├── .gitignore ├── .gitmodules ├── .travis.yml ├── AUTHORS.rst ├── ChangeLog.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── bootstrap.py ├── buildout.cfg ├── setup.py └── stop_words ├── __init__.py └── tests.py /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE specific 2 | .idea/ 3 | .spyderproject 4 | # file 5 | *.pyc 6 | manage.py 7 | dump.json 8 | MANIFEST 9 | # dir 10 | build/ 11 | dist/ 12 | *.egg-info/ 13 | logs/ 14 | src/ 15 | .c9/ 16 | bin/ 17 | develop-eggs/ 18 | eggs/ 19 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "stop_words/stop-words"] 2 | path = stop_words/stop-words 3 | url = https://github.com/Alir3z4/stop-words.git 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | - "3.5" 6 | - "3.6" 7 | - "3.7-dev" # 3.7 development branch 8 | - "nightly" # currently points to 3.7-dev 9 | install: 10 | - git submodule init 11 | - git submodule update 12 | - git submodule foreach git pull origin master 13 | - pip install -U setuptools coveralls 14 | - python bootstrap.py 15 | - ./bin/buildout 16 | before_script: 17 | - ./bin/flake8 stop_words 18 | script: 19 | - ./bin/cover 20 | notifications: 21 | irc: 22 | - "irc.freenode.org#python-stop-words" 23 | after_success: 24 | coveralls 25 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ``python-stop-words`` was originally created in middle 2014 at home, the bedroom 2 | division of the Alireza's place somewhere on planet earth maybe. 3 | 4 | The PRIMARY AUTHORS are (and/or have been): 5 | 6 | * Alireza Savand 7 | * François‎ 8 | 9 | And here is an inevitably incomplete list of MUCH-APPRECIATED CONTRIBUTORS -- 10 | people who have submitted patches, reported bugs, added translations, helped 11 | answer newbie questions, and generally made ``python-stop-words`` that much better: 12 | 13 | * Alireza Savand 14 | * Julien Fache 15 | * David Miró 16 | * Taras Labiak 17 | 18 | 19 | A big THANK YOU goes to: 20 | 21 | * François‎ for convincing Alireza to start the project. 22 | * Guido van Rossum for creating Python. 23 | -------------------------------------------------------------------------------- /ChangeLog.rst: -------------------------------------------------------------------------------- 1 | 2018.7.23 2 | ========= 3 | 4 | * Fixed #14: `languages.json` is missing, if you don't git clone with `--recursive`. 5 | * Feature: Support latest version of Python (3.7+). 6 | * Feature #22: Enforces packaging of eggs into folders. 7 | * Update the `stop-words` repository to get the latest languages. 8 | * Fixed Travis failing and tests due to bootstrap. 9 | 10 | 11 | 2015.2.23.1 12 | =========== 13 | 14 | * Fixed #9: Missing ``languages.json`` file that breaks the installation. 15 | 16 | 17 | 2015.2.23 18 | ========= 19 | 20 | * Feature: Using the cache is optional 21 | * Feature: Filtering stopwords 22 | 23 | 2015.2.21 24 | ========= 25 | 26 | * Feature: ``LANGUAGE_MAPPING`` is loads from stop-words/languages.json 27 | * Fixed: Made paths OS-independent 28 | 29 | 30 | 2015.1.31 31 | ========= 32 | 33 | * Feature #5: Decode error AND Add ``catalan`` language to ``LANGUAGE_MAPPING`. 34 | * Feature: Update `stop-words` dictionary. 35 | 36 | 37 | 2015.1.22 38 | ========= 39 | 40 | * Feature: Tests 41 | * Feature: Python 3 support 42 | * Feature: Dev installation via zc.buildout 43 | * Feature: Continuous integration via Travis 44 | 45 | 46 | 2015.1.19 47 | ========= 48 | 49 | * Feature #3: Handle language code, cache and custom errors 50 | 51 | 52 | 2014.5.26 53 | ========= 54 | 55 | * Initial release. 56 | * Package on pypi. 57 | * github.com/Alir3z4/stop-words as submodule. 58 | 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Alireza Savand, Contributors 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the {organization} nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.rst 3 | include ChangeLog.rst 4 | include AUTHORS.rst 5 | recursive-include stop_words/stop-words *.txt 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Python Stop Words 3 | ================= 4 | 5 | .. contents:: Table of contents 6 | 7 | Overview 8 | -------- 9 | 10 | Get list of common stop words in various languages in Python. 11 | 12 | .. image:: https://secure.travis-ci.org/Alir3z4/python-stop-words.png 13 | :alt: Build Status 14 | :target: http://travis-ci.org/Alir3z4/python-stop-words 15 | 16 | .. image:: https://coveralls.io/repos/Alir3z4/python-stop-words/badge.png 17 | :alt: Coverage Status 18 | :target: https://coveralls.io/r/Alir3z4/python-stop-words 19 | 20 | .. image:: http://badge.kloud51.com/pypi/v/stop-words.svg 21 | :target: https://pypi.python.org/pypi/stop-words 22 | :alt: PyPI Version 23 | 24 | .. image:: http://badge.kloud51.com/pypi/s/stop-words.svg 25 | :target: https://pypi.python.org/pypi/stop-words 26 | :alt: PyPI Status 27 | 28 | .. image:: http://badge.kloud51.com/pypi/l/stop-words.svg 29 | :target: https://github.com/Alir3z4/python-stop-words/blob/master/LICENSE 30 | :alt: License 31 | 32 | .. image:: http://badge.kloud51.com/pypi/p/stop-words.svg 33 | :target: https://pypi.python.org/pypi/stop-words 34 | :alt: PyPI Py_versions 35 | 36 | 37 | Available languages 38 | ------------------- 39 | 40 | * Arabic 41 | * Bulgarian 42 | * Catalan 43 | * Czech 44 | * Danish 45 | * Dutch 46 | * English 47 | * Finnish 48 | * French 49 | * German 50 | * Hungarian 51 | * Indonesian 52 | * Italian 53 | * Norwegian 54 | * Polish 55 | * Portuguese 56 | * Romanian 57 | * Russian 58 | * Spanish 59 | * Swedish 60 | * Turkish 61 | * Ukrainian 62 | 63 | 64 | Installation 65 | ------------ 66 | ``stop-words`` is available on PyPI 67 | 68 | http://pypi.python.org/pypi/stop-words 69 | 70 | So easily install it by ``pip`` 71 | :: 72 | 73 | $ pip install stop-words 74 | 75 | Another way is by cloning ``stop-words``'s `git repo `_ :: 76 | 77 | $ git clone --recursive git://github.com/Alir3z4/python-stop-words.git 78 | 79 | Then install it by running: 80 | :: 81 | 82 | $ python setup.py install 83 | 84 | 85 | Basic usage 86 | ----------- 87 | :: 88 | 89 | from stop_words import get_stop_words 90 | 91 | stop_words = get_stop_words('en') 92 | stop_words = get_stop_words('english') 93 | 94 | from stop_words import safe_get_stop_words 95 | 96 | stop_words = safe_get_stop_words('unsupported language') 97 | 98 | Python compatibility 99 | -------------------- 100 | 101 | Python Stop Words is compatibe with: 102 | 103 | * Python 2.7 104 | * Python 3.4 105 | * Python 3.5 106 | * Python 3.6 107 | * Python 3.7 108 | -------------------------------------------------------------------------------- /bootstrap.py: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | # 3 | # Copyright (c) 2006 Zope Foundation and Contributors. 4 | # All Rights Reserved. 5 | # 6 | # This software is subject to the provisions of the Zope Public License, 7 | # Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution. 8 | # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED 9 | # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 10 | # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS 11 | # FOR A PARTICULAR PURPOSE. 12 | # 13 | ############################################################################## 14 | """Bootstrap a buildout-based project 15 | 16 | Simply run this script in a directory containing a buildout.cfg. 17 | The script accepts buildout command-line options, so you can 18 | use the -c option to specify an alternate configuration file. 19 | """ 20 | 21 | import os 22 | import shutil 23 | import sys 24 | import tempfile 25 | 26 | from optparse import OptionParser 27 | 28 | __version__ = '2015-07-01' 29 | # See zc.buildout's changelog if this version is up to date. 30 | 31 | tmpeggs = tempfile.mkdtemp(prefix='bootstrap-') 32 | 33 | usage = '''\ 34 | [DESIRED PYTHON FOR BUILDOUT] bootstrap.py [options] 35 | 36 | Bootstraps a buildout-based project. 37 | 38 | Simply run this script in a directory containing a buildout.cfg, using the 39 | Python that you want bin/buildout to use. 40 | 41 | Note that by using --find-links to point to local resources, you can keep 42 | this script from going over the network. 43 | ''' 44 | 45 | parser = OptionParser(usage=usage) 46 | parser.add_option("--version", 47 | action="store_true", default=False, 48 | help=("Return bootstrap.py version.")) 49 | parser.add_option("-t", "--accept-buildout-test-releases", 50 | dest='accept_buildout_test_releases', 51 | action="store_true", default=False, 52 | help=("Normally, if you do not specify a --buildout-version, " 53 | "the bootstrap script and buildout gets the newest " 54 | "*final* versions of zc.buildout and its recipes and " 55 | "extensions for you. If you use this flag, " 56 | "bootstrap and buildout will get the newest releases " 57 | "even if they are alphas or betas.")) 58 | parser.add_option("-c", "--config-file", 59 | help=("Specify the path to the buildout configuration " 60 | "file to be used.")) 61 | parser.add_option("-f", "--find-links", 62 | help=("Specify a URL to search for buildout releases")) 63 | parser.add_option("--allow-site-packages", 64 | action="store_true", default=False, 65 | help=("Let bootstrap.py use existing site packages")) 66 | parser.add_option("--buildout-version", 67 | help="Use a specific zc.buildout version") 68 | parser.add_option("--setuptools-version", 69 | help="Use a specific setuptools version") 70 | parser.add_option("--setuptools-to-dir", 71 | help=("Allow for re-use of existing directory of " 72 | "setuptools versions")) 73 | 74 | options, args = parser.parse_args() 75 | if options.version: 76 | print("bootstrap.py version %s" % __version__) 77 | sys.exit(0) 78 | 79 | 80 | ###################################################################### 81 | # load/install setuptools 82 | 83 | try: 84 | from urllib.request import urlopen 85 | except ImportError: 86 | from urllib2 import urlopen 87 | 88 | ez = {} 89 | if os.path.exists('ez_setup.py'): 90 | exec(open('ez_setup.py').read(), ez) 91 | else: 92 | exec(urlopen('https://bootstrap.pypa.io/ez_setup.py').read(), ez) 93 | 94 | if not options.allow_site_packages: 95 | # ez_setup imports site, which adds site packages 96 | # this will remove them from the path to ensure that incompatible versions 97 | # of setuptools are not in the path 98 | import site 99 | # inside a virtualenv, there is no 'getsitepackages'. 100 | # We can't remove these reliably 101 | if hasattr(site, 'getsitepackages'): 102 | for sitepackage_path in site.getsitepackages(): 103 | # Strip all site-packages directories from sys.path that 104 | # are not sys.prefix; this is because on Windows 105 | # sys.prefix is a site-package directory. 106 | if sitepackage_path != sys.prefix: 107 | sys.path[:] = [x for x in sys.path 108 | if sitepackage_path not in x] 109 | 110 | setup_args = dict(to_dir=tmpeggs, download_delay=0) 111 | 112 | if options.setuptools_version is not None: 113 | setup_args['version'] = options.setuptools_version 114 | if options.setuptools_to_dir is not None: 115 | setup_args['to_dir'] = options.setuptools_to_dir 116 | 117 | ez['use_setuptools'](**setup_args) 118 | import setuptools 119 | import pkg_resources 120 | 121 | # This does not (always?) update the default working set. We will 122 | # do it. 123 | for path in sys.path: 124 | if path not in pkg_resources.working_set.entries: 125 | pkg_resources.working_set.add_entry(path) 126 | 127 | ###################################################################### 128 | # Install buildout 129 | 130 | ws = pkg_resources.working_set 131 | 132 | setuptools_path = ws.find( 133 | pkg_resources.Requirement.parse('setuptools')).location 134 | 135 | # Fix sys.path here as easy_install.pth added before PYTHONPATH 136 | cmd = [sys.executable, '-c', 137 | 'import sys; sys.path[0:0] = [%r]; ' % setuptools_path + 138 | 'from setuptools.command.easy_install import main; main()', 139 | '-mZqNxd', tmpeggs] 140 | 141 | find_links = os.environ.get( 142 | 'bootstrap-testing-find-links', 143 | options.find_links or 144 | ('http://downloads.buildout.org/' 145 | if options.accept_buildout_test_releases else None) 146 | ) 147 | if find_links: 148 | cmd.extend(['-f', find_links]) 149 | 150 | requirement = 'zc.buildout' 151 | version = options.buildout_version 152 | if version is None and not options.accept_buildout_test_releases: 153 | # Figure out the most recent final version of zc.buildout. 154 | import setuptools.package_index 155 | _final_parts = '*final-', '*final' 156 | 157 | def _final_version(parsed_version): 158 | try: 159 | return not parsed_version.is_prerelease 160 | except AttributeError: 161 | # Older setuptools 162 | for part in parsed_version: 163 | if (part[:1] == '*') and (part not in _final_parts): 164 | return False 165 | return True 166 | 167 | index = setuptools.package_index.PackageIndex( 168 | search_path=[setuptools_path]) 169 | if find_links: 170 | index.add_find_links((find_links,)) 171 | req = pkg_resources.Requirement.parse(requirement) 172 | if index.obtain(req) is not None: 173 | best = [] 174 | bestv = None 175 | for dist in index[req.project_name]: 176 | distv = dist.parsed_version 177 | if _final_version(distv): 178 | if bestv is None or distv > bestv: 179 | best = [dist] 180 | bestv = distv 181 | elif distv == bestv: 182 | best.append(dist) 183 | if best: 184 | best.sort() 185 | version = best[-1].version 186 | if version: 187 | requirement = '=='.join((requirement, version)) 188 | cmd.append(requirement) 189 | 190 | import subprocess 191 | if subprocess.call(cmd) != 0: 192 | raise Exception( 193 | "Failed to execute command:\n%s" % repr(cmd)[1:-1]) 194 | 195 | ###################################################################### 196 | # Import and run buildout 197 | 198 | ws.add_entry(tmpeggs) 199 | ws.require(requirement) 200 | import zc.buildout.buildout 201 | 202 | if not [a for a in args if '=' not in a]: 203 | args.append('bootstrap') 204 | 205 | # if -c was provided, we push it back into args for buildout' main function 206 | if options.config_file is not None: 207 | args[0:0] = ['-c', options.config_file] 208 | 209 | zc.buildout.buildout.main(args) 210 | shutil.rmtree(tmpeggs) 211 | -------------------------------------------------------------------------------- /buildout.cfg: -------------------------------------------------------------------------------- 1 | [buildout] 2 | develop = . 3 | parts = test 4 | cover 5 | flake8 6 | evolve 7 | show-picked-versions = true 8 | 9 | [evolve] 10 | arguments = '-s buildout.cfg -w --indent 32 --sorting alpha' 11 | eggs = buildout-versions-checker 12 | recipe = zc.recipe.egg 13 | scripts = check-buildout-updates=${:_buildout_section_name_} 14 | 15 | [test] 16 | defaults = --with-progressive 17 | eggs = nose 18 | nose-progressive 19 | recipe = pbp.recipe.noserunner 20 | 21 | [cover] 22 | <= test 23 | defaults = --with-coverage 24 | --cover-erase 25 | --cover-package=stop_words 26 | eggs = nose 27 | coverage 28 | 29 | [flake8] 30 | eggs = flake8 31 | recipe = zc.recipe.egg 32 | 33 | [versions] 34 | blessings = 1.6 35 | buildout-versions-checker = 1.5.1 36 | coverage = 3.7.1 37 | flake8 = 2.3.0 38 | futures = 2.2.0 39 | mccabe = 0.3 40 | nose = 1.3.4 41 | nose-progressive = 1.5.1 42 | pbp.recipe.noserunner = 0.2.6 43 | pep8 = 1.5.7 44 | pyflakes = 0.8.1 45 | six = 1.10.0 46 | zc.buildout = 2.12.1 47 | zc.recipe.egg = 2.0.7 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='stop-words', 5 | version=__import__("stop_words").get_version(), 6 | description='Get list of common stop words in various languages in Python', 7 | long_description=open('README.rst').read(), 8 | license=open('LICENSE').read(), 9 | author='Alireza Savand', 10 | author_email='alireza.savand@gmail.com', 11 | url='https://github.com/Alir3z4/python-stop-words', 12 | packages=find_packages(), 13 | zip_safe=False, 14 | package_data={ 15 | 'stop_words': [ 16 | 'stop-words/*.txt', 17 | 'stop-words/languages.json', 18 | ] 19 | }, 20 | classifiers=[ 21 | 'Programming Language :: Python', 22 | 'Intended Audience :: Developers', 23 | 'Operating System :: OS Independent', 24 | 'Topic :: Software Development', 25 | 'Development Status :: 6 - Mature', 26 | 'Programming Language :: Python :: 2', 27 | 'Programming Language :: Python :: 2.7', 28 | 'Programming Language :: Python :: 3', 29 | 'Programming Language :: Python :: 3.4', 30 | 'Programming Language :: Python :: 3.5', 31 | 'Programming Language :: Python :: 3.6', 32 | 'Programming Language :: Python :: 3.7', 33 | 'Topic :: Text Processing', 34 | 'Topic :: Text Processing :: Filters', 35 | 'License :: OSI Approved :: BSD License', 36 | ], 37 | ) 38 | -------------------------------------------------------------------------------- /stop_words/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | __VERSION__ = (2018, 7, 23) 5 | CURRENT_DIR = os.path.dirname(os.path.realpath(__file__)) 6 | STOP_WORDS_DIR = os.path.join(CURRENT_DIR, 'stop-words') 7 | STOP_WORDS_CACHE = {} 8 | 9 | with open(os.path.join(STOP_WORDS_DIR, 'languages.json'), 'rb') as map_file: 10 | buffer = map_file.read() 11 | buffer = buffer.decode('ascii') 12 | LANGUAGE_MAPPING = json.loads(buffer) 13 | 14 | AVAILABLE_LANGUAGES = list(LANGUAGE_MAPPING.values()) 15 | 16 | 17 | def get_version(): 18 | """ 19 | :rtype: basestring 20 | """ 21 | return ".".join(str(v) for v in __VERSION__) 22 | 23 | 24 | class StopWordError(Exception): 25 | pass 26 | 27 | 28 | def get_stop_words(language, cache=True): 29 | """ 30 | :type language: basestring 31 | 32 | :rtype: list 33 | """ 34 | try: 35 | language = LANGUAGE_MAPPING[language] 36 | except KeyError: 37 | if language not in AVAILABLE_LANGUAGES: 38 | raise StopWordError('{0}" language is unavailable.'.format( 39 | language 40 | )) 41 | 42 | if cache and language in STOP_WORDS_CACHE: 43 | return STOP_WORDS_CACHE[language] 44 | 45 | language_filename = os.path.join(STOP_WORDS_DIR, language + '.txt') 46 | try: 47 | with open(language_filename, 'rb') as language_file: 48 | stop_words = [line.decode('utf-8').strip() 49 | for line in language_file.readlines()] 50 | stop_words = apply_filters(stop_words, language) 51 | except IOError: 52 | raise StopWordError( 53 | '{0}" file is unreadable, check your installation.'.format( 54 | language_filename 55 | ) 56 | ) 57 | 58 | if cache: 59 | STOP_WORDS_CACHE[language] = stop_words 60 | 61 | return stop_words 62 | 63 | _filters = {None: []} 64 | 65 | 66 | def apply_filters(stopwords, language): 67 | """ 68 | Apply registered filters to stopwords 69 | :param stopwords: list 70 | :param language: string 71 | :return: filtered stopwords 72 | """ 73 | if language in _filters: 74 | for func in _filters[language]: 75 | stopwords = func(stopwords) 76 | 77 | for func in _filters[None]: 78 | stopwords = func(stopwords, language) 79 | 80 | return stopwords 81 | 82 | 83 | def add_filter(func, language=None): 84 | """ 85 | Register filters for specific language. 86 | If language == None the filter applies for all languages. 87 | Filter will not apply for stop words in cache. 88 | :param func: callable 89 | :param language: string|None 90 | :return: 91 | """ 92 | if language not in _filters: 93 | _filters[language] = [] 94 | _filters[language].append(func) 95 | 96 | 97 | def remove_filter(func, language=None): 98 | """ 99 | :param func: 100 | :param language: 101 | :return: 102 | """ 103 | if not (language in _filters and func in _filters[language]): 104 | return False 105 | _filters[language].remove(func) 106 | return True 107 | 108 | 109 | def safe_get_stop_words(language): 110 | """ 111 | :type language: basestring 112 | 113 | :rtype: list 114 | """ 115 | try: 116 | return get_stop_words(language) 117 | except StopWordError: 118 | return [] 119 | -------------------------------------------------------------------------------- /stop_words/tests.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for stop-words 3 | """ 4 | import random 5 | from unittest import TestCase 6 | from unittest import TestSuite 7 | from unittest import TestLoader 8 | 9 | import stop_words 10 | from stop_words import get_stop_words 11 | from stop_words import safe_get_stop_words 12 | from stop_words import StopWordError 13 | from stop_words import LANGUAGE_MAPPING 14 | from stop_words import AVAILABLE_LANGUAGES 15 | 16 | 17 | class StopWordsTestCase(TestCase): 18 | number_of_english_stop_words = 174 19 | 20 | def test_get_stop_words(self): 21 | sw = get_stop_words('english') 22 | self.assertEqual(len(sw), self.number_of_english_stop_words) 23 | 24 | def test_get_stop_words_language_mapping(self): 25 | sw = get_stop_words('en') 26 | self.assertEqual(len(sw), self.number_of_english_stop_words) 27 | self.assertEqual(sw, get_stop_words('english')) 28 | 29 | def test_get_stop_words_cache(self): 30 | self.assertFalse('french' in stop_words.STOP_WORDS_CACHE) 31 | sw = get_stop_words('fr') 32 | self.assertTrue('french' in stop_words.STOP_WORDS_CACHE) 33 | original_stop_words_dir = stop_words.STOP_WORDS_DIR 34 | stop_words.STOP_WORDS_DIR = 'not-existing-directory' 35 | self.assertEqual(sw, get_stop_words('french')) 36 | stop_words.STOP_WORDS_DIR = original_stop_words_dir 37 | try: 38 | get_stop_words('klingon') 39 | except: 40 | pass 41 | self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE) 42 | 43 | def test_get_stop_words_unavailable_language(self): 44 | self.assertRaises(StopWordError, get_stop_words, 'sindarin') 45 | 46 | def test_get_stop_words_install_issue(self): 47 | original_stop_words_dir = stop_words.STOP_WORDS_DIR 48 | stop_words.STOP_WORDS_DIR = 'not-existing-directory' 49 | self.assertRaises(StopWordError, get_stop_words, 'german') 50 | stop_words.STOP_WORDS_DIR = original_stop_words_dir 51 | 52 | def test_safe_get_stop_words(self): 53 | self.assertRaises(StopWordError, get_stop_words, 'huttese') 54 | self.assertEqual(safe_get_stop_words('huttese'), []) 55 | 56 | def test_random_language_stop_words_load(self): 57 | languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES) 58 | sample = random.sample(languages, len(languages)) 59 | for language in sample: 60 | stop_words = safe_get_stop_words(language) 61 | self.assertTrue( 62 | len(stop_words) > 0, 63 | 'Cannot load stopwords for {0} language'.format(language) 64 | ) 65 | 66 | def test_filters(self): 67 | language = 'en' 68 | before = get_stop_words(language, False) 69 | letter = random.choice(random.choice(before)) 70 | 71 | def remove_letter(stopwords, language): 72 | return [word for word in stopwords if letter not in word] 73 | stop_words.add_filter(remove_letter) 74 | after = get_stop_words(language, False) 75 | for stopword in after: 76 | self.assertFalse(letter in stopword) 77 | self.assertTrue(stop_words.remove_filter(remove_letter)) 78 | 79 | 80 | loader = TestLoader() 81 | 82 | test_suite = TestSuite( 83 | [ 84 | loader.loadTestsFromTestCase(StopWordsTestCase), 85 | ] 86 | ) 87 | --------------------------------------------------------------------------------