├── .gitignore
├── .gitmodules
├── .travis.yml
├── AUTHORS.rst
├── ChangeLog.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── bootstrap.py
├── buildout.cfg
├── setup.py
└── stop_words
    ├── __init__.py
    └── tests.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # IDE specific
 2 | .idea/
 3 | .spyderproject
 4 | # file
 5 | *.pyc
 6 | manage.py
 7 | dump.json
 8 | MANIFEST
 9 | # dir
10 | build/
11 | dist/
12 | *.egg-info/
13 | logs/
14 | src/
15 | .c9/
16 | bin/
17 | develop-eggs/
18 | eggs/
19 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "stop_words/stop-words"]
2 | 	path = stop_words/stop-words
3 | 	url = https://github.com/Alir3z4/stop-words.git
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - "2.7"
 4 |     - "3.4"
 5 |     - "3.5"
 6 |     - "3.6"
 7 |     - "3.7-dev" # 3.7 development branch
 8 |     - "nightly" # currently points to 3.7-dev
 9 | install:
10 |     - git submodule init
11 |     - git submodule update
12 |     - git submodule foreach git pull origin master
13 |     - pip install -U setuptools coveralls
14 |     - python bootstrap.py
15 |     - ./bin/buildout
16 | before_script:
17 |     - ./bin/flake8 stop_words
18 | script:
19 |     - ./bin/cover
20 | notifications:
21 |     irc:
22 |         - "irc.freenode.org#python-stop-words"
23 | after_success:
24 |     coveralls
25 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | ``python-stop-words`` was originally created in middle 2014 at home, the bedroom
 2 | division of the Alireza's place somewhere on planet earth maybe.
 3 | 
 4 | The PRIMARY AUTHORS are (and/or have been):
 5 | 
 6 | * Alireza Savand <alireza.savand@gmail.com>
 7 | * François‎
 8 | 
 9 | And here is an inevitably incomplete list of MUCH-APPRECIATED CONTRIBUTORS --
10 | people who have submitted patches, reported bugs, added translations, helped
11 | answer newbie questions, and generally made ``python-stop-words`` that much better:
12 | 
13 | * Alireza Savand <alireza.savand@gmail.com>
14 | * Julien Fache <fantomas42@gmail.com>
15 | * David Miró <lite.3engine@gmail.com>
16 | * Taras Labiak <kissarat@gmail.com>
17 | 
18 | 
19 | A big THANK YOU goes to:
20 | 
21 | * François‎ for convincing Alireza to start the project.
22 | * Guido van Rossum for creating Python.
23 | 


--------------------------------------------------------------------------------
/ChangeLog.rst:
--------------------------------------------------------------------------------
 1 | 2018.7.23
 2 | =========
 3 | 
 4 | * Fixed #14: `languages.json` is missing, if you don't git clone with `--recursive`.
 5 | * Feature: Support latest version of Python (3.7+).
 6 | * Feature #22: Enforces packaging of eggs into folders.
 7 | * Update the `stop-words` repository to get the latest languages.
 8 | * Fixed Travis failing and tests due to bootstrap.
 9 | 
10 | 
11 | 2015.2.23.1
12 | ===========
13 | 
14 | * Fixed #9: Missing ``languages.json`` file that breaks the installation.
15 | 
16 | 
17 | 2015.2.23
18 | =========
19 | 
20 | * Feature: Using the cache is optional
21 | * Feature: Filtering stopwords
22 | 
23 | 2015.2.21
24 | =========
25 | 
26 | * Feature: ``LANGUAGE_MAPPING`` is loads from stop-words/languages.json
27 | * Fixed: Made paths OS-independent
28 | 
29 | 
30 | 2015.1.31
31 | =========
32 | 
33 | * Feature #5: Decode error AND Add ``catalan`` language to ``LANGUAGE_MAPPING`.
34 | * Feature: Update `stop-words` dictionary.
35 | 
36 | 
37 | 2015.1.22
38 | =========
39 | 
40 | * Feature: Tests
41 | * Feature: Python 3 support
42 | * Feature: Dev installation via zc.buildout
43 | * Feature: Continuous integration via Travis
44 | 
45 | 
46 | 2015.1.19
47 | =========
48 | 
49 | * Feature #3: Handle language code, cache and custom errors 
50 | 
51 | 
52 | 2014.5.26
53 | =========
54 | 
55 | * Initial release.
56 | * Package on pypi.
57 | * github.com/Alir3z4/stop-words as submodule.
58 | 
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Alireza Savand, Contributors
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of the {organization} nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.rst
3 | include ChangeLog.rst
4 | include AUTHORS.rst
5 | recursive-include stop_words/stop-words *.txt
6 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =================
  2 | Python Stop Words
  3 | =================
  4 | 
  5 | .. contents:: Table of contents
  6 | 
  7 | Overview
  8 | --------
  9 | 
 10 | Get list of common stop words in various languages in Python.
 11 | 
 12 | .. image:: https://secure.travis-ci.org/Alir3z4/python-stop-words.png
 13 |    :alt: Build Status
 14 |    :target: http://travis-ci.org/Alir3z4/python-stop-words
 15 | 
 16 | .. image:: https://coveralls.io/repos/Alir3z4/python-stop-words/badge.png
 17 |    :alt: Coverage Status
 18 |    :target: https://coveralls.io/r/Alir3z4/python-stop-words
 19 | 
 20 | .. image:: http://badge.kloud51.com/pypi/v/stop-words.svg
 21 |     :target: https://pypi.python.org/pypi/stop-words
 22 |     :alt: PyPI Version
 23 | 
 24 | .. image:: http://badge.kloud51.com/pypi/s/stop-words.svg
 25 |     :target: https://pypi.python.org/pypi/stop-words
 26 |     :alt: PyPI Status
 27 | 
 28 | .. image:: http://badge.kloud51.com/pypi/l/stop-words.svg
 29 |     :target: https://github.com/Alir3z4/python-stop-words/blob/master/LICENSE
 30 |     :alt: License
 31 | 
 32 | .. image:: http://badge.kloud51.com/pypi/p/stop-words.svg
 33 |     :target: https://pypi.python.org/pypi/stop-words
 34 |     :alt: PyPI Py_versions
 35 | 
 36 | 
 37 | Available languages
 38 | -------------------
 39 | 
 40 | * Arabic
 41 | * Bulgarian
 42 | * Catalan
 43 | * Czech
 44 | * Danish
 45 | * Dutch
 46 | * English
 47 | * Finnish
 48 | * French
 49 | * German
 50 | * Hungarian
 51 | * Indonesian
 52 | * Italian
 53 | * Norwegian
 54 | * Polish
 55 | * Portuguese
 56 | * Romanian
 57 | * Russian
 58 | * Spanish
 59 | * Swedish
 60 | * Turkish
 61 | * Ukrainian
 62 | 
 63 | 
 64 | Installation
 65 | ------------
 66 | ``stop-words`` is available on PyPI
 67 | 
 68 | http://pypi.python.org/pypi/stop-words
 69 | 
 70 | So easily install it by ``pip``
 71 | ::
 72 | 
 73 |     $ pip install stop-words
 74 | 
 75 | Another way is by cloning ``stop-words``'s `git repo <https://github.com/Alir3z4/python-stop-words>`_ ::
 76 | 
 77 |     $ git clone --recursive git://github.com/Alir3z4/python-stop-words.git
 78 | 
 79 | Then install it by running:
 80 | ::
 81 | 
 82 |     $ python setup.py install
 83 | 
 84 | 
 85 | Basic usage
 86 | -----------
 87 | ::
 88 | 
 89 |     from stop_words import get_stop_words
 90 | 
 91 |     stop_words = get_stop_words('en')
 92 |     stop_words = get_stop_words('english')
 93 | 
 94 |     from stop_words import safe_get_stop_words
 95 | 
 96 |     stop_words = safe_get_stop_words('unsupported language')
 97 | 
 98 | Python compatibility
 99 | --------------------
100 | 
101 | Python Stop Words is compatibe with:
102 | 
103 | * Python 2.7
104 | * Python 3.4
105 | * Python 3.5
106 | * Python 3.6
107 | * Python 3.7
108 | 


--------------------------------------------------------------------------------
/bootstrap.py:
--------------------------------------------------------------------------------
  1 | ##############################################################################
  2 | #
  3 | # Copyright (c) 2006 Zope Foundation and Contributors.
  4 | # All Rights Reserved.
  5 | #
  6 | # This software is subject to the provisions of the Zope Public License,
  7 | # Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
  8 | # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
  9 | # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 10 | # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
 11 | # FOR A PARTICULAR PURPOSE.
 12 | #
 13 | ##############################################################################
 14 | """Bootstrap a buildout-based project
 15 | 
 16 | Simply run this script in a directory containing a buildout.cfg.
 17 | The script accepts buildout command-line options, so you can
 18 | use the -c option to specify an alternate configuration file.
 19 | """
 20 | 
 21 | import os
 22 | import shutil
 23 | import sys
 24 | import tempfile
 25 | 
 26 | from optparse import OptionParser
 27 | 
 28 | __version__ = '2015-07-01'
 29 | # See zc.buildout's changelog if this version is up to date.
 30 | 
 31 | tmpeggs = tempfile.mkdtemp(prefix='bootstrap-')
 32 | 
 33 | usage = '''\
 34 | [DESIRED PYTHON FOR BUILDOUT] bootstrap.py [options]
 35 | 
 36 | Bootstraps a buildout-based project.
 37 | 
 38 | Simply run this script in a directory containing a buildout.cfg, using the
 39 | Python that you want bin/buildout to use.
 40 | 
 41 | Note that by using --find-links to point to local resources, you can keep
 42 | this script from going over the network.
 43 | '''
 44 | 
 45 | parser = OptionParser(usage=usage)
 46 | parser.add_option("--version",
 47 |                   action="store_true", default=False,
 48 |                   help=("Return bootstrap.py version."))
 49 | parser.add_option("-t", "--accept-buildout-test-releases",
 50 |                   dest='accept_buildout_test_releases',
 51 |                   action="store_true", default=False,
 52 |                   help=("Normally, if you do not specify a --buildout-version, "
 53 |                         "the bootstrap script and buildout gets the newest "
 54 |                         "*final* versions of zc.buildout and its recipes and "
 55 |                         "extensions for you.  If you use this flag, "
 56 |                         "bootstrap and buildout will get the newest releases "
 57 |                         "even if they are alphas or betas."))
 58 | parser.add_option("-c", "--config-file",
 59 |                   help=("Specify the path to the buildout configuration "
 60 |                         "file to be used."))
 61 | parser.add_option("-f", "--find-links",
 62 |                   help=("Specify a URL to search for buildout releases"))
 63 | parser.add_option("--allow-site-packages",
 64 |                   action="store_true", default=False,
 65 |                   help=("Let bootstrap.py use existing site packages"))
 66 | parser.add_option("--buildout-version",
 67 |                   help="Use a specific zc.buildout version")
 68 | parser.add_option("--setuptools-version",
 69 |                   help="Use a specific setuptools version")
 70 | parser.add_option("--setuptools-to-dir",
 71 |                   help=("Allow for re-use of existing directory of "
 72 |                         "setuptools versions"))
 73 | 
 74 | options, args = parser.parse_args()
 75 | if options.version:
 76 |     print("bootstrap.py version %s" % __version__)
 77 |     sys.exit(0)
 78 | 
 79 | 
 80 | ######################################################################
 81 | # load/install setuptools
 82 | 
 83 | try:
 84 |     from urllib.request import urlopen
 85 | except ImportError:
 86 |     from urllib2 import urlopen
 87 | 
 88 | ez = {}
 89 | if os.path.exists('ez_setup.py'):
 90 |     exec(open('ez_setup.py').read(), ez)
 91 | else:
 92 |     exec(urlopen('https://bootstrap.pypa.io/ez_setup.py').read(), ez)
 93 | 
 94 | if not options.allow_site_packages:
 95 |     # ez_setup imports site, which adds site packages
 96 |     # this will remove them from the path to ensure that incompatible versions
 97 |     # of setuptools are not in the path
 98 |     import site
 99 |     # inside a virtualenv, there is no 'getsitepackages'.
100 |     # We can't remove these reliably
101 |     if hasattr(site, 'getsitepackages'):
102 |         for sitepackage_path in site.getsitepackages():
103 |             # Strip all site-packages directories from sys.path that
104 |             # are not sys.prefix; this is because on Windows
105 |             # sys.prefix is a site-package directory.
106 |             if sitepackage_path != sys.prefix:
107 |                 sys.path[:] = [x for x in sys.path
108 |                                if sitepackage_path not in x]
109 | 
110 | setup_args = dict(to_dir=tmpeggs, download_delay=0)
111 | 
112 | if options.setuptools_version is not None:
113 |     setup_args['version'] = options.setuptools_version
114 | if options.setuptools_to_dir is not None:
115 |     setup_args['to_dir'] = options.setuptools_to_dir
116 | 
117 | ez['use_setuptools'](**setup_args)
118 | import setuptools
119 | import pkg_resources
120 | 
121 | # This does not (always?) update the default working set.  We will
122 | # do it.
123 | for path in sys.path:
124 |     if path not in pkg_resources.working_set.entries:
125 |         pkg_resources.working_set.add_entry(path)
126 | 
127 | ######################################################################
128 | # Install buildout
129 | 
130 | ws = pkg_resources.working_set
131 | 
132 | setuptools_path = ws.find(
133 |     pkg_resources.Requirement.parse('setuptools')).location
134 | 
135 | # Fix sys.path here as easy_install.pth added before PYTHONPATH
136 | cmd = [sys.executable, '-c',
137 |        'import sys; sys.path[0:0] = [%r]; ' % setuptools_path +
138 |        'from setuptools.command.easy_install import main; main()',
139 |        '-mZqNxd', tmpeggs]
140 | 
141 | find_links = os.environ.get(
142 |     'bootstrap-testing-find-links',
143 |     options.find_links or
144 |     ('http://downloads.buildout.org/'
145 |      if options.accept_buildout_test_releases else None)
146 |     )
147 | if find_links:
148 |     cmd.extend(['-f', find_links])
149 | 
150 | requirement = 'zc.buildout'
151 | version = options.buildout_version
152 | if version is None and not options.accept_buildout_test_releases:
153 |     # Figure out the most recent final version of zc.buildout.
154 |     import setuptools.package_index
155 |     _final_parts = '*final-', '*final'
156 | 
157 |     def _final_version(parsed_version):
158 |         try:
159 |             return not parsed_version.is_prerelease
160 |         except AttributeError:
161 |             # Older setuptools
162 |             for part in parsed_version:
163 |                 if (part[:1] == '*') and (part not in _final_parts):
164 |                     return False
165 |             return True
166 | 
167 |     index = setuptools.package_index.PackageIndex(
168 |         search_path=[setuptools_path])
169 |     if find_links:
170 |         index.add_find_links((find_links,))
171 |     req = pkg_resources.Requirement.parse(requirement)
172 |     if index.obtain(req) is not None:
173 |         best = []
174 |         bestv = None
175 |         for dist in index[req.project_name]:
176 |             distv = dist.parsed_version
177 |             if _final_version(distv):
178 |                 if bestv is None or distv > bestv:
179 |                     best = [dist]
180 |                     bestv = distv
181 |                 elif distv == bestv:
182 |                     best.append(dist)
183 |         if best:
184 |             best.sort()
185 |             version = best[-1].version
186 | if version:
187 |     requirement = '=='.join((requirement, version))
188 | cmd.append(requirement)
189 | 
190 | import subprocess
191 | if subprocess.call(cmd) != 0:
192 |     raise Exception(
193 |         "Failed to execute command:\n%s" % repr(cmd)[1:-1])
194 | 
195 | ######################################################################
196 | # Import and run buildout
197 | 
198 | ws.add_entry(tmpeggs)
199 | ws.require(requirement)
200 | import zc.buildout.buildout
201 | 
202 | if not [a for a in args if '=' not in a]:
203 |     args.append('bootstrap')
204 | 
205 | # if -c was provided, we push it back into args for buildout' main function
206 | if options.config_file is not None:
207 |     args[0:0] = ['-c', options.config_file]
208 | 
209 | zc.buildout.buildout.main(args)
210 | shutil.rmtree(tmpeggs)
211 | 


--------------------------------------------------------------------------------
/buildout.cfg:
--------------------------------------------------------------------------------
 1 | [buildout]
 2 | develop                         = .
 3 | parts                           = test
 4 |                                   cover
 5 |                                   flake8
 6 |                                   evolve
 7 | show-picked-versions            = true
 8 | 
 9 | [evolve]
10 | arguments                       = '-s buildout.cfg -w --indent 32 --sorting alpha'
11 | eggs                            = buildout-versions-checker
12 | recipe                          = zc.recipe.egg
13 | scripts                         = check-buildout-updates=${:_buildout_section_name_}
14 | 
15 | [test]
16 | defaults                        = --with-progressive
17 | eggs                            = nose
18 |                                   nose-progressive
19 | recipe                          = pbp.recipe.noserunner
20 | 
21 | [cover]
22 | <=                                test
23 | defaults                        = --with-coverage
24 |                                   --cover-erase
25 |                                   --cover-package=stop_words
26 | eggs                            = nose
27 |                                   coverage
28 | 
29 | [flake8]
30 | eggs                            = flake8
31 | recipe                          = zc.recipe.egg
32 | 
33 | [versions]
34 | blessings                       = 1.6
35 | buildout-versions-checker       = 1.5.1
36 | coverage                        = 3.7.1
37 | flake8                          = 2.3.0
38 | futures                         = 2.2.0
39 | mccabe                          = 0.3
40 | nose                            = 1.3.4
41 | nose-progressive                = 1.5.1
42 | pbp.recipe.noserunner           = 0.2.6
43 | pep8                            = 1.5.7
44 | pyflakes                        = 0.8.1
45 | six                             = 1.10.0
46 | zc.buildout                     = 2.12.1
47 | zc.recipe.egg                   = 2.0.7
48 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='stop-words',
 5 |     version=__import__("stop_words").get_version(),
 6 |     description='Get list of common stop words in various languages in Python',
 7 |     long_description=open('README.rst').read(),
 8 |     license=open('LICENSE').read(),
 9 |     author='Alireza Savand',
10 |     author_email='alireza.savand@gmail.com',
11 |     url='https://github.com/Alir3z4/python-stop-words',
12 |     packages=find_packages(),
13 |     zip_safe=False,
14 |     package_data={
15 |         'stop_words': [
16 |             'stop-words/*.txt',
17 |             'stop-words/languages.json',
18 |         ]
19 |     },
20 |     classifiers=[
21 |         'Programming Language :: Python',
22 |         'Intended Audience :: Developers',
23 |         'Operating System :: OS Independent',
24 |         'Topic :: Software Development',
25 |         'Development Status :: 6 - Mature',
26 |         'Programming Language :: Python :: 2',
27 |         'Programming Language :: Python :: 2.7',
28 |         'Programming Language :: Python :: 3',
29 |         'Programming Language :: Python :: 3.4',
30 |         'Programming Language :: Python :: 3.5',
31 |         'Programming Language :: Python :: 3.6',
32 |         'Programming Language :: Python :: 3.7',
33 |         'Topic :: Text Processing',
34 |         'Topic :: Text Processing :: Filters',
35 |         'License :: OSI Approved :: BSD License',
36 |     ],
37 | )
38 | 


--------------------------------------------------------------------------------
/stop_words/__init__.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | __VERSION__ = (2018, 7, 23)
  5 | CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
  6 | STOP_WORDS_DIR = os.path.join(CURRENT_DIR, 'stop-words')
  7 | STOP_WORDS_CACHE = {}
  8 | 
  9 | with open(os.path.join(STOP_WORDS_DIR, 'languages.json'), 'rb') as map_file:
 10 |     buffer = map_file.read()
 11 |     buffer = buffer.decode('ascii')
 12 |     LANGUAGE_MAPPING = json.loads(buffer)
 13 | 
 14 | AVAILABLE_LANGUAGES = list(LANGUAGE_MAPPING.values())
 15 | 
 16 | 
 17 | def get_version():
 18 |     """
 19 |     :rtype: basestring
 20 |     """
 21 |     return ".".join(str(v) for v in __VERSION__)
 22 | 
 23 | 
 24 | class StopWordError(Exception):
 25 |     pass
 26 | 
 27 | 
 28 | def get_stop_words(language, cache=True):
 29 |     """
 30 |     :type language: basestring
 31 | 
 32 |     :rtype: list
 33 |     """
 34 |     try:
 35 |         language = LANGUAGE_MAPPING[language]
 36 |     except KeyError:
 37 |         if language not in AVAILABLE_LANGUAGES:
 38 |             raise StopWordError('{0}" language is unavailable.'.format(
 39 |                 language
 40 |             ))
 41 | 
 42 |     if cache and language in STOP_WORDS_CACHE:
 43 |         return STOP_WORDS_CACHE[language]
 44 | 
 45 |     language_filename = os.path.join(STOP_WORDS_DIR, language + '.txt')
 46 |     try:
 47 |         with open(language_filename, 'rb') as language_file:
 48 |             stop_words = [line.decode('utf-8').strip()
 49 |                           for line in language_file.readlines()]
 50 |             stop_words = apply_filters(stop_words, language)
 51 |     except IOError:
 52 |         raise StopWordError(
 53 |             '{0}" file is unreadable, check your installation.'.format(
 54 |                 language_filename
 55 |             )
 56 |         )
 57 | 
 58 |     if cache:
 59 |         STOP_WORDS_CACHE[language] = stop_words
 60 | 
 61 |     return stop_words
 62 | 
 63 | _filters = {None: []}
 64 | 
 65 | 
 66 | def apply_filters(stopwords, language):
 67 |     """
 68 |     Apply registered filters to stopwords
 69 |     :param stopwords: list
 70 |     :param language: string
 71 |     :return: filtered stopwords
 72 |     """
 73 |     if language in _filters:
 74 |         for func in _filters[language]:
 75 |             stopwords = func(stopwords)
 76 | 
 77 |     for func in _filters[None]:
 78 |         stopwords = func(stopwords, language)
 79 | 
 80 |     return stopwords
 81 | 
 82 | 
 83 | def add_filter(func, language=None):
 84 |     """
 85 |     Register filters for specific language.
 86 |     If language == None the filter applies for all languages.
 87 |     Filter will not apply for stop words in cache.
 88 |     :param func: callable
 89 |     :param language: string|None
 90 |     :return:
 91 |     """
 92 |     if language not in _filters:
 93 |         _filters[language] = []
 94 |     _filters[language].append(func)
 95 | 
 96 | 
 97 | def remove_filter(func, language=None):
 98 |     """
 99 |     :param func:
100 |     :param language:
101 |     :return:
102 |     """
103 |     if not (language in _filters and func in _filters[language]):
104 |         return False
105 |     _filters[language].remove(func)
106 |     return True
107 | 
108 | 
109 | def safe_get_stop_words(language):
110 |     """
111 |     :type language: basestring
112 | 
113 |     :rtype: list
114 |     """
115 |     try:
116 |         return get_stop_words(language)
117 |     except StopWordError:
118 |         return []
119 | 


--------------------------------------------------------------------------------
/stop_words/tests.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for stop-words
 3 | """
 4 | import random
 5 | from unittest import TestCase
 6 | from unittest import TestSuite
 7 | from unittest import TestLoader
 8 | 
 9 | import stop_words
10 | from stop_words import get_stop_words
11 | from stop_words import safe_get_stop_words
12 | from stop_words import StopWordError
13 | from stop_words import LANGUAGE_MAPPING
14 | from stop_words import AVAILABLE_LANGUAGES
15 | 
16 | 
17 | class StopWordsTestCase(TestCase):
18 |     number_of_english_stop_words = 174
19 | 
20 |     def test_get_stop_words(self):
21 |         sw = get_stop_words('english')
22 |         self.assertEqual(len(sw), self.number_of_english_stop_words)
23 | 
24 |     def test_get_stop_words_language_mapping(self):
25 |         sw = get_stop_words('en')
26 |         self.assertEqual(len(sw), self.number_of_english_stop_words)
27 |         self.assertEqual(sw, get_stop_words('english'))
28 | 
29 |     def test_get_stop_words_cache(self):
30 |         self.assertFalse('french' in stop_words.STOP_WORDS_CACHE)
31 |         sw = get_stop_words('fr')
32 |         self.assertTrue('french' in stop_words.STOP_WORDS_CACHE)
33 |         original_stop_words_dir = stop_words.STOP_WORDS_DIR
34 |         stop_words.STOP_WORDS_DIR = 'not-existing-directory'
35 |         self.assertEqual(sw, get_stop_words('french'))
36 |         stop_words.STOP_WORDS_DIR = original_stop_words_dir
37 |         try:
38 |             get_stop_words('klingon')
39 |         except:
40 |             pass
41 |         self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)
42 | 
43 |     def test_get_stop_words_unavailable_language(self):
44 |         self.assertRaises(StopWordError, get_stop_words, 'sindarin')
45 | 
46 |     def test_get_stop_words_install_issue(self):
47 |         original_stop_words_dir = stop_words.STOP_WORDS_DIR
48 |         stop_words.STOP_WORDS_DIR = 'not-existing-directory'
49 |         self.assertRaises(StopWordError, get_stop_words, 'german')
50 |         stop_words.STOP_WORDS_DIR = original_stop_words_dir
51 | 
52 |     def test_safe_get_stop_words(self):
53 |         self.assertRaises(StopWordError, get_stop_words, 'huttese')
54 |         self.assertEqual(safe_get_stop_words('huttese'), [])
55 | 
56 |     def test_random_language_stop_words_load(self):
57 |         languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES)
58 |         sample = random.sample(languages, len(languages))
59 |         for language in sample:
60 |             stop_words = safe_get_stop_words(language)
61 |             self.assertTrue(
62 |                 len(stop_words) > 0,
63 |                 'Cannot load stopwords for {0} language'.format(language)
64 |             )
65 | 
66 |     def test_filters(self):
67 |             language = 'en'
68 |             before = get_stop_words(language, False)
69 |             letter = random.choice(random.choice(before))
70 | 
71 |             def remove_letter(stopwords, language):
72 |                 return [word for word in stopwords if letter not in word]
73 |             stop_words.add_filter(remove_letter)
74 |             after = get_stop_words(language, False)
75 |             for stopword in after:
76 |                 self.assertFalse(letter in stopword)
77 |             self.assertTrue(stop_words.remove_filter(remove_letter))
78 | 
79 | 
80 | loader = TestLoader()
81 | 
82 | test_suite = TestSuite(
83 |     [
84 |         loader.loadTestsFromTestCase(StopWordsTestCase),
85 |     ]
86 | )
87 | 


--------------------------------------------------------------------------------