├── tests ├── fixtures │ ├── __init__.py │ ├── synonyms.json │ ├── makes_models_in_farsi_short.csv │ ├── makes_models_short.csv │ ├── sample_words.json │ └── makes_models_from_wikipedia.csv ├── animation │ └── short.gif ├── conftest.py ├── test_misc.py ├── test_normalize.py ├── test_loader.py ├── test_lfucache.py ├── test_autocomplete.py └── AutoCompleteWithSynonymsShort_Graph.svg ├── .coveragerc ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ └── main.yaml ├── AUTHORS ├── requirements-dev.txt ├── setup.cfg ├── fast_autocomplete ├── __init__.py ├── demo.py ├── normalize.py ├── misc.py ├── draw.py ├── loader.py ├── lfucache.py └── dwg.py ├── LICENSE ├── setup.py ├── .gitignore └── README.md /tests/fixtures/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = 3 | autocomplete 4 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [seperman] 2 | ko_fi: seperman 3 | -------------------------------------------------------------------------------- /tests/animation/short.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seperman/fast-autocomplete/HEAD/tests/animation/short.gif -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Authors: 2 | - Autocomplete by Sep Dehpour (zepworks.com) 3 | - LFU Cache by Shane Wang (medium.com/@epicshane) 4 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | bump2version==1.0.1 2 | click>=8.0.3 3 | deepdiff==5.5.0 4 | flake8==4.0.1 5 | pygraphviz==1.7 6 | pytest==6.2.5 7 | pytest_cov==3.0.0 8 | -------------------------------------------------------------------------------- /tests/fixtures/synonyms.json: -------------------------------------------------------------------------------- 1 | { 2 | "alfa romeo": ["alfa"], 3 | "bmw": ["beemer", "bimmer"], 4 | "mercedes-benz": ["mercedes", "benz"], 5 | "volkswagen": ["vw"], 6 | "truck": ["trucks"] 7 | } 8 | -------------------------------------------------------------------------------- /tests/fixtures/makes_models_in_farsi_short.csv: -------------------------------------------------------------------------------- 1 | make,model 2 | آکیورا,zdx 3 | آلفا,4c 4 | آلفا,4c coupe 5 | آلفا,giulia 6 | بی ام و,1 series 7 | بی ام و,2 series 8 | 2007,2007 9 | 2017,2017 10 | 2018,2018 11 | -------------------------------------------------------------------------------- /tests/fixtures/makes_models_short.csv: -------------------------------------------------------------------------------- 1 | make,model 2 | acura,zdx 3 | alfa romeo,4c 4 | alfa romeo,4c coupe 5 | alfa romeo,giulia 6 | bmw,1 series 7 | bmw,2 series 8 | 2007,2007 9 | 2017,2017 10 | 2018,2018 11 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | current_file = os.path.dirname(__file__) 5 | path1 = os.path.abspath(os.path.join(current_file, '..')) 6 | path2 = os.path.abspath(os.path.join(current_file, 'tests')) 7 | sys.path.append(path1) # noqa 8 | sys.path.append(path2) # noqa 9 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.9.0 3 | commit = True 4 | tag = True 5 | tag_name = {new_version} 6 | 7 | [bumpversion:file:setup.py] 8 | 9 | [bumpversion:file:README.md] 10 | 11 | [flake8] 12 | max-line-length = 120 13 | builtins = json 14 | statistics = true 15 | ignore = E202 16 | exclude = ./data,./src,./tests,.svn,CVS,.bzr,.hg,.git,__pycache__,./venv 17 | -------------------------------------------------------------------------------- /fast_autocomplete/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | import sys 3 | import pkg_resources 4 | 5 | if (sys.version_info[0], sys.version_info[1]) < (3, 6): 6 | sys.exit('fast-autocomplete requires Python 3.6 or later.') 7 | 8 | __version__ = pkg_resources.get_distribution("fast-autocomplete").version 9 | 10 | from fast_autocomplete.dwg import AutoComplete 11 | from fast_autocomplete.draw import DrawGraphMixin 12 | from fast_autocomplete.demo import demo 13 | from fast_autocomplete.loader import autocomplete_factory 14 | from fast_autocomplete.normalize import Normalizer 15 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from fast_autocomplete.misc import _extend_and_repeat 3 | 4 | 5 | class TestMisc: 6 | 7 | @pytest.mark.parametrize("list1, list2, expected_result", [ 8 | (['a', 'b'], ['c', 'd'], [['a', 'b', 'c'], ['a', 'b', 'd']]), 9 | (['a', 'b'], ['a', 'd'], [['a', 'b', 'd']]), 10 | (['a', 'b'], ['b model2', 'd'], [['a', 'b model2'], ['a', 'b', 'd']]), 11 | ([], ['c', 'd'], [['c'], ['d']]), 12 | ]) 13 | def test_extend_and_repeat(self, list1, list2, expected_result): 14 | result = _extend_and_repeat(list1, list2) 15 | assert expected_result == result 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **OS, Fast Autocomplete version and Python version (please complete the following information):** 20 | - OS: [e.g. Ubuntu] 21 | - Version [e.g. 20LTS] 22 | 23 | **Additional context** 24 | Add any other context about the problem here. 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 - 2019 Fair Financial Corp 4 | Copyright (c) 2020 - 2021 Sep Dehpour 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | -------------------------------------------------------------------------------- /fast_autocomplete/demo.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pprint import pprint 3 | from fast_autocomplete.misc import read_single_keypress, termios 4 | 5 | 6 | def demo(running_modules, max_cost, size): 7 | """ 8 | Gets an Autocomplete instance that has already data in it and you can then run search on it in real time 9 | """ 10 | 11 | word_list = [] 12 | 13 | running_modules = running_modules if isinstance(running_modules, dict) else {running_modules.__class__.__name__: running_modules} 14 | 15 | if termios is None: 16 | sys.exit('termios and/or fcntl packages are not available in your system. This is possibly because you are not on a Linux Distro.') 17 | 18 | print('FAST AUTOCOMPLETE DEMO') 19 | print('Press any key to search for. Press ctrl+c to exit') 20 | 21 | while True: 22 | pressed = read_single_keypress() 23 | if pressed == '\x7f': 24 | if word_list: 25 | word_list.pop() 26 | elif pressed == '\x03': 27 | break 28 | else: 29 | word_list.append(pressed) 30 | 31 | joined = ''.join(word_list) 32 | print(chr(27) + "[2J") 33 | print(joined) 34 | results = {} 35 | for module_name, module in running_modules.items(): 36 | results[module_name] = module.search(word=joined, max_cost=max_cost, size=size) 37 | pprint(results) 38 | print('') 39 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | version = '0.9.0' 4 | 5 | 6 | try: 7 | with open('README.md') as file: 8 | long_description = file.read() 9 | except Exception: 10 | long_description = "Autocomplete" 11 | 12 | setup( 13 | name='fast-autocomplete', 14 | description='Fast Autocomplete using Directed Word Graph', 15 | long_description=long_description, 16 | long_description_content_type='text/markdown', 17 | author='Sep Dehpour', 18 | url='https://github.com/seperman/fast-autocomplete', 19 | author_email='sep@zepworks.com', 20 | version=version, 21 | install_requires=[], 22 | extras_require={ 23 | 'levenshtein': ['python-Levenshtein>=0.12.2'], 24 | 'pylev': ['pylev>=1.4.0'], 25 | }, 26 | dependency_links=[], 27 | packages=find_packages(exclude=('tests', 'docs')), 28 | include_package_data=True, 29 | scripts=[], 30 | test_suite="tests", 31 | tests_require=['mock'], 32 | license='MIT', 33 | classifiers=[ 34 | "Intended Audience :: Developers", 35 | "Operating System :: OS Independent", 36 | "Topic :: Software Development", 37 | "Programming Language :: Python :: 3.6", 38 | "Programming Language :: Python :: 3.7", 39 | "Programming Language :: Python :: 3.8", 40 | "Programming Language :: Python :: 3.9", 41 | "Development Status :: 4 - Beta", 42 | ] 43 | ) 44 | -------------------------------------------------------------------------------- /tests/test_normalize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from fast_autocomplete.normalize import Normalizer 3 | 4 | normalizer = Normalizer() 5 | normalizer_unicode = Normalizer( 6 | valid_chars_for_string='زرتپبا' 7 | ) 8 | 9 | 10 | class TestNormalizer: 11 | 12 | @pytest.mark.parametrize("name, expected_result", [ 13 | ('type-r', 'type-r'), 14 | ('HONDA and Toyota!', 'honda and toyota'), 15 | (r'bmw? \#1', 'bmw 1'), 16 | (r'bmw? \#', 'bmw'), 17 | (None, ''), 18 | ]) 19 | def test_remove_any_special_character(self, name, expected_result): 20 | result = normalizer.remove_any_special_character(name) 21 | assert expected_result == result 22 | 23 | @pytest.mark.parametrize("name, extra_chars, expected_result", [ 24 | ('type-r', None, 'type r'), 25 | ('HONDA and Toyota!', None, 'honda and toyota'), 26 | (r'bmw? \#1', None, 'bmw 1'), 27 | (r'bmw? \#', None, 'bmw'), 28 | (r'bmw? \#', {'#'}, 'bmw #'), 29 | (None, None, ''), 30 | ]) 31 | def test_normalize_node_name(self, name, extra_chars, expected_result): 32 | result = normalizer.normalize_node_name(name, extra_chars=extra_chars) 33 | assert expected_result == result 34 | 35 | @pytest.mark.parametrize("name, extra_chars, expected_result", [ 36 | ('درپب', None, 'رپب'), 37 | ]) 38 | def test_normalize_unicode_node_name(self, name, extra_chars, expected_result): 39 | result = normalizer_unicode.normalize_node_name(name, extra_chars=extra_chars) 40 | assert expected_result == result 41 | -------------------------------------------------------------------------------- /tests/test_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from fast_autocomplete import autocomplete_factory, AutoComplete 4 | from fast_autocomplete.loader import WordValue 5 | 6 | current_dir = os.path.dirname(os.path.abspath(__file__)) 7 | fixture_dir = os.path.join(current_dir, 'fixtures') 8 | 9 | content_files = { 10 | 'words': { 11 | 'filepath': os.path.join(fixture_dir, 'sample_words.json'), 12 | 'compress': True # means compress the graph data in memory 13 | } 14 | } 15 | 16 | autocomplete = autocomplete_factory(content_files=content_files) 17 | 18 | 19 | class AutoCompleteIgnoreCount(AutoComplete): 20 | SHOULD_INCLUDE_COUNT = False 21 | 22 | 23 | autocomplete_ignore_count = autocomplete_factory(content_files=content_files, module=AutoCompleteIgnoreCount) 24 | 25 | 26 | class TestLoader: 27 | 28 | @pytest.mark.parametrize('word, expected_result, expected_unsorted_result', [ 29 | ('acu', 30 | [['acura'], ['acura mdx'], ['acura rdx']], 31 | [['acura'], ['acura rlx'], ['acura rdx']]), 32 | ]) 33 | def test_loader(self, word, expected_result, expected_unsorted_result): 34 | result = autocomplete.search(word=word, size=3) 35 | assert expected_result == result 36 | expected_word_value = WordValue(context={'make': 'acura'}, display='Acura', count=130123, original_key=None) 37 | assert autocomplete.words['acura'] == expected_word_value 38 | assert 'Acura' == autocomplete.words['acura'].display 39 | result = autocomplete_ignore_count.search(word=word, size=3) 40 | assert expected_unsorted_result == result 41 | -------------------------------------------------------------------------------- /tests/fixtures/sample_words.json: -------------------------------------------------------------------------------- 1 | { 2 | "acura rlx": [ 3 | { 4 | "model": "rlx", 5 | "make": "acura" 6 | }, 7 | "Acura RLX", 8 | 3132 9 | ], 10 | "rlx": [ 11 | { 12 | "model": "rlx", 13 | "make": "acura" 14 | }, 15 | "Acura RLX", 16 | 3132 17 | ], 18 | "acura": [ 19 | { 20 | "make": "acura" 21 | }, 22 | "Acura", 23 | 130123 24 | ], 25 | "acura rlx sport hybrid": [ 26 | { 27 | "model": "rlx sport hybrid", 28 | "make": "acura" 29 | }, 30 | "Acura RLX Sport Hybrid", 31 | 4 32 | ], 33 | "rlx sport hybrid": [ 34 | { 35 | "model": "rlx sport hybrid", 36 | "make": "acura" 37 | }, 38 | "Acura RLX Sport Hybrid", 39 | 4 40 | ], 41 | "acura ilx": [ 42 | { 43 | "model": "ilx--ilx hybrid", 44 | "make": "acura" 45 | }, 46 | "Acura ILX", 47 | 19936 48 | ], 49 | "ilx": [ 50 | { 51 | "model": "ilx--ilx hybrid", 52 | "make": "acura" 53 | }, 54 | "Acura ILX", 55 | 19936 56 | ], 57 | "acura mdx": [ 58 | { 59 | "model": "mdx", 60 | "make": "acura" 61 | }, 62 | "Acura MDX", 63 | 35290 64 | ], 65 | "mdx": [ 66 | { 67 | "model": "mdx", 68 | "make": "acura" 69 | }, 70 | "Acura MDX", 71 | 35290 72 | ], 73 | "acura nsx": [ 74 | { 75 | "model": "nsx", 76 | "make": "acura" 77 | }, 78 | "Acura NSX", 79 | 271 80 | ], 81 | "nsx": [ 82 | { 83 | "model": "nsx", 84 | "make": "acura" 85 | }, 86 | "Acura NSX", 87 | 271 88 | ], 89 | "acura rdx": [ 90 | { 91 | "model": "rdx", 92 | "make": "acura" 93 | }, 94 | "Acura RDX", 95 | 33905 96 | ] 97 | } 98 | -------------------------------------------------------------------------------- /tests/test_lfucache.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pytest 3 | import concurrent.futures 4 | from deepdiff import DeepDiff 5 | from fast_autocomplete.lfucache import LFUCache 6 | 7 | 8 | class TestLFUcache: 9 | 10 | @pytest.mark.parametrize("items, size, expected_results", [ 11 | (['a', 'a', 'b', 'a', 'c', 'b', 'd'], 3, [('a', 2), ('b', 1), ('d', 0)]), 12 | (['a', 'a', 'b', 'a', 'c', 'b', 'd', 'e', 'c', 'b'], 3, [('a', 2), ('b', 2), ('c', 0)]), 13 | (['a', 'a', 'b', 'a', 'c', 'b', 'd', 'e', 'c', 'b', 'b', 'c', 'd', 'b'], 3, [('b', 4), ('a', 2), ('d', 0)]), 14 | ]) 15 | def test_autocomplete(self, items, size, expected_results): 16 | lfucache = LFUCache(size) 17 | for item in items: 18 | lfucache.set(item, f'{item}_cached') 19 | results = lfucache.get_sorted_cache_keys() 20 | diff = DeepDiff(expected_results, results) 21 | assert not diff 22 | 23 | def test_get_multithreading(self): 24 | keys = 'aaaaaaaaaaaaaaaaaaaaaaaaaaabbc' 25 | lfucache = LFUCache(2) 26 | 27 | def _do_set(cache, key): 28 | cache.set(key, f'{key}_cached') 29 | 30 | def _do_get(cache, key): 31 | return cache.get(key) 32 | 33 | def _key_gen(): 34 | i = 0 35 | while i < 30000: 36 | i += 1 37 | yield random.choice(keys) 38 | 39 | def _random_func(cache, key): 40 | return random.choice([_do_get, _do_get, _do_set])(cache, key) 41 | 42 | with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor: 43 | futures = (executor.submit(_random_func, lfucache, key) for key in _key_gen()) 44 | for future in concurrent.futures.as_completed(futures): 45 | future.result() 46 | -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | 3 | on: 4 | push: 5 | branches: [ "master", "dev" ] 6 | pull_request: 7 | branches: [ "master", "dev" ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: [3.6, 3.7, 3.8, 3.9] 16 | architecture: ["x64"] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Setup Python ${{ matrix.python-version }} on ${{ matrix.architecture }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | architecture: ${{ matrix.architecture }} 25 | - name: Cache pip 26 | uses: actions/cache@v2 27 | with: 28 | # This path is specific to Ubuntu 29 | path: ~/.cache/pip 30 | # Look to see if there is a cache hit for the corresponding requirements file 31 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }} 32 | restore-keys: | 33 | ${{ runner.os }}-pip- 34 | ${{ runner.os }}- 35 | - name: Install c dependencies 36 | run: sudo apt install graphviz 37 | - name: Install dependencies 38 | run: pip install -r requirements-dev.txt 39 | - name: Lint with flake8 40 | run: | 41 | # stop the build if there are Python syntax errors or undefined names 42 | flake8 fast_autocomplete --count --select=E9,F63,F7,F82 --show-source --statistics 43 | - name: Test with pytest 44 | run: | 45 | pytest --cov-report=xml --cov=fast_autocomplete tests/ 46 | - name: Upload coverage to Codecov 47 | uses: codecov/codecov-action@v1 48 | if: matrix.python-version == 3.9 49 | with: 50 | file: ./coverage.xml 51 | env_vars: OS,PYTHON 52 | fail_ci_if_error: true 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | pytest.ini 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | .DS_Store 109 | 110 | temp[0-9]* 111 | makes_models_fair.csv 112 | test_autocomplete_fair.py 113 | 114 | short_*.svg 115 | -------------------------------------------------------------------------------- /fast_autocomplete/normalize.py: -------------------------------------------------------------------------------- 1 | import string 2 | from fast_autocomplete.lfucache import LFUCache 3 | 4 | 5 | NORMALIZED_CACHE_SIZE = 2048 6 | MAX_WORD_LENGTH = 40 7 | 8 | _normalized_lfu_cache = LFUCache(NORMALIZED_CACHE_SIZE) 9 | 10 | 11 | class Normalizer: 12 | 13 | def __init__( 14 | self, 15 | valid_chars_for_string=None, 16 | valid_chars_for_integer=None, 17 | valid_chars_for_node_name=None 18 | ): 19 | if valid_chars_for_string: 20 | self.valid_chars_for_string = frozenset(valid_chars_for_string) 21 | else: 22 | self.valid_chars_for_string = frozenset({i for i in string.ascii_letters.lower()}) 23 | if valid_chars_for_integer: 24 | self.valid_chars_for_integer = frozenset(valid_chars_for_integer) 25 | else: 26 | self.valid_chars_for_integer = frozenset({i for i in string.digits}) 27 | if valid_chars_for_node_name: 28 | self.valid_chars_for_node_name = valid_chars_for_node_name 29 | else: 30 | self.valid_chars_for_node_name = self._get_valid_chars_for_node_name() 31 | 32 | def _get_valid_chars_for_node_name(self): 33 | return {' ', '-', ':', '_'} | self.valid_chars_for_string | self.valid_chars_for_integer 34 | 35 | def normalize_node_name(self, name, extra_chars=None): 36 | if name is None: 37 | return '' 38 | name = name[:MAX_WORD_LENGTH] 39 | key = name if extra_chars is None else f"{name}{extra_chars}" 40 | result = _normalized_lfu_cache.get(key) 41 | if result == -1: 42 | result = self._get_normalized_node_name(name, extra_chars=extra_chars) 43 | _normalized_lfu_cache.set(key, result) 44 | return result 45 | 46 | def _remove_invalid_chars(self, x): 47 | result = x in self.valid_chars_for_node_name 48 | if x == '-' == self.prev_x: 49 | result = False 50 | self.prev_x = x 51 | return result 52 | 53 | def remove_any_special_character(self, name): 54 | """ 55 | Only remove invalid characters from a name. Useful for cleaning the user's original word. 56 | """ 57 | if name is None: 58 | return '' 59 | name = name.lower()[:MAX_WORD_LENGTH] 60 | self.prev_x = '' 61 | 62 | return ''.join(filter(self._remove_invalid_chars, name)).strip() 63 | 64 | def _get_normalized_node_name(self, name, extra_chars=None): 65 | name = name.lower() 66 | result = [] 67 | last_i = None 68 | for i in name: 69 | if i in self.valid_chars_for_node_name or (extra_chars and i in extra_chars): 70 | if i == '-': 71 | i = ' ' 72 | elif (i in self.valid_chars_for_integer and last_i in self.valid_chars_for_string) or (i in self.valid_chars_for_string and last_i in self.valid_chars_for_integer): 73 | result.append(' ') 74 | if not(i == last_i == ' '): 75 | result.append(i) 76 | last_i = i 77 | return ''.join(result).strip() 78 | -------------------------------------------------------------------------------- /fast_autocomplete/misc.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import csv 4 | import sys 5 | try: 6 | import termios 7 | import fcntl 8 | except Exception: 9 | termios = fcntl = None 10 | 11 | 12 | class FileNotFound(ValueError): 13 | pass 14 | 15 | 16 | def _check_file_exists(path): 17 | if not os.path.exists(path): 18 | raise FileNotFound(f'{path} does not exist') 19 | 20 | 21 | def read_csv_gen(path_or_stringio, csv_func=csv.reader, **kwargs): 22 | """ 23 | Takes a path_or_stringio to a file or a StringIO object and creates a CSV generator 24 | """ 25 | if isinstance(path_or_stringio, (str, bytes)): 26 | _check_file_exists(path_or_stringio) 27 | encoding = kwargs.pop('encoding', 'utf-8-sig') 28 | with open(path_or_stringio, 'r', encoding=encoding) as csvfile: 29 | for i in csv_func(csvfile, **kwargs): 30 | yield i 31 | elif isinstance(path_or_stringio, io.StringIO): 32 | for i in csv_func(path_or_stringio, **kwargs): 33 | yield i 34 | else: 35 | raise TypeError('Either a path to the file or StringIO object needs to be passed.') 36 | 37 | 38 | def _extend_and_repeat(list1, list2): 39 | if not list1: 40 | return [[i] for i in list2] 41 | 42 | result = [] 43 | for item in list2: 44 | if item not in list1: 45 | list1_copy = list1.copy() 46 | if item.startswith(list1_copy[-1]): 47 | list1_copy.pop() 48 | list1_copy.append(item) 49 | result.append(list1_copy) 50 | 51 | return result 52 | 53 | 54 | def read_single_keypress(): 55 | """Waits for a single keypress on stdin. 56 | https://stackoverflow.com/a/6599441/1497443 57 | 58 | This is a silly function to call if you need to do it a lot because it has 59 | to store stdin's current setup, setup stdin for reading single keystrokes 60 | then read the single keystroke then revert stdin back after reading the 61 | keystroke. 62 | 63 | Returns the character of the key that was pressed (zero on 64 | KeyboardInterrupt which can happen when a signal gets handled) 65 | 66 | """ 67 | if fcntl is None or termios is None: 68 | raise ValueError('termios and/or fcntl packages are not available in your system. This is possible because you are not on a Linux Distro.') 69 | fd = sys.stdin.fileno() 70 | # save old state 71 | flags_save = fcntl.fcntl(fd, fcntl.F_GETFL) 72 | attrs_save = termios.tcgetattr(fd) 73 | # make raw - the way to do this comes from the termios(3) man page. 74 | attrs = list(attrs_save) # copy the stored version to update 75 | # iflag 76 | attrs[0] &= ~(termios.IGNBRK | termios.BRKINT | termios.PARMRK | 77 | termios.ISTRIP | termios.INLCR | termios.IGNCR | 78 | termios.ICRNL | termios.IXON) 79 | # oflag 80 | attrs[1] &= ~termios.OPOST 81 | # cflag 82 | attrs[2] &= ~(termios.CSIZE | termios. PARENB) 83 | attrs[2] |= termios.CS8 84 | # lflag 85 | attrs[3] &= ~(termios.ECHONL | termios.ECHO | termios.ICANON | 86 | termios.ISIG | termios.IEXTEN) 87 | termios.tcsetattr(fd, termios.TCSANOW, attrs) 88 | # turn off non-blocking 89 | fcntl.fcntl(fd, fcntl.F_SETFL, flags_save & ~os.O_NONBLOCK) 90 | # read a single keystroke 91 | try: 92 | ret = sys.stdin.read(1) # returns a single character 93 | except KeyboardInterrupt: 94 | ret = 0 95 | finally: 96 | # restore old state 97 | termios.tcsetattr(fd, termios.TCSAFLUSH, attrs_save) 98 | fcntl.fcntl(fd, fcntl.F_SETFL, flags_save) 99 | return ret 100 | -------------------------------------------------------------------------------- /fast_autocomplete/draw.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | class DrawGraphMixin: 5 | 6 | DRAW_POPULATION_ANIMATION = False 7 | DRAW_POPULATION_ANIMATION_PATH = '' 8 | DRAW_POPULATION_ANIMATION_FILENO_PADDING = 6 9 | SHOW_OBJ_IDS_OF_WORDS = {} 10 | 11 | def draw_graph(self, file_path, starting_word=None, agraph_kwargs=None, prog='dot'): 12 | """ 13 | Draws the graph of autocomplete words. 14 | 15 | parameters: 16 | 17 | - file_path: the full path to the file to save the graph into. 18 | Graphviz library will determine the format of the file based on the extension you choose. 19 | - starting_word: what word to start from. All descendants of the this word will be in the graph. 20 | If left as None, the graph will start from the rootn node. 21 | - agraph_kwargs: kwargs that will be pased to PyGraphViz Agraph creator. You can control how the graph 22 | will be rendered using these kwargs. 23 | """ 24 | try: 25 | import pygraphviz as pgv 26 | except ImportError: 27 | print('You need to install pygraphviz in order to draw graphs') 28 | 29 | agraph_kwargs = agraph_kwargs if agraph_kwargs else {} 30 | graph = pgv.AGraph(strict=False, directed=True, **agraph_kwargs) 31 | 32 | edges = set() 33 | que = collections.deque() 34 | if starting_word: 35 | matched_prefix_of_last_word, rest_of_word, new_node, matched_words = self._prefix_autofill(word=starting_word) 36 | try: 37 | matched_word = matched_words[-1] 38 | except IndexError: 39 | new_node = self._dwg 40 | matched_word = 'root' 41 | else: 42 | new_node = self._dwg 43 | matched_word = 'root' 44 | que.append((matched_word, new_node, '')) 45 | node_alternative_names = {} 46 | while que: 47 | parent_name, node, edge_name = que.popleft() 48 | node_id = id(node) 49 | if node_id not in node_alternative_names: 50 | node_alternative_names[node_id] = f'.{len(node_alternative_names)}' 51 | if node.word: 52 | node_name = node.word 53 | if node_name in self.SHOW_OBJ_IDS_OF_WORDS: 54 | node_name = f'{node_name} {id(node)}' 55 | else: 56 | try: 57 | node_name = self.words[node_name].display 58 | except (KeyError, AttributeError): 59 | pass 60 | graph.add_node(node_name, fontcolor='blue', fontname='Arial', shape='rectangle') 61 | else: 62 | node_name = node_alternative_names[node_id] 63 | graph.add_node(node_name, color='grey', shape='point') 64 | edge_name = "' '" if edge_name == ' ' else edge_name 65 | edge = (parent_name, node_name) 66 | if edge not in edges: 67 | edges.add(edge) 68 | graph.add_edge(*edge, color='blue', label=edge_name) 69 | for edge_name, child in node.children.items(): 70 | que.append((node_name, child, edge_name)) 71 | graph.draw(file_path, prog=prog) 72 | 73 | def insert_word_callback(self, word): 74 | """ 75 | Once word is inserted, this call back is run. 76 | """ 77 | if self.DRAW_POPULATION_ANIMATION: 78 | if not hasattr(self, '_graph_fileno'): 79 | self._graph_fileno = 0 80 | self._graph_filepath = self.DRAW_POPULATION_ANIMATION_PATH.replace('.', r'{}.') 81 | 82 | fileno = str(self._graph_fileno).zfill(self.DRAW_POPULATION_ANIMATION_FILENO_PADDING) 83 | file_path = self._graph_filepath.format(fileno) 84 | self.draw_graph(file_path=file_path) 85 | self._graph_fileno += 1 86 | -------------------------------------------------------------------------------- /fast_autocomplete/loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gzip 3 | import json 4 | import logging 5 | try: 6 | from redis import StrictRedis 7 | except ImportError: 8 | StrictRedis = None 9 | 10 | from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, Union 11 | from fast_autocomplete import AutoComplete 12 | 13 | 14 | def read_local_dump(filepath: str): 15 | with open(filepath, 'r') as the_file: 16 | return the_file.read() 17 | 18 | 19 | def _simple_compress(item: str, hash_to_val: Dict[int, str]) -> str: 20 | item_hash = hash(item) 21 | if item_hash in hash_to_val: 22 | item = hash_to_val[item_hash] 23 | else: 24 | hash_to_val[item_hash] = item 25 | return item 26 | 27 | 28 | class WordValue(NamedTuple): 29 | context: Any 30 | display: Any 31 | count: int = 0 32 | original_key: 'WordValue' = None 33 | 34 | def get(self, key: str, default: Optional[str] = None) -> str: 35 | result = getattr(self, key) 36 | if result is None: 37 | result = default 38 | return result 39 | 40 | 41 | def get_all_content(content_files, redis_client=None, redis_key_prefix=None, logger=None): 42 | """ 43 | Get all content that is needed to initialize Autocomplete. 44 | 45 | :param: redis_client (optional) If passed, it tries to load from Redis if there is already cached data 46 | """ 47 | kwargs = {} 48 | for key, info in content_files.items(): 49 | kwargs[key] = get_data( 50 | filepath=info['filepath'], 51 | compress=info['compress'], 52 | redis_client=redis_client, 53 | redis_key_prefix=redis_key_prefix, 54 | logger=logger 55 | ) 56 | if logger: 57 | kwargs['logger'] = logger 58 | return kwargs 59 | 60 | 61 | def get_data(filepath: str, compress: bool = False, 62 | redis_client: Optional[StrictRedis] = None, 63 | redis_key_prefix: Optional[str] = None, 64 | logger: Optional[logging.RootLogger] = None) -> Dict[str, List[str]]: 65 | data_json = None 66 | filename = os.path.basename(filepath) 67 | if redis_client and redis_key_prefix: 68 | key = redis_key_prefix.format(filename) 69 | try: 70 | data_json = redis_client.get(key) 71 | except Exception: 72 | if logger: 73 | logger.exception('Unable to get the search graph words from Redis.') 74 | else: 75 | print('Unable to get the search graph words from Redis.') 76 | if data_json: 77 | data_json = gzip.decompress(data_json).decode('utf-8') 78 | if not data_json: 79 | data_json = read_local_dump(filepath) 80 | data = json.loads(data_json) 81 | 82 | if compress: 83 | hash_to_val = {} 84 | 85 | for word, value in data.items(): 86 | context, display, count = value 87 | display = _simple_compress(item=display, hash_to_val=hash_to_val) 88 | for key, val in context.items(): 89 | context[key] = _simple_compress( 90 | item=context[key], hash_to_val=hash_to_val 91 | ) 92 | data[word] = WordValue(context=context, display=display, count=count) 93 | 94 | return data 95 | 96 | 97 | def populate_redis(content_files, redis_client, redis_cache_prefix): 98 | """ 99 | Populate Redis with data based on the local files 100 | """ 101 | for key, info in content_files.items(): 102 | filename = os.path.basename(info['filepath']) 103 | redis_key = redis_cache_prefix.format(filename) 104 | data = read_local_dump(info['filepath']) 105 | compressed = gzip.compress(data.encode('utf-8')) 106 | redis_client.set(redis_key, compressed) 107 | 108 | 109 | def autocomplete_factory( 110 | content_files, redis_client=None, module=AutoComplete, logger=None 111 | ): 112 | """ 113 | Factory function to initialize the proper Vehicle Autocomplete object 114 | 115 | :param: content_files: The file paths and options where data is stored. 116 | 117 | Example 118 | 119 | content_files = { 120 | 'synonyms': { 121 | 'filename': 'path/to/synonyms.json', 122 | 'compress': False 123 | }, 124 | 'words': { 125 | 'filename': 'path/to/words.json', 126 | 'compress': True 127 | }, 128 | 'full_stop_words': { 129 | 'filename': 'path/to/full_stop_words.json', 130 | 'compress': False 131 | } 132 | } 133 | 134 | :param: redis_client: (optional) If passed, the factor function tries to load the data from Redis 135 | and if that fails, it will load the local data. 136 | :param: module: (optional) The AutoComplete module to initialize 137 | """ 138 | kwargs = get_all_content(content_files, redis_client=redis_client, logger=logger) 139 | return module(**kwargs) 140 | -------------------------------------------------------------------------------- /fast_autocomplete/lfucache.py: -------------------------------------------------------------------------------- 1 | """ 2 | LFU cache Written by Shane Wang 3 | https://medium.com/@epicshane/a-python-implementation-of-lfu-least-frequently-used-cache-with-o-1-time-complexity-e16b34a3c49b 4 | https://github.com/luxigner/lfu_cache 5 | Modified by Sep Dehpour 6 | """ 7 | from threading import Lock 8 | 9 | 10 | class CacheNode: 11 | def __init__(self, key, value, freq_node, pre, nxt): 12 | self.key = key 13 | self.value = value 14 | self.freq_node = freq_node 15 | self.pre = pre # previous CacheNode 16 | self.nxt = nxt # next CacheNode 17 | 18 | def free_myself(self): 19 | if self.freq_node.cache_head == self.freq_node.cache_tail: 20 | self.freq_node.cache_head = self.freq_node.cache_tail = None 21 | elif self.freq_node.cache_head == self: 22 | self.nxt.pre = None 23 | self.freq_node.cache_head = self.nxt 24 | elif self.freq_node.cache_tail == self: 25 | self.pre.nxt = None 26 | self.freq_node.cache_tail = self.pre 27 | else: 28 | self.pre.nxt = self.nxt 29 | self.nxt.pre = self.pre 30 | 31 | self.pre = None 32 | self.nxt = None 33 | self.freq_node = None 34 | 35 | 36 | class FreqNode: 37 | def __init__(self, freq, pre, nxt): 38 | self.freq = freq 39 | self.pre = pre # previous FreqNode 40 | self.nxt = nxt # next FreqNode 41 | self.cache_head = None # CacheNode head under this FreqNode 42 | self.cache_tail = None # CacheNode tail under this FreqNode 43 | 44 | def count_caches(self): 45 | if self.cache_head is None and self.cache_tail is None: 46 | return 0 47 | elif self.cache_head == self.cache_tail: 48 | return 1 49 | else: 50 | return '2+' 51 | 52 | def remove(self): 53 | if self.pre is not None: 54 | self.pre.nxt = self.nxt 55 | if self.nxt is not None: 56 | self.nxt.pre = self.pre 57 | 58 | pre = self.pre 59 | nxt = self.nxt 60 | self.pre = self.nxt = self.cache_head = self.cache_tail = None 61 | 62 | return (pre, nxt) 63 | 64 | def pop_head_cache(self): 65 | if self.cache_head is None and self.cache_tail is None: 66 | return None 67 | elif self.cache_head == self.cache_tail: 68 | cache_head = self.cache_head 69 | self.cache_head = self.cache_tail = None 70 | return cache_head 71 | else: 72 | cache_head = self.cache_head 73 | self.cache_head.nxt.pre = None 74 | self.cache_head = self.cache_head.nxt 75 | return cache_head 76 | 77 | def append_cache_to_tail(self, cache_node): 78 | cache_node.freq_node = self 79 | 80 | if self.cache_head is None and self.cache_tail is None: 81 | self.cache_head = self.cache_tail = cache_node 82 | else: 83 | cache_node.pre = self.cache_tail 84 | cache_node.nxt = None 85 | self.cache_tail.nxt = cache_node 86 | self.cache_tail = cache_node 87 | 88 | def insert_after_me(self, freq_node): 89 | freq_node.pre = self 90 | freq_node.nxt = self.nxt 91 | 92 | if self.nxt is not None: 93 | self.nxt.pre = freq_node 94 | 95 | self.nxt = freq_node 96 | 97 | def insert_before_me(self, freq_node): 98 | if self.pre is not None: 99 | self.pre.nxt = freq_node 100 | 101 | freq_node.pre = self.pre 102 | freq_node.nxt = self 103 | self.pre = freq_node 104 | 105 | 106 | class LFUCache: 107 | 108 | def __init__(self, capacity): 109 | self.cache = {} # {key: cache_node} 110 | self.capacity = capacity 111 | self.freq_link_head = None 112 | self.lock = Lock() 113 | 114 | def get(self, key): 115 | with self.lock: 116 | if key in self.cache: 117 | cache_node = self.cache[key] 118 | freq_node = cache_node.freq_node 119 | value = cache_node.value 120 | 121 | self.move_forward(cache_node, freq_node) 122 | 123 | return value 124 | else: 125 | return -1 126 | 127 | def set(self, key, value): 128 | with self.lock: 129 | if self.capacity <= 0: 130 | return -1 131 | 132 | if key not in self.cache: 133 | if len(self.cache) >= self.capacity: 134 | self.dump_cache() 135 | 136 | self.create_cache_node(key, value) 137 | else: 138 | cache_node = self.cache[key] 139 | freq_node = cache_node.freq_node 140 | cache_node.value = value 141 | 142 | self.move_forward(cache_node, freq_node) 143 | 144 | def move_forward(self, cache_node, freq_node): 145 | if freq_node.nxt is None or freq_node.nxt.freq != freq_node.freq + 1: 146 | target_freq_node = FreqNode(freq_node.freq + 1, None, None) 147 | target_empty = True 148 | else: 149 | target_freq_node = freq_node.nxt 150 | target_empty = False 151 | 152 | cache_node.free_myself() 153 | target_freq_node.append_cache_to_tail(cache_node) 154 | 155 | if target_empty: 156 | freq_node.insert_after_me(target_freq_node) 157 | 158 | if freq_node.count_caches() == 0: 159 | if self.freq_link_head == freq_node: 160 | self.freq_link_head = target_freq_node 161 | 162 | freq_node.remove() 163 | 164 | def dump_cache(self): 165 | head_freq_node = self.freq_link_head 166 | self.cache.pop(head_freq_node.cache_head.key) 167 | head_freq_node.pop_head_cache() 168 | 169 | if head_freq_node.count_caches() == 0: 170 | self.freq_link_head = head_freq_node.nxt 171 | head_freq_node.remove() 172 | 173 | def create_cache_node(self, key, value): 174 | cache_node = CacheNode(key, value, None, None, None) 175 | self.cache[key] = cache_node 176 | 177 | if self.freq_link_head is None or self.freq_link_head.freq != 0: 178 | new_freq_node = FreqNode(0, None, None) 179 | new_freq_node.append_cache_to_tail(cache_node) 180 | 181 | if self.freq_link_head is not None: 182 | self.freq_link_head.insert_before_me(new_freq_node) 183 | 184 | self.freq_link_head = new_freq_node 185 | else: 186 | self.freq_link_head.append_cache_to_tail(cache_node) 187 | 188 | def get_sorted_cache_keys(self): 189 | result = [(i, freq.freq_node.freq) for i, freq in self.cache.items()] 190 | result.sort(key=lambda x: -x[1]) 191 | return result 192 | -------------------------------------------------------------------------------- /tests/fixtures/makes_models_from_wikipedia.csv: -------------------------------------------------------------------------------- 1 | make,model,count 2 | Toyota,Aurion,6094 3 | Toyota,Avalon,8803 4 | Toyota,Avensis,1630 5 | Toyota,Camry,5371 6 | Toyota,Crown,9443 7 | Toyota,Etios,5806 8 | Toyota,Mirai,4272 9 | Toyota,Prius,9425 10 | Toyota,Vios,8322 11 | Toyota,Auris,4025 12 | Toyota,Aygo,2115 13 | Toyota,Yaris,6274 14 | Toyota,86,1298 15 | Toyota,Avanza,1760 16 | Toyota,Innova,4250 17 | Toyota,Noah,3462 18 | Toyota,Sienna,3992 19 | Toyota,Sienta,4992 20 | Toyota,Previa,8404 21 | Toyota,Verso,3765 22 | Toyota,Wish,3735 23 | Toyota,4Runner,5616 24 | Toyota,Fortuner,7003 25 | Toyota,Highlander,6235 26 | Toyota,RAV4,3182 27 | Toyota,Sequoia,2900 28 | Toyota,HiAce,5402 29 | Toyota,Tacoma,7371 30 | Toyota,Tundra,6608 31 | Toyota,Coaster,3503 32 | Toyota,Dyna,8426 33 | Lexus,CT,6770 34 | Lexus,IS,4028 35 | Lexus,HS,7415 36 | Lexus,ES,827 37 | Lexus,GS,3557 38 | Lexus,LS,1916 39 | Lexus,SC,6595 40 | Lexus,RC,7647 41 | Lexus,LC,8265 42 | Lexus,LFA,7897 43 | Lexus,NX,3177 44 | Lexus,RX,2663 45 | Lexus,GX,2696 46 | Lexus,LX,5592 47 | BMW,1 series,9969 48 | BMW,2 series,4590 49 | BMW,303,2047 50 | BMW,328,4334 51 | BMW,326,5637 52 | BMW,327,5377 53 | BMW,320,4309 54 | BMW,321,4170 55 | BMW,340,1178 56 | BMW,501,3430 57 | BMW,503,423 58 | BMW,507,3448 59 | BMW,700,3304 60 | BMW,E9,4675 61 | BMW,E3,3190 62 | BMW,M1,130 63 | BMW,E28,9404 64 | BMW,E30,8655 65 | BMW,E32,786 66 | BMW,E34,7726 67 | BMW,Z1,3143 68 | BMW,E31,4817 69 | BMW,Z3,688 70 | BMW,Z8,2607 71 | BMW,i3,2 72 | BMW,i8,4246 73 | Audi,F103,8261 74 | Audi,100,3565 75 | Audi,80,876 76 | Audi,50,6248 77 | Audi,200,3625 78 | Audi,5,6191 79 | Audi,90,4520 80 | Audi,V8,1751 81 | Audi,Cabriolet,2344 82 | Audi,A8,3985 83 | Audi,A4,9554 84 | Audi,A3,2451 85 | Audi,A6,8058 86 | Audi,TT,656 87 | Audi,A2,1032 88 | Audi,Q7,1988 89 | Audi,A5,1568 90 | Audi,Q5,9372 91 | Audi,R8,7835 92 | Audi,A1,2102 93 | Audi,A7,5228 94 | Audi,Q2,1222 95 | Audi,Q3,2371 96 | Audi,Q8,5076 97 | Audi,e,4462 98 | Mercedes-Benz,SSK,671 99 | Mercedes-Benz,W10,1574 100 | Mercedes-Benz,130H,7603 101 | Mercedes-Benz,150H,3934 102 | Mercedes-Benz,W31,6563 103 | Mercedes-Benz,W136,1007 104 | Mercedes-Benz,770,7990 105 | Mercedes-Benz,500K,4089 106 | Mercedes-Benz,540K,8136 107 | Mercedes-Benz,260,6160 108 | Mercedes-Benz,W125,7458 109 | Mercedes-Benz,W154,5619 110 | Mercedes-Benz,T80,9497 111 | Mercedes-Benz,W191,8567 112 | Mercedes-Benz,W120,8652 113 | Mercedes-Benz,190SL,2584 114 | Mercedes-Benz,W187,647 115 | Mercedes-Benz,W105,8822 116 | Mercedes-Benz,W180,1470 117 | Mercedes-Benz,W186,6734 118 | Mercedes-Benz,W188,3572 119 | Mercedes-Benz,W189,2539 120 | Mercedes-Benz,W196,9903 121 | Mercedes-Benz,300,2989 122 | Mercedes-Benz,300SL,3393 123 | Mercedes-Benz,W110,9213 124 | Mercedes-Benz,W111,2363 125 | Mercedes-Benz,W112,767 126 | Mercedes-Benz,W108,1470 127 | Mercedes-Benz,W114,7292 128 | Mercedes-Benz,W100,2874 129 | Mercedes-Benz,W113,8493 130 | Mercedes-Benz,W123,10 131 | Mercedes-Benz,G,3926 132 | Mercedes-Benz,S,1871 133 | Mercedes-Benz,W116,3454 134 | Mercedes-Benz,SL,6343 135 | Mercedes-Benz,R107,5253 136 | Mercedes-Benz,W201,4450 137 | Mercedes-Benz,C123,1913 138 | Mercedes-Benz,W126,4950 139 | Mercedes-Benz,S123,1403 140 | Mercedes-Benz,C126,100 141 | Mercedes-Benz,W124,3632 142 | Mercedes-Benz,C124,7291 143 | Mercedes-Benz,A,3129 144 | Mercedes-Benz,C,7321 145 | Mercedes-Benz,CLK,3968 146 | Mercedes-Benz,E,7247 147 | Mercedes-Benz,M,1164 148 | Mercedes-Benz,R129,6156 149 | Mercedes-Benz,Vaneo,8457 150 | Mercedes-Benz,W168,9009 151 | Mercedes-Benz,W169,8059 152 | Mercedes-Benz,B,212 153 | Mercedes-Benz,W203,897 154 | Mercedes-Benz,W204,4603 155 | Mercedes-Benz,W205,1626 156 | Mercedes-Benz,CL,7195 157 | Mercedes-Benz,W215,3007 158 | Mercedes-Benz,W216,8348 159 | Mercedes-Benz,CLC,1313 160 | Mercedes-Benz,CLS,6176 161 | Mercedes-Benz,W210,6893 162 | Mercedes-Benz,W211,1415 163 | Mercedes-Benz,W212,7213 164 | Mercedes-Benz,GL,6275 165 | Mercedes-Benz,W163,9507 166 | Mercedes-Benz,W164,1198 167 | Mercedes-Benz,R,4807 168 | Mercedes-Benz,W220,5300 169 | Mercedes-Benz,W221,2417 170 | Mercedes-Benz,W222,156 171 | Mercedes-Benz,SLK,1540 172 | Mercedes-Benz,R170,398 173 | Mercedes-Benz,R171,8498 174 | Mercedes-Benz,R230,3168 175 | Mercedes-Benz,SLR,1324 176 | Mercedes-Benz,CLA,3896 177 | Mercedes-Benz,GLA,267 178 | Mercedes-Benz,R231,2692 179 | Alfa Romeo,4C,3411 180 | Alfa Romeo,6C,9492 181 | Alfa Romeo,8C,1203 182 | Alfa Romeo,12C,2353 183 | Alfa Romeo,33,5014 184 | Alfa Romeo,75,6092 185 | Alfa Romeo,90,9659 186 | Alfa Romeo,105,8850 187 | Alfa Romeo,145,3173 188 | Alfa Romeo,146,6418 189 | Alfa Romeo,147,6209 190 | Alfa Romeo,155,8973 191 | Alfa Romeo,156,8905 192 | Alfa Romeo,159,4991 193 | Alfa Romeo,164,5324 194 | Alfa Romeo,166,1749 195 | Alfa Romeo,1750,309 196 | Alfa Romeo,1900,6706 197 | Alfa Romeo,2000,5057 198 | Alfa Romeo,2300,9805 199 | Alfa Romeo,2600,4120 200 | Alfa Romeo,Type,2662 201 | Alfa Romeo,Alfasud,2735 202 | Alfa Romeo,Alfetta,7667 203 | Alfa Romeo,AR6,1982 204 | Alfa Romeo,AR8,1004 205 | Alfa Romeo,Arna,462 206 | Alfa Romeo,Brera,5686 207 | Alfa Romeo,Dauphine,8051 208 | Alfa Romeo,G1,4370 209 | Alfa Romeo,Giulia,357 210 | Alfa Romeo,Giulietta,4924 211 | Alfa Romeo,Gran,3989 212 | Alfa Romeo,GT,2729 213 | Alfa Romeo,GTA,3848 214 | Alfa Romeo,GTV,9649 215 | Alfa Romeo,Matta,2519 216 | Alfa Romeo,MiTo,4991 217 | Alfa Romeo,Montreal,9744 218 | Alfa Romeo,RL,7160 219 | Alfa Romeo,RM,871 220 | Alfa Romeo,Spider,5029 221 | Alfa Romeo,Sprint,435 222 | Alfa Romeo,Stelvio,6508 223 | Alfa Romeo,SZ,3626 224 | Volkswagen,Amarok,6016 225 | Volkswagen,Ameo,350 226 | Volkswagen,Arteon,4740 227 | Volkswagen,Atlas,62 228 | Volkswagen,Caddy,5908 229 | Volkswagen,California,5950 230 | Volkswagen,Fox,4201 231 | Volkswagen,Gol,5813 232 | Volkswagen,Golf,9087 233 | Volkswagen,Jetta,3159 234 | Volkswagen,Lamando,2122 235 | Volkswagen,Lavida,3107 236 | Volkswagen,Beetle,533 237 | Volkswagen,Passat,1604 238 | Volkswagen,Passat,2373 239 | Volkswagen,Polo,6275 240 | Volkswagen,Polo,5747 241 | Volkswagen,Santana,9798 242 | Volkswagen,Sharan,7338 243 | Volkswagen,Tiguan,8213 244 | Volkswagen,Touareg,20 245 | Volkswagen,Touran,1586 246 | Volkswagen,Transporter,3823 247 | Volkswagen,Up,9100 248 | Volkswagen,Vento,7540 249 | Volkswagen,XL,888 250 | Chrysler,150,7529 251 | Chrysler,180,4813 252 | Chrysler,200,8641 253 | Chrysler,300,1638 254 | Chrysler,300M,2061 255 | Chrysler,300,2088 256 | Chrysler,Airflow,1330 257 | Chrysler,Airstream,6486 258 | Chrysler,Aspen,1092 259 | Chrysler,Centura,7034 260 | Chrysler,Australia,606 261 | Chrysler,Charger,9594 262 | Chrysler,by,5575 263 | Chrysler,Cirrus,5222 264 | Chrysler,Colt,6522 265 | Chrysler,Concorde,6028 266 | Chrysler,Conquest,9079 267 | Chrysler,Cordoba,7228 268 | Chrysler,Crossfire,4830 269 | Chrysler,D,9559 270 | Chrysler,Drifter,5391 271 | Chrysler,Executive,165 272 | Chrysler,Fifth,4183 273 | Chrysler,Galant,8517 274 | Chrysler,Horizon,2723 275 | Chrysler,Hunter,5119 276 | Chrysler,Imperial,2807 277 | Chrysler,Imperial,9531 278 | Chrysler,L300,786 279 | Chrysler,Lancer,5398 280 | Chrysler,Laser,325 281 | Chrysler,LeBaron,4387 282 | Chrysler,LHS,1989 283 | Chrysler,Newport,9134 284 | Chrysler,Neon,758 285 | Chrysler,New,5002 286 | Chrysler,Fifth,6742 287 | Chrysler,Pacifica,3467 288 | Chrysler,Prowler,5390 289 | Chrysler,PT,4102 290 | Chrysler,Regal,6030 291 | Chrysler,Royal,4960 292 | Chrysler,Royal,863 293 | Chrysler,Royal,20 294 | Chrysler,Saratoga,8312 295 | Chrysler,Sebring,1267 296 | Chrysler,Sigma,683 297 | Chrysler,Sunbeam,7414 298 | Chrysler,TC,4384 299 | Chrysler,Touring,325 300 | Chrysler,Town,2340 301 | Chrysler,Turbine,6708 302 | Chrysler,Valiant,9309 303 | Chrysler,Valiant,3872 304 | Chrysler,Vogue,5589 305 | Chrysler,Voyager,3797 306 | Chrysler,Royal,4695 307 | Chrysler,Windsor,5449 308 | Honda,Accord,5547 309 | Honda,Amaze,3084 310 | Honda,Avancier,9269 311 | Honda,Ballade,1666 312 | Honda,Brio,1899 313 | Honda,City,4908 314 | Honda,Civic,6317 315 | Honda,Civic Type R,4415 316 | Honda,Clarity,6472 317 | Honda,Crider,2453 318 | Honda,Elysion,5302 319 | Honda,Fit,2572 320 | Honda,Freed,5982 321 | Honda,Freed,7931 322 | Honda,City,8034 323 | Honda,City,8319 324 | Honda,City,7527 325 | Honda,Jade,8594 326 | Honda,Fit,4487 327 | Honda,Legend,5208 328 | Honda,Mobilio,6348 329 | Honda,NSX,196 330 | Honda,Pilot,7059 331 | Honda,Ridgeline,8671 332 | Honda,S660,9805 333 | Honda,Shuttle,1230 334 | Honda,Spirior,9906 335 | Honda,StepWGN,4061 336 | Honda,Avancier,9107 337 | Honda,Vamos,466 338 | Honda,Vezel,4760 339 | Honda,Type R,5449 340 | Jaguar,F-Type,8457 341 | Jaguar,Type,191 342 | 2007,2007,3276 343 | 2017,2017,1741 344 | 2018,2018,59 345 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fast Autocomplete 0.9.0 2 | 3 | [zepworks.com](https://zepworks.com) 4 | 5 | Fast autocomplete using Directed Word Graph (DWG) and Levenshtein Edit Distance. 6 | 7 | The results are cached via LFU (Least Frequently Used). 8 | 9 | # Why 10 | 11 | Read about why fast-autocomplete was built here: 12 | 13 | This library was written when we came to the conclusion that Elasticsearch's Autocomplete suggestor is not fast enough and doesn't do everything that we need: 14 | 15 | 1. Once we switched to Fast Autocomplete, our average latency went from 120ms to 30ms so an improvement of 3-4x in performance and errors went down to zero. 16 | 2. Elasticsearch's Autocomplete suggestor does not handle any sort of combination of the words you have put in. For example Fast Autocomplete can handle `2018 Toyota Camry in Los Angeles` when the words `2018`, `Toyota Camry`, `Los Angeles` are seperately fed into it. While Elasticsearch's autocomplete needs that whole sentence to be fed to it to show it in Autocomplete results. 17 | 18 | You might say: 19 | 20 | 1. Regarding #1: Yes, but you are using caching. Answer: shhh Yes, keep it quiet. We are also doing Levenshtein Edit distance using a C library so it improves there too. 21 | 2. Regarding #2: Cool. Answer: Ok, now we are talking. 22 | 23 | # How 24 | 25 | Read about how fast-autocomplete works here: 26 | 27 | In a nutshell, what the fast Autocomplete does is: 28 | 29 | 1. Populate the DWG with your words. 30 | 2. Follow the graph nodes letter by letter until it finds nodes that have words in them. 31 | 3. Continue after words are found on the graph until it reaches the leaf node. 32 | 4. Restart from the root node again until it reaches a letter that doesn't exist on the graph. 33 | 5. Depending on how much is left from the rest of the word, return all the descendant words from where it got stuck 34 | 6. Or run Levenshtein edit distance to find closes words to what is left and the continue from there. 35 | 36 | By doing so, it can tokenize a text such as: 37 | 38 | `2018 Toyota Camry in Los Angeles` into [`2018`, `toyota camry`, `in`, `los angeles`] 39 | 40 | And return Autocomplete results as you type. 41 | 42 | # Install 43 | 44 | `pip install fast-autocomplete` 45 | 46 | **Note: Fast Autocomplete only works with Python 3.6 and newer.** 47 | 48 | Are you still on Python 2? TIME TO UPGRADE. 49 | 50 | # Licence 51 | 52 | MIT 53 | 54 | # DWG 55 | 56 | The data structure we use in this library is called Dawg. 57 | 58 | DWG stands for Directed Word Graph. Here is an example DWG based on the "makes_models_short.csv" that is provided in the tests: 59 | 60 | ![dwg](tests/animation/short.gif) 61 | 62 | ![dwg](tests/AutoCompleteWithSynonymsShort_Graph.svg) 63 | 64 | 65 | # Usage 66 | 67 | First of all lets start from your data. The library leaves it up to you how to prepare your data. 68 | If you want to go straight to the factory function that lets you use the library in its easiest and most common case, skip all these and jump to the [sorting](#sorting) example. 69 | 70 | ## Example 1 71 | 72 | ```py 73 | >>> from fast_autocomplete import AutoComplete 74 | >>> words = {'book': {}, 'burrito': {}, 'pizza': {}, 'pasta':{}} 75 | >>> autocomplete = AutoComplete(words=words) 76 | >>> autocomplete.search(word='b', max_cost=3, size=3) 77 | [['book'], ['burrito']] 78 | >>> autocomplete.search(word='bu', max_cost=3, size=3) 79 | [['burrito']] 80 | >>> autocomplete.search(word='barrito', max_cost=3, size=3) # mis-spelling 81 | [['burrito']] 82 | ``` 83 | 84 | Words is a dictionary and each word can have a context. For example the "count", how to display the word, some other context around the word etc. In this example words didn't have any context. 85 | 86 | ## Example 2 87 | 88 | Imagine that we have a csv with the following content from vehicles' make and models: 89 | 90 | ```csv 91 | make,model 92 | acura,zdx 93 | alfa romeo,4c 94 | alfa romeo,4c coupe 95 | alfa romeo,giulia 96 | bmw,1 series 97 | bmw,2 series 98 | 2007,2007 99 | 2017,2017 100 | 2018,2018 101 | ``` 102 | 103 | What we want to do is to convert this to a dictionary of words and their context. 104 | 105 | 106 | ```py 107 | import csv 108 | from fast_autocomplete.misc import read_csv_gen 109 | 110 | 111 | def get_words(path): 112 | 113 | csv_gen = read_csv_gen(path, csv_func=csv.DictReader) 114 | 115 | words = {} 116 | 117 | for line in csv_gen: 118 | make = line['make'] 119 | model = line['model'] 120 | if make != model: 121 | local_words = [model, '{} {}'.format(make, model)] 122 | while local_words: 123 | word = local_words.pop() 124 | if word not in words: 125 | words[word] = {} 126 | if make not in words: 127 | words[make] = {} 128 | return words 129 | ``` 130 | 131 | the `read_csv_gen` is just a helper function. You don't really need it. The whole point is that we are converting that csv to a dictionary that looks like this: 132 | 133 | ```py 134 | >>> words = get_words('path to the csv') 135 | >>> words 136 | {'acura zdx': {}, 137 | 'zdx': {}, 138 | 'acura': {}, 139 | 'alfa romeo 4c': {}, 140 | '4c': {}, 141 | 'alfa romeo': {}, 142 | 'alfa romeo 4c coupe': {}, 143 | '4c coupe': {}, 144 | 'alfa romeo giulia': {}, 145 | 'giulia': {}, 146 | 'bmw 1 series': {}, 147 | '1 series': {}, 148 | 'bmw': {}, 149 | 'bmw 2 series': {}, 150 | '2 series': {}, 151 | '2007': {}, 152 | '2017': {}, 153 | '2018': {}} 154 | ``` 155 | 156 | This is a dictionary of words to their context. We have decided that we don't want any context for the words in this example so all the contexts are empty. However generally you will want some context around the words for more complicated logics. The context is used to convert the words "keys" into their context which is the value of the key in the words dictionary. 157 | 158 | In addition to words, we usually want a dictionary of synonyms. Something like this: 159 | 160 | ```py 161 | synonyms = { 162 | "alfa romeo": ["alfa"], 163 | "bmw": ["beemer", "bimmer"], 164 | "mercedes-benz": ["mercedes", "benz"], 165 | "volkswagen": ["vw"] 166 | } 167 | ``` 168 | 169 | Note that synonyms are optional. Maybe in your use case you don't need synonyms. 170 | 171 | Now we can use the above to initialize Autocomplete 172 | 173 | ```py 174 | 175 | from fast_autocomplete import AutoComplete 176 | 177 | autocomplete = AutoComplete(words=words, synonyms=synonyms) 178 | ``` 179 | 180 | At this point, AutoComplete has created a [dwg](#DWG) structure. 181 | 182 | Now you can search! 183 | 184 | - word: the word to return autocomplete results for 185 | - max_cost: Maximum Levenshtein edit distance to be considered when calculating results 186 | - size: The max number of results to return 187 | 188 | ```py 189 | >>> autocomplete.search(word='2018 bmw 1', max_cost=3, size=3) 190 | [['2018', 'bmw'], ['2018', 'bmw 1 series']] 191 | ``` 192 | 193 | Now what if we pressed a by mistake then? It still works. No problem. 194 | 195 | ```py 196 | >>> autocomplete.search(word='2018 bmw 1a', max_cost=3, size=3) 197 | [['2018', 'bmw'], ['2018', 'bmw 1 series']] 198 | ``` 199 | 200 | Ok let's search for Alfa now: 201 | 202 | ```py 203 | >>> autocomplete.search(word='alfa', max_cost=3, size=3) 204 | [['alfa romeo'], ['alfa romeo 4c'], ['alfa romeo giulia']] 205 | ``` 206 | 207 | What if we don't know how to pronounce alfa and we type `alpha` ? 208 | 209 | ```py 210 | >>> autocomplete.search(word='alpha', max_cost=3, size=3) 211 | [['alfa romeo'], ['alfa romeo 4c'], ['alfa romeo giulia']] 212 | ``` 213 | 214 | It still works! 215 | 216 | Fast-Autocomplete makes sure the results make sense! 217 | 218 | Ok lets add the word `Los Angeles` there to the words: 219 | 220 | 221 | ```py 222 | >>> words['los angeles'] = {} 223 | >>> words['in'] = {} 224 | >>> autocomplete.search(word='2007 alfa in los', max_cost=3, size=3) 225 | [['2007', 'alfa romeo', 'in'], ['2007', 'alfa romeo', 'in', 'los angeles']] 226 | ``` 227 | 228 | So far we have not used the context. And this library leaves it up to you how to use the context. But basically if we giving a context to each one of those words, then the above response could easly be translated to the list of those contexts. 229 | 230 | ## context 231 | 232 | If our words dictionary was: 233 | 234 | ```py 235 | words = { 236 | 'in': {}, 237 | 'alfa romeo': {'type': 'make'}, 238 | '2007': {'type': 'year'}, 239 | 'los angeles': {'type': 'location'}, 240 | } 241 | ``` 242 | 243 | Then the `autocomplete.words` can be used to map the results into their context: 244 | 245 | ``` 246 | [['2007', 'alfa romeo', 'in'], ['2007', 'alfa romeo', 'in', 'los angeles']] 247 | 248 | converted to contexts: 249 | 250 | [[{'year': '2007'}, {'make': alfa romeo'}], [{'year': '2007'}, {'make': alfa romeo'}, {'location': 'los angeles'}]] 251 | ``` 252 | 253 | ## Sorting 254 | 255 | Most people who use Fast Autocomplete, want to control how results are sorted. If you don't control that, the results will be sorted based on the order that Autocomplete found the nodes in the graph that matched the criteria. 256 | 257 | The easiest way to sort is to give each item a count. **Fast AutoComplete will use the count to sort items that are partial matches.** 258 | 259 | For example: 260 | 261 | 1. Make a json file that is a dictionary of words to their context. 262 | 263 | The format of the file needs to be: 264 | 265 | ```json 266 | 267 | { 268 | word: [ 269 | context, 270 | display value, 271 | count 272 | ] 273 | } 274 | ``` 275 | 276 | An example is included in the [sample_words.json](tests/fixtures/sample_words.json) 277 | 278 | ```json 279 | { 280 | "acura rlx": [ 281 | { 282 | "model": "rlx", 283 | "make": "acura" 284 | }, 285 | "Acura RLX", 286 | 3132 287 | ], 288 | "rlx": [ 289 | { 290 | "model": "rlx", 291 | "make": "acura" 292 | }, 293 | "Acura RLX", 294 | 3132 295 | ], 296 | "acura": [ 297 | { 298 | "make": "acura" 299 | }, 300 | "Acura", 301 | 130123 302 | ], 303 | ... 304 | } 305 | ``` 306 | 307 | You might be wondering why things are in this format. It is to save space when this json can become very big easily and the keys become repetitive. That's why we are using a list with predefined order of keys. For your use case for now you can leave the context and display values as None if you want. We will opensource other factory functions soon that will fully utilize those keys in the context. 308 | 309 | 2. Launch Autocomplete via the factory function: 310 | 311 | ```py 312 | from fast_autocomplete import autocomplete_factory 313 | 314 | content_files = { 315 | 'words': { 316 | 'filepath': path/to/sample_words.json, 317 | 'compress': True # means compress the graph data in memory 318 | } 319 | } 320 | 321 | autocomplete = autocomplete_factory(content_files=content_files) 322 | ``` 323 | 324 | 3. You can use Autocomplete and the results are ordered by count! 325 | 326 | 327 | ```py 328 | >>> autocomplete.search(word='acu') 329 | [['acura'], ['acura mdx'], ['acura rdx']] 330 | ``` 331 | 332 | 4. How do we use the context and display value now? 333 | 334 | Great question. You need to extend AutoComplete class to use these items. I will write a blog post about it. 335 | 336 | Here is a simple example without any extending: 337 | 338 | ```py 339 | >>> autocomplete.words['acura'] 340 | WordValue(context={'make': 'acura'}, display='Acura', count=130123, original_key=None) 341 | >>> autocomplete.words['acura'].display 342 | Acura 343 | ``` 344 | 345 | ### Change the sorting by updating counts 346 | 347 | Fast Autocomplete by default uses the "count" of the items to sort the items in the results. Think about these counts as a "guide" to Fast autocomplete so it can polish its results. Depending on whether or not Fast autocomplete finds exact matches to user's query, the counts will be used to refine the results. You can update the counts in an autocomplete object live. 348 | 349 | For example, in the [sample csv of car makes and models](tests/fixtures/makes_models_from_wikipedia.csv) we have: 350 | 351 | ```csv 352 | make,model,count 353 | Toyota,Aurion,6094 354 | Toyota,Avalon,8803 355 | Toyota,Avensis,1630 356 | Toyota,Auris,4025 357 | Toyota,Aygo,2115 358 | ``` 359 | 360 | If we use the autocomplete to search: 361 | 362 | ```py 363 | >>> auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo']) 364 | >>> autocomplete.search(word='toyota a') 365 | [['toyota'], ['toyota avalon'], ['toyota aurion'], ['toyota auris']] 366 | ``` 367 | 368 | However as you can notice `toyota aygo` had the count of 2115 and thus it didn't make it to the top 3 results. 369 | 370 | We can set the count for `toyota aygo` to a higher number to boost it in the results using `update_count_of_word`. 371 | 372 | The `update_count_of_word` can change the count via setting the word's count directly or by offsetting its current value. 373 | 374 | ```py 375 | >>> auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo']) 376 | >>> auto_complete.update_count_of_word(word='toyota aygo', count=10000) 377 | 10000 378 | ``` 379 | 380 | Now if we search: 381 | 382 | ```py 383 | >>> autocomplete.search(word='toyota a') 384 | [['toyota'], ['toyota aygo'], ['toyota avalon'], ['toyota aurion']] 385 | ``` 386 | 387 | We can double check the count of a node: 388 | 389 | ```py 390 | >>> autocomplete.get_count_of_word('toyota aygo') 391 | 10000 392 | ``` 393 | 394 | Now let's use the offset to offset the current count of a different node: 395 | 396 | 397 | ```py 398 | >>> auto_complete.update_count_of_word(word='toyota aurion', offset=-6000) 399 | 94 400 | ``` 401 | 402 | When we search, `toyota aurion` is not in the top 3 results anymore! 403 | 404 | ```py 405 | >>> autocomplete.search(word='toyota a') 406 | [['toyota'], ['toyota aygo'], ['toyota avalon'], ['toyota auris']] 407 | ``` 408 | 409 | 410 | ## Unicode 411 | 412 | By default this package only accepts ASCII lowercase letters, a-z. However you can pass the characters that you want to be acceptable via `valid_chars_for_string` for strings, and `valid_chars_for_integer` for numbers. For example here we tell Autocomplete to consider the Farsi alphabet characters for string characters. 413 | 414 | ```python 415 | AutoComplete( 416 | words=SHORT_WORDS_UNICODE, 417 | valid_chars_for_string='اآبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی') 418 | ``` 419 | 420 | If you want to pass other characters in addition to ASCII letters, such as punctuation marks, you need to set the `valid_chars_for_string` variable to include all of the characters you need. For example, the following code block sets ASCII letters a-z along with periods and apostrophes: 421 | 422 | ```python 423 | valid_chars = ".'" 424 | valid_chars += string.ascii_lowercase 425 | AutoComplete( 426 | words=WORDS_WITH_PUNCTUATION, 427 | valid_chars_for_string=valid_chars) 428 | ``` 429 | 430 | 431 | ## Draw 432 | 433 | This package can actually draw the dwgs as it is populating them or just once the dwg is populated for you! 434 | Here is the animation of populating the dwg with words from "makes_models_short.csv": 435 | 436 | 437 | ### Draw animation of dwg populating 438 | 439 | ```py 440 | from fast_autocomplete import AutoComplete, DrawGraphMixin 441 | 442 | 443 | class AutoCompleteDraw(DrawGraphMixin, AutoComplete): 444 | DRAW_POPULATION_ANIMATION = True 445 | DRAW_POPULATION_ANIMATION_PATH = 'animation/short_.svg' 446 | DRAW_POPULATION_ANIMATION_FILENO_PADDING = 6 447 | 448 | 449 | autocomplete = AutoCompleteDraw(words=words, synonyms=synonyms) 450 | ``` 451 | 452 | As soon as you initialize the above AutoCompleteDraw class, it will populate the dwg and generate the animation! 453 | For an example of this code properly setup, take a look at the tests. In fact the animation in the [dwg](#dwg) section is generated the same way via unit tests! 454 | 455 | Note that if you have many words, the graph file will be big. Instead of drawing all frames as the dwg is being populated, you can just draw the final stage: 456 | 457 | ### Draw the final graph 458 | 459 | To draw just one graph that shows the final stage of the dwg, use the draw mixin and run the draw_graph function: 460 | 461 | ```py 462 | from fast_autocomplete import AutoComplete, DrawGraphMixin 463 | 464 | 465 | class AutoCompleteDraw(DrawGraphMixin, AutoComplete): 466 | pass 467 | 468 | autocomplete = AutoCompleteDraw(words=words, synonyms=synonyms) 469 | autocomplete.draw_graph('path to file') 470 | ``` 471 | 472 | ## Demo 473 | 474 | If you want to have a real-time interaction with Autocomplete results in your terminal, you can use the demo module: 475 | 476 | Just pass it an instance of the autocomplete and the search configs: 477 | 478 | ```py 479 | from fast_autocomplete import demo 480 | 481 | demo(autocomplete, max_cost=3, size=5) 482 | ``` 483 | 484 | # 485 | 486 | # Develop 487 | 488 | 1. Clone the repo 489 | 2. Make a virtualenv with Python 3.6 or newer 490 | 3. `pip install -r requirements-dev.txt` 491 | 492 | ## Run tests 493 | 494 | `pytest` 495 | 496 | We try to maintain high standard in code coverage. Currently the `dwg` module's coverage is around 99%! 497 | 498 | ## Releases 499 | 500 | We use bump2version to bump and tag releases. 501 | 502 | ```bash 503 | git checkout master && git pull 504 | bump2version {patch|minor|major} 505 | git push && git push --tags 506 | ``` 507 | 508 | # Authors 509 | 510 | - Autocomplete written by [Sep Dehpour](http://zepworks.com). 511 | - LFU Cache by [Shane Wang](https://medium.com/@epicshane) 512 | 513 | # Other ways of doing AutoComplete 514 | 515 | 1. Elastic search. Yes, Elasticsearch generally is a *better* Autocomplete solution than this library. I said generally. In our specific use case, we wanted Autocomplete to be faster than Elasticsearch and handle combination of words. Otherwise Elasticsearch would have been perfect. Behind the scene Elasticsearch uses Finite State Transducer (FST) in Lucene to achive AutoComplete. FST is more complicated than what we have used in fast-autocomplete. 516 | 517 | 2. If your autocomplete is supposed to return results based on a big blog of text (for example based on some book contents), then a better solution is to go with Markov chains and conditional probability. Yes, there is already a library out there for it! and it looks great. Disclaimer: we have not actually used it since it doesn't fit our specific use-case. 518 | 519 | 520 | # FAQ 521 | 522 | ## Why DWG 523 | DWG stands for Directed Word Graph. Originally we were using Trie-Tree structure. But soon it was obvious that some branches needed to merge back to other branches. Such as `beemer` and `bmw` branches both need to end in the same node since they are synonyms. Thus we used DWG. 524 | 525 | ## What are synonyms, clean synonyms and partial synonyms 526 | Synonyms are words that should produce the same results. 527 | 528 | - For example `beemer` and `bmw` should both give you `bmw`. 529 | - `alfa` and `alfa romeo` should both give you `alfa romeo` 530 | 531 | The synonyms get divided into 2 groups: 532 | 533 | 1. clean synonyms: The 2 words share little or no words. For example `beemer` vs. `bmw`. 534 | 2. partial synonyms: One of the 2 words is a substring of the other one. For example `alfa` and `alfa romeo` or `gm` vs. `gmc`. 535 | 536 | Internally these 2 types of synonyms are treated differently but as a user of the library, you don't need to really care about it. You just provide the synonyms dictionary via defining the `get_synonyms` method. 537 | 538 | ## Why do you have a whole subtree for partial synonyms 539 | Q: Partial synonym means the synonym is a part of the original word. Such as `alfa` is a partial synonym for `alfa romeo`. 540 | In that case you are inserting both `alfa` and `alfa romeo` in the dwg. `alfa` will have `alfa 4c` and `alpha romeo` will have `alfa romeo 4c` branches. Why not just have `alfa` branches to be `alfa romeo` and from there you will have automatically all the sub branches of `alfa romeo`. 541 | 542 | Answer: We use letters for edges. So `alfa` can have only one edge coming out of it that is space (` `). And that edge is going to a node that has sub-branches to `alfa romoe`, `alfa 4c` etc. It can't have a ` ` going to that node and another ` ` going to `alfa romeo`'s immediate child. That way when we are traversing the dwg for the input of `alfa 4` we get to the correct node. 543 | 544 | ## I put Toyota in the Dawg but when I type `toy`, it doesn't show up. 545 | 546 | Answer: If you put `Toyota` with capital T in the dwg, it expects the search word to start with capital T too. We suggest that you lower case everything before putting them in dwg. Fast-autocomplete does not automatically do that for you since it assumes the `words` dictionary is what you want to be put in the dwg. It is up to you to clean your own data before putting it in the dwg. 547 | -------------------------------------------------------------------------------- /tests/test_autocomplete.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import os 4 | import pytest 5 | import string 6 | from pprint import pprint 7 | from typing import NamedTuple 8 | 9 | from fast_autocomplete.misc import read_csv_gen 10 | from fast_autocomplete import AutoComplete, DrawGraphMixin 11 | from fast_autocomplete.dwg import FindStep 12 | 13 | 14 | current_dir = os.path.dirname(os.path.abspath(__file__)) 15 | 16 | WHAT_TO_PRINT = {'word', 'results', 'expected_results', 'result', 17 | 'find_steps', 'expected_steps', 'search_results', 'search_results_immutable'} 18 | 19 | 20 | class Info(NamedTuple): 21 | make: 'Info' = None 22 | model: 'Info' = None 23 | original_key: 'Info' = None 24 | count: int = 0 25 | 26 | def get(self, key, default=None): 27 | return getattr(self, key, default) 28 | 29 | __get__ = get 30 | 31 | 32 | def parameterize_cases(cases): 33 | return [tuple(i.values()) for i in cases] 34 | 35 | 36 | def print_results(local_vars): 37 | common = WHAT_TO_PRINT & set(local_vars.keys()) 38 | for key in common: 39 | print(f'- {key}:') 40 | pprint(local_vars[key]) 41 | 42 | 43 | def get_words(path): 44 | 45 | file_path = os.path.join(current_dir, path) 46 | csv_gen = read_csv_gen(file_path, csv_func=csv.DictReader) 47 | 48 | words = {} 49 | 50 | for line in csv_gen: 51 | make = line['make'].lower() 52 | model = line['model'].lower() 53 | if make != model: 54 | local_words = [model, '{} {}'.format(make, model)] 55 | while local_words: 56 | word = local_words.pop() 57 | if word not in words: 58 | words[word] = dict(line) 59 | if make not in words: 60 | words[make] = {"make": make} 61 | 62 | words['truck'] = {'make': 'truck'} 63 | return words 64 | 65 | 66 | WIKIPEDIA_WORDS = get_words('fixtures/makes_models_from_wikipedia.csv') 67 | 68 | SHORT_WORDS = get_words('fixtures/makes_models_short.csv') 69 | 70 | SHORT_WORDS_UNICODE = get_words('fixtures/makes_models_in_farsi_short.csv') 71 | 72 | SHORT_WORDS_IMMUTABLE_INFO = {key: Info(**value) for key, value in SHORT_WORDS.items()} 73 | 74 | 75 | with open(os.path.join(current_dir, 'fixtures/synonyms.json'), 'r') as the_file: 76 | SYNONYMS = json.loads(the_file.read()) 77 | 78 | 79 | class TestAutocomplete: 80 | 81 | @pytest.mark.parametrize("word, max_cost, size, expected_results", [ 82 | ('bmw', 2, 3, {0: [['bmw']], 1: [['bmw 1 series'], ['bmw e28'], ['bmw e30'], ['bmw e34']]}), 83 | ('beemer', 2, 3, {}), 84 | ('honda covic', 2, 3, {0: [['honda']], 1: [['honda', 'civic'], ['honda', 'civic type r']]}), 85 | ]) 86 | def test_search_without_synonyms(self, word, max_cost, size, expected_results): 87 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS) 88 | results, find_steps = auto_complete._find(word, max_cost, size) 89 | results = dict(results) 90 | print_results(locals()) 91 | assert expected_results == results 92 | 93 | @pytest.mark.parametrize("word, max_cost, size, expected_results", [ 94 | ('بی ام و', 2, 3, {0: [['بی ام و']], 1: [['بی ام و 1 series'], ['بی ام و 2 series']]}), 95 | ]) 96 | def test_search_unicode_without_synonyms(self, word, max_cost, size, expected_results): 97 | auto_complete = AutoComplete( 98 | words=SHORT_WORDS_UNICODE, 99 | valid_chars_for_string='اآبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی') 100 | results, find_steps = auto_complete._find(word, max_cost, size) 101 | results = dict(results) 102 | print_results(locals()) 103 | assert expected_results == results 104 | 105 | def test_autocomplete_synonym_part_of_another_word(self): 106 | words = {'cartoon': {}, 'vehicle': {}} 107 | synonyms = {'vehicle': ['car']} 108 | autocomplete = AutoComplete(words=words, synonyms=synonyms) 109 | result = autocomplete.search(word='ca') 110 | assert [['vehicle'], ['cartoon']] == result 111 | 112 | def test_special_characters(self): 113 | words = {'abcd(efgh)ijk': {}, 'u (2 off)': {}} 114 | autocomplete = AutoComplete(words=words, valid_chars_for_string=string.ascii_letters + string.punctuation) 115 | # result = autocomplete.search(word='abcd(efgh)') 116 | # assert [['abcd(efgh)ijk']] == result 117 | 118 | result2 = autocomplete.search(word='u (2 o') 119 | assert [['u (2 off)']] == result2 120 | 121 | 122 | STEP_DESCENDANTS_ONLY = [FindStep.descendants_only] 123 | STEP_FUZZY_FOUND = [FindStep.fuzzy_try, FindStep.fuzzy_found] 124 | 125 | SEARCH_CASES = [ 126 | {'word': ' ', 127 | 'max_cost': 3, 128 | 'size': 3, 129 | 'expected_find_results': {1: [['1 series'], ['bmw 1 series'], ['spirior'], ['honda spirior']]}, 130 | 'expected_steps': STEP_DESCENDANTS_ONLY, 131 | 'expected_find_and_sort_results': [['1 series'], ['bmw 1 series'], ['spirior']], 132 | }, 133 | {'word': '', 134 | 'max_cost': 3, 135 | 'size': 3, 136 | 'expected_find_results': {1: [['1 series'], ['bmw 1 series'], ['spirior'], ['honda spirior']]}, 137 | 'expected_steps': STEP_DESCENDANTS_ONLY, 138 | 'expected_find_and_sort_results': [['1 series'], ['bmw 1 series'], ['spirior']], 139 | }, 140 | {'word': 'c', 141 | 'max_cost': 3, 142 | 'size': 3, 143 | 'expected_find_results': {0: [['c']], 1: [['charger'], ['chrysler charger'], ['chrysler d'], ['crown']]}, 144 | 'expected_steps': STEP_DESCENDANTS_ONLY, 145 | 'expected_find_and_sort_results': [['c'], ['charger'], ['chrysler charger']], 146 | }, 147 | {'word': 'ca', 148 | 'max_cost': 3, 149 | 'size': 3, 150 | 'expected_find_results': {1: [['california'], ['caddy'], ['camry'], ['cabriolet']]}, 151 | 'expected_steps': STEP_DESCENDANTS_ONLY, 152 | 'expected_find_and_sort_results': [['california'], ['caddy'], ['camry']], 153 | }, 154 | {'word': 'camr', 155 | 'max_cost': 3, 156 | 'size': 6, 157 | 'expected_find_results': {1: [['camry']]}, 158 | 'expected_steps': STEP_DESCENDANTS_ONLY, 159 | 'expected_find_and_sort_results': [['camry']], 160 | }, 161 | {'word': '4d', 162 | 'max_cost': 3, 163 | 'size': 3, 164 | 'expected_find_results': {1: [['4runner'], ['4c']]}, 165 | 'expected_steps': STEP_DESCENDANTS_ONLY, 166 | 'expected_find_and_sort_results': [['4runner'], ['4c']], 167 | }, 168 | {'word': '2018 alpha ', 169 | 'max_cost': 3, 170 | 'size': 3, 171 | 'expected_find_results': {0: [['2018']], 172 | 2: [['2018', 'alfa romeo'], 173 | ['2018', 'alfa romeo 2300'], 174 | ['2018', 'alfa romeo montreal'], 175 | ['2018', 'alfa romeo 90'], 176 | ['2018', 'alfa romeo gtv']]}, 177 | 'expected_steps': STEP_FUZZY_FOUND, 178 | 'expected_find_and_sort_results': [['2018'], ['2018', 'alfa romeo'], ['2018', 'alfa romeo 2300']], 179 | }, 180 | {'word': '2018 alpha romeo 4d', 181 | 'max_cost': 3, 182 | 'size': 4, 183 | 'expected_find_results': {0: [['2018']], 184 | 1: [['2018', 'alfa romeo 2300'], 185 | ['2018', 'alfa romeo montreal'], 186 | ['2018', 'alfa romeo 90'], 187 | ['2018', 'alfa romeo gtv'], 188 | ['2018', 'alfa romeo 6c']], 189 | 2: [['2018', 'alfa romeo', 'ameo']]}, 190 | 'expected_steps': [FindStep.fuzzy_try, FindStep.fuzzy_found, {FindStep.rest_of_fuzzy_round2: [FindStep.fuzzy_try, FindStep.fuzzy_found]}, FindStep.not_enough_results_add_some_descandants], 191 | 'expected_find_and_sort_results': [['2018'], 192 | ['2018', 'alfa romeo 2300'], 193 | ['2018', 'alfa romeo montreal'], 194 | ['2018', 'alfa romeo 90']], 195 | }, 196 | {'word': '2018 alpha', 197 | 'max_cost': 3, 198 | 'size': 3, 199 | 'expected_find_results': {0: [['2018']], 200 | 2: [['2018', 'alfa romeo'], 201 | ['2018', 'alfa romeo 2300'], 202 | ['2018', 'alfa romeo montreal'], 203 | ['2018', 'alfa romeo 90'], 204 | ['2018', 'alfa romeo gtv']]}, 205 | 'expected_steps': STEP_FUZZY_FOUND, 206 | 'expected_find_and_sort_results': [['2018'], ['2018', 'alfa romeo'], ['2018', 'alfa romeo 2300']], 207 | }, 208 | {'word': '2018 alfa', 209 | 'max_cost': 3, 210 | 'size': 3, 211 | 'expected_find_results': {0: [['2018', 'alfa romeo']], 212 | 1: [['2018', 'alfa romeo 2300'], 213 | ['2018', 'alfa romeo montreal'], 214 | ['2018', 'alfa romeo 90'], 215 | ['2018', 'alfa romeo gtv']]}, 216 | 'expected_steps': STEP_DESCENDANTS_ONLY, 217 | 'expected_find_and_sort_results': [['2018', 'alfa romeo'], ['2018', 'alfa romeo 2300'], ['2018', 'alfa romeo montreal']], 218 | }, 219 | {'word': '2018 alfg', 220 | 'max_cost': 3, 221 | 'size': 3, 222 | 'expected_find_results': {0: [['2018']], 223 | 1: [['2018', 'alfa romeo 2300'], 224 | ['2018', 'alfa romeo montreal'], 225 | ['2018', 'alfa romeo 90'], 226 | ['2018', 'alfa romeo gtv']]}, 227 | 'expected_steps': STEP_DESCENDANTS_ONLY, 228 | 'expected_find_and_sort_results': [['2018'], ['2018', 'alfa romeo 2300'], ['2018', 'alfa romeo montreal']], 229 | }, 230 | {'word': '2018 glfa', 231 | 'max_cost': 3, 232 | 'size': 3, 233 | 'expected_find_results': {0: [['2018']], 1: [['2018', 'gla']]}, 234 | 'expected_steps': STEP_DESCENDANTS_ONLY, 235 | 'expected_find_and_sort_results': [['2018'], ['2018', 'gla']], 236 | }, 237 | {'word': '2018 doyota', 238 | 'max_cost': 3, 239 | 'size': 3, 240 | 'expected_find_results': {0: [['2018']], 241 | 1: [['2018', 'toyota'], 242 | ['2018', 'toyota crown'], 243 | ['2018', 'toyota prius'], 244 | ['2018', 'toyota avalon'], 245 | ['2018', 'toyota dyna']]}, 246 | 'expected_steps': STEP_FUZZY_FOUND, 247 | 'expected_find_and_sort_results': [['2018'], ['2018', 'toyota'], ['2018', 'toyota crown']], 248 | }, 249 | {'word': '2018 doyota camr', 250 | 'max_cost': 3, 251 | 'size': 3, 252 | 'expected_find_results': {0: [['2018']], 253 | 1: [['2018', 'toyota', 'camry'], 254 | ['2018', 'dyna'], 255 | ['2018', 'dauphine'], 256 | ['2018', 'drifter']]}, 257 | 'expected_steps': [FindStep.fuzzy_try, FindStep.fuzzy_found, {FindStep.rest_of_fuzzy_round2: [FindStep.descendants_only]}, FindStep.not_enough_results_add_some_descandants], 258 | 'expected_find_and_sort_results': [['2018'], ['2018', 'toyota', 'camry'], ['2018', 'dyna']], 259 | }, 260 | {'word': '2018 beemer', 261 | 'max_cost': 3, 262 | 'size': 3, 263 | 'expected_find_results': {0: [['2018', 'bmw']], 264 | 1: [['2018', 'bmw 1 series'], 265 | ['2018', 'bmw e28'], 266 | ['2018', 'bmw e30'], 267 | ['2018', 'bmw e34']]}, 268 | 'expected_steps': STEP_DESCENDANTS_ONLY, 269 | 'expected_find_and_sort_results': [['2018', 'bmw'], ['2018', 'bmw 1 series'], ['2018', 'bmw e28']], 270 | }, 271 | {'word': '2018 beener', 272 | 'max_cost': 3, 273 | 'size': 3, 274 | 'expected_find_results': {0: [['2018']], 275 | 1: [['2018', 'bmw 1 series'], 276 | ['2018', 'bmw e28'], 277 | ['2018', 'bmw e30'], 278 | ['2018', 'bmw e34']]}, 279 | 'expected_steps': [FindStep.fuzzy_try, FindStep.not_enough_results_add_some_descandants], 280 | 'expected_find_and_sort_results': [['2018'], ['2018', 'bmw 1 series'], ['2018', 'bmw e28']], 281 | }, 282 | {'word': 'vw bea', 283 | 'max_cost': 3, 284 | 'size': 3, 285 | 'expected_find_results': {0: [['volkswagen']], 1: [['volkswagen beetle']]}, 286 | 'expected_steps': STEP_DESCENDANTS_ONLY, 287 | 'expected_find_and_sort_results': [['volkswagen'], ['volkswagen beetle']], 288 | }, 289 | {'word': 'toyota camry 2018', 290 | 'max_cost': 3, 291 | 'size': 5, 292 | 'expected_find_results': {0: [['toyota camry', '2018']]}, 293 | 'expected_steps': STEP_DESCENDANTS_ONLY, 294 | 'expected_find_and_sort_results': [['toyota camry', '2018']], 295 | }, 296 | {'word': 'type r', 297 | 'max_cost': 3, 298 | 'size': 5, 299 | 'expected_find_results': {0: [['type r']]}, 300 | 'expected_steps': STEP_DESCENDANTS_ONLY, 301 | 'expected_find_and_sort_results': [['type r']], 302 | }, 303 | {'word': 'truck', 304 | 'max_cost': 3, 305 | 'size': 5, 306 | 'expected_find_results': {0: [['truck']]}, 307 | 'expected_steps': STEP_DESCENDANTS_ONLY, 308 | 'expected_find_and_sort_results': [['truck']], 309 | }, 310 | {'word': 'trucks', 311 | 'max_cost': 3, 312 | 'size': 5, 313 | 'expected_find_results': {0: [['truck']]}, 314 | 'expected_steps': STEP_DESCENDANTS_ONLY, 315 | 'expected_find_and_sort_results': [['truck']], 316 | }, 317 | {'word': '1se', 318 | 'max_cost': 3, 319 | 'size': 5, 320 | 'expected_find_results': {1: [['1 series']]}, 321 | 'expected_steps': STEP_DESCENDANTS_ONLY, 322 | 'expected_find_and_sort_results': [['1 series']], 323 | }, 324 | ] 325 | 326 | 327 | SEARCH_CASES_PARAMS = parameterize_cases(SEARCH_CASES) 328 | 329 | 330 | class TestAutocompleteWithSynonyms: 331 | 332 | @pytest.mark.parametrize("word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results", SEARCH_CASES_PARAMS) 333 | def test_find(self, word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results): 334 | expected_results = expected_find_results 335 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) 336 | results, find_steps = auto_complete._find(word, max_cost, size) 337 | results = dict(results) 338 | print_results(locals()) 339 | assert expected_results == results 340 | assert expected_steps == find_steps 341 | 342 | @pytest.mark.parametrize("word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results", SEARCH_CASES_PARAMS) 343 | def test__find_and_sort(self, word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results): 344 | expected_results = expected_find_and_sort_results 345 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) 346 | results = auto_complete._find_and_sort(word, max_cost, size) 347 | results = list(results) 348 | search_results = auto_complete.search(word, max_cost, size) 349 | print_results(locals()) 350 | assert expected_results == results 351 | if word.strip(): 352 | assert expected_results == search_results 353 | else: 354 | assert [] == search_results 355 | 356 | @pytest.mark.parametrize("word", [ 357 | 'alf', 358 | ]) 359 | def test_immutable_info(self, word): 360 | auto_complete = AutoComplete(words=SHORT_WORDS, synonyms=SYNONYMS) 361 | auto_complete_immutable = AutoComplete(words=SHORT_WORDS_IMMUTABLE_INFO, synonyms=SYNONYMS) 362 | search_results = auto_complete._find(word, max_cost=3, size=3) 363 | search_results_immutable = auto_complete_immutable._find(word, max_cost=3, size=3) 364 | print_results(locals()) 365 | assert search_results_immutable == search_results 366 | 367 | 368 | class AutoCompleteWithSynonymsShort(DrawGraphMixin, AutoComplete): 369 | pass 370 | 371 | 372 | class AutoCompleteWithSynonymsShortWithAnim(AutoCompleteWithSynonymsShort): 373 | 374 | DRAW_POPULATION_ANIMATION = True 375 | DRAW_POPULATION_ANIMATION_PATH = os.path.join(current_dir, 'animation/short_.svg') 376 | DRAW_POPULATION_ANIMATION_FILENO_PADDING = 6 377 | 378 | 379 | class TestAutoCompleteWithSynonymsShortGraphDraw: 380 | 381 | def test_draw_graph(self): 382 | auto_complete = AutoCompleteWithSynonymsShort(words=SHORT_WORDS) 383 | file_path = os.path.join(current_dir, 'AutoCompleteWithSynonymsShort_Graph.svg') 384 | auto_complete.draw_graph(file_path) 385 | 386 | def test_draw_graph_animation(self): 387 | AutoCompleteWithSynonymsShortWithAnim(words=SHORT_WORDS) 388 | 389 | 390 | class TestPrefixAndDescendants: 391 | 392 | @pytest.mark.parametrize("word, expected_matched_prefix_of_last_word, expected_rest_of_word, expected_matched_words, expected_node_path", [ 393 | ('2018 alpha blah blah', 'al', 'pha blah blah', ['2018'], 'a,l'), 394 | ('2018 alpha ', 'al', 'pha ', ['2018'], 'a,l'), 395 | ('2018 alfa', '', '', ['2018', 'alfa romeo'], 'a,l,f,a'), 396 | ('2018 alf', 'alf', '', ['2018'], 'a,l,f'), 397 | ('2018 alfa romeo', '', '', ['2018', 'alfa romeo'], 'a,l,f,a, ,r,o,m,e,o'), 398 | ('1 series bmw 2007 2018', '', '', ['1 series', 'bmw', '2007', '2018'], '2,0,1,8'), 399 | ('200 chrysler', '', '', ['200', 'chrysler'], 'c,h,r,y,s,l,e,r'), 400 | ('200 chrysler 200', '', '', ['200', 'chrysler 200'], 'c,h,r,y,s,l,e,r, ,2,0,0'), 401 | ('chrysler 2007', '', '', ['chrysler', '2007'], '2,0,0,7'), 402 | ('type r', '', '', ['type r'], 't,y,p,e, ,r'), 403 | ]) 404 | def test_prefix_autofill(self, word, expected_matched_prefix_of_last_word, 405 | expected_rest_of_word, expected_matched_words, expected_node_path): 406 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) 407 | matched_prefix_of_last_word, rest_of_word, node, matched_words = auto_complete._prefix_autofill(word) 408 | print(f'word: {word}') 409 | print(f'expected_matched_prefix_of_last_word: {expected_matched_prefix_of_last_word}') 410 | print(f'matched_prefix_of_last_word: {matched_prefix_of_last_word}') 411 | print(f'expected_rest_of_word: {expected_rest_of_word}') 412 | print(f'rest_of_word: {rest_of_word}') 413 | print(f'node: {node}') 414 | print(f'expected_matched_words: {expected_matched_words}') 415 | print(f'matched_words: {matched_words}') 416 | expected_node = auto_complete._dwg 417 | for k in expected_node_path.split(','): 418 | expected_node = expected_node[k] 419 | assert expected_node is node 420 | assert expected_matched_prefix_of_last_word == matched_prefix_of_last_word 421 | assert expected_rest_of_word == rest_of_word 422 | assert expected_matched_words == matched_words 423 | 424 | @pytest.mark.parametrize("word, expected_results", [ 425 | ('2018 alpha ', ['alfa', 'alfa rl', 'alfa rm']), 426 | ('1 series bmw 2', ['bmw 2 series']), 427 | ('2018 alfa', ['alfa rl', 'alfa rm', 'alfa 33']), 428 | ]) 429 | def test_get_descendants_nodes(self, word, expected_results): 430 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) 431 | matched_prefix_of_last_word, rest_of_word, node, matched_words = auto_complete._prefix_autofill(word) 432 | size = 2 433 | found_words_gen = node.get_descendants_nodes(size=size) 434 | found_words = [_node.word for _node in found_words_gen][:size + 1] 435 | print(f'word: {word}') 436 | print(f'expected_results: {expected_results}') 437 | print(f'found_words: {found_words}') 438 | assert expected_results == list(found_words) 439 | 440 | @pytest.mark.parametrize("word, expected_results", [ 441 | ('r', ['rc', 'rx', 'rl', 'rm', 'r8', 'rav4', 'r107', 'r129', 'r170', 'r171', 'r230', 'r231', 'regal', 'royal', 'ridgeline']), 442 | ('benz', []), 443 | ]) 444 | def test_get_all_descendent_words_for_condition1(self, word, expected_results): 445 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) 446 | 447 | def condition(word_info): 448 | return 'model' in word_info 449 | 450 | size = 10 451 | results = auto_complete.get_all_descendent_words_for_condition(word=word, size=size, condition=condition) 452 | print_results(locals()) 453 | # So by default we insert counts and that makes the size to be set to infinity. 454 | # I don't remember why. 455 | # This line fails then. Note that test_get_all_descendent_words_for_condition is only used in search tokenizer. 456 | # assert expected_results == results[:size + 1] 457 | 458 | 459 | class TestOther: 460 | 461 | @pytest.mark.parametrize("word, expected_results", [ 462 | ('bmw', ['bmw']), 463 | ('al', ['alfa romeo']), 464 | ]) 465 | def test_get_all_descendent_words_for_condition2(self, word, expected_results): 466 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo']) 467 | 468 | results = auto_complete.get_tokens_flat_list(word, max_cost=0, size=3) 469 | print_results(locals()) 470 | assert expected_results == results 471 | 472 | @pytest.mark.parametrize("word, expected_results", [ 473 | ('bmw', {'make': 'bmw'}), 474 | ('bMw', {'make': 'bmw'}), 475 | ('al', None), 476 | ]) 477 | def test_get_word_context(self, word, expected_results): 478 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo']) 479 | results = auto_complete.get_word_context(word) 480 | print_results(locals()) 481 | assert expected_results == results 482 | 483 | @pytest.mark.parametrize("word, update_dict, expected_results, expected_new_count", [ 484 | ('toyota a', None, [['toyota'], ['toyota avalon'], ['toyota aurion'], ['toyota auris']], None), 485 | ('toyota a', {'word': 'toyota aygo', 'count': 10000}, [['toyota'], ['toyota aygo'], ['toyota avalon'], ['toyota aurion']], 10000), 486 | ('toyota a', {'word': 'toyota aurion', 'offset': -6000}, [['toyota'], ['toyota avalon'], ['toyota auris'], ['toyota aygo']], 94), 487 | ]) 488 | def test_update_count_of_word(self, word, update_dict, expected_results, expected_new_count): 489 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo']) 490 | if update_dict: 491 | new_count = auto_complete.update_count_of_word(**update_dict) 492 | assert expected_new_count == new_count 493 | assert expected_new_count == auto_complete.get_count_of_word(update_dict['word']) 494 | results = auto_complete.search(word, max_cost=2, size=4) 495 | print_results(locals()) 496 | assert expected_results == results 497 | -------------------------------------------------------------------------------- /fast_autocomplete/dwg.py: -------------------------------------------------------------------------------- 1 | from collections import ( 2 | defaultdict, 3 | deque 4 | ) 5 | from itertools import islice 6 | from enum import Enum 7 | from threading import Lock 8 | from fast_autocomplete.lfucache import LFUCache 9 | from fast_autocomplete.misc import _extend_and_repeat 10 | from fast_autocomplete.normalize import Normalizer 11 | 12 | # Prefer the 'Levenshtein' library implementation 13 | try: 14 | from Levenshtein import distance as levenshtein_distance 15 | except ImportError: 16 | try: 17 | from pylev import levenshtein as levenshtein_distance 18 | except ImportError: 19 | raise RuntimeError(""" 20 | Unable to import a levenshtein distance calculation module. 21 | Please add python-Levenshtein or pylev to your Python dependencies. 22 | 23 | Installing this package as 24 | 25 | pip install fast-autocomplete[levenshtein] 26 | 27 | or 28 | 29 | pip install fast-autocomplete[pylev] 30 | 31 | Note that fast-autocomplete[levenshtein] is preferred and is much faster than fast-autocomplete[pylev] 32 | """) 33 | 34 | DELIMITER = '__' 35 | ORIGINAL_KEY = 'original_key' 36 | INF = float('inf') 37 | 38 | 39 | class NodeNotFound(ValueError): 40 | pass 41 | 42 | 43 | class FindStep(Enum): 44 | start = 0 45 | descendants_only = 1 46 | fuzzy_try = 2 47 | fuzzy_found = 3 48 | rest_of_fuzzy_round2 = 4 49 | not_enough_results_add_some_descandants = 5 50 | 51 | 52 | class AutoComplete: 53 | 54 | CACHE_SIZE = 2048 55 | SHOULD_INCLUDE_COUNT = True 56 | 57 | def __init__( 58 | self, 59 | words, 60 | synonyms=None, 61 | full_stop_words=None, 62 | logger=None, 63 | valid_chars_for_string=None, 64 | valid_chars_for_integer=None, 65 | valid_chars_for_node_name=None, 66 | ): 67 | """ 68 | Initializes the Autocomplete module 69 | 70 | :param words: A dictionary of words mapped to their context 71 | :param synonyms: (optional) A dictionary of words to their synonyms. 72 | The synonym words should only be here and not repeated in words parameter. 73 | """ 74 | self._lock = Lock() 75 | self._dwg = None 76 | self._raw_synonyms = synonyms or {} 77 | self._lfu_cache = LFUCache(self.CACHE_SIZE) 78 | self._clean_synonyms, self._partial_synonyms = self._get_clean_and_partial_synonyms() 79 | self._reverse_synonyms = self._get_reverse_synonyms(self._clean_synonyms) 80 | self._full_stop_words = set(full_stop_words) if full_stop_words else None 81 | self.logger = logger 82 | self.words = words 83 | self.normalizer = Normalizer( 84 | valid_chars_for_string=valid_chars_for_string, 85 | valid_chars_for_integer=valid_chars_for_integer, 86 | valid_chars_for_node_name=valid_chars_for_node_name, 87 | ) 88 | new_words = self._get_partial_synonyms_to_words() 89 | self.words.update(new_words) 90 | self._populate_dwg() 91 | 92 | def _get_clean_and_partial_synonyms(self): 93 | """ 94 | Synonyms are words that should produce the same results. 95 | 96 | - For example `beemer` and `bmw` should both give you `bmw`. 97 | - `alfa` and `alfa romeo` should both give you `alfa romeo` 98 | 99 | The synonyms get divided into 2 groups: 100 | 101 | 1. clean synonyms: The 2 words share little or no words. For example `beemer` vs. `bmw`. 102 | 2. partial synonyms: One of the 2 words is a substring of the other one. For example `alfa` and `alfa romeo` or `gm` vs. `gmc`. 103 | 104 | """ 105 | clean_synonyms = {} 106 | partial_synonyms = {} 107 | 108 | for key, synonyms in self._raw_synonyms.items(): 109 | key = key.strip().lower() 110 | _clean = [] 111 | _partial = [] 112 | for syn in synonyms: 113 | syn = syn.strip().lower() 114 | if key.startswith(syn): 115 | _partial.append(syn) 116 | else: 117 | _clean.append(syn) 118 | if _clean: 119 | clean_synonyms[key] = _clean 120 | if _partial: 121 | partial_synonyms[key] = _partial 122 | 123 | return clean_synonyms, partial_synonyms 124 | 125 | def _get_reverse_synonyms(self, synonyms): 126 | result = {} 127 | if synonyms: 128 | for key, value in synonyms.items(): 129 | for item in value: 130 | result[item] = key 131 | return result 132 | 133 | def _get_partial_synonyms_to_words(self): 134 | new_words = {} 135 | for key, value in self.words.items(): 136 | # data is mutable so we copy 137 | try: 138 | value = value.copy() 139 | # data must be named tuple 140 | except Exception: 141 | new_value = value._asdict() 142 | new_value[ORIGINAL_KEY] = key 143 | value = type(value)(**new_value) 144 | else: 145 | value[ORIGINAL_KEY] = key 146 | for syn_key, syns in self._partial_synonyms.items(): 147 | if key.startswith(syn_key): 148 | for syn in syns: 149 | new_key = key.replace(syn_key, syn) 150 | new_words[new_key] = value 151 | return new_words 152 | 153 | def _populate_dwg(self): 154 | if not self._dwg: 155 | with self._lock: 156 | if not self._dwg: 157 | self._dwg = _DawgNode() 158 | for word, value in self.words.items(): 159 | original_key = value.get(ORIGINAL_KEY) 160 | # word = word.strip().lower() 161 | count = value.get('count', 0) 162 | leaf_node = self.insert_word_branch( 163 | word, 164 | original_key=original_key, 165 | count=count 166 | ) 167 | if leaf_node and self._clean_synonyms: 168 | for synonym in self._clean_synonyms.get(word, []): 169 | self.insert_word_branch( 170 | synonym, 171 | leaf_node=leaf_node, 172 | add_word=False, 173 | count=count 174 | ) 175 | 176 | def insert_word_callback(self, word): 177 | """ 178 | Once word is inserted, run this. 179 | """ 180 | pass 181 | 182 | def insert_word_branch(self, word, leaf_node=None, add_word=True, original_key=None, count=0): 183 | """ 184 | Inserts a word into the Dawg. 185 | 186 | :param word: The word to be inserted as a branch of dwg 187 | :param leaf_node: (optional) The leaf node for the node to merge into in the dwg. 188 | :param add_word: (Boolean, default: True) Add the word itself at the end of the branch. 189 | Usually this is set to False if we are merging into a leaf node and do not 190 | want to add the actual word there. 191 | :param original_key: If the word that is being added was originally another word. 192 | For example with synonyms, you might be inserting the word `beemer` but the 193 | original key is `bmw`. This parameter might be removed in the future. 194 | 195 | """ 196 | # if word == 'u (2 off)': 197 | # import pytest; pytest.set_trace() 198 | normalized_word = self.normalizer.normalize_node_name(word) 199 | # sometimes if the word does not have any valid characters, the normalized_word will be empty 200 | if not normalized_word: 201 | return 202 | last_char = normalized_word[-1] 203 | 204 | if leaf_node: 205 | temp_leaf_node = self._dwg.insert( 206 | word=word, 207 | normalized_word=normalized_word[:-1], 208 | add_word=add_word, 209 | original_key=original_key, 210 | count=count, 211 | insert_count=self.SHOULD_INCLUDE_COUNT 212 | ) 213 | # It already has children 214 | if temp_leaf_node.children and last_char in temp_leaf_node.children: 215 | temp_leaf_node.children[last_char].word = leaf_node.word 216 | # otherwise merge into the leaf node 217 | else: 218 | temp_leaf_node.children[last_char] = leaf_node 219 | else: 220 | leaf_node = self._dwg.insert( 221 | word=word, 222 | normalized_word=normalized_word, 223 | original_key=original_key, 224 | count=count, 225 | insert_count=self.SHOULD_INCLUDE_COUNT 226 | ) 227 | self.insert_word_callback(word) 228 | return leaf_node 229 | 230 | def _find_and_sort(self, word, max_cost, size): 231 | output_keys_set = set() 232 | results, find_steps = self._find(word, max_cost, size) 233 | results_keys = list(results.keys()) 234 | results_keys.sort() 235 | for key in results_keys: 236 | for output_items in results[key]: 237 | for i, item in enumerate(output_items): 238 | reversed_item = self._reverse_synonyms.get(item) 239 | if reversed_item: 240 | output_items[i] = reversed_item 241 | elif item not in self.words: 242 | output_items[i] = item 243 | output_items_str = DELIMITER.join(output_items) 244 | if output_items and output_items_str not in output_keys_set: 245 | output_keys_set.add(output_items_str) 246 | yield output_items 247 | if len(output_keys_set) >= size: 248 | return 249 | 250 | def get_tokens_flat_list(self, word, max_cost=3, size=10): 251 | """ 252 | Gets a flat list of tokens. 253 | This requires the original search function from this class to be run, 254 | instead of subclasses of AutoComplete. 255 | """ 256 | result = AutoComplete.search(self, word, max_cost=max_cost, size=size) 257 | return [item for sublist in result for item in sublist] 258 | 259 | def get_word_context(self, word): 260 | """ 261 | Gets the word's context from the words dictionary 262 | """ 263 | word = self.normalizer.normalize_node_name(word) 264 | return self.words.get(word) 265 | 266 | def search(self, word, max_cost=2, size=5): 267 | """ 268 | parameters: 269 | - word: the word to return autocomplete results for 270 | - max_cost: Maximum Levenshtein edit distance to be considered when calculating results 271 | - size: The max number of results to return 272 | """ 273 | word = self.normalizer.normalize_node_name(word) 274 | if not word: 275 | return [] 276 | key = f'{word}-{max_cost}-{size}' 277 | result = self._lfu_cache.get(key) 278 | if result == -1: 279 | result = list(self._find_and_sort(word, max_cost, size)) 280 | self._lfu_cache.set(key, result) 281 | return result 282 | 283 | @staticmethod 284 | def _len_results(results): 285 | return sum(map(len, results.values())) 286 | 287 | @staticmethod 288 | def _is_enough_results(results, size): 289 | return AutoComplete._len_results(results) >= size 290 | 291 | def _is_stop_word_condition(self, matched_words, matched_prefix_of_last_word): 292 | return (self._full_stop_words and matched_words and matched_words[-1] in self._full_stop_words and not matched_prefix_of_last_word) 293 | 294 | def _find(self, word, max_cost, size, call_count=0): 295 | """ 296 | The search function returns a list of all words that are less than the given 297 | maximum distance from the target word 298 | """ 299 | results = defaultdict(list) 300 | fuzzy_matches = defaultdict(list) 301 | rest_of_results = {} 302 | fuzzy_matches_len = 0 303 | 304 | fuzzy_min_distance = min_distance = INF 305 | matched_prefix_of_last_word, rest_of_word, new_node, matched_words = self._prefix_autofill(word=word) 306 | 307 | last_word = matched_prefix_of_last_word + rest_of_word 308 | 309 | if matched_words: 310 | results[0] = [matched_words.copy()] 311 | min_distance = 0 312 | # under certain condition with finding full stop words, do not bother with finding more matches 313 | if self._is_stop_word_condition(matched_words, matched_prefix_of_last_word): 314 | find_steps = [FindStep.start] 315 | return results, find_steps 316 | if len(rest_of_word) < 3: 317 | find_steps = [FindStep.descendants_only] 318 | self._add_descendants_words_to_results(node=new_node, size=size, matched_words=matched_words, results=results, distance=1) 319 | else: 320 | find_steps = [FindStep.fuzzy_try] 321 | word_chunks = deque(filter(lambda x: x, last_word.split(' '))) 322 | new_word = word_chunks.popleft() 323 | 324 | # TODO: experiment with the number here 325 | # 'in los angeles' gets cut into `in los` so it becomes a closer match to `in lodi` 326 | # but if the number was bigger, we could have matched with `in los angeles` 327 | while len(new_word) < 5 and word_chunks: 328 | new_word = f'{new_word} {word_chunks.popleft()}' 329 | fuzzy_rest_of_word = ' '.join(word_chunks) 330 | 331 | for _word in self.words: 332 | if abs(len(_word) - len(new_word)) > max_cost: 333 | continue 334 | dist = levenshtein_distance(new_word, _word) 335 | if dist < max_cost: 336 | fuzzy_matches_len += 1 337 | _value = self.words[_word].get(ORIGINAL_KEY, _word) 338 | fuzzy_matches[dist].append(_value) 339 | fuzzy_min_distance = min(fuzzy_min_distance, dist) 340 | if fuzzy_matches_len >= size or dist < 2: 341 | break 342 | if fuzzy_matches_len: 343 | find_steps.append(FindStep.fuzzy_found) 344 | if fuzzy_rest_of_word: 345 | call_count += 1 346 | if call_count < 2: 347 | rest_of_results, rest_find_steps = self._find(word=fuzzy_rest_of_word, max_cost=max_cost, size=size, call_count=call_count) 348 | find_steps.append({FindStep.rest_of_fuzzy_round2: rest_find_steps}) 349 | for _word in fuzzy_matches[fuzzy_min_distance]: 350 | if rest_of_results: 351 | rest_of_results_min_key = min(rest_of_results.keys()) 352 | for _rest_of_matched_word in rest_of_results[rest_of_results_min_key]: 353 | results[fuzzy_min_distance].append(matched_words + [_word] + _rest_of_matched_word) 354 | else: 355 | results[fuzzy_min_distance].append(matched_words + [_word]) 356 | _matched_prefix_of_last_word_b, not_used_rest_of_word, fuzzy_new_node, _matched_words_b = self._prefix_autofill(word=_word) 357 | if self._is_stop_word_condition(matched_words=_matched_words_b, matched_prefix_of_last_word=_matched_prefix_of_last_word_b): 358 | break 359 | self._add_descendants_words_to_results(node=fuzzy_new_node, size=size, matched_words=matched_words, results=results, distance=fuzzy_min_distance) 360 | 361 | if matched_words and not self._is_enough_results(results, size): 362 | find_steps.append(FindStep.not_enough_results_add_some_descandants) 363 | total_min_distance = min(min_distance, fuzzy_min_distance) 364 | self._add_descendants_words_to_results(node=new_node, size=size, matched_words=matched_words, results=results, distance=total_min_distance+1) 365 | 366 | return results, find_steps 367 | 368 | def _prefix_autofill(self, word, node=None): 369 | len_prev_rest_of_last_word = INF 370 | matched_words = [] 371 | matched_words_set = set() 372 | 373 | def _add_words(words): 374 | is_added = False 375 | for word in words: 376 | if word not in matched_words_set: 377 | matched_words.append(word) 378 | matched_words_set.add(word) 379 | is_added = True 380 | return is_added 381 | 382 | matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word, node) 383 | _add_words(matched_words_part) 384 | result = (matched_prefix_of_last_word, rest_of_word, node, matched_words) 385 | len_rest_of_last_word = len(rest_of_word) 386 | 387 | while len_rest_of_last_word and len_rest_of_last_word < len_prev_rest_of_last_word: 388 | word = matched_prefix_of_last_word + rest_of_word 389 | word = word.strip() 390 | len_prev_rest_of_last_word = len_rest_of_last_word 391 | matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word, node=self._dwg, matched_condition_ever=matched_condition_ever, matched_condition_in_branch=matched_condition_in_branch) 392 | is_added = _add_words(matched_words_part) 393 | if is_added is False: 394 | break 395 | len_rest_of_last_word = len(rest_of_word) 396 | result = (matched_prefix_of_last_word, rest_of_word, node, matched_words) 397 | 398 | return result 399 | 400 | def prefix_autofill_part_condition(self, node): 401 | pass 402 | 403 | PREFIX_AUTOFILL_PART_CONDITION_SUFFIX = '' 404 | 405 | def _add_to_matched_words(self, node, matched_words, matched_condition_in_branch, matched_condition_ever, matched_prefix_of_last_word): 406 | if matched_words: 407 | last_matched_word = matched_words[-1].replace(self.PREFIX_AUTOFILL_PART_CONDITION_SUFFIX, '') 408 | if node.value.startswith(last_matched_word): 409 | matched_words.pop() 410 | value = node.value 411 | if self.PREFIX_AUTOFILL_PART_CONDITION_SUFFIX: 412 | if self._node_word_info_matches_condition(node, self.prefix_autofill_part_condition): 413 | matched_condition_in_branch = True 414 | if matched_condition_ever and matched_prefix_of_last_word: 415 | value = f"{matched_prefix_of_last_word}{self.PREFIX_AUTOFILL_PART_CONDITION_SUFFIX}" 416 | matched_words.append(value) 417 | return matched_words, matched_condition_in_branch 418 | 419 | def _prefix_autofill_part(self, word, node=None, matched_condition_ever=False, matched_condition_in_branch=False): 420 | node = node or self._dwg 421 | que = deque(word) 422 | 423 | matched_prefix_of_last_word = '' 424 | matched_words = [] 425 | nodes_that_words_were_extracted = set() 426 | 427 | while que: 428 | char = que.popleft() 429 | 430 | if node.children: 431 | if char not in node.children: 432 | space_child = node.children.get(' ') 433 | if space_child and char in space_child.children: 434 | node = space_child 435 | else: 436 | que.appendleft(char) 437 | break 438 | node = node.children[char] 439 | if char != ' ' or matched_prefix_of_last_word: 440 | matched_prefix_of_last_word += char 441 | if node.word: 442 | if que: 443 | next_char = que[0] 444 | if next_char != ' ': 445 | continue 446 | matched_words, matched_condition_in_branch = self._add_to_matched_words(node, matched_words, matched_condition_in_branch, matched_condition_ever, matched_prefix_of_last_word) 447 | nodes_that_words_were_extracted.add(node) 448 | matched_prefix_of_last_word = '' 449 | else: 450 | if char == ' ': 451 | node = self._dwg 452 | if matched_condition_in_branch: 453 | matched_condition_ever = True 454 | else: 455 | que.appendleft(char) 456 | break 457 | 458 | if not que and node.word and node not in nodes_that_words_were_extracted: 459 | matched_words, matched_condition_in_branch = self._add_to_matched_words(node, matched_words, matched_condition_in_branch, matched_condition_ever, matched_prefix_of_last_word) 460 | matched_prefix_of_last_word = '' 461 | 462 | rest_of_word = "".join(que) 463 | if matched_condition_in_branch: 464 | matched_condition_ever = True 465 | 466 | return matched_prefix_of_last_word, rest_of_word, node, matched_words, matched_condition_ever, matched_condition_in_branch 467 | 468 | def _add_descendants_words_to_results(self, node, size, matched_words, results, distance, should_traverse=True): 469 | descendant_words = list(node.get_descendants_words(size, should_traverse, full_stop_words=self._full_stop_words)) 470 | extended = _extend_and_repeat(matched_words, descendant_words) 471 | if extended: 472 | results[distance].extend(extended) 473 | return distance 474 | 475 | def _node_word_info_matches_condition(self, node, condition): 476 | _word = node.word 477 | word_info = self.words.get(_word) 478 | if word_info: 479 | return condition(word_info) 480 | else: 481 | return False 482 | 483 | def get_all_descendent_words_for_condition(self, word, size, condition): 484 | """ 485 | This is used in the search tokenizer not in the fast autocomplete itself. 486 | """ 487 | new_tokens = [] 488 | 489 | matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word=word) 490 | if not rest_of_word and self._node_word_info_matches_condition(node, condition): 491 | found_nodes_gen = node.get_descendants_nodes(size, insert_count=self.SHOULD_INCLUDE_COUNT) 492 | for node in found_nodes_gen: 493 | if self._node_word_info_matches_condition(node, condition): 494 | new_tokens.append(node.word) 495 | return new_tokens 496 | 497 | def update_count_of_word(self, word, count=None, offset=None): 498 | """ 499 | Update the count attribute of a node in the dwg. This only affects the autocomplete 500 | object and not the original count of the node in the data that was fed into fast_autocomplete. 501 | """ 502 | matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word=word) 503 | if node: 504 | if offset: 505 | with self._lock: 506 | node.count += offset 507 | elif count: 508 | with self._lock: 509 | node.count = count 510 | else: 511 | raise NodeNotFound(f'Unable to find a node for word {word}') 512 | return node.count 513 | 514 | def get_count_of_word(self, word): 515 | return self.update_count_of_word(word) 516 | 517 | 518 | class _DawgNode: 519 | """ 520 | The Dawg data structure keeps a set of words, organized with one node for 521 | each letter. Each node has a branch for each letter that may follow it in the 522 | set of words. 523 | """ 524 | 525 | __slots__ = ("word", "original_key", "children", "count") 526 | 527 | def __init__(self): 528 | self.word = None 529 | self.original_key = None 530 | self.children = {} 531 | self.count = 0 532 | 533 | def __getitem__(self, key): 534 | return self.children[key] 535 | 536 | def __repr__(self): 537 | return f'' 538 | 539 | @property 540 | def value(self): 541 | return self.original_key or self.word 542 | 543 | def insert(self, word, normalized_word, add_word=True, original_key=None, count=0, insert_count=True): 544 | node = self 545 | for letter in normalized_word: 546 | if letter not in node.children: 547 | node.children[letter] = _DawgNode() 548 | 549 | node = node.children[letter] 550 | 551 | if add_word: 552 | node.word = word 553 | node.original_key = original_key 554 | if insert_count: 555 | node.count = int(count) # converts any str to int 556 | return node 557 | 558 | def get_descendants_nodes(self, size, should_traverse=True, full_stop_words=None, insert_count=True): 559 | if insert_count is True: 560 | size = INF 561 | 562 | que = deque() 563 | unique_nodes = {self} 564 | found_nodes_set = set() 565 | full_stop_words = full_stop_words if full_stop_words else set() 566 | 567 | for letter, child_node in self.children.items(): 568 | if child_node not in unique_nodes: 569 | unique_nodes.add(child_node) 570 | que.append((letter, child_node)) 571 | 572 | while que: 573 | letter, child_node = que.popleft() 574 | child_value = child_node.value 575 | if child_value: 576 | if child_value in full_stop_words: 577 | should_traverse = False 578 | if child_value not in found_nodes_set: 579 | found_nodes_set.add(child_value) 580 | yield child_node 581 | if len(found_nodes_set) > size: 582 | break 583 | 584 | if should_traverse: 585 | for letter, grand_child_node in child_node.children.items(): 586 | if grand_child_node not in unique_nodes: 587 | unique_nodes.add(grand_child_node) 588 | que.append((letter, grand_child_node)) 589 | 590 | def get_descendants_words( 591 | self, size, should_traverse=True, full_stop_words=None, insert_count=True): 592 | found_nodes_gen = self.get_descendants_nodes( 593 | size, 594 | should_traverse=should_traverse, 595 | full_stop_words=full_stop_words, 596 | insert_count=insert_count 597 | ) 598 | 599 | if insert_count is True: 600 | found_nodes = sorted( 601 | found_nodes_gen, 602 | key=lambda node: node.count, 603 | reverse=True 604 | )[:size + 1] 605 | else: 606 | found_nodes = islice(found_nodes_gen, size) 607 | 608 | return map(lambda word: word.value, found_nodes) 609 | -------------------------------------------------------------------------------- /tests/AutoCompleteWithSynonymsShort_Graph.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | .0 14 | 15 | 16 | 17 | 18 | .1 19 | 20 | 21 | 22 | 23 | .0->.1 24 | 25 | 26 | a 27 | 28 | 29 | 30 | 4c coupe 31 | 32 | 4c coupe 33 | 34 | 35 | 36 | .0->4c coupe 37 | 38 | 39 | 4 40 | 41 | 42 | 43 | .3 44 | 45 | 46 | 47 | 48 | .0->.3 49 | 50 | 51 | g 52 | 53 | 54 | 55 | .4 56 | 57 | 58 | 59 | 60 | .0->.4 61 | 62 | 63 | b 64 | 65 | 66 | 67 | .5 68 | 69 | 70 | 71 | 72 | .0->.5 73 | 74 | 75 | 1 76 | 77 | 78 | 79 | .6 80 | 81 | 82 | 83 | 84 | .0->.6 85 | 86 | 87 | 2 88 | 89 | 90 | 91 | .7 92 | 93 | 94 | 95 | 96 | .0->.7 97 | 98 | 99 | t 100 | 101 | 102 | 103 | .8 104 | 105 | 106 | 107 | 108 | .1->.8 109 | 110 | 111 | c 112 | 113 | 114 | 115 | .9 116 | 117 | 118 | 119 | 120 | .1->.9 121 | 122 | 123 | l 124 | 125 | 126 | 127 | .10 128 | 129 | 130 | 131 | 132 | 4c coupe->.10 133 | 134 | 135 | ' ' 136 | 137 | 138 | 139 | .11 140 | 141 | 142 | 143 | 144 | .3->.11 145 | 146 | 147 | i 148 | 149 | 150 | 151 | .12 152 | 153 | 154 | 155 | 156 | .4->.12 157 | 158 | 159 | m 160 | 161 | 162 | 163 | .13 164 | 165 | 166 | 167 | 168 | .5->.13 169 | 170 | 171 | ' ' 172 | 173 | 174 | 175 | .14 176 | 177 | 178 | 179 | 180 | .6->.14 181 | 182 | 183 | ' ' 184 | 185 | 186 | 187 | .15 188 | 189 | 190 | 191 | 192 | .6->.15 193 | 194 | 195 | 0 196 | 197 | 198 | 199 | .16 200 | 201 | 202 | 203 | 204 | .7->.16 205 | 206 | 207 | r 208 | 209 | 210 | 211 | root 212 | 213 | root 214 | 215 | 216 | 217 | root->.0 218 | 219 | 220 | 221 | 222 | 223 | .17 224 | 225 | 226 | 227 | 228 | .8->.17 229 | 230 | 231 | u 232 | 233 | 234 | 235 | .18 236 | 237 | 238 | 239 | 240 | .9->.18 241 | 242 | 243 | f 244 | 245 | 246 | 247 | 4c 248 | 249 | 4c 250 | 251 | 252 | 253 | .10->4c 254 | 255 | 256 | c 257 | 258 | 259 | 260 | .20 261 | 262 | 263 | 264 | 265 | .11->.20 266 | 267 | 268 | u 269 | 270 | 271 | 272 | bmw 273 | 274 | bmw 275 | 276 | 277 | 278 | .12->bmw 279 | 280 | 281 | w 282 | 283 | 284 | 285 | .22 286 | 287 | 288 | 289 | 290 | .13->.22 291 | 292 | 293 | s 294 | 295 | 296 | 297 | .23 298 | 299 | 300 | 301 | 302 | .14->.23 303 | 304 | 305 | s 306 | 307 | 308 | 309 | .24 310 | 311 | 312 | 313 | 314 | .15->.24 315 | 316 | 317 | 0 318 | 319 | 320 | 321 | .25 322 | 323 | 324 | 325 | 326 | .15->.25 327 | 328 | 329 | 1 330 | 331 | 332 | 333 | .26 334 | 335 | 336 | 337 | 338 | .16->.26 339 | 340 | 341 | u 342 | 343 | 344 | 345 | .27 346 | 347 | 348 | 349 | 350 | .17->.27 351 | 352 | 353 | r 354 | 355 | 356 | 357 | alfa 358 | 359 | alfa 360 | 361 | 362 | 363 | .18->alfa 364 | 365 | 366 | a 367 | 368 | 369 | 370 | .29 371 | 372 | 373 | 374 | 375 | .20->.29 376 | 377 | 378 | l 379 | 380 | 381 | 382 | .30 383 | 384 | 385 | 386 | 387 | bmw->.30 388 | 389 | 390 | ' ' 391 | 392 | 393 | 394 | .31 395 | 396 | 397 | 398 | 399 | .22->.31 400 | 401 | 402 | e 403 | 404 | 405 | 406 | .32 407 | 408 | 409 | 410 | 411 | .23->.32 412 | 413 | 414 | e 415 | 416 | 417 | 418 | 2007 419 | 420 | 2007 421 | 422 | 423 | 424 | .24->2007 425 | 426 | 427 | 7 428 | 429 | 430 | 431 | 2017 432 | 433 | 2017 434 | 435 | 436 | 437 | .25->2017 438 | 439 | 440 | 7 441 | 442 | 443 | 444 | 2018 445 | 446 | 2018 447 | 448 | 449 | 450 | .25->2018 451 | 452 | 453 | 8 454 | 455 | 456 | 457 | .36 458 | 459 | 460 | 461 | 462 | .26->.36 463 | 464 | 465 | c 466 | 467 | 468 | 469 | acura 470 | 471 | acura 472 | 473 | 474 | 475 | .27->acura 476 | 477 | 478 | a 479 | 480 | 481 | 482 | .38 483 | 484 | 485 | 486 | 487 | alfa->.38 488 | 489 | 490 | ' ' 491 | 492 | 493 | 494 | .39 495 | 496 | 497 | 498 | 499 | .29->.39 500 | 501 | 502 | i 503 | 504 | 505 | 506 | .40 507 | 508 | 509 | 510 | 511 | .30->.40 512 | 513 | 514 | 1 515 | 516 | 517 | 518 | .41 519 | 520 | 521 | 522 | 523 | .30->.41 524 | 525 | 526 | 2 527 | 528 | 529 | 530 | .42 531 | 532 | 533 | 534 | 535 | .31->.42 536 | 537 | 538 | r 539 | 540 | 541 | 542 | .43 543 | 544 | 545 | 546 | 547 | .32->.43 548 | 549 | 550 | r 551 | 552 | 553 | 554 | truck 555 | 556 | truck 557 | 558 | 559 | 560 | .36->truck 561 | 562 | 563 | k 564 | 565 | 566 | 567 | .45 568 | 569 | 570 | 571 | 572 | acura->.45 573 | 574 | 575 | ' ' 576 | 577 | 578 | 579 | .46 580 | 581 | 582 | 583 | 584 | .38->.46 585 | 586 | 587 | r 588 | 589 | 590 | 591 | .47 592 | 593 | 594 | 595 | 596 | .38->.47 597 | 598 | 599 | 4 600 | 601 | 602 | 603 | .48 604 | 605 | 606 | 607 | 608 | .38->.48 609 | 610 | 611 | g 612 | 613 | 614 | 615 | giulia 616 | 617 | giulia 618 | 619 | 620 | 621 | .39->giulia 622 | 623 | 624 | a 625 | 626 | 627 | 628 | .50 629 | 630 | 631 | 632 | 633 | .40->.50 634 | 635 | 636 | ' ' 637 | 638 | 639 | 640 | .51 641 | 642 | 643 | 644 | 645 | .41->.51 646 | 647 | 648 | ' ' 649 | 650 | 651 | 652 | .52 653 | 654 | 655 | 656 | 657 | .42->.52 658 | 659 | 660 | i 661 | 662 | 663 | 664 | .53 665 | 666 | 667 | 668 | 669 | .43->.53 670 | 671 | 672 | i 673 | 674 | 675 | 676 | .54 677 | 678 | 679 | 680 | 681 | .45->.54 682 | 683 | 684 | z 685 | 686 | 687 | 688 | .55 689 | 690 | 691 | 692 | 693 | .46->.55 694 | 695 | 696 | o 697 | 698 | 699 | 700 | .56 701 | 702 | 703 | 704 | 705 | .47->.56 706 | 707 | 708 | ' ' 709 | 710 | 711 | 712 | .57 713 | 714 | 715 | 716 | 717 | .48->.57 718 | 719 | 720 | i 721 | 722 | 723 | 724 | .58 725 | 726 | 727 | 728 | 729 | .50->.58 730 | 731 | 732 | s 733 | 734 | 735 | 736 | .59 737 | 738 | 739 | 740 | 741 | .51->.59 742 | 743 | 744 | s 745 | 746 | 747 | 748 | .60 749 | 750 | 751 | 752 | 753 | .52->.60 754 | 755 | 756 | e 757 | 758 | 759 | 760 | .61 761 | 762 | 763 | 764 | 765 | .53->.61 766 | 767 | 768 | e 769 | 770 | 771 | 772 | .62 773 | 774 | 775 | 776 | 777 | .54->.62 778 | 779 | 780 | d 781 | 782 | 783 | 784 | .63 785 | 786 | 787 | 788 | 789 | .55->.63 790 | 791 | 792 | m 793 | 794 | 795 | 796 | alfa 4c 797 | 798 | alfa 4c 799 | 800 | 801 | 802 | .56->alfa 4c 803 | 804 | 805 | c 806 | 807 | 808 | 809 | .65 810 | 811 | 812 | 813 | 814 | .57->.65 815 | 816 | 817 | u 818 | 819 | 820 | 821 | .66 822 | 823 | 824 | 825 | 826 | .58->.66 827 | 828 | 829 | e 830 | 831 | 832 | 833 | .67 834 | 835 | 836 | 837 | 838 | .59->.67 839 | 840 | 841 | e 842 | 843 | 844 | 845 | 1 series 846 | 847 | 1 series 848 | 849 | 850 | 851 | .60->1 series 852 | 853 | 854 | s 855 | 856 | 857 | 858 | 2 series 859 | 860 | 2 series 861 | 862 | 863 | 864 | .61->2 series 865 | 866 | 867 | s 868 | 869 | 870 | 871 | acura zdx 872 | 873 | acura zdx 874 | 875 | 876 | 877 | .62->acura zdx 878 | 879 | 880 | x 881 | 882 | 883 | 884 | .71 885 | 886 | 887 | 888 | 889 | .63->.71 890 | 891 | 892 | e 893 | 894 | 895 | 896 | .72 897 | 898 | 899 | 900 | 901 | alfa 4c->.72 902 | 903 | 904 | ' ' 905 | 906 | 907 | 908 | .73 909 | 910 | 911 | 912 | 913 | .65->.73 914 | 915 | 916 | l 917 | 918 | 919 | 920 | .74 921 | 922 | 923 | 924 | 925 | .66->.74 926 | 927 | 928 | r 929 | 930 | 931 | 932 | .75 933 | 934 | 935 | 936 | 937 | .67->.75 938 | 939 | 940 | r 941 | 942 | 943 | 944 | alfa romeo 945 | 946 | alfa romeo 947 | 948 | 949 | 950 | .71->alfa romeo 951 | 952 | 953 | o 954 | 955 | 956 | 957 | .77 958 | 959 | 960 | 961 | 962 | .72->.77 963 | 964 | 965 | c 966 | 967 | 968 | 969 | .78 970 | 971 | 972 | 973 | 974 | .73->.78 975 | 976 | 977 | i 978 | 979 | 980 | 981 | .79 982 | 983 | 984 | 985 | 986 | .74->.79 987 | 988 | 989 | i 990 | 991 | 992 | 993 | .80 994 | 995 | 996 | 997 | 998 | .75->.80 999 | 1000 | 1001 | i 1002 | 1003 | 1004 | 1005 | .81 1006 | 1007 | 1008 | 1009 | 1010 | alfa romeo->.81 1011 | 1012 | 1013 | ' ' 1014 | 1015 | 1016 | 1017 | .82 1018 | 1019 | 1020 | 1021 | 1022 | .77->.82 1023 | 1024 | 1025 | o 1026 | 1027 | 1028 | 1029 | alfa giulia 1030 | 1031 | alfa giulia 1032 | 1033 | 1034 | 1035 | .78->alfa giulia 1036 | 1037 | 1038 | a 1039 | 1040 | 1041 | 1042 | .84 1043 | 1044 | 1045 | 1046 | 1047 | .79->.84 1048 | 1049 | 1050 | e 1051 | 1052 | 1053 | 1054 | .85 1055 | 1056 | 1057 | 1058 | 1059 | .80->.85 1060 | 1061 | 1062 | e 1063 | 1064 | 1065 | 1066 | .86 1067 | 1068 | 1069 | 1070 | 1071 | .81->.86 1072 | 1073 | 1074 | 4 1075 | 1076 | 1077 | 1078 | .87 1079 | 1080 | 1081 | 1082 | 1083 | .81->.87 1084 | 1085 | 1086 | g 1087 | 1088 | 1089 | 1090 | .88 1091 | 1092 | 1093 | 1094 | 1095 | .82->.88 1096 | 1097 | 1098 | u 1099 | 1100 | 1101 | 1102 | bmw 1 series 1103 | 1104 | bmw 1 series 1105 | 1106 | 1107 | 1108 | .84->bmw 1 series 1109 | 1110 | 1111 | s 1112 | 1113 | 1114 | 1115 | bmw 2 series 1116 | 1117 | bmw 2 series 1118 | 1119 | 1120 | 1121 | .85->bmw 2 series 1122 | 1123 | 1124 | s 1125 | 1126 | 1127 | 1128 | .91 1129 | 1130 | 1131 | 1132 | 1133 | .86->.91 1134 | 1135 | 1136 | ' ' 1137 | 1138 | 1139 | 1140 | .92 1141 | 1142 | 1143 | 1144 | 1145 | .87->.92 1146 | 1147 | 1148 | i 1149 | 1150 | 1151 | 1152 | .93 1153 | 1154 | 1155 | 1156 | 1157 | .88->.93 1158 | 1159 | 1160 | p 1161 | 1162 | 1163 | 1164 | alfa romeo 4c 1165 | 1166 | alfa romeo 4c 1167 | 1168 | 1169 | 1170 | .91->alfa romeo 4c 1171 | 1172 | 1173 | c 1174 | 1175 | 1176 | 1177 | .95 1178 | 1179 | 1180 | 1181 | 1182 | .92->.95 1183 | 1184 | 1185 | u 1186 | 1187 | 1188 | 1189 | alfa 4c coupe 1190 | 1191 | alfa 4c coupe 1192 | 1193 | 1194 | 1195 | .93->alfa 4c coupe 1196 | 1197 | 1198 | e 1199 | 1200 | 1201 | 1202 | .97 1203 | 1204 | 1205 | 1206 | 1207 | alfa romeo 4c->.97 1208 | 1209 | 1210 | ' ' 1211 | 1212 | 1213 | 1214 | .98 1215 | 1216 | 1217 | 1218 | 1219 | .95->.98 1220 | 1221 | 1222 | l 1223 | 1224 | 1225 | 1226 | .99 1227 | 1228 | 1229 | 1230 | 1231 | .97->.99 1232 | 1233 | 1234 | c 1235 | 1236 | 1237 | 1238 | .100 1239 | 1240 | 1241 | 1242 | 1243 | .98->.100 1244 | 1245 | 1246 | i 1247 | 1248 | 1249 | 1250 | .101 1251 | 1252 | 1253 | 1254 | 1255 | .99->.101 1256 | 1257 | 1258 | o 1259 | 1260 | 1261 | 1262 | alfa romeo giulia 1263 | 1264 | alfa romeo giulia 1265 | 1266 | 1267 | 1268 | .100->alfa romeo giulia 1269 | 1270 | 1271 | a 1272 | 1273 | 1274 | 1275 | .103 1276 | 1277 | 1278 | 1279 | 1280 | .101->.103 1281 | 1282 | 1283 | u 1284 | 1285 | 1286 | 1287 | .104 1288 | 1289 | 1290 | 1291 | 1292 | .103->.104 1293 | 1294 | 1295 | p 1296 | 1297 | 1298 | 1299 | alfa romeo 4c coupe 1300 | 1301 | alfa romeo 4c coupe 1302 | 1303 | 1304 | 1305 | .104->alfa romeo 4c coupe 1306 | 1307 | 1308 | e 1309 | 1310 | 1311 | 1312 | --------------------------------------------------------------------------------