├── tests
├── fixtures
│ ├── __init__.py
│ ├── synonyms.json
│ ├── makes_models_in_farsi_short.csv
│ ├── makes_models_short.csv
│ ├── sample_words.json
│ └── makes_models_from_wikipedia.csv
├── animation
│ └── short.gif
├── conftest.py
├── test_misc.py
├── test_normalize.py
├── test_loader.py
├── test_lfucache.py
├── test_autocomplete.py
└── AutoCompleteWithSynonymsShort_Graph.svg
├── .coveragerc
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
└── workflows
│ └── main.yaml
├── AUTHORS
├── requirements-dev.txt
├── setup.cfg
├── fast_autocomplete
├── __init__.py
├── demo.py
├── normalize.py
├── misc.py
├── draw.py
├── loader.py
├── lfucache.py
└── dwg.py
├── LICENSE
├── setup.py
├── .gitignore
└── README.md
/tests/fixtures/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source =
3 | autocomplete
4 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [seperman]
2 | ko_fi: seperman
3 |
--------------------------------------------------------------------------------
/tests/animation/short.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/seperman/fast-autocomplete/HEAD/tests/animation/short.gif
--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Authors:
2 | - Autocomplete by Sep Dehpour (zepworks.com)
3 | - LFU Cache by Shane Wang (medium.com/@epicshane)
4 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | bump2version==1.0.1
2 | click>=8.0.3
3 | deepdiff==5.5.0
4 | flake8==4.0.1
5 | pygraphviz==1.7
6 | pytest==6.2.5
7 | pytest_cov==3.0.0
8 |
--------------------------------------------------------------------------------
/tests/fixtures/synonyms.json:
--------------------------------------------------------------------------------
1 | {
2 | "alfa romeo": ["alfa"],
3 | "bmw": ["beemer", "bimmer"],
4 | "mercedes-benz": ["mercedes", "benz"],
5 | "volkswagen": ["vw"],
6 | "truck": ["trucks"]
7 | }
8 |
--------------------------------------------------------------------------------
/tests/fixtures/makes_models_in_farsi_short.csv:
--------------------------------------------------------------------------------
1 | make,model
2 | آکیورا,zdx
3 | آلفا,4c
4 | آلفا,4c coupe
5 | آلفا,giulia
6 | بی ام و,1 series
7 | بی ام و,2 series
8 | 2007,2007
9 | 2017,2017
10 | 2018,2018
11 |
--------------------------------------------------------------------------------
/tests/fixtures/makes_models_short.csv:
--------------------------------------------------------------------------------
1 | make,model
2 | acura,zdx
3 | alfa romeo,4c
4 | alfa romeo,4c coupe
5 | alfa romeo,giulia
6 | bmw,1 series
7 | bmw,2 series
8 | 2007,2007
9 | 2017,2017
10 | 2018,2018
11 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | current_file = os.path.dirname(__file__)
5 | path1 = os.path.abspath(os.path.join(current_file, '..'))
6 | path2 = os.path.abspath(os.path.join(current_file, 'tests'))
7 | sys.path.append(path1) # noqa
8 | sys.path.append(path2) # noqa
9 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 0.9.0
3 | commit = True
4 | tag = True
5 | tag_name = {new_version}
6 |
7 | [bumpversion:file:setup.py]
8 |
9 | [bumpversion:file:README.md]
10 |
11 | [flake8]
12 | max-line-length = 120
13 | builtins = json
14 | statistics = true
15 | ignore = E202
16 | exclude = ./data,./src,./tests,.svn,CVS,.bzr,.hg,.git,__pycache__,./venv
17 |
--------------------------------------------------------------------------------
/fast_autocomplete/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | import sys
3 | import pkg_resources
4 |
5 | if (sys.version_info[0], sys.version_info[1]) < (3, 6):
6 | sys.exit('fast-autocomplete requires Python 3.6 or later.')
7 |
8 | __version__ = pkg_resources.get_distribution("fast-autocomplete").version
9 |
10 | from fast_autocomplete.dwg import AutoComplete
11 | from fast_autocomplete.draw import DrawGraphMixin
12 | from fast_autocomplete.demo import demo
13 | from fast_autocomplete.loader import autocomplete_factory
14 | from fast_autocomplete.normalize import Normalizer
15 |
--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from fast_autocomplete.misc import _extend_and_repeat
3 |
4 |
5 | class TestMisc:
6 |
7 | @pytest.mark.parametrize("list1, list2, expected_result", [
8 | (['a', 'b'], ['c', 'd'], [['a', 'b', 'c'], ['a', 'b', 'd']]),
9 | (['a', 'b'], ['a', 'd'], [['a', 'b', 'd']]),
10 | (['a', 'b'], ['b model2', 'd'], [['a', 'b model2'], ['a', 'b', 'd']]),
11 | ([], ['c', 'd'], [['c'], ['d']]),
12 | ])
13 | def test_extend_and_repeat(self, list1, list2, expected_result):
14 | result = _extend_and_repeat(list1, list2)
15 | assert expected_result == result
16 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior
15 |
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 |
19 | **OS, Fast Autocomplete version and Python version (please complete the following information):**
20 | - OS: [e.g. Ubuntu]
21 | - Version [e.g. 20LTS]
22 |
23 | **Additional context**
24 | Add any other context about the problem here.
25 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2018 - 2019 Fair Financial Corp
4 | Copyright (c) 2020 - 2021 Sep Dehpour
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
24 |
--------------------------------------------------------------------------------
/fast_autocomplete/demo.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pprint import pprint
3 | from fast_autocomplete.misc import read_single_keypress, termios
4 |
5 |
6 | def demo(running_modules, max_cost, size):
7 | """
8 | Gets an Autocomplete instance that has already data in it and you can then run search on it in real time
9 | """
10 |
11 | word_list = []
12 |
13 | running_modules = running_modules if isinstance(running_modules, dict) else {running_modules.__class__.__name__: running_modules}
14 |
15 | if termios is None:
16 | sys.exit('termios and/or fcntl packages are not available in your system. This is possibly because you are not on a Linux Distro.')
17 |
18 | print('FAST AUTOCOMPLETE DEMO')
19 | print('Press any key to search for. Press ctrl+c to exit')
20 |
21 | while True:
22 | pressed = read_single_keypress()
23 | if pressed == '\x7f':
24 | if word_list:
25 | word_list.pop()
26 | elif pressed == '\x03':
27 | break
28 | else:
29 | word_list.append(pressed)
30 |
31 | joined = ''.join(word_list)
32 | print(chr(27) + "[2J")
33 | print(joined)
34 | results = {}
35 | for module_name, module in running_modules.items():
36 | results[module_name] = module.search(word=joined, max_cost=max_cost, size=size)
37 | pprint(results)
38 | print('')
39 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | version = '0.9.0'
4 |
5 |
6 | try:
7 | with open('README.md') as file:
8 | long_description = file.read()
9 | except Exception:
10 | long_description = "Autocomplete"
11 |
12 | setup(
13 | name='fast-autocomplete',
14 | description='Fast Autocomplete using Directed Word Graph',
15 | long_description=long_description,
16 | long_description_content_type='text/markdown',
17 | author='Sep Dehpour',
18 | url='https://github.com/seperman/fast-autocomplete',
19 | author_email='sep@zepworks.com',
20 | version=version,
21 | install_requires=[],
22 | extras_require={
23 | 'levenshtein': ['python-Levenshtein>=0.12.2'],
24 | 'pylev': ['pylev>=1.4.0'],
25 | },
26 | dependency_links=[],
27 | packages=find_packages(exclude=('tests', 'docs')),
28 | include_package_data=True,
29 | scripts=[],
30 | test_suite="tests",
31 | tests_require=['mock'],
32 | license='MIT',
33 | classifiers=[
34 | "Intended Audience :: Developers",
35 | "Operating System :: OS Independent",
36 | "Topic :: Software Development",
37 | "Programming Language :: Python :: 3.6",
38 | "Programming Language :: Python :: 3.7",
39 | "Programming Language :: Python :: 3.8",
40 | "Programming Language :: Python :: 3.9",
41 | "Development Status :: 4 - Beta",
42 | ]
43 | )
44 |
--------------------------------------------------------------------------------
/tests/test_normalize.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from fast_autocomplete.normalize import Normalizer
3 |
4 | normalizer = Normalizer()
5 | normalizer_unicode = Normalizer(
6 | valid_chars_for_string='زرتپبا'
7 | )
8 |
9 |
10 | class TestNormalizer:
11 |
12 | @pytest.mark.parametrize("name, expected_result", [
13 | ('type-r', 'type-r'),
14 | ('HONDA and Toyota!', 'honda and toyota'),
15 | (r'bmw? \#1', 'bmw 1'),
16 | (r'bmw? \#', 'bmw'),
17 | (None, ''),
18 | ])
19 | def test_remove_any_special_character(self, name, expected_result):
20 | result = normalizer.remove_any_special_character(name)
21 | assert expected_result == result
22 |
23 | @pytest.mark.parametrize("name, extra_chars, expected_result", [
24 | ('type-r', None, 'type r'),
25 | ('HONDA and Toyota!', None, 'honda and toyota'),
26 | (r'bmw? \#1', None, 'bmw 1'),
27 | (r'bmw? \#', None, 'bmw'),
28 | (r'bmw? \#', {'#'}, 'bmw #'),
29 | (None, None, ''),
30 | ])
31 | def test_normalize_node_name(self, name, extra_chars, expected_result):
32 | result = normalizer.normalize_node_name(name, extra_chars=extra_chars)
33 | assert expected_result == result
34 |
35 | @pytest.mark.parametrize("name, extra_chars, expected_result", [
36 | ('درپب', None, 'رپب'),
37 | ])
38 | def test_normalize_unicode_node_name(self, name, extra_chars, expected_result):
39 | result = normalizer_unicode.normalize_node_name(name, extra_chars=extra_chars)
40 | assert expected_result == result
41 |
--------------------------------------------------------------------------------
/tests/test_loader.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | from fast_autocomplete import autocomplete_factory, AutoComplete
4 | from fast_autocomplete.loader import WordValue
5 |
6 | current_dir = os.path.dirname(os.path.abspath(__file__))
7 | fixture_dir = os.path.join(current_dir, 'fixtures')
8 |
9 | content_files = {
10 | 'words': {
11 | 'filepath': os.path.join(fixture_dir, 'sample_words.json'),
12 | 'compress': True # means compress the graph data in memory
13 | }
14 | }
15 |
16 | autocomplete = autocomplete_factory(content_files=content_files)
17 |
18 |
19 | class AutoCompleteIgnoreCount(AutoComplete):
20 | SHOULD_INCLUDE_COUNT = False
21 |
22 |
23 | autocomplete_ignore_count = autocomplete_factory(content_files=content_files, module=AutoCompleteIgnoreCount)
24 |
25 |
26 | class TestLoader:
27 |
28 | @pytest.mark.parametrize('word, expected_result, expected_unsorted_result', [
29 | ('acu',
30 | [['acura'], ['acura mdx'], ['acura rdx']],
31 | [['acura'], ['acura rlx'], ['acura rdx']]),
32 | ])
33 | def test_loader(self, word, expected_result, expected_unsorted_result):
34 | result = autocomplete.search(word=word, size=3)
35 | assert expected_result == result
36 | expected_word_value = WordValue(context={'make': 'acura'}, display='Acura', count=130123, original_key=None)
37 | assert autocomplete.words['acura'] == expected_word_value
38 | assert 'Acura' == autocomplete.words['acura'].display
39 | result = autocomplete_ignore_count.search(word=word, size=3)
40 | assert expected_unsorted_result == result
41 |
--------------------------------------------------------------------------------
/tests/fixtures/sample_words.json:
--------------------------------------------------------------------------------
1 | {
2 | "acura rlx": [
3 | {
4 | "model": "rlx",
5 | "make": "acura"
6 | },
7 | "Acura RLX",
8 | 3132
9 | ],
10 | "rlx": [
11 | {
12 | "model": "rlx",
13 | "make": "acura"
14 | },
15 | "Acura RLX",
16 | 3132
17 | ],
18 | "acura": [
19 | {
20 | "make": "acura"
21 | },
22 | "Acura",
23 | 130123
24 | ],
25 | "acura rlx sport hybrid": [
26 | {
27 | "model": "rlx sport hybrid",
28 | "make": "acura"
29 | },
30 | "Acura RLX Sport Hybrid",
31 | 4
32 | ],
33 | "rlx sport hybrid": [
34 | {
35 | "model": "rlx sport hybrid",
36 | "make": "acura"
37 | },
38 | "Acura RLX Sport Hybrid",
39 | 4
40 | ],
41 | "acura ilx": [
42 | {
43 | "model": "ilx--ilx hybrid",
44 | "make": "acura"
45 | },
46 | "Acura ILX",
47 | 19936
48 | ],
49 | "ilx": [
50 | {
51 | "model": "ilx--ilx hybrid",
52 | "make": "acura"
53 | },
54 | "Acura ILX",
55 | 19936
56 | ],
57 | "acura mdx": [
58 | {
59 | "model": "mdx",
60 | "make": "acura"
61 | },
62 | "Acura MDX",
63 | 35290
64 | ],
65 | "mdx": [
66 | {
67 | "model": "mdx",
68 | "make": "acura"
69 | },
70 | "Acura MDX",
71 | 35290
72 | ],
73 | "acura nsx": [
74 | {
75 | "model": "nsx",
76 | "make": "acura"
77 | },
78 | "Acura NSX",
79 | 271
80 | ],
81 | "nsx": [
82 | {
83 | "model": "nsx",
84 | "make": "acura"
85 | },
86 | "Acura NSX",
87 | 271
88 | ],
89 | "acura rdx": [
90 | {
91 | "model": "rdx",
92 | "make": "acura"
93 | },
94 | "Acura RDX",
95 | 33905
96 | ]
97 | }
98 |
--------------------------------------------------------------------------------
/tests/test_lfucache.py:
--------------------------------------------------------------------------------
1 | import random
2 | import pytest
3 | import concurrent.futures
4 | from deepdiff import DeepDiff
5 | from fast_autocomplete.lfucache import LFUCache
6 |
7 |
8 | class TestLFUcache:
9 |
10 | @pytest.mark.parametrize("items, size, expected_results", [
11 | (['a', 'a', 'b', 'a', 'c', 'b', 'd'], 3, [('a', 2), ('b', 1), ('d', 0)]),
12 | (['a', 'a', 'b', 'a', 'c', 'b', 'd', 'e', 'c', 'b'], 3, [('a', 2), ('b', 2), ('c', 0)]),
13 | (['a', 'a', 'b', 'a', 'c', 'b', 'd', 'e', 'c', 'b', 'b', 'c', 'd', 'b'], 3, [('b', 4), ('a', 2), ('d', 0)]),
14 | ])
15 | def test_autocomplete(self, items, size, expected_results):
16 | lfucache = LFUCache(size)
17 | for item in items:
18 | lfucache.set(item, f'{item}_cached')
19 | results = lfucache.get_sorted_cache_keys()
20 | diff = DeepDiff(expected_results, results)
21 | assert not diff
22 |
23 | def test_get_multithreading(self):
24 | keys = 'aaaaaaaaaaaaaaaaaaaaaaaaaaabbc'
25 | lfucache = LFUCache(2)
26 |
27 | def _do_set(cache, key):
28 | cache.set(key, f'{key}_cached')
29 |
30 | def _do_get(cache, key):
31 | return cache.get(key)
32 |
33 | def _key_gen():
34 | i = 0
35 | while i < 30000:
36 | i += 1
37 | yield random.choice(keys)
38 |
39 | def _random_func(cache, key):
40 | return random.choice([_do_get, _do_get, _do_set])(cache, key)
41 |
42 | with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
43 | futures = (executor.submit(_random_func, lfucache, key) for key in _key_gen())
44 | for future in concurrent.futures.as_completed(futures):
45 | future.result()
46 |
--------------------------------------------------------------------------------
/.github/workflows/main.yaml:
--------------------------------------------------------------------------------
1 | name: Unit Tests
2 |
3 | on:
4 | push:
5 | branches: [ "master", "dev" ]
6 | pull_request:
7 | branches: [ "master", "dev" ]
8 |
9 | jobs:
10 | build:
11 |
12 | runs-on: ubuntu-latest
13 | strategy:
14 | matrix:
15 | python-version: [3.6, 3.7, 3.8, 3.9]
16 | architecture: ["x64"]
17 |
18 | steps:
19 | - uses: actions/checkout@v2
20 | - name: Setup Python ${{ matrix.python-version }} on ${{ matrix.architecture }}
21 | uses: actions/setup-python@v2
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 | architecture: ${{ matrix.architecture }}
25 | - name: Cache pip
26 | uses: actions/cache@v2
27 | with:
28 | # This path is specific to Ubuntu
29 | path: ~/.cache/pip
30 | # Look to see if there is a cache hit for the corresponding requirements file
31 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
32 | restore-keys: |
33 | ${{ runner.os }}-pip-
34 | ${{ runner.os }}-
35 | - name: Install c dependencies
36 | run: sudo apt install graphviz
37 | - name: Install dependencies
38 | run: pip install -r requirements-dev.txt
39 | - name: Lint with flake8
40 | run: |
41 | # stop the build if there are Python syntax errors or undefined names
42 | flake8 fast_autocomplete --count --select=E9,F63,F7,F82 --show-source --statistics
43 | - name: Test with pytest
44 | run: |
45 | pytest --cov-report=xml --cov=fast_autocomplete tests/
46 | - name: Upload coverage to Codecov
47 | uses: codecov/codecov-action@v1
48 | if: matrix.python-version == 3.9
49 | with:
50 | file: ./coverage.xml
51 | env_vars: OS,PYTHON
52 | fail_ci_if_error: true
53 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | pytest.ini
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 | .DS_Store
109 |
110 | temp[0-9]*
111 | makes_models_fair.csv
112 | test_autocomplete_fair.py
113 |
114 | short_*.svg
115 |
--------------------------------------------------------------------------------
/fast_autocomplete/normalize.py:
--------------------------------------------------------------------------------
1 | import string
2 | from fast_autocomplete.lfucache import LFUCache
3 |
4 |
5 | NORMALIZED_CACHE_SIZE = 2048
6 | MAX_WORD_LENGTH = 40
7 |
8 | _normalized_lfu_cache = LFUCache(NORMALIZED_CACHE_SIZE)
9 |
10 |
11 | class Normalizer:
12 |
13 | def __init__(
14 | self,
15 | valid_chars_for_string=None,
16 | valid_chars_for_integer=None,
17 | valid_chars_for_node_name=None
18 | ):
19 | if valid_chars_for_string:
20 | self.valid_chars_for_string = frozenset(valid_chars_for_string)
21 | else:
22 | self.valid_chars_for_string = frozenset({i for i in string.ascii_letters.lower()})
23 | if valid_chars_for_integer:
24 | self.valid_chars_for_integer = frozenset(valid_chars_for_integer)
25 | else:
26 | self.valid_chars_for_integer = frozenset({i for i in string.digits})
27 | if valid_chars_for_node_name:
28 | self.valid_chars_for_node_name = valid_chars_for_node_name
29 | else:
30 | self.valid_chars_for_node_name = self._get_valid_chars_for_node_name()
31 |
32 | def _get_valid_chars_for_node_name(self):
33 | return {' ', '-', ':', '_'} | self.valid_chars_for_string | self.valid_chars_for_integer
34 |
35 | def normalize_node_name(self, name, extra_chars=None):
36 | if name is None:
37 | return ''
38 | name = name[:MAX_WORD_LENGTH]
39 | key = name if extra_chars is None else f"{name}{extra_chars}"
40 | result = _normalized_lfu_cache.get(key)
41 | if result == -1:
42 | result = self._get_normalized_node_name(name, extra_chars=extra_chars)
43 | _normalized_lfu_cache.set(key, result)
44 | return result
45 |
46 | def _remove_invalid_chars(self, x):
47 | result = x in self.valid_chars_for_node_name
48 | if x == '-' == self.prev_x:
49 | result = False
50 | self.prev_x = x
51 | return result
52 |
53 | def remove_any_special_character(self, name):
54 | """
55 | Only remove invalid characters from a name. Useful for cleaning the user's original word.
56 | """
57 | if name is None:
58 | return ''
59 | name = name.lower()[:MAX_WORD_LENGTH]
60 | self.prev_x = ''
61 |
62 | return ''.join(filter(self._remove_invalid_chars, name)).strip()
63 |
64 | def _get_normalized_node_name(self, name, extra_chars=None):
65 | name = name.lower()
66 | result = []
67 | last_i = None
68 | for i in name:
69 | if i in self.valid_chars_for_node_name or (extra_chars and i in extra_chars):
70 | if i == '-':
71 | i = ' '
72 | elif (i in self.valid_chars_for_integer and last_i in self.valid_chars_for_string) or (i in self.valid_chars_for_string and last_i in self.valid_chars_for_integer):
73 | result.append(' ')
74 | if not(i == last_i == ' '):
75 | result.append(i)
76 | last_i = i
77 | return ''.join(result).strip()
78 |
--------------------------------------------------------------------------------
/fast_autocomplete/misc.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | import csv
4 | import sys
5 | try:
6 | import termios
7 | import fcntl
8 | except Exception:
9 | termios = fcntl = None
10 |
11 |
12 | class FileNotFound(ValueError):
13 | pass
14 |
15 |
16 | def _check_file_exists(path):
17 | if not os.path.exists(path):
18 | raise FileNotFound(f'{path} does not exist')
19 |
20 |
21 | def read_csv_gen(path_or_stringio, csv_func=csv.reader, **kwargs):
22 | """
23 | Takes a path_or_stringio to a file or a StringIO object and creates a CSV generator
24 | """
25 | if isinstance(path_or_stringio, (str, bytes)):
26 | _check_file_exists(path_or_stringio)
27 | encoding = kwargs.pop('encoding', 'utf-8-sig')
28 | with open(path_or_stringio, 'r', encoding=encoding) as csvfile:
29 | for i in csv_func(csvfile, **kwargs):
30 | yield i
31 | elif isinstance(path_or_stringio, io.StringIO):
32 | for i in csv_func(path_or_stringio, **kwargs):
33 | yield i
34 | else:
35 | raise TypeError('Either a path to the file or StringIO object needs to be passed.')
36 |
37 |
38 | def _extend_and_repeat(list1, list2):
39 | if not list1:
40 | return [[i] for i in list2]
41 |
42 | result = []
43 | for item in list2:
44 | if item not in list1:
45 | list1_copy = list1.copy()
46 | if item.startswith(list1_copy[-1]):
47 | list1_copy.pop()
48 | list1_copy.append(item)
49 | result.append(list1_copy)
50 |
51 | return result
52 |
53 |
54 | def read_single_keypress():
55 | """Waits for a single keypress on stdin.
56 | https://stackoverflow.com/a/6599441/1497443
57 |
58 | This is a silly function to call if you need to do it a lot because it has
59 | to store stdin's current setup, setup stdin for reading single keystrokes
60 | then read the single keystroke then revert stdin back after reading the
61 | keystroke.
62 |
63 | Returns the character of the key that was pressed (zero on
64 | KeyboardInterrupt which can happen when a signal gets handled)
65 |
66 | """
67 | if fcntl is None or termios is None:
68 | raise ValueError('termios and/or fcntl packages are not available in your system. This is possible because you are not on a Linux Distro.')
69 | fd = sys.stdin.fileno()
70 | # save old state
71 | flags_save = fcntl.fcntl(fd, fcntl.F_GETFL)
72 | attrs_save = termios.tcgetattr(fd)
73 | # make raw - the way to do this comes from the termios(3) man page.
74 | attrs = list(attrs_save) # copy the stored version to update
75 | # iflag
76 | attrs[0] &= ~(termios.IGNBRK | termios.BRKINT | termios.PARMRK |
77 | termios.ISTRIP | termios.INLCR | termios.IGNCR |
78 | termios.ICRNL | termios.IXON)
79 | # oflag
80 | attrs[1] &= ~termios.OPOST
81 | # cflag
82 | attrs[2] &= ~(termios.CSIZE | termios. PARENB)
83 | attrs[2] |= termios.CS8
84 | # lflag
85 | attrs[3] &= ~(termios.ECHONL | termios.ECHO | termios.ICANON |
86 | termios.ISIG | termios.IEXTEN)
87 | termios.tcsetattr(fd, termios.TCSANOW, attrs)
88 | # turn off non-blocking
89 | fcntl.fcntl(fd, fcntl.F_SETFL, flags_save & ~os.O_NONBLOCK)
90 | # read a single keystroke
91 | try:
92 | ret = sys.stdin.read(1) # returns a single character
93 | except KeyboardInterrupt:
94 | ret = 0
95 | finally:
96 | # restore old state
97 | termios.tcsetattr(fd, termios.TCSAFLUSH, attrs_save)
98 | fcntl.fcntl(fd, fcntl.F_SETFL, flags_save)
99 | return ret
100 |
--------------------------------------------------------------------------------
/fast_autocomplete/draw.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 |
4 | class DrawGraphMixin:
5 |
6 | DRAW_POPULATION_ANIMATION = False
7 | DRAW_POPULATION_ANIMATION_PATH = ''
8 | DRAW_POPULATION_ANIMATION_FILENO_PADDING = 6
9 | SHOW_OBJ_IDS_OF_WORDS = {}
10 |
11 | def draw_graph(self, file_path, starting_word=None, agraph_kwargs=None, prog='dot'):
12 | """
13 | Draws the graph of autocomplete words.
14 |
15 | parameters:
16 |
17 | - file_path: the full path to the file to save the graph into.
18 | Graphviz library will determine the format of the file based on the extension you choose.
19 | - starting_word: what word to start from. All descendants of the this word will be in the graph.
20 | If left as None, the graph will start from the rootn node.
21 | - agraph_kwargs: kwargs that will be pased to PyGraphViz Agraph creator. You can control how the graph
22 | will be rendered using these kwargs.
23 | """
24 | try:
25 | import pygraphviz as pgv
26 | except ImportError:
27 | print('You need to install pygraphviz in order to draw graphs')
28 |
29 | agraph_kwargs = agraph_kwargs if agraph_kwargs else {}
30 | graph = pgv.AGraph(strict=False, directed=True, **agraph_kwargs)
31 |
32 | edges = set()
33 | que = collections.deque()
34 | if starting_word:
35 | matched_prefix_of_last_word, rest_of_word, new_node, matched_words = self._prefix_autofill(word=starting_word)
36 | try:
37 | matched_word = matched_words[-1]
38 | except IndexError:
39 | new_node = self._dwg
40 | matched_word = 'root'
41 | else:
42 | new_node = self._dwg
43 | matched_word = 'root'
44 | que.append((matched_word, new_node, ''))
45 | node_alternative_names = {}
46 | while que:
47 | parent_name, node, edge_name = que.popleft()
48 | node_id = id(node)
49 | if node_id not in node_alternative_names:
50 | node_alternative_names[node_id] = f'.{len(node_alternative_names)}'
51 | if node.word:
52 | node_name = node.word
53 | if node_name in self.SHOW_OBJ_IDS_OF_WORDS:
54 | node_name = f'{node_name} {id(node)}'
55 | else:
56 | try:
57 | node_name = self.words[node_name].display
58 | except (KeyError, AttributeError):
59 | pass
60 | graph.add_node(node_name, fontcolor='blue', fontname='Arial', shape='rectangle')
61 | else:
62 | node_name = node_alternative_names[node_id]
63 | graph.add_node(node_name, color='grey', shape='point')
64 | edge_name = "' '" if edge_name == ' ' else edge_name
65 | edge = (parent_name, node_name)
66 | if edge not in edges:
67 | edges.add(edge)
68 | graph.add_edge(*edge, color='blue', label=edge_name)
69 | for edge_name, child in node.children.items():
70 | que.append((node_name, child, edge_name))
71 | graph.draw(file_path, prog=prog)
72 |
73 | def insert_word_callback(self, word):
74 | """
75 | Once word is inserted, this call back is run.
76 | """
77 | if self.DRAW_POPULATION_ANIMATION:
78 | if not hasattr(self, '_graph_fileno'):
79 | self._graph_fileno = 0
80 | self._graph_filepath = self.DRAW_POPULATION_ANIMATION_PATH.replace('.', r'{}.')
81 |
82 | fileno = str(self._graph_fileno).zfill(self.DRAW_POPULATION_ANIMATION_FILENO_PADDING)
83 | file_path = self._graph_filepath.format(fileno)
84 | self.draw_graph(file_path=file_path)
85 | self._graph_fileno += 1
86 |
--------------------------------------------------------------------------------
/fast_autocomplete/loader.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gzip
3 | import json
4 | import logging
5 | try:
6 | from redis import StrictRedis
7 | except ImportError:
8 | StrictRedis = None
9 |
10 | from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, Union
11 | from fast_autocomplete import AutoComplete
12 |
13 |
14 | def read_local_dump(filepath: str):
15 | with open(filepath, 'r') as the_file:
16 | return the_file.read()
17 |
18 |
19 | def _simple_compress(item: str, hash_to_val: Dict[int, str]) -> str:
20 | item_hash = hash(item)
21 | if item_hash in hash_to_val:
22 | item = hash_to_val[item_hash]
23 | else:
24 | hash_to_val[item_hash] = item
25 | return item
26 |
27 |
28 | class WordValue(NamedTuple):
29 | context: Any
30 | display: Any
31 | count: int = 0
32 | original_key: 'WordValue' = None
33 |
34 | def get(self, key: str, default: Optional[str] = None) -> str:
35 | result = getattr(self, key)
36 | if result is None:
37 | result = default
38 | return result
39 |
40 |
41 | def get_all_content(content_files, redis_client=None, redis_key_prefix=None, logger=None):
42 | """
43 | Get all content that is needed to initialize Autocomplete.
44 |
45 | :param: redis_client (optional) If passed, it tries to load from Redis if there is already cached data
46 | """
47 | kwargs = {}
48 | for key, info in content_files.items():
49 | kwargs[key] = get_data(
50 | filepath=info['filepath'],
51 | compress=info['compress'],
52 | redis_client=redis_client,
53 | redis_key_prefix=redis_key_prefix,
54 | logger=logger
55 | )
56 | if logger:
57 | kwargs['logger'] = logger
58 | return kwargs
59 |
60 |
61 | def get_data(filepath: str, compress: bool = False,
62 | redis_client: Optional[StrictRedis] = None,
63 | redis_key_prefix: Optional[str] = None,
64 | logger: Optional[logging.RootLogger] = None) -> Dict[str, List[str]]:
65 | data_json = None
66 | filename = os.path.basename(filepath)
67 | if redis_client and redis_key_prefix:
68 | key = redis_key_prefix.format(filename)
69 | try:
70 | data_json = redis_client.get(key)
71 | except Exception:
72 | if logger:
73 | logger.exception('Unable to get the search graph words from Redis.')
74 | else:
75 | print('Unable to get the search graph words from Redis.')
76 | if data_json:
77 | data_json = gzip.decompress(data_json).decode('utf-8')
78 | if not data_json:
79 | data_json = read_local_dump(filepath)
80 | data = json.loads(data_json)
81 |
82 | if compress:
83 | hash_to_val = {}
84 |
85 | for word, value in data.items():
86 | context, display, count = value
87 | display = _simple_compress(item=display, hash_to_val=hash_to_val)
88 | for key, val in context.items():
89 | context[key] = _simple_compress(
90 | item=context[key], hash_to_val=hash_to_val
91 | )
92 | data[word] = WordValue(context=context, display=display, count=count)
93 |
94 | return data
95 |
96 |
97 | def populate_redis(content_files, redis_client, redis_cache_prefix):
98 | """
99 | Populate Redis with data based on the local files
100 | """
101 | for key, info in content_files.items():
102 | filename = os.path.basename(info['filepath'])
103 | redis_key = redis_cache_prefix.format(filename)
104 | data = read_local_dump(info['filepath'])
105 | compressed = gzip.compress(data.encode('utf-8'))
106 | redis_client.set(redis_key, compressed)
107 |
108 |
109 | def autocomplete_factory(
110 | content_files, redis_client=None, module=AutoComplete, logger=None
111 | ):
112 | """
113 | Factory function to initialize the proper Vehicle Autocomplete object
114 |
115 | :param: content_files: The file paths and options where data is stored.
116 |
117 | Example
118 |
119 | content_files = {
120 | 'synonyms': {
121 | 'filename': 'path/to/synonyms.json',
122 | 'compress': False
123 | },
124 | 'words': {
125 | 'filename': 'path/to/words.json',
126 | 'compress': True
127 | },
128 | 'full_stop_words': {
129 | 'filename': 'path/to/full_stop_words.json',
130 | 'compress': False
131 | }
132 | }
133 |
134 | :param: redis_client: (optional) If passed, the factor function tries to load the data from Redis
135 | and if that fails, it will load the local data.
136 | :param: module: (optional) The AutoComplete module to initialize
137 | """
138 | kwargs = get_all_content(content_files, redis_client=redis_client, logger=logger)
139 | return module(**kwargs)
140 |
--------------------------------------------------------------------------------
/fast_autocomplete/lfucache.py:
--------------------------------------------------------------------------------
1 | """
2 | LFU cache Written by Shane Wang
3 | https://medium.com/@epicshane/a-python-implementation-of-lfu-least-frequently-used-cache-with-o-1-time-complexity-e16b34a3c49b
4 | https://github.com/luxigner/lfu_cache
5 | Modified by Sep Dehpour
6 | """
7 | from threading import Lock
8 |
9 |
10 | class CacheNode:
11 | def __init__(self, key, value, freq_node, pre, nxt):
12 | self.key = key
13 | self.value = value
14 | self.freq_node = freq_node
15 | self.pre = pre # previous CacheNode
16 | self.nxt = nxt # next CacheNode
17 |
18 | def free_myself(self):
19 | if self.freq_node.cache_head == self.freq_node.cache_tail:
20 | self.freq_node.cache_head = self.freq_node.cache_tail = None
21 | elif self.freq_node.cache_head == self:
22 | self.nxt.pre = None
23 | self.freq_node.cache_head = self.nxt
24 | elif self.freq_node.cache_tail == self:
25 | self.pre.nxt = None
26 | self.freq_node.cache_tail = self.pre
27 | else:
28 | self.pre.nxt = self.nxt
29 | self.nxt.pre = self.pre
30 |
31 | self.pre = None
32 | self.nxt = None
33 | self.freq_node = None
34 |
35 |
36 | class FreqNode:
37 | def __init__(self, freq, pre, nxt):
38 | self.freq = freq
39 | self.pre = pre # previous FreqNode
40 | self.nxt = nxt # next FreqNode
41 | self.cache_head = None # CacheNode head under this FreqNode
42 | self.cache_tail = None # CacheNode tail under this FreqNode
43 |
44 | def count_caches(self):
45 | if self.cache_head is None and self.cache_tail is None:
46 | return 0
47 | elif self.cache_head == self.cache_tail:
48 | return 1
49 | else:
50 | return '2+'
51 |
52 | def remove(self):
53 | if self.pre is not None:
54 | self.pre.nxt = self.nxt
55 | if self.nxt is not None:
56 | self.nxt.pre = self.pre
57 |
58 | pre = self.pre
59 | nxt = self.nxt
60 | self.pre = self.nxt = self.cache_head = self.cache_tail = None
61 |
62 | return (pre, nxt)
63 |
64 | def pop_head_cache(self):
65 | if self.cache_head is None and self.cache_tail is None:
66 | return None
67 | elif self.cache_head == self.cache_tail:
68 | cache_head = self.cache_head
69 | self.cache_head = self.cache_tail = None
70 | return cache_head
71 | else:
72 | cache_head = self.cache_head
73 | self.cache_head.nxt.pre = None
74 | self.cache_head = self.cache_head.nxt
75 | return cache_head
76 |
77 | def append_cache_to_tail(self, cache_node):
78 | cache_node.freq_node = self
79 |
80 | if self.cache_head is None and self.cache_tail is None:
81 | self.cache_head = self.cache_tail = cache_node
82 | else:
83 | cache_node.pre = self.cache_tail
84 | cache_node.nxt = None
85 | self.cache_tail.nxt = cache_node
86 | self.cache_tail = cache_node
87 |
88 | def insert_after_me(self, freq_node):
89 | freq_node.pre = self
90 | freq_node.nxt = self.nxt
91 |
92 | if self.nxt is not None:
93 | self.nxt.pre = freq_node
94 |
95 | self.nxt = freq_node
96 |
97 | def insert_before_me(self, freq_node):
98 | if self.pre is not None:
99 | self.pre.nxt = freq_node
100 |
101 | freq_node.pre = self.pre
102 | freq_node.nxt = self
103 | self.pre = freq_node
104 |
105 |
106 | class LFUCache:
107 |
108 | def __init__(self, capacity):
109 | self.cache = {} # {key: cache_node}
110 | self.capacity = capacity
111 | self.freq_link_head = None
112 | self.lock = Lock()
113 |
114 | def get(self, key):
115 | with self.lock:
116 | if key in self.cache:
117 | cache_node = self.cache[key]
118 | freq_node = cache_node.freq_node
119 | value = cache_node.value
120 |
121 | self.move_forward(cache_node, freq_node)
122 |
123 | return value
124 | else:
125 | return -1
126 |
127 | def set(self, key, value):
128 | with self.lock:
129 | if self.capacity <= 0:
130 | return -1
131 |
132 | if key not in self.cache:
133 | if len(self.cache) >= self.capacity:
134 | self.dump_cache()
135 |
136 | self.create_cache_node(key, value)
137 | else:
138 | cache_node = self.cache[key]
139 | freq_node = cache_node.freq_node
140 | cache_node.value = value
141 |
142 | self.move_forward(cache_node, freq_node)
143 |
144 | def move_forward(self, cache_node, freq_node):
145 | if freq_node.nxt is None or freq_node.nxt.freq != freq_node.freq + 1:
146 | target_freq_node = FreqNode(freq_node.freq + 1, None, None)
147 | target_empty = True
148 | else:
149 | target_freq_node = freq_node.nxt
150 | target_empty = False
151 |
152 | cache_node.free_myself()
153 | target_freq_node.append_cache_to_tail(cache_node)
154 |
155 | if target_empty:
156 | freq_node.insert_after_me(target_freq_node)
157 |
158 | if freq_node.count_caches() == 0:
159 | if self.freq_link_head == freq_node:
160 | self.freq_link_head = target_freq_node
161 |
162 | freq_node.remove()
163 |
164 | def dump_cache(self):
165 | head_freq_node = self.freq_link_head
166 | self.cache.pop(head_freq_node.cache_head.key)
167 | head_freq_node.pop_head_cache()
168 |
169 | if head_freq_node.count_caches() == 0:
170 | self.freq_link_head = head_freq_node.nxt
171 | head_freq_node.remove()
172 |
173 | def create_cache_node(self, key, value):
174 | cache_node = CacheNode(key, value, None, None, None)
175 | self.cache[key] = cache_node
176 |
177 | if self.freq_link_head is None or self.freq_link_head.freq != 0:
178 | new_freq_node = FreqNode(0, None, None)
179 | new_freq_node.append_cache_to_tail(cache_node)
180 |
181 | if self.freq_link_head is not None:
182 | self.freq_link_head.insert_before_me(new_freq_node)
183 |
184 | self.freq_link_head = new_freq_node
185 | else:
186 | self.freq_link_head.append_cache_to_tail(cache_node)
187 |
188 | def get_sorted_cache_keys(self):
189 | result = [(i, freq.freq_node.freq) for i, freq in self.cache.items()]
190 | result.sort(key=lambda x: -x[1])
191 | return result
192 |
--------------------------------------------------------------------------------
/tests/fixtures/makes_models_from_wikipedia.csv:
--------------------------------------------------------------------------------
1 | make,model,count
2 | Toyota,Aurion,6094
3 | Toyota,Avalon,8803
4 | Toyota,Avensis,1630
5 | Toyota,Camry,5371
6 | Toyota,Crown,9443
7 | Toyota,Etios,5806
8 | Toyota,Mirai,4272
9 | Toyota,Prius,9425
10 | Toyota,Vios,8322
11 | Toyota,Auris,4025
12 | Toyota,Aygo,2115
13 | Toyota,Yaris,6274
14 | Toyota,86,1298
15 | Toyota,Avanza,1760
16 | Toyota,Innova,4250
17 | Toyota,Noah,3462
18 | Toyota,Sienna,3992
19 | Toyota,Sienta,4992
20 | Toyota,Previa,8404
21 | Toyota,Verso,3765
22 | Toyota,Wish,3735
23 | Toyota,4Runner,5616
24 | Toyota,Fortuner,7003
25 | Toyota,Highlander,6235
26 | Toyota,RAV4,3182
27 | Toyota,Sequoia,2900
28 | Toyota,HiAce,5402
29 | Toyota,Tacoma,7371
30 | Toyota,Tundra,6608
31 | Toyota,Coaster,3503
32 | Toyota,Dyna,8426
33 | Lexus,CT,6770
34 | Lexus,IS,4028
35 | Lexus,HS,7415
36 | Lexus,ES,827
37 | Lexus,GS,3557
38 | Lexus,LS,1916
39 | Lexus,SC,6595
40 | Lexus,RC,7647
41 | Lexus,LC,8265
42 | Lexus,LFA,7897
43 | Lexus,NX,3177
44 | Lexus,RX,2663
45 | Lexus,GX,2696
46 | Lexus,LX,5592
47 | BMW,1 series,9969
48 | BMW,2 series,4590
49 | BMW,303,2047
50 | BMW,328,4334
51 | BMW,326,5637
52 | BMW,327,5377
53 | BMW,320,4309
54 | BMW,321,4170
55 | BMW,340,1178
56 | BMW,501,3430
57 | BMW,503,423
58 | BMW,507,3448
59 | BMW,700,3304
60 | BMW,E9,4675
61 | BMW,E3,3190
62 | BMW,M1,130
63 | BMW,E28,9404
64 | BMW,E30,8655
65 | BMW,E32,786
66 | BMW,E34,7726
67 | BMW,Z1,3143
68 | BMW,E31,4817
69 | BMW,Z3,688
70 | BMW,Z8,2607
71 | BMW,i3,2
72 | BMW,i8,4246
73 | Audi,F103,8261
74 | Audi,100,3565
75 | Audi,80,876
76 | Audi,50,6248
77 | Audi,200,3625
78 | Audi,5,6191
79 | Audi,90,4520
80 | Audi,V8,1751
81 | Audi,Cabriolet,2344
82 | Audi,A8,3985
83 | Audi,A4,9554
84 | Audi,A3,2451
85 | Audi,A6,8058
86 | Audi,TT,656
87 | Audi,A2,1032
88 | Audi,Q7,1988
89 | Audi,A5,1568
90 | Audi,Q5,9372
91 | Audi,R8,7835
92 | Audi,A1,2102
93 | Audi,A7,5228
94 | Audi,Q2,1222
95 | Audi,Q3,2371
96 | Audi,Q8,5076
97 | Audi,e,4462
98 | Mercedes-Benz,SSK,671
99 | Mercedes-Benz,W10,1574
100 | Mercedes-Benz,130H,7603
101 | Mercedes-Benz,150H,3934
102 | Mercedes-Benz,W31,6563
103 | Mercedes-Benz,W136,1007
104 | Mercedes-Benz,770,7990
105 | Mercedes-Benz,500K,4089
106 | Mercedes-Benz,540K,8136
107 | Mercedes-Benz,260,6160
108 | Mercedes-Benz,W125,7458
109 | Mercedes-Benz,W154,5619
110 | Mercedes-Benz,T80,9497
111 | Mercedes-Benz,W191,8567
112 | Mercedes-Benz,W120,8652
113 | Mercedes-Benz,190SL,2584
114 | Mercedes-Benz,W187,647
115 | Mercedes-Benz,W105,8822
116 | Mercedes-Benz,W180,1470
117 | Mercedes-Benz,W186,6734
118 | Mercedes-Benz,W188,3572
119 | Mercedes-Benz,W189,2539
120 | Mercedes-Benz,W196,9903
121 | Mercedes-Benz,300,2989
122 | Mercedes-Benz,300SL,3393
123 | Mercedes-Benz,W110,9213
124 | Mercedes-Benz,W111,2363
125 | Mercedes-Benz,W112,767
126 | Mercedes-Benz,W108,1470
127 | Mercedes-Benz,W114,7292
128 | Mercedes-Benz,W100,2874
129 | Mercedes-Benz,W113,8493
130 | Mercedes-Benz,W123,10
131 | Mercedes-Benz,G,3926
132 | Mercedes-Benz,S,1871
133 | Mercedes-Benz,W116,3454
134 | Mercedes-Benz,SL,6343
135 | Mercedes-Benz,R107,5253
136 | Mercedes-Benz,W201,4450
137 | Mercedes-Benz,C123,1913
138 | Mercedes-Benz,W126,4950
139 | Mercedes-Benz,S123,1403
140 | Mercedes-Benz,C126,100
141 | Mercedes-Benz,W124,3632
142 | Mercedes-Benz,C124,7291
143 | Mercedes-Benz,A,3129
144 | Mercedes-Benz,C,7321
145 | Mercedes-Benz,CLK,3968
146 | Mercedes-Benz,E,7247
147 | Mercedes-Benz,M,1164
148 | Mercedes-Benz,R129,6156
149 | Mercedes-Benz,Vaneo,8457
150 | Mercedes-Benz,W168,9009
151 | Mercedes-Benz,W169,8059
152 | Mercedes-Benz,B,212
153 | Mercedes-Benz,W203,897
154 | Mercedes-Benz,W204,4603
155 | Mercedes-Benz,W205,1626
156 | Mercedes-Benz,CL,7195
157 | Mercedes-Benz,W215,3007
158 | Mercedes-Benz,W216,8348
159 | Mercedes-Benz,CLC,1313
160 | Mercedes-Benz,CLS,6176
161 | Mercedes-Benz,W210,6893
162 | Mercedes-Benz,W211,1415
163 | Mercedes-Benz,W212,7213
164 | Mercedes-Benz,GL,6275
165 | Mercedes-Benz,W163,9507
166 | Mercedes-Benz,W164,1198
167 | Mercedes-Benz,R,4807
168 | Mercedes-Benz,W220,5300
169 | Mercedes-Benz,W221,2417
170 | Mercedes-Benz,W222,156
171 | Mercedes-Benz,SLK,1540
172 | Mercedes-Benz,R170,398
173 | Mercedes-Benz,R171,8498
174 | Mercedes-Benz,R230,3168
175 | Mercedes-Benz,SLR,1324
176 | Mercedes-Benz,CLA,3896
177 | Mercedes-Benz,GLA,267
178 | Mercedes-Benz,R231,2692
179 | Alfa Romeo,4C,3411
180 | Alfa Romeo,6C,9492
181 | Alfa Romeo,8C,1203
182 | Alfa Romeo,12C,2353
183 | Alfa Romeo,33,5014
184 | Alfa Romeo,75,6092
185 | Alfa Romeo,90,9659
186 | Alfa Romeo,105,8850
187 | Alfa Romeo,145,3173
188 | Alfa Romeo,146,6418
189 | Alfa Romeo,147,6209
190 | Alfa Romeo,155,8973
191 | Alfa Romeo,156,8905
192 | Alfa Romeo,159,4991
193 | Alfa Romeo,164,5324
194 | Alfa Romeo,166,1749
195 | Alfa Romeo,1750,309
196 | Alfa Romeo,1900,6706
197 | Alfa Romeo,2000,5057
198 | Alfa Romeo,2300,9805
199 | Alfa Romeo,2600,4120
200 | Alfa Romeo,Type,2662
201 | Alfa Romeo,Alfasud,2735
202 | Alfa Romeo,Alfetta,7667
203 | Alfa Romeo,AR6,1982
204 | Alfa Romeo,AR8,1004
205 | Alfa Romeo,Arna,462
206 | Alfa Romeo,Brera,5686
207 | Alfa Romeo,Dauphine,8051
208 | Alfa Romeo,G1,4370
209 | Alfa Romeo,Giulia,357
210 | Alfa Romeo,Giulietta,4924
211 | Alfa Romeo,Gran,3989
212 | Alfa Romeo,GT,2729
213 | Alfa Romeo,GTA,3848
214 | Alfa Romeo,GTV,9649
215 | Alfa Romeo,Matta,2519
216 | Alfa Romeo,MiTo,4991
217 | Alfa Romeo,Montreal,9744
218 | Alfa Romeo,RL,7160
219 | Alfa Romeo,RM,871
220 | Alfa Romeo,Spider,5029
221 | Alfa Romeo,Sprint,435
222 | Alfa Romeo,Stelvio,6508
223 | Alfa Romeo,SZ,3626
224 | Volkswagen,Amarok,6016
225 | Volkswagen,Ameo,350
226 | Volkswagen,Arteon,4740
227 | Volkswagen,Atlas,62
228 | Volkswagen,Caddy,5908
229 | Volkswagen,California,5950
230 | Volkswagen,Fox,4201
231 | Volkswagen,Gol,5813
232 | Volkswagen,Golf,9087
233 | Volkswagen,Jetta,3159
234 | Volkswagen,Lamando,2122
235 | Volkswagen,Lavida,3107
236 | Volkswagen,Beetle,533
237 | Volkswagen,Passat,1604
238 | Volkswagen,Passat,2373
239 | Volkswagen,Polo,6275
240 | Volkswagen,Polo,5747
241 | Volkswagen,Santana,9798
242 | Volkswagen,Sharan,7338
243 | Volkswagen,Tiguan,8213
244 | Volkswagen,Touareg,20
245 | Volkswagen,Touran,1586
246 | Volkswagen,Transporter,3823
247 | Volkswagen,Up,9100
248 | Volkswagen,Vento,7540
249 | Volkswagen,XL,888
250 | Chrysler,150,7529
251 | Chrysler,180,4813
252 | Chrysler,200,8641
253 | Chrysler,300,1638
254 | Chrysler,300M,2061
255 | Chrysler,300,2088
256 | Chrysler,Airflow,1330
257 | Chrysler,Airstream,6486
258 | Chrysler,Aspen,1092
259 | Chrysler,Centura,7034
260 | Chrysler,Australia,606
261 | Chrysler,Charger,9594
262 | Chrysler,by,5575
263 | Chrysler,Cirrus,5222
264 | Chrysler,Colt,6522
265 | Chrysler,Concorde,6028
266 | Chrysler,Conquest,9079
267 | Chrysler,Cordoba,7228
268 | Chrysler,Crossfire,4830
269 | Chrysler,D,9559
270 | Chrysler,Drifter,5391
271 | Chrysler,Executive,165
272 | Chrysler,Fifth,4183
273 | Chrysler,Galant,8517
274 | Chrysler,Horizon,2723
275 | Chrysler,Hunter,5119
276 | Chrysler,Imperial,2807
277 | Chrysler,Imperial,9531
278 | Chrysler,L300,786
279 | Chrysler,Lancer,5398
280 | Chrysler,Laser,325
281 | Chrysler,LeBaron,4387
282 | Chrysler,LHS,1989
283 | Chrysler,Newport,9134
284 | Chrysler,Neon,758
285 | Chrysler,New,5002
286 | Chrysler,Fifth,6742
287 | Chrysler,Pacifica,3467
288 | Chrysler,Prowler,5390
289 | Chrysler,PT,4102
290 | Chrysler,Regal,6030
291 | Chrysler,Royal,4960
292 | Chrysler,Royal,863
293 | Chrysler,Royal,20
294 | Chrysler,Saratoga,8312
295 | Chrysler,Sebring,1267
296 | Chrysler,Sigma,683
297 | Chrysler,Sunbeam,7414
298 | Chrysler,TC,4384
299 | Chrysler,Touring,325
300 | Chrysler,Town,2340
301 | Chrysler,Turbine,6708
302 | Chrysler,Valiant,9309
303 | Chrysler,Valiant,3872
304 | Chrysler,Vogue,5589
305 | Chrysler,Voyager,3797
306 | Chrysler,Royal,4695
307 | Chrysler,Windsor,5449
308 | Honda,Accord,5547
309 | Honda,Amaze,3084
310 | Honda,Avancier,9269
311 | Honda,Ballade,1666
312 | Honda,Brio,1899
313 | Honda,City,4908
314 | Honda,Civic,6317
315 | Honda,Civic Type R,4415
316 | Honda,Clarity,6472
317 | Honda,Crider,2453
318 | Honda,Elysion,5302
319 | Honda,Fit,2572
320 | Honda,Freed,5982
321 | Honda,Freed,7931
322 | Honda,City,8034
323 | Honda,City,8319
324 | Honda,City,7527
325 | Honda,Jade,8594
326 | Honda,Fit,4487
327 | Honda,Legend,5208
328 | Honda,Mobilio,6348
329 | Honda,NSX,196
330 | Honda,Pilot,7059
331 | Honda,Ridgeline,8671
332 | Honda,S660,9805
333 | Honda,Shuttle,1230
334 | Honda,Spirior,9906
335 | Honda,StepWGN,4061
336 | Honda,Avancier,9107
337 | Honda,Vamos,466
338 | Honda,Vezel,4760
339 | Honda,Type R,5449
340 | Jaguar,F-Type,8457
341 | Jaguar,Type,191
342 | 2007,2007,3276
343 | 2017,2017,1741
344 | 2018,2018,59
345 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Fast Autocomplete 0.9.0
2 |
3 | [zepworks.com](https://zepworks.com)
4 |
5 | Fast autocomplete using Directed Word Graph (DWG) and Levenshtein Edit Distance.
6 |
7 | The results are cached via LFU (Least Frequently Used).
8 |
9 | # Why
10 |
11 | Read about why fast-autocomplete was built here:
12 |
13 | This library was written when we came to the conclusion that Elasticsearch's Autocomplete suggestor is not fast enough and doesn't do everything that we need:
14 |
15 | 1. Once we switched to Fast Autocomplete, our average latency went from 120ms to 30ms so an improvement of 3-4x in performance and errors went down to zero.
16 | 2. Elasticsearch's Autocomplete suggestor does not handle any sort of combination of the words you have put in. For example Fast Autocomplete can handle `2018 Toyota Camry in Los Angeles` when the words `2018`, `Toyota Camry`, `Los Angeles` are seperately fed into it. While Elasticsearch's autocomplete needs that whole sentence to be fed to it to show it in Autocomplete results.
17 |
18 | You might say:
19 |
20 | 1. Regarding #1: Yes, but you are using caching. Answer: shhh Yes, keep it quiet. We are also doing Levenshtein Edit distance using a C library so it improves there too.
21 | 2. Regarding #2: Cool. Answer: Ok, now we are talking.
22 |
23 | # How
24 |
25 | Read about how fast-autocomplete works here:
26 |
27 | In a nutshell, what the fast Autocomplete does is:
28 |
29 | 1. Populate the DWG with your words.
30 | 2. Follow the graph nodes letter by letter until it finds nodes that have words in them.
31 | 3. Continue after words are found on the graph until it reaches the leaf node.
32 | 4. Restart from the root node again until it reaches a letter that doesn't exist on the graph.
33 | 5. Depending on how much is left from the rest of the word, return all the descendant words from where it got stuck
34 | 6. Or run Levenshtein edit distance to find closes words to what is left and the continue from there.
35 |
36 | By doing so, it can tokenize a text such as:
37 |
38 | `2018 Toyota Camry in Los Angeles` into [`2018`, `toyota camry`, `in`, `los angeles`]
39 |
40 | And return Autocomplete results as you type.
41 |
42 | # Install
43 |
44 | `pip install fast-autocomplete`
45 |
46 | **Note: Fast Autocomplete only works with Python 3.6 and newer.**
47 |
48 | Are you still on Python 2? TIME TO UPGRADE.
49 |
50 | # Licence
51 |
52 | MIT
53 |
54 | # DWG
55 |
56 | The data structure we use in this library is called Dawg.
57 |
58 | DWG stands for Directed Word Graph. Here is an example DWG based on the "makes_models_short.csv" that is provided in the tests:
59 |
60 | 
61 |
62 | 
63 |
64 |
65 | # Usage
66 |
67 | First of all lets start from your data. The library leaves it up to you how to prepare your data.
68 | If you want to go straight to the factory function that lets you use the library in its easiest and most common case, skip all these and jump to the [sorting](#sorting) example.
69 |
70 | ## Example 1
71 |
72 | ```py
73 | >>> from fast_autocomplete import AutoComplete
74 | >>> words = {'book': {}, 'burrito': {}, 'pizza': {}, 'pasta':{}}
75 | >>> autocomplete = AutoComplete(words=words)
76 | >>> autocomplete.search(word='b', max_cost=3, size=3)
77 | [['book'], ['burrito']]
78 | >>> autocomplete.search(word='bu', max_cost=3, size=3)
79 | [['burrito']]
80 | >>> autocomplete.search(word='barrito', max_cost=3, size=3) # mis-spelling
81 | [['burrito']]
82 | ```
83 |
84 | Words is a dictionary and each word can have a context. For example the "count", how to display the word, some other context around the word etc. In this example words didn't have any context.
85 |
86 | ## Example 2
87 |
88 | Imagine that we have a csv with the following content from vehicles' make and models:
89 |
90 | ```csv
91 | make,model
92 | acura,zdx
93 | alfa romeo,4c
94 | alfa romeo,4c coupe
95 | alfa romeo,giulia
96 | bmw,1 series
97 | bmw,2 series
98 | 2007,2007
99 | 2017,2017
100 | 2018,2018
101 | ```
102 |
103 | What we want to do is to convert this to a dictionary of words and their context.
104 |
105 |
106 | ```py
107 | import csv
108 | from fast_autocomplete.misc import read_csv_gen
109 |
110 |
111 | def get_words(path):
112 |
113 | csv_gen = read_csv_gen(path, csv_func=csv.DictReader)
114 |
115 | words = {}
116 |
117 | for line in csv_gen:
118 | make = line['make']
119 | model = line['model']
120 | if make != model:
121 | local_words = [model, '{} {}'.format(make, model)]
122 | while local_words:
123 | word = local_words.pop()
124 | if word not in words:
125 | words[word] = {}
126 | if make not in words:
127 | words[make] = {}
128 | return words
129 | ```
130 |
131 | the `read_csv_gen` is just a helper function. You don't really need it. The whole point is that we are converting that csv to a dictionary that looks like this:
132 |
133 | ```py
134 | >>> words = get_words('path to the csv')
135 | >>> words
136 | {'acura zdx': {},
137 | 'zdx': {},
138 | 'acura': {},
139 | 'alfa romeo 4c': {},
140 | '4c': {},
141 | 'alfa romeo': {},
142 | 'alfa romeo 4c coupe': {},
143 | '4c coupe': {},
144 | 'alfa romeo giulia': {},
145 | 'giulia': {},
146 | 'bmw 1 series': {},
147 | '1 series': {},
148 | 'bmw': {},
149 | 'bmw 2 series': {},
150 | '2 series': {},
151 | '2007': {},
152 | '2017': {},
153 | '2018': {}}
154 | ```
155 |
156 | This is a dictionary of words to their context. We have decided that we don't want any context for the words in this example so all the contexts are empty. However generally you will want some context around the words for more complicated logics. The context is used to convert the words "keys" into their context which is the value of the key in the words dictionary.
157 |
158 | In addition to words, we usually want a dictionary of synonyms. Something like this:
159 |
160 | ```py
161 | synonyms = {
162 | "alfa romeo": ["alfa"],
163 | "bmw": ["beemer", "bimmer"],
164 | "mercedes-benz": ["mercedes", "benz"],
165 | "volkswagen": ["vw"]
166 | }
167 | ```
168 |
169 | Note that synonyms are optional. Maybe in your use case you don't need synonyms.
170 |
171 | Now we can use the above to initialize Autocomplete
172 |
173 | ```py
174 |
175 | from fast_autocomplete import AutoComplete
176 |
177 | autocomplete = AutoComplete(words=words, synonyms=synonyms)
178 | ```
179 |
180 | At this point, AutoComplete has created a [dwg](#DWG) structure.
181 |
182 | Now you can search!
183 |
184 | - word: the word to return autocomplete results for
185 | - max_cost: Maximum Levenshtein edit distance to be considered when calculating results
186 | - size: The max number of results to return
187 |
188 | ```py
189 | >>> autocomplete.search(word='2018 bmw 1', max_cost=3, size=3)
190 | [['2018', 'bmw'], ['2018', 'bmw 1 series']]
191 | ```
192 |
193 | Now what if we pressed a by mistake then? It still works. No problem.
194 |
195 | ```py
196 | >>> autocomplete.search(word='2018 bmw 1a', max_cost=3, size=3)
197 | [['2018', 'bmw'], ['2018', 'bmw 1 series']]
198 | ```
199 |
200 | Ok let's search for Alfa now:
201 |
202 | ```py
203 | >>> autocomplete.search(word='alfa', max_cost=3, size=3)
204 | [['alfa romeo'], ['alfa romeo 4c'], ['alfa romeo giulia']]
205 | ```
206 |
207 | What if we don't know how to pronounce alfa and we type `alpha` ?
208 |
209 | ```py
210 | >>> autocomplete.search(word='alpha', max_cost=3, size=3)
211 | [['alfa romeo'], ['alfa romeo 4c'], ['alfa romeo giulia']]
212 | ```
213 |
214 | It still works!
215 |
216 | Fast-Autocomplete makes sure the results make sense!
217 |
218 | Ok lets add the word `Los Angeles` there to the words:
219 |
220 |
221 | ```py
222 | >>> words['los angeles'] = {}
223 | >>> words['in'] = {}
224 | >>> autocomplete.search(word='2007 alfa in los', max_cost=3, size=3)
225 | [['2007', 'alfa romeo', 'in'], ['2007', 'alfa romeo', 'in', 'los angeles']]
226 | ```
227 |
228 | So far we have not used the context. And this library leaves it up to you how to use the context. But basically if we giving a context to each one of those words, then the above response could easly be translated to the list of those contexts.
229 |
230 | ## context
231 |
232 | If our words dictionary was:
233 |
234 | ```py
235 | words = {
236 | 'in': {},
237 | 'alfa romeo': {'type': 'make'},
238 | '2007': {'type': 'year'},
239 | 'los angeles': {'type': 'location'},
240 | }
241 | ```
242 |
243 | Then the `autocomplete.words` can be used to map the results into their context:
244 |
245 | ```
246 | [['2007', 'alfa romeo', 'in'], ['2007', 'alfa romeo', 'in', 'los angeles']]
247 |
248 | converted to contexts:
249 |
250 | [[{'year': '2007'}, {'make': alfa romeo'}], [{'year': '2007'}, {'make': alfa romeo'}, {'location': 'los angeles'}]]
251 | ```
252 |
253 | ## Sorting
254 |
255 | Most people who use Fast Autocomplete, want to control how results are sorted. If you don't control that, the results will be sorted based on the order that Autocomplete found the nodes in the graph that matched the criteria.
256 |
257 | The easiest way to sort is to give each item a count. **Fast AutoComplete will use the count to sort items that are partial matches.**
258 |
259 | For example:
260 |
261 | 1. Make a json file that is a dictionary of words to their context.
262 |
263 | The format of the file needs to be:
264 |
265 | ```json
266 |
267 | {
268 | word: [
269 | context,
270 | display value,
271 | count
272 | ]
273 | }
274 | ```
275 |
276 | An example is included in the [sample_words.json](tests/fixtures/sample_words.json)
277 |
278 | ```json
279 | {
280 | "acura rlx": [
281 | {
282 | "model": "rlx",
283 | "make": "acura"
284 | },
285 | "Acura RLX",
286 | 3132
287 | ],
288 | "rlx": [
289 | {
290 | "model": "rlx",
291 | "make": "acura"
292 | },
293 | "Acura RLX",
294 | 3132
295 | ],
296 | "acura": [
297 | {
298 | "make": "acura"
299 | },
300 | "Acura",
301 | 130123
302 | ],
303 | ...
304 | }
305 | ```
306 |
307 | You might be wondering why things are in this format. It is to save space when this json can become very big easily and the keys become repetitive. That's why we are using a list with predefined order of keys. For your use case for now you can leave the context and display values as None if you want. We will opensource other factory functions soon that will fully utilize those keys in the context.
308 |
309 | 2. Launch Autocomplete via the factory function:
310 |
311 | ```py
312 | from fast_autocomplete import autocomplete_factory
313 |
314 | content_files = {
315 | 'words': {
316 | 'filepath': path/to/sample_words.json,
317 | 'compress': True # means compress the graph data in memory
318 | }
319 | }
320 |
321 | autocomplete = autocomplete_factory(content_files=content_files)
322 | ```
323 |
324 | 3. You can use Autocomplete and the results are ordered by count!
325 |
326 |
327 | ```py
328 | >>> autocomplete.search(word='acu')
329 | [['acura'], ['acura mdx'], ['acura rdx']]
330 | ```
331 |
332 | 4. How do we use the context and display value now?
333 |
334 | Great question. You need to extend AutoComplete class to use these items. I will write a blog post about it.
335 |
336 | Here is a simple example without any extending:
337 |
338 | ```py
339 | >>> autocomplete.words['acura']
340 | WordValue(context={'make': 'acura'}, display='Acura', count=130123, original_key=None)
341 | >>> autocomplete.words['acura'].display
342 | Acura
343 | ```
344 |
345 | ### Change the sorting by updating counts
346 |
347 | Fast Autocomplete by default uses the "count" of the items to sort the items in the results. Think about these counts as a "guide" to Fast autocomplete so it can polish its results. Depending on whether or not Fast autocomplete finds exact matches to user's query, the counts will be used to refine the results. You can update the counts in an autocomplete object live.
348 |
349 | For example, in the [sample csv of car makes and models](tests/fixtures/makes_models_from_wikipedia.csv) we have:
350 |
351 | ```csv
352 | make,model,count
353 | Toyota,Aurion,6094
354 | Toyota,Avalon,8803
355 | Toyota,Avensis,1630
356 | Toyota,Auris,4025
357 | Toyota,Aygo,2115
358 | ```
359 |
360 | If we use the autocomplete to search:
361 |
362 | ```py
363 | >>> auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo'])
364 | >>> autocomplete.search(word='toyota a')
365 | [['toyota'], ['toyota avalon'], ['toyota aurion'], ['toyota auris']]
366 | ```
367 |
368 | However as you can notice `toyota aygo` had the count of 2115 and thus it didn't make it to the top 3 results.
369 |
370 | We can set the count for `toyota aygo` to a higher number to boost it in the results using `update_count_of_word`.
371 |
372 | The `update_count_of_word` can change the count via setting the word's count directly or by offsetting its current value.
373 |
374 | ```py
375 | >>> auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo'])
376 | >>> auto_complete.update_count_of_word(word='toyota aygo', count=10000)
377 | 10000
378 | ```
379 |
380 | Now if we search:
381 |
382 | ```py
383 | >>> autocomplete.search(word='toyota a')
384 | [['toyota'], ['toyota aygo'], ['toyota avalon'], ['toyota aurion']]
385 | ```
386 |
387 | We can double check the count of a node:
388 |
389 | ```py
390 | >>> autocomplete.get_count_of_word('toyota aygo')
391 | 10000
392 | ```
393 |
394 | Now let's use the offset to offset the current count of a different node:
395 |
396 |
397 | ```py
398 | >>> auto_complete.update_count_of_word(word='toyota aurion', offset=-6000)
399 | 94
400 | ```
401 |
402 | When we search, `toyota aurion` is not in the top 3 results anymore!
403 |
404 | ```py
405 | >>> autocomplete.search(word='toyota a')
406 | [['toyota'], ['toyota aygo'], ['toyota avalon'], ['toyota auris']]
407 | ```
408 |
409 |
410 | ## Unicode
411 |
412 | By default this package only accepts ASCII lowercase letters, a-z. However you can pass the characters that you want to be acceptable via `valid_chars_for_string` for strings, and `valid_chars_for_integer` for numbers. For example here we tell Autocomplete to consider the Farsi alphabet characters for string characters.
413 |
414 | ```python
415 | AutoComplete(
416 | words=SHORT_WORDS_UNICODE,
417 | valid_chars_for_string='اآبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی')
418 | ```
419 |
420 | If you want to pass other characters in addition to ASCII letters, such as punctuation marks, you need to set the `valid_chars_for_string` variable to include all of the characters you need. For example, the following code block sets ASCII letters a-z along with periods and apostrophes:
421 |
422 | ```python
423 | valid_chars = ".'"
424 | valid_chars += string.ascii_lowercase
425 | AutoComplete(
426 | words=WORDS_WITH_PUNCTUATION,
427 | valid_chars_for_string=valid_chars)
428 | ```
429 |
430 |
431 | ## Draw
432 |
433 | This package can actually draw the dwgs as it is populating them or just once the dwg is populated for you!
434 | Here is the animation of populating the dwg with words from "makes_models_short.csv":
435 |
436 |
437 | ### Draw animation of dwg populating
438 |
439 | ```py
440 | from fast_autocomplete import AutoComplete, DrawGraphMixin
441 |
442 |
443 | class AutoCompleteDraw(DrawGraphMixin, AutoComplete):
444 | DRAW_POPULATION_ANIMATION = True
445 | DRAW_POPULATION_ANIMATION_PATH = 'animation/short_.svg'
446 | DRAW_POPULATION_ANIMATION_FILENO_PADDING = 6
447 |
448 |
449 | autocomplete = AutoCompleteDraw(words=words, synonyms=synonyms)
450 | ```
451 |
452 | As soon as you initialize the above AutoCompleteDraw class, it will populate the dwg and generate the animation!
453 | For an example of this code properly setup, take a look at the tests. In fact the animation in the [dwg](#dwg) section is generated the same way via unit tests!
454 |
455 | Note that if you have many words, the graph file will be big. Instead of drawing all frames as the dwg is being populated, you can just draw the final stage:
456 |
457 | ### Draw the final graph
458 |
459 | To draw just one graph that shows the final stage of the dwg, use the draw mixin and run the draw_graph function:
460 |
461 | ```py
462 | from fast_autocomplete import AutoComplete, DrawGraphMixin
463 |
464 |
465 | class AutoCompleteDraw(DrawGraphMixin, AutoComplete):
466 | pass
467 |
468 | autocomplete = AutoCompleteDraw(words=words, synonyms=synonyms)
469 | autocomplete.draw_graph('path to file')
470 | ```
471 |
472 | ## Demo
473 |
474 | If you want to have a real-time interaction with Autocomplete results in your terminal, you can use the demo module:
475 |
476 | Just pass it an instance of the autocomplete and the search configs:
477 |
478 | ```py
479 | from fast_autocomplete import demo
480 |
481 | demo(autocomplete, max_cost=3, size=5)
482 | ```
483 |
484 | #
485 |
486 | # Develop
487 |
488 | 1. Clone the repo
489 | 2. Make a virtualenv with Python 3.6 or newer
490 | 3. `pip install -r requirements-dev.txt`
491 |
492 | ## Run tests
493 |
494 | `pytest`
495 |
496 | We try to maintain high standard in code coverage. Currently the `dwg` module's coverage is around 99%!
497 |
498 | ## Releases
499 |
500 | We use bump2version to bump and tag releases.
501 |
502 | ```bash
503 | git checkout master && git pull
504 | bump2version {patch|minor|major}
505 | git push && git push --tags
506 | ```
507 |
508 | # Authors
509 |
510 | - Autocomplete written by [Sep Dehpour](http://zepworks.com).
511 | - LFU Cache by [Shane Wang](https://medium.com/@epicshane)
512 |
513 | # Other ways of doing AutoComplete
514 |
515 | 1. Elastic search. Yes, Elasticsearch generally is a *better* Autocomplete solution than this library. I said generally. In our specific use case, we wanted Autocomplete to be faster than Elasticsearch and handle combination of words. Otherwise Elasticsearch would have been perfect. Behind the scene Elasticsearch uses Finite State Transducer (FST) in Lucene to achive AutoComplete. FST is more complicated than what we have used in fast-autocomplete.
516 |
517 | 2. If your autocomplete is supposed to return results based on a big blog of text (for example based on some book contents), then a better solution is to go with Markov chains and conditional probability. Yes, there is already a library out there for it! and it looks great. Disclaimer: we have not actually used it since it doesn't fit our specific use-case.
518 |
519 |
520 | # FAQ
521 |
522 | ## Why DWG
523 | DWG stands for Directed Word Graph. Originally we were using Trie-Tree structure. But soon it was obvious that some branches needed to merge back to other branches. Such as `beemer` and `bmw` branches both need to end in the same node since they are synonyms. Thus we used DWG.
524 |
525 | ## What are synonyms, clean synonyms and partial synonyms
526 | Synonyms are words that should produce the same results.
527 |
528 | - For example `beemer` and `bmw` should both give you `bmw`.
529 | - `alfa` and `alfa romeo` should both give you `alfa romeo`
530 |
531 | The synonyms get divided into 2 groups:
532 |
533 | 1. clean synonyms: The 2 words share little or no words. For example `beemer` vs. `bmw`.
534 | 2. partial synonyms: One of the 2 words is a substring of the other one. For example `alfa` and `alfa romeo` or `gm` vs. `gmc`.
535 |
536 | Internally these 2 types of synonyms are treated differently but as a user of the library, you don't need to really care about it. You just provide the synonyms dictionary via defining the `get_synonyms` method.
537 |
538 | ## Why do you have a whole subtree for partial synonyms
539 | Q: Partial synonym means the synonym is a part of the original word. Such as `alfa` is a partial synonym for `alfa romeo`.
540 | In that case you are inserting both `alfa` and `alfa romeo` in the dwg. `alfa` will have `alfa 4c` and `alpha romeo` will have `alfa romeo 4c` branches. Why not just have `alfa` branches to be `alfa romeo` and from there you will have automatically all the sub branches of `alfa romeo`.
541 |
542 | Answer: We use letters for edges. So `alfa` can have only one edge coming out of it that is space (` `). And that edge is going to a node that has sub-branches to `alfa romoe`, `alfa 4c` etc. It can't have a ` ` going to that node and another ` ` going to `alfa romeo`'s immediate child. That way when we are traversing the dwg for the input of `alfa 4` we get to the correct node.
543 |
544 | ## I put Toyota in the Dawg but when I type `toy`, it doesn't show up.
545 |
546 | Answer: If you put `Toyota` with capital T in the dwg, it expects the search word to start with capital T too. We suggest that you lower case everything before putting them in dwg. Fast-autocomplete does not automatically do that for you since it assumes the `words` dictionary is what you want to be put in the dwg. It is up to you to clean your own data before putting it in the dwg.
547 |
--------------------------------------------------------------------------------
/tests/test_autocomplete.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import json
3 | import os
4 | import pytest
5 | import string
6 | from pprint import pprint
7 | from typing import NamedTuple
8 |
9 | from fast_autocomplete.misc import read_csv_gen
10 | from fast_autocomplete import AutoComplete, DrawGraphMixin
11 | from fast_autocomplete.dwg import FindStep
12 |
13 |
14 | current_dir = os.path.dirname(os.path.abspath(__file__))
15 |
16 | WHAT_TO_PRINT = {'word', 'results', 'expected_results', 'result',
17 | 'find_steps', 'expected_steps', 'search_results', 'search_results_immutable'}
18 |
19 |
20 | class Info(NamedTuple):
21 | make: 'Info' = None
22 | model: 'Info' = None
23 | original_key: 'Info' = None
24 | count: int = 0
25 |
26 | def get(self, key, default=None):
27 | return getattr(self, key, default)
28 |
29 | __get__ = get
30 |
31 |
32 | def parameterize_cases(cases):
33 | return [tuple(i.values()) for i in cases]
34 |
35 |
36 | def print_results(local_vars):
37 | common = WHAT_TO_PRINT & set(local_vars.keys())
38 | for key in common:
39 | print(f'- {key}:')
40 | pprint(local_vars[key])
41 |
42 |
43 | def get_words(path):
44 |
45 | file_path = os.path.join(current_dir, path)
46 | csv_gen = read_csv_gen(file_path, csv_func=csv.DictReader)
47 |
48 | words = {}
49 |
50 | for line in csv_gen:
51 | make = line['make'].lower()
52 | model = line['model'].lower()
53 | if make != model:
54 | local_words = [model, '{} {}'.format(make, model)]
55 | while local_words:
56 | word = local_words.pop()
57 | if word not in words:
58 | words[word] = dict(line)
59 | if make not in words:
60 | words[make] = {"make": make}
61 |
62 | words['truck'] = {'make': 'truck'}
63 | return words
64 |
65 |
66 | WIKIPEDIA_WORDS = get_words('fixtures/makes_models_from_wikipedia.csv')
67 |
68 | SHORT_WORDS = get_words('fixtures/makes_models_short.csv')
69 |
70 | SHORT_WORDS_UNICODE = get_words('fixtures/makes_models_in_farsi_short.csv')
71 |
72 | SHORT_WORDS_IMMUTABLE_INFO = {key: Info(**value) for key, value in SHORT_WORDS.items()}
73 |
74 |
75 | with open(os.path.join(current_dir, 'fixtures/synonyms.json'), 'r') as the_file:
76 | SYNONYMS = json.loads(the_file.read())
77 |
78 |
79 | class TestAutocomplete:
80 |
81 | @pytest.mark.parametrize("word, max_cost, size, expected_results", [
82 | ('bmw', 2, 3, {0: [['bmw']], 1: [['bmw 1 series'], ['bmw e28'], ['bmw e30'], ['bmw e34']]}),
83 | ('beemer', 2, 3, {}),
84 | ('honda covic', 2, 3, {0: [['honda']], 1: [['honda', 'civic'], ['honda', 'civic type r']]}),
85 | ])
86 | def test_search_without_synonyms(self, word, max_cost, size, expected_results):
87 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS)
88 | results, find_steps = auto_complete._find(word, max_cost, size)
89 | results = dict(results)
90 | print_results(locals())
91 | assert expected_results == results
92 |
93 | @pytest.mark.parametrize("word, max_cost, size, expected_results", [
94 | ('بی ام و', 2, 3, {0: [['بی ام و']], 1: [['بی ام و 1 series'], ['بی ام و 2 series']]}),
95 | ])
96 | def test_search_unicode_without_synonyms(self, word, max_cost, size, expected_results):
97 | auto_complete = AutoComplete(
98 | words=SHORT_WORDS_UNICODE,
99 | valid_chars_for_string='اآبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی')
100 | results, find_steps = auto_complete._find(word, max_cost, size)
101 | results = dict(results)
102 | print_results(locals())
103 | assert expected_results == results
104 |
105 | def test_autocomplete_synonym_part_of_another_word(self):
106 | words = {'cartoon': {}, 'vehicle': {}}
107 | synonyms = {'vehicle': ['car']}
108 | autocomplete = AutoComplete(words=words, synonyms=synonyms)
109 | result = autocomplete.search(word='ca')
110 | assert [['vehicle'], ['cartoon']] == result
111 |
112 | def test_special_characters(self):
113 | words = {'abcd(efgh)ijk': {}, 'u (2 off)': {}}
114 | autocomplete = AutoComplete(words=words, valid_chars_for_string=string.ascii_letters + string.punctuation)
115 | # result = autocomplete.search(word='abcd(efgh)')
116 | # assert [['abcd(efgh)ijk']] == result
117 |
118 | result2 = autocomplete.search(word='u (2 o')
119 | assert [['u (2 off)']] == result2
120 |
121 |
122 | STEP_DESCENDANTS_ONLY = [FindStep.descendants_only]
123 | STEP_FUZZY_FOUND = [FindStep.fuzzy_try, FindStep.fuzzy_found]
124 |
125 | SEARCH_CASES = [
126 | {'word': ' ',
127 | 'max_cost': 3,
128 | 'size': 3,
129 | 'expected_find_results': {1: [['1 series'], ['bmw 1 series'], ['spirior'], ['honda spirior']]},
130 | 'expected_steps': STEP_DESCENDANTS_ONLY,
131 | 'expected_find_and_sort_results': [['1 series'], ['bmw 1 series'], ['spirior']],
132 | },
133 | {'word': '',
134 | 'max_cost': 3,
135 | 'size': 3,
136 | 'expected_find_results': {1: [['1 series'], ['bmw 1 series'], ['spirior'], ['honda spirior']]},
137 | 'expected_steps': STEP_DESCENDANTS_ONLY,
138 | 'expected_find_and_sort_results': [['1 series'], ['bmw 1 series'], ['spirior']],
139 | },
140 | {'word': 'c',
141 | 'max_cost': 3,
142 | 'size': 3,
143 | 'expected_find_results': {0: [['c']], 1: [['charger'], ['chrysler charger'], ['chrysler d'], ['crown']]},
144 | 'expected_steps': STEP_DESCENDANTS_ONLY,
145 | 'expected_find_and_sort_results': [['c'], ['charger'], ['chrysler charger']],
146 | },
147 | {'word': 'ca',
148 | 'max_cost': 3,
149 | 'size': 3,
150 | 'expected_find_results': {1: [['california'], ['caddy'], ['camry'], ['cabriolet']]},
151 | 'expected_steps': STEP_DESCENDANTS_ONLY,
152 | 'expected_find_and_sort_results': [['california'], ['caddy'], ['camry']],
153 | },
154 | {'word': 'camr',
155 | 'max_cost': 3,
156 | 'size': 6,
157 | 'expected_find_results': {1: [['camry']]},
158 | 'expected_steps': STEP_DESCENDANTS_ONLY,
159 | 'expected_find_and_sort_results': [['camry']],
160 | },
161 | {'word': '4d',
162 | 'max_cost': 3,
163 | 'size': 3,
164 | 'expected_find_results': {1: [['4runner'], ['4c']]},
165 | 'expected_steps': STEP_DESCENDANTS_ONLY,
166 | 'expected_find_and_sort_results': [['4runner'], ['4c']],
167 | },
168 | {'word': '2018 alpha ',
169 | 'max_cost': 3,
170 | 'size': 3,
171 | 'expected_find_results': {0: [['2018']],
172 | 2: [['2018', 'alfa romeo'],
173 | ['2018', 'alfa romeo 2300'],
174 | ['2018', 'alfa romeo montreal'],
175 | ['2018', 'alfa romeo 90'],
176 | ['2018', 'alfa romeo gtv']]},
177 | 'expected_steps': STEP_FUZZY_FOUND,
178 | 'expected_find_and_sort_results': [['2018'], ['2018', 'alfa romeo'], ['2018', 'alfa romeo 2300']],
179 | },
180 | {'word': '2018 alpha romeo 4d',
181 | 'max_cost': 3,
182 | 'size': 4,
183 | 'expected_find_results': {0: [['2018']],
184 | 1: [['2018', 'alfa romeo 2300'],
185 | ['2018', 'alfa romeo montreal'],
186 | ['2018', 'alfa romeo 90'],
187 | ['2018', 'alfa romeo gtv'],
188 | ['2018', 'alfa romeo 6c']],
189 | 2: [['2018', 'alfa romeo', 'ameo']]},
190 | 'expected_steps': [FindStep.fuzzy_try, FindStep.fuzzy_found, {FindStep.rest_of_fuzzy_round2: [FindStep.fuzzy_try, FindStep.fuzzy_found]}, FindStep.not_enough_results_add_some_descandants],
191 | 'expected_find_and_sort_results': [['2018'],
192 | ['2018', 'alfa romeo 2300'],
193 | ['2018', 'alfa romeo montreal'],
194 | ['2018', 'alfa romeo 90']],
195 | },
196 | {'word': '2018 alpha',
197 | 'max_cost': 3,
198 | 'size': 3,
199 | 'expected_find_results': {0: [['2018']],
200 | 2: [['2018', 'alfa romeo'],
201 | ['2018', 'alfa romeo 2300'],
202 | ['2018', 'alfa romeo montreal'],
203 | ['2018', 'alfa romeo 90'],
204 | ['2018', 'alfa romeo gtv']]},
205 | 'expected_steps': STEP_FUZZY_FOUND,
206 | 'expected_find_and_sort_results': [['2018'], ['2018', 'alfa romeo'], ['2018', 'alfa romeo 2300']],
207 | },
208 | {'word': '2018 alfa',
209 | 'max_cost': 3,
210 | 'size': 3,
211 | 'expected_find_results': {0: [['2018', 'alfa romeo']],
212 | 1: [['2018', 'alfa romeo 2300'],
213 | ['2018', 'alfa romeo montreal'],
214 | ['2018', 'alfa romeo 90'],
215 | ['2018', 'alfa romeo gtv']]},
216 | 'expected_steps': STEP_DESCENDANTS_ONLY,
217 | 'expected_find_and_sort_results': [['2018', 'alfa romeo'], ['2018', 'alfa romeo 2300'], ['2018', 'alfa romeo montreal']],
218 | },
219 | {'word': '2018 alfg',
220 | 'max_cost': 3,
221 | 'size': 3,
222 | 'expected_find_results': {0: [['2018']],
223 | 1: [['2018', 'alfa romeo 2300'],
224 | ['2018', 'alfa romeo montreal'],
225 | ['2018', 'alfa romeo 90'],
226 | ['2018', 'alfa romeo gtv']]},
227 | 'expected_steps': STEP_DESCENDANTS_ONLY,
228 | 'expected_find_and_sort_results': [['2018'], ['2018', 'alfa romeo 2300'], ['2018', 'alfa romeo montreal']],
229 | },
230 | {'word': '2018 glfa',
231 | 'max_cost': 3,
232 | 'size': 3,
233 | 'expected_find_results': {0: [['2018']], 1: [['2018', 'gla']]},
234 | 'expected_steps': STEP_DESCENDANTS_ONLY,
235 | 'expected_find_and_sort_results': [['2018'], ['2018', 'gla']],
236 | },
237 | {'word': '2018 doyota',
238 | 'max_cost': 3,
239 | 'size': 3,
240 | 'expected_find_results': {0: [['2018']],
241 | 1: [['2018', 'toyota'],
242 | ['2018', 'toyota crown'],
243 | ['2018', 'toyota prius'],
244 | ['2018', 'toyota avalon'],
245 | ['2018', 'toyota dyna']]},
246 | 'expected_steps': STEP_FUZZY_FOUND,
247 | 'expected_find_and_sort_results': [['2018'], ['2018', 'toyota'], ['2018', 'toyota crown']],
248 | },
249 | {'word': '2018 doyota camr',
250 | 'max_cost': 3,
251 | 'size': 3,
252 | 'expected_find_results': {0: [['2018']],
253 | 1: [['2018', 'toyota', 'camry'],
254 | ['2018', 'dyna'],
255 | ['2018', 'dauphine'],
256 | ['2018', 'drifter']]},
257 | 'expected_steps': [FindStep.fuzzy_try, FindStep.fuzzy_found, {FindStep.rest_of_fuzzy_round2: [FindStep.descendants_only]}, FindStep.not_enough_results_add_some_descandants],
258 | 'expected_find_and_sort_results': [['2018'], ['2018', 'toyota', 'camry'], ['2018', 'dyna']],
259 | },
260 | {'word': '2018 beemer',
261 | 'max_cost': 3,
262 | 'size': 3,
263 | 'expected_find_results': {0: [['2018', 'bmw']],
264 | 1: [['2018', 'bmw 1 series'],
265 | ['2018', 'bmw e28'],
266 | ['2018', 'bmw e30'],
267 | ['2018', 'bmw e34']]},
268 | 'expected_steps': STEP_DESCENDANTS_ONLY,
269 | 'expected_find_and_sort_results': [['2018', 'bmw'], ['2018', 'bmw 1 series'], ['2018', 'bmw e28']],
270 | },
271 | {'word': '2018 beener',
272 | 'max_cost': 3,
273 | 'size': 3,
274 | 'expected_find_results': {0: [['2018']],
275 | 1: [['2018', 'bmw 1 series'],
276 | ['2018', 'bmw e28'],
277 | ['2018', 'bmw e30'],
278 | ['2018', 'bmw e34']]},
279 | 'expected_steps': [FindStep.fuzzy_try, FindStep.not_enough_results_add_some_descandants],
280 | 'expected_find_and_sort_results': [['2018'], ['2018', 'bmw 1 series'], ['2018', 'bmw e28']],
281 | },
282 | {'word': 'vw bea',
283 | 'max_cost': 3,
284 | 'size': 3,
285 | 'expected_find_results': {0: [['volkswagen']], 1: [['volkswagen beetle']]},
286 | 'expected_steps': STEP_DESCENDANTS_ONLY,
287 | 'expected_find_and_sort_results': [['volkswagen'], ['volkswagen beetle']],
288 | },
289 | {'word': 'toyota camry 2018',
290 | 'max_cost': 3,
291 | 'size': 5,
292 | 'expected_find_results': {0: [['toyota camry', '2018']]},
293 | 'expected_steps': STEP_DESCENDANTS_ONLY,
294 | 'expected_find_and_sort_results': [['toyota camry', '2018']],
295 | },
296 | {'word': 'type r',
297 | 'max_cost': 3,
298 | 'size': 5,
299 | 'expected_find_results': {0: [['type r']]},
300 | 'expected_steps': STEP_DESCENDANTS_ONLY,
301 | 'expected_find_and_sort_results': [['type r']],
302 | },
303 | {'word': 'truck',
304 | 'max_cost': 3,
305 | 'size': 5,
306 | 'expected_find_results': {0: [['truck']]},
307 | 'expected_steps': STEP_DESCENDANTS_ONLY,
308 | 'expected_find_and_sort_results': [['truck']],
309 | },
310 | {'word': 'trucks',
311 | 'max_cost': 3,
312 | 'size': 5,
313 | 'expected_find_results': {0: [['truck']]},
314 | 'expected_steps': STEP_DESCENDANTS_ONLY,
315 | 'expected_find_and_sort_results': [['truck']],
316 | },
317 | {'word': '1se',
318 | 'max_cost': 3,
319 | 'size': 5,
320 | 'expected_find_results': {1: [['1 series']]},
321 | 'expected_steps': STEP_DESCENDANTS_ONLY,
322 | 'expected_find_and_sort_results': [['1 series']],
323 | },
324 | ]
325 |
326 |
327 | SEARCH_CASES_PARAMS = parameterize_cases(SEARCH_CASES)
328 |
329 |
330 | class TestAutocompleteWithSynonyms:
331 |
332 | @pytest.mark.parametrize("word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results", SEARCH_CASES_PARAMS)
333 | def test_find(self, word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results):
334 | expected_results = expected_find_results
335 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
336 | results, find_steps = auto_complete._find(word, max_cost, size)
337 | results = dict(results)
338 | print_results(locals())
339 | assert expected_results == results
340 | assert expected_steps == find_steps
341 |
342 | @pytest.mark.parametrize("word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results", SEARCH_CASES_PARAMS)
343 | def test__find_and_sort(self, word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results):
344 | expected_results = expected_find_and_sort_results
345 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
346 | results = auto_complete._find_and_sort(word, max_cost, size)
347 | results = list(results)
348 | search_results = auto_complete.search(word, max_cost, size)
349 | print_results(locals())
350 | assert expected_results == results
351 | if word.strip():
352 | assert expected_results == search_results
353 | else:
354 | assert [] == search_results
355 |
356 | @pytest.mark.parametrize("word", [
357 | 'alf',
358 | ])
359 | def test_immutable_info(self, word):
360 | auto_complete = AutoComplete(words=SHORT_WORDS, synonyms=SYNONYMS)
361 | auto_complete_immutable = AutoComplete(words=SHORT_WORDS_IMMUTABLE_INFO, synonyms=SYNONYMS)
362 | search_results = auto_complete._find(word, max_cost=3, size=3)
363 | search_results_immutable = auto_complete_immutable._find(word, max_cost=3, size=3)
364 | print_results(locals())
365 | assert search_results_immutable == search_results
366 |
367 |
368 | class AutoCompleteWithSynonymsShort(DrawGraphMixin, AutoComplete):
369 | pass
370 |
371 |
372 | class AutoCompleteWithSynonymsShortWithAnim(AutoCompleteWithSynonymsShort):
373 |
374 | DRAW_POPULATION_ANIMATION = True
375 | DRAW_POPULATION_ANIMATION_PATH = os.path.join(current_dir, 'animation/short_.svg')
376 | DRAW_POPULATION_ANIMATION_FILENO_PADDING = 6
377 |
378 |
379 | class TestAutoCompleteWithSynonymsShortGraphDraw:
380 |
381 | def test_draw_graph(self):
382 | auto_complete = AutoCompleteWithSynonymsShort(words=SHORT_WORDS)
383 | file_path = os.path.join(current_dir, 'AutoCompleteWithSynonymsShort_Graph.svg')
384 | auto_complete.draw_graph(file_path)
385 |
386 | def test_draw_graph_animation(self):
387 | AutoCompleteWithSynonymsShortWithAnim(words=SHORT_WORDS)
388 |
389 |
390 | class TestPrefixAndDescendants:
391 |
392 | @pytest.mark.parametrize("word, expected_matched_prefix_of_last_word, expected_rest_of_word, expected_matched_words, expected_node_path", [
393 | ('2018 alpha blah blah', 'al', 'pha blah blah', ['2018'], 'a,l'),
394 | ('2018 alpha ', 'al', 'pha ', ['2018'], 'a,l'),
395 | ('2018 alfa', '', '', ['2018', 'alfa romeo'], 'a,l,f,a'),
396 | ('2018 alf', 'alf', '', ['2018'], 'a,l,f'),
397 | ('2018 alfa romeo', '', '', ['2018', 'alfa romeo'], 'a,l,f,a, ,r,o,m,e,o'),
398 | ('1 series bmw 2007 2018', '', '', ['1 series', 'bmw', '2007', '2018'], '2,0,1,8'),
399 | ('200 chrysler', '', '', ['200', 'chrysler'], 'c,h,r,y,s,l,e,r'),
400 | ('200 chrysler 200', '', '', ['200', 'chrysler 200'], 'c,h,r,y,s,l,e,r, ,2,0,0'),
401 | ('chrysler 2007', '', '', ['chrysler', '2007'], '2,0,0,7'),
402 | ('type r', '', '', ['type r'], 't,y,p,e, ,r'),
403 | ])
404 | def test_prefix_autofill(self, word, expected_matched_prefix_of_last_word,
405 | expected_rest_of_word, expected_matched_words, expected_node_path):
406 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
407 | matched_prefix_of_last_word, rest_of_word, node, matched_words = auto_complete._prefix_autofill(word)
408 | print(f'word: {word}')
409 | print(f'expected_matched_prefix_of_last_word: {expected_matched_prefix_of_last_word}')
410 | print(f'matched_prefix_of_last_word: {matched_prefix_of_last_word}')
411 | print(f'expected_rest_of_word: {expected_rest_of_word}')
412 | print(f'rest_of_word: {rest_of_word}')
413 | print(f'node: {node}')
414 | print(f'expected_matched_words: {expected_matched_words}')
415 | print(f'matched_words: {matched_words}')
416 | expected_node = auto_complete._dwg
417 | for k in expected_node_path.split(','):
418 | expected_node = expected_node[k]
419 | assert expected_node is node
420 | assert expected_matched_prefix_of_last_word == matched_prefix_of_last_word
421 | assert expected_rest_of_word == rest_of_word
422 | assert expected_matched_words == matched_words
423 |
424 | @pytest.mark.parametrize("word, expected_results", [
425 | ('2018 alpha ', ['alfa', 'alfa rl', 'alfa rm']),
426 | ('1 series bmw 2', ['bmw 2 series']),
427 | ('2018 alfa', ['alfa rl', 'alfa rm', 'alfa 33']),
428 | ])
429 | def test_get_descendants_nodes(self, word, expected_results):
430 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
431 | matched_prefix_of_last_word, rest_of_word, node, matched_words = auto_complete._prefix_autofill(word)
432 | size = 2
433 | found_words_gen = node.get_descendants_nodes(size=size)
434 | found_words = [_node.word for _node in found_words_gen][:size + 1]
435 | print(f'word: {word}')
436 | print(f'expected_results: {expected_results}')
437 | print(f'found_words: {found_words}')
438 | assert expected_results == list(found_words)
439 |
440 | @pytest.mark.parametrize("word, expected_results", [
441 | ('r', ['rc', 'rx', 'rl', 'rm', 'r8', 'rav4', 'r107', 'r129', 'r170', 'r171', 'r230', 'r231', 'regal', 'royal', 'ridgeline']),
442 | ('benz', []),
443 | ])
444 | def test_get_all_descendent_words_for_condition1(self, word, expected_results):
445 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
446 |
447 | def condition(word_info):
448 | return 'model' in word_info
449 |
450 | size = 10
451 | results = auto_complete.get_all_descendent_words_for_condition(word=word, size=size, condition=condition)
452 | print_results(locals())
453 | # So by default we insert counts and that makes the size to be set to infinity.
454 | # I don't remember why.
455 | # This line fails then. Note that test_get_all_descendent_words_for_condition is only used in search tokenizer.
456 | # assert expected_results == results[:size + 1]
457 |
458 |
459 | class TestOther:
460 |
461 | @pytest.mark.parametrize("word, expected_results", [
462 | ('bmw', ['bmw']),
463 | ('al', ['alfa romeo']),
464 | ])
465 | def test_get_all_descendent_words_for_condition2(self, word, expected_results):
466 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo'])
467 |
468 | results = auto_complete.get_tokens_flat_list(word, max_cost=0, size=3)
469 | print_results(locals())
470 | assert expected_results == results
471 |
472 | @pytest.mark.parametrize("word, expected_results", [
473 | ('bmw', {'make': 'bmw'}),
474 | ('bMw', {'make': 'bmw'}),
475 | ('al', None),
476 | ])
477 | def test_get_word_context(self, word, expected_results):
478 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo'])
479 | results = auto_complete.get_word_context(word)
480 | print_results(locals())
481 | assert expected_results == results
482 |
483 | @pytest.mark.parametrize("word, update_dict, expected_results, expected_new_count", [
484 | ('toyota a', None, [['toyota'], ['toyota avalon'], ['toyota aurion'], ['toyota auris']], None),
485 | ('toyota a', {'word': 'toyota aygo', 'count': 10000}, [['toyota'], ['toyota aygo'], ['toyota avalon'], ['toyota aurion']], 10000),
486 | ('toyota a', {'word': 'toyota aurion', 'offset': -6000}, [['toyota'], ['toyota avalon'], ['toyota auris'], ['toyota aygo']], 94),
487 | ])
488 | def test_update_count_of_word(self, word, update_dict, expected_results, expected_new_count):
489 | auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo'])
490 | if update_dict:
491 | new_count = auto_complete.update_count_of_word(**update_dict)
492 | assert expected_new_count == new_count
493 | assert expected_new_count == auto_complete.get_count_of_word(update_dict['word'])
494 | results = auto_complete.search(word, max_cost=2, size=4)
495 | print_results(locals())
496 | assert expected_results == results
497 |
--------------------------------------------------------------------------------
/fast_autocomplete/dwg.py:
--------------------------------------------------------------------------------
1 | from collections import (
2 | defaultdict,
3 | deque
4 | )
5 | from itertools import islice
6 | from enum import Enum
7 | from threading import Lock
8 | from fast_autocomplete.lfucache import LFUCache
9 | from fast_autocomplete.misc import _extend_and_repeat
10 | from fast_autocomplete.normalize import Normalizer
11 |
12 | # Prefer the 'Levenshtein' library implementation
13 | try:
14 | from Levenshtein import distance as levenshtein_distance
15 | except ImportError:
16 | try:
17 | from pylev import levenshtein as levenshtein_distance
18 | except ImportError:
19 | raise RuntimeError("""
20 | Unable to import a levenshtein distance calculation module.
21 | Please add python-Levenshtein or pylev to your Python dependencies.
22 |
23 | Installing this package as
24 |
25 | pip install fast-autocomplete[levenshtein]
26 |
27 | or
28 |
29 | pip install fast-autocomplete[pylev]
30 |
31 | Note that fast-autocomplete[levenshtein] is preferred and is much faster than fast-autocomplete[pylev]
32 | """)
33 |
34 | DELIMITER = '__'
35 | ORIGINAL_KEY = 'original_key'
36 | INF = float('inf')
37 |
38 |
39 | class NodeNotFound(ValueError):
40 | pass
41 |
42 |
43 | class FindStep(Enum):
44 | start = 0
45 | descendants_only = 1
46 | fuzzy_try = 2
47 | fuzzy_found = 3
48 | rest_of_fuzzy_round2 = 4
49 | not_enough_results_add_some_descandants = 5
50 |
51 |
52 | class AutoComplete:
53 |
54 | CACHE_SIZE = 2048
55 | SHOULD_INCLUDE_COUNT = True
56 |
57 | def __init__(
58 | self,
59 | words,
60 | synonyms=None,
61 | full_stop_words=None,
62 | logger=None,
63 | valid_chars_for_string=None,
64 | valid_chars_for_integer=None,
65 | valid_chars_for_node_name=None,
66 | ):
67 | """
68 | Initializes the Autocomplete module
69 |
70 | :param words: A dictionary of words mapped to their context
71 | :param synonyms: (optional) A dictionary of words to their synonyms.
72 | The synonym words should only be here and not repeated in words parameter.
73 | """
74 | self._lock = Lock()
75 | self._dwg = None
76 | self._raw_synonyms = synonyms or {}
77 | self._lfu_cache = LFUCache(self.CACHE_SIZE)
78 | self._clean_synonyms, self._partial_synonyms = self._get_clean_and_partial_synonyms()
79 | self._reverse_synonyms = self._get_reverse_synonyms(self._clean_synonyms)
80 | self._full_stop_words = set(full_stop_words) if full_stop_words else None
81 | self.logger = logger
82 | self.words = words
83 | self.normalizer = Normalizer(
84 | valid_chars_for_string=valid_chars_for_string,
85 | valid_chars_for_integer=valid_chars_for_integer,
86 | valid_chars_for_node_name=valid_chars_for_node_name,
87 | )
88 | new_words = self._get_partial_synonyms_to_words()
89 | self.words.update(new_words)
90 | self._populate_dwg()
91 |
92 | def _get_clean_and_partial_synonyms(self):
93 | """
94 | Synonyms are words that should produce the same results.
95 |
96 | - For example `beemer` and `bmw` should both give you `bmw`.
97 | - `alfa` and `alfa romeo` should both give you `alfa romeo`
98 |
99 | The synonyms get divided into 2 groups:
100 |
101 | 1. clean synonyms: The 2 words share little or no words. For example `beemer` vs. `bmw`.
102 | 2. partial synonyms: One of the 2 words is a substring of the other one. For example `alfa` and `alfa romeo` or `gm` vs. `gmc`.
103 |
104 | """
105 | clean_synonyms = {}
106 | partial_synonyms = {}
107 |
108 | for key, synonyms in self._raw_synonyms.items():
109 | key = key.strip().lower()
110 | _clean = []
111 | _partial = []
112 | for syn in synonyms:
113 | syn = syn.strip().lower()
114 | if key.startswith(syn):
115 | _partial.append(syn)
116 | else:
117 | _clean.append(syn)
118 | if _clean:
119 | clean_synonyms[key] = _clean
120 | if _partial:
121 | partial_synonyms[key] = _partial
122 |
123 | return clean_synonyms, partial_synonyms
124 |
125 | def _get_reverse_synonyms(self, synonyms):
126 | result = {}
127 | if synonyms:
128 | for key, value in synonyms.items():
129 | for item in value:
130 | result[item] = key
131 | return result
132 |
133 | def _get_partial_synonyms_to_words(self):
134 | new_words = {}
135 | for key, value in self.words.items():
136 | # data is mutable so we copy
137 | try:
138 | value = value.copy()
139 | # data must be named tuple
140 | except Exception:
141 | new_value = value._asdict()
142 | new_value[ORIGINAL_KEY] = key
143 | value = type(value)(**new_value)
144 | else:
145 | value[ORIGINAL_KEY] = key
146 | for syn_key, syns in self._partial_synonyms.items():
147 | if key.startswith(syn_key):
148 | for syn in syns:
149 | new_key = key.replace(syn_key, syn)
150 | new_words[new_key] = value
151 | return new_words
152 |
153 | def _populate_dwg(self):
154 | if not self._dwg:
155 | with self._lock:
156 | if not self._dwg:
157 | self._dwg = _DawgNode()
158 | for word, value in self.words.items():
159 | original_key = value.get(ORIGINAL_KEY)
160 | # word = word.strip().lower()
161 | count = value.get('count', 0)
162 | leaf_node = self.insert_word_branch(
163 | word,
164 | original_key=original_key,
165 | count=count
166 | )
167 | if leaf_node and self._clean_synonyms:
168 | for synonym in self._clean_synonyms.get(word, []):
169 | self.insert_word_branch(
170 | synonym,
171 | leaf_node=leaf_node,
172 | add_word=False,
173 | count=count
174 | )
175 |
176 | def insert_word_callback(self, word):
177 | """
178 | Once word is inserted, run this.
179 | """
180 | pass
181 |
182 | def insert_word_branch(self, word, leaf_node=None, add_word=True, original_key=None, count=0):
183 | """
184 | Inserts a word into the Dawg.
185 |
186 | :param word: The word to be inserted as a branch of dwg
187 | :param leaf_node: (optional) The leaf node for the node to merge into in the dwg.
188 | :param add_word: (Boolean, default: True) Add the word itself at the end of the branch.
189 | Usually this is set to False if we are merging into a leaf node and do not
190 | want to add the actual word there.
191 | :param original_key: If the word that is being added was originally another word.
192 | For example with synonyms, you might be inserting the word `beemer` but the
193 | original key is `bmw`. This parameter might be removed in the future.
194 |
195 | """
196 | # if word == 'u (2 off)':
197 | # import pytest; pytest.set_trace()
198 | normalized_word = self.normalizer.normalize_node_name(word)
199 | # sometimes if the word does not have any valid characters, the normalized_word will be empty
200 | if not normalized_word:
201 | return
202 | last_char = normalized_word[-1]
203 |
204 | if leaf_node:
205 | temp_leaf_node = self._dwg.insert(
206 | word=word,
207 | normalized_word=normalized_word[:-1],
208 | add_word=add_word,
209 | original_key=original_key,
210 | count=count,
211 | insert_count=self.SHOULD_INCLUDE_COUNT
212 | )
213 | # It already has children
214 | if temp_leaf_node.children and last_char in temp_leaf_node.children:
215 | temp_leaf_node.children[last_char].word = leaf_node.word
216 | # otherwise merge into the leaf node
217 | else:
218 | temp_leaf_node.children[last_char] = leaf_node
219 | else:
220 | leaf_node = self._dwg.insert(
221 | word=word,
222 | normalized_word=normalized_word,
223 | original_key=original_key,
224 | count=count,
225 | insert_count=self.SHOULD_INCLUDE_COUNT
226 | )
227 | self.insert_word_callback(word)
228 | return leaf_node
229 |
230 | def _find_and_sort(self, word, max_cost, size):
231 | output_keys_set = set()
232 | results, find_steps = self._find(word, max_cost, size)
233 | results_keys = list(results.keys())
234 | results_keys.sort()
235 | for key in results_keys:
236 | for output_items in results[key]:
237 | for i, item in enumerate(output_items):
238 | reversed_item = self._reverse_synonyms.get(item)
239 | if reversed_item:
240 | output_items[i] = reversed_item
241 | elif item not in self.words:
242 | output_items[i] = item
243 | output_items_str = DELIMITER.join(output_items)
244 | if output_items and output_items_str not in output_keys_set:
245 | output_keys_set.add(output_items_str)
246 | yield output_items
247 | if len(output_keys_set) >= size:
248 | return
249 |
250 | def get_tokens_flat_list(self, word, max_cost=3, size=10):
251 | """
252 | Gets a flat list of tokens.
253 | This requires the original search function from this class to be run,
254 | instead of subclasses of AutoComplete.
255 | """
256 | result = AutoComplete.search(self, word, max_cost=max_cost, size=size)
257 | return [item for sublist in result for item in sublist]
258 |
259 | def get_word_context(self, word):
260 | """
261 | Gets the word's context from the words dictionary
262 | """
263 | word = self.normalizer.normalize_node_name(word)
264 | return self.words.get(word)
265 |
266 | def search(self, word, max_cost=2, size=5):
267 | """
268 | parameters:
269 | - word: the word to return autocomplete results for
270 | - max_cost: Maximum Levenshtein edit distance to be considered when calculating results
271 | - size: The max number of results to return
272 | """
273 | word = self.normalizer.normalize_node_name(word)
274 | if not word:
275 | return []
276 | key = f'{word}-{max_cost}-{size}'
277 | result = self._lfu_cache.get(key)
278 | if result == -1:
279 | result = list(self._find_and_sort(word, max_cost, size))
280 | self._lfu_cache.set(key, result)
281 | return result
282 |
283 | @staticmethod
284 | def _len_results(results):
285 | return sum(map(len, results.values()))
286 |
287 | @staticmethod
288 | def _is_enough_results(results, size):
289 | return AutoComplete._len_results(results) >= size
290 |
291 | def _is_stop_word_condition(self, matched_words, matched_prefix_of_last_word):
292 | return (self._full_stop_words and matched_words and matched_words[-1] in self._full_stop_words and not matched_prefix_of_last_word)
293 |
294 | def _find(self, word, max_cost, size, call_count=0):
295 | """
296 | The search function returns a list of all words that are less than the given
297 | maximum distance from the target word
298 | """
299 | results = defaultdict(list)
300 | fuzzy_matches = defaultdict(list)
301 | rest_of_results = {}
302 | fuzzy_matches_len = 0
303 |
304 | fuzzy_min_distance = min_distance = INF
305 | matched_prefix_of_last_word, rest_of_word, new_node, matched_words = self._prefix_autofill(word=word)
306 |
307 | last_word = matched_prefix_of_last_word + rest_of_word
308 |
309 | if matched_words:
310 | results[0] = [matched_words.copy()]
311 | min_distance = 0
312 | # under certain condition with finding full stop words, do not bother with finding more matches
313 | if self._is_stop_word_condition(matched_words, matched_prefix_of_last_word):
314 | find_steps = [FindStep.start]
315 | return results, find_steps
316 | if len(rest_of_word) < 3:
317 | find_steps = [FindStep.descendants_only]
318 | self._add_descendants_words_to_results(node=new_node, size=size, matched_words=matched_words, results=results, distance=1)
319 | else:
320 | find_steps = [FindStep.fuzzy_try]
321 | word_chunks = deque(filter(lambda x: x, last_word.split(' ')))
322 | new_word = word_chunks.popleft()
323 |
324 | # TODO: experiment with the number here
325 | # 'in los angeles' gets cut into `in los` so it becomes a closer match to `in lodi`
326 | # but if the number was bigger, we could have matched with `in los angeles`
327 | while len(new_word) < 5 and word_chunks:
328 | new_word = f'{new_word} {word_chunks.popleft()}'
329 | fuzzy_rest_of_word = ' '.join(word_chunks)
330 |
331 | for _word in self.words:
332 | if abs(len(_word) - len(new_word)) > max_cost:
333 | continue
334 | dist = levenshtein_distance(new_word, _word)
335 | if dist < max_cost:
336 | fuzzy_matches_len += 1
337 | _value = self.words[_word].get(ORIGINAL_KEY, _word)
338 | fuzzy_matches[dist].append(_value)
339 | fuzzy_min_distance = min(fuzzy_min_distance, dist)
340 | if fuzzy_matches_len >= size or dist < 2:
341 | break
342 | if fuzzy_matches_len:
343 | find_steps.append(FindStep.fuzzy_found)
344 | if fuzzy_rest_of_word:
345 | call_count += 1
346 | if call_count < 2:
347 | rest_of_results, rest_find_steps = self._find(word=fuzzy_rest_of_word, max_cost=max_cost, size=size, call_count=call_count)
348 | find_steps.append({FindStep.rest_of_fuzzy_round2: rest_find_steps})
349 | for _word in fuzzy_matches[fuzzy_min_distance]:
350 | if rest_of_results:
351 | rest_of_results_min_key = min(rest_of_results.keys())
352 | for _rest_of_matched_word in rest_of_results[rest_of_results_min_key]:
353 | results[fuzzy_min_distance].append(matched_words + [_word] + _rest_of_matched_word)
354 | else:
355 | results[fuzzy_min_distance].append(matched_words + [_word])
356 | _matched_prefix_of_last_word_b, not_used_rest_of_word, fuzzy_new_node, _matched_words_b = self._prefix_autofill(word=_word)
357 | if self._is_stop_word_condition(matched_words=_matched_words_b, matched_prefix_of_last_word=_matched_prefix_of_last_word_b):
358 | break
359 | self._add_descendants_words_to_results(node=fuzzy_new_node, size=size, matched_words=matched_words, results=results, distance=fuzzy_min_distance)
360 |
361 | if matched_words and not self._is_enough_results(results, size):
362 | find_steps.append(FindStep.not_enough_results_add_some_descandants)
363 | total_min_distance = min(min_distance, fuzzy_min_distance)
364 | self._add_descendants_words_to_results(node=new_node, size=size, matched_words=matched_words, results=results, distance=total_min_distance+1)
365 |
366 | return results, find_steps
367 |
368 | def _prefix_autofill(self, word, node=None):
369 | len_prev_rest_of_last_word = INF
370 | matched_words = []
371 | matched_words_set = set()
372 |
373 | def _add_words(words):
374 | is_added = False
375 | for word in words:
376 | if word not in matched_words_set:
377 | matched_words.append(word)
378 | matched_words_set.add(word)
379 | is_added = True
380 | return is_added
381 |
382 | matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word, node)
383 | _add_words(matched_words_part)
384 | result = (matched_prefix_of_last_word, rest_of_word, node, matched_words)
385 | len_rest_of_last_word = len(rest_of_word)
386 |
387 | while len_rest_of_last_word and len_rest_of_last_word < len_prev_rest_of_last_word:
388 | word = matched_prefix_of_last_word + rest_of_word
389 | word = word.strip()
390 | len_prev_rest_of_last_word = len_rest_of_last_word
391 | matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word, node=self._dwg, matched_condition_ever=matched_condition_ever, matched_condition_in_branch=matched_condition_in_branch)
392 | is_added = _add_words(matched_words_part)
393 | if is_added is False:
394 | break
395 | len_rest_of_last_word = len(rest_of_word)
396 | result = (matched_prefix_of_last_word, rest_of_word, node, matched_words)
397 |
398 | return result
399 |
400 | def prefix_autofill_part_condition(self, node):
401 | pass
402 |
403 | PREFIX_AUTOFILL_PART_CONDITION_SUFFIX = ''
404 |
405 | def _add_to_matched_words(self, node, matched_words, matched_condition_in_branch, matched_condition_ever, matched_prefix_of_last_word):
406 | if matched_words:
407 | last_matched_word = matched_words[-1].replace(self.PREFIX_AUTOFILL_PART_CONDITION_SUFFIX, '')
408 | if node.value.startswith(last_matched_word):
409 | matched_words.pop()
410 | value = node.value
411 | if self.PREFIX_AUTOFILL_PART_CONDITION_SUFFIX:
412 | if self._node_word_info_matches_condition(node, self.prefix_autofill_part_condition):
413 | matched_condition_in_branch = True
414 | if matched_condition_ever and matched_prefix_of_last_word:
415 | value = f"{matched_prefix_of_last_word}{self.PREFIX_AUTOFILL_PART_CONDITION_SUFFIX}"
416 | matched_words.append(value)
417 | return matched_words, matched_condition_in_branch
418 |
419 | def _prefix_autofill_part(self, word, node=None, matched_condition_ever=False, matched_condition_in_branch=False):
420 | node = node or self._dwg
421 | que = deque(word)
422 |
423 | matched_prefix_of_last_word = ''
424 | matched_words = []
425 | nodes_that_words_were_extracted = set()
426 |
427 | while que:
428 | char = que.popleft()
429 |
430 | if node.children:
431 | if char not in node.children:
432 | space_child = node.children.get(' ')
433 | if space_child and char in space_child.children:
434 | node = space_child
435 | else:
436 | que.appendleft(char)
437 | break
438 | node = node.children[char]
439 | if char != ' ' or matched_prefix_of_last_word:
440 | matched_prefix_of_last_word += char
441 | if node.word:
442 | if que:
443 | next_char = que[0]
444 | if next_char != ' ':
445 | continue
446 | matched_words, matched_condition_in_branch = self._add_to_matched_words(node, matched_words, matched_condition_in_branch, matched_condition_ever, matched_prefix_of_last_word)
447 | nodes_that_words_were_extracted.add(node)
448 | matched_prefix_of_last_word = ''
449 | else:
450 | if char == ' ':
451 | node = self._dwg
452 | if matched_condition_in_branch:
453 | matched_condition_ever = True
454 | else:
455 | que.appendleft(char)
456 | break
457 |
458 | if not que and node.word and node not in nodes_that_words_were_extracted:
459 | matched_words, matched_condition_in_branch = self._add_to_matched_words(node, matched_words, matched_condition_in_branch, matched_condition_ever, matched_prefix_of_last_word)
460 | matched_prefix_of_last_word = ''
461 |
462 | rest_of_word = "".join(que)
463 | if matched_condition_in_branch:
464 | matched_condition_ever = True
465 |
466 | return matched_prefix_of_last_word, rest_of_word, node, matched_words, matched_condition_ever, matched_condition_in_branch
467 |
468 | def _add_descendants_words_to_results(self, node, size, matched_words, results, distance, should_traverse=True):
469 | descendant_words = list(node.get_descendants_words(size, should_traverse, full_stop_words=self._full_stop_words))
470 | extended = _extend_and_repeat(matched_words, descendant_words)
471 | if extended:
472 | results[distance].extend(extended)
473 | return distance
474 |
475 | def _node_word_info_matches_condition(self, node, condition):
476 | _word = node.word
477 | word_info = self.words.get(_word)
478 | if word_info:
479 | return condition(word_info)
480 | else:
481 | return False
482 |
483 | def get_all_descendent_words_for_condition(self, word, size, condition):
484 | """
485 | This is used in the search tokenizer not in the fast autocomplete itself.
486 | """
487 | new_tokens = []
488 |
489 | matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word=word)
490 | if not rest_of_word and self._node_word_info_matches_condition(node, condition):
491 | found_nodes_gen = node.get_descendants_nodes(size, insert_count=self.SHOULD_INCLUDE_COUNT)
492 | for node in found_nodes_gen:
493 | if self._node_word_info_matches_condition(node, condition):
494 | new_tokens.append(node.word)
495 | return new_tokens
496 |
497 | def update_count_of_word(self, word, count=None, offset=None):
498 | """
499 | Update the count attribute of a node in the dwg. This only affects the autocomplete
500 | object and not the original count of the node in the data that was fed into fast_autocomplete.
501 | """
502 | matched_prefix_of_last_word, rest_of_word, node, matched_words_part, matched_condition_ever, matched_condition_in_branch = self._prefix_autofill_part(word=word)
503 | if node:
504 | if offset:
505 | with self._lock:
506 | node.count += offset
507 | elif count:
508 | with self._lock:
509 | node.count = count
510 | else:
511 | raise NodeNotFound(f'Unable to find a node for word {word}')
512 | return node.count
513 |
514 | def get_count_of_word(self, word):
515 | return self.update_count_of_word(word)
516 |
517 |
518 | class _DawgNode:
519 | """
520 | The Dawg data structure keeps a set of words, organized with one node for
521 | each letter. Each node has a branch for each letter that may follow it in the
522 | set of words.
523 | """
524 |
525 | __slots__ = ("word", "original_key", "children", "count")
526 |
527 | def __init__(self):
528 | self.word = None
529 | self.original_key = None
530 | self.children = {}
531 | self.count = 0
532 |
533 | def __getitem__(self, key):
534 | return self.children[key]
535 |
536 | def __repr__(self):
537 | return f''
538 |
539 | @property
540 | def value(self):
541 | return self.original_key or self.word
542 |
543 | def insert(self, word, normalized_word, add_word=True, original_key=None, count=0, insert_count=True):
544 | node = self
545 | for letter in normalized_word:
546 | if letter not in node.children:
547 | node.children[letter] = _DawgNode()
548 |
549 | node = node.children[letter]
550 |
551 | if add_word:
552 | node.word = word
553 | node.original_key = original_key
554 | if insert_count:
555 | node.count = int(count) # converts any str to int
556 | return node
557 |
558 | def get_descendants_nodes(self, size, should_traverse=True, full_stop_words=None, insert_count=True):
559 | if insert_count is True:
560 | size = INF
561 |
562 | que = deque()
563 | unique_nodes = {self}
564 | found_nodes_set = set()
565 | full_stop_words = full_stop_words if full_stop_words else set()
566 |
567 | for letter, child_node in self.children.items():
568 | if child_node not in unique_nodes:
569 | unique_nodes.add(child_node)
570 | que.append((letter, child_node))
571 |
572 | while que:
573 | letter, child_node = que.popleft()
574 | child_value = child_node.value
575 | if child_value:
576 | if child_value in full_stop_words:
577 | should_traverse = False
578 | if child_value not in found_nodes_set:
579 | found_nodes_set.add(child_value)
580 | yield child_node
581 | if len(found_nodes_set) > size:
582 | break
583 |
584 | if should_traverse:
585 | for letter, grand_child_node in child_node.children.items():
586 | if grand_child_node not in unique_nodes:
587 | unique_nodes.add(grand_child_node)
588 | que.append((letter, grand_child_node))
589 |
590 | def get_descendants_words(
591 | self, size, should_traverse=True, full_stop_words=None, insert_count=True):
592 | found_nodes_gen = self.get_descendants_nodes(
593 | size,
594 | should_traverse=should_traverse,
595 | full_stop_words=full_stop_words,
596 | insert_count=insert_count
597 | )
598 |
599 | if insert_count is True:
600 | found_nodes = sorted(
601 | found_nodes_gen,
602 | key=lambda node: node.count,
603 | reverse=True
604 | )[:size + 1]
605 | else:
606 | found_nodes = islice(found_nodes_gen, size)
607 |
608 | return map(lambda word: word.value, found_nodes)
609 |
--------------------------------------------------------------------------------
/tests/AutoCompleteWithSynonymsShort_Graph.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
1312 |
--------------------------------------------------------------------------------