├── tests ├── __init__.py ├── acceptance_tests │ ├── javascript │ │ ├── .eslintrc.js │ │ ├── mkdocs_serialization.js │ │ ├── mkdocs_query.js │ │ ├── package.json │ │ ├── language_serialize_index.js │ │ ├── mkdocs_load_serialized_index_and_search.js │ │ ├── language_query.js │ │ └── language_load_serialized_index_and_search.js │ ├── test_mkdocs.py │ ├── test_language_support.py │ └── fixtures │ │ ├── lang_es.json │ │ └── lang_es_en.json ├── test_complete_set.py ├── test_stemmer.py ├── conftest.py ├── test_trimmer.py ├── test_field_ref.py ├── test_token.py ├── test_serialization.py ├── utils.py ├── fixtures │ └── stemming_vocab.json ├── test_stop_word_filter.py ├── test_plugins.py ├── test_match_data.py ├── benchmarks.py ├── test_index.py ├── test_language_support.py ├── test_tokenizer.py ├── test_query.py ├── test_vector.py ├── test_builder.py ├── test_query_parser.py ├── test_pipeline.py ├── test_query_lexer.py └── test_token_set.py ├── setup.cfg ├── docs ├── changelog.md ├── Makefile ├── conf.py ├── languages.md ├── lunrjs-interop.md ├── index.md ├── customisation.md ├── usage.md └── indices.md ├── coverageio_token.txt ├── requirements ├── docs.txt ├── test.txt └── dev.txt ├── MANIFEST.in ├── lunr ├── exceptions.py ├── __init__.py ├── utils.py ├── trimmer.py ├── idf.py ├── languages │ ├── trimmer.py │ ├── stemmer.py │ └── __init__.py ├── token.py ├── field_ref.py ├── token_set_builder.py ├── tokenizer.py ├── match_data.py ├── __main__.py ├── stop_word_filter.py ├── query_lexer.py ├── query.py ├── vector.py ├── query_parser.py ├── pipeline.py └── token_set.py ├── .github ├── dependabot.yml └── workflows │ └── test-suite.yml ├── .gitignore ├── LICENSE ├── Makefile ├── tox.ini ├── setup.py ├── readme.rst ├── CHANGELOG.md └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CHANGELOG.md 2 | ``` -------------------------------------------------------------------------------- /coverageio_token.txt: -------------------------------------------------------------------------------- 1 | b2c4c44b-baed-4d95-ae74-7f495bac7a35 -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- 1 | furo 2 | sphinx 3 | sphinx-autobuild 4 | myst-parser 5 | -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | -e .[languages] 2 | pytest 3 | pytest-timeout 4 | mock 5 | tox 6 | coverage 7 | -------------------------------------------------------------------------------- /tests/acceptance_tests/javascript/.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "extends": "standard" 3 | }; 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | recursive-exclude * __pycache__ 4 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /lunr/exceptions.py: -------------------------------------------------------------------------------- 1 | class BaseLunrException(Exception): 2 | pass 3 | 4 | 5 | class QueryParseError(BaseLunrException): 6 | pass 7 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | -r test.txt 2 | -r docs.txt 3 | twine 4 | pytest-benchmark 5 | wheel 6 | mypy 7 | flake8 8 | black 9 | pdbpp 10 | ipython 11 | mypy 12 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | ignore: 9 | - dependency-name: nltk 10 | versions: 11 | - 3.6.1 12 | -------------------------------------------------------------------------------- /lunr/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lunr.__main__ import lunr, get_default_builder 4 | 5 | __all__ = ("lunr", "get_default_builder") 6 | 7 | logging.basicConfig(format="%(levelname)-7s - %(message)s") 8 | 9 | __VERSION__ = "0.6.2" 10 | __TARGET_JS_VERSION__ = "2.3.9" 11 | -------------------------------------------------------------------------------- /lunr/utils.py: -------------------------------------------------------------------------------- 1 | def as_string(obj): 2 | return "" if not obj else str(obj) 3 | 4 | 5 | class CompleteSet(set): 6 | def union(self, other): 7 | return self 8 | 9 | def intersection(self, other): 10 | return set(other) 11 | 12 | def __contains__(self, y): 13 | return True 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | *.egg-info/ 4 | .eggs/ 5 | .coverage 6 | coverage.xml 7 | htmlcov/ 8 | .tox/ 9 | .pytest_cache/ 10 | **/node_modules 11 | dist/ 12 | build/ 13 | .state 14 | .venv/ 15 | target/ 16 | site/ 17 | docs/_build 18 | 19 | .vscode/ 20 | *.code-workspace 21 | .python-version 22 | .DS_Store 23 | .benchmarks/ 24 | *TODO.md 25 | tests/profiles/ 26 | .mypy_cache/ 27 | .dev/ 28 | .direnv/ 29 | .envrc 30 | .tool-versions 31 | -------------------------------------------------------------------------------- /lunr/trimmer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from lunr.pipeline import Pipeline 4 | 5 | full_re = re.compile(r"^\W*?([^\W]+)\W*?$") 6 | 7 | 8 | def trimmer(token, i=None, tokens=None): 9 | def trim(s, metadata=None): 10 | match = full_re.match(s) 11 | if match is None: 12 | return s 13 | return match.group(1) 14 | 15 | return token.update(trim) 16 | 17 | 18 | Pipeline.register_function(trimmer, "trimmer") 19 | -------------------------------------------------------------------------------- /tests/test_complete_set.py: -------------------------------------------------------------------------------- 1 | from lunr.utils import CompleteSet 2 | 3 | 4 | class TestCompleteSet: 5 | def test_always_contains_other_element(self): 6 | assert "foo" in CompleteSet() 7 | 8 | def test_intersection_returns_other(self): 9 | cs = CompleteSet({"bar"}) 10 | assert cs.intersection({"foo"}) == {"foo"} 11 | 12 | def test_union_returns_self(self): 13 | cs = CompleteSet({"bar"}) 14 | assert cs.union({"foo"}) == {"bar"} 15 | -------------------------------------------------------------------------------- /lunr/idf.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | def idf(posting, document_count): 5 | """A function to calculate the inverse document frequency for a posting. 6 | This is shared between the builder and the index. 7 | """ 8 | documents_with_term = 0 9 | for field_name in posting: 10 | if field_name == "_index": 11 | continue 12 | documents_with_term += len(posting[field_name].keys()) 13 | 14 | x = (document_count - documents_with_term + 0.5) / (documents_with_term + 0.5) 15 | return math.log(1 + abs(x)) 16 | -------------------------------------------------------------------------------- /tests/acceptance_tests/javascript/mkdocs_serialization.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const tmp = require('tmp') 3 | const lunr = require('lunr') 4 | 5 | const data = JSON.parse( 6 | fs.readFileSync(__dirname + '/../fixtures/mkdocs_index.json')) 7 | let documents = {} 8 | const idx = lunr(function () { 9 | this.field('title') 10 | this.field('text') 11 | this.ref('id') 12 | for (doc of data.docs) { 13 | this.add(doc) 14 | documents[doc.id] = doc 15 | } 16 | }) 17 | 18 | const tmpFile = tmp.fileSync({keep: true}) 19 | fs.writeFileSync(tmpFile.fd, JSON.stringify(idx)) 20 | process.stdout.write(tmpFile.name) 21 | -------------------------------------------------------------------------------- /tests/acceptance_tests/javascript/mkdocs_query.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const lunr = require('lunr') 3 | 4 | const data = JSON.parse( 5 | fs.readFileSync(__dirname + '/../fixtures/mkdocs_index.json')) 6 | let documents = {} 7 | const idx = lunr(function () { 8 | this.field('title') 9 | this.field('text') 10 | this.ref('id') 11 | for (doc of data.docs) { 12 | this.add(doc) 13 | documents[doc.id] = doc 14 | } 15 | }) 16 | 17 | let results = idx.search(process.argv[2]) 18 | for (result of results) { 19 | let doc = documents[result.ref] 20 | process.stdout.write(`${result.ref} "${doc.title}" [${result.score}]\n`) 21 | } 22 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /lunr/languages/trimmer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def generate_trimmer(word_characters): 5 | """Returns a trimmer function from a string of word characters. 6 | 7 | TODO: lunr-languages ships with lists of word characters for each language 8 | I haven't found an equivalent in Python, we may need to copy it. 9 | """ 10 | full_re = re.compile(r"^[^{0}]*?([{0}]+)[^{0}]*?$".format(word_characters)) 11 | 12 | def trimmer(token, i=None, tokens=None): 13 | def trim(s, metadata=None): 14 | match = full_re.match(s) 15 | if match is None: 16 | return s 17 | return match.group(1) 18 | 19 | return token.update(trim) 20 | 21 | return trimmer 22 | -------------------------------------------------------------------------------- /tests/acceptance_tests/javascript/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "acceptance_tests", 3 | "version": "1.0.0", 4 | "description": "Acceptance tests for Lunr.py", 5 | "main": "test_mkdocs.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "Yeray Diaz Diaz", 10 | "license": "MIT", 11 | "dependencies": { 12 | "lunr": "2.3.9", 13 | "lunr-languages": "1.0.0", 14 | "tmp": "0.0.33" 15 | }, 16 | "devDependencies": { 17 | "eslint": "^8.10.0", 18 | "eslint-config-standard": "^11.0.0", 19 | "eslint-plugin-import": "^2.11.0", 20 | "eslint-plugin-node": "^6.0.1", 21 | "eslint-plugin-promise": "^3.7.0", 22 | "eslint-plugin-standard": "^3.0.1" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/acceptance_tests/javascript/language_serialize_index.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const tmp = require('tmp') 3 | const lunr = require('lunr') 4 | require("lunr-languages/lunr.stemmer.support")(lunr) 5 | require("lunr-languages/lunr.es")(lunr) 6 | 7 | const data = JSON.parse( 8 | fs.readFileSync(__dirname + '/../fixtures/lang_es.json')) 9 | let documents = {} 10 | const idx = lunr(function () { 11 | this.use(lunr.es) 12 | this.field('title') 13 | this.field('text') 14 | this.ref('id') 15 | for (doc of data.docs) { 16 | this.add(doc) 17 | documents[doc.id] = doc 18 | } 19 | }) 20 | 21 | const tmpFile = tmp.fileSync({keep: true}) 22 | fs.writeFileSync(tmpFile.fd, JSON.stringify(idx)) 23 | process.stdout.write(tmpFile.name) 24 | -------------------------------------------------------------------------------- /tests/acceptance_tests/javascript/mkdocs_load_serialized_index_and_search.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const lunr = require('lunr') 3 | 4 | // Read the documents only to retrieve the title for the results 5 | const data = JSON.parse( 6 | fs.readFileSync(__dirname + '/../fixtures/mkdocs_index.json')) 7 | let documents = {} 8 | for (doc of data.docs) { 9 | documents[doc.id] = doc 10 | } 11 | 12 | // Load the index from the serialized path produced from Python 13 | const serializedIndex = JSON.parse(fs.readFileSync(process.argv[2])) 14 | let idx = lunr.Index.load(serializedIndex) 15 | let results = idx.search(process.argv[3]) 16 | for (result of results) { 17 | process.stdout.write(`${result.ref} "${documents[result.ref].title}" [${result.score}]\n`) 18 | } -------------------------------------------------------------------------------- /tests/acceptance_tests/javascript/language_query.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const lunr = require('lunr') 3 | require("lunr-languages/lunr.stemmer.support")(lunr); 4 | require("lunr-languages/lunr.es")(lunr); 5 | 6 | const data = JSON.parse( 7 | fs.readFileSync(__dirname + '/../fixtures/lang_es.json')) 8 | let documents = {} 9 | const idx = lunr(function () { 10 | this.use(lunr.es) 11 | this.field('title') 12 | this.field('text') 13 | this.ref('id') 14 | for (doc of data.docs) { 15 | this.add(doc) 16 | documents[doc.id] = doc 17 | } 18 | }) 19 | 20 | let results = idx.search(process.argv[2]) 21 | for (result of results) { 22 | let doc = documents[result.ref] 23 | process.stdout.write(`${result.ref} "${doc.title}" [${result.score}]\n`) 24 | } 25 | -------------------------------------------------------------------------------- /tests/test_stemmer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from lunr.token import Token 5 | from lunr.stemmer import stemmer 6 | from lunr.pipeline import Pipeline 7 | 8 | 9 | class TestStemmer: 10 | def test_reduces_words_to_their_stem(self): 11 | path = os.path.join( 12 | os.path.dirname(__file__), "fixtures", "stemming_vocab.json" 13 | ) 14 | with open(path) as f: 15 | data = json.loads(f.read()) 16 | 17 | for word, expected in data.items(): 18 | token = Token(word) 19 | result = str(stemmer(token)) 20 | 21 | assert result == expected 22 | 23 | def test_is_a_registered_pipeline_function(self): 24 | assert stemmer.label == "stemmer" 25 | assert Pipeline.registered_functions["stemmer"] == stemmer 26 | -------------------------------------------------------------------------------- /lunr/token.py: -------------------------------------------------------------------------------- 1 | class Token: 2 | def __init__(self, string="", metadata=None): 3 | self.string = string 4 | self.metadata = metadata or {} 5 | 6 | def __str__(self): 7 | return self.string 8 | 9 | def __repr__(self): 10 | return ''.format(str(self)) 11 | 12 | def update(self, fn): 13 | """A token update function is used when updating or optionally 14 | when cloning a token.""" 15 | # TODO: we require functions to have two parameters, JS doesn't care 16 | self.string = fn(self.string, self.metadata) 17 | return self 18 | 19 | def clone(self, fn=None): 20 | """Applies the given function to the wrapped string token.""" 21 | fn = fn or (lambda s, m: s) 22 | return Token(fn(self.string, self.metadata), self.metadata) 23 | -------------------------------------------------------------------------------- /lunr/field_ref.py: -------------------------------------------------------------------------------- 1 | from lunr.exceptions import BaseLunrException 2 | 3 | 4 | class FieldRef: 5 | 6 | JOINER = "/" 7 | 8 | def __init__(self, doc_ref, field_name, string_value=None): 9 | self.doc_ref = doc_ref 10 | self.field_name = field_name 11 | self._string_value = string_value 12 | 13 | def __repr__(self): 14 | return ''.format(self.field_name, self.doc_ref) 15 | 16 | @classmethod 17 | def from_string(cls, string): 18 | if cls.JOINER not in string: 19 | raise BaseLunrException("Malformed field ref string") 20 | field_ref, doc_ref = string.split(cls.JOINER, 1) 21 | return cls(doc_ref, field_ref, string) 22 | 23 | def __str__(self): 24 | if self._string_value is None: 25 | self._string_value = self.field_name + self.JOINER + str(self.doc_ref) 26 | 27 | return self._string_value 28 | -------------------------------------------------------------------------------- /tests/acceptance_tests/javascript/language_load_serialized_index_and_search.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const lunr = require('lunr') 3 | require("lunr-languages/lunr.stemmer.support")(lunr) 4 | require("lunr-languages/lunr.es")(lunr) 5 | 6 | // Read the documents only to retrieve the title for the results 7 | const fixtureName = process.argv[4] || 'lang_es.json' 8 | const fixturePath = __dirname + '/../fixtures/' + fixtureName 9 | const data = JSON.parse(fs.readFileSync(fixturePath)) 10 | let documents = {} 11 | for (doc of data.docs) { 12 | documents[doc.id] = doc 13 | } 14 | 15 | // Load the index from the serialized path produced from Python 16 | const serializedIndex = JSON.parse(fs.readFileSync(process.argv[2])) 17 | let idx = lunr.Index.load(serializedIndex) 18 | let results = idx.search(process.argv[3]) 19 | for (result of results) { 20 | process.stdout.write(`${result.ref} "${documents[result.ref].title}" [${result.score}]\n`) 21 | } -------------------------------------------------------------------------------- /lunr/languages/stemmer.py: -------------------------------------------------------------------------------- 1 | def get_language_stemmer(language): 2 | """Retrieves the SnowballStemmer for a particular language. 3 | 4 | Args: 5 | language (str): ISO-639-1 code of the language. 6 | """ 7 | from lunr.languages import SUPPORTED_LANGUAGES 8 | from nltk.stem.snowball import SnowballStemmer # type: ignore 9 | 10 | return SnowballStemmer(SUPPORTED_LANGUAGES[language]) 11 | 12 | 13 | def nltk_stemmer(stemmer, token, i=None, tokens=None): 14 | """Wrapper around a NLTK SnowballStemmer, which includes stop words for 15 | each language. 16 | 17 | Args: 18 | stemmer (SnowballStemmer): Stemmer instance that performs the stemming. 19 | token (lunr.Token): The token to stem. 20 | i (int): The index of the token in a set. 21 | tokens (list): A list of tokens representing the set. 22 | """ 23 | 24 | def wrapped_stem(token, metadata=None): 25 | return stemmer.stem(token) 26 | 27 | return token.update(wrapped_stem) 28 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lunr import lunr 4 | 5 | 6 | @pytest.fixture 7 | def documents(): 8 | return [ 9 | { 10 | "id": "a", 11 | "title": "Mr. Green kills Colonel Mustard", 12 | "body": """Mr. Green killed Colonel Mustard in the study with the 13 | candlestick. Mr. Green is not a very nice fellow.""", 14 | "word_count": 19, 15 | }, 16 | { 17 | "id": "b", 18 | "title": "Plumb waters plant", 19 | "body": "Professor Plumb has a green plant in his study", 20 | "word_count": 9, 21 | }, 22 | { 23 | "id": "c", 24 | "title": "Scarlett helps Professor", 25 | "body": """Miss Scarlett watered Professor Plumbs green plant 26 | while he was away from his office last week.""", 27 | "word_count": 16, 28 | }, 29 | ] 30 | 31 | 32 | @pytest.fixture 33 | def index(documents): 34 | return lunr(ref="id", fields=("title", "body"), documents=documents) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2019, Yeray Díaz Díaz. All rights reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /tests/test_trimmer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lunr.trimmer import trimmer 4 | from lunr.token import Token 5 | from lunr.pipeline import Pipeline 6 | 7 | 8 | class TestTrimmer: 9 | def test_latin_characters(self): 10 | token = Token("hello") 11 | assert str(trimmer(token)) == str(token) 12 | 13 | @pytest.mark.parametrize( 14 | "description, string, expected", 15 | [ 16 | ("full stop", "hello.", "hello"), 17 | ("inner apostrophe", "it's", "it's"), 18 | ("trailing apostrophe", "james'", "james"), 19 | ("exclamation mark", "stop!", "stop"), 20 | ("comma", "first,", "first"), 21 | ("brackets", "[tag]", "tag"), 22 | ], 23 | ) 24 | def test_punctuation(self, description, string, expected): 25 | token = Token(string) 26 | trimmed = str(trimmer(token)) 27 | 28 | assert trimmed == expected 29 | 30 | def test_is_a_registered_pipeline_function(self): 31 | assert trimmer.label == "trimmer" 32 | assert Pipeline.registered_functions["trimmer"] == trimmer 33 | -------------------------------------------------------------------------------- /tests/test_field_ref.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lunr.field_ref import FieldRef 4 | from lunr.exceptions import BaseLunrException 5 | 6 | 7 | class TestFieldRef: 8 | def test_str_combines_document_ref_and_field_name(self): 9 | field_name = "title" 10 | document_ref = 123 11 | field_ref = FieldRef(document_ref, field_name) 12 | 13 | assert str(field_ref) == "title/123" 14 | assert repr(field_ref) == '' 15 | 16 | def test_from_string_splits_string_into_parts(self): 17 | field_ref = FieldRef.from_string("title/123") 18 | 19 | assert field_ref.field_name == "title" 20 | assert field_ref.doc_ref == "123" 21 | 22 | def test_from_string_docref_contains_join_character(self): 23 | field_ref = FieldRef.from_string("title/http://example.com/123") 24 | 25 | assert field_ref.field_name == "title" 26 | assert field_ref.doc_ref == "http://example.com/123" 27 | 28 | def test_from_string_does_not_contain_join_character(self): 29 | string = "docRefOnly" 30 | 31 | with pytest.raises(BaseLunrException): 32 | FieldRef.from_string(string) 33 | -------------------------------------------------------------------------------- /tests/test_token.py: -------------------------------------------------------------------------------- 1 | from lunr.token import Token 2 | 3 | 4 | def test_str_repr(): 5 | token = Token("foo") 6 | assert str(token) == "foo" 7 | assert repr(token) == '' 8 | 9 | 10 | class TestMetadata: 11 | def test_can_attach_arbitrary_metadata(self): 12 | token = Token("foo", {"length": 3}) 13 | assert token.metadata["length"] == 3 14 | 15 | def test_can_update_token_value(self): 16 | token = Token("foo", {"length": 3}) 17 | token.update(lambda s, m: s.upper()) 18 | 19 | assert str(token) == "FOO" 20 | 21 | def test_metadata_is_yielded_when_updating(self): 22 | # TODO: unsure what this test is asserting, a language feature? 23 | pass 24 | 25 | 26 | class TestClone: 27 | def setup_method(self, method): 28 | self.token = Token("foo", {"bar": True}) 29 | 30 | def test_clones_value(self): 31 | assert str(self.token) == str(self.token.clone()) 32 | 33 | def test_clones_metadata(self): 34 | assert self.token.metadata == self.token.clone().metadata 35 | 36 | def test_clone_and_modify(self): 37 | clone = self.token.clone(lambda s, m: s.upper()) 38 | 39 | assert str(clone) == "FOO" 40 | self.token.metadata == clone.metadata 41 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: tests tests-acceptance tests-full install-dev docs 2 | 3 | .state: 4 | mkdir .state 5 | 6 | .state/acceptance-npm: .state 7 | cd tests/acceptance_tests/javascript && \ 8 | npm install && \ 9 | cd ../../../ 10 | touch .state/acceptance-npm 11 | 12 | clean: 13 | rm .state/* 14 | 15 | install-dev: 16 | pip install -U pip wheel setuptools 17 | pip install -r requirements/dev.txt 18 | 19 | tests: 20 | coverage run -m pytest -m "not acceptance" 21 | coverage report 22 | 23 | tests-acceptance: .state/acceptance-npm 24 | pytest -m "acceptance" 25 | 26 | tests-full: tests tests-acceptance 27 | 28 | tests-benchmark: 29 | pytest tests/benchmarks.py --benchmark-warmup=on 30 | 31 | package: 32 | rm -fr dist/* 33 | python setup.py sdist 34 | python setup.py bdist_wheel --universal 35 | 36 | release-test: package 37 | @echo "Are you sure you want to release to test.pypi.org? [y/N]" && \ 38 | read ans && \ 39 | [ $${ans:-N} = y ] && \ 40 | twine upload --repository testpypi dist/* 41 | 42 | release-pypi: package 43 | @echo "Are you sure you want to release to pypi.org? [y/N]" && \ 44 | read ans && \ 45 | [ $${ans:-N} = y ] && \ 46 | twine upload dist/* 47 | 48 | lint: 49 | flake8 lunr tests 50 | black lunr tests 51 | mypy lunr 52 | 53 | docs: 54 | sphinx-build docs docs/_build/html 55 | 56 | docs-server: 57 | sphinx-autobuild docs docs/_build/html 58 | -------------------------------------------------------------------------------- /tests/test_serialization.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from lunr import lunr 4 | from lunr.index import Index 5 | 6 | 7 | class TestSerialization: 8 | def setup_method(self, method): 9 | documents = [ 10 | { 11 | "id": "a", 12 | "title": "Mr. Green kills Colonel Mustard", 13 | "body": """Mr. Green killed Colonel Mustard in the study with the 14 | candlestick. Mr. Green is not a very nice fellow.""", 15 | "word_count": 19, 16 | }, 17 | { 18 | "id": "b", 19 | "title": "Plumb waters plant", 20 | "body": "Professor Plumb has a green plant in his study", 21 | "word_count": 9, 22 | }, 23 | { 24 | "id": "c", 25 | "title": "Scarlett helps Professor", 26 | "body": """Miss Scarlett watered Professor Plumbs green plant 27 | while he was away from his office last week.""", 28 | "word_count": 16, 29 | }, 30 | ] 31 | 32 | self.idx = lunr(ref="id", fields=("title", "body"), documents=documents) 33 | 34 | def test_serialization(self): 35 | serialized_index = json.dumps(self.idx.serialize()) 36 | loaded_index = Index.load(json.loads(serialized_index)) 37 | 38 | assert self.idx.search("green") == loaded_index.search("green") 39 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import subprocess 5 | 6 | import pytest 7 | 8 | PATTERN = r'([^\ ]+) "([^\"]+)" \[([\d\.]*)\]' 9 | DEFAULT_TOLERANCE = 1e-2 10 | 11 | 12 | def assert_field_vectors_equal(a, b, tol=DEFAULT_TOLERANCE): 13 | assert a[0] == b[0] 14 | for x, y in zip(a[1], b[1]): 15 | assert x == pytest.approx(y, rel=tol) 16 | 17 | 18 | def assert_vectors_equal(a, b, tol=DEFAULT_TOLERANCE): 19 | for x, y in zip(a, b): 20 | assert x == pytest.approx(y, rel=tol) 21 | 22 | 23 | def assert_results_match(results, js_results, tol=DEFAULT_TOLERANCE): 24 | assert len(results) == len(js_results) != 0 25 | for js_result, result in zip(js_results, results): 26 | id_, title, score = re.match(PATTERN, js_result).groups() 27 | assert result["ref"] == id_ 28 | assert result["score"] == pytest.approx(float(score), rel=tol) 29 | 30 | 31 | def read_json_fixture(filename): 32 | fixture_path = os.path.join( 33 | os.path.dirname(__file__), "acceptance_tests", "fixtures", filename 34 | ) 35 | with open(fixture_path) as f: 36 | return json.loads(f.read()) 37 | 38 | 39 | def run_node_script(filename, *args): 40 | js_path = os.path.join( 41 | os.path.dirname(__file__), "acceptance_tests", "javascript", filename 42 | ) 43 | js_output = subprocess.check_output(["node", js_path] + list(args)) 44 | return js_output.decode("utf-8").strip() 45 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36,py37,py38,py39,py310,pypy3,flake8,black,mypy,docs 3 | 4 | [testenv] 5 | deps = -rrequirements/test.txt 6 | commands = 7 | coverage run -m pytest -m "not acceptance" 8 | coverage report 9 | coverage xml 10 | pytest -m "acceptance" 11 | 12 | [testenv:black] 13 | basepython = python3.8 14 | deps= 15 | black 16 | commands={envbindir}/black --check lunr tests 17 | 18 | [testenv:flake8] 19 | basepython = python3.8 20 | deps= 21 | flake8 22 | commands={envbindir}/flake8 lunr tests 23 | 24 | [testenv:docs] 25 | basepython = python3.8 26 | deps= 27 | furo 28 | sphinx 29 | sphinx-autobuild 30 | myst-parser 31 | commands={envbindir}/sphinx-build docs docs/_build/html 32 | 33 | [testenv:mypy] 34 | basepython = python3.8 35 | deps = mypy 36 | commands={envbindir}/mypy lunr 37 | 38 | [coverage:run] 39 | source=lunr 40 | branch=True 41 | 42 | [coverage:report] 43 | exclude_lines = 44 | if self.debug: 45 | pragma: no cover 46 | raise NotImplementedError 47 | if __name__ == .__main__.: 48 | ignore_errors = True 49 | omit = 50 | tests/* 51 | lunr/stemmer.py 52 | show_missing = True 53 | 54 | [flake8] 55 | exclude = lunr/stemmer.py 56 | max-line-length = 92 57 | ignore = E203 W503 58 | 59 | [pytest] 60 | markers = 61 | acceptance: mark test as an acceptance test 62 | 63 | [gh-actions] 64 | python = 65 | 3.6: py36 66 | 3.7: py37 67 | 3.8: py38,flake8,black,docs,mypy 68 | 3.9: py39 69 | 3.10: py310 70 | pypy3: pypy3 71 | -------------------------------------------------------------------------------- /tests/fixtures/stemming_vocab.json: -------------------------------------------------------------------------------- 1 | {"consign":"consign","consigned":"consign","consigning":"consign","consignment":"consign","consist":"consist","consisted":"consist","consistency":"consist","consistent":"consist","consistently":"consist","consisting":"consist","consists":"consist","consolation":"consol","consolations":"consol","consolatory":"consolatori","console":"consol","consoled":"consol","consoles":"consol","consolidate":"consolid","consolidated":"consolid","consolidating":"consolid","consoling":"consol","consols":"consol","consonant":"conson","consort":"consort","consorted":"consort","consorting":"consort","conspicuous":"conspicu","conspicuously":"conspicu","conspiracy":"conspiraci","conspirator":"conspir","conspirators":"conspir","conspire":"conspir","conspired":"conspir","conspiring":"conspir","constable":"constabl","constables":"constabl","constance":"constanc","constancy":"constanc","constant":"constant","knack":"knack","knackeries":"knackeri","knacks":"knack","knag":"knag","knave":"knave","knaves":"knave","knavish":"knavish","kneaded":"knead","kneading":"knead","knee":"knee","kneel":"kneel","kneeled":"kneel","kneeling":"kneel","kneels":"kneel","knees":"knee","knell":"knell","knelt":"knelt","knew":"knew","knick":"knick","knif":"knif","knife":"knife","knight":"knight","knights":"knight","knit":"knit","knits":"knit","knitted":"knit","knitting":"knit","knives":"knive","knob":"knob","knobs":"knob","knock":"knock","knocked":"knock","knocker":"knocker","knockers":"knocker","knocking":"knock","knocks":"knock","knopp":"knopp","knot":"knot","knots":"knot","lay":"lai","try":"try"} -------------------------------------------------------------------------------- /.github/workflows/test-suite.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: CI 3 | 4 | on: 5 | push: 6 | branches: ["master"] 7 | pull_request: 8 | branches: ["master"] 9 | 10 | jobs: 11 | tests: 12 | name: "Python ${{ matrix.python-version }}" 13 | runs-on: "ubuntu-latest" 14 | env: 15 | USING_COVERAGE: '3.8' 16 | 17 | strategy: 18 | matrix: 19 | python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "pypy3"] 20 | 21 | steps: 22 | - uses: "actions/checkout@v2" 23 | - uses: "actions/setup-python@v2" 24 | with: 25 | python-version: "${{ matrix.python-version }}" 26 | - uses: actions/setup-node@v1 27 | with: 28 | node-version: '14' 29 | 30 | - name: "Install dependencies" 31 | run: | 32 | set -xe 33 | python -VV 34 | python -m site 35 | python -m pip install --upgrade pip setuptools wheel 36 | python -m pip install --upgrade coverage[toml] virtualenv tox tox-gh-actions 37 | cd tests/acceptance_tests/javascript/ && npm install 38 | 39 | - name: "Run tox targets for ${{ matrix.python-version }}" 40 | run: "python -m tox" 41 | 42 | - name: "Convert coverage" 43 | if: "contains(env.USING_COVERAGE, matrix.python-version)" 44 | run: "python -m coverage xml" 45 | 46 | - name: "Upload coverage to Codecov" 47 | if: "contains(env.USING_COVERAGE, matrix.python-version)" 48 | uses: "codecov/codecov-action@v1" 49 | with: 50 | fail_ci_if_error: true 51 | 52 | -------------------------------------------------------------------------------- /tests/test_stop_word_filter.py: -------------------------------------------------------------------------------- 1 | from lunr.stop_word_filter import stop_word_filter, generate_stop_word_filter 2 | from lunr.pipeline import Pipeline 3 | 4 | STOP_WORDS = ["the", "and", "but", "than", "when"] 5 | 6 | 7 | class TestStopWordFilter: 8 | def test_filters_stop_words(self): 9 | for word in STOP_WORDS: 10 | assert stop_word_filter(word) is None 11 | 12 | def test_ignores_non_stop_words(self): 13 | non_stop_words = ["interesting", "words", "pass", "through"] 14 | for word in non_stop_words: 15 | assert stop_word_filter(word) == word 16 | 17 | def test_is_a_registered_pipeline_function(self): 18 | assert stop_word_filter.label == "stopWordFilter" 19 | assert Pipeline.registered_functions["stopWordFilter"] == stop_word_filter 20 | 21 | 22 | class TestGenerateStopWordFilter: 23 | def test_creates_correct_stop_words_filter(self): 24 | new_stop_word_filter = generate_stop_word_filter(STOP_WORDS) 25 | for word in STOP_WORDS: 26 | assert new_stop_word_filter(word) is None 27 | 28 | def test_registers_new_stop_words_filter(self): 29 | new_stop_word_filter = generate_stop_word_filter(STOP_WORDS) 30 | assert new_stop_word_filter.label == "stopWordFilter" 31 | assert Pipeline.registered_functions["stopWordFilter"] == new_stop_word_filter 32 | 33 | def test_passing_a_language_adds_to_registered_label(self): 34 | new_stop_word_filter = generate_stop_word_filter(STOP_WORDS, "es") 35 | assert new_stop_word_filter.label == "stopWordFilter-es" 36 | assert ( 37 | Pipeline.registered_functions["stopWordFilter-es"] == new_stop_word_filter 38 | ) 39 | -------------------------------------------------------------------------------- /tests/acceptance_tests/test_mkdocs.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | 4 | import pytest 5 | 6 | from lunr import lunr 7 | from lunr.index import Index 8 | from tests.utils import read_json_fixture, run_node_script, assert_results_match 9 | 10 | 11 | @pytest.mark.acceptance 12 | def test_mkdocs_produces_same_results(): 13 | query_string = "plugins" 14 | js_results = run_node_script("mkdocs_query.js", query_string).split("\n") 15 | data = read_json_fixture("mkdocs_index.json") 16 | index = lunr(ref="id", fields=("title", "text"), documents=data["docs"]) 17 | results = index.search(query_string) 18 | assert_results_match(results, js_results) 19 | 20 | 21 | @pytest.mark.acceptance 22 | def test_js_serialized_index_can_be_loaded_and_produces_same_results(): 23 | json_path = run_node_script("mkdocs_serialization.js") 24 | with open(json_path) as fd: 25 | js_serialized_index = fd.read() 26 | 27 | index = Index.load(js_serialized_index) 28 | query_string = "plugins" 29 | results = index.search(query_string) 30 | js_results = run_node_script("mkdocs_query.js", query_string).split("\n") 31 | assert_results_match(results, js_results) 32 | 33 | 34 | @pytest.mark.acceptance 35 | def test_serialized_index_can_be_loaded_in_js_and_produces_same_results(): 36 | data = read_json_fixture("mkdocs_index.json") 37 | index = lunr(ref="id", fields=("title", "text"), documents=data["docs"]) 38 | query_string = "plugins" 39 | results = index.search(query_string) 40 | serialized_index = index.serialize() 41 | 42 | with tempfile.NamedTemporaryFile(delete=False) as fp: 43 | fp.write(json.dumps(serialized_index).encode()) 44 | 45 | js_results = run_node_script( 46 | "mkdocs_load_serialized_index_and_search.js", fp.name, query_string 47 | ).split("\n") 48 | assert_results_match(results, js_results) 49 | -------------------------------------------------------------------------------- /lunr/token_set_builder.py: -------------------------------------------------------------------------------- 1 | from lunr.token_set import TokenSet 2 | from lunr.exceptions import BaseLunrException 3 | 4 | 5 | class TokenSetBuilder: 6 | def __init__(self): 7 | self.previous_word = "" 8 | self.root = TokenSet() 9 | self.unchecked_nodes = [] 10 | self.minimized_nodes = {} 11 | 12 | def insert(self, word): 13 | if word < self.previous_word: 14 | raise BaseLunrException("Out of order word insertion") 15 | 16 | common_prefix = 0 17 | for i in range(min(len(word), len(self.previous_word))): 18 | if word[i] != self.previous_word[i]: 19 | break 20 | 21 | common_prefix += 1 22 | 23 | self.minimize(common_prefix) 24 | 25 | node = ( 26 | self.root if not self.unchecked_nodes else self.unchecked_nodes[-1]["child"] 27 | ) 28 | 29 | for i in range(common_prefix, len(word)): 30 | next_node = TokenSet() 31 | char = word[i] 32 | 33 | node.edges[char] = next_node 34 | 35 | self.unchecked_nodes.append( 36 | {"parent": node, "char": char, "child": next_node} 37 | ) 38 | 39 | node = next_node 40 | 41 | node.final = True 42 | self.previous_word = word 43 | 44 | def finish(self): 45 | self.minimize(0) 46 | 47 | def minimize(self, down_to): 48 | for i in range(len(self.unchecked_nodes) - 1, down_to - 1, -1): 49 | node = self.unchecked_nodes[i] 50 | child_key = str(node["child"]) 51 | 52 | if child_key in self.minimized_nodes: 53 | node["parent"].edges[node["char"]] = self.minimized_nodes[child_key] 54 | else: 55 | node["child"]._str = child_key 56 | self.minimized_nodes[child_key] = node["child"] 57 | 58 | self.unchecked_nodes.pop() 59 | -------------------------------------------------------------------------------- /tests/test_plugins.py: -------------------------------------------------------------------------------- 1 | from lunr import lunr, get_default_builder 2 | from lunr.pipeline import Pipeline 3 | from lunr.stemmer import stemmer 4 | from lunr.trimmer import trimmer 5 | from lunr.stop_word_filter import stop_word_filter 6 | 7 | documents = [ 8 | { 9 | "id": "a", 10 | "title": "Mr. Green kills Colonel Mustard", 11 | "body": """Mr. Green killed Colonel Mustard in the study with the 12 | candlestick. Mr. Green is not a very nice fellow.""", 13 | "word_count": 19, 14 | }, 15 | { 16 | "id": "b", 17 | "title": "Plumb waters plant", 18 | "body": "Professor Plumb has a green plant in his study", 19 | "word_count": 9, 20 | }, 21 | { 22 | "id": "c", 23 | "title": "Scarlett helps Professor", 24 | "body": """Miss Scarlett watered Professor Plumbs green plant 25 | while he was away from his office last week.""", 26 | "word_count": 16, 27 | }, 28 | ] 29 | 30 | 31 | def test_get_default_builder(): 32 | builder = get_default_builder() 33 | assert builder.pipeline._stack == [trimmer, stop_word_filter, stemmer] 34 | assert builder.search_pipeline._stack == [stemmer] 35 | 36 | 37 | def test_drop_pipeline_function(): 38 | builder = get_default_builder() 39 | builder.pipeline.remove(stemmer) 40 | 41 | idx = lunr("id", ("title", "body"), documents, builder=builder) 42 | 43 | assert idx.search("kill") == [] # no match because "killed" was not stemmed 44 | 45 | 46 | def test_add_token_metadata(): 47 | builder = get_default_builder() 48 | 49 | def token_length(token, i, tokens): 50 | token.metadata["token_length"] = len(str(token)) 51 | return token 52 | 53 | Pipeline.register_function(token_length) 54 | builder.pipeline.add(token_length) 55 | builder.metadata_whitelist.append("token_length") 56 | 57 | idx = lunr("id", ("title", "body"), documents, builder=builder) 58 | 59 | [result, _, _] = idx.search("green") 60 | assert result["match_data"].metadata["green"]["title"]["token_length"] == [5] 61 | assert result["match_data"].metadata["green"]["body"]["token_length"] == [5, 5] 62 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from setuptools import setup, find_packages 4 | 5 | 6 | PATH = os.path.abspath(os.path.dirname(__file__)) 7 | 8 | 9 | def read_file(filepath): 10 | with open(filepath, "r") as fd: 11 | return fd.read() 12 | 13 | 14 | def find_version(): 15 | version_path = os.path.join(PATH, "lunr", "__init__.py") 16 | contents = read_file(version_path) 17 | version_string = contents[contents.index("__VERSION__") :] 18 | try: 19 | return re.match(r'.*__VERSION__ = [\'"]([\d\w\.]+)[\'"]', version_string).group( 20 | 1 21 | ) 22 | except AttributeError: 23 | raise RuntimeError("Unable to find version string.") 24 | 25 | 26 | setup( 27 | name="lunr", 28 | version=find_version(), 29 | url="https://github.com/yeraydiazdiaz/lunr.py", 30 | project_urls={"Documentation": "https://lunr.readthedocs.io"}, 31 | license="MIT", 32 | description="A Python implementation of Lunr.js", 33 | long_description=read_file("README.md"), 34 | long_description_content_type="text/markdown", 35 | author="Yeray Diaz Diaz", 36 | author_email="yeraydiazdiaz@gmail.com", 37 | packages=find_packages(exclude=("tests",)), 38 | include_package_data=True, 39 | zip_safe=False, 40 | python_requires=">=3.6", 41 | install_requires=[], 42 | extras_require={ 43 | "languages": ["nltk"] 44 | }, 45 | keywords="lunr full text search", 46 | classifiers=[ 47 | "Development Status :: 4 - Beta", 48 | "Intended Audience :: Developers", 49 | "License :: OSI Approved :: MIT License", 50 | "Operating System :: OS Independent", 51 | "Programming Language :: Python", 52 | "Programming Language :: Python :: 3", 53 | "Programming Language :: Python :: 3.6", 54 | "Programming Language :: Python :: 3.7", 55 | "Programming Language :: Python :: 3.8", 56 | "Programming Language :: Python :: 3.9", 57 | "Programming Language :: Python :: 3.10", 58 | "Programming Language :: Python :: Implementation :: CPython", 59 | "Programming Language :: Python :: Implementation :: PyPy", 60 | "Topic :: Text Processing", 61 | ], 62 | ) 63 | -------------------------------------------------------------------------------- /lunr/tokenizer.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from lunr.token import Token 4 | from lunr.utils import as_string 5 | 6 | SEPARATOR_CHARS = " \t\n\r\f\v\xa0-" 7 | 8 | 9 | def default_separator(char): 10 | return char and char in SEPARATOR_CHARS 11 | 12 | 13 | def Tokenizer(obj, metadata=None, separator=None): 14 | """Splits a string into tokens ready to be inserted into the search index. 15 | 16 | Args: 17 | metadata (dict): Optional metadata can be passed to the tokenizer, this 18 | metadata will be cloned and added as metadata to every token that is 19 | created from the object to be tokenized. 20 | separator (callable or compiled regex): This tokenizer will convert its 21 | parameter to a string by calling `str` and then will split this 22 | string on characters for which `separator` is True. Lists will have 23 | their elements converted to strings and wrapped in a lunr `Token`. 24 | 25 | Returns: 26 | List of Token instances. 27 | """ 28 | if obj is None: 29 | return [] 30 | 31 | metadata = metadata or {} 32 | 33 | if isinstance(obj, (list, tuple)): 34 | return [ 35 | Token(as_string(element).lower(), deepcopy(metadata)) for element in obj 36 | ] 37 | 38 | if separator is None: 39 | is_separator = default_separator 40 | elif callable(separator): 41 | is_separator = separator 42 | else: # must be a regex, remove when dropping support for 2.7 43 | is_separator = lambda c: separator.match(c) # noqa 44 | 45 | string = str(obj).lower() 46 | length = len(string) 47 | tokens = [] 48 | slice_start = 0 49 | for slice_end in range(length + 1): 50 | char = string[slice_end] if slice_end != length else "" 51 | slice_length = slice_end - slice_start 52 | if is_separator(char) or slice_end == length: 53 | if slice_length > 0: 54 | token_metadata = {} 55 | token_metadata["position"] = [slice_start, slice_length] 56 | token_metadata["index"] = len(tokens) 57 | token_metadata.update(metadata) 58 | 59 | sl = slice(slice_start, slice_end) 60 | tokens.append(Token(string[sl], token_metadata)) 61 | 62 | slice_start = slice_end + 1 63 | 64 | return tokens 65 | -------------------------------------------------------------------------------- /tests/test_match_data.py: -------------------------------------------------------------------------------- 1 | from lunr.match_data import MatchData 2 | 3 | 4 | class TestMatchData: 5 | def setup_method(self, method): 6 | self.match = MatchData("foo", "title", {"position": [1]}) 7 | self.match.combine(MatchData("bar", "title", {"position": [2]})) 8 | self.match.combine(MatchData("baz", "body", {"position": [3]})) 9 | self.match.combine(MatchData("baz", "body", {"position": [4]})) 10 | 11 | def test_repr(self): 12 | assert repr(self.match) == '' 13 | 14 | def test_create_empty_match_data(self): 15 | assert MatchData().metadata == {} 16 | 17 | def test_create_missing_field(self): 18 | assert MatchData("foo").metadata["foo"] == {} 19 | 20 | def test_create_missing_metadata(self): 21 | assert MatchData("foo", "title").metadata["foo"]["title"] == {} 22 | 23 | def test_combine_terms(self): 24 | assert sorted(list(self.match.metadata.keys())) == ["bar", "baz", "foo"] 25 | 26 | def test_combine_metadata(self): 27 | assert self.match.metadata["foo"]["title"]["position"] == [1] 28 | assert self.match.metadata["bar"]["title"]["position"] == [2] 29 | assert self.match.metadata["baz"]["body"]["position"] == [3, 4] 30 | 31 | def test_combine_does_not_mutate_source_data(self): 32 | metadata = {"foo": [1]} 33 | match_data1 = MatchData("foo", "title", metadata) 34 | match_data2 = MatchData("foo", "title", metadata) 35 | 36 | match_data1.combine(match_data2) 37 | 38 | assert metadata["foo"] == [1] 39 | 40 | def test_add_metadata_for_missing_term(self): 41 | self.match.add("spam", "title", {"position": [5]}) 42 | 43 | assert self.match.metadata["spam"]["title"]["position"] == [5] 44 | 45 | def test_add_metadata_for_missing_field(self): 46 | self.match.add("foo", "body", {"position": [6]}) 47 | 48 | assert self.match.metadata["foo"]["body"]["position"] == [6] 49 | 50 | def test_add_metadata_for_existing_term_field_and_metadata_key(self): 51 | self.match.add("foo", "title", {"position": [7]}) 52 | 53 | assert self.match.metadata["foo"]["title"]["position"] == [1, 7] 54 | 55 | def test_add_metadata_for_existing_term_and_field_and_missing_metadata_key(self): 56 | self.match.add("foo", "title", {"weight": [7]}) 57 | 58 | assert self.match.metadata["foo"]["title"] == {"position": [1], "weight": [7]} 59 | -------------------------------------------------------------------------------- /tests/benchmarks.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import pytest 4 | 5 | from tests.utils import read_json_fixture 6 | 7 | from lunr import lunr 8 | from lunr.pipeline import Pipeline 9 | 10 | 11 | def get_mkdocs_index(): 12 | data = read_json_fixture("mkdocs_index.json") 13 | return lunr(ref="id", fields=("title", "text"), documents=data["docs"]) 14 | 15 | 16 | class TestSearchBenchmarks: 17 | @pytest.fixture(scope="session") 18 | def index(self): 19 | return get_mkdocs_index() 20 | 21 | def test_search(self, index, benchmark): 22 | benchmark(index.search, "styling") 23 | 24 | 25 | class TestPipelineBenchmarks: 26 | 27 | FEW_COUNT = 50 28 | MANY_COUNT = 1000 29 | 30 | @pytest.fixture(scope="session") 31 | def many_tokens(self): 32 | path = os.path.join(os.path.dirname(__file__), "fixtures/words.txt") 33 | with open(path) as words: 34 | self.many_tokens = [ 35 | words.readline().strip() for _ in range(self.MANY_COUNT) 36 | ] 37 | self.few_tokens = self.many_tokens[: self.FEW_COUNT] 38 | yield self.many_tokens 39 | 40 | @pytest.fixture(scope="session") 41 | def few_tokens(self, many_tokens): 42 | yield self.few_tokens 43 | 44 | @staticmethod 45 | def token_to_token(token, i, tokens): 46 | return token 47 | 48 | @staticmethod 49 | def token_to_token_array(token, i, tokens): 50 | return [token, token] 51 | 52 | def test_few_token_to_token(self, few_tokens, benchmark): 53 | token_to_token_pipeline = Pipeline() 54 | token_to_token_pipeline.add(self.token_to_token) 55 | benchmark(token_to_token_pipeline.run, few_tokens) 56 | 57 | def test_many_token_to_token(self, many_tokens, benchmark): 58 | token_to_token_pipeline = Pipeline() 59 | token_to_token_pipeline.add(self.token_to_token) 60 | benchmark(token_to_token_pipeline.run, many_tokens) 61 | 62 | def test_few_token_to_token_array(self, few_tokens, benchmark): 63 | token_to_token_array_pipeline = Pipeline() 64 | token_to_token_array_pipeline.add(self.token_to_token_array) 65 | benchmark(token_to_token_array_pipeline.run, few_tokens) 66 | 67 | def test_many_token_to_token_array(self, many_tokens, benchmark): 68 | token_to_token_array_pipeline = Pipeline() 69 | token_to_token_array_pipeline.add(self.token_to_token_array) 70 | benchmark(token_to_token_array_pipeline.run, many_tokens) 71 | 72 | 73 | if __name__ == "__main__": 74 | get_mkdocs_index() 75 | -------------------------------------------------------------------------------- /readme.rst: -------------------------------------------------------------------------------- 1 | |Build Status| |codecov| 2 | 3 | Lunr.py 4 | ======= 5 | 6 | A Python implementation of `Lunr.js `__ by `Oliver 7 | Nightingale `__. 8 | 9 | A bit like Solr, but much smaller and not as bright. 10 | 11 | This Python version of Lunr.js aims to bring the simple and powerful 12 | full text search capabilities into Python guaranteeing results as close 13 | as the original implementation as possible. 14 | 15 | Current state: 16 | -------------- 17 | 18 | Each version of lunr.py `targets a specific version of 19 | lunr.js `__ 20 | and produces the same results as it both in Python 2.7 and 3 for 21 | `non-trivial corpus of 22 | documents `__. 23 | 24 | Lunr.py also serializes ``Index`` instances respecting the 25 | ```lunr-schema`` `__ which are 26 | consumable by Lunr.js and viceversa. 27 | 28 | The API is in alpha stage and likely to change. 29 | 30 | Usage: 31 | ------ 32 | 33 | You’ll need a list of dicts representing the documents you want to 34 | search on. These documents must have a unique field which will serve as 35 | a reference and a series of fields you’d like to search on. 36 | 37 | Lunr provides a convenience ``lunr`` function to quickly index this set 38 | of documents: 39 | 40 | .. code:: python 41 | 42 | >>> from lunr import lunr 43 | >>> 44 | >>> documents = [{ 45 | ... 'id': 'a', 46 | ... 'title': 'Mr. Green kills Colonel Mustard', 47 | ... 'body': 'Mr. Green killed Colonel Mustard in the study with the candlestick.', 48 | ... }, { 49 | ... 'id': 'b', 50 | ... 'title': 'Plumb waters plant', 51 | ... 'body': 'Professor Plumb has a green plant in his study', 52 | ... }] 53 | >>> idx = lunr( 54 | ... ref='id', fields=('title', 'body'), documents=documents 55 | ... ) 56 | >>> idx.search('kill') 57 | [{'ref': 'a', 'score': 0.6931722372559913, 'match_data': }] 58 | >>> idx.search('study') 59 | [{'ref': 'b', 'score': 0.23576799568081389, 'match_data': }, {'ref': 'a', 'score': 0.2236629211724517, 'match_data': }] 60 | 61 | .. |Build Status| image:: https://travis-ci.org/yeraydiazdiaz/lunr.py.svg?branch=master 62 | :target: https://travis-ci.org/yeraydiazdiaz/lunr.py 63 | .. |codecov| image:: https://codecov.io/gh/yeraydiazdiaz/lunr.py/branch/master/graph/badge.svg 64 | :target: https://codecov.io/gh/yeraydiazdiaz/lunr.py 65 | -------------------------------------------------------------------------------- /lunr/match_data.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | 4 | class MatchData: 5 | """Contains and collects metadata about a matching document. 6 | 7 | A single instance of lunr.MatchData is returned as part of every 8 | lunr.Index.Result. 9 | """ 10 | 11 | def __init__(self, term=None, field=None, metadata=None): 12 | self.metadata = {} 13 | if term is not None: 14 | self.metadata[term] = {} 15 | if field is not None: 16 | self.metadata[term][field] = ( 17 | deepcopy(metadata) if metadata is not None else {} 18 | ) 19 | 20 | def __repr__(self): 21 | return ''.format(",".join(sorted(self.metadata.keys()))) 22 | 23 | def combine(self, other): 24 | """An instance of lunr.MatchData will be created for every term that 25 | matches a document. 26 | 27 | However only one instance is required in a lunr.Index~Result. This 28 | method combines metadata from another instance of MatchData with this 29 | object's metadata. 30 | """ 31 | for term in other.metadata.keys(): 32 | if term not in self.metadata: 33 | self.metadata[term] = {} 34 | 35 | fields = other.metadata[term].keys() 36 | for field in fields: 37 | if field not in self.metadata[term]: 38 | self.metadata[term][field] = {} 39 | 40 | keys = other.metadata[term][field].keys() 41 | for key in keys: 42 | if key not in self.metadata[term][field]: 43 | self.metadata[term][field][key] = other.metadata[term][field][ 44 | key 45 | ] 46 | else: 47 | self.metadata[term][field][key].extend( 48 | other.metadata[term][field][key] 49 | ) 50 | 51 | def add(self, term, field, metadata): 52 | """Add metadata for a term/field pair to this instance of match data""" 53 | if term not in self.metadata: 54 | self.metadata[term] = {field: metadata} 55 | return 56 | 57 | if field not in self.metadata[term]: 58 | self.metadata[term][field] = metadata 59 | return 60 | 61 | for key in metadata.keys(): 62 | if key in self.metadata[term][field]: 63 | self.metadata[term][field][key].extend(metadata[key]) 64 | else: 65 | self.metadata[term][field][key] = metadata[key] 66 | 67 | def __eq__(self, other): 68 | return self.metadata == other.metadata 69 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = "lunr.py" 21 | copyright = "2022, Yeray Diaz Diaz" 22 | author = "Yeray Diaz Diaz" 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | "sphinx.ext.autodoc", 32 | "sphinx.ext.extlinks", 33 | "sphinx.ext.intersphinx", 34 | "sphinx.ext.mathjax", 35 | "sphinx.ext.todo", 36 | "sphinx.ext.viewcode", 37 | "myst_parser", 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ["_templates"] 42 | 43 | # List of patterns, relative to source directory, that match files and 44 | # directories to ignore when looking for source files. 45 | # This pattern also affects html_static_path and html_extra_path. 46 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 47 | 48 | 49 | # -- Options for HTML output ------------------------------------------------- 50 | 51 | # The theme to use for HTML and HTML Help pages. See the documentation for 52 | # a list of builtin themes. 53 | # 54 | html_theme = "furo" 55 | html_title = "Lunr.py" 56 | 57 | # Add any paths that contain custom static files (such as style sheets) here, 58 | # relative to this directory. They are copied after the builtin static files, 59 | # so a file named "default.css" will overwrite the builtin "default.css". 60 | html_static_path = ["_static"] 61 | 62 | # 63 | # -- Options for extlinks ---------------------------------------------------- 64 | # 65 | extlinks = {"pypi": ("https://pypi.org/project/%s/", "")} 66 | 67 | # 68 | # -- Options for intersphinx ------------------------------------------------- 69 | # 70 | intersphinx_mapping = { 71 | "python": ("https://docs.python.org/3", None), 72 | "sphinx": ("https://www.sphinx-doc.org/", None), 73 | } 74 | -------------------------------------------------------------------------------- /tests/test_index.py: -------------------------------------------------------------------------------- 1 | import json 2 | from mock import MagicMock, patch 3 | 4 | import pytest 5 | 6 | from lunr import __TARGET_JS_VERSION__ 7 | from lunr.index import Index 8 | from lunr.exceptions import BaseLunrException 9 | 10 | from tests.utils import assert_vectors_equal 11 | 12 | 13 | class TestIndex: 14 | def test_create_query_default_fields(self, index): 15 | query = index.create_query() 16 | assert query.all_fields == index.fields 17 | 18 | def test_create_query_subset_of_fields(self, index): 19 | query = index.create_query([index.fields[0]]) 20 | assert query.all_fields == [index.fields[0]] 21 | 22 | def test_create_query_non_contained_fields(self, index): 23 | with pytest.raises(BaseLunrException): 24 | index.create_query(["foo"]) 25 | 26 | def test_query_no_arguments_warns_and_returns_no_results(self, monkeypatch, index): 27 | from lunr.index import logger 28 | 29 | mock_logger = MagicMock() 30 | monkeypatch.setattr(logger, "warning", mock_logger) 31 | results = index.query() 32 | assert results == [] 33 | mock_logger.assert_called_once() 34 | 35 | def test_query_callback_argument_is_query_with_fields(self, index): 36 | def callback(query): 37 | assert query.all_fields == index.fields 38 | 39 | index.query(callback=callback) 40 | 41 | def test_query_callback_can_configure_query(self, index): 42 | def callback(query): 43 | query.clause("study") 44 | 45 | results = index.query(callback=callback) 46 | assert len(results) == 2 47 | assert results[0]["ref"] == "b" 48 | assert results[1]["ref"] == "a" 49 | 50 | 51 | class TestIndexSerialization: 52 | def test_serialization(self, index): 53 | serialized_index = index.serialize() 54 | assert serialized_index["version"] == __TARGET_JS_VERSION__ 55 | assert serialized_index["fields"] == index.fields 56 | for ref, vector in serialized_index["fieldVectors"]: 57 | assert ref in index.field_vectors 58 | assert_vectors_equal(vector, index.field_vectors[ref]) 59 | 60 | def test_json_deserialization(self, index): 61 | serialized_index = index.serialize() 62 | json_serialized_index = json.dumps(serialized_index) 63 | 64 | idx = Index.load(json_serialized_index) 65 | 66 | assert idx == index 67 | 68 | def test_load_warns_on_js_version_mismatch(self, index): 69 | serialized_index = index.serialize() 70 | serialized_index["version"] = "1.0.0" 71 | 72 | with patch("lunr.index.logger") as mock_log: 73 | Index.load(serialized_index) 74 | mock_log.warning.assert_called_once() 75 | -------------------------------------------------------------------------------- /tests/acceptance_tests/test_language_support.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | 4 | import pytest 5 | 6 | from lunr import lunr 7 | from lunr.index import Index 8 | from tests.utils import read_json_fixture, run_node_script, assert_results_match 9 | 10 | 11 | @pytest.mark.acceptance 12 | def test_languages_query_results_match_javascript_results(): 13 | query_string = "resistencia" 14 | js_results = run_node_script("language_query.js", query_string).split("\n") 15 | data = read_json_fixture("lang_es.json") 16 | index = lunr( 17 | ref="id", fields=("title", "text"), documents=data["docs"], languages="es" 18 | ) 19 | results = index.search(query_string) 20 | assert_results_match(results, js_results, tol=0.1) 21 | 22 | 23 | @pytest.mark.acceptance 24 | def test_js_serialized_lang_index_can_be_loaded_and_produces_same_results(): 25 | json_path = run_node_script("language_serialize_index.js") 26 | with open(json_path) as fd: 27 | js_serialized_index = fd.read() 28 | 29 | index = Index.load(js_serialized_index) 30 | query_string = "imperio" 31 | results = index.search(query_string) 32 | js_results = run_node_script("language_query.js", query_string).split("\n") 33 | assert_results_match(results, js_results) 34 | 35 | 36 | @pytest.mark.acceptance 37 | def test_serialized_lang_index_can_be_loaded_in_js_and_produces_same_results(): 38 | data = read_json_fixture("lang_es.json") 39 | index = lunr( 40 | ref="id", fields=("title", "text"), documents=data["docs"], languages="es" 41 | ) 42 | query_string = "imperio" 43 | results = index.search(query_string) 44 | serialized_index = index.serialize() 45 | 46 | with tempfile.NamedTemporaryFile(delete=False) as fp: 47 | fp.write(json.dumps(serialized_index).encode()) 48 | 49 | js_results = run_node_script( 50 | "language_load_serialized_index_and_search.js", fp.name, query_string 51 | ).split("\n") 52 | assert_results_match(results, js_results) 53 | 54 | 55 | @pytest.mark.acceptance 56 | def test_serialized_multilang_index_can_be_loaded_in_js_and_results_equal(): 57 | data = read_json_fixture("lang_es_en.json") 58 | index = lunr( 59 | ref="id", 60 | fields=("title", "text"), 61 | documents=data["docs"], 62 | languages=["es", "en"], 63 | ) 64 | query_string = "taxation" 65 | results = index.search(query_string) 66 | serialized_index = index.serialize() 67 | 68 | with tempfile.NamedTemporaryFile(delete=False) as fp: 69 | fp.write(json.dumps(serialized_index).encode()) 70 | 71 | js_results = run_node_script( 72 | "language_load_serialized_index_and_search.js", 73 | fp.name, 74 | query_string, 75 | "lang_es_en.json", 76 | ).split("\n") 77 | assert_results_match(results, js_results) 78 | -------------------------------------------------------------------------------- /lunr/__main__.py: -------------------------------------------------------------------------------- 1 | from lunr import languages as lang 2 | from lunr.builder import Builder 3 | from lunr.stemmer import stemmer 4 | from lunr.trimmer import trimmer 5 | from lunr.stop_word_filter import stop_word_filter 6 | 7 | 8 | def lunr(ref, fields, documents, languages=None, builder=None): 9 | """A convenience function to configure and construct a lunr.Index. 10 | 11 | Args: 12 | ref (str): The key in the documents to be used a the reference. 13 | fields (list): A list of strings defining fields in the documents to 14 | index. Optionally a list of dictionaries with three keys: 15 | `field_name` defining the document's field, `boost` an integer 16 | defining a boost to be applied to the field, and `extractor` 17 | a callable taking the document as a single argument and returning 18 | a string located in the document in a particular way. 19 | documents (list): The list of dictonaries representing the documents 20 | to index. Optionally a 2-tuple of dicts, the first one being 21 | the document and the second the associated attributes to it. 22 | languages (str or list, optional): The languages to use if using 23 | NLTK language support, ignored if NLTK is not available. 24 | 25 | Returns: 26 | Index: The populated Index ready to search against. 27 | """ 28 | builder = builder or get_default_builder(languages) 29 | builder.ref(ref) 30 | for field in fields: 31 | if isinstance(field, dict): 32 | builder.field(**field) 33 | else: 34 | builder.field(field) 35 | 36 | for document in documents: 37 | if isinstance(document, (tuple, list)): 38 | builder.add(document[0], attributes=document[1]) 39 | else: 40 | builder.add(document) 41 | 42 | return builder.build() 43 | 44 | 45 | def get_default_builder(languages=None): 46 | """Creates a new pre-configured instance of Builder. 47 | 48 | Useful as a starting point to tweak the defaults. 49 | """ 50 | if languages is not None and lang.LANGUAGE_SUPPORT: 51 | if isinstance(languages, str): 52 | languages = [languages] 53 | 54 | unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES) 55 | if unsupported_languages: 56 | raise RuntimeError( 57 | "The specified languages {} are not supported, " 58 | "please choose one of {}".format( 59 | ", ".join(unsupported_languages), 60 | ", ".join(lang.SUPPORTED_LANGUAGES.keys()), 61 | ) 62 | ) 63 | builder = lang.get_nltk_builder(languages) 64 | else: 65 | builder = Builder() 66 | builder.pipeline.add(trimmer, stop_word_filter, stemmer) 67 | builder.search_pipeline.add(stemmer) 68 | 69 | return builder 70 | -------------------------------------------------------------------------------- /lunr/stop_word_filter.py: -------------------------------------------------------------------------------- 1 | from lunr.pipeline import Pipeline 2 | 3 | WORDS = { 4 | "a", 5 | "able", 6 | "about", 7 | "across", 8 | "after", 9 | "all", 10 | "almost", 11 | "also", 12 | "am", 13 | "among", 14 | "an", 15 | "and", 16 | "any", 17 | "are", 18 | "as", 19 | "at", 20 | "be", 21 | "because", 22 | "been", 23 | "but", 24 | "by", 25 | "can", 26 | "cannot", 27 | "could", 28 | "dear", 29 | "did", 30 | "do", 31 | "does", 32 | "either", 33 | "else", 34 | "ever", 35 | "every", 36 | "for", 37 | "from", 38 | "get", 39 | "got", 40 | "had", 41 | "has", 42 | "have", 43 | "he", 44 | "her", 45 | "hers", 46 | "him", 47 | "his", 48 | "how", 49 | "however", 50 | "i", 51 | "if", 52 | "in", 53 | "into", 54 | "is", 55 | "it", 56 | "its", 57 | "just", 58 | "least", 59 | "let", 60 | "like", 61 | "likely", 62 | "may", 63 | "me", 64 | "might", 65 | "most", 66 | "must", 67 | "my", 68 | "neither", 69 | "no", 70 | "nor", 71 | "not", 72 | "of", 73 | "off", 74 | "often", 75 | "on", 76 | "only", 77 | "or", 78 | "other", 79 | "our", 80 | "own", 81 | "rather", 82 | "said", 83 | "say", 84 | "says", 85 | "she", 86 | "should", 87 | "since", 88 | "so", 89 | "some", 90 | "than", 91 | "that", 92 | "the", 93 | "their", 94 | "them", 95 | "then", 96 | "there", 97 | "these", 98 | "they", 99 | "this", 100 | "tis", 101 | "to", 102 | "too", 103 | "twas", 104 | "us", 105 | "wants", 106 | "was", 107 | "we", 108 | "were", 109 | "what", 110 | "when", 111 | "where", 112 | "which", 113 | "while", 114 | "who", 115 | "whom", 116 | "why", 117 | "will", 118 | "with", 119 | "would", 120 | "yet", 121 | "you", 122 | "your", 123 | } 124 | 125 | 126 | def generate_stop_word_filter(stop_words, language=None): 127 | """Builds a stopWordFilter function from the provided list of stop words. 128 | 129 | The built in `stop_word_filter` is built using this factory and can be used 130 | to generate custom `stop_word_filter` for applications or non English 131 | languages. 132 | """ 133 | 134 | def stop_word_filter(token, i=None, tokens=None): 135 | if token and str(token) not in stop_words: 136 | return token 137 | 138 | # camelCased for for compatibility with lunr.js 139 | label = ( 140 | "stopWordFilter-{}".format(language) 141 | if language is not None 142 | else "stopWordFilter" 143 | ) 144 | Pipeline.register_function(stop_word_filter, label) 145 | return stop_word_filter 146 | 147 | 148 | stop_word_filter = generate_stop_word_filter(WORDS) 149 | -------------------------------------------------------------------------------- /tests/test_language_support.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lunr import lunr 4 | from lunr.languages import LANGUAGE_SUPPORT, SUPPORTED_LANGUAGES 5 | from lunr.pipeline import Pipeline 6 | 7 | documents = [ 8 | { 9 | "id": "a", 10 | "text": ( 11 | "Este es un ejemplo inventado de lo que sería un documento en el " 12 | "idioma que se más se habla en España." 13 | ), 14 | "title": "Ejemplo de documento en español", 15 | }, 16 | { 17 | "id": "b", 18 | "text": ( 19 | "Según un estudio que me acabo de inventar porque soy un experto en" 20 | "idiomas que se hablan en España." 21 | ), 22 | "title": "Español es el tercer idioma más hablado del mundo", 23 | }, 24 | ] 25 | 26 | 27 | class TestLanguageSupport: 28 | @classmethod 29 | def setup_class(cls): 30 | assert ( 31 | LANGUAGE_SUPPORT is True 32 | ), "NLTK not found, please run `pip install -e .[languages]`" 33 | 34 | def test_lunr_function_raises_if_unsupported_language(self): 35 | with pytest.raises(RuntimeError): 36 | lunr("id", ["title", "text"], documents, "foo") 37 | 38 | def test_lunr_function_raises_if_any_unsupported_language_is_passed(self): 39 | with pytest.raises(RuntimeError): 40 | lunr("id", ["title", "text"], documents, ["es", "foo"]) 41 | 42 | def test_register_languages_in_pipeline_class(self): 43 | for lang in set(SUPPORTED_LANGUAGES) - {"en"}: 44 | assert "stemmer-{}".format(lang) in Pipeline.registered_functions 45 | 46 | def test_lunr_function_registers_nltk_stemmers_in_pipeline(self): 47 | idx = lunr("id", ["title", "text"], documents, ["es", "it"]) 48 | assert "stemmer-es" in repr(idx.pipeline) 49 | assert "stemmer-it" in repr(idx.pipeline) 50 | 51 | def test_lunr_registers_lun_stemmers_in_pipeline_if_language_is_en(self): 52 | idx = lunr("id", ["title", "text"], documents, ["en", "es"]) 53 | assert "stemmer,stemmer-es" in repr(idx.pipeline) 54 | 55 | def test_search_stems_search_terms(self): 56 | idx = lunr("id", ["title", "text"], documents, "es") 57 | results = idx.search("inventando") # stemmed to "invent" 58 | assert len(results) == 2 59 | 60 | def test_search_stems_search_terms_for_both_languages(self): 61 | italian_document = { 62 | "id": "c", 63 | "text": ( 64 | "Secondo uno studio che ho appena inventato perché sono un " 65 | "esperto di lingue parlate in Spagna." 66 | ), 67 | "title": "Lo spagnolo è la terza lingua più parlata al mondo", 68 | } 69 | idx = lunr( 70 | ref="id", 71 | fields=["title", "text"], 72 | documents=(documents + [italian_document]), 73 | languages=["es", "it"], 74 | ) 75 | results = idx.search("spagna") 76 | assert len(results) == 1 77 | 78 | results = idx.search("inventando") 79 | assert len(results) == 2 80 | -------------------------------------------------------------------------------- /docs/languages.md: -------------------------------------------------------------------------------- 1 | # Language support 2 | 3 | Lunr includes optional and experimental support for languages other than English via the [Natural Language Toolkit](http://www.nltk.org/). To install Lunr with this feature use `pip install lunr[languages]`. 4 | 5 | The currently supported languages are: 6 | 7 | - Arabic 8 | - Danish 9 | - Dutch 10 | - English 11 | - Finnish 12 | - French 13 | - German 14 | - Hungarian 15 | - Italian 16 | - Norwegian 17 | - Portuguese 18 | - Romanian 19 | - Russian 20 | - Spanish 21 | - Swedish 22 | 23 | ```python 24 | >>> documents = [ 25 | ... { 26 | ... "id": "a", 27 | ... "text": ( 28 | ... "Este es un ejemplo inventado de lo que sería un documento en el " 29 | ... "idioma que se más se habla en España."), 30 | ... "title": "Ejemplo de documento en español" 31 | ... }, 32 | ... { 33 | ... "id": "b", 34 | ... "text": ( 35 | ... "Según un estudio que me acabo de inventar porque soy un experto en" 36 | ... "idiomas que se hablan en España."), 37 | ... "title": "Español es el tercer idioma más hablado del mundo" 38 | ... }, 39 | ... ] 40 | ``` 41 | 42 | > New in 0.5.1: the `lunr` function now accepts more than one language 43 | 44 | Simply define specify one or more [ISO-639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) for the language(s) of your documents in the `languages` parameter to the `lunr` function. 45 | 46 | !!! Note 47 | In versions of Lunr prior to 0.5.0 the parameter's name is `language` and accepted a single string. 48 | 49 | If you have a single language you can pass the language code in `languages`: 50 | 51 | ```python 52 | >>> from lunr import lunr 53 | >>> idx = lunr('id', ['title', 'text'], documents, languages='es') 54 | >>> idx.search('inventando') 55 | [{'ref': 'a', 'score': 0.130, 'match_data': }, 56 | {'ref': 'b', 'score': 0.089, 'match_data': }] 57 | ``` 58 | 59 | !!! Note 60 | In order to construct stemmers, trimmers and stop word filters Lunr imports corpus data from NLTK which fetches data from Github and caches it in your home directory under `nltk_data` by default. You may see some logging indicating such activity during the creation of the index. 61 | 62 | If you have documents in multiple language pass a list of language codes: 63 | 64 | ```python 65 | >>> documents.append({ 66 | "id": "c", 67 | "text": "Let's say you also have documents written in English", 68 | "title": "A document in English" 69 | }) 70 | >>> idx = lunr('id', ['title', 'text'], documents, languages=['es', 'en']) 71 | >>> idx.search('english') 72 | [{'ref': 'c', 'score': 1.106, 'match_data': }] 73 | ``` 74 | 75 | ## Notes on language support 76 | 77 | - Using multiple languages means the terms will be stemmed once per language. This can yield unexpected results. 78 | - Compatibility with Lunr.js is ensured for languages that supported by both platforms, however results might differ slightly. 79 | + Languages supported by Lunr.js but not by Lunr.py: 80 | * Thai 81 | * Japanese 82 | * Turkish 83 | + Languages supported by Lunr.py but not Lunr.js: 84 | * Arabic 85 | - The usage of the language feature is subject to [NTLK corpus licensing clauses](https://github.com/nltk/nltk#redistributing) 86 | -------------------------------------------------------------------------------- /tests/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | 5 | from lunr.tokenizer import Tokenizer 6 | 7 | 8 | class TestTokenizer: 9 | def test_splitting_into_tokens(self): 10 | tokenizer = Tokenizer("foo bar baz") 11 | tokens = [str(token) for token in tokenizer] 12 | 13 | assert tokens == ["foo", "bar", "baz"] 14 | 15 | def test_run_downcases_tokens(self): 16 | tokenizer = Tokenizer("foo BAR BAZ") 17 | tokens = [str(token) for token in tokenizer] 18 | 19 | assert tokens == ["foo", "bar", "baz"] 20 | 21 | def test_array_of_strings(self): 22 | tokenizer = Tokenizer(["foo", "bar", "baz"]) 23 | tokens = [str(token) for token in tokenizer] 24 | 25 | assert tokens == ["foo", "bar", "baz"] 26 | 27 | def test_none_is_converted_to_empty_string(self): 28 | tokenizer = Tokenizer(["foo", None, "baz"]) 29 | tokens = [str(token) for token in tokenizer] 30 | 31 | assert tokens == ["foo", "", "baz"] 32 | 33 | def test_multiple_whitespace_is_stripped(self): 34 | tokenizer = Tokenizer(" foo bar baz ") 35 | tokens = [str(token) for token in tokenizer] 36 | 37 | assert tokens == ["foo", "bar", "baz"] 38 | 39 | def test_handling_null_like_arguments(self): 40 | assert len(Tokenizer(None)) == 0 41 | 42 | def test_converting_a_number_to_tokens(self): 43 | tokens = [str(token) for token in Tokenizer(41)] 44 | assert tokens == ["41"] 45 | 46 | def test_converting_a_boolean_to_tokens(self): 47 | tokens = [str(token) for token in Tokenizer(False)] 48 | assert tokens == ["false"] 49 | 50 | def test_converting_an_object_to_tokens(self): 51 | class Subject: 52 | def __str__(self): 53 | return "custom object" 54 | 55 | tokens = [str(token) for token in Tokenizer(Subject())] 56 | assert tokens == ["custom", "object"] 57 | 58 | def test_splits_strings_with_hyphens(self): 59 | tokens = [str(token) for token in Tokenizer("foo-bar")] 60 | assert tokens == ["foo", "bar"] 61 | 62 | def test_splits_strings_with_hyphens_and_spaces(self): 63 | tokens = [str(token) for token in Tokenizer("foo - bar")] 64 | assert tokens == ["foo", "bar"] 65 | 66 | def test_tracking_the_token_index(self): 67 | tokens = Tokenizer("foo bar") 68 | assert tokens[0].metadata["index"] == 0 69 | assert tokens[1].metadata["index"] == 1 70 | 71 | def test_tracking_the_token_position(self): 72 | tokens = Tokenizer("foo bar") 73 | assert tokens[0].metadata["position"] == [0, 3] 74 | assert tokens[1].metadata["position"] == [4, 3] 75 | 76 | def test_providing_additional_metadata(self): 77 | tokens = Tokenizer("foo bar", {"hurp": "durp"}) 78 | assert tokens[0].metadata["hurp"] == "durp" 79 | assert tokens[1].metadata["hurp"] == "durp" 80 | 81 | @pytest.mark.parametrize("separator", [re.compile(r"[_\-]+"), lambda c: c in "_-"]) 82 | def test_providing_separator(self, separator): 83 | tokens = [str(token) for token in Tokenizer("foo_bar-baz", separator=separator)] 84 | assert tokens == ["foo", "bar", "baz"] 85 | 86 | def test_tracking_token_position_with_left_hand_whitespace(self): 87 | tokens = Tokenizer(" foo bar") 88 | assert tokens[0].metadata["position"] == [1, 3] 89 | assert tokens[1].metadata["position"] == [5, 3] 90 | 91 | def test_tracking_token_position_with_right_hand_whitespace(self): 92 | tokens = Tokenizer("foo bar ") 93 | assert tokens[0].metadata["position"] == [0, 3] 94 | assert tokens[1].metadata["position"] == [4, 3] 95 | -------------------------------------------------------------------------------- /lunr/languages/__init__.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | from functools import partial 3 | 4 | import lunr 5 | from lunr.builder import Builder 6 | from lunr.languages.trimmer import generate_trimmer 7 | from lunr.languages.stemmer import nltk_stemmer, get_language_stemmer 8 | from lunr.pipeline import Pipeline 9 | from lunr.stop_word_filter import stop_word_filter, generate_stop_word_filter 10 | 11 | # map from ISO-639-1 codes to SnowballStemmer.languages 12 | # Languages not supported by nltk but by lunr.js: thai, japanese and turkish 13 | # Languages upported by nltk but not lunr.js: arabic 14 | 15 | SUPPORTED_LANGUAGES = { 16 | "ar": "arabic", 17 | "da": "danish", 18 | "nl": "dutch", 19 | "en": "english", 20 | "fi": "finnish", 21 | "fr": "french", 22 | "de": "german", 23 | "hu": "hungarian", 24 | "it": "italian", 25 | "no": "norwegian", 26 | "pt": "portuguese", 27 | "ro": "romanian", 28 | "ru": "russian", 29 | "es": "spanish", 30 | "sv": "swedish", 31 | } 32 | 33 | try: # pragma: no cover 34 | import nltk # type: ignore 35 | 36 | LANGUAGE_SUPPORT = True 37 | except ImportError: # pragma: no cover 38 | LANGUAGE_SUPPORT = False 39 | 40 | 41 | def _get_stopwords_and_word_characters(language): 42 | nltk.download("stopwords") 43 | verbose_language = SUPPORTED_LANGUAGES[language] 44 | stopwords = nltk.corpus.stopwords.words(verbose_language) 45 | # TODO: search for a more exhaustive list of word characters 46 | word_characters = {c for word in stopwords for c in word} 47 | return stopwords, word_characters 48 | 49 | 50 | def get_nltk_builder(languages): 51 | """Returns a builder with stemmers for all languages added to it. 52 | 53 | Args: 54 | languages (list): A list of supported languages. 55 | """ 56 | all_stemmers = [] 57 | all_stopwords_filters = [] 58 | all_word_characters = set() 59 | 60 | for language in languages: 61 | if language == "en": 62 | # use Lunr's defaults 63 | all_stemmers.append(lunr.stemmer.stemmer) 64 | all_stopwords_filters.append(stop_word_filter) 65 | all_word_characters.update({r"\w"}) 66 | else: 67 | stopwords, word_characters = _get_stopwords_and_word_characters(language) 68 | all_stemmers.append( 69 | Pipeline.registered_functions["stemmer-{}".format(language)] 70 | ) 71 | all_stopwords_filters.append( 72 | generate_stop_word_filter(stopwords, language=language) 73 | ) 74 | all_word_characters.update(word_characters) 75 | 76 | builder = Builder() 77 | multi_trimmer = generate_trimmer("".join(sorted(all_word_characters))) 78 | Pipeline.register_function( 79 | multi_trimmer, "lunr-multi-trimmer-{}".format("-".join(languages)) 80 | ) 81 | builder.pipeline.reset() 82 | 83 | for fn in chain([multi_trimmer], all_stopwords_filters, all_stemmers): 84 | builder.pipeline.add(fn) 85 | for fn in all_stemmers: 86 | builder.search_pipeline.add(fn) 87 | 88 | return builder 89 | 90 | 91 | def register_languages(): 92 | """Register all supported languages to ensure compatibility.""" 93 | for language in set(SUPPORTED_LANGUAGES) - {"en"}: 94 | language_stemmer = partial(nltk_stemmer, get_language_stemmer(language)) 95 | Pipeline.register_function(language_stemmer, "stemmer-{}".format(language)) 96 | 97 | 98 | if LANGUAGE_SUPPORT: # pragma: no cover 99 | # TODO: registering all possible stemmers feels unnecessary but it solves 100 | # deserializing with arbitrary language functions. Ideally the schema would 101 | # provide the language(s) for the index and we could register the stemmers 102 | # as needed 103 | register_languages() 104 | -------------------------------------------------------------------------------- /docs/lunrjs-interop.md: -------------------------------------------------------------------------------- 1 | # Interoperability with Lunr.js 2 | 3 | A key goal of Lunr.py is interoperability with Lunr.js: building an index with 4 | Lunr.py and being able to read it using Lunr.js without having to build it 5 | on the client on each visit. 6 | 7 | The key step in this process is index serialization, which is possible thanks 8 | to [`lunr-schema`](https://github.com/olivernn/lunr-schema). 9 | 10 | The serialization process in Lunr.py consist on calling `Index.serialize`, 11 | here is a complete example with the data from the [introduction](index.md): 12 | 13 | ```python 14 | >>> import json 15 | >>> from lunr import lunr 16 | >>> documents = [{ 17 | ...: 'id': 'a', 18 | ...: 'title': 'Mr. Green kills Colonel Mustard', 19 | ...: 'body': """Mr. Green killed Colonel Mustard in the study with the 20 | ...: candlestick. Mr. Green is not a very nice fellow.""" 21 | ...: }, { 22 | ...: 'id': 'b', 23 | ...: 'title': 'Plumb waters plant', 24 | ...: 'body': 'Professor Plumb has a green and a yellow plant in his study', 25 | ...: }, { 26 | ...: 'id': 'c', 27 | ...: 'title': 'Scarlett helps Professor', 28 | ...: 'body': """Miss Scarlett watered Professor Plumbs green plant 29 | ...: while he was away on his murdering holiday.""", 30 | ...: }] 31 | >>> idx = lunr( 32 | ...: ref='id', 33 | ...: fields=[dict(field_name='title', boost=10), 'body'], 34 | ...: documents=documents 35 | ...: ) 36 | >>> serialized_idx = idx.serialize() 37 | >>> with open('idx.json', 'w') as fd: 38 | ...: json.dump(serialized_idx, fd) 39 | ``` 40 | 41 | As you can see `serialize` will produce a JSON friendly dict you can write to 42 | disk and read from Lunr.js. The following snippet shows how to read the index 43 | using Node.js: 44 | 45 | ```javascript 46 | > const fs = require('fs') 47 | > const lunr = require('lunr') 48 | > const serializedIndex = JSON.parse(fs.readFileSync('idx.json')) 49 | > let idx = lunr.Index.load(serializedIndex) 50 | > idx.search('plant') 51 | [ 52 | { 53 | ref: 'b', 54 | score: 1.599, 55 | matchData: { metadata: [Object: null prototype] } 56 | }, 57 | { 58 | ref: 'c', 59 | score: 0.13, 60 | matchData: { metadata: [Object: null prototype] } 61 | } 62 | ] 63 | ``` 64 | 65 | !!! Note 66 | The search will only the _references_ of the matching documents. 67 | It is up to you to keep mapping of the documents in memory to be able show richer 68 | results which means in a web environment you will need to serve _two_ files, 69 | one for the index and another the collection of documents. 70 | 71 | ## Loading a serialized index 72 | 73 | You can also do the reverse operation of reading a serialized index produced 74 | by Lunr.py or Lunr.js using the `Index.load` class method: 75 | 76 | ```python 77 | >>> import json 78 | >>> from lunr.index import Index 79 | >>> with open("idx.json") as fd: 80 | ... serialized_idx = json.loads(fd.read()) 81 | ... 82 | >>> idx = Index.load(serialized_idx) 83 | >>> idx.search("plant") 84 | [{'ref': 'b', 'score': 1.599, 'match_data': }, {'ref': 'c', 'score': 0.13, 'match_data': }] 85 | ``` 86 | 87 | ## Language support 88 | 89 | Lunr.js uses the 90 | [`lunr-languages`](https://lunrjs.com/guides/language_support.html) package, 91 | a community driven collection of stemmers and trimmers for many languages. 92 | 93 | Porting each of those into Python was not feasible so Lunr.py uses [NTLK](https://www.nltk.org/) 94 | for language support and will configure the serialized index as expected by Lunr.js 95 | to ensure compatibility. 96 | 97 | However, this produces differences in scoring when loading indices from Lunr.py 98 | into Lunr.js larger than those observed using the base english implementation, 99 | due to inherent differences in the implementation of said stemmers and trimmers. 100 | -------------------------------------------------------------------------------- /tests/test_query.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lunr.query import Query, QueryPresence 4 | from lunr.token import Token 5 | from lunr.tokenizer import Tokenizer 6 | 7 | 8 | class BaseQuerySuite: 9 | ALL_FIELDS = ["title", "body"] 10 | 11 | def setup_method(self, method): 12 | self.query = Query(self.ALL_FIELDS) 13 | 14 | 15 | class TestQueryTerm(BaseQuerySuite): 16 | def test_single_string_term_adds_single_clause(self): 17 | self.query.clause(term="foo") 18 | 19 | assert len(self.query.clauses) == 1 20 | assert self.query.clauses[0].term == "foo" 21 | assert repr(self.query) == '' 22 | assert repr(self.query.clauses[0]) == '' 23 | 24 | def test_single_token_term_adds_single_clause(self): 25 | self.query.term(Token("foo")) 26 | 27 | assert len(self.query.clauses) == 1 28 | assert self.query.clauses[0].term == "foo" 29 | 30 | def test_multiple_string_terms_adds_multiple_clauses(self): 31 | self.query.term(["foo", "bar"]) 32 | 33 | assert len(self.query.clauses) == 2 34 | assert self.query.clauses[0].term == "foo" 35 | assert self.query.clauses[1].term == "bar" 36 | assert repr(self.query) == ('') 37 | 38 | def test_multiple_token_terms_adds_multiple_clauses(self): 39 | self.query.term(Tokenizer("foo bar")) 40 | 41 | assert len(self.query.clauses) == 2 42 | assert self.query.clauses[0].term == "foo" 43 | assert self.query.clauses[1].term == "bar" 44 | 45 | def test_multiple_string_terms_with_options(self): 46 | self.query.term(["foo", "bar"], use_pipeline=False) 47 | 48 | assert len(self.query.clauses) == 2 49 | assert self.query.clauses[0].term == "foo" 50 | assert self.query.clauses[1].term == "bar" 51 | 52 | 53 | class TestQueryClause(BaseQuerySuite): 54 | def test_clause_defaults(self): 55 | self.query.clause(term="foo") 56 | self.clause = self.query.clauses[0] 57 | 58 | assert self.clause.fields == self.ALL_FIELDS 59 | assert self.clause.boost == 1 60 | assert self.clause.use_pipeline is True 61 | 62 | def test_clause_specified(self): 63 | self.query.clause(term="foo", boost=10, fields=["title"], use_pipeline=False) 64 | self.clause = self.query.clauses[0] 65 | 66 | assert self.clause.fields == ["title"] 67 | assert self.clause.boost == 10 68 | assert self.clause.use_pipeline is False 69 | 70 | @pytest.mark.parametrize( 71 | "wildcard, expected_term", 72 | [ 73 | (Query.WILDCARD_NONE, "foo"), 74 | (Query.WILDCARD_LEADING, "*foo"), 75 | (Query.WILDCARD_TRAILING, "foo*"), 76 | (Query.WILDCARD_LEADING | Query.WILDCARD_TRAILING, "*foo*"), 77 | ], 78 | ) 79 | def test_clause_wildcard(self, wildcard, expected_term): 80 | self.query.clause(term="foo", wildcard=wildcard) 81 | self.clause = self.query.clauses[0] 82 | 83 | assert self.clause.term == expected_term 84 | 85 | def test_clause_wildcard_existing(self): 86 | self.query.clause( 87 | term="*foo*", wildcard=Query.WILDCARD_LEADING | Query.WILDCARD_TRAILING 88 | ) 89 | self.clause = self.query.clauses[0] 90 | 91 | assert self.clause.term == "*foo*" 92 | 93 | 94 | class TestQueryIsNegated(BaseQuerySuite): 95 | def test_all_prohibited(self): 96 | self.query.term("foo", presence=QueryPresence.PROHIBITED) 97 | self.query.term("bar", presence=QueryPresence.PROHIBITED) 98 | 99 | assert self.query.is_negated() is True 100 | 101 | def test_some_prohibited(self): 102 | self.query.term("foo", presence=QueryPresence.PROHIBITED) 103 | self.query.term("bar", presence=QueryPresence.REQUIRED) 104 | 105 | assert self.query.is_negated() is False 106 | 107 | def test_nome_prohibited(self): 108 | self.query.term("foo", presence=QueryPresence.OPTIONAL) 109 | self.query.term("bar", presence=QueryPresence.REQUIRED) 110 | 111 | assert self.query.is_negated() is False 112 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.6.2 (2022-02-27) 4 | 5 | - Add `Pipeline.skip` method to skip pipeline functions on specific fields 6 | @tristanlatr 7 | 8 | ### Deprecation warning 9 | 10 | - 0.6.2 will be the last release to support Python 3.6. 11 | 12 | ## 0.6.1 (2021-10-16) 13 | 14 | - Add support for Python 3.10. 15 | - Remove pin to NLTK < 3.5. 16 | 17 | ## 0.6.0 (2021-04-22) 18 | 19 | - Add index customisation, enabling build and search pipeline tweaks as well as 20 | meta-data whitelisting. 21 | 22 | ## 0.5.9 (2021-01-10) 23 | 24 | - Compatibility with Lunr.js 2.3.9: 25 | - Fix bug where clause matches are incorrectly initialized to a complete set. 26 | - Add support for Python 3.9 27 | - Drop support for Python 3.5 28 | 29 | ## 0.5.8 (2020-04-16) 30 | 31 | - Fix installing ntlk in 2.7 without `languages` extra. 32 | - Optimize regexes and avoid usage by default. 33 | 34 | ### Deprecation warning 35 | 36 | - 0.5.8 will be the last release to support Python 2.7. 37 | 38 | ## 0.5.7 (2020-04-14) 39 | 40 | - Prevent installing an unsupported version of NLTK in Python 2.7. 41 | 42 | ## 0.5.6 (2019-11-17) 43 | 44 | - Support for Python 3.8 45 | - Compatibility with Lunr.js 2.3.8: 46 | - Fix bug where leading white space would cause token position metadata to be reported incorrectly. 47 | 48 | ## 0.5.5 (2019-04-28) 49 | 50 | - Compatibility with Lunr.js 2.3.6: 51 | - Fix bug with fuzzy matching that meant deletions at the end of a word would not match. 52 | 53 | ## 0.5.4 (2018-11-10) 54 | 55 | - Compatibility with Lunr.js 2.3.5: 56 | - Fix bug on fuzzy matching ignoring matches on insertions at the end of the word. 57 | 58 | ## 0.5.3 (2018-09-08) 59 | 60 | - Performance improvements on indexing 61 | - Compatibility with Lunr.js 2.3.3: 62 | - Fixes catastrophic backtracking on leading wildcards 63 | 64 | ## 0.5.2 (2018-08-25) 65 | 66 | - Fix Python 2.7 support 67 | 68 | ## 0.5.1 (2018-08-25) 69 | 70 | - Added multilanguage support 71 | - Improved language support 72 | 73 | ### Deprecation warning 74 | 75 | - The `language` argument to the `lunr` has been renamed to `languages` to accomodate for multilanguage support. The `languages` argument accepts a string or an iterable of ISO-639-1 languages codes. If you're calling `lunr` with keyword arguments please update such calls accordingly. 76 | 77 | ## 0.4.3 (2018-08-18) 78 | 79 | - Target Lunr.js v2.3.2 80 | 81 | ## 0.4.2 (2018-07-28) 82 | 83 | - Target Lunr.js v2.3.1 84 | - Fix crash when using non-string document references. 85 | 86 | ## 0.4.1 (2018-07-07) 87 | 88 | - Added support for Python 3.7 89 | 90 | ## 0.4.0 (2018-06-25) 91 | 92 | - Compatibility with Lunr.js v2.3.0. Including: 93 | + Add support for build time field and document boosts. 94 | + Add support for indexing nested document fields using field extractors. 95 | + Prevent usage of problematic characters in field names 96 | 97 | ## 0.3.0 (2018-06-03) 98 | 99 | - Compatibility with Lunr.js v2.2.1. Including: 100 | + Add support for queries with term presence, e.g. required terms and prohibited terms. 101 | + Add support for using the output of `lunr.Tokenizer` directly with `lunr.Query.term`. 102 | + Add field name metadata to tokens in build and search pipelines. 103 | 104 | ## 0.2.3 (2018-05-19) 105 | 106 | - Compatibility with Lunr.js v2.1.6 107 | 108 | ## 0.2.2 (2018-05-15) 109 | 110 | - Fix bug on whitelisting metadata in Builder. 111 | 112 | ## 0.2.1 (2018-04-21) 113 | 114 | - Refactor of multilanguage support. 115 | 116 | ## 0.2.0 (2018-04-15) 117 | 118 | - Experimental support for languages via NLTK, currently supported languages are arabic, danish, dutch, english, finnish, french, german, hungarian, italian, norwegian, portuguese, romanian, russian, spanish and swedish. Note compatibility with Lunr.js and lunr-languages is reduced. 119 | 120 | ## 0.1.2 (2018-03-17) 121 | 122 | - Add serialization tests passing serialized index from Python to JS and producing same results. 123 | - Added `Index.create_query` returning a preinitialized `Query` with the index's fields or a subset of them. 124 | - `Index.search` does not accept a callback function, instead expects a `Query` object the user should preconfigure first. 125 | - Various docstring and repr changes. 126 | 127 | ## 0.1.1a1 (2018-03-01) 128 | 129 | - Initial release -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide-toc: true 3 | --- 4 | 5 | # Lunr.py 🌖 6 | 7 | A Python implementation of [Lunr.js](https://lunrjs.com) by [Oliver Nightingale](https://github.com/olivernn). 8 | 9 | > A bit like Solr, but much smaller and not as bright. 10 | 11 | This Python version of Lunr.js aims to bring the simple and powerful full text search 12 | capabilities into Python guaranteeing results as close as the original 13 | implementation as possible. 14 | 15 | ## What does this even do? 16 | 17 | Lunr is a simple full text search solution for situations where deploying a full 18 | scale solution like Elasticsearch isn't possible, viable or you're simply prototyping. 19 | Lunr parses a set of documents and creates an inverted index for quick full text 20 | searches in the same way other more complicated solution. 21 | 22 | The trade-off is that Lunr keeps the inverted index in memory and requires you 23 | to recreate or read the index at the start of your application. 24 | 25 | ## Interoperability with Lunr.js 26 | 27 | A core objective of Lunr.py is to [provide interoperability with the JavaScript 28 | version](lunrjs-interop.md). 29 | 30 | An example can be found in the [MkDocs documentation library](http://www.mkdocs.org/). 31 | MkDocs produces a set of documents from the pages of the documentation and uses 32 | [Lunr.js](https://lunrjs.com) in the frontend to power its built-in searching 33 | engine. This set of documents is in the form of a JSON file which needs to be 34 | fetched and parsed by Lunr.js to create the inverted index at startup of your application. 35 | 36 | While this is not a problem for most sites, depending on the size of your document 37 | set, this can take some time. 38 | 39 | Lunr.py provides a backend solution, allowing you to parse the documents in Python 40 | of time and create a serialized Lunr.js index you can pass have the browser 41 | version read, minimizing start up time of your application. 42 | 43 | Each version of lunr.py 44 | [targets a specific version of lunr.js](https://github.com/yeraydiazdiaz/lunr.py/blob/master/lunr/__init__.py#L12) 45 | and produces the same results as it both in Python 2.7 and 3 for 46 | [non-trivial corpus of documents](https://github.com/yeraydiazdiaz/lunr.py/blob/master/tests/acceptance_tests/fixtures/mkdocs_index.json). 47 | 48 | Lunr.py also serializes `Index` instances respecting the 49 | [`lunr-schema`](https://github.com/olivernn/lunr-schema) which are consumable by 50 | Lunr.js and viceversa. 51 | 52 | ## Installation 53 | 54 | `pip install lunr` 55 | 56 | An optional and experimental support for other languages thanks to the 57 | [Natural Language Toolkit](http://www.nltk.org/) stemmers is also available via 58 | `pip install lunr[languages]`. The usage of the language feature is subject to 59 | [NTLK corpus licensing clauses](https://github.com/nltk/nltk#redistributing). 60 | 61 | Please refer to the 62 | [documentation page on languages](https://lunr.readthedocs.io/en/latest/languages/) 63 | for more information. 64 | 65 | ## Usage 66 | 67 | First, you'll need a list of dicts representing the documents you want to search on. 68 | These documents must have a unique field which will serve as a reference and a 69 | series of fields you'd like to search on. 70 | 71 | Lunr provides a convenience `lunr` function to quickly index this set of documents: 72 | 73 | ```python 74 | >>> from lunr import lunr 75 | >>> 76 | >>> documents = [{ 77 | ... 'id': 'a', 78 | ... 'title': 'Mr. Green kills Colonel Mustard', 79 | ... 'body': 'Mr. Green killed Colonel Mustard in the study with the candlestick.', 80 | ... }, { 81 | ... 'id': 'b', 82 | ... 'title': 'Plumb waters plant', 83 | ... 'body': 'Professor Plumb has a green plant in his study', 84 | ... }] 85 | >>> idx = lunr( 86 | ... ref='id', fields=('title', 'body'), documents=documents 87 | ... ) 88 | >>> idx.search('kill') 89 | [{'ref': 'a', 'score': 0.6931722372559913, 'match_data': }] 90 | >>> idx.search('study') 91 | [{'ref': 'b', 'score': 0.23576799568081389, 'match_data': }, {'ref': 'a', 'score': 0.2236629211724517, 'match_data': }] 92 | ``` 93 | 94 | ```{toctree} 95 | :hidden: 96 | usage 97 | indices 98 | languages 99 | lunrjs-interop 100 | changelog 101 | customisation 102 | GitHub Repository 103 | ``` 104 | -------------------------------------------------------------------------------- /lunr/query_lexer.py: -------------------------------------------------------------------------------- 1 | from lunr.tokenizer import default_separator 2 | 3 | 4 | class QueryLexer: 5 | # TODO: use iteration protocol? 6 | EOS = "EOS" 7 | FIELD = "FIELD" 8 | TERM = "TERM" 9 | EDIT_DISTANCE = "EDIT_DISTANCE" 10 | BOOST = "BOOST" 11 | PRESENCE = "PRESENCE" 12 | 13 | def __init__(self, string): 14 | self.lexemes = [] 15 | self.string = string 16 | self.length = len(string) 17 | self.pos = 0 18 | self.start = 0 19 | self.escape_char_positions = [] 20 | 21 | @property 22 | def width(self): 23 | return self.pos - self.start 24 | 25 | def ignore(self): 26 | if self.start == self.pos: 27 | self.pos += 1 28 | 29 | self.start = self.pos 30 | 31 | def backup(self): 32 | self.pos -= 1 33 | 34 | def accept_digit_run(self): 35 | char = self.next() 36 | while char != self.EOS and (47 < ord(char) < 58): 37 | char = self.next() 38 | 39 | if char != self.EOS: 40 | self.backup() 41 | 42 | def run(self): 43 | state = self.lex_text() 44 | while state: 45 | state = state() 46 | 47 | def slice_string(self): 48 | subslices = [] 49 | slice_start = self.start 50 | 51 | for escape_char_position in self.escape_char_positions: 52 | subslices.append(self.string[slice_start:escape_char_position]) 53 | slice_start = escape_char_position + 1 54 | 55 | subslices.append(self.string[slice_start : self.pos]) 56 | self.escape_char_positions = [] 57 | 58 | return "".join(subslices) 59 | 60 | def next(self): 61 | if self.pos >= self.length: 62 | return self.EOS 63 | 64 | char = self.string[self.pos] 65 | self.pos += 1 66 | return char 67 | 68 | def emit(self, type_): 69 | self.lexemes.append( 70 | { 71 | "type": type_, 72 | "string": self.slice_string(), 73 | "start": self.start, 74 | "end": self.pos, 75 | } 76 | ) 77 | self.start = self.pos 78 | 79 | def escape_character(self): 80 | self.escape_char_positions.append(self.pos - 1) 81 | self.pos += 1 82 | 83 | def lex_field(self): 84 | self.backup() 85 | self.emit(self.FIELD) 86 | self.ignore() 87 | return self.lex_text 88 | 89 | def lex_term(self): 90 | if self.width > 1: 91 | self.backup() 92 | self.emit(self.TERM) 93 | 94 | self.ignore() 95 | 96 | return self.lex_text 97 | 98 | def lex_edit_distance(self): 99 | self.ignore() 100 | self.accept_digit_run() 101 | self.emit(self.EDIT_DISTANCE) 102 | return self.lex_text 103 | 104 | def lex_boost(self): 105 | self.ignore() 106 | self.accept_digit_run() 107 | self.emit(self.BOOST) 108 | return self.lex_text 109 | 110 | def lex_EOS(self): 111 | if self.width > 0: 112 | self.emit(self.TERM) 113 | 114 | def lex_text(self): 115 | while True: 116 | char = self.next() 117 | if char == self.EOS: 118 | return self.lex_EOS 119 | 120 | if ord(char) == 92: # Escape character is '\' 121 | self.escape_character() 122 | continue 123 | 124 | if char == ":": 125 | return self.lex_field 126 | 127 | if char == "~": 128 | self.backup() 129 | if self.width > 0: 130 | self.emit(self.TERM) 131 | 132 | return self.lex_edit_distance 133 | 134 | if char == "^": 135 | self.backup() 136 | if self.width > 0: 137 | self.emit(self.TERM) 138 | 139 | return self.lex_boost 140 | 141 | # '+' indicates term presence is required, check for length to 142 | # ensure only a leading '+' is considered 143 | if char == "+" and self.width == 1: 144 | self.emit(self.PRESENCE) 145 | return self.lex_text 146 | 147 | # '-' indicates term presence is prohibited 148 | if char == "-" and self.width == 1: 149 | self.emit(self.PRESENCE) 150 | return self.lex_text 151 | 152 | if default_separator(char): 153 | return self.lex_term 154 | -------------------------------------------------------------------------------- /docs/customisation.md: -------------------------------------------------------------------------------- 1 | # Customisation 2 | 3 | Lunr.py ships with some sensible defaults to create indexes and search easily, 4 | but in some cases you may want to tweak how documents are indexed and search. 5 | You can do that in lunr.py by passing your own `Builder` instance to the `lunr` 6 | function. 7 | 8 | ## Pipeline functions 9 | 10 | When the builder processes your documents it splits (tokenises) the text, and 11 | applies a series of functions to each token. These are called pipeline functions. 12 | 13 | The builder includes two pipelines, indexing and searching. 14 | 15 | If you want to change the way lunr.py indexes the documents you'll need to 16 | change the indexing pipeline. 17 | 18 | For example, say you wanted to support the American and British way of spelling 19 | certain words, you could use a normalisation pipeline function to force one 20 | token into the other: 21 | 22 | ```python 23 | from lunr import lunr, get_default_builder 24 | import lunr.pipeline.Pipeline 25 | 26 | documents = [...] 27 | 28 | builder = get_default_builder() 29 | def normalise_spelling(token, i, tokens) { 30 | if str(token) == "gray": 31 | return token.update(lambda: "grey") 32 | else: 33 | return token 34 | 35 | lunr.pipeline.Pipeline.register_function(normalise_spelling) 36 | builder.pipeline.add(normalise_spelling) 37 | 38 | idx = lunr(ref="id", fields=("title", "body"), documents=documents, builder=builder) 39 | ``` 40 | 41 | Note pipeline functions take the token being processed, its position in the 42 | token list, and the token list itself. 43 | 44 | ## Skip a pipeline function for specific field names 45 | 46 | The `Pipeline.skip()` method allows you to skip a pipeline function for specific field names. 47 | This example skips the `stop_word_filter` pipeline function for the field `fullName`. 48 | 49 | ```python 50 | from lunr import lunr, get_default_builder, stop_word_filter 51 | 52 | documents = [...] 53 | 54 | builder = get_default_builder() 55 | 56 | builder.pipeline.skip(stop_word_filter.stop_word_filter, ["fullName"]) 57 | 58 | idx = lunr(ref="id", fields=("fullName", "body"), documents=documents, builder=builder) 59 | ``` 60 | 61 | ## Token meta-data 62 | 63 | Lunr.py `Token` instances include meta-data information which can be used in 64 | pipeline functions. This meta-data is not stored in the index by default, but it 65 | can be by adding it to the builder's `metadata_whitelist` property. This will 66 | include the meta-data in the search results: 67 | 68 | ```python 69 | from lunr import lunr, get_default_builder 70 | import lunr.pipeline.Pipeline 71 | 72 | builder = get_default_builder() 73 | 74 | def token_length(token, i, tokens): 75 | token.metadata["token_length"] = len(str(token)) 76 | return token 77 | 78 | Pipeline.register_function(token_length) 79 | builder.pipeline.add(token_length) 80 | builder.metadata_whitelist.append("token_length") 81 | 82 | idx = lunr("id", ("title", "body"), documents, builder=builder) 83 | 84 | [result, _, _] = idx.search("green") 85 | assert result["match_data"].metadata["green"]["title"]["token_length"] == [5] 86 | assert result["match_data"].metadata["green"]["body"]["token_length"] == [5, 5] 87 | ``` 88 | 89 | ## Similarity tuning 90 | 91 | The algorithm used by Lunr to calculate similarity between a query and a document 92 | can be tuned using two parameters. Lunr ships with sensible defaults, and these 93 | can be adjusted to provide the best results for a given collection of documents. 94 | 95 | - **b**: This parameter controls the importance given to the length of a 96 | document and its fields. This value must be between 0 and 1, and by default it 97 | has a value of 0.75. Reducing this value reduces the effect of different length 98 | documents on a term’s importance to that document. 99 | - **k1**: This controls how quickly the boost given by a common word reaches 100 | saturation. Increasing it will slow down the rate of saturation and lower values 101 | result in quicker saturation. The default value is 1.2. If the collection of 102 | documents being indexed have high occurrences of words that are not covered by 103 | a stop word filter, these words can quickly dominate any similarity calculation. 104 | In these cases, this value can be reduced to get more balanced results. 105 | 106 | These values can be changed in the builder: 107 | 108 | ```python 109 | from lunr import lunr, get_default_builder 110 | 111 | builder = get_default_builder() 112 | builder.k1(1.3) 113 | builder.b(0) 114 | 115 | idx = lunr("id", ("title", "body"), documents, builder=builder) 116 | ``` 117 | 118 | -------------------------------------------------------------------------------- /tests/test_vector.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | 3 | import pytest 4 | 5 | from lunr.vector import Vector 6 | from lunr.exceptions import BaseLunrException 7 | 8 | 9 | def _vector_from_args(*args): 10 | vector = Vector() 11 | for i, arg in enumerate(args): 12 | vector.insert(i, arg) 13 | return vector 14 | 15 | 16 | def test_vector_repr(): 17 | vector = _vector_from_args(1, 3, -5) 18 | assert repr(vector) == "".format(vector.magnitude) 19 | 20 | 21 | class TestVectorPositionForIndex: 22 | 23 | vector = Vector([1, "a", 2, "b", 4, "c", 7, "d", 11, "e"]) 24 | 25 | def test_position_for_index_at_the_beggining(self): 26 | assert self.vector.position_for_index(0) == 0 27 | 28 | def test_position_for_index_at_the_end(self): 29 | assert self.vector.position_for_index(20) == 10 30 | 31 | def test_position_for_index_consecutive(self): 32 | assert self.vector.position_for_index(3) == 4 33 | 34 | def test_position_for_index_non_consecutive_gap_after(self): 35 | assert self.vector.position_for_index(5) == 6 36 | 37 | def test_position_for_index_non_consecutive_gap_before(self): 38 | assert self.vector.position_for_index(6) == 6 39 | 40 | def test_position_for_index_non_consecutive_gap_before_and_after(self): 41 | assert self.vector.position_for_index(9) == 8 42 | 43 | def test_position_for_index_duplicate_at_the_beggining(self): 44 | assert self.vector.position_for_index(1) == 0 45 | 46 | def test_position_for_index_duplicate_at_the_end(self): 47 | assert self.vector.position_for_index(11) == 8 48 | 49 | def test_position_for_index_duplicate_consecutive(self): 50 | assert self.vector.position_for_index(4) == 4 51 | 52 | 53 | def test_magnitude_calculates_magnitude(): 54 | vector = _vector_from_args(4, 5, 6) 55 | assert sqrt(77) == vector.magnitude 56 | 57 | 58 | def test_dot_calculates_dot_product_of_two_vectors(): 59 | v1 = _vector_from_args(1, 3, -5) 60 | v2 = _vector_from_args(4, -2, -1) 61 | 62 | assert v1.dot(v2) == 3 63 | 64 | 65 | class TestSimilarity: 66 | def test_similarity_calculates_the_similarity_between_two_vectors(self): 67 | v1 = _vector_from_args(1, 3, -5) 68 | v2 = _vector_from_args(4, -2, -1) 69 | 70 | assert v1.similarity(v2) == pytest.approx(0.5, 0.1) 71 | 72 | def test_empty_vector(self): 73 | v_empty = Vector() 74 | v1 = _vector_from_args(1) 75 | 76 | assert v1.similarity(v_empty) == 0 77 | assert v_empty.similarity(v1) == 0 78 | 79 | def test_non_overlapping_vector(self): 80 | v1 = Vector([1, 1]) 81 | v2 = Vector([2, 1]) 82 | 83 | assert v1.similarity(v2) == 0 84 | assert v2.similarity(v1) == 0 85 | 86 | 87 | class TestVectorInsert: 88 | def test_insert_invalidates_magnitude_cache(self): 89 | vector = _vector_from_args(4, 5, 6) 90 | assert sqrt(77) == vector.magnitude 91 | 92 | vector.insert(3, 7) 93 | 94 | assert sqrt(126) == vector.magnitude 95 | 96 | def test_insert_keeps_items_in_index_specified_order(self): 97 | vector = Vector() 98 | 99 | vector.insert(2, 4) 100 | vector.insert(1, 5) 101 | vector.insert(0, 6) 102 | 103 | assert vector.to_list() == [6, 5, 4] 104 | 105 | def test_insert_fails_when_duplicate_entry(self): 106 | vector = _vector_from_args(4, 5, 6) 107 | with pytest.raises(BaseLunrException): 108 | vector.insert(0, 44) 109 | 110 | 111 | class TestVectorUpsert: 112 | def test_upsert_invalidates_magnitude_cache(self): 113 | vector = _vector_from_args(4, 5, 6) 114 | assert vector.magnitude == sqrt(77) 115 | 116 | vector.upsert(3, 7) 117 | 118 | assert vector.magnitude == sqrt(126) 119 | 120 | def test_upsert_keeps_items_in_index_specified_order(self): 121 | vector = Vector() 122 | 123 | vector.upsert(2, 4) 124 | vector.upsert(1, 5) 125 | vector.upsert(0, 6) 126 | 127 | assert vector.to_list() == [6, 5, 4] 128 | 129 | def test_upsert_calls_fn_for_value_on_duplicate(self): 130 | vector = _vector_from_args(4, 5, 6) 131 | 132 | vector.upsert(0, 4, lambda current, passed: current + passed) 133 | 134 | assert vector.to_list() == [8, 5, 6] 135 | 136 | def test_upsert_defaults_to_passed_value_on_duplicate(self): 137 | vector = _vector_from_args(4, 5, 6) 138 | 139 | vector.upsert(0, 3) 140 | 141 | assert vector.to_list() == [3, 5, 6] 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://github.com/yeraydiazdiaz/lunr.py/workflows/CI/badge.svg?branch=master)](https://github.com/yeraydiazdiaz/lunr.py/actions?workflow=CI) 2 | [![codecov](https://codecov.io/gh/yeraydiazdiaz/lunr.py/branch/master/graph/badge.svg)](https://codecov.io/gh/yeraydiazdiaz/lunr.py) 3 | [![Supported Python Versions](https://img.shields.io/pypi/pyversions/lunr.svg)](https://pypi.org/project/lunr/) 4 | [![PyPI](https://img.shields.io/pypi/v/lunr.svg)](https://pypi.org/project/lunr/) 5 | [![Read the Docs](https://img.shields.io/readthedocs/lunr.svg)](http://lunr.readthedocs.io/en/latest/) 6 | [![Downloads](http://pepy.tech/badge/lunr)](http://pepy.tech/project/lunr) 7 | 8 | # Lunr.py 9 | 10 | A Python implementation of [Lunr.js](https://lunrjs.com) by [Oliver Nightingale](https://github.com/olivernn). 11 | 12 | > A bit like Solr, but much smaller and not as bright. 13 | 14 | This Python version of Lunr.js aims to bring the simple and powerful full text search 15 | capabilities into Python guaranteeing results as close as the original 16 | implementation as possible. 17 | 18 | - [Documentation](http://lunr.readthedocs.io/en/latest/) 19 | 20 | ## What does this even do? 21 | 22 | Lunr is a simple full text search solution for situations where deploying a full 23 | scale solution like Elasticsearch isn't possible, viable or you're simply prototyping. 24 | Lunr parses a set of documents and creates an inverted index for quick full text 25 | searches in the same way other more complicated solution. 26 | 27 | The trade-off is that Lunr keeps the inverted index in memory and requires you 28 | to recreate or read the index at the start of your application. 29 | 30 | ## Interoperability with Lunr.js 31 | 32 | A core objective of Lunr.py is to provide 33 | [interoperability with the JavaScript version](https://lunr.readthedocs.io/en/latest/lunrjs-interop). 34 | 35 | An example can be found in the [MkDocs documentation library](http://www.mkdocs.org/). 36 | MkDocs produces a set of documents from the pages of the documentation and uses 37 | [Lunr.js](https://lunrjs.com) in the frontend to power its built-in searching 38 | engine. This set of documents is in the form of a JSON file which needs to be 39 | fetched and parsed by Lunr.js to create the inverted index at startup of your application. 40 | 41 | While this is not a problem for most sites, depending on the size of your document 42 | set, this can take some time. 43 | 44 | Lunr.py provides a backend solution, allowing you to parse the documents in Python 45 | of time and create a serialized Lunr.js index you can pass have the browser 46 | version read, minimizing start up time of your application. 47 | 48 | Each version of lunr.py 49 | [targets a specific version of lunr.js](https://github.com/yeraydiazdiaz/lunr.py/blob/master/lunr/__init__.py#L12) 50 | and produces the same results for a 51 | [non-trivial corpus of documents](https://github.com/yeraydiazdiaz/lunr.py/blob/master/tests/acceptance_tests/fixtures/mkdocs_index.json). 52 | 53 | ## Installation 54 | 55 | `pip install lunr` 56 | 57 | An optional and experimental support for other languages thanks to the 58 | [Natural Language Toolkit](http://www.nltk.org/) stemmers is also available via 59 | `pip install lunr[languages]`. The usage of the language feature is subject to 60 | [NTLK corpus licensing clauses](https://github.com/nltk/nltk#redistributing). 61 | 62 | Please refer to the 63 | [documentation page on languages](https://lunr.readthedocs.io/en/latest/languages.html) 64 | for more information. 65 | 66 | ## Usage 67 | 68 | First, you'll need a list of dicts representing the documents you want to search on. 69 | These documents must have a unique field which will serve as a reference and a 70 | series of fields you'd like to search on. 71 | 72 | Lunr provides a convenience `lunr` function to quickly index this set of documents: 73 | 74 | ```python 75 | >>> from lunr import lunr 76 | >>> 77 | >>> documents = [{ 78 | ... 'id': 'a', 79 | ... 'title': 'Mr. Green kills Colonel Mustard', 80 | ... 'body': 'Mr. Green killed Colonel Mustard in the study with the candlestick.', 81 | ... }, { 82 | ... 'id': 'b', 83 | ... 'title': 'Plumb waters plant', 84 | ... 'body': 'Professor Plumb has a green plant in his study', 85 | ... }] 86 | >>> idx = lunr( 87 | ... ref='id', fields=('title', 'body'), documents=documents 88 | ... ) 89 | >>> idx.search('kill') 90 | [{'ref': 'a', 'score': 0.6931722372559913, 'match_data': }] 91 | >>> idx.search('study') 92 | [{'ref': 'b', 'score': 0.23576799568081389, 'match_data': }, {'ref': 'a', 'score': 0.2236629211724517, 'match_data': }] 93 | ``` 94 | 95 | Please refer to the [documentation](http://lunr.readthedocs.io/en/latest/) 96 | for more usage examples. 97 | -------------------------------------------------------------------------------- /tests/acceptance_tests/fixtures/lang_es.json: -------------------------------------------------------------------------------- 1 | { 2 | "docs": [ 3 | { 4 | "id": "a", 5 | "text": "La República Galáctica está sumida en el caos. Los impuestos de las rutas comerciales a los sistemas estelares exteriores están en disputa. Esperando resolver el asunto con un bloqueo de poderosas naves de guerra, la codiciosa Federación de Comercio ha detenido todos los envíos al pequeño planeta de Naboo. Mientras el Congreso de la República debate interminablemente esta alarmante cadena de acontecimientos, el Canciller Supremo ha enviado en secreto a dos Caballeros Jedi, guardianes de la paz y la justicia en la galaxia, para resolver el conflicto...", 6 | "title": "Episodio I: La Amenaza Fantasma" 7 | }, 8 | { 9 | "id": "b", 10 | "text": "En el Senado Galáctico reina la inquietud. Varios miles de sistemas solares han declarado su intención de abandonar la República. Este movimiento separatista, liderado por el misterioso Conde Dooku, ha provocado que al limitado número de Caballeros Jedi les resulte difícil mantener la paz y el orden en la galaxia. La senadora Amidala, la antigua reina de Naboo, regresa al Senado Galáctico para dar su voto en la crítica cuestión de crear un EJÉRCITO DE LA REPÚBLICA que ayude a los desbordados Jedi....", 11 | "title": "Episodio II: El Ataque de los Clones" 12 | }, 13 | { 14 | "id": "c", 15 | "text": "¡Guerra! La República se desmorona bajo los ataques del despiadado Lord Sith, el Conde Dooku. Hay héroes en ambos bandos. El mal está por doquier. En una contundente jugada, el diabólico líder droide, el General Grievous, ha irrumpido en la capital de la República y ha secuestrado al Canciller Palpatine, líder del Senado Galáctico. Mientras el ejército droide separatista trata de huir de la capital sitiada con su valioso rehén, dos Caballeros Jedi lideran una misión desesperada para rescatar al Canciller cautivo....", 16 | "title": "Episodio III: El Ataque de los Clones" 17 | }, 18 | { 19 | "id": "d", 20 | "text": "Nos encontramos en un periodo de guerra civil. Las naves espaciales rebeldes, atacando desde una base oculta, han logrado su primera victoria contra el malvado Imperio Galáctico. Durante la batalla, los espías rebeldes han conseguido apoderarse de los planos secretos del arma total y definitiva del Imperio, la ESTRELLA DE LA MUERTE, una estación espacial acorazada, llevando en sí potencia suficiente para destruir a un planeta entero. Perseguida por los siniestros agentes del Imperio, la Princesa Leia vuela hacia su patria, a bordo de su nave espacial, llevando consigo los planos robados, que pueden salvar a su pueblo y devolver la libertad a la galaxia....", 21 | "title": "Episodio IV: Una Nueva Esperanza" 22 | }, 23 | { 24 | "id": "e", 25 | "text": "Son tiempos adversos para la rebelión. Aunque la Estrella de la Muerte ha sido destruida, las tropas imperiales han hecho salir a las fuerzas rebeldes de sus bases ocultas y las han persiguen a través de la galaxia. Tras escapar de la terrible Flota Imperial, un grupo de guerreros de la libertad, encabezados por Luke Skywalker, ha establecido una nueva base secreta en el remoto mundo helado de Hoth. El malvado Lord Darth Vader, obsesionado por encontrar al joven skywalker, ha eviado miles de sondas espaciales hacía las infinitas distancias del espacio....", 26 | "title": "Episodio V: El Imperio Contraataca" 27 | }, 28 | { 29 | "id": "f", 30 | "text": "Luke Skywalker ha regresado a Tatooine, su planeta de origen, para intentar rescatar a su amigo Han Solo de las garras del malvado Jabba, el Hutt. Pero Luke ignora que el IMPERIO GALÁCTICO ha comenzado en secreto la construcción de una nueva estación espacial armada, más poderosa que la temida Estrella de la Muerte. Una vez terminada, este arma suprema significará la aniquilación del pequeño grupo de rebeldes que lucha para restaurar la libertad en la galaxia....", 31 | "title": "Episodio VI: El Retorno del Jedi" 32 | }, 33 | { 34 | "id": "g", 35 | "text": "Luke Skywalker ha desaparecido. En su ausencia, la siniestra PRIMERA ORDEN ha surgido de las cenizas del Imperio y no descansará hasta que Skywalker, el último Jedi, haya sido destruido. Con el apoyo de la REPÚBLICA, la General Leia Organa dirige una valiente RESISTENCIA. Desesperadamente busca a su hermano Luke con el fin de obtener su ayuda para restaurar la paz y la justicia en la galaxia. Leia ha enviado a su piloto más audaz en una misión secreta a Jakku, donde un viejo aliado ha descubierto una pista del paradero de Luke....", 36 | "title": "Episodio VII: El Despertar de la Fuerza" 37 | }, 38 | { 39 | "id": "h", 40 | "text": "La PRIMERA ORDEN impera. Luego de destruir a la pacífica República, el Líder Supremo Snoke ahora envía a sus despiadadas legiones a asumir el control militar de la galaxia. Sólo la general Leia Organa y su grupo de combatientes de la RESISTENCIA se oponen a la creciente tiranía, convencidos de que el Maestro Jedi Luke Skywalker regresará y restaurará la chispa de esperanza en la lucha. Pero la Resistencia ha sido expuesta. Mientras la Primera Orden se dirige hacia la base rebelde, los valientes héroes organizan un desesperado escape....", 41 | "title": "Episodio VIII: Los Últimos Jedi" 42 | } 43 | ] 44 | } 45 | -------------------------------------------------------------------------------- /lunr/query.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class QueryPresence(Enum): 5 | """Defines possible behaviours for the term's presence in a document.""" 6 | 7 | OPTIONAL = 1 # default 8 | REQUIRED = 2 9 | PROHIBITED = 3 # documents that contain this term will not be returned 10 | 11 | 12 | class Query: 13 | """A `lunr.Query` provides a programmatic way of defining queries to be 14 | performed against a `lunr.Index`. 15 | 16 | Prefer constructing a `lunr.Query` using `the lunr.Index.query` method 17 | so the query object is pre-initialized with the right index fields. 18 | """ 19 | 20 | # Constants for indicating what kind of automatic wildcard insertion will 21 | # be used when constructing a query clause. 22 | # This allows wildcards to be added to the beginning and end of a term 23 | # without having to manually do any string concatenation. 24 | # The wildcard constants can be bitwise combined to select both leading and 25 | # trailing wildcards. 26 | WILDCARD = "*" 27 | WILDCARD_NONE = 0 28 | WILDCARD_LEADING = 1 29 | WILDCARD_TRAILING = 2 30 | 31 | def __init__(self, all_fields): 32 | self.clauses = [] 33 | self.all_fields = all_fields 34 | 35 | def __repr__(self): 36 | return ''.format( 37 | ",".join(self.all_fields), ",".join(c.term for c in self.clauses) 38 | ) 39 | 40 | def clause(self, *args, **kwargs): 41 | """Adds a `lunr.Clause` to this query. 42 | 43 | Unless the clause contains the fields to be matched all fields will be 44 | matched. In addition a default boost of 1 is applied to the clause. 45 | 46 | If the first argument is a `lunr.Clause` it will be mutated and added, 47 | otherwise args and kwargs will be used in the constructor. 48 | 49 | Returns: 50 | lunr.Query: The Query itself. 51 | """ 52 | if args and isinstance(args[0], Clause): 53 | clause = args[0] 54 | else: 55 | clause = Clause(*args, **kwargs) 56 | 57 | if not clause.fields: 58 | clause.fields = self.all_fields 59 | 60 | if (clause.wildcard & Query.WILDCARD_LEADING) and ( 61 | clause.term[0] != Query.WILDCARD 62 | ): 63 | clause.term = Query.WILDCARD + clause.term 64 | 65 | if (clause.wildcard & Query.WILDCARD_TRAILING) and ( 66 | clause.term[-1] != Query.WILDCARD 67 | ): 68 | clause.term = clause.term + Query.WILDCARD 69 | 70 | self.clauses.append(clause) 71 | return self 72 | 73 | def term(self, term, **kwargs): 74 | """Adds a term to the current query, creating a Clause and adds it to 75 | the list of clauses making up this Query. 76 | 77 | The term is not tokenized and used "as is". Any conversion to token 78 | or token-like strings should be performed before calling this method. 79 | 80 | For example: 81 | query.term(lunr.Tokenizer("foo bar")) 82 | 83 | Args: 84 | term (Token or iterable): Token or iterable of tokens to add. 85 | kwargs (dict): Additional properties to add to the Clause. 86 | """ 87 | if isinstance(term, (list, tuple)): 88 | for t in term: 89 | self.term(t, **kwargs) 90 | else: 91 | self.clause(str(term), **kwargs) 92 | 93 | return self 94 | 95 | def is_negated(self): 96 | """A negated query is one in which every clause has a presence of 97 | prohibited. These queries require some special processing to return 98 | the expected results. 99 | """ 100 | return all( 101 | clause.presence == QueryPresence.PROHIBITED for clause in self.clauses 102 | ) 103 | 104 | 105 | class Clause: 106 | """A single clause in a `lunr.Query` contains a term and details on 107 | how to match that term against a `lunr.Index` 108 | 109 | Args: 110 | term (str, optional): The term for the clause. 111 | field (iterable, optional): The fields for the term to be searched 112 | against. 113 | edit_distance (int, optional): The character distance to use, defaults 114 | to 0. 115 | use_pipeline (bool, optional): Whether the clause should be pre 116 | processed by the index's pipeline, default to True. 117 | boost (int, optional): Boost to apply to the clause, defaults to 1. 118 | wildcard (Query.WILDCARD_*, optional): Any of the Query.WILDCARD 119 | constants defining if a wildcard is to be used and how, defaults 120 | to Query.WILDCARD_NONE. 121 | presence (QueryPresence, optional): Behaviour for a terms presence 122 | in a document. 123 | """ 124 | 125 | def __init__( 126 | self, 127 | term=None, 128 | fields=None, 129 | edit_distance=0, 130 | use_pipeline=True, 131 | boost=1, 132 | wildcard=Query.WILDCARD_NONE, 133 | presence=QueryPresence.OPTIONAL, 134 | ): 135 | super().__init__() 136 | self.term = term 137 | self.fields = fields or [] 138 | self.edit_distance = edit_distance 139 | self.use_pipeline = use_pipeline 140 | self.boost = boost 141 | self.wildcard = wildcard 142 | self.presence = presence 143 | 144 | def __repr__(self): 145 | return ''.format(self.term) 146 | -------------------------------------------------------------------------------- /lunr/vector.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | 3 | from lunr.exceptions import BaseLunrException 4 | 5 | 6 | class Vector: 7 | """A vector is used to construct the vector space of documents and queries. 8 | These vectors support operations to determine the similarity between two 9 | documents or a document and a query. 10 | 11 | Normally no parameters are required for initializing a vector, but in the 12 | case of loading a previously dumped vector the raw elements can be provided 13 | to the constructor. 14 | 15 | For performance reasons vectors are implemented with a flat array, where an 16 | elements index is immediately followed by its value. 17 | E.g. [index, value, index, value]. 18 | 19 | TODO: consider implemetation as 2-tuples. 20 | 21 | This allows the underlying array to be as sparse as possible and still 22 | offer decent performance when being used for vector calculations. 23 | """ 24 | 25 | def __init__(self, elements=None): 26 | self._magnitude = 0 27 | self.elements = elements or [] 28 | 29 | def __repr__(self): 30 | return "".format(self.magnitude) 31 | 32 | def __iter__(self): 33 | return iter(self.elements) 34 | 35 | def position_for_index(self, index): 36 | """Calculates the position within the vector to insert a given index. 37 | 38 | This is used internally by insert and upsert. If there are duplicate 39 | indexes then the position is returned as if the value for that index 40 | were to be updated, but it is the callers responsibility to check 41 | whether there is a duplicate at that index 42 | """ 43 | if not self.elements: 44 | return 0 45 | 46 | start = 0 47 | end = int(len(self.elements) / 2) 48 | slice_length = end - start 49 | pivot_point = int(slice_length / 2) 50 | pivot_index = self.elements[pivot_point * 2] 51 | 52 | while slice_length > 1: 53 | if pivot_index < index: 54 | start = pivot_point 55 | elif pivot_index > index: 56 | end = pivot_point 57 | else: 58 | break 59 | 60 | slice_length = end - start 61 | pivot_point = start + int(slice_length / 2) 62 | pivot_index = self.elements[pivot_point * 2] 63 | 64 | if pivot_index == index: 65 | return pivot_point * 2 66 | elif pivot_index > index: 67 | return pivot_point * 2 68 | else: 69 | return (pivot_point + 1) * 2 70 | 71 | def insert(self, insert_index, val): 72 | """Inserts an element at an index within the vector. 73 | 74 | Does not allow duplicates, will throw an error if there is already an 75 | entry for this index. 76 | """ 77 | 78 | def prevent_duplicates(index, val): 79 | raise BaseLunrException("Duplicate index") 80 | 81 | self.upsert(insert_index, val, prevent_duplicates) 82 | 83 | def upsert(self, insert_index, val, fn=None): 84 | """Inserts or updates an existing index within the vector. 85 | 86 | Args: 87 | - insert_index (int): The index at which the element should be 88 | inserted. 89 | - val (int|float): The value to be inserted into the vector. 90 | - fn (callable, optional): An optional callable taking two 91 | arguments, the current value and the passed value to generate 92 | the final inserted value at the position in case of collision. 93 | """ 94 | fn = fn or (lambda current, passed: passed) 95 | self._magnitude = 0 96 | position = self.position_for_index(insert_index) 97 | if position < len(self.elements) and self.elements[position] == insert_index: 98 | self.elements[position + 1] = fn(self.elements[position + 1], val) 99 | else: 100 | self.elements.insert(position, val) 101 | self.elements.insert(position, insert_index) 102 | 103 | def to_list(self): 104 | """Converts the vector to an array of the elements within the vector""" 105 | output = [] 106 | for i in range(1, len(self.elements), 2): 107 | output.append(self.elements[i]) 108 | return output 109 | 110 | def serialize(self): 111 | # TODO: the JS version forces rounding on the elements upon insertion 112 | # to ensure symmetry upon serialization 113 | return [round(element, 3) for element in self.elements] 114 | 115 | @property 116 | def magnitude(self): 117 | if not self._magnitude: 118 | sum_of_squares = 0 119 | for i in range(1, len(self.elements), 2): 120 | value = self.elements[i] 121 | sum_of_squares += value * value 122 | 123 | self._magnitude = sqrt(sum_of_squares) 124 | 125 | return self._magnitude 126 | 127 | def dot(self, other): 128 | """Calculates the dot product of this vector and another vector.""" 129 | dot_product = 0 130 | a = self.elements 131 | b = other.elements 132 | a_len = len(a) 133 | b_len = len(b) 134 | i = j = 0 135 | 136 | while i < a_len and j < b_len: 137 | a_val = a[i] 138 | b_val = b[j] 139 | if a_val < b_val: 140 | i += 2 141 | elif a_val > b_val: 142 | j += 2 143 | else: 144 | dot_product += a[i + 1] * b[j + 1] 145 | i += 2 146 | j += 2 147 | 148 | return dot_product 149 | 150 | def similarity(self, other): 151 | """Calculates the cosine similarity between this vector and another 152 | vector.""" 153 | if self.magnitude == 0 or other.magnitude == 0: 154 | return 0 155 | 156 | return self.dot(other) / self.magnitude 157 | -------------------------------------------------------------------------------- /tests/acceptance_tests/fixtures/lang_es_en.json: -------------------------------------------------------------------------------- 1 | { 2 | "docs": [ 3 | { 4 | "id": "a", 5 | "text": "La República Galáctica está sumida en el caos. Los impuestos de las rutas comerciales a los sistemas estelares exteriores están en disputa. Esperando resolver el asunto con un bloqueo de poderosas naves de guerra, la codiciosa Federación de Comercio ha detenido todos los envíos al pequeño planeta de Naboo. Mientras el Congreso de la República debate interminablemente esta alarmante cadena de acontecimientos, el Canciller Supremo ha enviado en secreto a dos Caballeros Jedi, guardianes de la paz y la justicia en la galaxia, para resolver el conflicto...", 6 | "title": "Episodio I: La Amenaza Fantasma" 7 | }, 8 | { 9 | "id": "b", 10 | "text": "En el Senado Galáctico reina la inquietud. Varios miles de sistemas solares han declarado su intención de abandonar la República. Este movimiento separatista, liderado por el misterioso Conde Dooku, ha provocado que al limitado número de Caballeros Jedi les resulte difícil mantener la paz y el orden en la galaxia. La senadora Amidala, la antigua reina de Naboo, regresa al Senado Galáctico para dar su voto en la crítica cuestión de crear un EJÉRCITO DE LA REPÚBLICA que ayude a los desbordados Jedi....", 11 | "title": "Episodio II: El Ataque de los Clones" 12 | }, 13 | { 14 | "id": "c", 15 | "text": "¡Guerra! La República se desmorona bajo los ataques del despiadado Lord Sith, el Conde Dooku. Hay héroes en ambos bandos. El mal está por doquier. En una contundente jugada, el diabólico líder droide, el General Grievous, ha irrumpido en la capital de la República y ha secuestrado al Canciller Palpatine, líder del Senado Galáctico. Mientras el ejército droide separatista trata de huir de la capital sitiada con su valioso rehén, dos Caballeros Jedi lideran una misión desesperada para rescatar al Canciller cautivo....", 16 | "title": "Episodio III: El Ataque de los Clones" 17 | }, 18 | { 19 | "id": "d", 20 | "text": "Nos encontramos en un periodo de guerra civil. Las naves espaciales rebeldes, atacando desde una base oculta, han logrado su primera victoria contra el malvado Imperio Galáctico. Durante la batalla, los espías rebeldes han conseguido apoderarse de los planos secretos del arma total y definitiva del Imperio, la ESTRELLA DE LA MUERTE, una estación espacial acorazada, llevando en sí potencia suficiente para destruir a un planeta entero. Perseguida por los siniestros agentes del Imperio, la Princesa Leia vuela hacia su patria, a bordo de su nave espacial, llevando consigo los planos robados, que pueden salvar a su pueblo y devolver la libertad a la galaxia....", 21 | "title": "Episodio IV: Una Nueva Esperanza" 22 | }, 23 | { 24 | "id": "e", 25 | "text": "Son tiempos adversos para la rebelión. Aunque la Estrella de la Muerte ha sido destruida, las tropas imperiales han hecho salir a las fuerzas rebeldes de sus bases ocultas y las han persiguen a través de la galaxia. Tras escapar de la terrible Flota Imperial, un grupo de guerreros de la libertad, encabezados por Luke Skywalker, ha establecido una nueva base secreta en el remoto mundo helado de Hoth. El malvado Lord Darth Vader, obsesionado por encontrar al joven skywalker, ha eviado miles de sondas espaciales hacía las infinitas distancias del espacio....", 26 | "title": "Episodio V: El Imperio Contraataca" 27 | }, 28 | { 29 | "id": "f", 30 | "text": "Luke Skywalker ha regresado a Tatooine, su planeta de origen, para intentar rescatar a su amigo Han Solo de las garras del malvado Jabba, el Hutt. Pero Luke ignora que el IMPERIO GALÁCTICO ha comenzado en secreto la construcción de una nueva estación espacial armada, más poderosa que la temida Estrella de la Muerte. Una vez terminada, este arma suprema significará la aniquilación del pequeño grupo de rebeldes que lucha para restaurar la libertad en la galaxia....", 31 | "title": "Episodio VI: El Retorno del Jedi" 32 | }, 33 | { 34 | "id": "g", 35 | "text": "Luke Skywalker ha desaparecido. En su ausencia, la siniestra PRIMERA ORDEN ha surgido de las cenizas del Imperio y no descansará hasta que Skywalker, el último Jedi, haya sido destruido. Con el apoyo de la REPÚBLICA, la General Leia Organa dirige una valiente RESISTENCIA. Desesperadamente busca a su hermano Luke con el fin de obtener su ayuda para restaurar la paz y la justicia en la galaxia. Leia ha enviado a su piloto más audaz en una misión secreta a Jakku, donde un viejo aliado ha descubierto una pista del paradero de Luke....", 36 | "title": "Episodio VII: El Despertar de la Fuerza" 37 | }, 38 | { 39 | "id": "h", 40 | "text": "La PRIMERA ORDEN impera. Luego de destruir a la pacífica República, el Líder Supremo Snoke ahora envía a sus despiadadas legiones a asumir el control militar de la galaxia. Sólo la general Leia Organa y su grupo de combatientes de la RESISTENCIA se oponen a la creciente tiranía, convencidos de que el Maestro Jedi Luke Skywalker regresará y restaurará la chispa de esperanza en la lucha. Pero la Resistencia ha sido expuesta. Mientras la Primera Orden se dirige hacia la base rebelde, los valientes héroes organizan un desesperado escape....", 41 | "title": "Episodio VIII: Los Últimos Jedi" 42 | }, 43 | { 44 | "id": "i", 45 | "text": "Turmoil has engulfed the Galactic Republic. The taxation of trade routes to outlying star systems is in dispute. Hoping to resolve the matter with a blockade of deadly battleships, the greedy Trade Federation has stopped all shipping to the small planet of Naboo. While the Congress of the Republic endlessly debates this alarming chain of events, the Supreme Chancellor has secretly dispatched two Jedi Knights, the guardians of peace and justice in the galaxy, to settle the conflict.... ", 46 | "title": "Episode I: The Phantom Menace" 47 | } 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /lunr/query_parser.py: -------------------------------------------------------------------------------- 1 | from lunr.query_lexer import QueryLexer 2 | from lunr.query import Clause, QueryPresence 3 | from lunr.exceptions import QueryParseError 4 | 5 | 6 | class QueryParser: 7 | def __init__(self, string, query): 8 | self.lexer = QueryLexer(string) 9 | self.query = query 10 | self.current_clause = Clause() 11 | self.lexeme_idx = 0 12 | 13 | def parse(self): 14 | self.lexer.run() 15 | self.lexemes = self.lexer.lexemes 16 | 17 | state = self.__class__.parse_clause 18 | 19 | while state: 20 | state = state(self) 21 | 22 | return self.query 23 | 24 | def peek_lexeme(self): 25 | try: 26 | return self.lexemes[self.lexeme_idx] 27 | except IndexError: 28 | return None 29 | 30 | def consume_lexeme(self): 31 | lexeme = self.peek_lexeme() 32 | self.lexeme_idx += 1 33 | return lexeme 34 | 35 | def next_clause(self): 36 | self.query.clause(self.current_clause) 37 | self.current_clause = Clause() 38 | 39 | @classmethod 40 | def parse_clause(cls, parser): 41 | lexeme = parser.peek_lexeme() 42 | if lexeme is None: 43 | return 44 | 45 | if lexeme["type"] == QueryLexer.FIELD: 46 | return cls.parse_field 47 | elif lexeme["type"] == QueryLexer.TERM: 48 | return cls.parse_term 49 | elif lexeme["type"] == QueryLexer.PRESENCE: 50 | return cls.parse_presence 51 | else: 52 | raise QueryParseError( 53 | "Expected either a field or a term, found {}{}".format( 54 | lexeme["type"], 55 | 'with value "' + lexeme["string"] + '"' 56 | if len(lexeme["string"]) 57 | else "", 58 | ) 59 | ) 60 | 61 | @classmethod 62 | def parse_field(cls, parser): 63 | lexeme = parser.consume_lexeme() 64 | 65 | if lexeme["string"] not in parser.query.all_fields: 66 | raise QueryParseError( 67 | 'Unrecognized field "{}", possible fields {}'.format( 68 | lexeme["string"], ", ".join(parser.query.all_fields) 69 | ) 70 | ) 71 | 72 | parser.current_clause.fields = [lexeme["string"]] 73 | 74 | next_lexeme = parser.peek_lexeme() 75 | if next_lexeme is None: 76 | raise QueryParseError("Expected term, found nothing") 77 | 78 | if next_lexeme["type"] == QueryLexer.TERM: 79 | return cls.parse_term 80 | else: 81 | raise QueryParseError("Expected term, found {}".format(next_lexeme["type"])) 82 | 83 | @classmethod 84 | def parse_term(cls, parser): 85 | lexeme = parser.consume_lexeme() 86 | 87 | parser.current_clause.term = lexeme["string"].lower() 88 | if "*" in lexeme["string"]: 89 | parser.current_clause.use_pipeline = False 90 | 91 | return cls._peek_next_lexeme(parser) 92 | 93 | @classmethod 94 | def parse_presence(cls, parser): 95 | lexeme = parser.consume_lexeme() 96 | 97 | if lexeme is None: 98 | return 99 | 100 | if lexeme["string"] == "-": 101 | parser.current_clause.presence = QueryPresence.PROHIBITED 102 | elif lexeme["string"] == "+": 103 | parser.current_clause.presence = QueryPresence.REQUIRED 104 | else: 105 | raise QueryParseError( 106 | "Unrecognized parser operator: {}, expected `+` or `-`".format( 107 | lexeme.str 108 | ) 109 | ) 110 | 111 | next_lexeme = parser.peek_lexeme() 112 | if next_lexeme is None: 113 | raise QueryParseError("Expected either a field or a term, found nothing") 114 | 115 | if next_lexeme["type"] == QueryLexer.FIELD: 116 | return cls.parse_field 117 | elif next_lexeme["type"] == QueryLexer.TERM: 118 | return cls.parse_term 119 | else: 120 | raise QueryParseError( 121 | "Expected either a field or a term, found {}".format(lexeme["type"]) 122 | ) 123 | 124 | @classmethod 125 | def parse_edit_distance(cls, parser): 126 | lexeme = parser.consume_lexeme() 127 | 128 | try: 129 | edit_distance = int(lexeme["string"]) 130 | except ValueError as e: 131 | raise QueryParseError("Edit distance must be numeric") from e 132 | 133 | parser.current_clause.edit_distance = edit_distance 134 | 135 | return cls._peek_next_lexeme(parser) 136 | 137 | @classmethod 138 | def parse_boost(cls, parser): 139 | lexeme = parser.consume_lexeme() 140 | 141 | try: 142 | boost = int(lexeme["string"]) 143 | except ValueError as e: 144 | raise QueryParseError("Boost must be numeric") from e 145 | 146 | parser.current_clause.boost = boost 147 | 148 | return cls._peek_next_lexeme(parser) 149 | 150 | @classmethod 151 | def _peek_next_lexeme(cls, parser): 152 | next_lexeme = parser.peek_lexeme() 153 | if next_lexeme is None: 154 | parser.next_clause() 155 | return 156 | 157 | if next_lexeme["type"] == QueryLexer.TERM: 158 | parser.next_clause() 159 | return cls.parse_term 160 | elif next_lexeme["type"] == QueryLexer.FIELD: 161 | parser.next_clause() 162 | return cls.parse_field 163 | elif next_lexeme["type"] == QueryLexer.EDIT_DISTANCE: 164 | return cls.parse_edit_distance 165 | elif next_lexeme["type"] == QueryLexer.BOOST: 166 | return cls.parse_boost 167 | elif next_lexeme["type"] == QueryLexer.PRESENCE: 168 | parser.next_clause() 169 | return cls.parse_presence 170 | else: 171 | raise QueryParseError( 172 | "Unexpected lexeme type {}".format(next_lexeme["type"]) 173 | ) 174 | -------------------------------------------------------------------------------- /lunr/pipeline.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import logging 3 | from typing import Callable, Dict, List, Set 4 | 5 | from lunr.exceptions import BaseLunrException 6 | from lunr.token import Token 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | class Pipeline: 12 | """lunr.Pipelines maintain a list of functions to be applied to all tokens 13 | in documents entering the search index and queries ran agains the index. 14 | 15 | """ 16 | 17 | registered_functions: Dict[str, Callable] = {} 18 | 19 | def __init__(self): 20 | self._stack: List[Callable] = [] 21 | self._skip: Dict[Callable, Set[str]] = defaultdict(set) 22 | 23 | def __len__(self): 24 | return len(self._stack) 25 | 26 | def __repr__(self): 27 | return ''.format(",".join(fn.label for fn in self._stack)) 28 | 29 | # TODO: add iterator methods? 30 | 31 | @classmethod 32 | def register_function(cls, fn, label=None): 33 | """Register a function with the pipeline.""" 34 | label = label or fn.__name__ 35 | if label in cls.registered_functions: 36 | log.warning("Overwriting existing registered function %s", label) 37 | 38 | fn.label = label 39 | cls.registered_functions[fn.label] = fn 40 | 41 | @classmethod 42 | def load(cls, serialised): 43 | """Loads a previously serialised pipeline.""" 44 | pipeline = cls() 45 | for fn_name in serialised: 46 | try: 47 | fn = cls.registered_functions[fn_name] 48 | except KeyError: 49 | raise BaseLunrException( 50 | "Cannot load unregistered function {}".format(fn_name) 51 | ) 52 | else: 53 | pipeline.add(fn) 54 | 55 | return pipeline 56 | 57 | def add(self, *args): 58 | """Adds new functions to the end of the pipeline. 59 | 60 | Functions must accept three arguments: 61 | - Token: A lunr.Token object which will be updated 62 | - i: The index of the token in the set 63 | - tokens: A list of tokens representing the set 64 | """ 65 | for fn in args: 66 | self.warn_if_function_not_registered(fn) 67 | self._stack.append(fn) 68 | 69 | def warn_if_function_not_registered(self, fn): 70 | try: 71 | return fn.label in self.registered_functions 72 | except AttributeError: 73 | log.warning( 74 | 'Function "{}" is not registered with pipeline. ' 75 | "This may cause problems when serialising the index.".format( 76 | getattr(fn, "label", fn) 77 | ) 78 | ) 79 | 80 | def after(self, existing_fn, new_fn): 81 | """Adds a single function after a function that already exists in the 82 | pipeline.""" 83 | self.warn_if_function_not_registered(new_fn) 84 | try: 85 | index = self._stack.index(existing_fn) 86 | self._stack.insert(index + 1, new_fn) 87 | except ValueError as e: 88 | raise BaseLunrException("Cannot find existing_fn") from e 89 | 90 | def before(self, existing_fn, new_fn): 91 | """Adds a single function before a function that already exists in the 92 | pipeline. 93 | 94 | """ 95 | self.warn_if_function_not_registered(new_fn) 96 | try: 97 | index = self._stack.index(existing_fn) 98 | self._stack.insert(index, new_fn) 99 | except ValueError as e: 100 | raise BaseLunrException("Cannot find existing_fn") from e 101 | 102 | def remove(self, fn): 103 | """Removes a function from the pipeline.""" 104 | try: 105 | self._stack.remove(fn) 106 | except ValueError: 107 | pass 108 | 109 | def skip(self, fn: Callable, field_names: List[str]): 110 | """ 111 | Make the pipeline skip the function based on field name we're processing. 112 | 113 | This relies on passing the field name to Pipeline.run(). 114 | """ 115 | self._skip[fn].update(field_names) 116 | 117 | def run(self, tokens, field_name=None): 118 | """ 119 | Runs the current list of functions that make up the pipeline against 120 | the passed tokens. 121 | 122 | :param tokens: The tokens to process. 123 | :param field_name: The name of the field these tokens belongs to, can be ommited. 124 | Used to skip some functions based on field names. 125 | """ 126 | for fn in self._stack: 127 | # Skip the function based on field name. 128 | if field_name and field_name in self._skip[fn]: 129 | continue 130 | results = [] 131 | for i, token in enumerate(tokens): 132 | # JS ignores additional arguments to the functions but we 133 | # force pipeline functions to declare (token, i, tokens) 134 | # or *args 135 | result = fn(token, i, tokens) 136 | if not result: 137 | continue 138 | if isinstance(result, (list, tuple)): # simulate Array.concat 139 | results.extend(result) 140 | else: 141 | results.append(result) 142 | tokens = results 143 | 144 | return tokens 145 | 146 | def run_string(self, string, metadata=None): 147 | """Convenience method for passing a string through a pipeline and 148 | getting strings out. This method takes care of wrapping the passed 149 | string in a token and mapping the resulting tokens back to strings. 150 | 151 | .. note:: This ignores the skipped functions since we can't 152 | access field names from this context. 153 | """ 154 | token = Token(string, metadata) 155 | return [str(tkn) for tkn in self.run([token])] 156 | 157 | def reset(self): 158 | self._stack = [] 159 | 160 | def serialize(self): 161 | return [fn.label for fn in self._stack] 162 | -------------------------------------------------------------------------------- /tests/test_builder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lunr.builder import Builder 4 | from lunr.token_set import TokenSet 5 | from lunr.index import Index 6 | from lunr.vector import Vector 7 | 8 | 9 | def _assert_deep_keys(dict_, keys): 10 | d = dict_ 11 | for key in keys.split("."): 12 | d_keys_as_str = [str(k) for k in d] 13 | assert key in d_keys_as_str 14 | d = d[key] 15 | 16 | 17 | class TestBuilderBuild: 18 | def setup_method(self, method): 19 | self.builder = Builder() 20 | doc = {"id": "id", "title": "test", "body": "missing"} 21 | 22 | self.builder.ref("id") 23 | self.builder.field("title") 24 | self.builder.add(doc) 25 | self.index = self.builder.build() 26 | 27 | def test_adds_tokens_to_inverted_index(self): 28 | _assert_deep_keys(self.builder.inverted_index, "test.title.id") 29 | 30 | def test_builds_vector_space_of_the_document_fields(self): 31 | assert "title/id" in self.builder.field_vectors 32 | assert isinstance(self.builder.field_vectors["title/id"], Vector) 33 | 34 | def test_skips_fields_not_defined_for_indexing(self): 35 | assert "missing" not in self.builder.inverted_index 36 | 37 | def test_builds_a_token_set_for_the_corpus(self): 38 | needle = TokenSet.from_string("test") 39 | assert "test" in self.builder.token_set.intersect(needle).to_list() 40 | 41 | def test_calculates_document_count(self): 42 | assert self.builder.average_field_length["title"] == 1 43 | 44 | def test_index_is_returned(self): 45 | assert isinstance(self.index, Index) 46 | 47 | 48 | class TestBuilderAdd: 49 | def test_builder_casts_docrefs_to_strings(self): 50 | self.builder = Builder() 51 | self.builder.ref("id") 52 | self.builder.field("title") 53 | 54 | self.builder.add(dict(id=123, title="test", body="missing")) 55 | 56 | _assert_deep_keys(self.builder.inverted_index, "test.title.123") 57 | 58 | def test_builder_metadata_whitelist_includes_metadata_in_index(self): 59 | self.builder = Builder() 60 | self.builder.ref("id") 61 | self.builder.field("title") 62 | self.builder.metadata_whitelist = ["position"] 63 | 64 | self.builder.add(dict(id="a", title="test", body="missing")) 65 | self.builder.add(dict(id="b", title="another test", body="missing")) 66 | 67 | assert self.builder.inverted_index["test"]["title"]["a"] == { 68 | "position": [[0, 4]] 69 | } 70 | assert self.builder.inverted_index["test"]["title"]["b"] == { 71 | "position": [[8, 4]] 72 | } 73 | 74 | def test_builder_field_raises_if_contains_slash(self): 75 | self.builder = Builder() 76 | 77 | with pytest.raises(ValueError): 78 | self.builder.field("foo/bar") 79 | 80 | def test_builder_extracts_nested_properties_from_document(self): 81 | self.builder = Builder() 82 | self.builder.field("name", extractor=lambda d: d["person"]["name"]) 83 | 84 | self.builder.add({"id": "id", "person": {"name": "bob"}}) 85 | 86 | assert self.builder.inverted_index["bob"]["name"]["id"] == {} 87 | 88 | def test_builder_field_term_frequency_and_length(self): 89 | self.builder = Builder() 90 | self.builder.ref("id") 91 | self.builder.field("title") 92 | 93 | self.builder.add(dict(id="a", title="test a testing test", body="missing")) 94 | 95 | assert self.builder.field_term_frequencies == { 96 | "title/a": {"test": 2, "a": 1, "testing": 1} 97 | } 98 | assert self.builder.field_lengths == {"title/a": 4} 99 | 100 | 101 | class TestBuilderUse: 102 | def setup_method(self, method): 103 | self.builder = Builder() 104 | 105 | def test_calls_plugin_function(self): 106 | def plugin(*args): 107 | assert True 108 | 109 | self.builder.use(plugin) 110 | 111 | def test_plugin_is_called_with_builder_as_first_argument(self): 112 | def plugin(builder): 113 | assert builder is self.builder 114 | 115 | self.builder.use(plugin) 116 | 117 | def test_forwards_arguments_to_the_plugin(self): 118 | def plugin(builder, *args, **kwargs): 119 | assert args == (1, 2, 3) 120 | assert kwargs == {"foo": "bar"} 121 | 122 | self.builder.use(plugin, 1, 2, 3, foo="bar") 123 | 124 | 125 | class TestBuilderK1: 126 | def test_k1_default_value(self): 127 | builder = Builder() 128 | assert builder._k1 == 1.2 129 | 130 | def test_k1_can_be_set(self): 131 | builder = Builder() 132 | builder.k1(1.6) 133 | assert builder._k1 == 1.6 134 | 135 | 136 | class TestBuilderB: 137 | def test_b_default_value(self): 138 | builder = Builder() 139 | assert builder._b == 0.75 140 | 141 | def test_b_within_range(self): 142 | builder = Builder() 143 | builder.b(0.5) 144 | assert builder._b == 0.5 145 | 146 | def test_b_less_than_zero(self): 147 | builder = Builder() 148 | builder.b(-1) 149 | assert builder._b == 0 150 | 151 | def test_b_higher_than_one(self): 152 | builder = Builder() 153 | builder.b(1.5) 154 | assert builder._b == 1 155 | 156 | 157 | class TestBuilerRef: 158 | def test_default_reference(self): 159 | builder = Builder() 160 | assert builder._ref == "id" 161 | 162 | def test_defining_a_reference_field(self): 163 | builder = Builder() 164 | builder.ref("foo") 165 | assert builder._ref == "foo" 166 | 167 | 168 | class TestBuilderField: 169 | def test_define_fields_to_index(self): 170 | builder = Builder() 171 | builder.field("foo") 172 | assert len(builder._fields) == 1 173 | assert builder._fields["foo"].name == "foo" 174 | assert builder._fields["foo"].boost == 1 175 | assert builder._fields["foo"].extractor is None 176 | assert repr(builder._fields["foo"]) == '' 177 | assert hash(builder._fields["foo"]) == hash("foo") 178 | -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # Quick start 2 | 3 | First, you'll need a list of dicts representing the documents you want to search on. These documents must have a unique field which will serve as a reference and a series of fields you'd like to search on. 4 | 5 | ```python 6 | >>> from lunr import lunr 7 | >>> 8 | >>> documents = [{ 9 | ...: 'id': 'a', 10 | ...: 'title': 'Mr. Green kills Colonel Mustard', 11 | ...: 'body': """Mr. Green killed Colonel Mustard in the study with the 12 | ...: candlestick. Mr. Green is not a very nice fellow.""" 13 | ...: }, { 14 | ...: 'id': 'b', 15 | ...: 'title': 'Plumb waters plant', 16 | ...: 'body': 'Professor Plumb has a green and a yellow plant in his study', 17 | ...: }, { 18 | ...: 'id': 'c', 19 | ...: 'title': 'Scarlett helps Professor', 20 | ...: 'body': """Miss Scarlett watered Professor Plumbs green plant 21 | ...: while he was away on his murdering holiday.""", 22 | ...: }] 23 | ``` 24 | 25 | Lunr provides a convenience `lunr` function to quickly index this set of documents: 26 | 27 | ```python 28 | >>> idx = lunr( 29 | ... ref='id', fields=('title', 'body'), documents=documents 30 | ... ) 31 | ``` 32 | 33 | For basic no-fuss searches just use the `search` on the index: 34 | 35 | ```python 36 | >>> idx.search('kill') 37 | [{'ref': 'a', 'score': 0.6931722372559913, 'match_data': }] 38 | >>> idx.search('study') 39 | [{'ref': 'b', 'score': 0.23576799568081389, 'match_data': }, 40 | {'ref': 'a', 'score': 0.2236629211724517, 'match_data': }] 41 | ``` 42 | 43 | ## Using query strings 44 | 45 | The query string passed to `search` accepts multiple terms: 46 | 47 | ```python 48 | >>> idx.search('green plant') 49 | [{'ref': 'b', 'score': 0.5023294192217546, 'match_data': }, 50 | {'ref': 'a', 'score': 0.12544083739725947, 'match_data': }, 51 | {'ref': 'c', 'score': 0.07306110905506158, 'match_data': }] 52 | ``` 53 | 54 | The index will search for `green` OR `plant`, a few things to note on the results: 55 | 56 | - document `b` scores highest because `plant` appears in both fields and `green` appears in the body 57 | - document `a` is second includes only `green` but in the title and the body twice 58 | - document `c` includes both terms but only on one of the fields 59 | 60 | Query strings support a variety of modifiers: 61 | 62 | ### Wildcards 63 | 64 | You can use `*` as a wildcard anywhere in your query string: 65 | 66 | ```python 67 | >>> idx.search('pl*') 68 | [{'ref': 'b', 'score': 0.725901569004226, 'match_data': }, 69 | {'ref': 'c', 'score': 0.0816178155209697, 'match_data': }] 70 | >>> idx.search('*llow') 71 | [{'ref': 'b', 'score': 0.6210112024848421, 'match_data': }, 72 | {'ref': 'a', 'score': 0.30426104537491444, 'match_data': }] 73 | ``` 74 | 75 | Note that, when using wildcards, no stemming is performed in the search terms. 76 | 77 | ### Fields 78 | 79 | Prefixing any search term with `:` allows you to specify which field a particular term should be searched for: 80 | 81 | ```python 82 | >>> idx.search('title:green title:plant') 83 | [{'ref': 'b', 'score': 0.18604713274256787, 'match_data': }, 84 | {'ref': 'a', 'score': 0.07902963505882092, 'match_data': }] 85 | ``` 86 | 87 | Note the difference with the example above, document `c` is no longer in the results. 88 | 89 | Specifying an unindexed field will raise an exception: 90 | 91 | ```python 92 | >>> idx.search('foo:green') 93 | Traceback (most recent call last): 94 | ... 95 | lunr.exceptions.QueryParseError: Unrecognized field "foo", possible fields title, body 96 | ``` 97 | 98 | You can combine this with wildcards: 99 | 100 | ```python 101 | >>> idx.search('body:mu*') 102 | [{'ref': 'c', 'score': 0.3072276611029057, 'match_data': }, 103 | {'ref': 'a', 'score': 0.14581429988419872, 'match_data': }] 104 | ``` 105 | 106 | ### Boosts 107 | 108 | When searching for several terms you can use boosting to give more importance to the each term: 109 | 110 | ```python 111 | >>> idx.search('green plant^10') 112 | [{'ref': 'b', 'score': 0.831629678987025, 'match_data': }, 113 | {'ref': 'c', 'score': 0.06360184858161157, 'match_data': }, 114 | {'ref': 'a', 'score': 0.01756105367777591, 'match_data': }] 115 | ``` 116 | 117 | Note how document `c` now scores higher because of the boosting on the term `plant`. The `10` represents a multiplier on the relative score for the term and must be positive integers. 118 | 119 | ### Fuzzy matches 120 | 121 | You can also use fuzzy matching for terms that are likely to be misspelled: 122 | 123 | ```python 124 | >>> idx.search('yellow~1') 125 | [{'ref': 'b', 'score': 0.621155860224936, 'match_data': }, 126 | {'ref': 'a', 'score': 0.3040972809936496, 'match_data': }] 127 | ``` 128 | 129 | The positive integer after `~` represents the edit distance, in this case 1 character, either by addition, removal or transposition. 130 | 131 | ### Term presence (new in 0.3.0) 132 | 133 | As mentioned above, Lunr defaults to searching for logical OR on terms, but it is possible to specify the presence of each term in matching documents. The default OR behaviour is represented by the term's presence being *optional* in a matching document, to specify that a term must be present in matching document the term must be prefixed with a `+`. On the other hand to specify that a term must *not* be included in a matching document the term must be prefixed with a `-`. 134 | 135 | The below example searches for documents that must contain "green", might contain "plant" and must not contain "study": 136 | 137 | ```python 138 | >>> idx.search("+green plant -study") 139 | [{'ref': 'c', 140 | 'score': 0.08090317236904906, 141 | 'match_data': }] 142 | ``` 143 | 144 | Contrast this with the default behaviour: 145 | 146 | ```python 147 | >>> idx.search('green plant study') 148 | [{'ref': 'b', 149 | 'score': 0.5178296383103647, 150 | 'match_data': }, 151 | {'ref': 'a', 152 | 'score': 0.22147889214939157, 153 | 'match_data': }, 154 | {'ref': 'c', 155 | 'score': 0.06605716362553504, 156 | 'match_data': }] 157 | ``` 158 | 159 | To simulate a logical AND search of "green AND plant" mark both terms as required: 160 | 161 | ```python 162 | >>> idx.search('+yellow +plant') 163 | [{'ref': 'b', 164 | 'score': 0.8915374700737615, 165 | 'match_data': }] 166 | ``` 167 | 168 | As opposed to the default: 169 | 170 | ```python 171 | >>> idx.search('yellow plant') 172 | [{'ref': 'b', 173 | 'score': 0.8915374700737615, 174 | 'match_data': }, 175 | {'ref': 'c', 176 | 'score': 0.045333674172311975, 177 | 'match_data': }] 178 | ``` 179 | 180 | Note presence can also be combined with any of the other modifiers described above. 181 | -------------------------------------------------------------------------------- /docs/indices.md: -------------------------------------------------------------------------------- 1 | # Building indices 2 | 3 | We briefly skimmed over creating indices in Lunr in the [searching](./usage.md) section, let's go into more detail around what we need to build a Lunr index. 4 | 5 | ## The `lunr` function 6 | 7 | The main entry point to Lunr is the `lunr` function. It provides a simple way to create an index, define fields we're interested in and start indexing a corpus of documents. 8 | 9 | We do that simply by providing: 10 | 11 | - A `ref` string specifying the field in the documents that should be used as a key for each document. 12 | - A `fields` list, which defines the fields in the documents that should be added to the index. 13 | - A `documents` list, including a set of dictionaries representing the documents we want to index. 14 | 15 | And that's it. The `lunr` function will create an index, configure it, add the documents and return the `lunr.Index` for you to start searching. 16 | 17 | ## Build time boosts 18 | 19 | > New in version 0.4.0 20 | 21 | Lunr also provides some very useful functionality for boosting at index building time. There are two types of boosts you can include: field boosts and document boosts. 22 | 23 | ### Field boosts 24 | 25 | Field boosts let Lunr know that, when searching, we care more about some fields than others, a typical example is adding a boost on the *title* of our documents so when searching for a term, if it is found in the title, the document will score higher. 26 | 27 | To include a field boost we use the `fields` argument of the `lunr` function, instead of passing a list of strings as usual, we pass a list of dictionaries with two keys: 28 | 29 | - `field_name` whose value will be the name of the field in the documents we want to index. 30 | - `boost` an integer to be multiplied to the score when a match is found on this field. 31 | 32 | For example: 33 | 34 | ```python 35 | >>> from lunr import lunr 36 | >>> documents = [{ 37 | ...: 'id': 'a', 38 | ...: 'title': 'Mr. Green kills Colonel Mustard', 39 | ...: 'body': """Mr. Green killed Colonel Mustard in the study with the 40 | ...: candlestick. Mr. Green is not a very nice fellow.""" 41 | ...: }, { 42 | ...: 'id': 'b', 43 | ...: 'title': 'Plumb waters plant', 44 | ...: 'body': 'Professor Plumb has a green and a yellow plant in his study', 45 | ...: }, { 46 | ...: 'id': 'c', 47 | ...: 'title': 'Scarlett helps Professor', 48 | ...: 'body': """Miss Scarlett watered Professor Plumbs green plant 49 | ...: while he was away on his murdering holiday.""", 50 | ...: }] 51 | >>> idx = lunr( 52 | ...: ref='id', 53 | ...: fields=[dict(field_name='title', boost=10), 'body'], 54 | ...: documents=documents 55 | ...: ) 56 | ``` 57 | 58 | Note how we're passing a dictionary only for `title`, `body` will have a neutral value for `boost`. 59 | 60 | 61 | ```python 62 | >>> idx.search('plumb') 63 | [{'match_data': , 'ref': 'b', 'score': 1.599}, 64 | {'match_data': , 'ref': 'c', 'score': 0.13}] 65 | ``` 66 | 67 | Note how the score for document `b` is much higher thanks to our field boost. 68 | 69 | ### Document boosts 70 | 71 | Document boosts let Lunr know that some documents are more important than others, for example we would like an FAQ page to show up higher in searches. 72 | 73 | In Lunr we do this via the `documents` argument to the `lunr` function, instead of passing a list of dictionaries we pass a 2-tuple (or list) with the document dictionary as a first item and another dictionary as a second item. This second dictionary must have a single `boost` key with an integer to be applied to any matches on this particular document. 74 | 75 | ```python 76 | documents = [ 77 | { 78 | 'id': 'a', 79 | 'title': 'Mr. Green kills Colonel Mustard', 80 | 'body': """Mr. Green killed Colonel Mustard in the study with the 81 | candlestick. Mr. Green is not a very nice fellow.""" 82 | }, { 83 | 'id': 'b', 84 | 'title': 'Plumb waters plant', 85 | 'body': 'Professor Plumb has a green and a yellow plant in his study', 86 | }, ( 87 | { 88 | 'id': 'c', 89 | 'title': 'Scarlett helps Professor', 90 | 'body': """Miss Scarlett watered Professor Plumbs green plant 91 | while he was away on his murdering holiday.""", 92 | }, { 93 | 'boost': 10 94 | } 95 | )] 96 | ``` 97 | 98 | Note how the third member of a list is a tuple, now if we pass these documents to the `lunr` function and perform a search: 99 | 100 | ```python 101 | >>> idx = lunr(ref='id', fields=('title', 'body'), documents=documents) 102 | >>> idx.search('plumb') 103 | [{'match_data': , 'ref': 'c', 'score': 1.297}, 104 | {'match_data': , 'ref': 'b', 'score': 0.3}] 105 | ``` 106 | 107 | The score for `c` is now higher than `b` even though there are less matches, thanks to our document boost. 108 | 109 | ## Field extractors 110 | 111 | Up until now we've been working with fairly simple documents, but what if you have large nested documents and only want to index parts of them? 112 | 113 | For this Lunr provides *field extractors*, which are simply callables that Lunr can use to fetch the field in the document you want to index. If you do not provide it, as we've been doing, Lunr assumes there's a key matching the field name, i.e. `title` or `body`. 114 | 115 | To pass a field extractor to Lunr we, once again, use the `fields` argument to the `lunr` function. Similarly to what we did to define field boosts we pass a list of dictionaries, but this time we add an `extractor` key whose value is a callable with a single argument, the document being processed. Lunr will call the extractor when fetching the indexed field and will use its result in our index. 116 | 117 | Imagine our documents have a slightly different form where the reference is at the top level but our fields are nested under a `content` key: 118 | 119 | ```python 120 | documents = [{ 121 | 'id': 'a', 122 | 'content': { 123 | 'title': 'Mr. Green kills Colonel Mustard', 124 | 'body': """Mr. Green killed Colonel Mustard in the study with the 125 | candlestick. Mr. Green is not a very nice fellow.""" 126 | } 127 | }, { 128 | 'id': 'b', 129 | 'content': { 130 | 'title': 'Plumb waters plant', 131 | 'body': 'Professor Plumb has a green and a yellow plant in his study', 132 | } 133 | }, { 134 | 'id': 'c', 135 | 'content': { 136 | 'title': 'Scarlett helps Professor', 137 | 'body': """Miss Scarlett watered Professor Plumbs green plant 138 | while he was away on his murdering holiday.""", 139 | } 140 | }] 141 | ``` 142 | 143 | To work around this we simply need to add field extractors, which are simply callables that take a document as an argument and return the content of the field, in this case a simple `lambda` will do: 144 | 145 | ```python 146 | >>> idx = lunr( 147 | ... ref='id', 148 | ... fields=[ 149 | ... dict(field_name='title', extractor=lambda d: d['content']['title']), 150 | ... dict(field_name='body', extractor=lambda d: d['content']['body']) 151 | ... ], 152 | ... documents=documents) 153 | ``` 154 | 155 | We can now search the index as usual: 156 | 157 | ```python 158 | >>> idx.search('plumb') 159 | [{'ref': 'b', 'score': 0.3, 'match_data': } 160 | {'ref': 'c', 'score': 0.13, 'match_data': }] 161 | ``` 162 | -------------------------------------------------------------------------------- /tests/test_query_parser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lunr.query import Query, QueryPresence 4 | from lunr.query_parser import QueryParser 5 | from lunr.exceptions import QueryParseError 6 | 7 | 8 | def parse(q): 9 | query = Query(["title", "body"]) 10 | parser = QueryParser(q, query) 11 | 12 | parser.parse() 13 | return query.clauses 14 | 15 | 16 | class TestQueryParser: 17 | def test_parse_empty_string(self): 18 | clauses = parse("") 19 | assert len(clauses) == 0 20 | 21 | def test_parse_single_term(self): 22 | clauses = parse("foo") 23 | assert len(clauses) == 1 24 | clause = clauses[0] 25 | assert clause.term == "foo" 26 | assert clause.fields == ["title", "body"] 27 | assert clause.use_pipeline is True 28 | assert clause.presence is QueryPresence.OPTIONAL 29 | 30 | def test_parse_single_term_uppercase(self): 31 | clauses = parse("FOO") 32 | assert len(clauses) == 1 33 | clause = clauses[0] 34 | assert clause.term == "foo" 35 | assert clause.fields == ["title", "body"] 36 | assert clause.use_pipeline is True 37 | 38 | def test_parse_single_term_with_wildcard(self): 39 | clauses = parse("fo*") 40 | assert len(clauses) == 1 41 | clause = clauses[0] 42 | assert clause.term == "fo*" 43 | assert clause.use_pipeline is False 44 | 45 | def test_multiple_terms(self): 46 | clauses = parse("foo bar") 47 | assert len(clauses) == 2 48 | assert clauses[0].term == "foo" 49 | assert clauses[1].term == "bar" 50 | 51 | def test_term_with_presence_required_adds_required_clause(self): 52 | clauses = parse("+foo") 53 | assert len(clauses) == 1 54 | assert clauses[0].term == "foo" 55 | assert clauses[0].boost == 1 56 | assert clauses[0].fields == ["title", "body"] 57 | assert clauses[0].presence == QueryPresence.REQUIRED 58 | 59 | def test_term_with_presence_required_adds_prohibited_clause(self): 60 | clauses = parse("-foo") 61 | assert len(clauses) == 1 62 | assert clauses[0].term == "foo" 63 | assert clauses[0].boost == 1 64 | assert clauses[0].fields == ["title", "body"] 65 | assert clauses[0].presence == QueryPresence.PROHIBITED 66 | 67 | def test_term_scoped_by_field_with_presence_required(self): 68 | clauses = parse("+title:foo") 69 | assert len(clauses) == 1 70 | assert clauses[0].term == "foo" 71 | assert clauses[0].boost == 1 72 | assert clauses[0].fields == ["title"] 73 | assert clauses[0].presence == QueryPresence.REQUIRED 74 | 75 | def test_term_scoped_by_field_with_presence_prohibited(self): 76 | clauses = parse("-title:foo") 77 | assert len(clauses) == 1 78 | assert clauses[0].term == "foo" 79 | assert clauses[0].boost == 1 80 | assert clauses[0].fields == ["title"] 81 | assert clauses[0].presence == QueryPresence.PROHIBITED 82 | 83 | def test_multiple_terms_with_presence_creates_two_clauses(self): 84 | clauses = parse("+foo +bar") 85 | assert len(clauses) == 2 86 | assert clauses[0].term == "foo" 87 | assert clauses[1].term == "bar" 88 | assert clauses[0].presence == QueryPresence.REQUIRED 89 | assert clauses[1].presence == QueryPresence.REQUIRED 90 | 91 | def test_unknown_field(self): 92 | with pytest.raises(QueryParseError): 93 | parse("unknown:foo") 94 | 95 | def test_field_without_a_term(self): 96 | with pytest.raises(QueryParseError): 97 | parse("title:") 98 | 99 | def test_field_twice(self): 100 | with pytest.raises(QueryParseError): 101 | parse("title:title:") 102 | 103 | def test_term_with_field(self): 104 | clauses = parse("title:foo") 105 | assert len(clauses) == 1 106 | assert clauses[0].fields == ["title"] 107 | 108 | def test_uppercase_field_with_uppercase_term(self): 109 | query = Query(["TITLE"]) 110 | parser = QueryParser("TITLE:FOO", query) 111 | 112 | parser.parse() 113 | clauses = query.clauses 114 | 115 | assert len(clauses) == 1 116 | assert clauses[0].term == "foo" 117 | assert clauses[0].fields == ["TITLE"] 118 | 119 | def test_multiple_terms_scoped_to_different_fields(self): 120 | clauses = parse("title:foo body:bar") 121 | 122 | assert len(clauses) == 2 123 | assert clauses[0].fields == ["title"] 124 | assert clauses[1].fields == ["body"] 125 | 126 | assert clauses[0].term == "foo" 127 | assert clauses[1].term == "bar" 128 | 129 | def test_single_term_with_edit_distance(self): 130 | clauses = parse("foo~2") 131 | 132 | assert len(clauses) == 1 133 | assert clauses[0].term == "foo" 134 | assert clauses[0].fields == ["title", "body"] 135 | assert clauses[0].edit_distance == 2 136 | 137 | def test_multiple_terms_with_edit_distance(self): 138 | clauses = parse("foo~2 bar~3") 139 | 140 | assert len(clauses) == 2 141 | assert clauses[0].fields == ["title", "body"] 142 | assert clauses[1].fields == ["title", "body"] 143 | 144 | assert clauses[0].term == "foo" 145 | assert clauses[1].term == "bar" 146 | 147 | assert clauses[0].edit_distance == 2 148 | assert clauses[1].edit_distance == 3 149 | 150 | def test_single_term_scoped_to_field_with_edit_distance(self): 151 | clauses = parse("title:foo~2") 152 | 153 | assert len(clauses) == 1 154 | assert clauses[0].term == "foo" 155 | assert clauses[0].fields == ["title"] 156 | assert clauses[0].edit_distance == 2 157 | 158 | def test_non_numeric_edit_distance(self): 159 | with pytest.raises(QueryParseError): 160 | parse("foo~a") 161 | 162 | def test_edit_distance_without_a_term(self): 163 | with pytest.raises(QueryParseError): 164 | parse("~2") 165 | 166 | def test_single_term_with_boost(self): 167 | clauses = parse("foo^2") 168 | 169 | assert len(clauses) == 1 170 | assert clauses[0].term == "foo" 171 | assert clauses[0].fields == ["title", "body"] 172 | assert clauses[0].boost == 2 173 | 174 | def test_non_numeric_boost(self): 175 | with pytest.raises(QueryParseError): 176 | parse("foo^a") 177 | 178 | def test_boost_without_a_term(self): 179 | with pytest.raises(QueryParseError): 180 | parse("^2") 181 | 182 | def test_multiple_terms_with_boost(self): 183 | clauses = parse("foo^2 bar^3") 184 | 185 | assert len(clauses) == 2 186 | assert clauses[0].fields == ["title", "body"] 187 | assert clauses[1].fields == ["title", "body"] 188 | 189 | assert clauses[0].term == "foo" 190 | assert clauses[1].term == "bar" 191 | 192 | assert clauses[0].boost == 2 193 | assert clauses[1].boost == 3 194 | 195 | def test_term_scoped_by_field_with_boost(self): 196 | clauses = parse("title:foo^2") 197 | 198 | assert len(clauses) == 1 199 | assert clauses[0].term == "foo" 200 | assert clauses[0].fields == ["title"] 201 | assert clauses[0].boost == 2 202 | 203 | def test_term_with_boost_and_edit_distance(self): 204 | clauses = parse("foo^2~3") 205 | 206 | assert len(clauses) == 1 207 | assert clauses[0].term == "foo" 208 | assert clauses[0].fields == ["title", "body"] 209 | assert clauses[0].edit_distance == 3 210 | assert clauses[0].boost == 2 211 | 212 | def test_edit_distance_followed_by_presence(self): 213 | clauses = parse("foo~10 +bar") 214 | 215 | assert len(clauses) == 2 216 | 217 | assert clauses[0].fields == ["title", "body"] 218 | assert clauses[1].fields == ["title", "body"] 219 | 220 | assert clauses[0].term == "foo" 221 | assert clauses[1].term == "bar" 222 | 223 | assert clauses[0].edit_distance == 10 224 | assert clauses[1].edit_distance == 0 225 | 226 | assert clauses[0].presence == QueryPresence.OPTIONAL 227 | assert clauses[1].presence == QueryPresence.REQUIRED 228 | 229 | def test_boost_followed_by_presence(self): 230 | clauses = parse("foo^10 +bar") 231 | 232 | assert len(clauses) == 2 233 | 234 | assert clauses[0].fields == ["title", "body"] 235 | assert clauses[1].fields == ["title", "body"] 236 | 237 | assert clauses[0].term == "foo" 238 | assert clauses[1].term == "bar" 239 | 240 | assert clauses[0].boost == 10 241 | assert clauses[1].boost == 1 242 | 243 | assert clauses[0].presence == QueryPresence.OPTIONAL 244 | assert clauses[1].presence == QueryPresence.REQUIRED 245 | -------------------------------------------------------------------------------- /tests/test_pipeline.py: -------------------------------------------------------------------------------- 1 | from mock import patch 2 | 3 | import pytest 4 | 5 | from lunr.exceptions import BaseLunrException 6 | from lunr.pipeline import Pipeline 7 | 8 | 9 | def noop(*args, **kwargs): 10 | pass 11 | 12 | 13 | def fn(*args, **kwargs): 14 | pass 15 | 16 | 17 | class BaseTestPipeline: 18 | @pytest.fixture(autouse=True) 19 | def setup_mock_pipline(self, monkeypatch): 20 | monkeypatch.setattr(Pipeline, "registered_functions", {}) 21 | monkeypatch.setattr(Pipeline, "warn_if_function_not_registered", noop) 22 | self.pipeline = Pipeline() 23 | 24 | 25 | class TestAdd(BaseTestPipeline): 26 | def test_add_function_to_pipeline(self): 27 | self.pipeline.add(noop) 28 | assert len(self.pipeline) == 1 29 | 30 | def test_add_multiple_functions_to_pipeline(self): 31 | self.pipeline.add(noop, noop) 32 | assert len(self.pipeline) == 2 33 | 34 | def test_add_warns_if_function_not_registered(self, monkeypatch): 35 | monkeypatch.undo() 36 | with patch("lunr.pipeline.log") as mock_log: 37 | self.pipeline.add(lambda x: x) 38 | mock_log.warning.assert_called_once() 39 | 40 | 41 | class TestRemove(BaseTestPipeline): 42 | def test_remove_function_exists_in_pipeline(self): 43 | self.pipeline.add(noop) 44 | assert len(self.pipeline) == 1 45 | 46 | self.pipeline.remove(noop) 47 | assert len(self.pipeline) == 0 48 | 49 | def test_remove_function_does_not_exist_in_pipeline(self): 50 | 51 | self.pipeline.add(noop) 52 | assert len(self.pipeline) == 1 53 | 54 | self.pipeline.remove(fn) 55 | assert len(self.pipeline) == 1 56 | 57 | 58 | class TestBefore(BaseTestPipeline): 59 | def test_before_other_function_exists(self): 60 | self.pipeline.add(noop) 61 | self.pipeline.before(noop, fn) 62 | 63 | assert self.pipeline._stack == [fn, noop] 64 | 65 | def test_before_other_functions_does_not_exist(self): 66 | with pytest.raises(BaseLunrException): 67 | self.pipeline.before(noop, fn) 68 | 69 | assert len(self.pipeline) == 0 70 | 71 | 72 | class TestAfter(BaseTestPipeline): 73 | def test_after_other_function_exists(self): 74 | self.pipeline.add(noop) 75 | self.pipeline.after(noop, fn) 76 | 77 | assert self.pipeline._stack == [noop, fn] 78 | 79 | def test_after_other_function_does_not_exist(self): 80 | with pytest.raises(BaseLunrException): 81 | self.pipeline.after(noop, fn) 82 | 83 | assert len(self.pipeline) == 0 84 | 85 | 86 | class TestRun(BaseTestPipeline): 87 | def test_run_calling_each_function_for_each_token(self): 88 | count_1 = [] 89 | count_2 = [] 90 | 91 | def fn1(t, *args): 92 | count_1.append(1) 93 | return t 94 | 95 | def fn2(t, *args): 96 | count_2.append(1) 97 | return t 98 | 99 | self.pipeline.add(fn1, fn2) 100 | self.pipeline.run([1, 2, 3]) 101 | 102 | assert len(count_1) == 3 103 | assert len(count_2) == 3 104 | 105 | def test_run_passes_token_to_pipeline_function(self): 106 | def fn(token, *args): 107 | assert token == "foo" 108 | 109 | self.pipeline.add(fn) 110 | self.pipeline.run(["foo"]) 111 | 112 | def test_run_passes_index_to_pipeline_function(self): 113 | def fn(_, index, *args): 114 | assert index == 0 115 | 116 | self.pipeline.add(fn) 117 | self.pipeline.run(["foo"]) 118 | 119 | def test_run_passes_entire_token_list_to_pipeline_function(self): 120 | def fn(_, __, tokens): 121 | assert tokens == ["foo"] 122 | 123 | self.pipeline.add(fn) 124 | self.pipeline.run(["foo"]) 125 | 126 | def test_run_passes_output_of_one_function_as_input_to_the_next(self): 127 | def fn1(t, *args): 128 | return t.upper() 129 | 130 | def fn2(t, *args): 131 | assert t == "FOO" 132 | 133 | self.pipeline.add(fn1, fn2) 134 | self.pipeline.run(["foo"]) 135 | 136 | def test_run_returns_the_results_of_the_last_function(self): 137 | def fn(t, *args): 138 | return t.upper() 139 | 140 | self.pipeline.add(fn) 141 | 142 | assert self.pipeline.run(["foo"]) == ["FOO"] 143 | 144 | def test_run_filters_out_none_and_empty_string_values(self): 145 | tokens = [] 146 | 147 | def fn1(t, i, _): 148 | if i % 2: 149 | return t 150 | elif i == 5: 151 | return "" 152 | 153 | def fn2(t, *args): 154 | tokens.append(t) 155 | return t 156 | 157 | self.pipeline.add(fn1) 158 | self.pipeline.add(fn2) 159 | 160 | output = self.pipeline.run(list("abcde")) 161 | 162 | assert tokens == ["b", "d"] 163 | assert output == ["b", "d"] 164 | 165 | def test_expanding_tokens_passed_to_output(self): 166 | self.pipeline.add(lambda t, *args: [t, t.upper()]) 167 | 168 | assert self.pipeline.run(["foo"]) == ["foo", "FOO"] 169 | 170 | def test_expanding_tokens_not_passed_to_same_function(self): 171 | received = [] 172 | 173 | def fn(t, *args): 174 | received.append(t) 175 | return [t, t.upper()] 176 | 177 | self.pipeline.add(fn) 178 | self.pipeline.run(["foo"]) 179 | 180 | assert received == ["foo"] 181 | 182 | def test_expanding_tokens_passed_to_the_next_pipeline_function(self): 183 | received = [] 184 | 185 | def fn1(t, *args): 186 | return [t, t.upper()] 187 | 188 | def fn2(t, *args): 189 | received.append(t) 190 | 191 | self.pipeline.add(fn1) 192 | self.pipeline.add(fn2) 193 | self.pipeline.run(["foo"]) 194 | 195 | assert received == ["foo", "FOO"] 196 | 197 | def test_skip_pipeline_function(self) -> None: 198 | def upper(t, *args): 199 | return t.upper() 200 | 201 | def lower(t, *args): 202 | return t.lower() 203 | 204 | self.pipeline.add(upper) 205 | self.pipeline.skip(upper, ["no_upper", "nothing"]) 206 | assert self.pipeline.run(["Foo"]) == ["FOO"] 207 | 208 | self.pipeline.add(lower) 209 | self.pipeline.skip(lower, ["no_lower", "nothing"]) 210 | assert self.pipeline.run(["Foo"]) == ["foo"] 211 | 212 | assert self.pipeline.run(["Foo"], field_name="no_lower") == ["FOO"] 213 | assert self.pipeline.run(["Foo"], field_name="no_upper") == ["foo"] 214 | assert self.pipeline.run(["Foo"], field_name="nothing") == ["Foo"] 215 | 216 | 217 | class TestSerialize(BaseTestPipeline): 218 | def test_serialize_returns_array_of_registered_function_labels(self): 219 | Pipeline.register_function(fn, "fn") 220 | self.pipeline.add(fn) 221 | 222 | assert self.pipeline.serialize() == ["fn"] 223 | assert repr(self.pipeline) == '' 224 | 225 | 226 | class TestRegisterFunction(BaseTestPipeline): 227 | def setup_method(self, method): 228 | def fn(*args): 229 | pass 230 | 231 | self.fn = fn 232 | 233 | def test_register_function_adds_a_label_property_to_the_function(self): 234 | Pipeline.register_function(self.fn, "fn") 235 | 236 | assert self.fn.label == "fn" 237 | 238 | def test_register_function_adds_defaults_to_name_of_the_function(self): 239 | Pipeline.register_function(self.fn) 240 | 241 | assert self.fn.label == self.fn.__name__ 242 | 243 | def test_register_function_adds_function_to_list_of_registered_functions(self): 244 | Pipeline.register_function(self.fn, "fn") 245 | 246 | assert Pipeline.registered_functions["fn"] == self.fn 247 | 248 | def test_register_function_warns_when_adding_function_with_same_label(self): 249 | Pipeline.register_function(self.fn, "fn") 250 | with patch("lunr.pipeline.log") as mock_log: 251 | Pipeline.register_function(self.fn, "fn") 252 | 253 | mock_log.warning.assert_called_once() 254 | 255 | 256 | class TestLoad(BaseTestPipeline): 257 | def test_load_with_registered_functions(self): 258 | serialized_pipeline = ["fn"] 259 | Pipeline.register_function(fn, "fn") 260 | 261 | pipeline = Pipeline.load(serialized_pipeline) 262 | 263 | assert len(pipeline) == 1 264 | assert pipeline._stack[0] == fn 265 | 266 | def test_load_with_unregistered_functions(self): 267 | serialized_pipeline = ["fn"] 268 | with pytest.raises(BaseLunrException): 269 | Pipeline.load(serialized_pipeline) 270 | 271 | 272 | class TestReset(BaseTestPipeline): 273 | def test_reset_empties_the_stack(self): 274 | self.pipeline.add(noop) 275 | assert len(self.pipeline) == 1 276 | 277 | self.pipeline.reset() 278 | assert len(self.pipeline) == 0 279 | -------------------------------------------------------------------------------- /tests/test_query_lexer.py: -------------------------------------------------------------------------------- 1 | from lunr.query_lexer import QueryLexer 2 | 3 | 4 | def _lex(string): 5 | lexer = QueryLexer(string) 6 | lexer.run() 7 | return lexer 8 | 9 | 10 | class TestQueryLexer: 11 | def test_single_term_produces_one_lexeme(self): 12 | lexer = _lex("foo") 13 | assert len(lexer.lexemes) == 1 14 | lexeme = lexer.lexemes[0] 15 | assert lexeme["type"] == QueryLexer.TERM 16 | assert lexeme["string"] == "foo" 17 | assert lexeme["start"] == 0 18 | assert lexeme["end"] == 3 19 | 20 | def test_term_escape_character(self): 21 | lexer = _lex("foo\\:bar") 22 | assert len(lexer.lexemes) == 1 23 | lexeme = lexer.lexemes[0] 24 | assert lexeme["type"] == QueryLexer.TERM 25 | assert lexeme["string"] == "foo:bar" 26 | assert lexeme["start"] == 0 27 | assert lexeme["end"] == 8 28 | 29 | def test_multiple_terms(self): 30 | lexer = _lex("foo bar") 31 | assert len(lexer.lexemes) == 2 32 | foo_lexeme, bar_lexeme = lexer.lexemes 33 | assert foo_lexeme["type"] == bar_lexeme["type"] == QueryLexer.TERM 34 | assert foo_lexeme["string"] == "foo" 35 | assert bar_lexeme["string"] == "bar" 36 | assert foo_lexeme["start"] == 0 37 | assert bar_lexeme["start"] == 4 38 | assert foo_lexeme["end"] == 3 39 | assert bar_lexeme["end"] == 7 40 | 41 | def test_separator_length_greater_than_one(self): 42 | lexer = _lex("foo bar") 43 | assert len(lexer.lexemes) == 2 44 | foo_lexeme, bar_lexeme = lexer.lexemes 45 | assert foo_lexeme["type"] == bar_lexeme["type"] == QueryLexer.TERM 46 | assert foo_lexeme["string"] == "foo" 47 | assert bar_lexeme["string"] == "bar" 48 | assert foo_lexeme["start"] == 0 49 | assert bar_lexeme["start"] == 7 50 | assert foo_lexeme["end"] == 3 51 | assert bar_lexeme["end"] == 10 52 | 53 | def test_hyphen_is_considered_a_separator(self): 54 | lexer = _lex("foo-bar") 55 | assert len(lexer.lexemes) == 2 56 | 57 | def test_term_with_field(self): 58 | lexer = _lex("title:foo") 59 | assert len(lexer.lexemes) == 2 60 | field_lexeme, term_lexeme = lexer.lexemes 61 | assert field_lexeme["type"] == QueryLexer.FIELD 62 | assert term_lexeme["type"] == QueryLexer.TERM 63 | assert field_lexeme["string"] == "title" 64 | assert term_lexeme["string"] == "foo" 65 | assert field_lexeme["start"] == 0 66 | assert term_lexeme["start"] == 6 67 | assert field_lexeme["end"] == 5 68 | assert term_lexeme["end"] == 9 69 | 70 | def test_term_with_field_with_escape_character(self): 71 | lexer = _lex("ti\\:tle:foo") 72 | assert len(lexer.lexemes) == 2 73 | field_lexeme, term_lexeme = lexer.lexemes 74 | assert field_lexeme["type"] == QueryLexer.FIELD 75 | assert term_lexeme["type"] == QueryLexer.TERM 76 | assert field_lexeme["string"] == "ti:tle" 77 | assert term_lexeme["string"] == "foo" 78 | assert field_lexeme["start"] == 0 79 | assert term_lexeme["start"] == 8 80 | assert field_lexeme["end"] == 7 81 | assert term_lexeme["end"] == 11 82 | 83 | def test_term_with_edit_distance(self): 84 | lexer = _lex("foo~2") 85 | assert len(lexer.lexemes) == 2 86 | term_lexeme, edit_distance_lexeme = lexer.lexemes 87 | assert term_lexeme["type"] == QueryLexer.TERM 88 | assert edit_distance_lexeme["type"] == QueryLexer.EDIT_DISTANCE 89 | assert term_lexeme["string"] == "foo" 90 | assert edit_distance_lexeme["string"] == "2" 91 | assert term_lexeme["start"] == 0 92 | assert edit_distance_lexeme["start"] == 4 93 | assert term_lexeme["end"] == 3 94 | assert edit_distance_lexeme["end"] == 5 95 | 96 | def test_term_with_boost(self): 97 | lexer = _lex("foo^10") 98 | assert len(lexer.lexemes) == 2 99 | term_lexeme, boost_lexeme = lexer.lexemes 100 | assert term_lexeme["type"] == QueryLexer.TERM 101 | assert boost_lexeme["type"] == QueryLexer.BOOST 102 | assert term_lexeme["string"] == "foo" 103 | assert boost_lexeme["string"] == "10" 104 | assert term_lexeme["start"] == 0 105 | assert boost_lexeme["start"] == 4 106 | assert term_lexeme["end"] == 3 107 | assert boost_lexeme["end"] == 6 108 | 109 | def test_term_with_field_boost_and_edit_distance(self): 110 | lexer = _lex("title:foo^10~5") 111 | assert len(lexer.lexemes) == 4 112 | field_lexeme, term_lexeme, boost_lexeme, edit_distance_lexeme = lexer.lexemes 113 | assert field_lexeme["type"] == QueryLexer.FIELD 114 | assert term_lexeme["type"] == QueryLexer.TERM 115 | assert boost_lexeme["type"] == QueryLexer.BOOST 116 | assert edit_distance_lexeme["type"] == QueryLexer.EDIT_DISTANCE 117 | 118 | assert field_lexeme["string"] == "title" 119 | assert term_lexeme["string"] == "foo" 120 | assert boost_lexeme["string"] == "10" 121 | assert edit_distance_lexeme["string"] == "5" 122 | 123 | assert field_lexeme["start"] == 0 124 | assert term_lexeme["start"] == 6 125 | assert boost_lexeme["start"] == 10 126 | assert edit_distance_lexeme["start"] == 13 127 | 128 | assert field_lexeme["end"] == 5 129 | assert term_lexeme["end"] == 9 130 | assert boost_lexeme["end"] == 12 131 | assert edit_distance_lexeme["end"] == 14 132 | 133 | def test_single_term_with_hyphen_produces_two_lexemes(self): 134 | """Embedded hyphens should not be confused with presence operators.""" 135 | lexer = _lex("foo-bar") 136 | assert len(lexer.lexemes) == 2 137 | foo_lexeme, bar_lexeme = lexer.lexemes 138 | 139 | assert foo_lexeme["type"] == QueryLexer.TERM 140 | assert foo_lexeme["string"] == "foo" 141 | assert foo_lexeme["start"] == 0 142 | assert foo_lexeme["end"] == 3 143 | 144 | assert bar_lexeme["type"] == QueryLexer.TERM 145 | assert bar_lexeme["string"] == "bar" 146 | assert bar_lexeme["start"] == 4 147 | assert bar_lexeme["end"] == 7 148 | 149 | def test_single_term_with_presence_produces_two_lexemes(self): 150 | lexer = _lex("+foo") 151 | assert len(lexer.lexemes) == 2 152 | presence_lexeme, term_lexeme = lexer.lexemes 153 | 154 | assert presence_lexeme["type"] == QueryLexer.PRESENCE 155 | assert presence_lexeme["string"] == "+" 156 | assert presence_lexeme["start"] == 0 157 | assert presence_lexeme["end"] == 1 158 | 159 | assert term_lexeme["type"] == QueryLexer.TERM 160 | assert term_lexeme["string"] == "foo" 161 | assert term_lexeme["start"] == 1 162 | assert term_lexeme["end"] == 4 163 | 164 | def test_multiple_terms_with_presence_produces_four_lexemes(self): 165 | lexer = _lex("+foo +bar") 166 | assert len(lexer.lexemes) == 4 167 | ( 168 | foo_presence_lexeme, 169 | foo_term_lexeme, 170 | bar_presence_lexeme, 171 | bar_term_lexeme, 172 | ) = lexer.lexemes 173 | 174 | assert foo_term_lexeme["type"] == QueryLexer.TERM 175 | assert foo_term_lexeme["string"] == "foo" 176 | assert foo_term_lexeme["start"] == 1 177 | assert foo_term_lexeme["end"] == 4 178 | 179 | assert foo_presence_lexeme["type"] == QueryLexer.PRESENCE 180 | assert foo_presence_lexeme["string"] == "+" 181 | assert foo_presence_lexeme["start"] == 0 182 | assert foo_presence_lexeme["end"] == 1 183 | 184 | assert bar_term_lexeme["type"] == QueryLexer.TERM 185 | assert bar_term_lexeme["string"] == "bar" 186 | assert bar_term_lexeme["start"] == 6 187 | assert bar_term_lexeme["end"] == 9 188 | 189 | assert bar_presence_lexeme["type"] == QueryLexer.PRESENCE 190 | assert bar_presence_lexeme["string"] == "+" 191 | assert bar_presence_lexeme["start"] == 5 192 | assert bar_presence_lexeme["end"] == 6 193 | 194 | def test_multiple_terms_with_presence_and_fuzz(self): 195 | lexer = _lex("+foo~1 +bar") 196 | assert len(lexer.lexemes) == 5 197 | 198 | ( 199 | foo_presence_lexeme, 200 | foo_term_lexeme, 201 | foo_fuzz_lexeme, 202 | bar_presence_lexeme, 203 | bar_term_lexeme, 204 | ) = lexer.lexemes 205 | 206 | assert foo_presence_lexeme["type"] == QueryLexer.PRESENCE 207 | assert foo_presence_lexeme["string"] == "+" 208 | assert foo_presence_lexeme["start"] == 0 209 | assert foo_presence_lexeme["end"] == 1 210 | 211 | assert foo_term_lexeme["type"] == QueryLexer.TERM 212 | assert foo_term_lexeme["string"] == "foo" 213 | assert foo_term_lexeme["start"] == 1 214 | assert foo_term_lexeme["end"] == 4 215 | 216 | assert foo_fuzz_lexeme["type"] == QueryLexer.EDIT_DISTANCE 217 | assert foo_fuzz_lexeme["string"] == "1" 218 | assert foo_fuzz_lexeme["start"] == 5 219 | assert foo_fuzz_lexeme["end"] == 6 220 | 221 | assert bar_presence_lexeme["type"] == QueryLexer.PRESENCE 222 | assert bar_presence_lexeme["string"] == "+" 223 | assert bar_presence_lexeme["start"] == 7 224 | assert bar_presence_lexeme["end"] == 8 225 | 226 | assert bar_term_lexeme["type"] == QueryLexer.TERM 227 | assert bar_term_lexeme["string"] == "bar" 228 | assert bar_term_lexeme["start"] == 8 229 | assert bar_term_lexeme["end"] == 11 230 | -------------------------------------------------------------------------------- /tests/test_token_set.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lunr.token_set import TokenSet 4 | from lunr.exceptions import BaseLunrException 5 | 6 | 7 | class TestTokenSetStr: 8 | def test_str_includes_node_finality(self): 9 | non_final = TokenSet() 10 | final = TokenSet() 11 | other_final = TokenSet() 12 | 13 | final.final = True 14 | other_final.final = True 15 | 16 | assert str(non_final) != str(final) 17 | assert str(other_final) == str(final) 18 | 19 | def test_str_includes_all_edges(self): 20 | zero_edges = TokenSet() 21 | one_edge = TokenSet() 22 | two_edges = TokenSet() 23 | 24 | one_edge.edges["a"] = 1 25 | two_edges.edges["a"] = 1 26 | two_edges.edges["b"] = 1 27 | 28 | assert str(zero_edges) != str(one_edge) 29 | assert str(two_edges) != str(one_edge) 30 | assert str(two_edges) != str(zero_edges) 31 | 32 | def test_str_includes_edge_id(self): 33 | child_a = TokenSet() 34 | child_b = TokenSet() 35 | parent_a = TokenSet() 36 | parent_b = TokenSet() 37 | parent_c = TokenSet() 38 | 39 | parent_a.edges["a"] = child_a 40 | parent_b.edges["a"] = child_b 41 | parent_c.edges["a"] = child_b 42 | 43 | assert str(parent_b) == str(parent_c) 44 | assert str(parent_a) != str(parent_c) 45 | assert str(parent_a) != str(parent_b) 46 | 47 | 48 | class TestTokenSetFromString: 49 | def test_from_string_without_wildcard(self): 50 | TokenSet._next_id = 1 51 | x = TokenSet.from_string("a") 52 | 53 | assert str(x) == "0a2" 54 | assert x.edges["a"].final 55 | 56 | def test_from_string_with_trailing_wildcard(self): 57 | x = TokenSet.from_string("a*") 58 | wild = x.edges["a"].edges["*"] 59 | 60 | assert wild == wild.edges["*"] 61 | assert wild.final 62 | 63 | 64 | class TestTokenSetFromList: 65 | def test_from_list_with_unsorted_list(self): 66 | with pytest.raises(BaseLunrException): 67 | TokenSet.from_list(["z", "a"]) 68 | 69 | def test_from_list_with_sorted_list(self): 70 | token_set = TokenSet.from_list(["a", "z"]) 71 | assert ["a", "z"] == sorted(token_set.to_list()) 72 | 73 | def test_from_list_is_minimal(self): 74 | token_set = TokenSet.from_list(["ac", "dc"]) 75 | ac_node = token_set.edges["a"].edges["c"] 76 | dc_node = token_set.edges["d"].edges["c"] 77 | 78 | assert ac_node == dc_node 79 | 80 | 81 | class TestTokenSetToList: 82 | def test_to_list_includes_all_words(self): 83 | words = ["bat", "cat"] 84 | token_set = TokenSet.from_list(words) 85 | 86 | assert set(words) == set(token_set.to_list()) 87 | 88 | def test_to_list_includes_single_words(self): 89 | word = "bat" 90 | token_set = TokenSet.from_string(word) 91 | 92 | assert {word} == set(token_set.to_list()) 93 | 94 | 95 | class TestTokenSetIntersect: 96 | def test_no_intersection(self): 97 | x = TokenSet.from_string("cat") 98 | y = TokenSet.from_string("bar") 99 | z = x.intersect(y) 100 | 101 | assert len(z.to_list()) == 0 102 | 103 | def test_simple_intersection(self): 104 | x = TokenSet.from_string("cat") 105 | y = TokenSet.from_string("cat") 106 | z = x.intersect(y) 107 | 108 | assert {"cat"} == set(z.to_list()) 109 | 110 | def test_trailing_wildcard_intersection(self): 111 | x = TokenSet.from_string("cat") 112 | y = TokenSet.from_string("c*") 113 | z = x.intersect(y) 114 | 115 | assert {"cat"} == set(z.to_list()) 116 | 117 | def test_trailing_wildcard_no_intersection(self): 118 | x = TokenSet.from_string("cat") 119 | y = TokenSet.from_string("b*") 120 | z = x.intersect(y) 121 | 122 | assert len(z.to_list()) == 0 123 | 124 | def test_leading_wildcard_intersection(self): 125 | x = TokenSet.from_string("cat") 126 | y = TokenSet.from_string("*t") 127 | z = x.intersect(y) 128 | 129 | assert {"cat"} == set(z.to_list()) 130 | 131 | def test_leading_wildcard_no_intersection(self): 132 | x = TokenSet.from_string("cat") 133 | y = TokenSet.from_string("*r") 134 | z = x.intersect(y) 135 | 136 | assert len(z.to_list()) == 0 137 | 138 | def test_contained_wildcard_intersection(self): 139 | x = TokenSet.from_string("foo") 140 | y = TokenSet.from_string("f*o") 141 | z = x.intersect(y) 142 | 143 | assert {"foo"} == set(z.to_list()) 144 | 145 | def test_contained_wildcard_no_intersection(self): 146 | x = TokenSet.from_string("foo") 147 | y = TokenSet.from_string("b*r") 148 | z = x.intersect(y) 149 | 150 | assert len(z.to_list()) == 0 151 | 152 | def test_wildcard_zero_or_more_characters(self): 153 | x = TokenSet.from_string("foo") 154 | y = TokenSet.from_string("foo*") 155 | z = x.intersect(y) 156 | 157 | assert {"foo"} == set(z.to_list()) 158 | 159 | def test_with_fuzzy_string_substitution(self): 160 | x1 = TokenSet.from_string("bar") 161 | x2 = TokenSet.from_string("cur") 162 | x3 = TokenSet.from_string("cat") 163 | x4 = TokenSet.from_string("car") 164 | x5 = TokenSet.from_string("foo") 165 | y = TokenSet.from_fuzzy_string("car", 1) 166 | 167 | assert x1.intersect(y).to_list() == ["bar"] 168 | assert x2.intersect(y).to_list() == ["cur"] 169 | assert x3.intersect(y).to_list() == ["cat"] 170 | assert x4.intersect(y).to_list() == ["car"] 171 | assert x5.intersect(y).to_list() == [] 172 | 173 | def test_with_fuzzy_string_deletion(self): 174 | x1 = TokenSet.from_string("ar") 175 | x2 = TokenSet.from_string("br") 176 | x3 = TokenSet.from_string("ba") 177 | x4 = TokenSet.from_string("bar") 178 | x5 = TokenSet.from_string("foo") 179 | y = TokenSet.from_fuzzy_string("bar", 1) 180 | 181 | assert x1.intersect(y).to_list() == ["ar"] 182 | assert x2.intersect(y).to_list() == ["br"] 183 | assert x3.intersect(y).to_list() == ["ba"] 184 | assert x4.intersect(y).to_list() == ["bar"] 185 | assert x5.intersect(y).to_list() == [] 186 | 187 | def test_with_fuzzy_string_insertion(self): 188 | x1 = TokenSet.from_string("bbar") 189 | x2 = TokenSet.from_string("baar") 190 | x3 = TokenSet.from_string("barr") 191 | x4 = TokenSet.from_string("bar") 192 | x5 = TokenSet.from_string("ba") 193 | x6 = TokenSet.from_string("foo") 194 | x7 = TokenSet.from_string("bara") 195 | y = TokenSet.from_fuzzy_string("bar", 1) 196 | 197 | assert x1.intersect(y).to_list() == ["bbar"] 198 | assert x2.intersect(y).to_list() == ["baar"] 199 | assert x3.intersect(y).to_list() == ["barr"] 200 | assert x4.intersect(y).to_list() == ["bar"] 201 | assert x5.intersect(y).to_list() == ["ba"] 202 | assert x6.intersect(y).to_list() == [] 203 | assert x7.intersect(y).to_list() == ["bara"] 204 | 205 | def test_with_fuzzy_string_transpose(self): 206 | x1 = TokenSet.from_string("abr") 207 | x2 = TokenSet.from_string("bra") 208 | x3 = TokenSet.from_string("foo") 209 | y = TokenSet.from_fuzzy_string("bar", 1) 210 | 211 | assert x1.intersect(y).to_list() == ["abr"] 212 | assert x2.intersect(y).to_list() == ["bra"] 213 | assert x3.intersect(y).to_list() == [] 214 | 215 | def test_fuzzy_string_insertion(self): 216 | x = TokenSet.from_string("abcxx") 217 | y = TokenSet.from_fuzzy_string("abc", 2) 218 | 219 | assert x.intersect(y).to_list() == ["abcxx"] 220 | 221 | def test_fuzzy_string_substitution(self): 222 | x = TokenSet.from_string("axx") 223 | y = TokenSet.from_fuzzy_string("abc", 2) 224 | 225 | assert x.intersect(y).to_list() == ["axx"] 226 | 227 | def test_fuzzy_string_deletion(self): 228 | x = TokenSet.from_string("a") 229 | y = TokenSet.from_fuzzy_string("abc", 2) 230 | 231 | assert x.intersect(y).to_list() == ["a"] 232 | 233 | def test_fuzzy_string_transpose(self): 234 | x = TokenSet.from_string("bca") 235 | y = TokenSet.from_fuzzy_string("abc", 2) 236 | 237 | assert x.intersect(y).to_list() == ["bca"] 238 | 239 | def test_leading_wildcard_backtracking_intersection(self): 240 | x = TokenSet.from_string("aaacbab") 241 | y = TokenSet.from_string("*ab") 242 | 243 | assert x.intersect(y).to_list() == ["aaacbab"] 244 | 245 | def test_leading_wildcard_backtracking_no_intersection(self): 246 | x = TokenSet.from_string("aaacbab") 247 | y = TokenSet.from_string("*abc") 248 | 249 | assert x.intersect(y).to_list() == [] 250 | 251 | def test_contained_wildcard_backtracking_intersection(self): 252 | x = TokenSet.from_string("ababc") 253 | y = TokenSet.from_string("a*bc") 254 | 255 | assert x.intersect(y).to_list() == ["ababc"] 256 | 257 | def test_contained_wildcard_backtracking_no_intersection(self): 258 | x = TokenSet.from_string("ababc") 259 | y = TokenSet.from_string("a*ac") 260 | 261 | assert x.intersect(y).to_list() == [] 262 | 263 | @pytest.mark.timeout(2) 264 | def test_catastrophic_backtracking_with_leading_characters(self): 265 | x = TokenSet.from_string("f" * 100) 266 | y = TokenSet.from_string("*f") 267 | 268 | assert len(x.intersect(y).to_list()) == 1 269 | 270 | def test_leading_trailing_wildcard_backtracking_intersection(self): 271 | x = TokenSet.from_string("acbaabab") 272 | y = TokenSet.from_string("*ab*") 273 | 274 | assert x.intersect(y).to_list() == ["acbaabab"] 275 | 276 | def test_leading_atrailing_wildcard_backtracking_intersection(self): 277 | x = TokenSet.from_string("acbaabab") 278 | y = TokenSet.from_string("a*ba*b") 279 | 280 | assert x.intersect(y).to_list() == ["acbaabab"] 281 | -------------------------------------------------------------------------------- /lunr/token_set.py: -------------------------------------------------------------------------------- 1 | class TokenSet: 2 | """ 3 | A token set is used to store the unique list of all tokens 4 | within an index. Token sets are also used to represent an 5 | incoming query to the index, this query token set and index 6 | token set are then intersected to find which tokens to look 7 | up in the inverted index. 8 | 9 | A token set can hold multiple tokens, as in the case of the 10 | index token set, or it can hold a single token as in the 11 | case of a simple query token set. 12 | 13 | Additionally token sets are used to perform wildcard matching. 14 | Leading, contained and trailing wildcards are supported, and 15 | from this edit distance matching can also be provided. 16 | 17 | Token sets are implemented as a minimal finite state automata, 18 | where both common prefixes and suffixes are shared between tokens. 19 | This helps to reduce the space used for storing the token set. 20 | 21 | TODO: consider https://github.com/glyph/automat 22 | """ 23 | 24 | _next_id = 1 25 | 26 | def __init__(self): 27 | self.final = False 28 | self.edges = {} 29 | self.id = self._next_id 30 | self.__class__._next_id += 1 31 | 32 | def __str__(self): 33 | try: 34 | return self._string 35 | except AttributeError: 36 | pass 37 | 38 | string = "1" if self.final else "0" 39 | for label in sorted(list(self.edges.keys())): 40 | node = self.edges[label] 41 | try: 42 | node_id = str(node.id) 43 | except AttributeError: 44 | # TODO: JS seems to rely on undefined for the id attribute? 45 | node_id = "" 46 | 47 | string = string + label + node_id 48 | 49 | return string 50 | 51 | def __repr__(self): 52 | return ''.format(str(self)) 53 | 54 | @classmethod 55 | def from_string(self, string): 56 | """Creates a TokenSet from a string. 57 | 58 | The string may contain one or more wildcard characters (*) that will 59 | allow wildcard matching when intersecting with another TokenSet 60 | """ 61 | node = TokenSet() 62 | root = node 63 | 64 | # Iterates throough all characters in the passed string appending 65 | # a node for each character. 66 | # When a wildcard character is found then a self referencing edge 67 | # is introduced to continually match any number of characters 68 | for i, char in enumerate(string): 69 | final = i == len(string) - 1 70 | if char == "*": 71 | node.edges[char] = node 72 | node.final = final 73 | else: 74 | next_ = TokenSet() 75 | next_.final = final 76 | node.edges[char] = next_ 77 | node = next_ 78 | 79 | return root 80 | 81 | @classmethod 82 | def from_fuzzy_string(cls, string, edit_distance): 83 | """Creates a token set representing a single string with a specified 84 | edit distance. 85 | 86 | Insertions, deletions, substitutions and transpositions are each 87 | treated as an edit distance of 1. 88 | 89 | Increasing the allowed edit distance will have a dramatic impact 90 | on the performance of both creating and intersecting these TokenSets. 91 | It is advised to keep the edit distance less than 3. 92 | """ 93 | root = TokenSet() 94 | 95 | stack = [{"node": root, "edits_remaining": edit_distance, "string": string}] 96 | 97 | while stack: 98 | frame = stack.pop() 99 | # no edit 100 | if len(frame["string"]) > 0: 101 | char = frame["string"][0] 102 | no_edit_node = None 103 | if char in frame["node"].edges: 104 | no_edit_node = frame["node"].edges[char] 105 | else: 106 | no_edit_node = TokenSet() 107 | frame["node"].edges[char] = no_edit_node 108 | 109 | if len(frame["string"]) == 1: 110 | no_edit_node.final = True 111 | 112 | stack.append( 113 | { 114 | "node": no_edit_node, 115 | "edits_remaining": frame["edits_remaining"], 116 | "string": frame["string"][1:], 117 | } 118 | ) 119 | 120 | if frame["edits_remaining"] == 0: 121 | continue 122 | 123 | # insertion, can only do insertion if there are edits remaining 124 | if "*" in frame["node"].edges: 125 | insertion_node = frame["node"].edges["*"] 126 | else: 127 | insertion_node = TokenSet() 128 | frame["node"].edges["*"] = insertion_node 129 | 130 | if len(frame["string"]) == 0: 131 | insertion_node.final = True 132 | 133 | stack.append( 134 | { 135 | "node": insertion_node, 136 | "edits_remaining": frame["edits_remaining"] - 1, 137 | "string": frame["string"], 138 | } 139 | ) 140 | 141 | # deletion, can only do a deletion if we have enough edits 142 | # remaining and if there are characters left to delete in the string 143 | if len(frame["string"]) > 1: 144 | stack.append( 145 | { 146 | "node": frame["node"], 147 | "edits_remaining": frame["edits_remaining"] - 1, 148 | "string": frame["string"][1:], 149 | } 150 | ) 151 | 152 | # deletion, just removing the last character of the string 153 | if len(frame["string"]) == 1: 154 | frame["node"].final = True 155 | 156 | # substitution, can only do a substitution if we have enough edits 157 | # remaining and there are characters left to substitute 158 | if len(frame["string"]) >= 1: 159 | if "*" in frame["node"].edges: 160 | substitution_node = frame["node"].edges["*"] 161 | else: 162 | substitution_node = TokenSet() 163 | frame["node"].edges["*"] = substitution_node 164 | 165 | if len(frame["string"]) == 1: 166 | substitution_node.final = True 167 | 168 | stack.append( 169 | { 170 | "node": substitution_node, 171 | "edits_remaining": frame["edits_remaining"] - 1, 172 | "string": frame["string"][1:], 173 | } 174 | ) 175 | 176 | # transposition, can only do a transposition if there are edits 177 | # remaining and there are enough characters to transpose 178 | if frame["edits_remaining"] and len(frame["string"]) > 1: 179 | char_a = frame["string"][0] 180 | char_b = frame["string"][1] 181 | transpose_node = None 182 | 183 | if char_b in frame["node"].edges: 184 | transpose_node = frame["node"].edges[char_b] 185 | else: 186 | transpose_node = TokenSet() 187 | frame["node"].edges[char_b] = transpose_node 188 | 189 | if len(frame["string"]) == 1: 190 | transpose_node.final = True 191 | 192 | stack.append( 193 | { 194 | "node": transpose_node, 195 | "edits_remaining": frame["edits_remaining"] - 1, 196 | "string": char_a + frame["string"][2:], 197 | } 198 | ) 199 | 200 | return root 201 | 202 | @classmethod 203 | def from_list(cls, list_of_words): 204 | from lunr.token_set_builder import TokenSetBuilder 205 | 206 | builder = TokenSetBuilder() 207 | for word in list_of_words: 208 | builder.insert(word) 209 | 210 | builder.finish() 211 | return builder.root 212 | 213 | @classmethod 214 | def from_clause(cls, clause): 215 | if clause.edit_distance: 216 | return cls.from_fuzzy_string(clause.term, clause.edit_distance) 217 | else: 218 | return cls.from_string(clause.term) 219 | 220 | def to_list(self): 221 | words = [] 222 | stack = [{"prefix": "", "node": self}] 223 | 224 | while stack: 225 | frame = stack.pop() 226 | if frame["node"].final: 227 | words.append(frame["prefix"]) 228 | 229 | for edge in frame["node"].edges.keys(): 230 | stack.append( 231 | { 232 | "prefix": frame["prefix"] + str(edge), 233 | "node": frame["node"].edges[edge], 234 | } 235 | ) 236 | 237 | return words 238 | 239 | def intersect(self, other): 240 | """Returns a new TokenSet that is the intersection of this TokenSet 241 | and the passed TokenSet. 242 | 243 | This intersection will take into account any wildcards contained within 244 | the TokenSet. 245 | """ 246 | output = TokenSet() 247 | stack = [{"node": self, "q_node": other, "output": output}] 248 | 249 | while stack: 250 | frame = stack.pop() 251 | for q_edge in frame["q_node"].edges.keys(): 252 | for n_edge in frame["node"].edges.keys(): 253 | if n_edge == q_edge or q_edge == "*": 254 | node = frame["node"].edges[n_edge] 255 | q_node = frame["q_node"].edges[q_edge] 256 | final = node.final and q_node.final 257 | next_ = None 258 | 259 | if n_edge in frame["output"].edges: 260 | next_ = frame["output"].edges[n_edge] 261 | next_.final = next_.final or final 262 | else: 263 | next_ = TokenSet() 264 | next_.final = final 265 | frame["output"].edges[n_edge] = next_ 266 | 267 | stack.append({"node": node, "q_node": q_node, "output": next_}) 268 | 269 | return output 270 | --------------------------------------------------------------------------------