├── tests
    ├── __init__.py
    ├── acceptance_tests
    │   ├── javascript
    │   │   ├── .eslintrc.js
    │   │   ├── mkdocs_serialization.js
    │   │   ├── mkdocs_query.js
    │   │   ├── package.json
    │   │   ├── language_serialize_index.js
    │   │   ├── mkdocs_load_serialized_index_and_search.js
    │   │   ├── language_query.js
    │   │   └── language_load_serialized_index_and_search.js
    │   ├── test_mkdocs.py
    │   ├── test_language_support.py
    │   └── fixtures
    │   │   ├── lang_es.json
    │   │   └── lang_es_en.json
    ├── test_complete_set.py
    ├── test_stemmer.py
    ├── conftest.py
    ├── test_trimmer.py
    ├── test_field_ref.py
    ├── test_token.py
    ├── test_serialization.py
    ├── utils.py
    ├── fixtures
    │   └── stemming_vocab.json
    ├── test_stop_word_filter.py
    ├── test_plugins.py
    ├── test_match_data.py
    ├── benchmarks.py
    ├── test_index.py
    ├── test_language_support.py
    ├── test_tokenizer.py
    ├── test_query.py
    ├── test_vector.py
    ├── test_builder.py
    ├── test_query_parser.py
    ├── test_pipeline.py
    ├── test_query_lexer.py
    └── test_token_set.py
├── setup.cfg
├── docs
    ├── changelog.md
    ├── Makefile
    ├── conf.py
    ├── languages.md
    ├── lunrjs-interop.md
    ├── index.md
    ├── customisation.md
    ├── usage.md
    └── indices.md
├── coverageio_token.txt
├── requirements
    ├── docs.txt
    ├── test.txt
    └── dev.txt
├── MANIFEST.in
├── lunr
    ├── exceptions.py
    ├── __init__.py
    ├── utils.py
    ├── trimmer.py
    ├── idf.py
    ├── languages
    │   ├── trimmer.py
    │   ├── stemmer.py
    │   └── __init__.py
    ├── token.py
    ├── field_ref.py
    ├── token_set_builder.py
    ├── tokenizer.py
    ├── match_data.py
    ├── __main__.py
    ├── stop_word_filter.py
    ├── query_lexer.py
    ├── query.py
    ├── vector.py
    ├── query_parser.py
    ├── pipeline.py
    └── token_set.py
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── test-suite.yml
├── .gitignore
├── LICENSE
├── Makefile
├── tox.ini
├── setup.py
├── readme.rst
├── CHANGELOG.md
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1
3 | 


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CHANGELOG.md
2 | ```


--------------------------------------------------------------------------------
/coverageio_token.txt:
--------------------------------------------------------------------------------
1 | b2c4c44b-baed-4d95-ae74-7f495bac7a35


--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
1 | furo
2 | sphinx
3 | sphinx-autobuild
4 | myst-parser
5 | 


--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
1 | -e .[languages]
2 | pytest
3 | pytest-timeout
4 | mock
5 | tox
6 | coverage
7 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/javascript/.eslintrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |     "extends": "standard"
3 | };
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | recursive-exclude * __pycache__
4 | recursive-exclude * *.py[co]


--------------------------------------------------------------------------------
/lunr/exceptions.py:
--------------------------------------------------------------------------------
1 | class BaseLunrException(Exception):
2 |     pass
3 | 
4 | 
5 | class QueryParseError(BaseLunrException):
6 |     pass
7 | 


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
 1 | -r test.txt
 2 | -r docs.txt
 3 | twine
 4 | pytest-benchmark
 5 | wheel
 6 | mypy
 7 | flake8
 8 | black
 9 | pdbpp
10 | ipython
11 | mypy
12 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: pip
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |   open-pull-requests-limit: 10
 8 |   ignore:
 9 |   - dependency-name: nltk
10 |     versions:
11 |     - 3.6.1
12 | 


--------------------------------------------------------------------------------
/lunr/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from lunr.__main__ import lunr, get_default_builder
 4 | 
 5 | __all__ = ("lunr", "get_default_builder")
 6 | 
 7 | logging.basicConfig(format="%(levelname)-7s -  %(message)s")
 8 | 
 9 | __VERSION__ = "0.6.2"
10 | __TARGET_JS_VERSION__ = "2.3.9"
11 | 


--------------------------------------------------------------------------------
/lunr/utils.py:
--------------------------------------------------------------------------------
 1 | def as_string(obj):
 2 |     return "" if not obj else str(obj)
 3 | 
 4 | 
 5 | class CompleteSet(set):
 6 |     def union(self, other):
 7 |         return self
 8 | 
 9 |     def intersection(self, other):
10 |         return set(other)
11 | 
12 |     def __contains__(self, y):
13 |         return True
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.pyc
 3 | *.egg-info/
 4 | .eggs/
 5 | .coverage
 6 | coverage.xml
 7 | htmlcov/
 8 | .tox/
 9 | .pytest_cache/
10 | **/node_modules
11 | dist/
12 | build/
13 | .state
14 | .venv/
15 | target/
16 | site/
17 | docs/_build
18 | 
19 | .vscode/
20 | *.code-workspace
21 | .python-version
22 | .DS_Store
23 | .benchmarks/
24 | *TODO.md
25 | tests/profiles/
26 | .mypy_cache/
27 | .dev/
28 | .direnv/
29 | .envrc
30 | .tool-versions
31 | 


--------------------------------------------------------------------------------
/lunr/trimmer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from lunr.pipeline import Pipeline
 4 | 
 5 | full_re = re.compile(r"^\W*?([^\W]+)\W*?$")
 6 | 
 7 | 
 8 | def trimmer(token, i=None, tokens=None):
 9 |     def trim(s, metadata=None):
10 |         match = full_re.match(s)
11 |         if match is None:
12 |             return s
13 |         return match.group(1)
14 | 
15 |     return token.update(trim)
16 | 
17 | 
18 | Pipeline.register_function(trimmer, "trimmer")
19 | 


--------------------------------------------------------------------------------
/tests/test_complete_set.py:
--------------------------------------------------------------------------------
 1 | from lunr.utils import CompleteSet
 2 | 
 3 | 
 4 | class TestCompleteSet:
 5 |     def test_always_contains_other_element(self):
 6 |         assert "foo" in CompleteSet()
 7 | 
 8 |     def test_intersection_returns_other(self):
 9 |         cs = CompleteSet({"bar"})
10 |         assert cs.intersection({"foo"}) == {"foo"}
11 | 
12 |     def test_union_returns_self(self):
13 |         cs = CompleteSet({"bar"})
14 |         assert cs.union({"foo"}) == {"bar"}
15 | 


--------------------------------------------------------------------------------
/lunr/idf.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | def idf(posting, document_count):
 5 |     """A function to calculate the inverse document frequency for a posting.
 6 |     This is shared between the builder and the index.
 7 |     """
 8 |     documents_with_term = 0
 9 |     for field_name in posting:
10 |         if field_name == "_index":
11 |             continue
12 |         documents_with_term += len(posting[field_name].keys())
13 | 
14 |     x = (document_count - documents_with_term + 0.5) / (documents_with_term + 0.5)
15 |     return math.log(1 + abs(x))
16 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/javascript/mkdocs_serialization.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs')
 2 | const tmp = require('tmp')
 3 | const lunr = require('lunr')
 4 | 
 5 | const data = JSON.parse(
 6 |   fs.readFileSync(__dirname + '/../fixtures/mkdocs_index.json'))
 7 | let documents = {}
 8 | const idx = lunr(function () {
 9 |   this.field('title')
10 |   this.field('text')
11 |   this.ref('id')
12 |   for (doc of data.docs) {
13 |     this.add(doc)
14 |     documents[doc.id] = doc
15 |   }
16 | })
17 | 
18 | const tmpFile = tmp.fileSync({keep: true})
19 | fs.writeFileSync(tmpFile.fd, JSON.stringify(idx))
20 | process.stdout.write(tmpFile.name)
21 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/javascript/mkdocs_query.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs')
 2 | const lunr = require('lunr')
 3 | 
 4 | const data = JSON.parse(
 5 |   fs.readFileSync(__dirname + '/../fixtures/mkdocs_index.json'))
 6 | let documents = {}
 7 | const idx = lunr(function () {
 8 |   this.field('title')
 9 |   this.field('text')
10 |   this.ref('id')
11 |   for (doc of data.docs) {
12 |     this.add(doc)
13 |     documents[doc.id] = doc
14 |   }
15 | })
16 | 
17 | let results = idx.search(process.argv[2])
18 | for (result of results) {
19 |   let doc = documents[result.ref]
20 |   process.stdout.write(`${result.ref} "${doc.title}" [${result.score}]\n`)
21 | }
22 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/lunr/languages/trimmer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def generate_trimmer(word_characters):
 5 |     """Returns a trimmer function from a string of word characters.
 6 | 
 7 |     TODO: lunr-languages ships with lists of word characters for each language
 8 |     I haven't found an equivalent in Python, we may need to copy it.
 9 |     """
10 |     full_re = re.compile(r"^[^{0}]*?([{0}]+)[^{0}]*?$".format(word_characters))
11 | 
12 |     def trimmer(token, i=None, tokens=None):
13 |         def trim(s, metadata=None):
14 |             match = full_re.match(s)
15 |             if match is None:
16 |                 return s
17 |             return match.group(1)
18 | 
19 |         return token.update(trim)
20 | 
21 |     return trimmer
22 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/javascript/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "acceptance_tests",
 3 |   "version": "1.0.0",
 4 |   "description": "Acceptance tests for Lunr.py",
 5 |   "main": "test_mkdocs.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1"
 8 |   },
 9 |   "author": "Yeray Diaz Diaz",
10 |   "license": "MIT",
11 |   "dependencies": {
12 |     "lunr": "2.3.9",
13 |     "lunr-languages": "1.0.0",
14 |     "tmp": "0.0.33"
15 |   },
16 |   "devDependencies": {
17 |     "eslint": "^8.10.0",
18 |     "eslint-config-standard": "^11.0.0",
19 |     "eslint-plugin-import": "^2.11.0",
20 |     "eslint-plugin-node": "^6.0.1",
21 |     "eslint-plugin-promise": "^3.7.0",
22 |     "eslint-plugin-standard": "^3.0.1"
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/javascript/language_serialize_index.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs')
 2 | const tmp = require('tmp')
 3 | const lunr = require('lunr')
 4 | require("lunr-languages/lunr.stemmer.support")(lunr)
 5 | require("lunr-languages/lunr.es")(lunr)
 6 | 
 7 | const data = JSON.parse(
 8 |   fs.readFileSync(__dirname + '/../fixtures/lang_es.json'))
 9 | let documents = {}
10 | const idx = lunr(function () {
11 |   this.use(lunr.es)
12 |   this.field('title')
13 |   this.field('text')
14 |   this.ref('id')
15 |   for (doc of data.docs) {
16 |     this.add(doc)
17 |     documents[doc.id] = doc
18 |   }
19 | })
20 | 
21 | const tmpFile = tmp.fileSync({keep: true})
22 | fs.writeFileSync(tmpFile.fd, JSON.stringify(idx))
23 | process.stdout.write(tmpFile.name)
24 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/javascript/mkdocs_load_serialized_index_and_search.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs')
 2 | const lunr = require('lunr')
 3 | 
 4 | // Read the documents only to retrieve the title for the results
 5 | const data = JSON.parse(
 6 |   fs.readFileSync(__dirname + '/../fixtures/mkdocs_index.json'))
 7 | let documents = {}
 8 | for (doc of data.docs) {
 9 |   documents[doc.id] = doc
10 | }
11 | 
12 | // Load the index from the serialized path produced from Python
13 | const serializedIndex = JSON.parse(fs.readFileSync(process.argv[2]))
14 | let idx = lunr.Index.load(serializedIndex)
15 | let results = idx.search(process.argv[3])
16 | for (result of results) {
17 |   process.stdout.write(`${result.ref} "${documents[result.ref].title}" [${result.score}]\n`)
18 | }


--------------------------------------------------------------------------------
/tests/acceptance_tests/javascript/language_query.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs')
 2 | const lunr = require('lunr')
 3 | require("lunr-languages/lunr.stemmer.support")(lunr);
 4 | require("lunr-languages/lunr.es")(lunr);
 5 | 
 6 | const data = JSON.parse(
 7 |   fs.readFileSync(__dirname + '/../fixtures/lang_es.json'))
 8 | let documents = {}
 9 | const idx = lunr(function () {
10 |   this.use(lunr.es)
11 |   this.field('title')
12 |   this.field('text')
13 |   this.ref('id')
14 |   for (doc of data.docs) {
15 |     this.add(doc)
16 |     documents[doc.id] = doc
17 |   }
18 | })
19 | 
20 | let results = idx.search(process.argv[2])
21 | for (result of results) {
22 |   let doc = documents[result.ref]
23 |   process.stdout.write(`${result.ref} "${doc.title}" [${result.score}]\n`)
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/test_stemmer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from lunr.token import Token
 5 | from lunr.stemmer import stemmer
 6 | from lunr.pipeline import Pipeline
 7 | 
 8 | 
 9 | class TestStemmer:
10 |     def test_reduces_words_to_their_stem(self):
11 |         path = os.path.join(
12 |             os.path.dirname(__file__), "fixtures", "stemming_vocab.json"
13 |         )
14 |         with open(path) as f:
15 |             data = json.loads(f.read())
16 | 
17 |         for word, expected in data.items():
18 |             token = Token(word)
19 |             result = str(stemmer(token))
20 | 
21 |             assert result == expected
22 | 
23 |     def test_is_a_registered_pipeline_function(self):
24 |         assert stemmer.label == "stemmer"
25 |         assert Pipeline.registered_functions["stemmer"] == stemmer
26 | 


--------------------------------------------------------------------------------
/lunr/token.py:
--------------------------------------------------------------------------------
 1 | class Token:
 2 |     def __init__(self, string="", metadata=None):
 3 |         self.string = string
 4 |         self.metadata = metadata or {}
 5 | 
 6 |     def __str__(self):
 7 |         return self.string
 8 | 
 9 |     def __repr__(self):
10 |         return '<Token "{}">'.format(str(self))
11 | 
12 |     def update(self, fn):
13 |         """A token update function is used when updating or optionally
14 |         when cloning a token."""
15 |         # TODO: we require functions to have two parameters, JS doesn't care
16 |         self.string = fn(self.string, self.metadata)
17 |         return self
18 | 
19 |     def clone(self, fn=None):
20 |         """Applies the given function to the wrapped string token."""
21 |         fn = fn or (lambda s, m: s)
22 |         return Token(fn(self.string, self.metadata), self.metadata)
23 | 


--------------------------------------------------------------------------------
/lunr/field_ref.py:
--------------------------------------------------------------------------------
 1 | from lunr.exceptions import BaseLunrException
 2 | 
 3 | 
 4 | class FieldRef:
 5 | 
 6 |     JOINER = "/"
 7 | 
 8 |     def __init__(self, doc_ref, field_name, string_value=None):
 9 |         self.doc_ref = doc_ref
10 |         self.field_name = field_name
11 |         self._string_value = string_value
12 | 
13 |     def __repr__(self):
14 |         return '<FieldRef field="{}" ref="{}">'.format(self.field_name, self.doc_ref)
15 | 
16 |     @classmethod
17 |     def from_string(cls, string):
18 |         if cls.JOINER not in string:
19 |             raise BaseLunrException("Malformed field ref string")
20 |         field_ref, doc_ref = string.split(cls.JOINER, 1)
21 |         return cls(doc_ref, field_ref, string)
22 | 
23 |     def __str__(self):
24 |         if self._string_value is None:
25 |             self._string_value = self.field_name + self.JOINER + str(self.doc_ref)
26 | 
27 |         return self._string_value
28 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/javascript/language_load_serialized_index_and_search.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs')
 2 | const lunr = require('lunr')
 3 | require("lunr-languages/lunr.stemmer.support")(lunr)
 4 | require("lunr-languages/lunr.es")(lunr)
 5 | 
 6 | // Read the documents only to retrieve the title for the results
 7 | const fixtureName = process.argv[4] ||  'lang_es.json'
 8 | const fixturePath = __dirname + '/../fixtures/' + fixtureName
 9 | const data = JSON.parse(fs.readFileSync(fixturePath))
10 | let documents = {}
11 | for (doc of data.docs) {
12 |   documents[doc.id] = doc
13 | }
14 | 
15 | // Load the index from the serialized path produced from Python
16 | const serializedIndex = JSON.parse(fs.readFileSync(process.argv[2]))
17 | let idx = lunr.Index.load(serializedIndex)
18 | let results = idx.search(process.argv[3])
19 | for (result of results) {
20 |   process.stdout.write(`${result.ref} "${documents[result.ref].title}" [${result.score}]\n`)
21 | }


--------------------------------------------------------------------------------
/lunr/languages/stemmer.py:
--------------------------------------------------------------------------------
 1 | def get_language_stemmer(language):
 2 |     """Retrieves the SnowballStemmer for a particular language.
 3 | 
 4 |     Args:
 5 |         language (str): ISO-639-1 code of the language.
 6 |     """
 7 |     from lunr.languages import SUPPORTED_LANGUAGES
 8 |     from nltk.stem.snowball import SnowballStemmer  # type: ignore
 9 | 
10 |     return SnowballStemmer(SUPPORTED_LANGUAGES[language])
11 | 
12 | 
13 | def nltk_stemmer(stemmer, token, i=None, tokens=None):
14 |     """Wrapper around a NLTK SnowballStemmer, which includes stop words for
15 |     each language.
16 | 
17 |     Args:
18 |         stemmer (SnowballStemmer): Stemmer instance that performs the stemming.
19 |         token (lunr.Token): The token to stem.
20 |         i (int): The index of the token in a set.
21 |         tokens (list): A list of tokens representing the set.
22 |     """
23 | 
24 |     def wrapped_stem(token, metadata=None):
25 |         return stemmer.stem(token)
26 | 
27 |     return token.update(wrapped_stem)
28 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from lunr import lunr
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def documents():
 8 |     return [
 9 |         {
10 |             "id": "a",
11 |             "title": "Mr. Green kills Colonel Mustard",
12 |             "body": """Mr. Green killed Colonel Mustard in the study with the
13 | candlestick. Mr. Green is not a very nice fellow.""",
14 |             "word_count": 19,
15 |         },
16 |         {
17 |             "id": "b",
18 |             "title": "Plumb waters plant",
19 |             "body": "Professor Plumb has a green plant in his study",
20 |             "word_count": 9,
21 |         },
22 |         {
23 |             "id": "c",
24 |             "title": "Scarlett helps Professor",
25 |             "body": """Miss Scarlett watered Professor Plumbs green plant
26 | while he was away from his office last week.""",
27 |             "word_count": 16,
28 |         },
29 |     ]
30 | 
31 | 
32 | @pytest.fixture
33 | def index(documents):
34 |     return lunr(ref="id", fields=("title", "body"), documents=documents)
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright © 2019, Yeray Díaz Díaz. All rights reserved.
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/tests/test_trimmer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from lunr.trimmer import trimmer
 4 | from lunr.token import Token
 5 | from lunr.pipeline import Pipeline
 6 | 
 7 | 
 8 | class TestTrimmer:
 9 |     def test_latin_characters(self):
10 |         token = Token("hello")
11 |         assert str(trimmer(token)) == str(token)
12 | 
13 |     @pytest.mark.parametrize(
14 |         "description, string, expected",
15 |         [
16 |             ("full stop", "hello.", "hello"),
17 |             ("inner apostrophe", "it's", "it's"),
18 |             ("trailing apostrophe", "james'", "james"),
19 |             ("exclamation mark", "stop!", "stop"),
20 |             ("comma", "first,", "first"),
21 |             ("brackets", "[tag]", "tag"),
22 |         ],
23 |     )
24 |     def test_punctuation(self, description, string, expected):
25 |         token = Token(string)
26 |         trimmed = str(trimmer(token))
27 | 
28 |         assert trimmed == expected
29 | 
30 |     def test_is_a_registered_pipeline_function(self):
31 |         assert trimmer.label == "trimmer"
32 |         assert Pipeline.registered_functions["trimmer"] == trimmer
33 | 


--------------------------------------------------------------------------------
/tests/test_field_ref.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from lunr.field_ref import FieldRef
 4 | from lunr.exceptions import BaseLunrException
 5 | 
 6 | 
 7 | class TestFieldRef:
 8 |     def test_str_combines_document_ref_and_field_name(self):
 9 |         field_name = "title"
10 |         document_ref = 123
11 |         field_ref = FieldRef(document_ref, field_name)
12 | 
13 |         assert str(field_ref) == "title/123"
14 |         assert repr(field_ref) == '<FieldRef field="title" ref="123">'
15 | 
16 |     def test_from_string_splits_string_into_parts(self):
17 |         field_ref = FieldRef.from_string("title/123")
18 | 
19 |         assert field_ref.field_name == "title"
20 |         assert field_ref.doc_ref == "123"
21 | 
22 |     def test_from_string_docref_contains_join_character(self):
23 |         field_ref = FieldRef.from_string("title/http://example.com/123")
24 | 
25 |         assert field_ref.field_name == "title"
26 |         assert field_ref.doc_ref == "http://example.com/123"
27 | 
28 |     def test_from_string_does_not_contain_join_character(self):
29 |         string = "docRefOnly"
30 | 
31 |         with pytest.raises(BaseLunrException):
32 |             FieldRef.from_string(string)
33 | 


--------------------------------------------------------------------------------
/tests/test_token.py:
--------------------------------------------------------------------------------
 1 | from lunr.token import Token
 2 | 
 3 | 
 4 | def test_str_repr():
 5 |     token = Token("foo")
 6 |     assert str(token) == "foo"
 7 |     assert repr(token) == '<Token "foo">'
 8 | 
 9 | 
10 | class TestMetadata:
11 |     def test_can_attach_arbitrary_metadata(self):
12 |         token = Token("foo", {"length": 3})
13 |         assert token.metadata["length"] == 3
14 | 
15 |     def test_can_update_token_value(self):
16 |         token = Token("foo", {"length": 3})
17 |         token.update(lambda s, m: s.upper())
18 | 
19 |         assert str(token) == "FOO"
20 | 
21 |     def test_metadata_is_yielded_when_updating(self):
22 |         # TODO: unsure what this test is asserting, a language feature?
23 |         pass
24 | 
25 | 
26 | class TestClone:
27 |     def setup_method(self, method):
28 |         self.token = Token("foo", {"bar": True})
29 | 
30 |     def test_clones_value(self):
31 |         assert str(self.token) == str(self.token.clone())
32 | 
33 |     def test_clones_metadata(self):
34 |         assert self.token.metadata == self.token.clone().metadata
35 | 
36 |     def test_clone_and_modify(self):
37 |         clone = self.token.clone(lambda s, m: s.upper())
38 | 
39 |         assert str(clone) == "FOO"
40 |         self.token.metadata == clone.metadata
41 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: tests tests-acceptance tests-full install-dev docs
 2 | 
 3 | .state:
 4 | 	mkdir .state
 5 | 
 6 | .state/acceptance-npm: .state
 7 | 	cd tests/acceptance_tests/javascript && \
 8 | 		npm install && \
 9 | 		cd ../../../
10 | 	touch .state/acceptance-npm
11 | 
12 | clean:
13 | 	rm .state/*
14 | 
15 | install-dev:
16 | 	pip install -U pip wheel setuptools
17 | 	pip install -r requirements/dev.txt
18 | 
19 | tests:
20 | 	coverage run -m pytest -m "not acceptance"
21 | 	coverage report
22 | 
23 | tests-acceptance: .state/acceptance-npm
24 | 	pytest -m "acceptance"
25 | 
26 | tests-full: tests tests-acceptance
27 | 
28 | tests-benchmark:
29 | 	pytest tests/benchmarks.py --benchmark-warmup=on
30 | 
31 | package:
32 | 	rm -fr dist/*
33 | 	python setup.py sdist
34 | 	python setup.py bdist_wheel --universal
35 | 
36 | release-test: package
37 | 	@echo "Are you sure you want to release to test.pypi.org? [y/N]" && \
38 | 		read ans && \
39 | 		[ $${ans:-N} = y ] && \
40 | 		twine upload --repository testpypi dist/*
41 | 
42 | release-pypi: package
43 | 	@echo "Are you sure you want to release to pypi.org? [y/N]" && \
44 | 		read ans && \
45 | 		[ $${ans:-N} = y ] && \
46 | 		twine upload dist/*
47 | 
48 | lint:
49 | 	flake8 lunr tests
50 | 	black lunr tests
51 | 	mypy lunr
52 | 
53 | docs:
54 | 	sphinx-build docs docs/_build/html
55 | 
56 | docs-server:
57 | 	sphinx-autobuild docs docs/_build/html
58 | 


--------------------------------------------------------------------------------
/tests/test_serialization.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from lunr import lunr
 4 | from lunr.index import Index
 5 | 
 6 | 
 7 | class TestSerialization:
 8 |     def setup_method(self, method):
 9 |         documents = [
10 |             {
11 |                 "id": "a",
12 |                 "title": "Mr. Green kills Colonel Mustard",
13 |                 "body": """Mr. Green killed Colonel Mustard in the study with the
14 | candlestick. Mr. Green is not a very nice fellow.""",
15 |                 "word_count": 19,
16 |             },
17 |             {
18 |                 "id": "b",
19 |                 "title": "Plumb waters plant",
20 |                 "body": "Professor Plumb has a green plant in his study",
21 |                 "word_count": 9,
22 |             },
23 |             {
24 |                 "id": "c",
25 |                 "title": "Scarlett helps Professor",
26 |                 "body": """Miss Scarlett watered Professor Plumbs green plant
27 | while he was away from his office last week.""",
28 |                 "word_count": 16,
29 |             },
30 |         ]
31 | 
32 |         self.idx = lunr(ref="id", fields=("title", "body"), documents=documents)
33 | 
34 |     def test_serialization(self):
35 |         serialized_index = json.dumps(self.idx.serialize())
36 |         loaded_index = Index.load(json.loads(serialized_index))
37 | 
38 |         assert self.idx.search("green") == loaded_index.search("green")
39 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | import subprocess
 5 | 
 6 | import pytest
 7 | 
 8 | PATTERN = r'([^\ ]+) "([^\"]+)" \[([\d\.]*)\]'
 9 | DEFAULT_TOLERANCE = 1e-2
10 | 
11 | 
12 | def assert_field_vectors_equal(a, b, tol=DEFAULT_TOLERANCE):
13 |     assert a[0] == b[0]
14 |     for x, y in zip(a[1], b[1]):
15 |         assert x == pytest.approx(y, rel=tol)
16 | 
17 | 
18 | def assert_vectors_equal(a, b, tol=DEFAULT_TOLERANCE):
19 |     for x, y in zip(a, b):
20 |         assert x == pytest.approx(y, rel=tol)
21 | 
22 | 
23 | def assert_results_match(results, js_results, tol=DEFAULT_TOLERANCE):
24 |     assert len(results) == len(js_results) != 0
25 |     for js_result, result in zip(js_results, results):
26 |         id_, title, score = re.match(PATTERN, js_result).groups()
27 |         assert result["ref"] == id_
28 |         assert result["score"] == pytest.approx(float(score), rel=tol)
29 | 
30 | 
31 | def read_json_fixture(filename):
32 |     fixture_path = os.path.join(
33 |         os.path.dirname(__file__), "acceptance_tests", "fixtures", filename
34 |     )
35 |     with open(fixture_path) as f:
36 |         return json.loads(f.read())
37 | 
38 | 
39 | def run_node_script(filename, *args):
40 |     js_path = os.path.join(
41 |         os.path.dirname(__file__), "acceptance_tests", "javascript", filename
42 |     )
43 |     js_output = subprocess.check_output(["node", js_path] + list(args))
44 |     return js_output.decode("utf-8").strip()
45 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36,py37,py38,py39,py310,pypy3,flake8,black,mypy,docs
 3 | 
 4 | [testenv]
 5 | deps = -rrequirements/test.txt
 6 | commands =
 7 |     coverage run -m pytest -m "not acceptance"
 8 |     coverage report
 9 |     coverage xml
10 |     pytest -m "acceptance"
11 | 
12 | [testenv:black]
13 | basepython = python3.8
14 | deps=
15 |     black
16 | commands={envbindir}/black --check lunr tests
17 | 
18 | [testenv:flake8]
19 | basepython = python3.8
20 | deps=
21 |     flake8
22 | commands={envbindir}/flake8 lunr tests
23 | 
24 | [testenv:docs]
25 | basepython = python3.8
26 | deps=
27 |     furo
28 |     sphinx
29 |     sphinx-autobuild
30 |     myst-parser
31 | commands={envbindir}/sphinx-build docs docs/_build/html
32 | 
33 | [testenv:mypy]
34 | basepython = python3.8
35 | deps = mypy
36 | commands={envbindir}/mypy lunr
37 | 
38 | [coverage:run]
39 | source=lunr
40 | branch=True
41 | 
42 | [coverage:report]
43 | exclude_lines =
44 |     if self.debug:
45 |     pragma: no cover
46 |     raise NotImplementedError
47 |     if __name__ == .__main__.:
48 | ignore_errors = True
49 | omit =
50 |     tests/*
51 |     lunr/stemmer.py
52 | show_missing = True
53 | 
54 | [flake8]
55 | exclude = lunr/stemmer.py
56 | max-line-length = 92
57 | ignore = E203 W503
58 | 
59 | [pytest]
60 | markers =
61 |     acceptance: mark test as an acceptance test
62 | 
63 | [gh-actions]
64 | python =
65 |     3.6: py36
66 |     3.7: py37
67 |     3.8: py38,flake8,black,docs,mypy
68 |     3.9: py39
69 |     3.10: py310
70 |     pypy3: pypy3
71 | 


--------------------------------------------------------------------------------
/tests/fixtures/stemming_vocab.json:
--------------------------------------------------------------------------------
1 | {"consign":"consign","consigned":"consign","consigning":"consign","consignment":"consign","consist":"consist","consisted":"consist","consistency":"consist","consistent":"consist","consistently":"consist","consisting":"consist","consists":"consist","consolation":"consol","consolations":"consol","consolatory":"consolatori","console":"consol","consoled":"consol","consoles":"consol","consolidate":"consolid","consolidated":"consolid","consolidating":"consolid","consoling":"consol","consols":"consol","consonant":"conson","consort":"consort","consorted":"consort","consorting":"consort","conspicuous":"conspicu","conspicuously":"conspicu","conspiracy":"conspiraci","conspirator":"conspir","conspirators":"conspir","conspire":"conspir","conspired":"conspir","conspiring":"conspir","constable":"constabl","constables":"constabl","constance":"constanc","constancy":"constanc","constant":"constant","knack":"knack","knackeries":"knackeri","knacks":"knack","knag":"knag","knave":"knave","knaves":"knave","knavish":"knavish","kneaded":"knead","kneading":"knead","knee":"knee","kneel":"kneel","kneeled":"kneel","kneeling":"kneel","kneels":"kneel","knees":"knee","knell":"knell","knelt":"knelt","knew":"knew","knick":"knick","knif":"knif","knife":"knife","knight":"knight","knights":"knight","knit":"knit","knits":"knit","knitted":"knit","knitting":"knit","knives":"knive","knob":"knob","knobs":"knob","knock":"knock","knocked":"knock","knocker":"knocker","knockers":"knocker","knocking":"knock","knocks":"knock","knopp":"knopp","knot":"knot","knots":"knot","lay":"lai","try":"try"}


--------------------------------------------------------------------------------
/.github/workflows/test-suite.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: CI
 3 | 
 4 | on:
 5 |   push:
 6 |     branches: ["master"]
 7 |   pull_request:
 8 |     branches: ["master"]
 9 | 
10 | jobs:
11 |   tests:
12 |     name: "Python ${{ matrix.python-version }}"
13 |     runs-on: "ubuntu-latest"
14 |     env:
15 |       USING_COVERAGE: '3.8'
16 | 
17 |     strategy:
18 |       matrix:
19 |         python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "pypy3"]
20 | 
21 |     steps:
22 |       - uses: "actions/checkout@v2"
23 |       - uses: "actions/setup-python@v2"
24 |         with:
25 |           python-version: "${{ matrix.python-version }}"
26 |       - uses: actions/setup-node@v1
27 |         with:
28 |           node-version: '14'
29 | 
30 |       - name: "Install dependencies"
31 |         run: |
32 |           set -xe
33 |           python -VV
34 |           python -m site
35 |           python -m pip install --upgrade pip setuptools wheel
36 |           python -m pip install --upgrade coverage[toml] virtualenv tox tox-gh-actions
37 |           cd tests/acceptance_tests/javascript/ && npm install
38 | 
39 |       - name: "Run tox targets for ${{ matrix.python-version }}"
40 |         run: "python -m tox"
41 | 
42 |       - name: "Convert coverage"
43 |         if: "contains(env.USING_COVERAGE, matrix.python-version)"
44 |         run: "python -m coverage xml"
45 | 
46 |       - name: "Upload coverage to Codecov"
47 |         if: "contains(env.USING_COVERAGE, matrix.python-version)"
48 |         uses: "codecov/codecov-action@v1"
49 |         with:
50 |           fail_ci_if_error: true
51 | 
52 | 


--------------------------------------------------------------------------------
/tests/test_stop_word_filter.py:
--------------------------------------------------------------------------------
 1 | from lunr.stop_word_filter import stop_word_filter, generate_stop_word_filter
 2 | from lunr.pipeline import Pipeline
 3 | 
 4 | STOP_WORDS = ["the", "and", "but", "than", "when"]
 5 | 
 6 | 
 7 | class TestStopWordFilter:
 8 |     def test_filters_stop_words(self):
 9 |         for word in STOP_WORDS:
10 |             assert stop_word_filter(word) is None
11 | 
12 |     def test_ignores_non_stop_words(self):
13 |         non_stop_words = ["interesting", "words", "pass", "through"]
14 |         for word in non_stop_words:
15 |             assert stop_word_filter(word) == word
16 | 
17 |     def test_is_a_registered_pipeline_function(self):
18 |         assert stop_word_filter.label == "stopWordFilter"
19 |         assert Pipeline.registered_functions["stopWordFilter"] == stop_word_filter
20 | 
21 | 
22 | class TestGenerateStopWordFilter:
23 |     def test_creates_correct_stop_words_filter(self):
24 |         new_stop_word_filter = generate_stop_word_filter(STOP_WORDS)
25 |         for word in STOP_WORDS:
26 |             assert new_stop_word_filter(word) is None
27 | 
28 |     def test_registers_new_stop_words_filter(self):
29 |         new_stop_word_filter = generate_stop_word_filter(STOP_WORDS)
30 |         assert new_stop_word_filter.label == "stopWordFilter"
31 |         assert Pipeline.registered_functions["stopWordFilter"] == new_stop_word_filter
32 | 
33 |     def test_passing_a_language_adds_to_registered_label(self):
34 |         new_stop_word_filter = generate_stop_word_filter(STOP_WORDS, "es")
35 |         assert new_stop_word_filter.label == "stopWordFilter-es"
36 |         assert (
37 |             Pipeline.registered_functions["stopWordFilter-es"] == new_stop_word_filter
38 |         )
39 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/test_mkdocs.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import tempfile
 3 | 
 4 | import pytest
 5 | 
 6 | from lunr import lunr
 7 | from lunr.index import Index
 8 | from tests.utils import read_json_fixture, run_node_script, assert_results_match
 9 | 
10 | 
11 | @pytest.mark.acceptance
12 | def test_mkdocs_produces_same_results():
13 |     query_string = "plugins"
14 |     js_results = run_node_script("mkdocs_query.js", query_string).split("\n")
15 |     data = read_json_fixture("mkdocs_index.json")
16 |     index = lunr(ref="id", fields=("title", "text"), documents=data["docs"])
17 |     results = index.search(query_string)
18 |     assert_results_match(results, js_results)
19 | 
20 | 
21 | @pytest.mark.acceptance
22 | def test_js_serialized_index_can_be_loaded_and_produces_same_results():
23 |     json_path = run_node_script("mkdocs_serialization.js")
24 |     with open(json_path) as fd:
25 |         js_serialized_index = fd.read()
26 | 
27 |     index = Index.load(js_serialized_index)
28 |     query_string = "plugins"
29 |     results = index.search(query_string)
30 |     js_results = run_node_script("mkdocs_query.js", query_string).split("\n")
31 |     assert_results_match(results, js_results)
32 | 
33 | 
34 | @pytest.mark.acceptance
35 | def test_serialized_index_can_be_loaded_in_js_and_produces_same_results():
36 |     data = read_json_fixture("mkdocs_index.json")
37 |     index = lunr(ref="id", fields=("title", "text"), documents=data["docs"])
38 |     query_string = "plugins"
39 |     results = index.search(query_string)
40 |     serialized_index = index.serialize()
41 | 
42 |     with tempfile.NamedTemporaryFile(delete=False) as fp:
43 |         fp.write(json.dumps(serialized_index).encode())
44 | 
45 |     js_results = run_node_script(
46 |         "mkdocs_load_serialized_index_and_search.js", fp.name, query_string
47 |     ).split("\n")
48 |     assert_results_match(results, js_results)
49 | 


--------------------------------------------------------------------------------
/lunr/token_set_builder.py:
--------------------------------------------------------------------------------
 1 | from lunr.token_set import TokenSet
 2 | from lunr.exceptions import BaseLunrException
 3 | 
 4 | 
 5 | class TokenSetBuilder:
 6 |     def __init__(self):
 7 |         self.previous_word = ""
 8 |         self.root = TokenSet()
 9 |         self.unchecked_nodes = []
10 |         self.minimized_nodes = {}
11 | 
12 |     def insert(self, word):
13 |         if word < self.previous_word:
14 |             raise BaseLunrException("Out of order word insertion")
15 | 
16 |         common_prefix = 0
17 |         for i in range(min(len(word), len(self.previous_word))):
18 |             if word[i] != self.previous_word[i]:
19 |                 break
20 | 
21 |             common_prefix += 1
22 | 
23 |         self.minimize(common_prefix)
24 | 
25 |         node = (
26 |             self.root if not self.unchecked_nodes else self.unchecked_nodes[-1]["child"]
27 |         )
28 | 
29 |         for i in range(common_prefix, len(word)):
30 |             next_node = TokenSet()
31 |             char = word[i]
32 | 
33 |             node.edges[char] = next_node
34 | 
35 |             self.unchecked_nodes.append(
36 |                 {"parent": node, "char": char, "child": next_node}
37 |             )
38 | 
39 |             node = next_node
40 | 
41 |         node.final = True
42 |         self.previous_word = word
43 | 
44 |     def finish(self):
45 |         self.minimize(0)
46 | 
47 |     def minimize(self, down_to):
48 |         for i in range(len(self.unchecked_nodes) - 1, down_to - 1, -1):
49 |             node = self.unchecked_nodes[i]
50 |             child_key = str(node["child"])
51 | 
52 |             if child_key in self.minimized_nodes:
53 |                 node["parent"].edges[node["char"]] = self.minimized_nodes[child_key]
54 |             else:
55 |                 node["child"]._str = child_key
56 |                 self.minimized_nodes[child_key] = node["child"]
57 | 
58 |             self.unchecked_nodes.pop()
59 | 


--------------------------------------------------------------------------------
/tests/test_plugins.py:
--------------------------------------------------------------------------------
 1 | from lunr import lunr, get_default_builder
 2 | from lunr.pipeline import Pipeline
 3 | from lunr.stemmer import stemmer
 4 | from lunr.trimmer import trimmer
 5 | from lunr.stop_word_filter import stop_word_filter
 6 | 
 7 | documents = [
 8 |     {
 9 |         "id": "a",
10 |         "title": "Mr. Green kills Colonel Mustard",
11 |         "body": """Mr. Green killed Colonel Mustard in the study with the
12 | candlestick. Mr. Green is not a very nice fellow.""",
13 |         "word_count": 19,
14 |     },
15 |     {
16 |         "id": "b",
17 |         "title": "Plumb waters plant",
18 |         "body": "Professor Plumb has a green plant in his study",
19 |         "word_count": 9,
20 |     },
21 |     {
22 |         "id": "c",
23 |         "title": "Scarlett helps Professor",
24 |         "body": """Miss Scarlett watered Professor Plumbs green plant
25 | while he was away from his office last week.""",
26 |         "word_count": 16,
27 |     },
28 | ]
29 | 
30 | 
31 | def test_get_default_builder():
32 |     builder = get_default_builder()
33 |     assert builder.pipeline._stack == [trimmer, stop_word_filter, stemmer]
34 |     assert builder.search_pipeline._stack == [stemmer]
35 | 
36 | 
37 | def test_drop_pipeline_function():
38 |     builder = get_default_builder()
39 |     builder.pipeline.remove(stemmer)
40 | 
41 |     idx = lunr("id", ("title", "body"), documents, builder=builder)
42 | 
43 |     assert idx.search("kill") == []  # no match because "killed" was not stemmed
44 | 
45 | 
46 | def test_add_token_metadata():
47 |     builder = get_default_builder()
48 | 
49 |     def token_length(token, i, tokens):
50 |         token.metadata["token_length"] = len(str(token))
51 |         return token
52 | 
53 |     Pipeline.register_function(token_length)
54 |     builder.pipeline.add(token_length)
55 |     builder.metadata_whitelist.append("token_length")
56 | 
57 |     idx = lunr("id", ("title", "body"), documents, builder=builder)
58 | 
59 |     [result, _, _] = idx.search("green")
60 |     assert result["match_data"].metadata["green"]["title"]["token_length"] == [5]
61 |     assert result["match_data"].metadata["green"]["body"]["token_length"] == [5, 5]
62 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from setuptools import setup, find_packages
 4 | 
 5 | 
 6 | PATH = os.path.abspath(os.path.dirname(__file__))
 7 | 
 8 | 
 9 | def read_file(filepath):
10 |     with open(filepath, "r") as fd:
11 |         return fd.read()
12 | 
13 | 
14 | def find_version():
15 |     version_path = os.path.join(PATH, "lunr", "__init__.py")
16 |     contents = read_file(version_path)
17 |     version_string = contents[contents.index("__VERSION__") :]
18 |     try:
19 |         return re.match(r'.*__VERSION__ = [\'"]([\d\w\.]+)[\'"]', version_string).group(
20 |             1
21 |         )
22 |     except AttributeError:
23 |         raise RuntimeError("Unable to find version string.")
24 | 
25 | 
26 | setup(
27 |     name="lunr",
28 |     version=find_version(),
29 |     url="https://github.com/yeraydiazdiaz/lunr.py",
30 |     project_urls={"Documentation": "https://lunr.readthedocs.io"},
31 |     license="MIT",
32 |     description="A Python implementation of Lunr.js",
33 |     long_description=read_file("README.md"),
34 |     long_description_content_type="text/markdown",
35 |     author="Yeray Diaz Diaz",
36 |     author_email="yeraydiazdiaz@gmail.com",
37 |     packages=find_packages(exclude=("tests",)),
38 |     include_package_data=True,
39 |     zip_safe=False,
40 |     python_requires=">=3.6",
41 |     install_requires=[],
42 |     extras_require={
43 |         "languages": ["nltk"]
44 |     },
45 |     keywords="lunr full text search",
46 |     classifiers=[
47 |         "Development Status :: 4 - Beta",
48 |         "Intended Audience :: Developers",
49 |         "License :: OSI Approved :: MIT License",
50 |         "Operating System :: OS Independent",
51 |         "Programming Language :: Python",
52 |         "Programming Language :: Python :: 3",
53 |         "Programming Language :: Python :: 3.6",
54 |         "Programming Language :: Python :: 3.7",
55 |         "Programming Language :: Python :: 3.8",
56 |         "Programming Language :: Python :: 3.9",
57 |         "Programming Language :: Python :: 3.10",
58 |         "Programming Language :: Python :: Implementation :: CPython",
59 |         "Programming Language :: Python :: Implementation :: PyPy",
60 |         "Topic :: Text Processing",
61 |     ],
62 | )
63 | 


--------------------------------------------------------------------------------
/lunr/tokenizer.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from lunr.token import Token
 4 | from lunr.utils import as_string
 5 | 
 6 | SEPARATOR_CHARS = " \t\n\r\f\v\xa0-"
 7 | 
 8 | 
 9 | def default_separator(char):
10 |     return char and char in SEPARATOR_CHARS
11 | 
12 | 
13 | def Tokenizer(obj, metadata=None, separator=None):
14 |     """Splits a string into tokens ready to be inserted into the search index.
15 | 
16 |     Args:
17 |         metadata (dict): Optional metadata can be passed to the tokenizer, this
18 |             metadata will be cloned and added as metadata to every token that is
19 |             created from the object to be tokenized.
20 |         separator (callable or compiled regex): This tokenizer will convert its
21 |             parameter to a string by calling `str` and then will split this
22 |             string on characters for which `separator` is True. Lists will have
23 |             their elements converted to strings and wrapped in a lunr `Token`.
24 | 
25 |     Returns:
26 |         List of Token instances.
27 |     """
28 |     if obj is None:
29 |         return []
30 | 
31 |     metadata = metadata or {}
32 | 
33 |     if isinstance(obj, (list, tuple)):
34 |         return [
35 |             Token(as_string(element).lower(), deepcopy(metadata)) for element in obj
36 |         ]
37 | 
38 |     if separator is None:
39 |         is_separator = default_separator
40 |     elif callable(separator):
41 |         is_separator = separator
42 |     else:  # must be a regex, remove when dropping support for 2.7
43 |         is_separator = lambda c: separator.match(c)  # noqa
44 | 
45 |     string = str(obj).lower()
46 |     length = len(string)
47 |     tokens = []
48 |     slice_start = 0
49 |     for slice_end in range(length + 1):
50 |         char = string[slice_end] if slice_end != length else ""
51 |         slice_length = slice_end - slice_start
52 |         if is_separator(char) or slice_end == length:
53 |             if slice_length > 0:
54 |                 token_metadata = {}
55 |                 token_metadata["position"] = [slice_start, slice_length]
56 |                 token_metadata["index"] = len(tokens)
57 |                 token_metadata.update(metadata)
58 | 
59 |                 sl = slice(slice_start, slice_end)
60 |                 tokens.append(Token(string[sl], token_metadata))
61 | 
62 |             slice_start = slice_end + 1
63 | 
64 |     return tokens
65 | 


--------------------------------------------------------------------------------
/tests/test_match_data.py:
--------------------------------------------------------------------------------
 1 | from lunr.match_data import MatchData
 2 | 
 3 | 
 4 | class TestMatchData:
 5 |     def setup_method(self, method):
 6 |         self.match = MatchData("foo", "title", {"position": [1]})
 7 |         self.match.combine(MatchData("bar", "title", {"position": [2]}))
 8 |         self.match.combine(MatchData("baz", "body", {"position": [3]}))
 9 |         self.match.combine(MatchData("baz", "body", {"position": [4]}))
10 | 
11 |     def test_repr(self):
12 |         assert repr(self.match) == '<MatchData "bar,baz,foo">'
13 | 
14 |     def test_create_empty_match_data(self):
15 |         assert MatchData().metadata == {}
16 | 
17 |     def test_create_missing_field(self):
18 |         assert MatchData("foo").metadata["foo"] == {}
19 | 
20 |     def test_create_missing_metadata(self):
21 |         assert MatchData("foo", "title").metadata["foo"]["title"] == {}
22 | 
23 |     def test_combine_terms(self):
24 |         assert sorted(list(self.match.metadata.keys())) == ["bar", "baz", "foo"]
25 | 
26 |     def test_combine_metadata(self):
27 |         assert self.match.metadata["foo"]["title"]["position"] == [1]
28 |         assert self.match.metadata["bar"]["title"]["position"] == [2]
29 |         assert self.match.metadata["baz"]["body"]["position"] == [3, 4]
30 | 
31 |     def test_combine_does_not_mutate_source_data(self):
32 |         metadata = {"foo": [1]}
33 |         match_data1 = MatchData("foo", "title", metadata)
34 |         match_data2 = MatchData("foo", "title", metadata)
35 | 
36 |         match_data1.combine(match_data2)
37 | 
38 |         assert metadata["foo"] == [1]
39 | 
40 |     def test_add_metadata_for_missing_term(self):
41 |         self.match.add("spam", "title", {"position": [5]})
42 | 
43 |         assert self.match.metadata["spam"]["title"]["position"] == [5]
44 | 
45 |     def test_add_metadata_for_missing_field(self):
46 |         self.match.add("foo", "body", {"position": [6]})
47 | 
48 |         assert self.match.metadata["foo"]["body"]["position"] == [6]
49 | 
50 |     def test_add_metadata_for_existing_term_field_and_metadata_key(self):
51 |         self.match.add("foo", "title", {"position": [7]})
52 | 
53 |         assert self.match.metadata["foo"]["title"]["position"] == [1, 7]
54 | 
55 |     def test_add_metadata_for_existing_term_and_field_and_missing_metadata_key(self):
56 |         self.match.add("foo", "title", {"weight": [7]})
57 | 
58 |         assert self.match.metadata["foo"]["title"] == {"position": [1], "weight": [7]}
59 | 


--------------------------------------------------------------------------------
/tests/benchmarks.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | import pytest
 4 | 
 5 | from tests.utils import read_json_fixture
 6 | 
 7 | from lunr import lunr
 8 | from lunr.pipeline import Pipeline
 9 | 
10 | 
11 | def get_mkdocs_index():
12 |     data = read_json_fixture("mkdocs_index.json")
13 |     return lunr(ref="id", fields=("title", "text"), documents=data["docs"])
14 | 
15 | 
16 | class TestSearchBenchmarks:
17 |     @pytest.fixture(scope="session")
18 |     def index(self):
19 |         return get_mkdocs_index()
20 | 
21 |     def test_search(self, index, benchmark):
22 |         benchmark(index.search, "styling")
23 | 
24 | 
25 | class TestPipelineBenchmarks:
26 | 
27 |     FEW_COUNT = 50
28 |     MANY_COUNT = 1000
29 | 
30 |     @pytest.fixture(scope="session")
31 |     def many_tokens(self):
32 |         path = os.path.join(os.path.dirname(__file__), "fixtures/words.txt")
33 |         with open(path) as words:
34 |             self.many_tokens = [
35 |                 words.readline().strip() for _ in range(self.MANY_COUNT)
36 |             ]
37 |         self.few_tokens = self.many_tokens[: self.FEW_COUNT]
38 |         yield self.many_tokens
39 | 
40 |     @pytest.fixture(scope="session")
41 |     def few_tokens(self, many_tokens):
42 |         yield self.few_tokens
43 | 
44 |     @staticmethod
45 |     def token_to_token(token, i, tokens):
46 |         return token
47 | 
48 |     @staticmethod
49 |     def token_to_token_array(token, i, tokens):
50 |         return [token, token]
51 | 
52 |     def test_few_token_to_token(self, few_tokens, benchmark):
53 |         token_to_token_pipeline = Pipeline()
54 |         token_to_token_pipeline.add(self.token_to_token)
55 |         benchmark(token_to_token_pipeline.run, few_tokens)
56 | 
57 |     def test_many_token_to_token(self, many_tokens, benchmark):
58 |         token_to_token_pipeline = Pipeline()
59 |         token_to_token_pipeline.add(self.token_to_token)
60 |         benchmark(token_to_token_pipeline.run, many_tokens)
61 | 
62 |     def test_few_token_to_token_array(self, few_tokens, benchmark):
63 |         token_to_token_array_pipeline = Pipeline()
64 |         token_to_token_array_pipeline.add(self.token_to_token_array)
65 |         benchmark(token_to_token_array_pipeline.run, few_tokens)
66 | 
67 |     def test_many_token_to_token_array(self, many_tokens, benchmark):
68 |         token_to_token_array_pipeline = Pipeline()
69 |         token_to_token_array_pipeline.add(self.token_to_token_array)
70 |         benchmark(token_to_token_array_pipeline.run, many_tokens)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     get_mkdocs_index()
75 | 


--------------------------------------------------------------------------------
/readme.rst:
--------------------------------------------------------------------------------
 1 | |Build Status| |codecov|
 2 | 
 3 | Lunr.py
 4 | =======
 5 | 
 6 | A Python implementation of `Lunr.js <https://lunrjs.com>`__ by `Oliver
 7 | Nightingale <https://github.com/olivernn>`__.
 8 | 
 9 |     A bit like Solr, but much smaller and not as bright.
10 | 
11 | This Python version of Lunr.js aims to bring the simple and powerful
12 | full text search capabilities into Python guaranteeing results as close
13 | as the original implementation as possible.
14 | 
15 | Current state:
16 | --------------
17 | 
18 | Each version of lunr.py `targets a specific version of
19 | lunr.js <https://github.com/yeraydiazdiaz/lunr.py/blob/master/lunr/__init__.py#L12>`__
20 | and produces the same results as it both in Python 2.7 and 3 for
21 | `non-trivial corpus of
22 | documents <https://github.com/yeraydiazdiaz/lunr.py/blob/master/tests/acceptance_tests/fixtures/mkdocs_index.json>`__.
23 | 
24 | Lunr.py also serializes ``Index`` instances respecting the
25 | ```lunr-schema`` <https://github.com/olivernn/lunr-schema>`__ which are
26 | consumable by Lunr.js and viceversa.
27 | 
28 | The API is in alpha stage and likely to change.
29 | 
30 | Usage:
31 | ------
32 | 
33 | You’ll need a list of dicts representing the documents you want to
34 | search on. These documents must have a unique field which will serve as
35 | a reference and a series of fields you’d like to search on.
36 | 
37 | Lunr provides a convenience ``lunr`` function to quickly index this set
38 | of documents:
39 | 
40 | .. code:: python
41 | 
42 |     >>> from lunr import lunr
43 |     >>>
44 |     >>> documents = [{
45 |     ...     'id': 'a',
46 |     ...     'title': 'Mr. Green kills Colonel Mustard',
47 |     ...     'body': 'Mr. Green killed Colonel Mustard in the study with the candlestick.',
48 |     ... }, {
49 |     ...     'id': 'b',
50 |     ...     'title': 'Plumb waters plant',
51 |     ...     'body': 'Professor Plumb has a green plant in his study',
52 |     ... }]
53 |     >>> idx = lunr(
54 |     ...     ref='id', fields=('title', 'body'), documents=documents
55 |     ... )
56 |     >>> idx.search('kill')
57 |     [{'ref': 'a', 'score': 0.6931722372559913, 'match_data': <MatchData "kill">}]
58 |     >>> idx.search('study')
59 |     [{'ref': 'b', 'score': 0.23576799568081389, 'match_data': <MatchData "studi">}, {'ref': 'a', 'score': 0.2236629211724517, 'match_data': <MatchData "studi">}]
60 | 
61 | .. |Build Status| image:: https://travis-ci.org/yeraydiazdiaz/lunr.py.svg?branch=master
62 |    :target: https://travis-ci.org/yeraydiazdiaz/lunr.py
63 | .. |codecov| image:: https://codecov.io/gh/yeraydiazdiaz/lunr.py/branch/master/graph/badge.svg
64 |    :target: https://codecov.io/gh/yeraydiazdiaz/lunr.py
65 | 


--------------------------------------------------------------------------------
/lunr/match_data.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | 
 4 | class MatchData:
 5 |     """Contains and collects metadata about a matching document.
 6 | 
 7 |     A single instance of lunr.MatchData is returned as part of every
 8 |     lunr.Index.Result.
 9 |     """
10 | 
11 |     def __init__(self, term=None, field=None, metadata=None):
12 |         self.metadata = {}
13 |         if term is not None:
14 |             self.metadata[term] = {}
15 |             if field is not None:
16 |                 self.metadata[term][field] = (
17 |                     deepcopy(metadata) if metadata is not None else {}
18 |                 )
19 | 
20 |     def __repr__(self):
21 |         return '<MatchData "{}">'.format(",".join(sorted(self.metadata.keys())))
22 | 
23 |     def combine(self, other):
24 |         """An instance of lunr.MatchData will be created for every term that
25 |         matches a document.
26 | 
27 |         However only one instance is required in a lunr.Index~Result. This
28 |         method combines metadata from another instance of MatchData with this
29 |         object's metadata.
30 |         """
31 |         for term in other.metadata.keys():
32 |             if term not in self.metadata:
33 |                 self.metadata[term] = {}
34 | 
35 |             fields = other.metadata[term].keys()
36 |             for field in fields:
37 |                 if field not in self.metadata[term]:
38 |                     self.metadata[term][field] = {}
39 | 
40 |                 keys = other.metadata[term][field].keys()
41 |                 for key in keys:
42 |                     if key not in self.metadata[term][field]:
43 |                         self.metadata[term][field][key] = other.metadata[term][field][
44 |                             key
45 |                         ]
46 |                     else:
47 |                         self.metadata[term][field][key].extend(
48 |                             other.metadata[term][field][key]
49 |                         )
50 | 
51 |     def add(self, term, field, metadata):
52 |         """Add metadata for a term/field pair to this instance of match data"""
53 |         if term not in self.metadata:
54 |             self.metadata[term] = {field: metadata}
55 |             return
56 | 
57 |         if field not in self.metadata[term]:
58 |             self.metadata[term][field] = metadata
59 |             return
60 | 
61 |         for key in metadata.keys():
62 |             if key in self.metadata[term][field]:
63 |                 self.metadata[term][field][key].extend(metadata[key])
64 |             else:
65 |                 self.metadata[term][field][key] = metadata[key]
66 | 
67 |     def __eq__(self, other):
68 |         return self.metadata == other.metadata
69 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = "lunr.py"
21 | copyright = "2022, Yeray Diaz Diaz"
22 | author = "Yeray Diaz Diaz"
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 |     "sphinx.ext.autodoc",
32 |     "sphinx.ext.extlinks",
33 |     "sphinx.ext.intersphinx",
34 |     "sphinx.ext.mathjax",
35 |     "sphinx.ext.todo",
36 |     "sphinx.ext.viewcode",
37 |     "myst_parser",
38 | ]
39 | 
40 | # Add any paths that contain templates here, relative to this directory.
41 | templates_path = ["_templates"]
42 | 
43 | # List of patterns, relative to source directory, that match files and
44 | # directories to ignore when looking for source files.
45 | # This pattern also affects html_static_path and html_extra_path.
46 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
47 | 
48 | 
49 | # -- Options for HTML output -------------------------------------------------
50 | 
51 | # The theme to use for HTML and HTML Help pages.  See the documentation for
52 | # a list of builtin themes.
53 | #
54 | html_theme = "furo"
55 | html_title = "Lunr.py"
56 | 
57 | # Add any paths that contain custom static files (such as style sheets) here,
58 | # relative to this directory. They are copied after the builtin static files,
59 | # so a file named "default.css" will overwrite the builtin "default.css".
60 | html_static_path = ["_static"]
61 | 
62 | #
63 | # -- Options for extlinks ----------------------------------------------------
64 | #
65 | extlinks = {"pypi": ("https://pypi.org/project/%s/", "")}
66 | 
67 | #
68 | # -- Options for intersphinx -------------------------------------------------
69 | #
70 | intersphinx_mapping = {
71 |     "python": ("https://docs.python.org/3", None),
72 |     "sphinx": ("https://www.sphinx-doc.org/", None),
73 | }
74 | 


--------------------------------------------------------------------------------
/tests/test_index.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from mock import MagicMock, patch
 3 | 
 4 | import pytest
 5 | 
 6 | from lunr import __TARGET_JS_VERSION__
 7 | from lunr.index import Index
 8 | from lunr.exceptions import BaseLunrException
 9 | 
10 | from tests.utils import assert_vectors_equal
11 | 
12 | 
13 | class TestIndex:
14 |     def test_create_query_default_fields(self, index):
15 |         query = index.create_query()
16 |         assert query.all_fields == index.fields
17 | 
18 |     def test_create_query_subset_of_fields(self, index):
19 |         query = index.create_query([index.fields[0]])
20 |         assert query.all_fields == [index.fields[0]]
21 | 
22 |     def test_create_query_non_contained_fields(self, index):
23 |         with pytest.raises(BaseLunrException):
24 |             index.create_query(["foo"])
25 | 
26 |     def test_query_no_arguments_warns_and_returns_no_results(self, monkeypatch, index):
27 |         from lunr.index import logger
28 | 
29 |         mock_logger = MagicMock()
30 |         monkeypatch.setattr(logger, "warning", mock_logger)
31 |         results = index.query()
32 |         assert results == []
33 |         mock_logger.assert_called_once()
34 | 
35 |     def test_query_callback_argument_is_query_with_fields(self, index):
36 |         def callback(query):
37 |             assert query.all_fields == index.fields
38 | 
39 |         index.query(callback=callback)
40 | 
41 |     def test_query_callback_can_configure_query(self, index):
42 |         def callback(query):
43 |             query.clause("study")
44 | 
45 |         results = index.query(callback=callback)
46 |         assert len(results) == 2
47 |         assert results[0]["ref"] == "b"
48 |         assert results[1]["ref"] == "a"
49 | 
50 | 
51 | class TestIndexSerialization:
52 |     def test_serialization(self, index):
53 |         serialized_index = index.serialize()
54 |         assert serialized_index["version"] == __TARGET_JS_VERSION__
55 |         assert serialized_index["fields"] == index.fields
56 |         for ref, vector in serialized_index["fieldVectors"]:
57 |             assert ref in index.field_vectors
58 |             assert_vectors_equal(vector, index.field_vectors[ref])
59 | 
60 |     def test_json_deserialization(self, index):
61 |         serialized_index = index.serialize()
62 |         json_serialized_index = json.dumps(serialized_index)
63 | 
64 |         idx = Index.load(json_serialized_index)
65 | 
66 |         assert idx == index
67 | 
68 |     def test_load_warns_on_js_version_mismatch(self, index):
69 |         serialized_index = index.serialize()
70 |         serialized_index["version"] = "1.0.0"
71 | 
72 |         with patch("lunr.index.logger") as mock_log:
73 |             Index.load(serialized_index)
74 |             mock_log.warning.assert_called_once()
75 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/test_language_support.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import tempfile
 3 | 
 4 | import pytest
 5 | 
 6 | from lunr import lunr
 7 | from lunr.index import Index
 8 | from tests.utils import read_json_fixture, run_node_script, assert_results_match
 9 | 
10 | 
11 | @pytest.mark.acceptance
12 | def test_languages_query_results_match_javascript_results():
13 |     query_string = "resistencia"
14 |     js_results = run_node_script("language_query.js", query_string).split("\n")
15 |     data = read_json_fixture("lang_es.json")
16 |     index = lunr(
17 |         ref="id", fields=("title", "text"), documents=data["docs"], languages="es"
18 |     )
19 |     results = index.search(query_string)
20 |     assert_results_match(results, js_results, tol=0.1)
21 | 
22 | 
23 | @pytest.mark.acceptance
24 | def test_js_serialized_lang_index_can_be_loaded_and_produces_same_results():
25 |     json_path = run_node_script("language_serialize_index.js")
26 |     with open(json_path) as fd:
27 |         js_serialized_index = fd.read()
28 | 
29 |     index = Index.load(js_serialized_index)
30 |     query_string = "imperio"
31 |     results = index.search(query_string)
32 |     js_results = run_node_script("language_query.js", query_string).split("\n")
33 |     assert_results_match(results, js_results)
34 | 
35 | 
36 | @pytest.mark.acceptance
37 | def test_serialized_lang_index_can_be_loaded_in_js_and_produces_same_results():
38 |     data = read_json_fixture("lang_es.json")
39 |     index = lunr(
40 |         ref="id", fields=("title", "text"), documents=data["docs"], languages="es"
41 |     )
42 |     query_string = "imperio"
43 |     results = index.search(query_string)
44 |     serialized_index = index.serialize()
45 | 
46 |     with tempfile.NamedTemporaryFile(delete=False) as fp:
47 |         fp.write(json.dumps(serialized_index).encode())
48 | 
49 |     js_results = run_node_script(
50 |         "language_load_serialized_index_and_search.js", fp.name, query_string
51 |     ).split("\n")
52 |     assert_results_match(results, js_results)
53 | 
54 | 
55 | @pytest.mark.acceptance
56 | def test_serialized_multilang_index_can_be_loaded_in_js_and_results_equal():
57 |     data = read_json_fixture("lang_es_en.json")
58 |     index = lunr(
59 |         ref="id",
60 |         fields=("title", "text"),
61 |         documents=data["docs"],
62 |         languages=["es", "en"],
63 |     )
64 |     query_string = "taxation"
65 |     results = index.search(query_string)
66 |     serialized_index = index.serialize()
67 | 
68 |     with tempfile.NamedTemporaryFile(delete=False) as fp:
69 |         fp.write(json.dumps(serialized_index).encode())
70 | 
71 |     js_results = run_node_script(
72 |         "language_load_serialized_index_and_search.js",
73 |         fp.name,
74 |         query_string,
75 |         "lang_es_en.json",
76 |     ).split("\n")
77 |     assert_results_match(results, js_results)
78 | 


--------------------------------------------------------------------------------
/lunr/__main__.py:
--------------------------------------------------------------------------------
 1 | from lunr import languages as lang
 2 | from lunr.builder import Builder
 3 | from lunr.stemmer import stemmer
 4 | from lunr.trimmer import trimmer
 5 | from lunr.stop_word_filter import stop_word_filter
 6 | 
 7 | 
 8 | def lunr(ref, fields, documents, languages=None, builder=None):
 9 |     """A convenience function to configure and construct a lunr.Index.
10 | 
11 |     Args:
12 |         ref (str): The key in the documents to be used a the reference.
13 |         fields (list): A list of strings defining fields in the documents to
14 |             index. Optionally a list of dictionaries with three keys:
15 |             `field_name` defining the document's field, `boost` an integer
16 |             defining a boost to be applied to the field, and `extractor`
17 |             a callable taking the document as a single argument and returning
18 |             a string located in the document in a particular way.
19 |         documents (list): The list of dictonaries representing the documents
20 |             to index. Optionally a 2-tuple of dicts, the first one being
21 |             the document and the second the associated attributes to it.
22 |         languages (str or list, optional): The languages to use if using
23 |             NLTK language support, ignored if NLTK is not available.
24 | 
25 |     Returns:
26 |         Index: The populated Index ready to search against.
27 |     """
28 |     builder = builder or get_default_builder(languages)
29 |     builder.ref(ref)
30 |     for field in fields:
31 |         if isinstance(field, dict):
32 |             builder.field(**field)
33 |         else:
34 |             builder.field(field)
35 | 
36 |     for document in documents:
37 |         if isinstance(document, (tuple, list)):
38 |             builder.add(document[0], attributes=document[1])
39 |         else:
40 |             builder.add(document)
41 | 
42 |     return builder.build()
43 | 
44 | 
45 | def get_default_builder(languages=None):
46 |     """Creates a new pre-configured instance of Builder.
47 | 
48 |     Useful as a starting point to tweak the defaults.
49 |     """
50 |     if languages is not None and lang.LANGUAGE_SUPPORT:
51 |         if isinstance(languages, str):
52 |             languages = [languages]
53 | 
54 |         unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES)
55 |         if unsupported_languages:
56 |             raise RuntimeError(
57 |                 "The specified languages {} are not supported, "
58 |                 "please choose one of {}".format(
59 |                     ", ".join(unsupported_languages),
60 |                     ", ".join(lang.SUPPORTED_LANGUAGES.keys()),
61 |                 )
62 |             )
63 |         builder = lang.get_nltk_builder(languages)
64 |     else:
65 |         builder = Builder()
66 |         builder.pipeline.add(trimmer, stop_word_filter, stemmer)
67 |         builder.search_pipeline.add(stemmer)
68 | 
69 |     return builder
70 | 


--------------------------------------------------------------------------------
/lunr/stop_word_filter.py:
--------------------------------------------------------------------------------
  1 | from lunr.pipeline import Pipeline
  2 | 
  3 | WORDS = {
  4 |     "a",
  5 |     "able",
  6 |     "about",
  7 |     "across",
  8 |     "after",
  9 |     "all",
 10 |     "almost",
 11 |     "also",
 12 |     "am",
 13 |     "among",
 14 |     "an",
 15 |     "and",
 16 |     "any",
 17 |     "are",
 18 |     "as",
 19 |     "at",
 20 |     "be",
 21 |     "because",
 22 |     "been",
 23 |     "but",
 24 |     "by",
 25 |     "can",
 26 |     "cannot",
 27 |     "could",
 28 |     "dear",
 29 |     "did",
 30 |     "do",
 31 |     "does",
 32 |     "either",
 33 |     "else",
 34 |     "ever",
 35 |     "every",
 36 |     "for",
 37 |     "from",
 38 |     "get",
 39 |     "got",
 40 |     "had",
 41 |     "has",
 42 |     "have",
 43 |     "he",
 44 |     "her",
 45 |     "hers",
 46 |     "him",
 47 |     "his",
 48 |     "how",
 49 |     "however",
 50 |     "i",
 51 |     "if",
 52 |     "in",
 53 |     "into",
 54 |     "is",
 55 |     "it",
 56 |     "its",
 57 |     "just",
 58 |     "least",
 59 |     "let",
 60 |     "like",
 61 |     "likely",
 62 |     "may",
 63 |     "me",
 64 |     "might",
 65 |     "most",
 66 |     "must",
 67 |     "my",
 68 |     "neither",
 69 |     "no",
 70 |     "nor",
 71 |     "not",
 72 |     "of",
 73 |     "off",
 74 |     "often",
 75 |     "on",
 76 |     "only",
 77 |     "or",
 78 |     "other",
 79 |     "our",
 80 |     "own",
 81 |     "rather",
 82 |     "said",
 83 |     "say",
 84 |     "says",
 85 |     "she",
 86 |     "should",
 87 |     "since",
 88 |     "so",
 89 |     "some",
 90 |     "than",
 91 |     "that",
 92 |     "the",
 93 |     "their",
 94 |     "them",
 95 |     "then",
 96 |     "there",
 97 |     "these",
 98 |     "they",
 99 |     "this",
100 |     "tis",
101 |     "to",
102 |     "too",
103 |     "twas",
104 |     "us",
105 |     "wants",
106 |     "was",
107 |     "we",
108 |     "were",
109 |     "what",
110 |     "when",
111 |     "where",
112 |     "which",
113 |     "while",
114 |     "who",
115 |     "whom",
116 |     "why",
117 |     "will",
118 |     "with",
119 |     "would",
120 |     "yet",
121 |     "you",
122 |     "your",
123 | }
124 | 
125 | 
126 | def generate_stop_word_filter(stop_words, language=None):
127 |     """Builds a stopWordFilter function from the provided list of stop words.
128 | 
129 |     The built in `stop_word_filter` is built using this factory and can be used
130 |     to generate custom `stop_word_filter` for applications or non English
131 |     languages.
132 |     """
133 | 
134 |     def stop_word_filter(token, i=None, tokens=None):
135 |         if token and str(token) not in stop_words:
136 |             return token
137 | 
138 |     # camelCased for for compatibility with lunr.js
139 |     label = (
140 |         "stopWordFilter-{}".format(language)
141 |         if language is not None
142 |         else "stopWordFilter"
143 |     )
144 |     Pipeline.register_function(stop_word_filter, label)
145 |     return stop_word_filter
146 | 
147 | 
148 | stop_word_filter = generate_stop_word_filter(WORDS)
149 | 


--------------------------------------------------------------------------------
/tests/test_language_support.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from lunr import lunr
 4 | from lunr.languages import LANGUAGE_SUPPORT, SUPPORTED_LANGUAGES
 5 | from lunr.pipeline import Pipeline
 6 | 
 7 | documents = [
 8 |     {
 9 |         "id": "a",
10 |         "text": (
11 |             "Este es un ejemplo inventado de lo que sería un documento en el "
12 |             "idioma que se más se habla en España."
13 |         ),
14 |         "title": "Ejemplo de documento en español",
15 |     },
16 |     {
17 |         "id": "b",
18 |         "text": (
19 |             "Según un estudio que me acabo de inventar porque soy un experto en"
20 |             "idiomas que se hablan en España."
21 |         ),
22 |         "title": "Español es el tercer idioma más hablado del mundo",
23 |     },
24 | ]
25 | 
26 | 
27 | class TestLanguageSupport:
28 |     @classmethod
29 |     def setup_class(cls):
30 |         assert (
31 |             LANGUAGE_SUPPORT is True
32 |         ), "NLTK not found, please run `pip install -e .[languages]`"
33 | 
34 |     def test_lunr_function_raises_if_unsupported_language(self):
35 |         with pytest.raises(RuntimeError):
36 |             lunr("id", ["title", "text"], documents, "foo")
37 | 
38 |     def test_lunr_function_raises_if_any_unsupported_language_is_passed(self):
39 |         with pytest.raises(RuntimeError):
40 |             lunr("id", ["title", "text"], documents, ["es", "foo"])
41 | 
42 |     def test_register_languages_in_pipeline_class(self):
43 |         for lang in set(SUPPORTED_LANGUAGES) - {"en"}:
44 |             assert "stemmer-{}".format(lang) in Pipeline.registered_functions
45 | 
46 |     def test_lunr_function_registers_nltk_stemmers_in_pipeline(self):
47 |         idx = lunr("id", ["title", "text"], documents, ["es", "it"])
48 |         assert "stemmer-es" in repr(idx.pipeline)
49 |         assert "stemmer-it" in repr(idx.pipeline)
50 | 
51 |     def test_lunr_registers_lun_stemmers_in_pipeline_if_language_is_en(self):
52 |         idx = lunr("id", ["title", "text"], documents, ["en", "es"])
53 |         assert "stemmer,stemmer-es" in repr(idx.pipeline)
54 | 
55 |     def test_search_stems_search_terms(self):
56 |         idx = lunr("id", ["title", "text"], documents, "es")
57 |         results = idx.search("inventando")  # stemmed to "invent"
58 |         assert len(results) == 2
59 | 
60 |     def test_search_stems_search_terms_for_both_languages(self):
61 |         italian_document = {
62 |             "id": "c",
63 |             "text": (
64 |                 "Secondo uno studio che ho appena inventato perché sono un "
65 |                 "esperto di lingue parlate in Spagna."
66 |             ),
67 |             "title": "Lo spagnolo è la terza lingua più parlata al mondo",
68 |         }
69 |         idx = lunr(
70 |             ref="id",
71 |             fields=["title", "text"],
72 |             documents=(documents + [italian_document]),
73 |             languages=["es", "it"],
74 |         )
75 |         results = idx.search("spagna")
76 |         assert len(results) == 1
77 | 
78 |         results = idx.search("inventando")
79 |         assert len(results) == 2
80 | 


--------------------------------------------------------------------------------
/docs/languages.md:
--------------------------------------------------------------------------------
 1 | # Language support
 2 | 
 3 | Lunr includes optional and experimental support for languages other than English via the [Natural Language Toolkit](http://www.nltk.org/). To install Lunr with this feature use `pip install lunr[languages]`.
 4 | 
 5 | The currently supported languages are:
 6 | 
 7 | - Arabic
 8 | - Danish
 9 | - Dutch
10 | - English
11 | - Finnish
12 | - French
13 | - German
14 | - Hungarian
15 | - Italian
16 | - Norwegian
17 | - Portuguese
18 | - Romanian
19 | - Russian
20 | - Spanish
21 | - Swedish
22 | 
23 | ```python
24 | >>> documents = [
25 | ...   {
26 | ...     "id": "a",
27 | ...     "text": (
28 | ...         "Este es un ejemplo inventado de lo que sería un documento en el "
29 | ...         "idioma que se más se habla en España."),
30 | ...     "title": "Ejemplo de documento en español"
31 | ...   },
32 | ...   {
33 | ...     "id": "b",
34 | ...     "text": (
35 | ...         "Según un estudio que me acabo de inventar porque soy un experto en"
36 | ...         "idiomas que se hablan en España."),
37 | ...     "title": "Español es el tercer idioma más hablado del mundo"
38 | ...   },
39 | ... ]
40 | ```
41 | 
42 | > New in 0.5.1: the `lunr` function now accepts more than one language
43 | 
44 | Simply define specify one or more [ISO-639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) for the language(s) of your documents in the `languages` parameter to the `lunr` function.
45 | 
46 | !!! Note
47 |     In versions of Lunr prior to 0.5.0 the parameter's name is `language` and accepted a single string.
48 | 
49 | If you have a single language you can pass the language code in `languages`:
50 | 
51 | ```python
52 | >>> from lunr import lunr
53 | >>> idx = lunr('id', ['title', 'text'], documents, languages='es')
54 | >>> idx.search('inventando')
55 | [{'ref': 'a', 'score': 0.130, 'match_data': <MatchData "invent">},
56 | {'ref': 'b', 'score': 0.089, 'match_data': <MatchData "invent">}]
57 | ```
58 | 
59 | !!! Note
60 |     In order to construct stemmers, trimmers and stop word filters Lunr imports corpus data from NLTK which fetches data from Github and caches it in your home directory under `nltk_data` by default. You may see some logging indicating such activity during the creation of the index.
61 | 
62 | If you have documents in multiple language pass a list of language codes:
63 | 
64 | ```python
65 | >>> documents.append({
66 |      "id": "c",
67 |      "text": "Let's say you also have documents written in English",
68 |      "title": "A document in English"
69 |  })
70 | >>> idx = lunr('id', ['title', 'text'], documents, languages=['es', 'en'])
71 | >>> idx.search('english')
72 | [{'ref': 'c', 'score': 1.106, 'match_data': <MatchData "english">}]
73 | ```
74 | 
75 | ## Notes on language support
76 | 
77 | - Using multiple languages means the terms will be stemmed once per language. This can yield unexpected results.
78 | - Compatibility with Lunr.js is ensured for languages that supported by both platforms, however results might differ slightly.
79 |     + Languages supported by Lunr.js but not by Lunr.py:
80 |         * Thai
81 |         * Japanese
82 |         * Turkish
83 |     + Languages supported by Lunr.py but not Lunr.js:
84 |         * Arabic
85 | - The usage of the language feature is subject to [NTLK corpus licensing clauses](https://github.com/nltk/nltk#redistributing)
86 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pytest
 4 | 
 5 | from lunr.tokenizer import Tokenizer
 6 | 
 7 | 
 8 | class TestTokenizer:
 9 |     def test_splitting_into_tokens(self):
10 |         tokenizer = Tokenizer("foo bar baz")
11 |         tokens = [str(token) for token in tokenizer]
12 | 
13 |         assert tokens == ["foo", "bar", "baz"]
14 | 
15 |     def test_run_downcases_tokens(self):
16 |         tokenizer = Tokenizer("foo BAR BAZ")
17 |         tokens = [str(token) for token in tokenizer]
18 | 
19 |         assert tokens == ["foo", "bar", "baz"]
20 | 
21 |     def test_array_of_strings(self):
22 |         tokenizer = Tokenizer(["foo", "bar", "baz"])
23 |         tokens = [str(token) for token in tokenizer]
24 | 
25 |         assert tokens == ["foo", "bar", "baz"]
26 | 
27 |     def test_none_is_converted_to_empty_string(self):
28 |         tokenizer = Tokenizer(["foo", None, "baz"])
29 |         tokens = [str(token) for token in tokenizer]
30 | 
31 |         assert tokens == ["foo", "", "baz"]
32 | 
33 |     def test_multiple_whitespace_is_stripped(self):
34 |         tokenizer = Tokenizer("   foo    bar   baz  ")
35 |         tokens = [str(token) for token in tokenizer]
36 | 
37 |         assert tokens == ["foo", "bar", "baz"]
38 | 
39 |     def test_handling_null_like_arguments(self):
40 |         assert len(Tokenizer(None)) == 0
41 | 
42 |     def test_converting_a_number_to_tokens(self):
43 |         tokens = [str(token) for token in Tokenizer(41)]
44 |         assert tokens == ["41"]
45 | 
46 |     def test_converting_a_boolean_to_tokens(self):
47 |         tokens = [str(token) for token in Tokenizer(False)]
48 |         assert tokens == ["false"]
49 | 
50 |     def test_converting_an_object_to_tokens(self):
51 |         class Subject:
52 |             def __str__(self):
53 |                 return "custom object"
54 | 
55 |         tokens = [str(token) for token in Tokenizer(Subject())]
56 |         assert tokens == ["custom", "object"]
57 | 
58 |     def test_splits_strings_with_hyphens(self):
59 |         tokens = [str(token) for token in Tokenizer("foo-bar")]
60 |         assert tokens == ["foo", "bar"]
61 | 
62 |     def test_splits_strings_with_hyphens_and_spaces(self):
63 |         tokens = [str(token) for token in Tokenizer("foo - bar")]
64 |         assert tokens == ["foo", "bar"]
65 | 
66 |     def test_tracking_the_token_index(self):
67 |         tokens = Tokenizer("foo bar")
68 |         assert tokens[0].metadata["index"] == 0
69 |         assert tokens[1].metadata["index"] == 1
70 | 
71 |     def test_tracking_the_token_position(self):
72 |         tokens = Tokenizer("foo bar")
73 |         assert tokens[0].metadata["position"] == [0, 3]
74 |         assert tokens[1].metadata["position"] == [4, 3]
75 | 
76 |     def test_providing_additional_metadata(self):
77 |         tokens = Tokenizer("foo bar", {"hurp": "durp"})
78 |         assert tokens[0].metadata["hurp"] == "durp"
79 |         assert tokens[1].metadata["hurp"] == "durp"
80 | 
81 |     @pytest.mark.parametrize("separator", [re.compile(r"[_\-]+"), lambda c: c in "_-"])
82 |     def test_providing_separator(self, separator):
83 |         tokens = [str(token) for token in Tokenizer("foo_bar-baz", separator=separator)]
84 |         assert tokens == ["foo", "bar", "baz"]
85 | 
86 |     def test_tracking_token_position_with_left_hand_whitespace(self):
87 |         tokens = Tokenizer(" foo bar")
88 |         assert tokens[0].metadata["position"] == [1, 3]
89 |         assert tokens[1].metadata["position"] == [5, 3]
90 | 
91 |     def test_tracking_token_position_with_right_hand_whitespace(self):
92 |         tokens = Tokenizer("foo bar ")
93 |         assert tokens[0].metadata["position"] == [0, 3]
94 |         assert tokens[1].metadata["position"] == [4, 3]
95 | 


--------------------------------------------------------------------------------
/lunr/languages/__init__.py:
--------------------------------------------------------------------------------
  1 | from itertools import chain
  2 | from functools import partial
  3 | 
  4 | import lunr
  5 | from lunr.builder import Builder
  6 | from lunr.languages.trimmer import generate_trimmer
  7 | from lunr.languages.stemmer import nltk_stemmer, get_language_stemmer
  8 | from lunr.pipeline import Pipeline
  9 | from lunr.stop_word_filter import stop_word_filter, generate_stop_word_filter
 10 | 
 11 | # map from ISO-639-1 codes to SnowballStemmer.languages
 12 | # Languages not supported by nltk but by lunr.js: thai, japanese and turkish
 13 | # Languages upported by nltk but not lunr.js: arabic
 14 | 
 15 | SUPPORTED_LANGUAGES = {
 16 |     "ar": "arabic",
 17 |     "da": "danish",
 18 |     "nl": "dutch",
 19 |     "en": "english",
 20 |     "fi": "finnish",
 21 |     "fr": "french",
 22 |     "de": "german",
 23 |     "hu": "hungarian",
 24 |     "it": "italian",
 25 |     "no": "norwegian",
 26 |     "pt": "portuguese",
 27 |     "ro": "romanian",
 28 |     "ru": "russian",
 29 |     "es": "spanish",
 30 |     "sv": "swedish",
 31 | }
 32 | 
 33 | try:  # pragma: no cover
 34 |     import nltk  # type: ignore
 35 | 
 36 |     LANGUAGE_SUPPORT = True
 37 | except ImportError:  # pragma: no cover
 38 |     LANGUAGE_SUPPORT = False
 39 | 
 40 | 
 41 | def _get_stopwords_and_word_characters(language):
 42 |     nltk.download("stopwords")
 43 |     verbose_language = SUPPORTED_LANGUAGES[language]
 44 |     stopwords = nltk.corpus.stopwords.words(verbose_language)
 45 |     # TODO: search for a more exhaustive list of word characters
 46 |     word_characters = {c for word in stopwords for c in word}
 47 |     return stopwords, word_characters
 48 | 
 49 | 
 50 | def get_nltk_builder(languages):
 51 |     """Returns a builder with stemmers for all languages added to it.
 52 | 
 53 |     Args:
 54 |         languages (list): A list of supported languages.
 55 |     """
 56 |     all_stemmers = []
 57 |     all_stopwords_filters = []
 58 |     all_word_characters = set()
 59 | 
 60 |     for language in languages:
 61 |         if language == "en":
 62 |             # use Lunr's defaults
 63 |             all_stemmers.append(lunr.stemmer.stemmer)
 64 |             all_stopwords_filters.append(stop_word_filter)
 65 |             all_word_characters.update({r"\w"})
 66 |         else:
 67 |             stopwords, word_characters = _get_stopwords_and_word_characters(language)
 68 |             all_stemmers.append(
 69 |                 Pipeline.registered_functions["stemmer-{}".format(language)]
 70 |             )
 71 |             all_stopwords_filters.append(
 72 |                 generate_stop_word_filter(stopwords, language=language)
 73 |             )
 74 |             all_word_characters.update(word_characters)
 75 | 
 76 |     builder = Builder()
 77 |     multi_trimmer = generate_trimmer("".join(sorted(all_word_characters)))
 78 |     Pipeline.register_function(
 79 |         multi_trimmer, "lunr-multi-trimmer-{}".format("-".join(languages))
 80 |     )
 81 |     builder.pipeline.reset()
 82 | 
 83 |     for fn in chain([multi_trimmer], all_stopwords_filters, all_stemmers):
 84 |         builder.pipeline.add(fn)
 85 |     for fn in all_stemmers:
 86 |         builder.search_pipeline.add(fn)
 87 | 
 88 |     return builder
 89 | 
 90 | 
 91 | def register_languages():
 92 |     """Register all supported languages to ensure compatibility."""
 93 |     for language in set(SUPPORTED_LANGUAGES) - {"en"}:
 94 |         language_stemmer = partial(nltk_stemmer, get_language_stemmer(language))
 95 |         Pipeline.register_function(language_stemmer, "stemmer-{}".format(language))
 96 | 
 97 | 
 98 | if LANGUAGE_SUPPORT:  # pragma: no cover
 99 |     # TODO: registering all possible stemmers feels unnecessary but it solves
100 |     # deserializing with arbitrary language functions. Ideally the schema would
101 |     # provide the language(s) for the index and we could register the stemmers
102 |     # as needed
103 |     register_languages()
104 | 


--------------------------------------------------------------------------------
/docs/lunrjs-interop.md:
--------------------------------------------------------------------------------
  1 | # Interoperability with Lunr.js
  2 | 
  3 | A key goal of Lunr.py is interoperability with Lunr.js: building an index with
  4 | Lunr.py and being able to read it using Lunr.js without having to build it
  5 | on the client on each visit.
  6 | 
  7 | The key step in this process is index serialization, which is possible thanks
  8 | to [`lunr-schema`](https://github.com/olivernn/lunr-schema).
  9 | 
 10 | The serialization process in Lunr.py consist on calling `Index.serialize`,
 11 | here is a complete example with the data from the [introduction](index.md):
 12 | 
 13 | ```python
 14 | >>> import json
 15 | >>> from lunr import lunr
 16 | >>> documents = [{
 17 | ...:         'id': 'a',
 18 | ...:         'title': 'Mr. Green kills Colonel Mustard',
 19 | ...:         'body': """Mr. Green killed Colonel Mustard in the study with the
 20 | ...: candlestick. Mr. Green is not a very nice fellow."""
 21 | ...:     }, {
 22 | ...:         'id': 'b',
 23 | ...:         'title': 'Plumb waters plant',
 24 | ...:         'body': 'Professor Plumb has a green and a yellow plant in his study',
 25 | ...:     }, {
 26 | ...:         'id': 'c',
 27 | ...:         'title': 'Scarlett helps Professor',
 28 | ...:         'body': """Miss Scarlett watered Professor Plumbs green plant
 29 | ...: while he was away on his murdering holiday.""",
 30 | ...:     }]
 31 | >>> idx = lunr(
 32 | ...:    ref='id',
 33 | ...:    fields=[dict(field_name='title', boost=10), 'body'],
 34 | ...:    documents=documents
 35 | ...: )
 36 | >>> serialized_idx = idx.serialize()
 37 | >>> with open('idx.json', 'w') as fd:
 38 | ...:    json.dump(serialized_idx, fd)
 39 | ```
 40 | 
 41 | As you can see `serialize` will produce a JSON friendly dict you can write to
 42 | disk and read from Lunr.js. The following snippet shows how to read the index
 43 | using Node.js:
 44 | 
 45 | ```javascript
 46 | > const fs = require('fs')
 47 | > const lunr = require('lunr')
 48 | > const serializedIndex = JSON.parse(fs.readFileSync('idx.json'))
 49 | > let idx = lunr.Index.load(serializedIndex)
 50 | > idx.search('plant')
 51 | [
 52 |   {
 53 |     ref: 'b',
 54 |     score: 1.599,
 55 |     matchData: { metadata: [Object: null prototype] }
 56 |   },
 57 |   {
 58 |     ref: 'c',
 59 |     score: 0.13,
 60 |     matchData: { metadata: [Object: null prototype] }
 61 |   }
 62 | ]
 63 | ```
 64 | 
 65 | !!! Note
 66 |     The search will only the _references_ of the matching documents.
 67 |     It is up to you to keep mapping of the documents in memory to be able show richer
 68 |     results which means in a web environment you will need to serve _two_ files,
 69 |     one for the index and another the collection of documents.
 70 | 
 71 | ## Loading a serialized index
 72 | 
 73 | You can also do the reverse operation of reading a serialized index produced
 74 | by Lunr.py or Lunr.js using the `Index.load` class method:
 75 | 
 76 | ```python
 77 | >>> import json
 78 | >>> from lunr.index import Index
 79 | >>> with open("idx.json") as fd:
 80 | ...     serialized_idx = json.loads(fd.read())
 81 | ...
 82 | >>> idx = Index.load(serialized_idx)
 83 | >>> idx.search("plant")
 84 | [{'ref': 'b', 'score': 1.599, 'match_data': <MatchData "plant">}, {'ref': 'c', 'score': 0.13, 'match_data': <MatchData "plant">}]
 85 | ```
 86 | 
 87 | ## Language support
 88 | 
 89 | Lunr.js uses the
 90 | [`lunr-languages`](https://lunrjs.com/guides/language_support.html) package,
 91 | a community driven collection of stemmers and trimmers for many languages.
 92 | 
 93 | Porting each of those into Python was not feasible so Lunr.py uses [NTLK](https://www.nltk.org/)
 94 | for language support and will configure the serialized index as expected by Lunr.js
 95 | to ensure compatibility.
 96 | 
 97 | However, this produces differences in scoring when loading indices from Lunr.py
 98 | into Lunr.js larger than those observed using the base english implementation,
 99 | due to inherent differences in the implementation of said stemmers and trimmers.
100 | 


--------------------------------------------------------------------------------
/tests/test_query.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from lunr.query import Query, QueryPresence
  4 | from lunr.token import Token
  5 | from lunr.tokenizer import Tokenizer
  6 | 
  7 | 
  8 | class BaseQuerySuite:
  9 |     ALL_FIELDS = ["title", "body"]
 10 | 
 11 |     def setup_method(self, method):
 12 |         self.query = Query(self.ALL_FIELDS)
 13 | 
 14 | 
 15 | class TestQueryTerm(BaseQuerySuite):
 16 |     def test_single_string_term_adds_single_clause(self):
 17 |         self.query.clause(term="foo")
 18 | 
 19 |         assert len(self.query.clauses) == 1
 20 |         assert self.query.clauses[0].term == "foo"
 21 |         assert repr(self.query) == '<Query fields="title,body" clauses="foo">'
 22 |         assert repr(self.query.clauses[0]) == '<Clause term="foo">'
 23 | 
 24 |     def test_single_token_term_adds_single_clause(self):
 25 |         self.query.term(Token("foo"))
 26 | 
 27 |         assert len(self.query.clauses) == 1
 28 |         assert self.query.clauses[0].term == "foo"
 29 | 
 30 |     def test_multiple_string_terms_adds_multiple_clauses(self):
 31 |         self.query.term(["foo", "bar"])
 32 | 
 33 |         assert len(self.query.clauses) == 2
 34 |         assert self.query.clauses[0].term == "foo"
 35 |         assert self.query.clauses[1].term == "bar"
 36 |         assert repr(self.query) == ('<Query fields="title,body" clauses="foo,bar">')
 37 | 
 38 |     def test_multiple_token_terms_adds_multiple_clauses(self):
 39 |         self.query.term(Tokenizer("foo bar"))
 40 | 
 41 |         assert len(self.query.clauses) == 2
 42 |         assert self.query.clauses[0].term == "foo"
 43 |         assert self.query.clauses[1].term == "bar"
 44 | 
 45 |     def test_multiple_string_terms_with_options(self):
 46 |         self.query.term(["foo", "bar"], use_pipeline=False)
 47 | 
 48 |         assert len(self.query.clauses) == 2
 49 |         assert self.query.clauses[0].term == "foo"
 50 |         assert self.query.clauses[1].term == "bar"
 51 | 
 52 | 
 53 | class TestQueryClause(BaseQuerySuite):
 54 |     def test_clause_defaults(self):
 55 |         self.query.clause(term="foo")
 56 |         self.clause = self.query.clauses[0]
 57 | 
 58 |         assert self.clause.fields == self.ALL_FIELDS
 59 |         assert self.clause.boost == 1
 60 |         assert self.clause.use_pipeline is True
 61 | 
 62 |     def test_clause_specified(self):
 63 |         self.query.clause(term="foo", boost=10, fields=["title"], use_pipeline=False)
 64 |         self.clause = self.query.clauses[0]
 65 | 
 66 |         assert self.clause.fields == ["title"]
 67 |         assert self.clause.boost == 10
 68 |         assert self.clause.use_pipeline is False
 69 | 
 70 |     @pytest.mark.parametrize(
 71 |         "wildcard, expected_term",
 72 |         [
 73 |             (Query.WILDCARD_NONE, "foo"),
 74 |             (Query.WILDCARD_LEADING, "*foo"),
 75 |             (Query.WILDCARD_TRAILING, "foo*"),
 76 |             (Query.WILDCARD_LEADING | Query.WILDCARD_TRAILING, "*foo*"),
 77 |         ],
 78 |     )
 79 |     def test_clause_wildcard(self, wildcard, expected_term):
 80 |         self.query.clause(term="foo", wildcard=wildcard)
 81 |         self.clause = self.query.clauses[0]
 82 | 
 83 |         assert self.clause.term == expected_term
 84 | 
 85 |     def test_clause_wildcard_existing(self):
 86 |         self.query.clause(
 87 |             term="*foo*", wildcard=Query.WILDCARD_LEADING | Query.WILDCARD_TRAILING
 88 |         )
 89 |         self.clause = self.query.clauses[0]
 90 | 
 91 |         assert self.clause.term == "*foo*"
 92 | 
 93 | 
 94 | class TestQueryIsNegated(BaseQuerySuite):
 95 |     def test_all_prohibited(self):
 96 |         self.query.term("foo", presence=QueryPresence.PROHIBITED)
 97 |         self.query.term("bar", presence=QueryPresence.PROHIBITED)
 98 | 
 99 |         assert self.query.is_negated() is True
100 | 
101 |     def test_some_prohibited(self):
102 |         self.query.term("foo", presence=QueryPresence.PROHIBITED)
103 |         self.query.term("bar", presence=QueryPresence.REQUIRED)
104 | 
105 |         assert self.query.is_negated() is False
106 | 
107 |     def test_nome_prohibited(self):
108 |         self.query.term("foo", presence=QueryPresence.OPTIONAL)
109 |         self.query.term("bar", presence=QueryPresence.REQUIRED)
110 | 
111 |         assert self.query.is_negated() is False
112 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## 0.6.2 (2022-02-27)
  4 | 
  5 | - Add `Pipeline.skip` method to skip pipeline functions on specific fields
  6 | @tristanlatr
  7 | 
  8 | ### Deprecation warning
  9 | 
 10 | - 0.6.2 will be the last release to support Python 3.6.
 11 | 
 12 | ## 0.6.1 (2021-10-16)
 13 | 
 14 | - Add support for Python 3.10.
 15 | - Remove pin to NLTK < 3.5.
 16 | 
 17 | ## 0.6.0 (2021-04-22)
 18 | 
 19 | - Add index customisation, enabling build and search pipeline tweaks as well as
 20 | meta-data whitelisting.
 21 | 
 22 | ## 0.5.9 (2021-01-10)
 23 | 
 24 | - Compatibility with Lunr.js 2.3.9:
 25 |     - Fix bug where clause matches are incorrectly initialized to a complete set.
 26 | - Add support for Python 3.9
 27 | - Drop support for Python 3.5
 28 | 
 29 | ## 0.5.8 (2020-04-16)
 30 | 
 31 | - Fix installing ntlk in 2.7 without `languages` extra.
 32 | - Optimize regexes and avoid usage by default.
 33 | 
 34 | ### Deprecation warning
 35 | 
 36 | - 0.5.8 will be the last release to support Python 2.7.
 37 | 
 38 | ## 0.5.7 (2020-04-14)
 39 | 
 40 | - Prevent installing an unsupported version of NLTK in Python 2.7.
 41 | 
 42 | ## 0.5.6 (2019-11-17)
 43 | 
 44 | - Support for Python 3.8
 45 | - Compatibility with Lunr.js 2.3.8:
 46 |     - Fix bug where leading white space would cause token position metadata to be reported incorrectly.
 47 | 
 48 | ## 0.5.5 (2019-04-28)
 49 | 
 50 | - Compatibility with Lunr.js 2.3.6:
 51 |     - Fix bug with fuzzy matching that meant deletions at the end of a word would not match.
 52 | 
 53 | ## 0.5.4 (2018-11-10)
 54 | 
 55 | - Compatibility with Lunr.js 2.3.5:
 56 |     - Fix bug on fuzzy matching ignoring matches on insertions at the end of the word.
 57 | 
 58 | ## 0.5.3 (2018-09-08)
 59 | 
 60 | - Performance improvements on indexing
 61 | - Compatibility with Lunr.js 2.3.3:
 62 |     - Fixes catastrophic backtracking on leading wildcards
 63 | 
 64 | ## 0.5.2 (2018-08-25)
 65 | 
 66 | - Fix Python 2.7 support
 67 | 
 68 | ## 0.5.1 (2018-08-25)
 69 | 
 70 | - Added multilanguage support
 71 | - Improved language support
 72 | 
 73 | ### Deprecation warning
 74 | 
 75 | - The `language` argument to the `lunr` has been renamed to `languages` to accomodate for multilanguage support. The `languages` argument accepts a string or an iterable of ISO-639-1 languages codes. If you're calling `lunr` with keyword arguments please update such calls accordingly.
 76 | 
 77 | ## 0.4.3 (2018-08-18)
 78 | 
 79 | - Target Lunr.js v2.3.2
 80 | 
 81 | ## 0.4.2 (2018-07-28)
 82 | 
 83 | - Target Lunr.js v2.3.1
 84 | - Fix crash when using non-string document references.
 85 | 
 86 | ## 0.4.1 (2018-07-07)
 87 | 
 88 | - Added support for Python 3.7
 89 | 
 90 | ## 0.4.0 (2018-06-25)
 91 | 
 92 | - Compatibility with Lunr.js v2.3.0. Including:
 93 |     + Add support for build time field and document boosts.
 94 |     + Add support for indexing nested document fields using field extractors.
 95 |     + Prevent usage of problematic characters in field names
 96 | 
 97 | ## 0.3.0 (2018-06-03)
 98 | 
 99 | - Compatibility with Lunr.js v2.2.1. Including:
100 |     + Add support for queries with term presence, e.g. required terms and prohibited terms.
101 |     + Add support for using the output of `lunr.Tokenizer` directly with `lunr.Query.term`.
102 |     + Add field name metadata to tokens in build and search pipelines.
103 | 
104 | ## 0.2.3 (2018-05-19)
105 | 
106 | - Compatibility with Lunr.js v2.1.6
107 | 
108 | ## 0.2.2 (2018-05-15)
109 | 
110 | - Fix bug on whitelisting metadata in Builder.
111 | 
112 | ## 0.2.1 (2018-04-21)
113 | 
114 | - Refactor of multilanguage support.
115 | 
116 | ## 0.2.0 (2018-04-15)
117 | 
118 | - Experimental support for languages via NLTK, currently supported languages are arabic, danish, dutch, english, finnish, french, german, hungarian, italian, norwegian, portuguese, romanian, russian, spanish and swedish. Note compatibility with Lunr.js and lunr-languages is reduced.
119 | 
120 | ## 0.1.2 (2018-03-17)
121 | 
122 | - Add serialization tests passing serialized index from Python to JS and producing same results.
123 | - Added `Index.create_query` returning a preinitialized `Query` with the index's fields or a subset of them.
124 | - `Index.search` does not accept a callback function, instead expects a `Query` object the user should preconfigure first.
125 | - Various docstring and repr changes.
126 | 
127 | ## 0.1.1a1 (2018-03-01)
128 | 
129 | - Initial release


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | hide-toc: true
  3 | ---
  4 | 
  5 | # Lunr.py 🌖
  6 | 
  7 | A Python implementation of [Lunr.js](https://lunrjs.com) by [Oliver Nightingale](https://github.com/olivernn).
  8 | 
  9 | > A bit like Solr, but much smaller and not as bright.
 10 | 
 11 | This Python version of Lunr.js aims to bring the simple and powerful full text search
 12 | capabilities into Python guaranteeing results as close as the original
 13 | implementation as possible.
 14 | 
 15 | ## What does this even do?
 16 | 
 17 | Lunr is a simple full text search solution for situations where deploying a full
 18 | scale solution like Elasticsearch isn't possible, viable or you're simply prototyping.
 19 | Lunr parses a set of documents and creates an inverted index for quick full text
 20 | searches in the same way other more complicated solution.
 21 | 
 22 | The trade-off is that Lunr keeps the inverted index in memory and requires you
 23 | to recreate or read the index at the start of your application.
 24 | 
 25 | ## Interoperability with Lunr.js
 26 | 
 27 | A core objective of Lunr.py is to [provide interoperability with the JavaScript
 28 | version](lunrjs-interop.md).
 29 | 
 30 | An example can be found in the [MkDocs documentation library](http://www.mkdocs.org/).
 31 | MkDocs produces a set of documents from the pages of the documentation and uses
 32 | [Lunr.js](https://lunrjs.com) in the frontend to power its built-in searching
 33 | engine. This set of documents is in the form of a JSON file which needs to be
 34 | fetched and parsed by Lunr.js to create the inverted index at startup of your application.
 35 | 
 36 | While this is not a problem for most sites, depending on the size of your document
 37 | set, this can take some time.
 38 | 
 39 | Lunr.py provides a backend solution, allowing you to parse the documents in Python
 40 | of time and create a serialized Lunr.js index you can pass have the browser
 41 | version read, minimizing start up time of your application.
 42 | 
 43 | Each version of lunr.py
 44 | [targets a specific version of lunr.js](https://github.com/yeraydiazdiaz/lunr.py/blob/master/lunr/__init__.py#L12)
 45 | and produces the same results as it both in Python 2.7 and 3 for
 46 | [non-trivial corpus of documents](https://github.com/yeraydiazdiaz/lunr.py/blob/master/tests/acceptance_tests/fixtures/mkdocs_index.json).
 47 | 
 48 | Lunr.py also serializes `Index` instances respecting the
 49 | [`lunr-schema`](https://github.com/olivernn/lunr-schema) which are consumable by
 50 | Lunr.js and viceversa.
 51 | 
 52 | ## Installation
 53 | 
 54 | `pip install lunr`
 55 | 
 56 | An optional and experimental support for other languages thanks to the
 57 | [Natural Language Toolkit](http://www.nltk.org/) stemmers is also available via
 58 | `pip install lunr[languages]`. The usage of the language feature is subject to
 59 | [NTLK corpus licensing clauses](https://github.com/nltk/nltk#redistributing).
 60 | 
 61 | Please refer to the
 62 | [documentation page on languages](https://lunr.readthedocs.io/en/latest/languages/)
 63 | for more information.
 64 | 
 65 | ## Usage
 66 | 
 67 | First, you'll need a list of dicts representing the documents you want to search on.
 68 | These documents must have a unique field which will serve as a reference and a
 69 | series of fields you'd like to search on.
 70 | 
 71 | Lunr provides a convenience `lunr` function to quickly index this set of documents:
 72 | 
 73 | ```python
 74 | >>> from lunr import lunr
 75 | >>>
 76 | >>> documents = [{
 77 | ...     'id': 'a',
 78 | ...     'title': 'Mr. Green kills Colonel Mustard',
 79 | ...     'body': 'Mr. Green killed Colonel Mustard in the study with the candlestick.',
 80 | ... }, {
 81 | ...     'id': 'b',
 82 | ...     'title': 'Plumb waters plant',
 83 | ...     'body': 'Professor Plumb has a green plant in his study',
 84 | ... }]
 85 | >>> idx = lunr(
 86 | ...     ref='id', fields=('title', 'body'), documents=documents
 87 | ... )
 88 | >>> idx.search('kill')
 89 | [{'ref': 'a', 'score': 0.6931722372559913, 'match_data': <MatchData "kill">}]
 90 | >>> idx.search('study')
 91 | [{'ref': 'b', 'score': 0.23576799568081389, 'match_data': <MatchData "studi">}, {'ref': 'a', 'score': 0.2236629211724517, 'match_data': <MatchData "studi">}]
 92 | ```
 93 | 
 94 | ```{toctree}
 95 | :hidden:
 96 | usage
 97 | indices
 98 | languages
 99 | lunrjs-interop
100 | changelog
101 | customisation
102 | GitHub Repository <https://github.com/yeraydiazdiaz/lunr.py>
103 | ```
104 | 


--------------------------------------------------------------------------------
/lunr/query_lexer.py:
--------------------------------------------------------------------------------
  1 | from lunr.tokenizer import default_separator
  2 | 
  3 | 
  4 | class QueryLexer:
  5 |     # TODO: use iteration protocol?
  6 |     EOS = "EOS"
  7 |     FIELD = "FIELD"
  8 |     TERM = "TERM"
  9 |     EDIT_DISTANCE = "EDIT_DISTANCE"
 10 |     BOOST = "BOOST"
 11 |     PRESENCE = "PRESENCE"
 12 | 
 13 |     def __init__(self, string):
 14 |         self.lexemes = []
 15 |         self.string = string
 16 |         self.length = len(string)
 17 |         self.pos = 0
 18 |         self.start = 0
 19 |         self.escape_char_positions = []
 20 | 
 21 |     @property
 22 |     def width(self):
 23 |         return self.pos - self.start
 24 | 
 25 |     def ignore(self):
 26 |         if self.start == self.pos:
 27 |             self.pos += 1
 28 | 
 29 |         self.start = self.pos
 30 | 
 31 |     def backup(self):
 32 |         self.pos -= 1
 33 | 
 34 |     def accept_digit_run(self):
 35 |         char = self.next()
 36 |         while char != self.EOS and (47 < ord(char) < 58):
 37 |             char = self.next()
 38 | 
 39 |         if char != self.EOS:
 40 |             self.backup()
 41 | 
 42 |     def run(self):
 43 |         state = self.lex_text()
 44 |         while state:
 45 |             state = state()
 46 | 
 47 |     def slice_string(self):
 48 |         subslices = []
 49 |         slice_start = self.start
 50 | 
 51 |         for escape_char_position in self.escape_char_positions:
 52 |             subslices.append(self.string[slice_start:escape_char_position])
 53 |             slice_start = escape_char_position + 1
 54 | 
 55 |         subslices.append(self.string[slice_start : self.pos])
 56 |         self.escape_char_positions = []
 57 | 
 58 |         return "".join(subslices)
 59 | 
 60 |     def next(self):
 61 |         if self.pos >= self.length:
 62 |             return self.EOS
 63 | 
 64 |         char = self.string[self.pos]
 65 |         self.pos += 1
 66 |         return char
 67 | 
 68 |     def emit(self, type_):
 69 |         self.lexemes.append(
 70 |             {
 71 |                 "type": type_,
 72 |                 "string": self.slice_string(),
 73 |                 "start": self.start,
 74 |                 "end": self.pos,
 75 |             }
 76 |         )
 77 |         self.start = self.pos
 78 | 
 79 |     def escape_character(self):
 80 |         self.escape_char_positions.append(self.pos - 1)
 81 |         self.pos += 1
 82 | 
 83 |     def lex_field(self):
 84 |         self.backup()
 85 |         self.emit(self.FIELD)
 86 |         self.ignore()
 87 |         return self.lex_text
 88 | 
 89 |     def lex_term(self):
 90 |         if self.width > 1:
 91 |             self.backup()
 92 |             self.emit(self.TERM)
 93 | 
 94 |         self.ignore()
 95 | 
 96 |         return self.lex_text
 97 | 
 98 |     def lex_edit_distance(self):
 99 |         self.ignore()
100 |         self.accept_digit_run()
101 |         self.emit(self.EDIT_DISTANCE)
102 |         return self.lex_text
103 | 
104 |     def lex_boost(self):
105 |         self.ignore()
106 |         self.accept_digit_run()
107 |         self.emit(self.BOOST)
108 |         return self.lex_text
109 | 
110 |     def lex_EOS(self):
111 |         if self.width > 0:
112 |             self.emit(self.TERM)
113 | 
114 |     def lex_text(self):
115 |         while True:
116 |             char = self.next()
117 |             if char == self.EOS:
118 |                 return self.lex_EOS
119 | 
120 |             if ord(char) == 92:  # Escape character is '\'
121 |                 self.escape_character()
122 |                 continue
123 | 
124 |             if char == ":":
125 |                 return self.lex_field
126 | 
127 |             if char == "~":
128 |                 self.backup()
129 |                 if self.width > 0:
130 |                     self.emit(self.TERM)
131 | 
132 |                 return self.lex_edit_distance
133 | 
134 |             if char == "^":
135 |                 self.backup()
136 |                 if self.width > 0:
137 |                     self.emit(self.TERM)
138 | 
139 |                 return self.lex_boost
140 | 
141 |             # '+' indicates term presence is required, check for length to
142 |             # ensure only a leading '+' is considered
143 |             if char == "+" and self.width == 1:
144 |                 self.emit(self.PRESENCE)
145 |                 return self.lex_text
146 | 
147 |             # '-' indicates term presence is prohibited
148 |             if char == "-" and self.width == 1:
149 |                 self.emit(self.PRESENCE)
150 |                 return self.lex_text
151 | 
152 |             if default_separator(char):
153 |                 return self.lex_term
154 | 


--------------------------------------------------------------------------------
/docs/customisation.md:
--------------------------------------------------------------------------------
  1 | # Customisation
  2 | 
  3 | Lunr.py ships with some sensible defaults to create indexes and search easily,
  4 | but in some cases you may want to tweak how documents are indexed and search.
  5 | You can do that in lunr.py by passing your own `Builder` instance to the `lunr`
  6 | function.
  7 | 
  8 | ## Pipeline functions
  9 | 
 10 | When the builder processes your documents it splits (tokenises) the text, and
 11 | applies a series of functions to each token. These are called pipeline functions.
 12 | 
 13 | The builder includes two pipelines, indexing and searching.
 14 | 
 15 | If you want to change the way lunr.py indexes the documents you'll need to
 16 | change the indexing pipeline.
 17 | 
 18 | For example, say you wanted to support the American and British way of spelling
 19 | certain words, you could use a normalisation pipeline function to force one
 20 | token into the other:
 21 | 
 22 | ```python
 23 | from lunr import lunr, get_default_builder
 24 | import lunr.pipeline.Pipeline
 25 | 
 26 | documents = [...]
 27 | 
 28 | builder = get_default_builder()
 29 | def normalise_spelling(token, i, tokens) {
 30 |     if str(token) == "gray":
 31 |         return token.update(lambda: "grey")
 32 |     else:
 33 |         return token
 34 | 
 35 | lunr.pipeline.Pipeline.register_function(normalise_spelling)
 36 | builder.pipeline.add(normalise_spelling)
 37 | 
 38 | idx = lunr(ref="id", fields=("title", "body"), documents=documents, builder=builder)
 39 | ```
 40 | 
 41 | Note pipeline functions take the token being processed, its position in the
 42 | token list, and the token list itself.
 43 | 
 44 | ## Skip a pipeline function for specific field names
 45 | 
 46 | The `Pipeline.skip()` method allows you to skip a pipeline function for specific field names.
 47 | This example skips the `stop_word_filter` pipeline function for the field `fullName`.
 48 | 
 49 | ```python
 50 | from lunr import lunr, get_default_builder, stop_word_filter
 51 | 
 52 | documents = [...]
 53 | 
 54 | builder = get_default_builder()
 55 | 
 56 | builder.pipeline.skip(stop_word_filter.stop_word_filter, ["fullName"])
 57 | 
 58 | idx = lunr(ref="id", fields=("fullName", "body"), documents=documents, builder=builder)
 59 | ```
 60 | 
 61 | ## Token meta-data
 62 | 
 63 | Lunr.py `Token` instances include meta-data information which can be used in
 64 | pipeline functions. This meta-data is not stored in the index by default, but it
 65 | can be by adding it to the builder's `metadata_whitelist` property. This will
 66 | include the meta-data in the search results:
 67 | 
 68 | ```python
 69 | from lunr import lunr, get_default_builder
 70 | import lunr.pipeline.Pipeline
 71 | 
 72 | builder = get_default_builder()
 73 | 
 74 | def token_length(token, i, tokens):
 75 |     token.metadata["token_length"] = len(str(token))
 76 |     return token
 77 | 
 78 | Pipeline.register_function(token_length)
 79 | builder.pipeline.add(token_length)
 80 | builder.metadata_whitelist.append("token_length")
 81 | 
 82 | idx = lunr("id", ("title", "body"), documents, builder=builder)
 83 | 
 84 | [result, _, _] = idx.search("green")
 85 | assert result["match_data"].metadata["green"]["title"]["token_length"] == [5]
 86 | assert result["match_data"].metadata["green"]["body"]["token_length"] == [5, 5]
 87 | ```
 88 | 
 89 | ## Similarity tuning
 90 | 
 91 | The algorithm used by Lunr to calculate similarity between a query and a document
 92 | can be tuned using two parameters. Lunr ships with sensible defaults, and these
 93 | can be adjusted to provide the best results for a given collection of documents.
 94 | 
 95 | - **b**: This parameter controls the importance given to the length of a
 96 | document and its fields. This value must be between 0 and 1, and by default it
 97 | has a value of 0.75. Reducing this value reduces the effect of different length
 98 | documents on a term’s importance to that document.
 99 | - **k1**: This controls how quickly the boost given by a common word reaches
100 | saturation. Increasing it will slow down the rate of saturation and lower values
101 | result in quicker saturation. The default value is 1.2. If the collection of
102 | documents being indexed have high occurrences of words that are not covered by
103 | a stop word filter, these words can quickly dominate any similarity calculation.
104 | In these cases, this value can be reduced to get more balanced results.
105 | 
106 | These values can be changed in the builder:
107 | 
108 | ```python
109 | from lunr import lunr, get_default_builder
110 | 
111 | builder = get_default_builder()
112 | builder.k1(1.3)
113 | builder.b(0)
114 | 
115 | idx = lunr("id", ("title", "body"), documents, builder=builder)
116 | ```
117 | 
118 | 


--------------------------------------------------------------------------------
/tests/test_vector.py:
--------------------------------------------------------------------------------
  1 | from math import sqrt
  2 | 
  3 | import pytest
  4 | 
  5 | from lunr.vector import Vector
  6 | from lunr.exceptions import BaseLunrException
  7 | 
  8 | 
  9 | def _vector_from_args(*args):
 10 |     vector = Vector()
 11 |     for i, arg in enumerate(args):
 12 |         vector.insert(i, arg)
 13 |     return vector
 14 | 
 15 | 
 16 | def test_vector_repr():
 17 |     vector = _vector_from_args(1, 3, -5)
 18 |     assert repr(vector) == "<Vector magnitude={}>".format(vector.magnitude)
 19 | 
 20 | 
 21 | class TestVectorPositionForIndex:
 22 | 
 23 |     vector = Vector([1, "a", 2, "b", 4, "c", 7, "d", 11, "e"])
 24 | 
 25 |     def test_position_for_index_at_the_beggining(self):
 26 |         assert self.vector.position_for_index(0) == 0
 27 | 
 28 |     def test_position_for_index_at_the_end(self):
 29 |         assert self.vector.position_for_index(20) == 10
 30 | 
 31 |     def test_position_for_index_consecutive(self):
 32 |         assert self.vector.position_for_index(3) == 4
 33 | 
 34 |     def test_position_for_index_non_consecutive_gap_after(self):
 35 |         assert self.vector.position_for_index(5) == 6
 36 | 
 37 |     def test_position_for_index_non_consecutive_gap_before(self):
 38 |         assert self.vector.position_for_index(6) == 6
 39 | 
 40 |     def test_position_for_index_non_consecutive_gap_before_and_after(self):
 41 |         assert self.vector.position_for_index(9) == 8
 42 | 
 43 |     def test_position_for_index_duplicate_at_the_beggining(self):
 44 |         assert self.vector.position_for_index(1) == 0
 45 | 
 46 |     def test_position_for_index_duplicate_at_the_end(self):
 47 |         assert self.vector.position_for_index(11) == 8
 48 | 
 49 |     def test_position_for_index_duplicate_consecutive(self):
 50 |         assert self.vector.position_for_index(4) == 4
 51 | 
 52 | 
 53 | def test_magnitude_calculates_magnitude():
 54 |     vector = _vector_from_args(4, 5, 6)
 55 |     assert sqrt(77) == vector.magnitude
 56 | 
 57 | 
 58 | def test_dot_calculates_dot_product_of_two_vectors():
 59 |     v1 = _vector_from_args(1, 3, -5)
 60 |     v2 = _vector_from_args(4, -2, -1)
 61 | 
 62 |     assert v1.dot(v2) == 3
 63 | 
 64 | 
 65 | class TestSimilarity:
 66 |     def test_similarity_calculates_the_similarity_between_two_vectors(self):
 67 |         v1 = _vector_from_args(1, 3, -5)
 68 |         v2 = _vector_from_args(4, -2, -1)
 69 | 
 70 |         assert v1.similarity(v2) == pytest.approx(0.5, 0.1)
 71 | 
 72 |     def test_empty_vector(self):
 73 |         v_empty = Vector()
 74 |         v1 = _vector_from_args(1)
 75 | 
 76 |         assert v1.similarity(v_empty) == 0
 77 |         assert v_empty.similarity(v1) == 0
 78 | 
 79 |     def test_non_overlapping_vector(self):
 80 |         v1 = Vector([1, 1])
 81 |         v2 = Vector([2, 1])
 82 | 
 83 |         assert v1.similarity(v2) == 0
 84 |         assert v2.similarity(v1) == 0
 85 | 
 86 | 
 87 | class TestVectorInsert:
 88 |     def test_insert_invalidates_magnitude_cache(self):
 89 |         vector = _vector_from_args(4, 5, 6)
 90 |         assert sqrt(77) == vector.magnitude
 91 | 
 92 |         vector.insert(3, 7)
 93 | 
 94 |         assert sqrt(126) == vector.magnitude
 95 | 
 96 |     def test_insert_keeps_items_in_index_specified_order(self):
 97 |         vector = Vector()
 98 | 
 99 |         vector.insert(2, 4)
100 |         vector.insert(1, 5)
101 |         vector.insert(0, 6)
102 | 
103 |         assert vector.to_list() == [6, 5, 4]
104 | 
105 |     def test_insert_fails_when_duplicate_entry(self):
106 |         vector = _vector_from_args(4, 5, 6)
107 |         with pytest.raises(BaseLunrException):
108 |             vector.insert(0, 44)
109 | 
110 | 
111 | class TestVectorUpsert:
112 |     def test_upsert_invalidates_magnitude_cache(self):
113 |         vector = _vector_from_args(4, 5, 6)
114 |         assert vector.magnitude == sqrt(77)
115 | 
116 |         vector.upsert(3, 7)
117 | 
118 |         assert vector.magnitude == sqrt(126)
119 | 
120 |     def test_upsert_keeps_items_in_index_specified_order(self):
121 |         vector = Vector()
122 | 
123 |         vector.upsert(2, 4)
124 |         vector.upsert(1, 5)
125 |         vector.upsert(0, 6)
126 | 
127 |         assert vector.to_list() == [6, 5, 4]
128 | 
129 |     def test_upsert_calls_fn_for_value_on_duplicate(self):
130 |         vector = _vector_from_args(4, 5, 6)
131 | 
132 |         vector.upsert(0, 4, lambda current, passed: current + passed)
133 | 
134 |         assert vector.to_list() == [8, 5, 6]
135 | 
136 |     def test_upsert_defaults_to_passed_value_on_duplicate(self):
137 |         vector = _vector_from_args(4, 5, 6)
138 | 
139 |         vector.upsert(0, 3)
140 | 
141 |         assert vector.to_list() == [3, 5, 6]
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://github.com/yeraydiazdiaz/lunr.py/workflows/CI/badge.svg?branch=master)](https://github.com/yeraydiazdiaz/lunr.py/actions?workflow=CI)
 2 | [![codecov](https://codecov.io/gh/yeraydiazdiaz/lunr.py/branch/master/graph/badge.svg)](https://codecov.io/gh/yeraydiazdiaz/lunr.py)
 3 | [![Supported Python Versions](https://img.shields.io/pypi/pyversions/lunr.svg)](https://pypi.org/project/lunr/)
 4 | [![PyPI](https://img.shields.io/pypi/v/lunr.svg)](https://pypi.org/project/lunr/)
 5 | [![Read the Docs](https://img.shields.io/readthedocs/lunr.svg)](http://lunr.readthedocs.io/en/latest/)
 6 | [![Downloads](http://pepy.tech/badge/lunr)](http://pepy.tech/project/lunr)
 7 | 
 8 | # Lunr.py
 9 | 
10 | A Python implementation of [Lunr.js](https://lunrjs.com) by [Oliver Nightingale](https://github.com/olivernn).
11 | 
12 | > A bit like Solr, but much smaller and not as bright.
13 | 
14 | This Python version of Lunr.js aims to bring the simple and powerful full text search
15 | capabilities into Python guaranteeing results as close as the original
16 | implementation as possible.
17 | 
18 | - [Documentation](http://lunr.readthedocs.io/en/latest/)
19 | 
20 | ## What does this even do?
21 | 
22 | Lunr is a simple full text search solution for situations where deploying a full
23 | scale solution like Elasticsearch isn't possible, viable or you're simply prototyping.
24 | Lunr parses a set of documents and creates an inverted index for quick full text
25 | searches in the same way other more complicated solution.
26 | 
27 | The trade-off is that Lunr keeps the inverted index in memory and requires you
28 | to recreate or read the index at the start of your application.
29 | 
30 | ## Interoperability with Lunr.js
31 | 
32 | A core objective of Lunr.py is to provide
33 | [interoperability with the JavaScript version](https://lunr.readthedocs.io/en/latest/lunrjs-interop).
34 | 
35 | An example can be found in the [MkDocs documentation library](http://www.mkdocs.org/).
36 | MkDocs produces a set of documents from the pages of the documentation and uses
37 | [Lunr.js](https://lunrjs.com) in the frontend to power its built-in searching
38 | engine. This set of documents is in the form of a JSON file which needs to be
39 | fetched and parsed by Lunr.js to create the inverted index at startup of your application.
40 | 
41 | While this is not a problem for most sites, depending on the size of your document
42 | set, this can take some time.
43 | 
44 | Lunr.py provides a backend solution, allowing you to parse the documents in Python
45 | of time and create a serialized Lunr.js index you can pass have the browser
46 | version read, minimizing start up time of your application.
47 | 
48 | Each version of lunr.py
49 | [targets a specific version of lunr.js](https://github.com/yeraydiazdiaz/lunr.py/blob/master/lunr/__init__.py#L12)
50 | and produces the same results for a
51 | [non-trivial corpus of documents](https://github.com/yeraydiazdiaz/lunr.py/blob/master/tests/acceptance_tests/fixtures/mkdocs_index.json).
52 | 
53 | ## Installation
54 | 
55 | `pip install lunr`
56 | 
57 | An optional and experimental support for other languages thanks to the
58 | [Natural Language Toolkit](http://www.nltk.org/) stemmers is also available via
59 | `pip install lunr[languages]`. The usage of the language feature is subject to
60 | [NTLK corpus licensing clauses](https://github.com/nltk/nltk#redistributing).
61 | 
62 | Please refer to the
63 | [documentation page on languages](https://lunr.readthedocs.io/en/latest/languages.html)
64 | for more information.
65 | 
66 | ## Usage
67 | 
68 | First, you'll need a list of dicts representing the documents you want to search on.
69 | These documents must have a unique field which will serve as a reference and a
70 | series of fields you'd like to search on.
71 | 
72 | Lunr provides a convenience `lunr` function to quickly index this set of documents:
73 | 
74 | ```python
75 | >>> from lunr import lunr
76 | >>>
77 | >>> documents = [{
78 | ...     'id': 'a',
79 | ...     'title': 'Mr. Green kills Colonel Mustard',
80 | ...     'body': 'Mr. Green killed Colonel Mustard in the study with the candlestick.',
81 | ... }, {
82 | ...     'id': 'b',
83 | ...     'title': 'Plumb waters plant',
84 | ...     'body': 'Professor Plumb has a green plant in his study',
85 | ... }]
86 | >>> idx = lunr(
87 | ...     ref='id', fields=('title', 'body'), documents=documents
88 | ... )
89 | >>> idx.search('kill')
90 | [{'ref': 'a', 'score': 0.6931722372559913, 'match_data': <MatchData "kill">}]
91 | >>> idx.search('study')
92 | [{'ref': 'b', 'score': 0.23576799568081389, 'match_data': <MatchData "studi">}, {'ref': 'a', 'score': 0.2236629211724517, 'match_data': <MatchData "studi">}]
93 | ```
94 | 
95 | Please refer to the [documentation](http://lunr.readthedocs.io/en/latest/)
96 | for more usage examples.
97 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/fixtures/lang_es.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "docs": [
 3 |         {
 4 |             "id": "a",
 5 |             "text": "La República Galáctica está sumida en el caos. Los impuestos de las rutas comerciales a los sistemas estelares exteriores están en disputa. Esperando resolver el asunto con un bloqueo de poderosas naves de guerra, la codiciosa Federación de Comercio ha detenido todos los envíos al pequeño planeta de Naboo. Mientras el Congreso de la República debate interminablemente esta alarmante cadena de acontecimientos, el Canciller Supremo ha enviado en secreto a dos Caballeros Jedi, guardianes de la paz y la justicia en la galaxia, para resolver el conflicto...",
 6 |             "title": "Episodio I: La Amenaza Fantasma"
 7 |         },
 8 |         {
 9 |             "id": "b",
10 |             "text": "En el Senado Galáctico reina la inquietud. Varios miles de sistemas solares han declarado su intención de abandonar la República. Este movimiento separatista, liderado por el misterioso Conde Dooku, ha provocado que al limitado número de Caballeros Jedi les resulte difícil mantener la paz y el orden en la galaxia. La senadora Amidala, la antigua reina de Naboo, regresa al Senado Galáctico para dar su voto en la crítica cuestión de crear un EJÉRCITO DE LA REPÚBLICA que ayude a los desbordados Jedi....",
11 |             "title": "Episodio II: El Ataque de los Clones"
12 |         },
13 |         {
14 |             "id": "c",
15 |             "text": "¡Guerra! La República se desmorona bajo los ataques del despiadado Lord Sith, el Conde Dooku. Hay héroes en ambos bandos. El mal está por doquier. En una contundente jugada, el diabólico líder droide, el General Grievous, ha irrumpido en la capital de la República y ha secuestrado al Canciller Palpatine, líder del Senado Galáctico. Mientras el ejército droide separatista trata de huir de la capital sitiada con su valioso rehén, dos Caballeros Jedi lideran una misión desesperada para rescatar al Canciller cautivo....",
16 |             "title": "Episodio III: El Ataque de los Clones"
17 |         },
18 |         {
19 |             "id": "d",
20 |             "text": "Nos encontramos en un periodo de guerra civil. Las naves espaciales rebeldes, atacando desde una base oculta, han logrado su primera victoria contra el malvado Imperio Galáctico. Durante la batalla, los espías rebeldes han conseguido apoderarse de los planos secretos del arma total y definitiva del Imperio, la ESTRELLA DE LA MUERTE, una estación espacial acorazada, llevando en sí potencia suficiente para destruir a un planeta entero. Perseguida por los siniestros agentes del Imperio, la Princesa Leia vuela hacia su patria, a bordo de su nave espacial, llevando consigo los planos robados, que pueden salvar a su pueblo y devolver la libertad a la galaxia....",
21 |             "title": "Episodio IV: Una Nueva Esperanza"
22 |         },
23 |         {
24 |             "id": "e",
25 |             "text": "Son tiempos adversos para la rebelión. Aunque la Estrella de la Muerte ha sido destruida, las tropas imperiales han hecho salir a las fuerzas rebeldes de sus bases ocultas y las han persiguen a través de la galaxia. Tras escapar de la terrible Flota Imperial, un grupo de guerreros de la libertad, encabezados por Luke Skywalker, ha establecido una nueva base secreta en el remoto mundo helado de Hoth. El malvado Lord Darth Vader, obsesionado por encontrar al joven skywalker, ha eviado miles de sondas espaciales hacía las infinitas distancias del espacio....",
26 |             "title": "Episodio V: El Imperio Contraataca"
27 |         },
28 |         {
29 |             "id": "f",
30 |             "text": "Luke Skywalker ha regresado a Tatooine, su planeta de origen, para intentar rescatar a su amigo Han Solo de las garras del malvado Jabba, el Hutt. Pero Luke ignora que el IMPERIO GALÁCTICO ha comenzado en secreto la construcción de una nueva estación espacial armada, más poderosa que la temida Estrella de la Muerte. Una vez terminada, este arma suprema significará la aniquilación del pequeño grupo de rebeldes que lucha para restaurar la libertad en la galaxia....",
31 |             "title": "Episodio VI: El Retorno del Jedi"
32 |         },
33 |         {
34 |             "id": "g",
35 |             "text": "Luke Skywalker ha desaparecido. En su ausencia, la siniestra PRIMERA ORDEN ha surgido de las cenizas del Imperio y no descansará hasta que Skywalker, el último Jedi, haya sido destruido. Con el apoyo de la REPÚBLICA, la General Leia Organa dirige una valiente RESISTENCIA. Desesperadamente busca a su hermano Luke con el fin de obtener su ayuda para restaurar la paz y la justicia en la galaxia. Leia ha enviado a su piloto más audaz en una misión secreta a Jakku, donde un viejo aliado ha descubierto una pista del paradero de Luke....",
36 |             "title": "Episodio VII: El Despertar de la Fuerza"
37 |         },
38 |         {
39 |             "id": "h",
40 |             "text": "La PRIMERA ORDEN impera. Luego de destruir a la pacífica República, el Líder Supremo Snoke ahora envía a sus despiadadas legiones a asumir el control militar de la galaxia. Sólo la general Leia Organa y su grupo de combatientes de la RESISTENCIA se oponen a la creciente tiranía, convencidos de que el Maestro Jedi Luke Skywalker regresará y restaurará la chispa de esperanza en la lucha. Pero la Resistencia ha sido expuesta. Mientras la Primera Orden se dirige hacia la base rebelde, los valientes héroes organizan un desesperado escape....",
41 |             "title": "Episodio VIII: Los Últimos Jedi"
42 |         }
43 |     ]
44 | }
45 | 


--------------------------------------------------------------------------------
/lunr/query.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | 
  4 | class QueryPresence(Enum):
  5 |     """Defines possible behaviours for the term's presence in a document."""
  6 | 
  7 |     OPTIONAL = 1  # default
  8 |     REQUIRED = 2
  9 |     PROHIBITED = 3  # documents that contain this term will not be returned
 10 | 
 11 | 
 12 | class Query:
 13 |     """A `lunr.Query` provides a programmatic way of defining queries to be
 14 |     performed against a `lunr.Index`.
 15 | 
 16 |     Prefer constructing a `lunr.Query` using `the lunr.Index.query` method
 17 |     so the query object is pre-initialized with the right index fields.
 18 |     """
 19 | 
 20 |     # Constants for indicating what kind of automatic wildcard insertion will
 21 |     # be used when constructing a query clause.
 22 |     # This allows wildcards to be added to the beginning and end of a term
 23 |     # without having to manually do any string concatenation.
 24 |     # The wildcard constants can be bitwise combined to select both leading and
 25 |     # trailing wildcards.
 26 |     WILDCARD = "*"
 27 |     WILDCARD_NONE = 0
 28 |     WILDCARD_LEADING = 1
 29 |     WILDCARD_TRAILING = 2
 30 | 
 31 |     def __init__(self, all_fields):
 32 |         self.clauses = []
 33 |         self.all_fields = all_fields
 34 | 
 35 |     def __repr__(self):
 36 |         return '<Query fields="{}" clauses="{}">'.format(
 37 |             ",".join(self.all_fields), ",".join(c.term for c in self.clauses)
 38 |         )
 39 | 
 40 |     def clause(self, *args, **kwargs):
 41 |         """Adds a `lunr.Clause` to this query.
 42 | 
 43 |         Unless the clause contains the fields to be matched all fields will be
 44 |         matched. In addition a default boost of 1 is applied to the clause.
 45 | 
 46 |         If the first argument is a `lunr.Clause` it will be mutated and added,
 47 |         otherwise args and kwargs will be used in the constructor.
 48 | 
 49 |         Returns:
 50 |             lunr.Query: The Query itself.
 51 |         """
 52 |         if args and isinstance(args[0], Clause):
 53 |             clause = args[0]
 54 |         else:
 55 |             clause = Clause(*args, **kwargs)
 56 | 
 57 |         if not clause.fields:
 58 |             clause.fields = self.all_fields
 59 | 
 60 |         if (clause.wildcard & Query.WILDCARD_LEADING) and (
 61 |             clause.term[0] != Query.WILDCARD
 62 |         ):
 63 |             clause.term = Query.WILDCARD + clause.term
 64 | 
 65 |         if (clause.wildcard & Query.WILDCARD_TRAILING) and (
 66 |             clause.term[-1] != Query.WILDCARD
 67 |         ):
 68 |             clause.term = clause.term + Query.WILDCARD
 69 | 
 70 |         self.clauses.append(clause)
 71 |         return self
 72 | 
 73 |     def term(self, term, **kwargs):
 74 |         """Adds a term to the current query, creating a Clause and adds it to
 75 |         the list of clauses making up this Query.
 76 | 
 77 |         The term is not tokenized and used "as is". Any conversion to token
 78 |         or token-like strings should be performed before calling this method.
 79 | 
 80 |         For example:
 81 |             query.term(lunr.Tokenizer("foo bar"))
 82 | 
 83 |         Args:
 84 |             term (Token or iterable): Token or iterable of tokens to add.
 85 |             kwargs (dict): Additional properties to add to the Clause.
 86 |         """
 87 |         if isinstance(term, (list, tuple)):
 88 |             for t in term:
 89 |                 self.term(t, **kwargs)
 90 |         else:
 91 |             self.clause(str(term), **kwargs)
 92 | 
 93 |         return self
 94 | 
 95 |     def is_negated(self):
 96 |         """A negated query is one in which every clause has a presence of
 97 |         prohibited. These queries require some special processing to return
 98 |         the expected results.
 99 |         """
100 |         return all(
101 |             clause.presence == QueryPresence.PROHIBITED for clause in self.clauses
102 |         )
103 | 
104 | 
105 | class Clause:
106 |     """A single clause in a `lunr.Query` contains a term and details on
107 |     how to match that term against a `lunr.Index`
108 | 
109 |     Args:
110 |         term (str, optional): The term for the clause.
111 |         field (iterable, optional): The fields for the term to be searched
112 |             against.
113 |         edit_distance (int, optional): The character distance to use, defaults
114 |             to 0.
115 |         use_pipeline (bool, optional): Whether the clause should be pre
116 |             processed by the index's pipeline, default to True.
117 |         boost (int, optional): Boost to apply to the clause, defaults to 1.
118 |         wildcard (Query.WILDCARD_*, optional): Any of the Query.WILDCARD
119 |             constants defining if a wildcard is to be used and how, defaults
120 |             to Query.WILDCARD_NONE.
121 |         presence (QueryPresence, optional): Behaviour for a terms presence
122 |             in a document.
123 |     """
124 | 
125 |     def __init__(
126 |         self,
127 |         term=None,
128 |         fields=None,
129 |         edit_distance=0,
130 |         use_pipeline=True,
131 |         boost=1,
132 |         wildcard=Query.WILDCARD_NONE,
133 |         presence=QueryPresence.OPTIONAL,
134 |     ):
135 |         super().__init__()
136 |         self.term = term
137 |         self.fields = fields or []
138 |         self.edit_distance = edit_distance
139 |         self.use_pipeline = use_pipeline
140 |         self.boost = boost
141 |         self.wildcard = wildcard
142 |         self.presence = presence
143 | 
144 |     def __repr__(self):
145 |         return '<Clause term="{}">'.format(self.term)
146 | 


--------------------------------------------------------------------------------
/lunr/vector.py:
--------------------------------------------------------------------------------
  1 | from math import sqrt
  2 | 
  3 | from lunr.exceptions import BaseLunrException
  4 | 
  5 | 
  6 | class Vector:
  7 |     """A vector is used to construct the vector space of documents and queries.
  8 |     These vectors support operations to determine the similarity between two
  9 |     documents or a document and a query.
 10 | 
 11 |     Normally no parameters are required for initializing a vector, but in the
 12 |     case of loading a previously dumped vector the raw elements can be provided
 13 |     to the constructor.
 14 | 
 15 |     For performance reasons vectors are implemented with a flat array, where an
 16 |     elements index is immediately followed by its value.
 17 |     E.g. [index, value, index, value].
 18 | 
 19 |     TODO: consider implemetation as 2-tuples.
 20 | 
 21 |     This allows the underlying array to be as sparse as possible and still
 22 |     offer decent performance when being used for vector calculations.
 23 |     """
 24 | 
 25 |     def __init__(self, elements=None):
 26 |         self._magnitude = 0
 27 |         self.elements = elements or []
 28 | 
 29 |     def __repr__(self):
 30 |         return "<Vector magnitude={}>".format(self.magnitude)
 31 | 
 32 |     def __iter__(self):
 33 |         return iter(self.elements)
 34 | 
 35 |     def position_for_index(self, index):
 36 |         """Calculates the position within the vector to insert a given index.
 37 | 
 38 |         This is used internally by insert and upsert. If there are duplicate
 39 |         indexes then the position is returned as if the value for that index
 40 |         were to be updated, but it is the callers responsibility to check
 41 |         whether there is a duplicate at that index
 42 |         """
 43 |         if not self.elements:
 44 |             return 0
 45 | 
 46 |         start = 0
 47 |         end = int(len(self.elements) / 2)
 48 |         slice_length = end - start
 49 |         pivot_point = int(slice_length / 2)
 50 |         pivot_index = self.elements[pivot_point * 2]
 51 | 
 52 |         while slice_length > 1:
 53 |             if pivot_index < index:
 54 |                 start = pivot_point
 55 |             elif pivot_index > index:
 56 |                 end = pivot_point
 57 |             else:
 58 |                 break
 59 | 
 60 |             slice_length = end - start
 61 |             pivot_point = start + int(slice_length / 2)
 62 |             pivot_index = self.elements[pivot_point * 2]
 63 | 
 64 |         if pivot_index == index:
 65 |             return pivot_point * 2
 66 |         elif pivot_index > index:
 67 |             return pivot_point * 2
 68 |         else:
 69 |             return (pivot_point + 1) * 2
 70 | 
 71 |     def insert(self, insert_index, val):
 72 |         """Inserts an element at an index within the vector.
 73 | 
 74 |         Does not allow duplicates, will throw an error if there is already an
 75 |         entry for this index.
 76 |         """
 77 | 
 78 |         def prevent_duplicates(index, val):
 79 |             raise BaseLunrException("Duplicate index")
 80 | 
 81 |         self.upsert(insert_index, val, prevent_duplicates)
 82 | 
 83 |     def upsert(self, insert_index, val, fn=None):
 84 |         """Inserts or updates an existing index within the vector.
 85 | 
 86 |         Args:
 87 |             - insert_index (int): The index at which the element should be
 88 |                 inserted.
 89 |             - val (int|float): The value to be inserted into the vector.
 90 |             - fn (callable, optional): An optional callable taking two
 91 |                 arguments, the current value and the passed value to generate
 92 |                 the final inserted value at the position in case of collision.
 93 |         """
 94 |         fn = fn or (lambda current, passed: passed)
 95 |         self._magnitude = 0
 96 |         position = self.position_for_index(insert_index)
 97 |         if position < len(self.elements) and self.elements[position] == insert_index:
 98 |             self.elements[position + 1] = fn(self.elements[position + 1], val)
 99 |         else:
100 |             self.elements.insert(position, val)
101 |             self.elements.insert(position, insert_index)
102 | 
103 |     def to_list(self):
104 |         """Converts the vector to an array of the elements within the vector"""
105 |         output = []
106 |         for i in range(1, len(self.elements), 2):
107 |             output.append(self.elements[i])
108 |         return output
109 | 
110 |     def serialize(self):
111 |         # TODO: the JS version forces rounding on the elements upon insertion
112 |         # to ensure symmetry upon serialization
113 |         return [round(element, 3) for element in self.elements]
114 | 
115 |     @property
116 |     def magnitude(self):
117 |         if not self._magnitude:
118 |             sum_of_squares = 0
119 |             for i in range(1, len(self.elements), 2):
120 |                 value = self.elements[i]
121 |                 sum_of_squares += value * value
122 | 
123 |             self._magnitude = sqrt(sum_of_squares)
124 | 
125 |         return self._magnitude
126 | 
127 |     def dot(self, other):
128 |         """Calculates the dot product of this vector and another vector."""
129 |         dot_product = 0
130 |         a = self.elements
131 |         b = other.elements
132 |         a_len = len(a)
133 |         b_len = len(b)
134 |         i = j = 0
135 | 
136 |         while i < a_len and j < b_len:
137 |             a_val = a[i]
138 |             b_val = b[j]
139 |             if a_val < b_val:
140 |                 i += 2
141 |             elif a_val > b_val:
142 |                 j += 2
143 |             else:
144 |                 dot_product += a[i + 1] * b[j + 1]
145 |                 i += 2
146 |                 j += 2
147 | 
148 |         return dot_product
149 | 
150 |     def similarity(self, other):
151 |         """Calculates the cosine similarity between this vector and another
152 |         vector."""
153 |         if self.magnitude == 0 or other.magnitude == 0:
154 |             return 0
155 | 
156 |         return self.dot(other) / self.magnitude
157 | 


--------------------------------------------------------------------------------
/tests/acceptance_tests/fixtures/lang_es_en.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "docs": [
 3 |         {
 4 |             "id": "a",
 5 |             "text": "La República Galáctica está sumida en el caos. Los impuestos de las rutas comerciales a los sistemas estelares exteriores están en disputa. Esperando resolver el asunto con un bloqueo de poderosas naves de guerra, la codiciosa Federación de Comercio ha detenido todos los envíos al pequeño planeta de Naboo. Mientras el Congreso de la República debate interminablemente esta alarmante cadena de acontecimientos, el Canciller Supremo ha enviado en secreto a dos Caballeros Jedi, guardianes de la paz y la justicia en la galaxia, para resolver el conflicto...",
 6 |             "title": "Episodio I: La Amenaza Fantasma"
 7 |         },
 8 |         {
 9 |             "id": "b",
10 |             "text": "En el Senado Galáctico reina la inquietud. Varios miles de sistemas solares han declarado su intención de abandonar la República. Este movimiento separatista, liderado por el misterioso Conde Dooku, ha provocado que al limitado número de Caballeros Jedi les resulte difícil mantener la paz y el orden en la galaxia. La senadora Amidala, la antigua reina de Naboo, regresa al Senado Galáctico para dar su voto en la crítica cuestión de crear un EJÉRCITO DE LA REPÚBLICA que ayude a los desbordados Jedi....",
11 |             "title": "Episodio II: El Ataque de los Clones"
12 |         },
13 |         {
14 |             "id": "c",
15 |             "text": "¡Guerra! La República se desmorona bajo los ataques del despiadado Lord Sith, el Conde Dooku. Hay héroes en ambos bandos. El mal está por doquier. En una contundente jugada, el diabólico líder droide, el General Grievous, ha irrumpido en la capital de la República y ha secuestrado al Canciller Palpatine, líder del Senado Galáctico. Mientras el ejército droide separatista trata de huir de la capital sitiada con su valioso rehén, dos Caballeros Jedi lideran una misión desesperada para rescatar al Canciller cautivo....",
16 |             "title": "Episodio III: El Ataque de los Clones"
17 |         },
18 |         {
19 |             "id": "d",
20 |             "text": "Nos encontramos en un periodo de guerra civil. Las naves espaciales rebeldes, atacando desde una base oculta, han logrado su primera victoria contra el malvado Imperio Galáctico. Durante la batalla, los espías rebeldes han conseguido apoderarse de los planos secretos del arma total y definitiva del Imperio, la ESTRELLA DE LA MUERTE, una estación espacial acorazada, llevando en sí potencia suficiente para destruir a un planeta entero. Perseguida por los siniestros agentes del Imperio, la Princesa Leia vuela hacia su patria, a bordo de su nave espacial, llevando consigo los planos robados, que pueden salvar a su pueblo y devolver la libertad a la galaxia....",
21 |             "title": "Episodio IV: Una Nueva Esperanza"
22 |         },
23 |         {
24 |             "id": "e",
25 |             "text": "Son tiempos adversos para la rebelión. Aunque la Estrella de la Muerte ha sido destruida, las tropas imperiales han hecho salir a las fuerzas rebeldes de sus bases ocultas y las han persiguen a través de la galaxia. Tras escapar de la terrible Flota Imperial, un grupo de guerreros de la libertad, encabezados por Luke Skywalker, ha establecido una nueva base secreta en el remoto mundo helado de Hoth. El malvado Lord Darth Vader, obsesionado por encontrar al joven skywalker, ha eviado miles de sondas espaciales hacía las infinitas distancias del espacio....",
26 |             "title": "Episodio V: El Imperio Contraataca"
27 |         },
28 |         {
29 |             "id": "f",
30 |             "text": "Luke Skywalker ha regresado a Tatooine, su planeta de origen, para intentar rescatar a su amigo Han Solo de las garras del malvado Jabba, el Hutt. Pero Luke ignora que el IMPERIO GALÁCTICO ha comenzado en secreto la construcción de una nueva estación espacial armada, más poderosa que la temida Estrella de la Muerte. Una vez terminada, este arma suprema significará la aniquilación del pequeño grupo de rebeldes que lucha para restaurar la libertad en la galaxia....",
31 |             "title": "Episodio VI: El Retorno del Jedi"
32 |         },
33 |         {
34 |             "id": "g",
35 |             "text": "Luke Skywalker ha desaparecido. En su ausencia, la siniestra PRIMERA ORDEN ha surgido de las cenizas del Imperio y no descansará hasta que Skywalker, el último Jedi, haya sido destruido. Con el apoyo de la REPÚBLICA, la General Leia Organa dirige una valiente RESISTENCIA. Desesperadamente busca a su hermano Luke con el fin de obtener su ayuda para restaurar la paz y la justicia en la galaxia. Leia ha enviado a su piloto más audaz en una misión secreta a Jakku, donde un viejo aliado ha descubierto una pista del paradero de Luke....",
36 |             "title": "Episodio VII: El Despertar de la Fuerza"
37 |         },
38 |         {
39 |             "id": "h",
40 |             "text": "La PRIMERA ORDEN impera. Luego de destruir a la pacífica República, el Líder Supremo Snoke ahora envía a sus despiadadas legiones a asumir el control militar de la galaxia. Sólo la general Leia Organa y su grupo de combatientes de la RESISTENCIA se oponen a la creciente tiranía, convencidos de que el Maestro Jedi Luke Skywalker regresará y restaurará la chispa de esperanza en la lucha. Pero la Resistencia ha sido expuesta. Mientras la Primera Orden se dirige hacia la base rebelde, los valientes héroes organizan un desesperado escape....",
41 |             "title": "Episodio VIII: Los Últimos Jedi"
42 |         },
43 |         {
44 |             "id": "i",
45 |             "text": "Turmoil has engulfed the Galactic Republic. The taxation of trade routes to outlying star systems is in dispute. Hoping to resolve the matter with a blockade of deadly battleships, the greedy Trade Federation has stopped all shipping to the small planet of Naboo. While the Congress of the Republic endlessly debates this alarming chain of events, the Supreme Chancellor has secretly dispatched two Jedi Knights, the guardians of peace and justice in the galaxy, to settle the conflict.... ",
46 |             "title": "Episode I: The Phantom Menace"
47 |         }
48 |     ]
49 | }
50 | 


--------------------------------------------------------------------------------
/lunr/query_parser.py:
--------------------------------------------------------------------------------
  1 | from lunr.query_lexer import QueryLexer
  2 | from lunr.query import Clause, QueryPresence
  3 | from lunr.exceptions import QueryParseError
  4 | 
  5 | 
  6 | class QueryParser:
  7 |     def __init__(self, string, query):
  8 |         self.lexer = QueryLexer(string)
  9 |         self.query = query
 10 |         self.current_clause = Clause()
 11 |         self.lexeme_idx = 0
 12 | 
 13 |     def parse(self):
 14 |         self.lexer.run()
 15 |         self.lexemes = self.lexer.lexemes
 16 | 
 17 |         state = self.__class__.parse_clause
 18 | 
 19 |         while state:
 20 |             state = state(self)
 21 | 
 22 |         return self.query
 23 | 
 24 |     def peek_lexeme(self):
 25 |         try:
 26 |             return self.lexemes[self.lexeme_idx]
 27 |         except IndexError:
 28 |             return None
 29 | 
 30 |     def consume_lexeme(self):
 31 |         lexeme = self.peek_lexeme()
 32 |         self.lexeme_idx += 1
 33 |         return lexeme
 34 | 
 35 |     def next_clause(self):
 36 |         self.query.clause(self.current_clause)
 37 |         self.current_clause = Clause()
 38 | 
 39 |     @classmethod
 40 |     def parse_clause(cls, parser):
 41 |         lexeme = parser.peek_lexeme()
 42 |         if lexeme is None:
 43 |             return
 44 | 
 45 |         if lexeme["type"] == QueryLexer.FIELD:
 46 |             return cls.parse_field
 47 |         elif lexeme["type"] == QueryLexer.TERM:
 48 |             return cls.parse_term
 49 |         elif lexeme["type"] == QueryLexer.PRESENCE:
 50 |             return cls.parse_presence
 51 |         else:
 52 |             raise QueryParseError(
 53 |                 "Expected either a field or a term, found {}{}".format(
 54 |                     lexeme["type"],
 55 |                     'with value "' + lexeme["string"] + '"'
 56 |                     if len(lexeme["string"])
 57 |                     else "",
 58 |                 )
 59 |             )
 60 | 
 61 |     @classmethod
 62 |     def parse_field(cls, parser):
 63 |         lexeme = parser.consume_lexeme()
 64 | 
 65 |         if lexeme["string"] not in parser.query.all_fields:
 66 |             raise QueryParseError(
 67 |                 'Unrecognized field "{}", possible fields {}'.format(
 68 |                     lexeme["string"], ", ".join(parser.query.all_fields)
 69 |                 )
 70 |             )
 71 | 
 72 |         parser.current_clause.fields = [lexeme["string"]]
 73 | 
 74 |         next_lexeme = parser.peek_lexeme()
 75 |         if next_lexeme is None:
 76 |             raise QueryParseError("Expected term, found nothing")
 77 | 
 78 |         if next_lexeme["type"] == QueryLexer.TERM:
 79 |             return cls.parse_term
 80 |         else:
 81 |             raise QueryParseError("Expected term, found {}".format(next_lexeme["type"]))
 82 | 
 83 |     @classmethod
 84 |     def parse_term(cls, parser):
 85 |         lexeme = parser.consume_lexeme()
 86 | 
 87 |         parser.current_clause.term = lexeme["string"].lower()
 88 |         if "*" in lexeme["string"]:
 89 |             parser.current_clause.use_pipeline = False
 90 | 
 91 |         return cls._peek_next_lexeme(parser)
 92 | 
 93 |     @classmethod
 94 |     def parse_presence(cls, parser):
 95 |         lexeme = parser.consume_lexeme()
 96 | 
 97 |         if lexeme is None:
 98 |             return
 99 | 
100 |         if lexeme["string"] == "-":
101 |             parser.current_clause.presence = QueryPresence.PROHIBITED
102 |         elif lexeme["string"] == "+":
103 |             parser.current_clause.presence = QueryPresence.REQUIRED
104 |         else:
105 |             raise QueryParseError(
106 |                 "Unrecognized parser operator: {}, expected `+` or `-`".format(
107 |                     lexeme.str
108 |                 )
109 |             )
110 | 
111 |         next_lexeme = parser.peek_lexeme()
112 |         if next_lexeme is None:
113 |             raise QueryParseError("Expected either a field or a term, found nothing")
114 | 
115 |         if next_lexeme["type"] == QueryLexer.FIELD:
116 |             return cls.parse_field
117 |         elif next_lexeme["type"] == QueryLexer.TERM:
118 |             return cls.parse_term
119 |         else:
120 |             raise QueryParseError(
121 |                 "Expected either a field or a term, found {}".format(lexeme["type"])
122 |             )
123 | 
124 |     @classmethod
125 |     def parse_edit_distance(cls, parser):
126 |         lexeme = parser.consume_lexeme()
127 | 
128 |         try:
129 |             edit_distance = int(lexeme["string"])
130 |         except ValueError as e:
131 |             raise QueryParseError("Edit distance must be numeric") from e
132 | 
133 |         parser.current_clause.edit_distance = edit_distance
134 | 
135 |         return cls._peek_next_lexeme(parser)
136 | 
137 |     @classmethod
138 |     def parse_boost(cls, parser):
139 |         lexeme = parser.consume_lexeme()
140 | 
141 |         try:
142 |             boost = int(lexeme["string"])
143 |         except ValueError as e:
144 |             raise QueryParseError("Boost must be numeric") from e
145 | 
146 |         parser.current_clause.boost = boost
147 | 
148 |         return cls._peek_next_lexeme(parser)
149 | 
150 |     @classmethod
151 |     def _peek_next_lexeme(cls, parser):
152 |         next_lexeme = parser.peek_lexeme()
153 |         if next_lexeme is None:
154 |             parser.next_clause()
155 |             return
156 | 
157 |         if next_lexeme["type"] == QueryLexer.TERM:
158 |             parser.next_clause()
159 |             return cls.parse_term
160 |         elif next_lexeme["type"] == QueryLexer.FIELD:
161 |             parser.next_clause()
162 |             return cls.parse_field
163 |         elif next_lexeme["type"] == QueryLexer.EDIT_DISTANCE:
164 |             return cls.parse_edit_distance
165 |         elif next_lexeme["type"] == QueryLexer.BOOST:
166 |             return cls.parse_boost
167 |         elif next_lexeme["type"] == QueryLexer.PRESENCE:
168 |             parser.next_clause()
169 |             return cls.parse_presence
170 |         else:
171 |             raise QueryParseError(
172 |                 "Unexpected lexeme type {}".format(next_lexeme["type"])
173 |             )
174 | 


--------------------------------------------------------------------------------
/lunr/pipeline.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import logging
  3 | from typing import Callable, Dict, List, Set
  4 | 
  5 | from lunr.exceptions import BaseLunrException
  6 | from lunr.token import Token
  7 | 
  8 | log = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class Pipeline:
 12 |     """lunr.Pipelines maintain a list of functions to be applied to all tokens
 13 |     in documents entering the search index and queries ran agains the index.
 14 | 
 15 |     """
 16 | 
 17 |     registered_functions: Dict[str, Callable] = {}
 18 | 
 19 |     def __init__(self):
 20 |         self._stack: List[Callable] = []
 21 |         self._skip: Dict[Callable, Set[str]] = defaultdict(set)
 22 | 
 23 |     def __len__(self):
 24 |         return len(self._stack)
 25 | 
 26 |     def __repr__(self):
 27 |         return '<Pipeline stack="{}">'.format(",".join(fn.label for fn in self._stack))
 28 | 
 29 |     # TODO: add iterator methods?
 30 | 
 31 |     @classmethod
 32 |     def register_function(cls, fn, label=None):
 33 |         """Register a function with the pipeline."""
 34 |         label = label or fn.__name__
 35 |         if label in cls.registered_functions:
 36 |             log.warning("Overwriting existing registered function %s", label)
 37 | 
 38 |         fn.label = label
 39 |         cls.registered_functions[fn.label] = fn
 40 | 
 41 |     @classmethod
 42 |     def load(cls, serialised):
 43 |         """Loads a previously serialised pipeline."""
 44 |         pipeline = cls()
 45 |         for fn_name in serialised:
 46 |             try:
 47 |                 fn = cls.registered_functions[fn_name]
 48 |             except KeyError:
 49 |                 raise BaseLunrException(
 50 |                     "Cannot load unregistered function {}".format(fn_name)
 51 |                 )
 52 |             else:
 53 |                 pipeline.add(fn)
 54 | 
 55 |         return pipeline
 56 | 
 57 |     def add(self, *args):
 58 |         """Adds new functions to the end of the pipeline.
 59 | 
 60 |         Functions must accept three arguments:
 61 |         - Token: A lunr.Token object which will be updated
 62 |         - i: The index of the token in the set
 63 |         - tokens: A list of tokens representing the set
 64 |         """
 65 |         for fn in args:
 66 |             self.warn_if_function_not_registered(fn)
 67 |             self._stack.append(fn)
 68 | 
 69 |     def warn_if_function_not_registered(self, fn):
 70 |         try:
 71 |             return fn.label in self.registered_functions
 72 |         except AttributeError:
 73 |             log.warning(
 74 |                 'Function "{}" is not registered with pipeline. '
 75 |                 "This may cause problems when serialising the index.".format(
 76 |                     getattr(fn, "label", fn)
 77 |                 )
 78 |             )
 79 | 
 80 |     def after(self, existing_fn, new_fn):
 81 |         """Adds a single function after a function that already exists in the
 82 |         pipeline."""
 83 |         self.warn_if_function_not_registered(new_fn)
 84 |         try:
 85 |             index = self._stack.index(existing_fn)
 86 |             self._stack.insert(index + 1, new_fn)
 87 |         except ValueError as e:
 88 |             raise BaseLunrException("Cannot find existing_fn") from e
 89 | 
 90 |     def before(self, existing_fn, new_fn):
 91 |         """Adds a single function before a function that already exists in the
 92 |         pipeline.
 93 | 
 94 |         """
 95 |         self.warn_if_function_not_registered(new_fn)
 96 |         try:
 97 |             index = self._stack.index(existing_fn)
 98 |             self._stack.insert(index, new_fn)
 99 |         except ValueError as e:
100 |             raise BaseLunrException("Cannot find existing_fn") from e
101 | 
102 |     def remove(self, fn):
103 |         """Removes a function from the pipeline."""
104 |         try:
105 |             self._stack.remove(fn)
106 |         except ValueError:
107 |             pass
108 | 
109 |     def skip(self, fn: Callable, field_names: List[str]):
110 |         """
111 |         Make the pipeline skip the function based on field name we're processing.
112 | 
113 |         This relies on passing the field name to Pipeline.run().
114 |         """
115 |         self._skip[fn].update(field_names)
116 | 
117 |     def run(self, tokens, field_name=None):
118 |         """
119 |         Runs the current list of functions that make up the pipeline against
120 |         the passed tokens.
121 | 
122 |         :param tokens: The tokens to process.
123 |         :param field_name: The name of the field these tokens belongs to, can be ommited.
124 |             Used to skip some functions based on field names.
125 |         """
126 |         for fn in self._stack:
127 |             # Skip the function based on field name.
128 |             if field_name and field_name in self._skip[fn]:
129 |                 continue
130 |             results = []
131 |             for i, token in enumerate(tokens):
132 |                 # JS ignores additional arguments to the functions but we
133 |                 # force pipeline functions to declare (token, i, tokens)
134 |                 # or *args
135 |                 result = fn(token, i, tokens)
136 |                 if not result:
137 |                     continue
138 |                 if isinstance(result, (list, tuple)):  # simulate Array.concat
139 |                     results.extend(result)
140 |                 else:
141 |                     results.append(result)
142 |             tokens = results
143 | 
144 |         return tokens
145 | 
146 |     def run_string(self, string, metadata=None):
147 |         """Convenience method for passing a string through a pipeline and
148 |         getting strings out. This method takes care of wrapping the passed
149 |         string in a token and mapping the resulting tokens back to strings.
150 | 
151 |         .. note:: This ignores the skipped functions since we can't
152 |             access field names from this context.
153 |         """
154 |         token = Token(string, metadata)
155 |         return [str(tkn) for tkn in self.run([token])]
156 | 
157 |     def reset(self):
158 |         self._stack = []
159 | 
160 |     def serialize(self):
161 |         return [fn.label for fn in self._stack]
162 | 


--------------------------------------------------------------------------------
/tests/test_builder.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from lunr.builder import Builder
  4 | from lunr.token_set import TokenSet
  5 | from lunr.index import Index
  6 | from lunr.vector import Vector
  7 | 
  8 | 
  9 | def _assert_deep_keys(dict_, keys):
 10 |     d = dict_
 11 |     for key in keys.split("."):
 12 |         d_keys_as_str = [str(k) for k in d]
 13 |         assert key in d_keys_as_str
 14 |         d = d[key]
 15 | 
 16 | 
 17 | class TestBuilderBuild:
 18 |     def setup_method(self, method):
 19 |         self.builder = Builder()
 20 |         doc = {"id": "id", "title": "test", "body": "missing"}
 21 | 
 22 |         self.builder.ref("id")
 23 |         self.builder.field("title")
 24 |         self.builder.add(doc)
 25 |         self.index = self.builder.build()
 26 | 
 27 |     def test_adds_tokens_to_inverted_index(self):
 28 |         _assert_deep_keys(self.builder.inverted_index, "test.title.id")
 29 | 
 30 |     def test_builds_vector_space_of_the_document_fields(self):
 31 |         assert "title/id" in self.builder.field_vectors
 32 |         assert isinstance(self.builder.field_vectors["title/id"], Vector)
 33 | 
 34 |     def test_skips_fields_not_defined_for_indexing(self):
 35 |         assert "missing" not in self.builder.inverted_index
 36 | 
 37 |     def test_builds_a_token_set_for_the_corpus(self):
 38 |         needle = TokenSet.from_string("test")
 39 |         assert "test" in self.builder.token_set.intersect(needle).to_list()
 40 | 
 41 |     def test_calculates_document_count(self):
 42 |         assert self.builder.average_field_length["title"] == 1
 43 | 
 44 |     def test_index_is_returned(self):
 45 |         assert isinstance(self.index, Index)
 46 | 
 47 | 
 48 | class TestBuilderAdd:
 49 |     def test_builder_casts_docrefs_to_strings(self):
 50 |         self.builder = Builder()
 51 |         self.builder.ref("id")
 52 |         self.builder.field("title")
 53 | 
 54 |         self.builder.add(dict(id=123, title="test", body="missing"))
 55 | 
 56 |         _assert_deep_keys(self.builder.inverted_index, "test.title.123")
 57 | 
 58 |     def test_builder_metadata_whitelist_includes_metadata_in_index(self):
 59 |         self.builder = Builder()
 60 |         self.builder.ref("id")
 61 |         self.builder.field("title")
 62 |         self.builder.metadata_whitelist = ["position"]
 63 | 
 64 |         self.builder.add(dict(id="a", title="test", body="missing"))
 65 |         self.builder.add(dict(id="b", title="another test", body="missing"))
 66 | 
 67 |         assert self.builder.inverted_index["test"]["title"]["a"] == {
 68 |             "position": [[0, 4]]
 69 |         }
 70 |         assert self.builder.inverted_index["test"]["title"]["b"] == {
 71 |             "position": [[8, 4]]
 72 |         }
 73 | 
 74 |     def test_builder_field_raises_if_contains_slash(self):
 75 |         self.builder = Builder()
 76 | 
 77 |         with pytest.raises(ValueError):
 78 |             self.builder.field("foo/bar")
 79 | 
 80 |     def test_builder_extracts_nested_properties_from_document(self):
 81 |         self.builder = Builder()
 82 |         self.builder.field("name", extractor=lambda d: d["person"]["name"])
 83 | 
 84 |         self.builder.add({"id": "id", "person": {"name": "bob"}})
 85 | 
 86 |         assert self.builder.inverted_index["bob"]["name"]["id"] == {}
 87 | 
 88 |     def test_builder_field_term_frequency_and_length(self):
 89 |         self.builder = Builder()
 90 |         self.builder.ref("id")
 91 |         self.builder.field("title")
 92 | 
 93 |         self.builder.add(dict(id="a", title="test a testing test", body="missing"))
 94 | 
 95 |         assert self.builder.field_term_frequencies == {
 96 |             "title/a": {"test": 2, "a": 1, "testing": 1}
 97 |         }
 98 |         assert self.builder.field_lengths == {"title/a": 4}
 99 | 
100 | 
101 | class TestBuilderUse:
102 |     def setup_method(self, method):
103 |         self.builder = Builder()
104 | 
105 |     def test_calls_plugin_function(self):
106 |         def plugin(*args):
107 |             assert True
108 | 
109 |         self.builder.use(plugin)
110 | 
111 |     def test_plugin_is_called_with_builder_as_first_argument(self):
112 |         def plugin(builder):
113 |             assert builder is self.builder
114 | 
115 |         self.builder.use(plugin)
116 | 
117 |     def test_forwards_arguments_to_the_plugin(self):
118 |         def plugin(builder, *args, **kwargs):
119 |             assert args == (1, 2, 3)
120 |             assert kwargs == {"foo": "bar"}
121 | 
122 |         self.builder.use(plugin, 1, 2, 3, foo="bar")
123 | 
124 | 
125 | class TestBuilderK1:
126 |     def test_k1_default_value(self):
127 |         builder = Builder()
128 |         assert builder._k1 == 1.2
129 | 
130 |     def test_k1_can_be_set(self):
131 |         builder = Builder()
132 |         builder.k1(1.6)
133 |         assert builder._k1 == 1.6
134 | 
135 | 
136 | class TestBuilderB:
137 |     def test_b_default_value(self):
138 |         builder = Builder()
139 |         assert builder._b == 0.75
140 | 
141 |     def test_b_within_range(self):
142 |         builder = Builder()
143 |         builder.b(0.5)
144 |         assert builder._b == 0.5
145 | 
146 |     def test_b_less_than_zero(self):
147 |         builder = Builder()
148 |         builder.b(-1)
149 |         assert builder._b == 0
150 | 
151 |     def test_b_higher_than_one(self):
152 |         builder = Builder()
153 |         builder.b(1.5)
154 |         assert builder._b == 1
155 | 
156 | 
157 | class TestBuilerRef:
158 |     def test_default_reference(self):
159 |         builder = Builder()
160 |         assert builder._ref == "id"
161 | 
162 |     def test_defining_a_reference_field(self):
163 |         builder = Builder()
164 |         builder.ref("foo")
165 |         assert builder._ref == "foo"
166 | 
167 | 
168 | class TestBuilderField:
169 |     def test_define_fields_to_index(self):
170 |         builder = Builder()
171 |         builder.field("foo")
172 |         assert len(builder._fields) == 1
173 |         assert builder._fields["foo"].name == "foo"
174 |         assert builder._fields["foo"].boost == 1
175 |         assert builder._fields["foo"].extractor is None
176 |         assert repr(builder._fields["foo"]) == '<Field "foo" boost="1">'
177 |         assert hash(builder._fields["foo"]) == hash("foo")
178 | 


--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
  1 | # Quick start
  2 | 
  3 | First, you'll need a list of dicts representing the documents you want to search on. These documents must have a unique field which will serve as a reference and a series of fields you'd like to search on.
  4 | 
  5 | ```python
  6 | >>> from lunr import lunr
  7 | >>>
  8 | >>> documents = [{
  9 | ...:         'id': 'a',
 10 | ...:         'title': 'Mr. Green kills Colonel Mustard',
 11 | ...:         'body': """Mr. Green killed Colonel Mustard in the study with the
 12 | ...: candlestick. Mr. Green is not a very nice fellow."""
 13 | ...:     }, {
 14 | ...:         'id': 'b',
 15 | ...:         'title': 'Plumb waters plant',
 16 | ...:         'body': 'Professor Plumb has a green and a yellow plant in his study',
 17 | ...:     }, {
 18 | ...:         'id': 'c',
 19 | ...:         'title': 'Scarlett helps Professor',
 20 | ...:         'body': """Miss Scarlett watered Professor Plumbs green plant
 21 | ...: while he was away on his murdering holiday.""",
 22 | ...:     }]
 23 | ```
 24 | 
 25 | Lunr provides a convenience `lunr` function to quickly index this set of documents:
 26 | 
 27 | ```python
 28 | >>> idx = lunr(
 29 | ...     ref='id', fields=('title', 'body'), documents=documents
 30 | ... )
 31 | ```
 32 | 
 33 | For basic no-fuss searches just use the `search` on the index:
 34 | 
 35 | ```python
 36 | >>> idx.search('kill')
 37 | [{'ref': 'a', 'score': 0.6931722372559913, 'match_data': <MatchData "kill">}]
 38 | >>> idx.search('study')
 39 | [{'ref': 'b', 'score': 0.23576799568081389, 'match_data': <MatchData "studi">},
 40 | {'ref': 'a', 'score': 0.2236629211724517, 'match_data': <MatchData "studi">}]
 41 | ```
 42 | 
 43 | ## Using query strings
 44 | 
 45 | The query string passed to `search` accepts multiple terms:
 46 | 
 47 | ```python
 48 | >>> idx.search('green plant')
 49 | [{'ref': 'b', 'score': 0.5023294192217546, 'match_data': <MatchData "green, plant">},
 50 | {'ref': 'a', 'score': 0.12544083739725947, 'match_data': <MatchData "green">},
 51 | {'ref': 'c', 'score': 0.07306110905506158, 'match_data': <MatchData "green, plant">}]
 52 | ```
 53 | 
 54 | The index will search for `green` OR `plant`, a few things to note on the results:
 55 | 
 56 | - document `b` scores highest because `plant` appears in both fields and `green` appears in the body
 57 | - document `a` is second includes only `green` but in the title and the body twice
 58 | - document `c` includes both terms but only on one of the fields
 59 | 
 60 | Query strings support a variety of modifiers:
 61 | 
 62 | ### Wildcards
 63 | 
 64 | You can use `*` as a wildcard anywhere in your query string:
 65 | 
 66 | ```python
 67 | >>> idx.search('pl*')
 68 | [{'ref': 'b', 'score': 0.725901569004226, 'match_data': <MatchData "plumb, plant">},
 69 | {'ref': 'c', 'score': 0.0816178155209697, 'match_data': <MatchData "plumb, plant">}]
 70 | >>> idx.search('*llow')
 71 | [{'ref': 'b', 'score': 0.6210112024848421, 'match_data': <MatchData "yellow">},
 72 | {'ref': 'a', 'score': 0.30426104537491444, 'match_data': <MatchData "fellow">}]
 73 | ```
 74 | 
 75 | Note that, when using wildcards, no stemming is performed in the search terms.
 76 | 
 77 | ### Fields
 78 | 
 79 | Prefixing any search term with `<FIELD_NAME>:` allows you to specify which field a particular term should be searched for:
 80 | 
 81 | ```python
 82 | >>> idx.search('title:green title:plant')
 83 | [{'ref': 'b', 'score': 0.18604713274256787, 'match_data': <MatchData "plant">},
 84 | {'ref': 'a', 'score': 0.07902963505882092, 'match_data': <MatchData "green">}]
 85 | ```
 86 | 
 87 | Note the difference with the example above, document `c` is no longer in the results.
 88 | 
 89 | Specifying an unindexed field will raise an exception:
 90 | 
 91 | ```python
 92 | >>> idx.search('foo:green')
 93 | Traceback (most recent call last):
 94 | ...
 95 | lunr.exceptions.QueryParseError: Unrecognized field "foo", possible fields title, body
 96 | ```
 97 | 
 98 | You can combine this with wildcards:
 99 | 
100 | ```python
101 | >>> idx.search('body:mu*')
102 | [{'ref': 'c', 'score': 0.3072276611029057, 'match_data': <MatchData "murder">},
103 | {'ref': 'a', 'score': 0.14581429988419872, 'match_data': <MatchData "mustard">}]
104 | ```
105 | 
106 | ### Boosts
107 | 
108 | When searching for several terms you can use boosting to give more importance to the each term:
109 | 
110 | ```python
111 | >>> idx.search('green plant^10')
112 | [{'ref': 'b', 'score': 0.831629678987025, 'match_data': <MatchData "green, plant">},
113 | {'ref': 'c', 'score': 0.06360184858161157, 'match_data': <MatchData "green, plant">},
114 | {'ref': 'a', 'score': 0.01756105367777591, 'match_data': <MatchData "green">}]
115 | ```
116 | 
117 | Note how document `c` now scores higher because of the boosting on the term `plant`. The `10` represents a multiplier on the relative score for the term and must be positive integers.
118 | 
119 | ### Fuzzy matches
120 | 
121 | You can also use fuzzy matching for terms that are likely to be misspelled:
122 | 
123 | ```python
124 | >>> idx.search('yellow~1')
125 | [{'ref': 'b', 'score': 0.621155860224936, 'match_data': <MatchData "yellow">},
126 | {'ref': 'a', 'score': 0.3040972809936496, 'match_data': <MatchData "fellow">}]
127 | ```
128 | 
129 | The positive integer after `~` represents the edit distance, in this case 1 character, either by addition, removal or transposition.
130 | 
131 | ### Term presence (new in 0.3.0)
132 | 
133 | As mentioned above, Lunr defaults to searching for logical OR on terms, but it is possible to specify the presence of each term in matching documents. The default OR behaviour is represented by the term's presence being *optional* in a matching document, to specify that a term must be present in matching document the term must be prefixed with a `+`. On the other hand to specify that a term must *not* be included in a matching document the term must be prefixed with a `-`.
134 | 
135 | The below example searches for documents that must contain "green", might contain "plant" and must not contain "study":
136 | 
137 | ```python
138 | >>> idx.search("+green plant -study")
139 | [{'ref': 'c',
140 |   'score': 0.08090317236904906,
141 |   'match_data': <MatchData "green,plant">}]
142 | ```
143 | 
144 | Contrast this with the default behaviour:
145 | 
146 | ```python
147 | >>> idx.search('green plant study')
148 | [{'ref': 'b',
149 |   'score': 0.5178296383103647,
150 |   'match_data': <MatchData "green,plant,studi">},
151 |  {'ref': 'a',
152 |   'score': 0.22147889214939157,
153 |   'match_data': <MatchData "green,studi">},
154 |  {'ref': 'c',
155 |   'score': 0.06605716362553504,
156 |   'match_data': <MatchData "green,plant">}]
157 | ```
158 | 
159 | To simulate a logical AND search of "green AND plant" mark both terms as required:
160 | 
161 | ```python
162 | >>> idx.search('+yellow +plant')
163 | [{'ref': 'b',
164 |   'score': 0.8915374700737615,
165 |   'match_data': <MatchData "plant,yellow">}]
166 | ```
167 | 
168 | As opposed to the default:
169 | 
170 | ```python
171 | >>> idx.search('yellow plant')
172 | [{'ref': 'b',
173 |   'score': 0.8915374700737615,
174 |   'match_data': <MatchData "plant,yellow">},
175 |  {'ref': 'c',
176 |   'score': 0.045333674172311975,
177 |   'match_data': <MatchData "plant">}]
178 | ```
179 | 
180 | Note presence can also be combined with any of the other modifiers described above.
181 | 


--------------------------------------------------------------------------------
/docs/indices.md:
--------------------------------------------------------------------------------
  1 | # Building indices
  2 | 
  3 | We briefly skimmed over creating indices in Lunr in the [searching](./usage.md) section, let's go into more detail around what we need to build a Lunr index.
  4 | 
  5 | ## The `lunr` function
  6 | 
  7 | The main entry point to Lunr is the `lunr` function. It provides a simple way to create an index, define fields we're interested in and start indexing a corpus of documents.
  8 | 
  9 | We do that simply by providing:
 10 | 
 11 | - A `ref` string specifying the field in the documents that should be used as a key for each document.
 12 | - A `fields` list, which defines the fields in the documents that should be added to the index.
 13 | - A `documents` list, including a set of dictionaries representing the documents we want to index.
 14 | 
 15 | And that's it. The `lunr` function will create an index, configure it, add the documents and return the `lunr.Index` for you to start searching.
 16 | 
 17 | ## Build time boosts
 18 | 
 19 | > New in version 0.4.0
 20 | 
 21 | Lunr also provides some very useful functionality for boosting at index building time. There are two types of boosts you can include: field boosts and document boosts.
 22 | 
 23 | ### Field boosts
 24 | 
 25 | Field boosts let Lunr know that, when searching, we care more about some fields than others, a typical example is adding a boost on the *title* of our documents so when searching for a term, if it is found in the title, the document will score higher.
 26 | 
 27 | To include a field boost we use the `fields` argument of the `lunr` function, instead of passing a list of strings as usual, we pass a list of dictionaries with two keys:
 28 | 
 29 | - `field_name` whose value will be the name of the field in the documents we want to index.
 30 | - `boost` an integer to be multiplied to the score when a match is found on this field.
 31 | 
 32 | For example:
 33 | 
 34 | ```python
 35 | >>> from lunr import lunr
 36 | >>> documents = [{
 37 | ...:         'id': 'a',
 38 | ...:         'title': 'Mr. Green kills Colonel Mustard',
 39 | ...:         'body': """Mr. Green killed Colonel Mustard in the study with the
 40 | ...: candlestick. Mr. Green is not a very nice fellow."""
 41 | ...:     }, {
 42 | ...:         'id': 'b',
 43 | ...:         'title': 'Plumb waters plant',
 44 | ...:         'body': 'Professor Plumb has a green and a yellow plant in his study',
 45 | ...:     }, {
 46 | ...:         'id': 'c',
 47 | ...:         'title': 'Scarlett helps Professor',
 48 | ...:         'body': """Miss Scarlett watered Professor Plumbs green plant
 49 | ...: while he was away on his murdering holiday.""",
 50 | ...:     }]
 51 | >>> idx = lunr(
 52 | ...:    ref='id',
 53 | ...:    fields=[dict(field_name='title', boost=10), 'body'],
 54 | ...:    documents=documents
 55 | ...: )
 56 | ```
 57 | 
 58 | Note how we're passing a dictionary only for `title`, `body` will have a neutral value for `boost`.
 59 | 
 60 | 
 61 | ```python
 62 | >>> idx.search('plumb')
 63 | [{'match_data': <MatchData "plumb">, 'ref': 'b', 'score': 1.599},
 64 |  {'match_data': <MatchData "plumb">, 'ref': 'c', 'score': 0.13}]
 65 | ```
 66 | 
 67 | Note how the score for document `b` is much higher thanks to our field boost.
 68 | 
 69 | ### Document boosts
 70 | 
 71 | Document boosts let Lunr know that some documents are more important than others, for example we would like an FAQ page to show up higher in searches.
 72 | 
 73 | In Lunr we do this via the `documents` argument to the `lunr` function, instead of passing a list of dictionaries we pass a 2-tuple (or list) with the document dictionary as a first item and another dictionary as a second item. This second dictionary must have a single `boost` key with an integer to be applied to any matches on this particular document.
 74 | 
 75 | ```python
 76 | documents = [
 77 |     {
 78 |         'id': 'a',
 79 |         'title': 'Mr. Green kills Colonel Mustard',
 80 |         'body': """Mr. Green killed Colonel Mustard in the study with the
 81 | candlestick. Mr. Green is not a very nice fellow."""
 82 |     }, {
 83 |             'id': 'b',
 84 |             'title': 'Plumb waters plant',
 85 |             'body': 'Professor Plumb has a green and a yellow plant in his study',
 86 |     }, (
 87 |         {
 88 |             'id': 'c',
 89 |             'title': 'Scarlett helps Professor',
 90 |             'body': """Miss Scarlett watered Professor Plumbs green plant
 91 |     while he was away on his murdering holiday.""",
 92 |         }, {
 93 |             'boost': 10
 94 |         }
 95 |     )]
 96 | ```
 97 | 
 98 | Note how the third member of a list is a tuple, now if we pass these documents to the `lunr` function and perform a search:
 99 | 
100 | ```python
101 | >>> idx = lunr(ref='id', fields=('title', 'body'), documents=documents)
102 | >>> idx.search('plumb')
103 | [{'match_data': <MatchData "plumb">, 'ref': 'c', 'score': 1.297},
104 |  {'match_data': <MatchData "plumb">, 'ref': 'b', 'score': 0.3}]
105 | ```
106 | 
107 | The score for `c` is now higher than `b` even though there are less matches, thanks to our document boost.
108 | 
109 | ## Field extractors
110 | 
111 | Up until now we've been working with fairly simple documents, but what if you have large nested documents and only want to index parts of them?
112 | 
113 | For this Lunr provides *field extractors*, which are simply callables that Lunr can use to fetch the field in the document you want to index. If you do not provide it, as we've been doing, Lunr assumes there's a key matching the field name, i.e. `title` or `body`.
114 | 
115 | To pass a field extractor to Lunr we, once again, use the `fields` argument to the `lunr` function. Similarly to what we did to define field boosts we pass a list of dictionaries, but this time we add an `extractor` key whose value is a  callable with a single argument, the document being processed. Lunr will call the extractor when fetching the indexed field and will use its result in our index.
116 | 
117 | Imagine our documents have a slightly different form where the reference is at the top level but our fields are nested under a `content` key:
118 | 
119 | ```python
120 | documents = [{
121 |     'id': 'a',
122 |     'content': {
123 |         'title': 'Mr. Green kills Colonel Mustard',
124 |         'body': """Mr. Green killed Colonel Mustard in the study with the
125 | candlestick. Mr. Green is not a very nice fellow."""
126 |     }
127 | }, {
128 |     'id': 'b',
129 |     'content': {
130 |         'title': 'Plumb waters plant',
131 |         'body': 'Professor Plumb has a green and a yellow plant in his study',
132 |     }
133 | }, {
134 |     'id': 'c',
135 |     'content': {
136 |         'title': 'Scarlett helps Professor',
137 |         'body': """Miss Scarlett watered Professor Plumbs green plant
138 |     while he was away on his murdering holiday.""",
139 |     }
140 | }]
141 | ```
142 | 
143 | To work around this we simply need to add field extractors, which are simply callables that take a document as an argument and return the content of the field, in this case a simple `lambda` will do:
144 | 
145 | ```python
146 | >>> idx = lunr(
147 | ...     ref='id',
148 | ...     fields=[
149 | ...         dict(field_name='title', extractor=lambda d: d['content']['title']),
150 | ...         dict(field_name='body', extractor=lambda d: d['content']['body'])
151 | ...     ],
152 | ...     documents=documents)
153 | ```
154 | 
155 | We can now search the index as usual:
156 | 
157 | ```python
158 | >>> idx.search('plumb')
159 | [{'ref': 'b', 'score': 0.3, 'match_data': <MatchData "plumb">}
160 |  {'ref': 'c', 'score': 0.13, 'match_data': <MatchData "plumb">}]
161 | ```
162 | 


--------------------------------------------------------------------------------
/tests/test_query_parser.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from lunr.query import Query, QueryPresence
  4 | from lunr.query_parser import QueryParser
  5 | from lunr.exceptions import QueryParseError
  6 | 
  7 | 
  8 | def parse(q):
  9 |     query = Query(["title", "body"])
 10 |     parser = QueryParser(q, query)
 11 | 
 12 |     parser.parse()
 13 |     return query.clauses
 14 | 
 15 | 
 16 | class TestQueryParser:
 17 |     def test_parse_empty_string(self):
 18 |         clauses = parse("")
 19 |         assert len(clauses) == 0
 20 | 
 21 |     def test_parse_single_term(self):
 22 |         clauses = parse("foo")
 23 |         assert len(clauses) == 1
 24 |         clause = clauses[0]
 25 |         assert clause.term == "foo"
 26 |         assert clause.fields == ["title", "body"]
 27 |         assert clause.use_pipeline is True
 28 |         assert clause.presence is QueryPresence.OPTIONAL
 29 | 
 30 |     def test_parse_single_term_uppercase(self):
 31 |         clauses = parse("FOO")
 32 |         assert len(clauses) == 1
 33 |         clause = clauses[0]
 34 |         assert clause.term == "foo"
 35 |         assert clause.fields == ["title", "body"]
 36 |         assert clause.use_pipeline is True
 37 | 
 38 |     def test_parse_single_term_with_wildcard(self):
 39 |         clauses = parse("fo*")
 40 |         assert len(clauses) == 1
 41 |         clause = clauses[0]
 42 |         assert clause.term == "fo*"
 43 |         assert clause.use_pipeline is False
 44 | 
 45 |     def test_multiple_terms(self):
 46 |         clauses = parse("foo bar")
 47 |         assert len(clauses) == 2
 48 |         assert clauses[0].term == "foo"
 49 |         assert clauses[1].term == "bar"
 50 | 
 51 |     def test_term_with_presence_required_adds_required_clause(self):
 52 |         clauses = parse("+foo")
 53 |         assert len(clauses) == 1
 54 |         assert clauses[0].term == "foo"
 55 |         assert clauses[0].boost == 1
 56 |         assert clauses[0].fields == ["title", "body"]
 57 |         assert clauses[0].presence == QueryPresence.REQUIRED
 58 | 
 59 |     def test_term_with_presence_required_adds_prohibited_clause(self):
 60 |         clauses = parse("-foo")
 61 |         assert len(clauses) == 1
 62 |         assert clauses[0].term == "foo"
 63 |         assert clauses[0].boost == 1
 64 |         assert clauses[0].fields == ["title", "body"]
 65 |         assert clauses[0].presence == QueryPresence.PROHIBITED
 66 | 
 67 |     def test_term_scoped_by_field_with_presence_required(self):
 68 |         clauses = parse("+title:foo")
 69 |         assert len(clauses) == 1
 70 |         assert clauses[0].term == "foo"
 71 |         assert clauses[0].boost == 1
 72 |         assert clauses[0].fields == ["title"]
 73 |         assert clauses[0].presence == QueryPresence.REQUIRED
 74 | 
 75 |     def test_term_scoped_by_field_with_presence_prohibited(self):
 76 |         clauses = parse("-title:foo")
 77 |         assert len(clauses) == 1
 78 |         assert clauses[0].term == "foo"
 79 |         assert clauses[0].boost == 1
 80 |         assert clauses[0].fields == ["title"]
 81 |         assert clauses[0].presence == QueryPresence.PROHIBITED
 82 | 
 83 |     def test_multiple_terms_with_presence_creates_two_clauses(self):
 84 |         clauses = parse("+foo +bar")
 85 |         assert len(clauses) == 2
 86 |         assert clauses[0].term == "foo"
 87 |         assert clauses[1].term == "bar"
 88 |         assert clauses[0].presence == QueryPresence.REQUIRED
 89 |         assert clauses[1].presence == QueryPresence.REQUIRED
 90 | 
 91 |     def test_unknown_field(self):
 92 |         with pytest.raises(QueryParseError):
 93 |             parse("unknown:foo")
 94 | 
 95 |     def test_field_without_a_term(self):
 96 |         with pytest.raises(QueryParseError):
 97 |             parse("title:")
 98 | 
 99 |     def test_field_twice(self):
100 |         with pytest.raises(QueryParseError):
101 |             parse("title:title:")
102 | 
103 |     def test_term_with_field(self):
104 |         clauses = parse("title:foo")
105 |         assert len(clauses) == 1
106 |         assert clauses[0].fields == ["title"]
107 | 
108 |     def test_uppercase_field_with_uppercase_term(self):
109 |         query = Query(["TITLE"])
110 |         parser = QueryParser("TITLE:FOO", query)
111 | 
112 |         parser.parse()
113 |         clauses = query.clauses
114 | 
115 |         assert len(clauses) == 1
116 |         assert clauses[0].term == "foo"
117 |         assert clauses[0].fields == ["TITLE"]
118 | 
119 |     def test_multiple_terms_scoped_to_different_fields(self):
120 |         clauses = parse("title:foo body:bar")
121 | 
122 |         assert len(clauses) == 2
123 |         assert clauses[0].fields == ["title"]
124 |         assert clauses[1].fields == ["body"]
125 | 
126 |         assert clauses[0].term == "foo"
127 |         assert clauses[1].term == "bar"
128 | 
129 |     def test_single_term_with_edit_distance(self):
130 |         clauses = parse("foo~2")
131 | 
132 |         assert len(clauses) == 1
133 |         assert clauses[0].term == "foo"
134 |         assert clauses[0].fields == ["title", "body"]
135 |         assert clauses[0].edit_distance == 2
136 | 
137 |     def test_multiple_terms_with_edit_distance(self):
138 |         clauses = parse("foo~2 bar~3")
139 | 
140 |         assert len(clauses) == 2
141 |         assert clauses[0].fields == ["title", "body"]
142 |         assert clauses[1].fields == ["title", "body"]
143 | 
144 |         assert clauses[0].term == "foo"
145 |         assert clauses[1].term == "bar"
146 | 
147 |         assert clauses[0].edit_distance == 2
148 |         assert clauses[1].edit_distance == 3
149 | 
150 |     def test_single_term_scoped_to_field_with_edit_distance(self):
151 |         clauses = parse("title:foo~2")
152 | 
153 |         assert len(clauses) == 1
154 |         assert clauses[0].term == "foo"
155 |         assert clauses[0].fields == ["title"]
156 |         assert clauses[0].edit_distance == 2
157 | 
158 |     def test_non_numeric_edit_distance(self):
159 |         with pytest.raises(QueryParseError):
160 |             parse("foo~a")
161 | 
162 |     def test_edit_distance_without_a_term(self):
163 |         with pytest.raises(QueryParseError):
164 |             parse("~2")
165 | 
166 |     def test_single_term_with_boost(self):
167 |         clauses = parse("foo^2")
168 | 
169 |         assert len(clauses) == 1
170 |         assert clauses[0].term == "foo"
171 |         assert clauses[0].fields == ["title", "body"]
172 |         assert clauses[0].boost == 2
173 | 
174 |     def test_non_numeric_boost(self):
175 |         with pytest.raises(QueryParseError):
176 |             parse("foo^a")
177 | 
178 |     def test_boost_without_a_term(self):
179 |         with pytest.raises(QueryParseError):
180 |             parse("^2")
181 | 
182 |     def test_multiple_terms_with_boost(self):
183 |         clauses = parse("foo^2 bar^3")
184 | 
185 |         assert len(clauses) == 2
186 |         assert clauses[0].fields == ["title", "body"]
187 |         assert clauses[1].fields == ["title", "body"]
188 | 
189 |         assert clauses[0].term == "foo"
190 |         assert clauses[1].term == "bar"
191 | 
192 |         assert clauses[0].boost == 2
193 |         assert clauses[1].boost == 3
194 | 
195 |     def test_term_scoped_by_field_with_boost(self):
196 |         clauses = parse("title:foo^2")
197 | 
198 |         assert len(clauses) == 1
199 |         assert clauses[0].term == "foo"
200 |         assert clauses[0].fields == ["title"]
201 |         assert clauses[0].boost == 2
202 | 
203 |     def test_term_with_boost_and_edit_distance(self):
204 |         clauses = parse("foo^2~3")
205 | 
206 |         assert len(clauses) == 1
207 |         assert clauses[0].term == "foo"
208 |         assert clauses[0].fields == ["title", "body"]
209 |         assert clauses[0].edit_distance == 3
210 |         assert clauses[0].boost == 2
211 | 
212 |     def test_edit_distance_followed_by_presence(self):
213 |         clauses = parse("foo~10 +bar")
214 | 
215 |         assert len(clauses) == 2
216 | 
217 |         assert clauses[0].fields == ["title", "body"]
218 |         assert clauses[1].fields == ["title", "body"]
219 | 
220 |         assert clauses[0].term == "foo"
221 |         assert clauses[1].term == "bar"
222 | 
223 |         assert clauses[0].edit_distance == 10
224 |         assert clauses[1].edit_distance == 0
225 | 
226 |         assert clauses[0].presence == QueryPresence.OPTIONAL
227 |         assert clauses[1].presence == QueryPresence.REQUIRED
228 | 
229 |     def test_boost_followed_by_presence(self):
230 |         clauses = parse("foo^10 +bar")
231 | 
232 |         assert len(clauses) == 2
233 | 
234 |         assert clauses[0].fields == ["title", "body"]
235 |         assert clauses[1].fields == ["title", "body"]
236 | 
237 |         assert clauses[0].term == "foo"
238 |         assert clauses[1].term == "bar"
239 | 
240 |         assert clauses[0].boost == 10
241 |         assert clauses[1].boost == 1
242 | 
243 |         assert clauses[0].presence == QueryPresence.OPTIONAL
244 |         assert clauses[1].presence == QueryPresence.REQUIRED
245 | 


--------------------------------------------------------------------------------
/tests/test_pipeline.py:
--------------------------------------------------------------------------------
  1 | from mock import patch
  2 | 
  3 | import pytest
  4 | 
  5 | from lunr.exceptions import BaseLunrException
  6 | from lunr.pipeline import Pipeline
  7 | 
  8 | 
  9 | def noop(*args, **kwargs):
 10 |     pass
 11 | 
 12 | 
 13 | def fn(*args, **kwargs):
 14 |     pass
 15 | 
 16 | 
 17 | class BaseTestPipeline:
 18 |     @pytest.fixture(autouse=True)
 19 |     def setup_mock_pipline(self, monkeypatch):
 20 |         monkeypatch.setattr(Pipeline, "registered_functions", {})
 21 |         monkeypatch.setattr(Pipeline, "warn_if_function_not_registered", noop)
 22 |         self.pipeline = Pipeline()
 23 | 
 24 | 
 25 | class TestAdd(BaseTestPipeline):
 26 |     def test_add_function_to_pipeline(self):
 27 |         self.pipeline.add(noop)
 28 |         assert len(self.pipeline) == 1
 29 | 
 30 |     def test_add_multiple_functions_to_pipeline(self):
 31 |         self.pipeline.add(noop, noop)
 32 |         assert len(self.pipeline) == 2
 33 | 
 34 |     def test_add_warns_if_function_not_registered(self, monkeypatch):
 35 |         monkeypatch.undo()
 36 |         with patch("lunr.pipeline.log") as mock_log:
 37 |             self.pipeline.add(lambda x: x)
 38 |             mock_log.warning.assert_called_once()
 39 | 
 40 | 
 41 | class TestRemove(BaseTestPipeline):
 42 |     def test_remove_function_exists_in_pipeline(self):
 43 |         self.pipeline.add(noop)
 44 |         assert len(self.pipeline) == 1
 45 | 
 46 |         self.pipeline.remove(noop)
 47 |         assert len(self.pipeline) == 0
 48 | 
 49 |     def test_remove_function_does_not_exist_in_pipeline(self):
 50 | 
 51 |         self.pipeline.add(noop)
 52 |         assert len(self.pipeline) == 1
 53 | 
 54 |         self.pipeline.remove(fn)
 55 |         assert len(self.pipeline) == 1
 56 | 
 57 | 
 58 | class TestBefore(BaseTestPipeline):
 59 |     def test_before_other_function_exists(self):
 60 |         self.pipeline.add(noop)
 61 |         self.pipeline.before(noop, fn)
 62 | 
 63 |         assert self.pipeline._stack == [fn, noop]
 64 | 
 65 |     def test_before_other_functions_does_not_exist(self):
 66 |         with pytest.raises(BaseLunrException):
 67 |             self.pipeline.before(noop, fn)
 68 | 
 69 |         assert len(self.pipeline) == 0
 70 | 
 71 | 
 72 | class TestAfter(BaseTestPipeline):
 73 |     def test_after_other_function_exists(self):
 74 |         self.pipeline.add(noop)
 75 |         self.pipeline.after(noop, fn)
 76 | 
 77 |         assert self.pipeline._stack == [noop, fn]
 78 | 
 79 |     def test_after_other_function_does_not_exist(self):
 80 |         with pytest.raises(BaseLunrException):
 81 |             self.pipeline.after(noop, fn)
 82 | 
 83 |         assert len(self.pipeline) == 0
 84 | 
 85 | 
 86 | class TestRun(BaseTestPipeline):
 87 |     def test_run_calling_each_function_for_each_token(self):
 88 |         count_1 = []
 89 |         count_2 = []
 90 | 
 91 |         def fn1(t, *args):
 92 |             count_1.append(1)
 93 |             return t
 94 | 
 95 |         def fn2(t, *args):
 96 |             count_2.append(1)
 97 |             return t
 98 | 
 99 |         self.pipeline.add(fn1, fn2)
100 |         self.pipeline.run([1, 2, 3])
101 | 
102 |         assert len(count_1) == 3
103 |         assert len(count_2) == 3
104 | 
105 |     def test_run_passes_token_to_pipeline_function(self):
106 |         def fn(token, *args):
107 |             assert token == "foo"
108 | 
109 |         self.pipeline.add(fn)
110 |         self.pipeline.run(["foo"])
111 | 
112 |     def test_run_passes_index_to_pipeline_function(self):
113 |         def fn(_, index, *args):
114 |             assert index == 0
115 | 
116 |         self.pipeline.add(fn)
117 |         self.pipeline.run(["foo"])
118 | 
119 |     def test_run_passes_entire_token_list_to_pipeline_function(self):
120 |         def fn(_, __, tokens):
121 |             assert tokens == ["foo"]
122 | 
123 |         self.pipeline.add(fn)
124 |         self.pipeline.run(["foo"])
125 | 
126 |     def test_run_passes_output_of_one_function_as_input_to_the_next(self):
127 |         def fn1(t, *args):
128 |             return t.upper()
129 | 
130 |         def fn2(t, *args):
131 |             assert t == "FOO"
132 | 
133 |         self.pipeline.add(fn1, fn2)
134 |         self.pipeline.run(["foo"])
135 | 
136 |     def test_run_returns_the_results_of_the_last_function(self):
137 |         def fn(t, *args):
138 |             return t.upper()
139 | 
140 |         self.pipeline.add(fn)
141 | 
142 |         assert self.pipeline.run(["foo"]) == ["FOO"]
143 | 
144 |     def test_run_filters_out_none_and_empty_string_values(self):
145 |         tokens = []
146 | 
147 |         def fn1(t, i, _):
148 |             if i % 2:
149 |                 return t
150 |             elif i == 5:
151 |                 return ""
152 | 
153 |         def fn2(t, *args):
154 |             tokens.append(t)
155 |             return t
156 | 
157 |         self.pipeline.add(fn1)
158 |         self.pipeline.add(fn2)
159 | 
160 |         output = self.pipeline.run(list("abcde"))
161 | 
162 |         assert tokens == ["b", "d"]
163 |         assert output == ["b", "d"]
164 | 
165 |     def test_expanding_tokens_passed_to_output(self):
166 |         self.pipeline.add(lambda t, *args: [t, t.upper()])
167 | 
168 |         assert self.pipeline.run(["foo"]) == ["foo", "FOO"]
169 | 
170 |     def test_expanding_tokens_not_passed_to_same_function(self):
171 |         received = []
172 | 
173 |         def fn(t, *args):
174 |             received.append(t)
175 |             return [t, t.upper()]
176 | 
177 |         self.pipeline.add(fn)
178 |         self.pipeline.run(["foo"])
179 | 
180 |         assert received == ["foo"]
181 | 
182 |     def test_expanding_tokens_passed_to_the_next_pipeline_function(self):
183 |         received = []
184 | 
185 |         def fn1(t, *args):
186 |             return [t, t.upper()]
187 | 
188 |         def fn2(t, *args):
189 |             received.append(t)
190 | 
191 |         self.pipeline.add(fn1)
192 |         self.pipeline.add(fn2)
193 |         self.pipeline.run(["foo"])
194 | 
195 |         assert received == ["foo", "FOO"]
196 | 
197 |     def test_skip_pipeline_function(self) -> None:
198 |         def upper(t, *args):
199 |             return t.upper()
200 | 
201 |         def lower(t, *args):
202 |             return t.lower()
203 | 
204 |         self.pipeline.add(upper)
205 |         self.pipeline.skip(upper, ["no_upper", "nothing"])
206 |         assert self.pipeline.run(["Foo"]) == ["FOO"]
207 | 
208 |         self.pipeline.add(lower)
209 |         self.pipeline.skip(lower, ["no_lower", "nothing"])
210 |         assert self.pipeline.run(["Foo"]) == ["foo"]
211 | 
212 |         assert self.pipeline.run(["Foo"], field_name="no_lower") == ["FOO"]
213 |         assert self.pipeline.run(["Foo"], field_name="no_upper") == ["foo"]
214 |         assert self.pipeline.run(["Foo"], field_name="nothing") == ["Foo"]
215 | 
216 | 
217 | class TestSerialize(BaseTestPipeline):
218 |     def test_serialize_returns_array_of_registered_function_labels(self):
219 |         Pipeline.register_function(fn, "fn")
220 |         self.pipeline.add(fn)
221 | 
222 |         assert self.pipeline.serialize() == ["fn"]
223 |         assert repr(self.pipeline) == '<Pipeline stack="fn">'
224 | 
225 | 
226 | class TestRegisterFunction(BaseTestPipeline):
227 |     def setup_method(self, method):
228 |         def fn(*args):
229 |             pass
230 | 
231 |         self.fn = fn
232 | 
233 |     def test_register_function_adds_a_label_property_to_the_function(self):
234 |         Pipeline.register_function(self.fn, "fn")
235 | 
236 |         assert self.fn.label == "fn"
237 | 
238 |     def test_register_function_adds_defaults_to_name_of_the_function(self):
239 |         Pipeline.register_function(self.fn)
240 | 
241 |         assert self.fn.label == self.fn.__name__
242 | 
243 |     def test_register_function_adds_function_to_list_of_registered_functions(self):
244 |         Pipeline.register_function(self.fn, "fn")
245 | 
246 |         assert Pipeline.registered_functions["fn"] == self.fn
247 | 
248 |     def test_register_function_warns_when_adding_function_with_same_label(self):
249 |         Pipeline.register_function(self.fn, "fn")
250 |         with patch("lunr.pipeline.log") as mock_log:
251 |             Pipeline.register_function(self.fn, "fn")
252 | 
253 |             mock_log.warning.assert_called_once()
254 | 
255 | 
256 | class TestLoad(BaseTestPipeline):
257 |     def test_load_with_registered_functions(self):
258 |         serialized_pipeline = ["fn"]
259 |         Pipeline.register_function(fn, "fn")
260 | 
261 |         pipeline = Pipeline.load(serialized_pipeline)
262 | 
263 |         assert len(pipeline) == 1
264 |         assert pipeline._stack[0] == fn
265 | 
266 |     def test_load_with_unregistered_functions(self):
267 |         serialized_pipeline = ["fn"]
268 |         with pytest.raises(BaseLunrException):
269 |             Pipeline.load(serialized_pipeline)
270 | 
271 | 
272 | class TestReset(BaseTestPipeline):
273 |     def test_reset_empties_the_stack(self):
274 |         self.pipeline.add(noop)
275 |         assert len(self.pipeline) == 1
276 | 
277 |         self.pipeline.reset()
278 |         assert len(self.pipeline) == 0
279 | 


--------------------------------------------------------------------------------
/tests/test_query_lexer.py:
--------------------------------------------------------------------------------
  1 | from lunr.query_lexer import QueryLexer
  2 | 
  3 | 
  4 | def _lex(string):
  5 |     lexer = QueryLexer(string)
  6 |     lexer.run()
  7 |     return lexer
  8 | 
  9 | 
 10 | class TestQueryLexer:
 11 |     def test_single_term_produces_one_lexeme(self):
 12 |         lexer = _lex("foo")
 13 |         assert len(lexer.lexemes) == 1
 14 |         lexeme = lexer.lexemes[0]
 15 |         assert lexeme["type"] == QueryLexer.TERM
 16 |         assert lexeme["string"] == "foo"
 17 |         assert lexeme["start"] == 0
 18 |         assert lexeme["end"] == 3
 19 | 
 20 |     def test_term_escape_character(self):
 21 |         lexer = _lex("foo\\:bar")
 22 |         assert len(lexer.lexemes) == 1
 23 |         lexeme = lexer.lexemes[0]
 24 |         assert lexeme["type"] == QueryLexer.TERM
 25 |         assert lexeme["string"] == "foo:bar"
 26 |         assert lexeme["start"] == 0
 27 |         assert lexeme["end"] == 8
 28 | 
 29 |     def test_multiple_terms(self):
 30 |         lexer = _lex("foo bar")
 31 |         assert len(lexer.lexemes) == 2
 32 |         foo_lexeme, bar_lexeme = lexer.lexemes
 33 |         assert foo_lexeme["type"] == bar_lexeme["type"] == QueryLexer.TERM
 34 |         assert foo_lexeme["string"] == "foo"
 35 |         assert bar_lexeme["string"] == "bar"
 36 |         assert foo_lexeme["start"] == 0
 37 |         assert bar_lexeme["start"] == 4
 38 |         assert foo_lexeme["end"] == 3
 39 |         assert bar_lexeme["end"] == 7
 40 | 
 41 |     def test_separator_length_greater_than_one(self):
 42 |         lexer = _lex("foo    bar")
 43 |         assert len(lexer.lexemes) == 2
 44 |         foo_lexeme, bar_lexeme = lexer.lexemes
 45 |         assert foo_lexeme["type"] == bar_lexeme["type"] == QueryLexer.TERM
 46 |         assert foo_lexeme["string"] == "foo"
 47 |         assert bar_lexeme["string"] == "bar"
 48 |         assert foo_lexeme["start"] == 0
 49 |         assert bar_lexeme["start"] == 7
 50 |         assert foo_lexeme["end"] == 3
 51 |         assert bar_lexeme["end"] == 10
 52 | 
 53 |     def test_hyphen_is_considered_a_separator(self):
 54 |         lexer = _lex("foo-bar")
 55 |         assert len(lexer.lexemes) == 2
 56 | 
 57 |     def test_term_with_field(self):
 58 |         lexer = _lex("title:foo")
 59 |         assert len(lexer.lexemes) == 2
 60 |         field_lexeme, term_lexeme = lexer.lexemes
 61 |         assert field_lexeme["type"] == QueryLexer.FIELD
 62 |         assert term_lexeme["type"] == QueryLexer.TERM
 63 |         assert field_lexeme["string"] == "title"
 64 |         assert term_lexeme["string"] == "foo"
 65 |         assert field_lexeme["start"] == 0
 66 |         assert term_lexeme["start"] == 6
 67 |         assert field_lexeme["end"] == 5
 68 |         assert term_lexeme["end"] == 9
 69 | 
 70 |     def test_term_with_field_with_escape_character(self):
 71 |         lexer = _lex("ti\\:tle:foo")
 72 |         assert len(lexer.lexemes) == 2
 73 |         field_lexeme, term_lexeme = lexer.lexemes
 74 |         assert field_lexeme["type"] == QueryLexer.FIELD
 75 |         assert term_lexeme["type"] == QueryLexer.TERM
 76 |         assert field_lexeme["string"] == "ti:tle"
 77 |         assert term_lexeme["string"] == "foo"
 78 |         assert field_lexeme["start"] == 0
 79 |         assert term_lexeme["start"] == 8
 80 |         assert field_lexeme["end"] == 7
 81 |         assert term_lexeme["end"] == 11
 82 | 
 83 |     def test_term_with_edit_distance(self):
 84 |         lexer = _lex("foo~2")
 85 |         assert len(lexer.lexemes) == 2
 86 |         term_lexeme, edit_distance_lexeme = lexer.lexemes
 87 |         assert term_lexeme["type"] == QueryLexer.TERM
 88 |         assert edit_distance_lexeme["type"] == QueryLexer.EDIT_DISTANCE
 89 |         assert term_lexeme["string"] == "foo"
 90 |         assert edit_distance_lexeme["string"] == "2"
 91 |         assert term_lexeme["start"] == 0
 92 |         assert edit_distance_lexeme["start"] == 4
 93 |         assert term_lexeme["end"] == 3
 94 |         assert edit_distance_lexeme["end"] == 5
 95 | 
 96 |     def test_term_with_boost(self):
 97 |         lexer = _lex("foo^10")
 98 |         assert len(lexer.lexemes) == 2
 99 |         term_lexeme, boost_lexeme = lexer.lexemes
100 |         assert term_lexeme["type"] == QueryLexer.TERM
101 |         assert boost_lexeme["type"] == QueryLexer.BOOST
102 |         assert term_lexeme["string"] == "foo"
103 |         assert boost_lexeme["string"] == "10"
104 |         assert term_lexeme["start"] == 0
105 |         assert boost_lexeme["start"] == 4
106 |         assert term_lexeme["end"] == 3
107 |         assert boost_lexeme["end"] == 6
108 | 
109 |     def test_term_with_field_boost_and_edit_distance(self):
110 |         lexer = _lex("title:foo^10~5")
111 |         assert len(lexer.lexemes) == 4
112 |         field_lexeme, term_lexeme, boost_lexeme, edit_distance_lexeme = lexer.lexemes
113 |         assert field_lexeme["type"] == QueryLexer.FIELD
114 |         assert term_lexeme["type"] == QueryLexer.TERM
115 |         assert boost_lexeme["type"] == QueryLexer.BOOST
116 |         assert edit_distance_lexeme["type"] == QueryLexer.EDIT_DISTANCE
117 | 
118 |         assert field_lexeme["string"] == "title"
119 |         assert term_lexeme["string"] == "foo"
120 |         assert boost_lexeme["string"] == "10"
121 |         assert edit_distance_lexeme["string"] == "5"
122 | 
123 |         assert field_lexeme["start"] == 0
124 |         assert term_lexeme["start"] == 6
125 |         assert boost_lexeme["start"] == 10
126 |         assert edit_distance_lexeme["start"] == 13
127 | 
128 |         assert field_lexeme["end"] == 5
129 |         assert term_lexeme["end"] == 9
130 |         assert boost_lexeme["end"] == 12
131 |         assert edit_distance_lexeme["end"] == 14
132 | 
133 |     def test_single_term_with_hyphen_produces_two_lexemes(self):
134 |         """Embedded hyphens should not be confused with presence operators."""
135 |         lexer = _lex("foo-bar")
136 |         assert len(lexer.lexemes) == 2
137 |         foo_lexeme, bar_lexeme = lexer.lexemes
138 | 
139 |         assert foo_lexeme["type"] == QueryLexer.TERM
140 |         assert foo_lexeme["string"] == "foo"
141 |         assert foo_lexeme["start"] == 0
142 |         assert foo_lexeme["end"] == 3
143 | 
144 |         assert bar_lexeme["type"] == QueryLexer.TERM
145 |         assert bar_lexeme["string"] == "bar"
146 |         assert bar_lexeme["start"] == 4
147 |         assert bar_lexeme["end"] == 7
148 | 
149 |     def test_single_term_with_presence_produces_two_lexemes(self):
150 |         lexer = _lex("+foo")
151 |         assert len(lexer.lexemes) == 2
152 |         presence_lexeme, term_lexeme = lexer.lexemes
153 | 
154 |         assert presence_lexeme["type"] == QueryLexer.PRESENCE
155 |         assert presence_lexeme["string"] == "+"
156 |         assert presence_lexeme["start"] == 0
157 |         assert presence_lexeme["end"] == 1
158 | 
159 |         assert term_lexeme["type"] == QueryLexer.TERM
160 |         assert term_lexeme["string"] == "foo"
161 |         assert term_lexeme["start"] == 1
162 |         assert term_lexeme["end"] == 4
163 | 
164 |     def test_multiple_terms_with_presence_produces_four_lexemes(self):
165 |         lexer = _lex("+foo +bar")
166 |         assert len(lexer.lexemes) == 4
167 |         (
168 |             foo_presence_lexeme,
169 |             foo_term_lexeme,
170 |             bar_presence_lexeme,
171 |             bar_term_lexeme,
172 |         ) = lexer.lexemes
173 | 
174 |         assert foo_term_lexeme["type"] == QueryLexer.TERM
175 |         assert foo_term_lexeme["string"] == "foo"
176 |         assert foo_term_lexeme["start"] == 1
177 |         assert foo_term_lexeme["end"] == 4
178 | 
179 |         assert foo_presence_lexeme["type"] == QueryLexer.PRESENCE
180 |         assert foo_presence_lexeme["string"] == "+"
181 |         assert foo_presence_lexeme["start"] == 0
182 |         assert foo_presence_lexeme["end"] == 1
183 | 
184 |         assert bar_term_lexeme["type"] == QueryLexer.TERM
185 |         assert bar_term_lexeme["string"] == "bar"
186 |         assert bar_term_lexeme["start"] == 6
187 |         assert bar_term_lexeme["end"] == 9
188 | 
189 |         assert bar_presence_lexeme["type"] == QueryLexer.PRESENCE
190 |         assert bar_presence_lexeme["string"] == "+"
191 |         assert bar_presence_lexeme["start"] == 5
192 |         assert bar_presence_lexeme["end"] == 6
193 | 
194 |     def test_multiple_terms_with_presence_and_fuzz(self):
195 |         lexer = _lex("+foo~1 +bar")
196 |         assert len(lexer.lexemes) == 5
197 | 
198 |         (
199 |             foo_presence_lexeme,
200 |             foo_term_lexeme,
201 |             foo_fuzz_lexeme,
202 |             bar_presence_lexeme,
203 |             bar_term_lexeme,
204 |         ) = lexer.lexemes
205 | 
206 |         assert foo_presence_lexeme["type"] == QueryLexer.PRESENCE
207 |         assert foo_presence_lexeme["string"] == "+"
208 |         assert foo_presence_lexeme["start"] == 0
209 |         assert foo_presence_lexeme["end"] == 1
210 | 
211 |         assert foo_term_lexeme["type"] == QueryLexer.TERM
212 |         assert foo_term_lexeme["string"] == "foo"
213 |         assert foo_term_lexeme["start"] == 1
214 |         assert foo_term_lexeme["end"] == 4
215 | 
216 |         assert foo_fuzz_lexeme["type"] == QueryLexer.EDIT_DISTANCE
217 |         assert foo_fuzz_lexeme["string"] == "1"
218 |         assert foo_fuzz_lexeme["start"] == 5
219 |         assert foo_fuzz_lexeme["end"] == 6
220 | 
221 |         assert bar_presence_lexeme["type"] == QueryLexer.PRESENCE
222 |         assert bar_presence_lexeme["string"] == "+"
223 |         assert bar_presence_lexeme["start"] == 7
224 |         assert bar_presence_lexeme["end"] == 8
225 | 
226 |         assert bar_term_lexeme["type"] == QueryLexer.TERM
227 |         assert bar_term_lexeme["string"] == "bar"
228 |         assert bar_term_lexeme["start"] == 8
229 |         assert bar_term_lexeme["end"] == 11
230 | 


--------------------------------------------------------------------------------
/tests/test_token_set.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from lunr.token_set import TokenSet
  4 | from lunr.exceptions import BaseLunrException
  5 | 
  6 | 
  7 | class TestTokenSetStr:
  8 |     def test_str_includes_node_finality(self):
  9 |         non_final = TokenSet()
 10 |         final = TokenSet()
 11 |         other_final = TokenSet()
 12 | 
 13 |         final.final = True
 14 |         other_final.final = True
 15 | 
 16 |         assert str(non_final) != str(final)
 17 |         assert str(other_final) == str(final)
 18 | 
 19 |     def test_str_includes_all_edges(self):
 20 |         zero_edges = TokenSet()
 21 |         one_edge = TokenSet()
 22 |         two_edges = TokenSet()
 23 | 
 24 |         one_edge.edges["a"] = 1
 25 |         two_edges.edges["a"] = 1
 26 |         two_edges.edges["b"] = 1
 27 | 
 28 |         assert str(zero_edges) != str(one_edge)
 29 |         assert str(two_edges) != str(one_edge)
 30 |         assert str(two_edges) != str(zero_edges)
 31 | 
 32 |     def test_str_includes_edge_id(self):
 33 |         child_a = TokenSet()
 34 |         child_b = TokenSet()
 35 |         parent_a = TokenSet()
 36 |         parent_b = TokenSet()
 37 |         parent_c = TokenSet()
 38 | 
 39 |         parent_a.edges["a"] = child_a
 40 |         parent_b.edges["a"] = child_b
 41 |         parent_c.edges["a"] = child_b
 42 | 
 43 |         assert str(parent_b) == str(parent_c)
 44 |         assert str(parent_a) != str(parent_c)
 45 |         assert str(parent_a) != str(parent_b)
 46 | 
 47 | 
 48 | class TestTokenSetFromString:
 49 |     def test_from_string_without_wildcard(self):
 50 |         TokenSet._next_id = 1
 51 |         x = TokenSet.from_string("a")
 52 | 
 53 |         assert str(x) == "0a2"
 54 |         assert x.edges["a"].final
 55 | 
 56 |     def test_from_string_with_trailing_wildcard(self):
 57 |         x = TokenSet.from_string("a*")
 58 |         wild = x.edges["a"].edges["*"]
 59 | 
 60 |         assert wild == wild.edges["*"]
 61 |         assert wild.final
 62 | 
 63 | 
 64 | class TestTokenSetFromList:
 65 |     def test_from_list_with_unsorted_list(self):
 66 |         with pytest.raises(BaseLunrException):
 67 |             TokenSet.from_list(["z", "a"])
 68 | 
 69 |     def test_from_list_with_sorted_list(self):
 70 |         token_set = TokenSet.from_list(["a", "z"])
 71 |         assert ["a", "z"] == sorted(token_set.to_list())
 72 | 
 73 |     def test_from_list_is_minimal(self):
 74 |         token_set = TokenSet.from_list(["ac", "dc"])
 75 |         ac_node = token_set.edges["a"].edges["c"]
 76 |         dc_node = token_set.edges["d"].edges["c"]
 77 | 
 78 |         assert ac_node == dc_node
 79 | 
 80 | 
 81 | class TestTokenSetToList:
 82 |     def test_to_list_includes_all_words(self):
 83 |         words = ["bat", "cat"]
 84 |         token_set = TokenSet.from_list(words)
 85 | 
 86 |         assert set(words) == set(token_set.to_list())
 87 | 
 88 |     def test_to_list_includes_single_words(self):
 89 |         word = "bat"
 90 |         token_set = TokenSet.from_string(word)
 91 | 
 92 |         assert {word} == set(token_set.to_list())
 93 | 
 94 | 
 95 | class TestTokenSetIntersect:
 96 |     def test_no_intersection(self):
 97 |         x = TokenSet.from_string("cat")
 98 |         y = TokenSet.from_string("bar")
 99 |         z = x.intersect(y)
100 | 
101 |         assert len(z.to_list()) == 0
102 | 
103 |     def test_simple_intersection(self):
104 |         x = TokenSet.from_string("cat")
105 |         y = TokenSet.from_string("cat")
106 |         z = x.intersect(y)
107 | 
108 |         assert {"cat"} == set(z.to_list())
109 | 
110 |     def test_trailing_wildcard_intersection(self):
111 |         x = TokenSet.from_string("cat")
112 |         y = TokenSet.from_string("c*")
113 |         z = x.intersect(y)
114 | 
115 |         assert {"cat"} == set(z.to_list())
116 | 
117 |     def test_trailing_wildcard_no_intersection(self):
118 |         x = TokenSet.from_string("cat")
119 |         y = TokenSet.from_string("b*")
120 |         z = x.intersect(y)
121 | 
122 |         assert len(z.to_list()) == 0
123 | 
124 |     def test_leading_wildcard_intersection(self):
125 |         x = TokenSet.from_string("cat")
126 |         y = TokenSet.from_string("*t")
127 |         z = x.intersect(y)
128 | 
129 |         assert {"cat"} == set(z.to_list())
130 | 
131 |     def test_leading_wildcard_no_intersection(self):
132 |         x = TokenSet.from_string("cat")
133 |         y = TokenSet.from_string("*r")
134 |         z = x.intersect(y)
135 | 
136 |         assert len(z.to_list()) == 0
137 | 
138 |     def test_contained_wildcard_intersection(self):
139 |         x = TokenSet.from_string("foo")
140 |         y = TokenSet.from_string("f*o")
141 |         z = x.intersect(y)
142 | 
143 |         assert {"foo"} == set(z.to_list())
144 | 
145 |     def test_contained_wildcard_no_intersection(self):
146 |         x = TokenSet.from_string("foo")
147 |         y = TokenSet.from_string("b*r")
148 |         z = x.intersect(y)
149 | 
150 |         assert len(z.to_list()) == 0
151 | 
152 |     def test_wildcard_zero_or_more_characters(self):
153 |         x = TokenSet.from_string("foo")
154 |         y = TokenSet.from_string("foo*")
155 |         z = x.intersect(y)
156 | 
157 |         assert {"foo"} == set(z.to_list())
158 | 
159 |     def test_with_fuzzy_string_substitution(self):
160 |         x1 = TokenSet.from_string("bar")
161 |         x2 = TokenSet.from_string("cur")
162 |         x3 = TokenSet.from_string("cat")
163 |         x4 = TokenSet.from_string("car")
164 |         x5 = TokenSet.from_string("foo")
165 |         y = TokenSet.from_fuzzy_string("car", 1)
166 | 
167 |         assert x1.intersect(y).to_list() == ["bar"]
168 |         assert x2.intersect(y).to_list() == ["cur"]
169 |         assert x3.intersect(y).to_list() == ["cat"]
170 |         assert x4.intersect(y).to_list() == ["car"]
171 |         assert x5.intersect(y).to_list() == []
172 | 
173 |     def test_with_fuzzy_string_deletion(self):
174 |         x1 = TokenSet.from_string("ar")
175 |         x2 = TokenSet.from_string("br")
176 |         x3 = TokenSet.from_string("ba")
177 |         x4 = TokenSet.from_string("bar")
178 |         x5 = TokenSet.from_string("foo")
179 |         y = TokenSet.from_fuzzy_string("bar", 1)
180 | 
181 |         assert x1.intersect(y).to_list() == ["ar"]
182 |         assert x2.intersect(y).to_list() == ["br"]
183 |         assert x3.intersect(y).to_list() == ["ba"]
184 |         assert x4.intersect(y).to_list() == ["bar"]
185 |         assert x5.intersect(y).to_list() == []
186 | 
187 |     def test_with_fuzzy_string_insertion(self):
188 |         x1 = TokenSet.from_string("bbar")
189 |         x2 = TokenSet.from_string("baar")
190 |         x3 = TokenSet.from_string("barr")
191 |         x4 = TokenSet.from_string("bar")
192 |         x5 = TokenSet.from_string("ba")
193 |         x6 = TokenSet.from_string("foo")
194 |         x7 = TokenSet.from_string("bara")
195 |         y = TokenSet.from_fuzzy_string("bar", 1)
196 | 
197 |         assert x1.intersect(y).to_list() == ["bbar"]
198 |         assert x2.intersect(y).to_list() == ["baar"]
199 |         assert x3.intersect(y).to_list() == ["barr"]
200 |         assert x4.intersect(y).to_list() == ["bar"]
201 |         assert x5.intersect(y).to_list() == ["ba"]
202 |         assert x6.intersect(y).to_list() == []
203 |         assert x7.intersect(y).to_list() == ["bara"]
204 | 
205 |     def test_with_fuzzy_string_transpose(self):
206 |         x1 = TokenSet.from_string("abr")
207 |         x2 = TokenSet.from_string("bra")
208 |         x3 = TokenSet.from_string("foo")
209 |         y = TokenSet.from_fuzzy_string("bar", 1)
210 | 
211 |         assert x1.intersect(y).to_list() == ["abr"]
212 |         assert x2.intersect(y).to_list() == ["bra"]
213 |         assert x3.intersect(y).to_list() == []
214 | 
215 |     def test_fuzzy_string_insertion(self):
216 |         x = TokenSet.from_string("abcxx")
217 |         y = TokenSet.from_fuzzy_string("abc", 2)
218 | 
219 |         assert x.intersect(y).to_list() == ["abcxx"]
220 | 
221 |     def test_fuzzy_string_substitution(self):
222 |         x = TokenSet.from_string("axx")
223 |         y = TokenSet.from_fuzzy_string("abc", 2)
224 | 
225 |         assert x.intersect(y).to_list() == ["axx"]
226 | 
227 |     def test_fuzzy_string_deletion(self):
228 |         x = TokenSet.from_string("a")
229 |         y = TokenSet.from_fuzzy_string("abc", 2)
230 | 
231 |         assert x.intersect(y).to_list() == ["a"]
232 | 
233 |     def test_fuzzy_string_transpose(self):
234 |         x = TokenSet.from_string("bca")
235 |         y = TokenSet.from_fuzzy_string("abc", 2)
236 | 
237 |         assert x.intersect(y).to_list() == ["bca"]
238 | 
239 |     def test_leading_wildcard_backtracking_intersection(self):
240 |         x = TokenSet.from_string("aaacbab")
241 |         y = TokenSet.from_string("*ab")
242 | 
243 |         assert x.intersect(y).to_list() == ["aaacbab"]
244 | 
245 |     def test_leading_wildcard_backtracking_no_intersection(self):
246 |         x = TokenSet.from_string("aaacbab")
247 |         y = TokenSet.from_string("*abc")
248 | 
249 |         assert x.intersect(y).to_list() == []
250 | 
251 |     def test_contained_wildcard_backtracking_intersection(self):
252 |         x = TokenSet.from_string("ababc")
253 |         y = TokenSet.from_string("a*bc")
254 | 
255 |         assert x.intersect(y).to_list() == ["ababc"]
256 | 
257 |     def test_contained_wildcard_backtracking_no_intersection(self):
258 |         x = TokenSet.from_string("ababc")
259 |         y = TokenSet.from_string("a*ac")
260 | 
261 |         assert x.intersect(y).to_list() == []
262 | 
263 |     @pytest.mark.timeout(2)
264 |     def test_catastrophic_backtracking_with_leading_characters(self):
265 |         x = TokenSet.from_string("f" * 100)
266 |         y = TokenSet.from_string("*f")
267 | 
268 |         assert len(x.intersect(y).to_list()) == 1
269 | 
270 |     def test_leading_trailing_wildcard_backtracking_intersection(self):
271 |         x = TokenSet.from_string("acbaabab")
272 |         y = TokenSet.from_string("*ab*")
273 | 
274 |         assert x.intersect(y).to_list() == ["acbaabab"]
275 | 
276 |     def test_leading_atrailing_wildcard_backtracking_intersection(self):
277 |         x = TokenSet.from_string("acbaabab")
278 |         y = TokenSet.from_string("a*ba*b")
279 | 
280 |         assert x.intersect(y).to_list() == ["acbaabab"]
281 | 


--------------------------------------------------------------------------------
/lunr/token_set.py:
--------------------------------------------------------------------------------
  1 | class TokenSet:
  2 |     """
  3 |     A token set is used to store the unique list of all tokens
  4 |     within an index. Token sets are also used to represent an
  5 |     incoming query to the index, this query token set and index
  6 |     token set are then intersected to find which tokens to look
  7 |     up in the inverted index.
  8 | 
  9 |     A token set can hold multiple tokens, as in the case of the
 10 |     index token set, or it can hold a single token as in the
 11 |     case of a simple query token set.
 12 | 
 13 |     Additionally token sets are used to perform wildcard matching.
 14 |     Leading, contained and trailing wildcards are supported, and
 15 |     from this edit distance matching can also be provided.
 16 | 
 17 |     Token sets are implemented as a minimal finite state automata,
 18 |     where both common prefixes and suffixes are shared between tokens.
 19 |     This helps to reduce the space used for storing the token set.
 20 | 
 21 |     TODO: consider https://github.com/glyph/automat
 22 |     """
 23 | 
 24 |     _next_id = 1
 25 | 
 26 |     def __init__(self):
 27 |         self.final = False
 28 |         self.edges = {}
 29 |         self.id = self._next_id
 30 |         self.__class__._next_id += 1
 31 | 
 32 |     def __str__(self):
 33 |         try:
 34 |             return self._string
 35 |         except AttributeError:
 36 |             pass
 37 | 
 38 |         string = "1" if self.final else "0"
 39 |         for label in sorted(list(self.edges.keys())):
 40 |             node = self.edges[label]
 41 |             try:
 42 |                 node_id = str(node.id)
 43 |             except AttributeError:
 44 |                 # TODO: JS seems to rely on undefined for the id attribute?
 45 |                 node_id = ""
 46 | 
 47 |             string = string + label + node_id
 48 | 
 49 |         return string
 50 | 
 51 |     def __repr__(self):
 52 |         return '<TokenSet "{}">'.format(str(self))
 53 | 
 54 |     @classmethod
 55 |     def from_string(self, string):
 56 |         """Creates a TokenSet from a string.
 57 | 
 58 |         The string may contain one or more wildcard characters (*) that will
 59 |         allow wildcard matching when intersecting with another TokenSet
 60 |         """
 61 |         node = TokenSet()
 62 |         root = node
 63 | 
 64 |         # Iterates throough all characters in the passed string appending
 65 |         # a node for each character.
 66 |         # When a wildcard character is found then a self referencing edge
 67 |         # is introduced to continually match any number of characters
 68 |         for i, char in enumerate(string):
 69 |             final = i == len(string) - 1
 70 |             if char == "*":
 71 |                 node.edges[char] = node
 72 |                 node.final = final
 73 |             else:
 74 |                 next_ = TokenSet()
 75 |                 next_.final = final
 76 |                 node.edges[char] = next_
 77 |                 node = next_
 78 | 
 79 |         return root
 80 | 
 81 |     @classmethod
 82 |     def from_fuzzy_string(cls, string, edit_distance):
 83 |         """Creates a token set representing a single string with a specified
 84 |         edit distance.
 85 | 
 86 |         Insertions, deletions, substitutions and transpositions are each
 87 |         treated as an edit distance of 1.
 88 | 
 89 |         Increasing the allowed edit distance will have a dramatic impact
 90 |         on the performance of both creating and intersecting these TokenSets.
 91 |         It is advised to keep the edit distance less than 3.
 92 |         """
 93 |         root = TokenSet()
 94 | 
 95 |         stack = [{"node": root, "edits_remaining": edit_distance, "string": string}]
 96 | 
 97 |         while stack:
 98 |             frame = stack.pop()
 99 |             # no edit
100 |             if len(frame["string"]) > 0:
101 |                 char = frame["string"][0]
102 |                 no_edit_node = None
103 |                 if char in frame["node"].edges:
104 |                     no_edit_node = frame["node"].edges[char]
105 |                 else:
106 |                     no_edit_node = TokenSet()
107 |                     frame["node"].edges[char] = no_edit_node
108 | 
109 |                 if len(frame["string"]) == 1:
110 |                     no_edit_node.final = True
111 | 
112 |                 stack.append(
113 |                     {
114 |                         "node": no_edit_node,
115 |                         "edits_remaining": frame["edits_remaining"],
116 |                         "string": frame["string"][1:],
117 |                     }
118 |                 )
119 | 
120 |             if frame["edits_remaining"] == 0:
121 |                 continue
122 | 
123 |             # insertion, can only do insertion if there are edits remaining
124 |             if "*" in frame["node"].edges:
125 |                 insertion_node = frame["node"].edges["*"]
126 |             else:
127 |                 insertion_node = TokenSet()
128 |                 frame["node"].edges["*"] = insertion_node
129 | 
130 |             if len(frame["string"]) == 0:
131 |                 insertion_node.final = True
132 | 
133 |             stack.append(
134 |                 {
135 |                     "node": insertion_node,
136 |                     "edits_remaining": frame["edits_remaining"] - 1,
137 |                     "string": frame["string"],
138 |                 }
139 |             )
140 | 
141 |             # deletion, can only do a deletion if we have enough edits
142 |             # remaining and if there are characters left to delete in the string
143 |             if len(frame["string"]) > 1:
144 |                 stack.append(
145 |                     {
146 |                         "node": frame["node"],
147 |                         "edits_remaining": frame["edits_remaining"] - 1,
148 |                         "string": frame["string"][1:],
149 |                     }
150 |                 )
151 | 
152 |             # deletion, just removing the last character of the string
153 |             if len(frame["string"]) == 1:
154 |                 frame["node"].final = True
155 | 
156 |             # substitution, can only do a substitution if we have enough edits
157 |             # remaining and there are characters left to substitute
158 |             if len(frame["string"]) >= 1:
159 |                 if "*" in frame["node"].edges:
160 |                     substitution_node = frame["node"].edges["*"]
161 |                 else:
162 |                     substitution_node = TokenSet()
163 |                     frame["node"].edges["*"] = substitution_node
164 | 
165 |                 if len(frame["string"]) == 1:
166 |                     substitution_node.final = True
167 | 
168 |                 stack.append(
169 |                     {
170 |                         "node": substitution_node,
171 |                         "edits_remaining": frame["edits_remaining"] - 1,
172 |                         "string": frame["string"][1:],
173 |                     }
174 |                 )
175 | 
176 |             # transposition, can only do a transposition if there are edits
177 |             # remaining and there are enough characters to transpose
178 |             if frame["edits_remaining"] and len(frame["string"]) > 1:
179 |                 char_a = frame["string"][0]
180 |                 char_b = frame["string"][1]
181 |                 transpose_node = None
182 | 
183 |                 if char_b in frame["node"].edges:
184 |                     transpose_node = frame["node"].edges[char_b]
185 |                 else:
186 |                     transpose_node = TokenSet()
187 |                     frame["node"].edges[char_b] = transpose_node
188 | 
189 |                 if len(frame["string"]) == 1:
190 |                     transpose_node.final = True
191 | 
192 |                 stack.append(
193 |                     {
194 |                         "node": transpose_node,
195 |                         "edits_remaining": frame["edits_remaining"] - 1,
196 |                         "string": char_a + frame["string"][2:],
197 |                     }
198 |                 )
199 | 
200 |         return root
201 | 
202 |     @classmethod
203 |     def from_list(cls, list_of_words):
204 |         from lunr.token_set_builder import TokenSetBuilder
205 | 
206 |         builder = TokenSetBuilder()
207 |         for word in list_of_words:
208 |             builder.insert(word)
209 | 
210 |         builder.finish()
211 |         return builder.root
212 | 
213 |     @classmethod
214 |     def from_clause(cls, clause):
215 |         if clause.edit_distance:
216 |             return cls.from_fuzzy_string(clause.term, clause.edit_distance)
217 |         else:
218 |             return cls.from_string(clause.term)
219 | 
220 |     def to_list(self):
221 |         words = []
222 |         stack = [{"prefix": "", "node": self}]
223 | 
224 |         while stack:
225 |             frame = stack.pop()
226 |             if frame["node"].final:
227 |                 words.append(frame["prefix"])
228 | 
229 |             for edge in frame["node"].edges.keys():
230 |                 stack.append(
231 |                     {
232 |                         "prefix": frame["prefix"] + str(edge),
233 |                         "node": frame["node"].edges[edge],
234 |                     }
235 |                 )
236 | 
237 |         return words
238 | 
239 |     def intersect(self, other):
240 |         """Returns a new TokenSet that is the intersection of this TokenSet
241 |         and the passed TokenSet.
242 | 
243 |         This intersection will take into account any wildcards contained within
244 |         the TokenSet.
245 |         """
246 |         output = TokenSet()
247 |         stack = [{"node": self, "q_node": other, "output": output}]
248 | 
249 |         while stack:
250 |             frame = stack.pop()
251 |             for q_edge in frame["q_node"].edges.keys():
252 |                 for n_edge in frame["node"].edges.keys():
253 |                     if n_edge == q_edge or q_edge == "*":
254 |                         node = frame["node"].edges[n_edge]
255 |                         q_node = frame["q_node"].edges[q_edge]
256 |                         final = node.final and q_node.final
257 |                         next_ = None
258 | 
259 |                         if n_edge in frame["output"].edges:
260 |                             next_ = frame["output"].edges[n_edge]
261 |                             next_.final = next_.final or final
262 |                         else:
263 |                             next_ = TokenSet()
264 |                             next_.final = final
265 |                             frame["output"].edges[n_edge] = next_
266 | 
267 |                         stack.append({"node": node, "q_node": q_node, "output": next_})
268 | 
269 |         return output
270 | 


--------------------------------------------------------------------------------