├── requirements.txt ├── docs ├── .DS_Store ├── config.py ├── pages │ └── templates │ │ ├── range.md │ │ ├── group-by.md │ │ ├── comparison.md │ │ ├── add.md │ │ ├── update.md │ │ ├── highlight.md │ │ ├── code-search.md │ │ ├── aggregate-metrics.md │ │ ├── delete.md │ │ ├── quickstart.md │ │ ├── ranking.md │ │ ├── index.md │ │ ├── search.html │ │ ├── autosuggest.md │ │ ├── spelling-correction.md │ │ ├── conditions │ │ └── operators.md │ │ ├── string-queries.md │ │ ├── matching.md │ │ ├── script-scores.md │ │ ├── storage-and-consistency.md │ │ ├── create.md │ │ └── search.md ├── hooks.py └── _site │ ├── comparison.md │ └── index.html │ └── aggregate-metrics.md │ └── index.html ├── jamesql ├── __init__.py ├── query_simplifier.py ├── script_lang.py └── rewriter.py ├── assets ├── .DS_Store └── screenshot.png ├── tests ├── fixtures │ ├── example_stub_and_query.json │ ├── documents.json │ ├── documents_with_numeric_values.json │ ├── documents_with_categorical_values.json │ ├── documents_with_categorical_and_numeric_values.json │ ├── documents_with_varied_data_types.json │ └── code │ │ ├── simplifier.py │ │ └── simplifier_demo.py ├── conftest.py ├── save_and_load.py ├── concurrency.py ├── autosuggest.py ├── spelling_correction.py ├── gsi_type_inference.py ├── script_lang.py ├── code_search.py ├── highlight.py ├── data_types.py ├── aggregation.py ├── range_queries.py ├── query_simplification.py ├── group_by.py ├── sort_by.py ├── string_queries_categorical_and_range.py ├── string_query.py └── test.py ├── CITATION.cff ├── .github └── workflows │ ├── welcome.yml │ ├── test.yml │ ├── release.yml │ ├── benchmark.yml │ ├── windows.yml │ └── documentation.yml ├── LICENSE ├── setup.py ├── web ├── landing.html ├── templates │ ├── index.html │ └── search.html └── web.py ├── schema.py └── .gitignore /requirements.txt: -------------------------------------------------------------------------------- 1 | pybmoore 2 | -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capjamesg/jamesql/HEAD/docs/.DS_Store -------------------------------------------------------------------------------- /jamesql/__init__.py: -------------------------------------------------------------------------------- 1 | from .index import JameSQL 2 | 3 | __version__ = "0.3.0" 4 | -------------------------------------------------------------------------------- /assets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capjamesg/jamesql/HEAD/assets/.DS_Store -------------------------------------------------------------------------------- /assets/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capjamesg/jamesql/HEAD/assets/screenshot.png -------------------------------------------------------------------------------- /tests/fixtures/example_stub_and_query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "and": [] 4 | }, 5 | "limit": 10, 6 | "sort_by": "title" 7 | } -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - given-names: James (capjamesg) 5 | title: "JameSQL" 6 | version: 0.1.0 7 | date-released: 2024-10-16 8 | -------------------------------------------------------------------------------- /tests/fixtures/documents.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"title": "tolerate it", "lyric": "I made you my temple, my mural my sky"}, 3 | { 4 | "title": "my tears ricochet", 5 | "lyric": "And I still talk to you when I'm screaming at the sky" 6 | }, 7 | {"title": "The Bolter", "lyric": "Started with a kiss"} 8 | ] -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption( 6 | "--benchmark", action="store_true", default=False, help="Enable benchmarking" 7 | ) 8 | parser.addoption( 9 | "--long-benchmark", 10 | action="store_true", 11 | default=False, 12 | help="Enable long benchmark", 13 | ) 14 | -------------------------------------------------------------------------------- /tests/fixtures/documents_with_numeric_values.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"title": "tolerate it", "lyric": "I made you my temple, my mural, my sky", "listens": 100}, 3 | { 4 | "title": "my tears ricochet", 5 | "lyric": "And I still talk to you when I'm screaming at the sky", 6 | "listens": 200 7 | }, 8 | {"title": "The Bolter", "lyric": "Started with a kiss", "listens": 300} 9 | ] -------------------------------------------------------------------------------- /tests/fixtures/documents_with_categorical_values.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"title": "tolerate it", "lyric": "I made you my temple, my mural, my sky", "category": ["pop"]}, 3 | { 4 | "title": "my tears ricochet", 5 | "lyric": "And I still talk to you when I'm screaming at the sky", 6 | "category": ["pop"] 7 | }, 8 | {"title": "The Bolter", "lyric": "Started with a kiss", "category": ["pop", "acoustic"]} 9 | ] -------------------------------------------------------------------------------- /tests/fixtures/documents_with_categorical_and_numeric_values.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"title": "tolerate it", "lyric": "I made you my temple, my mural, my sky", "listens": 100, "category": ["pop"] }, 3 | { 4 | "title": "my tears ricochet", 5 | "lyric": "And I still talk to you when I'm screaming at the sky", 6 | "listens": 200, 7 | "category": ["pop", "acoustic"] 8 | }, 9 | {"title": "The Bolter", "lyric": "Started with a kiss", "listens": 300, "category": ["acoustic"]} 10 | ] -------------------------------------------------------------------------------- /docs/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | BASE_URLS = { 4 | "local": os.getcwd(), 5 | "production": "https://jamesg.blog/jamesql", 6 | } 7 | 8 | SITE_ENV = os.environ.get("SITE_ENV", "local") 9 | BASE_URL = BASE_URLS[SITE_ENV] 10 | ROOT_DIR = "pages" 11 | LAYOUTS_BASE_DIR = "_layouts" 12 | SITE_DIR = "_site" 13 | HOOKS = { 14 | "post_template_generation": {"hooks": ["highlight_code"]}, 15 | "pre_template_generation": {"hooks": ["generate_table_of_contents"]} 16 | } 17 | SITE_STATE = {} 18 | 19 | BASE_URL = BASE_URLS[SITE_ENV] -------------------------------------------------------------------------------- /docs/pages/templates/range.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Range Queries 4 | permalink: /range/ 5 | --- 6 | 7 | You can find values in a numeric range with a range query. Here is an example of a query that looks for documents where the `year` field is between `2010` and `2020`: 8 | 9 | ```python 10 | query = { 11 | "query": { 12 | "year": { 13 | "range": [2010, 2020] 14 | } 15 | } 16 | } 17 | ``` 18 | 19 | The first value in the range is the lower bound to use in the search, and the second value is the upper bound. -------------------------------------------------------------------------------- /docs/pages/templates/group-by.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Group By 4 | permalink: /group-by/ 5 | --- 6 | 7 | You can group results by a single key. This is useful for presenting aggregate views of data. 8 | 9 | To group results by a key, use the following code: 10 | 11 | ```python 12 | query = { 13 | "query": { 14 | "lyric": { 15 | "contains": "sky" 16 | } 17 | }, 18 | "group_by": "title" 19 | } 20 | ``` 21 | 22 | This query will search for all `lyric` fields that contain the term "sky" and group the results by the `title` field. -------------------------------------------------------------------------------- /.github/workflows/welcome.yml: -------------------------------------------------------------------------------- 1 | name: Welcome 2 | 3 | on: 4 | issues: 5 | types: [opened] 6 | pull_request_target: 7 | types: [opened] 8 | 9 | jobs: 10 | build: 11 | name: 👋 Welcome 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/first-interaction@v1.3.0 15 | with: 16 | repo-token: ${{ secrets.GITHUB_TOKEN }} 17 | issue-message: "Thank you for creating an Issue on this repository! 🙌 We will get back to you shortly." 18 | pr-message: "Thank you for creating an PR on this repository! 🙌 We will get back to you shortly." 19 | -------------------------------------------------------------------------------- /docs/pages/templates/comparison.md: -------------------------------------------------------------------------------- 1 | You can find documents where a field is less than, greater than, less than or equal to, or greater than or equal to a value with a range query. Here is an example of a query that looks for documents where the `year` field is greater than `2010`: 2 | 3 |

 4 | query = {
 5 |     "query": {
 6 |         "year": {
 7 |             "greater_than": 2010
 8 |         }
 9 |     }
10 | }
11 | 
12 | 13 | The following operators are supported: 14 | 15 | - `greater_than` 16 | - `less_than` 17 | - `greater_than_or_equal` 18 | - `less_than_or_equal` -------------------------------------------------------------------------------- /tests/save_and_load.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | 7 | from jamesql import JameSQL 8 | from jamesql.index import GSI_INDEX_STRATEGIES, INDEX_STORE 9 | 10 | 11 | @pytest.mark.skip 12 | def test_load_from_local_index(): 13 | with open("tests/fixtures/documents.json") as f: 14 | documents = json.load(f) 15 | 16 | index = JameSQL.load() 17 | 18 | assert len(index.global_index) == len(documents) 19 | assert index.global_index 20 | assert len(index.gsis) == 2 # indexing two fields 21 | assert index.gsis["title"] 22 | assert len(index.uuids_to_position_in_global_index) == len(documents) 23 | -------------------------------------------------------------------------------- /tests/fixtures/documents_with_varied_data_types.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"title": "tolerate it", "lyric": "I made you my temple, my mural, my sky", "listens": 100, "album_in_stock": true, "rating": 4.7, "metadata": {"version": 0}, "record_last_updated": "2023-01-01"}, 3 | { 4 | "title": "my tears ricochet", 5 | "lyric": "And I still talk to you when I'm screaming at the sky", 6 | "listens": 200, 7 | "album_in_stock": true, 8 | "rating": 4.7, "metadata": {"version": 0}, "record_last_updated": "2024-01-01" 9 | }, 10 | {"title": "The Bolter", "lyric": "Started with a kiss", "listens": 300, "album_in_stock": false, "rating": 4.9, "metadata": {"version": 0}, "record_last_updated": "2024-04-01"} 11 | ] -------------------------------------------------------------------------------- /docs/pages/templates/add.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Add a Document 4 | permalink: /add/ 5 | --- 6 | 7 | To add documents to a database, use the following code: 8 | 9 |

10 | index.add({"title": "tolerate it", "artist": "Taylor Swift"})
11 | index.add({"title": "betty", "artist": "Taylor Swift"})
12 | 
13 | 14 | Values within documents can have the following data types: 15 | 16 | - String 17 | - Integer 18 | - Float 19 | - List 20 | - Dictionary 21 | 22 | When documents are added, a `uuid` key is added for use in uniquely identifying the document. 23 | 24 |
25 | Dictionaries are not indexable. You can store dictionaries and they will be returned in payloads, but you cannot run search operations on them. 26 |
-------------------------------------------------------------------------------- /docs/pages/templates/update.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Update a Document 4 | permalink: /update/ 5 | --- 6 | 7 | 8 | You need a document UUID to update a document. You can retrieve a UUID by searching for a document. 9 | 10 | Here is an example showing how to update a document: 11 | 12 | ```python 13 | response = index.search( 14 | { 15 | "query": {"title": {"equals": "tolerate it"}}, 16 | "limit": 10, 17 | "sort_by": "title", 18 | } 19 | ) 20 | 21 | uuid = response["documents"][0]["uuid"] 22 | 23 | index.update(uuid, {"title": "tolerate it (folklore)", "artist": "Taylor Swift"}) 24 | ``` 25 | 26 | `update` is an override operation. This means you must provide the full document that you want to save, instead of only the fields you want to update. 27 | -------------------------------------------------------------------------------- /docs/pages/templates/highlight.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Highlight Results 4 | permalink: /highlight/ 5 | --- 6 | 7 | You can extract context around results. This data can be used to show a snippet of the document that contains the query term. 8 | 9 | Here is an example of a query that highlights context around all instances of the term "sky" in the `lyric` field: 10 | 11 | ```python 12 | query = { 13 | "query": { 14 | "lyric": { 15 | "contains": "sky", 16 | "highlight": True, 17 | "highlight_stride": 3 18 | } 19 | } 20 | } 21 | ``` 22 | 23 | `highlight_stride` states how many words to retrieve before and after the match. 24 | 25 | All documents returned by this query will have a `_context` key that contains the context around all instances of the term "sky". -------------------------------------------------------------------------------- /docs/pages/templates/code-search.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Code Search 4 | permalink: /code-search/ 5 | --- 6 | 7 | You can use JameSQL to efficiently search through code. 8 | 9 | To do so, first create a `TRIGRAM_CODE` index on the field you want to search. 10 | 11 | When you add documents, include at least the following two fields: 12 | 13 | - `file_name`: The name of the file the code is in. 14 | - `code`: The code you want to index. 15 | 16 | When you search for code, all matching documents will have a `_context` key with the following structure: 17 | 18 |

19 | {
20 |     "line": "1",
21 |     "code": "..."
22 | }
23 | 
24 | 25 | This tells you on what line your search matched, and the code that matched. This information is ideal to highlight specific lines relevant to your query. -------------------------------------------------------------------------------- /docs/pages/templates/aggregate-metrics.md: -------------------------------------------------------------------------------- 1 | 2 | You can find the total number of unique values for the fields returned by a query using an `aggregate` query. This is useful for presenting the total number of options available in a search space to a user. 3 | 4 | You can use the following query to find the total number of unique values for all fields whose `lyric` field contains the term "sky": 5 | 6 |

 7 | query = {
 8 |     "query": {
 9 |         "lyric": {
10 |             "contains": "sky"
11 |         }
12 |     },
13 |     "metrics": ["aggregate"]
14 | }
15 | 
16 | 17 | The aggregate results are presented in an `unique_record_values` key with the following structure: 18 | 19 |

20 | {
21 |     "documents": [...],
22 |     "query_time": 0.0001,
23 |     {'unique_record_values': {'title': 2, 'lyric': 2, 'listens': 2, 'categories': 3}}
24 | }
25 | 
-------------------------------------------------------------------------------- /docs/pages/templates/delete.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Delete a Document 4 | permalink: /delete/ 5 | --- 6 | 7 | You need a document UUID to delete a document. You can retrieve a UUID by searching for a document. 8 | 9 | Here is an example showing how to delete a document: 10 | 11 |

12 | response = index.search(
13 |     {
14 |         "query": {"title": {"equals": "tolerate it"}},
15 |         "limit": 10,
16 |         "sort_by": "title",
17 |     }
18 | )
19 | 
20 | uuid = response["documents"][0]["uuid"]
21 | 
22 | index.remove(uuid)
23 | 
24 | 25 | You can validate the document has been deleted using this code: 26 | 27 |

28 | response = index.search(
29 |     {
30 |         "query": {"title": {"equals": "tolerate it"}},
31 |         "limit": 10,
32 |         "sort_by": "title",
33 |     }
34 | )
35 | 
36 | assert len(response["documents"]) == 0
37 | 
-------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: JameSQL Test Workflow (macOS and Ubuntu) 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | push: 7 | branches: [main] 8 | 9 | jobs: 10 | build-dev-test: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: ["ubuntu-latest", "macos-latest"] 15 | python-version: ["3.10", "3.11", "3.12", "3.13"] 16 | steps: 17 | - name: 🛎️ Checkout 18 | uses: actions/checkout@v4 19 | - name: 🐍 Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | check-latest: true 24 | 25 | - name: 📦 Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install -e . 29 | pip install -e .[dev] 30 | - name: 🧪 Test 31 | run: | 32 | python -m pytest tests/*.py 33 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Publish Workflow 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: [3.12] 13 | steps: 14 | - name: 🛎️ Checkout 15 | uses: actions/checkout@v4 16 | with: 17 | ref: ${{ github.head_ref }} 18 | - name: 🐍 Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: 🦾 Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip twine wheel 25 | - name: 🚀 Publish to PyPi 26 | env: 27 | PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine check dist/* 32 | twine upload dist/* -u ${PYPI_USERNAME} -p ${PYPI_PASSWORD} --verbose 33 | -------------------------------------------------------------------------------- /.github/workflows/benchmark.yml: -------------------------------------------------------------------------------- 1 | name: JameSQL Benchmark Workflow 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | push: 7 | branches: [main] 8 | 9 | jobs: 10 | build-dev-test: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: ["ubuntu-latest", "macos-latest"] 15 | python-version: ["3.10", "3.11", "3.12", "3.13"] 16 | steps: 17 | - name: 🛎️ Checkout 18 | uses: actions/checkout@v4 19 | - name: 🐍 Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | check-latest: true 24 | 25 | - name: 📦 Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install -e . 29 | pip install -e .[dev] 30 | 31 | - name: 🧪 Run benchmark stress test 32 | env: 33 | SITE_ENV: production 34 | run: | 35 | python -m pytest ./tests/*.py --benchmark 36 | python -m pytest ./tests/*.py --long-benchmark 37 | 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 James 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | name: JameSQL Test Workflow (Windows) 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | push: 7 | branches: [main] 8 | 9 | jobs: 10 | build-dev-test: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: ["windows-latest"] 15 | python-version: ["3.10", "3.11", "3.12", "3.13"] 16 | steps: 17 | - name: 🛎️ Checkout 18 | uses: actions/checkout@v4 19 | - name: 🐍 Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | check-latest: true 24 | 25 | - name: 📦 Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install -e . 29 | pip install -e .[dev] 30 | - name: 🧪 Test 31 | run: | 32 | python -m pytest tests/aggregation.py tests/data_types.py tests/group_by.py tests/highlight.py tests/range_queries.py tests/save_and_load.py tests/script_lang.py tests/string_queries_categorical_and_range.py tests/string_query.py tests/test.py 33 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: Publish Documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v4 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: '3.13' 20 | check-latest: true 21 | 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install pygments bs4 lxml 26 | python -m pip install git+https://github.com/capjamesg/aurora 27 | cd docs 28 | - name: Build main site 29 | env: 30 | SITE_ENV: ${{ secrets.SITE_ENV }} 31 | run: | 32 | cd docs 33 | aurora build 34 | - name: rsync deployments 35 | uses: burnett01/rsync-deployments@7.0.1 36 | with: 37 | switches: -avzr 38 | path: "docs/_site/*" 39 | remote_path: ${{ secrets.REMOTE_PATH }} 40 | remote_host: ${{ secrets.SERVER_HOST }} 41 | remote_user: ${{ secrets.SERVER_USERNAME }} 42 | remote_key: ${{ secrets.KEY }} 43 | -------------------------------------------------------------------------------- /tests/concurrency.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import threading 4 | 5 | from jamesql import JameSQL 6 | from jamesql.index import GSI_INDEX_STRATEGIES 7 | 8 | 9 | def test_threading(): 10 | with open("tests/fixtures/documents.json") as f: 11 | documents = json.load(f) 12 | 13 | index = JameSQL() 14 | 15 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 16 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 17 | 18 | for document in documents * 100: 19 | document = document.copy() 20 | index.add(document, doc_id=str(random.randint(0, 1000000))) 21 | 22 | def query(i): 23 | if i == 0: 24 | document = documents[0].copy() 25 | document["title"] = "teal" 26 | index.add(document, "xyz") 27 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 28 | 29 | assert len(index.string_query_search("teal")["documents"]) == 1 30 | 31 | threads = [] 32 | 33 | for i in range(2500): 34 | t = threading.Thread(target=query, args=(i,)) 35 | threads.append(t) 36 | t.start() 37 | 38 | for t in threads: 39 | t.join() 40 | 41 | assert len(index.global_index) == 301 42 | assert index.global_index["xyz"]["title"] == "teal" 43 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from setuptools import find_packages 3 | import re 4 | 5 | with open("./jamesql/__init__.py", 'r') as f: 6 | content = f.read() 7 | version = re.search(r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]', content).group(1) 8 | 9 | with open("README.md", "r") as fh: 10 | long_description = fh.read() 11 | 12 | setuptools.setup( 13 | name="jamesql", 14 | version=version, 15 | author="capjamesg", 16 | author_email="jamesg@jamesg.blog", 17 | description="A JameSQL database implemented in Python.", 18 | long_description=long_description, 19 | long_description_content_type="text/markdown", 20 | url="https://github.com/capjamesg/jamesql", 21 | install_requires=[ 22 | "pybmoore", 23 | "pygtrie", 24 | "lark", 25 | "btrees", 26 | "nltk", 27 | "sortedcontainers" 28 | ], 29 | packages=find_packages(exclude=("tests",)), 30 | extras_require={ 31 | "dev": ["flake8", "black==22.3.0", "isort", "twine", "pytest", "wheel", "flask", "orjson", "tqdm", "deepdiff"], 32 | }, 33 | classifiers=[ 34 | "Programming Language :: Python :: 3", 35 | "License :: OSI Approved :: MIT License", 36 | "Operating System :: OS Independent", 37 | ], 38 | python_requires=">=3.7", 39 | ) 40 | -------------------------------------------------------------------------------- /web/landing.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Document 7 | 32 | 33 | 34 |
35 |

JameSQL

36 |

Fast, in-memory search.

37 |
38 | 39 | -------------------------------------------------------------------------------- /jamesql/query_simplifier.py: -------------------------------------------------------------------------------- 1 | def normalize_operator_query(t): 2 | if isinstance(t, str): 3 | return t 4 | 5 | return "_".join(t) 6 | 7 | 8 | def simplifier(terms): 9 | new_terms = [] 10 | outer_terms = set() 11 | to_remove = set() 12 | 13 | for i, t in enumerate(terms): 14 | if isinstance(t, str) and t not in outer_terms: 15 | outer_terms.add(t) 16 | new_terms.append(t) 17 | 18 | for i, t in enumerate(terms): 19 | normalized_terms = normalize_operator_query(t) 20 | if isinstance(t, list) and t[1] == "OR": 21 | for inner_term in t: 22 | if inner_term == "OR": 23 | continue 24 | 25 | if inner_term not in outer_terms: 26 | outer_terms.add(inner_term) 27 | new_terms.append(inner_term) 28 | elif ( 29 | isinstance(t, list) 30 | and t[1] == "AND" 31 | and normalized_terms not in outer_terms 32 | ): 33 | new_terms.append(t[0]) 34 | new_terms.append("AND") 35 | new_terms.append(t[2]) 36 | elif ( 37 | isinstance(t, list) 38 | and t[0] == "NOT" 39 | and normalized_terms not in outer_terms 40 | ): 41 | new_terms.append("-" + t[1]) 42 | 43 | if t[1] in outer_terms: 44 | to_remove.add(t[1]) 45 | to_remove.add("-" + t[1]) 46 | 47 | return [i for i in new_terms if normalize_operator_query(i) not in to_remove] 48 | -------------------------------------------------------------------------------- /docs/pages/templates/quickstart.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Quickstart 4 | permalink: /quickstart/ 5 | --- 6 | 7 |

You can create a JameSQL database in five lines of code.

8 | 9 |

Install JameSQL

10 | 11 | First, install JameSQL: 12 | 13 |
14 | pip install jamesql
15 | 
16 | 17 |

Insert Records

18 | 19 | Then, create a new Python file and add the following code: 20 | 21 |

22 | from jamesql import JameSQL, GSI_INDEX_STRATEGIES
23 | 
24 | index = JameSQL.load()
25 | 
26 | index.add({"title": "tolerate it", "lyric": "Use my best colors for your portrait"})
27 | 
28 | 29 |

Create an Index

30 | 31 | For efficient data retrieval for longer pieces of text in the `lyric` key, we are going to use the `CONTAINS` index type. This creates a reverse index for each word in the text. 32 | 33 |

34 | index.create_gsi("lyric", GSI_INDEX_STRATEGIES.CONTAINS)
35 | 
36 | 37 |

Search the Database

38 | 39 | We can search the database using the following code: 40 | 41 |

42 | results = index.string_query_search("title:'tolerate it' colors")
43 | 
44 | print(results)
45 | 
46 | 47 | Our code returns: 48 | 49 |

50 | {"documents": [{"title": "tolerate it", "lyric": "Use my best colors for your portrait" …}]}
51 | 
52 | 53 | We have successfully built a database! 54 | 55 | -------------------------------------------------------------------------------- /tests/fixtures/code/simplifier.py: -------------------------------------------------------------------------------- 1 | def normalize_operator_query(t): 2 | if isinstance(t, str): 3 | return t 4 | 5 | return "_".join(t) 6 | 7 | 8 | def simplifier(terms): 9 | new_terms = [] 10 | outer_terms = set() 11 | to_remove = set() 12 | 13 | for i, t in enumerate(terms): 14 | if isinstance(t, str) and t not in outer_terms: 15 | outer_terms.add(t) 16 | new_terms.append(t) 17 | 18 | for i, t in enumerate(terms): 19 | normalized_terms = normalize_operator_query(t) 20 | if isinstance(t, list) and t[1] == "OR": 21 | for inner_term in t: 22 | if inner_term == "OR": 23 | continue 24 | 25 | if inner_term not in outer_terms: 26 | outer_terms.add(inner_term) 27 | new_terms.append(inner_term) 28 | elif ( 29 | isinstance(t, list) 30 | and t[1] == "AND" 31 | and normalized_terms not in outer_terms 32 | ): 33 | new_terms.append(t) 34 | outer_terms.add(normalized_terms) 35 | if t[0] in outer_terms: 36 | to_remove.add(t[0]) 37 | if t[2] in outer_terms: 38 | to_remove.add(t[2]) 39 | elif ( 40 | isinstance(t, list) 41 | and t[0] == "NOT" 42 | and normalized_terms not in outer_terms 43 | ): 44 | if t[1] in outer_terms: 45 | to_remove.add(t[1]) 46 | 47 | return [i for i in new_terms if normalize_operator_query(i) not in to_remove] 48 | -------------------------------------------------------------------------------- /docs/pages/templates/ranking.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | permalink: /ranking/ 4 | title: Document Ranking 5 | --- 6 | 7 | By default, documents are ranked in no order. If you provide a `sort_by` field, documents are sorted by that field. 8 | 9 | For more advanced ranking, you can use the `boost` feature. This feature lets you boost the value of a field in a document to calculate a final score. 10 | 11 | The default score for each field is `1`. 12 | 13 | To use this feature, you must use `boost` on fields that have an index. 14 | 15 | Here is an example of a query that uses the `boost` feature: 16 | 17 | ```python 18 | { 19 | "query": { 20 | "or": { 21 | "post": { 22 | "contains": "taylor swift", 23 | "strict": False, 24 | "boost": 1 25 | }, 26 | "title": { 27 | "contains": "desk", 28 | "strict": True, 29 | "boost": 25 30 | } 31 | } 32 | }, 33 | "limit": 4, 34 | "sort_by": "_score", 35 | } 36 | ``` 37 | 38 | This query would search for documents whose `post` field contains `taylor swift` or whose `title` field contains `desk`. The `title` field is boosted by 25, so documents that match the `title` field are ranked higher. 39 | 40 | The score for each document before boosting is equal to the number of times the query condition is satisfied. For example, if a post contains `taylor swift` twice, the score for that document is `2`; if a title contains `desk` once, the score for that document is `1`. 41 | 42 | Documents are then ranked in decreasing order of score. -------------------------------------------------------------------------------- /docs/pages/templates/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: JameSQL 4 | permalink: / 5 | --- 6 | 7 | An in-memory, NoSQL database implemented in Python, with support for building custom ranking algorithms. 8 | 9 | You can run full text search queries on thousands of documents with multiple fields in < 1ms. 10 | 11 | ## Demo 12 | 13 | [Try a site search engine built with JameSQL](https://jamesg.blog/search-pages/). 14 | 15 | 18 | 19 | ## Ideal use case 20 | 21 | JameSQL is designed for small-scale search projects where objects can easily be loaded into and stored in memory. 22 | 23 | James uses it for his [personal website search engine](https://jamesg.blog/search-pages/), which indexes 1,000+ documents (500,000+ words). 24 | 25 | On James' search engine, are computed in < 10ms and returned to a client in < 70ms. -------------------------------------------------------------------------------- /tests/fixtures/code/simplifier_demo.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | from collections import defaultdict 4 | 5 | 6 | def get_trigrams(line): 7 | return [line[i : i + 3] for i in range(len(line) - 2)] 8 | 9 | 10 | index = defaultdict(list) 11 | 12 | # read all python files in . 13 | DIR = "./pages/posts/" 14 | id2line = {} 15 | doc_lengths = {} 16 | 17 | for root, dirs, files in os.walk(DIR): 18 | for file in files: 19 | if file.endswith(".md"): 20 | with open(os.path.join(root, file), "r") as file: 21 | code = file.read() 22 | 23 | code_lines = code.split("\n") 24 | total_lines = len(code_lines) 25 | 26 | for line_num, line in enumerate(code_lines): 27 | trigrams = get_trigrams(line) 28 | 29 | if len(trigrams) == 0: 30 | id2line[f"{file.name}:{line_num}"] = line 31 | 32 | for trigram in trigrams: 33 | index[trigram].append((file, line_num)) 34 | id2line[f"{file.name}:{line_num}"] = line 35 | 36 | doc_lengths[file.name] = total_lines 37 | 38 | query = "coffee" 39 | context = 0 40 | 41 | trigrams = get_trigrams(query) 42 | 43 | candidates = set(index[trigrams[0]]) 44 | # print([file.name + ":" + str(line_num) for file, line_num in candidates]) 45 | for trigram in trigrams: 46 | candidates = candidates.intersection(set(index[trigram])) 47 | 48 | for file, line_num in candidates: 49 | print(f"{file.name}:{line_num}") 50 | for i in range( 51 | max(0, line_num - context), min(doc_lengths[file.name], line_num + context + 1) 52 | ): 53 | line = id2line[f"{file.name}:{i}"] 54 | print(f"{i}: {line}") 55 | 56 | print() 57 | -------------------------------------------------------------------------------- /docs/pages/templates/search.html: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | permalink: /search-pages/ 4 | title: Search 5 | notoc: true 6 | --- 7 | 8 |

search results for ""

9 | 10 | 12 | 13 | -------------------------------------------------------------------------------- /schema.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from enum import Enum 4 | from typing import Dict, Optional 5 | 6 | from pydantic import BaseModel, ConfigDict, model_validator 7 | 8 | VALID_QUERY_TYPES = ["contains", "equals", "starts_with"] 9 | VALID_OPERATOR_QUERY_TYPES = ["or", "and"] 10 | 11 | 12 | class QueryType(str, Enum): 13 | contains = "contains" 14 | equals = "equals" 15 | starts_with = "starts_with" 16 | 17 | 18 | class AndOperatorQueryType(str, Enum): 19 | and_ = "and" 20 | 21 | 22 | class OrOperatorQueryType(str, Enum): 23 | or_ = "or" 24 | 25 | 26 | class QueryItem(BaseModel): 27 | model_config = ConfigDict(extra="forbid") 28 | 29 | contains: Optional[str] = None 30 | equals: Optional[str] = None 31 | starts_with: Optional[str] = None 32 | 33 | strict: Optional[bool] = False 34 | boost: Optional[int] = 1 35 | 36 | # ensure that only one of the query types is used 37 | @model_validator(mode="after") 38 | def validate_query_type(cls, v): 39 | query_types = [v.contains, v.equals, v.starts_with] 40 | 41 | if len([qt for qt in query_types if qt]) > 1: 42 | raise ValueError("Only one query type can be used") 43 | 44 | return v 45 | 46 | 47 | class RootQuery(BaseModel): 48 | query: ( 49 | Dict[AndOperatorQueryType, Dict[str, QueryItem]] 50 | | Dict[OrOperatorQueryType, Dict[str, QueryItem]] 51 | | Dict[str, QueryItem] 52 | ) 53 | limit: Optional[int] = 10 54 | sort_by: Optional[str] = "score" 55 | 56 | 57 | query = { 58 | "query": { 59 | "or": { 60 | "post": {"contains": "taylor swift", "strict": False, "boost": 1}, 61 | "title": {"contains": "my desk", "strict": True, "boost": 25}, 62 | } 63 | }, 64 | "limit": 4, 65 | "sort_by": "score", 66 | } 67 | 68 | # validate query 69 | print(RootQuery(**query)) 70 | -------------------------------------------------------------------------------- /jamesql/script_lang.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import math 3 | 4 | from lark import Transformer 5 | 6 | grammar = """ 7 | start: query 8 | 9 | query: decay | "(" query OPERATOR query ")" | logarithm | FLOAT | WORD 10 | logarithm: LOGARITHM "(" query ")" 11 | OPERATOR: "+" | "-" | "*" | "/" 12 | LOGARITHM: "log" 13 | decay: "decay" WORD 14 | 15 | WORD: /[a-zA-Z0-9_]+/ 16 | FLOAT: /[0-9]+(\.[0-9]+)?/ 17 | 18 | %import common.WS 19 | %ignore WS 20 | """ 21 | 22 | OPERATOR_METHODS = { 23 | "+": lambda x, y: x + y, 24 | "-": lambda x, y: x - y, 25 | "*": lambda x, y: x * y, 26 | "/": lambda x, y: x / y, 27 | } 28 | 29 | 30 | class JameSQLScriptTransformer(Transformer): 31 | def __init__(self, document): 32 | self.document = document 33 | 34 | def query(self, items): 35 | if len(items) == 1: 36 | return items[0] 37 | 38 | left = items[0] 39 | operator = items[1] 40 | right = items[2] 41 | 42 | operator_command = OPERATOR_METHODS[operator] 43 | 44 | return operator_command(left, right) 45 | 46 | def logarithm(self, items): 47 | # + 0.1 removes the possibility of log(0) 48 | # which would return a math domain error 49 | return math.log(items[1] + 0.1) 50 | 51 | def start(self, items): 52 | return items[0] 53 | 54 | def decay(self, items): 55 | # decay by half for every 30 days 56 | # item is datetime.dateime object 57 | days_since_post = ( 58 | datetime.datetime.now() 59 | - datetime.datetime.strptime(items[0], "%Y-%m-%dT%H:%M:%S") 60 | ).days 61 | 62 | return 1.1 ** (days_since_post / 30) 63 | 64 | def WORD(self, items): 65 | if items.value.isdigit(): 66 | return float(items.value) 67 | 68 | return self.document[items.value] 69 | 70 | def FLOAT(self, items): 71 | return float(items.value) 72 | 73 | def OPERATOR(self, items): 74 | return items.value 75 | -------------------------------------------------------------------------------- /docs/pages/templates/autosuggest.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | permalink: /autosuggest/ 4 | title: Autosuggest 5 | --- 6 | 7 | You can enable autosuggest using one or more fields in an index. This can be used to efficiently find records that start with a given prefix. 8 | 9 | To enable autosuggest on an index, run: 10 | 11 |

12 | index = JameSQL()
13 | 
14 | index.enable_autosuggest("field")
15 | 
16 | 17 | Where `field` is the name of the field on which you want to enable autosuggest. 18 | 19 | You can enable autosuggest on multiple fields: 20 | 21 |

22 | index.enable_autosuggest("field1")
23 | index.enable_autosuggest("field2")
24 | 
25 | 26 | When you enable autosuggest on a field, JameSQL will create a trie index for that field. This index is used to efficiently find records that start with a given prefix. 27 | 28 | To run an autosuggest query, use the following code: 29 | 30 |

31 | suggestions = index.autosuggest("started", match_full_record=True, limit = 1)
32 | 
33 | 34 | This will automatically return records that start with the prefix `started`. 35 | 36 | The `match_full_record` parameter indicates whether to return full record names, or any records starting with a term. 37 | 38 | `match_full_record=True` means that the full record name will be returned. This is ideal to enable selection between full records. 39 | 40 | `match_full_record=False` means that any records starting with the term will be returned. This is ideal for autosuggesting single words. 41 | 42 | For example, given the query `start`, matching against full records with `match_full_record=True` would return: 43 | 44 | - `Started with a kiss` 45 | 46 | This is the content of a full document. 47 | 48 | `match_full_record=False`, on the other hand, would return: 49 | 50 | - `started` 51 | - `started with a kiss` 52 | 53 | This contains both a root word starting with `start` and full documents starting with `start`. 54 | 55 | This feature is case insensitive. 56 | 57 | The `limit` argument limits the number of results returned. -------------------------------------------------------------------------------- /docs/pages/templates/spelling-correction.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Spelling Correction 4 | permalink: /spelling-correction/ 5 | --- 6 | 7 | It is recommended that you check the spelling of words before you run a query. 8 | 9 | This is because correcting the spelling of a word can improve the accuracy of your search results. 10 | 11 | ### Correcting the spelling of a single word 12 | 13 | To recommend a spelling correction for a query, use the following code: 14 | 15 | ```python 16 | index = ... 17 | 18 | suggestion = index.spelling_correction("taylr swift") 19 | ``` 20 | 21 | This will return a single suggestion. The suggestion will be the word that is most likely to be the correct spelling of the word you provided. 22 | 23 | Spelling correction first generates segmentations of a word, like: 24 | 25 | - `t aylorswift` 26 | - `ta ylorswift` 27 | 28 | If a segmentation is valid, it is returned. 29 | 30 | For example, if the user types in `taylorswift`, one permutation would be segmented into `taylor swift`. If `taylor swift` is common in the index, `taylor swift` will be returned as the suggestion. 31 | 32 | Spelling correction works by transforming the input query by inserting, deleting, and transforming one character in every position in a string. The transformed strings are then looked up in the index to find if they are present and, if so, how common they are. 33 | 34 | The most common suggestion is then returned. 35 | 36 | For example, if you provide the word `tayloi` and `taylor` is common in the index, the suggestion will be `taylor`. 37 | 38 | If correction was not possible after transforming one character, correction will be attempted with two transformations given the input string. 39 | 40 | If the word you provided is already spelled correctly, the suggestion will be the word you provided. If spelling correction is not possible (i.e. the word is too distant from any word in the index), the suggestion will be `None`. 41 | 42 | ### Correcting a string query 43 | 44 | If you are correcting a string query submitted with the `string_query_search()` function, spelling will be automatically corrected using the algorithm above. No configuration is required. -------------------------------------------------------------------------------- /docs/pages/templates/conditions/operators.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Operators 3 | layout: default 4 | permalink: /conditions/operators 5 | --- 6 | 7 | There are three operators you can use for condition matching: 8 | 9 | - `equals` 10 | - `contains` 11 | - `starts_with` 12 | 13 | Here is an example of a query that searches for documents that have the `artist` field set to `Taylor Swift`: 14 | 15 | ```python 16 | query = { 17 | "query": { 18 | "artist": { 19 | "equals": "Taylor Swift" 20 | } 21 | } 22 | } 23 | ``` 24 | 25 | These operators can be used with three query types: 26 | 27 | - `and` 28 | - `or` 29 | - `not` 30 | 31 | ### and 32 | 33 | You can also search for documents that have the `artist` field set to `Taylor Swift` and the `title` field set to `tolerate it`: 34 | 35 | ```python 36 | query = { 37 | "query": { 38 | "and": [ 39 | { 40 | "artist": { 41 | "equals": "Taylor Swift" 42 | } 43 | }, 44 | { 45 | "title": { 46 | "equals": "tolerate it" 47 | } 48 | } 49 | ] 50 | } 51 | } 52 | ``` 53 | 54 | ### or 55 | 56 | You can nest conditions to create complex queries, like: 57 | 58 | ```python 59 | query = { 60 | "query": { 61 | "or": { 62 | "and": [ 63 | {"title": {"starts_with": "tolerate"}}, 64 | {"title": {"contains": "it"}}, 65 | ], 66 | "lyric": {"contains": "kiss"}, 67 | } 68 | }, 69 | "limit": 2, 70 | "sort_by": "title", 71 | } 72 | ``` 73 | 74 | This will return a list of documents that match the query. 75 | 76 | ### not 77 | 78 | You can search for documents that do not match a query by using the `not` operator. Here is an example of a query that searches for lyrics that contain `sky` but not `kiss`: 79 | 80 | ```python 81 | query = { 82 | "query": { 83 | "and": { 84 | "or": [ 85 | {"lyric": {"contains": "sky", "boost": 3}}, 86 | ], 87 | "not": {"lyric": {"contains": "kiss"}}, 88 | } 89 | }, 90 | "limit": 10, 91 | "sort_by": "title", 92 | } 93 | ``` -------------------------------------------------------------------------------- /docs/pages/templates/string-queries.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: String Queries 4 | permalink: /string-query/ 5 | --- 6 | 7 | JameSQL supports string queries. String queries are single strings that use special syntax to assert the meaning of parts of a string. 8 | 9 | For example, you could use the following query to find documents where the `title` field contains `tolerate it` and any field contains `mural`: 10 | 11 |
12 | title:"tolerate it" mural
13 | 
14 | 15 | The following operators are supported: 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 |
OperatorDescription
-termSearch for documents that do not contain term.
termSearch for documents that contain term.
term1 term2Search for documents that contain term1 and term2.
'term1 term2'Search for the literal phrase term1 term2 in documents.
field:'term'Search for documents where the field field contains term (i.e. title:"tolerate it").
field^2 termBoost the score of documents where the field field matches the query term by 2.
51 | 52 | This feature turns a string query into a JameSQL query, which is then executed and the results returned. 53 | 54 | To run a string query, use the following code: 55 | 56 | ```python 57 | results = index.string_query_search("title:'tolerate it' mural") 58 | ``` 59 | 60 | When you run a string query, JameSQL will attempt to simplify the query to make it more efficient. For example, if you search for `-sky sky mural`, the query will be `mural` because `-sky` negates the `sky` mention. 61 | -------------------------------------------------------------------------------- /docs/hooks.py: -------------------------------------------------------------------------------- 1 | 2 | from pygments import highlight 3 | from pygments.lexers import PythonLexer, HtmlLexer 4 | from pygments.formatters import HtmlFormatter 5 | from bs4 import BeautifulSoup 6 | 7 | languages = { 8 | "python": PythonLexer(), 9 | "html": HtmlLexer(), 10 | "text": HtmlLexer(), 11 | } 12 | 13 | def generate_table_of_contents(file_name, page_state, site_state): 14 | page = BeautifulSoup(page_state["page"].contents, 'html.parser') 15 | h2s = page.find_all('h2') 16 | toc = [] 17 | for h2 in h2s: 18 | toc.append({ 19 | "text": h2.text, 20 | "id": h2.text.lower().replace(" ", "-"), 21 | "children": [] 22 | }) 23 | h3s = h2.find_next_siblings('h3') 24 | for h3 in h3s: 25 | # if h3 is a child of another h3, skip it 26 | if h3.find_previous_sibling('h2') != h2: 27 | continue 28 | toc[-1]["children"].append({ 29 | "text": h3.text, 30 | "id": h3.text.lower().replace(" ", "-"), 31 | }) 32 | page_state["page"].toc = toc 33 | 34 | return page_state 35 | 36 | def highlight_code(file_name, page_state, _, page_contents): 37 | print(f"Checking {file_name}") 38 | if ".txt" in file_name or ".xml" in file_name: 39 | return page_contents 40 | print(f"Highlighting code in {file_name}") 41 | soup = BeautifulSoup(page_contents, 'lxml') 42 | 43 | for pre in soup.find_all('pre'): 44 | code = pre.find('code') 45 | try: 46 | language = code['class'][0].split("language-")[1] 47 | code = highlight(code.text, languages[language], HtmlFormatter()) 48 | except: 49 | continue 50 | 51 | pre.replace_with(BeautifulSoup(code, 'html.parser')) 52 | 53 | css = HtmlFormatter().get_style_defs('.highlight') 54 | css = f"" 55 | 56 | # this happens for bookmarks 57 | if not soup.find("body"): 58 | return "" 59 | 60 | body = soup.find('body') 61 | body.insert(0, BeautifulSoup(css, 'html.parser')) 62 | 63 | # get every h2 and add id= to it 64 | for h2 in soup.find_all('h2'): 65 | h2['id'] = h2.text.lower().replace(" ", "-") 66 | for h3 in soup.find_all('h3'): 67 | h3['id'] = h3.text.lower().replace(" ", "-") 68 | 69 | return str(soup) -------------------------------------------------------------------------------- /docs/pages/templates/matching.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Search Matching 4 | permalink: /matching/ 5 | --- 6 | 7 | ### Strict matching 8 | 9 | By default, a search query on a text field will find any document where the field contains any word in the query string. For example, a query for `tolerate it` on a `title` field will match any document whose `title` that contains `tolerate` or `it`. This is called a non-strict match. 10 | 11 | Non-strict matches are the default because they are faster to compute than strict matches. 12 | 13 | If you want to find documents where terms appear next to each other in a field, you can do so with a strict match. Here is an example of a strict match: 14 | 15 | ```python 16 | query = { 17 | "query": { 18 | "title": { 19 | "contains": "tolerate it", 20 | "strict": True 21 | } 22 | } 23 | } 24 | ``` 25 | 26 | This will return documents whose title contains `tolerate it` as a single phrase. 27 | 28 | ### Fuzzy matching 29 | 30 | By default, search queries look for the exact string provided. This means that if a query contains a typo (i.e. searching for `tolerate ip` instead of `tolerate it`), no documents will be returned. 31 | 32 | JameSQL implements a limited form of fuzzy matching. This means that if a query contains a typo, JameSQL will still return documents that match the query. 33 | 34 | The fuzzy matching feature matches documents that contain one typo. If a document contains more than one typo, it will not be returned. A typo is an incorrectly typed character. JameSQL does not support fuzzy matching that accounts for missing or additional characters (i.e. `tolerate itt` will not match `tolerate it`). 35 | 36 | You can enable fuzzy matching by setting the `fuzzy` key to `True` in the query. Here is an example of a query that uses fuzzy matching: 37 | 38 | ```python 39 | query = { 40 | "query": { 41 | "title": { 42 | "contains": "tolerate ip", 43 | "fuzzy": True 44 | } 45 | } 46 | } 47 | ``` 48 | 49 | ### Wildcard matching 50 | 51 | You can match documents using a single wildcard character. This character is represented by an asterisk `*`. 52 | 53 | ```python 54 | query = { 55 | "query": { 56 | "title": { 57 | "contains": "tolerat* it", 58 | "fuzzy": True 59 | } 60 | } 61 | } 62 | ``` 63 | 64 | This query will look for all words that match the pattern `tolerat* it`, where the `*` character can be any single character. -------------------------------------------------------------------------------- /docs/pages/templates/script-scores.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | permalink: /script-scores/ 4 | title: Script Scores 5 | --- 6 | 7 | The script score feature lets you write custom scripts to calculate the score for each document. This is useful if you want to calculate a score based on multiple fields, including numeric fields. 8 | 9 | Script scores are applied after all documents are retrieved. 10 | 11 | The script score feature supports the following mathematical operations: 12 | 13 | - `+` (addition) 14 | - `-` (subtraction) 15 | - `*` (multiplication) 16 | - `/` (division) 17 | - `log` (logarithm) 18 | - `decay` (timeseries decay) 19 | 20 | You can apply a script score at the top level of your query: 21 | 22 | ```python 23 | { 24 | "query": { 25 | "or": { 26 | "post": { 27 | "contains": "taylor swift", 28 | "strict": False, 29 | "boost": 1 30 | }, 31 | "title": { 32 | "contains": "desk", 33 | "strict": True, 34 | "boost": 25 35 | } 36 | } 37 | }, 38 | "limit": 4, 39 | "sort_by": "_score", 40 | "script_score": "((post + title) * 2)" 41 | } 42 | ``` 43 | 44 | The above example will calculate the score of documents by adding the score of the `post` field and the `title` field, then multiplying the result by `2`. 45 | 46 | A script score is made up of terms. A term is a field name or number (float or int), followed by an operator, followed by another term or number. Terms can be nested. 47 | 48 | All terms must be enclosed within parentheses. 49 | 50 | To compute a score that adds the `post` score to `title` and multiplies the result by `2`, use the following code: 51 | 52 | ```text 53 | ((post + title) * 2) 54 | ``` 55 | 56 | Invalid forms of this query include: 57 | 58 | - `post + title * 2` (missing parentheses) 59 | - `(post + title * 2)` (terms can only include one operator) 60 | 61 | The `decay` function lets you decay a value by `0.9 ** days_since_post / 30`. This is useful for gradually decreasing the rank for older documents as time passes. This may be particularly useful if you are working with data where you want more recent documents to be ranked higher. `decay` only works with timeseries. 62 | 63 | Here is an example of `decay` in use: 64 | 65 | ``` 66 | (_score * decay published) 67 | ``` 68 | 69 | This will apply the `decay` function to the `published` field. 70 | 71 | Data must be stored as a Python `datetime` object for the `decay` function to work. -------------------------------------------------------------------------------- /tests/autosuggest.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from deepdiff import DeepDiff 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | 11 | 12 | def pytest_addoption(parser): 13 | parser.addoption("--benchmark", action="store") 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def create_indices(request): 18 | with open("tests/fixtures/documents.json") as f: 19 | documents = json.load(f) 20 | 21 | index = JameSQL() 22 | 23 | for document in documents: 24 | index.add(document) 25 | 26 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 27 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 28 | 29 | index.enable_autosuggest("title") 30 | 31 | with open("tests/fixtures/documents.json") as f: 32 | documents = json.load(f) 33 | 34 | if request.config.getoption("--benchmark") or request.config.getoption( 35 | "--long-benchmark" 36 | ): 37 | large_index = JameSQL() 38 | 39 | for document in documents * 100000: 40 | if request.config.getoption("--long-benchmark"): 41 | document = document.copy() 42 | document["title"] = "".join( 43 | [ 44 | word + " " 45 | for word in document["title"].split() 46 | for _ in range(10) 47 | ] 48 | ) 49 | large_index.add(document) 50 | 51 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 52 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 53 | 54 | large_index.enable_autosuggest("title") 55 | else: 56 | large_index = None 57 | 58 | return index, large_index 59 | 60 | 61 | @pytest.mark.parametrize( 62 | "query, suggestion", 63 | [ 64 | ("tolerat", "tolerate"), 65 | ("toler", "tolerate"), 66 | ("th", "the"), 67 | ("b", "bolter"), 68 | ("he", ""), # not in index; part of another word 69 | ("cod", ""), # not in index 70 | ], 71 | ) 72 | def test_autosuggest(create_indices, query, suggestion): 73 | index = create_indices[0] 74 | large_index = create_indices[1] 75 | 76 | if suggestion != "": 77 | assert index.autosuggest(query)[0] == suggestion 78 | 79 | if large_index and suggestion != "": 80 | assert large_index.autosuggest(query)[0] == suggestion 81 | -------------------------------------------------------------------------------- /tests/spelling_correction.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from deepdiff import DeepDiff 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | 11 | 12 | def pytest_addoption(parser): 13 | parser.addoption("--benchmark", action="store") 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def create_indices(request): 18 | with open("tests/fixtures/documents.json") as f: 19 | documents = json.load(f) 20 | 21 | index = JameSQL() 22 | 23 | for document in documents: 24 | index.add(document) 25 | 26 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 27 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 28 | 29 | with open("tests/fixtures/documents.json") as f: 30 | documents = json.load(f) 31 | 32 | if request.config.getoption("--benchmark") or request.config.getoption( 33 | "--long-benchmark" 34 | ): 35 | large_index = JameSQL() 36 | 37 | for document in documents * 100000: 38 | if request.config.getoption("--long-benchmark"): 39 | document = document.copy() 40 | document["title"] = "".join( 41 | [ 42 | word + " " 43 | for word in document["title"].split() 44 | for _ in range(10) 45 | ] 46 | ) 47 | large_index.add(document) 48 | 49 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 50 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 51 | else: 52 | large_index = None 53 | 54 | return index, large_index 55 | 56 | 57 | @pytest.mark.parametrize( 58 | "query, corrected_query", 59 | [ 60 | ("tolerat", "tolerate"), 61 | ("tolerateit", "tolerate it"), # test segmentation 62 | ( 63 | "startedwith", 64 | "started with", 65 | ), # query word that appears uppercase in corpus of text 66 | ("toleratt", "tolerate"), 67 | ("toleratt", "tolerate"), 68 | ("tolerate", "tolerate"), 69 | ("toler", "toler"), # not in index 70 | ("cod", "cod"), # not in index 71 | ], 72 | ) 73 | def test_spelling_correction(create_indices, query, corrected_query): 74 | index = create_indices[0] 75 | large_index = create_indices[1] 76 | 77 | assert index.spelling_correction(query) == corrected_query 78 | 79 | if large_index: 80 | assert large_index.spelling_correction(query) == corrected_query 81 | -------------------------------------------------------------------------------- /tests/gsi_type_inference.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from jamesql import JameSQL 6 | from jamesql.index import GSI_INDEX_STRATEGIES 7 | 8 | 9 | def pytest_addoption(parser): 10 | parser.addoption("--benchmark", action="store") 11 | 12 | 13 | @pytest.mark.timeout(20) 14 | def test_gsi_type_inference(request): 15 | with open("tests/fixtures/documents_with_varied_data_types.json") as f: 16 | documents = json.load(f) 17 | 18 | index = JameSQL() 19 | 20 | for document in documents: 21 | index.add(document) 22 | 23 | # check gsi type 24 | assert index.gsis["title"]["strategy"] == GSI_INDEX_STRATEGIES.CONTAINS.name 25 | assert index.gsis["lyric"]["strategy"] == GSI_INDEX_STRATEGIES.CONTAINS.name 26 | assert index.gsis["listens"]["strategy"] == GSI_INDEX_STRATEGIES.NUMERIC.name 27 | assert index.gsis["album_in_stock"]["strategy"] == GSI_INDEX_STRATEGIES.FLAT.name 28 | assert index.gsis["rating"]["strategy"] == GSI_INDEX_STRATEGIES.NUMERIC.name 29 | assert index.gsis["metadata"]["strategy"] == GSI_INDEX_STRATEGIES.NOT_INDEXABLE.name 30 | assert ( 31 | index.gsis["record_last_updated"]["strategy"] == GSI_INDEX_STRATEGIES.DATE.name 32 | ) 33 | 34 | with open("tests/fixtures/documents_with_varied_data_types.json") as f: 35 | documents = json.load(f) 36 | 37 | if request.config.getoption("--benchmark") or request.config.getoption( 38 | "--long-benchmark" 39 | ): 40 | large_index = JameSQL() 41 | 42 | for document in documents * 100000: 43 | if request.config.getoption("--long-benchmark"): 44 | document = document.copy() 45 | document["title"] = "".join( 46 | [ 47 | word + " " 48 | for word in document["title"].split() 49 | for _ in range(10) 50 | ] 51 | ) 52 | large_index.add(document) 53 | 54 | assert ( 55 | large_index.gsis["title"]["strategy"] == GSI_INDEX_STRATEGIES.CONTAINS.name 56 | ) 57 | assert ( 58 | large_index.gsis["lyric"]["strategy"] == GSI_INDEX_STRATEGIES.CONTAINS.name 59 | ) 60 | assert ( 61 | large_index.gsis["listens"]["strategy"] == GSI_INDEX_STRATEGIES.NUMERIC.name 62 | ) 63 | assert ( 64 | large_index.gsis["album_in_stock"]["strategy"] 65 | == GSI_INDEX_STRATEGIES.FLAT.name 66 | ) 67 | assert ( 68 | large_index.gsis["rating"]["strategy"] == GSI_INDEX_STRATEGIES.NUMERIC.name 69 | ) 70 | assert ( 71 | large_index.gsis["metadata"]["strategy"] 72 | == GSI_INDEX_STRATEGIES.NOT_INDEXABLE.name 73 | ) 74 | assert ( 75 | large_index.gsis["record_last_updated"]["strategy"] 76 | == GSI_INDEX_STRATEGIES.DATE.name 77 | ) 78 | -------------------------------------------------------------------------------- /docs/pages/templates/storage-and-consistency.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Data Storage and Consistency 4 | permalink: /storage-and-consistency/ 5 | --- 6 | 7 | JameSQL indices are stored in memory and on disk. 8 | 9 | When you call the `add()` method, the document is appended to an `index.jamesql` file in the directory in which your program is running. This file is serialized as JSONL. 10 | 11 | When you load an index, all entries in the `index.jamesql` file will be read back into memory. 12 | 13 | _Note: You will need to manually reconstruct your indices using the `create_gsi()` method after loading an index._ 14 | 15 | ## Data Consistency 16 | 17 | When you call `add()`, a `journal.jamesql` file is created. This is used to store the contents of the `add()` operation you are executing. If JameSQL terminates during an `add()` call for any reason (i.e. system crash, program termination), this journal will be used to reconcile the database. 18 | 19 | Next time you initialize a JameSQL instance, your documents in `index.jamesql` will be read into memory. Then, the transactions in `journal.jamesql` will be replayed to ensure the index is consistent. Finally, the `journal.jamesql` file will be deleted. 20 | 21 | You can access the JSON of the last transaction issued, sans the `uuid`, by calling `index.last_transaction`. 22 | 23 | If you were in the middle of ingesting data, this could be used to resume the ingestion process from where you left off by allowing you to skip records that were already ingested. 24 | 25 | ## Reducing Precision for Large Results Pages 26 | 27 | By default, JameSQL assigns scores to the top 1,000 documents in each clause in a query. Consider the following query; 28 | 29 |

30 | query = {
31 |     "query": {
32 |         "and": [
33 |             {
34 |                 "artist": {
35 |                     "equals": "Taylor Swift"
36 |                 }
37 |             },
38 |             {
39 |                 "title": {
40 |                     "equals": "tolerate it"
41 |                 }
42 |             }
43 |         ]
44 |     },
45 |     "limit": 10
46 | }
47 | 
48 | 49 | The `{ "artist": { "equals": "Taylor Swift" } }` clause will return the top 1,000 documents that match the query. The `{ "title": { "equals": "tolerate it" } }` clause will return the top 1,000 documents that match the query. 50 | 51 | These will then be combine and sorted to return the 10 documents of the 2,000 processed that have the highest score. 52 | 53 | This means that if you have a large number of documents that match a query, you may not get precisely the most relevant documents in the top 10 results, rather an approximation of the most relevant documents. 54 | 55 | You can override the number of documents to consider with: 56 | 57 |

58 | index.match_limit_for_large_result_pages = 10_000
59 | 
60 | 61 | The higher this number, the longer it will take to process results with a large number of matching documents. -------------------------------------------------------------------------------- /tests/script_lang.py: -------------------------------------------------------------------------------- 1 | import json 2 | from contextlib import ExitStack as DoesNotRaise 3 | 4 | import pytest 5 | from lark import Lark 6 | from pytest import raises 7 | 8 | from jamesql import JameSQL 9 | from jamesql.script_lang import JameSQLScriptTransformer, grammar 10 | 11 | 12 | @pytest.fixture 13 | def document_to_test(): 14 | with open("tests/fixtures/documents.json") as f: 15 | documents = json.load(f) 16 | 17 | documents[0]["_score"] = 7.52 18 | documents[0]["listens"] = 2000 19 | 20 | return documents[0] 21 | 22 | 23 | @pytest.fixture 24 | def script_score_parser(): 25 | return Lark(grammar) 26 | 27 | 28 | @pytest.mark.parametrize( 29 | "query, result, raises_exception", 30 | [ 31 | ( 32 | "(_score + 1)", 33 | 8.52, 34 | DoesNotRaise(), 35 | ), 36 | ( 37 | "(_score * 2)", 38 | 15.04, 39 | DoesNotRaise(), 40 | ), 41 | ( 42 | "(_score / 2)", 43 | 3.76, 44 | DoesNotRaise(), 45 | ), 46 | ( 47 | "(_score - 2)", 48 | 5.52, 49 | DoesNotRaise(), 50 | ), 51 | ( 52 | "((_score + 1) * 2)", 53 | 17.04, 54 | DoesNotRaise(), 55 | ), 56 | ( 57 | "(((_score + 1) * 2) + 1)", 58 | 18.04, 59 | DoesNotRaise(), 60 | ), 61 | ( 62 | "(_score + _score)", 63 | 15.04, 64 | DoesNotRaise(), 65 | ), 66 | ( 67 | "((_score + _score) + _score)", 68 | 22.56, 69 | DoesNotRaise(), 70 | ), 71 | ( 72 | "(_score * listens)", 73 | 15040, 74 | DoesNotRaise(), 75 | ), 76 | ( 77 | "log ((_score * listens))", 78 | 9.618475246417898, 79 | DoesNotRaise(), 80 | ), 81 | ( 82 | "log (((_score * listens) + 1))", 83 | 9.618541733127229, 84 | DoesNotRaise(), 85 | ), 86 | ( 87 | "_score + 1", 88 | 0, 89 | raises(Exception), # missing parenthesis 90 | ), 91 | ( 92 | "(_score + 1", 93 | 0, 94 | raises(Exception), # missing closing parenthesis 95 | ), 96 | ( 97 | "(_score + 1))", 98 | 0, 99 | raises(Exception), # additional closing parenthesis 100 | ), 101 | ], 102 | ) 103 | def test_script_score( 104 | document_to_test, script_score_parser, query, result, raises_exception 105 | ): 106 | with raises_exception: 107 | tree = script_score_parser.parse(query) 108 | 109 | transformer = JameSQLScriptTransformer(document_to_test) 110 | 111 | assert transformer.transform(tree) == result 112 | -------------------------------------------------------------------------------- /docs/pages/templates/create.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | permalink: /create 4 | title: Create an Index 5 | --- 6 | 7 | JameSQL supports several index types. 8 | 9 | To achieve the best performance, you should carefully choose the index type to use for each field in your data. 10 | 11 | If you don't choose an index, JameSQL will automatically create an index for you when you run a query on a field for the first time. This is inferred from the types of data in the first record you add. 12 | 13 | ## Set an Index Strategy 14 | 15 | To create an index, use the following code: 16 | 17 |

18 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.PREFIX)
19 | 
20 | 21 | See the table below for a list of available index strategies. 22 | 23 | ## Indexing strategies 24 | 25 | The following index strategies are available: 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 39 | 42 | 43 | 44 | 47 | 50 | 51 | 52 | 55 | 58 | 59 | 60 | 63 | 66 | 67 | 68 | 71 | 74 | 75 | 76 | 79 | 82 | 83 | 84 |
Index StrategyDescription
37 | GSI_INDEX_STRATEGIES.CONTAINS 38 | 40 | Creates a reverse index for the field. This is useful for fields that contain longer strings (i.e. body text in a blog post). TF-IDF is used to search fields structured with the CONTAINS type. 41 |
45 | GSI_INDEX_STRATEGIES.NUMERIC 46 | 48 | Creates several buckets to allow for efficient search of numeric values, especially values with high cardinality. 49 |
53 | GSI_INDEX_STRATEGIES.FLAT 54 | 56 | Stores the field as the data type it is. A flat index is created of values that are not strings or numbers. This is the default. For example, if you are indexing document titles and don't need to do a starts_with query, you may choose a flat index to allow for efficient equals and contains queries. 57 |
61 | GSI_INDEX_STRATEGIES.PREFIX 62 | 64 | Creates a trie index for the field. This is useful for fields that contain short strings (i.e. titles). 65 |
69 | GSI_INDEX_STRATEGIES.CATEGORICAL 70 | 72 | Creates a categorical index for the field. This is useful for fields that contain specific categories (i.e. genres). 73 |
77 | GSI_INDEX_STRATEGIES.TRIGRAM_CODE 78 | 80 | Creates a character-level trigram index for the field. This is useful for efficient code search. See the "Code Search" documentation later in this README for more information about using code search with JameSQL. 81 |
-------------------------------------------------------------------------------- /docs/pages/templates/search.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: Search for Documents 4 | permalink: /search/ 5 | --- 6 | 7 | There are two ways you can run a search: 8 | 9 | - Using a natural language query with JameSQL operators, or; 10 | - Using a JSON DSL. 11 | 12 | ## Using the JSON DSL 13 | 14 | A query has the following format: 15 | 16 |

 17 | {
 18 |     "query": {
 19 |         "field": "value"
 20 |     },
 21 |     "limit": 10,
 22 |     "sort_by": "field",
 23 |     "skip": 0
 24 | }
 25 | 
26 | 27 | - `query` is a dictionary that contains the fields to search for. 28 | - `limit` is the maximum number of documents to return. (default 10) 29 | - `sort_by` is the field to sort by. (default None) 30 | - `skip` is the number of documents to skip. This is useful for implementing pagination. (default 0) 31 | 32 | `limit`, `sort_by`, and `skip` are optional. 33 | 34 | Within the `query` key you can query for documents that match one or more conditions. 35 | 36 | An empty query returns no documents. 37 | 38 | ### Running a search 39 | 40 | To search for documents that match a query, use the following code: 41 | 42 |

 43 | result = index.search(query)
 44 | 
45 | 46 | This returns a JSON payload with the following structure: 47 |

 48 | 
 49 | {
 50 |     "documents": [
 51 |         {"uuid": "1", ...}
 52 |         {"uuid": "2", ...}
 53 |         ...
 54 |     ],
 55 |     "query_time": 0.0001,
 56 |     "total_results": 200
 57 | }
 58 | 
59 | 60 | You can search through multiple pages with the `scroll()` method: 61 | 62 |

 63 | result = index.scroll(query)
 64 | 
65 | 66 | `scroll()` returns a generator that yields documents in the same format as `search()`. 67 | 68 | ## Retrieve All Documents 69 | 70 | You can retrieve all documents by using a catch-all query, which uses the following syntax: 71 | 72 |

 73 | {
 74 |     "query": "*",
 75 |     "limit": 2,
 76 |     "sort_by": "song",
 77 |     "skip": 1
 78 | }
 79 | 
80 | 81 | This is useful if you want to page through documents. You should supply a `sort_by` field to ensure the order of documents is consistent. 82 | 83 | ### Response 84 | 85 | All valid queries return responses in the following form: 86 | 87 |

 88 | {
 89 |     "documents": [
 90 |         {"uuid": "1", "title": "test", "artist": "..."},
 91 |         {"uuid": "2", "title": "test", "artist": "..."},
 92 |         ...
 93 |     ],
 94 |     "query_time": 0.0001,
 95 |     "total_results": 200
 96 | }
 97 | 
98 | 99 | `documents` is a list of documents that match the query. `query_time` is the amount of time it took to execute the query. `total_results` is the total number of documents that match the query before applying any `limit`. 100 | 101 | `total_results` is useful for implementing pagination. 102 | 103 | If an error was encountered, the response will be in the following form: 104 | 105 |

106 | {
107 |     "documents": [],
108 |     "query_time": 0.0001,
109 |     "error": "Invalid query"
110 | }
111 | 
112 | 113 | The `error` key contains a message describing the exact error encountered. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | -------------------------------------------------------------------------------- /tests/code_search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | from contextlib import ExitStack as DoesNotRaise 5 | 6 | import pytest 7 | from deepdiff import DeepDiff 8 | 9 | from jamesql import JameSQL 10 | from jamesql.index import GSI_INDEX_STRATEGIES 11 | 12 | CODE_BASE_DIR = "tests/fixtures/code" 13 | 14 | 15 | def pytest_addoption(parser): 16 | parser.addoption("--benchmark", action="store") 17 | 18 | 19 | @pytest.fixture(scope="session") 20 | def create_indices(request): 21 | # open all files in code/* 22 | documents = [] 23 | 24 | for file in os.listdir("tests/fixtures/code"): 25 | with open(os.path.join("tests/fixtures/code", file)) as f: 26 | documents.append({"file_name": file, "code": f.read()}) 27 | 28 | index = JameSQL() 29 | 30 | index.create_gsi("file_name", strategy=GSI_INDEX_STRATEGIES.PREFIX) 31 | index.create_gsi("code", strategy=GSI_INDEX_STRATEGIES.TRIGRAM_CODE) 32 | 33 | for document in documents: 34 | index.add(document) 35 | 36 | if request.config.getoption("--benchmark") or request.config.getoption( 37 | "--long-benchmark" 38 | ): 39 | large_index = JameSQL() 40 | 41 | for document in documents * 100000: 42 | if request.config.getoption("--long-benchmark"): 43 | document = document.copy() 44 | 45 | large_index.add(document) 46 | 47 | large_index.create_gsi("file_name", strategy=GSI_INDEX_STRATEGIES.PREFIX) 48 | large_index.create_gsi("code", strategy=GSI_INDEX_STRATEGIES.TRIGRAM_CODE) 49 | else: 50 | large_index = None 51 | 52 | return index, large_index 53 | 54 | 55 | @pytest.mark.parametrize( 56 | "query, number_of_documents_expected, top_result_value, raises_exception", 57 | [ 58 | ( 59 | {"query": {"and": [{"code": {"contains": "def"}}]}, "limit": 10}, 60 | 3, 61 | "index.py", 62 | DoesNotRaise(), 63 | ), # test code search for valid query 64 | ( 65 | {"query": {"and": [{"code": {"contains": "ef "}}]}, "limit": 10}, 66 | 3, 67 | "index.py", 68 | DoesNotRaise(), 69 | ), # test code search for valid query with space 70 | ( 71 | {"query": {"and": [{"code": {"contains": "banana"}}]}, "limit": 10}, 72 | 0, 73 | "", 74 | DoesNotRaise(), 75 | ), # test code search with toekn not in documents 76 | ( 77 | {"query": {"and": [{"code": {"contains": "return "}}]}, "limit": 10}, 78 | 3, 79 | "index.py", 80 | DoesNotRaise(), 81 | ), # test code search with > 3 char token 82 | ], 83 | ) 84 | @pytest.mark.timeout(20) 85 | def test_code_search( 86 | create_indices, 87 | query, 88 | number_of_documents_expected, 89 | top_result_value, 90 | raises_exception, 91 | ): 92 | with raises_exception: 93 | index, large_index = create_indices 94 | 95 | response = index.search(query) 96 | 97 | # sort response by documents[0]["title"] to make it easier to compare 98 | response["documents"] = sorted( 99 | response["documents"], key=lambda x: x["file_name"] 100 | ) 101 | 102 | assert len(response["documents"]) == number_of_documents_expected 103 | 104 | if number_of_documents_expected > 0: 105 | assert response["documents"][0]["file_name"] == top_result_value 106 | 107 | assert float(response["query_time"]) < 0.06 108 | 109 | # run if --benchmark is passed 110 | if "--benchmark" in sys.argv: 111 | response = large_index.search(query) 112 | 113 | assert float(response["query_time"]) < 0.06 114 | -------------------------------------------------------------------------------- /tests/highlight.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from deepdiff import DeepDiff 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | 11 | 12 | def pytest_addoption(parser): 13 | parser.addoption("--benchmark", action="store") 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def create_indices(request): 18 | with open("tests/fixtures/documents_with_numeric_values.json") as f: 19 | documents = json.load(f) 20 | 21 | index = JameSQL() 22 | 23 | for document in documents: 24 | index.add(document) 25 | 26 | with open("tests/fixtures/documents_with_numeric_values.json") as f: 27 | documents = json.load(f) 28 | 29 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 30 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 31 | index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 32 | 33 | if request.config.getoption("--benchmark") or request.config.getoption( 34 | "--long-benchmark" 35 | ): 36 | large_index = JameSQL() 37 | 38 | for document in documents * 100000: 39 | if request.config.getoption("--long-benchmark"): 40 | document = document.copy() 41 | document["title"] = "".join( 42 | [ 43 | word + " " 44 | for word in document["title"].split() 45 | for _ in range(10) 46 | ] 47 | ) 48 | large_index.add(document) 49 | 50 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 51 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 52 | large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 53 | else: 54 | large_index = None 55 | 56 | return index, large_index 57 | 58 | 59 | @pytest.mark.parametrize( 60 | "query, highlights, number_of_documents_expected, top_result_value, raises_exception", 61 | [ 62 | ( 63 | { 64 | "query": { 65 | "and": [ 66 | { 67 | "lyric": { 68 | "contains": "kiss", 69 | "highlight": "lyric", 70 | "strict": True, 71 | } 72 | } 73 | ] 74 | }, 75 | "limit": 10, 76 | "sort_by": "title", 77 | }, 78 | [["Started with a kiss"]], 79 | 1, 80 | "The Bolter", 81 | DoesNotRaise(), 82 | ), # test range query 83 | ], 84 | ) 85 | @pytest.mark.timeout(20) 86 | def test_search( 87 | create_indices, 88 | query, 89 | highlights, 90 | number_of_documents_expected, 91 | top_result_value, 92 | raises_exception, 93 | ): 94 | with raises_exception: 95 | index, large_index = create_indices 96 | 97 | response = index.search(query) 98 | 99 | assert len(response["documents"]) == number_of_documents_expected 100 | 101 | for actual_context, expected_context in zip(response["documents"], highlights): 102 | assert actual_context["_context"] == expected_context 103 | 104 | if number_of_documents_expected > 0: 105 | assert response["documents"][0]["title"] == top_result_value 106 | 107 | assert float(response["query_time"]) < 0.06 108 | 109 | # run if --benchmark is passed 110 | if "--benchmark" in sys.argv: 111 | response = large_index.search(query) 112 | 113 | assert float(response["query_time"]) < 0.06 114 | -------------------------------------------------------------------------------- /web/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | JameSQL Preview 7 | 8 | 9 | 10 | 71 | 72 | 73 |
74 |
75 |
76 |

JameSQL

77 | 78 |
79 |
{}
80 |
81 |
82 |
83 |

 

84 | 85 |
86 |
{}
87 |
88 | 117 |
118 | 119 | -------------------------------------------------------------------------------- /tests/data_types.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from deepdiff import DeepDiff 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | 11 | 12 | def pytest_addoption(parser): 13 | parser.addoption("--benchmark", action="store") 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def create_indices(request): 18 | with open("tests/fixtures/documents_with_varied_data_types.json") as f: 19 | documents = json.load(f) 20 | 21 | index = JameSQL() 22 | 23 | for document in documents: 24 | index.add(document) 25 | 26 | with open("tests/fixtures/documents_with_varied_data_types.json") as f: 27 | documents = json.load(f) 28 | 29 | if request.config.getoption("--benchmark") or request.config.getoption( 30 | "--long-benchmark" 31 | ): 32 | large_index = JameSQL() 33 | 34 | for document in documents * 100000: 35 | if request.config.getoption("--long-benchmark"): 36 | document = document.copy() 37 | document["title"] = "".join( 38 | [ 39 | word + " " 40 | for word in document["title"].split() 41 | for _ in range(10) 42 | ] 43 | ) 44 | large_index.add(document) 45 | else: 46 | large_index = None 47 | 48 | return index, large_index 49 | 50 | 51 | @pytest.mark.parametrize( 52 | "query, number_of_documents_expected, top_result_value, raises_exception", 53 | [ 54 | ( 55 | { 56 | "query": { 57 | "album_in_stock": {"equals": True}, 58 | }, 59 | "limit": 2, 60 | "sort_by": "title", 61 | }, 62 | 2, 63 | "tolerate it", 64 | DoesNotRaise(), 65 | ), # test equals with boolean 66 | ( 67 | { 68 | "query": { 69 | "rating": {"greater_than": 4.8}, 70 | }, 71 | "limit": 2, 72 | "sort_by": "title", 73 | }, 74 | 1, 75 | "The Bolter", 76 | DoesNotRaise(), 77 | ), # test greater than with floating point 78 | ( 79 | { 80 | "query": { 81 | "metadata": {"contains": "version"}, 82 | }, 83 | "limit": 2, 84 | "sort_by": "title", 85 | }, 86 | 0, 87 | "", 88 | DoesNotRaise(), 89 | ), # dictionaries are not indexable, so this will return a 0 result 90 | ( 91 | { 92 | "query": { 93 | "record_last_updated": {"greater_than": "2024-03-01"}, 94 | }, 95 | "limit": 2, 96 | "sort_by": "title", 97 | }, 98 | 1, 99 | "The Bolter", 100 | DoesNotRaise(), 101 | ), # test greater than with date 102 | ( 103 | { 104 | "query": { 105 | "record_last_updated": {"less_than": "2024-03-01"}, 106 | }, 107 | "limit": 2, 108 | "sort_by": "title", 109 | }, 110 | 2, 111 | "tolerate it", 112 | DoesNotRaise(), 113 | ), # test greater than with date 114 | ], 115 | ) 116 | @pytest.mark.timeout(20) 117 | def test_search( 118 | create_indices, 119 | query, 120 | number_of_documents_expected, 121 | top_result_value, 122 | raises_exception, 123 | ): 124 | with raises_exception: 125 | index, large_index = create_indices 126 | 127 | response = index.search(query) 128 | 129 | assert len(response["documents"]) == number_of_documents_expected 130 | 131 | if number_of_documents_expected > 0: 132 | assert response["documents"][0]["title"] == top_result_value 133 | 134 | assert float(response["query_time"]) < 0.06 135 | 136 | # run if --benchmark is passed 137 | if "--benchmark" in sys.argv: 138 | response = large_index.search(query) 139 | 140 | assert float(response["query_time"]) < 0.06 141 | -------------------------------------------------------------------------------- /web/web.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, render_template, send_from_directory 2 | from jamesql import JameSQL 3 | from jamesql.index import GSI_INDEX_STRATEGIES 4 | import json 5 | from tqdm import tqdm 6 | from datetime import datetime 7 | import requests 8 | import time 9 | 10 | import os 11 | import pyromark 12 | import frontmatter 13 | from bs4 import BeautifulSoup 14 | 15 | app = Flask(__name__) 16 | 17 | index = JameSQL() 18 | 19 | link_graph = {} 20 | records = [] 21 | 22 | blog_posts = os.listdir("../../pages/posts") 23 | 24 | for post_name in blog_posts: 25 | with open(f"../../pages/posts/{post_name}") as f: 26 | post = frontmatter.load(f) 27 | category = post.get("categories", [])[0] 28 | description = "
".join(post.content.split("\n")[:2]) 29 | post["description"] = description 30 | 31 | post[ 32 | "published" 33 | ] = f"{post_name.split('-')[0]}-{post_name.split('-')[1]}-{post_name.split('-')[2]}" 34 | post[ 35 | "url" 36 | ] = f"https://jamesg.blog/{post_name.split('-')[0]}/{post_name.split('-')[1]}/{post_name.split('-')[2]}/{'-'.join(post_name.split('-')[3:]).replace('.md', '').strip('/')}" 37 | 38 | # stem the content 39 | # post.content = " ".join([stemmer.stem(word) for word in post.content.split()]) 40 | # post["title"] = " ".join([stemmer.stem(word) for word in post["title"].split()]) 41 | # parse markdown 42 | # post.content = pyromark.markdown(post.content) 43 | # exit() 44 | links = BeautifulSoup(pyromark.markdown(post.content), "html.parser").find_all( 45 | "a" 46 | ) 47 | 48 | links = [link.get("href") for link in links] 49 | 50 | for link in links: 51 | if not link: 52 | continue 53 | 54 | # if link starts with /, add jamesg.blog 55 | if link.startswith("/"): 56 | link = f"https://jamesg.blog{link}" 57 | 58 | link = link.rstrip("/") 59 | 60 | if link not in link_graph: 61 | link_graph[link] = [] 62 | 63 | link_graph[link].append(post["url"].strip("/")) 64 | 65 | html = pyromark.markdown(post["description"]) 66 | 67 | if post.content and post["title"]: 68 | records.append( 69 | { 70 | "title": post["title"], 71 | "title_lower": post["title"].lower(), 72 | "post": post.content.lower(), 73 | "category": category, 74 | "description": html, 75 | "published": datetime.strptime(post["published"], "%Y-%m-%d"), 76 | "url": post["url"], 77 | "type": "blog", 78 | } 79 | ) 80 | 81 | for record in records: 82 | record["inlinks"] = len(link_graph.get(record["url"], [])) 83 | index.add(record) 84 | 85 | index.create_gsi("title_lower", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 86 | index.create_gsi("post", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 87 | 88 | 89 | @app.route("/", methods=["GET", "POST"]) 90 | def search(): 91 | field_names = index.gsis 92 | 93 | field_names_to_index_types = { 94 | name: index.gsis[name]["strategy"] for name in field_names.keys() 95 | } 96 | if request.method == "POST": 97 | query = request.json 98 | if query["type"] == "string_query": 99 | query_parsed = index._compute_string_query( 100 | query["raw_query"], query_keys=query["fields"], boosts=query["boosts"] 101 | ) 102 | query_parsed["query_score"] = query["query_score"] 103 | query_parsed["sort_by"] = "_score" 104 | result = index.search(query_parsed) 105 | else: 106 | result = index.search(query) 107 | return result 108 | 109 | return render_template("search.html", field_names=field_names_to_index_types) 110 | 111 | 112 | @app.route("/json", methods=["GET", "POST"]) 113 | def json_search(): 114 | if request.method == "POST": 115 | query = request.json 116 | result = index.search(query) 117 | return result 118 | 119 | return render_template("index.html") 120 | 121 | 122 | # serve ./ace-builds 123 | @app.route("/ace-builds/") 124 | def ace(path): 125 | return send_from_directory("ace-builds", path) 126 | 127 | 128 | if __name__ == "__main__": 129 | app.run(debug=True) 130 | -------------------------------------------------------------------------------- /tests/aggregation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from deepdiff import DeepDiff 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | 11 | 12 | def pytest_addoption(parser): 13 | parser.addoption("--benchmark", action="store") 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def create_indices(request): 18 | with open("tests/fixtures/documents.json") as f: 19 | documents = json.load(f) 20 | 21 | index = JameSQL() 22 | 23 | for document in documents: 24 | index.add(document) 25 | 26 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 27 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 28 | 29 | with open("tests/fixtures/documents.json") as f: 30 | documents = json.load(f) 31 | 32 | if request.config.getoption("--benchmark") or request.config.getoption( 33 | "--long-benchmark" 34 | ): 35 | large_index = JameSQL() 36 | 37 | for document in documents * 100000: 38 | if request.config.getoption("--long-benchmark"): 39 | document = document.copy() 40 | document["title"] = "".join( 41 | [ 42 | word + " " 43 | for word in document["title"].split() 44 | for _ in range(10) 45 | ] 46 | ) 47 | large_index.add(document) 48 | 49 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 50 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 51 | else: 52 | large_index = None 53 | 54 | return index, large_index 55 | 56 | 57 | @pytest.mark.parametrize( 58 | "query, introspection_results, number_of_documents_expected, top_result_value, raises_exception", 59 | [ 60 | ( 61 | { 62 | "query": { 63 | "and": [ 64 | { 65 | "lyric": { 66 | "contains": "my", 67 | } 68 | }, 69 | ] 70 | }, 71 | "metrics": ["aggregate"], 72 | "limit": 10, 73 | "sort_by": "title", 74 | }, 75 | {"unique_record_values": {"title": 1, "lyric": 1}}, 76 | 1, 77 | "tolerate it", 78 | DoesNotRaise(), 79 | ), # test query with introspection 80 | ( 81 | { 82 | "query": {}, 83 | "metrics": ["aggregate"], 84 | "limit": 10, 85 | "sort_by": "title", 86 | }, 87 | {}, 88 | 0, 89 | "", 90 | DoesNotRaise(), 91 | ), # test blank query with introspection 92 | ( 93 | { 94 | "query": "*", 95 | "metrics": ["aggregate"], 96 | "limit": 10, 97 | "sort_by": "title", 98 | }, 99 | {"unique_record_values": {"title": 3, "lyric": 3}}, 100 | 3, 101 | "tolerate it", 102 | DoesNotRaise(), 103 | ), # test all (*) query with introspection 104 | ], 105 | ) 106 | @pytest.mark.timeout(20) 107 | def test_search( 108 | create_indices, 109 | query, 110 | introspection_results, 111 | number_of_documents_expected, 112 | top_result_value, 113 | raises_exception, 114 | ): 115 | with raises_exception: 116 | index, large_index = create_indices 117 | 118 | response = index.search(query) 119 | 120 | assert len(response["documents"]) == number_of_documents_expected 121 | 122 | # allow items to be in different orders; order doesn't matter 123 | result = DeepDiff( 124 | response.get("metrics", {}), introspection_results, ignore_order=True 125 | ) 126 | 127 | assert result == {} 128 | 129 | if number_of_documents_expected > 0: 130 | assert response["documents"][0]["title"] == top_result_value 131 | 132 | assert float(response["query_time"]) < 0.06 133 | 134 | # run if --benchmark is passed 135 | if "--benchmark" in sys.argv: 136 | response = large_index.search(query) 137 | 138 | assert float(response["query_time"]) < 0.06 139 | -------------------------------------------------------------------------------- /tests/range_queries.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | 7 | from jamesql import JameSQL 8 | from jamesql.index import GSI_INDEX_STRATEGIES 9 | 10 | 11 | def pytest_addoption(parser): 12 | parser.addoption("--benchmark", action="store") 13 | 14 | 15 | @pytest.fixture(scope="session") 16 | def create_indices(request): 17 | with open("tests/fixtures/documents_with_numeric_values.json") as f: 18 | documents = json.load(f) 19 | 20 | index = JameSQL() 21 | 22 | for document in documents: 23 | index.add(document) 24 | 25 | with open("tests/fixtures/documents_with_numeric_values.json") as f: 26 | documents = json.load(f) 27 | 28 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 29 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 30 | index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 31 | 32 | if request.config.getoption("--benchmark") or request.config.getoption( 33 | "--long-benchmark" 34 | ): 35 | large_index = JameSQL() 36 | 37 | for document in documents * 100000: 38 | if request.config.getoption("--long-benchmark"): 39 | document = document.copy() 40 | document["title"] = "".join( 41 | [ 42 | word + " " 43 | for word in document["title"].split() 44 | for _ in range(10) 45 | ] 46 | ) 47 | large_index.add(document) 48 | 49 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 50 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 51 | large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 52 | else: 53 | large_index = None 54 | 55 | return index, large_index 56 | 57 | 58 | @pytest.mark.parametrize( 59 | "query, number_of_documents_expected, top_result_value, raises_exception", 60 | [ 61 | ( 62 | { 63 | "query": { 64 | "and": [ 65 | {"listens": {"range": [200, 300]}}, 66 | ] 67 | }, 68 | "limit": 10, 69 | "sort_by": "title", 70 | }, 71 | 2, 72 | "my tears ricochet", 73 | DoesNotRaise(), 74 | ), # test range query 75 | ( 76 | { 77 | "query": { 78 | "and": [ 79 | {"listens": {"range": [0, 300]}}, 80 | ] 81 | }, 82 | "limit": 10, 83 | "sort_by": "title", 84 | }, 85 | 3, 86 | "tolerate it", 87 | DoesNotRaise(), 88 | ), # test range query 89 | ( 90 | { 91 | "query": { 92 | "and": [ 93 | {"listens": {"range": [300, 300]}}, 94 | ] 95 | }, 96 | "limit": 10, 97 | "sort_by": "title", 98 | }, 99 | 1, 100 | "The Bolter", 101 | DoesNotRaise(), 102 | ), # test range query 103 | ( 104 | { 105 | "query": { 106 | "and": [ 107 | {"listens": {"range": [0, 0]}}, 108 | ] 109 | }, 110 | "limit": 10, 111 | "sort_by": "title", 112 | }, 113 | 0, 114 | "", 115 | DoesNotRaise(), 116 | ), # test range query 117 | ], 118 | ) 119 | @pytest.mark.timeout(20) 120 | def test_search( 121 | create_indices, 122 | query, 123 | number_of_documents_expected, 124 | top_result_value, 125 | raises_exception, 126 | ): 127 | with raises_exception: 128 | index, large_index = create_indices 129 | 130 | response = index.search(query) 131 | 132 | assert len(response["documents"]) == number_of_documents_expected 133 | 134 | if number_of_documents_expected > 0: 135 | assert response["documents"][0]["title"] == top_result_value 136 | 137 | assert float(response["query_time"]) < 0.06 138 | 139 | # run if --benchmark is passed 140 | if "--benchmark" in sys.argv: 141 | response = large_index.search(query) 142 | 143 | assert float(response["query_time"]) < 0.06 144 | -------------------------------------------------------------------------------- /tests/query_simplification.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from lark import Lark 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | from jamesql.rewriter import grammar, simplify_string_query 11 | 12 | 13 | def pytest_addoption(parser): 14 | parser.addoption("--benchmark", action="store") 15 | 16 | 17 | @pytest.fixture(scope="session") 18 | def create_indices(request): 19 | with open("tests/fixtures/documents.json") as f: 20 | documents = json.load(f) 21 | 22 | index = JameSQL() 23 | 24 | for document in documents: 25 | index.add(document) 26 | 27 | with open("tests/fixtures/documents.json") as f: 28 | documents = json.load(f) 29 | 30 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 31 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 32 | index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 33 | 34 | if request.config.getoption("--benchmark") or request.config.getoption( 35 | "--long-benchmark" 36 | ): 37 | large_index = JameSQL() 38 | 39 | for document in documents * 100000: 40 | if request.config.getoption("--long-benchmark"): 41 | document = document.copy() 42 | document["title"] = "".join( 43 | [ 44 | word + " " 45 | for word in document["title"].split() 46 | for _ in range(10) 47 | ] 48 | ) 49 | large_index.add(document) 50 | 51 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 52 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 53 | large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 54 | else: 55 | large_index = None 56 | 57 | return index, large_index 58 | 59 | 60 | @pytest.mark.parametrize( 61 | "query, simplified_form, number_of_documents_expected, top_result_value, raises_exception", 62 | [ 63 | ( 64 | "sky -sky", 65 | "", 66 | 0, 67 | "", 68 | DoesNotRaise(), 69 | ), # test negation simplification with empty string result 70 | ( 71 | "100 100", 72 | "100", 73 | 0, 74 | "", 75 | DoesNotRaise(), 76 | ), # test numeric query simplification 77 | ( 78 | "screaming -sky", 79 | "screaming -sky", 80 | 0, 81 | "", 82 | DoesNotRaise(), 83 | ), # test negation with no simplification required 84 | ( 85 | "sky sky", 86 | "sky", 87 | 2, 88 | ["my tears ricochet", "tolerate it"], 89 | DoesNotRaise(), 90 | ), # test duplication of single word term simplification 91 | ( 92 | "sky OR mural sky", 93 | "sky mural", 94 | 2, 95 | "tolerate it", 96 | DoesNotRaise(), 97 | ), # test redundant single term in or query simplification 98 | ( 99 | "sky OR sky OR sky", 100 | "sky", 101 | 2, 102 | ["my tears ricochet", "tolerate it"], 103 | DoesNotRaise(), 104 | ), # test redundant term in multiple ORs 105 | ( 106 | "-lyric:sky lyric:sky", 107 | "", 108 | 0, 109 | "", 110 | DoesNotRaise(), 111 | ), # test double negation of in clause 112 | ], 113 | ) 114 | @pytest.mark.timeout(20) 115 | def test_simplification_then_search( 116 | create_indices, 117 | query, 118 | simplified_form, 119 | number_of_documents_expected, 120 | top_result_value, 121 | raises_exception, 122 | ): 123 | with raises_exception: 124 | parser = Lark(grammar) 125 | index, large_index = create_indices 126 | 127 | simplified_query, _ = simplify_string_query(parser, query) 128 | 129 | assert simplified_query == simplified_form 130 | 131 | response = index.string_query_search(query) 132 | 133 | assert len(response["documents"]) == number_of_documents_expected 134 | 135 | if number_of_documents_expected > 0: 136 | if isinstance(top_result_value, list): 137 | assert response["documents"][0]["title"] in top_result_value 138 | else: 139 | assert response["documents"][0]["title"] == top_result_value 140 | 141 | assert float(response["query_time"]) < 0.06 142 | 143 | # run if --benchmark is passed 144 | if "--benchmark" in sys.argv: 145 | response = large_index.string_query_search(query) 146 | 147 | assert float(response["query_time"]) < 0.06 148 | -------------------------------------------------------------------------------- /tests/group_by.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from deepdiff import DeepDiff 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | 11 | 12 | def pytest_addoption(parser): 13 | parser.addoption("--benchmark", action="store") 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def create_indices(request): 18 | with open("tests/fixtures/documents_with_categorical_values.json") as f: 19 | documents = json.load(f) 20 | 21 | index = JameSQL() 22 | 23 | for document in documents: 24 | index.add(document) 25 | 26 | with open("tests/fixtures/documents_with_categorical_values.json") as f: 27 | documents = json.load(f) 28 | 29 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 30 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 31 | index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 32 | 33 | if request.config.getoption("--benchmark") or request.config.getoption( 34 | "--long-benchmark" 35 | ): 36 | large_index = JameSQL() 37 | 38 | for document in documents * 100000: 39 | if request.config.getoption("--long-benchmark"): 40 | document = document.copy() 41 | document["title"] = "".join( 42 | [ 43 | word + " " 44 | for word in document["title"].split() 45 | for _ in range(10) 46 | ] 47 | ) 48 | large_index.add(document) 49 | 50 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 51 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 52 | large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 53 | else: 54 | large_index = None 55 | 56 | return index, large_index 57 | 58 | 59 | @pytest.mark.parametrize( 60 | "query, group_by_result, number_of_documents_expected, top_result_value, raises_exception", 61 | [ 62 | ( 63 | { 64 | "query": {"and": [{"lyric": {"contains": "with"}}]}, 65 | "limit": 10, 66 | "group_by": "title", 67 | "sort_by": "title", 68 | }, 69 | { 70 | "The Bolter": [ 71 | { 72 | "title": "The Bolter", 73 | "lyric": "Started with a kiss", 74 | "category": ["pop", "acoustic"], 75 | "uuid": "18fbe44e19a24153b0a22841261db61c", 76 | "_score": 1, 77 | } 78 | ] 79 | }, 80 | 1, 81 | "The Bolter", 82 | DoesNotRaise(), 83 | ), # test group by on string field 84 | ( 85 | { 86 | "query": {"and": [{"lyric": {"contains": "kiss"}}]}, 87 | "group_by": "category", 88 | "limit": 10, 89 | "sort_by": "title", 90 | }, 91 | { 92 | "pop": [ 93 | { 94 | "title": "The Bolter", 95 | "lyric": "Started with a kiss", 96 | "category": ["pop", "acoustic"], 97 | "uuid": "eb11180b16e34467a5d457f7115fda38", 98 | "_score": 1, 99 | } 100 | ], 101 | "acoustic": [ 102 | { 103 | "title": "The Bolter", 104 | "lyric": "Started with a kiss", 105 | "category": ["pop", "acoustic"], 106 | "uuid": "eb11180b16e34467a5d457f7115fda38", 107 | "_score": 1, 108 | } 109 | ], 110 | }, 111 | 1, 112 | "The Bolter", 113 | DoesNotRaise(), 114 | ), # test group by on categorical field 115 | ], 116 | ) 117 | @pytest.mark.timeout(20) 118 | def test_search( 119 | create_indices, 120 | query, 121 | group_by_result, 122 | number_of_documents_expected, 123 | top_result_value, 124 | raises_exception, 125 | ): 126 | with raises_exception: 127 | index, large_index = create_indices 128 | 129 | response = index.search(query) 130 | 131 | assert len(response["documents"]) == number_of_documents_expected 132 | 133 | # exclude uuids since they are randomly assigned on indexing in this configuration 134 | 135 | assert ( 136 | DeepDiff( 137 | dict(response["groups"]), 138 | group_by_result, 139 | ignore_order=True, 140 | # ignore "score" 141 | exclude_regex_paths=["root\[.*\]\['uuid'\]", "root\[.*\]\['_score'\]"], 142 | ) 143 | == {} 144 | ) 145 | 146 | if number_of_documents_expected > 0: 147 | assert response["documents"][0]["title"] == top_result_value 148 | 149 | assert float(response["query_time"]) < 0.06 150 | 151 | # run if --benchmark is passed 152 | if "--benchmark" in sys.argv: 153 | response = large_index.search(query) 154 | 155 | assert float(response["query_time"]) < 0.06 156 | -------------------------------------------------------------------------------- /tests/sort_by.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from deepdiff import DeepDiff 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | 11 | 12 | def pytest_addoption(parser): 13 | parser.addoption("--benchmark", action="store") 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def create_indices(request): 18 | with open("tests/fixtures/documents.json") as f: 19 | documents = json.load(f) 20 | 21 | index = JameSQL() 22 | 23 | for document in documents: 24 | index.add(document) 25 | 26 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 27 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 28 | 29 | with open("tests/fixtures/documents.json") as f: 30 | documents = json.load(f) 31 | 32 | if request.config.getoption("--benchmark") or request.config.getoption( 33 | "--long-benchmark" 34 | ): 35 | large_index = JameSQL() 36 | 37 | for document in documents * 100000: 38 | if request.config.getoption("--long-benchmark"): 39 | document = document.copy() 40 | document["title"] = "".join( 41 | [ 42 | word + " " 43 | for word in document["title"].split() 44 | for _ in range(10) 45 | ] 46 | ) 47 | large_index.add(document) 48 | 49 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 50 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 51 | else: 52 | large_index = None 53 | 54 | return index, large_index 55 | 56 | 57 | @pytest.mark.parametrize( 58 | "query, top_result_title, number_of_documents_expected, raises_exception", 59 | [ 60 | ( 61 | { 62 | "query": { 63 | "or": [ 64 | { 65 | "lyric": { 66 | "contains": "kiss", 67 | } 68 | }, 69 | { 70 | "lyric": { 71 | "contains": "sky", 72 | } 73 | }, 74 | ] 75 | }, 76 | "limit": 10, 77 | "sort_by": "title", 78 | }, 79 | "tolerate it", 80 | 3, 81 | DoesNotRaise(), 82 | ), # test with text field sort 83 | ( 84 | { 85 | "query": { 86 | "or": [ 87 | { 88 | "lyric": { 89 | "contains": "kiss", 90 | } 91 | }, 92 | { 93 | "lyric": { 94 | "contains": "sky", 95 | } 96 | }, 97 | ] 98 | }, 99 | "limit": 10, 100 | "sort_by": "_score", 101 | }, 102 | "The Bolter", 103 | 3, 104 | DoesNotRaise(), 105 | ), # test with text field score sort 106 | ( 107 | { 108 | "query": { 109 | "or": [ 110 | { 111 | "lyric": { 112 | "contains": "kiss", 113 | } 114 | }, 115 | { 116 | "lyric": { 117 | "contains": "sky", 118 | } 119 | }, 120 | ] 121 | }, 122 | "limit": 10, 123 | "sort_by": "_score", 124 | "sort_order": "asc", 125 | }, 126 | "my tears ricochet", 127 | 3, 128 | DoesNotRaise(), 129 | ), # test with text field score sort 130 | ( 131 | { 132 | "query": { 133 | "or": [ 134 | { 135 | "lyric": { 136 | "contains": "kiss", 137 | } 138 | }, 139 | { 140 | "lyric": { 141 | "contains": "sky", 142 | } 143 | }, 144 | ] 145 | }, 146 | "limit": 10, 147 | "sort_by": "_score", 148 | "sort_order": "desc", 149 | }, 150 | "The Bolter", 151 | 3, 152 | DoesNotRaise(), 153 | ), # test with text field score sort 154 | ], 155 | ) 156 | @pytest.mark.timeout(20) 157 | def test_search( 158 | create_indices, 159 | query, 160 | top_result_title, 161 | number_of_documents_expected, 162 | raises_exception, 163 | ): 164 | with raises_exception: 165 | index, large_index = create_indices 166 | 167 | response = index.search(query) 168 | 169 | # print(response) 170 | 171 | # assert False 172 | 173 | assert len(response["documents"]) == number_of_documents_expected 174 | assert response["documents"][0]["title"] == top_result_title 175 | 176 | if number_of_documents_expected > 0: 177 | assert response["documents"][0]["title"] == top_result_title 178 | 179 | assert float(response["query_time"]) < 0.06 180 | 181 | # run if --benchmark is passed 182 | if "--benchmark" in sys.argv: 183 | response = large_index.search(query) 184 | 185 | assert float(response["query_time"]) < 0.06 186 | -------------------------------------------------------------------------------- /docs/_site/comparison.md/index.html: -------------------------------------------------------------------------------- 1 |

You can find documents where a field is less than, greater than, less than or equal to, or greater than or equal to a value with a range query. Here is an example of a query that looks for documents where the year field is greater than 2010:

76 |
query = {
77 |     "query": {
78 |         "year": {
79 |             "greater_than": 2010
80 |         }
81 |     }
82 | }
83 | 
84 | 85 |

The following operators are supported:

86 | 92 | -------------------------------------------------------------------------------- /docs/_site/aggregate-metrics.md/index.html: -------------------------------------------------------------------------------- 1 |

You can find the total number of unique values for the fields returned by a query using an aggregate query. This is useful for presenting the total number of options available in a search space to a user.

76 |

You can use the following query to find the total number of unique values for all fields whose lyric field contains the term “sky”:

77 |
query = {
78 |     "query": {
79 |         "lyric": {
80 |             "contains": "sky"
81 |         }
82 |     },
83 |     "metrics": ["aggregate"]
84 | }
85 | 
86 | 87 |

The aggregate results are presented in an unique_record_values key with the following structure:

88 |
{
89 |     "documents": [...],
90 |     "query_time": 0.0001,
91 |     {'unique_record_values': {'title': 2, 'lyric': 2, 'listens': 2, 'categories': 3}}
92 | }
93 | 
94 | -------------------------------------------------------------------------------- /tests/string_queries_categorical_and_range.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from deepdiff import DeepDiff 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | 11 | 12 | def pytest_addoption(parser): 13 | parser.addoption("--benchmark", action="store") 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def create_indices(request): 18 | with open("tests/fixtures/documents_with_categorical_and_numeric_values.json") as f: 19 | documents = json.load(f) 20 | 21 | index = JameSQL() 22 | 23 | for document in documents: 24 | index.add(document) 25 | 26 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.PREFIX) 27 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 28 | index.create_gsi("category", strategy=GSI_INDEX_STRATEGIES.FLAT) 29 | index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 30 | 31 | with open("tests/fixtures/documents_with_categorical_and_numeric_values.json") as f: 32 | documents = json.load(f) 33 | 34 | if request.config.getoption("--benchmark") or request.config.getoption( 35 | "--long-benchmark" 36 | ): 37 | large_index = JameSQL() 38 | 39 | for document in documents * 100000: 40 | if request.config.getoption("--long-benchmark"): 41 | document = document.copy() 42 | document["title"] = "".join( 43 | [ 44 | word + " " 45 | for word in document["title"].split() 46 | for _ in range(10) 47 | ] 48 | ) 49 | large_index.add(document) 50 | 51 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.PREFIX) 52 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 53 | large_index.create_gsi("category", strategy=GSI_INDEX_STRATEGIES.FLAT) 54 | large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC) 55 | else: 56 | large_index = None 57 | 58 | return index, large_index 59 | 60 | 61 | @pytest.mark.parametrize( 62 | "query, rewritten_query, number_of_documents_expected, top_result_value, raises_exception", 63 | [ 64 | ( 65 | "listens>100", 66 | {"query": {"and": [{"listens": {"greater_than": 100}}]}, "limit": 10}, 67 | 2, 68 | "The Bolter", 69 | DoesNotRaise(), 70 | ), # test > operator 71 | ( 72 | "listens<101", 73 | {"query": {"and": [{"listens": {"less_than": 101}}]}, "limit": 10}, 74 | 1, 75 | "tolerate it", 76 | DoesNotRaise(), 77 | ), # test < operator 78 | ( 79 | "listens<=101", 80 | {"query": {"and": [{"listens": {"less_than_or_equal": 101}}]}, "limit": 10}, 81 | 1, 82 | "tolerate it", 83 | DoesNotRaise(), 84 | ), # test <= operator 85 | ( 86 | "listens>=101", 87 | { 88 | "query": {"and": [{"listens": {"greater_than_or_equal": 101}}]}, 89 | "limit": 10, 90 | }, 91 | 2, 92 | "The Bolter", 93 | DoesNotRaise(), 94 | ), # test >= operator 95 | ( 96 | "listens[200, 300] category:'pop'", 97 | { 98 | "query": { 99 | "and": [ 100 | {"listens": {"range": [200, 300]}}, 101 | {"category": {"contains": "pop"}}, 102 | ] 103 | }, 104 | "limit": 10, 105 | }, 106 | 1, 107 | "my tears ricochet", 108 | DoesNotRaise(), 109 | ), # test range operator with a single categorical data query 110 | ( 111 | "listens[200, 300]", 112 | {"query": {"and": [{"listens": {"range": [200, 300]}}]}, "limit": 10}, 113 | 2, 114 | "The Bolter", 115 | DoesNotRaise(), 116 | ), # test range operator 117 | ( 118 | "listens>=101 sky", 119 | { 120 | "query": { 121 | "and": [ 122 | {"listens": {"greater_than_or_equal": 101}}, 123 | { 124 | "or": [ 125 | {"title": {"contains": "sky"}}, 126 | {"lyric": {"contains": "sky"}}, 127 | {"category": {"contains": "sky"}}, 128 | ] 129 | }, 130 | ] 131 | }, 132 | "limit": 10, 133 | }, 134 | 1, 135 | "my tears ricochet", 136 | DoesNotRaise(), 137 | ), # test >= operator with a single word query 138 | ( 139 | "category:'pop' sky", 140 | { 141 | "query": { 142 | "and": [ 143 | {"category": {"contains": "pop"}}, 144 | { 145 | "or": [ 146 | {"title": {"contains": "sky"}}, 147 | {"lyric": {"contains": "sky"}}, 148 | {"category": {"contains": "sky"}}, 149 | ] 150 | }, 151 | ] 152 | }, 153 | "limit": 10, 154 | }, 155 | 2, 156 | "my tears ricochet", 157 | DoesNotRaise(), 158 | ), # test a single categorical data query with a single word query 159 | ( 160 | "category:'pop' category:'acoustic'", 161 | { 162 | "query": { 163 | "and": [ 164 | {"category": {"contains": "pop"}}, 165 | {"category": {"contains": "acoustic"}}, 166 | ] 167 | }, 168 | "limit": 10, 169 | }, 170 | 1, 171 | "my tears ricochet", 172 | DoesNotRaise(), 173 | ), # test two categorical data queries 174 | ], 175 | ) 176 | @pytest.mark.timeout(20) 177 | def test_search( 178 | create_indices, 179 | query, 180 | rewritten_query, 181 | number_of_documents_expected, 182 | top_result_value, 183 | raises_exception, 184 | ): 185 | with raises_exception: 186 | index, large_index = create_indices 187 | 188 | internal_query, _ = index._compute_string_query(query) 189 | response = index.string_query_search(query) 190 | 191 | # sort response by documents[0]["title"] to make it easier to compare 192 | response["documents"] = sorted(response["documents"], key=lambda x: x["title"]) 193 | 194 | assert len(response["documents"]) == number_of_documents_expected 195 | 196 | # allow items to be in different orders; order doesn't matter 197 | result = DeepDiff(internal_query, rewritten_query, ignore_order=True) 198 | 199 | assert result == {} 200 | 201 | # order documents alphabetically by title 202 | 203 | response["documents"] = sorted(response["documents"], key=lambda x: x["title"]) 204 | 205 | if number_of_documents_expected > 0: 206 | assert response["documents"][0]["title"] == top_result_value 207 | 208 | assert float(response["query_time"]) < 0.06 209 | 210 | # run if --benchmark is passed 211 | if "--benchmark" in sys.argv: 212 | response = large_index.string_query_search(query) 213 | 214 | assert float(response["query_time"]) < 0.06 215 | -------------------------------------------------------------------------------- /tests/string_query.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | from deepdiff import DeepDiff 7 | 8 | from jamesql import JameSQL 9 | from jamesql.index import GSI_INDEX_STRATEGIES 10 | 11 | 12 | def pytest_addoption(parser): 13 | parser.addoption("--benchmark", action="store") 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def create_indices(request): 18 | with open("tests/fixtures/documents.json") as f: 19 | documents = json.load(f) 20 | 21 | index = JameSQL() 22 | 23 | for document in documents: 24 | index.add(document) 25 | 26 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 27 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 28 | 29 | with open("tests/fixtures/documents.json") as f: 30 | documents = json.load(f) 31 | 32 | if request.config.getoption("--benchmark") or request.config.getoption( 33 | "--long-benchmark" 34 | ): 35 | large_index = JameSQL() 36 | 37 | for document in documents * 100000: 38 | if request.config.getoption("--long-benchmark"): 39 | document = document.copy() 40 | document["title"] = "".join( 41 | [ 42 | word + " " 43 | for word in document["title"].split() 44 | for _ in range(10) 45 | ] 46 | ) 47 | large_index.add(document) 48 | 49 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 50 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 51 | else: 52 | large_index = None 53 | 54 | return index, large_index 55 | 56 | 57 | @pytest.mark.parametrize( 58 | "query, rewritten_query, number_of_documents_expected, top_result_value, raises_exception", 59 | [ 60 | ( 61 | "tolerate it", 62 | { 63 | "query": { 64 | "or": [ 65 | { 66 | "or": [ 67 | {"title": {"contains": "tolerate"}}, 68 | {"lyric": {"contains": "tolerate"}}, 69 | ] 70 | }, 71 | { 72 | "or": [ 73 | {"title": {"contains": "it"}}, 74 | {"lyric": {"contains": "it"}}, 75 | ] 76 | }, 77 | ] 78 | }, 79 | "limit": 10, 80 | }, 81 | 1, 82 | "tolerate it", 83 | DoesNotRaise(), 84 | ), # test query with no special operators 85 | ( 86 | "title:tolerate", 87 | {"query": {"and": [{"title": {"contains": "tolerate"}}]}, "limit": 10}, 88 | 1, 89 | "tolerate it", 90 | DoesNotRaise(), 91 | ), # test one word field search 92 | ( 93 | "title:'tolerate it'", 94 | {"query": {"and": [{"title": {"contains": "tolerate it"}}]}, "limit": 10}, 95 | 1, 96 | "tolerate it", 97 | DoesNotRaise(), 98 | ), # test multi-word field search 99 | ( 100 | "'tolerate'", 101 | { 102 | "query": { 103 | "or": [ 104 | { 105 | "or": { 106 | "lyric": {"contains": "tolerate", "strict": True}, 107 | "title": {"contains": "tolerate", "strict": True}, 108 | } 109 | } 110 | ] 111 | }, 112 | "limit": 10, 113 | }, 114 | 1, 115 | "tolerate it", 116 | DoesNotRaise(), 117 | ), # test multi-word search 118 | ( 119 | "St*rted", 120 | { 121 | "query": { 122 | "or": [ 123 | { 124 | "or": [ 125 | {"title": {"wildcard": "St*rted"}}, 126 | {"lyric": {"wildcard": "St*rted"}}, 127 | ] 128 | } 129 | ] 130 | }, 131 | "limit": 10, 132 | }, 133 | 1, 134 | "The Bolter", 135 | DoesNotRaise(), 136 | ), # test multi-word search 137 | ( 138 | "-started -with mural", 139 | { 140 | "query": { 141 | "and": [ 142 | { 143 | "not": { 144 | "or": [ 145 | {"title": {"contains": "started"}}, 146 | {"lyric": {"contains": "started"}}, 147 | ] 148 | } 149 | }, 150 | { 151 | "not": { 152 | "or": [ 153 | {"title": {"contains": "with"}}, 154 | {"lyric": {"contains": "with"}}, 155 | ] 156 | } 157 | }, 158 | { 159 | "or": [ 160 | {"title": {"contains": "mural"}}, 161 | {"lyric": {"contains": "mural"}}, 162 | ] 163 | }, 164 | ] 165 | }, 166 | "limit": 10, 167 | }, 168 | 1, 169 | "tolerate it", 170 | DoesNotRaise(), 171 | ), # two negation queries 172 | ( 173 | "title:tolerate lyric:I", 174 | { 175 | "query": { 176 | "and": [ 177 | {"title": {"contains": "tolerate"}}, 178 | {"lyric": {"contains": "I"}}, 179 | ] 180 | }, 181 | "limit": 10, 182 | }, 183 | 1, 184 | "tolerate it", 185 | DoesNotRaise(), 186 | ), # two field queries 187 | ( 188 | "", 189 | {"query": {}}, 190 | 0, 191 | "", 192 | DoesNotRaise(), 193 | ), # blank query 194 | ( 195 | "Started sky", 196 | { 197 | "query": { 198 | "or": [ 199 | { 200 | "or": [ 201 | {"title": {"contains": "Started"}}, 202 | {"lyric": {"contains": "Started"}}, 203 | ] 204 | }, 205 | { 206 | "or": [ 207 | {"title": {"contains": "sky"}}, 208 | {"lyric": {"contains": "sky"}}, 209 | ] 210 | }, 211 | ] 212 | }, 213 | "limit": 10, 214 | }, 215 | 3, 216 | "The Bolter", 217 | DoesNotRaise(), 218 | ), # test OR argument 219 | ( 220 | "I -still", 221 | { 222 | "query": { 223 | "and": [ 224 | { 225 | "or": [ 226 | {"lyric": {"contains": "I"}}, 227 | {"title": {"contains": "I"}}, 228 | ] 229 | }, 230 | { 231 | "not": { 232 | "or": [ 233 | {"lyric": {"contains": "still"}}, 234 | {"title": {"contains": "still"}}, 235 | ] 236 | } 237 | }, 238 | ] 239 | }, 240 | "limit": 10, 241 | }, 242 | 1, 243 | "tolerate it", 244 | DoesNotRaise(), 245 | ), # test negation argument 246 | ( 247 | "-started -mural -title:'The'", 248 | { 249 | "query": { 250 | "and": [ 251 | { 252 | "not": { 253 | "or": [ 254 | {"title": {"contains": "started"}}, 255 | {"lyric": {"contains": "started"}}, 256 | ] 257 | } 258 | }, 259 | { 260 | "not": { 261 | "or": [ 262 | {"title": {"contains": "mural"}}, 263 | {"lyric": {"contains": "mural"}}, 264 | ] 265 | } 266 | }, 267 | {"not": {"title": {"contains": "The"}}}, 268 | ] 269 | }, 270 | "limit": 10, 271 | }, 272 | 1, 273 | "my tears ricochet", 274 | DoesNotRaise(), 275 | ), # test negation on field 276 | ], 277 | ) 278 | @pytest.mark.timeout(20) 279 | def test_search( 280 | create_indices, 281 | query, 282 | rewritten_query, 283 | number_of_documents_expected, 284 | top_result_value, 285 | raises_exception, 286 | ): 287 | with raises_exception: 288 | index, large_index = create_indices 289 | 290 | internal_query, _ = index._compute_string_query(query) 291 | response = index.string_query_search(query) 292 | 293 | print(internal_query, response) 294 | 295 | assert len(response["documents"]) == number_of_documents_expected 296 | 297 | # allow items to be in different orders; order doesn't matter, ignore sort_by 298 | result = DeepDiff( 299 | internal_query, 300 | rewritten_query, 301 | ignore_order=True, 302 | exclude_regex_paths=["root\['sort_by'\]"], 303 | ) 304 | 305 | print(result) 306 | 307 | assert result == {} 308 | 309 | if number_of_documents_expected > 0: 310 | assert response["documents"][0]["title"] == top_result_value 311 | 312 | if response.get("query_time"): 313 | assert float(response["query_time"]) < 0.06 314 | 315 | # run if --benchmark is passed 316 | if "--benchmark" in sys.argv: 317 | response = large_index.string_query_search(query) 318 | 319 | assert float(response["query_time"]) < 0.06 320 | -------------------------------------------------------------------------------- /jamesql/rewriter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from lark import Lark, Transformer 4 | from lark.visitors import Interpreter, Visitor 5 | 6 | from .query_simplifier import simplifier 7 | 8 | grammar = """ 9 | start: (query)+ sort_component? 10 | 11 | or_query: (query ("OR " | "or ") query)* 12 | and_query: (query ("AND " | "and ") query)* 13 | query: and_query | or_query | query_component 14 | query_component: (negate_query | range_query | strict_search_query | word_query | field_query | comparison)+ 15 | 16 | sort_component: "sort:" TERM (ORDER)? 17 | strict_search_query: "'" MULTI_WORD "'" 18 | comparison: TERM OPERATOR WORD 19 | range_query: TERM "[" WORD "," WORD "]" 20 | word_query: WORD ("^" FLOAT)? 21 | field_query: TERM ":" ("'" MULTI_WORD "'" | WORD | DOUBLE_QUOTE MULTI_WORD DOUBLE_QUOTE) 22 | negate_query: "-" (strict_search_query | word_query | field_query | comparison | range_query) 23 | OPERATOR: ">" | "<" | ">=" | "<=" 24 | DOUBLE_QUOTE: "\\"" 25 | WORD: /[a-zA-Z0-9_.!?*-]+/ 26 | FLOAT: /[0-9]+(\.[0-9]+)?/ 27 | MULTI_WORD: /[a-zA-Z0-9 ]+/ 28 | TERM: /[a-zA-Z0-9_]+/ 29 | ORDER: "ASC" | "DESC" | "asc" | "desc" 30 | 31 | %import common.WS 32 | %ignore WS 33 | """ 34 | 35 | OPERATOR_MAP = { 36 | ">": "greater_than", 37 | "<": "less_than", 38 | ">=": "greater_than_or_equal", 39 | "<=": "less_than_or_equal", 40 | } 41 | 42 | 43 | class QuerySimplifier(Transformer): 44 | def __init__(self): 45 | self.terms = [] 46 | 47 | def WORD(self, items): 48 | return items.value 49 | 50 | def FLOAT(self, items): 51 | return items.value 52 | 53 | def word_query(self, items): 54 | self.terms.append("^".join(items)) 55 | return "^".join(items) 56 | 57 | def field_query(self, items): 58 | return items[0] 59 | 60 | def query_component(self, items): 61 | return items[0] 62 | 63 | def query(self, items): 64 | return items[0] 65 | 66 | def or_query(self, items): 67 | self.terms.append([items[0], "OR", items[1]]) 68 | return items[0] 69 | 70 | def and_query(self, items): 71 | self.terms.append([items[0], "AND", items[1]]) 72 | return items[0] 73 | 74 | def start(self, items): 75 | return items[0] 76 | 77 | def field_query(self, items): 78 | self.terms.append(items[0] + ":" + "'" + items[1] + "'") 79 | return items[0] + ":" + "'" + items[1] + "'" 80 | 81 | def TERM(self, items): 82 | return items.value 83 | 84 | def negate_query(self, items): 85 | if items[0] in self.terms: 86 | self.terms.remove(items[0]) 87 | 88 | self.terms.append(["NOT", items[0]]) 89 | 90 | return items[0] 91 | 92 | def range_query(self, items): 93 | self.terms.append(items[0] + "[" + items[1] + "," + items[2] + "]") 94 | return items[0] + "[" + items[1] + "," + items[2] + "]" 95 | 96 | def comparison(self, items): 97 | self.terms.append(items[0] + items[1] + items[2]) 98 | return items[0] + items[1] + items[2] 99 | 100 | def strict_search_query(self, items): 101 | self.terms.append("'" + items[0] + "'") 102 | return "'" + items[0] + "'" 103 | 104 | def MULTI_WORD(self, items): 105 | return items.value 106 | 107 | 108 | class QueryRewriter(Transformer): 109 | def __init__( 110 | self, 111 | default_strategies=None, 112 | query_keys=None, 113 | boosts={}, 114 | fuzzy=False, 115 | highlight_keys=[], 116 | ): 117 | self.indexing_strategies = default_strategies 118 | self.query_keys = query_keys 119 | self.boosts = boosts 120 | self.fuzzy = fuzzy 121 | self.highlight_keys = highlight_keys 122 | 123 | def get_query_strategy(self, key="", value=""): 124 | default = "contains" 125 | 126 | if isinstance(value, str) and "*" in value: 127 | return "wildcard" 128 | 129 | return default 130 | 131 | def ORDER(self, items): 132 | return items.value 133 | 134 | def FLOAT(self, items): 135 | return items.value 136 | 137 | def or_query(self, items): 138 | return {"or": items} 139 | 140 | def and_query(self, items): 141 | return {"and": items} 142 | 143 | def negate_query(self, items): 144 | return {"not": items[0]} 145 | 146 | def query(self, items): 147 | return items[0] 148 | 149 | def query_component(self, items): 150 | # if all child keys are OR, return as OR 151 | all_are_or = False 152 | 153 | for item in items: 154 | if not isinstance(item, dict) or "or" not in item: 155 | all_are_or = False 156 | break 157 | 158 | all_are_or = True 159 | 160 | return {"and": items} if not all_are_or else {"or": items} 161 | 162 | def sort_component(self, items): 163 | result = {"sort_by": items[0]} 164 | 165 | if len(items) > 1: 166 | result["sort_order"] = items[1] 167 | 168 | return result 169 | 170 | def start(self, items): 171 | items = {k: v for item in items for k, v in item.items()} 172 | 173 | response = {"query": items, "limit": 10} 174 | 175 | if "sort_by" in items: 176 | response["sort_by"] = items["sort_by"] 177 | response["sort_order"] = items.get("sort_order", "asc") 178 | del items["sort_by"] 179 | del items["sort_order"] 180 | 181 | return response 182 | 183 | def OPERATOR(self, items): 184 | return items.value 185 | 186 | def strict_search_query(self, items): 187 | return { 188 | "or": [ 189 | {field: { 190 | self.get_query_strategy(value=items[0]): items[0], 191 | "strict": True, 192 | }} 193 | for field in self.query_keys 194 | if self.indexing_strategies.get(field) not in {"NUMERIC", "DATE"} 195 | ] 196 | } 197 | 198 | def TERM(self, items): 199 | return items.value 200 | 201 | def MULTI_WORD(self, items): 202 | return items.value 203 | 204 | def comparison(self, items): 205 | field = items[0] 206 | operator = items[1] 207 | value = items[2] 208 | 209 | if field not in self.query_keys: 210 | return {} 211 | 212 | return {field: {OPERATOR_MAP[operator]: value}} 213 | 214 | def range_query(self, items): 215 | field = items[0] 216 | start = items[1] 217 | end = items[2] 218 | 219 | if field not in self.query_keys: 220 | return {} 221 | 222 | return {field: {"range": [start, end]}} 223 | 224 | def word_query(self, items): 225 | result = [] 226 | 227 | for key in self.query_keys: 228 | field = key 229 | value = items[0] 230 | if len(items) > 1: 231 | boost = items[1] 232 | else: 233 | boost = 1 234 | 235 | if self.indexing_strategies.get(field) == "NUMERIC": 236 | continue 237 | 238 | if self.get_query_strategy(field, value) == "contains": 239 | # if value is float, convert to int 240 | # this is because text queries can't be floats 241 | if isinstance(value, float): 242 | value = int(value) 243 | 244 | value = str(value) 245 | 246 | results = { 247 | field: { 248 | self.get_query_strategy(field, value): value, 249 | } 250 | } 251 | 252 | if self.boosts.get(field): 253 | results[field]["boost"] = self.boosts.get(field, boost) 254 | 255 | if self.fuzzy: 256 | results[field]["fuzzy"] = ( 257 | self.fuzzy 258 | if self.get_query_strategy(field, value) == "contains" 259 | else False 260 | ) 261 | 262 | if field in self.highlight_keys: 263 | results[field]["highlight"] = True 264 | 265 | result.append(results) 266 | 267 | return {"or": result} 268 | 269 | def field_query(self, items): 270 | # remove negation 271 | field = items[0].lstrip("-") 272 | value = items[1] 273 | 274 | if field not in self.query_keys: 275 | return {} 276 | 277 | return {field: {self.get_query_strategy(field, value): value}} 278 | 279 | def WORD(self, items): 280 | if items.value.isdigit(): 281 | return float(items.value) 282 | 283 | return items.value 284 | 285 | 286 | def simplify_string_query(parser, query, correct_spelling_index=None): 287 | # remove punctuation not in grammar 288 | query = re.sub(r"[^a-zA-Z0-9_,!?^*:\-.'<>=\[\] ]", "", query) 289 | 290 | tree = parser.parse(query) 291 | 292 | result = QuerySimplifier() 293 | result.transform(tree.copy()) 294 | 295 | # query = simplifier(result.terms) 296 | # query = " ".join(query).strip() 297 | 298 | if len(query.strip()) == 0: 299 | return query, {} 300 | 301 | spelling_substitutions = {} 302 | 303 | if correct_spelling_index is not None: 304 | final_query = "" 305 | 306 | for word in query.split(): 307 | # if word starts with -, skip 308 | # ' and " are used to indicate strict strings, so we need to skip words that start or end with the character 309 | first_char = word[0] if len(word) > 0 else "" 310 | last_char = word[-1] if len(word) > 0 else "" 311 | 312 | if ( 313 | first_char == "-" 314 | or first_char == "'" 315 | or first_char == '"' 316 | or last_char == "'" 317 | or last_char == '"' 318 | or correct_spelling_index.word_counts.get(word) 319 | or "*" in word 320 | ): 321 | final_query += word + " " 322 | continue 323 | 324 | final_query += correct_spelling_index.spelling_correction(word) + " " 325 | 326 | spelling_substitutions = { 327 | word: correct_spelling_index.spelling_correction(word) 328 | for word in query.split() 329 | if word != correct_spelling_index.spelling_correction(word) 330 | } 331 | 332 | query = final_query.strip() 333 | 334 | return query, spelling_substitutions 335 | 336 | 337 | def string_query_to_jamesql( 338 | parser, 339 | query, 340 | query_keys, 341 | default_strategies={}, 342 | boosts={}, 343 | fuzzy=False, 344 | correct_spelling_index=None, 345 | highlight_keys=False, 346 | ): 347 | query, spelling_substitutions = simplify_string_query( 348 | parser, query, correct_spelling_index 349 | ) 350 | 351 | if query.strip() == "": 352 | return {"query": {}}, [] 353 | 354 | tree = parser.parse(query) 355 | 356 | rewritten_query = QueryRewriter( 357 | default_strategies=default_strategies, 358 | query_keys=query_keys, 359 | boosts=boosts, 360 | fuzzy=fuzzy, 361 | highlight_keys=highlight_keys, 362 | ).transform(tree) 363 | 364 | return rewritten_query, spelling_substitutions 365 | -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from contextlib import ExitStack as DoesNotRaise 4 | 5 | import pytest 6 | 7 | from jamesql import JameSQL 8 | from jamesql.index import GSI_INDEX_STRATEGIES 9 | 10 | 11 | def pytest_addoption(parser): 12 | parser.addoption("--benchmark", action="store") 13 | 14 | 15 | @pytest.fixture 16 | def example_stub_and_query(): 17 | with open("tests/fixtures/example_stub_and_query.json") as f: 18 | query = json.load(f) 19 | 20 | return query 21 | 22 | 23 | @pytest.fixture(scope="session") 24 | def create_indices(request): 25 | with open("tests/fixtures/documents.json") as f: 26 | documents = json.load(f) 27 | 28 | index = JameSQL() 29 | 30 | for document in documents: 31 | index.add(document) 32 | 33 | with open("tests/fixtures/documents.json") as f: 34 | documents = json.load(f) 35 | 36 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 37 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 38 | 39 | if request.config.getoption("--benchmark") or request.config.getoption( 40 | "--long-benchmark" 41 | ): 42 | large_index = JameSQL() 43 | 44 | for document in documents * 100000: 45 | if request.config.getoption("--long-benchmark"): 46 | document = document.copy() 47 | document["title"] = "".join( 48 | [ 49 | word + " " 50 | for word in document["title"].split() 51 | for _ in range(10) 52 | ] 53 | ) 54 | large_index.add(document) 55 | 56 | large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 57 | large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 58 | else: 59 | large_index = None 60 | 61 | return index, large_index 62 | 63 | 64 | @pytest.mark.parametrize( 65 | "query, number_of_documents_expected, top_result_value, raises_exception", 66 | [ 67 | ( 68 | { 69 | "query": {"title": {"contains": "tolerate"}}, 70 | "limit": 10, 71 | "sort_by": "title", 72 | }, 73 | 1, 74 | "tolerate it", 75 | DoesNotRaise(), 76 | ), # test contains 77 | ( 78 | { 79 | "query": {"title": {"contains": "tolerats"}}, 80 | "limit": 10, 81 | "sort_by": "title", 82 | }, 83 | 0, 84 | "", 85 | DoesNotRaise(), 86 | ), # test contains 87 | ( 88 | { 89 | "query": {"title": {"equals": "tolerate it"}}, 90 | "limit": 10, 91 | "sort_by": "title", 92 | }, 93 | 1, 94 | "tolerate it", 95 | DoesNotRaise(), 96 | ), # test equals 97 | ( 98 | { 99 | "query": {"title": {"equals": "tolerate it"}}, 100 | "limit": 0, 101 | "sort_by": "title", 102 | }, 103 | 0, 104 | "", 105 | DoesNotRaise(), 106 | ), # test limit 107 | ( 108 | { 109 | "query": {"lyric": {"contains": "my mural", "strict": True}}, 110 | "limit": 1, 111 | "sort_by": "title", 112 | }, 113 | 1, 114 | "tolerate it", 115 | DoesNotRaise(), 116 | ), # test strict 117 | ( 118 | { 119 | "query": { 120 | "lyric": {"contains": "my murap", "strict": True, "fuzzy": True} 121 | }, 122 | "limit": 1, 123 | "sort_by": "title", 124 | }, 125 | 1, 126 | "tolerate it", 127 | DoesNotRaise(), 128 | ), 129 | ( 130 | { 131 | "query": {"title": {"wildcard": "tolerat*"}}, 132 | "limit": 1, 133 | "sort_by": "title", 134 | }, 135 | 1, 136 | "tolerate it", 137 | DoesNotRaise(), 138 | ), # test wildcard 139 | ( 140 | { 141 | "query": {"lyric": {"wildcard": "my mura*", "strict": True}}, 142 | "limit": 1, 143 | "sort_by": "title", 144 | }, 145 | 1, 146 | "tolerate it", 147 | DoesNotRaise(), 148 | ), # test wildcard and strict; wildcard overrides strict 149 | ( 150 | { 151 | "query": {"title": {"contains": "it tolerate", "strict": True}}, 152 | "limit": 10, 153 | "sort_by": "title", 154 | }, 155 | 0, 156 | "", 157 | DoesNotRaise(), 158 | ), # test an invalid query 159 | ( 160 | { 161 | "query": {"title": {"starts_with": "toler"}}, 162 | "limit": 10, 163 | "sort_by": "title", 164 | }, 165 | 1, 166 | "tolerate it", 167 | DoesNotRaise(), 168 | ), # test starts_with on an index with CONTAINS type 169 | # this will return results but slowly 170 | ( 171 | { 172 | "query": {"lyric": {"starts_with": "Started with"}}, 173 | "limit": 10, 174 | "sort_by": "title", 175 | }, 176 | 1, 177 | "The Bolter", 178 | DoesNotRaise(), 179 | ), # test starts_with 180 | ( 181 | { 182 | "query": {"lyric": {"contains": "Startee with", "fuzzy": True}}, 183 | "limit": 10, 184 | "sort_by": "title", 185 | }, 186 | 1, 187 | "The Bolter", 188 | DoesNotRaise(), 189 | ), # test fuzzy on contains 190 | ( 191 | { 192 | "query": { 193 | "lyric": { 194 | "starts_with": "Startee with", 195 | "fuzzy": True, 196 | "strict": True, 197 | } 198 | }, 199 | "limit": 10, 200 | "sort_by": "title", 201 | }, 202 | 1, 203 | "The Bolter", 204 | DoesNotRaise(), 205 | ), # test fuzzy on starts_with 206 | ( 207 | { 208 | "query": {"lyric": {"equals": "Startee with", "fuzzy": True}}, 209 | "limit": 10, 210 | "sort_by": "title", 211 | }, 212 | 0, 213 | "", 214 | DoesNotRaise(), 215 | ), # fuzzy doesn't work on equals 216 | ( 217 | { 218 | "query": {"lyric": {"contains": "sky"}}, 219 | "limit": 10, 220 | "sort_by": "lyric", 221 | }, 222 | 2, 223 | "tolerate it", 224 | DoesNotRaise(), 225 | ), # test starts_with 226 | ( 227 | { 228 | "query": {"lyric": {"contains": "100"}}, 229 | "limit": 10, 230 | "sort_by": "lyric" 231 | }, 232 | 0, 233 | "", 234 | DoesNotRaise(), 235 | ), # test numeric query 236 | ( 237 | { 238 | "query": {"lyric": {"contains": 100}}, 239 | "limit": 10, 240 | "sort_by": "lyric" 241 | }, 242 | 0, 243 | "", 244 | DoesNotRaise(), 245 | ), # test numeric query cast as int 246 | ( 247 | { 248 | "query": {"lyric": {"contains": 100.001}}, 249 | "limit": 10, 250 | "sort_by": "lyric" 251 | }, 252 | 0, 253 | "", 254 | DoesNotRaise(), 255 | ), # test numeric query cast as float 256 | ( 257 | { 258 | "query": {"lyric": {"starts_with": "started with"}}, 259 | "limit": 10, 260 | "sort_by": "title", 261 | }, 262 | 0, 263 | "", 264 | DoesNotRaise(), 265 | ), # the query is case-sensitive 266 | ( 267 | { 268 | "query": {"lyric": {"starts_with": "started with"}}, 269 | "limit": 10, 270 | "sort_by": "title", 271 | }, 272 | 0, 273 | "", 274 | DoesNotRaise(), 275 | ), # the query contains a key that doesn't exist; this shouldn't fail 276 | ( 277 | {"lyric": {"starts_with": "started with"}, "limit": 10, "sort_by": "title"}, 278 | 0, 279 | "", 280 | DoesNotRaise(), 281 | ), # the query is missing the query key; this returns an "error" key but doesn't raise an error 282 | ( 283 | { 284 | "query": { 285 | "and": [ 286 | {"title": {"starts_with": "tolerate"}}, 287 | {"title": {"contains": "it"}}, 288 | ] 289 | }, 290 | "limit": 2, 291 | "sort_by": "title", 292 | }, 293 | 1, 294 | "tolerate it", 295 | DoesNotRaise(), 296 | ), # test complex query with single query 297 | ( 298 | { 299 | "query": { 300 | "or": { 301 | "and": [ 302 | {"title": {"starts_with": "tolerate"}}, 303 | {"title": {"contains": "it"}}, 304 | ], 305 | "lyric": {"contains": "kiss"}, 306 | } 307 | }, 308 | "limit": 2, 309 | "sort_by": "title", 310 | }, 311 | 2, 312 | "tolerate it", 313 | DoesNotRaise(), 314 | ), # test complex query with multiple queries 315 | ( 316 | { 317 | "query": {}, 318 | "limit": 10, 319 | "sort_by": "title", 320 | }, 321 | 0, 322 | "", 323 | DoesNotRaise(), 324 | ), # test empty query 325 | ( 326 | { 327 | "query": "*", 328 | "skip": 2, 329 | "limit": 1, 330 | "sort_by": "title", 331 | }, 332 | 1, 333 | "The Bolter", 334 | DoesNotRaise(), 335 | ), # test start query 336 | ( 337 | { 338 | "query": "*", 339 | "limit": 10, 340 | "sort_by": "title", 341 | }, 342 | 3, 343 | "tolerate it", 344 | DoesNotRaise(), 345 | ), # test all query 346 | ( 347 | { 348 | "query": { 349 | "not": {"lyric": {"contains": "kiss"}}, 350 | }, 351 | "limit": 10, 352 | "sort_by": "title", 353 | }, 354 | 2, 355 | "tolerate it", 356 | DoesNotRaise(), 357 | ), # test not with no and query 358 | ( 359 | { 360 | "query": { 361 | "and": { 362 | "or": [ 363 | {"lyric": {"contains": "sky", "boost": 3}}, 364 | {"lyric": {"contains": "kiss", "boost": 3}}, 365 | ], 366 | "not": {"lyric": {"contains": "kiss"}}, 367 | } 368 | }, 369 | "limit": 10, 370 | "sort_by": "title", 371 | }, 372 | 2, 373 | "tolerate it", 374 | DoesNotRaise(), 375 | ), # test not query within an and query 376 | ], 377 | ) 378 | @pytest.mark.timeout(30) 379 | def test_search( 380 | create_indices, 381 | query, 382 | number_of_documents_expected, 383 | top_result_value, 384 | raises_exception, 385 | ): 386 | with raises_exception: 387 | index, large_index = create_indices 388 | 389 | response = index.search(query) 390 | 391 | print(response, query) 392 | 393 | assert len(response["documents"]) == number_of_documents_expected 394 | 395 | if number_of_documents_expected > 0: 396 | assert response["documents"][0]["title"] == top_result_value 397 | 398 | assert float(response["query_time"]) < 0.06 399 | 400 | # run if --benchmark is passed 401 | if "--benchmark" in sys.argv: 402 | response = large_index.search(query) 403 | 404 | assert float(response["query_time"]) < 0.06 405 | 406 | 407 | # TODO: TF/IDF needs to be calculated after all documents have been inserted 408 | # Otherwise TF/IDF score will vary on document insertion order 409 | # which we don't want 410 | @pytest.mark.parametrize( 411 | "query, top_document_name, top_document_score, raises_exception", 412 | [ 413 | ( 414 | { 415 | "query": {"title": {"contains": "tolerate"}}, 416 | "limit": 2, 417 | "query_score": "(_score + 2)", 418 | }, 419 | "tolerate it", 420 | 2.0, 421 | DoesNotRaise(), 422 | ), 423 | ( 424 | { 425 | "query": {"title": {"contains": "tolerate"}}, 426 | "limit": 2, 427 | "query_score": "(_score * 2)", 428 | "sort_by": "_score", 429 | }, 430 | "tolerate it", 431 | 0.09010335735736986, 432 | DoesNotRaise(), 433 | ), 434 | ( 435 | { 436 | "query": {"lyric": {"contains": "sky", "boost": 56}}, 437 | "limit": 10, 438 | "sort_by": "title", 439 | }, 440 | "tolerate it", 441 | 2.5228940060063563, 442 | DoesNotRaise(), # test searching TF/IDF indexed field 443 | ), 444 | ], 445 | ) 446 | def test_query_score_and_boost( 447 | create_indices, 448 | query, 449 | top_document_name, 450 | top_document_score, 451 | raises_exception, 452 | ): 453 | with raises_exception: 454 | index, large_index = create_indices 455 | response = index.search(query) 456 | 457 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 458 | index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 459 | 460 | assert response["documents"][0]["title"] == top_document_name 461 | assert response["documents"][0]["_score"] == top_document_score 462 | 463 | 464 | def test_add_item( 465 | create_indices, 466 | ): 467 | index, _ = create_indices 468 | 469 | index.add({"title": "shake it off", "lyric": "I stay out too late"}) 470 | 471 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS) 472 | 473 | response = index.search( 474 | { 475 | "query": {"title": {"equals": "shake it off"}}, 476 | "limit": 10, 477 | "sort_by": "title", 478 | } 479 | ) 480 | 481 | assert len(response["documents"]) == 1 482 | 483 | 484 | def test_remove_item( 485 | create_indices, 486 | ): 487 | index, large_index = create_indices 488 | 489 | response = index.search( 490 | { 491 | "query": {"title": {"contains": "tolerate"}}, 492 | "limit": 10, 493 | "sort_by": "title", 494 | } 495 | ) 496 | 497 | uuid = response["documents"][0]["uuid"] 498 | 499 | index.remove(uuid) 500 | 501 | response = index.search( 502 | { 503 | "query": {"title": {"contains": "tolerate"}}, 504 | "limit": 10, 505 | "sort_by": "title", 506 | } 507 | ) 508 | 509 | assert len(response["documents"]) == 0 510 | 511 | 512 | def test_query_exceeding_maximum_subqueries(example_stub_and_query, create_indices): 513 | for i in range(0, 25): 514 | example_stub_and_query["query"]["and"].append( 515 | {"lyric" + str(i): {"contains": "kiss"}} 516 | ) 517 | 518 | index, large_index = create_indices 519 | 520 | response = index.search(example_stub_and_query) 521 | 522 | assert len(response["documents"]) == 0 523 | assert response["error"].startswith("Too many query conditions.") 524 | -------------------------------------------------------------------------------- /web/templates/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | JameSQL Query Experimentation Tool 7 | 8 | 9 | 10 | 168 | 169 | 170 | 171 | 172 | 222 |
223 | 228 |
229 |

Results

230 |
    231 |
    232 |
    233 | 506 | 507 | --------------------------------------------------------------------------------