├── requirements.txt
├── docs
    ├── .DS_Store
    ├── config.py
    ├── pages
    │   └── templates
    │   │   ├── range.md
    │   │   ├── group-by.md
    │   │   ├── comparison.md
    │   │   ├── add.md
    │   │   ├── update.md
    │   │   ├── highlight.md
    │   │   ├── code-search.md
    │   │   ├── aggregate-metrics.md
    │   │   ├── delete.md
    │   │   ├── quickstart.md
    │   │   ├── ranking.md
    │   │   ├── index.md
    │   │   ├── search.html
    │   │   ├── autosuggest.md
    │   │   ├── spelling-correction.md
    │   │   ├── conditions
    │   │       └── operators.md
    │   │   ├── string-queries.md
    │   │   ├── matching.md
    │   │   ├── script-scores.md
    │   │   ├── storage-and-consistency.md
    │   │   ├── create.md
    │   │   └── search.md
    ├── hooks.py
    └── _site
    │   ├── comparison.md
    │       └── index.html
    │   └── aggregate-metrics.md
    │       └── index.html
├── jamesql
    ├── __init__.py
    ├── query_simplifier.py
    ├── script_lang.py
    └── rewriter.py
├── assets
    ├── .DS_Store
    └── screenshot.png
├── tests
    ├── fixtures
    │   ├── example_stub_and_query.json
    │   ├── documents.json
    │   ├── documents_with_numeric_values.json
    │   ├── documents_with_categorical_values.json
    │   ├── documents_with_categorical_and_numeric_values.json
    │   ├── documents_with_varied_data_types.json
    │   └── code
    │   │   ├── simplifier.py
    │   │   └── simplifier_demo.py
    ├── conftest.py
    ├── save_and_load.py
    ├── concurrency.py
    ├── autosuggest.py
    ├── spelling_correction.py
    ├── gsi_type_inference.py
    ├── script_lang.py
    ├── code_search.py
    ├── highlight.py
    ├── data_types.py
    ├── aggregation.py
    ├── range_queries.py
    ├── query_simplification.py
    ├── group_by.py
    ├── sort_by.py
    ├── string_queries_categorical_and_range.py
    ├── string_query.py
    └── test.py
├── CITATION.cff
├── .github
    └── workflows
    │   ├── welcome.yml
    │   ├── test.yml
    │   ├── release.yml
    │   ├── benchmark.yml
    │   ├── windows.yml
    │   └── documentation.yml
├── LICENSE
├── setup.py
├── web
    ├── landing.html
    ├── templates
    │   ├── index.html
    │   └── search.html
    └── web.py
├── schema.py
└── .gitignore


/requirements.txt:
--------------------------------------------------------------------------------
1 | pybmoore
2 | 


--------------------------------------------------------------------------------
/docs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capjamesg/jamesql/HEAD/docs/.DS_Store


--------------------------------------------------------------------------------
/jamesql/__init__.py:
--------------------------------------------------------------------------------
1 | from .index import JameSQL
2 | 
3 | __version__ = "0.3.0"
4 | 


--------------------------------------------------------------------------------
/assets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capjamesg/jamesql/HEAD/assets/.DS_Store


--------------------------------------------------------------------------------
/assets/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capjamesg/jamesql/HEAD/assets/screenshot.png


--------------------------------------------------------------------------------
/tests/fixtures/example_stub_and_query.json:
--------------------------------------------------------------------------------
1 | {
2 |     "query": {
3 |         "and": []
4 |     },
5 |     "limit": 10,
6 |     "sort_by": "title"
7 | }


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 |   - given-names: James (capjamesg)
5 | title: "JameSQL"
6 | version: 0.1.0
7 | date-released: 2024-10-16
8 | 


--------------------------------------------------------------------------------
/tests/fixtures/documents.json:
--------------------------------------------------------------------------------
1 | [
2 |     {"title": "tolerate it", "lyric": "I made you my temple, my mural my sky"},
3 |     {
4 |         "title": "my tears ricochet",
5 |         "lyric": "And I still talk to you when I'm screaming at the sky"
6 |     },
7 |     {"title": "The Bolter", "lyric": "Started with a kiss"}
8 | ]


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption(
 6 |         "--benchmark", action="store_true", default=False, help="Enable benchmarking"
 7 |     )
 8 |     parser.addoption(
 9 |         "--long-benchmark",
10 |         action="store_true",
11 |         default=False,
12 |         help="Enable long benchmark",
13 |     )
14 | 


--------------------------------------------------------------------------------
/tests/fixtures/documents_with_numeric_values.json:
--------------------------------------------------------------------------------
1 | [
2 |     {"title": "tolerate it", "lyric": "I made you my temple, my mural, my sky", "listens": 100},
3 |     {
4 |         "title": "my tears ricochet",
5 |         "lyric": "And I still talk to you when I'm screaming at the sky",
6 |         "listens": 200
7 |     },
8 |     {"title": "The Bolter", "lyric": "Started with a kiss", "listens": 300}
9 | ]


--------------------------------------------------------------------------------
/tests/fixtures/documents_with_categorical_values.json:
--------------------------------------------------------------------------------
1 | [
2 |     {"title": "tolerate it", "lyric": "I made you my temple, my mural, my sky", "category": ["pop"]},
3 |     {
4 |         "title": "my tears ricochet",
5 |         "lyric": "And I still talk to you when I'm screaming at the sky",
6 |         "category": ["pop"]
7 |     },
8 |     {"title": "The Bolter", "lyric": "Started with a kiss", "category": ["pop", "acoustic"]}
9 | ]


--------------------------------------------------------------------------------
/tests/fixtures/documents_with_categorical_and_numeric_values.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {"title": "tolerate it", "lyric": "I made you my temple, my mural, my sky", "listens": 100, "category": ["pop"] },
 3 |     {
 4 |         "title": "my tears ricochet",
 5 |         "lyric": "And I still talk to you when I'm screaming at the sky",
 6 |         "listens": 200,
 7 |         "category": ["pop", "acoustic"]
 8 |     },
 9 |     {"title": "The Bolter", "lyric": "Started with a kiss", "listens": 300, "category": ["acoustic"]}
10 | ]


--------------------------------------------------------------------------------
/docs/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | BASE_URLS = {
 4 |     "local": os.getcwd(),
 5 |     "production": "https://jamesg.blog/jamesql",
 6 | }
 7 | 
 8 | SITE_ENV = os.environ.get("SITE_ENV", "local")
 9 | BASE_URL = BASE_URLS[SITE_ENV]
10 | ROOT_DIR = "pages"
11 | LAYOUTS_BASE_DIR = "_layouts"
12 | SITE_DIR = "_site"
13 | HOOKS = {
14 |     "post_template_generation": {"hooks": ["highlight_code"]},
15 |     "pre_template_generation": {"hooks": ["generate_table_of_contents"]}
16 | }
17 | SITE_STATE = {}
18 | 
19 | BASE_URL = BASE_URLS[SITE_ENV]


--------------------------------------------------------------------------------
/docs/pages/templates/range.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Range Queries
 4 | permalink: /range/
 5 | ---
 6 | 
 7 | You can find values in a numeric range with a range query. Here is an example of a query that looks for documents where the `year` field is between `2010` and `2020`:
 8 | 
 9 | ```python
10 | query = {
11 |     "query": {
12 |         "year": {
13 |             "range": [2010, 2020]
14 |         }
15 |     }
16 | }
17 | ```
18 | 
19 | The first value in the range is the lower bound to use in the search, and the second value is the upper bound.


--------------------------------------------------------------------------------
/docs/pages/templates/group-by.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Group By
 4 | permalink: /group-by/
 5 | ---
 6 | 
 7 | You can group results by a single key. This is useful for presenting aggregate views of data.
 8 | 
 9 | To group results by a key, use the following code:
10 | 
11 | ```python
12 | query = {
13 |     "query": {
14 |         "lyric": {
15 |             "contains": "sky"
16 |         }
17 |     },
18 |     "group_by": "title"
19 | }
20 | ```
21 | 
22 | This query will search for all `lyric` fields that contain the term "sky" and group the results by the `title` field.


--------------------------------------------------------------------------------
/.github/workflows/welcome.yml:
--------------------------------------------------------------------------------
 1 | name: Welcome
 2 | 
 3 | on:
 4 |   issues:
 5 |     types: [opened]
 6 |   pull_request_target:
 7 |     types: [opened]
 8 | 
 9 | jobs:
10 |   build:
11 |     name: 👋 Welcome
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/first-interaction@v1.3.0
15 |         with:
16 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
17 |           issue-message: "Thank you for creating an Issue on this repository! 🙌 We will get back to you shortly."
18 |           pr-message: "Thank you for creating an PR on this repository! 🙌 We will get back to you shortly."
19 | 


--------------------------------------------------------------------------------
/docs/pages/templates/comparison.md:
--------------------------------------------------------------------------------
 1 | You can find documents where a field is less than, greater than, less than or equal to, or greater than or equal to a value with a range query. Here is an example of a query that looks for documents where the `year` field is greater than `2010`:
 2 | 
 3 | <pre><code class="language-python">
 4 | query = {
 5 |     "query": {
 6 |         "year": {
 7 |             "greater_than": 2010
 8 |         }
 9 |     }
10 | }
11 | </code></pre>
12 | 
13 | The following operators are supported:
14 | 
15 | - `greater_than`
16 | - `less_than`
17 | - `greater_than_or_equal`
18 | - `less_than_or_equal`


--------------------------------------------------------------------------------
/tests/save_and_load.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from contextlib import ExitStack as DoesNotRaise
 4 | 
 5 | import pytest
 6 | 
 7 | from jamesql import JameSQL
 8 | from jamesql.index import GSI_INDEX_STRATEGIES, INDEX_STORE
 9 | 
10 | 
11 | @pytest.mark.skip
12 | def test_load_from_local_index():
13 |     with open("tests/fixtures/documents.json") as f:
14 |         documents = json.load(f)
15 | 
16 |     index = JameSQL.load()
17 | 
18 |     assert len(index.global_index) == len(documents)
19 |     assert index.global_index
20 |     assert len(index.gsis) == 2  # indexing two fields
21 |     assert index.gsis["title"]
22 |     assert len(index.uuids_to_position_in_global_index) == len(documents)
23 | 


--------------------------------------------------------------------------------
/tests/fixtures/documents_with_varied_data_types.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {"title": "tolerate it", "lyric": "I made you my temple, my mural, my sky", "listens": 100, "album_in_stock": true, "rating": 4.7, "metadata": {"version": 0}, "record_last_updated": "2023-01-01"},
 3 |     {
 4 |         "title": "my tears ricochet",
 5 |         "lyric": "And I still talk to you when I'm screaming at the sky",
 6 |         "listens": 200,
 7 |         "album_in_stock": true,
 8 |         "rating": 4.7, "metadata": {"version": 0}, "record_last_updated": "2024-01-01"
 9 |     },
10 |     {"title": "The Bolter", "lyric": "Started with a kiss", "listens": 300, "album_in_stock": false, "rating": 4.9, "metadata": {"version": 0}, "record_last_updated": "2024-04-01"}
11 | ]


--------------------------------------------------------------------------------
/docs/pages/templates/add.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Add a Document
 4 | permalink: /add/
 5 | ---
 6 | 
 7 | To add documents to a database, use the following code:
 8 | 
 9 | <pre><code class="language-python">
10 | index.add({"title": "tolerate it", "artist": "Taylor Swift"})
11 | index.add({"title": "betty", "artist": "Taylor Swift"})
12 | </code></pre>
13 | 
14 | Values within documents can have the following data types:
15 | 
16 | - String
17 | - Integer
18 | - Float
19 | - List
20 | - Dictionary
21 | 
22 | When documents are added, a `uuid` key is added for use in uniquely identifying the document.
23 | 
24 | <div class="warning">
25 |     Dictionaries are not indexable. You can store dictionaries and they will be returned in payloads, but you cannot run search operations on them.
26 | </div>


--------------------------------------------------------------------------------
/docs/pages/templates/update.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Update a Document
 4 | permalink: /update/
 5 | ---
 6 | 
 7 | 
 8 | You need a document UUID to update a document. You can retrieve a UUID by searching for a document.
 9 | 
10 | Here is an example showing how to update a document:
11 | 
12 | ```python
13 | response = index.search(
14 |     {
15 |         "query": {"title": {"equals": "tolerate it"}},
16 |         "limit": 10,
17 |         "sort_by": "title",
18 |     }
19 | )
20 | 
21 | uuid = response["documents"][0]["uuid"]
22 | 
23 | index.update(uuid, {"title": "tolerate it (folklore)", "artist": "Taylor Swift"})
24 | ```
25 | 
26 | `update` is an override operation. This means you must provide the full document that you want to save, instead of only the fields you want to update.
27 | 


--------------------------------------------------------------------------------
/docs/pages/templates/highlight.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Highlight Results
 4 | permalink: /highlight/
 5 | ---
 6 | 
 7 | You can extract context around results. This data can be used to show a snippet of the document that contains the query term.
 8 | 
 9 | Here is an example of a query that highlights context around all instances of the term "sky" in the `lyric` field:
10 | 
11 | ```python
12 | query = {
13 |     "query": {
14 |         "lyric": {
15 |             "contains": "sky",
16 |             "highlight": True,
17 |             "highlight_stride": 3
18 |         }
19 |     }
20 | }
21 | ```
22 | 
23 | `highlight_stride` states how many words to retrieve before and after the match.
24 | 
25 | All documents returned by this query will have a `_context` key that contains the context around all instances of the term "sky".


--------------------------------------------------------------------------------
/docs/pages/templates/code-search.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Code Search
 4 | permalink: /code-search/
 5 | ---
 6 | 
 7 | You can use JameSQL to efficiently search through code.
 8 | 
 9 | To do so, first create a `TRIGRAM_CODE` index on the field you want to search.
10 | 
11 | When you add documents, include at least the following two fields:
12 | 
13 | - `file_name`: The name of the file the code is in.
14 | - `code`: The code you want to index.
15 | 
16 | When you search for code, all matching documents will have a `_context` key with the following structure:
17 | 
18 | <pre><code class="language-python">
19 | {
20 |     "line": "1",
21 |     "code": "..."
22 | }
23 | </code></pre>
24 | 
25 | This tells you on what line your search matched, and the code that matched. This information is ideal to highlight specific lines relevant to your query.


--------------------------------------------------------------------------------
/docs/pages/templates/aggregate-metrics.md:
--------------------------------------------------------------------------------
 1 | 
 2 | You can find the total number of unique values for the fields returned by a query using an `aggregate` query. This is useful for presenting the total number of options available in a search space to a user.
 3 | 
 4 | You can use the following query to find the total number of unique values for all fields whose `lyric` field contains the term "sky":
 5 | 
 6 | <pre><code class="language-python">
 7 | query = {
 8 |     "query": {
 9 |         "lyric": {
10 |             "contains": "sky"
11 |         }
12 |     },
13 |     "metrics": ["aggregate"]
14 | }
15 | </code></pre>
16 | 
17 | The aggregate results are presented in an `unique_record_values` key with the following structure:
18 | 
19 | <pre><code class="language-python">
20 | {
21 |     "documents": [...],
22 |     "query_time": 0.0001,
23 |     {'unique_record_values': {'title': 2, 'lyric': 2, 'listens': 2, 'categories': 3}}
24 | }
25 | </code></pre>


--------------------------------------------------------------------------------
/docs/pages/templates/delete.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Delete a Document
 4 | permalink: /delete/
 5 | ---
 6 | 
 7 | You need a document UUID to delete a document. You can retrieve a UUID by searching for a document.
 8 | 
 9 | Here is an example showing how to delete a document:
10 | 
11 | <pre><code class="language-python">
12 | response = index.search(
13 |     {
14 |         "query": {"title": {"equals": "tolerate it"}},
15 |         "limit": 10,
16 |         "sort_by": "title",
17 |     }
18 | )
19 | 
20 | uuid = response["documents"][0]["uuid"]
21 | 
22 | index.remove(uuid)
23 | </code></pre>
24 | 
25 | You can validate the document has been deleted using this code:
26 | 
27 | <pre><code class="language-python">
28 | response = index.search(
29 |     {
30 |         "query": {"title": {"equals": "tolerate it"}},
31 |         "limit": 10,
32 |         "sort_by": "title",
33 |     }
34 | )
35 | 
36 | assert len(response["documents"]) == 0
37 | </code></pre>


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: JameSQL Test Workflow (macOS and Ubuntu)
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   build-dev-test:
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: ["ubuntu-latest", "macos-latest"]
15 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
16 |     steps:
17 |       - name: 🛎️ Checkout
18 |         uses: actions/checkout@v4
19 |       - name: 🐍 Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 |           check-latest: true
24 | 
25 |       - name: 📦 Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install -e .
29 |           pip install -e .[dev]
30 |       - name: 🧪 Test
31 |         run: |
32 |           python -m pytest tests/*.py
33 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Workflow
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       matrix:
12 |         python-version: [3.12]
13 |     steps:
14 |       - name: 🛎️ Checkout
15 |         uses: actions/checkout@v4
16 |         with:
17 |           ref: ${{ github.head_ref }}
18 |       - name: 🐍 Set up Python ${{ matrix.python-version }}
19 |         uses: actions/setup-python@v5
20 |         with:
21 |           python-version: ${{ matrix.python-version }}
22 |       - name: 🦾 Install dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip twine wheel
25 |       - name: 🚀 Publish to PyPi
26 |         env:
27 |           PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |           PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |         run: |
30 |           python setup.py sdist bdist_wheel
31 |           twine check dist/*
32 |           twine upload dist/* -u ${PYPI_USERNAME} -p ${PYPI_PASSWORD} --verbose
33 | 


--------------------------------------------------------------------------------
/.github/workflows/benchmark.yml:
--------------------------------------------------------------------------------
 1 | name: JameSQL Benchmark Workflow
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   build-dev-test:
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: ["ubuntu-latest", "macos-latest"]
15 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
16 |     steps:
17 |       - name: 🛎️ Checkout
18 |         uses: actions/checkout@v4
19 |       - name: 🐍 Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 |           check-latest: true
24 | 
25 |       - name: 📦 Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install -e .
29 |           pip install -e .[dev]
30 | 
31 |       - name: 🧪 Run benchmark stress test
32 |         env:
33 |           SITE_ENV: production
34 |         run: |
35 |           python -m pytest ./tests/*.py --benchmark
36 |           python -m pytest ./tests/*.py --long-benchmark
37 |           
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 James
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
 1 | name: JameSQL Test Workflow (Windows)
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   build-dev-test:
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: ["windows-latest"]
15 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
16 |     steps:
17 |       - name: 🛎️ Checkout
18 |         uses: actions/checkout@v4
19 |       - name: 🐍 Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 |           check-latest: true
24 | 
25 |       - name: 📦 Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install -e .
29 |           pip install -e .[dev]
30 |       - name: 🧪 Test
31 |         run: |
32 |           python -m pytest tests/aggregation.py tests/data_types.py tests/group_by.py tests/highlight.py tests/range_queries.py tests/save_and_load.py tests/script_lang.py tests/string_queries_categorical_and_range.py tests/string_query.py tests/test.py
33 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@v4
15 | 
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: '3.13'
20 |           check-latest: true
21 |           
22 |       - name: Install dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           python -m pip install pygments bs4 lxml
26 |           python -m pip install git+https://github.com/capjamesg/aurora
27 |           cd docs
28 |       - name: Build main site
29 |         env:  
30 |           SITE_ENV: ${{ secrets.SITE_ENV }}  
31 |         run: |
32 |           cd docs
33 |           aurora build
34 |       - name: rsync deployments
35 |         uses: burnett01/rsync-deployments@7.0.1
36 |         with:
37 |           switches: -avzr
38 |           path: "docs/_site/*"
39 |           remote_path: ${{ secrets.REMOTE_PATH }}
40 |           remote_host: ${{ secrets.SERVER_HOST }}
41 |           remote_user: ${{ secrets.SERVER_USERNAME }}
42 |           remote_key: ${{ secrets.KEY }}
43 | 


--------------------------------------------------------------------------------
/tests/concurrency.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import threading
 4 | 
 5 | from jamesql import JameSQL
 6 | from jamesql.index import GSI_INDEX_STRATEGIES
 7 | 
 8 | 
 9 | def test_threading():
10 |     with open("tests/fixtures/documents.json") as f:
11 |         documents = json.load(f)
12 | 
13 |     index = JameSQL()
14 | 
15 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
16 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
17 | 
18 |     for document in documents * 100:
19 |         document = document.copy()
20 |         index.add(document, doc_id=str(random.randint(0, 1000000)))
21 | 
22 |     def query(i):
23 |         if i == 0:
24 |             document = documents[0].copy()
25 |             document["title"] = "teal"
26 |             index.add(document, "xyz")
27 |             index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
28 | 
29 |         assert len(index.string_query_search("teal")["documents"]) == 1
30 | 
31 |     threads = []
32 | 
33 |     for i in range(2500):
34 |         t = threading.Thread(target=query, args=(i,))
35 |         threads.append(t)
36 |         t.start()
37 | 
38 |     for t in threads:
39 |         t.join()
40 | 
41 |     assert len(index.global_index) == 301
42 |     assert index.global_index["xyz"]["title"] == "teal"
43 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from setuptools import find_packages
 3 | import re
 4 | 
 5 | with open("./jamesql/__init__.py", 'r') as f:
 6 |     content = f.read()
 7 |     version = re.search(r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]', content).group(1)
 8 |     
 9 | with open("README.md", "r") as fh:
10 |     long_description = fh.read()
11 | 
12 | setuptools.setup(
13 |     name="jamesql",
14 |     version=version,
15 |     author="capjamesg",
16 |     author_email="jamesg@jamesg.blog",
17 |     description="A JameSQL database implemented in Python.",
18 |     long_description=long_description,
19 |     long_description_content_type="text/markdown",
20 |     url="https://github.com/capjamesg/jamesql",
21 |     install_requires=[
22 |         "pybmoore",
23 |         "pygtrie",
24 |         "lark",
25 |         "btrees",
26 |         "nltk",
27 |         "sortedcontainers"
28 |     ],
29 |     packages=find_packages(exclude=("tests",)),
30 |     extras_require={
31 |         "dev": ["flake8", "black==22.3.0", "isort", "twine", "pytest", "wheel", "flask", "orjson", "tqdm", "deepdiff"],
32 |     },
33 |     classifiers=[
34 |         "Programming Language :: Python :: 3",
35 |         "License :: OSI Approved :: MIT License",
36 |         "Operating System :: OS Independent",
37 |     ],
38 |     python_requires=">=3.7",
39 | )
40 | 


--------------------------------------------------------------------------------
/web/landing.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Document</title>
 7 |     <style>
 8 |         :root {
 9 |             --step--2: clamp(0.6944rem, 0.6856rem + 0.0444vw, 0.72rem);
10 |             --step--1: clamp(0.8333rem, 0.8101rem + 0.1159vw, 0.9rem);
11 |             --step-0: clamp(1rem, 0.9565rem + 0.2174vw, 1.125rem);
12 |             --step-1: clamp(1.2rem, 1.1283rem + 0.3587vw, 1.4063rem);
13 |             --step-2: clamp(1.44rem, 1.3295rem + 0.5527vw, 1.7578rem);
14 |             --step-3: clamp(1.728rem, 1.5648rem + 0.8161vw, 2.1973rem);
15 |             --step-4: clamp(2.0736rem, 1.8395rem + 1.1704vw, 2.7466rem);
16 |             --step-5: clamp(2.4883rem, 2.1597rem + 1.6433vw, 3.4332rem);
17 |             --primary-color: #ff9950;
18 |         }
19 |         body, html {
20 |             padding: 0;
21 |             font-size: var(--step-0);
22 |             font-family: 'Helvetica', sans-serif;
23 |         }
24 |         html {
25 |             border-top: 0.25em solid var(--primary-color);
26 |         }
27 |         main {
28 |             margin: auto;
29 |             max-width: 40em;
30 |         }
31 |     </style>
32 | </head>
33 | <body>
34 |     <main>
35 |         <h1>JameSQL</h1>
36 |         <p>Fast, in-memory search.</p>
37 |     </main>
38 | </body>
39 | </html>


--------------------------------------------------------------------------------
/jamesql/query_simplifier.py:
--------------------------------------------------------------------------------
 1 | def normalize_operator_query(t):
 2 |     if isinstance(t, str):
 3 |         return t
 4 | 
 5 |     return "_".join(t)
 6 | 
 7 | 
 8 | def simplifier(terms):
 9 |     new_terms = []
10 |     outer_terms = set()
11 |     to_remove = set()
12 | 
13 |     for i, t in enumerate(terms):
14 |         if isinstance(t, str) and t not in outer_terms:
15 |             outer_terms.add(t)
16 |             new_terms.append(t)
17 | 
18 |     for i, t in enumerate(terms):
19 |         normalized_terms = normalize_operator_query(t)
20 |         if isinstance(t, list) and t[1] == "OR":
21 |             for inner_term in t:
22 |                 if inner_term == "OR":
23 |                     continue
24 | 
25 |                 if inner_term not in outer_terms:
26 |                     outer_terms.add(inner_term)
27 |                     new_terms.append(inner_term)
28 |         elif (
29 |             isinstance(t, list)
30 |             and t[1] == "AND"
31 |             and normalized_terms not in outer_terms
32 |         ):
33 |             new_terms.append(t[0])
34 |             new_terms.append("AND")
35 |             new_terms.append(t[2])
36 |         elif (
37 |             isinstance(t, list)
38 |             and t[0] == "NOT"
39 |             and normalized_terms not in outer_terms
40 |         ):
41 |             new_terms.append("-" + t[1])
42 | 
43 |             if t[1] in outer_terms:
44 |                 to_remove.add(t[1])
45 |                 to_remove.add("-" + t[1])
46 | 
47 |     return [i for i in new_terms if normalize_operator_query(i) not in to_remove]
48 | 


--------------------------------------------------------------------------------
/docs/pages/templates/quickstart.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Quickstart
 4 | permalink: /quickstart/
 5 | ---
 6 | 
 7 | <p>You can create a JameSQL database in five lines of code.</p>
 8 | 
 9 | <h2>Install JameSQL</h2>
10 | 
11 | First, install JameSQL:
12 | 
13 | <pre>
14 | pip install jamesql
15 | </pre>
16 | 
17 | <h2>Insert Records</h2>
18 | 
19 | Then, create a new Python file and add the following code:
20 | 
21 | <pre><code class="language-python">
22 | from jamesql import JameSQL, GSI_INDEX_STRATEGIES
23 | 
24 | index = JameSQL.load()
25 | 
26 | index.add({"title": "tolerate it", "lyric": "Use my best colors for your portrait"})
27 | </code></pre>
28 | 
29 | <h2>Create an Index</h2>
30 | 
31 | For efficient data retrieval for longer pieces of text in the `lyric` key, we are going to use the `CONTAINS` index type. This creates a reverse index for each word in the text.
32 | 
33 | <pre><code class="language-python">
34 | index.create_gsi("lyric", GSI_INDEX_STRATEGIES.CONTAINS)
35 | </code></pre>
36 | 
37 | <h3>Search the Database</h3>
38 | 
39 | We can search the database using the following code:
40 | 
41 | <pre><code class="language-python">
42 | results = index.string_query_search("title:'tolerate it' colors")
43 | 
44 | print(results)
45 | </code></pre>
46 | 
47 | Our code returns:
48 | 
49 | <pre><code class="language-python">
50 | {"documents": [{"title": "tolerate it", "lyric": "Use my best colors for your portrait" …}]}
51 | </code></pre>
52 | 
53 | We have successfully built a database!
54 | 
55 | <footer>
56 | Next up: <a href="{{ site.root_url }}/index/">Learn how to create indices →</a>
57 | </footer>


--------------------------------------------------------------------------------
/tests/fixtures/code/simplifier.py:
--------------------------------------------------------------------------------
 1 | def normalize_operator_query(t):
 2 |     if isinstance(t, str):
 3 |         return t
 4 | 
 5 |     return "_".join(t)
 6 | 
 7 | 
 8 | def simplifier(terms):
 9 |     new_terms = []
10 |     outer_terms = set()
11 |     to_remove = set()
12 | 
13 |     for i, t in enumerate(terms):
14 |         if isinstance(t, str) and t not in outer_terms:
15 |             outer_terms.add(t)
16 |             new_terms.append(t)
17 | 
18 |     for i, t in enumerate(terms):
19 |         normalized_terms = normalize_operator_query(t)
20 |         if isinstance(t, list) and t[1] == "OR":
21 |             for inner_term in t:
22 |                 if inner_term == "OR":
23 |                     continue
24 | 
25 |                 if inner_term not in outer_terms:
26 |                     outer_terms.add(inner_term)
27 |                     new_terms.append(inner_term)
28 |         elif (
29 |             isinstance(t, list)
30 |             and t[1] == "AND"
31 |             and normalized_terms not in outer_terms
32 |         ):
33 |             new_terms.append(t)
34 |             outer_terms.add(normalized_terms)
35 |             if t[0] in outer_terms:
36 |                 to_remove.add(t[0])
37 |             if t[2] in outer_terms:
38 |                 to_remove.add(t[2])
39 |         elif (
40 |             isinstance(t, list)
41 |             and t[0] == "NOT"
42 |             and normalized_terms not in outer_terms
43 |         ):
44 |             if t[1] in outer_terms:
45 |                 to_remove.add(t[1])
46 | 
47 |     return [i for i in new_terms if normalize_operator_query(i) not in to_remove]
48 | 


--------------------------------------------------------------------------------
/docs/pages/templates/ranking.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | permalink: /ranking/
 4 | title: Document Ranking
 5 | ---
 6 | 
 7 | By default, documents are ranked in no order. If you provide a `sort_by` field, documents are sorted by that field.
 8 | 
 9 | For more advanced ranking, you can use the `boost` feature. This feature lets you boost the value of a field in a document to calculate a final score.
10 | 
11 | The default score for each field is `1`.
12 | 
13 | To use this feature, you must use `boost` on fields that have an index.
14 | 
15 | Here is an example of a query that uses the `boost` feature:
16 | 
17 | ```python
18 | {
19 |     "query": {
20 |         "or": {
21 |             "post": {
22 |                 "contains": "taylor swift",
23 |                 "strict": False,
24 |                 "boost": 1
25 |             },
26 |             "title": {
27 |                 "contains": "desk",
28 |                 "strict": True,
29 |                 "boost": 25
30 |             }
31 |         }
32 |     },
33 |     "limit": 4,
34 |     "sort_by": "_score",
35 | }
36 | ```
37 | 
38 | This query would search for documents whose `post` field contains `taylor swift` or whose `title` field contains `desk`. The `title` field is boosted by 25, so documents that match the `title` field are ranked higher.
39 | 
40 | The score for each document before boosting is equal to the number of times the query condition is satisfied. For example, if a post contains `taylor swift` twice, the score for that document is `2`; if a title contains `desk` once, the score for that document is `1`.
41 | 
42 | Documents are then ranked in decreasing order of score.


--------------------------------------------------------------------------------
/docs/pages/templates/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: JameSQL
 4 | permalink: /
 5 | ---
 6 | 
 7 | An in-memory, NoSQL database implemented in Python, with support for building custom ranking algorithms.
 8 | 
 9 | You can run full text search queries on thousands of documents with multiple fields in < 1ms.
10 | 
11 | ## Demo
12 | 
13 | [Try a site search engine built with JameSQL](https://jamesg.blog/search-pages/).
14 | 
15 | <video autoplay loop muted playsinline>
16 |     <source src="https://private-user-images.githubusercontent.com/37276661/377151826-f1bf931d-6601-4fc8-b43c-d284853bce8f.mov?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MzA4MTA0MTUsIm5iZiI6MTczMDgxMDExNSwicGF0aCI6Ii8zNzI3NjY2MS8zNzcxNTE4MjYtZjFiZjkzMWQtNjYwMS00ZmM4LWI0M2MtZDI4NDg1M2JjZThmLm1vdj9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDExMDUlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQxMTA1VDEyMzUxNVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPWQxOTU1ZThlNjhjYjVmNTYwYmUyODdjOTQ3MzU5OGFiOGI1MWU1ODE0OWRlMDRmOTM1M2I5YzJmMTQxZWI5ZmUmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0In0.2GBymAKR-6lGJskHKo7CslvuiR8jaDfR4hn2EA56MVQ" type="video/mp4">
17 | </video>
18 | 
19 | ## Ideal use case
20 | 
21 | JameSQL is designed for small-scale search projects where objects can easily be loaded into and stored in memory.
22 | 
23 | James uses it for his [personal website search engine](https://jamesg.blog/search-pages/), which indexes 1,000+ documents (500,000+ words).
24 | 
25 | On James' search engine, are computed in < 10ms and returned to a client in < 70ms.


--------------------------------------------------------------------------------
/tests/fixtures/code/simplifier_demo.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import os
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | def get_trigrams(line):
 7 |     return [line[i : i + 3] for i in range(len(line) - 2)]
 8 | 
 9 | 
10 | index = defaultdict(list)
11 | 
12 | # read all python files in .
13 | DIR = "./pages/posts/"
14 | id2line = {}
15 | doc_lengths = {}
16 | 
17 | for root, dirs, files in os.walk(DIR):
18 |     for file in files:
19 |         if file.endswith(".md"):
20 |             with open(os.path.join(root, file), "r") as file:
21 |                 code = file.read()
22 | 
23 |             code_lines = code.split("\n")
24 |             total_lines = len(code_lines)
25 | 
26 |             for line_num, line in enumerate(code_lines):
27 |                 trigrams = get_trigrams(line)
28 | 
29 |                 if len(trigrams) == 0:
30 |                     id2line[f"{file.name}:{line_num}"] = line
31 | 
32 |                 for trigram in trigrams:
33 |                     index[trigram].append((file, line_num))
34 |                     id2line[f"{file.name}:{line_num}"] = line
35 | 
36 |             doc_lengths[file.name] = total_lines
37 | 
38 | query = "coffee"
39 | context = 0
40 | 
41 | trigrams = get_trigrams(query)
42 | 
43 | candidates = set(index[trigrams[0]])
44 | # print([file.name + ":" + str(line_num) for file, line_num in candidates])
45 | for trigram in trigrams:
46 |     candidates = candidates.intersection(set(index[trigram]))
47 | 
48 | for file, line_num in candidates:
49 |     print(f"{file.name}:{line_num}")
50 |     for i in range(
51 |         max(0, line_num - context), min(doc_lengths[file.name], line_num + context + 1)
52 |     ):
53 |         line = id2line[f"{file.name}:{i}"]
54 |         print(f"{i}: {line}")
55 | 
56 |     print()
57 | 


--------------------------------------------------------------------------------
/docs/pages/templates/search.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | permalink: /search-pages/
 4 | title: Search
 5 | notoc: true
 6 | ---
 7 | 
 8 | <p><span id="results-count"></span> search results for "<span id="query"></span>"</p>
 9 | 
10 | <ul class="search">
11 | </ul>
12 | 
13 | <script>
14 |     // get ?query= arg and make search
15 |     if (window.location.search.includes("?q=")) {
16 |         var query = window.location.search.split("?q=")[1];
17 |         var queryElement = document.getElementById("query");
18 |         var searchElement = document.getElementById("search-input");
19 |         searchElement.value = query;
20 |         queryElement.innerText = query;
21 |         var search = document.getElementsByClassName("search")[0];
22 |         // make request to localhost:5000/search?q=query
23 |         fetch("/search?q=" + query)
24 |             .then(response => response.json())
25 |             .then(data => {
26 |                 var resultsCount = document.getElementById("results-count");
27 |                 resultsCount.innerText = data.documents.length;
28 |                 data.documents.forEach(result => {
29 |                     var li = document.createElement("li");
30 |                     var a = document.createElement("a");
31 |                     var h3 = document.createElement("h2");
32 |                     var p = document.createElement("p");
33 |                     a.href = result.url;
34 |                     h3.innerText = result.title;
35 |                     p.innerHTML = result.raw_content;
36 |                     a.appendChild(h3);
37 |                     a.appendChild(p);
38 |                     li.appendChild(a);
39 |                     search.appendChild(li);
40 |                 });
41 |             });
42 |     }
43 | </script>


--------------------------------------------------------------------------------
/schema.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from enum import Enum
 4 | from typing import Dict, Optional
 5 | 
 6 | from pydantic import BaseModel, ConfigDict, model_validator
 7 | 
 8 | VALID_QUERY_TYPES = ["contains", "equals", "starts_with"]
 9 | VALID_OPERATOR_QUERY_TYPES = ["or", "and"]
10 | 
11 | 
12 | class QueryType(str, Enum):
13 |     contains = "contains"
14 |     equals = "equals"
15 |     starts_with = "starts_with"
16 | 
17 | 
18 | class AndOperatorQueryType(str, Enum):
19 |     and_ = "and"
20 | 
21 | 
22 | class OrOperatorQueryType(str, Enum):
23 |     or_ = "or"
24 | 
25 | 
26 | class QueryItem(BaseModel):
27 |     model_config = ConfigDict(extra="forbid")
28 | 
29 |     contains: Optional[str] = None
30 |     equals: Optional[str] = None
31 |     starts_with: Optional[str] = None
32 | 
33 |     strict: Optional[bool] = False
34 |     boost: Optional[int] = 1
35 | 
36 |     # ensure that only one of the query types is used
37 |     @model_validator(mode="after")
38 |     def validate_query_type(cls, v):
39 |         query_types = [v.contains, v.equals, v.starts_with]
40 | 
41 |         if len([qt for qt in query_types if qt]) > 1:
42 |             raise ValueError("Only one query type can be used")
43 | 
44 |         return v
45 | 
46 | 
47 | class RootQuery(BaseModel):
48 |     query: (
49 |         Dict[AndOperatorQueryType, Dict[str, QueryItem]]
50 |         | Dict[OrOperatorQueryType, Dict[str, QueryItem]]
51 |         | Dict[str, QueryItem]
52 |     )
53 |     limit: Optional[int] = 10
54 |     sort_by: Optional[str] = "score"
55 | 
56 | 
57 | query = {
58 |     "query": {
59 |         "or": {
60 |             "post": {"contains": "taylor swift", "strict": False, "boost": 1},
61 |             "title": {"contains": "my desk", "strict": True, "boost": 25},
62 |         }
63 |     },
64 |     "limit": 4,
65 |     "sort_by": "score",
66 | }
67 | 
68 | # validate query
69 | print(RootQuery(**query))
70 | 


--------------------------------------------------------------------------------
/jamesql/script_lang.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import math
 3 | 
 4 | from lark import Transformer
 5 | 
 6 | grammar = """
 7 | start: query
 8 | 
 9 | query: decay | "(" query OPERATOR query ")" | logarithm | FLOAT | WORD
10 | logarithm: LOGARITHM "(" query ")"
11 | OPERATOR: "+" | "-" | "*" | "/"
12 | LOGARITHM: "log"
13 | decay: "decay" WORD
14 | 
15 | WORD: /[a-zA-Z0-9_]+/
16 | FLOAT: /[0-9]+(\.[0-9]+)?/
17 | 
18 | %import common.WS
19 | %ignore WS
20 | """
21 | 
22 | OPERATOR_METHODS = {
23 |     "+": lambda x, y: x + y,
24 |     "-": lambda x, y: x - y,
25 |     "*": lambda x, y: x * y,
26 |     "/": lambda x, y: x / y,
27 | }
28 | 
29 | 
30 | class JameSQLScriptTransformer(Transformer):
31 |     def __init__(self, document):
32 |         self.document = document
33 | 
34 |     def query(self, items):
35 |         if len(items) == 1:
36 |             return items[0]
37 | 
38 |         left = items[0]
39 |         operator = items[1]
40 |         right = items[2]
41 | 
42 |         operator_command = OPERATOR_METHODS[operator]
43 | 
44 |         return operator_command(left, right)
45 | 
46 |     def logarithm(self, items):
47 |         # + 0.1 removes the possibility of log(0)
48 |         # which would return a math domain error
49 |         return math.log(items[1] + 0.1)
50 | 
51 |     def start(self, items):
52 |         return items[0]
53 | 
54 |     def decay(self, items):
55 |         # decay by half for every 30 days
56 |         # item is datetime.dateime object
57 |         days_since_post = (
58 |             datetime.datetime.now()
59 |             - datetime.datetime.strptime(items[0], "%Y-%m-%dT%H:%M:%S")
60 |         ).days
61 | 
62 |         return 1.1 ** (days_since_post / 30)
63 | 
64 |     def WORD(self, items):
65 |         if items.value.isdigit():
66 |             return float(items.value)
67 | 
68 |         return self.document[items.value]
69 | 
70 |     def FLOAT(self, items):
71 |         return float(items.value)
72 | 
73 |     def OPERATOR(self, items):
74 |         return items.value
75 | 


--------------------------------------------------------------------------------
/docs/pages/templates/autosuggest.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | permalink: /autosuggest/
 4 | title: Autosuggest
 5 | ---
 6 | 
 7 | You can enable autosuggest using one or more fields in an index. This can be used to efficiently find records that start with a given prefix.
 8 | 
 9 | To enable autosuggest on an index, run:
10 | 
11 | <pre><code class="language-python">
12 | index = JameSQL()
13 | 
14 | index.enable_autosuggest("field")
15 | </code></pre>
16 | 
17 | Where `field` is the name of the field on which you want to enable autosuggest.
18 | 
19 | You can enable autosuggest on multiple fields:
20 | 
21 | <pre><code class="language-python">
22 | index.enable_autosuggest("field1")
23 | index.enable_autosuggest("field2")
24 | </code></pre>
25 | 
26 | When you enable autosuggest on a field, JameSQL will create a trie index for that field. This index is used to efficiently find records that start with a given prefix.
27 | 
28 | To run an autosuggest query, use the following code:
29 | 
30 | <pre><code class="language-python">
31 | suggestions = index.autosuggest("started", match_full_record=True, limit = 1)
32 | </code></pre>
33 | 
34 | This will automatically return records that start with the prefix `started`.
35 | 
36 | The `match_full_record` parameter indicates whether to return full record names, or any records starting with a term.
37 | 
38 | `match_full_record=True` means that the full record name will be returned. This is ideal to enable selection between full records.
39 | 
40 | `match_full_record=False` means that any records starting with the term will be returned. This is ideal for autosuggesting single words.
41 | 
42 | For example, given the query `start`, matching against full records with `match_full_record=True` would return:
43 | 
44 | - `Started with a kiss`
45 | 
46 | This is the content of a full document.
47 | 
48 | `match_full_record=False`, on the other hand, would return:
49 | 
50 | - `started`
51 | - `started with a kiss`
52 | 
53 | This contains both a root word starting with `start` and full documents starting with `start`.
54 | 
55 | This feature is case insensitive.
56 | 
57 | The `limit` argument limits the number of results returned.


--------------------------------------------------------------------------------
/docs/pages/templates/spelling-correction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Spelling Correction
 4 | permalink: /spelling-correction/
 5 | ---
 6 | 
 7 | It is recommended that you check the spelling of words before you run a query. 
 8 | 
 9 | This is because correcting the spelling of a word can improve the accuracy of your search results.
10 | 
11 | ### Correcting the spelling of a single word
12 | 
13 | To recommend a spelling correction for a query, use the following code:
14 | 
15 | ```python
16 | index = ...
17 | 
18 | suggestion = index.spelling_correction("taylr swift")
19 | ```
20 | 
21 | This will return a single suggestion. The suggestion will be the word that is most likely to be the correct spelling of the word you provided.
22 | 
23 | Spelling correction first generates segmentations of a word, like:
24 | 
25 | - `t aylorswift`
26 | - `ta ylorswift`
27 | 
28 | If a segmentation is valid, it is returned.
29 | 
30 | For example, if the user types in `taylorswift`, one permutation would be segmented into `taylor swift`. If `taylor swift` is common in the index, `taylor swift` will be returned as the suggestion.
31 | 
32 | Spelling correction works by transforming the input query by inserting, deleting, and transforming one character in every position in a string. The transformed strings are then looked up in the index to find if they are present and, if so, how common they are.
33 | 
34 | The most common suggestion is then returned.
35 | 
36 | For example, if you provide the word `tayloi` and `taylor` is common in the index, the suggestion will be `taylor`.
37 | 
38 | If correction was not possible after transforming one character, correction will be attempted with two transformations given the input string.
39 | 
40 | If the word you provided is already spelled correctly, the suggestion will be the word you provided. If spelling correction is not possible (i.e. the word is too distant from any word in the index), the suggestion will be `None`.
41 | 
42 | ### Correcting a string query
43 | 
44 | If you are correcting a string query submitted with the `string_query_search()` function, spelling will be automatically corrected using the algorithm above. No configuration is required.


--------------------------------------------------------------------------------
/docs/pages/templates/conditions/operators.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Operators
 3 | layout: default
 4 | permalink: /conditions/operators
 5 | ---
 6 | 
 7 | There are three operators you can use for condition matching:
 8 | 
 9 | - `equals`
10 | - `contains`
11 | - `starts_with`
12 | 
13 | Here is an example of a query that searches for documents that have the `artist` field set to `Taylor Swift`:
14 | 
15 | ```python
16 | query = {
17 |     "query": {
18 |         "artist": {
19 |             "equals": "Taylor Swift"
20 |         }
21 |     }
22 | }
23 | ```
24 | 
25 | These operators can be used with three query types:
26 | 
27 | - `and`
28 | - `or`
29 | - `not`
30 | 
31 | ### and
32 | 
33 | You can also search for documents that have the `artist` field set to `Taylor Swift` and the `title` field set to `tolerate it`:
34 | 
35 | ```python
36 | query = {
37 |     "query": {
38 |         "and": [
39 |             {
40 |                 "artist": {
41 |                     "equals": "Taylor Swift"
42 |                 }
43 |             },
44 |             {
45 |                 "title": {
46 |                     "equals": "tolerate it"
47 |                 }
48 |             }
49 |         ]
50 |     }
51 | }
52 | ```
53 | 
54 | ### or
55 | 
56 | You can nest conditions to create complex queries, like:
57 | 
58 | ```python
59 | query = {
60 |     "query": {
61 |         "or": {
62 |             "and": [
63 |                 {"title": {"starts_with": "tolerate"}},
64 |                 {"title": {"contains": "it"}},
65 |             ],
66 |             "lyric": {"contains": "kiss"},
67 |         }
68 |     },
69 |     "limit": 2,
70 |     "sort_by": "title",
71 | }
72 | ```
73 | 
74 | This will return a list of documents that match the query.
75 | 
76 | ### not
77 | 
78 | You can search for documents that do not match a query by using the `not` operator. Here is an example of a query that searches for lyrics that contain `sky` but not `kiss`:
79 | 
80 | ```python
81 | query = {
82 |     "query": {
83 |         "and": {
84 |             "or": [
85 |                 {"lyric": {"contains": "sky", "boost": 3}},
86 |             ],
87 |             "not": {"lyric": {"contains": "kiss"}},
88 |         }
89 |     },
90 |     "limit": 10,
91 |     "sort_by": "title",
92 | }
93 | ```


--------------------------------------------------------------------------------
/docs/pages/templates/string-queries.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: String Queries
 4 | permalink: /string-query/
 5 | ---
 6 | 
 7 | JameSQL supports string queries. String queries are single strings that use special syntax to assert the meaning of parts of a string.
 8 | 
 9 | For example, you could use the following query to find documents where the `title` field contains `tolerate it` and any field contains `mural`:
10 | 
11 | <pre>
12 | title:"tolerate it" mural
13 | </pre>
14 | 
15 | The following operators are supported:
16 | 
17 | <table>
18 |     <thead>
19 |         <tr>
20 |             <th>Operator</th>
21 |             <th>Description</th>
22 |         </tr>
23 |     </thead>
24 |     <tbody>
25 |         <tr>
26 |             <td><code>-term</code></td>
27 |             <td>Search for documents that do not contain <code>term</code>.</td>
28 |         </tr>
29 |         <tr>
30 |             <td><code>term</code></td>
31 |             <td>Search for documents that contain <code>term</code>.</td>
32 |         </tr>
33 |         <tr>
34 |             <td><code>term1 term2</code></td>
35 |             <td>Search for documents that contain <code>term1</code> and <code>term2</code>.</td>
36 |         </tr>
37 |         <tr>
38 |             <td><code>'term1 term2'</code></td>
39 |             <td>Search for the literal phrase <code>term1 term2</code> in documents.</td>
40 |         </tr>
41 |         <tr>
42 |             <td><code>field:'term'</code></td>
43 |             <td>Search for documents where the <code>field</code> field contains <code>term</code> (i.e. <code>title:"tolerate it"</code>).</td>
44 |         </tr>
45 |         <tr>
46 |             <td><code>field^2 term</code></td>
47 |             <td>Boost the score of documents where the <code>field</code> field matches the query <code>term</code> by <code>2</code>.</td>
48 |         </tr>
49 |     </tbody>
50 | </table>
51 | 
52 | This feature turns a string query into a JameSQL query, which is then executed and the results returned.
53 | 
54 | To run a string query, use the following code:
55 | 
56 | ```python
57 | results = index.string_query_search("title:'tolerate it' mural")
58 | ```
59 | 
60 | When you run a string query, JameSQL will attempt to simplify the query to make it more efficient. For example, if you search for `-sky sky mural`, the query will be `mural` because `-sky` negates the `sky` mention.
61 | 


--------------------------------------------------------------------------------
/docs/hooks.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from pygments import highlight
 3 | from pygments.lexers import PythonLexer, HtmlLexer
 4 | from pygments.formatters import HtmlFormatter
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | languages = {
 8 |     "python": PythonLexer(),
 9 |     "html": HtmlLexer(),
10 |     "text": HtmlLexer(),
11 | }
12 | 
13 | def generate_table_of_contents(file_name, page_state, site_state):
14 |     page = BeautifulSoup(page_state["page"].contents, 'html.parser')
15 |     h2s = page.find_all('h2')
16 |     toc = []
17 |     for h2 in h2s:
18 |         toc.append({
19 |             "text": h2.text,
20 |             "id": h2.text.lower().replace(" ", "-"),
21 |             "children": []
22 |         })
23 |         h3s = h2.find_next_siblings('h3')
24 |         for h3 in h3s:
25 |             # if h3 is a child of another h3, skip it
26 |             if h3.find_previous_sibling('h2') != h2:
27 |                 continue
28 |             toc[-1]["children"].append({
29 |                 "text": h3.text,
30 |                 "id": h3.text.lower().replace(" ", "-"),
31 |             })
32 |     page_state["page"].toc = toc
33 | 
34 |     return page_state
35 | 
36 | def highlight_code(file_name, page_state, _, page_contents):
37 |     print(f"Checking {file_name}")
38 |     if ".txt" in file_name or ".xml" in file_name:
39 |         return page_contents
40 |     print(f"Highlighting code in {file_name}")
41 |     soup = BeautifulSoup(page_contents, 'lxml')
42 | 
43 |     for pre in soup.find_all('pre'):
44 |         code = pre.find('code')
45 |         try:
46 |             language = code['class'][0].split("language-")[1]
47 |             code = highlight(code.text, languages[language], HtmlFormatter())
48 |         except:
49 |             continue
50 |         
51 |         pre.replace_with(BeautifulSoup(code, 'html.parser'))
52 | 
53 |     css = HtmlFormatter().get_style_defs('.highlight')
54 |     css = f"<style>{css}</style>"
55 | 
56 |     # this happens for bookmarks
57 |     if not soup.find("body"):
58 |         return ""
59 |     
60 |     body = soup.find('body')
61 |     body.insert(0, BeautifulSoup(css, 'html.parser'))
62 | 
63 |     # get every h2 and add id= to it
64 |     for h2 in soup.find_all('h2'):
65 |         h2['id'] = h2.text.lower().replace(" ", "-")
66 |     for h3 in soup.find_all('h3'):
67 |         h3['id'] = h3.text.lower().replace(" ", "-")
68 | 
69 |     return str(soup)


--------------------------------------------------------------------------------
/docs/pages/templates/matching.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Search Matching
 4 | permalink: /matching/
 5 | ---
 6 | 
 7 | ### Strict matching
 8 | 
 9 | By default, a search query on a text field will find any document where the field contains any word in the query string. For example, a query for `tolerate it` on a `title` field will match any document whose `title` that contains `tolerate` or `it`. This is called a non-strict match.
10 | 
11 | Non-strict matches are the default because they are faster to compute than strict matches.
12 | 
13 | If you want to find documents where terms appear next to each other in a field, you can do so with a strict match. Here is an example of a strict match:
14 | 
15 | ```python
16 | query = {
17 |     "query": {
18 |         "title": {
19 |             "contains": "tolerate it",
20 |             "strict": True
21 |         }
22 |     }
23 | }
24 | ```
25 | 
26 | This will return documents whose title contains `tolerate it` as a single phrase.
27 | 
28 | ### Fuzzy matching
29 | 
30 | By default, search queries look for the exact string provided. This means that if a query contains a typo (i.e. searching for `tolerate ip` instead of `tolerate it`), no documents will be returned.
31 | 
32 | JameSQL implements a limited form of fuzzy matching. This means that if a query contains a typo, JameSQL will still return documents that match the query.
33 | 
34 | The fuzzy matching feature matches documents that contain one typo. If a document contains more than one typo, it will not be returned. A typo is an incorrectly typed character. JameSQL does not support fuzzy matching that accounts for missing or additional characters (i.e. `tolerate itt` will not match `tolerate it`).
35 | 
36 | You can enable fuzzy matching by setting the `fuzzy` key to `True` in the query. Here is an example of a query that uses fuzzy matching:
37 | 
38 | ```python
39 | query = {
40 |     "query": {
41 |         "title": {
42 |             "contains": "tolerate ip",
43 |             "fuzzy": True
44 |         }
45 |     }
46 | }
47 | ```
48 | 
49 | ### Wildcard matching
50 | 
51 | You can match documents using a single wildcard character. This character is represented by an asterisk `*`.
52 | 
53 | ```python
54 | query = {
55 |     "query": {
56 |         "title": {
57 |             "contains": "tolerat* it",
58 |             "fuzzy": True
59 |         }
60 |     }
61 | }
62 | ```
63 | 
64 | This query will look for all words that match the pattern `tolerat* it`, where the `*` character can be any single character.


--------------------------------------------------------------------------------
/docs/pages/templates/script-scores.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | permalink: /script-scores/
 4 | title: Script Scores
 5 | ---
 6 | 
 7 | The script score feature lets you write custom scripts to calculate the score for each document. This is useful if you want to calculate a score based on multiple fields, including numeric fields.
 8 | 
 9 | Script scores are applied after all documents are retrieved.
10 | 
11 | The script score feature supports the following mathematical operations:
12 | 
13 | - `+` (addition)
14 | - `-` (subtraction)
15 | - `*` (multiplication)
16 | - `/` (division)
17 | - `log` (logarithm)
18 | - `decay` (timeseries decay)
19 | 
20 | You can apply a script score at the top level of your query:
21 | 
22 | ```python
23 | {
24 |     "query": {
25 |         "or": {
26 |             "post": {
27 |                 "contains": "taylor swift",
28 |                 "strict": False,
29 |                 "boost": 1
30 |             },
31 |             "title": {
32 |                 "contains": "desk",
33 |                 "strict": True,
34 |                 "boost": 25
35 |             }
36 |         }
37 |     },
38 |     "limit": 4,
39 |     "sort_by": "_score",
40 |     "script_score": "((post + title) * 2)"
41 | }
42 | ```
43 | 
44 | The above example will calculate the score of documents by adding the score of the `post` field and the `title` field, then multiplying the result by `2`.
45 | 
46 | A script score is made up of terms. A term is a field name or number (float or int), followed by an operator, followed by another term or number. Terms can be nested.
47 | 
48 | All terms must be enclosed within parentheses.
49 | 
50 | To compute a score that adds the `post` score to `title` and multiplies the result by `2`, use the following code:
51 | 
52 | ```text
53 | ((post + title) * 2)
54 | ```
55 | 
56 | Invalid forms of this query include:
57 | 
58 | - `post + title * 2` (missing parentheses)
59 | - `(post + title * 2)` (terms can only include one operator)
60 | 
61 | The `decay` function lets you decay a value by `0.9 ** days_since_post / 30`. This is useful for gradually decreasing the rank for older documents as time passes. This may be particularly useful if you are working with data where you want more recent documents to be ranked higher. `decay` only works with timeseries.
62 | 
63 | Here is an example of `decay` in use:
64 | 
65 | ```
66 | (_score * decay published)
67 | ```
68 | 
69 | This will apply the `decay` function to the `published` field.
70 | 
71 | Data must be stored as a Python `datetime` object for the `decay` function to work.


--------------------------------------------------------------------------------
/tests/autosuggest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | from contextlib import ExitStack as DoesNotRaise
 4 | 
 5 | import pytest
 6 | from deepdiff import DeepDiff
 7 | 
 8 | from jamesql import JameSQL
 9 | from jamesql.index import GSI_INDEX_STRATEGIES
10 | 
11 | 
12 | def pytest_addoption(parser):
13 |     parser.addoption("--benchmark", action="store")
14 | 
15 | 
16 | @pytest.fixture(scope="session")
17 | def create_indices(request):
18 |     with open("tests/fixtures/documents.json") as f:
19 |         documents = json.load(f)
20 | 
21 |     index = JameSQL()
22 | 
23 |     for document in documents:
24 |         index.add(document)
25 | 
26 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
27 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
28 | 
29 |     index.enable_autosuggest("title")
30 | 
31 |     with open("tests/fixtures/documents.json") as f:
32 |         documents = json.load(f)
33 | 
34 |     if request.config.getoption("--benchmark") or request.config.getoption(
35 |         "--long-benchmark"
36 |     ):
37 |         large_index = JameSQL()
38 | 
39 |         for document in documents * 100000:
40 |             if request.config.getoption("--long-benchmark"):
41 |                 document = document.copy()
42 |                 document["title"] = "".join(
43 |                     [
44 |                         word + " "
45 |                         for word in document["title"].split()
46 |                         for _ in range(10)
47 |                     ]
48 |                 )
49 |             large_index.add(document)
50 | 
51 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
52 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
53 | 
54 |         large_index.enable_autosuggest("title")
55 |     else:
56 |         large_index = None
57 | 
58 |     return index, large_index
59 | 
60 | 
61 | @pytest.mark.parametrize(
62 |     "query, suggestion",
63 |     [
64 |         ("tolerat", "tolerate"),
65 |         ("toler", "tolerate"),
66 |         ("th", "the"),
67 |         ("b", "bolter"),
68 |         ("he", ""),  # not in index; part of another word
69 |         ("cod", ""),  # not in index
70 |     ],
71 | )
72 | def test_autosuggest(create_indices, query, suggestion):
73 |     index = create_indices[0]
74 |     large_index = create_indices[1]
75 | 
76 |     if suggestion != "":
77 |         assert index.autosuggest(query)[0] == suggestion
78 | 
79 |     if large_index and suggestion != "":
80 |         assert large_index.autosuggest(query)[0] == suggestion
81 | 


--------------------------------------------------------------------------------
/tests/spelling_correction.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | from contextlib import ExitStack as DoesNotRaise
 4 | 
 5 | import pytest
 6 | from deepdiff import DeepDiff
 7 | 
 8 | from jamesql import JameSQL
 9 | from jamesql.index import GSI_INDEX_STRATEGIES
10 | 
11 | 
12 | def pytest_addoption(parser):
13 |     parser.addoption("--benchmark", action="store")
14 | 
15 | 
16 | @pytest.fixture(scope="session")
17 | def create_indices(request):
18 |     with open("tests/fixtures/documents.json") as f:
19 |         documents = json.load(f)
20 | 
21 |     index = JameSQL()
22 | 
23 |     for document in documents:
24 |         index.add(document)
25 | 
26 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
27 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
28 | 
29 |     with open("tests/fixtures/documents.json") as f:
30 |         documents = json.load(f)
31 | 
32 |     if request.config.getoption("--benchmark") or request.config.getoption(
33 |         "--long-benchmark"
34 |     ):
35 |         large_index = JameSQL()
36 | 
37 |         for document in documents * 100000:
38 |             if request.config.getoption("--long-benchmark"):
39 |                 document = document.copy()
40 |                 document["title"] = "".join(
41 |                     [
42 |                         word + " "
43 |                         for word in document["title"].split()
44 |                         for _ in range(10)
45 |                     ]
46 |                 )
47 |             large_index.add(document)
48 | 
49 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
50 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
51 |     else:
52 |         large_index = None
53 | 
54 |     return index, large_index
55 | 
56 | 
57 | @pytest.mark.parametrize(
58 |     "query, corrected_query",
59 |     [
60 |         ("tolerat", "tolerate"),
61 |         ("tolerateit", "tolerate it"),  # test segmentation
62 |         (
63 |             "startedwith",
64 |             "started with",
65 |         ),  # query word that appears uppercase in corpus of text
66 |         ("toleratt", "tolerate"),
67 |         ("toleratt", "tolerate"),
68 |         ("tolerate", "tolerate"),
69 |         ("toler", "toler"),  # not in index
70 |         ("cod", "cod"),  # not in index
71 |     ],
72 | )
73 | def test_spelling_correction(create_indices, query, corrected_query):
74 |     index = create_indices[0]
75 |     large_index = create_indices[1]
76 | 
77 |     assert index.spelling_correction(query) == corrected_query
78 | 
79 |     if large_index:
80 |         assert large_index.spelling_correction(query) == corrected_query
81 | 


--------------------------------------------------------------------------------
/tests/gsi_type_inference.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | 
 5 | from jamesql import JameSQL
 6 | from jamesql.index import GSI_INDEX_STRATEGIES
 7 | 
 8 | 
 9 | def pytest_addoption(parser):
10 |     parser.addoption("--benchmark", action="store")
11 | 
12 | 
13 | @pytest.mark.timeout(20)
14 | def test_gsi_type_inference(request):
15 |     with open("tests/fixtures/documents_with_varied_data_types.json") as f:
16 |         documents = json.load(f)
17 | 
18 |     index = JameSQL()
19 | 
20 |     for document in documents:
21 |         index.add(document)
22 | 
23 |     # check gsi type
24 |     assert index.gsis["title"]["strategy"] == GSI_INDEX_STRATEGIES.CONTAINS.name
25 |     assert index.gsis["lyric"]["strategy"] == GSI_INDEX_STRATEGIES.CONTAINS.name
26 |     assert index.gsis["listens"]["strategy"] == GSI_INDEX_STRATEGIES.NUMERIC.name
27 |     assert index.gsis["album_in_stock"]["strategy"] == GSI_INDEX_STRATEGIES.FLAT.name
28 |     assert index.gsis["rating"]["strategy"] == GSI_INDEX_STRATEGIES.NUMERIC.name
29 |     assert index.gsis["metadata"]["strategy"] == GSI_INDEX_STRATEGIES.NOT_INDEXABLE.name
30 |     assert (
31 |         index.gsis["record_last_updated"]["strategy"] == GSI_INDEX_STRATEGIES.DATE.name
32 |     )
33 | 
34 |     with open("tests/fixtures/documents_with_varied_data_types.json") as f:
35 |         documents = json.load(f)
36 | 
37 |     if request.config.getoption("--benchmark") or request.config.getoption(
38 |         "--long-benchmark"
39 |     ):
40 |         large_index = JameSQL()
41 | 
42 |         for document in documents * 100000:
43 |             if request.config.getoption("--long-benchmark"):
44 |                 document = document.copy()
45 |                 document["title"] = "".join(
46 |                     [
47 |                         word + " "
48 |                         for word in document["title"].split()
49 |                         for _ in range(10)
50 |                     ]
51 |                 )
52 |             large_index.add(document)
53 | 
54 |         assert (
55 |             large_index.gsis["title"]["strategy"] == GSI_INDEX_STRATEGIES.CONTAINS.name
56 |         )
57 |         assert (
58 |             large_index.gsis["lyric"]["strategy"] == GSI_INDEX_STRATEGIES.CONTAINS.name
59 |         )
60 |         assert (
61 |             large_index.gsis["listens"]["strategy"] == GSI_INDEX_STRATEGIES.NUMERIC.name
62 |         )
63 |         assert (
64 |             large_index.gsis["album_in_stock"]["strategy"]
65 |             == GSI_INDEX_STRATEGIES.FLAT.name
66 |         )
67 |         assert (
68 |             large_index.gsis["rating"]["strategy"] == GSI_INDEX_STRATEGIES.NUMERIC.name
69 |         )
70 |         assert (
71 |             large_index.gsis["metadata"]["strategy"]
72 |             == GSI_INDEX_STRATEGIES.NOT_INDEXABLE.name
73 |         )
74 |         assert (
75 |             large_index.gsis["record_last_updated"]["strategy"]
76 |             == GSI_INDEX_STRATEGIES.DATE.name
77 |         )
78 | 


--------------------------------------------------------------------------------
/docs/pages/templates/storage-and-consistency.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | title: Data Storage and Consistency
 4 | permalink: /storage-and-consistency/
 5 | ---
 6 | 
 7 | JameSQL indices are stored in memory and on disk.
 8 | 
 9 | When you call the `add()` method, the document is appended to an `index.jamesql` file in the directory in which your program is running. This file is serialized as JSONL.
10 | 
11 | When you load an index, all entries in the `index.jamesql` file will be read back into memory.
12 | 
13 | _Note: You will need to manually reconstruct your indices using the `create_gsi()` method after loading an index._
14 | 
15 | ## Data Consistency
16 | 
17 | When you call `add()`, a `journal.jamesql` file is created. This is used to store the contents of the `add()` operation you are executing. If JameSQL terminates during an `add()` call for any reason (i.e. system crash, program termination), this journal will be used to reconcile the database.
18 | 
19 | Next time you initialize a JameSQL instance, your documents in `index.jamesql` will be read into memory. Then, the transactions in `journal.jamesql` will be replayed to ensure the index is consistent. Finally, the `journal.jamesql` file will be deleted.
20 | 
21 | You can access the JSON of the last transaction issued, sans the `uuid`, by calling `index.last_transaction`.
22 | 
23 | If you were in the middle of ingesting data, this could be used to resume the ingestion process from where you left off by allowing you to skip records that were already ingested.
24 | 
25 | ## Reducing Precision for Large Results Pages
26 | 
27 | By default, JameSQL assigns scores to the top 1,000 documents in each clause in a query. Consider the following query;
28 | 
29 | <pre><code class="language-python">
30 | query = {
31 |     "query": {
32 |         "and": [
33 |             {
34 |                 "artist": {
35 |                     "equals": "Taylor Swift"
36 |                 }
37 |             },
38 |             {
39 |                 "title": {
40 |                     "equals": "tolerate it"
41 |                 }
42 |             }
43 |         ]
44 |     },
45 |     "limit": 10
46 | }
47 | </code></pre>
48 | 
49 | The `{ "artist": { "equals": "Taylor Swift" } }` clause will return the top 1,000 documents that match the query. The `{ "title": { "equals": "tolerate it" } }` clause will return the top 1,000 documents that match the query.
50 | 
51 | These will then be combine and sorted to return the 10 documents of the 2,000 processed that have the highest score.
52 | 
53 | This means that if you have a large number of documents that match a query, you may not get precisely the most relevant documents in the top 10 results, rather an approximation of the most relevant documents.
54 | 
55 | You can override the number of documents to consider with:
56 | 
57 | <pre><code class="language-python">
58 | index.match_limit_for_large_result_pages = 10_000
59 | </code></pre>
60 | 
61 | The higher this number, the longer it will take to process results with a large number of matching documents.


--------------------------------------------------------------------------------
/tests/script_lang.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from contextlib import ExitStack as DoesNotRaise
  3 | 
  4 | import pytest
  5 | from lark import Lark
  6 | from pytest import raises
  7 | 
  8 | from jamesql import JameSQL
  9 | from jamesql.script_lang import JameSQLScriptTransformer, grammar
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def document_to_test():
 14 |     with open("tests/fixtures/documents.json") as f:
 15 |         documents = json.load(f)
 16 | 
 17 |     documents[0]["_score"] = 7.52
 18 |     documents[0]["listens"] = 2000
 19 | 
 20 |     return documents[0]
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def script_score_parser():
 25 |     return Lark(grammar)
 26 | 
 27 | 
 28 | @pytest.mark.parametrize(
 29 |     "query, result, raises_exception",
 30 |     [
 31 |         (
 32 |             "(_score + 1)",
 33 |             8.52,
 34 |             DoesNotRaise(),
 35 |         ),
 36 |         (
 37 |             "(_score * 2)",
 38 |             15.04,
 39 |             DoesNotRaise(),
 40 |         ),
 41 |         (
 42 |             "(_score / 2)",
 43 |             3.76,
 44 |             DoesNotRaise(),
 45 |         ),
 46 |         (
 47 |             "(_score - 2)",
 48 |             5.52,
 49 |             DoesNotRaise(),
 50 |         ),
 51 |         (
 52 |             "((_score + 1) * 2)",
 53 |             17.04,
 54 |             DoesNotRaise(),
 55 |         ),
 56 |         (
 57 |             "(((_score + 1) * 2) + 1)",
 58 |             18.04,
 59 |             DoesNotRaise(),
 60 |         ),
 61 |         (
 62 |             "(_score + _score)",
 63 |             15.04,
 64 |             DoesNotRaise(),
 65 |         ),
 66 |         (
 67 |             "((_score + _score) + _score)",
 68 |             22.56,
 69 |             DoesNotRaise(),
 70 |         ),
 71 |         (
 72 |             "(_score * listens)",
 73 |             15040,
 74 |             DoesNotRaise(),
 75 |         ),
 76 |         (
 77 |             "log ((_score * listens))",
 78 |             9.618475246417898,
 79 |             DoesNotRaise(),
 80 |         ),
 81 |         (
 82 |             "log (((_score * listens) + 1))",
 83 |             9.618541733127229,
 84 |             DoesNotRaise(),
 85 |         ),
 86 |         (
 87 |             "_score + 1",
 88 |             0,
 89 |             raises(Exception),  # missing parenthesis
 90 |         ),
 91 |         (
 92 |             "(_score + 1",
 93 |             0,
 94 |             raises(Exception),  # missing closing parenthesis
 95 |         ),
 96 |         (
 97 |             "(_score + 1))",
 98 |             0,
 99 |             raises(Exception),  # additional closing parenthesis
100 |         ),
101 |     ],
102 | )
103 | def test_script_score(
104 |     document_to_test, script_score_parser, query, result, raises_exception
105 | ):
106 |     with raises_exception:
107 |         tree = script_score_parser.parse(query)
108 | 
109 |         transformer = JameSQLScriptTransformer(document_to_test)
110 | 
111 |         assert transformer.transform(tree) == result
112 | 


--------------------------------------------------------------------------------
/docs/pages/templates/create.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | permalink: /create
 4 | title: Create an Index
 5 | ---
 6 | 
 7 | JameSQL supports several index types.
 8 | 
 9 | To achieve the best performance, you should carefully choose the index type to use for each field in your data.
10 | 
11 | If you don't choose an index, JameSQL will automatically create an index for you when you run a query on a field for the first time. This is inferred from the types of data in the first record you add.
12 | 
13 | ## Set an Index Strategy
14 | 
15 | To create an index, use the following code:
16 | 
17 | <pre><code class="language-python">
18 | index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.PREFIX)
19 | </code></pre>
20 | 
21 | See the table below for a list of available index strategies.
22 | 
23 | ## Indexing strategies
24 | 
25 | The following index strategies are available:
26 | 
27 | <table>
28 |     <thead>
29 |         <tr>
30 |             <th>Index Strategy</th>
31 |             <th>Description</th>
32 |         </tr>
33 |     </thead>
34 |     <tbody>
35 |         <tr>
36 |             <td>
37 |                 <code>GSI_INDEX_STRATEGIES.CONTAINS</code>
38 |             </td>
39 |             <td>
40 |                 Creates a reverse index for the field. This is useful for fields that contain longer strings (i.e. body text in a blog post). TF-IDF is used to search fields structured with the <code>CONTAINS</code> type.
41 |             </td>
42 |         </tr>
43 |         <tr>
44 |             <td>
45 |                 <code>GSI_INDEX_STRATEGIES.NUMERIC</code>
46 |             </td>
47 |             <td>
48 |                 Creates several buckets to allow for efficient search of numeric values, especially values with high cardinality.
49 |             </td>
50 |         </tr>
51 |         <tr>
52 |             <td>
53 |                 <code>GSI_INDEX_STRATEGIES.FLAT</code>
54 |             </td>
55 |             <td>
56 |                 Stores the field as the data type it is. A flat index is created of values that are not strings or numbers. This is the default. For example, if you are indexing document titles and don't need to do a <code>starts_with</code> query, you may choose a flat index to allow for efficient <code>equals</code> and <code>contains</code> queries.
57 |             </td>
58 |         </tr>
59 |         <tr>
60 |             <td>
61 |                 <code>GSI_INDEX_STRATEGIES.PREFIX</code>
62 |             </td>
63 |             <td>
64 |                 Creates a trie index for the field. This is useful for fields that contain short strings (i.e. titles).
65 |             </td>
66 |         </tr>
67 |         <tr>
68 |             <td>
69 |                 <code>GSI_INDEX_STRATEGIES.CATEGORICAL</code>
70 |             </td>
71 |             <td>
72 |                 Creates a categorical index for the field. This is useful for fields that contain specific categories (i.e. genres).
73 |             </td>
74 |         </tr>
75 |         <tr>
76 |             <td>
77 |                 <code>GSI_INDEX_STRATEGIES.TRIGRAM_CODE</code>
78 |             </td>
79 |             <td>
80 |                 Creates a character-level trigram index for the field. This is useful for efficient code search. See the "Code Search" documentation later in this README for more information about using code search with JameSQL.
81 |             </td>
82 |         </tr>
83 |     </tbody>
84 | </table>


--------------------------------------------------------------------------------
/docs/pages/templates/search.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | title: Search for Documents
  4 | permalink: /search/
  5 | ---
  6 | 
  7 | There are two ways you can run a search:
  8 | 
  9 | - Using a natural language query with JameSQL operators, or;
 10 | - Using a JSON DSL.
 11 | 
 12 | ## Using the JSON DSL
 13 | 
 14 | A query has the following format:
 15 | 
 16 | <pre><code class="language-python">
 17 | {
 18 |     "query": {
 19 |         "field": "value"
 20 |     },
 21 |     "limit": 10,
 22 |     "sort_by": "field",
 23 |     "skip": 0
 24 | }
 25 | </code></pre>
 26 | 
 27 | - `query` is a dictionary that contains the fields to search for.
 28 | - `limit` is the maximum number of documents to return. (default 10)
 29 | - `sort_by` is the field to sort by. (default None)
 30 | - `skip` is the number of documents to skip. This is useful for implementing pagination. (default 0)
 31 | 
 32 | `limit`, `sort_by`, and `skip` are optional.
 33 | 
 34 | Within the `query` key you can query for documents that match one or more conditions.
 35 | 
 36 | An empty query returns no documents.
 37 | 
 38 | ### Running a search
 39 | 
 40 | To search for documents that match a query, use the following code:
 41 | 
 42 | <pre><code class="language-python">
 43 | result = index.search(query)
 44 | </code></pre>
 45 | 
 46 | This returns a JSON payload with the following structure:
 47 | <pre><code class="language-python">
 48 | 
 49 | {
 50 |     "documents": [
 51 |         {"uuid": "1", ...}
 52 |         {"uuid": "2", ...}
 53 |         ...
 54 |     ],
 55 |     "query_time": 0.0001,
 56 |     "total_results": 200
 57 | }
 58 | </code></pre>
 59 | 
 60 | You can search through multiple pages with the `scroll()` method:
 61 | 
 62 | <pre><code class="language-python">
 63 | result = index.scroll(query)
 64 | </code></pre>
 65 | 
 66 | `scroll()` returns a generator that yields documents in the same format as `search()`.
 67 | 
 68 | ## Retrieve All Documents
 69 | 
 70 | You can retrieve all documents by using a catch-all query, which uses the following syntax:
 71 | 
 72 | <pre><code class="language-python">
 73 | {
 74 |     "query": "*",
 75 |     "limit": 2,
 76 |     "sort_by": "song",
 77 |     "skip": 1
 78 | }
 79 | </code></pre>
 80 | 
 81 | This is useful if you want to page through documents. You should supply a `sort_by` field to ensure the order of documents is consistent.
 82 | 
 83 | ### Response
 84 | 
 85 | All valid queries return responses in the following form:
 86 | 
 87 | <pre><code class="language-python">
 88 | {
 89 |     "documents": [
 90 |         {"uuid": "1", "title": "test", "artist": "..."},
 91 |         {"uuid": "2", "title": "test", "artist": "..."},
 92 |         ...
 93 |     ],
 94 |     "query_time": 0.0001,
 95 |     "total_results": 200
 96 | }
 97 | </code></pre>
 98 | 
 99 | `documents` is a list of documents that match the query. `query_time` is the amount of time it took to execute the query. `total_results` is the total number of documents that match the query before applying any `limit`.
100 | 
101 | `total_results` is useful for implementing pagination.
102 | 
103 | If an error was encountered, the response will be in the following form:
104 | 
105 | <pre><code class="language-python">
106 | {
107 |     "documents": [],
108 |     "query_time": 0.0001,
109 |     "error": "Invalid query"
110 | }
111 | </code></pre>
112 | 
113 | The `error` key contains a message describing the exact error encountered.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 


--------------------------------------------------------------------------------
/tests/code_search.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import sys
  4 | from contextlib import ExitStack as DoesNotRaise
  5 | 
  6 | import pytest
  7 | from deepdiff import DeepDiff
  8 | 
  9 | from jamesql import JameSQL
 10 | from jamesql.index import GSI_INDEX_STRATEGIES
 11 | 
 12 | CODE_BASE_DIR = "tests/fixtures/code"
 13 | 
 14 | 
 15 | def pytest_addoption(parser):
 16 |     parser.addoption("--benchmark", action="store")
 17 | 
 18 | 
 19 | @pytest.fixture(scope="session")
 20 | def create_indices(request):
 21 |     # open all files in code/*
 22 |     documents = []
 23 | 
 24 |     for file in os.listdir("tests/fixtures/code"):
 25 |         with open(os.path.join("tests/fixtures/code", file)) as f:
 26 |             documents.append({"file_name": file, "code": f.read()})
 27 | 
 28 |     index = JameSQL()
 29 | 
 30 |     index.create_gsi("file_name", strategy=GSI_INDEX_STRATEGIES.PREFIX)
 31 |     index.create_gsi("code", strategy=GSI_INDEX_STRATEGIES.TRIGRAM_CODE)
 32 | 
 33 |     for document in documents:
 34 |         index.add(document)
 35 | 
 36 |     if request.config.getoption("--benchmark") or request.config.getoption(
 37 |         "--long-benchmark"
 38 |     ):
 39 |         large_index = JameSQL()
 40 | 
 41 |         for document in documents * 100000:
 42 |             if request.config.getoption("--long-benchmark"):
 43 |                 document = document.copy()
 44 | 
 45 |             large_index.add(document)
 46 | 
 47 |         large_index.create_gsi("file_name", strategy=GSI_INDEX_STRATEGIES.PREFIX)
 48 |         large_index.create_gsi("code", strategy=GSI_INDEX_STRATEGIES.TRIGRAM_CODE)
 49 |     else:
 50 |         large_index = None
 51 | 
 52 |     return index, large_index
 53 | 
 54 | 
 55 | @pytest.mark.parametrize(
 56 |     "query, number_of_documents_expected, top_result_value, raises_exception",
 57 |     [
 58 |         (
 59 |             {"query": {"and": [{"code": {"contains": "def"}}]}, "limit": 10},
 60 |             3,
 61 |             "index.py",
 62 |             DoesNotRaise(),
 63 |         ),  # test code search for valid query
 64 |         (
 65 |             {"query": {"and": [{"code": {"contains": "ef "}}]}, "limit": 10},
 66 |             3,
 67 |             "index.py",
 68 |             DoesNotRaise(),
 69 |         ),  # test code search for valid query with space
 70 |         (
 71 |             {"query": {"and": [{"code": {"contains": "banana"}}]}, "limit": 10},
 72 |             0,
 73 |             "",
 74 |             DoesNotRaise(),
 75 |         ),  # test code search with toekn not in documents
 76 |         (
 77 |             {"query": {"and": [{"code": {"contains": "return "}}]}, "limit": 10},
 78 |             3,
 79 |             "index.py",
 80 |             DoesNotRaise(),
 81 |         ),  # test code search with > 3 char token
 82 |     ],
 83 | )
 84 | @pytest.mark.timeout(20)
 85 | def test_code_search(
 86 |     create_indices,
 87 |     query,
 88 |     number_of_documents_expected,
 89 |     top_result_value,
 90 |     raises_exception,
 91 | ):
 92 |     with raises_exception:
 93 |         index, large_index = create_indices
 94 | 
 95 |         response = index.search(query)
 96 | 
 97 |         # sort response by documents[0]["title"] to make it easier to compare
 98 |         response["documents"] = sorted(
 99 |             response["documents"], key=lambda x: x["file_name"]
100 |         )
101 | 
102 |         assert len(response["documents"]) == number_of_documents_expected
103 | 
104 |         if number_of_documents_expected > 0:
105 |             assert response["documents"][0]["file_name"] == top_result_value
106 | 
107 |         assert float(response["query_time"]) < 0.06
108 | 
109 |         # run if --benchmark is passed
110 |         if "--benchmark" in sys.argv:
111 |             response = large_index.search(query)
112 | 
113 |             assert float(response["query_time"]) < 0.06
114 | 


--------------------------------------------------------------------------------
/tests/highlight.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | from deepdiff import DeepDiff
  7 | 
  8 | from jamesql import JameSQL
  9 | from jamesql.index import GSI_INDEX_STRATEGIES
 10 | 
 11 | 
 12 | def pytest_addoption(parser):
 13 |     parser.addoption("--benchmark", action="store")
 14 | 
 15 | 
 16 | @pytest.fixture(scope="session")
 17 | def create_indices(request):
 18 |     with open("tests/fixtures/documents_with_numeric_values.json") as f:
 19 |         documents = json.load(f)
 20 | 
 21 |     index = JameSQL()
 22 | 
 23 |     for document in documents:
 24 |         index.add(document)
 25 | 
 26 |     with open("tests/fixtures/documents_with_numeric_values.json") as f:
 27 |         documents = json.load(f)
 28 | 
 29 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 30 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 31 |     index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 32 | 
 33 |     if request.config.getoption("--benchmark") or request.config.getoption(
 34 |         "--long-benchmark"
 35 |     ):
 36 |         large_index = JameSQL()
 37 | 
 38 |         for document in documents * 100000:
 39 |             if request.config.getoption("--long-benchmark"):
 40 |                 document = document.copy()
 41 |                 document["title"] = "".join(
 42 |                     [
 43 |                         word + " "
 44 |                         for word in document["title"].split()
 45 |                         for _ in range(10)
 46 |                     ]
 47 |                 )
 48 |             large_index.add(document)
 49 | 
 50 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 51 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 52 |         large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 53 |     else:
 54 |         large_index = None
 55 | 
 56 |     return index, large_index
 57 | 
 58 | 
 59 | @pytest.mark.parametrize(
 60 |     "query, highlights, number_of_documents_expected, top_result_value, raises_exception",
 61 |     [
 62 |         (
 63 |             {
 64 |                 "query": {
 65 |                     "and": [
 66 |                         {
 67 |                             "lyric": {
 68 |                                 "contains": "kiss",
 69 |                                 "highlight": "lyric",
 70 |                                 "strict": True,
 71 |                             }
 72 |                         }
 73 |                     ]
 74 |                 },
 75 |                 "limit": 10,
 76 |                 "sort_by": "title",
 77 |             },
 78 |             [["Started with a kiss"]],
 79 |             1,
 80 |             "The Bolter",
 81 |             DoesNotRaise(),
 82 |         ),  # test range query
 83 |     ],
 84 | )
 85 | @pytest.mark.timeout(20)
 86 | def test_search(
 87 |     create_indices,
 88 |     query,
 89 |     highlights,
 90 |     number_of_documents_expected,
 91 |     top_result_value,
 92 |     raises_exception,
 93 | ):
 94 |     with raises_exception:
 95 |         index, large_index = create_indices
 96 | 
 97 |         response = index.search(query)
 98 | 
 99 |         assert len(response["documents"]) == number_of_documents_expected
100 | 
101 |         for actual_context, expected_context in zip(response["documents"], highlights):
102 |             assert actual_context["_context"] == expected_context
103 | 
104 |         if number_of_documents_expected > 0:
105 |             assert response["documents"][0]["title"] == top_result_value
106 | 
107 |         assert float(response["query_time"]) < 0.06
108 | 
109 |         # run if --benchmark is passed
110 |         if "--benchmark" in sys.argv:
111 |             response = large_index.search(query)
112 | 
113 |             assert float(response["query_time"]) < 0.06
114 | 


--------------------------------------------------------------------------------
/web/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>JameSQL Preview</title>
  7 | 
  8 |     <script src="/ace-builds/src-noconflict/ace.js" type="text/javascript" charset="utf-8"></script>
  9 | 
 10 |     <style>
 11 |         :root {
 12 |             --primary-color: royalblue;
 13 |         }
 14 |         body {
 15 |             margin: 0;
 16 |             padding: 0;
 17 |             font-family: Helvetica, sans-serif;
 18 |             border-top: 0.5em solid var(--primary-color);
 19 |         }
 20 |         main {
 21 |             display: grid;
 22 |             grid-template-columns: 1fr 1fr;
 23 |             grid-gap: 1rem;
 24 |             padding: 1em;
 25 |             padding-top: 0;
 26 |         }
 27 |         section {
 28 |             min-height: calc(100vh - 5em);
 29 |         }
 30 |         #editor, #preview {
 31 |             width: 100%;
 32 |             height: 100%;
 33 |             border: 1px solid #ccc;
 34 |             margin: 0;
 35 |         }
 36 |         .header {
 37 |             display: flex;
 38 |             justify-content: space-between;
 39 |             align-items: center;
 40 |         }
 41 |         button {
 42 |             background-color: var(--primary-color);
 43 |             color: white;
 44 |             border: none;
 45 |             padding: 0.5em 1em;
 46 |             border-radius: 5px;
 47 |             cursor: pointer;
 48 |         }
 49 |         button:hover {
 50 |             background-color: #0069d9;
 51 |         }
 52 |         button:focus {
 53 |             background-color: yellow;
 54 |             color: black;
 55 |         }
 56 |         .show-on-mobile {
 57 |             display: none;
 58 |         }
 59 |         @media (max-width: 768px) {
 60 |             main {
 61 |                 grid-template-columns: 1fr;
 62 |             }
 63 |             section {
 64 |                 min-height: 25vh;
 65 |             }
 66 |             .show-on-mobile {
 67 |                 display: block;
 68 |             }
 69 |         }
 70 |     </style>
 71 | </head>
 72 | <body>
 73 |     <main>
 74 |         <section>
 75 |             <div class="header">
 76 |                 <h1>JameSQL</h1>
 77 |                 <button onclick="submit()" class="show-on-mobile">Run Query</button>
 78 |             </div>
 79 |             <div id="editor">{}</div>
 80 |         </section>
 81 |         <section>
 82 |             <div class="header">
 83 |                 <h1>&nbsp;</h1>
 84 |                 <button onclick="submit()">Run Query</button>
 85 |             </div>
 86 |             <pre id="preview">{}</pre>
 87 |         </section>
 88 |         <script>
 89 |             var editor = ace.edit("editor");
 90 |             editor.session.setMode("ace/mode/json");
 91 |             // persist value on refresh
 92 |             editor.setValue(localStorage.getItem('editor') || '{}');
 93 |             editor.gotoLine(0, 0, true);
 94 | 
 95 |             var preview = ace.edit("preview");
 96 |             preview.session.setMode("ace/mode/json");
 97 | 
 98 |             function submit() {
 99 |                 localStorage.setItem('editor', editor.getValue());
100 |                 
101 |                 var data = editor.getValue();
102 |                 
103 |                 fetch('http://localhost:5000', {
104 |                     method: 'POST',
105 |                     headers: {
106 |                         'Content-Type': 'application/json'
107 |                     },
108 |                     body: data
109 |                 }).then(response => response.json())
110 |                 .then(data => {
111 |                     preview.setValue(JSON.stringify(data, null, 2));
112 | 
113 |                     preview.gotoLine(0, 0, true);
114 |                 });
115 |             }
116 |         </script>
117 |     </main>
118 | </body>
119 | </html>


--------------------------------------------------------------------------------
/tests/data_types.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | from deepdiff import DeepDiff
  7 | 
  8 | from jamesql import JameSQL
  9 | from jamesql.index import GSI_INDEX_STRATEGIES
 10 | 
 11 | 
 12 | def pytest_addoption(parser):
 13 |     parser.addoption("--benchmark", action="store")
 14 | 
 15 | 
 16 | @pytest.fixture(scope="session")
 17 | def create_indices(request):
 18 |     with open("tests/fixtures/documents_with_varied_data_types.json") as f:
 19 |         documents = json.load(f)
 20 | 
 21 |     index = JameSQL()
 22 | 
 23 |     for document in documents:
 24 |         index.add(document)
 25 | 
 26 |     with open("tests/fixtures/documents_with_varied_data_types.json") as f:
 27 |         documents = json.load(f)
 28 | 
 29 |     if request.config.getoption("--benchmark") or request.config.getoption(
 30 |         "--long-benchmark"
 31 |     ):
 32 |         large_index = JameSQL()
 33 | 
 34 |         for document in documents * 100000:
 35 |             if request.config.getoption("--long-benchmark"):
 36 |                 document = document.copy()
 37 |                 document["title"] = "".join(
 38 |                     [
 39 |                         word + " "
 40 |                         for word in document["title"].split()
 41 |                         for _ in range(10)
 42 |                     ]
 43 |                 )
 44 |             large_index.add(document)
 45 |     else:
 46 |         large_index = None
 47 | 
 48 |     return index, large_index
 49 | 
 50 | 
 51 | @pytest.mark.parametrize(
 52 |     "query, number_of_documents_expected, top_result_value, raises_exception",
 53 |     [
 54 |         (
 55 |             {
 56 |                 "query": {
 57 |                     "album_in_stock": {"equals": True},
 58 |                 },
 59 |                 "limit": 2,
 60 |                 "sort_by": "title",
 61 |             },
 62 |             2,
 63 |             "tolerate it",
 64 |             DoesNotRaise(),
 65 |         ),  # test equals with boolean
 66 |         (
 67 |             {
 68 |                 "query": {
 69 |                     "rating": {"greater_than": 4.8},
 70 |                 },
 71 |                 "limit": 2,
 72 |                 "sort_by": "title",
 73 |             },
 74 |             1,
 75 |             "The Bolter",
 76 |             DoesNotRaise(),
 77 |         ),  # test greater than with floating point
 78 |         (
 79 |             {
 80 |                 "query": {
 81 |                     "metadata": {"contains": "version"},
 82 |                 },
 83 |                 "limit": 2,
 84 |                 "sort_by": "title",
 85 |             },
 86 |             0,
 87 |             "",
 88 |             DoesNotRaise(),
 89 |         ),  # dictionaries are not indexable, so this will return a 0 result
 90 |         (
 91 |             {
 92 |                 "query": {
 93 |                     "record_last_updated": {"greater_than": "2024-03-01"},
 94 |                 },
 95 |                 "limit": 2,
 96 |                 "sort_by": "title",
 97 |             },
 98 |             1,
 99 |             "The Bolter",
100 |             DoesNotRaise(),
101 |         ),  # test greater than with date
102 |         (
103 |             {
104 |                 "query": {
105 |                     "record_last_updated": {"less_than": "2024-03-01"},
106 |                 },
107 |                 "limit": 2,
108 |                 "sort_by": "title",
109 |             },
110 |             2,
111 |             "tolerate it",
112 |             DoesNotRaise(),
113 |         ),  # test greater than with date
114 |     ],
115 | )
116 | @pytest.mark.timeout(20)
117 | def test_search(
118 |     create_indices,
119 |     query,
120 |     number_of_documents_expected,
121 |     top_result_value,
122 |     raises_exception,
123 | ):
124 |     with raises_exception:
125 |         index, large_index = create_indices
126 | 
127 |         response = index.search(query)
128 | 
129 |         assert len(response["documents"]) == number_of_documents_expected
130 | 
131 |         if number_of_documents_expected > 0:
132 |             assert response["documents"][0]["title"] == top_result_value
133 | 
134 |         assert float(response["query_time"]) < 0.06
135 | 
136 |         # run if --benchmark is passed
137 |         if "--benchmark" in sys.argv:
138 |             response = large_index.search(query)
139 | 
140 |             assert float(response["query_time"]) < 0.06
141 | 


--------------------------------------------------------------------------------
/web/web.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, request, render_template, send_from_directory
  2 | from jamesql import JameSQL
  3 | from jamesql.index import GSI_INDEX_STRATEGIES
  4 | import json
  5 | from tqdm import tqdm
  6 | from datetime import datetime
  7 | import requests
  8 | import time
  9 | 
 10 | import os
 11 | import pyromark
 12 | import frontmatter
 13 | from bs4 import BeautifulSoup
 14 | 
 15 | app = Flask(__name__)
 16 | 
 17 | index = JameSQL()
 18 | 
 19 | link_graph = {}
 20 | records = []
 21 | 
 22 | blog_posts = os.listdir("../../pages/posts")
 23 | 
 24 | for post_name in blog_posts:
 25 |     with open(f"../../pages/posts/{post_name}") as f:
 26 |         post = frontmatter.load(f)
 27 |         category = post.get("categories", [])[0]
 28 |         description = "<br>".join(post.content.split("\n")[:2])
 29 |         post["description"] = description
 30 | 
 31 |         post[
 32 |             "published"
 33 |         ] = f"{post_name.split('-')[0]}-{post_name.split('-')[1]}-{post_name.split('-')[2]}"
 34 |         post[
 35 |             "url"
 36 |         ] = f"https://jamesg.blog/{post_name.split('-')[0]}/{post_name.split('-')[1]}/{post_name.split('-')[2]}/{'-'.join(post_name.split('-')[3:]).replace('.md', '').strip('/')}"
 37 | 
 38 |         # stem the content
 39 |         # post.content = " ".join([stemmer.stem(word) for word in post.content.split()])
 40 |         # post["title"] = " ".join([stemmer.stem(word) for word in post["title"].split()])
 41 |         # parse markdown
 42 |         # post.content = pyromark.markdown(post.content)
 43 |         # exit()
 44 |         links = BeautifulSoup(pyromark.markdown(post.content), "html.parser").find_all(
 45 |             "a"
 46 |         )
 47 | 
 48 |         links = [link.get("href") for link in links]
 49 | 
 50 |         for link in links:
 51 |             if not link:
 52 |                 continue
 53 | 
 54 |             # if link starts with /, add jamesg.blog
 55 |             if link.startswith("/"):
 56 |                 link = f"https://jamesg.blog{link}"
 57 | 
 58 |             link = link.rstrip("/")
 59 | 
 60 |             if link not in link_graph:
 61 |                 link_graph[link] = []
 62 | 
 63 |             link_graph[link].append(post["url"].strip("/"))
 64 | 
 65 |         html = pyromark.markdown(post["description"])
 66 | 
 67 |         if post.content and post["title"]:
 68 |             records.append(
 69 |                 {
 70 |                     "title": post["title"],
 71 |                     "title_lower": post["title"].lower(),
 72 |                     "post": post.content.lower(),
 73 |                     "category": category,
 74 |                     "description": html,
 75 |                     "published": datetime.strptime(post["published"], "%Y-%m-%d"),
 76 |                     "url": post["url"],
 77 |                     "type": "blog",
 78 |                 }
 79 |             )
 80 | 
 81 | for record in records:
 82 |     record["inlinks"] = len(link_graph.get(record["url"], []))
 83 |     index.add(record)
 84 | 
 85 | index.create_gsi("title_lower", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 86 | index.create_gsi("post", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 87 | 
 88 | 
 89 | @app.route("/", methods=["GET", "POST"])
 90 | def search():
 91 |     field_names = index.gsis
 92 | 
 93 |     field_names_to_index_types = {
 94 |         name: index.gsis[name]["strategy"] for name in field_names.keys()
 95 |     }
 96 |     if request.method == "POST":
 97 |         query = request.json
 98 |         if query["type"] == "string_query":
 99 |             query_parsed = index._compute_string_query(
100 |                 query["raw_query"], query_keys=query["fields"], boosts=query["boosts"]
101 |             )
102 |             query_parsed["query_score"] = query["query_score"]
103 |             query_parsed["sort_by"] = "_score"
104 |             result = index.search(query_parsed)
105 |         else:
106 |             result = index.search(query)
107 |         return result
108 | 
109 |     return render_template("search.html", field_names=field_names_to_index_types)
110 | 
111 | 
112 | @app.route("/json", methods=["GET", "POST"])
113 | def json_search():
114 |     if request.method == "POST":
115 |         query = request.json
116 |         result = index.search(query)
117 |         return result
118 | 
119 |     return render_template("index.html")
120 | 
121 | 
122 | # serve ./ace-builds
123 | @app.route("/ace-builds/<path:path>")
124 | def ace(path):
125 |     return send_from_directory("ace-builds", path)
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     app.run(debug=True)
130 | 


--------------------------------------------------------------------------------
/tests/aggregation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | from deepdiff import DeepDiff
  7 | 
  8 | from jamesql import JameSQL
  9 | from jamesql.index import GSI_INDEX_STRATEGIES
 10 | 
 11 | 
 12 | def pytest_addoption(parser):
 13 |     parser.addoption("--benchmark", action="store")
 14 | 
 15 | 
 16 | @pytest.fixture(scope="session")
 17 | def create_indices(request):
 18 |     with open("tests/fixtures/documents.json") as f:
 19 |         documents = json.load(f)
 20 | 
 21 |     index = JameSQL()
 22 | 
 23 |     for document in documents:
 24 |         index.add(document)
 25 | 
 26 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 27 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 28 | 
 29 |     with open("tests/fixtures/documents.json") as f:
 30 |         documents = json.load(f)
 31 | 
 32 |     if request.config.getoption("--benchmark") or request.config.getoption(
 33 |         "--long-benchmark"
 34 |     ):
 35 |         large_index = JameSQL()
 36 | 
 37 |         for document in documents * 100000:
 38 |             if request.config.getoption("--long-benchmark"):
 39 |                 document = document.copy()
 40 |                 document["title"] = "".join(
 41 |                     [
 42 |                         word + " "
 43 |                         for word in document["title"].split()
 44 |                         for _ in range(10)
 45 |                     ]
 46 |                 )
 47 |             large_index.add(document)
 48 | 
 49 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 50 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 51 |     else:
 52 |         large_index = None
 53 | 
 54 |     return index, large_index
 55 | 
 56 | 
 57 | @pytest.mark.parametrize(
 58 |     "query, introspection_results, number_of_documents_expected, top_result_value, raises_exception",
 59 |     [
 60 |         (
 61 |             {
 62 |                 "query": {
 63 |                     "and": [
 64 |                         {
 65 |                             "lyric": {
 66 |                                 "contains": "my",
 67 |                             }
 68 |                         },
 69 |                     ]
 70 |                 },
 71 |                 "metrics": ["aggregate"],
 72 |                 "limit": 10,
 73 |                 "sort_by": "title",
 74 |             },
 75 |             {"unique_record_values": {"title": 1, "lyric": 1}},
 76 |             1,
 77 |             "tolerate it",
 78 |             DoesNotRaise(),
 79 |         ),  # test query with introspection
 80 |         (
 81 |             {
 82 |                 "query": {},
 83 |                 "metrics": ["aggregate"],
 84 |                 "limit": 10,
 85 |                 "sort_by": "title",
 86 |             },
 87 |             {},
 88 |             0,
 89 |             "",
 90 |             DoesNotRaise(),
 91 |         ),  # test blank query with introspection
 92 |         (
 93 |             {
 94 |                 "query": "*",
 95 |                 "metrics": ["aggregate"],
 96 |                 "limit": 10,
 97 |                 "sort_by": "title",
 98 |             },
 99 |             {"unique_record_values": {"title": 3, "lyric": 3}},
100 |             3,
101 |             "tolerate it",
102 |             DoesNotRaise(),
103 |         ),  # test all (*) query with introspection
104 |     ],
105 | )
106 | @pytest.mark.timeout(20)
107 | def test_search(
108 |     create_indices,
109 |     query,
110 |     introspection_results,
111 |     number_of_documents_expected,
112 |     top_result_value,
113 |     raises_exception,
114 | ):
115 |     with raises_exception:
116 |         index, large_index = create_indices
117 | 
118 |         response = index.search(query)
119 | 
120 |         assert len(response["documents"]) == number_of_documents_expected
121 | 
122 |         # allow items to be in different orders; order doesn't matter
123 |         result = DeepDiff(
124 |             response.get("metrics", {}), introspection_results, ignore_order=True
125 |         )
126 | 
127 |         assert result == {}
128 | 
129 |         if number_of_documents_expected > 0:
130 |             assert response["documents"][0]["title"] == top_result_value
131 | 
132 |         assert float(response["query_time"]) < 0.06
133 | 
134 |         # run if --benchmark is passed
135 |         if "--benchmark" in sys.argv:
136 |             response = large_index.search(query)
137 | 
138 |             assert float(response["query_time"]) < 0.06
139 | 


--------------------------------------------------------------------------------
/tests/range_queries.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | 
  7 | from jamesql import JameSQL
  8 | from jamesql.index import GSI_INDEX_STRATEGIES
  9 | 
 10 | 
 11 | def pytest_addoption(parser):
 12 |     parser.addoption("--benchmark", action="store")
 13 | 
 14 | 
 15 | @pytest.fixture(scope="session")
 16 | def create_indices(request):
 17 |     with open("tests/fixtures/documents_with_numeric_values.json") as f:
 18 |         documents = json.load(f)
 19 | 
 20 |     index = JameSQL()
 21 | 
 22 |     for document in documents:
 23 |         index.add(document)
 24 | 
 25 |     with open("tests/fixtures/documents_with_numeric_values.json") as f:
 26 |         documents = json.load(f)
 27 | 
 28 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 29 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 30 |     index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 31 | 
 32 |     if request.config.getoption("--benchmark") or request.config.getoption(
 33 |         "--long-benchmark"
 34 |     ):
 35 |         large_index = JameSQL()
 36 | 
 37 |         for document in documents * 100000:
 38 |             if request.config.getoption("--long-benchmark"):
 39 |                 document = document.copy()
 40 |                 document["title"] = "".join(
 41 |                     [
 42 |                         word + " "
 43 |                         for word in document["title"].split()
 44 |                         for _ in range(10)
 45 |                     ]
 46 |                 )
 47 |             large_index.add(document)
 48 | 
 49 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 50 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 51 |         large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 52 |     else:
 53 |         large_index = None
 54 | 
 55 |     return index, large_index
 56 | 
 57 | 
 58 | @pytest.mark.parametrize(
 59 |     "query, number_of_documents_expected, top_result_value, raises_exception",
 60 |     [
 61 |         (
 62 |             {
 63 |                 "query": {
 64 |                     "and": [
 65 |                         {"listens": {"range": [200, 300]}},
 66 |                     ]
 67 |                 },
 68 |                 "limit": 10,
 69 |                 "sort_by": "title",
 70 |             },
 71 |             2,
 72 |             "my tears ricochet",
 73 |             DoesNotRaise(),
 74 |         ),  # test range query
 75 |         (
 76 |             {
 77 |                 "query": {
 78 |                     "and": [
 79 |                         {"listens": {"range": [0, 300]}},
 80 |                     ]
 81 |                 },
 82 |                 "limit": 10,
 83 |                 "sort_by": "title",
 84 |             },
 85 |             3,
 86 |             "tolerate it",
 87 |             DoesNotRaise(),
 88 |         ),  # test range query
 89 |         (
 90 |             {
 91 |                 "query": {
 92 |                     "and": [
 93 |                         {"listens": {"range": [300, 300]}},
 94 |                     ]
 95 |                 },
 96 |                 "limit": 10,
 97 |                 "sort_by": "title",
 98 |             },
 99 |             1,
100 |             "The Bolter",
101 |             DoesNotRaise(),
102 |         ),  # test range query
103 |         (
104 |             {
105 |                 "query": {
106 |                     "and": [
107 |                         {"listens": {"range": [0, 0]}},
108 |                     ]
109 |                 },
110 |                 "limit": 10,
111 |                 "sort_by": "title",
112 |             },
113 |             0,
114 |             "",
115 |             DoesNotRaise(),
116 |         ),  # test range query
117 |     ],
118 | )
119 | @pytest.mark.timeout(20)
120 | def test_search(
121 |     create_indices,
122 |     query,
123 |     number_of_documents_expected,
124 |     top_result_value,
125 |     raises_exception,
126 | ):
127 |     with raises_exception:
128 |         index, large_index = create_indices
129 | 
130 |         response = index.search(query)
131 | 
132 |         assert len(response["documents"]) == number_of_documents_expected
133 | 
134 |         if number_of_documents_expected > 0:
135 |             assert response["documents"][0]["title"] == top_result_value
136 | 
137 |         assert float(response["query_time"]) < 0.06
138 | 
139 |         # run if --benchmark is passed
140 |         if "--benchmark" in sys.argv:
141 |             response = large_index.search(query)
142 | 
143 |             assert float(response["query_time"]) < 0.06
144 | 


--------------------------------------------------------------------------------
/tests/query_simplification.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | from lark import Lark
  7 | 
  8 | from jamesql import JameSQL
  9 | from jamesql.index import GSI_INDEX_STRATEGIES
 10 | from jamesql.rewriter import grammar, simplify_string_query
 11 | 
 12 | 
 13 | def pytest_addoption(parser):
 14 |     parser.addoption("--benchmark", action="store")
 15 | 
 16 | 
 17 | @pytest.fixture(scope="session")
 18 | def create_indices(request):
 19 |     with open("tests/fixtures/documents.json") as f:
 20 |         documents = json.load(f)
 21 | 
 22 |     index = JameSQL()
 23 | 
 24 |     for document in documents:
 25 |         index.add(document)
 26 | 
 27 |     with open("tests/fixtures/documents.json") as f:
 28 |         documents = json.load(f)
 29 | 
 30 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 31 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 32 |     index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 33 | 
 34 |     if request.config.getoption("--benchmark") or request.config.getoption(
 35 |         "--long-benchmark"
 36 |     ):
 37 |         large_index = JameSQL()
 38 | 
 39 |         for document in documents * 100000:
 40 |             if request.config.getoption("--long-benchmark"):
 41 |                 document = document.copy()
 42 |                 document["title"] = "".join(
 43 |                     [
 44 |                         word + " "
 45 |                         for word in document["title"].split()
 46 |                         for _ in range(10)
 47 |                     ]
 48 |                 )
 49 |             large_index.add(document)
 50 | 
 51 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 52 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 53 |         large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 54 |     else:
 55 |         large_index = None
 56 | 
 57 |     return index, large_index
 58 | 
 59 | 
 60 | @pytest.mark.parametrize(
 61 |     "query, simplified_form, number_of_documents_expected, top_result_value, raises_exception",
 62 |     [
 63 |         (
 64 |             "sky -sky",
 65 |             "",
 66 |             0,
 67 |             "",
 68 |             DoesNotRaise(),
 69 |         ),  # test negation simplification with empty string result
 70 |         (
 71 |             "100 100",
 72 |             "100",
 73 |             0,
 74 |             "",
 75 |             DoesNotRaise(),
 76 |         ),  # test numeric query simplification
 77 |         (
 78 |             "screaming -sky",
 79 |             "screaming -sky",
 80 |             0,
 81 |             "",
 82 |             DoesNotRaise(),
 83 |         ),  # test negation with no simplification required
 84 |         (
 85 |             "sky sky",
 86 |             "sky",
 87 |             2,
 88 |             ["my tears ricochet", "tolerate it"],
 89 |             DoesNotRaise(),
 90 |         ),  # test duplication of single word term simplification
 91 |         (
 92 |             "sky OR mural sky",
 93 |             "sky mural",
 94 |             2,
 95 |             "tolerate it",
 96 |             DoesNotRaise(),
 97 |         ),  # test redundant single term in or query simplification
 98 |         (
 99 |             "sky OR sky OR sky",
100 |             "sky",
101 |             2,
102 |             ["my tears ricochet", "tolerate it"],
103 |             DoesNotRaise(),
104 |         ),  # test redundant term in multiple ORs
105 |         (
106 |             "-lyric:sky lyric:sky",
107 |             "",
108 |             0,
109 |             "",
110 |             DoesNotRaise(),
111 |         ),  # test double negation of in clause
112 |     ],
113 | )
114 | @pytest.mark.timeout(20)
115 | def test_simplification_then_search(
116 |     create_indices,
117 |     query,
118 |     simplified_form,
119 |     number_of_documents_expected,
120 |     top_result_value,
121 |     raises_exception,
122 | ):
123 |     with raises_exception:
124 |         parser = Lark(grammar)
125 |         index, large_index = create_indices
126 | 
127 |         simplified_query, _ = simplify_string_query(parser, query)
128 | 
129 |         assert simplified_query == simplified_form
130 | 
131 |         response = index.string_query_search(query)
132 | 
133 |         assert len(response["documents"]) == number_of_documents_expected
134 | 
135 |         if number_of_documents_expected > 0:
136 |             if isinstance(top_result_value, list):
137 |                 assert response["documents"][0]["title"] in top_result_value
138 |             else:
139 |                 assert response["documents"][0]["title"] == top_result_value
140 | 
141 |         assert float(response["query_time"]) < 0.06
142 | 
143 |         # run if --benchmark is passed
144 |         if "--benchmark" in sys.argv:
145 |             response = large_index.string_query_search(query)
146 | 
147 |             assert float(response["query_time"]) < 0.06
148 | 


--------------------------------------------------------------------------------
/tests/group_by.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | from deepdiff import DeepDiff
  7 | 
  8 | from jamesql import JameSQL
  9 | from jamesql.index import GSI_INDEX_STRATEGIES
 10 | 
 11 | 
 12 | def pytest_addoption(parser):
 13 |     parser.addoption("--benchmark", action="store")
 14 | 
 15 | 
 16 | @pytest.fixture(scope="session")
 17 | def create_indices(request):
 18 |     with open("tests/fixtures/documents_with_categorical_values.json") as f:
 19 |         documents = json.load(f)
 20 | 
 21 |     index = JameSQL()
 22 | 
 23 |     for document in documents:
 24 |         index.add(document)
 25 | 
 26 |     with open("tests/fixtures/documents_with_categorical_values.json") as f:
 27 |         documents = json.load(f)
 28 | 
 29 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 30 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 31 |     index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 32 | 
 33 |     if request.config.getoption("--benchmark") or request.config.getoption(
 34 |         "--long-benchmark"
 35 |     ):
 36 |         large_index = JameSQL()
 37 | 
 38 |         for document in documents * 100000:
 39 |             if request.config.getoption("--long-benchmark"):
 40 |                 document = document.copy()
 41 |                 document["title"] = "".join(
 42 |                     [
 43 |                         word + " "
 44 |                         for word in document["title"].split()
 45 |                         for _ in range(10)
 46 |                     ]
 47 |                 )
 48 |             large_index.add(document)
 49 | 
 50 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 51 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 52 |         large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 53 |     else:
 54 |         large_index = None
 55 | 
 56 |     return index, large_index
 57 | 
 58 | 
 59 | @pytest.mark.parametrize(
 60 |     "query, group_by_result, number_of_documents_expected, top_result_value, raises_exception",
 61 |     [
 62 |         (
 63 |             {
 64 |                 "query": {"and": [{"lyric": {"contains": "with"}}]},
 65 |                 "limit": 10,
 66 |                 "group_by": "title",
 67 |                 "sort_by": "title",
 68 |             },
 69 |             {
 70 |                 "The Bolter": [
 71 |                     {
 72 |                         "title": "The Bolter",
 73 |                         "lyric": "Started with a kiss",
 74 |                         "category": ["pop", "acoustic"],
 75 |                         "uuid": "18fbe44e19a24153b0a22841261db61c",
 76 |                         "_score": 1,
 77 |                     }
 78 |                 ]
 79 |             },
 80 |             1,
 81 |             "The Bolter",
 82 |             DoesNotRaise(),
 83 |         ),  # test group by on string field
 84 |         (
 85 |             {
 86 |                 "query": {"and": [{"lyric": {"contains": "kiss"}}]},
 87 |                 "group_by": "category",
 88 |                 "limit": 10,
 89 |                 "sort_by": "title",
 90 |             },
 91 |             {
 92 |                 "pop": [
 93 |                     {
 94 |                         "title": "The Bolter",
 95 |                         "lyric": "Started with a kiss",
 96 |                         "category": ["pop", "acoustic"],
 97 |                         "uuid": "eb11180b16e34467a5d457f7115fda38",
 98 |                         "_score": 1,
 99 |                     }
100 |                 ],
101 |                 "acoustic": [
102 |                     {
103 |                         "title": "The Bolter",
104 |                         "lyric": "Started with a kiss",
105 |                         "category": ["pop", "acoustic"],
106 |                         "uuid": "eb11180b16e34467a5d457f7115fda38",
107 |                         "_score": 1,
108 |                     }
109 |                 ],
110 |             },
111 |             1,
112 |             "The Bolter",
113 |             DoesNotRaise(),
114 |         ),  # test group by on categorical field
115 |     ],
116 | )
117 | @pytest.mark.timeout(20)
118 | def test_search(
119 |     create_indices,
120 |     query,
121 |     group_by_result,
122 |     number_of_documents_expected,
123 |     top_result_value,
124 |     raises_exception,
125 | ):
126 |     with raises_exception:
127 |         index, large_index = create_indices
128 | 
129 |         response = index.search(query)
130 | 
131 |         assert len(response["documents"]) == number_of_documents_expected
132 | 
133 |         # exclude uuids since they are randomly assigned on indexing in this configuration
134 | 
135 |         assert (
136 |             DeepDiff(
137 |                 dict(response["groups"]),
138 |                 group_by_result,
139 |                 ignore_order=True,
140 |                 # ignore "score"
141 |                 exclude_regex_paths=["root\[.*\]\['uuid'\]", "root\[.*\]\['_score'\]"],
142 |             )
143 |             == {}
144 |         )
145 | 
146 |         if number_of_documents_expected > 0:
147 |             assert response["documents"][0]["title"] == top_result_value
148 | 
149 |         assert float(response["query_time"]) < 0.06
150 | 
151 |         # run if --benchmark is passed
152 |         if "--benchmark" in sys.argv:
153 |             response = large_index.search(query)
154 | 
155 |             assert float(response["query_time"]) < 0.06
156 | 


--------------------------------------------------------------------------------
/tests/sort_by.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | from deepdiff import DeepDiff
  7 | 
  8 | from jamesql import JameSQL
  9 | from jamesql.index import GSI_INDEX_STRATEGIES
 10 | 
 11 | 
 12 | def pytest_addoption(parser):
 13 |     parser.addoption("--benchmark", action="store")
 14 | 
 15 | 
 16 | @pytest.fixture(scope="session")
 17 | def create_indices(request):
 18 |     with open("tests/fixtures/documents.json") as f:
 19 |         documents = json.load(f)
 20 | 
 21 |     index = JameSQL()
 22 | 
 23 |     for document in documents:
 24 |         index.add(document)
 25 | 
 26 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 27 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 28 | 
 29 |     with open("tests/fixtures/documents.json") as f:
 30 |         documents = json.load(f)
 31 | 
 32 |     if request.config.getoption("--benchmark") or request.config.getoption(
 33 |         "--long-benchmark"
 34 |     ):
 35 |         large_index = JameSQL()
 36 | 
 37 |         for document in documents * 100000:
 38 |             if request.config.getoption("--long-benchmark"):
 39 |                 document = document.copy()
 40 |                 document["title"] = "".join(
 41 |                     [
 42 |                         word + " "
 43 |                         for word in document["title"].split()
 44 |                         for _ in range(10)
 45 |                     ]
 46 |                 )
 47 |             large_index.add(document)
 48 | 
 49 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 50 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 51 |     else:
 52 |         large_index = None
 53 | 
 54 |     return index, large_index
 55 | 
 56 | 
 57 | @pytest.mark.parametrize(
 58 |     "query, top_result_title, number_of_documents_expected, raises_exception",
 59 |     [
 60 |         (
 61 |             {
 62 |                 "query": {
 63 |                     "or": [
 64 |                         {
 65 |                             "lyric": {
 66 |                                 "contains": "kiss",
 67 |                             }
 68 |                         },
 69 |                         {
 70 |                             "lyric": {
 71 |                                 "contains": "sky",
 72 |                             }
 73 |                         },
 74 |                     ]
 75 |                 },
 76 |                 "limit": 10,
 77 |                 "sort_by": "title",
 78 |             },
 79 |             "tolerate it",
 80 |             3,
 81 |             DoesNotRaise(),
 82 |         ),  # test with text field sort
 83 |         (
 84 |             {
 85 |                 "query": {
 86 |                     "or": [
 87 |                         {
 88 |                             "lyric": {
 89 |                                 "contains": "kiss",
 90 |                             }
 91 |                         },
 92 |                         {
 93 |                             "lyric": {
 94 |                                 "contains": "sky",
 95 |                             }
 96 |                         },
 97 |                     ]
 98 |                 },
 99 |                 "limit": 10,
100 |                 "sort_by": "_score",
101 |             },
102 |             "The Bolter",
103 |             3,
104 |             DoesNotRaise(),
105 |         ),  # test with text field score sort
106 |         (
107 |             {
108 |                 "query": {
109 |                     "or": [
110 |                         {
111 |                             "lyric": {
112 |                                 "contains": "kiss",
113 |                             }
114 |                         },
115 |                         {
116 |                             "lyric": {
117 |                                 "contains": "sky",
118 |                             }
119 |                         },
120 |                     ]
121 |                 },
122 |                 "limit": 10,
123 |                 "sort_by": "_score",
124 |                 "sort_order": "asc",
125 |             },
126 |             "my tears ricochet",
127 |             3,
128 |             DoesNotRaise(),
129 |         ),  # test with text field score sort
130 |         (
131 |             {
132 |                 "query": {
133 |                     "or": [
134 |                         {
135 |                             "lyric": {
136 |                                 "contains": "kiss",
137 |                             }
138 |                         },
139 |                         {
140 |                             "lyric": {
141 |                                 "contains": "sky",
142 |                             }
143 |                         },
144 |                     ]
145 |                 },
146 |                 "limit": 10,
147 |                 "sort_by": "_score",
148 |                 "sort_order": "desc",
149 |             },
150 |             "The Bolter",
151 |             3,
152 |             DoesNotRaise(),
153 |         ),  # test with text field score sort
154 |     ],
155 | )
156 | @pytest.mark.timeout(20)
157 | def test_search(
158 |     create_indices,
159 |     query,
160 |     top_result_title,
161 |     number_of_documents_expected,
162 |     raises_exception,
163 | ):
164 |     with raises_exception:
165 |         index, large_index = create_indices
166 | 
167 |         response = index.search(query)
168 | 
169 |         # print(response)
170 | 
171 |         # assert False
172 | 
173 |         assert len(response["documents"]) == number_of_documents_expected
174 |         assert response["documents"][0]["title"] == top_result_title
175 | 
176 |         if number_of_documents_expected > 0:
177 |             assert response["documents"][0]["title"] == top_result_title
178 | 
179 |         assert float(response["query_time"]) < 0.06
180 | 
181 |         # run if --benchmark is passed
182 |         if "--benchmark" in sys.argv:
183 |             response = large_index.search(query)
184 | 
185 |             assert float(response["query_time"]) < 0.06
186 | 


--------------------------------------------------------------------------------
/docs/_site/comparison.md/index.html:
--------------------------------------------------------------------------------
 1 | <html><body><style>pre { line-height: 125%; }
 2 | td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 4 | td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 5 | span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 6 | .highlight .hll { background-color: #ffffcc }
 7 | .highlight { background: #f8f8f8; }
 8 | .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */
 9 | .highlight .err { border: 1px solid #FF0000 } /* Error */
10 | .highlight .k { color: #008000; font-weight: bold } /* Keyword */
11 | .highlight .o { color: #666666 } /* Operator */
12 | .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */
13 | .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */
14 | .highlight .cp { color: #9C6500 } /* Comment.Preproc */
15 | .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */
16 | .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */
17 | .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */
18 | .highlight .gd { color: #A00000 } /* Generic.Deleted */
19 | .highlight .ge { font-style: italic } /* Generic.Emph */
20 | .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
21 | .highlight .gr { color: #E40000 } /* Generic.Error */
22 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
23 | .highlight .gi { color: #008400 } /* Generic.Inserted */
24 | .highlight .go { color: #717171 } /* Generic.Output */
25 | .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
26 | .highlight .gs { font-weight: bold } /* Generic.Strong */
27 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
28 | .highlight .gt { color: #0044DD } /* Generic.Traceback */
29 | .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
30 | .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
31 | .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
32 | .highlight .kp { color: #008000 } /* Keyword.Pseudo */
33 | .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
34 | .highlight .kt { color: #B00040 } /* Keyword.Type */
35 | .highlight .m { color: #666666 } /* Literal.Number */
36 | .highlight .s { color: #BA2121 } /* Literal.String */
37 | .highlight .na { color: #687822 } /* Name.Attribute */
38 | .highlight .nb { color: #008000 } /* Name.Builtin */
39 | .highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */
40 | .highlight .no { color: #880000 } /* Name.Constant */
41 | .highlight .nd { color: #AA22FF } /* Name.Decorator */
42 | .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */
43 | .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */
44 | .highlight .nf { color: #0000FF } /* Name.Function */
45 | .highlight .nl { color: #767600 } /* Name.Label */
46 | .highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
47 | .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */
48 | .highlight .nv { color: #19177C } /* Name.Variable */
49 | .highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
50 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */
51 | .highlight .mb { color: #666666 } /* Literal.Number.Bin */
52 | .highlight .mf { color: #666666 } /* Literal.Number.Float */
53 | .highlight .mh { color: #666666 } /* Literal.Number.Hex */
54 | .highlight .mi { color: #666666 } /* Literal.Number.Integer */
55 | .highlight .mo { color: #666666 } /* Literal.Number.Oct */
56 | .highlight .sa { color: #BA2121 } /* Literal.String.Affix */
57 | .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */
58 | .highlight .sc { color: #BA2121 } /* Literal.String.Char */
59 | .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */
60 | .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
61 | .highlight .s2 { color: #BA2121 } /* Literal.String.Double */
62 | .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */
63 | .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */
64 | .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */
65 | .highlight .sx { color: #008000 } /* Literal.String.Other */
66 | .highlight .sr { color: #A45A77 } /* Literal.String.Regex */
67 | .highlight .s1 { color: #BA2121 } /* Literal.String.Single */
68 | .highlight .ss { color: #19177C } /* Literal.String.Symbol */
69 | .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */
70 | .highlight .fm { color: #0000FF } /* Name.Function.Magic */
71 | .highlight .vc { color: #19177C } /* Name.Variable.Class */
72 | .highlight .vg { color: #19177C } /* Name.Variable.Global */
73 | .highlight .vi { color: #19177C } /* Name.Variable.Instance */
74 | .highlight .vm { color: #19177C } /* Name.Variable.Magic */
75 | .highlight .il { color: #666666 } /* Literal.Number.Integer.Long */</style><p>You can find documents where a field is less than, greater than, less than or equal to, or greater than or equal to a value with a range query. Here is an example of a query that looks for documents where the <code>year</code> field is greater than <code>2010</code>:</p>
76 | <div class="highlight"><pre><span></span><span class="n">query</span> <span class="o">=</span> <span class="p">{</span>
77 |     <span class="s2">"query"</span><span class="p">:</span> <span class="p">{</span>
78 |         <span class="s2">"year"</span><span class="p">:</span> <span class="p">{</span>
79 |             <span class="s2">"greater_than"</span><span class="p">:</span> <span class="mi">2010</span>
80 |         <span class="p">}</span>
81 |     <span class="p">}</span>
82 | <span class="p">}</span>
83 | </pre></div>
84 | 
85 | <p>The following operators are supported:</p>
86 | <ul>
87 | <li><code>greater_than</code></li>
88 | <li><code>less_than</code></li>
89 | <li><code>greater_than_or_equal</code></li>
90 | <li><code>less_than_or_equal</code></li>
91 | </ul>
92 | </body></html>


--------------------------------------------------------------------------------
/docs/_site/aggregate-metrics.md/index.html:
--------------------------------------------------------------------------------
 1 | <html><body><style>pre { line-height: 125%; }
 2 | td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 3 | span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
 4 | td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 5 | span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
 6 | .highlight .hll { background-color: #ffffcc }
 7 | .highlight { background: #f8f8f8; }
 8 | .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */
 9 | .highlight .err { border: 1px solid #FF0000 } /* Error */
10 | .highlight .k { color: #008000; font-weight: bold } /* Keyword */
11 | .highlight .o { color: #666666 } /* Operator */
12 | .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */
13 | .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */
14 | .highlight .cp { color: #9C6500 } /* Comment.Preproc */
15 | .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */
16 | .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */
17 | .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */
18 | .highlight .gd { color: #A00000 } /* Generic.Deleted */
19 | .highlight .ge { font-style: italic } /* Generic.Emph */
20 | .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
21 | .highlight .gr { color: #E40000 } /* Generic.Error */
22 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
23 | .highlight .gi { color: #008400 } /* Generic.Inserted */
24 | .highlight .go { color: #717171 } /* Generic.Output */
25 | .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
26 | .highlight .gs { font-weight: bold } /* Generic.Strong */
27 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
28 | .highlight .gt { color: #0044DD } /* Generic.Traceback */
29 | .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
30 | .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
31 | .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
32 | .highlight .kp { color: #008000 } /* Keyword.Pseudo */
33 | .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
34 | .highlight .kt { color: #B00040 } /* Keyword.Type */
35 | .highlight .m { color: #666666 } /* Literal.Number */
36 | .highlight .s { color: #BA2121 } /* Literal.String */
37 | .highlight .na { color: #687822 } /* Name.Attribute */
38 | .highlight .nb { color: #008000 } /* Name.Builtin */
39 | .highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */
40 | .highlight .no { color: #880000 } /* Name.Constant */
41 | .highlight .nd { color: #AA22FF } /* Name.Decorator */
42 | .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */
43 | .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */
44 | .highlight .nf { color: #0000FF } /* Name.Function */
45 | .highlight .nl { color: #767600 } /* Name.Label */
46 | .highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
47 | .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */
48 | .highlight .nv { color: #19177C } /* Name.Variable */
49 | .highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
50 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */
51 | .highlight .mb { color: #666666 } /* Literal.Number.Bin */
52 | .highlight .mf { color: #666666 } /* Literal.Number.Float */
53 | .highlight .mh { color: #666666 } /* Literal.Number.Hex */
54 | .highlight .mi { color: #666666 } /* Literal.Number.Integer */
55 | .highlight .mo { color: #666666 } /* Literal.Number.Oct */
56 | .highlight .sa { color: #BA2121 } /* Literal.String.Affix */
57 | .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */
58 | .highlight .sc { color: #BA2121 } /* Literal.String.Char */
59 | .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */
60 | .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
61 | .highlight .s2 { color: #BA2121 } /* Literal.String.Double */
62 | .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */
63 | .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */
64 | .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */
65 | .highlight .sx { color: #008000 } /* Literal.String.Other */
66 | .highlight .sr { color: #A45A77 } /* Literal.String.Regex */
67 | .highlight .s1 { color: #BA2121 } /* Literal.String.Single */
68 | .highlight .ss { color: #19177C } /* Literal.String.Symbol */
69 | .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */
70 | .highlight .fm { color: #0000FF } /* Name.Function.Magic */
71 | .highlight .vc { color: #19177C } /* Name.Variable.Class */
72 | .highlight .vg { color: #19177C } /* Name.Variable.Global */
73 | .highlight .vi { color: #19177C } /* Name.Variable.Instance */
74 | .highlight .vm { color: #19177C } /* Name.Variable.Magic */
75 | .highlight .il { color: #666666 } /* Literal.Number.Integer.Long */</style><p>You can find the total number of unique values for the fields returned by a query using an <code>aggregate</code> query. This is useful for presenting the total number of options available in a search space to a user.</p>
76 | <p>You can use the following query to find the total number of unique values for all fields whose <code>lyric</code> field contains the term “sky”:</p>
77 | <div class="highlight"><pre><span></span><span class="n">query</span> <span class="o">=</span> <span class="p">{</span>
78 |     <span class="s2">"query"</span><span class="p">:</span> <span class="p">{</span>
79 |         <span class="s2">"lyric"</span><span class="p">:</span> <span class="p">{</span>
80 |             <span class="s2">"contains"</span><span class="p">:</span> <span class="s2">"sky"</span>
81 |         <span class="p">}</span>
82 |     <span class="p">},</span>
83 |     <span class="s2">"metrics"</span><span class="p">:</span> <span class="p">[</span><span class="s2">"aggregate"</span><span class="p">]</span>
84 | <span class="p">}</span>
85 | </pre></div>
86 | 
87 | <p>The aggregate results are presented in an <code>unique_record_values</code> key with the following structure:</p>
88 | <div class="highlight"><pre><span></span><span class="p">{</span>
89 |     <span class="s2">"documents"</span><span class="p">:</span> <span class="p">[</span><span class="o">...</span><span class="p">],</span>
90 |     <span class="s2">"query_time"</span><span class="p">:</span> <span class="mf">0.0001</span><span class="p">,</span>
91 |     <span class="p">{</span><span class="s1">'unique_record_values'</span><span class="p">:</span> <span class="p">{</span><span class="s1">'title'</span><span class="p">:</span> <span class="mi">2</span><span class="p">,</span> <span class="s1">'lyric'</span><span class="p">:</span> <span class="mi">2</span><span class="p">,</span> <span class="s1">'listens'</span><span class="p">:</span> <span class="mi">2</span><span class="p">,</span> <span class="s1">'categories'</span><span class="p">:</span> <span class="mi">3</span><span class="p">}}</span>
92 | <span class="p">}</span>
93 | </pre></div>
94 | </body></html>


--------------------------------------------------------------------------------
/tests/string_queries_categorical_and_range.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | from deepdiff import DeepDiff
  7 | 
  8 | from jamesql import JameSQL
  9 | from jamesql.index import GSI_INDEX_STRATEGIES
 10 | 
 11 | 
 12 | def pytest_addoption(parser):
 13 |     parser.addoption("--benchmark", action="store")
 14 | 
 15 | 
 16 | @pytest.fixture(scope="session")
 17 | def create_indices(request):
 18 |     with open("tests/fixtures/documents_with_categorical_and_numeric_values.json") as f:
 19 |         documents = json.load(f)
 20 | 
 21 |     index = JameSQL()
 22 | 
 23 |     for document in documents:
 24 |         index.add(document)
 25 | 
 26 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.PREFIX)
 27 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 28 |     index.create_gsi("category", strategy=GSI_INDEX_STRATEGIES.FLAT)
 29 |     index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 30 | 
 31 |     with open("tests/fixtures/documents_with_categorical_and_numeric_values.json") as f:
 32 |         documents = json.load(f)
 33 | 
 34 |     if request.config.getoption("--benchmark") or request.config.getoption(
 35 |         "--long-benchmark"
 36 |     ):
 37 |         large_index = JameSQL()
 38 | 
 39 |         for document in documents * 100000:
 40 |             if request.config.getoption("--long-benchmark"):
 41 |                 document = document.copy()
 42 |                 document["title"] = "".join(
 43 |                     [
 44 |                         word + " "
 45 |                         for word in document["title"].split()
 46 |                         for _ in range(10)
 47 |                     ]
 48 |                 )
 49 |             large_index.add(document)
 50 | 
 51 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.PREFIX)
 52 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 53 |         large_index.create_gsi("category", strategy=GSI_INDEX_STRATEGIES.FLAT)
 54 |         large_index.create_gsi("listens", strategy=GSI_INDEX_STRATEGIES.NUMERIC)
 55 |     else:
 56 |         large_index = None
 57 | 
 58 |     return index, large_index
 59 | 
 60 | 
 61 | @pytest.mark.parametrize(
 62 |     "query, rewritten_query, number_of_documents_expected, top_result_value, raises_exception",
 63 |     [
 64 |         (
 65 |             "listens>100",
 66 |             {"query": {"and": [{"listens": {"greater_than": 100}}]}, "limit": 10},
 67 |             2,
 68 |             "The Bolter",
 69 |             DoesNotRaise(),
 70 |         ),  # test > operator
 71 |         (
 72 |             "listens<101",
 73 |             {"query": {"and": [{"listens": {"less_than": 101}}]}, "limit": 10},
 74 |             1,
 75 |             "tolerate it",
 76 |             DoesNotRaise(),
 77 |         ),  # test < operator
 78 |         (
 79 |             "listens<=101",
 80 |             {"query": {"and": [{"listens": {"less_than_or_equal": 101}}]}, "limit": 10},
 81 |             1,
 82 |             "tolerate it",
 83 |             DoesNotRaise(),
 84 |         ),  # test <= operator
 85 |         (
 86 |             "listens>=101",
 87 |             {
 88 |                 "query": {"and": [{"listens": {"greater_than_or_equal": 101}}]},
 89 |                 "limit": 10,
 90 |             },
 91 |             2,
 92 |             "The Bolter",
 93 |             DoesNotRaise(),
 94 |         ),  # test >= operator
 95 |         (
 96 |             "listens[200, 300] category:'pop'",
 97 |             {
 98 |                 "query": {
 99 |                     "and": [
100 |                         {"listens": {"range": [200, 300]}},
101 |                         {"category": {"contains": "pop"}},
102 |                     ]
103 |                 },
104 |                 "limit": 10,
105 |             },
106 |             1,
107 |             "my tears ricochet",
108 |             DoesNotRaise(),
109 |         ),  # test range operator with a single categorical data query
110 |         (
111 |             "listens[200, 300]",
112 |             {"query": {"and": [{"listens": {"range": [200, 300]}}]}, "limit": 10},
113 |             2,
114 |             "The Bolter",
115 |             DoesNotRaise(),
116 |         ),  # test range operator
117 |         (
118 |             "listens>=101 sky",
119 |             {
120 |                 "query": {
121 |                     "and": [
122 |                         {"listens": {"greater_than_or_equal": 101}},
123 |                         {
124 |                             "or": [
125 |                                 {"title": {"contains": "sky"}},
126 |                                 {"lyric": {"contains": "sky"}},
127 |                                 {"category": {"contains": "sky"}},
128 |                             ]
129 |                         },
130 |                     ]
131 |                 },
132 |                 "limit": 10,
133 |             },
134 |             1,
135 |             "my tears ricochet",
136 |             DoesNotRaise(),
137 |         ),  # test >= operator with a single word query
138 |         (
139 |             "category:'pop' sky",
140 |             {
141 |                 "query": {
142 |                     "and": [
143 |                         {"category": {"contains": "pop"}},
144 |                         {
145 |                             "or": [
146 |                                 {"title": {"contains": "sky"}},
147 |                                 {"lyric": {"contains": "sky"}},
148 |                                 {"category": {"contains": "sky"}},
149 |                             ]
150 |                         },
151 |                     ]
152 |                 },
153 |                 "limit": 10,
154 |             },
155 |             2,
156 |             "my tears ricochet",
157 |             DoesNotRaise(),
158 |         ),  # test a single categorical data query with a single word query
159 |         (
160 |             "category:'pop' category:'acoustic'",
161 |             {
162 |                 "query": {
163 |                     "and": [
164 |                         {"category": {"contains": "pop"}},
165 |                         {"category": {"contains": "acoustic"}},
166 |                     ]
167 |                 },
168 |                 "limit": 10,
169 |             },
170 |             1,
171 |             "my tears ricochet",
172 |             DoesNotRaise(),
173 |         ),  # test two categorical data queries
174 |     ],
175 | )
176 | @pytest.mark.timeout(20)
177 | def test_search(
178 |     create_indices,
179 |     query,
180 |     rewritten_query,
181 |     number_of_documents_expected,
182 |     top_result_value,
183 |     raises_exception,
184 | ):
185 |     with raises_exception:
186 |         index, large_index = create_indices
187 | 
188 |         internal_query, _ = index._compute_string_query(query)
189 |         response = index.string_query_search(query)
190 | 
191 |         # sort response by documents[0]["title"] to make it easier to compare
192 |         response["documents"] = sorted(response["documents"], key=lambda x: x["title"])
193 | 
194 |         assert len(response["documents"]) == number_of_documents_expected
195 | 
196 |         # allow items to be in different orders; order doesn't matter
197 |         result = DeepDiff(internal_query, rewritten_query, ignore_order=True)
198 | 
199 |         assert result == {}
200 | 
201 |         # order documents alphabetically by title
202 | 
203 |         response["documents"] = sorted(response["documents"], key=lambda x: x["title"])
204 | 
205 |         if number_of_documents_expected > 0:
206 |             assert response["documents"][0]["title"] == top_result_value
207 | 
208 |         assert float(response["query_time"]) < 0.06
209 | 
210 |         # run if --benchmark is passed
211 |         if "--benchmark" in sys.argv:
212 |             response = large_index.string_query_search(query)
213 | 
214 |             assert float(response["query_time"]) < 0.06
215 | 


--------------------------------------------------------------------------------
/tests/string_query.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | from deepdiff import DeepDiff
  7 | 
  8 | from jamesql import JameSQL
  9 | from jamesql.index import GSI_INDEX_STRATEGIES
 10 | 
 11 | 
 12 | def pytest_addoption(parser):
 13 |     parser.addoption("--benchmark", action="store")
 14 | 
 15 | 
 16 | @pytest.fixture(scope="session")
 17 | def create_indices(request):
 18 |     with open("tests/fixtures/documents.json") as f:
 19 |         documents = json.load(f)
 20 | 
 21 |     index = JameSQL()
 22 | 
 23 |     for document in documents:
 24 |         index.add(document)
 25 | 
 26 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 27 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 28 | 
 29 |     with open("tests/fixtures/documents.json") as f:
 30 |         documents = json.load(f)
 31 | 
 32 |     if request.config.getoption("--benchmark") or request.config.getoption(
 33 |         "--long-benchmark"
 34 |     ):
 35 |         large_index = JameSQL()
 36 | 
 37 |         for document in documents * 100000:
 38 |             if request.config.getoption("--long-benchmark"):
 39 |                 document = document.copy()
 40 |                 document["title"] = "".join(
 41 |                     [
 42 |                         word + " "
 43 |                         for word in document["title"].split()
 44 |                         for _ in range(10)
 45 |                     ]
 46 |                 )
 47 |             large_index.add(document)
 48 | 
 49 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 50 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 51 |     else:
 52 |         large_index = None
 53 | 
 54 |     return index, large_index
 55 | 
 56 | 
 57 | @pytest.mark.parametrize(
 58 |     "query, rewritten_query, number_of_documents_expected, top_result_value, raises_exception",
 59 |     [
 60 |         (
 61 |             "tolerate it",
 62 |             {
 63 |                 "query": {
 64 |                     "or": [
 65 |                         {
 66 |                             "or": [
 67 |                                 {"title": {"contains": "tolerate"}},
 68 |                                 {"lyric": {"contains": "tolerate"}},
 69 |                             ]
 70 |                         },
 71 |                         {
 72 |                             "or": [
 73 |                                 {"title": {"contains": "it"}},
 74 |                                 {"lyric": {"contains": "it"}},
 75 |                             ]
 76 |                         },
 77 |                     ]
 78 |                 },
 79 |                 "limit": 10,
 80 |             },
 81 |             1,
 82 |             "tolerate it",
 83 |             DoesNotRaise(),
 84 |         ),  # test query with no special operators
 85 |         (
 86 |             "title:tolerate",
 87 |             {"query": {"and": [{"title": {"contains": "tolerate"}}]}, "limit": 10},
 88 |             1,
 89 |             "tolerate it",
 90 |             DoesNotRaise(),
 91 |         ),  # test one word field search
 92 |         (
 93 |             "title:'tolerate it'",
 94 |             {"query": {"and": [{"title": {"contains": "tolerate it"}}]}, "limit": 10},
 95 |             1,
 96 |             "tolerate it",
 97 |             DoesNotRaise(),
 98 |         ),  # test multi-word field search
 99 |         (
100 |             "'tolerate'",
101 |             {
102 |                 "query": {
103 |                     "or": [
104 |                         {
105 |                             "or": {
106 |                                 "lyric": {"contains": "tolerate", "strict": True},
107 |                                 "title": {"contains": "tolerate", "strict": True},
108 |                             }
109 |                         }
110 |                     ]
111 |                 },
112 |                 "limit": 10,
113 |             },
114 |             1,
115 |             "tolerate it",
116 |             DoesNotRaise(),
117 |         ),  # test multi-word search
118 |         (
119 |             "St*rted",
120 |             {
121 |                 "query": {
122 |                     "or": [
123 |                         {
124 |                             "or": [
125 |                                 {"title": {"wildcard": "St*rted"}},
126 |                                 {"lyric": {"wildcard": "St*rted"}},
127 |                             ]
128 |                         }
129 |                     ]
130 |                 },
131 |                 "limit": 10,
132 |             },
133 |             1,
134 |             "The Bolter",
135 |             DoesNotRaise(),
136 |         ),  # test multi-word search
137 |         (
138 |             "-started -with mural",
139 |             {
140 |                 "query": {
141 |                     "and": [
142 |                         {
143 |                             "not": {
144 |                                 "or": [
145 |                                     {"title": {"contains": "started"}},
146 |                                     {"lyric": {"contains": "started"}},
147 |                                 ]
148 |                             }
149 |                         },
150 |                         {
151 |                             "not": {
152 |                                 "or": [
153 |                                     {"title": {"contains": "with"}},
154 |                                     {"lyric": {"contains": "with"}},
155 |                                 ]
156 |                             }
157 |                         },
158 |                         {
159 |                             "or": [
160 |                                 {"title": {"contains": "mural"}},
161 |                                 {"lyric": {"contains": "mural"}},
162 |                             ]
163 |                         },
164 |                     ]
165 |                 },
166 |                 "limit": 10,
167 |             },
168 |             1,
169 |             "tolerate it",
170 |             DoesNotRaise(),
171 |         ),  # two negation queries
172 |         (
173 |             "title:tolerate lyric:I",
174 |             {
175 |                 "query": {
176 |                     "and": [
177 |                         {"title": {"contains": "tolerate"}},
178 |                         {"lyric": {"contains": "I"}},
179 |                     ]
180 |                 },
181 |                 "limit": 10,
182 |             },
183 |             1,
184 |             "tolerate it",
185 |             DoesNotRaise(),
186 |         ),  # two field queries
187 |         (
188 |             "",
189 |             {"query": {}},
190 |             0,
191 |             "",
192 |             DoesNotRaise(),
193 |         ),  # blank query
194 |         (
195 |             "Started sky",
196 |             {
197 |                 "query": {
198 |                     "or": [
199 |                         {
200 |                             "or": [
201 |                                 {"title": {"contains": "Started"}},
202 |                                 {"lyric": {"contains": "Started"}},
203 |                             ]
204 |                         },
205 |                         {
206 |                             "or": [
207 |                                 {"title": {"contains": "sky"}},
208 |                                 {"lyric": {"contains": "sky"}},
209 |                             ]
210 |                         },
211 |                     ]
212 |                 },
213 |                 "limit": 10,
214 |             },
215 |             3,
216 |             "The Bolter",
217 |             DoesNotRaise(),
218 |         ),  # test OR argument
219 |         (
220 |             "I -still",
221 |             {
222 |                 "query": {
223 |                     "and": [
224 |                         {
225 |                             "or": [
226 |                                 {"lyric": {"contains": "I"}},
227 |                                 {"title": {"contains": "I"}},
228 |                             ]
229 |                         },
230 |                         {
231 |                             "not": {
232 |                                 "or": [
233 |                                     {"lyric": {"contains": "still"}},
234 |                                     {"title": {"contains": "still"}},
235 |                                 ]
236 |                             }
237 |                         },
238 |                     ]
239 |                 },
240 |                 "limit": 10,
241 |             },
242 |             1,
243 |             "tolerate it",
244 |             DoesNotRaise(),
245 |         ),  # test negation argument
246 |         (
247 |             "-started -mural -title:'The'",
248 |             {
249 |                 "query": {
250 |                     "and": [
251 |                         {
252 |                             "not": {
253 |                                 "or": [
254 |                                     {"title": {"contains": "started"}},
255 |                                     {"lyric": {"contains": "started"}},
256 |                                 ]
257 |                             }
258 |                         },
259 |                         {
260 |                             "not": {
261 |                                 "or": [
262 |                                     {"title": {"contains": "mural"}},
263 |                                     {"lyric": {"contains": "mural"}},
264 |                                 ]
265 |                             }
266 |                         },
267 |                         {"not": {"title": {"contains": "The"}}},
268 |                     ]
269 |                 },
270 |                 "limit": 10,
271 |             },
272 |             1,
273 |             "my tears ricochet",
274 |             DoesNotRaise(),
275 |         ),  # test negation on field
276 |     ],
277 | )
278 | @pytest.mark.timeout(20)
279 | def test_search(
280 |     create_indices,
281 |     query,
282 |     rewritten_query,
283 |     number_of_documents_expected,
284 |     top_result_value,
285 |     raises_exception,
286 | ):
287 |     with raises_exception:
288 |         index, large_index = create_indices
289 | 
290 |         internal_query, _ = index._compute_string_query(query)
291 |         response = index.string_query_search(query)
292 | 
293 |         print(internal_query, response)
294 | 
295 |         assert len(response["documents"]) == number_of_documents_expected
296 | 
297 |         # allow items to be in different orders; order doesn't matter, ignore sort_by
298 |         result = DeepDiff(
299 |             internal_query,
300 |             rewritten_query,
301 |             ignore_order=True,
302 |             exclude_regex_paths=["root\['sort_by'\]"],
303 |         )
304 | 
305 |         print(result)
306 | 
307 |         assert result == {}
308 | 
309 |         if number_of_documents_expected > 0:
310 |             assert response["documents"][0]["title"] == top_result_value
311 | 
312 |         if response.get("query_time"):
313 |             assert float(response["query_time"]) < 0.06
314 | 
315 |             # run if --benchmark is passed
316 |             if "--benchmark" in sys.argv:
317 |                 response = large_index.string_query_search(query)
318 | 
319 |                 assert float(response["query_time"]) < 0.06
320 | 


--------------------------------------------------------------------------------
/jamesql/rewriter.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from lark import Lark, Transformer
  4 | from lark.visitors import Interpreter, Visitor
  5 | 
  6 | from .query_simplifier import simplifier
  7 | 
  8 | grammar = """
  9 | start: (query)+ sort_component?
 10 | 
 11 | or_query: (query ("OR " | "or ") query)*
 12 | and_query: (query ("AND " | "and ") query)*
 13 | query: and_query | or_query | query_component
 14 | query_component: (negate_query | range_query | strict_search_query | word_query | field_query | comparison)+
 15 | 
 16 | sort_component: "sort:" TERM (ORDER)?
 17 | strict_search_query: "'" MULTI_WORD "'"
 18 | comparison: TERM OPERATOR WORD
 19 | range_query: TERM "[" WORD "," WORD "]"
 20 | word_query: WORD ("^" FLOAT)?
 21 | field_query: TERM ":" ("'" MULTI_WORD "'" | WORD | DOUBLE_QUOTE MULTI_WORD DOUBLE_QUOTE)
 22 | negate_query: "-" (strict_search_query | word_query | field_query | comparison | range_query)
 23 | OPERATOR: ">" | "<" | ">=" | "<="
 24 | DOUBLE_QUOTE: "\\""
 25 | WORD: /[a-zA-Z0-9_.!?*-]+/
 26 | FLOAT: /[0-9]+(\.[0-9]+)?/
 27 | MULTI_WORD: /[a-zA-Z0-9 ]+/
 28 | TERM: /[a-zA-Z0-9_]+/
 29 | ORDER: "ASC" | "DESC" | "asc" | "desc"
 30 | 
 31 | %import common.WS
 32 | %ignore WS
 33 | """
 34 | 
 35 | OPERATOR_MAP = {
 36 |     ">": "greater_than",
 37 |     "<": "less_than",
 38 |     ">=": "greater_than_or_equal",
 39 |     "<=": "less_than_or_equal",
 40 | }
 41 | 
 42 | 
 43 | class QuerySimplifier(Transformer):
 44 |     def __init__(self):
 45 |         self.terms = []
 46 | 
 47 |     def WORD(self, items):
 48 |         return items.value
 49 | 
 50 |     def FLOAT(self, items):
 51 |         return items.value
 52 | 
 53 |     def word_query(self, items):
 54 |         self.terms.append("^".join(items))
 55 |         return "^".join(items)
 56 | 
 57 |     def field_query(self, items):
 58 |         return items[0]
 59 | 
 60 |     def query_component(self, items):
 61 |         return items[0]
 62 | 
 63 |     def query(self, items):
 64 |         return items[0]
 65 | 
 66 |     def or_query(self, items):
 67 |         self.terms.append([items[0], "OR", items[1]])
 68 |         return items[0]
 69 |     
 70 |     def and_query(self, items):
 71 |         self.terms.append([items[0], "AND", items[1]])
 72 |         return items[0]
 73 | 
 74 |     def start(self, items):
 75 |         return items[0]
 76 | 
 77 |     def field_query(self, items):
 78 |         self.terms.append(items[0] + ":" + "'" + items[1] + "'")
 79 |         return items[0] + ":" + "'" + items[1] + "'"
 80 | 
 81 |     def TERM(self, items):
 82 |         return items.value
 83 | 
 84 |     def negate_query(self, items):
 85 |         if items[0] in self.terms:
 86 |             self.terms.remove(items[0])
 87 | 
 88 |         self.terms.append(["NOT", items[0]])
 89 | 
 90 |         return items[0]
 91 | 
 92 |     def range_query(self, items):
 93 |         self.terms.append(items[0] + "[" + items[1] + "," + items[2] + "]")
 94 |         return items[0] + "[" + items[1] + "," + items[2] + "]"
 95 | 
 96 |     def comparison(self, items):
 97 |         self.terms.append(items[0] + items[1] + items[2])
 98 |         return items[0] + items[1] + items[2]
 99 | 
100 |     def strict_search_query(self, items):
101 |         self.terms.append("'" + items[0] + "'")
102 |         return "'" + items[0] + "'"
103 | 
104 |     def MULTI_WORD(self, items):
105 |         return items.value
106 | 
107 | 
108 | class QueryRewriter(Transformer):
109 |     def __init__(
110 |         self,
111 |         default_strategies=None,
112 |         query_keys=None,
113 |         boosts={},
114 |         fuzzy=False,
115 |         highlight_keys=[],
116 |     ):
117 |         self.indexing_strategies = default_strategies
118 |         self.query_keys = query_keys
119 |         self.boosts = boosts
120 |         self.fuzzy = fuzzy
121 |         self.highlight_keys = highlight_keys
122 | 
123 |     def get_query_strategy(self, key="", value=""):
124 |         default = "contains"
125 | 
126 |         if isinstance(value, str) and "*" in value:
127 |             return "wildcard"
128 | 
129 |         return default
130 | 
131 |     def ORDER(self, items):
132 |         return items.value
133 | 
134 |     def FLOAT(self, items):
135 |         return items.value
136 | 
137 |     def or_query(self, items):
138 |         return {"or": items}
139 |     
140 |     def and_query(self, items):
141 |         return {"and": items}
142 | 
143 |     def negate_query(self, items):
144 |         return {"not": items[0]}
145 | 
146 |     def query(self, items):
147 |         return items[0]
148 | 
149 |     def query_component(self, items):
150 |         # if all child keys are OR, return as OR
151 |         all_are_or = False
152 | 
153 |         for item in items:
154 |             if not isinstance(item, dict) or "or" not in item:
155 |                 all_are_or = False
156 |                 break
157 | 
158 |             all_are_or = True
159 | 
160 |         return {"and": items} if not all_are_or else {"or": items}
161 | 
162 |     def sort_component(self, items):
163 |         result = {"sort_by": items[0]}
164 | 
165 |         if len(items) > 1:
166 |             result["sort_order"] = items[1]
167 | 
168 |         return result
169 | 
170 |     def start(self, items):
171 |         items = {k: v for item in items for k, v in item.items()}
172 | 
173 |         response = {"query": items, "limit": 10}
174 | 
175 |         if "sort_by" in items:
176 |             response["sort_by"] = items["sort_by"]
177 |             response["sort_order"] = items.get("sort_order", "asc")
178 |             del items["sort_by"]
179 |             del items["sort_order"]
180 | 
181 |         return response
182 | 
183 |     def OPERATOR(self, items):
184 |         return items.value
185 | 
186 |     def strict_search_query(self, items):
187 |         return {
188 |             "or": [
189 |                 {field: {
190 |                     self.get_query_strategy(value=items[0]): items[0],
191 |                     "strict": True,
192 |                 }}
193 |                 for field in self.query_keys
194 |                 if self.indexing_strategies.get(field) not in {"NUMERIC", "DATE"}
195 |             ]
196 |         }
197 | 
198 |     def TERM(self, items):
199 |         return items.value
200 | 
201 |     def MULTI_WORD(self, items):
202 |         return items.value
203 | 
204 |     def comparison(self, items):
205 |         field = items[0]
206 |         operator = items[1]
207 |         value = items[2]
208 | 
209 |         if field not in self.query_keys:
210 |             return {}
211 | 
212 |         return {field: {OPERATOR_MAP[operator]: value}}
213 | 
214 |     def range_query(self, items):
215 |         field = items[0]
216 |         start = items[1]
217 |         end = items[2]
218 | 
219 |         if field not in self.query_keys:
220 |             return {}
221 | 
222 |         return {field: {"range": [start, end]}}
223 | 
224 |     def word_query(self, items):
225 |         result = []
226 | 
227 |         for key in self.query_keys:
228 |             field = key
229 |             value = items[0]
230 |             if len(items) > 1:
231 |                 boost = items[1]
232 |             else:
233 |                 boost = 1
234 | 
235 |             if self.indexing_strategies.get(field) == "NUMERIC":
236 |                 continue
237 | 
238 |             if self.get_query_strategy(field, value) == "contains":
239 |                 # if value is float, convert to int
240 |                 # this is because text queries can't be floats
241 |                 if isinstance(value, float):
242 |                     value = int(value)
243 | 
244 |                 value = str(value)
245 | 
246 |             results = {
247 |                 field: {
248 |                     self.get_query_strategy(field, value): value,
249 |                 }
250 |             }
251 | 
252 |             if self.boosts.get(field):
253 |                 results[field]["boost"] = self.boosts.get(field, boost)
254 | 
255 |             if self.fuzzy:
256 |                 results[field]["fuzzy"] = (
257 |                     self.fuzzy
258 |                     if self.get_query_strategy(field, value) == "contains"
259 |                     else False
260 |                 )
261 | 
262 |             if field in self.highlight_keys:
263 |                 results[field]["highlight"] = True
264 | 
265 |             result.append(results)
266 | 
267 |         return {"or": result}
268 | 
269 |     def field_query(self, items):
270 |         # remove negation
271 |         field = items[0].lstrip("-")
272 |         value = items[1]
273 | 
274 |         if field not in self.query_keys:
275 |             return {}
276 | 
277 |         return {field: {self.get_query_strategy(field, value): value}}
278 | 
279 |     def WORD(self, items):
280 |         if items.value.isdigit():
281 |             return float(items.value)
282 | 
283 |         return items.value
284 | 
285 | 
286 | def simplify_string_query(parser, query, correct_spelling_index=None):
287 |     # remove punctuation not in grammar
288 |     query = re.sub(r"[^a-zA-Z0-9_,!?^*:\-.'<>=\[\] ]", "", query)
289 | 
290 |     tree = parser.parse(query)
291 | 
292 |     result = QuerySimplifier()
293 |     result.transform(tree.copy())
294 | 
295 |     # query = simplifier(result.terms)
296 |     # query = " ".join(query).strip()
297 | 
298 |     if len(query.strip()) == 0:
299 |         return query, {}
300 | 
301 |     spelling_substitutions = {}
302 | 
303 |     if correct_spelling_index is not None:
304 |         final_query = ""
305 | 
306 |         for word in query.split():
307 |             # if word starts with -, skip
308 |             # ' and " are used to indicate strict strings, so we need to skip words that start or end with the character
309 |             first_char = word[0] if len(word) > 0 else ""
310 |             last_char = word[-1] if len(word) > 0 else ""
311 | 
312 |             if (
313 |                 first_char == "-"
314 |                 or first_char == "'"
315 |                 or first_char == '"'
316 |                 or last_char == "'"
317 |                 or last_char == '"'
318 |                 or correct_spelling_index.word_counts.get(word)
319 |                 or "*" in word
320 |             ):
321 |                 final_query += word + " "
322 |                 continue
323 | 
324 |             final_query += correct_spelling_index.spelling_correction(word) + " "
325 | 
326 |         spelling_substitutions = {
327 |             word: correct_spelling_index.spelling_correction(word)
328 |             for word in query.split()
329 |             if word != correct_spelling_index.spelling_correction(word)
330 |         }
331 | 
332 |         query = final_query.strip()
333 | 
334 |     return query, spelling_substitutions
335 | 
336 | 
337 | def string_query_to_jamesql(
338 |     parser,
339 |     query,
340 |     query_keys,
341 |     default_strategies={},
342 |     boosts={},
343 |     fuzzy=False,
344 |     correct_spelling_index=None,
345 |     highlight_keys=False,
346 | ):
347 |     query, spelling_substitutions = simplify_string_query(
348 |         parser, query, correct_spelling_index
349 |     )
350 | 
351 |     if query.strip() == "":
352 |         return {"query": {}}, []
353 | 
354 |     tree = parser.parse(query)
355 | 
356 |     rewritten_query = QueryRewriter(
357 |         default_strategies=default_strategies,
358 |         query_keys=query_keys,
359 |         boosts=boosts,
360 |         fuzzy=fuzzy,
361 |         highlight_keys=highlight_keys,
362 |     ).transform(tree)
363 | 
364 |     return rewritten_query, spelling_substitutions
365 | 


--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from contextlib import ExitStack as DoesNotRaise
  4 | 
  5 | import pytest
  6 | 
  7 | from jamesql import JameSQL
  8 | from jamesql.index import GSI_INDEX_STRATEGIES
  9 | 
 10 | 
 11 | def pytest_addoption(parser):
 12 |     parser.addoption("--benchmark", action="store")
 13 | 
 14 | 
 15 | @pytest.fixture
 16 | def example_stub_and_query():
 17 |     with open("tests/fixtures/example_stub_and_query.json") as f:
 18 |         query = json.load(f)
 19 | 
 20 |     return query
 21 | 
 22 | 
 23 | @pytest.fixture(scope="session")
 24 | def create_indices(request):
 25 |     with open("tests/fixtures/documents.json") as f:
 26 |         documents = json.load(f)
 27 | 
 28 |     index = JameSQL()
 29 | 
 30 |     for document in documents:
 31 |         index.add(document)
 32 | 
 33 |     with open("tests/fixtures/documents.json") as f:
 34 |         documents = json.load(f)
 35 | 
 36 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 37 |     index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 38 | 
 39 |     if request.config.getoption("--benchmark") or request.config.getoption(
 40 |         "--long-benchmark"
 41 |     ):
 42 |         large_index = JameSQL()
 43 | 
 44 |         for document in documents * 100000:
 45 |             if request.config.getoption("--long-benchmark"):
 46 |                 document = document.copy()
 47 |                 document["title"] = "".join(
 48 |                     [
 49 |                         word + " "
 50 |                         for word in document["title"].split()
 51 |                         for _ in range(10)
 52 |                     ]
 53 |                 )
 54 |             large_index.add(document)
 55 | 
 56 |         large_index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 57 |         large_index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
 58 |     else:
 59 |         large_index = None
 60 | 
 61 |     return index, large_index
 62 | 
 63 | 
 64 | @pytest.mark.parametrize(
 65 |     "query, number_of_documents_expected, top_result_value, raises_exception",
 66 |     [
 67 |         (
 68 |             {
 69 |                 "query": {"title": {"contains": "tolerate"}},
 70 |                 "limit": 10,
 71 |                 "sort_by": "title",
 72 |             },
 73 |             1,
 74 |             "tolerate it",
 75 |             DoesNotRaise(),
 76 |         ),  # test contains
 77 |         (
 78 |             {
 79 |                 "query": {"title": {"contains": "tolerats"}},
 80 |                 "limit": 10,
 81 |                 "sort_by": "title",
 82 |             },
 83 |             0,
 84 |             "",
 85 |             DoesNotRaise(),
 86 |         ),  # test contains
 87 |         (
 88 |             {
 89 |                 "query": {"title": {"equals": "tolerate it"}},
 90 |                 "limit": 10,
 91 |                 "sort_by": "title",
 92 |             },
 93 |             1,
 94 |             "tolerate it",
 95 |             DoesNotRaise(),
 96 |         ),  # test equals
 97 |         (
 98 |             {
 99 |                 "query": {"title": {"equals": "tolerate it"}},
100 |                 "limit": 0,
101 |                 "sort_by": "title",
102 |             },
103 |             0,
104 |             "",
105 |             DoesNotRaise(),
106 |         ),  # test limit
107 |         (
108 |             {
109 |                 "query": {"lyric": {"contains": "my mural", "strict": True}},
110 |                 "limit": 1,
111 |                 "sort_by": "title",
112 |             },
113 |             1,
114 |             "tolerate it",
115 |             DoesNotRaise(),
116 |         ),  # test strict
117 |         (
118 |             {
119 |                 "query": {
120 |                     "lyric": {"contains": "my murap", "strict": True, "fuzzy": True}
121 |                 },
122 |                 "limit": 1,
123 |                 "sort_by": "title",
124 |             },
125 |             1,
126 |             "tolerate it",
127 |             DoesNotRaise(),
128 |         ),
129 |         (
130 |             {
131 |                 "query": {"title": {"wildcard": "tolerat*"}},
132 |                 "limit": 1,
133 |                 "sort_by": "title",
134 |             },
135 |             1,
136 |             "tolerate it",
137 |             DoesNotRaise(),
138 |         ),  # test wildcard
139 |         (
140 |             {
141 |                 "query": {"lyric": {"wildcard": "my mura*", "strict": True}},
142 |                 "limit": 1,
143 |                 "sort_by": "title",
144 |             },
145 |             1,
146 |             "tolerate it",
147 |             DoesNotRaise(),
148 |         ),  # test wildcard and strict; wildcard overrides strict
149 |         (
150 |             {
151 |                 "query": {"title": {"contains": "it tolerate", "strict": True}},
152 |                 "limit": 10,
153 |                 "sort_by": "title",
154 |             },
155 |             0,
156 |             "",
157 |             DoesNotRaise(),
158 |         ),  # test an invalid query
159 |         (
160 |             {
161 |                 "query": {"title": {"starts_with": "toler"}},
162 |                 "limit": 10,
163 |                 "sort_by": "title",
164 |             },
165 |             1,
166 |             "tolerate it",
167 |             DoesNotRaise(),
168 |         ),  # test starts_with on an index with CONTAINS type
169 |         # this will return results but slowly
170 |         (
171 |             {
172 |                 "query": {"lyric": {"starts_with": "Started with"}},
173 |                 "limit": 10,
174 |                 "sort_by": "title",
175 |             },
176 |             1,
177 |             "The Bolter",
178 |             DoesNotRaise(),
179 |         ),  # test starts_with
180 |         (
181 |             {
182 |                 "query": {"lyric": {"contains": "Startee with", "fuzzy": True}},
183 |                 "limit": 10,
184 |                 "sort_by": "title",
185 |             },
186 |             1,
187 |             "The Bolter",
188 |             DoesNotRaise(),
189 |         ),  # test fuzzy on contains
190 |         (
191 |             {
192 |                 "query": {
193 |                     "lyric": {
194 |                         "starts_with": "Startee with",
195 |                         "fuzzy": True,
196 |                         "strict": True,
197 |                     }
198 |                 },
199 |                 "limit": 10,
200 |                 "sort_by": "title",
201 |             },
202 |             1,
203 |             "The Bolter",
204 |             DoesNotRaise(),
205 |         ),  # test fuzzy on starts_with
206 |         (
207 |             {
208 |                 "query": {"lyric": {"equals": "Startee with", "fuzzy": True}},
209 |                 "limit": 10,
210 |                 "sort_by": "title",
211 |             },
212 |             0,
213 |             "",
214 |             DoesNotRaise(),
215 |         ),  # fuzzy doesn't work on equals
216 |         (
217 |             {
218 |                 "query": {"lyric": {"contains": "sky"}},
219 |                 "limit": 10,
220 |                 "sort_by": "lyric",
221 |             },
222 |             2,
223 |             "tolerate it",
224 |             DoesNotRaise(),
225 |         ),  # test starts_with
226 |         (
227 |             {
228 |                 "query": {"lyric": {"contains": "100"}},
229 |                 "limit": 10,
230 |                 "sort_by": "lyric"
231 |             },
232 |             0,
233 |             "",
234 |             DoesNotRaise(),
235 |         ),  # test numeric query
236 |         (
237 |             {
238 |                 "query": {"lyric": {"contains": 100}},
239 |                 "limit": 10,
240 |                 "sort_by": "lyric"
241 |             },
242 |             0,
243 |             "",
244 |             DoesNotRaise(),
245 |         ),  # test numeric query cast as int
246 |         (
247 |             {
248 |                 "query": {"lyric": {"contains": 100.001}},
249 |                 "limit": 10,
250 |                 "sort_by": "lyric"
251 |             },
252 |             0,
253 |             "",
254 |             DoesNotRaise(),
255 |         ),  # test numeric query cast as float
256 |         (
257 |             {
258 |                 "query": {"lyric": {"starts_with": "started with"}},
259 |                 "limit": 10,
260 |                 "sort_by": "title",
261 |             },
262 |             0,
263 |             "",
264 |             DoesNotRaise(),
265 |         ),  # the query is case-sensitive
266 |         (
267 |             {
268 |                 "query": {"lyric": {"starts_with": "started with"}},
269 |                 "limit": 10,
270 |                 "sort_by": "title",
271 |             },
272 |             0,
273 |             "",
274 |             DoesNotRaise(),
275 |         ),  # the query contains a key that doesn't exist; this shouldn't fail
276 |         (
277 |             {"lyric": {"starts_with": "started with"}, "limit": 10, "sort_by": "title"},
278 |             0,
279 |             "",
280 |             DoesNotRaise(),
281 |         ),  # the query is missing the query key; this returns an "error" key but doesn't raise an error
282 |         (
283 |             {
284 |                 "query": {
285 |                     "and": [
286 |                         {"title": {"starts_with": "tolerate"}},
287 |                         {"title": {"contains": "it"}},
288 |                     ]
289 |                 },
290 |                 "limit": 2,
291 |                 "sort_by": "title",
292 |             },
293 |             1,
294 |             "tolerate it",
295 |             DoesNotRaise(),
296 |         ),  # test complex query with single query
297 |         (
298 |             {
299 |                 "query": {
300 |                     "or": {
301 |                         "and": [
302 |                             {"title": {"starts_with": "tolerate"}},
303 |                             {"title": {"contains": "it"}},
304 |                         ],
305 |                         "lyric": {"contains": "kiss"},
306 |                     }
307 |                 },
308 |                 "limit": 2,
309 |                 "sort_by": "title",
310 |             },
311 |             2,
312 |             "tolerate it",
313 |             DoesNotRaise(),
314 |         ),  # test complex query with multiple queries
315 |         (
316 |             {
317 |                 "query": {},
318 |                 "limit": 10,
319 |                 "sort_by": "title",
320 |             },
321 |             0,
322 |             "",
323 |             DoesNotRaise(),
324 |         ),  # test empty query
325 |         (
326 |             {
327 |                 "query": "*",
328 |                 "skip": 2,
329 |                 "limit": 1,
330 |                 "sort_by": "title",
331 |             },
332 |             1,
333 |             "The Bolter",
334 |             DoesNotRaise(),
335 |         ),  # test start query
336 |         (
337 |             {
338 |                 "query": "*",
339 |                 "limit": 10,
340 |                 "sort_by": "title",
341 |             },
342 |             3,
343 |             "tolerate it",
344 |             DoesNotRaise(),
345 |         ),  # test all query
346 |         (
347 |             {
348 |                 "query": {
349 |                     "not": {"lyric": {"contains": "kiss"}},
350 |                 },
351 |                 "limit": 10,
352 |                 "sort_by": "title",
353 |             },
354 |             2,
355 |             "tolerate it",
356 |             DoesNotRaise(),
357 |         ),  # test not with no and query
358 |         (
359 |             {
360 |                 "query": {
361 |                     "and": {
362 |                         "or": [
363 |                             {"lyric": {"contains": "sky", "boost": 3}},
364 |                             {"lyric": {"contains": "kiss", "boost": 3}},
365 |                         ],
366 |                         "not": {"lyric": {"contains": "kiss"}},
367 |                     }
368 |                 },
369 |                 "limit": 10,
370 |                 "sort_by": "title",
371 |             },
372 |             2,
373 |             "tolerate it",
374 |             DoesNotRaise(),
375 |         ),  # test not query within an and query
376 |     ],
377 | )
378 | @pytest.mark.timeout(30)
379 | def test_search(
380 |     create_indices,
381 |     query,
382 |     number_of_documents_expected,
383 |     top_result_value,
384 |     raises_exception,
385 | ):
386 |     with raises_exception:
387 |         index, large_index = create_indices
388 | 
389 |         response = index.search(query)
390 | 
391 |         print(response, query)
392 | 
393 |         assert len(response["documents"]) == number_of_documents_expected
394 | 
395 |         if number_of_documents_expected > 0:
396 |             assert response["documents"][0]["title"] == top_result_value
397 | 
398 |         assert float(response["query_time"]) < 0.06
399 | 
400 |         # run if --benchmark is passed
401 |         if "--benchmark" in sys.argv:
402 |             response = large_index.search(query)
403 | 
404 |             assert float(response["query_time"]) < 0.06
405 | 
406 | 
407 | # TODO: TF/IDF needs to be calculated after all documents have been inserted
408 | # Otherwise TF/IDF score will vary on document insertion order
409 | # which we don't want
410 | @pytest.mark.parametrize(
411 |     "query, top_document_name, top_document_score, raises_exception",
412 |     [
413 |         (
414 |             {
415 |                 "query": {"title": {"contains": "tolerate"}},
416 |                 "limit": 2,
417 |                 "query_score": "(_score + 2)",
418 |             },
419 |             "tolerate it",
420 |             2.0,
421 |             DoesNotRaise(),
422 |         ),
423 |         (
424 |             {
425 |                 "query": {"title": {"contains": "tolerate"}},
426 |                 "limit": 2,
427 |                 "query_score": "(_score * 2)",
428 |                 "sort_by": "_score",
429 |             },
430 |             "tolerate it",
431 |             0.09010335735736986,
432 |             DoesNotRaise(),
433 |         ),
434 |         (
435 |             {
436 |                 "query": {"lyric": {"contains": "sky", "boost": 56}},
437 |                 "limit": 10,
438 |                 "sort_by": "title",
439 |             },
440 |             "tolerate it",
441 |             2.5228940060063563,
442 |             DoesNotRaise(),  # test searching TF/IDF indexed field
443 |         ),
444 |     ],
445 | )
446 | def test_query_score_and_boost(
447 |     create_indices,
448 |     query,
449 |     top_document_name,
450 |     top_document_score,
451 |     raises_exception,
452 | ):
453 |     with raises_exception:
454 |         index, large_index = create_indices
455 |         response = index.search(query)
456 | 
457 |         index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
458 |         index.create_gsi("lyric", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
459 | 
460 |         assert response["documents"][0]["title"] == top_document_name
461 |         assert response["documents"][0]["_score"] == top_document_score
462 | 
463 | 
464 | def test_add_item(
465 |     create_indices,
466 | ):
467 |     index, _ = create_indices
468 | 
469 |     index.add({"title": "shake it off", "lyric": "I stay out too late"})
470 | 
471 |     index.create_gsi("title", strategy=GSI_INDEX_STRATEGIES.CONTAINS)
472 | 
473 |     response = index.search(
474 |         {
475 |             "query": {"title": {"equals": "shake it off"}},
476 |             "limit": 10,
477 |             "sort_by": "title",
478 |         }
479 |     )
480 | 
481 |     assert len(response["documents"]) == 1
482 | 
483 | 
484 | def test_remove_item(
485 |     create_indices,
486 | ):
487 |     index, large_index = create_indices
488 | 
489 |     response = index.search(
490 |         {
491 |             "query": {"title": {"contains": "tolerate"}},
492 |             "limit": 10,
493 |             "sort_by": "title",
494 |         }
495 |     )
496 | 
497 |     uuid = response["documents"][0]["uuid"]
498 | 
499 |     index.remove(uuid)
500 | 
501 |     response = index.search(
502 |         {
503 |             "query": {"title": {"contains": "tolerate"}},
504 |             "limit": 10,
505 |             "sort_by": "title",
506 |         }
507 |     )
508 | 
509 |     assert len(response["documents"]) == 0
510 | 
511 | 
512 | def test_query_exceeding_maximum_subqueries(example_stub_and_query, create_indices):
513 |     for i in range(0, 25):
514 |         example_stub_and_query["query"]["and"].append(
515 |             {"lyric" + str(i): {"contains": "kiss"}}
516 |         )
517 | 
518 |     index, large_index = create_indices
519 | 
520 |     response = index.search(example_stub_and_query)
521 | 
522 |     assert len(response["documents"]) == 0
523 |     assert response["error"].startswith("Too many query conditions.")
524 | 


--------------------------------------------------------------------------------
/web/templates/search.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>JameSQL Query Experimentation Tool</title>
  7 | 
  8 |     <script src="/ace-builds/src-noconflict/ace.js" type="text/javascript" charset="utf-8"></script>
  9 | 
 10 |     <style>
 11 |         :root {
 12 |             --primary-color: royalblue;
 13 |         }
 14 |         html {
 15 |             border-top: 0.5em solid var(--primary-color);
 16 |         }
 17 |         body {
 18 |             font-family: Helvetica, sans-serif;
 19 |             display: grid;
 20 |             grid-template-columns: 2fr 7fr;
 21 |             grid-gap: 2em;
 22 |             grid-template-areas: "aside main";
 23 |             padding-top: 0;
 24 | 
 25 |         }
 26 |         aside {
 27 |             padding-left: 1em;
 28 |             padding-right: 1em;
 29 |             border-right: 1px solid #ccc;
 30 |             height: 100vh;
 31 |             overflow-y: auto;
 32 |         }
 33 |         textarea, input {
 34 |             box-sizing: border-box;
 35 |         }
 36 |         pre {
 37 |             white-space: pre-wrap;
 38 |             word-wrap: break-word;
 39 |             max-width: 50em;
 40 |             overflow-x: auto;
 41 |             background-color: #f8f9fa;
 42 |             padding: 1em;
 43 |             border-radius: 0.5em;
 44 |         }
 45 |         a {
 46 |             color: var(--primary-color);
 47 |             text-decoration: none;
 48 |         }
 49 |         #editor, #preview {
 50 |             width: 100%;
 51 |             height: 100%;
 52 |             border: 1px solid #ccc;
 53 |             margin: 0;
 54 |         }
 55 |         .header {
 56 |             display: flex;
 57 |             justify-content: space-between;
 58 |             align-items: center;
 59 |         }
 60 |         button {
 61 |             background-color: var(--primary-color);
 62 |             color: white;
 63 |             border: none;
 64 |             padding: 0.5em 1em;
 65 |             border-radius: 5px;
 66 |             cursor: pointer;
 67 |         }
 68 |         button:hover {
 69 |             background-color: #0069d9;
 70 |         }
 71 |         button:focus {
 72 |             background-color: yellow;
 73 |             color: black;
 74 |         }
 75 |         .show-on-mobile {
 76 |             display: none;
 77 |         }
 78 |         @media (max-width: 768px) {
 79 |             main {
 80 |                 grid-template-columns: 1fr;
 81 |             }
 82 |             section {
 83 |                 min-height: 25vh;
 84 |             }
 85 |             .show-on-mobile {
 86 |                 display: block;
 87 |             }
 88 |         }
 89 |         label {
 90 |             font-weight: 600;
 91 |             margin-bottom: 0.5em;
 92 |             margin-top: 1em;
 93 |             display: block;
 94 |         }
 95 |         .search-box {
 96 |             display: grid;
 97 |             grid-template-columns: 3fr 0.01fr 1fr;
 98 |             grid-gap: 1em;
 99 |             margin-top: 1em;
100 |         }
101 |         input[type="text"] {
102 |             width: 100%;
103 |             font-size: 1em;
104 |             border-radius: 0.5em;
105 |             border: 0.1em solid lightgrey;
106 |             padding: 0.5em;
107 |         }
108 |         button {
109 |             border-radius: 0.5em;
110 |             cursor: pointer;
111 |             outline: none;
112 |             font-size: 1em;
113 |             border: 0.1em solid lightgrey;
114 |         }
115 |         button:hover {
116 |             background-color: rgb(208, 208, 208);
117 |             color: black;
118 |         }
119 |         button:focus {
120 |             background-color: rgb(255, 225, 116);
121 |             color: black;
122 |             font-weight: 600;
123 |             outline: none;
124 |         }
125 |         input[type="text"]:focus {
126 |             background-color: rgb(255, 225, 116);
127 |             border: 0.1em solid lightgrey;
128 |             outline: none;
129 |         }
130 |         ul {
131 |             padding: 0;
132 |         }
133 |         ul li {
134 |             list-style-type: none;
135 |             margin-bottom: 1em;
136 |         }
137 |         select {
138 |             width: 100%;
139 |             font-size: 1em;
140 |             border-radius: 0.5em;
141 |             border: 0.1em solid lightgrey;
142 |             padding: 0.5em;
143 |         }
144 |         textarea {
145 |             width: 100%;
146 |             font-size: 1em;
147 |             border-radius: 0.5em;
148 |             border: 0.1em solid lightgrey;
149 |             padding: 0.5em;
150 |             height: 5em;
151 |         }
152 |         @media (max-width: 768px) {
153 |             body {
154 |                 grid-template-areas: "main" "aside";
155 |                 grid-template-columns: 1fr;
156 |             }
157 |             aside {
158 |                 order: 2;
159 |                 border-top: 1px solid #ccc;
160 |                 border-right: none;
161 |                 padding-top: 1em;
162 |                 padding-left: 1em;
163 |                 padding-right: 1em;
164 |             }
165 |             
166 |         }
167 |     </style>
168 | 
169 |     <link rel="icon" href="https://jamesg.blog/favicon.ico" type="image/x-icon">
170 | </head>
171 | <body>
172 |     <aside>
173 |         <section>
174 |             <h1><img src="https://jamesg.blog/assets/mascot.svg" height="25" style="margin-bottom: -5px; margin-right: 0.5em;" /> JameSQL</h1>
175 |             <form>
176 |                 <label>Ranking algorithm:</label>
177 |                 <textarea id="ranking">(_score + log (inlinks))</textarea>
178 |                 <br>
179 |                 <label>Query type:</label>
180 |                 <select id="query_type">
181 |                     <option value="json_query">JSON</option>
182 |                     <option value="string_query">String</option>
183 |                 </select>
184 |                 <label>Fields to search:</label>
185 |                 <input type="text" id="fields" value="{% for field in field_names %}{{ field }}{% if not loop.last %}, {% endif %}{% endfor %}">
186 |                 <br>
187 |                 <label>Keyword operator:</label>
188 |                 <select id="operator">
189 |                     <option value="and">AND</option>
190 |                     <option value="or">OR</option>
191 |                 </select>
192 |                 <label>Sort by:</label>
193 |                 <select id="sort_by">
194 |                     <option value="_score">_score</option>
195 |                     {% for field in field_names %}
196 |                         <option value="{{ field }}">{{ field }}</option>
197 |                     {% endfor %}
198 |                 </select>
199 |                 <label>Highlight matching terms (strict search):</label>
200 |                 <input type="checkbox" id="highlight_matching_terms" value="false">
201 |                 <label>Show all data:</label>
202 |                 <input type="checkbox" id="show_all_data" value="true">
203 |                 <br><br>
204 |                 <details>
205 |                     <summary>Advanced</summary>
206 |                     <h3>Boosts</h3>
207 |                     {% for field in field_names %}
208 |                         <label>{{ field }}</label>
209 |                         <input type="text" id="boost_{{ field }}" value="1">
210 |                     {% endfor %}
211 |                     <h3>Indexed Fields</h3>
212 |                     <ul>
213 |                         {% for field in field_names %}
214 |                             <li>{{ field }}: {{ field_names[field] }}</li>
215 |                         {% endfor %}
216 |                     </ul>
217 |                     <div id="computed_query"></div>
218 |                 </details>
219 |             </form>
220 |         </section>
221 |     </aside>
222 |     <section>
223 |         <div class="search-box">
224 |             <input autocomplete="off" id="q" name="q" placeholder="Search..." type="text">
225 |             <span></span>
226 |             <button type="submit">Search</button>
227 |         </div>
228 |         <section>
229 |             <h2>Results</h2>
230 |             <ul id="results"></ul>
231 |         </section>
232 |     </section>
233 |     <script>
234 |         // if ?q= is in the URL, set the search input value to the query
235 |         var urlParams = new URLSearchParams(window.location.search);
236 |         if (urlParams.has('q')) {
237 |             document.querySelector('input[name="q"]').value = urlParams.get('q');
238 |             submit();
239 |         }
240 |         // add listener
241 |         document.querySelector('button[type="submit"]').addEventListener('click', submit);
242 |     
243 |         // if enter is pressed, submit the form
244 |         document.querySelector('input[name="q"]').addEventListener('keypress', function (e) {
245 |             if (e.key === 'Enter') {
246 |                 submit();
247 |             }
248 |         });
249 |     
250 |         // if enter pressed on button
251 |         document.querySelector('button[type="submit"]').addEventListener('keypress', function (e) {
252 |             if (e.key === 'Enter') {
253 |                 submit();
254 |             }
255 |         });
256 | 
257 |         // if show all data is clicked, resubmit query
258 |         document.querySelector('#show_all_data').addEventListener('click', function () {
259 |             submit();
260 |         });
261 | 
262 |         // if highlight matching terms is clicked, resubmit query
263 |         document.querySelector('#highlight_matching_terms').addEventListener('click', function () {
264 |             submit();
265 |         });
266 | 
267 |         // if query type changed to string, hide keyword operator, boosts, and highlights
268 |         document.querySelector('#query_type').addEventListener('change', function () {
269 |             if (document.querySelector('#query_type').value === 'string_query') {
270 |                 document.querySelector('#operator').style.display = 'none';
271 |                 document.querySelector('#highlight_matching_terms').style.display = 'none';
272 |                 document.querySelector('#boost').style.display = 'none';
273 |             } else {
274 |                 document.querySelector('#operator').style.display = 'block';
275 |                 document.querySelector('#highlight_matching_terms').style.display = 'block';
276 |                 document.querySelector('#boost').style.display = 'block';
277 |             }
278 |         });
279 |         
280 |         function submit() {
281 |             var query = document.querySelector('input[name="q"]').value;
282 |             var start = 0;
283 |     
284 |             if (urlParams.has('page')) {
285 |                 start = (urlParams.get('page') - 1) * 10;
286 |             }
287 |     
288 |             function toTitleCase(str) {
289 |               return str.replace(
290 |                 /\w\S*/g,
291 |                 text => text.charAt(0).toUpperCase() + text.substring(1).toLowerCase()
292 |               );
293 |             }
294 |     
295 |             // update page url to ?q={query}&page={page}, without reloading the page or adding to history
296 |             var newUrl = window.location.protocol + '//' + window.location.host + window.location.pathname + '?q=' + query + '&page=' + (start / 10 + 1);
297 |             window.history.pushState({ path: newUrl }, '', newUrl);
298 | 
299 |             var query_json = [];
300 | 
301 |             for (var field of document.querySelector('#fields').value.split(',')) {
302 |                 var record = {[field.trim()]: {"contains": query}};
303 | 
304 |                 if (document.querySelector('#boost_' + field.trim()).value !== '1') {
305 |                     record[field.trim()]['boost'] = document.querySelector('#boost_' + field.trim()).value;
306 |                 }
307 | 
308 |                 if (document.querySelector('#highlight_matching_terms').checked) {
309 |                     record[field.trim()]['highlight'] = true;
310 |                     record[field.trim()]['strict'] = true;
311 |                 }
312 |                 query_json.push(record);
313 |             }
314 | 
315 |             // if press /, focus on search field
316 |             document.addEventListener('keydown', function (e) {
317 |                 if (e.key === '/') {
318 |                     document.querySelector('input[name="q"]').focus();
319 |                     // clear the input
320 |                     document.querySelector('input[name="q"]').value = '';
321 |                     e.preventDefault();
322 |                 }
323 |             });
324 | 
325 |             var boosts = {};
326 | 
327 |             for (var field of document.querySelector('#fields').value.split(',')) {
328 |                 boosts[field.trim()] = document.querySelector('#boost_' + field.trim()).value;
329 |             }
330 |     
331 |             fetch('http://127.0.0.1:5000', {
332 |                 method: 'POST',
333 |                 headers: {
334 |                     'Content-Type': 'application/json'
335 |                 },
336 |                 body: JSON.stringify({
337 |                     query: {
338 |                         [document.querySelector('#operator').value]: query_json
339 |                     },
340 |                     raw_query: document.querySelector('input[name="q"]').value,
341 |                     start: start,
342 |                     query_score: document.querySelector('#ranking').value,
343 |                     sort_by: document.querySelector('#sort_by').value,
344 |                     highlight: true,
345 |                     highlight_stride: 10,
346 |                     type: document.querySelector('#query_type').value,
347 |                     fields: document.querySelector('#fields').value.split(',').map(field => field.trim()),
348 |                     boosts: boosts
349 |                 })
350 |             }).then(response => response.json())
351 |             .then(data => {
352 |                 // add query json to pre in computed query
353 |                 var computed_query = document.querySelector('#computed_query');
354 |                 computed_query.innerHTML = '<h3>Computed Query</h3>';
355 |                 var pre = document.createElement('pre');
356 |                 pre.textContent = JSON.stringify({
357 |                     query: {
358 |                         [document.querySelector('#operator').value]: query_json
359 |                     },
360 |                     start: start,
361 |                     query_score: document.querySelector('#ranking').value,
362 |                     sort_by: document.querySelector('#sort_by').value,
363 |                 }, null, 2);
364 |                 computed_query.appendChild(pre);
365 | 
366 |                 var list = document.querySelector('#results');
367 |                 list.innerHTML = '';
368 |     
369 |                 if (data.documents.length === 0) {
370 |                     var li = document.createElement('li');
371 |                     li.textContent = 'No results found.';
372 |                     list.appendChild(li);
373 |                     return;
374 |                 }
375 |                 
376 |                 var results_count = data.total_results;
377 |                 var query_time = data.query_time;
378 |     
379 |                 var li = document.createElement('li');
380 |     
381 |                 li.textContent = `Found ${results_count} result${results_count === 1 ? '' : 's'}. Viewing page ${start / 10 + 1} of ${Math.ceil(results_count / 10)}. Query took ${query_time}ms.`;
382 |     
383 |                 li.style.marginBottom = '1em';
384 |                 list.appendChild(li);
385 | 
386 |                 // if result.answer, add text
387 |                 if (data.answer) {
388 |                     var li = document.createElement('li');
389 |                     li.innerHTML = `<p><strong>Answer:</strong> ${data.answer}</p>`;
390 |                     list.appendChild(li);
391 |                 }
392 |     
393 |                 data.documents.forEach(article => {
394 |                     var li = document.createElement('li');
395 |                     li.classList.add('h-entry', 'list_entry');
396 |     
397 |                     var h3 = document.createElement('h3');
398 |                     h3.classList.add('p-name');
399 |                     var a = document.createElement('a');
400 |                     a.href = article.url;
401 | 
402 |                     a.textContent = article.title + ' - ' + article._score.toFixed(3);
403 |                     h3.appendChild(a);
404 | 
405 |                     // add pre with full json
406 |                     var articleElement = document.createElement('pre');
407 |                     // iterate over all keys; if len > 20 words, add ...
408 |                     var keys = Object.keys(article);
409 |                     keys.forEach(key => {
410 |                         var value = article[key];
411 |                         if (typeof value === 'string') {
412 |                             if (value.split(' ').length > 20) {
413 |                                 article[key] = value.split(' ').slice(0, 20).join(' ') + '...';
414 |                             }
415 |                         }
416 |                     });
417 |                     
418 |                     articleElement.textContent = JSON.stringify(article, null, 2);
419 |     
420 |                     li.appendChild(h3);
421 |                     // if highlight, add list
422 |                     if (article._context && article._context.length > 0) {
423 |                         var ul = document.createElement('ul');
424 |                         for (var key in article._context.slice(0, 3)) {
425 |                             var nli = document.createElement('li');
426 |                             nli.style.marginBottom = '1em';
427 |                             nli.innerHTML = article._context[key];
428 |                             nli.style.listStyleType = 'disc';
429 |                             nli.style.marginLeft = '1em';
430 |                             ul.appendChild(nli);
431 | 
432 |                             // surround matching words with mark regex
433 |                             console.log(query);
434 |                             var regex = new RegExp(query, 'gi');
435 |                             var text = nli.textContent;
436 |                             nli.innerHTML = text.replace(regex, '<mark>$&</mark>');
437 |                         }
438 |                         li.appendChild(ul);
439 | 
440 |                         // add ... if needed
441 |                         if (article._context.length > 3) {
442 |                             var span = document.createElement('span');
443 |                             span.textContent = '...';
444 |                             span.style.marginRight = '1em';
445 |                             li.appendChild(span);
446 |                         }
447 |                     }
448 | 
449 |                     if (document.querySelector('#show_all_data').checked) {
450 |                         li.appendChild(articleElement);
451 |                     }
452 |     
453 |                     list.appendChild(li);
454 |                 });
455 |                 
456 |                 if (results_count > 10) {
457 |                     var li = document.createElement('li');
458 |                     var pages = Math.ceil(results_count / 10);
459 |                     var ellipsis = false;
460 |                     for (var i = 1; i <= pages; i++) {
461 |                         // if i == 1 or i == len(pages), always show
462 |                         // otherwise, only show next and previous 2 pages
463 |                         if (i != 1 && i != pages) {
464 |                             if (i < start / 10 - 2 || i > start / 10 + 2) {
465 |                                 ellipsis = true;
466 |                                 continue;
467 |                             }
468 |                         }
469 |                         if (ellipsis) {
470 |                             var span = document.createElement('span');
471 |                             span.textContent = '...';
472 |                             span.style.marginRight = '1em';
473 |                             li.appendChild(span);
474 |                             ellipsis = false;
475 |                         }
476 |                         var a = document.createElement('a');
477 |                         a.href = `?q=${query}&page=${i}`;
478 |                         // if ?page= is in the URL, set the search input value to the query
479 |                         if (urlParams.has('page')) {
480 |                             if (urlParams.get('page') == i) {
481 |                                 a.style.fontWeight = '600';
482 |                             }
483 |                         }
484 |                         a.textContent = `Page ${i}`;
485 |                         a.style.marginRight = '1em';
486 |                         li.appendChild(a);
487 |                     }
488 |                     list.appendChild(li);
489 |                 }
490 |     
491 |                 // if page > 1 and results count == 0, redirect to page 1
492 |                 if (results_count == 0 && start > 0) {
493 |                     window.location.href = `?q=${query}&page=1`;
494 |                 }
495 |             })
496 |             .catch(error => {
497 |                 console.error('Error:', error);
498 |                 var list = document.querySelector('#results');
499 |                 list.innerHTML = '';
500 |                 var li = document.createElement('li');
501 |                 li.innerHTML = `<p>There was an error searching for "${query}". If this error persists, please email <a href="mailto:readers@jamesg.blog">readers@jamesg.blog</a>.</p>`;
502 |                 list.appendChild(li);
503 |             });
504 |         }
505 |     </script>
506 | </body>
507 | </html>


--------------------------------------------------------------------------------