├── .codiumai.toml
├── .coveragerc
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   └── sweep-template.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    ├── labeler.yml
    └── workflows
    │   ├── auto-merge.yml
    │   ├── deploy-github-pages.yml
    │   ├── first-interaction.yml
    │   ├── issue_labeler.yml
    │   ├── python-publish.yml
    │   ├── stale_bot.yaml
    │   ├── test.yml
    │   └── tox.yml
├── .gitignore
├── .hgignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── .sonarcloud.properties
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── benchmark
    ├── dcvgr10.txt.gz
    ├── dictionary.py
    ├── enron.py
    ├── marc21.py
    ├── reuters.py
    └── reuters21578.txt.gz
├── codecov.yml
├── docs
    ├── Makefile
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── analysis.rst
    │   ├── api
    │       ├── analysis.rst
    │       ├── api.rst
    │       ├── codec
    │       │   └── base.rst
    │       ├── collectors.rst
    │       ├── columns.rst
    │       ├── fields.rst
    │       ├── filedb
    │       │   ├── filestore.rst
    │       │   ├── filetables.rst
    │       │   └── structfile.rst
    │       ├── formats.rst
    │       ├── highlight.rst
    │       ├── idsets.rst
    │       ├── index.rst
    │       ├── lang
    │       │   ├── morph_en.rst
    │       │   ├── porter.rst
    │       │   └── wordnet.rst
    │       ├── matching.rst
    │       ├── qparser.rst
    │       ├── query.rst
    │       ├── reading.rst
    │       ├── scoring.rst
    │       ├── searching.rst
    │       ├── sorting.rst
    │       ├── spelling.rst
    │       ├── support
    │       │   ├── charset.rst
    │       │   └── levenshtein.rst
    │       ├── util.rst
    │       └── writing.rst
    │   ├── batch.rst
    │   ├── conf.py
    │   ├── dates.rst
    │   ├── facets.rst
    │   ├── fieldcaches.rst
    │   ├── glossary.rst
    │   ├── highlight.rst
    │   ├── index.rst
    │   ├── indexing.rst
    │   ├── intro.rst
    │   ├── keywords.rst
    │   ├── nested.rst
    │   ├── ngrams.rst
    │   ├── parsing.rst
    │   ├── query.rst
    │   ├── querylang.rst
    │   ├── quickstart.rst
    │   ├── recipes.rst
    │   ├── releases
    │       ├── 0_3.rst
    │       ├── 1_0.rst
    │       ├── 2_0.rst
    │       └── index.rst
    │   ├── schema.rst
    │   ├── searching.rst
    │   ├── spelling.rst
    │   ├── stemming.rst
    │   ├── tech
    │       ├── backend.rst
    │       ├── filedb.rst
    │       └── index.rst
    │   └── threads.rst
├── files
    ├── whoosh.svg
    ├── whoosh_16.png
    ├── whoosh_35.png
    ├── whoosh_64.png
    └── whoosh_small.svg
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── scripts
    ├── make_checkpoint.py
    ├── pylint.ini
    └── read_checkpoint.py
├── setup.cfg
├── setup.py
├── src
    └── whoosh
    │   ├── __init__.py
    │   ├── analysis
    │       ├── __init__.py
    │       ├── acore.py
    │       ├── analyzers.py
    │       ├── filters.py
    │       ├── intraword.py
    │       ├── morph.py
    │       ├── ngrams.py
    │       └── tokenizers.py
    │   ├── automata
    │       ├── __init__.py
    │       ├── fsa.py
    │       ├── fst.py
    │       ├── glob.py
    │       ├── lev.py
    │       └── reg.py
    │   ├── classify.py
    │   ├── codec
    │       ├── __init__.py
    │       ├── base.py
    │       ├── memory.py
    │       ├── plaintext.py
    │       ├── whoosh2.py
    │       └── whoosh3.py
    │   ├── collectors.py
    │   ├── columns.py
    │   ├── externalsort.py
    │   ├── fields.py
    │   ├── filedb
    │       ├── __init__.py
    │       ├── compound.py
    │       ├── fileindex.py
    │       ├── filepostings.py
    │       ├── filereading.py
    │       ├── filestore.py
    │       ├── filetables.py
    │       ├── filewriting.py
    │       ├── gae.py
    │       ├── misc.py
    │       ├── pools.py
    │       └── structfile.py
    │   ├── formats.py
    │   ├── highlight.py
    │   ├── idsets.py
    │   ├── index.py
    │   ├── lang
    │       ├── __init__.py
    │       ├── dmetaphone.py
    │       ├── isri.py
    │       ├── lovins.py
    │       ├── morph_en.py
    │       ├── paicehusk.py
    │       ├── phonetic.py
    │       ├── porter.py
    │       ├── porter2.py
    │       ├── snowball
    │       │   ├── LICENSE.txt
    │       │   ├── __init__.py
    │       │   ├── bases.py
    │       │   ├── danish.py
    │       │   ├── dutch.py
    │       │   ├── english.py
    │       │   ├── finnish.py
    │       │   ├── french.py
    │       │   ├── german.py
    │       │   ├── hungarian.py
    │       │   ├── italian.py
    │       │   ├── norwegian.py
    │       │   ├── portugese.py
    │       │   ├── romanian.py
    │       │   ├── russian.py
    │       │   ├── spanish.py
    │       │   └── swedish.py
    │       ├── stopwords.py
    │       └── wordnet.py
    │   ├── legacy.py
    │   ├── matching
    │       ├── __init__.py
    │       ├── binary.py
    │       ├── combo.py
    │       ├── mcore.py
    │       └── wrappers.py
    │   ├── multiproc.py
    │   ├── qparser
    │       ├── __init__.py
    │       ├── common.py
    │       ├── dateparse.py
    │       ├── default.py
    │       ├── plugins.py
    │       ├── syntax.py
    │       └── taggers.py
    │   ├── query
    │       ├── __init__.py
    │       ├── compound.py
    │       ├── nested.py
    │       ├── positional.py
    │       ├── qcolumns.py
    │       ├── qcore.py
    │       ├── ranges.py
    │       ├── spans.py
    │       ├── terms.py
    │       └── wrappers.py
    │   ├── reading.py
    │   ├── scoring.py
    │   ├── searching.py
    │   ├── sorting.py
    │   ├── spelling.py
    │   ├── support
    │       ├── __init__.py
    │       ├── base85.py
    │       ├── bench.py
    │       ├── bitstream.py
    │       ├── bitvector.py
    │       ├── charset.py
    │       ├── levenshtein.py
    │       ├── pyparsing.py
    │       ├── relativedelta.py
    │       └── unicode.py
    │   ├── system.py
    │   ├── util
    │       ├── __init__.py
    │       ├── cache.py
    │       ├── filelock.py
    │       ├── loading.py
    │       ├── numeric.py
    │       ├── numlists.py
    │       ├── testing.py
    │       ├── text.py
    │       ├── times.py
    │       ├── varints.py
    │       └── versions.py
    │   └── writing.py
├── stress
    ├── test_bigfacet.py
    ├── test_bigindex.py
    ├── test_bigsort.py
    ├── test_bigtable.py
    ├── test_hugeindex.py
    ├── test_threading.py
    └── test_update.py
├── sweep.yaml
├── tests
    ├── english-words.10.gz
    ├── test_analysis.py
    ├── test_automata.py
    ├── test_bits.py
    ├── test_classify.py
    ├── test_codecs.py
    ├── test_collector.py
    ├── test_columns.py
    ├── test_compound.py
    ├── test_dateparse.py
    ├── test_fields.py
    ├── test_flexible.py
    ├── test_highlighting.py
    ├── test_indexing.py
    ├── test_matching.py
    ├── test_misc.py
    ├── test_mpwriter.py
    ├── test_nested.py
    ├── test_parse_plugins.py
    ├── test_parsing.py
    ├── test_postings.py
    ├── test_quality.py
    ├── test_queries.py
    ├── test_reading.py
    ├── test_results.py
    ├── test_searching.py
    ├── test_sorting.py
    ├── test_spans.py
    ├── test_spelling.py
    ├── test_stem.py
    ├── test_tables.py
    ├── test_vectors.py
    ├── test_weightings.py
    └── test_writing.py
└── tox.ini


/.codiumai.toml:
--------------------------------------------------------------------------------
 1 | #.codiumai.toml
 2 | [tests]
 3 | 
 4 | ## Testing framework to use - this can affect the content of the generated tests
 5 | ## as well as the test run command.
 6 | ## Possible values are:
 7 | ##  Python: Pytest, Unittest
 8 | framework = "Pytest"
 9 | 
10 | ## A hint to the test generator about whether to use mocks or not. Possible values are true or false.
11 | # use_mocks = false
12 | 
13 | ## How many tests should be generated by default. Fewer tests is faster.
14 | ## Does not apply at the moment to extend-suite tests.
15 | num_desired_tests = 20
16 | 
17 | ## A multiline string, delimited with triple-quotes (""") serving as an extra instruction
18 | ## that the AI model will take into consideration.
19 | ## This will appear as "General instructions" in the
20 | ## configuration section in the tests panel.
21 | # plan_instructions = """
22 | # Each line should have a comment explaining it.
23 | # Each comment should start with the comment number (1., 2. etc.)
24 | # """
25 | 
26 | ## A multiline string, delimited with triple-quotes (""") serving as an example test that represents
27 | ## what you would like the generated tests to look like in terms of style, setup, etc.
28 | # example_test = """
29 | # describe("something", () => {
30 | #   it("says 'bar'", () => {
31 | #     // given
32 | #
33 | #     // when
34 | #     const res = something.say();
35 | #
36 | #     // Then
37 | #     expect(res).to.equal("bar");
38 | #   });
39 | # });
40 | # """
41 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | omit =
 4 |     # Autogenerated missed code handles other VCSes.
 5 |     devito/_version.py
 6 |     examples/*__init__*
 7 | concurrency = multiprocessing
 8 | parallel = True
 9 | 
10 | [report]
11 | # Regexes for lines to exclude from consideration
12 | exclude_lines =
13 |     # Don't complain about missing debug-only code:
14 |     def __repr__
15 | 
16 |     # Don't complain if tests don't hit defensive assertion code:
17 |     raise NotImplementedError
18 |     raise ValueError
19 |     raise TypeError
20 |     raise RuntimeError
21 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [ZeroCool940711]
2 | patreon: zerocool94
3 | ko_fi: zerocool94
4 | open_collective: sygil_dev
5 | custom: ["https://paypal.me/zerocool94"]
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/sweep-template.yml:
--------------------------------------------------------------------------------
 1 | name: Sweep Issue
 2 | title: 'Sweep: '
 3 | description: For small bugs, features, refactors, and tests to be handled by Sweep, an AI-powered junior developer.
 4 | labels: sweep
 5 | body:
 6 |   - type: textarea
 7 |     id: description
 8 |     attributes:
 9 |       label: Details
10 |       description: Tell Sweep where and what to edit and provide enough context for a new developer to the codebase
11 |       placeholder: |
12 |         Unit Tests: Write unit tests for <FILE>. Test each function in the file. Make sure to test edge cases.
13 |         Bugs: The bug might be in <FILE>. Here are the logs: ...
14 |         Features: the new endpoint should use the ... class from <FILE> because it contains ... logic.
15 |         Refactors: We are migrating this function to ... version because ...
16 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | Please include:
 4 | * relevant motivation
 5 | * a summary of the change
 6 | * which issue is fixed.
 7 | * any additional dependencies that are required for this change.
 8 | 
 9 | Closes: # (issue)
10 | 
11 | # Checklist:
12 | 
13 | - [ ] I have performed a self-review of my own code
14 | - [ ] I have commented my code in hard-to-understand areas
15 | - [ ] I have made corresponding changes to the documentation
16 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   # Enable version updates for pip
 9 |   - package-ecosystem: 'pip'
10 |     directory: '/'
11 |     # Check the npm registry for updates once a week (Monday)
12 |     schedule:
13 |       interval: 'daily'
14 | 
15 |   - package-ecosystem: 'github-actions'
16 |     directory: '/'
17 |     schedule:
18 |       interval: 'daily'
19 | 


--------------------------------------------------------------------------------
/.github/workflows/auto-merge.yml:
--------------------------------------------------------------------------------
 1 | name: Dependabot auto-merge
 2 | on: pull_request
 3 | 
 4 | permissions:
 5 |   contents: write
 6 |   pull-requests: write
 7 | 
 8 | jobs:
 9 |   dependabot:
10 |     runs-on: ubuntu-latest
11 |     if: github.actor == 'dependabot[bot]'
12 |     steps:
13 |       - name: Dependabot metadata
14 |         id: metadata
15 |         uses: dependabot/fetch-metadata@v2
16 |         with:
17 |           github-token: "${{ secrets.GITHUB_TOKEN }}"
18 |       - name: Enable auto-merge for Dependabot PRs
19 |         run: gh pr merge --auto --merge "$PR_URL"
20 |         env:
21 |           PR_URL: ${{github.event.pull_request.html_url}}
22 |           GH_TOKEN: ${{secrets.GITHUB_TOKEN}}
23 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-github-pages.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy to GitHub Pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     # Review gh actions docs if you want to further define triggers, paths, etc
 8 |     # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on
 9 | 
10 |   workflow_dispatch: # This line allows manual triggering
11 | 
12 | jobs:
13 |   deploy:
14 |     name: Deploy to GitHub Pages
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v4.1.1
18 |       - uses: actions/setup-python@v5.0.0
19 |         with:
20 |             python-version: '3.7'
21 | 
22 |       - name: Install dependencies
23 |         run: pip install -r requirements.txt && pip install -r docs/requirements.txt
24 |       - name: Build website
25 |         run: cd docs && make html
26 | 
27 |       # Popular action to deploy to GitHub Pages:
28 |       # Docs: https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-docusaurus
29 |       - name: Deploy to GitHub Pages
30 |         uses: peaceiris/actions-gh-pages@v4.0.0
31 |         with:
32 |           github_token: ${{ secrets.GITHUB_TOKEN }}
33 |           # Build output to publish to the `gh-pages` branch:
34 |           publish_dir: ./docs/build/html
35 |           # The following lines assign commit authorship to the official
36 |           # GH-Actions bot for deploys to `gh-pages` branch:
37 |           # https://github.com/actions/checkout/issues/13#issuecomment-724415212
38 |           # The GH actions bot is used by default if you didn't specify the two fields.
39 |           # You can swap them out with your own user credentials.
40 |           user_name: github-actions[bot]
41 |           user_email: 41898282+github-actions[bot]@users.noreply.github.com
42 | 


--------------------------------------------------------------------------------
/.github/workflows/first-interaction.yml:
--------------------------------------------------------------------------------
 1 | name: first-interaction
 2 | 
 3 | on:
 4 |   issues:
 5 |     types: [opened]
 6 |   pull_request:
 7 |     branches: [main]
 8 |     types: [opened]
 9 | 
10 | jobs:
11 |   check_for_first_interaction:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - uses: actions/first-interaction@main
16 |         with:
17 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
18 |           issue-message: |
19 |             Hello! Thank you for filing an issue.
20 | 
21 |             If this is a bug report, please include relevant logs to help us debug the problem.
22 |           pr-message: |
23 |             Hello! Thank you for your contribution.
24 | 
25 |             If you are fixing a bug, please reference the issue number in the description.
26 | 
27 |             If you are implementing a feature request, please check with the maintainers that the feature will be accepted first.
28 | 


--------------------------------------------------------------------------------
/.github/workflows/issue_labeler.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | name: Issue labeler
 4 | on:
 5 |   issues:
 6 |     types: [ opened ]
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   label-component:
13 |     runs-on: ubuntu-22.04
14 | 
15 |     permissions:
16 |       issues: write
17 | 
18 |     strategy:
19 |       matrix:
20 |         template: [ bug_report.yml, feature_request.yml ]
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 | 
25 |       - name: Parse issue form
26 |         uses: stefanbuck/github-issue-parser@c1a559d78bfb8dd05216dab9ffd2b91082ff5324
27 |         id: issue-parser
28 |         with:
29 |           template-path: .github/ISSUE_TEMPLATE/${{ matrix.template }}
30 | 
31 |       - name: Set labels based on component field
32 |         uses: redhat-plumbers-in-action/advanced-issue-labeler@d498805e5c7c0658e336948b3363480bcfd68da6
33 |         with:
34 |           issue-form: ${{ steps.issue-parser.outputs.jsonString }}
35 |           template: ${{ matrix.template }}
36 |           token: ${{ secrets.GITHUB_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 |   workflow_dispatch: # This line allows manual triggering
16 | 
17 |   #push:
18 |   #  branches:
19 |   #    - master
20 | 
21 | permissions:
22 |   contents: read
23 | 
24 | jobs:
25 |   deploy:
26 | 
27 |     runs-on: ubuntu-latest
28 | 
29 |     steps:
30 |     - uses: actions/checkout@v4
31 |     - name: Set up Python
32 |       uses: actions/setup-python@v5
33 |       with:
34 |         python-version: '3.9'
35 |     - name: Install dependencies
36 |       run: |
37 |         python -m pip install --upgrade pip
38 |         pip install build
39 |     - name: Build package
40 |       run: python -m build
41 |     - name: Publish package
42 |       uses: pypa/gh-action-pypi-publish@v1.8.14
43 |       with:
44 |         user: __token__
45 |         password: ${{ secrets.PYPI_API_TOKEN }}
46 | 


--------------------------------------------------------------------------------
/.github/workflows/stale_bot.yaml:
--------------------------------------------------------------------------------
 1 | name: Stale bot
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * *'
 6 | 
 7 | permissions:
 8 |   pull-requests: write
 9 | 
10 | jobs:
11 |   stale:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Mark and close stale PRs
15 |         uses: actions/stale@v9
16 |         with:
17 |           stale-pr-message: "This PR is stale because it has been 60 days with no activity. This PR will be automatically closed within 7 days if there is no further activity."
18 |           close-pr-message: "This PR was closed because it has been stalled for some time with no activity."
19 |           days-before-stale: -1 # avoid marking issues
20 |           days-before-pr-stale: 60
21 |           days-before-close: -1 # avoid closing issues
22 |           days-before-pr-close: 7
23 |           exempt-all-pr-assignees: true  # avoid stale for all PR with assignees
24 |           exempt-all-pr-milestones: true # avoid stale for all PR with milestones
25 |           operations-per-run: 200
26 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Test
 3 | 
 4 | on: [pull_request, push, workflow_dispatch]
 5 | 
 6 | jobs:
 7 |   codespell_and_ruff:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - uses: actions/checkout@v4
11 |     - run: pip install --user codespell[toml] ruff
12 |     # TODO: Fix lint issues and remove `--exit-zero` from the line below.
13 |     - run: ruff --exit-zero --output-format=github --target-version=py38 .
14 |   test:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: [3.8, 3.9, "3.10", 3.11, 3.12]
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip setuptools wheel
28 |         pip install pytest pytest-cov pytest-sugar coverage cached-property
29 |     - name: Install Whoosh
30 |       run: |
31 |         pip install -e .
32 |     - name: Run tests
33 |       run: |
34 |         pytest --cov=./ --cov-report=xml --cov-report=html
35 |     - name: Upload HTML coverage report
36 |       uses: actions/upload-artifact@v4
37 |       with:
38 |         name: "HTML Coverage ${{ matrix.python-version }}"
39 |         path: "htmlcov"
40 |         retention-days: 7
41 | 
42 |     - name: Upload Coverage to Codecov
43 |       uses: codecov/codecov-action@v4.4.1
44 |       with:
45 |         directory: ./
46 |         env_vars: OS,PYTHON
47 |         fail_ci_if_error: true
48 |         files: ./coverage.xml
49 |         handle_no_reports_found: true
50 |         token: ${{ secrets.CODECOV_TOKEN }} # required
51 |         verbose: true # optional (default = false)
52 | 


--------------------------------------------------------------------------------
/.github/workflows/tox.yml:
--------------------------------------------------------------------------------
 1 | name: tox
 2 | on: [push, pull_request, workflow_dispatch]
 3 | jobs:
 4 |   tox:
 5 |     strategy:
 6 |       fail-fast: false
 7 |       max-parallel: 5
 8 |       matrix:
 9 |         os: [ubuntu-latest]  # [macos-latest, ubuntu-latest, windows-latest]
10 |         python: ['3.8', '3.9', '3.10', '3.11', '3.12']
11 |     runs-on: ${{ matrix.os }}
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |       - uses: actions/setup-python@v5
15 |         with:
16 |           python-version: ${{ matrix.python }}
17 |       - run: pip install --upgrade pip
18 |       - run: pip install tox
19 |       - run: tox -e py
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | __pycache__/
 3 | .tox/
 4 | env/
 5 | build/
 6 | dist/
 7 | eggs/
 8 | .eggs/
 9 | *.egg
10 | *.egg-info/
11 | /test.py
12 | /.vscode/settings.json
13 | /.coverage
14 | /whoosh-reloaded.code-workspace
15 | /.vscode/launch.json
16 | *.coverage.DESKTOP-*
17 | /coverage.xml
18 | /lcov.info
19 | /.codiumai.local.toml
20 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | *.pyc
 3 | *~
 4 | *.DS_Store
 5 | 
 6 | .idea
 7 | .settings
 8 | .coverage
 9 | .tox
10 | .cache
11 | nosetests.xml
12 | 
13 | build
14 | dist
15 | docs/build
16 | src/Whoosh.egg-info
17 | 
18 | bmark
19 | *testindex
20 | benchmark/enron_index*
21 | benchmark/reuters_index*
22 | benchmark/dictionary_index*
23 | benchmark/enron_cache.pickle
24 | benchmark/enron_mail_082109.tar.gz
25 | 
26 | tmp/*
27 | tests/tmp/*
28 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.5.0
 4 |     hooks:
 5 |       - id: check-ast
 6 |       - id: check-builtin-literals
 7 |       - id: check-merge-conflict
 8 |       - id: check-toml
 9 |       - id: check-yaml
10 |       - id: detect-private-key
11 |       - id: end-of-file-fixer
12 |       - id: mixed-line-ending
13 |       - id: trailing-whitespace
14 | 
15 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
16 |     rev: v0.2.1
17 |     hooks:
18 |       - id: ruff
19 |         args: [ --fix ]
20 |       - id: ruff-format
21 | 
22 |   -   repo: https://github.com/ikamensh/flynt/
23 |       rev: '1.0.1'
24 |       hooks:
25 |       -   id: flynt
26 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.7"
13 | 
14 | # Build documentation in the "docs/" directory with Sphinx
15 | sphinx:
16 |   configuration: docs/source/conf.py
17 | 
18 | # Optionally build your docs in additional formats such as PDF and ePub
19 | # formats:
20 | #    - pdf
21 | #    - epub
22 | 
23 | # Optional but recommended, declare the Python requirements required
24 | # to build your documentation
25 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
26 | python:
27 |     install:
28 | #    - requirements: requirements.txt
29 |     - requirements: docs/requirements.txt
30 | 


--------------------------------------------------------------------------------
/.sonarcloud.properties:
--------------------------------------------------------------------------------
1 | sonar.python.version=3.8, 3.9, 3.10, 3.11, 3.12
2 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2011 Matt Chaput. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 |    1. Redistributions of source code must retain the above copyright notice,
 7 |       this list of conditions and the following disclaimer.
 8 | 
 9 |    2. Redistributions in binary form must reproduce the above copyright
10 |       notice, this list of conditions and the following disclaimer in the
11 |       documentation and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 
24 | The views and conclusions contained in the software and documentation are
25 | those of the authors and should not be interpreted as representing official
26 | policies, either expressed or implied, of Matt Chaput.
27 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include benchmark/dcvgr10.txt.gz
3 | include benchmark/reuters21578.txt.gz
4 | include tests/english-words.10.gz
5 | recursive-include tests *.txt *.py
6 | recursive-include benchmark *.txt *.py
7 | recursive-include docs *.txt *.py *.rst
8 | recursive-include files *.txt *.py *.png *.jpg *.svg
9 | 


--------------------------------------------------------------------------------
/benchmark/dcvgr10.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/benchmark/dcvgr10.txt.gz


--------------------------------------------------------------------------------
/benchmark/dictionary.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import os
 3 | 
 4 | from whoosh import analysis, fields
 5 | from whoosh.support.bench import Bench, Spec
 6 | 
 7 | 
 8 | class VulgarTongue(Spec):
 9 |     name = "dictionary"
10 |     filename = "dcvgr10.txt.gz"
11 |     headline_field = "head"
12 | 
13 |     def documents(self):
14 |         path = os.path.join(self.options.dir, self.filename)
15 |         f = gzip.GzipFile(path)
16 | 
17 |         head = body = None
18 |         for line in f:
19 |             line = line.decode("latin1")
20 |             if line[0].isalpha():
21 |                 if head:
22 |                     yield {"head": head, "body": head + body}
23 |                 head, body = line.split(".", 1)
24 |             else:
25 |                 body += line
26 | 
27 |         if head:
28 |             yield {"head": head, "body": head + body}
29 | 
30 |     def whoosh_schema(self):
31 |         ana = analysis.StemmingAnalyzer()
32 | 
33 |         schema = fields.Schema(
34 |             head=fields.ID(stored=True), body=fields.TEXT(analyzer=ana, stored=True)
35 |         )
36 |         return schema
37 | 
38 |     def zcatalog_setup(self, cat):
39 |         from zcatalog import indexes  # type: ignore @UnresolvedImport
40 | 
41 |         cat["head"] = indexes.FieldIndex(field_name="head")
42 |         cat["body"] = indexes.TextIndex(field_name="body")
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     Bench().run(VulgarTongue)
47 | 


--------------------------------------------------------------------------------
/benchmark/reuters.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import os.path
 3 | 
 4 | from whoosh import analysis, fields, index, qparser, query
 5 | from whoosh.support.bench import Bench, Spec
 6 | from whoosh.util import now
 7 | 
 8 | 
 9 | class Reuters(Spec):
10 |     name = "reuters"
11 |     filename = "reuters21578.txt.gz"
12 |     main_field = "text"
13 |     headline_text = "headline"
14 | 
15 |     def whoosh_schema(self):
16 |         # ana = analysis.StemmingAnalyzer()
17 |         ana = analysis.StandardAnalyzer()
18 |         schema = fields.Schema(
19 |             id=fields.ID(stored=True),
20 |             headline=fields.STORED,
21 |             text=fields.TEXT(analyzer=ana, stored=True),
22 |         )
23 |         return schema
24 | 
25 |     def zcatalog_setup(self, cat):
26 |         from zcatalog import indexes  # type: ignore @UnresolvedImport
27 | 
28 |         cat["id"] = indexes.FieldIndex(field_name="id")
29 |         cat["headline"] = indexes.TextIndex(field_name="headline")
30 |         cat["body"] = indexes.TextIndex(field_name="text")
31 | 
32 |     def documents(self):
33 |         path = os.path.join(self.options.dir, self.filename)
34 |         f = gzip.GzipFile(path)
35 | 
36 |         for line in f:
37 |             id, text = line.decode("latin1").split("\t")
38 |             yield {"id": id, "text": text, "headline": text[:70]}
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     Bench().run(Reuters)
43 | 


--------------------------------------------------------------------------------
/benchmark/reuters21578.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/benchmark/reuters21578.txt.gz


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   require_ci_to_pass: yes
 3 |   notify:
 4 |     wait_for_ci: yes
 5 | 
 6 | coverage:
 7 |   precision: 2
 8 |   round: down
 9 |   range: 80...90
10 | 
11 |   status:
12 |     # Learn more at http://docs.codecov.io/docs/codecov-yaml
13 |     project:
14 |       default:
15 |         enabled: yes
16 |         target: 1
17 |         threshold: 0.1
18 |     patch:
19 |       default:
20 |         enabled: off
21 | 
22 | ignore:
23 |   - "**/*.ipynb"
24 |   - docs
25 |   - docker
26 |   - binder
27 |   - .github
28 |   - .git
29 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx_rtd_theme
3 | sphinx-jsonschema
4 | 


--------------------------------------------------------------------------------
/docs/source/api/analysis.rst:
--------------------------------------------------------------------------------
 1 | ===================
 2 | ``analysis`` module
 3 | ===================
 4 | 
 5 | .. automodule:: whoosh.analysis
 6 | 
 7 | Analyzers
 8 | =========
 9 | 
10 | .. autofunction:: IDAnalyzer
11 | .. autofunction:: KeywordAnalyzer
12 | .. autofunction:: RegexAnalyzer
13 | .. autofunction:: SimpleAnalyzer
14 | .. autofunction:: StandardAnalyzer
15 | .. autofunction:: StemmingAnalyzer
16 | .. autofunction:: FancyAnalyzer
17 | .. autofunction:: NgramAnalyzer
18 | .. autofunction:: NgramWordAnalyzer
19 | .. autofunction:: LanguageAnalyzer
20 | 
21 | 
22 | Tokenizers
23 | ==========
24 | 
25 | .. autoclass:: IDTokenizer
26 | .. autoclass:: RegexTokenizer
27 | .. autoclass:: CharsetTokenizer
28 | .. autofunction:: SpaceSeparatedTokenizer
29 | .. autofunction:: CommaSeparatedTokenizer
30 | .. autoclass:: NgramTokenizer
31 | .. autoclass:: PathTokenizer
32 | 
33 | 
34 | Filters
35 | =======
36 | 
37 | .. autoclass:: PassFilter
38 | .. autoclass:: LoggingFilter
39 | .. autoclass:: MultiFilter
40 | .. autoclass:: TeeFilter
41 | .. autoclass:: ReverseTextFilter
42 | .. autoclass:: LowercaseFilter
43 | .. autoclass:: StripFilter
44 | .. autoclass:: StopFilter
45 | .. autoclass:: StemFilter
46 | .. autoclass:: CharsetFilter
47 | .. autoclass:: NgramFilter
48 | .. autoclass:: IntraWordFilter
49 | .. autoclass:: CompoundWordFilter
50 | .. autoclass:: BiWordFilter
51 | .. autoclass:: ShingleFilter
52 | .. autoclass:: DelimitedAttributeFilter
53 | .. autoclass:: DoubleMetaphoneFilter
54 | .. autoclass:: SubstitutionFilter
55 | 
56 | 
57 | Token classes and functions
58 | ===========================
59 | 
60 | .. autoclass:: Token
61 | .. autofunction:: unstopped
62 | 


--------------------------------------------------------------------------------
/docs/source/api/api.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Whoosh API
 3 | ==========
 4 | 
 5 | .. toctree::
 6 |     :glob:
 7 |     :maxdepth: 1
 8 | 
 9 |     **
10 | 


--------------------------------------------------------------------------------
/docs/source/api/codec/base.rst:
--------------------------------------------------------------------------------
 1 | =====================
 2 | ``codec.base`` module
 3 | =====================
 4 | 
 5 | .. automodule:: whoosh.codec.base
 6 | 
 7 | 
 8 | Classes
 9 | =======
10 | 
11 | .. autoclass:: Codec
12 |    :members:
13 | 
14 | .. autoclass:: PerDocumentWriter
15 |    :members:
16 | 
17 | .. autoclass:: FieldWriter
18 |    :members:
19 | 
20 | .. autoclass:: PostingsWriter
21 |    :members:
22 | 
23 | .. autoclass:: TermsReader
24 |    :members:
25 | 
26 | .. autoclass:: PerDocumentReader
27 |    :members:
28 | 
29 | .. autoclass:: Segment
30 |    :members:
31 | 


--------------------------------------------------------------------------------
/docs/source/api/collectors.rst:
--------------------------------------------------------------------------------
 1 | =====================
 2 | ``collectors`` module
 3 | =====================
 4 | 
 5 | .. automodule:: whoosh.collectors
 6 | 
 7 | 
 8 | Base classes
 9 | ============
10 | 
11 | .. autoclass:: Collector
12 |     :members:
13 | 
14 | .. autoclass:: ScoredCollector
15 |     :members:
16 | 
17 | .. autoclass:: WrappingCollector
18 |     :members:
19 | 
20 | 
21 | Basic collectors
22 | ================
23 | 
24 | .. autoclass:: TopCollector
25 | 
26 | .. autoclass:: UnlimitedCollector
27 | 
28 | .. autoclass:: SortingCollector
29 | 
30 | 
31 | Wrappers
32 | ========
33 | 
34 | .. autoclass:: FilterCollector
35 | 
36 | .. autoclass:: FacetCollector
37 | 
38 | .. autoclass:: CollapseCollector
39 | 
40 | .. autoclass:: TimeLimitCollector
41 | 
42 | .. autoclass:: TermsCollector
43 | 


--------------------------------------------------------------------------------
/docs/source/api/columns.rst:
--------------------------------------------------------------------------------
 1 | =====================
 2 | ``columns`` module
 3 | =====================
 4 | 
 5 | .. automodule:: whoosh.columns
 6 | 
 7 | 
 8 | Base classes
 9 | ============
10 | 
11 | .. autoclass:: Column
12 |     :members:
13 | 
14 | .. autoclass:: ColumnWriter
15 |     :members:
16 | 
17 | .. autoclass:: ColumnReader
18 |     :members:
19 | 
20 | 
21 | Basic columns
22 | =============
23 | 
24 | .. autoclass:: VarBytesColumn
25 | 
26 | .. autoclass:: FixedBytesColumn
27 | 
28 | .. autoclass:: RefBytesColumn
29 | 
30 | .. autoclass:: NumericColumn
31 | 
32 | 
33 | Technical columns
34 | =================
35 | 
36 | .. autoclass:: BitColumn
37 | 
38 | .. autoclass:: CompressedBytesColumn
39 | 
40 | .. autoclass:: StructColumn
41 | 
42 | .. autoclass:: PickleColumn
43 | 
44 | 
45 | Experimental columns
46 | ====================
47 | 
48 | .. autoclass:: ClampedNumericColumn
49 | 


--------------------------------------------------------------------------------
/docs/source/api/fields.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | ``fields`` module
 3 | =================
 4 | 
 5 | .. automodule:: whoosh.fields
 6 | 
 7 | Schema class
 8 | ============
 9 | 
10 | .. autoclass:: Schema
11 |     :members:
12 | 
13 | .. autoclass:: SchemaClass
14 | 
15 | FieldType base class
16 | ====================
17 | 
18 | .. autoclass:: FieldType
19 |     :members:
20 | 
21 | 
22 | Pre-made field types
23 | ====================
24 | 
25 | .. autoclass:: ID
26 | .. autoclass:: IDLIST
27 | .. autoclass:: STORED
28 | .. autoclass:: KEYWORD
29 | .. autoclass:: TEXT
30 | .. autoclass:: NUMERIC
31 | .. autoclass:: DATETIME
32 | .. autoclass:: BOOLEAN
33 | .. autoclass:: NGRAM
34 | .. autoclass:: NGRAMWORDS
35 | 
36 | 
37 | Exceptions
38 | ==========
39 | 
40 | .. autoexception:: FieldConfigurationError
41 | .. autoexception:: UnknownFieldError
42 | 


--------------------------------------------------------------------------------
/docs/source/api/filedb/filestore.rst:
--------------------------------------------------------------------------------
 1 | ===========================
 2 | ``filedb.filestore`` module
 3 | ===========================
 4 | 
 5 | .. automodule:: whoosh.filedb.filestore
 6 | 
 7 | Base class
 8 | ==========
 9 | 
10 | .. autoclass:: Storage
11 |     :members:
12 | 
13 | 
14 | Implementation classes
15 | ======================
16 | 
17 | .. autoclass:: FileStorage
18 | .. autoclass:: RamStorage
19 | 
20 | 
21 | Helper functions
22 | ================
23 | 
24 | .. autofunction:: copy_storage
25 | .. autofunction:: copy_to_ram
26 | 
27 | 
28 | Exceptions
29 | ==========
30 | 
31 | .. autoexception:: ReadOnlyError
32 | 


--------------------------------------------------------------------------------
/docs/source/api/filedb/filetables.rst:
--------------------------------------------------------------------------------
 1 | ============================
 2 | ``filedb.filetables`` module
 3 | ============================
 4 | 
 5 | .. automodule:: whoosh.filedb.filetables
 6 | 
 7 | 
 8 | Hash file
 9 | =========
10 | 
11 | .. autoclass:: HashWriter
12 |     :members:
13 | 
14 | .. autoclass:: HashReader
15 |     :members:
16 | 
17 | 
18 | Ordered Hash file
19 | =================
20 | 
21 | .. autoclass:: OrderedHashWriter
22 | .. autoclass:: OrderedHashReader
23 | 


--------------------------------------------------------------------------------
/docs/source/api/filedb/structfile.rst:
--------------------------------------------------------------------------------
 1 | ============================
 2 | ``filedb.structfile`` module
 3 | ============================
 4 | 
 5 | .. automodule:: whoosh.filedb.structfile
 6 | 
 7 | Classes
 8 | =======
 9 | 
10 | .. autoclass:: StructFile
11 |     :members:
12 | 
13 | .. autoclass:: BufferFile
14 | .. autoclass:: ChecksumFile
15 | 


--------------------------------------------------------------------------------
/docs/source/api/formats.rst:
--------------------------------------------------------------------------------
 1 | ==================
 2 | ``formats`` module
 3 | ==================
 4 | 
 5 | .. automodule:: whoosh.formats
 6 | 
 7 | Base class
 8 | ==========
 9 | 
10 | .. autoclass:: Format
11 |     :members:
12 | 
13 | 
14 | Formats
15 | =======
16 | 
17 | .. autoclass:: Existence
18 | .. autoclass:: Frequency
19 | .. autoclass:: Positions
20 | .. autoclass:: Characters
21 | .. autoclass:: PositionBoosts
22 | .. autoclass:: CharacterBoosts
23 | 


--------------------------------------------------------------------------------
/docs/source/api/highlight.rst:
--------------------------------------------------------------------------------
 1 | ====================
 2 | ``highlight`` module
 3 | ====================
 4 | 
 5 | .. automodule:: whoosh.highlight
 6 | 
 7 | See :doc:`how to highlight terms in search results </highlight>`.
 8 | 
 9 | 
10 | Manual highlighting
11 | ===================
12 | 
13 | .. autoclass:: Highlighter
14 |    :members:
15 | 
16 | .. autofunction:: highlight
17 | 
18 | 
19 | Fragmenters
20 | ===========
21 | 
22 | .. autoclass:: Fragmenter
23 |    :members:
24 | 
25 | .. autoclass:: WholeFragmenter
26 | .. autoclass:: SentenceFragmenter
27 | .. autoclass:: ContextFragmenter
28 | .. autoclass:: PinpointFragmenter
29 | 
30 | 
31 | Scorers
32 | =======
33 | 
34 | .. autoclass:: FragmentScorer
35 | .. autoclass:: BasicFragmentScorer
36 | 
37 | 
38 | Formatters
39 | ==========
40 | 
41 | .. autoclass:: UppercaseFormatter
42 | .. autoclass:: HtmlFormatter
43 | .. autoclass:: GenshiFormatter
44 | 
45 | 
46 | Utility classes
47 | ===============
48 | 
49 | .. autoclass:: Fragment
50 |     :members:
51 | 


--------------------------------------------------------------------------------
/docs/source/api/idsets.rst:
--------------------------------------------------------------------------------
 1 | ============================
 2 | ``support.bitvector`` module
 3 | ============================
 4 | 
 5 | .. automodule:: whoosh.idsets
 6 | 
 7 | 
 8 | Base classes
 9 | ============
10 | 
11 | .. autoclass:: DocIdSet
12 |     :members:
13 | 
14 | .. autoclass:: BaseBitSet
15 | 
16 | 
17 | Implementation classes
18 | ======================
19 | 
20 | .. autoclass:: BitSet
21 | .. autoclass:: OnDiskBitSet
22 | .. autoclass:: SortedIntSet
23 | .. autoclass:: MultiIdSet
24 | 


--------------------------------------------------------------------------------
/docs/source/api/index.rst:
--------------------------------------------------------------------------------
 1 | ================
 2 | ``index`` module
 3 | ================
 4 | 
 5 | .. automodule:: whoosh.index
 6 | 
 7 | 
 8 | Functions
 9 | =========
10 | 
11 | .. autofunction:: create_in
12 | .. autofunction:: open_dir
13 | .. autofunction:: exists_in
14 | .. autofunction:: exists
15 | .. autofunction:: version_in
16 | .. autofunction:: version
17 | 
18 | 
19 | Base class
20 | ==========
21 | 
22 | .. autoclass:: Index
23 |     :members:
24 | 
25 | 
26 | Implementation
27 | ==============
28 | 
29 | .. autoclass:: FileIndex
30 | 
31 | 
32 | Exceptions
33 | ==========
34 | 
35 | .. autoexception:: LockError
36 | .. autoexception:: IndexError
37 | .. autoexception:: IndexVersionError
38 | .. autoexception:: OutOfDateError
39 | .. autoexception:: EmptyIndexError
40 | 


--------------------------------------------------------------------------------
/docs/source/api/lang/morph_en.rst:
--------------------------------------------------------------------------------
1 | ========================
2 | ``lang.morph_en`` module
3 | ========================
4 | 
5 | .. automodule:: whoosh.lang.morph_en
6 | 
7 | .. autofunction:: variations
8 | 


--------------------------------------------------------------------------------
/docs/source/api/lang/porter.rst:
--------------------------------------------------------------------------------
1 | ======================
2 | ``lang.porter`` module
3 | ======================
4 | 
5 | .. automodule:: whoosh.lang.porter
6 | 
7 | .. autofunction:: stem
8 | 


--------------------------------------------------------------------------------
/docs/source/api/lang/wordnet.rst:
--------------------------------------------------------------------------------
 1 | ========================
 2 | ``lang.wordnet`` module
 3 | ========================
 4 | 
 5 | .. automodule:: whoosh.lang.wordnet
 6 | 
 7 | Thesaurus
 8 | =========
 9 | 
10 | .. autoclass:: Thesaurus
11 |     :members:
12 | 
13 | 
14 | Low-level functions
15 | ===================
16 | 
17 | .. autofunction:: parse_file
18 | .. autofunction:: synonyms
19 | .. autofunction:: make_index
20 | 


--------------------------------------------------------------------------------
/docs/source/api/matching.rst:
--------------------------------------------------------------------------------
 1 | ===================
 2 | ``matching`` module
 3 | ===================
 4 | 
 5 | .. automodule:: whoosh.matching
 6 | 
 7 | Matchers
 8 | ========
 9 | 
10 | .. autoclass:: Matcher
11 |     :members:
12 | 
13 | .. autoclass:: NullMatcher
14 | .. autoclass:: ListMatcher
15 | .. autoclass:: WrappingMatcher
16 | .. autoclass:: MultiMatcher
17 | .. autoclass:: FilterMatcher
18 | .. autoclass:: BiMatcher
19 | .. autoclass:: AdditiveBiMatcher
20 | .. autoclass:: UnionMatcher
21 | .. autoclass:: DisjunctionMaxMatcher
22 | .. autoclass:: IntersectionMatcher
23 | .. autoclass:: AndNotMatcher
24 | .. autoclass:: InverseMatcher
25 | .. autoclass:: RequireMatcher
26 | .. autoclass:: AndMaybeMatcher
27 | .. autoclass:: ConstantScoreMatcher
28 | 
29 | 
30 | Exceptions
31 | ==========
32 | 
33 | .. autoexception:: ReadTooFar
34 | .. autoexception:: NoQualityAvailable
35 | 


--------------------------------------------------------------------------------
/docs/source/api/qparser.rst:
--------------------------------------------------------------------------------
 1 | ==================
 2 | ``qparser`` module
 3 | ==================
 4 | 
 5 | .. automodule:: whoosh.qparser
 6 | 
 7 | Parser object
 8 | =============
 9 | 
10 | .. autoclass:: QueryParser
11 |     :members:
12 | 
13 | Pre-made configurations
14 | -----------------------
15 | 
16 | The following functions return pre-configured QueryParser objects.
17 | 
18 | .. autofunction:: MultifieldParser
19 | 
20 | .. autofunction:: SimpleParser
21 | 
22 | .. autofunction:: DisMaxParser
23 | 
24 | 
25 | Plug-ins
26 | ========
27 | 
28 | .. autoclass:: Plugin
29 |     :members:
30 | 
31 | .. autoclass:: SingleQuotePlugin
32 | .. autoclass:: PrefixPlugin
33 | .. autoclass:: WildcardPlugin
34 | .. autoclass:: RegexPlugin
35 | .. autoclass:: BoostPlugin
36 | .. autoclass:: GroupPlugin
37 | .. autoclass:: EveryPlugin
38 | .. autoclass:: FieldsPlugin
39 | .. autoclass:: PhrasePlugin
40 | .. autoclass:: RangePlugin
41 | .. autoclass:: OperatorsPlugin
42 | .. autoclass:: PlusMinusPlugin
43 | .. autoclass:: GtLtPlugin
44 | .. autoclass:: MultifieldPlugin
45 | .. autoclass:: FieldAliasPlugin
46 | .. autoclass:: CopyFieldPlugin
47 | 
48 | 
49 | Syntax node objects
50 | ===================
51 | 
52 | Base nodes
53 | ----------
54 | 
55 | .. autoclass:: SyntaxNode
56 |     :members:
57 | 
58 | 
59 | Nodes
60 | -----
61 | 
62 | .. autoclass:: FieldnameNode
63 | .. autoclass:: TextNode
64 | .. autoclass:: WordNode
65 | .. autoclass:: RangeNode
66 | .. autoclass:: MarkerNode
67 | 
68 | 
69 | Group nodes
70 | -----------
71 | 
72 | .. autoclass:: GroupNode
73 | .. autoclass:: BinaryGroup
74 | .. autoclass:: ErrorNode
75 | .. autoclass:: AndGroup
76 | .. autoclass:: OrGroup
77 | .. autoclass:: AndNotGroup
78 | .. autoclass:: AndMaybeGroup
79 | .. autoclass:: DisMaxGroup
80 | .. autoclass:: RequireGroup
81 | .. autoclass:: NotGroup
82 | 
83 | 
84 | Operators
85 | ---------
86 | 
87 | .. autoclass:: Operator
88 | .. autoclass:: PrefixOperator
89 | .. autoclass:: PostfixOperator
90 | .. autoclass:: InfixOperator
91 | 


--------------------------------------------------------------------------------
/docs/source/api/query.rst:
--------------------------------------------------------------------------------
 1 | ================
 2 | ``query`` module
 3 | ================
 4 | 
 5 | .. automodule:: whoosh.query
 6 | 
 7 | See also :mod:` whoosh.qparser` which contains code for parsing user queries
 8 | into query objects.
 9 | 
10 | Base classes
11 | ============
12 | 
13 | The following abstract base classes are subclassed to create the "real"
14 | query operations.
15 | 
16 | .. autoclass:: Query
17 |     :members:
18 | 
19 | .. autoclass:: CompoundQuery
20 | .. autoclass:: MultiTerm
21 | .. autoclass:: ExpandingTerm
22 | .. autoclass:: WrappingQuery
23 | 
24 | 
25 | Query classes
26 | =============
27 | 
28 | .. autoclass:: Term
29 | .. autoclass:: Variations
30 | .. autoclass:: FuzzyTerm
31 | .. autoclass:: Phrase
32 | .. autoclass:: And
33 | .. autoclass:: Or
34 | .. autoclass:: DisjunctionMax
35 | .. autoclass:: Not
36 | .. autoclass:: Prefix
37 | .. autoclass:: Wildcard
38 | .. autoclass:: Regex
39 | .. autoclass:: TermRange
40 | .. autoclass:: NumericRange
41 | .. autoclass:: DateRange
42 | .. autoclass:: Every
43 | .. autoclass:: NullQuery
44 | 
45 | 
46 | Binary queries
47 | ==============
48 | 
49 | .. autoclass:: Require
50 | .. autoclass:: AndMaybe
51 | .. autoclass:: AndNot
52 | .. autoclass:: Otherwise
53 | 
54 | 
55 | Span queries
56 | ============
57 | 
58 | .. autoclass:: Span
59 |     :members:
60 | 
61 | .. autoclass:: SpanQuery
62 | .. autoclass:: SpanFirst
63 | .. autoclass:: SpanNear
64 | .. autoclass:: SpanNear2
65 | .. autoclass:: SpanNot
66 | .. autoclass:: SpanOr
67 | .. autoclass:: SpanContains
68 | .. autoclass:: SpanBefore
69 | .. autoclass:: SpanCondition
70 | 
71 | 
72 | Special queries
73 | ===============
74 | 
75 | .. autoclass:: NestedParent
76 | .. autoclass:: NestedChildren
77 | .. autoclass:: ConstantScoreQuery
78 | 
79 | 
80 | Exceptions
81 | ==========
82 | 
83 | .. autoexception:: QueryError
84 | 


--------------------------------------------------------------------------------
/docs/source/api/reading.rst:
--------------------------------------------------------------------------------
 1 | ==================
 2 | ``reading`` module
 3 | ==================
 4 | 
 5 | .. automodule:: whoosh.reading
 6 | 
 7 | Classes
 8 | =======
 9 | 
10 | .. autoclass:: IndexReader
11 |     :members:
12 | 
13 | .. autoclass:: MultiReader
14 | 
15 | .. autoclass:: TermInfo
16 |    :members:
17 | 
18 | Exceptions
19 | ==========
20 | 
21 | .. autoexception:: TermNotFound
22 | 


--------------------------------------------------------------------------------
/docs/source/api/scoring.rst:
--------------------------------------------------------------------------------
 1 | ==================
 2 | ``scoring`` module
 3 | ==================
 4 | 
 5 | .. automodule:: whoosh.scoring
 6 | 
 7 | 
 8 | Base classes
 9 | ============
10 | 
11 | .. autoclass:: WeightingModel
12 |     :members:
13 | 
14 | .. autoclass:: BaseScorer
15 |     :members:
16 | 
17 | .. autoclass:: WeightScorer
18 | .. autoclass:: WeightLengthScorer
19 | 
20 | 
21 | Scoring algorithm classes
22 | =========================
23 | 
24 | .. autoclass:: BM25F
25 | 
26 | .. autoclass:: TF_IDF
27 | 
28 | .. autoclass:: Frequency
29 | 
30 | 
31 | Scoring utility classes
32 | =======================
33 | 
34 | .. autoclass:: FunctionWeighting
35 | 
36 | .. autoclass:: MultiWeighting
37 | 
38 | .. autoclass:: ReverseWeighting
39 | 


--------------------------------------------------------------------------------
/docs/source/api/searching.rst:
--------------------------------------------------------------------------------
 1 | ====================
 2 | ``searching`` module
 3 | ====================
 4 | 
 5 | .. automodule:: whoosh.searching
 6 | 
 7 | 
 8 | Searching classes
 9 | =================
10 | 
11 | .. autoclass:: Searcher
12 |     :members:
13 | 
14 | 
15 | Results classes
16 | ===============
17 | 
18 | .. autoclass:: Results
19 |     :members:
20 | 
21 | .. autoclass:: Hit
22 |     :members:
23 | 
24 | .. autoclass:: ResultsPage
25 |     :members:
26 | 
27 | 
28 | Exceptions
29 | ==========
30 | 
31 | .. autoexception:: NoTermsException
32 | .. autoexception:: TimeLimit
33 | 


--------------------------------------------------------------------------------
/docs/source/api/sorting.rst:
--------------------------------------------------------------------------------
 1 | ==================
 2 | ``sorting`` module
 3 | ==================
 4 | 
 5 | .. automodule:: whoosh.sorting
 6 | 
 7 | 
 8 | Base types
 9 | ==========
10 | 
11 | .. autoclass:: FacetType
12 |     :members:
13 | 
14 | .. autoclass:: Categorizer
15 |     :members:
16 | 
17 | 
18 | Facet types
19 | ===========
20 | 
21 | .. autoclass:: FieldFacet
22 | .. autoclass:: QueryFacet
23 | .. autoclass:: RangeFacet
24 | .. autoclass:: DateRangeFacet
25 | .. autoclass:: ScoreFacet
26 | .. autoclass:: FunctionFacet
27 | .. autoclass:: MultiFacet
28 | .. autoclass:: StoredFieldFacet
29 | 
30 | 
31 | Facets object
32 | =============
33 | 
34 | .. autoclass:: Facets
35 |     :members:
36 | 
37 | 
38 | FacetType objects
39 | =================
40 | 
41 | .. autoclass:: FacetMap
42 |     :members:
43 | .. autoclass:: OrderedList
44 | .. autoclass:: UnorderedList
45 | .. autoclass:: Count
46 | .. autoclass:: Best
47 | 


--------------------------------------------------------------------------------
/docs/source/api/spelling.rst:
--------------------------------------------------------------------------------
 1 | ===================
 2 | ``spelling`` module
 3 | ===================
 4 | 
 5 | See :doc:`correcting errors in user queries <../spelling>`.
 6 | 
 7 | .. automodule:: whoosh.spelling
 8 | 
 9 | 
10 | Corrector objects
11 | =================
12 | 
13 | .. autoclass:: Corrector
14 |     :members:
15 | 
16 | .. autoclass:: ReaderCorrector
17 | 
18 | .. autoclass:: MultiCorrector
19 | 
20 | 
21 | QueryCorrector objects
22 | ======================
23 | 
24 | .. autoclass:: QueryCorrector
25 |     :members:
26 | 
27 | .. autoclass:: SimpleQueryCorrector
28 | 
29 | .. autoclass:: Correction
30 | 


--------------------------------------------------------------------------------
/docs/source/api/support/charset.rst:
--------------------------------------------------------------------------------
 1 | ==========================
 2 | ``support.charset`` module
 3 | ==========================
 4 | 
 5 | .. automodule:: whoosh.support.charset
 6 | 
 7 | .. data:: default_charset
 8 | 
 9 |     An extensive case- and accent folding charset table.
10 |     Taken from http://speeple.com/unicode-maps.txt
11 | 
12 | .. autofunction:: charset_table_to_dict
13 | 


--------------------------------------------------------------------------------
/docs/source/api/support/levenshtein.rst:
--------------------------------------------------------------------------------
 1 | ==============================
 2 | ``support.levenshtein`` module
 3 | ==============================
 4 | 
 5 | .. automodule:: whoosh.support.levenshtein
 6 | 
 7 | .. autofunction:: relative
 8 | 
 9 | .. autofunction:: distance
10 | 


--------------------------------------------------------------------------------
/docs/source/api/util.rst:
--------------------------------------------------------------------------------
1 | ===============
2 | ``util`` module
3 | ===============
4 | 
5 | .. automodule:: whoosh.util
6 |     :members:
7 | 


--------------------------------------------------------------------------------
/docs/source/api/writing.rst:
--------------------------------------------------------------------------------
 1 | ==================
 2 | ``writing`` module
 3 | ==================
 4 | 
 5 | .. automodule:: whoosh.writing
 6 | 
 7 | 
 8 | Writer
 9 | ======
10 | 
11 | .. autoclass:: IndexWriter
12 |     :members:
13 | 
14 | 
15 | Utility writers
16 | ===============
17 | 
18 | .. autoclass:: BufferedWriter
19 |     :members:
20 | 
21 | .. autoclass:: AsyncWriter
22 |     :members:
23 | 
24 | 
25 | Exceptions
26 | ==========
27 | 
28 | .. autoexception:: IndexingError
29 | 


--------------------------------------------------------------------------------
/docs/source/batch.rst:
--------------------------------------------------------------------------------
  1 | ===================================
  2 | Tips for speeding up batch indexing
  3 | ===================================
  4 | 
  5 | 
  6 | Overview
  7 | ========
  8 | 
  9 | Indexing documents tends to fall into two general patterns: adding documents
 10 | one at a time as they are created (as in a web application), and adding a bunch
 11 | of documents at once (batch indexing).
 12 | 
 13 | The following settings and alternate workflows can make batch indexing faster.
 14 | 
 15 | 
 16 | StemmingAnalyzer cache
 17 | ======================
 18 | 
 19 | The stemming analyzer by default uses a least-recently-used (LRU) cache to limit
 20 | the amount of memory it uses, to prevent the cache from growing very large if
 21 | the analyzer is reused for a long period of time. However, the LRU cache can
 22 | slow down indexing by almost 200% compared to a stemming analyzer with an
 23 | "unbounded" cache.
 24 | 
 25 | When you're indexing in large batches with a one-shot instance of the
 26 | analyzer, consider using an unbounded cache::
 27 | 
 28 |     w = myindex.writer()
 29 |     # Get the analyzer object from a text field
 30 |     stem_ana = w.schema["content"].format.analyzer
 31 |     # Set the cachesize to -1 to indicate unbounded caching
 32 |     stem_ana.cachesize = -1
 33 |     # Reset the analyzer to pick up the changed attribute
 34 |     stem_ana.clear()
 35 | 
 36 |     # Use the writer to index documents...
 37 | 
 38 | 
 39 | The ``limitmb`` parameter
 40 | =========================
 41 | 
 42 | The ``limitmb`` parameter to :meth:`whoosh.index.Index.writer` controls the
 43 | *maximum* memory (in megabytes) the writer will use for the indexing pool. The
 44 | higher the number, the faster indexing will be.
 45 | 
 46 | The default value of ``128`` is actually somewhat low, considering many people
 47 | have multiple gigabytes of RAM these days. Setting it higher can speed up
 48 | indexing considerably::
 49 | 
 50 |     from whoosh import index
 51 | 
 52 |     ix = index.open_dir("indexdir")
 53 |     writer = ix.writer(limitmb=256)
 54 | 
 55 | .. note::
 56 |     The actual memory used will be higher than this value because of interpreter
 57 |     overhead (up to twice as much!). It is very useful as a tuning parameter,
 58 |     but not for trying to exactly control the memory usage of Whoosh.
 59 | 
 60 | 
 61 | The ``procs`` parameter
 62 | =======================
 63 | 
 64 | The ``procs`` parameter to :meth:`whoosh.index.Index.writer` controls the
 65 | number of processors the writer will use for indexing (via the
 66 | ``multiprocessing`` module)::
 67 | 
 68 |     from whoosh import index
 69 | 
 70 |     ix = index.open_dir("indexdir")
 71 |     writer = ix.writer(procs=4)
 72 | 
 73 | Note that when you use multiprocessing, the ``limitmb`` parameter controls the
 74 | amount of memory used by *each process*, so the actual memory used will be
 75 | ``limitmb * procs``::
 76 | 
 77 |     # Each process will use a limit of 128, for a total of 512
 78 |     writer = ix.writer(procs=4, limitmb=128)
 79 | 
 80 | 
 81 | The ``multisegment`` parameter
 82 | ==============================
 83 | 
 84 | The ``procs`` parameter causes the default writer to use multiple processors to
 85 | do much of the indexing, but then still uses a single process to merge the pool
 86 | of each sub-writer into a single segment.
 87 | 
 88 | You can get much better indexing speed by also using the ``multisegment=True``
 89 | keyword argument, which instead of merging the results of each sub-writer,
 90 | simply has them each just write out a new segment::
 91 | 
 92 |     from whoosh import index
 93 | 
 94 |     ix = index.open_dir("indexdir")
 95 |     writer = ix.writer(procs=4, multisegment=True)
 96 | 
 97 | The drawback is that instead
 98 | of creating a single new segment, this option creates a number of new segments
 99 | **at least** equal to the number of processes you use.
100 | 
101 | For example, if you use ``procs=4``, the writer will create four new segments.
102 | (If you merge old segments or call ``add_reader`` on the parent writer, the
103 | parent writer will also write a segment, meaning you'll get five new segments.)
104 | 
105 | So, while ``multisegment=True`` is much faster than a normal writer, you should
106 | only use it for large batch indexing jobs (or perhaps only for indexing from
107 | scratch). It should not be the only method you use for indexing, because
108 | otherwise the number of segments will tend to increase forever!
109 | 


--------------------------------------------------------------------------------
/docs/source/fieldcaches.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Field caches
 3 | ============
 4 | 
 5 | The default (``filedb``) backend uses *field caches* in certain circumstances.
 6 | The field cache basically pre-computes the order of documents in the index to
 7 | speed up sorting and faceting.
 8 | 
 9 | Generating field caches can take time the first time you sort/facet on a large
10 | index. The field cache is kept in memory (and by default written to disk when it
11 | is generated) so subsequent sorted/faceted searches should be faster.
12 | 
13 | The default caching policy never expires field caches, so reused searchers and/or
14 | sorting a lot of different fields could use up quite a bit of memory with large
15 | indexes.
16 | 
17 | 
18 | Customizing cache behaviour
19 | ===========================
20 | 
21 | (The following API examples refer to the default ``filedb`` backend.)
22 | 
23 | *By default*, Whoosh saves field caches to disk. To prevent a reader or searcher
24 | from writing out field caches, do this before you start using it::
25 | 
26 |     searcher.set_caching_policy(save=False)
27 | 
28 | By default, if caches are written to disk they are saved in the index directory.
29 | To tell a reader or searcher to save cache files to a different location, create
30 | a storage object and pass it to the ``storage`` keyword argument::
31 | 
32 |     from whoosh.filedb.filestore import FileStorage
33 | 
34 |     mystorage = FileStorage("path/to/cachedir")
35 |     reader.set_caching_policy(storage=mystorage)
36 | 
37 | 
38 | Creating a custom caching policy
39 | ================================
40 | 
41 | Expert users who want to implement a custom caching policy (for example, to add
42 | cache expiration) should subclass :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`.
43 | Then you can pass an instance of your policy object to the ``set_caching_policy``
44 | method::
45 | 
46 |     searcher.set_caching_policy(MyPolicy())
47 | 


--------------------------------------------------------------------------------
/docs/source/glossary.rst:
--------------------------------------------------------------------------------
 1 | .. _glossary:
 2 | 
 3 | ========
 4 | Glossary
 5 | ========
 6 | 
 7 | .. glossary::
 8 | 
 9 |     Analysis
10 |         The process of breaking the text of a field into individual *terms*
11 |         to be indexed. This consists of tokenizing the text into terms, and then optionally
12 |         filtering the tokenized terms (for example, lowercasing and removing *stop words*).
13 |         Whoosh includes several different analyzers.
14 | 
15 |     Corpus
16 |         The set of documents you are indexing.
17 | 
18 |     Documents
19 |         The individual pieces of content you want to make searchable.
20 |         The word "documents" might imply files, but the data source could really be
21 |         anything -- articles in a content management system, blog posts in a blogging
22 |         system, chunks of a very large file, rows returned from an SQL query, individual
23 |         email messages from a mailbox file, or whatever. When you get search results
24 |         from Whoosh, the results are a list of documents, whatever "documents" means in
25 |         your search engine.
26 | 
27 |     Fields
28 |         Each document contains a set of fields. Typical fields might be "title", "content",
29 |         "url", "keywords", "status", "date", etc. Fields can be indexed (so they're
30 |         searchable) and/or stored with the document. Storing the field makes it available
31 |         in search results. For example, you typically want to store the "title" field so
32 |         your search results can display it.
33 | 
34 |     Forward index
35 |         A table listing every document and the words that appear in the document.
36 |         Whoosh lets you store *term vectors* that are a kind of forward index.
37 | 
38 |     Indexing
39 |         The process of examining documents in the corpus and adding them to the
40 |         *reverse index*.
41 | 
42 |     Postings
43 |         The *reverse index* lists every word in the corpus, and for each word, a list
44 |         of documents in which that word appears, along with some optional information
45 |         (such as the number of times the word appears in that document). These items
46 |         in the list, containing a document number and any extra information, are
47 |         called *postings*. In Whoosh the information stored in postings is customizable
48 |         for each *field*.
49 | 
50 |     Reverse index
51 |         Basically a table listing every word in the corpus, and for each word, the
52 |         list of documents in which it appears. It can be more complicated (the index can
53 |         also list how many times the word appears in each document, the positions at which
54 |         it appears, etc.) but that's how it basically works.
55 | 
56 |     Schema
57 |         Whoosh requires that you specify the *fields* of the index before you begin
58 |         indexing. The Schema associates field names with metadata about the field, such
59 |         as the format of the *postings* and whether the contents of the field are stored
60 |         in the index.
61 | 
62 |     Term vector
63 |         A *forward index* for a certain field in a certain document. You can specify
64 |         in the Schema that a given field should store term vectors.
65 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ==============================
 2 | Whoosh |release| documentation
 3 | ==============================
 4 | 
 5 | Whoosh was created by `Matt Chaput <mailto:matt@whoosh.ca>`_.
 6 | You can view outstanding issues on the
 7 | `Whoosh Bitbucket page <https://github.com/Sygil-Dev/whoosh-reloaded>`_
 8 | and get help on the `Whoosh mailing list <http://groups.google.com/group/whoosh-reloaded>`_.
 9 | 
10 | 
11 | Contents
12 | ========
13 | 
14 | .. toctree::
15 |     :maxdepth: 2
16 | 
17 |     releases/index
18 |     quickstart
19 |     intro
20 |     glossary
21 |     schema
22 |     indexing
23 |     searching
24 |     parsing
25 |     querylang
26 |     dates
27 |     query
28 |     analysis
29 |     stemming
30 |     ngrams
31 |     facets
32 |     highlight
33 |     keywords
34 |     spelling
35 |     fieldcaches
36 |     batch
37 |     threads
38 |     nested
39 |     recipes
40 |     api/api
41 |     tech/index
42 | 
43 | 
44 | Indices and tables
45 | ==================
46 | 
47 | * :ref:`genindex`
48 | * :ref:`modindex`
49 | * :ref:`search`
50 | 


--------------------------------------------------------------------------------
/docs/source/intro.rst:
--------------------------------------------------------------------------------
 1 | ======================
 2 | Introduction to Whoosh
 3 | ======================
 4 | 
 5 | About Whoosh
 6 | ------------
 7 | 
 8 | Whoosh was created by `Matt Chaput <mailto:matt@whoosh.ca>`_. It started as a quick and dirty
 9 | search server for the online documentation of the `Houdini <http://www.sidefx.com/>`_
10 | 3D animation software package. Side Effects Software generously allowed Matt to open source
11 | the code in case it might be useful to anyone else who needs a very flexible or pure-Python
12 | search engine (or both!).
13 | 
14 | * Whoosh is fast, but uses only pure Python, so it will run anywhere Python runs,
15 |   without requiring a compiler.
16 | 
17 | * By default, Whoosh uses the `Okapi BM25F <http://en.wikipedia.com/wiki/Okapi_BM25>`_ ranking
18 |   function, but like most things the ranking function can be easily customized.
19 | 
20 | * Whoosh creates fairly small indexes compared to many other search libraries.
21 | 
22 | * All indexed text in Whoosh must be *unicode*.
23 | 
24 | * Whoosh lets you store arbitrary Python objects with indexed documents.
25 | 
26 | 
27 | What is Whoosh?
28 | ---------------
29 | 
30 | Whoosh is a fast, pure Python search engine library.
31 | 
32 | The primary design impetus of Whoosh is that it is pure Python. You should be able to
33 | use Whoosh anywhere you can use Python, no compiler or Java required.
34 | 
35 | Like one of its ancestors, Lucene, Whoosh is not really a search engine, it's a programmer
36 | library for creating a search engine [1]_.
37 | 
38 | Practically no important behavior of Whoosh is hard-coded. Indexing
39 | of text, the level of information stored for each term in each field, parsing of search queries,
40 | the types of queries allowed, scoring algorithms, etc. are all customizable, replaceable, and
41 | extensible.
42 | 
43 | 
44 | .. [1] It would of course be possible to build a turnkey search engine on top of Whoosh,
45 |        like Nutch and Solr use Lucene.
46 | 
47 | 
48 | What can Whoosh do for you?
49 | ---------------------------
50 | 
51 | Whoosh lets you index free-form or structured text and then quickly find matching
52 | documents based on simple or complex search criteria.
53 | 
54 | 
55 | Getting help with Whoosh
56 | ------------------------
57 | 
58 | You can view outstanding issues on the
59 | `Whoosh Github page <http://github.com/Sygil-Dev/whoosh-reloaded>`_
60 | and get help on the `Whoosh mailing list <http://groups.google.com/group/whoosh>`_.
61 | 


--------------------------------------------------------------------------------
/docs/source/keywords.rst:
--------------------------------------------------------------------------------
 1 | =======================================
 2 | Query expansion and Key word extraction
 3 | =======================================
 4 | 
 5 | Overview
 6 | ========
 7 | 
 8 | Whoosh provides methods for computing the "key terms" of a set of documents. For
 9 | these methods, "key terms" basically means terms that are frequent in the given
10 | documents, but relatively infrequent in the indexed collection as a whole.
11 | 
12 | Because this is a purely statistical operation, not a natural language
13 | processing or AI function, the quality of the results will vary based on the
14 | content, the size of the document collection, and the number of documents for
15 | which you extract keywords.
16 | 
17 | These methods can be useful for providing the following features to users:
18 | 
19 | * Search term expansion. You can extract key terms for the top N results from a
20 |   query and suggest them to the user as additional/alternate query terms to try.
21 | 
22 | * Tag suggestion. Extracting the key terms for a single document may yield
23 |   useful suggestions for tagging the document.
24 | 
25 | * "More like this". You can extract key terms for the top ten or so results from
26 |   a query (and removing the original query terms), and use those key words as
27 |   the basis for another query that may find more documents using terms the user
28 |   didn't think of.
29 | 
30 | Usage
31 | =====
32 | 
33 | * Get more documents like a certain search hit. *This requires that the field
34 |   you want to match on is vectored or stored, or that you have access to the
35 |   original text (such as from a database)*.
36 | 
37 |   Use :meth:`~ whoosh.searching.Hit.more_like_this`::
38 | 
39 |         results = mysearcher.search(myquery)
40 |         first_hit = results[0]
41 |         more_results = first_hit.more_like_this("content")
42 | 
43 | * Extract keywords for the top N documents in a
44 |   :class:` whoosh.searching.Results` object. *This requires that the field is
45 |   either vectored or stored*.
46 | 
47 |   Use the :meth:`~ whoosh.searching.Results.key_terms` method of the
48 |   :class:` whoosh.searching.Results` object to extract keywords from the top N
49 |   documents of the result set.
50 | 
51 |   For example, to extract *five* key terms from the ``content`` field of the top
52 |   *ten* documents of a results object::
53 | 
54 |         keywords = [keyword for keyword, score
55 |                     in results.key_terms("content", docs=10, numterms=5)
56 | 
57 | * Extract keywords for an arbitrary set of documents. *This requires that the
58 |   field is either vectored or stored*.
59 | 
60 |   Use the :meth:`~ whoosh.searching.Searcher.document_number` or
61 |   :meth:`~ whoosh.searching.Searcher.document_numbers` methods of the
62 |   :class:` whoosh.searching.Searcher` object to get the document numbers for the
63 |   document(s) you want to extract keywords from.
64 | 
65 |   Use the :meth:`~ whoosh.searching.Searcher.key_terms` method of a
66 |   :class:` whoosh.searching.Searcher` to extract the keywords, given the list of
67 |   document numbers.
68 | 
69 |   For example, let's say you have an index of emails. To extract key terms from
70 |   the ``content`` field of emails whose ``emailto`` field contains
71 |   ``matt@whoosh.ca``::
72 | 
73 |         with email_index.searcher() as s:
74 |             docnums = s.document_numbers(emailto=u"matt@whoosh.ca")
75 |             keywords = [keyword for keyword, score
76 |                         in s.key_terms(docnums, "body")]
77 | 
78 | * Extract keywords from arbitrary text not in the index.
79 | 
80 |   Use the :meth:`~ whoosh.searching.Searcher.key_terms_from_text` method of a
81 |   :class:` whoosh.searching.Searcher` to extract the keywords, given the text::
82 | 
83 |         with email_index.searcher() as s:
84 |             keywords = [keyword for keyword, score
85 |                         in s.key_terms_from_text("body", mytext)]
86 | 
87 | 
88 | Expansion models
89 | ================
90 | 
91 | The ``ExpansionModel`` subclasses in the :mod:` whoosh.classify` module implement
92 | different weighting functions for key words. These models are translated into
93 | Python from original Java implementations in Terrier.
94 | 


--------------------------------------------------------------------------------
/docs/source/ngrams.rst:
--------------------------------------------------------------------------------
 1 | ==============================
 2 | Indexing and searching N-grams
 3 | ==============================
 4 | 
 5 | Overview
 6 | ========
 7 | 
 8 | N-gram indexing is a powerful method for getting fast, "search as you type"
 9 | functionality like iTunes. It is also useful for quick and effective indexing
10 | of languages such as Chinese and Japanese without word breaks.
11 | 
12 | N-grams refers to groups of N characters... bigrams are groups of two
13 | characters, trigrams are groups of three characters, and so on.
14 | 
15 | Whoosh includes two methods for analyzing N-gram fields: an N-gram tokenizer,
16 | and a filter that breaks tokens into N-grams.
17 | 
18 | :class:` whoosh.analysis.NgramTokenizer` tokenizes the entire field into N-grams.
19 | This is more useful for Chinese/Japanese/Korean languages, where it's useful
20 | to index bigrams of characters rather than individual characters. Using this
21 | tokenizer with roman languages leads to spaces in the tokens.
22 | 
23 | ::
24 | 
25 |     >>> ngt = NgramTokenizer(minsize=2, maxsize=4)
26 |     >>> [token.text for token in ngt(u"hi there")]
27 |     [u'hi', u'hi ', u'hi t',u'i ', u'i t', u'i th', u' t', u' th', u' the', u'th',
28 |     u'the', u'ther', u'he', u'her', u'here', u'er', u'ere', u're']
29 | 
30 | :class:` whoosh.analysis.NgramFilter` breaks individual tokens into N-grams as
31 | part of an analysis pipeline. This is more useful for languages with word
32 | separation.
33 | 
34 | ::
35 | 
36 |     >>> my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4)
37 |     >>> [token.text for token in my_analyzer(u"rendering shaders")]
38 |     [u'ren', u'rend', u'end', u'ende', u'nde', u'nder', u'der', u'deri', u'eri',
39 |     u'erin', u'rin', u'ring', u'ing', u'sha', u'shad', u'had', u'hade', u'ade',
40 |     u'ader', u'der', u'ders', u'ers']
41 | 
42 | Whoosh includes two pre-configured field types for N-grams:
43 | :class:` whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS`. The only
44 | difference is that ``NGRAM`` runs all text through the N-gram filter, including
45 | whitespace and punctuation, while ``NGRAMWORDS`` extracts words from the text
46 | using a tokenizer, then runs each word through the N-gram filter.
47 | 
48 | TBD.
49 | 


--------------------------------------------------------------------------------
/docs/source/query.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Query objects
 3 | =============
 4 | 
 5 | The classes in the :mod:` whoosh.query` module implement *queries* you can run against the index.
 6 | 
 7 | TBD.
 8 | 
 9 | See :doc:`searching` for how to search the index using query objects.
10 | 


--------------------------------------------------------------------------------
/docs/source/releases/0_3.rst:
--------------------------------------------------------------------------------
 1 | ========================
 2 | Whoosh 0.3 release notes
 3 | ========================
 4 | 
 5 | * Major improvements to reading/writing of postings and query performance.
 6 | 
 7 | * Changed default post limit (run size) from 4 MB to 32 MB.
 8 | 
 9 | * Finished migrating backend-specific code into `` whoosh.filedb`` package.
10 | 
11 | * Moved formats from whoosh.fields module into new whoosh.formats module.
12 | 
13 | * DocReader and TermReader classes combined into new IndexReader interface.
14 |   You can get an IndexReader implementation by calling Index.reader().
15 |   Searcher is now a wrapper around an IndexReader.
16 | 
17 | * Range query object changed, with new signature and new syntax in the default
18 |   query parser. Now you can use ``[start TO end]`` in the query parser for an
19 |   inclusive range, and ``{start TO end}`` for an exclusive range. You can also
20 |   mix the delimiters, for example ``[start TO end}`` for a range with an
21 |   inclusive start but exclusive end term.
22 | 
23 | * Added experimental DATETIME field type lets you pass a
24 |   ``datetime.datetime`` object as a field value to ``add_document``::
25 | 
26 |     from whoosh.fields import Schema, ID, DATETIME
27 |     from whoosh.filedb.filestore import RamStorage
28 |     from datetime import datetime
29 | 
30 |     schema = Schema(id=ID, date=DATETIME)
31 |     storage = RamStorage()
32 |     ix = storage.create_index(schema)
33 |     w = ix.writer()
34 |     w.add_document(id=u"A", date=datetime.now())
35 |     w.close()
36 | 
37 |   Internally, the DATETIME field indexes the datetime object as text using
38 |   the format (4 digit year + 2 digit month + 2 digit day + 'T' + 2 digit hour +
39 |   2 digit minute + 2 digit second + 6 digit microsecond), for example
40 |   ``20090817T160203109000``.
41 | 
42 | * The default query parser now lets you use quoted strings in prefix and range
43 |   queries, e.g. ``["2009-05" TO "2009-12"]``, ``"alfa/bravo"*``, making it
44 |   easier to work with terms containing special characters.
45 | 
46 | * ``DocReader.vector_as(docnum, fieldid, astype)`` is now
47 |   ``IndexReader.vector_as(astype, docnum, fieldid)`` (i.e. the astype argument
48 |   has moved from the last to the first argument), e.g.
49 |   ``v = ixreader.vector_as("frequency", 102, "content")``.
50 | 
51 | * Added whoosh.support.charset for translating Sphinx charset table files.
52 | 
53 | * Added whoosh.analysis.CharsetTokenizer and CharsetFilter to enable case and
54 |   accent folding.
55 | 
56 | * Added experimental `` whoosh.ramdb`` in-memory backend.
57 | 
58 | * Added experimental `` whoosh.query.FuzzyTerm`` query type.
59 | 
60 | * Added `` whoosh.lang.wordnet`` module containing ``Thesaurus`` object for using
61 |   WordNet synonym database.
62 | 


--------------------------------------------------------------------------------
/docs/source/releases/index.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Release notes
 3 | =============
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 2
 7 | 
 8 |     2_0
 9 |     1_0
10 |     0_3
11 | 


--------------------------------------------------------------------------------
/docs/source/spelling.rst:
--------------------------------------------------------------------------------
  1 | =====================================================
  2 | "Did you mean... ?" Correcting errors in user queries
  3 | =====================================================
  4 | 
  5 | Overview
  6 | ========
  7 | 
  8 | Whoosh can quickly suggest replacements for mis-typed words by returning
  9 | a list of words from the index (or a dictionary) that are close to the
 10 | mis-typed word::
 11 | 
 12 |     with ix.searcher() as s:
 13 |         corrector = s.corrector("text")
 14 |         for mistyped_word in mistyped_words:
 15 |             print corrector.suggest(mistyped_word, limit=3)
 16 | 
 17 | See the :meth:` whoosh.spelling.Corrector.suggest` method documentation
 18 | for information on the arguments.
 19 | 
 20 | Currently the suggestion engine is more like a "typo corrector" than a
 21 | real "spell checker" since it doesn't do the kind of sophisticated
 22 | phonetic matching or semantic/contextual analysis a good spell checker
 23 | might. However, it is still very useful.
 24 | 
 25 | There are two main strategies for correcting words:
 26 | 
 27 | *   Use the terms from an index field.
 28 | 
 29 | *   Use words from a word list.
 30 | 
 31 | 
 32 | Pulling suggestions from an indexed field
 33 | =========================================
 34 | 
 35 | In Whoosh 2.7 and later, spelling suggestions are available on all fields.
 36 | However, if you have an analyzer that modifies the indexed words (such as
 37 | stemming), you can add ``spelling=True`` to a field to have it store separate
 38 | unmodified versions of the terms for spelling suggestions::
 39 | 
 40 |     ana = analysis.StemmingAnalyzer()
 41 |     schema = fields.Schema(text=TEXT(analyzer=ana, spelling=True))
 42 | 
 43 | You can then use the :meth:` whoosh.searching.Searcher.corrector` method
 44 | to get a corrector for a field::
 45 | 
 46 |     corrector = searcher.corrector("content")
 47 | 
 48 | The advantage of using the contents of an index field is that when you
 49 | are spell checking queries on that index, the suggestions are tailored
 50 | to the contents of the index. The disadvantage is that if the indexed
 51 | documents contain spelling errors, then the spelling suggestions will
 52 | also be erroneous.
 53 | 
 54 | 
 55 | Pulling suggestions from a word list
 56 | ====================================
 57 | 
 58 | There are plenty of word lists available on the internet you can use to
 59 | populate the spelling dictionary.
 60 | 
 61 | (In the following examples, ``word_list`` can be a list of unicode
 62 | strings, or a file object with one word on each line.)
 63 | 
 64 | To create a :class:` whoosh.spelling.Corrector` object from a sorted word list::
 65 | 
 66 |     from whoosh.spelling import ListCorrector
 67 | 
 68 |     # word_list must be a sorted list of unicocde strings
 69 |     corrector = ListCorrector(word_list)
 70 | 
 71 | 
 72 | Merging two or more correctors
 73 | ==============================
 74 | 
 75 | You can combine suggestions from two sources (for example, the contents
 76 | of an index field and a word list) using a
 77 | :class:` whoosh.spelling.MultiCorrector`::
 78 | 
 79 |     c1 = searcher.corrector("content")
 80 |     c2 = spelling.ListCorrector(word_list)
 81 |     corrector = MultiCorrector([c1, c2])
 82 | 
 83 | 
 84 | Correcting user queries
 85 | =======================
 86 | 
 87 | You can spell-check a user query using the
 88 | :meth:` whoosh.searching.Searcher.correct_query` method::
 89 | 
 90 |     from whoosh import qparser
 91 | 
 92 |     # Parse the user query string
 93 |     qp = qparser.QueryParser("content", myindex.schema)
 94 |     q = qp.parse(qstring)
 95 | 
 96 |     # Try correcting the query
 97 |     with myindex.searcher() as s:
 98 |         corrected = s.correct_query(q, qstring)
 99 |         if corrected.query != q:
100 |             print("Did you mean:", corrected.string)
101 | 
102 | The ``correct_query`` method returns an object with the following
103 | attributes:
104 | 
105 | ``query``
106 |     A corrected :class:` whoosh.query.Query` tree. You can test
107 |     whether this is equal (``==``) to the original parsed query to
108 |     check if the corrector actually changed anything.
109 | 
110 | ``string``
111 |     A corrected version of the user's query string.
112 | 
113 | ``tokens``
114 |     A list of corrected token objects representing the corrected
115 |     terms. You can use this to reformat the user query (see below).
116 | 
117 | 
118 | You can use a :class:` whoosh.highlight.Formatter` object to format the
119 | corrected query string. For example, use the
120 | :class:`~ whoosh.highlight.HtmlFormatter` to format the corrected string
121 | as HTML::
122 | 
123 |     from whoosh import highlight
124 | 
125 |     hf = highlight.HtmlFormatter()
126 |     corrected = s.correct_query(q, qstring, formatter=hf)
127 | 
128 | See the documentation for
129 | :meth:` whoosh.searching.Searcher.correct_query` for information on the
130 | defaults and arguments.
131 | 


--------------------------------------------------------------------------------
/docs/source/tech/filedb.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | filedb notes
 3 | ============
 4 | 
 5 | TBD.
 6 | 
 7 | Files created
 8 | =============
 9 | 
10 | <revision_number>.toc
11 |     The "master" file containing information about the index and its segments.
12 | 
13 | The index directory will contain a set of files for each segment. A segment is like a mini-index -- when you add documents to the index, Whoosh creates a new segment and then searches the old segment(s) and the new segment to avoid having to do a big merge every time you add a document. When you get enough small segments Whoosh will merge them into larger segments or a single segment.
14 | 
15 | <segment_number>.dci
16 |     Contains per-document information (e.g. field lengths). This will grow linearly with the number of documents.
17 | 
18 | <segment_number>.dcz
19 |     Contains the stored fields for each document.
20 | 
21 | <segment_number>.tiz
22 |     Contains per-term information. The size of file will vary based on the number of unique terms.
23 | 
24 | <segment_number>.pst
25 |     Contains per-term postings. The size of this file depends on the size of the collection and the formats used for each field (e.g. storing term positions takes more space than storing frequency only).
26 | 
27 | <segment_number>.fvz
28 |     contains term vectors (forward indexes) for each document. This file is only created if at least one field in the schema stores term vectors. The size will vary based on the number of documents, field length, the formats used for each vector (e.g. storing term positions takes more space than storing frequency only), etc.
29 | 


--------------------------------------------------------------------------------
/docs/source/tech/index.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | Technical notes
 3 | ===============
 4 | 
 5 | .. toctree::
 6 |     :glob:
 7 |     :maxdepth: 2
 8 | 
 9 |     *
10 | 


--------------------------------------------------------------------------------
/docs/source/threads.rst:
--------------------------------------------------------------------------------
 1 | ====================================
 2 | Concurrency, locking, and versioning
 3 | ====================================
 4 | 
 5 | Concurrency
 6 | ===========
 7 | 
 8 | The ``FileIndex`` object is "stateless" and should be share-able between
 9 | threads.
10 | 
11 | A ``Reader`` object (which underlies the ``Searcher`` object) wraps open files and often
12 | individual methods rely on consistent file cursor positions (e.g. they do two
13 | ``file.read()``\ s in a row, so if another thread moves the cursor between the two
14 | read calls Bad Things would happen). You should use one Reader/Searcher per
15 | thread in your code.
16 | 
17 | Readers/Searchers tend to cache information (such as field caches for sorting),
18 | so if you can share one across multiple search requests, it's a big performance
19 | win.
20 | 
21 | 
22 | Locking
23 | =======
24 | 
25 | Only one thread/process can write to an index at a time. When you open a writer,
26 | it locks the index. If you try to open a writer on the same index in another
27 | thread/process, it will raise `` whoosh.store.LockError``.
28 | 
29 | In a multi-threaded or multi-process environment your code needs to be aware
30 | that opening a writer may raise this exception if a writer is already open.
31 | Whoosh includes a couple of example implementations
32 | (:class:` whoosh.writing.AsyncWriter` and :class:` whoosh.writing.BufferedWriter`)
33 | of ways to work around the write lock.
34 | 
35 | While the writer is open and during the commit, **the index is still available
36 | for reading**. Existing readers are unaffected and new readers can open the
37 | current index normally.
38 | 
39 | 
40 | Lock files
41 | ----------
42 | 
43 | Locking the index is accomplished by acquiring an exclusive file lock on the
44 | ``<indexname>_WRITELOCK`` file in the index directory. The file is not deleted
45 | after the file lock is released, so the fact that the file exists **does not**
46 | mean the index is locked.
47 | 
48 | 
49 | Versioning
50 | ==========
51 | 
52 | When you open a reader/searcher, the reader represents a view of the **current
53 | version** of the index. If someone writes changes to the index, any readers
54 | that are already open **will not** pick up the changes automatically. A reader
55 | always sees the index as it existed when the reader was opened.
56 | 
57 | If you are re-using a Searcher across multiple search requests, you can check
58 | whether the Searcher is a view of the latest version of the index using
59 | :meth:` whoosh.searching.Searcher.up_to_date`. If the searcher is not up to date,
60 | you can get an up-to-date copy of the searcher using
61 | :meth:` whoosh.searching.Searcher.refresh`::
62 | 
63 |     # If 'searcher' is not up-to-date, replace it
64 |     searcher = searcher.refresh()
65 | 
66 | (If the searcher has the latest version of the index, ``refresh()`` simply
67 | returns it.)
68 | 
69 | Calling ``Searcher.refresh()`` is more efficient that closing the searcher and
70 | opening a new one, since it will re-use any underlying readers and caches that
71 | haven't changed.
72 | 


--------------------------------------------------------------------------------
/files/whoosh_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/files/whoosh_16.png


--------------------------------------------------------------------------------
/files/whoosh_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/files/whoosh_35.png


--------------------------------------------------------------------------------
/files/whoosh_64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/files/whoosh_64.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | target-version = "py38"
 3 | 
 4 | [tool.ruff.lint]
 5 | select = [
 6 |   "AIR",    # Airflow
 7 |   "ASYNC",  # flake8-async
 8 |   "BLE",    # flake8-blind-except
 9 |   "C4",     # flake8-comprehensions
10 |   "C90",    # McCabe cyclomatic complexity
11 |   "DJ",     # flake8-django
12 |   "DTZ",    # flake8-datetimez
13 |   "EXE",    # flake8-executable
14 |   "F",      # Pyflakes
15 |   "FA",     # flake8-future-annotations
16 |   "G",      # flake8-logging-format
17 |   "I",      # isort
18 |   "ICN",    # flake8-import-conventions
19 |   "INT",    # flake8-gettext
20 |   "LOG",    # flake8-logging
21 |   "NPY",    # NumPy-specific rules
22 |   "PERF",   # Perflint
23 |   "PLC",    # Pylint conventions
24 |   "PLE",    # Pylint errors
25 |   "PLR091", # Pylint Refactor just for max-args, max-branches, etc.
26 |   "PYI",    # flake8-pyi
27 |   "Q",      # flake8-quotes
28 |   "SLOT",   # flake8-slots
29 |   "TCH",    # flake8-type-checking
30 |   "TID",    # flake8-tidy-imports
31 |   "TRIO",   # flake8-trio
32 |   "UP",     # pyupgrade
33 |   "W",      # pycodestyle
34 |   "YTT",    # flake8-2020
35 |   # "A",    # flake8-builtins
36 |   # "ANN",  # flake8-annotations
37 |   # "ARG",  # flake8-unused-arguments
38 |   # "B",    # flake8-bugbear
39 |   # "COM",  # flake8-commas
40 |   # "CPY",  # flake8-copyright
41 |   # "D",    # pydocstyle
42 |   # "E",    # pycodestyle
43 |   # "EM",   # flake8-errmsg
44 |   # "ERA",  # eradicate
45 |   # "FBT",  # flake8-boolean-trap
46 |   # "FIX",  # flake8-fixme
47 |   # "FLY",  # flynt
48 |   # "FURB", # refurb
49 |   # "INP",  # flake8-no-pep420
50 |   # "ISC",  # flake8-implicit-str-concat
51 |   # "N",    # pep8-naming
52 |   # "PD",   # pandas-vet
53 |   # "PGH",  # pygrep-hooks
54 |   # "PIE",  # flake8-pie
55 |   # "PL",   # Pylint
56 |   # "PT",   # flake8-pytest-style
57 |   # "PTH",  # flake8-use-pathlib
58 |   # "RET",  # flake8-return
59 |   # "RSE",  # flake8-raise
60 |   # "RUF",  # Ruff-specific rules
61 |   # "S",    # flake8-bandit
62 |   # "SIM",  # flake8-simplify
63 |   # "SLF",  # flake8-self
64 |   # "T10",  # flake8-debugger
65 |   # "T20",  # flake8-print
66 |   # "TD",   # flake8-todos
67 |   # "TRY",  # tryceratops
68 | ]
69 | ignore = [
70 |   "EXE001",
71 |   "F401",
72 |   "F811",
73 |   "F841",
74 |   "PERF203",
75 |   "UP031",
76 | ]
77 | 
78 | [tool.ruff.lint.mccabe]
79 | max-complexity = 45  # Default is 10
80 | 
81 | [tool.ruff.lint.per-file-ignores]
82 | "src/whoosh/compat.py" = ["F821"]
83 | "src/whoosh/filedb/filestore.py" = ["UP024"]
84 | "src/whoosh/util/__init__.py" = ["F821"]
85 | 
86 | [tool.ruff.lint.pylint]
87 | max-args = 22  # Default is 5
88 | max-branches = 79  # Default is 12
89 | max-returns = 16  # Default is 6
90 | max-statements = 256  # Default is 50
91 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pythomata
3 | versioneer
4 | -e .
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .
2 | 


--------------------------------------------------------------------------------
/scripts/make_checkpoint.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | 
 3 | # Make a "checkpoint" index, capturing the index format created by a certain
 4 | # version of Whoosh
 5 | 
 6 | 
 7 | import os.path
 8 | import random
 9 | import sys
10 | from datetime import datetime, timezone
11 | 
12 | from whoosh import fields, index
13 | 
14 | if len(sys.argv) < 2:
15 |     print("USAGE: make_checkpoint.py <dir>")
16 |     sys.exit(1)
17 | indexdir = sys.argv[1]
18 | print("Creating checkpoint index in", indexdir)
19 | 
20 | schema = fields.Schema(
21 |     path=fields.ID(stored=True, unique=True),
22 |     num=fields.NUMERIC(int, stored=True),
23 |     frac=fields.NUMERIC(float, stored=True),
24 |     dt=fields.DATETIME(stored=True),
25 |     tag=fields.KEYWORD,
26 |     title=fields.TEXT(stored=True),
27 |     ngrams=fields.NGRAMWORDS,
28 | )
29 | 
30 | words = (
31 |     "alfa bravo charlie delta echo foxtrot golf hotel india"
32 |     "juliet kilo lima mike november oskar papa quebec romeo"
33 |     "sierra tango"
34 | ).split()
35 | 
36 | if not os.path.exists(indexdir):
37 |     os.makedirs(indexdir)
38 | 
39 | ix = index.create_in(indexdir, schema)
40 | counter = 0
41 | frac = 0.0
42 | for segnum in range(3):
43 |     with ix.writer() as w:
44 |         for num in range(100):
45 |             frac += 0.15
46 |             path = f"{segnum}/{num}"
47 |             title = " ".join(random.choice(words) for _ in range(100))
48 |             dt = datetime(
49 |                 year=2000 + counter,
50 |                 month=(counter % 12) + 1,
51 |                 day=15,
52 |                 tzinfo=timezone.utc,
53 |             )
54 | 
55 |             w.add_document(
56 |                 path=path,
57 |                 num=counter,
58 |                 frac=frac,
59 |                 dt=dt,
60 |                 tag=words[counter % len(words)],
61 |                 title=title,
62 |                 ngrams=title,
63 |             )
64 |             counter += 1
65 | 
66 | with ix.writer() as w:
67 |     for path in ("0/42", "1/6", "2/80"):
68 |         print("Deleted", path, w.delete_by_term("path", path))
69 | 
70 | print(counter, ix.doc_count())
71 | 


--------------------------------------------------------------------------------
/scripts/read_checkpoint.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | 
 3 | # Read a "checkpoint" index, to check backwards compatibility
 4 | 
 5 | 
 6 | import sys
 7 | 
 8 | from whoosh import index, query
 9 | 
10 | if len(sys.argv) < 2:
11 |     print("USAGE: read_checkpoint.py <dir>")
12 |     sys.exit(1)
13 | indexdir = sys.argv[1]
14 | print("Reading checkpoint index in", indexdir)
15 | 
16 | words = (
17 |     "alfa bravo charlie delta echo foxtrot golf hotel india"
18 |     "juliet kilo lima mike november oskar papa quebec romeo"
19 |     "sierra tango"
20 | ).split()
21 | 
22 | deleted = ("0/42", "1/6", "2/80")
23 | 
24 | ix = index.open_dir(indexdir)
25 | with ix.searcher() as s:
26 |     dtfield = ix.schema["dt"]
27 |     for sf in s.all_stored_fields():
28 |         if sf["path"] in deleted:
29 |             continue
30 | 
31 |         num = sf["num"]
32 |         r = s.search(query.Term("num", num), limit=None)
33 |         assert len(r) == 1
34 |         assert r[0]["num"] == num
35 | 
36 |         frac = sf["frac"]
37 |         r = s.search(query.Term("frac", frac), limit=None)
38 |         assert len(r) == 1
39 |         assert r[0]["frac"] == frac
40 | 
41 |         dt = sf["dt"]
42 |         q = query.Term("dt", dt)
43 |         r = s.search(q, limit=None)
44 |         if len(r) > 1:
45 |             for hit in r:
46 |                 print(hit.fields())
47 |         assert len(r) == 1, len(r)
48 |         assert r[0]["dt"] == dt
49 | 
50 | print("Done")
51 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [wheel]
 2 | universal = True
 3 | 
 4 | [build_sphinx]
 5 | build-dir = docs/build
 6 | source-dir = docs/source
 7 | 
 8 | [upload_sphinx]
 9 | upload-dir = docs/build/html
10 | 
11 | [sdist]
12 | formats = zip,gztar
13 | 
14 | [aliases]
15 | push = sdist bdist_wheel twine upload
16 | pushdocs = build_sphinx upload_sphinx
17 | 
18 | [tool:pytest]
19 | ; --tb= traceback print mode (long/short/line/native/no)
20 | addopts = -rs --tb=short
21 | 
22 | norecursedirs = .hg .tox _build tmp* env* benchmark stress
23 | minversion = 3.0
24 | python_files = test_*.py
25 | 
26 | [tool.coverage.run]
27 | source = ["src/whoosh"]
28 | 
29 | [tool.pytest.ini_options]
30 | addopts = "--cov --cov-report=lcov:lcov.info --cov-report=term"
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | 
 3 | import os.path
 4 | import sys
 5 | 
 6 | from setuptools import find_packages, setup
 7 | from setuptools.command.test import test as TestCommand
 8 | 
 9 | try:
10 |     import pytest
11 | except ImportError:
12 |     pytest = None
13 | 
14 | sys.path.insert(0, os.path.abspath("src"))
15 | from whoosh import versionstring
16 | 
17 | 
18 | class PyTest(TestCommand):
19 |     def finalize_options(self):
20 |         TestCommand.finalize_options(self)
21 |         self.test_args = []
22 |         self.test_suite = True
23 | 
24 |     def run_tests(self):
25 |         # import here, cause outside the eggs aren't loaded
26 |         import pytest
27 | 
28 |         pytest.main(self.test_args)
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     setup(
33 |         name="Whoosh-Reloaded",
34 |         version=versionstring(),
35 |         package_dir={"": "src"},
36 |         packages=find_packages("src"),
37 |         author="Matt Chaput",
38 |         author_email="matt@whoosh.ca",
39 |         maintainer="Sygil-Dev",
40 |         description="Fast, pure-Python full text indexing, search, and spell checking library.",
41 |         long_description=open("README.md").read(),
42 |         long_description_content_type="text/markdown",
43 |         license="Two-clause BSD license",
44 |         keywords="index search text spell",
45 |         url="https://github.com/Sygil-Dev/whoosh-reloaded",
46 |         zip_safe=True,
47 |         install_requires=[
48 |             "cached-property==2.0.1",
49 |             "loguru==0.7.3",
50 |         ],
51 |         tests_require=[
52 |             "pytest==8.4.0",
53 |             "nose==1.3.7",
54 |             "pre-commit==4.2.0",
55 |         ],
56 |         cmdclass={"test": PyTest},
57 |         classifiers=[
58 |             "Programming Language :: Python :: 3",
59 |             "Development Status :: 5 - Production/Stable",
60 |             "Intended Audience :: Developers",
61 |             "License :: OSI Approved :: BSD License",
62 |             "Natural Language :: English",
63 |             "Operating System :: OS Independent",
64 |             "Programming Language :: Python :: 3.8",
65 |             "Programming Language :: Python :: 3.9",
66 |             "Programming Language :: Python :: 3.10",
67 |             "Programming Language :: Python :: 3.11",
68 |             "Programming Language :: Python :: 3.12",
69 |             "Topic :: Software Development :: Libraries :: Python Modules",
70 |             "Topic :: Text Processing :: Indexing",
71 |         ],
72 |     )
73 | 


--------------------------------------------------------------------------------
/src/whoosh/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2008 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | __version__ = (3, 0, 0)
29 | 
30 | 
31 | def versionstring(build=True, extra=True):
32 |     """Returns the version number of Whoosh as a string.
33 | 
34 |     :param build: Whether to include the build number in the string.
35 |     :param extra: Whether to include alpha/beta/rc etc. tags. Only
36 |         checked if build is True.
37 |     :rtype: str
38 |     """
39 | 
40 |     if build:
41 |         first = 3
42 |     else:
43 |         first = 2
44 | 
45 |     s = ".".join(str(n) for n in __version__[:first])
46 |     if build and extra:
47 |         s += "".join(str(n) for n in __version__[3:])
48 | 
49 |     return s
50 | 


--------------------------------------------------------------------------------
/src/whoosh/analysis/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2007 Matt Chaput. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions are met:
  5 | #
  6 | #    1. Redistributions of source code must retain the above copyright notice,
  7 | #       this list of conditions and the following disclaimer.
  8 | #
  9 | #    2. Redistributions in binary form must reproduce the above copyright
 10 | #       notice, this list of conditions and the following disclaimer in the
 11 | #       documentation and/or other materials provided with the distribution.
 12 | #
 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23 | #
 24 | # The views and conclusions contained in the software and documentation are
 25 | # those of the authors and should not be interpreted as representing official
 26 | # policies, either expressed or implied, of Matt Chaput.
 27 | 
 28 | """Classes and functions for turning a piece of text into an indexable stream
 29 | of "tokens" (usually equivalent to words). There are three general classes
 30 | involved in analysis:
 31 | 
 32 | * Tokenizers are always at the start of the text processing pipeline. They take
 33 |   a string and yield Token objects (actually, the same token object over and
 34 |   over, for performance reasons) corresponding to the tokens (words) in the
 35 |   text.
 36 | 
 37 |   Every tokenizer is a callable that takes a string and returns an iterator of
 38 |   tokens.
 39 | 
 40 | * Filters take the tokens from the tokenizer and perform various
 41 |   transformations on them. For example, the LowercaseFilter converts all tokens
 42 |   to lowercase, which is usually necessary when indexing regular English text.
 43 | 
 44 |   Every filter is a callable that takes a token generator and returns a token
 45 |   generator.
 46 | 
 47 | * Analyzers are convenience functions/classes that "package up" a tokenizer and
 48 |   zero or more filters into a single unit. For example, the StandardAnalyzer
 49 |   combines a RegexTokenizer, LowercaseFilter, and StopFilter.
 50 | 
 51 |   Every analyzer is a callable that takes a string and returns a token
 52 |   iterator. (So Tokenizers can be used as Analyzers if you don't need any
 53 |   filtering).
 54 | 
 55 | You can compose tokenizers and filters together using the ``|`` character::
 56 | 
 57 |     my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter()
 58 | 
 59 | The first item must be a tokenizer and the rest must be filters (you can't put
 60 | a filter first or a tokenizer after the first item).
 61 | """
 62 | 
 63 | from whoosh.analysis.acore import (
 64 |     Composable,
 65 |     CompositionError,
 66 |     Token,
 67 |     entoken,
 68 |     unstopped,
 69 | )
 70 | from whoosh.analysis.analyzers import (
 71 |     Analyzer,
 72 |     FancyAnalyzer,
 73 |     IDAnalyzer,
 74 |     KeywordAnalyzer,
 75 |     LanguageAnalyzer,
 76 |     RegexAnalyzer,
 77 |     SimpleAnalyzer,
 78 |     StandardAnalyzer,
 79 |     StemmingAnalyzer,
 80 | )
 81 | from whoosh.analysis.filters import (
 82 |     STOP_WORDS,
 83 |     CharsetFilter,
 84 |     Composable,
 85 |     DelimitedAttributeFilter,
 86 |     Filter,
 87 |     LoggingFilter,
 88 |     LowercaseFilter,
 89 |     MultiFilter,
 90 |     PassFilter,
 91 |     ReverseTextFilter,
 92 |     StopFilter,
 93 |     StripFilter,
 94 |     SubstitutionFilter,
 95 |     TeeFilter,
 96 |     url_pattern,
 97 | )
 98 | from whoosh.analysis.intraword import (
 99 |     BiWordFilter,
100 |     CompoundWordFilter,
101 |     IntraWordFilter,
102 |     ShingleFilter,
103 | )
104 | from whoosh.analysis.morph import DoubleMetaphoneFilter, PyStemmerFilter, StemFilter
105 | from whoosh.analysis.ngrams import (
106 |     NgramAnalyzer,
107 |     NgramFilter,
108 |     NgramTokenizer,
109 |     NgramWordAnalyzer,
110 | )
111 | from whoosh.analysis.tokenizers import (
112 |     CharsetTokenizer,
113 |     CommaSeparatedTokenizer,
114 |     IDTokenizer,
115 |     PathTokenizer,
116 |     RegexTokenizer,
117 |     SpaceSeparatedTokenizer,
118 |     Tokenizer,
119 | )
120 | 


--------------------------------------------------------------------------------
/src/whoosh/automata/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/src/whoosh/automata/__init__.py


--------------------------------------------------------------------------------
/src/whoosh/automata/glob.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | from whoosh.automata.fsa import ANY, EPSILON, NFA
29 | 
30 | # Constants for glob
31 | _LIT = 0
32 | _STAR = 1
33 | _PLUS = 2
34 | _QUEST = 3
35 | _RANGE = 4
36 | 
37 | 
38 | def parse_glob(
39 |     pattern, _glob_multi="*", _glob_single="?", _glob_range1="[", _glob_range2="]"
40 | ):
41 |     pos = 0
42 |     last = None
43 |     while pos < len(pattern):
44 |         char = pattern[pos]
45 |         pos += 1
46 |         if char == _glob_multi:  # *
47 |             # (Ignore more than one star in a row)
48 |             if last is not _STAR:
49 |                 yield _STAR, None
50 |                 last = _STAR
51 |         elif char == _glob_single:  # ?
52 |             # (Ignore ? after a star)
53 |             if last is not _STAR:
54 |                 yield _QUEST, None
55 |                 last = _QUEST
56 |         elif char == _glob_range1:  # [
57 |             chars = set()
58 |             negate = False
59 |             # Take the char range specification until the ]
60 |             while pos < len(pattern):
61 |                 char = pattern[pos]
62 |                 pos += 1
63 |                 if char == _glob_range2:
64 |                     break
65 |                 chars.add(char)
66 |             if chars:
67 |                 yield _RANGE, (chars, negate)
68 |                 last = _RANGE
69 |         else:
70 |             yield _LIT, char
71 |             last = _LIT
72 | 
73 | 
74 | def glob_automaton(pattern):
75 |     nfa = NFA(0)
76 |     i = -1
77 |     for i, (op, arg) in enumerate(parse_glob(pattern)):
78 |         if op is _LIT:
79 |             nfa.add_transition(i, arg, i + 1)
80 |         elif op is _STAR:
81 |             nfa.add_transition(i, ANY, i + 1)
82 |             nfa.add_transition(i, EPSILON, i + 1)
83 |             nfa.add_transition(i + 1, EPSILON, i)
84 |         elif op is _QUEST:
85 |             nfa.add_transition(i, ANY, i + 1)
86 |         elif op is _RANGE:
87 |             for char in arg[0]:
88 |                 nfa.add_transition(i, char, i + 1)
89 |     nfa.add_final_state(i + 1)
90 |     return nfa
91 | 


--------------------------------------------------------------------------------
/src/whoosh/automata/lev.py:
--------------------------------------------------------------------------------
 1 | from whoosh.automata.fsa import ANY, EPSILON, NFA
 2 | 
 3 | 
 4 | def levenshtein_automaton(term, k, prefix=0):
 5 |     nfa = NFA((0, 0))
 6 |     if prefix:
 7 |         for i in range(prefix):
 8 |             c = term[i]
 9 |             nfa.add_transition((i, 0), c, (i + 1, 0))
10 | 
11 |     for i in range(prefix, len(term)):
12 |         c = term[i]
13 |         for e in range(k + 1):
14 |             # Correct character
15 |             nfa.add_transition((i, e), c, (i + 1, e))
16 |             if e < k:
17 |                 # Deletion
18 |                 nfa.add_transition((i, e), ANY, (i, e + 1))
19 |                 # Insertion
20 |                 nfa.add_transition((i, e), EPSILON, (i + 1, e + 1))
21 |                 # Substitution
22 |                 nfa.add_transition((i, e), ANY, (i + 1, e + 1))
23 |     for e in range(k + 1):
24 |         if e < k:
25 |             nfa.add_transition((len(term), e), ANY, (len(term), e + 1))
26 |         nfa.add_final_state((len(term), e))
27 |     return nfa
28 | 


--------------------------------------------------------------------------------
/src/whoosh/automata/reg.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2014 Matt Chaput. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions are met:
  5 | #
  6 | #    1. Redistributions of source code must retain the above copyright notice,
  7 | #       this list of conditions and the following disclaimer.
  8 | #
  9 | #    2. Redistributions in binary form must reproduce the above copyright
 10 | #       notice, this list of conditions and the following disclaimer in the
 11 | #       documentation and/or other materials provided with the distribution.
 12 | #
 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23 | #
 24 | # The views and conclusions contained in the software and documentation are
 25 | # those of the authors and should not be interpreted as representing official
 26 | # policies, either expressed or implied, of Matt Chaput.
 27 | 
 28 | from whoosh.automata.fsa import ANY, EPSILON, NFA
 29 | 
 30 | # Operator precedence
 31 | CHOICE = ("|",)
 32 | ops = ()
 33 | 
 34 | 
 35 | def parse(pattern):
 36 |     stack = []
 37 |     ops = []
 38 | 
 39 | 
 40 | class RegexBuilder:
 41 |     def __init__(self):
 42 |         self.statenum = 1
 43 | 
 44 |     def new_state(self):
 45 |         self.statenum += 1
 46 |         return self.statenum
 47 | 
 48 |     def epsilon(self):
 49 |         s = self.new_state()
 50 |         e = self.new_state()
 51 |         nfa = NFA(s)
 52 |         nfa.add_transition(s, EPSILON, e)
 53 |         nfa.add_final_state(e)
 54 |         return nfa
 55 | 
 56 |     def char(self, label):
 57 |         s = self.new_state()
 58 |         e = self.new_state()
 59 |         nfa = NFA(s)
 60 |         nfa.add_transition(s, label, e)
 61 |         nfa.add_final_state(e)
 62 |         return nfa
 63 | 
 64 |     def charset(self, chars):
 65 |         s = self.new_state()
 66 |         e = self.new_state()
 67 |         nfa = NFA(s)
 68 |         for char in chars:
 69 |             nfa.add_transition(s, char, e)
 70 |         nfa.add_final_state(e)
 71 |         return e
 72 | 
 73 |     def dot(self):
 74 |         s = self.new_state()
 75 |         e = self.new_state()
 76 |         nfa = NFA(s)
 77 |         nfa.add_transition(s, ANY, e)
 78 |         nfa.add_final_state(e)
 79 |         return nfa
 80 | 
 81 |     def choice(self, n1, n2):
 82 |         s = self.new_state()
 83 |         s1 = self.new_state()
 84 |         s2 = self.new_state()
 85 |         e1 = self.new_state()
 86 |         e2 = self.new_state()
 87 |         e = self.new_state()
 88 |         nfa = NFA(s)
 89 |         nfa.add_transition(s, EPSILON, s1)
 90 |         nfa.add_transition(s, EPSILON, s2)
 91 |         nfa.insert(s1, n1, e1)
 92 |         nfa.insert(s2, n2, e2)
 93 |         nfa.add_transition(e1, EPSILON, e)
 94 |         nfa.add_transition(e2, EPSILON, e)
 95 |         nfa.add_final_state(e)
 96 |         return nfa
 97 | 
 98 |     def concat(self, n1, n2):
 99 |         s = self.new_state()
100 |         m = self.new_state()
101 |         e = self.new_state()
102 |         nfa = NFA(s)
103 |         nfa.insert(s, n1, m)
104 |         nfa.insert(m, n2, e)
105 |         nfa.add_final_state(e)
106 |         return nfa
107 | 
108 |     def star(self, n):
109 |         s = self.new_state()
110 |         m1 = self.new_state()
111 |         m2 = self.new_state()
112 |         e = self.new_state()
113 |         nfa = NFA(s)
114 |         nfa.add_transition(s, EPSILON, m1)
115 |         nfa.add_transition(s, EPSILON, e)
116 |         nfa.insert(m1, n, m2)
117 |         nfa.add_transition(m2, EPSILON, m1)
118 |         nfa.add_transition(m2, EPSILON, e)
119 |         nfa.add_final_state(e)
120 |         return nfa
121 | 
122 |     def plus(self, n):
123 |         return self.concat(n, self.star(n))
124 | 
125 |     def question(self, n):
126 |         return self.choice(n, self.epsilon())
127 | 


--------------------------------------------------------------------------------
/src/whoosh/codec/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | 
29 | def default_codec(*args, **kwargs):
30 |     from whoosh.codec.whoosh3 import W3Codec
31 | 
32 |     return W3Codec(*args, **kwargs)
33 | 


--------------------------------------------------------------------------------
/src/whoosh/filedb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/src/whoosh/filedb/__init__.py


--------------------------------------------------------------------------------
/src/whoosh/filedb/misc.py:
--------------------------------------------------------------------------------
 1 | # ===============================================================================
 2 | # Copyright 2010 Matt Chaput
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ===============================================================================
16 | 
17 | from marshal import dumps as mdumps
18 | from marshal import loads as mloads
19 | from pickle import dumps, loads
20 | from struct import Struct
21 | 
22 | from whoosh.system import (
23 |     _SHORT_SIZE,
24 |     pack_uint,
25 |     pack_ushort,
26 |     unpack_uint,
27 |     unpack_ushort,
28 | )
29 | from whoosh.util import utf8decode, utf8encode
30 | 
31 | 
32 | def encode_termkey(term):
33 |     fieldnum, text = term
34 |     return pack_ushort(fieldnum) + utf8encode(text)[0]
35 | 
36 | 
37 | def decode_termkey(key):
38 |     return (unpack_ushort(key[:_SHORT_SIZE])[0], utf8decode(key[_SHORT_SIZE:])[0])
39 | 
40 | 
41 | _terminfo_struct = Struct("!III")  # frequency, offset, postcount
42 | _pack_terminfo = _terminfo_struct.pack
43 | encode_terminfo = lambda cf_offset_df: _pack_terminfo(*cf_offset_df)
44 | decode_terminfo = _terminfo_struct.unpack
45 | 
46 | encode_docnum = pack_uint
47 | decode_docnum = lambda x: unpack_uint(x)[0]
48 | 
49 | enpickle = lambda data: dumps(data, -1)
50 | depickle = loads
51 | 
52 | enmarshal = mdumps
53 | demarshal = mloads
54 | 


--------------------------------------------------------------------------------
/src/whoosh/lang/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012 Matt Chaput. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions are met:
  5 | #
  6 | #    1. Redistributions of source code must retain the above copyright notice,
  7 | #       this list of conditions and the following disclaimer.
  8 | #
  9 | #    2. Redistributions in binary form must reproduce the above copyright
 10 | #       notice, this list of conditions and the following disclaimer in the
 11 | #       documentation and/or other materials provided with the distribution.
 12 | #
 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23 | #
 24 | # The views and conclusions contained in the software and documentation are
 25 | # those of the authors and should not be interpreted as representing official
 26 | # policies, either expressed or implied, of Matt Chaput.
 27 | 
 28 | 
 29 | # Exceptions
 30 | 
 31 | 
 32 | class NoStemmer(Exception):
 33 |     pass
 34 | 
 35 | 
 36 | class NoStopWords(Exception):
 37 |     pass
 38 | 
 39 | 
 40 | # Data and functions for language names
 41 | 
 42 | languages = (
 43 |     "ar",
 44 |     "da",
 45 |     "nl",
 46 |     "en",
 47 |     "fi",
 48 |     "fr",
 49 |     "de",
 50 |     "hu",
 51 |     "it",
 52 |     "no",
 53 |     "pt",
 54 |     "ro",
 55 |     "ru",
 56 |     "es",
 57 |     "sv",
 58 |     "tr",
 59 | )
 60 | 
 61 | aliases = {
 62 |     # By ISO 639-1 three letter codes
 63 |     "ara": "ar",
 64 |     "dan": "da",
 65 |     "nld": "nl",
 66 |     "eng": "en",
 67 |     "fin": "fi",
 68 |     "fra": "fr",
 69 |     "deu": "de",
 70 |     "hun": "hu",
 71 |     "ita": "it",
 72 |     "nor": "no",
 73 |     "por": "pt",
 74 |     "ron": "ro",
 75 |     "rus": "ru",
 76 |     "spa": "es",
 77 |     "swe": "sv",
 78 |     "tur": "tr",
 79 |     # By name in English
 80 |     "arabic": "ar",
 81 |     "danish": "da",
 82 |     "dutch": "nl",
 83 |     "english": "en",
 84 |     "finnish": "fi",
 85 |     "french": "fr",
 86 |     "german": "de",
 87 |     "hungarian": "hu",
 88 |     "italian": "it",
 89 |     "norwegian": "no",
 90 |     "portuguese": "pt",
 91 |     "romanian": "ro",
 92 |     "russian": "ru",
 93 |     "spanish": "es",
 94 |     "swedish": "sv",
 95 |     "turkish": "tr",
 96 |     # By name in own language
 97 |     "العربية": "ar",
 98 |     "dansk": "da",
 99 |     "nederlands": "nl",
100 |     "suomi": "fi",
101 |     "français": "fr",
102 |     "deutsch": "de",
103 |     "magyar": "hu",
104 |     "italiano": "it",
105 |     "norsk": "no",
106 |     "português": "pt",
107 |     "русский язык": "ru",
108 |     "español": "es",
109 |     "svenska": "sv",
110 |     "türkçe": "tr",
111 | }
112 | 
113 | 
114 | def two_letter_code(name):
115 |     if name in languages:
116 |         return name
117 |     if name in aliases:
118 |         return aliases[name]
119 |     return None
120 | 
121 | 
122 | # Getter functions
123 | 
124 | 
125 | def has_stemmer(lang):
126 |     try:
127 |         return bool(stemmer_for_language(lang))
128 |     except NoStemmer:
129 |         return False
130 | 
131 | 
132 | def has_stopwords(lang):
133 |     try:
134 |         return bool(stopwords_for_language(lang))
135 |     except NoStopWords:
136 |         return False
137 | 
138 | 
139 | def stemmer_for_language(lang):
140 |     if lang == "en_porter":
141 |         # Original porter stemming algorithm is several times faster than the
142 |         # more correct porter2 algorithm in snowball package
143 |         from .porter import stem as porter_stem
144 | 
145 |         return porter_stem
146 | 
147 |     tlc = two_letter_code(lang)
148 | 
149 |     if tlc == "ar":
150 |         from .isri import ISRIStemmer
151 | 
152 |         return ISRIStemmer().stem
153 | 
154 |     from .snowball import classes as snowball_classes
155 | 
156 |     if tlc in snowball_classes:
157 |         return snowball_classes[tlc]().stem
158 | 
159 |     raise NoStemmer(f"No stemmer available for {lang!r}")
160 | 
161 | 
162 | def stopwords_for_language(lang):
163 |     from .stopwords import stoplists
164 | 
165 |     tlc = two_letter_code(lang)
166 |     if tlc in stoplists:
167 |         return stoplists[tlc]
168 | 
169 |     raise NoStopWords(f"No stop-word list available for {lang!r}")
170 | 


--------------------------------------------------------------------------------
/src/whoosh/lang/phonetic.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains quasi-phonetic encoders for words in different languages.
  3 | """
  4 | 
  5 | import re
  6 | 
  7 | # This soundex implementation is adapted from the recipe here:
  8 | # http://code.activestate.com/recipes/52213/
  9 | 
 10 | english_codes = "01230120022455012623010202"
 11 | 
 12 | 
 13 | def soundex_en(word):
 14 |     # digits holds the soundex values for the alphabet
 15 |     r = ""
 16 |     if word:
 17 |         # Remember first character
 18 |         fc = None
 19 |         prevcode = None
 20 |         for char in word.lower():
 21 |             c = ord(char)
 22 |             if c >= 97 and c <= 122:  # a-z
 23 |                 if not fc:
 24 |                     fc = char
 25 |                 code = english_codes[c - 97]
 26 |                 # Don't append the code if it's the same as the previous
 27 |                 if code != prevcode:
 28 |                     r += code
 29 |                 prevcode = code
 30 | 
 31 |         # Replace first digit with first alpha character
 32 |         r = fc + r[1:]
 33 | 
 34 |     return r
 35 | 
 36 | 
 37 | # Quasi-phonetic coder for Spanish, translated to Python from Sebastian
 38 | # Ferreyra's version here:
 39 | # http://www.javalobby.org/java/forums/t16936.html
 40 | 
 41 | _esp_codes = (
 42 |     ("\\Aw?[uh]?([aeiou])", ""),
 43 |     ("c[eiéí]|z|ll|sh|ch|sch|cc|y[aeiouáéíóú]|ps|bs|x|j|g[eiéí]", "s"),
 44 |     ("[aeiouhwáéíóúü]+", ""),
 45 |     ("y", ""),
 46 |     ("ñ|gn", "n"),
 47 |     ("[dpc]t", "t"),
 48 |     ("c[aouáóú]|ck|q", "k"),
 49 |     ("v", "b"),
 50 |     ("d$", "t"),  # Change a trailing d to a t
 51 | )
 52 | _esp_codes = tuple((re.compile(pat), repl) for pat, repl in _esp_codes)
 53 | 
 54 | 
 55 | def soundex_esp(word):
 56 |     word = word.lower()
 57 |     r = ""
 58 | 
 59 |     prevcode = None
 60 |     i = 0
 61 |     while i < len(word):
 62 |         code = None
 63 |         for expr, ecode in _esp_codes:
 64 |             match = expr.match(word, i)
 65 |             if match:
 66 |                 i = match.end()
 67 |                 code = ecode
 68 |                 break
 69 | 
 70 |         if code is None:
 71 |             code = word[i]
 72 |             i += 1
 73 | 
 74 |         if code != prevcode:
 75 |             r += code
 76 |         prevcode = code
 77 | 
 78 |     return r
 79 | 
 80 | 
 81 | # This version of soundex for Arabic is translated to Python from Tammam
 82 | # Koujan's C# version here:
 83 | # http://www.codeproject.com/KB/recipes/ArabicSoundex.aspx
 84 | 
 85 | # Create a dictionary mapping arabic characters to digits
 86 | _arabic_codes = {}
 87 | for chars, code in {
 88 |     "\u0627\u0623\u0625\u0622\u062d\u062e\u0647\u0639\u063a\u0634\u0648\u064a": "0",
 89 |     "\u0641\u0628": "1",
 90 |     "\u062c\u0632\u0633\u0635\u0638\u0642\u0643": "2",
 91 |     "\u062a\u062b\u062f\u0630\u0636\u0637": "3",
 92 |     "\u0644": "4",
 93 |     "\u0645\u0646": "5",
 94 |     "\u0631": "6",
 95 | }.items():
 96 |     for char in chars:
 97 |         _arabic_codes[char] = code
 98 | 
 99 | 
100 | def soundex_ar(word):
101 |     if word[0] in "\u0627\u0623\u0625\u0622":
102 |         word = word[1:]
103 | 
104 |     r = "0"
105 |     prevcode = "0"
106 |     if len(word) > 1:
107 |         # Discard the first character
108 |         for char in word[1:]:
109 |             if char in _arabic_codes:
110 |                 code = _arabic_codes.get(char, "0")
111 |             # Don't append the code if it's the same as the previous
112 |             if code != prevcode:
113 |                 # If the code is a 0 (vowel), don't process it
114 |                 if code != "0":
115 |                     r += code
116 |             prevcode = code
117 |     return r
118 | 


--------------------------------------------------------------------------------
/src/whoosh/lang/porter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reimplementation of the
  3 | `Porter stemming algorithm <http://tartarus.org/~martin/PorterStemmer/>`_
  4 | in Python.
  5 | 
  6 | In my quick tests, this implementation about 3.5 times faster than the
  7 | seriously weird Python linked from the official page.
  8 | """
  9 | 
 10 | import re
 11 | 
 12 | # Suffix replacement lists
 13 | 
 14 | _step2list = {
 15 |     "ational": "ate",
 16 |     "tional": "tion",
 17 |     "enci": "ence",
 18 |     "anci": "ance",
 19 |     "izer": "ize",
 20 |     "bli": "ble",
 21 |     "alli": "al",
 22 |     "entli": "ent",
 23 |     "eli": "e",
 24 |     "ousli": "ous",
 25 |     "ization": "ize",
 26 |     "ation": "ate",
 27 |     "ator": "ate",
 28 |     "alism": "al",
 29 |     "iveness": "ive",
 30 |     "fulness": "ful",
 31 |     "ousness": "ous",
 32 |     "aliti": "al",
 33 |     "iviti": "ive",
 34 |     "biliti": "ble",
 35 |     "logi": "log",
 36 | }
 37 | 
 38 | _step3list = {
 39 |     "icate": "ic",
 40 |     "ative": "",
 41 |     "alize": "al",
 42 |     "iciti": "ic",
 43 |     "ical": "ic",
 44 |     "ful": "",
 45 |     "ness": "",
 46 | }
 47 | 
 48 | 
 49 | _cons = "[^aeiou]"
 50 | _vowel = "[aeiouy]"
 51 | _cons_seq = "[^aeiouy]+"
 52 | _vowel_seq = "[aeiou]+"
 53 | 
 54 | # m > 0
 55 | _mgr0 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq)
 56 | # m == 0
 57 | _meq1 = re.compile(
 58 |     "^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$"
 59 | )
 60 | # m > 1
 61 | _mgr1 = re.compile(
 62 |     "^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq
 63 | )
 64 | # vowel in stem
 65 | _s_v = re.compile("^(" + _cons_seq + ")?" + _vowel)
 66 | # ???
 67 | _c_v = re.compile("^" + _cons_seq + _vowel + "[^aeiouwxy]$")
 68 | 
 69 | # Patterns used in the rules
 70 | 
 71 | _ed_ing = re.compile("^(.*)(ed|ing)$")
 72 | _at_bl_iz = re.compile("(at|bl|iz)$")
 73 | _step1b = re.compile("([^aeiouylsz])\\1$")
 74 | _step2 = re.compile(
 75 |     "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$"
 76 | )
 77 | _step3 = re.compile("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$")
 78 | _step4_1 = re.compile(
 79 |     "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$"
 80 | )
 81 | _step4_2 = re.compile("^(.+?)(s|t)(ion)$")
 82 | _step5 = re.compile("^(.+?)e$")
 83 | 
 84 | 
 85 | # Stemming function
 86 | 
 87 | 
 88 | def stem(w):
 89 |     """Uses the Porter stemming algorithm to remove suffixes from English
 90 |     words.
 91 | 
 92 |     >>> stem("fundamentally")
 93 |     "fundament"
 94 |     """
 95 | 
 96 |     if len(w) < 3:
 97 |         return w
 98 | 
 99 |     first_is_y = w[0] == "y"
100 |     if first_is_y:
101 |         w = "Y" + w[1:]
102 | 
103 |     # Step 1a
104 |     if w.endswith("s"):
105 |         if w.endswith("sses"):
106 |             w = w[:-2]
107 |         elif w.endswith("ies"):
108 |             w = w[:-2]
109 |         elif w[-2] != "s":
110 |             w = w[:-1]
111 | 
112 |     # Step 1b
113 | 
114 |     if w.endswith("eed"):
115 |         s = w[:-3]
116 |         if _mgr0.match(s):
117 |             w = w[:-1]
118 |     else:
119 |         m = _ed_ing.match(w)
120 |         if m:
121 |             stem = m.group(1)
122 |             if _s_v.match(stem):
123 |                 w = stem
124 |                 if _at_bl_iz.match(w):
125 |                     w += "e"
126 |                 elif _step1b.match(w):
127 |                     w = w[:-1]
128 |                 elif _c_v.match(w):
129 |                     w += "e"
130 | 
131 |     # Step 1c
132 | 
133 |     if w.endswith("y"):
134 |         stem = w[:-1]
135 |         if _s_v.match(stem):
136 |             w = stem + "i"
137 | 
138 |     # Step 2
139 | 
140 |     m = _step2.match(w)
141 |     if m:
142 |         stem = m.group(1)
143 |         suffix = m.group(2)
144 |         if _mgr0.match(stem):
145 |             w = stem + _step2list[suffix]
146 | 
147 |     # Step 3
148 | 
149 |     m = _step3.match(w)
150 |     if m:
151 |         stem = m.group(1)
152 |         suffix = m.group(2)
153 |         if _mgr0.match(stem):
154 |             w = stem + _step3list[suffix]
155 | 
156 |     # Step 4
157 | 
158 |     m = _step4_1.match(w)
159 |     if m:
160 |         stem = m.group(1)
161 |         if _mgr1.match(stem):
162 |             w = stem
163 |     else:
164 |         m = _step4_2.match(w)
165 |         if m:
166 |             stem = m.group(1) + m.group(2)
167 |             if _mgr1.match(stem):
168 |                 w = stem
169 | 
170 |     # Step 5
171 | 
172 |     m = _step5.match(w)
173 |     if m:
174 |         stem = m.group(1)
175 |         if _mgr1.match(stem) or (_meq1.match(stem) and not _c_v.match(stem)):
176 |             w = stem
177 | 
178 |     if w.endswith("ll") and _mgr1.match(w):
179 |         w = w[:-1]
180 | 
181 |     if first_is_y:
182 |         w = "y" + w[1:]
183 | 
184 |     return w
185 | 


--------------------------------------------------------------------------------
/src/whoosh/lang/snowball/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2001-2012 NLTK Project
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the 'License');
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an 'AS IS' BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/src/whoosh/lang/snowball/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2001-2012 NLTK Project
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the 'License');
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # Natural Language Toolkit: Snowball Stemmer
17 | #
18 | # Copyright (C) 2001-2012 NLTK Project
19 | # Author: Peter Michael Stahl <pemistahl@gmail.com>
20 | #         Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions)
21 | # Algorithms: Dr Martin Porter <martin@tartarus.org>
22 | # URL: <http://www.nltk.org/>
23 | # For license information, see LICENSE.TXT
24 | 
25 | # HJ 2012/07/19  adapted from https://github.com/kmike/nltk.git  (branch 2and3)
26 | #                2.0.1rc4-256-g45768f8
27 | 
28 | """
29 | This module provides a port of the Snowball stemmers developed by Martin
30 | Porter.
31 | 
32 | At the moment, this port is able to stem words from fourteen languages: Danish,
33 | Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian,
34 | Portuguese, Romanian, Russian, Spanish and Swedish.
35 | 
36 | The algorithms have been developed by Martin Porter. These stemmers are called
37 | Snowball, because he invented a programming language with this name for
38 | creating new stemming algorithms. There is more information available at
39 | http://snowball.tartarus.org/
40 | """
41 | 
42 | from .danish import DanishStemmer
43 | from .dutch import DutchStemmer
44 | from .english import EnglishStemmer
45 | from .finnish import FinnishStemmer
46 | from .french import FrenchStemmer
47 | from .german import GermanStemmer
48 | from .hungarian import HungarianStemmer
49 | from .italian import ItalianStemmer
50 | from .norwegian import NorwegianStemmer
51 | from .portugese import PortugueseStemmer
52 | from .romanian import RomanianStemmer
53 | from .russian import RussianStemmer
54 | from .spanish import SpanishStemmer
55 | from .swedish import SwedishStemmer
56 | 
57 | # Map two-letter codes to stemming classes
58 | 
59 | classes = {
60 |     "da": DanishStemmer,
61 |     "nl": DutchStemmer,
62 |     "en": EnglishStemmer,
63 |     "fi": FinnishStemmer,
64 |     "fr": FrenchStemmer,
65 |     "de": GermanStemmer,
66 |     "hu": HungarianStemmer,
67 |     "it": ItalianStemmer,
68 |     "no": NorwegianStemmer,
69 |     "pt": PortugueseStemmer,
70 |     "ro": RomanianStemmer,
71 |     "ru": RussianStemmer,
72 |     "es": SpanishStemmer,
73 |     "sv": SwedishStemmer,
74 | }
75 | 


--------------------------------------------------------------------------------
/src/whoosh/lang/snowball/norwegian.py:
--------------------------------------------------------------------------------
  1 | from .bases import _ScandinavianStemmer
  2 | 
  3 | 
  4 | class NorwegianStemmer(_ScandinavianStemmer):
  5 | 
  6 |     """
  7 |     The Norwegian Snowball stemmer.
  8 | 
  9 |     :cvar __vowels: The Norwegian vowels.
 10 |     :type __vowels: unicode
 11 |     :cvar __s_ending: Letters that may directly appear before a word final 's'.
 12 |     :type __s_ending: unicode
 13 |     :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
 14 |     :type __step1_suffixes: tuple
 15 |     :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
 16 |     :type __step2_suffixes: tuple
 17 |     :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
 18 |     :type __step3_suffixes: tuple
 19 |     :note: A detailed description of the Norwegian
 20 |            stemming algorithm can be found under
 21 |            http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
 22 | 
 23 |     """
 24 | 
 25 |     __vowels = "aeiouy\xE6\xE5\xF8"
 26 |     __s_ending = "bcdfghjlmnoprtvyz"
 27 |     __step1_suffixes = (
 28 |         "hetenes",
 29 |         "hetene",
 30 |         "hetens",
 31 |         "heter",
 32 |         "heten",
 33 |         "endes",
 34 |         "ande",
 35 |         "ende",
 36 |         "edes",
 37 |         "enes",
 38 |         "erte",
 39 |         "ede",
 40 |         "ane",
 41 |         "ene",
 42 |         "ens",
 43 |         "ers",
 44 |         "ets",
 45 |         "het",
 46 |         "ast",
 47 |         "ert",
 48 |         "en",
 49 |         "ar",
 50 |         "er",
 51 |         "as",
 52 |         "es",
 53 |         "et",
 54 |         "a",
 55 |         "e",
 56 |         "s",
 57 |     )
 58 | 
 59 |     __step2_suffixes = ("dt", "vt")
 60 | 
 61 |     __step3_suffixes = (
 62 |         "hetslov",
 63 |         "eleg",
 64 |         "elig",
 65 |         "elov",
 66 |         "slov",
 67 |         "leg",
 68 |         "eig",
 69 |         "lig",
 70 |         "els",
 71 |         "lov",
 72 |         "ig",
 73 |     )
 74 | 
 75 |     def stem(self, word):
 76 |         """
 77 |         Stem a Norwegian word and return the stemmed form.
 78 | 
 79 |         :param word: The word that is stemmed.
 80 |         :type word: str or unicode
 81 |         :return: The stemmed form.
 82 |         :rtype: unicode
 83 | 
 84 |         """
 85 |         word = word.lower()
 86 | 
 87 |         r1 = self._r1_scandinavian(word, self.__vowels)
 88 | 
 89 |         # STEP 1
 90 |         for suffix in self.__step1_suffixes:
 91 |             if r1.endswith(suffix):
 92 |                 if suffix in ("erte", "ert"):
 93 |                     word = "".join((word[: -len(suffix)], "er"))
 94 |                     r1 = "".join((r1[: -len(suffix)], "er"))
 95 | 
 96 |                 elif suffix == "s":
 97 |                     if word[-2] in self.__s_ending or (
 98 |                         word[-2] == "k" and word[-3] not in self.__vowels
 99 |                     ):
100 |                         word = word[:-1]
101 |                         r1 = r1[:-1]
102 |                 else:
103 |                     word = word[: -len(suffix)]
104 |                     r1 = r1[: -len(suffix)]
105 |                 break
106 | 
107 |         # STEP 2
108 |         for suffix in self.__step2_suffixes:
109 |             if r1.endswith(suffix):
110 |                 word = word[:-1]
111 |                 r1 = r1[:-1]
112 |                 break
113 | 
114 |         # STEP 3
115 |         for suffix in self.__step3_suffixes:
116 |             if r1.endswith(suffix):
117 |                 word = word[: -len(suffix)]
118 |                 break
119 | 
120 |         return word
121 | 


--------------------------------------------------------------------------------
/src/whoosh/lang/snowball/swedish.py:
--------------------------------------------------------------------------------
  1 | from .bases import _ScandinavianStemmer
  2 | 
  3 | 
  4 | class SwedishStemmer(_ScandinavianStemmer):
  5 | 
  6 |     """
  7 |     The Swedish Snowball stemmer.
  8 | 
  9 |     :cvar __vowels: The Swedish vowels.
 10 |     :type __vowels: unicode
 11 |     :cvar __s_ending: Letters that may directly appear before a word final 's'.
 12 |     :type __s_ending: unicode
 13 |     :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
 14 |     :type __step1_suffixes: tuple
 15 |     :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
 16 |     :type __step2_suffixes: tuple
 17 |     :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
 18 |     :type __step3_suffixes: tuple
 19 |     :note: A detailed description of the Swedish
 20 |            stemming algorithm can be found under
 21 |            http://snowball.tartarus.org/algorithms/swedish/stemmer.html
 22 |     """
 23 | 
 24 |     __vowels = "aeiouy\xE4\xE5\xF6"
 25 |     __s_ending = "bcdfghjklmnoprtvy"
 26 |     __step1_suffixes = (
 27 |         "heterna",
 28 |         "hetens",
 29 |         "heter",
 30 |         "heten",
 31 |         "anden",
 32 |         "arnas",
 33 |         "ernas",
 34 |         "ornas",
 35 |         "andes",
 36 |         "andet",
 37 |         "arens",
 38 |         "arna",
 39 |         "erna",
 40 |         "orna",
 41 |         "ande",
 42 |         "arne",
 43 |         "aste",
 44 |         "aren",
 45 |         "ades",
 46 |         "erns",
 47 |         "ade",
 48 |         "are",
 49 |         "ern",
 50 |         "ens",
 51 |         "het",
 52 |         "ast",
 53 |         "ad",
 54 |         "en",
 55 |         "ar",
 56 |         "er",
 57 |         "or",
 58 |         "as",
 59 |         "es",
 60 |         "at",
 61 |         "a",
 62 |         "e",
 63 |         "s",
 64 |     )
 65 |     __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt")
 66 |     __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig")
 67 | 
 68 |     def stem(self, word):
 69 |         """
 70 |         Stem a Swedish word and return the stemmed form.
 71 | 
 72 |         :param word: The word that is stemmed.
 73 |         :type word: str or unicode
 74 |         :return: The stemmed form.
 75 |         :rtype: unicode
 76 | 
 77 |         """
 78 |         word = word.lower()
 79 | 
 80 |         r1 = self._r1_scandinavian(word, self.__vowels)
 81 | 
 82 |         # STEP 1
 83 |         for suffix in self.__step1_suffixes:
 84 |             if r1.endswith(suffix):
 85 |                 if suffix == "s":
 86 |                     if word[-2] in self.__s_ending:
 87 |                         word = word[:-1]
 88 |                         r1 = r1[:-1]
 89 |                 else:
 90 |                     word = word[: -len(suffix)]
 91 |                     r1 = r1[: -len(suffix)]
 92 |                 break
 93 | 
 94 |         # STEP 2
 95 |         for suffix in self.__step2_suffixes:
 96 |             if r1.endswith(suffix):
 97 |                 word = word[:-1]
 98 |                 r1 = r1[:-1]
 99 |                 break
100 | 
101 |         # STEP 3
102 |         for suffix in self.__step3_suffixes:
103 |             if r1.endswith(suffix):
104 |                 if suffix in ("els", "lig", "ig"):
105 |                     word = word[: -len(suffix)]
106 |                 elif suffix in ("fullt", "l\xF6st"):
107 |                     word = word[:-1]
108 |                 break
109 | 
110 |         return word
111 | 


--------------------------------------------------------------------------------
/src/whoosh/legacy.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | """
29 | This module contains code for maintaining backwards compatibility with old
30 | index formats.
31 | """
32 | 
33 | from whoosh.util.loading import RenamingUnpickler
34 | 
35 | 
36 | def load_110_toc(stream, gen, schema, version):
37 |     # Between version -110 and version -111, I reorganized the modules and
38 |     # changed the implementation of the NUMERIC field, so we have to change the
39 |     # classes the unpickler tries to load if we need to read an old schema
40 | 
41 |     # Read the length of the pickled schema
42 |     picklen = stream.read_varint()
43 |     if schema:
44 |         # If the user passed us a schema, use it and skip the one on disk
45 |         stream.seek(picklen, 1)
46 |     else:
47 |         # Remap the old classes and functions to their moved versions as we
48 |         # unpickle the schema
49 |         scuts = {
50 |             "wf": "whoosh.fields",
51 |             "wsn": "whoosh.support.numeric",
52 |             "wcw2": "whoosh.codec.whoosh2",
53 |         }
54 |         objmap = {
55 |             "%(wf)s.NUMERIC": "%(wcw2)s.OLD_NUMERIC",
56 |             "%(wf)s.DATETIME": "%(wcw2)s.OLD_DATETIME",
57 |             "%(wsn)s.int_to_text": "%(wcw2)s.int_to_text",
58 |             "%(wsn)s.text_to_int": "%(wcw2)s.text_to_int",
59 |             "%(wsn)s.long_to_text": "%(wcw2)s.long_to_text",
60 |             "%(wsn)s.text_to_long": "%(wcw2)s.text_to_long",
61 |             "%(wsn)s.float_to_text": "%(wcw2)s.float_to_text",
62 |             "%(wsn)s.text_to_float": "%(wcw2)s.text_to_float",
63 |         }
64 |         ru = RenamingUnpickler(stream, objmap, shortcuts=scuts)
65 |         schema = ru.load()
66 |     # Read the generation number
67 |     index_gen = stream.read_int()
68 |     assert gen == index_gen
69 |     # Unused number
70 |     _ = stream.read_int()
71 |     # Unpickle the list of segment objects
72 |     segments = stream.read_pickle()
73 |     return schema, segments
74 | 
75 | 
76 | # Map TOC version numbers to functions to load that version
77 | toc_loaders = {-110: load_110_toc}
78 | 
79 | 
80 | # Map segment class names to functions to load the segment
81 | segment_loaders = {}
82 | 


--------------------------------------------------------------------------------
/src/whoosh/matching/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | from whoosh.matching.binary import (
29 |     AdditiveBiMatcher,
30 |     AndMaybeMatcher,
31 |     AndNotMatcher,
32 |     BiMatcher,
33 |     DisjunctionMaxMatcher,
34 |     IntersectionMatcher,
35 |     UnionMatcher,
36 | )
37 | from whoosh.matching.combo import (
38 |     ArrayUnionMatcher,
39 |     CombinationMatcher,
40 |     PreloadedUnionMatcher,
41 | )
42 | from whoosh.matching.mcore import (
43 |     ConstantScoreMatcher,
44 |     LeafMatcher,
45 |     ListMatcher,
46 |     Matcher,
47 |     NoQualityAvailable,
48 |     NullMatcher,
49 |     NullMatcherClass,
50 |     ReadTooFar,
51 | )
52 | from whoosh.matching.wrappers import (
53 |     ConstantScoreWrapperMatcher,
54 |     CoordMatcher,
55 |     ExcludeMatcher,
56 |     FilterMatcher,
57 |     InverseMatcher,
58 |     MultiMatcher,
59 |     RequireMatcher,
60 |     SingleTermMatcher,
61 |     WrappingMatcher,
62 | )
63 | 


--------------------------------------------------------------------------------
/src/whoosh/qparser/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2010 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | from whoosh.qparser.default import (
29 |     DisMaxParser,
30 |     MultifieldParser,
31 |     QueryParser,
32 |     SimpleParser,
33 | )
34 | from whoosh.qparser.plugins import (
35 |     BoostPlugin,
36 |     CopyFieldPlugin,
37 |     EveryPlugin,
38 |     FieldAliasPlugin,
39 |     FieldsPlugin,
40 |     FunctionPlugin,
41 |     FuzzyTermPlugin,
42 |     GroupPlugin,
43 |     GtLtPlugin,
44 |     MultifieldPlugin,
45 |     OperatorsPlugin,
46 |     PhrasePlugin,
47 |     Plugin,
48 |     PlusMinusPlugin,
49 |     PrefixPlugin,
50 |     PseudoFieldPlugin,
51 |     RangePlugin,
52 |     RegexPlugin,
53 |     RegexTagger,
54 |     SequencePlugin,
55 |     SingleQuotePlugin,
56 |     TaggingPlugin,
57 |     WhitespacePlugin,
58 |     WildcardPlugin,
59 | )
60 | from whoosh.qparser.syntax import (
61 |     AndGroup,
62 |     AndMaybeGroup,
63 |     AndNotGroup,
64 |     BinaryGroup,
65 |     DisMaxGroup,
66 |     ErrorNode,
67 |     FieldnameNode,
68 |     GroupNode,
69 |     InfixOperator,
70 |     MarkerNode,
71 |     NotGroup,
72 |     Operator,
73 |     OrderedGroup,
74 |     OrGroup,
75 |     PostfixOperator,
76 |     PrefixOperator,
77 |     RequireGroup,
78 |     SyntaxNode,
79 |     TextNode,
80 |     Whitespace,
81 |     WordNode,
82 |     Wrapper,
83 | )
84 | 


--------------------------------------------------------------------------------
/src/whoosh/qparser/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2010 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | """
29 | This module contains common utility objects/functions for the other query
30 | parser modules.
31 | """
32 | 
33 | import sys
34 | 
35 | 
36 | class QueryParserError(Exception):
37 |     def __init__(self, cause, msg=None):
38 |         super().__init__(str(cause))
39 |         self.cause = cause
40 | 
41 | 
42 | def get_single_text(field, text, **kwargs):
43 |     """Returns the first token from an analyzer's output."""
44 | 
45 |     for t in field.process_text(text, mode="query", **kwargs):
46 |         return t
47 | 
48 | 
49 | def attach(q, stxnode):
50 |     if q:
51 |         try:
52 |             q.startchar = stxnode.startchar
53 |             q.endchar = stxnode.endchar
54 |         except AttributeError:
55 |             raise AttributeError(f"Can't set attribute on {q.__class__.__name__}")
56 |     return q
57 | 
58 | 
59 | def print_debug(level, msg, out=sys.stderr):
60 |     if level:
61 |         out.write(f"{' ' * (level - 1)}{msg}\n")
62 | 


--------------------------------------------------------------------------------
/src/whoosh/qparser/taggers.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2011 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | from whoosh.util.text import rcompile
29 | 
30 | # Tagger objects
31 | 
32 | 
33 | class Tagger:
34 |     """Base class for taggers, objects which match syntax in the query string
35 |     and translate it into a :class:`whoosh.qparser.syntax.SyntaxNode` object.
36 |     """
37 | 
38 |     def match(self, parser, text, pos):
39 |         """This method should see if this tagger matches the query string at
40 |         the given position. If it matches, it should return
41 | 
42 |         :param parser: the :class:`whoosh.qparser.default.QueryParser` object.
43 |         :param text: the text being parsed.
44 |         :param pos: the position in the text at which the tagger should try to
45 |             match.
46 |         """
47 | 
48 |         raise NotImplementedError
49 | 
50 | 
51 | class RegexTagger(Tagger):
52 |     """Tagger class that uses regular expressions to match the query string.
53 |     Subclasses should override ``create()`` instead of ``match()``.
54 |     """
55 | 
56 |     def __init__(self, expr):
57 |         self.expr = rcompile(expr)
58 | 
59 |     def match(self, parser, text, pos):
60 |         match = self.expr.match(text, pos)
61 |         if match:
62 |             node = self.create(parser, match)
63 |             if node is not None:
64 |                 node = node.set_range(match.start(), match.end())
65 |                 return node
66 |             else:
67 |                 return None
68 | 
69 |     def create(self, parser, match):
70 |         """When the regular expression matches, this method is called to
71 |         translate the regex match object into a syntax node.
72 | 
73 |         :param parser: the :class:`whoosh.qparser.default.QueryParser` object.
74 |         :param match: the regex match object.
75 |         """
76 | 
77 |         raise NotImplementedError
78 | 
79 | 
80 | class FnTagger(RegexTagger):
81 |     """Tagger that takes a regular expression and a class or function, and for
82 |     matches calls the class/function with the regex match's named groups as
83 |     keyword arguments.
84 |     """
85 | 
86 |     def __init__(self, expr, fn, memo=""):
87 |         RegexTagger.__init__(self, expr)
88 |         self.fn = fn
89 |         self.memo = memo
90 | 
91 |     def __repr__(self):
92 |         return f"<{self.__class__.__name__} {self.expr!r} ({self.memo})>"
93 | 
94 |     def create(self, parser, match):
95 |         return self.fn(**match.groupdict())
96 | 


--------------------------------------------------------------------------------
/src/whoosh/query/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | 
29 | from whoosh.query.compound import (
30 |     And,
31 |     AndMaybe,
32 |     AndNot,
33 |     BinaryQuery,
34 |     BooleanQuery,
35 |     CompoundQuery,
36 |     DefaultOr,
37 |     DisjunctionMax,
38 |     Or,
39 |     Otherwise,
40 |     PreloadedOr,
41 |     Require,
42 |     SplitOr,
43 | )
44 | from whoosh.query.nested import NestedChildren, NestedParent
45 | from whoosh.query.positional import Ordered, Phrase, Sequence
46 | from whoosh.query.qcolumns import ColumnMatcher, ColumnQuery
47 | from whoosh.query.qcore import (
48 |     Every,
49 |     Highest,
50 |     Lowest,
51 |     NullQuery,
52 |     Query,
53 |     QueryError,
54 |     _NullQuery,
55 |     error_query,
56 |     token_lists,
57 | )
58 | from whoosh.query.ranges import DateRange, NumericRange, RangeMixin, TermRange
59 | from whoosh.query.spans import (
60 |     Span,
61 |     SpanBefore,
62 |     SpanBiMatcher,
63 |     SpanBiQuery,
64 |     SpanCondition,
65 |     SpanContains,
66 |     SpanFirst,
67 |     SpanNear,
68 |     SpanNear2,
69 |     SpanNot,
70 |     SpanOr,
71 |     SpanQuery,
72 |     SpanWrappingMatcher,
73 |     WrappingSpan,
74 | )
75 | from whoosh.query.terms import (
76 |     ExpandingTerm,
77 |     FuzzyTerm,
78 |     MultiTerm,
79 |     PatternQuery,
80 |     Prefix,
81 |     Regex,
82 |     Term,
83 |     Variations,
84 |     Wildcard,
85 | )
86 | from whoosh.query.wrappers import ConstantScoreQuery, Not, WeightingQuery, WrappingQuery
87 | 


--------------------------------------------------------------------------------
/src/whoosh/query/qcolumns.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2012 Matt Chaput. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions are met:
  5 | #
  6 | #    1. Redistributions of source code must retain the above copyright notice,
  7 | #       this list of conditions and the following disclaimer.
  8 | #
  9 | #    2. Redistributions in binary form must reproduce the above copyright
 10 | #       notice, this list of conditions and the following disclaimer in the
 11 | #       documentation and/or other materials provided with the distribution.
 12 | #
 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23 | #
 24 | # The views and conclusions contained in the software and documentation are
 25 | # those of the authors and should not be interpreted as representing official
 26 | # policies, either expressed or implied, of Matt Chaput.
 27 | 
 28 | from whoosh.matching import ConstantScoreMatcher, NullMatcher, ReadTooFar
 29 | from whoosh.query.qcore import Query
 30 | 
 31 | 
 32 | class ColumnQuery(Query):
 33 |     """A query that matches per-document values stored in a column rather than
 34 |     terms in the inverted index.
 35 | 
 36 |     This may be useful in special circumstances, but note that this is MUCH
 37 |     SLOWER than searching an indexed field.
 38 |     """
 39 | 
 40 |     def __init__(self, fieldname, condition):
 41 |         """
 42 |         :param fieldname: the name of the field to look in. If the field does
 43 |             not have a column, this query will not match anything.
 44 |         :param condition: if this is a callable, it is called on each value
 45 |             in the column, and for documents where callable(docvalue) returns
 46 |             True are returned as matching documents. If this is not a callable,
 47 |             the document values are compared to it (using ``==``).
 48 |         """
 49 | 
 50 |         self.fieldname = fieldname
 51 |         self.condition = condition
 52 | 
 53 |     def is_leaf(self):
 54 |         return True
 55 | 
 56 |     def matcher(self, searcher, context=None):
 57 |         fieldname = self.fieldname
 58 |         condition = self.condition
 59 |         if callable(condition):
 60 |             comp = condition
 61 |         else:
 62 | 
 63 |             def comp(v):
 64 |                 # Made this a function instead of a lambda so I could put
 65 |                 # debug prints here if necessary ;)
 66 |                 return v == condition
 67 | 
 68 |         reader = searcher.reader()
 69 |         if not reader.has_column(fieldname):
 70 |             return NullMatcher()
 71 | 
 72 |         creader = reader.column_reader(fieldname)
 73 |         return ColumnMatcher(creader, comp)
 74 | 
 75 | 
 76 | class ColumnMatcher(ConstantScoreMatcher):
 77 |     def __init__(self, creader, condition):
 78 |         self.creader = creader
 79 |         self.condition = condition
 80 |         self._i = 0
 81 |         self._find_next()
 82 | 
 83 |     def _find_next(self):
 84 |         condition = self.condition
 85 |         creader = self.creader
 86 | 
 87 |         while self._i < len(creader) and not condition(creader[self._i]):
 88 |             self._i += 1
 89 | 
 90 |     def is_active(self):
 91 |         return self._i < len(self.creader)
 92 | 
 93 |     def next(self):
 94 |         if not self.is_active():
 95 |             raise ReadTooFar
 96 |         self._i += 1
 97 |         self._find_next()
 98 | 
 99 |     def reset(self):
100 |         self._i = 0
101 |         self._find_next()
102 | 
103 |     def id(self):
104 |         return self._i
105 | 
106 |     def all_ids(self):
107 |         condition = self.condition
108 |         for docnum, v in enumerate(self.creader):
109 |             if condition(v):
110 |                 yield docnum
111 | 
112 |     def supports(self, astype):
113 |         return False
114 | 
115 |     def skip_to_quality(self, minquality):
116 |         if self._score <= minquality:
117 |             self._i = len(self.creader)
118 |             return True
119 | 


--------------------------------------------------------------------------------
/src/whoosh/support/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/src/whoosh/support/__init__.py


--------------------------------------------------------------------------------
/src/whoosh/support/base85.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains generic base85 encoding and decoding functions. The
  3 | whoosh.util.numeric module contains faster variants for encoding and
  4 | decoding integers.
  5 | 
  6 | Modified from:
  7 | http://paste.lisp.org/display/72815
  8 | """
  9 | 
 10 | import struct
 11 | 
 12 | # Instead of using the character set from the ascii85 algorithm, I put the
 13 | # characters in order so that the encoded text sorts properly (my life would be
 14 | # a lot easier if they had just done that from the start)
 15 | b85chars = (
 16 |     "!$%&*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 17 |     "^_abcdefghijklmnopqrstuvwxyz{|}~"
 18 | )
 19 | b85dec = {}
 20 | for i in range(len(b85chars)):
 21 |     b85dec[b85chars[i]] = i
 22 | 
 23 | 
 24 | # Integer encoding and decoding functions
 25 | 
 26 | 
 27 | def to_base85(x, islong=False):
 28 |     "Encodes the given integer using base 85."
 29 | 
 30 |     size = 10 if islong else 5
 31 |     rems = ""
 32 |     for i in range(size):
 33 |         rems = b85chars[x % 85] + rems
 34 |         x //= 85
 35 |     return rems
 36 | 
 37 | 
 38 | def from_base85(text):
 39 |     "Decodes the given base 85 text into an integer."
 40 | 
 41 |     acc = 0
 42 |     for c in text:
 43 |         acc = acc * 85 + b85dec[c]
 44 |     return acc
 45 | 
 46 | 
 47 | # Bytes encoding and decoding functions
 48 | 
 49 | 
 50 | def b85encode(text, pad=False):
 51 |     l = len(text)
 52 |     r = l % 4
 53 |     if r:
 54 |         text += "\0" * (4 - r)
 55 |     longs = len(text) >> 2
 56 |     out = []
 57 |     words = struct.unpack(">" + "L" * longs, text[0 : longs * 4])
 58 |     for word in words:
 59 |         rems = [0, 0, 0, 0, 0]
 60 |         for i in range(4, -1, -1):
 61 |             rems[i] = b85chars[word % 85]
 62 |             word /= 85
 63 |         out.extend(rems)
 64 | 
 65 |     out = "".join(out)
 66 |     if pad:
 67 |         return out
 68 | 
 69 |     # Trim padding
 70 |     olen = l % 4
 71 |     if olen:
 72 |         olen += 1
 73 |     olen += l / 4 * 5
 74 |     return out[0:olen]
 75 | 
 76 | 
 77 | def b85decode(text):
 78 |     l = len(text)
 79 |     out = []
 80 |     for i in range(0, len(text), 5):
 81 |         chunk = text[i : i + 5]
 82 |         acc = 0
 83 |         for j in range(len(chunk)):
 84 |             try:
 85 |                 acc = acc * 85 + b85dec[chunk[j]]
 86 |             except KeyError:
 87 |                 raise TypeError("Bad base85 character at byte %d" % (i + j))
 88 |         if acc > 4294967295:
 89 |             raise OverflowError("Base85 overflow in hunk starting at byte %d" % i)
 90 |         out.append(acc)
 91 | 
 92 |     # Pad final chunk if necessary
 93 |     cl = l % 5
 94 |     if cl:
 95 |         acc *= 85 ** (5 - cl)
 96 |         if cl > 1:
 97 |             acc += 0xFFFFFF >> (cl - 2) * 8
 98 |         out[-1] = acc
 99 | 
100 |     out = struct.pack(">" + "L" * ((l + 4) / 5), *out)
101 |     if cl:
102 |         out = out[: -(5 - cl)]
103 | 
104 |     return out
105 | 


--------------------------------------------------------------------------------
/src/whoosh/support/bitstream.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | From a post by Patrick Maupin on the Python mailing list:
 4 | http://mail.python.org/pipermail/python-list/2003-November/237481.html
 5 | """
 6 | 
 7 | from array import array
 8 | 
 9 | from whoosh.system import _LONG_SIZE
10 | 
11 | _bitsperlong = _LONG_SIZE * 8
12 | 
13 | 
14 | class BitStreamReader:
15 |     def __init__(self, source):
16 |         self._totalbits = len(source) * _bitsperlong
17 |         self._position = 0
18 | 
19 |         # Pad to longword boundary, then make an array
20 | 
21 |         source += -len(source) % _LONG_SIZE * chr(0)
22 |         bits = array("L")
23 |         bits.fromstring(source)
24 |         self._bitstream = bits
25 | 
26 |     def seek(self, offset):
27 |         self._position = offset
28 | 
29 |     def tell(self):
30 |         return self._position
31 | 
32 |     def read(self, numbits):
33 |         position = self._position
34 | 
35 |         if position < 0 or position + numbits > self._totalbits:
36 |             raise (IndexError, "Invalid bitarray._position/numbits")
37 | 
38 |         longaddress, bitoffset = divmod(position, _bitsperlong)
39 | 
40 |         # We may read bits in the final word after ones we care
41 |         # about, so create a mask to remove them later.
42 | 
43 |         finalmask = (1 << numbits) - 1
44 | 
45 |         # We may read bits in the first word before the ones we
46 |         # care about, so bump the total bits to read by this
47 |         # amount, so we read enough higher-order bits.
48 | 
49 |         numbits += bitoffset
50 | 
51 |         # Read and concatenate every long containing a bit we need
52 | 
53 |         outval, outshift = 0, 0
54 |         while numbits > 0:
55 |             outval += self._bitstream[longaddress] << outshift
56 |             longaddress += 1
57 |             outshift += _bitsperlong
58 |             numbits -= _bitsperlong
59 | 
60 |         # numbits is now basically a negative number which tells us
61 |         # how many bits to back up from our current position.
62 | 
63 |         self._position = longaddress * _bitsperlong + numbits
64 | 
65 |         # Shift right to strip off the low-order bits we
66 |         # don't want, then 'and' with the mask to strip
67 |         # off the high-order bits we don't want.
68 | 
69 |         return (outval >> bitoffset) & finalmask
70 | 


--------------------------------------------------------------------------------
/src/whoosh/support/levenshtein.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains functions implementing edit distance algorithms.
 3 | """
 4 | 
 5 | 
 6 | def levenshtein(seq1, seq2, limit=None):
 7 |     """Returns the Levenshtein edit distance between two strings."""
 8 | 
 9 |     oneago = None
10 |     thisrow = list(range(1, len(seq2) + 1)) + [0]
11 |     for x in range(len(seq1)):
12 |         # Python lists wrap around for negative indices, so put the
13 |         # leftmost column at the *end* of the list. This matches with
14 |         # the zero-indexed strings and saves extra calculation.
15 |         oneago, thisrow = thisrow, [0] * len(seq2) + [x + 1]
16 |         for y in range(len(seq2)):
17 |             delcost = oneago[y] + 1
18 |             addcost = thisrow[y - 1] + 1
19 |             subcost = oneago[y - 1] + (seq1[x] != seq2[y])
20 |             thisrow[y] = min(delcost, addcost, subcost)
21 | 
22 |         if limit and x > limit and min(thisrow) > limit:
23 |             return limit + 1
24 | 
25 |     return thisrow[len(seq2) - 1]
26 | 
27 | 
28 | def damerau_levenshtein(seq1, seq2, limit=None):
29 |     """Returns the Damerau-Levenshtein edit distance between two strings."""
30 | 
31 |     oneago = None
32 |     thisrow = list(range(1, len(seq2) + 1)) + [0]
33 |     for x in range(len(seq1)):
34 |         # Python lists wrap around for negative indices, so put the
35 |         # leftmost column at the *end* of the list. This matches with
36 |         # the zero-indexed strings and saves extra calculation.
37 |         twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
38 |         for y in range(len(seq2)):
39 |             delcost = oneago[y] + 1
40 |             addcost = thisrow[y - 1] + 1
41 |             subcost = oneago[y - 1] + (seq1[x] != seq2[y])
42 |             thisrow[y] = min(delcost, addcost, subcost)
43 |             # This block deals with transpositions
44 |             if (
45 |                 x > 0
46 |                 and y > 0
47 |                 and seq1[x] == seq2[y - 1]
48 |                 and seq1[x - 1] == seq2[y]
49 |                 and seq1[x] != seq2[y]
50 |             ):
51 |                 thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
52 | 
53 |         if limit and x > limit and min(thisrow) > limit:
54 |             return limit + 1
55 | 
56 |     return thisrow[len(seq2) - 1]
57 | 
58 | 
59 | def relative(a, b):
60 |     """Returns the relative distance between two strings, in the range
61 |     [0-1] where 1 means total equality.
62 |     """
63 | 
64 |     d = distance(a, b)
65 |     longer = float(max((len(a), len(b))))
66 |     shorter = float(min((len(a), len(b))))
67 |     r = ((longer - d) / longer) * (shorter / longer)
68 |     return r
69 | 
70 | 
71 | distance = damerau_levenshtein
72 | 


--------------------------------------------------------------------------------
/src/whoosh/system.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2007 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | import sys
29 | from struct import Struct, calcsize
30 | 
31 | IS_LITTLE = sys.byteorder == "little"
32 | 
33 | _INT_SIZE = calcsize("!i")
34 | _SHORT_SIZE = calcsize("!H")
35 | _LONG_SIZE = calcsize("!Q")
36 | _FLOAT_SIZE = calcsize("!f")
37 | _DOUBLE_SIZE = calcsize("!d")
38 | 
39 | _byte_struct = Struct("!B")
40 | _sbyte_struct = Struct("!b")
41 | _ushort_struct = Struct("!H")
42 | _int_struct = Struct("!i")
43 | _uint_struct = Struct("!I")
44 | _long_struct = Struct("!q")
45 | _ulong_struct = Struct("!Q")
46 | _float_struct = Struct("!f")
47 | _double_struct = Struct("!d")
48 | _ushort_le_struct = Struct("<H")
49 | _uint_le_struct = Struct("<I")
50 | 
51 | pack_byte = _byte_struct.pack
52 | pack_sbyte = _sbyte_struct.pack
53 | pack_ushort = _ushort_struct.pack
54 | pack_int = _int_struct.pack
55 | pack_uint = _uint_struct.pack
56 | pack_long = _long_struct.pack
57 | pack_ulong = _ulong_struct.pack
58 | pack_float = _float_struct.pack
59 | pack_double = _double_struct.pack
60 | pack_ushort_le = _ushort_le_struct.pack
61 | pack_uint_le = _uint_le_struct.pack
62 | 
63 | unpack_byte = _byte_struct.unpack  # ord() might be faster
64 | unpack_sbyte = _sbyte_struct.unpack
65 | unpack_ushort = _ushort_struct.unpack
66 | unpack_int = _int_struct.unpack
67 | unpack_uint = _uint_struct.unpack
68 | unpack_long = _long_struct.unpack
69 | unpack_ulong = _ulong_struct.unpack
70 | unpack_float = _float_struct.unpack
71 | unpack_double = _double_struct.unpack
72 | unpack_ushort_le = _ushort_le_struct.unpack
73 | unpack_uint_le = _uint_le_struct.unpack
74 | 
75 | 
76 | emptybytes = b""
77 | 


--------------------------------------------------------------------------------
/src/whoosh/util/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2007 Matt Chaput. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions are met:
  5 | #
  6 | #    1. Redistributions of source code must retain the above copyright notice,
  7 | #       this list of conditions and the following disclaimer.
  8 | #
  9 | #    2. Redistributions in binary form must reproduce the above copyright
 10 | #       notice, this list of conditions and the following disclaimer in the
 11 | #       documentation and/or other materials provided with the distribution.
 12 | #
 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23 | #
 24 | # The views and conclusions contained in the software and documentation are
 25 | # those of the authors and should not be interpreted as representing official
 26 | # policies, either expressed or implied, of Matt Chaput.
 27 | 
 28 | 
 29 | import random
 30 | import sys
 31 | import time
 32 | from bisect import insort
 33 | from functools import wraps
 34 | 
 35 | # These must be valid separate characters in CASE-INSENSTIVE filenames
 36 | IDCHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
 37 | 
 38 | 
 39 | if hasattr(time, "perf_counter"):
 40 |     now = time.perf_counter
 41 | elif sys.platform == "win32":
 42 |     now = time.clock
 43 | else:
 44 |     now = time.time
 45 | 
 46 | 
 47 | def random_name(size=28):
 48 |     return "".join(random.choice(IDCHARS) for _ in range(size))
 49 | 
 50 | 
 51 | def random_bytes(size=28):
 52 |     return bytes(random.randint(0, 255) for _ in range(size))
 53 | 
 54 | 
 55 | def make_binary_tree(fn, args, **kwargs):
 56 |     """Takes a function/class that takes two positional arguments and a list of
 57 |     arguments and returns a binary tree of results/instances.
 58 | 
 59 |     >>> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3])
 60 |     UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3))
 61 | 
 62 |     Any keyword arguments given to this function are passed to the class
 63 |     initializer.
 64 |     """
 65 | 
 66 |     count = len(args)
 67 |     if not count:
 68 |         raise ValueError("Called make_binary_tree with empty list")
 69 |     elif count == 1:
 70 |         return args[0]
 71 | 
 72 |     half = count // 2
 73 |     return fn(
 74 |         make_binary_tree(fn, args[:half], **kwargs),
 75 |         make_binary_tree(fn, args[half:], **kwargs),
 76 |         **kwargs,
 77 |     )
 78 | 
 79 | 
 80 | def make_weighted_tree(fn, ls, **kwargs):
 81 |     """Takes a function/class that takes two positional arguments and a list of
 82 |     (weight, argument) tuples and returns a huffman-like weighted tree of
 83 |     results/instances.
 84 |     """
 85 | 
 86 |     if not ls:
 87 |         raise ValueError("Called make_weighted_tree with empty list")
 88 | 
 89 |     ls.sort()
 90 |     while len(ls) > 1:
 91 |         a = ls.pop(0)
 92 |         b = ls.pop(0)
 93 |         insort(ls, (a[0] + b[0], fn(a[1], b[1])))
 94 |     return ls[0][1]
 95 | 
 96 | 
 97 | # Fibonacci function
 98 | 
 99 | _fib_cache = {}
100 | 
101 | 
102 | def fib(n):
103 |     """Returns the nth value in the Fibonacci sequence."""
104 | 
105 |     if n <= 2:
106 |         return n
107 |     if n in _fib_cache:
108 |         return _fib_cache[n]
109 |     result = fib(n - 1) + fib(n - 2)
110 |     _fib_cache[n] = result
111 |     return result
112 | 
113 | 
114 | # Decorators
115 | 
116 | 
117 | def synchronized(func):
118 |     """Decorator for storage-access methods, which synchronizes on a threading
119 |     lock. The parent object must have 'is_closed' and '_sync_lock' attributes.
120 |     """
121 | 
122 |     @wraps(func)
123 |     def synchronized_wrapper(self, *args, **kwargs):
124 |         with self._sync_lock:
125 |             return func(self, *args, **kwargs)
126 | 
127 |     return synchronized_wrapper
128 | 
129 | 
130 | def unclosed(method):
131 |     """
132 |     Decorator to check if the object is closed.
133 |     """
134 | 
135 |     @wraps(method)
136 |     def unclosed_wrapper(self, *args, **kwargs):
137 |         if self.closed:
138 |             raise ValueError("Operation on a closed object")
139 |         return method(self, *args, **kwargs)
140 | 
141 |     return unclosed_wrapper
142 | 


--------------------------------------------------------------------------------
/src/whoosh/util/cache.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2007 Matt Chaput. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions are met:
  5 | #
  6 | #    1. Redistributions of source code must retain the above copyright notice,
  7 | #       this list of conditions and the following disclaimer.
  8 | #
  9 | #    2. Redistributions in binary form must reproduce the above copyright
 10 | #       notice, this list of conditions and the following disclaimer in the
 11 | #       documentation and/or other materials provided with the distribution.
 12 | #
 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23 | #
 24 | # The views and conclusions contained in the software and documentation are
 25 | # those of the authors and should not be interpreted as representing official
 26 | # policies, either expressed or implied, of Matt Chaput.
 27 | 
 28 | 
 29 | import functools
 30 | from collections import Counter
 31 | from heapq import nsmallest
 32 | from operator import itemgetter
 33 | 
 34 | 
 35 | def unbound_cache(func):
 36 |     """Caching decorator with an unbounded cache size."""
 37 | 
 38 |     cache = {}
 39 | 
 40 |     @functools.wraps(func)
 41 |     def caching_wrapper(*args):
 42 |         try:
 43 |             return cache[args]
 44 |         except KeyError:
 45 |             result = func(*args)
 46 |             cache[args] = result
 47 |             return result
 48 | 
 49 |     return caching_wrapper
 50 | 
 51 | 
 52 | def lfu_cache(maxsize=100):
 53 |     """A simple cache that, when the cache is full, deletes the least frequently
 54 |     used 10% of the cached values.
 55 | 
 56 |     This function duplicates (more-or-less) the protocol of the
 57 |     ``functools.lru_cache`` decorator in the Python 3.2 standard library.
 58 | 
 59 |     Arguments to the cached function must be hashable.
 60 | 
 61 |     View the cache statistics tuple ``(hits, misses, maxsize, currsize)``
 62 |     with f.cache_info().  Clear the cache and statistics with f.cache_clear().
 63 |     Access the underlying function with f.__wrapped__.
 64 |     """
 65 | 
 66 |     def decorating_function(user_function):
 67 |         stats = [0, 0]  # Hits, misses
 68 |         data = {}
 69 |         usecount = Counter()
 70 | 
 71 |         @functools.wraps(user_function)
 72 |         def wrapper(*args):
 73 |             try:
 74 |                 result = data[args]
 75 |                 stats[0] += 1  # Hit
 76 |             except KeyError:
 77 |                 stats[1] += 1  # Miss
 78 |                 if len(data) == maxsize:
 79 |                     for k, _ in nsmallest(
 80 |                         maxsize // 10 or 1, usecount.items(), key=itemgetter(1)
 81 |                     ):
 82 |                         del data[k]
 83 |                         del usecount[k]
 84 |                 data[args] = user_function(*args)
 85 |                 result = data[args]
 86 |             finally:
 87 |                 usecount[args] += 1
 88 |             return result
 89 | 
 90 |         def cache_info():
 91 |             return stats[0], stats[1], maxsize, len(data)
 92 | 
 93 |         def cache_clear():
 94 |             data.clear()
 95 |             usecount.clear()
 96 | 
 97 |         wrapper.cache_info = cache_info
 98 |         wrapper.cache_clear = cache_clear
 99 |         return wrapper
100 | 
101 |     return decorating_function
102 | 


--------------------------------------------------------------------------------
/src/whoosh/util/loading.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2012 Matt Chaput. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions are met:
 5 | #
 6 | #    1. Redistributions of source code must retain the above copyright notice,
 7 | #       this list of conditions and the following disclaimer.
 8 | #
 9 | #    2. Redistributions in binary form must reproduce the above copyright
10 | #       notice, this list of conditions and the following disclaimer in the
11 | #       documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | #
24 | # The views and conclusions contained in the software and documentation are
25 | # those of the authors and should not be interpreted as representing official
26 | # policies, either expressed or implied, of Matt Chaput.
27 | 
28 | import pickle
29 | 
30 | 
31 | class RenamingUnpickler(pickle.Unpickler):
32 |     """Subclasses ``pickle.Unpickler`` to allow remapping of class names before
33 |     loading them.
34 |     """
35 | 
36 |     def __init__(self, f, objmap, shortcuts=None):
37 |         pickle.Unpickler.__init__(self, f)
38 | 
39 |         if shortcuts:
40 |             objmap = {k % shortcuts: v % shortcuts for k, v in objmap.items()}
41 |         self._objmap = objmap
42 | 
43 |     def find_class(self, modulename, objname):
44 |         fqname = f"{modulename}.{objname}"
45 |         if fqname in self._objmap:
46 |             fqname = self._objmap[fqname]
47 |         try:
48 |             obj = find_object(fqname)
49 |         except ImportError:
50 |             raise ImportError(f"Couldn't find {fqname!r}")
51 |         return obj
52 | 
53 | 
54 | def find_object(name, blacklist=None, whitelist=None):
55 |     """Imports and returns an object given a fully qualified name.
56 | 
57 |     >>> find_object("whoosh.analysis.StopFilter")
58 |     <class 'whoosh.analysis.StopFilter'>
59 |     """
60 | 
61 |     if blacklist:
62 |         for pre in blacklist:
63 |             if name.startswith(pre):
64 |                 raise TypeError(
65 |                     f"{name!r}: can't instantiate names starting with {pre!r}"
66 |                 )
67 |     if whitelist:
68 |         passes = False
69 |         for pre in whitelist:
70 |             if name.startswith(pre):
71 |                 passes = True
72 |                 break
73 |         if not passes:
74 |             raise TypeError(f"Can't instantiate {name!r}")
75 | 
76 |     lastdot = name.rfind(".")
77 | 
78 |     assert lastdot > -1, f"Name {name!r} must be fully qualified"
79 |     modname = name[:lastdot]
80 |     clsname = name[lastdot + 1 :]
81 | 
82 |     mod = __import__(modname, fromlist=[clsname])
83 |     cls = getattr(mod, clsname)
84 |     return cls
85 | 


--------------------------------------------------------------------------------
/src/whoosh/util/text.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2007 Matt Chaput. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions are met:
  5 | #
  6 | #    1. Redistributions of source code must retain the above copyright notice,
  7 | #       this list of conditions and the following disclaimer.
  8 | #
  9 | #    2. Redistributions in binary form must reproduce the above copyright
 10 | #       notice, this list of conditions and the following disclaimer in the
 11 | #       documentation and/or other materials provided with the distribution.
 12 | #
 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23 | #
 24 | # The views and conclusions contained in the software and documentation are
 25 | # those of the authors and should not be interpreted as representing official
 26 | # policies, either expressed or implied, of Matt Chaput.
 27 | 
 28 | import codecs
 29 | import re
 30 | 
 31 | # Note: these functions return a tuple of (text, length), so when you call
 32 | # them, you have to add [0] on the end, e.g. str = utf8encode(unicode)[0]
 33 | 
 34 | utf8encode = codecs.getencoder("utf-8")
 35 | utf8decode = codecs.getdecoder("utf-8")
 36 | 
 37 | 
 38 | # Prefix encoding functions
 39 | 
 40 | 
 41 | def byte(num):
 42 |     return bytes((num,))
 43 | 
 44 | 
 45 | def first_diff(a, b):
 46 |     """
 47 |     Returns the position of the first differing character in the sequences a
 48 |     and b. For example, first_diff('render', 'rending') == 4. This function
 49 |     limits the return value to 255 so the difference can be encoded in a single
 50 |     byte.
 51 |     """
 52 | 
 53 |     i = 0
 54 |     while i <= 255 and i < len(a) and i < len(b) and a[i] == b[i]:
 55 |         i += 1
 56 |     return i
 57 | 
 58 | 
 59 | def prefix_encode(a, b):
 60 |     """
 61 |     Compresses bytestring b as a byte representing the prefix it shares with a,
 62 |     followed by the suffix bytes.
 63 |     """
 64 | 
 65 |     i = first_diff(a, b)
 66 |     return byte(i) + b[i:]
 67 | 
 68 | 
 69 | def prefix_encode_all(ls):
 70 |     """Compresses the given list of (unicode) strings by storing each string
 71 |     (except the first one) as an integer (encoded in a byte) representing
 72 |     the prefix it shares with its predecessor, followed by the suffix encoded
 73 |     as UTF-8.
 74 |     """
 75 | 
 76 |     last = ""
 77 |     for w in ls:
 78 |         i = first_diff(last, w)
 79 |         yield chr(i) + w[i:].encode("utf-8")
 80 |         last = w
 81 | 
 82 | 
 83 | def prefix_decode_all(ls):
 84 |     """Decompresses a list of strings compressed by prefix_encode()."""
 85 | 
 86 |     last = ""
 87 |     for w in ls:
 88 |         i = ord(w[0])
 89 |         decoded = last[:i] + w[1:].decode("utf-8")
 90 |         yield decoded
 91 |         last = decoded
 92 | 
 93 | 
 94 | # Natural key sorting function
 95 | 
 96 | _nkre = re.compile(r"\D+|\d+", re.UNICODE)
 97 | 
 98 | 
 99 | def _nkconv(i):
100 |     try:
101 |         return int(i)
102 |     except ValueError:
103 |         return i.lower()
104 | 
105 | 
106 | def natural_key(s):
107 |     """Converts string ``s`` into a tuple that will sort "naturally" (i.e.,
108 |     ``name5`` will come before ``name10`` and ``1`` will come before ``A``).
109 |     This function is designed to be used as the ``key`` argument to sorting
110 |     functions.
111 | 
112 |     :param s: the str/unicode string to convert.
113 |     :rtype: tuple
114 |     """
115 | 
116 |     # Use _nkre to split the input string into a sequence of
117 |     # digit runs and non-digit runs. Then use _nkconv() to convert
118 |     # the digit runs into ints and the non-digit runs to lowercase.
119 |     return tuple(_nkconv(m) for m in _nkre.findall(s))
120 | 
121 | 
122 | # Regular expression functions
123 | 
124 | 
125 | def rcompile(pattern, flags=0, verbose=False):
126 |     """A wrapper for re.compile that checks whether "pattern" is a regex object
127 |     or a string to be compiled, and automatically adds the re.UNICODE flag.
128 |     """
129 | 
130 |     if not isinstance(pattern, str):
131 |         # If it's not a string, assume it's already a compiled pattern
132 |         return pattern
133 |     if verbose:
134 |         flags |= re.VERBOSE
135 |     return re.compile(pattern, re.UNICODE | flags)
136 | 


--------------------------------------------------------------------------------
/src/whoosh/util/varints.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2007 Matt Chaput. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions are met:
  5 | #
  6 | #    1. Redistributions of source code must retain the above copyright notice,
  7 | #       this list of conditions and the following disclaimer.
  8 | #
  9 | #    2. Redistributions in binary form must reproduce the above copyright
 10 | #       notice, this list of conditions and the following disclaimer in the
 11 | #       documentation and/or other materials provided with the distribution.
 12 | #
 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 23 | #
 24 | # The views and conclusions contained in the software and documentation are
 25 | # those of the authors and should not be interpreted as representing official
 26 | # policies, either expressed or implied, of Matt Chaput.
 27 | 
 28 | from array import array
 29 | 
 30 | # Varint cache
 31 | 
 32 | # Build a cache of the varint byte sequences for the first N integers, so we
 33 | # don't have to constantly recalculate them on the fly. This makes a small but
 34 | # noticeable difference.
 35 | 
 36 | 
 37 | def _varint(i):
 38 |     a = array("B")
 39 |     while (i & ~0x7F) != 0:
 40 |         a.append((i & 0x7F) | 0x80)
 41 |         i = i >> 7
 42 |     a.append(i)
 43 |     return a.tobytes()
 44 | 
 45 | 
 46 | _varint_cache_size = 512
 47 | _varint_cache = tuple([_varint(i) for i in range(_varint_cache_size)])
 48 | 
 49 | 
 50 | def varint(i):
 51 |     """Encodes the given integer into a string of the minimum number  of bytes."""
 52 |     if i < len(_varint_cache):
 53 |         return _varint_cache[i]
 54 |     return _varint(i)
 55 | 
 56 | 
 57 | def varint_to_int(vi):
 58 |     b = ord(vi[0])
 59 |     p = 1
 60 |     i = b & 0x7F
 61 |     shift = 7
 62 |     while b & 0x80 != 0:
 63 |         b = ord(vi[p])
 64 |         p += 1
 65 |         i |= (b & 0x7F) << shift
 66 |         shift += 7
 67 |     return i
 68 | 
 69 | 
 70 | def signed_varint(i):
 71 |     """Zig-zag encodes a signed integer into a varint."""
 72 | 
 73 |     if i >= 0:
 74 |         return varint(i << 1)
 75 |     return varint((i << 1) ^ (~0))
 76 | 
 77 | 
 78 | def decode_signed_varint(i):
 79 |     """Zig-zag decodes an integer value."""
 80 | 
 81 |     if not i & 1:
 82 |         return i >> 1
 83 |     return (i >> 1) ^ (~0)
 84 | 
 85 | 
 86 | def read_varint(readfn):
 87 |     """
 88 |     Reads a variable-length encoded integer.
 89 | 
 90 |     :param readfn: a callable that reads a given number of bytes,
 91 |         like file.read().
 92 |     """
 93 | 
 94 |     b = ord(readfn(1))
 95 |     i = b & 0x7F
 96 | 
 97 |     shift = 7
 98 |     while b & 0x80 != 0:
 99 |         b = ord(readfn(1))
100 |         i |= (b & 0x7F) << shift
101 |         shift += 7
102 |     return i
103 | 


--------------------------------------------------------------------------------
/stress/test_bigfacet.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import random
 3 | import string
 4 | 
 5 | from whoosh import fields, formats, index, query, sorting
 6 | from whoosh.util import now
 7 | 
 8 | tagcount = 100
 9 | doccount = 500000
10 | dirname = "testindex"
11 | 
12 | schema = fields.Schema(tags=fields.KEYWORD(stored=True, vector=formats.Existence()))
13 | 
14 | if not os.path.exists(dirname):
15 |     os.mkdir(dirname)
16 | 
17 | reindex = False
18 | if reindex or not index.exists_in(dirname):
19 |     tags = []
20 |     for _ in range(tagcount):
21 |         tag = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
22 |         tags.append(tag)
23 | 
24 |     ix = index.create_in(dirname, schema)
25 |     t = now()
26 |     with ix.writer() as w:
27 |         for i in range(doccount):
28 |             doc = " ".join(random.sample(tags, random.randint(10, 20)))
29 |             w.add_document(tags=doc)
30 |             if not i % 10000:
31 |                 print(i)
32 |     print(now() - t)
33 | 
34 | 
35 | ix = index.open_dir(dirname)
36 | with ix.searcher() as s:
37 |     tags = list(s.lexicon("tags"))
38 |     facet = sorting.FieldFacet("tags", allow_overlap=True)
39 |     qtag = random.choice(tags)
40 |     print("tag=", qtag)
41 |     q = query.Term("tags", qtag)
42 |     r = s.search(q, groupedby={"tags": facet})
43 |     print(r.runtime)
44 | 
45 |     facet = sorting.StoredFieldFacet("tags", allow_overlap=True)
46 |     r = s.search(q, groupedby={"tags": facet})
47 |     print(r.runtime)
48 | 


--------------------------------------------------------------------------------
/stress/test_bigindex.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | from whoosh import fields
  4 | from whoosh.util import now
  5 | from whoosh.util.testing import TempIndex
  6 | 
  7 | 
  8 | def test_20000_single():
  9 |     sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
 10 |     with TempIndex(sc, "20000single") as ix:
 11 |         domain = [
 12 |             "alfa",
 13 |             "bravo",
 14 |             "charlie",
 15 |             "delta",
 16 |             "echo",
 17 |             "foxtrot",
 18 |             "golf",
 19 |             "hotel",
 20 |             "india",
 21 |             "juliet",
 22 |             "kilo",
 23 |             "lima",
 24 |         ]
 25 | 
 26 |         t = now()
 27 |         for i in range(20000):
 28 |             w = ix.writer()
 29 |             w.add_document(id=str(i), text=" ".join(random.sample(domain, 5)))
 30 |             w.commit()
 31 |         print("Write single:", now() - t)
 32 | 
 33 |         t = now()
 34 |         ix.optimize()
 35 |         print("Optimize single:", now() - t)
 36 | 
 37 | 
 38 | def test_20000_buffered():
 39 |     from whoosh.writing import BufferedWriter
 40 | 
 41 |     sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
 42 |     with TempIndex(sc, "20000buffered") as ix:
 43 |         domain = [
 44 |             "alfa",
 45 |             "bravo",
 46 |             "charlie",
 47 |             "delta",
 48 |             "echo",
 49 |             "foxtrot",
 50 |             "golf",
 51 |             "hotel",
 52 |             "india",
 53 |             "juliet",
 54 |             "kilo",
 55 |             "lima",
 56 |         ]
 57 | 
 58 |         t = now()
 59 |         w = BufferedWriter(ix, limit=100, period=None)
 60 |         for i in range(20000):
 61 |             w.add_document(id=str(i), text=" ".join(random.sample(domain, 5)))
 62 |         w.close()
 63 |         print("Write buffered:", now() - t)
 64 | 
 65 |         t = now()
 66 |         ix.optimize()
 67 |         print("Optimize buffered:", now() - t)
 68 | 
 69 | 
 70 | def test_20000_batch():
 71 |     sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
 72 |     with TempIndex(sc, "20000batch") as ix:
 73 |         domain = [
 74 |             "alfa",
 75 |             "bravo",
 76 |             "charlie",
 77 |             "delta",
 78 |             "echo",
 79 |             "foxtrot",
 80 |             "golf",
 81 |             "hotel",
 82 |             "india",
 83 |             "juliet",
 84 |             "kilo",
 85 |             "lima",
 86 |         ]
 87 | 
 88 |         t = now()
 89 |         w = ix.writer()
 90 |         for i in range(20000):
 91 |             w.add_document(id=str(i), text=" ".join(random.sample(domain, 5)))
 92 |             if not i % 100:
 93 |                 w.commit()
 94 |                 w = ix.writer()
 95 |         w.commit()
 96 |         print("Write batch:", now() - t)
 97 | 
 98 |         t = now()
 99 |         ix.optimize()
100 |         print("Optimize batch:", now() - t)
101 | 


--------------------------------------------------------------------------------
/stress/test_bigsort.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import random
 3 | import shutil
 4 | from datetime import datetime, timezone
 5 | 
 6 | from whoosh import fields, index, query
 7 | from whoosh.util import now
 8 | 
 9 | 
10 | def test_bigsort():
11 |     times = 30000
12 |     dirname = "testindex"
13 | 
14 |     df = fields.DATETIME(stored=True)
15 |     schema = fields.Schema(id=fields.ID(stored=True), date=df)
16 | 
17 |     if os.path.exists(dirname):
18 |         shutil.rmtree(dirname)
19 |     os.mkdir(dirname)
20 |     ix = index.create_in(dirname, schema)
21 | 
22 |     print("Writing...")
23 |     t = now()
24 |     w = ix.writer(limitmb=512)
25 |     for i in range(times):
26 |         dt = datetime.fromtimestamp(
27 |             random.randint(15839593, 1294102139), tz=timezone.utc
28 |         )
29 |         w.add_document(id=str(i), date=dt)
30 |     w.commit()
31 |     print("Writing took ", now() - t)
32 | 
33 |     ix = index.open_dir(dirname)
34 |     s = ix.searcher()
35 |     q = query.Wildcard("id", "1?2*")
36 | 
37 |     t = now()
38 |     x = list(df.sortable_terms(s.reader(), "date"))
39 |     print(now() - t, len(x))
40 | 
41 |     t = now()
42 |     for y in x:
43 |         p = list(s.postings("date", y).all_ids())
44 |     print(now() - t)
45 | 
46 |     t = now()
47 |     r = s.search(q, limit=25, sortedby="date", reverse=True)
48 |     print("Search 1 took", now() - t)
49 |     print("len=", r.scored_length())
50 | 
51 |     t = now()
52 |     r = s.search(q, limit=25, sortedby="date")
53 |     print("Search 2 took", now() - t)
54 | 
55 |     t = now()
56 |     r = s.search(q, limit=25, sortedby="date")
57 |     print("Search 2 took", now() - t)
58 | 
59 |     from heapq import nlargest
60 | 
61 |     t = now()
62 |     sf = s.stored_fields
63 |     gen = ((sf(n)["date"], n) for n in q.docs(s))
64 |     r = nlargest(25, gen)
65 |     print(now() - t)
66 | 


--------------------------------------------------------------------------------
/stress/test_bigtable.py:
--------------------------------------------------------------------------------
 1 | from random import randint, shuffle
 2 | 
 3 | from nose.tools import assert_equal  # type: ignore @UnresolvedImport
 4 | from whoosh.filedb.filetables import HashReader, HashWriter
 5 | from whoosh.util.testing import TempStorage
 6 | 
 7 | 
 8 | def test_bigtable():
 9 |     with TempStorage("bigtable") as st:
10 | 
11 |         def randstring(min, max):
12 |             return "".join(chr(randint(1, 255)) for _ in range(randint(min, max)))
13 | 
14 |         count = 100000
15 |         samp = {randstring(1, 50): randstring(1, 50) for _ in range(count)}
16 | 
17 |         fhw = HashWriter(st.create_file("big.hsh"))
18 |         fhw.add_all(samp.items())
19 |         fhw.close()
20 | 
21 |         fhr = HashReader(st.open_file("big.hsh"))
22 |         keys = list(samp.keys())
23 |         shuffle(keys)
24 |         for key in keys:
25 |             assert_equal(samp[key], fhr[key])
26 | 
27 |         set1 = set(samp.items())
28 |         set2 = set(fhr.items())
29 |         assert_equal(set1, set2)
30 | 
31 |         fhr.close()
32 | 


--------------------------------------------------------------------------------
/stress/test_hugeindex.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | 
 3 | from nose.tools import assert_equal  # type: ignore @UnresolvedImport
 4 | from whoosh import formats
 5 | from whoosh.filedb.filepostings import FilePostingReader, FilePostingWriter
 6 | from whoosh.util.testing import TempStorage
 7 | 
 8 | 
 9 | def test_huge_postfile():
10 |     with TempStorage("hugeindex") as st:
11 |         pf = st.create_file("test.pst")
12 | 
13 |         gb5 = 5 * 1024 * 1024 * 1024
14 |         pf.seek(gb5)
15 |         pf.write("\x00\x00\x00\x00")
16 |         assert_equal(pf.tell(), gb5 + 4)
17 | 
18 |         fpw = FilePostingWriter(pf)
19 |         f = formats.Frequency(None)
20 |         offset = fpw.start(f)
21 |         for i in range(10):
22 |             fpw.write(i, float(i), struct.pack("!I", i), 10)
23 |         posttotal = fpw.finish()
24 |         assert_equal(posttotal, 10)
25 |         fpw.close()
26 | 
27 |         pf = st.open_file("test.pst")
28 |         pfr = FilePostingReader(pf, offset, f)
29 |         i = 0
30 |         while pfr.is_active():
31 |             assert_equal(pfr.id(), i)
32 |             assert_equal(pfr.weight(), float(i))
33 |             assert_equal(pfr.value(), struct.pack("!I", i))
34 |             pfr.next()
35 |             i += 1
36 |         pf.close()
37 | 


--------------------------------------------------------------------------------
/stress/test_threading.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import threading
 3 | import time
 4 | 
 5 | from whoosh import fields, query
 6 | from whoosh.util.testing import TempStorage
 7 | 
 8 | 
 9 | def test_readwrite():
10 |     schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
11 |     with TempStorage("threading") as st:
12 |         domain = (
13 |             "alfa",
14 |             "bravo",
15 |             "charlie",
16 |             "delta",
17 |             "echo",
18 |             "foxtrot",
19 |             "golf",
20 |             "hotel",
21 |             "india",
22 |             "juliet",
23 |             "kilo",
24 |             "lima",
25 |             "mike",
26 |             "november",
27 |             "oscar",
28 |             "papa",
29 |             "quebec",
30 |             "romeo",
31 |             "sierra",
32 |             "tango",
33 |             "uniform",
34 |             "victor",
35 |             "whiskey",
36 |             "xray",
37 |             "yankee",
38 |             "zulu",
39 |         )
40 | 
41 |         class WriterThread(threading.Thread):
42 |             def run(self):
43 |                 ix = st.create_index(dir, schema)
44 |                 num = 0
45 | 
46 |                 for i in range(50):
47 |                     print(i)
48 |                     w = ix.writer()
49 |                     for _ in range(random.randint(1, 100)):
50 |                         content = " ".join(random.sample(domain, random.randint(5, 20)))
51 |                         w.add_document(id=str(num), content=content)
52 |                         num += 1
53 |                     w.commit()
54 | 
55 |                     time.sleep(0.1)
56 | 
57 |         class SearcherThread(threading.Thread):
58 |             def run(self):
59 |                 print(self.name + " starting")
60 |                 for _ in range(10):
61 |                     ix = st.open_index()
62 |                     s = ix.searcher()
63 |                     q = query.Term("content", random.choice(domain))
64 |                     s.search(q, limit=10)
65 |                     s.close()
66 |                     ix.close()
67 |                     time.sleep(0.1)
68 |                 print(self.name + " done")
69 | 
70 |         wt = WriterThread()
71 |         wt.start()
72 |         time.sleep(0.5)
73 |         for _ in range(20):
74 |             SearcherThread().start()
75 |             time.sleep(0.5)
76 |         wt.join()
77 | 


--------------------------------------------------------------------------------
/stress/test_update.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from nose.tools import assert_equal
 4 | from whoosh import fields, query
 5 | from whoosh.util.testing import TempIndex
 6 | 
 7 | 
 8 | def test_many_updates():
 9 |     schema = fields.Schema(key=fields.ID(unique=True, stored=True))
10 |     with TempIndex(schema, "manyupdates") as ix:
11 |         for _ in range(10000):
12 |             num = random.randint(0, 5000)
13 |             w = ix.writer()
14 |             w.update_document(key=str(num))
15 |             w.commit()
16 | 
17 |         with ix.searcher() as s:
18 |             result = [d["key"] for d in s.search(query.Every())]
19 |             assert_equal(len(result), len(set(result)))
20 | 


--------------------------------------------------------------------------------
/sweep.yaml:
--------------------------------------------------------------------------------
 1 | # Sweep AI turns bugs & feature requests into code changes (https://sweep.dev)
 2 | # For details on our config file, check out our docs at https://docs.sweep.dev/usage/config
 3 | 
 4 | # This setting contains a list of rules that Sweep will check for. If any of these rules are broken in a new commit, Sweep will create an pull request to fix the broken rule.
 5 | rules:
 6 |   - "All new business logic should have corresponding unit tests."
 7 |   - "Refactor large functions to be more modular."
 8 |   - "Add docstrings to all functions and file headers."
 9 | 
10 | # This is the branch that Sweep will develop from and make pull requests to. Most people use 'main' or 'master' but some users also use 'dev' or 'staging'.
11 | branch: 'main'
12 | 
13 | # By default Sweep will read the logs and outputs from your existing Github Actions. To disable this, set this to false.
14 | gha_enabled: True
15 | 
16 | # This is the description of your project. It will be used by sweep when creating PRs. You can tell Sweep what's unique about your project, what frameworks you use, or anything else you want.
17 | #
18 | # Example:
19 | #
20 | # description: sweepai/sweep is a python project. The main api endpoints are in sweepai/api.py. Write code that adheres to PEP8.
21 | description: ''
22 | 
23 | # This sets whether to create pull requests as drafts. If this is set to True, then all pull requests will be created as drafts and GitHub Actions will not be triggered.
24 | draft: False
25 | 
26 | # This is a list of directories that Sweep will not be able to edit.
27 | blocked_dirs: []
28 | 


--------------------------------------------------------------------------------
/tests/english-words.10.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/tests/english-words.10.gz


--------------------------------------------------------------------------------
/tests/test_compound.py:
--------------------------------------------------------------------------------
 1 | from whoosh.filedb.compound import CompoundStorage
 2 | from whoosh.filedb.filestore import RamStorage
 3 | from whoosh.util.testing import TempStorage
 4 | 
 5 | 
 6 | def _test_simple_compound(st):
 7 |     alist = [1, 2, 3, 5, -5, -4, -3, -2]
 8 |     blist = [1, 12, 67, 8, 2, 1023]
 9 |     clist = [100, -100, 200, -200]
10 | 
11 |     with st.create_file("a") as af:
12 |         for x in alist:
13 |             af.write_int(x)
14 |     with st.create_file("b") as bf:
15 |         for x in blist:
16 |             bf.write_varint(x)
17 |     with st.create_file("c") as cf:
18 |         for x in clist:
19 |             cf.write_int(x)
20 | 
21 |     f = st.create_file("f")
22 |     CompoundStorage.assemble(f, st, ["a", "b", "c"])
23 | 
24 |     f = CompoundStorage(st.open_file("f"))
25 |     with f.open_file("a") as af:
26 |         for x in alist:
27 |             assert x == af.read_int()
28 |         assert af.read() == b""
29 | 
30 |     with f.open_file("b") as bf:
31 |         for x in blist:
32 |             assert x == bf.read_varint()
33 |         assert bf.read() == b""
34 | 
35 |     with f.open_file("c") as cf:
36 |         for x in clist:
37 |             assert x == cf.read_int()
38 |         assert cf.read() == b""
39 | 
40 | 
41 | def test_simple_compound_mmap():
42 |     with TempStorage("compound") as st:
43 |         assert st.supports_mmap
44 |         _test_simple_compound(st)
45 | 
46 | 
47 | def test_simple_compound_nomap():
48 |     st = RamStorage()
49 |     _test_simple_compound(st)
50 | 
51 | 
52 | # def test_unclosed_mmap():
53 | #    with TempStorage("unclosed") as st:
54 | #        assert st.supports_mmap
55 | #        with st.create_file("a") as af:
56 | #            af.write("alfa")
57 | #        with st.create_file("b") as bf:
58 | #            bf.write("bravo")
59 | #        f = st.create_file("f")
60 | #        CompoundStorage.assemble(f, st, ["a", "b"])
61 | #
62 | #        f = CompoundStorage(st, "f")
63 | 


--------------------------------------------------------------------------------
/tests/test_flexible.py:
--------------------------------------------------------------------------------
  1 | from whoosh import fields
  2 | from whoosh.util.testing import TempIndex
  3 | 
  4 | 
  5 | def test_addfield():
  6 |     schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
  7 |     with TempIndex(schema, "addfield") as ix:
  8 |         w = ix.writer()
  9 |         w.add_document(id="a", content="alfa")
 10 |         w.add_document(id="b", content="bravo")
 11 |         w.add_document(id="c", content="charlie")
 12 |         w.commit()
 13 | 
 14 |         ix.add_field("added", fields.KEYWORD(stored=True))
 15 | 
 16 |         w = ix.writer()
 17 |         w.add_document(id="d", content="delta", added="fourth")
 18 |         w.add_document(id="e", content="echo", added="fifth")
 19 |         w.commit(merge=False)
 20 | 
 21 |         with ix.searcher() as s:
 22 |             assert ("id", "d") in s.reader()
 23 |             assert s.document(id="d") == {"id": "d", "added": "fourth"}
 24 |             assert s.document(id="b") == {"id": "b"}
 25 | 
 26 | 
 27 | def test_addfield_spelling():
 28 |     schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT)
 29 |     with TempIndex(schema, "addfield") as ix:
 30 |         w = ix.writer()
 31 |         w.add_document(id="a", content="alfa")
 32 |         w.add_document(id="b", content="bravo")
 33 |         w.add_document(id="c", content="charlie")
 34 |         w.commit()
 35 | 
 36 |         ix.add_field("added", fields.KEYWORD(stored=True))
 37 | 
 38 |         w = ix.writer()
 39 |         w.add_document(id="d", content="delta", added="fourth")
 40 |         w.add_document(id="e", content="echo", added="fifth")
 41 |         w.commit(merge=False)
 42 | 
 43 |         with ix.searcher() as s:
 44 |             assert s.document(id="d") == {"id": "d", "added": "fourth"}
 45 |             assert s.document(id="b") == {"id": "b"}
 46 | 
 47 | 
 48 | def test_removefield():
 49 |     schema = fields.Schema(
 50 |         id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True)
 51 |     )
 52 |     with TempIndex(schema, "removefield") as ix:
 53 |         w = ix.writer()
 54 |         w.add_document(id="b", content="bravo", city="baghdad")
 55 |         w.add_document(id="c", content="charlie", city="cairo")
 56 |         w.add_document(id="d", content="delta", city="dakar")
 57 |         w.commit()
 58 | 
 59 |         with ix.searcher() as s:
 60 |             assert s.document(id="c") == {"id": "c", "city": "cairo"}
 61 | 
 62 |         w = ix.writer()
 63 |         w.remove_field("content")
 64 |         w.remove_field("city")
 65 |         w.commit()
 66 | 
 67 |         ixschema = ix._current_schema()
 68 |         assert ixschema.names() == ["id"]
 69 |         assert ixschema.stored_names() == ["id"]
 70 | 
 71 |         with ix.searcher() as s:
 72 |             assert ("content", b"charlie") not in s.reader()
 73 |             assert s.document(id="c") == {"id": "c"}
 74 | 
 75 | 
 76 | def test_optimize_away():
 77 |     schema = fields.Schema(
 78 |         id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True)
 79 |     )
 80 |     with TempIndex(schema, "optimizeaway") as ix:
 81 |         w = ix.writer()
 82 |         w.add_document(id="b", content="bravo", city="baghdad")
 83 |         w.add_document(id="c", content="charlie", city="cairo")
 84 |         w.add_document(id="d", content="delta", city="dakar")
 85 |         w.commit()
 86 | 
 87 |         with ix.searcher() as s:
 88 |             assert s.document(id="c") == {"id": "c", "city": "cairo"}
 89 | 
 90 |         w = ix.writer()
 91 |         w.remove_field("content")
 92 |         w.remove_field("city")
 93 |         w.commit(optimize=True)
 94 | 
 95 |         with ix.searcher() as s:
 96 |             assert ("content", "charlie") not in s.reader()
 97 |             assert s.document(id="c") == {"id": "c"}
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     test_addfield()
102 | 


--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import threading
  3 | import time
  4 | 
  5 | from whoosh.util.filelock import try_for
  6 | from whoosh.util.numeric import byte_to_length, length_to_byte
  7 | from whoosh.util.testing import TempStorage
  8 | 
  9 | 
 10 | def test_now():
 11 |     from whoosh.util import now
 12 | 
 13 |     t1 = now()
 14 |     t2 = now()
 15 |     assert t1 <= t2
 16 | 
 17 | 
 18 | def test_storage_creation():
 19 |     import tempfile
 20 |     import uuid
 21 | 
 22 |     from whoosh import fields
 23 |     from whoosh.filedb.filestore import FileStorage
 24 | 
 25 |     schema = fields.Schema(text=fields.TEXT)
 26 |     uid = uuid.uuid4()
 27 |     dirpath = os.path.join(tempfile.gettempdir(), str(uid))
 28 |     assert not os.path.exists(dirpath)
 29 | 
 30 |     st = FileStorage(dirpath)
 31 |     st.create()
 32 |     assert os.path.exists(dirpath)
 33 | 
 34 |     ix = st.create_index(schema)
 35 |     with ix.writer() as w:
 36 |         w.add_document(text="alfa bravo")
 37 |         w.add_document(text="bracho charlie")
 38 | 
 39 |     st.destroy()
 40 |     assert not os.path.exists(dirpath)
 41 | 
 42 | 
 43 | def test_ramstorage():
 44 |     from whoosh.filedb.filestore import RamStorage
 45 | 
 46 |     st = RamStorage()
 47 |     lock = st.lock("test")
 48 |     lock.acquire()
 49 |     lock.release()
 50 | 
 51 | 
 52 | def test_filelock_simple():
 53 |     with TempStorage("simplefilelock") as st:
 54 |         lock1 = st.lock("testlock")
 55 |         lock2 = st.lock("testlock")
 56 |         assert lock1 is not lock2
 57 | 
 58 |         assert lock1.acquire()
 59 |         assert st.file_exists("testlock")
 60 |         assert not lock2.acquire()
 61 |         lock1.release()
 62 |         assert lock2.acquire()
 63 |         assert not lock1.acquire()
 64 |         lock2.release()
 65 | 
 66 | 
 67 | def test_threaded_filelock():
 68 |     with TempStorage("threadedfilelock") as st:
 69 |         lock1 = st.lock("testlock")
 70 |         result = []
 71 | 
 72 |         # The thread function tries to acquire the lock and then quits
 73 |         def fn():
 74 |             lock2 = st.lock("testlock")
 75 |             gotit = try_for(lock2.acquire, 1.0, 0.1)
 76 |             if gotit:
 77 |                 result.append(True)
 78 |                 lock2.release()
 79 | 
 80 |         t = threading.Thread(target=fn)
 81 | 
 82 |         # Acquire the lock in this thread
 83 |         lock1.acquire()
 84 |         # Start the other thread trying to acquire the lock
 85 |         t.start()
 86 |         # Wait for a bit
 87 |         time.sleep(0.15)
 88 |         # Release the lock
 89 |         lock1.release()
 90 |         # Wait for the other thread to finish
 91 |         t.join()
 92 |         # If the other thread got the lock, it should have appended True to the
 93 |         # "results" list.
 94 |         assert result == [True]
 95 | 
 96 | 
 97 | def test_length_byte():
 98 |     source = list(range(11))
 99 |     xform = [length_to_byte(n) for n in source]
100 |     result = [byte_to_length(n) for n in xform]
101 |     assert source == result
102 | 
103 | 
104 | def test_version_object():
105 |     from whoosh.util.versions import SimpleVersion as sv
106 | 
107 |     assert sv.parse("1") == sv(1)
108 |     assert sv.parse("1.2") == sv(1, 2)
109 |     assert sv.parse("1.2b") == sv(1, 2, ex="b")
110 |     assert sv.parse("1.2rc") == sv(1, 2, ex="rc")
111 |     assert sv.parse("1.2b3") == sv(1, 2, ex="b", exnum=3)
112 |     assert sv.parse("1.2.3") == sv(1, 2, 3)
113 |     assert sv.parse("1.2.3a") == sv(1, 2, 3, "a")
114 |     assert sv.parse("1.2.3rc") == sv(1, 2, 3, "rc")
115 |     assert sv.parse("1.2.3a4") == sv(1, 2, 3, "a", 4)
116 |     assert sv.parse("1.2.3rc2") == sv(1, 2, 3, "rc", 2)
117 |     assert sv.parse("999.999.999c999") == sv(999, 999, 999, "c", 999)
118 | 
119 |     assert sv.parse("1.2") == sv.parse("1.2")
120 |     assert sv("1.2") != sv("1.3")
121 |     assert sv.parse("1.0") < sv.parse("1.1")
122 |     assert sv.parse("1.0") < sv.parse("2.0")
123 |     assert sv.parse("1.2.3a4") < sv.parse("1.2.3a5")
124 |     assert sv.parse("1.2.3a5") > sv.parse("1.2.3a4")
125 |     assert sv.parse("1.2.3c99") < sv.parse("1.2.4")
126 |     assert sv.parse("1.2.3a4") != sv.parse("1.2.3a5")
127 |     assert sv.parse("1.2.3a5") != sv.parse("1.2.3a4")
128 |     assert sv.parse("1.2.3c99") != sv.parse("1.2.4")
129 |     assert sv.parse("1.2.3a4") <= sv.parse("1.2.3a5")
130 |     assert sv.parse("1.2.3a5") >= sv.parse("1.2.3a4")
131 |     assert sv.parse("1.2.3c99") <= sv.parse("1.2.4")
132 |     assert sv.parse("1.2") <= sv.parse("1.2")
133 | 
134 |     assert sv(1, 2, 3).to_int() == 17213488128
135 |     assert sv.from_int(17213488128) == sv(1, 2, 3)
136 | 


--------------------------------------------------------------------------------
/tests/test_postings.py:
--------------------------------------------------------------------------------
  1 | from whoosh import analysis, fields
  2 | from whoosh.codec import default_codec
  3 | from whoosh.formats import (
  4 |     CharacterBoosts,
  5 |     Characters,
  6 |     Existence,
  7 |     Frequency,
  8 |     PositionBoosts,
  9 |     Positions,
 10 | )
 11 | from whoosh.util.testing import TempStorage
 12 | 
 13 | 
 14 | def _roundtrip(content, format_, astype, ana=None):
 15 |     with TempStorage("roundtrip") as st:
 16 |         codec = default_codec()
 17 |         seg = codec.new_segment(st, "")
 18 |         ana = ana or analysis.StandardAnalyzer()
 19 |         field = fields.FieldType(format=format_, analyzer=ana)
 20 | 
 21 |         fw = codec.field_writer(st, seg)
 22 |         fw.start_field("f1", field)
 23 |         for text, _, weight, valuestring in sorted(field.index(content)):
 24 |             fw.start_term(text)
 25 |             fw.add(0, weight, valuestring, None)
 26 |             fw.finish_term()
 27 |         fw.finish_field()
 28 |         fw.close()
 29 | 
 30 |         tr = codec.terms_reader(st, seg)
 31 |         ps = []
 32 |         for fieldname, btext in tr.terms():
 33 |             m = tr.matcher(fieldname, btext, format_)
 34 |             ps.append((field.from_bytes(btext), m.value_as(astype)))
 35 |         tr.close()
 36 |         return ps
 37 | 
 38 | 
 39 | def test_existence_postings():
 40 |     content = "alfa bravo charlie"
 41 |     assert _roundtrip(content, Existence(), "frequency") == [
 42 |         ("alfa", 1),
 43 |         ("bravo", 1),
 44 |         ("charlie", 1),
 45 |     ]
 46 | 
 47 | 
 48 | def test_frequency_postings():
 49 |     content = "alfa bravo charlie bravo alfa alfa"
 50 |     assert _roundtrip(content, Frequency(), "frequency") == [
 51 |         ("alfa", 3),
 52 |         ("bravo", 2),
 53 |         ("charlie", 1),
 54 |     ]
 55 | 
 56 | 
 57 | def test_position_postings():
 58 |     content = "alfa bravo charlie bravo alfa alfa"
 59 |     assert _roundtrip(content, Positions(), "positions") == [
 60 |         ("alfa", [0, 4, 5]),
 61 |         ("bravo", [1, 3]),
 62 |         ("charlie", [2]),
 63 |     ]
 64 |     assert _roundtrip(content, Positions(), "frequency") == [
 65 |         ("alfa", 3),
 66 |         ("bravo", 2),
 67 |         ("charlie", 1),
 68 |     ]
 69 | 
 70 | 
 71 | def test_character_postings():
 72 |     content = "alfa bravo charlie bravo alfa alfa"
 73 |     assert _roundtrip(content, Characters(), "characters") == [
 74 |         ("alfa", [(0, 0, 4), (4, 25, 29), (5, 30, 34)]),
 75 |         ("bravo", [(1, 5, 10), (3, 19, 24)]),
 76 |         ("charlie", [(2, 11, 18)]),
 77 |     ]
 78 |     assert _roundtrip(content, Characters(), "positions") == [
 79 |         ("alfa", [0, 4, 5]),
 80 |         ("bravo", [1, 3]),
 81 |         ("charlie", [2]),
 82 |     ]
 83 |     assert _roundtrip(content, Characters(), "frequency") == [
 84 |         ("alfa", 3),
 85 |         ("bravo", 2),
 86 |         ("charlie", 1),
 87 |     ]
 88 | 
 89 | 
 90 | def test_posboost_postings():
 91 |     pbs = PositionBoosts()
 92 |     ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
 93 |     content = "alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa"
 94 |     assert _roundtrip(content, pbs, "position_boosts", ana) == [
 95 |         ("alfa", [(0, 2), (4, 1), (5, 1)]),
 96 |         ("bravo", [(1, 0.1), (3, 0.5)]),
 97 |         ("charlie", [(2, 2)]),
 98 |     ]
 99 |     assert _roundtrip(content, pbs, "positions", ana) == [
100 |         ("alfa", [0, 4, 5]),
101 |         ("bravo", [1, 3]),
102 |         ("charlie", [2]),
103 |     ]
104 |     assert _roundtrip(content, pbs, "frequency", ana) == [
105 |         ("alfa", 3),
106 |         ("bravo", 2),
107 |         ("charlie", 1),
108 |     ]
109 | 
110 | 
111 | def test_charboost_postings():
112 |     cbs = CharacterBoosts()
113 |     ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
114 |     content = "alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa"
115 |     assert _roundtrip(content, cbs, "character_boosts", ana) == [
116 |         ("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]),
117 |         ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]),
118 |         ("charlie", [(2, 17, 24, 2)]),
119 |     ]
120 |     assert _roundtrip(content, cbs, "position_boosts", ana) == [
121 |         ("alfa", [(0, 2), (4, 1), (5, 1)]),
122 |         ("bravo", [(1, 0.1), (3, 0.5)]),
123 |         ("charlie", [(2, 2)]),
124 |     ]
125 |     assert _roundtrip(content, cbs, "characters", ana) == [
126 |         ("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]),
127 |         ("bravo", [(1, 7, 12), (3, 27, 32)]),
128 |         ("charlie", [(2, 17, 24)]),
129 |     ]
130 |     assert _roundtrip(content, cbs, "positions", ana) == [
131 |         ("alfa", [0, 4, 5]),
132 |         ("bravo", [1, 3]),
133 |         ("charlie", [2]),
134 |     ]
135 |     assert _roundtrip(content, cbs, "frequency", ana) == [
136 |         ("alfa", 3),
137 |         ("bravo", 2),
138 |         ("charlie", 1),
139 |     ]
140 | 


--------------------------------------------------------------------------------
/tests/test_stem.py:
--------------------------------------------------------------------------------
 1 | from whoosh.lang.snowball.english import EnglishStemmer
 2 | from whoosh.lang.snowball.finnish import FinnishStemmer
 3 | from whoosh.lang.snowball.french import FrenchStemmer
 4 | from whoosh.lang.snowball.spanish import SpanishStemmer
 5 | 
 6 | 
 7 | def test_english():
 8 |     s = EnglishStemmer()
 9 |     assert s.stem("hello") == "hello"
10 |     assert s.stem("atlas") == "atlas"
11 |     assert s.stem("stars") == "star"
12 | 
13 | 
14 | def test_french():
15 |     s = FrenchStemmer()
16 |     assert s.stem("adresse") == "adress"
17 |     assert s.stem("lettres") == "lettr"
18 | 
19 | 
20 | def test_finnish():
21 |     s = FinnishStemmer()
22 |     assert s.stem("valitse") == "valits"
23 |     assert s.stem("koko") == "koko"
24 |     assert s.stem("erikoismerkit") == "erikoismerk"
25 | 
26 | 
27 | def test_spanish_spell_suffix():
28 |     word = "tgue"
29 |     s = SpanishStemmer()
30 |     w = s.stem(word)
31 |     assert w == "tgu"
32 | 


--------------------------------------------------------------------------------
/tests/test_weightings.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import sys
 3 | from itertools import permutations
 4 | from random import choice, randint
 5 | 
 6 | from whoosh import fields, query, scoring
 7 | from whoosh.filedb.filestore import RamStorage
 8 | 
 9 | 
10 | def u(s):
11 |     return s.decode("ascii") if isinstance(s, bytes) else s
12 | 
13 | 
14 | def _weighting_classes(ignore):
15 |     # Get all the subclasses of Weighting in whoosh.scoring
16 |     return [
17 |         c
18 |         for _, c in inspect.getmembers(scoring, inspect.isclass)
19 |         if scoring.Weighting in c.__bases__ and c not in ignore
20 |     ]
21 | 
22 | 
23 | def test_all():
24 |     domain = [u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot")]
25 |     schema = fields.Schema(text=fields.TEXT)
26 |     storage = RamStorage()
27 |     ix = storage.create_index(schema)
28 |     w = ix.writer()
29 |     for _ in range(100):
30 |         w.add_document(text=u(" ").join(choice(domain) for _ in range(randint(10, 20))))
31 |     w.commit()
32 | 
33 |     # List ABCs that should not be tested
34 |     abcs = ()
35 |     # provide initializer arguments for any weighting classes that require them
36 |     init_args = {
37 |         "MultiWeighting": ([scoring.BM25F()], {"text": scoring.Frequency()}),
38 |         "ReverseWeighting": ([scoring.BM25F()], {}),
39 |     }
40 | 
41 |     for wclass in _weighting_classes(abcs):
42 |         try:
43 |             if wclass.__name__ in init_args:
44 |                 args, kwargs = init_args[wclass.__name__]
45 |                 weighting = wclass(*args, **kwargs)
46 |             else:
47 |                 weighting = wclass()
48 |         except TypeError:
49 |             e = sys.exc_info()[1]
50 |             raise TypeError(f"Error instantiating {wclass!r}: {e}")
51 | 
52 |         with ix.searcher(weighting=weighting) as s:
53 |             try:
54 |                 for word in domain:
55 |                     s.search(query.Term("text", word))
56 |             except ValueError:
57 |                 e = sys.exc_info()[1]
58 |                 e.msg = f"Error searching with {wclass!r}: {e}"
59 |                 raise
60 | 
61 | 
62 | def test_compatibility():
63 |     from whoosh.scoring import Weighting
64 | 
65 |     # This is the old way of doing a custom weighting model, check that
66 |     # it's still supported...
67 |     class LegacyWeighting(Weighting):
68 |         use_final = True
69 | 
70 |         def score(self, searcher, fieldname, text, docnum, weight):
71 |             return weight + 0.5
72 | 
73 |         def final(self, searcher, docnum, score):
74 |             return score * 1.5
75 | 
76 |     schema = fields.Schema(text=fields.TEXT)
77 |     ix = RamStorage().create_index(schema)
78 |     w = ix.writer()
79 |     domain = "alfa bravo charlie delta".split()
80 |     for ls in permutations(domain, 3):
81 |         w.add_document(text=u(" ").join(ls))
82 |     w.commit()
83 | 
84 |     s = ix.searcher(weighting=LegacyWeighting())
85 |     r = s.search(query.Term("text", u("bravo")))
86 |     assert r.score(0) == 2.25
87 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27, py38, py39, py310, py311, py312
3 | 
4 | [testenv]
5 | deps =
6 |     pytest
7 |     pytest-pep8
8 | commands = py.test -s {posargs} tests
9 | 


--------------------------------------------------------------------------------