├── .codiumai.toml ├── .coveragerc ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ └── sweep-template.yml ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml ├── labeler.yml └── workflows │ ├── auto-merge.yml │ ├── deploy-github-pages.yml │ ├── first-interaction.yml │ ├── issue_labeler.yml │ ├── python-publish.yml │ ├── stale_bot.yaml │ ├── test.yml │ └── tox.yml ├── .gitignore ├── .hgignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── .sonarcloud.properties ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── benchmark ├── dcvgr10.txt.gz ├── dictionary.py ├── enron.py ├── marc21.py ├── reuters.py └── reuters21578.txt.gz ├── codecov.yml ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── analysis.rst │ ├── api │ ├── analysis.rst │ ├── api.rst │ ├── codec │ │ └── base.rst │ ├── collectors.rst │ ├── columns.rst │ ├── fields.rst │ ├── filedb │ │ ├── filestore.rst │ │ ├── filetables.rst │ │ └── structfile.rst │ ├── formats.rst │ ├── highlight.rst │ ├── idsets.rst │ ├── index.rst │ ├── lang │ │ ├── morph_en.rst │ │ ├── porter.rst │ │ └── wordnet.rst │ ├── matching.rst │ ├── qparser.rst │ ├── query.rst │ ├── reading.rst │ ├── scoring.rst │ ├── searching.rst │ ├── sorting.rst │ ├── spelling.rst │ ├── support │ │ ├── charset.rst │ │ └── levenshtein.rst │ ├── util.rst │ └── writing.rst │ ├── batch.rst │ ├── conf.py │ ├── dates.rst │ ├── facets.rst │ ├── fieldcaches.rst │ ├── glossary.rst │ ├── highlight.rst │ ├── index.rst │ ├── indexing.rst │ ├── intro.rst │ ├── keywords.rst │ ├── nested.rst │ ├── ngrams.rst │ ├── parsing.rst │ ├── query.rst │ ├── querylang.rst │ ├── quickstart.rst │ ├── recipes.rst │ ├── releases │ ├── 0_3.rst │ ├── 1_0.rst │ ├── 2_0.rst │ └── index.rst │ ├── schema.rst │ ├── searching.rst │ ├── spelling.rst │ ├── stemming.rst │ ├── tech │ ├── backend.rst │ ├── filedb.rst │ └── index.rst │ └── threads.rst ├── files ├── whoosh.svg ├── whoosh_16.png ├── whoosh_35.png ├── whoosh_64.png └── whoosh_small.svg ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── scripts ├── make_checkpoint.py ├── pylint.ini └── read_checkpoint.py ├── setup.cfg ├── setup.py ├── src └── whoosh │ ├── __init__.py │ ├── analysis │ ├── __init__.py │ ├── acore.py │ ├── analyzers.py │ ├── filters.py │ ├── intraword.py │ ├── morph.py │ ├── ngrams.py │ └── tokenizers.py │ ├── automata │ ├── __init__.py │ ├── fsa.py │ ├── fst.py │ ├── glob.py │ ├── lev.py │ └── reg.py │ ├── classify.py │ ├── codec │ ├── __init__.py │ ├── base.py │ ├── memory.py │ ├── plaintext.py │ ├── whoosh2.py │ └── whoosh3.py │ ├── collectors.py │ ├── columns.py │ ├── externalsort.py │ ├── fields.py │ ├── filedb │ ├── __init__.py │ ├── compound.py │ ├── fileindex.py │ ├── filepostings.py │ ├── filereading.py │ ├── filestore.py │ ├── filetables.py │ ├── filewriting.py │ ├── gae.py │ ├── misc.py │ ├── pools.py │ └── structfile.py │ ├── formats.py │ ├── highlight.py │ ├── idsets.py │ ├── index.py │ ├── lang │ ├── __init__.py │ ├── dmetaphone.py │ ├── isri.py │ ├── lovins.py │ ├── morph_en.py │ ├── paicehusk.py │ ├── phonetic.py │ ├── porter.py │ ├── porter2.py │ ├── snowball │ │ ├── LICENSE.txt │ │ ├── __init__.py │ │ ├── bases.py │ │ ├── danish.py │ │ ├── dutch.py │ │ ├── english.py │ │ ├── finnish.py │ │ ├── french.py │ │ ├── german.py │ │ ├── hungarian.py │ │ ├── italian.py │ │ ├── norwegian.py │ │ ├── portugese.py │ │ ├── romanian.py │ │ ├── russian.py │ │ ├── spanish.py │ │ └── swedish.py │ ├── stopwords.py │ └── wordnet.py │ ├── legacy.py │ ├── matching │ ├── __init__.py │ ├── binary.py │ ├── combo.py │ ├── mcore.py │ └── wrappers.py │ ├── multiproc.py │ ├── qparser │ ├── __init__.py │ ├── common.py │ ├── dateparse.py │ ├── default.py │ ├── plugins.py │ ├── syntax.py │ └── taggers.py │ ├── query │ ├── __init__.py │ ├── compound.py │ ├── nested.py │ ├── positional.py │ ├── qcolumns.py │ ├── qcore.py │ ├── ranges.py │ ├── spans.py │ ├── terms.py │ └── wrappers.py │ ├── reading.py │ ├── scoring.py │ ├── searching.py │ ├── sorting.py │ ├── spelling.py │ ├── support │ ├── __init__.py │ ├── base85.py │ ├── bench.py │ ├── bitstream.py │ ├── bitvector.py │ ├── charset.py │ ├── levenshtein.py │ ├── pyparsing.py │ ├── relativedelta.py │ └── unicode.py │ ├── system.py │ ├── util │ ├── __init__.py │ ├── cache.py │ ├── filelock.py │ ├── loading.py │ ├── numeric.py │ ├── numlists.py │ ├── testing.py │ ├── text.py │ ├── times.py │ ├── varints.py │ └── versions.py │ └── writing.py ├── stress ├── test_bigfacet.py ├── test_bigindex.py ├── test_bigsort.py ├── test_bigtable.py ├── test_hugeindex.py ├── test_threading.py └── test_update.py ├── sweep.yaml ├── tests ├── english-words.10.gz ├── test_analysis.py ├── test_automata.py ├── test_bits.py ├── test_classify.py ├── test_codecs.py ├── test_collector.py ├── test_columns.py ├── test_compound.py ├── test_dateparse.py ├── test_fields.py ├── test_flexible.py ├── test_highlighting.py ├── test_indexing.py ├── test_matching.py ├── test_misc.py ├── test_mpwriter.py ├── test_nested.py ├── test_parse_plugins.py ├── test_parsing.py ├── test_postings.py ├── test_quality.py ├── test_queries.py ├── test_reading.py ├── test_results.py ├── test_searching.py ├── test_sorting.py ├── test_spans.py ├── test_spelling.py ├── test_stem.py ├── test_tables.py ├── test_vectors.py ├── test_weightings.py └── test_writing.py └── tox.ini /.codiumai.toml: -------------------------------------------------------------------------------- 1 | #.codiumai.toml 2 | [tests] 3 | 4 | ## Testing framework to use - this can affect the content of the generated tests 5 | ## as well as the test run command. 6 | ## Possible values are: 7 | ## Python: Pytest, Unittest 8 | framework = "Pytest" 9 | 10 | ## A hint to the test generator about whether to use mocks or not. Possible values are true or false. 11 | # use_mocks = false 12 | 13 | ## How many tests should be generated by default. Fewer tests is faster. 14 | ## Does not apply at the moment to extend-suite tests. 15 | num_desired_tests = 20 16 | 17 | ## A multiline string, delimited with triple-quotes (""") serving as an extra instruction 18 | ## that the AI model will take into consideration. 19 | ## This will appear as "General instructions" in the 20 | ## configuration section in the tests panel. 21 | # plan_instructions = """ 22 | # Each line should have a comment explaining it. 23 | # Each comment should start with the comment number (1., 2. etc.) 24 | # """ 25 | 26 | ## A multiline string, delimited with triple-quotes (""") serving as an example test that represents 27 | ## what you would like the generated tests to look like in terms of style, setup, etc. 28 | # example_test = """ 29 | # describe("something", () => { 30 | # it("says 'bar'", () => { 31 | # // given 32 | # 33 | # // when 34 | # const res = something.say(); 35 | # 36 | # // Then 37 | # expect(res).to.equal("bar"); 38 | # }); 39 | # }); 40 | # """ 41 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | omit = 4 | # Autogenerated missed code handles other VCSes. 5 | devito/_version.py 6 | examples/*__init__* 7 | concurrency = multiprocessing 8 | parallel = True 9 | 10 | [report] 11 | # Regexes for lines to exclude from consideration 12 | exclude_lines = 13 | # Don't complain about missing debug-only code: 14 | def __repr__ 15 | 16 | # Don't complain if tests don't hit defensive assertion code: 17 | raise NotImplementedError 18 | raise ValueError 19 | raise TypeError 20 | raise RuntimeError 21 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [ZeroCool940711] 2 | patreon: zerocool94 3 | ko_fi: zerocool94 4 | open_collective: sygil_dev 5 | custom: ["https://paypal.me/zerocool94"] 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/sweep-template.yml: -------------------------------------------------------------------------------- 1 | name: Sweep Issue 2 | title: 'Sweep: ' 3 | description: For small bugs, features, refactors, and tests to be handled by Sweep, an AI-powered junior developer. 4 | labels: sweep 5 | body: 6 | - type: textarea 7 | id: description 8 | attributes: 9 | label: Details 10 | description: Tell Sweep where and what to edit and provide enough context for a new developer to the codebase 11 | placeholder: | 12 | Unit Tests: Write unit tests for . Test each function in the file. Make sure to test edge cases. 13 | Bugs: The bug might be in . Here are the logs: ... 14 | Features: the new endpoint should use the ... class from because it contains ... logic. 15 | Refactors: We are migrating this function to ... version because ... 16 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please include: 4 | * relevant motivation 5 | * a summary of the change 6 | * which issue is fixed. 7 | * any additional dependencies that are required for this change. 8 | 9 | Closes: # (issue) 10 | 11 | # Checklist: 12 | 13 | - [ ] I have performed a self-review of my own code 14 | - [ ] I have commented my code in hard-to-understand areas 15 | - [ ] I have made corresponding changes to the documentation 16 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | # Enable version updates for pip 9 | - package-ecosystem: 'pip' 10 | directory: '/' 11 | # Check the npm registry for updates once a week (Monday) 12 | schedule: 13 | interval: 'daily' 14 | 15 | - package-ecosystem: 'github-actions' 16 | directory: '/' 17 | schedule: 18 | interval: 'daily' 19 | -------------------------------------------------------------------------------- /.github/workflows/auto-merge.yml: -------------------------------------------------------------------------------- 1 | name: Dependabot auto-merge 2 | on: pull_request 3 | 4 | permissions: 5 | contents: write 6 | pull-requests: write 7 | 8 | jobs: 9 | dependabot: 10 | runs-on: ubuntu-latest 11 | if: github.actor == 'dependabot[bot]' 12 | steps: 13 | - name: Dependabot metadata 14 | id: metadata 15 | uses: dependabot/fetch-metadata@v2 16 | with: 17 | github-token: "${{ secrets.GITHUB_TOKEN }}" 18 | - name: Enable auto-merge for Dependabot PRs 19 | run: gh pr merge --auto --merge "$PR_URL" 20 | env: 21 | PR_URL: ${{github.event.pull_request.html_url}} 22 | GH_TOKEN: ${{secrets.GITHUB_TOKEN}} 23 | -------------------------------------------------------------------------------- /.github/workflows/deploy-github-pages.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | # Review gh actions docs if you want to further define triggers, paths, etc 8 | # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on 9 | 10 | workflow_dispatch: # This line allows manual triggering 11 | 12 | jobs: 13 | deploy: 14 | name: Deploy to GitHub Pages 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4.1.1 18 | - uses: actions/setup-python@v5.0.0 19 | with: 20 | python-version: '3.7' 21 | 22 | - name: Install dependencies 23 | run: pip install -r requirements.txt && pip install -r docs/requirements.txt 24 | - name: Build website 25 | run: cd docs && make html 26 | 27 | # Popular action to deploy to GitHub Pages: 28 | # Docs: https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-docusaurus 29 | - name: Deploy to GitHub Pages 30 | uses: peaceiris/actions-gh-pages@v4.0.0 31 | with: 32 | github_token: ${{ secrets.GITHUB_TOKEN }} 33 | # Build output to publish to the `gh-pages` branch: 34 | publish_dir: ./docs/build/html 35 | # The following lines assign commit authorship to the official 36 | # GH-Actions bot for deploys to `gh-pages` branch: 37 | # https://github.com/actions/checkout/issues/13#issuecomment-724415212 38 | # The GH actions bot is used by default if you didn't specify the two fields. 39 | # You can swap them out with your own user credentials. 40 | user_name: github-actions[bot] 41 | user_email: 41898282+github-actions[bot]@users.noreply.github.com 42 | -------------------------------------------------------------------------------- /.github/workflows/first-interaction.yml: -------------------------------------------------------------------------------- 1 | name: first-interaction 2 | 3 | on: 4 | issues: 5 | types: [opened] 6 | pull_request: 7 | branches: [main] 8 | types: [opened] 9 | 10 | jobs: 11 | check_for_first_interaction: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - uses: actions/first-interaction@main 16 | with: 17 | repo-token: ${{ secrets.GITHUB_TOKEN }} 18 | issue-message: | 19 | Hello! Thank you for filing an issue. 20 | 21 | If this is a bug report, please include relevant logs to help us debug the problem. 22 | pr-message: | 23 | Hello! Thank you for your contribution. 24 | 25 | If you are fixing a bug, please reference the issue number in the description. 26 | 27 | If you are implementing a feature request, please check with the maintainers that the feature will be accepted first. 28 | -------------------------------------------------------------------------------- /.github/workflows/issue_labeler.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | name: Issue labeler 4 | on: 5 | issues: 6 | types: [ opened ] 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | label-component: 13 | runs-on: ubuntu-22.04 14 | 15 | permissions: 16 | issues: write 17 | 18 | strategy: 19 | matrix: 20 | template: [ bug_report.yml, feature_request.yml ] 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - name: Parse issue form 26 | uses: stefanbuck/github-issue-parser@c1a559d78bfb8dd05216dab9ffd2b91082ff5324 27 | id: issue-parser 28 | with: 29 | template-path: .github/ISSUE_TEMPLATE/${{ matrix.template }} 30 | 31 | - name: Set labels based on component field 32 | uses: redhat-plumbers-in-action/advanced-issue-labeler@d498805e5c7c0658e336948b3363480bcfd68da6 33 | with: 34 | issue-form: ${{ steps.issue-parser.outputs.jsonString }} 35 | template: ${{ matrix.template }} 36 | token: ${{ secrets.GITHUB_TOKEN }} 37 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | workflow_dispatch: # This line allows manual triggering 16 | 17 | #push: 18 | # branches: 19 | # - master 20 | 21 | permissions: 22 | contents: read 23 | 24 | jobs: 25 | deploy: 26 | 27 | runs-on: ubuntu-latest 28 | 29 | steps: 30 | - uses: actions/checkout@v4 31 | - name: Set up Python 32 | uses: actions/setup-python@v5 33 | with: 34 | python-version: '3.9' 35 | - name: Install dependencies 36 | run: | 37 | python -m pip install --upgrade pip 38 | pip install build 39 | - name: Build package 40 | run: python -m build 41 | - name: Publish package 42 | uses: pypa/gh-action-pypi-publish@v1.8.14 43 | with: 44 | user: __token__ 45 | password: ${{ secrets.PYPI_API_TOKEN }} 46 | -------------------------------------------------------------------------------- /.github/workflows/stale_bot.yaml: -------------------------------------------------------------------------------- 1 | name: Stale bot 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | 7 | permissions: 8 | pull-requests: write 9 | 10 | jobs: 11 | stale: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Mark and close stale PRs 15 | uses: actions/stale@v9 16 | with: 17 | stale-pr-message: "This PR is stale because it has been 60 days with no activity. This PR will be automatically closed within 7 days if there is no further activity." 18 | close-pr-message: "This PR was closed because it has been stalled for some time with no activity." 19 | days-before-stale: -1 # avoid marking issues 20 | days-before-pr-stale: 60 21 | days-before-close: -1 # avoid closing issues 22 | days-before-pr-close: 7 23 | exempt-all-pr-assignees: true # avoid stale for all PR with assignees 24 | exempt-all-pr-milestones: true # avoid stale for all PR with milestones 25 | operations-per-run: 200 26 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Test 3 | 4 | on: [pull_request, push, workflow_dispatch] 5 | 6 | jobs: 7 | codespell_and_ruff: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | - run: pip install --user codespell[toml] ruff 12 | # TODO: Fix lint issues and remove `--exit-zero` from the line below. 13 | - run: ruff --exit-zero --output-format=github --target-version=py38 . 14 | test: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [3.8, 3.9, "3.10", 3.11, 3.12] 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip setuptools wheel 28 | pip install pytest pytest-cov pytest-sugar coverage cached-property 29 | - name: Install Whoosh 30 | run: | 31 | pip install -e . 32 | - name: Run tests 33 | run: | 34 | pytest --cov=./ --cov-report=xml --cov-report=html 35 | - name: Upload HTML coverage report 36 | uses: actions/upload-artifact@v4 37 | with: 38 | name: "HTML Coverage ${{ matrix.python-version }}" 39 | path: "htmlcov" 40 | retention-days: 7 41 | 42 | - name: Upload Coverage to Codecov 43 | uses: codecov/codecov-action@v4.4.1 44 | with: 45 | directory: ./ 46 | env_vars: OS,PYTHON 47 | fail_ci_if_error: true 48 | files: ./coverage.xml 49 | handle_no_reports_found: true 50 | token: ${{ secrets.CODECOV_TOKEN }} # required 51 | verbose: true # optional (default = false) 52 | -------------------------------------------------------------------------------- /.github/workflows/tox.yml: -------------------------------------------------------------------------------- 1 | name: tox 2 | on: [push, pull_request, workflow_dispatch] 3 | jobs: 4 | tox: 5 | strategy: 6 | fail-fast: false 7 | max-parallel: 5 8 | matrix: 9 | os: [ubuntu-latest] # [macos-latest, ubuntu-latest, windows-latest] 10 | python: ['3.8', '3.9', '3.10', '3.11', '3.12'] 11 | runs-on: ${{ matrix.os }} 12 | steps: 13 | - uses: actions/checkout@v4 14 | - uses: actions/setup-python@v5 15 | with: 16 | python-version: ${{ matrix.python }} 17 | - run: pip install --upgrade pip 18 | - run: pip install tox 19 | - run: tox -e py 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | .tox/ 4 | env/ 5 | build/ 6 | dist/ 7 | eggs/ 8 | .eggs/ 9 | *.egg 10 | *.egg-info/ 11 | /test.py 12 | /.vscode/settings.json 13 | /.coverage 14 | /whoosh-reloaded.code-workspace 15 | /.vscode/launch.json 16 | *.coverage.DESKTOP-* 17 | /coverage.xml 18 | /lcov.info 19 | /.codiumai.local.toml 20 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | *.pyc 3 | *~ 4 | *.DS_Store 5 | 6 | .idea 7 | .settings 8 | .coverage 9 | .tox 10 | .cache 11 | nosetests.xml 12 | 13 | build 14 | dist 15 | docs/build 16 | src/Whoosh.egg-info 17 | 18 | bmark 19 | *testindex 20 | benchmark/enron_index* 21 | benchmark/reuters_index* 22 | benchmark/dictionary_index* 23 | benchmark/enron_cache.pickle 24 | benchmark/enron_mail_082109.tar.gz 25 | 26 | tmp/* 27 | tests/tmp/* 28 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-ast 6 | - id: check-builtin-literals 7 | - id: check-merge-conflict 8 | - id: check-toml 9 | - id: check-yaml 10 | - id: detect-private-key 11 | - id: end-of-file-fixer 12 | - id: mixed-line-ending 13 | - id: trailing-whitespace 14 | 15 | - repo: https://github.com/charliermarsh/ruff-pre-commit 16 | rev: v0.2.1 17 | hooks: 18 | - id: ruff 19 | args: [ --fix ] 20 | - id: ruff-format 21 | 22 | - repo: https://github.com/ikamensh/flynt/ 23 | rev: '1.0.1' 24 | hooks: 25 | - id: flynt 26 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.7" 13 | 14 | # Build documentation in the "docs/" directory with Sphinx 15 | sphinx: 16 | configuration: docs/source/conf.py 17 | 18 | # Optionally build your docs in additional formats such as PDF and ePub 19 | # formats: 20 | # - pdf 21 | # - epub 22 | 23 | # Optional but recommended, declare the Python requirements required 24 | # to build your documentation 25 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 26 | python: 27 | install: 28 | # - requirements: requirements.txt 29 | - requirements: docs/requirements.txt 30 | -------------------------------------------------------------------------------- /.sonarcloud.properties: -------------------------------------------------------------------------------- 1 | sonar.python.version=3.8, 3.9, 3.10, 3.11, 3.12 2 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2011 Matt Chaput. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, 7 | this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are 25 | those of the authors and should not be interpreted as representing official 26 | policies, either expressed or implied, of Matt Chaput. 27 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include benchmark/dcvgr10.txt.gz 3 | include benchmark/reuters21578.txt.gz 4 | include tests/english-words.10.gz 5 | recursive-include tests *.txt *.py 6 | recursive-include benchmark *.txt *.py 7 | recursive-include docs *.txt *.py *.rst 8 | recursive-include files *.txt *.py *.png *.jpg *.svg 9 | -------------------------------------------------------------------------------- /benchmark/dcvgr10.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/benchmark/dcvgr10.txt.gz -------------------------------------------------------------------------------- /benchmark/dictionary.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | 4 | from whoosh import analysis, fields 5 | from whoosh.support.bench import Bench, Spec 6 | 7 | 8 | class VulgarTongue(Spec): 9 | name = "dictionary" 10 | filename = "dcvgr10.txt.gz" 11 | headline_field = "head" 12 | 13 | def documents(self): 14 | path = os.path.join(self.options.dir, self.filename) 15 | f = gzip.GzipFile(path) 16 | 17 | head = body = None 18 | for line in f: 19 | line = line.decode("latin1") 20 | if line[0].isalpha(): 21 | if head: 22 | yield {"head": head, "body": head + body} 23 | head, body = line.split(".", 1) 24 | else: 25 | body += line 26 | 27 | if head: 28 | yield {"head": head, "body": head + body} 29 | 30 | def whoosh_schema(self): 31 | ana = analysis.StemmingAnalyzer() 32 | 33 | schema = fields.Schema( 34 | head=fields.ID(stored=True), body=fields.TEXT(analyzer=ana, stored=True) 35 | ) 36 | return schema 37 | 38 | def zcatalog_setup(self, cat): 39 | from zcatalog import indexes # type: ignore @UnresolvedImport 40 | 41 | cat["head"] = indexes.FieldIndex(field_name="head") 42 | cat["body"] = indexes.TextIndex(field_name="body") 43 | 44 | 45 | if __name__ == "__main__": 46 | Bench().run(VulgarTongue) 47 | -------------------------------------------------------------------------------- /benchmark/reuters.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os.path 3 | 4 | from whoosh import analysis, fields, index, qparser, query 5 | from whoosh.support.bench import Bench, Spec 6 | from whoosh.util import now 7 | 8 | 9 | class Reuters(Spec): 10 | name = "reuters" 11 | filename = "reuters21578.txt.gz" 12 | main_field = "text" 13 | headline_text = "headline" 14 | 15 | def whoosh_schema(self): 16 | # ana = analysis.StemmingAnalyzer() 17 | ana = analysis.StandardAnalyzer() 18 | schema = fields.Schema( 19 | id=fields.ID(stored=True), 20 | headline=fields.STORED, 21 | text=fields.TEXT(analyzer=ana, stored=True), 22 | ) 23 | return schema 24 | 25 | def zcatalog_setup(self, cat): 26 | from zcatalog import indexes # type: ignore @UnresolvedImport 27 | 28 | cat["id"] = indexes.FieldIndex(field_name="id") 29 | cat["headline"] = indexes.TextIndex(field_name="headline") 30 | cat["body"] = indexes.TextIndex(field_name="text") 31 | 32 | def documents(self): 33 | path = os.path.join(self.options.dir, self.filename) 34 | f = gzip.GzipFile(path) 35 | 36 | for line in f: 37 | id, text = line.decode("latin1").split("\t") 38 | yield {"id": id, "text": text, "headline": text[:70]} 39 | 40 | 41 | if __name__ == "__main__": 42 | Bench().run(Reuters) 43 | -------------------------------------------------------------------------------- /benchmark/reuters21578.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/benchmark/reuters21578.txt.gz -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: yes 3 | notify: 4 | wait_for_ci: yes 5 | 6 | coverage: 7 | precision: 2 8 | round: down 9 | range: 80...90 10 | 11 | status: 12 | # Learn more at http://docs.codecov.io/docs/codecov-yaml 13 | project: 14 | default: 15 | enabled: yes 16 | target: 1 17 | threshold: 0.1 18 | patch: 19 | default: 20 | enabled: off 21 | 22 | ignore: 23 | - "**/*.ipynb" 24 | - docs 25 | - docker 26 | - binder 27 | - .github 28 | - .git 29 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx_rtd_theme 3 | sphinx-jsonschema 4 | -------------------------------------------------------------------------------- /docs/source/api/analysis.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | ``analysis`` module 3 | =================== 4 | 5 | .. automodule:: whoosh.analysis 6 | 7 | Analyzers 8 | ========= 9 | 10 | .. autofunction:: IDAnalyzer 11 | .. autofunction:: KeywordAnalyzer 12 | .. autofunction:: RegexAnalyzer 13 | .. autofunction:: SimpleAnalyzer 14 | .. autofunction:: StandardAnalyzer 15 | .. autofunction:: StemmingAnalyzer 16 | .. autofunction:: FancyAnalyzer 17 | .. autofunction:: NgramAnalyzer 18 | .. autofunction:: NgramWordAnalyzer 19 | .. autofunction:: LanguageAnalyzer 20 | 21 | 22 | Tokenizers 23 | ========== 24 | 25 | .. autoclass:: IDTokenizer 26 | .. autoclass:: RegexTokenizer 27 | .. autoclass:: CharsetTokenizer 28 | .. autofunction:: SpaceSeparatedTokenizer 29 | .. autofunction:: CommaSeparatedTokenizer 30 | .. autoclass:: NgramTokenizer 31 | .. autoclass:: PathTokenizer 32 | 33 | 34 | Filters 35 | ======= 36 | 37 | .. autoclass:: PassFilter 38 | .. autoclass:: LoggingFilter 39 | .. autoclass:: MultiFilter 40 | .. autoclass:: TeeFilter 41 | .. autoclass:: ReverseTextFilter 42 | .. autoclass:: LowercaseFilter 43 | .. autoclass:: StripFilter 44 | .. autoclass:: StopFilter 45 | .. autoclass:: StemFilter 46 | .. autoclass:: CharsetFilter 47 | .. autoclass:: NgramFilter 48 | .. autoclass:: IntraWordFilter 49 | .. autoclass:: CompoundWordFilter 50 | .. autoclass:: BiWordFilter 51 | .. autoclass:: ShingleFilter 52 | .. autoclass:: DelimitedAttributeFilter 53 | .. autoclass:: DoubleMetaphoneFilter 54 | .. autoclass:: SubstitutionFilter 55 | 56 | 57 | Token classes and functions 58 | =========================== 59 | 60 | .. autoclass:: Token 61 | .. autofunction:: unstopped 62 | -------------------------------------------------------------------------------- /docs/source/api/api.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Whoosh API 3 | ========== 4 | 5 | .. toctree:: 6 | :glob: 7 | :maxdepth: 1 8 | 9 | ** 10 | -------------------------------------------------------------------------------- /docs/source/api/codec/base.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | ``codec.base`` module 3 | ===================== 4 | 5 | .. automodule:: whoosh.codec.base 6 | 7 | 8 | Classes 9 | ======= 10 | 11 | .. autoclass:: Codec 12 | :members: 13 | 14 | .. autoclass:: PerDocumentWriter 15 | :members: 16 | 17 | .. autoclass:: FieldWriter 18 | :members: 19 | 20 | .. autoclass:: PostingsWriter 21 | :members: 22 | 23 | .. autoclass:: TermsReader 24 | :members: 25 | 26 | .. autoclass:: PerDocumentReader 27 | :members: 28 | 29 | .. autoclass:: Segment 30 | :members: 31 | -------------------------------------------------------------------------------- /docs/source/api/collectors.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | ``collectors`` module 3 | ===================== 4 | 5 | .. automodule:: whoosh.collectors 6 | 7 | 8 | Base classes 9 | ============ 10 | 11 | .. autoclass:: Collector 12 | :members: 13 | 14 | .. autoclass:: ScoredCollector 15 | :members: 16 | 17 | .. autoclass:: WrappingCollector 18 | :members: 19 | 20 | 21 | Basic collectors 22 | ================ 23 | 24 | .. autoclass:: TopCollector 25 | 26 | .. autoclass:: UnlimitedCollector 27 | 28 | .. autoclass:: SortingCollector 29 | 30 | 31 | Wrappers 32 | ======== 33 | 34 | .. autoclass:: FilterCollector 35 | 36 | .. autoclass:: FacetCollector 37 | 38 | .. autoclass:: CollapseCollector 39 | 40 | .. autoclass:: TimeLimitCollector 41 | 42 | .. autoclass:: TermsCollector 43 | -------------------------------------------------------------------------------- /docs/source/api/columns.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | ``columns`` module 3 | ===================== 4 | 5 | .. automodule:: whoosh.columns 6 | 7 | 8 | Base classes 9 | ============ 10 | 11 | .. autoclass:: Column 12 | :members: 13 | 14 | .. autoclass:: ColumnWriter 15 | :members: 16 | 17 | .. autoclass:: ColumnReader 18 | :members: 19 | 20 | 21 | Basic columns 22 | ============= 23 | 24 | .. autoclass:: VarBytesColumn 25 | 26 | .. autoclass:: FixedBytesColumn 27 | 28 | .. autoclass:: RefBytesColumn 29 | 30 | .. autoclass:: NumericColumn 31 | 32 | 33 | Technical columns 34 | ================= 35 | 36 | .. autoclass:: BitColumn 37 | 38 | .. autoclass:: CompressedBytesColumn 39 | 40 | .. autoclass:: StructColumn 41 | 42 | .. autoclass:: PickleColumn 43 | 44 | 45 | Experimental columns 46 | ==================== 47 | 48 | .. autoclass:: ClampedNumericColumn 49 | -------------------------------------------------------------------------------- /docs/source/api/fields.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | ``fields`` module 3 | ================= 4 | 5 | .. automodule:: whoosh.fields 6 | 7 | Schema class 8 | ============ 9 | 10 | .. autoclass:: Schema 11 | :members: 12 | 13 | .. autoclass:: SchemaClass 14 | 15 | FieldType base class 16 | ==================== 17 | 18 | .. autoclass:: FieldType 19 | :members: 20 | 21 | 22 | Pre-made field types 23 | ==================== 24 | 25 | .. autoclass:: ID 26 | .. autoclass:: IDLIST 27 | .. autoclass:: STORED 28 | .. autoclass:: KEYWORD 29 | .. autoclass:: TEXT 30 | .. autoclass:: NUMERIC 31 | .. autoclass:: DATETIME 32 | .. autoclass:: BOOLEAN 33 | .. autoclass:: NGRAM 34 | .. autoclass:: NGRAMWORDS 35 | 36 | 37 | Exceptions 38 | ========== 39 | 40 | .. autoexception:: FieldConfigurationError 41 | .. autoexception:: UnknownFieldError 42 | -------------------------------------------------------------------------------- /docs/source/api/filedb/filestore.rst: -------------------------------------------------------------------------------- 1 | =========================== 2 | ``filedb.filestore`` module 3 | =========================== 4 | 5 | .. automodule:: whoosh.filedb.filestore 6 | 7 | Base class 8 | ========== 9 | 10 | .. autoclass:: Storage 11 | :members: 12 | 13 | 14 | Implementation classes 15 | ====================== 16 | 17 | .. autoclass:: FileStorage 18 | .. autoclass:: RamStorage 19 | 20 | 21 | Helper functions 22 | ================ 23 | 24 | .. autofunction:: copy_storage 25 | .. autofunction:: copy_to_ram 26 | 27 | 28 | Exceptions 29 | ========== 30 | 31 | .. autoexception:: ReadOnlyError 32 | -------------------------------------------------------------------------------- /docs/source/api/filedb/filetables.rst: -------------------------------------------------------------------------------- 1 | ============================ 2 | ``filedb.filetables`` module 3 | ============================ 4 | 5 | .. automodule:: whoosh.filedb.filetables 6 | 7 | 8 | Hash file 9 | ========= 10 | 11 | .. autoclass:: HashWriter 12 | :members: 13 | 14 | .. autoclass:: HashReader 15 | :members: 16 | 17 | 18 | Ordered Hash file 19 | ================= 20 | 21 | .. autoclass:: OrderedHashWriter 22 | .. autoclass:: OrderedHashReader 23 | -------------------------------------------------------------------------------- /docs/source/api/filedb/structfile.rst: -------------------------------------------------------------------------------- 1 | ============================ 2 | ``filedb.structfile`` module 3 | ============================ 4 | 5 | .. automodule:: whoosh.filedb.structfile 6 | 7 | Classes 8 | ======= 9 | 10 | .. autoclass:: StructFile 11 | :members: 12 | 13 | .. autoclass:: BufferFile 14 | .. autoclass:: ChecksumFile 15 | -------------------------------------------------------------------------------- /docs/source/api/formats.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | ``formats`` module 3 | ================== 4 | 5 | .. automodule:: whoosh.formats 6 | 7 | Base class 8 | ========== 9 | 10 | .. autoclass:: Format 11 | :members: 12 | 13 | 14 | Formats 15 | ======= 16 | 17 | .. autoclass:: Existence 18 | .. autoclass:: Frequency 19 | .. autoclass:: Positions 20 | .. autoclass:: Characters 21 | .. autoclass:: PositionBoosts 22 | .. autoclass:: CharacterBoosts 23 | -------------------------------------------------------------------------------- /docs/source/api/highlight.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | ``highlight`` module 3 | ==================== 4 | 5 | .. automodule:: whoosh.highlight 6 | 7 | See :doc:`how to highlight terms in search results `. 8 | 9 | 10 | Manual highlighting 11 | =================== 12 | 13 | .. autoclass:: Highlighter 14 | :members: 15 | 16 | .. autofunction:: highlight 17 | 18 | 19 | Fragmenters 20 | =========== 21 | 22 | .. autoclass:: Fragmenter 23 | :members: 24 | 25 | .. autoclass:: WholeFragmenter 26 | .. autoclass:: SentenceFragmenter 27 | .. autoclass:: ContextFragmenter 28 | .. autoclass:: PinpointFragmenter 29 | 30 | 31 | Scorers 32 | ======= 33 | 34 | .. autoclass:: FragmentScorer 35 | .. autoclass:: BasicFragmentScorer 36 | 37 | 38 | Formatters 39 | ========== 40 | 41 | .. autoclass:: UppercaseFormatter 42 | .. autoclass:: HtmlFormatter 43 | .. autoclass:: GenshiFormatter 44 | 45 | 46 | Utility classes 47 | =============== 48 | 49 | .. autoclass:: Fragment 50 | :members: 51 | -------------------------------------------------------------------------------- /docs/source/api/idsets.rst: -------------------------------------------------------------------------------- 1 | ============================ 2 | ``support.bitvector`` module 3 | ============================ 4 | 5 | .. automodule:: whoosh.idsets 6 | 7 | 8 | Base classes 9 | ============ 10 | 11 | .. autoclass:: DocIdSet 12 | :members: 13 | 14 | .. autoclass:: BaseBitSet 15 | 16 | 17 | Implementation classes 18 | ====================== 19 | 20 | .. autoclass:: BitSet 21 | .. autoclass:: OnDiskBitSet 22 | .. autoclass:: SortedIntSet 23 | .. autoclass:: MultiIdSet 24 | -------------------------------------------------------------------------------- /docs/source/api/index.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | ``index`` module 3 | ================ 4 | 5 | .. automodule:: whoosh.index 6 | 7 | 8 | Functions 9 | ========= 10 | 11 | .. autofunction:: create_in 12 | .. autofunction:: open_dir 13 | .. autofunction:: exists_in 14 | .. autofunction:: exists 15 | .. autofunction:: version_in 16 | .. autofunction:: version 17 | 18 | 19 | Base class 20 | ========== 21 | 22 | .. autoclass:: Index 23 | :members: 24 | 25 | 26 | Implementation 27 | ============== 28 | 29 | .. autoclass:: FileIndex 30 | 31 | 32 | Exceptions 33 | ========== 34 | 35 | .. autoexception:: LockError 36 | .. autoexception:: IndexError 37 | .. autoexception:: IndexVersionError 38 | .. autoexception:: OutOfDateError 39 | .. autoexception:: EmptyIndexError 40 | -------------------------------------------------------------------------------- /docs/source/api/lang/morph_en.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | ``lang.morph_en`` module 3 | ======================== 4 | 5 | .. automodule:: whoosh.lang.morph_en 6 | 7 | .. autofunction:: variations 8 | -------------------------------------------------------------------------------- /docs/source/api/lang/porter.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | ``lang.porter`` module 3 | ====================== 4 | 5 | .. automodule:: whoosh.lang.porter 6 | 7 | .. autofunction:: stem 8 | -------------------------------------------------------------------------------- /docs/source/api/lang/wordnet.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | ``lang.wordnet`` module 3 | ======================== 4 | 5 | .. automodule:: whoosh.lang.wordnet 6 | 7 | Thesaurus 8 | ========= 9 | 10 | .. autoclass:: Thesaurus 11 | :members: 12 | 13 | 14 | Low-level functions 15 | =================== 16 | 17 | .. autofunction:: parse_file 18 | .. autofunction:: synonyms 19 | .. autofunction:: make_index 20 | -------------------------------------------------------------------------------- /docs/source/api/matching.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | ``matching`` module 3 | =================== 4 | 5 | .. automodule:: whoosh.matching 6 | 7 | Matchers 8 | ======== 9 | 10 | .. autoclass:: Matcher 11 | :members: 12 | 13 | .. autoclass:: NullMatcher 14 | .. autoclass:: ListMatcher 15 | .. autoclass:: WrappingMatcher 16 | .. autoclass:: MultiMatcher 17 | .. autoclass:: FilterMatcher 18 | .. autoclass:: BiMatcher 19 | .. autoclass:: AdditiveBiMatcher 20 | .. autoclass:: UnionMatcher 21 | .. autoclass:: DisjunctionMaxMatcher 22 | .. autoclass:: IntersectionMatcher 23 | .. autoclass:: AndNotMatcher 24 | .. autoclass:: InverseMatcher 25 | .. autoclass:: RequireMatcher 26 | .. autoclass:: AndMaybeMatcher 27 | .. autoclass:: ConstantScoreMatcher 28 | 29 | 30 | Exceptions 31 | ========== 32 | 33 | .. autoexception:: ReadTooFar 34 | .. autoexception:: NoQualityAvailable 35 | -------------------------------------------------------------------------------- /docs/source/api/qparser.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | ``qparser`` module 3 | ================== 4 | 5 | .. automodule:: whoosh.qparser 6 | 7 | Parser object 8 | ============= 9 | 10 | .. autoclass:: QueryParser 11 | :members: 12 | 13 | Pre-made configurations 14 | ----------------------- 15 | 16 | The following functions return pre-configured QueryParser objects. 17 | 18 | .. autofunction:: MultifieldParser 19 | 20 | .. autofunction:: SimpleParser 21 | 22 | .. autofunction:: DisMaxParser 23 | 24 | 25 | Plug-ins 26 | ======== 27 | 28 | .. autoclass:: Plugin 29 | :members: 30 | 31 | .. autoclass:: SingleQuotePlugin 32 | .. autoclass:: PrefixPlugin 33 | .. autoclass:: WildcardPlugin 34 | .. autoclass:: RegexPlugin 35 | .. autoclass:: BoostPlugin 36 | .. autoclass:: GroupPlugin 37 | .. autoclass:: EveryPlugin 38 | .. autoclass:: FieldsPlugin 39 | .. autoclass:: PhrasePlugin 40 | .. autoclass:: RangePlugin 41 | .. autoclass:: OperatorsPlugin 42 | .. autoclass:: PlusMinusPlugin 43 | .. autoclass:: GtLtPlugin 44 | .. autoclass:: MultifieldPlugin 45 | .. autoclass:: FieldAliasPlugin 46 | .. autoclass:: CopyFieldPlugin 47 | 48 | 49 | Syntax node objects 50 | =================== 51 | 52 | Base nodes 53 | ---------- 54 | 55 | .. autoclass:: SyntaxNode 56 | :members: 57 | 58 | 59 | Nodes 60 | ----- 61 | 62 | .. autoclass:: FieldnameNode 63 | .. autoclass:: TextNode 64 | .. autoclass:: WordNode 65 | .. autoclass:: RangeNode 66 | .. autoclass:: MarkerNode 67 | 68 | 69 | Group nodes 70 | ----------- 71 | 72 | .. autoclass:: GroupNode 73 | .. autoclass:: BinaryGroup 74 | .. autoclass:: ErrorNode 75 | .. autoclass:: AndGroup 76 | .. autoclass:: OrGroup 77 | .. autoclass:: AndNotGroup 78 | .. autoclass:: AndMaybeGroup 79 | .. autoclass:: DisMaxGroup 80 | .. autoclass:: RequireGroup 81 | .. autoclass:: NotGroup 82 | 83 | 84 | Operators 85 | --------- 86 | 87 | .. autoclass:: Operator 88 | .. autoclass:: PrefixOperator 89 | .. autoclass:: PostfixOperator 90 | .. autoclass:: InfixOperator 91 | -------------------------------------------------------------------------------- /docs/source/api/query.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | ``query`` module 3 | ================ 4 | 5 | .. automodule:: whoosh.query 6 | 7 | See also :mod:` whoosh.qparser` which contains code for parsing user queries 8 | into query objects. 9 | 10 | Base classes 11 | ============ 12 | 13 | The following abstract base classes are subclassed to create the "real" 14 | query operations. 15 | 16 | .. autoclass:: Query 17 | :members: 18 | 19 | .. autoclass:: CompoundQuery 20 | .. autoclass:: MultiTerm 21 | .. autoclass:: ExpandingTerm 22 | .. autoclass:: WrappingQuery 23 | 24 | 25 | Query classes 26 | ============= 27 | 28 | .. autoclass:: Term 29 | .. autoclass:: Variations 30 | .. autoclass:: FuzzyTerm 31 | .. autoclass:: Phrase 32 | .. autoclass:: And 33 | .. autoclass:: Or 34 | .. autoclass:: DisjunctionMax 35 | .. autoclass:: Not 36 | .. autoclass:: Prefix 37 | .. autoclass:: Wildcard 38 | .. autoclass:: Regex 39 | .. autoclass:: TermRange 40 | .. autoclass:: NumericRange 41 | .. autoclass:: DateRange 42 | .. autoclass:: Every 43 | .. autoclass:: NullQuery 44 | 45 | 46 | Binary queries 47 | ============== 48 | 49 | .. autoclass:: Require 50 | .. autoclass:: AndMaybe 51 | .. autoclass:: AndNot 52 | .. autoclass:: Otherwise 53 | 54 | 55 | Span queries 56 | ============ 57 | 58 | .. autoclass:: Span 59 | :members: 60 | 61 | .. autoclass:: SpanQuery 62 | .. autoclass:: SpanFirst 63 | .. autoclass:: SpanNear 64 | .. autoclass:: SpanNear2 65 | .. autoclass:: SpanNot 66 | .. autoclass:: SpanOr 67 | .. autoclass:: SpanContains 68 | .. autoclass:: SpanBefore 69 | .. autoclass:: SpanCondition 70 | 71 | 72 | Special queries 73 | =============== 74 | 75 | .. autoclass:: NestedParent 76 | .. autoclass:: NestedChildren 77 | .. autoclass:: ConstantScoreQuery 78 | 79 | 80 | Exceptions 81 | ========== 82 | 83 | .. autoexception:: QueryError 84 | -------------------------------------------------------------------------------- /docs/source/api/reading.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | ``reading`` module 3 | ================== 4 | 5 | .. automodule:: whoosh.reading 6 | 7 | Classes 8 | ======= 9 | 10 | .. autoclass:: IndexReader 11 | :members: 12 | 13 | .. autoclass:: MultiReader 14 | 15 | .. autoclass:: TermInfo 16 | :members: 17 | 18 | Exceptions 19 | ========== 20 | 21 | .. autoexception:: TermNotFound 22 | -------------------------------------------------------------------------------- /docs/source/api/scoring.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | ``scoring`` module 3 | ================== 4 | 5 | .. automodule:: whoosh.scoring 6 | 7 | 8 | Base classes 9 | ============ 10 | 11 | .. autoclass:: WeightingModel 12 | :members: 13 | 14 | .. autoclass:: BaseScorer 15 | :members: 16 | 17 | .. autoclass:: WeightScorer 18 | .. autoclass:: WeightLengthScorer 19 | 20 | 21 | Scoring algorithm classes 22 | ========================= 23 | 24 | .. autoclass:: BM25F 25 | 26 | .. autoclass:: TF_IDF 27 | 28 | .. autoclass:: Frequency 29 | 30 | 31 | Scoring utility classes 32 | ======================= 33 | 34 | .. autoclass:: FunctionWeighting 35 | 36 | .. autoclass:: MultiWeighting 37 | 38 | .. autoclass:: ReverseWeighting 39 | -------------------------------------------------------------------------------- /docs/source/api/searching.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | ``searching`` module 3 | ==================== 4 | 5 | .. automodule:: whoosh.searching 6 | 7 | 8 | Searching classes 9 | ================= 10 | 11 | .. autoclass:: Searcher 12 | :members: 13 | 14 | 15 | Results classes 16 | =============== 17 | 18 | .. autoclass:: Results 19 | :members: 20 | 21 | .. autoclass:: Hit 22 | :members: 23 | 24 | .. autoclass:: ResultsPage 25 | :members: 26 | 27 | 28 | Exceptions 29 | ========== 30 | 31 | .. autoexception:: NoTermsException 32 | .. autoexception:: TimeLimit 33 | -------------------------------------------------------------------------------- /docs/source/api/sorting.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | ``sorting`` module 3 | ================== 4 | 5 | .. automodule:: whoosh.sorting 6 | 7 | 8 | Base types 9 | ========== 10 | 11 | .. autoclass:: FacetType 12 | :members: 13 | 14 | .. autoclass:: Categorizer 15 | :members: 16 | 17 | 18 | Facet types 19 | =========== 20 | 21 | .. autoclass:: FieldFacet 22 | .. autoclass:: QueryFacet 23 | .. autoclass:: RangeFacet 24 | .. autoclass:: DateRangeFacet 25 | .. autoclass:: ScoreFacet 26 | .. autoclass:: FunctionFacet 27 | .. autoclass:: MultiFacet 28 | .. autoclass:: StoredFieldFacet 29 | 30 | 31 | Facets object 32 | ============= 33 | 34 | .. autoclass:: Facets 35 | :members: 36 | 37 | 38 | FacetType objects 39 | ================= 40 | 41 | .. autoclass:: FacetMap 42 | :members: 43 | .. autoclass:: OrderedList 44 | .. autoclass:: UnorderedList 45 | .. autoclass:: Count 46 | .. autoclass:: Best 47 | -------------------------------------------------------------------------------- /docs/source/api/spelling.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | ``spelling`` module 3 | =================== 4 | 5 | See :doc:`correcting errors in user queries <../spelling>`. 6 | 7 | .. automodule:: whoosh.spelling 8 | 9 | 10 | Corrector objects 11 | ================= 12 | 13 | .. autoclass:: Corrector 14 | :members: 15 | 16 | .. autoclass:: ReaderCorrector 17 | 18 | .. autoclass:: MultiCorrector 19 | 20 | 21 | QueryCorrector objects 22 | ====================== 23 | 24 | .. autoclass:: QueryCorrector 25 | :members: 26 | 27 | .. autoclass:: SimpleQueryCorrector 28 | 29 | .. autoclass:: Correction 30 | -------------------------------------------------------------------------------- /docs/source/api/support/charset.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | ``support.charset`` module 3 | ========================== 4 | 5 | .. automodule:: whoosh.support.charset 6 | 7 | .. data:: default_charset 8 | 9 | An extensive case- and accent folding charset table. 10 | Taken from http://speeple.com/unicode-maps.txt 11 | 12 | .. autofunction:: charset_table_to_dict 13 | -------------------------------------------------------------------------------- /docs/source/api/support/levenshtein.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | ``support.levenshtein`` module 3 | ============================== 4 | 5 | .. automodule:: whoosh.support.levenshtein 6 | 7 | .. autofunction:: relative 8 | 9 | .. autofunction:: distance 10 | -------------------------------------------------------------------------------- /docs/source/api/util.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | ``util`` module 3 | =============== 4 | 5 | .. automodule:: whoosh.util 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/source/api/writing.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | ``writing`` module 3 | ================== 4 | 5 | .. automodule:: whoosh.writing 6 | 7 | 8 | Writer 9 | ====== 10 | 11 | .. autoclass:: IndexWriter 12 | :members: 13 | 14 | 15 | Utility writers 16 | =============== 17 | 18 | .. autoclass:: BufferedWriter 19 | :members: 20 | 21 | .. autoclass:: AsyncWriter 22 | :members: 23 | 24 | 25 | Exceptions 26 | ========== 27 | 28 | .. autoexception:: IndexingError 29 | -------------------------------------------------------------------------------- /docs/source/batch.rst: -------------------------------------------------------------------------------- 1 | =================================== 2 | Tips for speeding up batch indexing 3 | =================================== 4 | 5 | 6 | Overview 7 | ======== 8 | 9 | Indexing documents tends to fall into two general patterns: adding documents 10 | one at a time as they are created (as in a web application), and adding a bunch 11 | of documents at once (batch indexing). 12 | 13 | The following settings and alternate workflows can make batch indexing faster. 14 | 15 | 16 | StemmingAnalyzer cache 17 | ====================== 18 | 19 | The stemming analyzer by default uses a least-recently-used (LRU) cache to limit 20 | the amount of memory it uses, to prevent the cache from growing very large if 21 | the analyzer is reused for a long period of time. However, the LRU cache can 22 | slow down indexing by almost 200% compared to a stemming analyzer with an 23 | "unbounded" cache. 24 | 25 | When you're indexing in large batches with a one-shot instance of the 26 | analyzer, consider using an unbounded cache:: 27 | 28 | w = myindex.writer() 29 | # Get the analyzer object from a text field 30 | stem_ana = w.schema["content"].format.analyzer 31 | # Set the cachesize to -1 to indicate unbounded caching 32 | stem_ana.cachesize = -1 33 | # Reset the analyzer to pick up the changed attribute 34 | stem_ana.clear() 35 | 36 | # Use the writer to index documents... 37 | 38 | 39 | The ``limitmb`` parameter 40 | ========================= 41 | 42 | The ``limitmb`` parameter to :meth:`whoosh.index.Index.writer` controls the 43 | *maximum* memory (in megabytes) the writer will use for the indexing pool. The 44 | higher the number, the faster indexing will be. 45 | 46 | The default value of ``128`` is actually somewhat low, considering many people 47 | have multiple gigabytes of RAM these days. Setting it higher can speed up 48 | indexing considerably:: 49 | 50 | from whoosh import index 51 | 52 | ix = index.open_dir("indexdir") 53 | writer = ix.writer(limitmb=256) 54 | 55 | .. note:: 56 | The actual memory used will be higher than this value because of interpreter 57 | overhead (up to twice as much!). It is very useful as a tuning parameter, 58 | but not for trying to exactly control the memory usage of Whoosh. 59 | 60 | 61 | The ``procs`` parameter 62 | ======================= 63 | 64 | The ``procs`` parameter to :meth:`whoosh.index.Index.writer` controls the 65 | number of processors the writer will use for indexing (via the 66 | ``multiprocessing`` module):: 67 | 68 | from whoosh import index 69 | 70 | ix = index.open_dir("indexdir") 71 | writer = ix.writer(procs=4) 72 | 73 | Note that when you use multiprocessing, the ``limitmb`` parameter controls the 74 | amount of memory used by *each process*, so the actual memory used will be 75 | ``limitmb * procs``:: 76 | 77 | # Each process will use a limit of 128, for a total of 512 78 | writer = ix.writer(procs=4, limitmb=128) 79 | 80 | 81 | The ``multisegment`` parameter 82 | ============================== 83 | 84 | The ``procs`` parameter causes the default writer to use multiple processors to 85 | do much of the indexing, but then still uses a single process to merge the pool 86 | of each sub-writer into a single segment. 87 | 88 | You can get much better indexing speed by also using the ``multisegment=True`` 89 | keyword argument, which instead of merging the results of each sub-writer, 90 | simply has them each just write out a new segment:: 91 | 92 | from whoosh import index 93 | 94 | ix = index.open_dir("indexdir") 95 | writer = ix.writer(procs=4, multisegment=True) 96 | 97 | The drawback is that instead 98 | of creating a single new segment, this option creates a number of new segments 99 | **at least** equal to the number of processes you use. 100 | 101 | For example, if you use ``procs=4``, the writer will create four new segments. 102 | (If you merge old segments or call ``add_reader`` on the parent writer, the 103 | parent writer will also write a segment, meaning you'll get five new segments.) 104 | 105 | So, while ``multisegment=True`` is much faster than a normal writer, you should 106 | only use it for large batch indexing jobs (or perhaps only for indexing from 107 | scratch). It should not be the only method you use for indexing, because 108 | otherwise the number of segments will tend to increase forever! 109 | -------------------------------------------------------------------------------- /docs/source/fieldcaches.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Field caches 3 | ============ 4 | 5 | The default (``filedb``) backend uses *field caches* in certain circumstances. 6 | The field cache basically pre-computes the order of documents in the index to 7 | speed up sorting and faceting. 8 | 9 | Generating field caches can take time the first time you sort/facet on a large 10 | index. The field cache is kept in memory (and by default written to disk when it 11 | is generated) so subsequent sorted/faceted searches should be faster. 12 | 13 | The default caching policy never expires field caches, so reused searchers and/or 14 | sorting a lot of different fields could use up quite a bit of memory with large 15 | indexes. 16 | 17 | 18 | Customizing cache behaviour 19 | =========================== 20 | 21 | (The following API examples refer to the default ``filedb`` backend.) 22 | 23 | *By default*, Whoosh saves field caches to disk. To prevent a reader or searcher 24 | from writing out field caches, do this before you start using it:: 25 | 26 | searcher.set_caching_policy(save=False) 27 | 28 | By default, if caches are written to disk they are saved in the index directory. 29 | To tell a reader or searcher to save cache files to a different location, create 30 | a storage object and pass it to the ``storage`` keyword argument:: 31 | 32 | from whoosh.filedb.filestore import FileStorage 33 | 34 | mystorage = FileStorage("path/to/cachedir") 35 | reader.set_caching_policy(storage=mystorage) 36 | 37 | 38 | Creating a custom caching policy 39 | ================================ 40 | 41 | Expert users who want to implement a custom caching policy (for example, to add 42 | cache expiration) should subclass :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`. 43 | Then you can pass an instance of your policy object to the ``set_caching_policy`` 44 | method:: 45 | 46 | searcher.set_caching_policy(MyPolicy()) 47 | -------------------------------------------------------------------------------- /docs/source/glossary.rst: -------------------------------------------------------------------------------- 1 | .. _glossary: 2 | 3 | ======== 4 | Glossary 5 | ======== 6 | 7 | .. glossary:: 8 | 9 | Analysis 10 | The process of breaking the text of a field into individual *terms* 11 | to be indexed. This consists of tokenizing the text into terms, and then optionally 12 | filtering the tokenized terms (for example, lowercasing and removing *stop words*). 13 | Whoosh includes several different analyzers. 14 | 15 | Corpus 16 | The set of documents you are indexing. 17 | 18 | Documents 19 | The individual pieces of content you want to make searchable. 20 | The word "documents" might imply files, but the data source could really be 21 | anything -- articles in a content management system, blog posts in a blogging 22 | system, chunks of a very large file, rows returned from an SQL query, individual 23 | email messages from a mailbox file, or whatever. When you get search results 24 | from Whoosh, the results are a list of documents, whatever "documents" means in 25 | your search engine. 26 | 27 | Fields 28 | Each document contains a set of fields. Typical fields might be "title", "content", 29 | "url", "keywords", "status", "date", etc. Fields can be indexed (so they're 30 | searchable) and/or stored with the document. Storing the field makes it available 31 | in search results. For example, you typically want to store the "title" field so 32 | your search results can display it. 33 | 34 | Forward index 35 | A table listing every document and the words that appear in the document. 36 | Whoosh lets you store *term vectors* that are a kind of forward index. 37 | 38 | Indexing 39 | The process of examining documents in the corpus and adding them to the 40 | *reverse index*. 41 | 42 | Postings 43 | The *reverse index* lists every word in the corpus, and for each word, a list 44 | of documents in which that word appears, along with some optional information 45 | (such as the number of times the word appears in that document). These items 46 | in the list, containing a document number and any extra information, are 47 | called *postings*. In Whoosh the information stored in postings is customizable 48 | for each *field*. 49 | 50 | Reverse index 51 | Basically a table listing every word in the corpus, and for each word, the 52 | list of documents in which it appears. It can be more complicated (the index can 53 | also list how many times the word appears in each document, the positions at which 54 | it appears, etc.) but that's how it basically works. 55 | 56 | Schema 57 | Whoosh requires that you specify the *fields* of the index before you begin 58 | indexing. The Schema associates field names with metadata about the field, such 59 | as the format of the *postings* and whether the contents of the field are stored 60 | in the index. 61 | 62 | Term vector 63 | A *forward index* for a certain field in a certain document. You can specify 64 | in the Schema that a given field should store term vectors. 65 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | Whoosh |release| documentation 3 | ============================== 4 | 5 | Whoosh was created by `Matt Chaput `_. 6 | You can view outstanding issues on the 7 | `Whoosh Bitbucket page `_ 8 | and get help on the `Whoosh mailing list `_. 9 | 10 | 11 | Contents 12 | ======== 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | releases/index 18 | quickstart 19 | intro 20 | glossary 21 | schema 22 | indexing 23 | searching 24 | parsing 25 | querylang 26 | dates 27 | query 28 | analysis 29 | stemming 30 | ngrams 31 | facets 32 | highlight 33 | keywords 34 | spelling 35 | fieldcaches 36 | batch 37 | threads 38 | nested 39 | recipes 40 | api/api 41 | tech/index 42 | 43 | 44 | Indices and tables 45 | ================== 46 | 47 | * :ref:`genindex` 48 | * :ref:`modindex` 49 | * :ref:`search` 50 | -------------------------------------------------------------------------------- /docs/source/intro.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | Introduction to Whoosh 3 | ====================== 4 | 5 | About Whoosh 6 | ------------ 7 | 8 | Whoosh was created by `Matt Chaput `_. It started as a quick and dirty 9 | search server for the online documentation of the `Houdini `_ 10 | 3D animation software package. Side Effects Software generously allowed Matt to open source 11 | the code in case it might be useful to anyone else who needs a very flexible or pure-Python 12 | search engine (or both!). 13 | 14 | * Whoosh is fast, but uses only pure Python, so it will run anywhere Python runs, 15 | without requiring a compiler. 16 | 17 | * By default, Whoosh uses the `Okapi BM25F `_ ranking 18 | function, but like most things the ranking function can be easily customized. 19 | 20 | * Whoosh creates fairly small indexes compared to many other search libraries. 21 | 22 | * All indexed text in Whoosh must be *unicode*. 23 | 24 | * Whoosh lets you store arbitrary Python objects with indexed documents. 25 | 26 | 27 | What is Whoosh? 28 | --------------- 29 | 30 | Whoosh is a fast, pure Python search engine library. 31 | 32 | The primary design impetus of Whoosh is that it is pure Python. You should be able to 33 | use Whoosh anywhere you can use Python, no compiler or Java required. 34 | 35 | Like one of its ancestors, Lucene, Whoosh is not really a search engine, it's a programmer 36 | library for creating a search engine [1]_. 37 | 38 | Practically no important behavior of Whoosh is hard-coded. Indexing 39 | of text, the level of information stored for each term in each field, parsing of search queries, 40 | the types of queries allowed, scoring algorithms, etc. are all customizable, replaceable, and 41 | extensible. 42 | 43 | 44 | .. [1] It would of course be possible to build a turnkey search engine on top of Whoosh, 45 | like Nutch and Solr use Lucene. 46 | 47 | 48 | What can Whoosh do for you? 49 | --------------------------- 50 | 51 | Whoosh lets you index free-form or structured text and then quickly find matching 52 | documents based on simple or complex search criteria. 53 | 54 | 55 | Getting help with Whoosh 56 | ------------------------ 57 | 58 | You can view outstanding issues on the 59 | `Whoosh Github page `_ 60 | and get help on the `Whoosh mailing list `_. 61 | -------------------------------------------------------------------------------- /docs/source/keywords.rst: -------------------------------------------------------------------------------- 1 | ======================================= 2 | Query expansion and Key word extraction 3 | ======================================= 4 | 5 | Overview 6 | ======== 7 | 8 | Whoosh provides methods for computing the "key terms" of a set of documents. For 9 | these methods, "key terms" basically means terms that are frequent in the given 10 | documents, but relatively infrequent in the indexed collection as a whole. 11 | 12 | Because this is a purely statistical operation, not a natural language 13 | processing or AI function, the quality of the results will vary based on the 14 | content, the size of the document collection, and the number of documents for 15 | which you extract keywords. 16 | 17 | These methods can be useful for providing the following features to users: 18 | 19 | * Search term expansion. You can extract key terms for the top N results from a 20 | query and suggest them to the user as additional/alternate query terms to try. 21 | 22 | * Tag suggestion. Extracting the key terms for a single document may yield 23 | useful suggestions for tagging the document. 24 | 25 | * "More like this". You can extract key terms for the top ten or so results from 26 | a query (and removing the original query terms), and use those key words as 27 | the basis for another query that may find more documents using terms the user 28 | didn't think of. 29 | 30 | Usage 31 | ===== 32 | 33 | * Get more documents like a certain search hit. *This requires that the field 34 | you want to match on is vectored or stored, or that you have access to the 35 | original text (such as from a database)*. 36 | 37 | Use :meth:`~ whoosh.searching.Hit.more_like_this`:: 38 | 39 | results = mysearcher.search(myquery) 40 | first_hit = results[0] 41 | more_results = first_hit.more_like_this("content") 42 | 43 | * Extract keywords for the top N documents in a 44 | :class:` whoosh.searching.Results` object. *This requires that the field is 45 | either vectored or stored*. 46 | 47 | Use the :meth:`~ whoosh.searching.Results.key_terms` method of the 48 | :class:` whoosh.searching.Results` object to extract keywords from the top N 49 | documents of the result set. 50 | 51 | For example, to extract *five* key terms from the ``content`` field of the top 52 | *ten* documents of a results object:: 53 | 54 | keywords = [keyword for keyword, score 55 | in results.key_terms("content", docs=10, numterms=5) 56 | 57 | * Extract keywords for an arbitrary set of documents. *This requires that the 58 | field is either vectored or stored*. 59 | 60 | Use the :meth:`~ whoosh.searching.Searcher.document_number` or 61 | :meth:`~ whoosh.searching.Searcher.document_numbers` methods of the 62 | :class:` whoosh.searching.Searcher` object to get the document numbers for the 63 | document(s) you want to extract keywords from. 64 | 65 | Use the :meth:`~ whoosh.searching.Searcher.key_terms` method of a 66 | :class:` whoosh.searching.Searcher` to extract the keywords, given the list of 67 | document numbers. 68 | 69 | For example, let's say you have an index of emails. To extract key terms from 70 | the ``content`` field of emails whose ``emailto`` field contains 71 | ``matt@whoosh.ca``:: 72 | 73 | with email_index.searcher() as s: 74 | docnums = s.document_numbers(emailto=u"matt@whoosh.ca") 75 | keywords = [keyword for keyword, score 76 | in s.key_terms(docnums, "body")] 77 | 78 | * Extract keywords from arbitrary text not in the index. 79 | 80 | Use the :meth:`~ whoosh.searching.Searcher.key_terms_from_text` method of a 81 | :class:` whoosh.searching.Searcher` to extract the keywords, given the text:: 82 | 83 | with email_index.searcher() as s: 84 | keywords = [keyword for keyword, score 85 | in s.key_terms_from_text("body", mytext)] 86 | 87 | 88 | Expansion models 89 | ================ 90 | 91 | The ``ExpansionModel`` subclasses in the :mod:` whoosh.classify` module implement 92 | different weighting functions for key words. These models are translated into 93 | Python from original Java implementations in Terrier. 94 | -------------------------------------------------------------------------------- /docs/source/ngrams.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | Indexing and searching N-grams 3 | ============================== 4 | 5 | Overview 6 | ======== 7 | 8 | N-gram indexing is a powerful method for getting fast, "search as you type" 9 | functionality like iTunes. It is also useful for quick and effective indexing 10 | of languages such as Chinese and Japanese without word breaks. 11 | 12 | N-grams refers to groups of N characters... bigrams are groups of two 13 | characters, trigrams are groups of three characters, and so on. 14 | 15 | Whoosh includes two methods for analyzing N-gram fields: an N-gram tokenizer, 16 | and a filter that breaks tokens into N-grams. 17 | 18 | :class:` whoosh.analysis.NgramTokenizer` tokenizes the entire field into N-grams. 19 | This is more useful for Chinese/Japanese/Korean languages, where it's useful 20 | to index bigrams of characters rather than individual characters. Using this 21 | tokenizer with roman languages leads to spaces in the tokens. 22 | 23 | :: 24 | 25 | >>> ngt = NgramTokenizer(minsize=2, maxsize=4) 26 | >>> [token.text for token in ngt(u"hi there")] 27 | [u'hi', u'hi ', u'hi t',u'i ', u'i t', u'i th', u' t', u' th', u' the', u'th', 28 | u'the', u'ther', u'he', u'her', u'here', u'er', u'ere', u're'] 29 | 30 | :class:` whoosh.analysis.NgramFilter` breaks individual tokens into N-grams as 31 | part of an analysis pipeline. This is more useful for languages with word 32 | separation. 33 | 34 | :: 35 | 36 | >>> my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4) 37 | >>> [token.text for token in my_analyzer(u"rendering shaders")] 38 | [u'ren', u'rend', u'end', u'ende', u'nde', u'nder', u'der', u'deri', u'eri', 39 | u'erin', u'rin', u'ring', u'ing', u'sha', u'shad', u'had', u'hade', u'ade', 40 | u'ader', u'der', u'ders', u'ers'] 41 | 42 | Whoosh includes two pre-configured field types for N-grams: 43 | :class:` whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS`. The only 44 | difference is that ``NGRAM`` runs all text through the N-gram filter, including 45 | whitespace and punctuation, while ``NGRAMWORDS`` extracts words from the text 46 | using a tokenizer, then runs each word through the N-gram filter. 47 | 48 | TBD. 49 | -------------------------------------------------------------------------------- /docs/source/query.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Query objects 3 | ============= 4 | 5 | The classes in the :mod:` whoosh.query` module implement *queries* you can run against the index. 6 | 7 | TBD. 8 | 9 | See :doc:`searching` for how to search the index using query objects. 10 | -------------------------------------------------------------------------------- /docs/source/releases/0_3.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Whoosh 0.3 release notes 3 | ======================== 4 | 5 | * Major improvements to reading/writing of postings and query performance. 6 | 7 | * Changed default post limit (run size) from 4 MB to 32 MB. 8 | 9 | * Finished migrating backend-specific code into `` whoosh.filedb`` package. 10 | 11 | * Moved formats from whoosh.fields module into new whoosh.formats module. 12 | 13 | * DocReader and TermReader classes combined into new IndexReader interface. 14 | You can get an IndexReader implementation by calling Index.reader(). 15 | Searcher is now a wrapper around an IndexReader. 16 | 17 | * Range query object changed, with new signature and new syntax in the default 18 | query parser. Now you can use ``[start TO end]`` in the query parser for an 19 | inclusive range, and ``{start TO end}`` for an exclusive range. You can also 20 | mix the delimiters, for example ``[start TO end}`` for a range with an 21 | inclusive start but exclusive end term. 22 | 23 | * Added experimental DATETIME field type lets you pass a 24 | ``datetime.datetime`` object as a field value to ``add_document``:: 25 | 26 | from whoosh.fields import Schema, ID, DATETIME 27 | from whoosh.filedb.filestore import RamStorage 28 | from datetime import datetime 29 | 30 | schema = Schema(id=ID, date=DATETIME) 31 | storage = RamStorage() 32 | ix = storage.create_index(schema) 33 | w = ix.writer() 34 | w.add_document(id=u"A", date=datetime.now()) 35 | w.close() 36 | 37 | Internally, the DATETIME field indexes the datetime object as text using 38 | the format (4 digit year + 2 digit month + 2 digit day + 'T' + 2 digit hour + 39 | 2 digit minute + 2 digit second + 6 digit microsecond), for example 40 | ``20090817T160203109000``. 41 | 42 | * The default query parser now lets you use quoted strings in prefix and range 43 | queries, e.g. ``["2009-05" TO "2009-12"]``, ``"alfa/bravo"*``, making it 44 | easier to work with terms containing special characters. 45 | 46 | * ``DocReader.vector_as(docnum, fieldid, astype)`` is now 47 | ``IndexReader.vector_as(astype, docnum, fieldid)`` (i.e. the astype argument 48 | has moved from the last to the first argument), e.g. 49 | ``v = ixreader.vector_as("frequency", 102, "content")``. 50 | 51 | * Added whoosh.support.charset for translating Sphinx charset table files. 52 | 53 | * Added whoosh.analysis.CharsetTokenizer and CharsetFilter to enable case and 54 | accent folding. 55 | 56 | * Added experimental `` whoosh.ramdb`` in-memory backend. 57 | 58 | * Added experimental `` whoosh.query.FuzzyTerm`` query type. 59 | 60 | * Added `` whoosh.lang.wordnet`` module containing ``Thesaurus`` object for using 61 | WordNet synonym database. 62 | -------------------------------------------------------------------------------- /docs/source/releases/index.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Release notes 3 | ============= 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | 2_0 9 | 1_0 10 | 0_3 11 | -------------------------------------------------------------------------------- /docs/source/spelling.rst: -------------------------------------------------------------------------------- 1 | ===================================================== 2 | "Did you mean... ?" Correcting errors in user queries 3 | ===================================================== 4 | 5 | Overview 6 | ======== 7 | 8 | Whoosh can quickly suggest replacements for mis-typed words by returning 9 | a list of words from the index (or a dictionary) that are close to the 10 | mis-typed word:: 11 | 12 | with ix.searcher() as s: 13 | corrector = s.corrector("text") 14 | for mistyped_word in mistyped_words: 15 | print corrector.suggest(mistyped_word, limit=3) 16 | 17 | See the :meth:` whoosh.spelling.Corrector.suggest` method documentation 18 | for information on the arguments. 19 | 20 | Currently the suggestion engine is more like a "typo corrector" than a 21 | real "spell checker" since it doesn't do the kind of sophisticated 22 | phonetic matching or semantic/contextual analysis a good spell checker 23 | might. However, it is still very useful. 24 | 25 | There are two main strategies for correcting words: 26 | 27 | * Use the terms from an index field. 28 | 29 | * Use words from a word list. 30 | 31 | 32 | Pulling suggestions from an indexed field 33 | ========================================= 34 | 35 | In Whoosh 2.7 and later, spelling suggestions are available on all fields. 36 | However, if you have an analyzer that modifies the indexed words (such as 37 | stemming), you can add ``spelling=True`` to a field to have it store separate 38 | unmodified versions of the terms for spelling suggestions:: 39 | 40 | ana = analysis.StemmingAnalyzer() 41 | schema = fields.Schema(text=TEXT(analyzer=ana, spelling=True)) 42 | 43 | You can then use the :meth:` whoosh.searching.Searcher.corrector` method 44 | to get a corrector for a field:: 45 | 46 | corrector = searcher.corrector("content") 47 | 48 | The advantage of using the contents of an index field is that when you 49 | are spell checking queries on that index, the suggestions are tailored 50 | to the contents of the index. The disadvantage is that if the indexed 51 | documents contain spelling errors, then the spelling suggestions will 52 | also be erroneous. 53 | 54 | 55 | Pulling suggestions from a word list 56 | ==================================== 57 | 58 | There are plenty of word lists available on the internet you can use to 59 | populate the spelling dictionary. 60 | 61 | (In the following examples, ``word_list`` can be a list of unicode 62 | strings, or a file object with one word on each line.) 63 | 64 | To create a :class:` whoosh.spelling.Corrector` object from a sorted word list:: 65 | 66 | from whoosh.spelling import ListCorrector 67 | 68 | # word_list must be a sorted list of unicocde strings 69 | corrector = ListCorrector(word_list) 70 | 71 | 72 | Merging two or more correctors 73 | ============================== 74 | 75 | You can combine suggestions from two sources (for example, the contents 76 | of an index field and a word list) using a 77 | :class:` whoosh.spelling.MultiCorrector`:: 78 | 79 | c1 = searcher.corrector("content") 80 | c2 = spelling.ListCorrector(word_list) 81 | corrector = MultiCorrector([c1, c2]) 82 | 83 | 84 | Correcting user queries 85 | ======================= 86 | 87 | You can spell-check a user query using the 88 | :meth:` whoosh.searching.Searcher.correct_query` method:: 89 | 90 | from whoosh import qparser 91 | 92 | # Parse the user query string 93 | qp = qparser.QueryParser("content", myindex.schema) 94 | q = qp.parse(qstring) 95 | 96 | # Try correcting the query 97 | with myindex.searcher() as s: 98 | corrected = s.correct_query(q, qstring) 99 | if corrected.query != q: 100 | print("Did you mean:", corrected.string) 101 | 102 | The ``correct_query`` method returns an object with the following 103 | attributes: 104 | 105 | ``query`` 106 | A corrected :class:` whoosh.query.Query` tree. You can test 107 | whether this is equal (``==``) to the original parsed query to 108 | check if the corrector actually changed anything. 109 | 110 | ``string`` 111 | A corrected version of the user's query string. 112 | 113 | ``tokens`` 114 | A list of corrected token objects representing the corrected 115 | terms. You can use this to reformat the user query (see below). 116 | 117 | 118 | You can use a :class:` whoosh.highlight.Formatter` object to format the 119 | corrected query string. For example, use the 120 | :class:`~ whoosh.highlight.HtmlFormatter` to format the corrected string 121 | as HTML:: 122 | 123 | from whoosh import highlight 124 | 125 | hf = highlight.HtmlFormatter() 126 | corrected = s.correct_query(q, qstring, formatter=hf) 127 | 128 | See the documentation for 129 | :meth:` whoosh.searching.Searcher.correct_query` for information on the 130 | defaults and arguments. 131 | -------------------------------------------------------------------------------- /docs/source/tech/filedb.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | filedb notes 3 | ============ 4 | 5 | TBD. 6 | 7 | Files created 8 | ============= 9 | 10 | .toc 11 | The "master" file containing information about the index and its segments. 12 | 13 | The index directory will contain a set of files for each segment. A segment is like a mini-index -- when you add documents to the index, Whoosh creates a new segment and then searches the old segment(s) and the new segment to avoid having to do a big merge every time you add a document. When you get enough small segments Whoosh will merge them into larger segments or a single segment. 14 | 15 | .dci 16 | Contains per-document information (e.g. field lengths). This will grow linearly with the number of documents. 17 | 18 | .dcz 19 | Contains the stored fields for each document. 20 | 21 | .tiz 22 | Contains per-term information. The size of file will vary based on the number of unique terms. 23 | 24 | .pst 25 | Contains per-term postings. The size of this file depends on the size of the collection and the formats used for each field (e.g. storing term positions takes more space than storing frequency only). 26 | 27 | .fvz 28 | contains term vectors (forward indexes) for each document. This file is only created if at least one field in the schema stores term vectors. The size will vary based on the number of documents, field length, the formats used for each vector (e.g. storing term positions takes more space than storing frequency only), etc. 29 | -------------------------------------------------------------------------------- /docs/source/tech/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Technical notes 3 | =============== 4 | 5 | .. toctree:: 6 | :glob: 7 | :maxdepth: 2 8 | 9 | * 10 | -------------------------------------------------------------------------------- /docs/source/threads.rst: -------------------------------------------------------------------------------- 1 | ==================================== 2 | Concurrency, locking, and versioning 3 | ==================================== 4 | 5 | Concurrency 6 | =========== 7 | 8 | The ``FileIndex`` object is "stateless" and should be share-able between 9 | threads. 10 | 11 | A ``Reader`` object (which underlies the ``Searcher`` object) wraps open files and often 12 | individual methods rely on consistent file cursor positions (e.g. they do two 13 | ``file.read()``\ s in a row, so if another thread moves the cursor between the two 14 | read calls Bad Things would happen). You should use one Reader/Searcher per 15 | thread in your code. 16 | 17 | Readers/Searchers tend to cache information (such as field caches for sorting), 18 | so if you can share one across multiple search requests, it's a big performance 19 | win. 20 | 21 | 22 | Locking 23 | ======= 24 | 25 | Only one thread/process can write to an index at a time. When you open a writer, 26 | it locks the index. If you try to open a writer on the same index in another 27 | thread/process, it will raise `` whoosh.store.LockError``. 28 | 29 | In a multi-threaded or multi-process environment your code needs to be aware 30 | that opening a writer may raise this exception if a writer is already open. 31 | Whoosh includes a couple of example implementations 32 | (:class:` whoosh.writing.AsyncWriter` and :class:` whoosh.writing.BufferedWriter`) 33 | of ways to work around the write lock. 34 | 35 | While the writer is open and during the commit, **the index is still available 36 | for reading**. Existing readers are unaffected and new readers can open the 37 | current index normally. 38 | 39 | 40 | Lock files 41 | ---------- 42 | 43 | Locking the index is accomplished by acquiring an exclusive file lock on the 44 | ``_WRITELOCK`` file in the index directory. The file is not deleted 45 | after the file lock is released, so the fact that the file exists **does not** 46 | mean the index is locked. 47 | 48 | 49 | Versioning 50 | ========== 51 | 52 | When you open a reader/searcher, the reader represents a view of the **current 53 | version** of the index. If someone writes changes to the index, any readers 54 | that are already open **will not** pick up the changes automatically. A reader 55 | always sees the index as it existed when the reader was opened. 56 | 57 | If you are re-using a Searcher across multiple search requests, you can check 58 | whether the Searcher is a view of the latest version of the index using 59 | :meth:` whoosh.searching.Searcher.up_to_date`. If the searcher is not up to date, 60 | you can get an up-to-date copy of the searcher using 61 | :meth:` whoosh.searching.Searcher.refresh`:: 62 | 63 | # If 'searcher' is not up-to-date, replace it 64 | searcher = searcher.refresh() 65 | 66 | (If the searcher has the latest version of the index, ``refresh()`` simply 67 | returns it.) 68 | 69 | Calling ``Searcher.refresh()`` is more efficient that closing the searcher and 70 | opening a new one, since it will re-use any underlying readers and caches that 71 | haven't changed. 72 | -------------------------------------------------------------------------------- /files/whoosh_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/files/whoosh_16.png -------------------------------------------------------------------------------- /files/whoosh_35.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/files/whoosh_35.png -------------------------------------------------------------------------------- /files/whoosh_64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/files/whoosh_64.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | target-version = "py38" 3 | 4 | [tool.ruff.lint] 5 | select = [ 6 | "AIR", # Airflow 7 | "ASYNC", # flake8-async 8 | "BLE", # flake8-blind-except 9 | "C4", # flake8-comprehensions 10 | "C90", # McCabe cyclomatic complexity 11 | "DJ", # flake8-django 12 | "DTZ", # flake8-datetimez 13 | "EXE", # flake8-executable 14 | "F", # Pyflakes 15 | "FA", # flake8-future-annotations 16 | "G", # flake8-logging-format 17 | "I", # isort 18 | "ICN", # flake8-import-conventions 19 | "INT", # flake8-gettext 20 | "LOG", # flake8-logging 21 | "NPY", # NumPy-specific rules 22 | "PERF", # Perflint 23 | "PLC", # Pylint conventions 24 | "PLE", # Pylint errors 25 | "PLR091", # Pylint Refactor just for max-args, max-branches, etc. 26 | "PYI", # flake8-pyi 27 | "Q", # flake8-quotes 28 | "SLOT", # flake8-slots 29 | "TCH", # flake8-type-checking 30 | "TID", # flake8-tidy-imports 31 | "TRIO", # flake8-trio 32 | "UP", # pyupgrade 33 | "W", # pycodestyle 34 | "YTT", # flake8-2020 35 | # "A", # flake8-builtins 36 | # "ANN", # flake8-annotations 37 | # "ARG", # flake8-unused-arguments 38 | # "B", # flake8-bugbear 39 | # "COM", # flake8-commas 40 | # "CPY", # flake8-copyright 41 | # "D", # pydocstyle 42 | # "E", # pycodestyle 43 | # "EM", # flake8-errmsg 44 | # "ERA", # eradicate 45 | # "FBT", # flake8-boolean-trap 46 | # "FIX", # flake8-fixme 47 | # "FLY", # flynt 48 | # "FURB", # refurb 49 | # "INP", # flake8-no-pep420 50 | # "ISC", # flake8-implicit-str-concat 51 | # "N", # pep8-naming 52 | # "PD", # pandas-vet 53 | # "PGH", # pygrep-hooks 54 | # "PIE", # flake8-pie 55 | # "PL", # Pylint 56 | # "PT", # flake8-pytest-style 57 | # "PTH", # flake8-use-pathlib 58 | # "RET", # flake8-return 59 | # "RSE", # flake8-raise 60 | # "RUF", # Ruff-specific rules 61 | # "S", # flake8-bandit 62 | # "SIM", # flake8-simplify 63 | # "SLF", # flake8-self 64 | # "T10", # flake8-debugger 65 | # "T20", # flake8-print 66 | # "TD", # flake8-todos 67 | # "TRY", # tryceratops 68 | ] 69 | ignore = [ 70 | "EXE001", 71 | "F401", 72 | "F811", 73 | "F841", 74 | "PERF203", 75 | "UP031", 76 | ] 77 | 78 | [tool.ruff.lint.mccabe] 79 | max-complexity = 45 # Default is 10 80 | 81 | [tool.ruff.lint.per-file-ignores] 82 | "src/whoosh/compat.py" = ["F821"] 83 | "src/whoosh/filedb/filestore.py" = ["UP024"] 84 | "src/whoosh/util/__init__.py" = ["F821"] 85 | 86 | [tool.ruff.lint.pylint] 87 | max-args = 22 # Default is 5 88 | max-branches = 79 # Default is 12 89 | max-returns = 16 # Default is 6 90 | max-statements = 256 # Default is 50 91 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pythomata 3 | versioneer 4 | -e . 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | . 2 | -------------------------------------------------------------------------------- /scripts/make_checkpoint.py: -------------------------------------------------------------------------------- 1 | #!python 2 | 3 | # Make a "checkpoint" index, capturing the index format created by a certain 4 | # version of Whoosh 5 | 6 | 7 | import os.path 8 | import random 9 | import sys 10 | from datetime import datetime, timezone 11 | 12 | from whoosh import fields, index 13 | 14 | if len(sys.argv) < 2: 15 | print("USAGE: make_checkpoint.py ") 16 | sys.exit(1) 17 | indexdir = sys.argv[1] 18 | print("Creating checkpoint index in", indexdir) 19 | 20 | schema = fields.Schema( 21 | path=fields.ID(stored=True, unique=True), 22 | num=fields.NUMERIC(int, stored=True), 23 | frac=fields.NUMERIC(float, stored=True), 24 | dt=fields.DATETIME(stored=True), 25 | tag=fields.KEYWORD, 26 | title=fields.TEXT(stored=True), 27 | ngrams=fields.NGRAMWORDS, 28 | ) 29 | 30 | words = ( 31 | "alfa bravo charlie delta echo foxtrot golf hotel india" 32 | "juliet kilo lima mike november oskar papa quebec romeo" 33 | "sierra tango" 34 | ).split() 35 | 36 | if not os.path.exists(indexdir): 37 | os.makedirs(indexdir) 38 | 39 | ix = index.create_in(indexdir, schema) 40 | counter = 0 41 | frac = 0.0 42 | for segnum in range(3): 43 | with ix.writer() as w: 44 | for num in range(100): 45 | frac += 0.15 46 | path = f"{segnum}/{num}" 47 | title = " ".join(random.choice(words) for _ in range(100)) 48 | dt = datetime( 49 | year=2000 + counter, 50 | month=(counter % 12) + 1, 51 | day=15, 52 | tzinfo=timezone.utc, 53 | ) 54 | 55 | w.add_document( 56 | path=path, 57 | num=counter, 58 | frac=frac, 59 | dt=dt, 60 | tag=words[counter % len(words)], 61 | title=title, 62 | ngrams=title, 63 | ) 64 | counter += 1 65 | 66 | with ix.writer() as w: 67 | for path in ("0/42", "1/6", "2/80"): 68 | print("Deleted", path, w.delete_by_term("path", path)) 69 | 70 | print(counter, ix.doc_count()) 71 | -------------------------------------------------------------------------------- /scripts/read_checkpoint.py: -------------------------------------------------------------------------------- 1 | #!python 2 | 3 | # Read a "checkpoint" index, to check backwards compatibility 4 | 5 | 6 | import sys 7 | 8 | from whoosh import index, query 9 | 10 | if len(sys.argv) < 2: 11 | print("USAGE: read_checkpoint.py ") 12 | sys.exit(1) 13 | indexdir = sys.argv[1] 14 | print("Reading checkpoint index in", indexdir) 15 | 16 | words = ( 17 | "alfa bravo charlie delta echo foxtrot golf hotel india" 18 | "juliet kilo lima mike november oskar papa quebec romeo" 19 | "sierra tango" 20 | ).split() 21 | 22 | deleted = ("0/42", "1/6", "2/80") 23 | 24 | ix = index.open_dir(indexdir) 25 | with ix.searcher() as s: 26 | dtfield = ix.schema["dt"] 27 | for sf in s.all_stored_fields(): 28 | if sf["path"] in deleted: 29 | continue 30 | 31 | num = sf["num"] 32 | r = s.search(query.Term("num", num), limit=None) 33 | assert len(r) == 1 34 | assert r[0]["num"] == num 35 | 36 | frac = sf["frac"] 37 | r = s.search(query.Term("frac", frac), limit=None) 38 | assert len(r) == 1 39 | assert r[0]["frac"] == frac 40 | 41 | dt = sf["dt"] 42 | q = query.Term("dt", dt) 43 | r = s.search(q, limit=None) 44 | if len(r) > 1: 45 | for hit in r: 46 | print(hit.fields()) 47 | assert len(r) == 1, len(r) 48 | assert r[0]["dt"] == dt 49 | 50 | print("Done") 51 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = True 3 | 4 | [build_sphinx] 5 | build-dir = docs/build 6 | source-dir = docs/source 7 | 8 | [upload_sphinx] 9 | upload-dir = docs/build/html 10 | 11 | [sdist] 12 | formats = zip,gztar 13 | 14 | [aliases] 15 | push = sdist bdist_wheel twine upload 16 | pushdocs = build_sphinx upload_sphinx 17 | 18 | [tool:pytest] 19 | ; --tb= traceback print mode (long/short/line/native/no) 20 | addopts = -rs --tb=short 21 | 22 | norecursedirs = .hg .tox _build tmp* env* benchmark stress 23 | minversion = 3.0 24 | python_files = test_*.py 25 | 26 | [tool.coverage.run] 27 | source = ["src/whoosh"] 28 | 29 | [tool.pytest.ini_options] 30 | addopts = "--cov --cov-report=lcov:lcov.info --cov-report=term" 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!python 2 | 3 | import os.path 4 | import sys 5 | 6 | from setuptools import find_packages, setup 7 | from setuptools.command.test import test as TestCommand 8 | 9 | try: 10 | import pytest 11 | except ImportError: 12 | pytest = None 13 | 14 | sys.path.insert(0, os.path.abspath("src")) 15 | from whoosh import versionstring 16 | 17 | 18 | class PyTest(TestCommand): 19 | def finalize_options(self): 20 | TestCommand.finalize_options(self) 21 | self.test_args = [] 22 | self.test_suite = True 23 | 24 | def run_tests(self): 25 | # import here, cause outside the eggs aren't loaded 26 | import pytest 27 | 28 | pytest.main(self.test_args) 29 | 30 | 31 | if __name__ == "__main__": 32 | setup( 33 | name="Whoosh-Reloaded", 34 | version=versionstring(), 35 | package_dir={"": "src"}, 36 | packages=find_packages("src"), 37 | author="Matt Chaput", 38 | author_email="matt@whoosh.ca", 39 | maintainer="Sygil-Dev", 40 | description="Fast, pure-Python full text indexing, search, and spell checking library.", 41 | long_description=open("README.md").read(), 42 | long_description_content_type="text/markdown", 43 | license="Two-clause BSD license", 44 | keywords="index search text spell", 45 | url="https://github.com/Sygil-Dev/whoosh-reloaded", 46 | zip_safe=True, 47 | install_requires=[ 48 | "cached-property==2.0.1", 49 | "loguru==0.7.3", 50 | ], 51 | tests_require=[ 52 | "pytest==8.4.0", 53 | "nose==1.3.7", 54 | "pre-commit==4.2.0", 55 | ], 56 | cmdclass={"test": PyTest}, 57 | classifiers=[ 58 | "Programming Language :: Python :: 3", 59 | "Development Status :: 5 - Production/Stable", 60 | "Intended Audience :: Developers", 61 | "License :: OSI Approved :: BSD License", 62 | "Natural Language :: English", 63 | "Operating System :: OS Independent", 64 | "Programming Language :: Python :: 3.8", 65 | "Programming Language :: Python :: 3.9", 66 | "Programming Language :: Python :: 3.10", 67 | "Programming Language :: Python :: 3.11", 68 | "Programming Language :: Python :: 3.12", 69 | "Topic :: Software Development :: Libraries :: Python Modules", 70 | "Topic :: Text Processing :: Indexing", 71 | ], 72 | ) 73 | -------------------------------------------------------------------------------- /src/whoosh/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2008 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | __version__ = (3, 0, 0) 29 | 30 | 31 | def versionstring(build=True, extra=True): 32 | """Returns the version number of Whoosh as a string. 33 | 34 | :param build: Whether to include the build number in the string. 35 | :param extra: Whether to include alpha/beta/rc etc. tags. Only 36 | checked if build is True. 37 | :rtype: str 38 | """ 39 | 40 | if build: 41 | first = 3 42 | else: 43 | first = 2 44 | 45 | s = ".".join(str(n) for n in __version__[:first]) 46 | if build and extra: 47 | s += "".join(str(n) for n in __version__[3:]) 48 | 49 | return s 50 | -------------------------------------------------------------------------------- /src/whoosh/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2007 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | """Classes and functions for turning a piece of text into an indexable stream 29 | of "tokens" (usually equivalent to words). There are three general classes 30 | involved in analysis: 31 | 32 | * Tokenizers are always at the start of the text processing pipeline. They take 33 | a string and yield Token objects (actually, the same token object over and 34 | over, for performance reasons) corresponding to the tokens (words) in the 35 | text. 36 | 37 | Every tokenizer is a callable that takes a string and returns an iterator of 38 | tokens. 39 | 40 | * Filters take the tokens from the tokenizer and perform various 41 | transformations on them. For example, the LowercaseFilter converts all tokens 42 | to lowercase, which is usually necessary when indexing regular English text. 43 | 44 | Every filter is a callable that takes a token generator and returns a token 45 | generator. 46 | 47 | * Analyzers are convenience functions/classes that "package up" a tokenizer and 48 | zero or more filters into a single unit. For example, the StandardAnalyzer 49 | combines a RegexTokenizer, LowercaseFilter, and StopFilter. 50 | 51 | Every analyzer is a callable that takes a string and returns a token 52 | iterator. (So Tokenizers can be used as Analyzers if you don't need any 53 | filtering). 54 | 55 | You can compose tokenizers and filters together using the ``|`` character:: 56 | 57 | my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() 58 | 59 | The first item must be a tokenizer and the rest must be filters (you can't put 60 | a filter first or a tokenizer after the first item). 61 | """ 62 | 63 | from whoosh.analysis.acore import ( 64 | Composable, 65 | CompositionError, 66 | Token, 67 | entoken, 68 | unstopped, 69 | ) 70 | from whoosh.analysis.analyzers import ( 71 | Analyzer, 72 | FancyAnalyzer, 73 | IDAnalyzer, 74 | KeywordAnalyzer, 75 | LanguageAnalyzer, 76 | RegexAnalyzer, 77 | SimpleAnalyzer, 78 | StandardAnalyzer, 79 | StemmingAnalyzer, 80 | ) 81 | from whoosh.analysis.filters import ( 82 | STOP_WORDS, 83 | CharsetFilter, 84 | Composable, 85 | DelimitedAttributeFilter, 86 | Filter, 87 | LoggingFilter, 88 | LowercaseFilter, 89 | MultiFilter, 90 | PassFilter, 91 | ReverseTextFilter, 92 | StopFilter, 93 | StripFilter, 94 | SubstitutionFilter, 95 | TeeFilter, 96 | url_pattern, 97 | ) 98 | from whoosh.analysis.intraword import ( 99 | BiWordFilter, 100 | CompoundWordFilter, 101 | IntraWordFilter, 102 | ShingleFilter, 103 | ) 104 | from whoosh.analysis.morph import DoubleMetaphoneFilter, PyStemmerFilter, StemFilter 105 | from whoosh.analysis.ngrams import ( 106 | NgramAnalyzer, 107 | NgramFilter, 108 | NgramTokenizer, 109 | NgramWordAnalyzer, 110 | ) 111 | from whoosh.analysis.tokenizers import ( 112 | CharsetTokenizer, 113 | CommaSeparatedTokenizer, 114 | IDTokenizer, 115 | PathTokenizer, 116 | RegexTokenizer, 117 | SpaceSeparatedTokenizer, 118 | Tokenizer, 119 | ) 120 | -------------------------------------------------------------------------------- /src/whoosh/automata/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/src/whoosh/automata/__init__.py -------------------------------------------------------------------------------- /src/whoosh/automata/glob.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | from whoosh.automata.fsa import ANY, EPSILON, NFA 29 | 30 | # Constants for glob 31 | _LIT = 0 32 | _STAR = 1 33 | _PLUS = 2 34 | _QUEST = 3 35 | _RANGE = 4 36 | 37 | 38 | def parse_glob( 39 | pattern, _glob_multi="*", _glob_single="?", _glob_range1="[", _glob_range2="]" 40 | ): 41 | pos = 0 42 | last = None 43 | while pos < len(pattern): 44 | char = pattern[pos] 45 | pos += 1 46 | if char == _glob_multi: # * 47 | # (Ignore more than one star in a row) 48 | if last is not _STAR: 49 | yield _STAR, None 50 | last = _STAR 51 | elif char == _glob_single: # ? 52 | # (Ignore ? after a star) 53 | if last is not _STAR: 54 | yield _QUEST, None 55 | last = _QUEST 56 | elif char == _glob_range1: # [ 57 | chars = set() 58 | negate = False 59 | # Take the char range specification until the ] 60 | while pos < len(pattern): 61 | char = pattern[pos] 62 | pos += 1 63 | if char == _glob_range2: 64 | break 65 | chars.add(char) 66 | if chars: 67 | yield _RANGE, (chars, negate) 68 | last = _RANGE 69 | else: 70 | yield _LIT, char 71 | last = _LIT 72 | 73 | 74 | def glob_automaton(pattern): 75 | nfa = NFA(0) 76 | i = -1 77 | for i, (op, arg) in enumerate(parse_glob(pattern)): 78 | if op is _LIT: 79 | nfa.add_transition(i, arg, i + 1) 80 | elif op is _STAR: 81 | nfa.add_transition(i, ANY, i + 1) 82 | nfa.add_transition(i, EPSILON, i + 1) 83 | nfa.add_transition(i + 1, EPSILON, i) 84 | elif op is _QUEST: 85 | nfa.add_transition(i, ANY, i + 1) 86 | elif op is _RANGE: 87 | for char in arg[0]: 88 | nfa.add_transition(i, char, i + 1) 89 | nfa.add_final_state(i + 1) 90 | return nfa 91 | -------------------------------------------------------------------------------- /src/whoosh/automata/lev.py: -------------------------------------------------------------------------------- 1 | from whoosh.automata.fsa import ANY, EPSILON, NFA 2 | 3 | 4 | def levenshtein_automaton(term, k, prefix=0): 5 | nfa = NFA((0, 0)) 6 | if prefix: 7 | for i in range(prefix): 8 | c = term[i] 9 | nfa.add_transition((i, 0), c, (i + 1, 0)) 10 | 11 | for i in range(prefix, len(term)): 12 | c = term[i] 13 | for e in range(k + 1): 14 | # Correct character 15 | nfa.add_transition((i, e), c, (i + 1, e)) 16 | if e < k: 17 | # Deletion 18 | nfa.add_transition((i, e), ANY, (i, e + 1)) 19 | # Insertion 20 | nfa.add_transition((i, e), EPSILON, (i + 1, e + 1)) 21 | # Substitution 22 | nfa.add_transition((i, e), ANY, (i + 1, e + 1)) 23 | for e in range(k + 1): 24 | if e < k: 25 | nfa.add_transition((len(term), e), ANY, (len(term), e + 1)) 26 | nfa.add_final_state((len(term), e)) 27 | return nfa 28 | -------------------------------------------------------------------------------- /src/whoosh/automata/reg.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | from whoosh.automata.fsa import ANY, EPSILON, NFA 29 | 30 | # Operator precedence 31 | CHOICE = ("|",) 32 | ops = () 33 | 34 | 35 | def parse(pattern): 36 | stack = [] 37 | ops = [] 38 | 39 | 40 | class RegexBuilder: 41 | def __init__(self): 42 | self.statenum = 1 43 | 44 | def new_state(self): 45 | self.statenum += 1 46 | return self.statenum 47 | 48 | def epsilon(self): 49 | s = self.new_state() 50 | e = self.new_state() 51 | nfa = NFA(s) 52 | nfa.add_transition(s, EPSILON, e) 53 | nfa.add_final_state(e) 54 | return nfa 55 | 56 | def char(self, label): 57 | s = self.new_state() 58 | e = self.new_state() 59 | nfa = NFA(s) 60 | nfa.add_transition(s, label, e) 61 | nfa.add_final_state(e) 62 | return nfa 63 | 64 | def charset(self, chars): 65 | s = self.new_state() 66 | e = self.new_state() 67 | nfa = NFA(s) 68 | for char in chars: 69 | nfa.add_transition(s, char, e) 70 | nfa.add_final_state(e) 71 | return e 72 | 73 | def dot(self): 74 | s = self.new_state() 75 | e = self.new_state() 76 | nfa = NFA(s) 77 | nfa.add_transition(s, ANY, e) 78 | nfa.add_final_state(e) 79 | return nfa 80 | 81 | def choice(self, n1, n2): 82 | s = self.new_state() 83 | s1 = self.new_state() 84 | s2 = self.new_state() 85 | e1 = self.new_state() 86 | e2 = self.new_state() 87 | e = self.new_state() 88 | nfa = NFA(s) 89 | nfa.add_transition(s, EPSILON, s1) 90 | nfa.add_transition(s, EPSILON, s2) 91 | nfa.insert(s1, n1, e1) 92 | nfa.insert(s2, n2, e2) 93 | nfa.add_transition(e1, EPSILON, e) 94 | nfa.add_transition(e2, EPSILON, e) 95 | nfa.add_final_state(e) 96 | return nfa 97 | 98 | def concat(self, n1, n2): 99 | s = self.new_state() 100 | m = self.new_state() 101 | e = self.new_state() 102 | nfa = NFA(s) 103 | nfa.insert(s, n1, m) 104 | nfa.insert(m, n2, e) 105 | nfa.add_final_state(e) 106 | return nfa 107 | 108 | def star(self, n): 109 | s = self.new_state() 110 | m1 = self.new_state() 111 | m2 = self.new_state() 112 | e = self.new_state() 113 | nfa = NFA(s) 114 | nfa.add_transition(s, EPSILON, m1) 115 | nfa.add_transition(s, EPSILON, e) 116 | nfa.insert(m1, n, m2) 117 | nfa.add_transition(m2, EPSILON, m1) 118 | nfa.add_transition(m2, EPSILON, e) 119 | nfa.add_final_state(e) 120 | return nfa 121 | 122 | def plus(self, n): 123 | return self.concat(n, self.star(n)) 124 | 125 | def question(self, n): 126 | return self.choice(n, self.epsilon()) 127 | -------------------------------------------------------------------------------- /src/whoosh/codec/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | 29 | def default_codec(*args, **kwargs): 30 | from whoosh.codec.whoosh3 import W3Codec 31 | 32 | return W3Codec(*args, **kwargs) 33 | -------------------------------------------------------------------------------- /src/whoosh/filedb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/src/whoosh/filedb/__init__.py -------------------------------------------------------------------------------- /src/whoosh/filedb/misc.py: -------------------------------------------------------------------------------- 1 | # =============================================================================== 2 | # Copyright 2010 Matt Chaput 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # =============================================================================== 16 | 17 | from marshal import dumps as mdumps 18 | from marshal import loads as mloads 19 | from pickle import dumps, loads 20 | from struct import Struct 21 | 22 | from whoosh.system import ( 23 | _SHORT_SIZE, 24 | pack_uint, 25 | pack_ushort, 26 | unpack_uint, 27 | unpack_ushort, 28 | ) 29 | from whoosh.util import utf8decode, utf8encode 30 | 31 | 32 | def encode_termkey(term): 33 | fieldnum, text = term 34 | return pack_ushort(fieldnum) + utf8encode(text)[0] 35 | 36 | 37 | def decode_termkey(key): 38 | return (unpack_ushort(key[:_SHORT_SIZE])[0], utf8decode(key[_SHORT_SIZE:])[0]) 39 | 40 | 41 | _terminfo_struct = Struct("!III") # frequency, offset, postcount 42 | _pack_terminfo = _terminfo_struct.pack 43 | encode_terminfo = lambda cf_offset_df: _pack_terminfo(*cf_offset_df) 44 | decode_terminfo = _terminfo_struct.unpack 45 | 46 | encode_docnum = pack_uint 47 | decode_docnum = lambda x: unpack_uint(x)[0] 48 | 49 | enpickle = lambda data: dumps(data, -1) 50 | depickle = loads 51 | 52 | enmarshal = mdumps 53 | demarshal = mloads 54 | -------------------------------------------------------------------------------- /src/whoosh/lang/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | 29 | # Exceptions 30 | 31 | 32 | class NoStemmer(Exception): 33 | pass 34 | 35 | 36 | class NoStopWords(Exception): 37 | pass 38 | 39 | 40 | # Data and functions for language names 41 | 42 | languages = ( 43 | "ar", 44 | "da", 45 | "nl", 46 | "en", 47 | "fi", 48 | "fr", 49 | "de", 50 | "hu", 51 | "it", 52 | "no", 53 | "pt", 54 | "ro", 55 | "ru", 56 | "es", 57 | "sv", 58 | "tr", 59 | ) 60 | 61 | aliases = { 62 | # By ISO 639-1 three letter codes 63 | "ara": "ar", 64 | "dan": "da", 65 | "nld": "nl", 66 | "eng": "en", 67 | "fin": "fi", 68 | "fra": "fr", 69 | "deu": "de", 70 | "hun": "hu", 71 | "ita": "it", 72 | "nor": "no", 73 | "por": "pt", 74 | "ron": "ro", 75 | "rus": "ru", 76 | "spa": "es", 77 | "swe": "sv", 78 | "tur": "tr", 79 | # By name in English 80 | "arabic": "ar", 81 | "danish": "da", 82 | "dutch": "nl", 83 | "english": "en", 84 | "finnish": "fi", 85 | "french": "fr", 86 | "german": "de", 87 | "hungarian": "hu", 88 | "italian": "it", 89 | "norwegian": "no", 90 | "portuguese": "pt", 91 | "romanian": "ro", 92 | "russian": "ru", 93 | "spanish": "es", 94 | "swedish": "sv", 95 | "turkish": "tr", 96 | # By name in own language 97 | "العربية": "ar", 98 | "dansk": "da", 99 | "nederlands": "nl", 100 | "suomi": "fi", 101 | "français": "fr", 102 | "deutsch": "de", 103 | "magyar": "hu", 104 | "italiano": "it", 105 | "norsk": "no", 106 | "português": "pt", 107 | "русский язык": "ru", 108 | "español": "es", 109 | "svenska": "sv", 110 | "türkçe": "tr", 111 | } 112 | 113 | 114 | def two_letter_code(name): 115 | if name in languages: 116 | return name 117 | if name in aliases: 118 | return aliases[name] 119 | return None 120 | 121 | 122 | # Getter functions 123 | 124 | 125 | def has_stemmer(lang): 126 | try: 127 | return bool(stemmer_for_language(lang)) 128 | except NoStemmer: 129 | return False 130 | 131 | 132 | def has_stopwords(lang): 133 | try: 134 | return bool(stopwords_for_language(lang)) 135 | except NoStopWords: 136 | return False 137 | 138 | 139 | def stemmer_for_language(lang): 140 | if lang == "en_porter": 141 | # Original porter stemming algorithm is several times faster than the 142 | # more correct porter2 algorithm in snowball package 143 | from .porter import stem as porter_stem 144 | 145 | return porter_stem 146 | 147 | tlc = two_letter_code(lang) 148 | 149 | if tlc == "ar": 150 | from .isri import ISRIStemmer 151 | 152 | return ISRIStemmer().stem 153 | 154 | from .snowball import classes as snowball_classes 155 | 156 | if tlc in snowball_classes: 157 | return snowball_classes[tlc]().stem 158 | 159 | raise NoStemmer(f"No stemmer available for {lang!r}") 160 | 161 | 162 | def stopwords_for_language(lang): 163 | from .stopwords import stoplists 164 | 165 | tlc = two_letter_code(lang) 166 | if tlc in stoplists: 167 | return stoplists[tlc] 168 | 169 | raise NoStopWords(f"No stop-word list available for {lang!r}") 170 | -------------------------------------------------------------------------------- /src/whoosh/lang/phonetic.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains quasi-phonetic encoders for words in different languages. 3 | """ 4 | 5 | import re 6 | 7 | # This soundex implementation is adapted from the recipe here: 8 | # http://code.activestate.com/recipes/52213/ 9 | 10 | english_codes = "01230120022455012623010202" 11 | 12 | 13 | def soundex_en(word): 14 | # digits holds the soundex values for the alphabet 15 | r = "" 16 | if word: 17 | # Remember first character 18 | fc = None 19 | prevcode = None 20 | for char in word.lower(): 21 | c = ord(char) 22 | if c >= 97 and c <= 122: # a-z 23 | if not fc: 24 | fc = char 25 | code = english_codes[c - 97] 26 | # Don't append the code if it's the same as the previous 27 | if code != prevcode: 28 | r += code 29 | prevcode = code 30 | 31 | # Replace first digit with first alpha character 32 | r = fc + r[1:] 33 | 34 | return r 35 | 36 | 37 | # Quasi-phonetic coder for Spanish, translated to Python from Sebastian 38 | # Ferreyra's version here: 39 | # http://www.javalobby.org/java/forums/t16936.html 40 | 41 | _esp_codes = ( 42 | ("\\Aw?[uh]?([aeiou])", ""), 43 | ("c[eiéí]|z|ll|sh|ch|sch|cc|y[aeiouáéíóú]|ps|bs|x|j|g[eiéí]", "s"), 44 | ("[aeiouhwáéíóúü]+", ""), 45 | ("y", ""), 46 | ("ñ|gn", "n"), 47 | ("[dpc]t", "t"), 48 | ("c[aouáóú]|ck|q", "k"), 49 | ("v", "b"), 50 | ("d$", "t"), # Change a trailing d to a t 51 | ) 52 | _esp_codes = tuple((re.compile(pat), repl) for pat, repl in _esp_codes) 53 | 54 | 55 | def soundex_esp(word): 56 | word = word.lower() 57 | r = "" 58 | 59 | prevcode = None 60 | i = 0 61 | while i < len(word): 62 | code = None 63 | for expr, ecode in _esp_codes: 64 | match = expr.match(word, i) 65 | if match: 66 | i = match.end() 67 | code = ecode 68 | break 69 | 70 | if code is None: 71 | code = word[i] 72 | i += 1 73 | 74 | if code != prevcode: 75 | r += code 76 | prevcode = code 77 | 78 | return r 79 | 80 | 81 | # This version of soundex for Arabic is translated to Python from Tammam 82 | # Koujan's C# version here: 83 | # http://www.codeproject.com/KB/recipes/ArabicSoundex.aspx 84 | 85 | # Create a dictionary mapping arabic characters to digits 86 | _arabic_codes = {} 87 | for chars, code in { 88 | "\u0627\u0623\u0625\u0622\u062d\u062e\u0647\u0639\u063a\u0634\u0648\u064a": "0", 89 | "\u0641\u0628": "1", 90 | "\u062c\u0632\u0633\u0635\u0638\u0642\u0643": "2", 91 | "\u062a\u062b\u062f\u0630\u0636\u0637": "3", 92 | "\u0644": "4", 93 | "\u0645\u0646": "5", 94 | "\u0631": "6", 95 | }.items(): 96 | for char in chars: 97 | _arabic_codes[char] = code 98 | 99 | 100 | def soundex_ar(word): 101 | if word[0] in "\u0627\u0623\u0625\u0622": 102 | word = word[1:] 103 | 104 | r = "0" 105 | prevcode = "0" 106 | if len(word) > 1: 107 | # Discard the first character 108 | for char in word[1:]: 109 | if char in _arabic_codes: 110 | code = _arabic_codes.get(char, "0") 111 | # Don't append the code if it's the same as the previous 112 | if code != prevcode: 113 | # If the code is a 0 (vowel), don't process it 114 | if code != "0": 115 | r += code 116 | prevcode = code 117 | return r 118 | -------------------------------------------------------------------------------- /src/whoosh/lang/porter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reimplementation of the 3 | `Porter stemming algorithm `_ 4 | in Python. 5 | 6 | In my quick tests, this implementation about 3.5 times faster than the 7 | seriously weird Python linked from the official page. 8 | """ 9 | 10 | import re 11 | 12 | # Suffix replacement lists 13 | 14 | _step2list = { 15 | "ational": "ate", 16 | "tional": "tion", 17 | "enci": "ence", 18 | "anci": "ance", 19 | "izer": "ize", 20 | "bli": "ble", 21 | "alli": "al", 22 | "entli": "ent", 23 | "eli": "e", 24 | "ousli": "ous", 25 | "ization": "ize", 26 | "ation": "ate", 27 | "ator": "ate", 28 | "alism": "al", 29 | "iveness": "ive", 30 | "fulness": "ful", 31 | "ousness": "ous", 32 | "aliti": "al", 33 | "iviti": "ive", 34 | "biliti": "ble", 35 | "logi": "log", 36 | } 37 | 38 | _step3list = { 39 | "icate": "ic", 40 | "ative": "", 41 | "alize": "al", 42 | "iciti": "ic", 43 | "ical": "ic", 44 | "ful": "", 45 | "ness": "", 46 | } 47 | 48 | 49 | _cons = "[^aeiou]" 50 | _vowel = "[aeiouy]" 51 | _cons_seq = "[^aeiouy]+" 52 | _vowel_seq = "[aeiou]+" 53 | 54 | # m > 0 55 | _mgr0 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq) 56 | # m == 0 57 | _meq1 = re.compile( 58 | "^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$" 59 | ) 60 | # m > 1 61 | _mgr1 = re.compile( 62 | "^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq 63 | ) 64 | # vowel in stem 65 | _s_v = re.compile("^(" + _cons_seq + ")?" + _vowel) 66 | # ??? 67 | _c_v = re.compile("^" + _cons_seq + _vowel + "[^aeiouwxy]$") 68 | 69 | # Patterns used in the rules 70 | 71 | _ed_ing = re.compile("^(.*)(ed|ing)$") 72 | _at_bl_iz = re.compile("(at|bl|iz)$") 73 | _step1b = re.compile("([^aeiouylsz])\\1$") 74 | _step2 = re.compile( 75 | "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$" 76 | ) 77 | _step3 = re.compile("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$") 78 | _step4_1 = re.compile( 79 | "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$" 80 | ) 81 | _step4_2 = re.compile("^(.+?)(s|t)(ion)$") 82 | _step5 = re.compile("^(.+?)e$") 83 | 84 | 85 | # Stemming function 86 | 87 | 88 | def stem(w): 89 | """Uses the Porter stemming algorithm to remove suffixes from English 90 | words. 91 | 92 | >>> stem("fundamentally") 93 | "fundament" 94 | """ 95 | 96 | if len(w) < 3: 97 | return w 98 | 99 | first_is_y = w[0] == "y" 100 | if first_is_y: 101 | w = "Y" + w[1:] 102 | 103 | # Step 1a 104 | if w.endswith("s"): 105 | if w.endswith("sses"): 106 | w = w[:-2] 107 | elif w.endswith("ies"): 108 | w = w[:-2] 109 | elif w[-2] != "s": 110 | w = w[:-1] 111 | 112 | # Step 1b 113 | 114 | if w.endswith("eed"): 115 | s = w[:-3] 116 | if _mgr0.match(s): 117 | w = w[:-1] 118 | else: 119 | m = _ed_ing.match(w) 120 | if m: 121 | stem = m.group(1) 122 | if _s_v.match(stem): 123 | w = stem 124 | if _at_bl_iz.match(w): 125 | w += "e" 126 | elif _step1b.match(w): 127 | w = w[:-1] 128 | elif _c_v.match(w): 129 | w += "e" 130 | 131 | # Step 1c 132 | 133 | if w.endswith("y"): 134 | stem = w[:-1] 135 | if _s_v.match(stem): 136 | w = stem + "i" 137 | 138 | # Step 2 139 | 140 | m = _step2.match(w) 141 | if m: 142 | stem = m.group(1) 143 | suffix = m.group(2) 144 | if _mgr0.match(stem): 145 | w = stem + _step2list[suffix] 146 | 147 | # Step 3 148 | 149 | m = _step3.match(w) 150 | if m: 151 | stem = m.group(1) 152 | suffix = m.group(2) 153 | if _mgr0.match(stem): 154 | w = stem + _step3list[suffix] 155 | 156 | # Step 4 157 | 158 | m = _step4_1.match(w) 159 | if m: 160 | stem = m.group(1) 161 | if _mgr1.match(stem): 162 | w = stem 163 | else: 164 | m = _step4_2.match(w) 165 | if m: 166 | stem = m.group(1) + m.group(2) 167 | if _mgr1.match(stem): 168 | w = stem 169 | 170 | # Step 5 171 | 172 | m = _step5.match(w) 173 | if m: 174 | stem = m.group(1) 175 | if _mgr1.match(stem) or (_meq1.match(stem) and not _c_v.match(stem)): 176 | w = stem 177 | 178 | if w.endswith("ll") and _mgr1.match(w): 179 | w = w[:-1] 180 | 181 | if first_is_y: 182 | w = "y" + w[1:] 183 | 184 | return w 185 | -------------------------------------------------------------------------------- /src/whoosh/lang/snowball/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2001-2012 NLTK Project 2 | 3 | Licensed under the Apache License, Version 2.0 (the 'License'); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an 'AS IS' BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /src/whoosh/lang/snowball/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2001-2012 NLTK Project 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # Natural Language Toolkit: Snowball Stemmer 17 | # 18 | # Copyright (C) 2001-2012 NLTK Project 19 | # Author: Peter Michael Stahl 20 | # Peter Ljunglof (revisions) 21 | # Algorithms: Dr Martin Porter 22 | # URL: 23 | # For license information, see LICENSE.TXT 24 | 25 | # HJ 2012/07/19 adapted from https://github.com/kmike/nltk.git (branch 2and3) 26 | # 2.0.1rc4-256-g45768f8 27 | 28 | """ 29 | This module provides a port of the Snowball stemmers developed by Martin 30 | Porter. 31 | 32 | At the moment, this port is able to stem words from fourteen languages: Danish, 33 | Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, 34 | Portuguese, Romanian, Russian, Spanish and Swedish. 35 | 36 | The algorithms have been developed by Martin Porter. These stemmers are called 37 | Snowball, because he invented a programming language with this name for 38 | creating new stemming algorithms. There is more information available at 39 | http://snowball.tartarus.org/ 40 | """ 41 | 42 | from .danish import DanishStemmer 43 | from .dutch import DutchStemmer 44 | from .english import EnglishStemmer 45 | from .finnish import FinnishStemmer 46 | from .french import FrenchStemmer 47 | from .german import GermanStemmer 48 | from .hungarian import HungarianStemmer 49 | from .italian import ItalianStemmer 50 | from .norwegian import NorwegianStemmer 51 | from .portugese import PortugueseStemmer 52 | from .romanian import RomanianStemmer 53 | from .russian import RussianStemmer 54 | from .spanish import SpanishStemmer 55 | from .swedish import SwedishStemmer 56 | 57 | # Map two-letter codes to stemming classes 58 | 59 | classes = { 60 | "da": DanishStemmer, 61 | "nl": DutchStemmer, 62 | "en": EnglishStemmer, 63 | "fi": FinnishStemmer, 64 | "fr": FrenchStemmer, 65 | "de": GermanStemmer, 66 | "hu": HungarianStemmer, 67 | "it": ItalianStemmer, 68 | "no": NorwegianStemmer, 69 | "pt": PortugueseStemmer, 70 | "ro": RomanianStemmer, 71 | "ru": RussianStemmer, 72 | "es": SpanishStemmer, 73 | "sv": SwedishStemmer, 74 | } 75 | -------------------------------------------------------------------------------- /src/whoosh/lang/snowball/norwegian.py: -------------------------------------------------------------------------------- 1 | from .bases import _ScandinavianStemmer 2 | 3 | 4 | class NorwegianStemmer(_ScandinavianStemmer): 5 | 6 | """ 7 | The Norwegian Snowball stemmer. 8 | 9 | :cvar __vowels: The Norwegian vowels. 10 | :type __vowels: unicode 11 | :cvar __s_ending: Letters that may directly appear before a word final 's'. 12 | :type __s_ending: unicode 13 | :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. 14 | :type __step1_suffixes: tuple 15 | :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. 16 | :type __step2_suffixes: tuple 17 | :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. 18 | :type __step3_suffixes: tuple 19 | :note: A detailed description of the Norwegian 20 | stemming algorithm can be found under 21 | http://snowball.tartarus.org/algorithms/norwegian/stemmer.html 22 | 23 | """ 24 | 25 | __vowels = "aeiouy\xE6\xE5\xF8" 26 | __s_ending = "bcdfghjlmnoprtvyz" 27 | __step1_suffixes = ( 28 | "hetenes", 29 | "hetene", 30 | "hetens", 31 | "heter", 32 | "heten", 33 | "endes", 34 | "ande", 35 | "ende", 36 | "edes", 37 | "enes", 38 | "erte", 39 | "ede", 40 | "ane", 41 | "ene", 42 | "ens", 43 | "ers", 44 | "ets", 45 | "het", 46 | "ast", 47 | "ert", 48 | "en", 49 | "ar", 50 | "er", 51 | "as", 52 | "es", 53 | "et", 54 | "a", 55 | "e", 56 | "s", 57 | ) 58 | 59 | __step2_suffixes = ("dt", "vt") 60 | 61 | __step3_suffixes = ( 62 | "hetslov", 63 | "eleg", 64 | "elig", 65 | "elov", 66 | "slov", 67 | "leg", 68 | "eig", 69 | "lig", 70 | "els", 71 | "lov", 72 | "ig", 73 | ) 74 | 75 | def stem(self, word): 76 | """ 77 | Stem a Norwegian word and return the stemmed form. 78 | 79 | :param word: The word that is stemmed. 80 | :type word: str or unicode 81 | :return: The stemmed form. 82 | :rtype: unicode 83 | 84 | """ 85 | word = word.lower() 86 | 87 | r1 = self._r1_scandinavian(word, self.__vowels) 88 | 89 | # STEP 1 90 | for suffix in self.__step1_suffixes: 91 | if r1.endswith(suffix): 92 | if suffix in ("erte", "ert"): 93 | word = "".join((word[: -len(suffix)], "er")) 94 | r1 = "".join((r1[: -len(suffix)], "er")) 95 | 96 | elif suffix == "s": 97 | if word[-2] in self.__s_ending or ( 98 | word[-2] == "k" and word[-3] not in self.__vowels 99 | ): 100 | word = word[:-1] 101 | r1 = r1[:-1] 102 | else: 103 | word = word[: -len(suffix)] 104 | r1 = r1[: -len(suffix)] 105 | break 106 | 107 | # STEP 2 108 | for suffix in self.__step2_suffixes: 109 | if r1.endswith(suffix): 110 | word = word[:-1] 111 | r1 = r1[:-1] 112 | break 113 | 114 | # STEP 3 115 | for suffix in self.__step3_suffixes: 116 | if r1.endswith(suffix): 117 | word = word[: -len(suffix)] 118 | break 119 | 120 | return word 121 | -------------------------------------------------------------------------------- /src/whoosh/lang/snowball/swedish.py: -------------------------------------------------------------------------------- 1 | from .bases import _ScandinavianStemmer 2 | 3 | 4 | class SwedishStemmer(_ScandinavianStemmer): 5 | 6 | """ 7 | The Swedish Snowball stemmer. 8 | 9 | :cvar __vowels: The Swedish vowels. 10 | :type __vowels: unicode 11 | :cvar __s_ending: Letters that may directly appear before a word final 's'. 12 | :type __s_ending: unicode 13 | :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. 14 | :type __step1_suffixes: tuple 15 | :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. 16 | :type __step2_suffixes: tuple 17 | :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. 18 | :type __step3_suffixes: tuple 19 | :note: A detailed description of the Swedish 20 | stemming algorithm can be found under 21 | http://snowball.tartarus.org/algorithms/swedish/stemmer.html 22 | """ 23 | 24 | __vowels = "aeiouy\xE4\xE5\xF6" 25 | __s_ending = "bcdfghjklmnoprtvy" 26 | __step1_suffixes = ( 27 | "heterna", 28 | "hetens", 29 | "heter", 30 | "heten", 31 | "anden", 32 | "arnas", 33 | "ernas", 34 | "ornas", 35 | "andes", 36 | "andet", 37 | "arens", 38 | "arna", 39 | "erna", 40 | "orna", 41 | "ande", 42 | "arne", 43 | "aste", 44 | "aren", 45 | "ades", 46 | "erns", 47 | "ade", 48 | "are", 49 | "ern", 50 | "ens", 51 | "het", 52 | "ast", 53 | "ad", 54 | "en", 55 | "ar", 56 | "er", 57 | "or", 58 | "as", 59 | "es", 60 | "at", 61 | "a", 62 | "e", 63 | "s", 64 | ) 65 | __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt") 66 | __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig") 67 | 68 | def stem(self, word): 69 | """ 70 | Stem a Swedish word and return the stemmed form. 71 | 72 | :param word: The word that is stemmed. 73 | :type word: str or unicode 74 | :return: The stemmed form. 75 | :rtype: unicode 76 | 77 | """ 78 | word = word.lower() 79 | 80 | r1 = self._r1_scandinavian(word, self.__vowels) 81 | 82 | # STEP 1 83 | for suffix in self.__step1_suffixes: 84 | if r1.endswith(suffix): 85 | if suffix == "s": 86 | if word[-2] in self.__s_ending: 87 | word = word[:-1] 88 | r1 = r1[:-1] 89 | else: 90 | word = word[: -len(suffix)] 91 | r1 = r1[: -len(suffix)] 92 | break 93 | 94 | # STEP 2 95 | for suffix in self.__step2_suffixes: 96 | if r1.endswith(suffix): 97 | word = word[:-1] 98 | r1 = r1[:-1] 99 | break 100 | 101 | # STEP 3 102 | for suffix in self.__step3_suffixes: 103 | if r1.endswith(suffix): 104 | if suffix in ("els", "lig", "ig"): 105 | word = word[: -len(suffix)] 106 | elif suffix in ("fullt", "l\xF6st"): 107 | word = word[:-1] 108 | break 109 | 110 | return word 111 | -------------------------------------------------------------------------------- /src/whoosh/legacy.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | """ 29 | This module contains code for maintaining backwards compatibility with old 30 | index formats. 31 | """ 32 | 33 | from whoosh.util.loading import RenamingUnpickler 34 | 35 | 36 | def load_110_toc(stream, gen, schema, version): 37 | # Between version -110 and version -111, I reorganized the modules and 38 | # changed the implementation of the NUMERIC field, so we have to change the 39 | # classes the unpickler tries to load if we need to read an old schema 40 | 41 | # Read the length of the pickled schema 42 | picklen = stream.read_varint() 43 | if schema: 44 | # If the user passed us a schema, use it and skip the one on disk 45 | stream.seek(picklen, 1) 46 | else: 47 | # Remap the old classes and functions to their moved versions as we 48 | # unpickle the schema 49 | scuts = { 50 | "wf": "whoosh.fields", 51 | "wsn": "whoosh.support.numeric", 52 | "wcw2": "whoosh.codec.whoosh2", 53 | } 54 | objmap = { 55 | "%(wf)s.NUMERIC": "%(wcw2)s.OLD_NUMERIC", 56 | "%(wf)s.DATETIME": "%(wcw2)s.OLD_DATETIME", 57 | "%(wsn)s.int_to_text": "%(wcw2)s.int_to_text", 58 | "%(wsn)s.text_to_int": "%(wcw2)s.text_to_int", 59 | "%(wsn)s.long_to_text": "%(wcw2)s.long_to_text", 60 | "%(wsn)s.text_to_long": "%(wcw2)s.text_to_long", 61 | "%(wsn)s.float_to_text": "%(wcw2)s.float_to_text", 62 | "%(wsn)s.text_to_float": "%(wcw2)s.text_to_float", 63 | } 64 | ru = RenamingUnpickler(stream, objmap, shortcuts=scuts) 65 | schema = ru.load() 66 | # Read the generation number 67 | index_gen = stream.read_int() 68 | assert gen == index_gen 69 | # Unused number 70 | _ = stream.read_int() 71 | # Unpickle the list of segment objects 72 | segments = stream.read_pickle() 73 | return schema, segments 74 | 75 | 76 | # Map TOC version numbers to functions to load that version 77 | toc_loaders = {-110: load_110_toc} 78 | 79 | 80 | # Map segment class names to functions to load the segment 81 | segment_loaders = {} 82 | -------------------------------------------------------------------------------- /src/whoosh/matching/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | from whoosh.matching.binary import ( 29 | AdditiveBiMatcher, 30 | AndMaybeMatcher, 31 | AndNotMatcher, 32 | BiMatcher, 33 | DisjunctionMaxMatcher, 34 | IntersectionMatcher, 35 | UnionMatcher, 36 | ) 37 | from whoosh.matching.combo import ( 38 | ArrayUnionMatcher, 39 | CombinationMatcher, 40 | PreloadedUnionMatcher, 41 | ) 42 | from whoosh.matching.mcore import ( 43 | ConstantScoreMatcher, 44 | LeafMatcher, 45 | ListMatcher, 46 | Matcher, 47 | NoQualityAvailable, 48 | NullMatcher, 49 | NullMatcherClass, 50 | ReadTooFar, 51 | ) 52 | from whoosh.matching.wrappers import ( 53 | ConstantScoreWrapperMatcher, 54 | CoordMatcher, 55 | ExcludeMatcher, 56 | FilterMatcher, 57 | InverseMatcher, 58 | MultiMatcher, 59 | RequireMatcher, 60 | SingleTermMatcher, 61 | WrappingMatcher, 62 | ) 63 | -------------------------------------------------------------------------------- /src/whoosh/qparser/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2010 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | from whoosh.qparser.default import ( 29 | DisMaxParser, 30 | MultifieldParser, 31 | QueryParser, 32 | SimpleParser, 33 | ) 34 | from whoosh.qparser.plugins import ( 35 | BoostPlugin, 36 | CopyFieldPlugin, 37 | EveryPlugin, 38 | FieldAliasPlugin, 39 | FieldsPlugin, 40 | FunctionPlugin, 41 | FuzzyTermPlugin, 42 | GroupPlugin, 43 | GtLtPlugin, 44 | MultifieldPlugin, 45 | OperatorsPlugin, 46 | PhrasePlugin, 47 | Plugin, 48 | PlusMinusPlugin, 49 | PrefixPlugin, 50 | PseudoFieldPlugin, 51 | RangePlugin, 52 | RegexPlugin, 53 | RegexTagger, 54 | SequencePlugin, 55 | SingleQuotePlugin, 56 | TaggingPlugin, 57 | WhitespacePlugin, 58 | WildcardPlugin, 59 | ) 60 | from whoosh.qparser.syntax import ( 61 | AndGroup, 62 | AndMaybeGroup, 63 | AndNotGroup, 64 | BinaryGroup, 65 | DisMaxGroup, 66 | ErrorNode, 67 | FieldnameNode, 68 | GroupNode, 69 | InfixOperator, 70 | MarkerNode, 71 | NotGroup, 72 | Operator, 73 | OrderedGroup, 74 | OrGroup, 75 | PostfixOperator, 76 | PrefixOperator, 77 | RequireGroup, 78 | SyntaxNode, 79 | TextNode, 80 | Whitespace, 81 | WordNode, 82 | Wrapper, 83 | ) 84 | -------------------------------------------------------------------------------- /src/whoosh/qparser/common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2010 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | """ 29 | This module contains common utility objects/functions for the other query 30 | parser modules. 31 | """ 32 | 33 | import sys 34 | 35 | 36 | class QueryParserError(Exception): 37 | def __init__(self, cause, msg=None): 38 | super().__init__(str(cause)) 39 | self.cause = cause 40 | 41 | 42 | def get_single_text(field, text, **kwargs): 43 | """Returns the first token from an analyzer's output.""" 44 | 45 | for t in field.process_text(text, mode="query", **kwargs): 46 | return t 47 | 48 | 49 | def attach(q, stxnode): 50 | if q: 51 | try: 52 | q.startchar = stxnode.startchar 53 | q.endchar = stxnode.endchar 54 | except AttributeError: 55 | raise AttributeError(f"Can't set attribute on {q.__class__.__name__}") 56 | return q 57 | 58 | 59 | def print_debug(level, msg, out=sys.stderr): 60 | if level: 61 | out.write(f"{' ' * (level - 1)}{msg}\n") 62 | -------------------------------------------------------------------------------- /src/whoosh/qparser/taggers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | from whoosh.util.text import rcompile 29 | 30 | # Tagger objects 31 | 32 | 33 | class Tagger: 34 | """Base class for taggers, objects which match syntax in the query string 35 | and translate it into a :class:`whoosh.qparser.syntax.SyntaxNode` object. 36 | """ 37 | 38 | def match(self, parser, text, pos): 39 | """This method should see if this tagger matches the query string at 40 | the given position. If it matches, it should return 41 | 42 | :param parser: the :class:`whoosh.qparser.default.QueryParser` object. 43 | :param text: the text being parsed. 44 | :param pos: the position in the text at which the tagger should try to 45 | match. 46 | """ 47 | 48 | raise NotImplementedError 49 | 50 | 51 | class RegexTagger(Tagger): 52 | """Tagger class that uses regular expressions to match the query string. 53 | Subclasses should override ``create()`` instead of ``match()``. 54 | """ 55 | 56 | def __init__(self, expr): 57 | self.expr = rcompile(expr) 58 | 59 | def match(self, parser, text, pos): 60 | match = self.expr.match(text, pos) 61 | if match: 62 | node = self.create(parser, match) 63 | if node is not None: 64 | node = node.set_range(match.start(), match.end()) 65 | return node 66 | else: 67 | return None 68 | 69 | def create(self, parser, match): 70 | """When the regular expression matches, this method is called to 71 | translate the regex match object into a syntax node. 72 | 73 | :param parser: the :class:`whoosh.qparser.default.QueryParser` object. 74 | :param match: the regex match object. 75 | """ 76 | 77 | raise NotImplementedError 78 | 79 | 80 | class FnTagger(RegexTagger): 81 | """Tagger that takes a regular expression and a class or function, and for 82 | matches calls the class/function with the regex match's named groups as 83 | keyword arguments. 84 | """ 85 | 86 | def __init__(self, expr, fn, memo=""): 87 | RegexTagger.__init__(self, expr) 88 | self.fn = fn 89 | self.memo = memo 90 | 91 | def __repr__(self): 92 | return f"<{self.__class__.__name__} {self.expr!r} ({self.memo})>" 93 | 94 | def create(self, parser, match): 95 | return self.fn(**match.groupdict()) 96 | -------------------------------------------------------------------------------- /src/whoosh/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | 29 | from whoosh.query.compound import ( 30 | And, 31 | AndMaybe, 32 | AndNot, 33 | BinaryQuery, 34 | BooleanQuery, 35 | CompoundQuery, 36 | DefaultOr, 37 | DisjunctionMax, 38 | Or, 39 | Otherwise, 40 | PreloadedOr, 41 | Require, 42 | SplitOr, 43 | ) 44 | from whoosh.query.nested import NestedChildren, NestedParent 45 | from whoosh.query.positional import Ordered, Phrase, Sequence 46 | from whoosh.query.qcolumns import ColumnMatcher, ColumnQuery 47 | from whoosh.query.qcore import ( 48 | Every, 49 | Highest, 50 | Lowest, 51 | NullQuery, 52 | Query, 53 | QueryError, 54 | _NullQuery, 55 | error_query, 56 | token_lists, 57 | ) 58 | from whoosh.query.ranges import DateRange, NumericRange, RangeMixin, TermRange 59 | from whoosh.query.spans import ( 60 | Span, 61 | SpanBefore, 62 | SpanBiMatcher, 63 | SpanBiQuery, 64 | SpanCondition, 65 | SpanContains, 66 | SpanFirst, 67 | SpanNear, 68 | SpanNear2, 69 | SpanNot, 70 | SpanOr, 71 | SpanQuery, 72 | SpanWrappingMatcher, 73 | WrappingSpan, 74 | ) 75 | from whoosh.query.terms import ( 76 | ExpandingTerm, 77 | FuzzyTerm, 78 | MultiTerm, 79 | PatternQuery, 80 | Prefix, 81 | Regex, 82 | Term, 83 | Variations, 84 | Wildcard, 85 | ) 86 | from whoosh.query.wrappers import ConstantScoreQuery, Not, WeightingQuery, WrappingQuery 87 | -------------------------------------------------------------------------------- /src/whoosh/query/qcolumns.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | from whoosh.matching import ConstantScoreMatcher, NullMatcher, ReadTooFar 29 | from whoosh.query.qcore import Query 30 | 31 | 32 | class ColumnQuery(Query): 33 | """A query that matches per-document values stored in a column rather than 34 | terms in the inverted index. 35 | 36 | This may be useful in special circumstances, but note that this is MUCH 37 | SLOWER than searching an indexed field. 38 | """ 39 | 40 | def __init__(self, fieldname, condition): 41 | """ 42 | :param fieldname: the name of the field to look in. If the field does 43 | not have a column, this query will not match anything. 44 | :param condition: if this is a callable, it is called on each value 45 | in the column, and for documents where callable(docvalue) returns 46 | True are returned as matching documents. If this is not a callable, 47 | the document values are compared to it (using ``==``). 48 | """ 49 | 50 | self.fieldname = fieldname 51 | self.condition = condition 52 | 53 | def is_leaf(self): 54 | return True 55 | 56 | def matcher(self, searcher, context=None): 57 | fieldname = self.fieldname 58 | condition = self.condition 59 | if callable(condition): 60 | comp = condition 61 | else: 62 | 63 | def comp(v): 64 | # Made this a function instead of a lambda so I could put 65 | # debug prints here if necessary ;) 66 | return v == condition 67 | 68 | reader = searcher.reader() 69 | if not reader.has_column(fieldname): 70 | return NullMatcher() 71 | 72 | creader = reader.column_reader(fieldname) 73 | return ColumnMatcher(creader, comp) 74 | 75 | 76 | class ColumnMatcher(ConstantScoreMatcher): 77 | def __init__(self, creader, condition): 78 | self.creader = creader 79 | self.condition = condition 80 | self._i = 0 81 | self._find_next() 82 | 83 | def _find_next(self): 84 | condition = self.condition 85 | creader = self.creader 86 | 87 | while self._i < len(creader) and not condition(creader[self._i]): 88 | self._i += 1 89 | 90 | def is_active(self): 91 | return self._i < len(self.creader) 92 | 93 | def next(self): 94 | if not self.is_active(): 95 | raise ReadTooFar 96 | self._i += 1 97 | self._find_next() 98 | 99 | def reset(self): 100 | self._i = 0 101 | self._find_next() 102 | 103 | def id(self): 104 | return self._i 105 | 106 | def all_ids(self): 107 | condition = self.condition 108 | for docnum, v in enumerate(self.creader): 109 | if condition(v): 110 | yield docnum 111 | 112 | def supports(self, astype): 113 | return False 114 | 115 | def skip_to_quality(self, minquality): 116 | if self._score <= minquality: 117 | self._i = len(self.creader) 118 | return True 119 | -------------------------------------------------------------------------------- /src/whoosh/support/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/src/whoosh/support/__init__.py -------------------------------------------------------------------------------- /src/whoosh/support/base85.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains generic base85 encoding and decoding functions. The 3 | whoosh.util.numeric module contains faster variants for encoding and 4 | decoding integers. 5 | 6 | Modified from: 7 | http://paste.lisp.org/display/72815 8 | """ 9 | 10 | import struct 11 | 12 | # Instead of using the character set from the ascii85 algorithm, I put the 13 | # characters in order so that the encoded text sorts properly (my life would be 14 | # a lot easier if they had just done that from the start) 15 | b85chars = ( 16 | "!$%&*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" 17 | "^_abcdefghijklmnopqrstuvwxyz{|}~" 18 | ) 19 | b85dec = {} 20 | for i in range(len(b85chars)): 21 | b85dec[b85chars[i]] = i 22 | 23 | 24 | # Integer encoding and decoding functions 25 | 26 | 27 | def to_base85(x, islong=False): 28 | "Encodes the given integer using base 85." 29 | 30 | size = 10 if islong else 5 31 | rems = "" 32 | for i in range(size): 33 | rems = b85chars[x % 85] + rems 34 | x //= 85 35 | return rems 36 | 37 | 38 | def from_base85(text): 39 | "Decodes the given base 85 text into an integer." 40 | 41 | acc = 0 42 | for c in text: 43 | acc = acc * 85 + b85dec[c] 44 | return acc 45 | 46 | 47 | # Bytes encoding and decoding functions 48 | 49 | 50 | def b85encode(text, pad=False): 51 | l = len(text) 52 | r = l % 4 53 | if r: 54 | text += "\0" * (4 - r) 55 | longs = len(text) >> 2 56 | out = [] 57 | words = struct.unpack(">" + "L" * longs, text[0 : longs * 4]) 58 | for word in words: 59 | rems = [0, 0, 0, 0, 0] 60 | for i in range(4, -1, -1): 61 | rems[i] = b85chars[word % 85] 62 | word /= 85 63 | out.extend(rems) 64 | 65 | out = "".join(out) 66 | if pad: 67 | return out 68 | 69 | # Trim padding 70 | olen = l % 4 71 | if olen: 72 | olen += 1 73 | olen += l / 4 * 5 74 | return out[0:olen] 75 | 76 | 77 | def b85decode(text): 78 | l = len(text) 79 | out = [] 80 | for i in range(0, len(text), 5): 81 | chunk = text[i : i + 5] 82 | acc = 0 83 | for j in range(len(chunk)): 84 | try: 85 | acc = acc * 85 + b85dec[chunk[j]] 86 | except KeyError: 87 | raise TypeError("Bad base85 character at byte %d" % (i + j)) 88 | if acc > 4294967295: 89 | raise OverflowError("Base85 overflow in hunk starting at byte %d" % i) 90 | out.append(acc) 91 | 92 | # Pad final chunk if necessary 93 | cl = l % 5 94 | if cl: 95 | acc *= 85 ** (5 - cl) 96 | if cl > 1: 97 | acc += 0xFFFFFF >> (cl - 2) * 8 98 | out[-1] = acc 99 | 100 | out = struct.pack(">" + "L" * ((l + 4) / 5), *out) 101 | if cl: 102 | out = out[: -(5 - cl)] 103 | 104 | return out 105 | -------------------------------------------------------------------------------- /src/whoosh/support/bitstream.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | From a post by Patrick Maupin on the Python mailing list: 4 | http://mail.python.org/pipermail/python-list/2003-November/237481.html 5 | """ 6 | 7 | from array import array 8 | 9 | from whoosh.system import _LONG_SIZE 10 | 11 | _bitsperlong = _LONG_SIZE * 8 12 | 13 | 14 | class BitStreamReader: 15 | def __init__(self, source): 16 | self._totalbits = len(source) * _bitsperlong 17 | self._position = 0 18 | 19 | # Pad to longword boundary, then make an array 20 | 21 | source += -len(source) % _LONG_SIZE * chr(0) 22 | bits = array("L") 23 | bits.fromstring(source) 24 | self._bitstream = bits 25 | 26 | def seek(self, offset): 27 | self._position = offset 28 | 29 | def tell(self): 30 | return self._position 31 | 32 | def read(self, numbits): 33 | position = self._position 34 | 35 | if position < 0 or position + numbits > self._totalbits: 36 | raise (IndexError, "Invalid bitarray._position/numbits") 37 | 38 | longaddress, bitoffset = divmod(position, _bitsperlong) 39 | 40 | # We may read bits in the final word after ones we care 41 | # about, so create a mask to remove them later. 42 | 43 | finalmask = (1 << numbits) - 1 44 | 45 | # We may read bits in the first word before the ones we 46 | # care about, so bump the total bits to read by this 47 | # amount, so we read enough higher-order bits. 48 | 49 | numbits += bitoffset 50 | 51 | # Read and concatenate every long containing a bit we need 52 | 53 | outval, outshift = 0, 0 54 | while numbits > 0: 55 | outval += self._bitstream[longaddress] << outshift 56 | longaddress += 1 57 | outshift += _bitsperlong 58 | numbits -= _bitsperlong 59 | 60 | # numbits is now basically a negative number which tells us 61 | # how many bits to back up from our current position. 62 | 63 | self._position = longaddress * _bitsperlong + numbits 64 | 65 | # Shift right to strip off the low-order bits we 66 | # don't want, then 'and' with the mask to strip 67 | # off the high-order bits we don't want. 68 | 69 | return (outval >> bitoffset) & finalmask 70 | -------------------------------------------------------------------------------- /src/whoosh/support/levenshtein.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains functions implementing edit distance algorithms. 3 | """ 4 | 5 | 6 | def levenshtein(seq1, seq2, limit=None): 7 | """Returns the Levenshtein edit distance between two strings.""" 8 | 9 | oneago = None 10 | thisrow = list(range(1, len(seq2) + 1)) + [0] 11 | for x in range(len(seq1)): 12 | # Python lists wrap around for negative indices, so put the 13 | # leftmost column at the *end* of the list. This matches with 14 | # the zero-indexed strings and saves extra calculation. 15 | oneago, thisrow = thisrow, [0] * len(seq2) + [x + 1] 16 | for y in range(len(seq2)): 17 | delcost = oneago[y] + 1 18 | addcost = thisrow[y - 1] + 1 19 | subcost = oneago[y - 1] + (seq1[x] != seq2[y]) 20 | thisrow[y] = min(delcost, addcost, subcost) 21 | 22 | if limit and x > limit and min(thisrow) > limit: 23 | return limit + 1 24 | 25 | return thisrow[len(seq2) - 1] 26 | 27 | 28 | def damerau_levenshtein(seq1, seq2, limit=None): 29 | """Returns the Damerau-Levenshtein edit distance between two strings.""" 30 | 31 | oneago = None 32 | thisrow = list(range(1, len(seq2) + 1)) + [0] 33 | for x in range(len(seq1)): 34 | # Python lists wrap around for negative indices, so put the 35 | # leftmost column at the *end* of the list. This matches with 36 | # the zero-indexed strings and saves extra calculation. 37 | twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] 38 | for y in range(len(seq2)): 39 | delcost = oneago[y] + 1 40 | addcost = thisrow[y - 1] + 1 41 | subcost = oneago[y - 1] + (seq1[x] != seq2[y]) 42 | thisrow[y] = min(delcost, addcost, subcost) 43 | # This block deals with transpositions 44 | if ( 45 | x > 0 46 | and y > 0 47 | and seq1[x] == seq2[y - 1] 48 | and seq1[x - 1] == seq2[y] 49 | and seq1[x] != seq2[y] 50 | ): 51 | thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) 52 | 53 | if limit and x > limit and min(thisrow) > limit: 54 | return limit + 1 55 | 56 | return thisrow[len(seq2) - 1] 57 | 58 | 59 | def relative(a, b): 60 | """Returns the relative distance between two strings, in the range 61 | [0-1] where 1 means total equality. 62 | """ 63 | 64 | d = distance(a, b) 65 | longer = float(max((len(a), len(b)))) 66 | shorter = float(min((len(a), len(b)))) 67 | r = ((longer - d) / longer) * (shorter / longer) 68 | return r 69 | 70 | 71 | distance = damerau_levenshtein 72 | -------------------------------------------------------------------------------- /src/whoosh/system.py: -------------------------------------------------------------------------------- 1 | # Copyright 2007 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | import sys 29 | from struct import Struct, calcsize 30 | 31 | IS_LITTLE = sys.byteorder == "little" 32 | 33 | _INT_SIZE = calcsize("!i") 34 | _SHORT_SIZE = calcsize("!H") 35 | _LONG_SIZE = calcsize("!Q") 36 | _FLOAT_SIZE = calcsize("!f") 37 | _DOUBLE_SIZE = calcsize("!d") 38 | 39 | _byte_struct = Struct("!B") 40 | _sbyte_struct = Struct("!b") 41 | _ushort_struct = Struct("!H") 42 | _int_struct = Struct("!i") 43 | _uint_struct = Struct("!I") 44 | _long_struct = Struct("!q") 45 | _ulong_struct = Struct("!Q") 46 | _float_struct = Struct("!f") 47 | _double_struct = Struct("!d") 48 | _ushort_le_struct = Struct(">> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3]) 60 | UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3)) 61 | 62 | Any keyword arguments given to this function are passed to the class 63 | initializer. 64 | """ 65 | 66 | count = len(args) 67 | if not count: 68 | raise ValueError("Called make_binary_tree with empty list") 69 | elif count == 1: 70 | return args[0] 71 | 72 | half = count // 2 73 | return fn( 74 | make_binary_tree(fn, args[:half], **kwargs), 75 | make_binary_tree(fn, args[half:], **kwargs), 76 | **kwargs, 77 | ) 78 | 79 | 80 | def make_weighted_tree(fn, ls, **kwargs): 81 | """Takes a function/class that takes two positional arguments and a list of 82 | (weight, argument) tuples and returns a huffman-like weighted tree of 83 | results/instances. 84 | """ 85 | 86 | if not ls: 87 | raise ValueError("Called make_weighted_tree with empty list") 88 | 89 | ls.sort() 90 | while len(ls) > 1: 91 | a = ls.pop(0) 92 | b = ls.pop(0) 93 | insort(ls, (a[0] + b[0], fn(a[1], b[1]))) 94 | return ls[0][1] 95 | 96 | 97 | # Fibonacci function 98 | 99 | _fib_cache = {} 100 | 101 | 102 | def fib(n): 103 | """Returns the nth value in the Fibonacci sequence.""" 104 | 105 | if n <= 2: 106 | return n 107 | if n in _fib_cache: 108 | return _fib_cache[n] 109 | result = fib(n - 1) + fib(n - 2) 110 | _fib_cache[n] = result 111 | return result 112 | 113 | 114 | # Decorators 115 | 116 | 117 | def synchronized(func): 118 | """Decorator for storage-access methods, which synchronizes on a threading 119 | lock. The parent object must have 'is_closed' and '_sync_lock' attributes. 120 | """ 121 | 122 | @wraps(func) 123 | def synchronized_wrapper(self, *args, **kwargs): 124 | with self._sync_lock: 125 | return func(self, *args, **kwargs) 126 | 127 | return synchronized_wrapper 128 | 129 | 130 | def unclosed(method): 131 | """ 132 | Decorator to check if the object is closed. 133 | """ 134 | 135 | @wraps(method) 136 | def unclosed_wrapper(self, *args, **kwargs): 137 | if self.closed: 138 | raise ValueError("Operation on a closed object") 139 | return method(self, *args, **kwargs) 140 | 141 | return unclosed_wrapper 142 | -------------------------------------------------------------------------------- /src/whoosh/util/cache.py: -------------------------------------------------------------------------------- 1 | # Copyright 2007 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | 29 | import functools 30 | from collections import Counter 31 | from heapq import nsmallest 32 | from operator import itemgetter 33 | 34 | 35 | def unbound_cache(func): 36 | """Caching decorator with an unbounded cache size.""" 37 | 38 | cache = {} 39 | 40 | @functools.wraps(func) 41 | def caching_wrapper(*args): 42 | try: 43 | return cache[args] 44 | except KeyError: 45 | result = func(*args) 46 | cache[args] = result 47 | return result 48 | 49 | return caching_wrapper 50 | 51 | 52 | def lfu_cache(maxsize=100): 53 | """A simple cache that, when the cache is full, deletes the least frequently 54 | used 10% of the cached values. 55 | 56 | This function duplicates (more-or-less) the protocol of the 57 | ``functools.lru_cache`` decorator in the Python 3.2 standard library. 58 | 59 | Arguments to the cached function must be hashable. 60 | 61 | View the cache statistics tuple ``(hits, misses, maxsize, currsize)`` 62 | with f.cache_info(). Clear the cache and statistics with f.cache_clear(). 63 | Access the underlying function with f.__wrapped__. 64 | """ 65 | 66 | def decorating_function(user_function): 67 | stats = [0, 0] # Hits, misses 68 | data = {} 69 | usecount = Counter() 70 | 71 | @functools.wraps(user_function) 72 | def wrapper(*args): 73 | try: 74 | result = data[args] 75 | stats[0] += 1 # Hit 76 | except KeyError: 77 | stats[1] += 1 # Miss 78 | if len(data) == maxsize: 79 | for k, _ in nsmallest( 80 | maxsize // 10 or 1, usecount.items(), key=itemgetter(1) 81 | ): 82 | del data[k] 83 | del usecount[k] 84 | data[args] = user_function(*args) 85 | result = data[args] 86 | finally: 87 | usecount[args] += 1 88 | return result 89 | 90 | def cache_info(): 91 | return stats[0], stats[1], maxsize, len(data) 92 | 93 | def cache_clear(): 94 | data.clear() 95 | usecount.clear() 96 | 97 | wrapper.cache_info = cache_info 98 | wrapper.cache_clear = cache_clear 99 | return wrapper 100 | 101 | return decorating_function 102 | -------------------------------------------------------------------------------- /src/whoosh/util/loading.py: -------------------------------------------------------------------------------- 1 | # Copyright 2012 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | import pickle 29 | 30 | 31 | class RenamingUnpickler(pickle.Unpickler): 32 | """Subclasses ``pickle.Unpickler`` to allow remapping of class names before 33 | loading them. 34 | """ 35 | 36 | def __init__(self, f, objmap, shortcuts=None): 37 | pickle.Unpickler.__init__(self, f) 38 | 39 | if shortcuts: 40 | objmap = {k % shortcuts: v % shortcuts for k, v in objmap.items()} 41 | self._objmap = objmap 42 | 43 | def find_class(self, modulename, objname): 44 | fqname = f"{modulename}.{objname}" 45 | if fqname in self._objmap: 46 | fqname = self._objmap[fqname] 47 | try: 48 | obj = find_object(fqname) 49 | except ImportError: 50 | raise ImportError(f"Couldn't find {fqname!r}") 51 | return obj 52 | 53 | 54 | def find_object(name, blacklist=None, whitelist=None): 55 | """Imports and returns an object given a fully qualified name. 56 | 57 | >>> find_object("whoosh.analysis.StopFilter") 58 | 59 | """ 60 | 61 | if blacklist: 62 | for pre in blacklist: 63 | if name.startswith(pre): 64 | raise TypeError( 65 | f"{name!r}: can't instantiate names starting with {pre!r}" 66 | ) 67 | if whitelist: 68 | passes = False 69 | for pre in whitelist: 70 | if name.startswith(pre): 71 | passes = True 72 | break 73 | if not passes: 74 | raise TypeError(f"Can't instantiate {name!r}") 75 | 76 | lastdot = name.rfind(".") 77 | 78 | assert lastdot > -1, f"Name {name!r} must be fully qualified" 79 | modname = name[:lastdot] 80 | clsname = name[lastdot + 1 :] 81 | 82 | mod = __import__(modname, fromlist=[clsname]) 83 | cls = getattr(mod, clsname) 84 | return cls 85 | -------------------------------------------------------------------------------- /src/whoosh/util/text.py: -------------------------------------------------------------------------------- 1 | # Copyright 2007 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | import codecs 29 | import re 30 | 31 | # Note: these functions return a tuple of (text, length), so when you call 32 | # them, you have to add [0] on the end, e.g. str = utf8encode(unicode)[0] 33 | 34 | utf8encode = codecs.getencoder("utf-8") 35 | utf8decode = codecs.getdecoder("utf-8") 36 | 37 | 38 | # Prefix encoding functions 39 | 40 | 41 | def byte(num): 42 | return bytes((num,)) 43 | 44 | 45 | def first_diff(a, b): 46 | """ 47 | Returns the position of the first differing character in the sequences a 48 | and b. For example, first_diff('render', 'rending') == 4. This function 49 | limits the return value to 255 so the difference can be encoded in a single 50 | byte. 51 | """ 52 | 53 | i = 0 54 | while i <= 255 and i < len(a) and i < len(b) and a[i] == b[i]: 55 | i += 1 56 | return i 57 | 58 | 59 | def prefix_encode(a, b): 60 | """ 61 | Compresses bytestring b as a byte representing the prefix it shares with a, 62 | followed by the suffix bytes. 63 | """ 64 | 65 | i = first_diff(a, b) 66 | return byte(i) + b[i:] 67 | 68 | 69 | def prefix_encode_all(ls): 70 | """Compresses the given list of (unicode) strings by storing each string 71 | (except the first one) as an integer (encoded in a byte) representing 72 | the prefix it shares with its predecessor, followed by the suffix encoded 73 | as UTF-8. 74 | """ 75 | 76 | last = "" 77 | for w in ls: 78 | i = first_diff(last, w) 79 | yield chr(i) + w[i:].encode("utf-8") 80 | last = w 81 | 82 | 83 | def prefix_decode_all(ls): 84 | """Decompresses a list of strings compressed by prefix_encode().""" 85 | 86 | last = "" 87 | for w in ls: 88 | i = ord(w[0]) 89 | decoded = last[:i] + w[1:].decode("utf-8") 90 | yield decoded 91 | last = decoded 92 | 93 | 94 | # Natural key sorting function 95 | 96 | _nkre = re.compile(r"\D+|\d+", re.UNICODE) 97 | 98 | 99 | def _nkconv(i): 100 | try: 101 | return int(i) 102 | except ValueError: 103 | return i.lower() 104 | 105 | 106 | def natural_key(s): 107 | """Converts string ``s`` into a tuple that will sort "naturally" (i.e., 108 | ``name5`` will come before ``name10`` and ``1`` will come before ``A``). 109 | This function is designed to be used as the ``key`` argument to sorting 110 | functions. 111 | 112 | :param s: the str/unicode string to convert. 113 | :rtype: tuple 114 | """ 115 | 116 | # Use _nkre to split the input string into a sequence of 117 | # digit runs and non-digit runs. Then use _nkconv() to convert 118 | # the digit runs into ints and the non-digit runs to lowercase. 119 | return tuple(_nkconv(m) for m in _nkre.findall(s)) 120 | 121 | 122 | # Regular expression functions 123 | 124 | 125 | def rcompile(pattern, flags=0, verbose=False): 126 | """A wrapper for re.compile that checks whether "pattern" is a regex object 127 | or a string to be compiled, and automatically adds the re.UNICODE flag. 128 | """ 129 | 130 | if not isinstance(pattern, str): 131 | # If it's not a string, assume it's already a compiled pattern 132 | return pattern 133 | if verbose: 134 | flags |= re.VERBOSE 135 | return re.compile(pattern, re.UNICODE | flags) 136 | -------------------------------------------------------------------------------- /src/whoosh/util/varints.py: -------------------------------------------------------------------------------- 1 | # Copyright 2007 Matt Chaput. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, 7 | # this list of conditions and the following disclaimer. 8 | # 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15 | # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16 | # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | # 24 | # The views and conclusions contained in the software and documentation are 25 | # those of the authors and should not be interpreted as representing official 26 | # policies, either expressed or implied, of Matt Chaput. 27 | 28 | from array import array 29 | 30 | # Varint cache 31 | 32 | # Build a cache of the varint byte sequences for the first N integers, so we 33 | # don't have to constantly recalculate them on the fly. This makes a small but 34 | # noticeable difference. 35 | 36 | 37 | def _varint(i): 38 | a = array("B") 39 | while (i & ~0x7F) != 0: 40 | a.append((i & 0x7F) | 0x80) 41 | i = i >> 7 42 | a.append(i) 43 | return a.tobytes() 44 | 45 | 46 | _varint_cache_size = 512 47 | _varint_cache = tuple([_varint(i) for i in range(_varint_cache_size)]) 48 | 49 | 50 | def varint(i): 51 | """Encodes the given integer into a string of the minimum number of bytes.""" 52 | if i < len(_varint_cache): 53 | return _varint_cache[i] 54 | return _varint(i) 55 | 56 | 57 | def varint_to_int(vi): 58 | b = ord(vi[0]) 59 | p = 1 60 | i = b & 0x7F 61 | shift = 7 62 | while b & 0x80 != 0: 63 | b = ord(vi[p]) 64 | p += 1 65 | i |= (b & 0x7F) << shift 66 | shift += 7 67 | return i 68 | 69 | 70 | def signed_varint(i): 71 | """Zig-zag encodes a signed integer into a varint.""" 72 | 73 | if i >= 0: 74 | return varint(i << 1) 75 | return varint((i << 1) ^ (~0)) 76 | 77 | 78 | def decode_signed_varint(i): 79 | """Zig-zag decodes an integer value.""" 80 | 81 | if not i & 1: 82 | return i >> 1 83 | return (i >> 1) ^ (~0) 84 | 85 | 86 | def read_varint(readfn): 87 | """ 88 | Reads a variable-length encoded integer. 89 | 90 | :param readfn: a callable that reads a given number of bytes, 91 | like file.read(). 92 | """ 93 | 94 | b = ord(readfn(1)) 95 | i = b & 0x7F 96 | 97 | shift = 7 98 | while b & 0x80 != 0: 99 | b = ord(readfn(1)) 100 | i |= (b & 0x7F) << shift 101 | shift += 7 102 | return i 103 | -------------------------------------------------------------------------------- /stress/test_bigfacet.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import random 3 | import string 4 | 5 | from whoosh import fields, formats, index, query, sorting 6 | from whoosh.util import now 7 | 8 | tagcount = 100 9 | doccount = 500000 10 | dirname = "testindex" 11 | 12 | schema = fields.Schema(tags=fields.KEYWORD(stored=True, vector=formats.Existence())) 13 | 14 | if not os.path.exists(dirname): 15 | os.mkdir(dirname) 16 | 17 | reindex = False 18 | if reindex or not index.exists_in(dirname): 19 | tags = [] 20 | for _ in range(tagcount): 21 | tag = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) 22 | tags.append(tag) 23 | 24 | ix = index.create_in(dirname, schema) 25 | t = now() 26 | with ix.writer() as w: 27 | for i in range(doccount): 28 | doc = " ".join(random.sample(tags, random.randint(10, 20))) 29 | w.add_document(tags=doc) 30 | if not i % 10000: 31 | print(i) 32 | print(now() - t) 33 | 34 | 35 | ix = index.open_dir(dirname) 36 | with ix.searcher() as s: 37 | tags = list(s.lexicon("tags")) 38 | facet = sorting.FieldFacet("tags", allow_overlap=True) 39 | qtag = random.choice(tags) 40 | print("tag=", qtag) 41 | q = query.Term("tags", qtag) 42 | r = s.search(q, groupedby={"tags": facet}) 43 | print(r.runtime) 44 | 45 | facet = sorting.StoredFieldFacet("tags", allow_overlap=True) 46 | r = s.search(q, groupedby={"tags": facet}) 47 | print(r.runtime) 48 | -------------------------------------------------------------------------------- /stress/test_bigindex.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from whoosh import fields 4 | from whoosh.util import now 5 | from whoosh.util.testing import TempIndex 6 | 7 | 8 | def test_20000_single(): 9 | sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) 10 | with TempIndex(sc, "20000single") as ix: 11 | domain = [ 12 | "alfa", 13 | "bravo", 14 | "charlie", 15 | "delta", 16 | "echo", 17 | "foxtrot", 18 | "golf", 19 | "hotel", 20 | "india", 21 | "juliet", 22 | "kilo", 23 | "lima", 24 | ] 25 | 26 | t = now() 27 | for i in range(20000): 28 | w = ix.writer() 29 | w.add_document(id=str(i), text=" ".join(random.sample(domain, 5))) 30 | w.commit() 31 | print("Write single:", now() - t) 32 | 33 | t = now() 34 | ix.optimize() 35 | print("Optimize single:", now() - t) 36 | 37 | 38 | def test_20000_buffered(): 39 | from whoosh.writing import BufferedWriter 40 | 41 | sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) 42 | with TempIndex(sc, "20000buffered") as ix: 43 | domain = [ 44 | "alfa", 45 | "bravo", 46 | "charlie", 47 | "delta", 48 | "echo", 49 | "foxtrot", 50 | "golf", 51 | "hotel", 52 | "india", 53 | "juliet", 54 | "kilo", 55 | "lima", 56 | ] 57 | 58 | t = now() 59 | w = BufferedWriter(ix, limit=100, period=None) 60 | for i in range(20000): 61 | w.add_document(id=str(i), text=" ".join(random.sample(domain, 5))) 62 | w.close() 63 | print("Write buffered:", now() - t) 64 | 65 | t = now() 66 | ix.optimize() 67 | print("Optimize buffered:", now() - t) 68 | 69 | 70 | def test_20000_batch(): 71 | sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) 72 | with TempIndex(sc, "20000batch") as ix: 73 | domain = [ 74 | "alfa", 75 | "bravo", 76 | "charlie", 77 | "delta", 78 | "echo", 79 | "foxtrot", 80 | "golf", 81 | "hotel", 82 | "india", 83 | "juliet", 84 | "kilo", 85 | "lima", 86 | ] 87 | 88 | t = now() 89 | w = ix.writer() 90 | for i in range(20000): 91 | w.add_document(id=str(i), text=" ".join(random.sample(domain, 5))) 92 | if not i % 100: 93 | w.commit() 94 | w = ix.writer() 95 | w.commit() 96 | print("Write batch:", now() - t) 97 | 98 | t = now() 99 | ix.optimize() 100 | print("Optimize batch:", now() - t) 101 | -------------------------------------------------------------------------------- /stress/test_bigsort.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import random 3 | import shutil 4 | from datetime import datetime, timezone 5 | 6 | from whoosh import fields, index, query 7 | from whoosh.util import now 8 | 9 | 10 | def test_bigsort(): 11 | times = 30000 12 | dirname = "testindex" 13 | 14 | df = fields.DATETIME(stored=True) 15 | schema = fields.Schema(id=fields.ID(stored=True), date=df) 16 | 17 | if os.path.exists(dirname): 18 | shutil.rmtree(dirname) 19 | os.mkdir(dirname) 20 | ix = index.create_in(dirname, schema) 21 | 22 | print("Writing...") 23 | t = now() 24 | w = ix.writer(limitmb=512) 25 | for i in range(times): 26 | dt = datetime.fromtimestamp( 27 | random.randint(15839593, 1294102139), tz=timezone.utc 28 | ) 29 | w.add_document(id=str(i), date=dt) 30 | w.commit() 31 | print("Writing took ", now() - t) 32 | 33 | ix = index.open_dir(dirname) 34 | s = ix.searcher() 35 | q = query.Wildcard("id", "1?2*") 36 | 37 | t = now() 38 | x = list(df.sortable_terms(s.reader(), "date")) 39 | print(now() - t, len(x)) 40 | 41 | t = now() 42 | for y in x: 43 | p = list(s.postings("date", y).all_ids()) 44 | print(now() - t) 45 | 46 | t = now() 47 | r = s.search(q, limit=25, sortedby="date", reverse=True) 48 | print("Search 1 took", now() - t) 49 | print("len=", r.scored_length()) 50 | 51 | t = now() 52 | r = s.search(q, limit=25, sortedby="date") 53 | print("Search 2 took", now() - t) 54 | 55 | t = now() 56 | r = s.search(q, limit=25, sortedby="date") 57 | print("Search 2 took", now() - t) 58 | 59 | from heapq import nlargest 60 | 61 | t = now() 62 | sf = s.stored_fields 63 | gen = ((sf(n)["date"], n) for n in q.docs(s)) 64 | r = nlargest(25, gen) 65 | print(now() - t) 66 | -------------------------------------------------------------------------------- /stress/test_bigtable.py: -------------------------------------------------------------------------------- 1 | from random import randint, shuffle 2 | 3 | from nose.tools import assert_equal # type: ignore @UnresolvedImport 4 | from whoosh.filedb.filetables import HashReader, HashWriter 5 | from whoosh.util.testing import TempStorage 6 | 7 | 8 | def test_bigtable(): 9 | with TempStorage("bigtable") as st: 10 | 11 | def randstring(min, max): 12 | return "".join(chr(randint(1, 255)) for _ in range(randint(min, max))) 13 | 14 | count = 100000 15 | samp = {randstring(1, 50): randstring(1, 50) for _ in range(count)} 16 | 17 | fhw = HashWriter(st.create_file("big.hsh")) 18 | fhw.add_all(samp.items()) 19 | fhw.close() 20 | 21 | fhr = HashReader(st.open_file("big.hsh")) 22 | keys = list(samp.keys()) 23 | shuffle(keys) 24 | for key in keys: 25 | assert_equal(samp[key], fhr[key]) 26 | 27 | set1 = set(samp.items()) 28 | set2 = set(fhr.items()) 29 | assert_equal(set1, set2) 30 | 31 | fhr.close() 32 | -------------------------------------------------------------------------------- /stress/test_hugeindex.py: -------------------------------------------------------------------------------- 1 | import struct 2 | 3 | from nose.tools import assert_equal # type: ignore @UnresolvedImport 4 | from whoosh import formats 5 | from whoosh.filedb.filepostings import FilePostingReader, FilePostingWriter 6 | from whoosh.util.testing import TempStorage 7 | 8 | 9 | def test_huge_postfile(): 10 | with TempStorage("hugeindex") as st: 11 | pf = st.create_file("test.pst") 12 | 13 | gb5 = 5 * 1024 * 1024 * 1024 14 | pf.seek(gb5) 15 | pf.write("\x00\x00\x00\x00") 16 | assert_equal(pf.tell(), gb5 + 4) 17 | 18 | fpw = FilePostingWriter(pf) 19 | f = formats.Frequency(None) 20 | offset = fpw.start(f) 21 | for i in range(10): 22 | fpw.write(i, float(i), struct.pack("!I", i), 10) 23 | posttotal = fpw.finish() 24 | assert_equal(posttotal, 10) 25 | fpw.close() 26 | 27 | pf = st.open_file("test.pst") 28 | pfr = FilePostingReader(pf, offset, f) 29 | i = 0 30 | while pfr.is_active(): 31 | assert_equal(pfr.id(), i) 32 | assert_equal(pfr.weight(), float(i)) 33 | assert_equal(pfr.value(), struct.pack("!I", i)) 34 | pfr.next() 35 | i += 1 36 | pf.close() 37 | -------------------------------------------------------------------------------- /stress/test_threading.py: -------------------------------------------------------------------------------- 1 | import random 2 | import threading 3 | import time 4 | 5 | from whoosh import fields, query 6 | from whoosh.util.testing import TempStorage 7 | 8 | 9 | def test_readwrite(): 10 | schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) 11 | with TempStorage("threading") as st: 12 | domain = ( 13 | "alfa", 14 | "bravo", 15 | "charlie", 16 | "delta", 17 | "echo", 18 | "foxtrot", 19 | "golf", 20 | "hotel", 21 | "india", 22 | "juliet", 23 | "kilo", 24 | "lima", 25 | "mike", 26 | "november", 27 | "oscar", 28 | "papa", 29 | "quebec", 30 | "romeo", 31 | "sierra", 32 | "tango", 33 | "uniform", 34 | "victor", 35 | "whiskey", 36 | "xray", 37 | "yankee", 38 | "zulu", 39 | ) 40 | 41 | class WriterThread(threading.Thread): 42 | def run(self): 43 | ix = st.create_index(dir, schema) 44 | num = 0 45 | 46 | for i in range(50): 47 | print(i) 48 | w = ix.writer() 49 | for _ in range(random.randint(1, 100)): 50 | content = " ".join(random.sample(domain, random.randint(5, 20))) 51 | w.add_document(id=str(num), content=content) 52 | num += 1 53 | w.commit() 54 | 55 | time.sleep(0.1) 56 | 57 | class SearcherThread(threading.Thread): 58 | def run(self): 59 | print(self.name + " starting") 60 | for _ in range(10): 61 | ix = st.open_index() 62 | s = ix.searcher() 63 | q = query.Term("content", random.choice(domain)) 64 | s.search(q, limit=10) 65 | s.close() 66 | ix.close() 67 | time.sleep(0.1) 68 | print(self.name + " done") 69 | 70 | wt = WriterThread() 71 | wt.start() 72 | time.sleep(0.5) 73 | for _ in range(20): 74 | SearcherThread().start() 75 | time.sleep(0.5) 76 | wt.join() 77 | -------------------------------------------------------------------------------- /stress/test_update.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from nose.tools import assert_equal 4 | from whoosh import fields, query 5 | from whoosh.util.testing import TempIndex 6 | 7 | 8 | def test_many_updates(): 9 | schema = fields.Schema(key=fields.ID(unique=True, stored=True)) 10 | with TempIndex(schema, "manyupdates") as ix: 11 | for _ in range(10000): 12 | num = random.randint(0, 5000) 13 | w = ix.writer() 14 | w.update_document(key=str(num)) 15 | w.commit() 16 | 17 | with ix.searcher() as s: 18 | result = [d["key"] for d in s.search(query.Every())] 19 | assert_equal(len(result), len(set(result))) 20 | -------------------------------------------------------------------------------- /sweep.yaml: -------------------------------------------------------------------------------- 1 | # Sweep AI turns bugs & feature requests into code changes (https://sweep.dev) 2 | # For details on our config file, check out our docs at https://docs.sweep.dev/usage/config 3 | 4 | # This setting contains a list of rules that Sweep will check for. If any of these rules are broken in a new commit, Sweep will create an pull request to fix the broken rule. 5 | rules: 6 | - "All new business logic should have corresponding unit tests." 7 | - "Refactor large functions to be more modular." 8 | - "Add docstrings to all functions and file headers." 9 | 10 | # This is the branch that Sweep will develop from and make pull requests to. Most people use 'main' or 'master' but some users also use 'dev' or 'staging'. 11 | branch: 'main' 12 | 13 | # By default Sweep will read the logs and outputs from your existing Github Actions. To disable this, set this to false. 14 | gha_enabled: True 15 | 16 | # This is the description of your project. It will be used by sweep when creating PRs. You can tell Sweep what's unique about your project, what frameworks you use, or anything else you want. 17 | # 18 | # Example: 19 | # 20 | # description: sweepai/sweep is a python project. The main api endpoints are in sweepai/api.py. Write code that adheres to PEP8. 21 | description: '' 22 | 23 | # This sets whether to create pull requests as drafts. If this is set to True, then all pull requests will be created as drafts and GitHub Actions will not be triggered. 24 | draft: False 25 | 26 | # This is a list of directories that Sweep will not be able to edit. 27 | blocked_dirs: [] 28 | -------------------------------------------------------------------------------- /tests/english-words.10.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sygil-Dev/whoosh-reloaded/ab775ca5bfb443c2b3598b6a2d6189e595f7c377/tests/english-words.10.gz -------------------------------------------------------------------------------- /tests/test_compound.py: -------------------------------------------------------------------------------- 1 | from whoosh.filedb.compound import CompoundStorage 2 | from whoosh.filedb.filestore import RamStorage 3 | from whoosh.util.testing import TempStorage 4 | 5 | 6 | def _test_simple_compound(st): 7 | alist = [1, 2, 3, 5, -5, -4, -3, -2] 8 | blist = [1, 12, 67, 8, 2, 1023] 9 | clist = [100, -100, 200, -200] 10 | 11 | with st.create_file("a") as af: 12 | for x in alist: 13 | af.write_int(x) 14 | with st.create_file("b") as bf: 15 | for x in blist: 16 | bf.write_varint(x) 17 | with st.create_file("c") as cf: 18 | for x in clist: 19 | cf.write_int(x) 20 | 21 | f = st.create_file("f") 22 | CompoundStorage.assemble(f, st, ["a", "b", "c"]) 23 | 24 | f = CompoundStorage(st.open_file("f")) 25 | with f.open_file("a") as af: 26 | for x in alist: 27 | assert x == af.read_int() 28 | assert af.read() == b"" 29 | 30 | with f.open_file("b") as bf: 31 | for x in blist: 32 | assert x == bf.read_varint() 33 | assert bf.read() == b"" 34 | 35 | with f.open_file("c") as cf: 36 | for x in clist: 37 | assert x == cf.read_int() 38 | assert cf.read() == b"" 39 | 40 | 41 | def test_simple_compound_mmap(): 42 | with TempStorage("compound") as st: 43 | assert st.supports_mmap 44 | _test_simple_compound(st) 45 | 46 | 47 | def test_simple_compound_nomap(): 48 | st = RamStorage() 49 | _test_simple_compound(st) 50 | 51 | 52 | # def test_unclosed_mmap(): 53 | # with TempStorage("unclosed") as st: 54 | # assert st.supports_mmap 55 | # with st.create_file("a") as af: 56 | # af.write("alfa") 57 | # with st.create_file("b") as bf: 58 | # bf.write("bravo") 59 | # f = st.create_file("f") 60 | # CompoundStorage.assemble(f, st, ["a", "b"]) 61 | # 62 | # f = CompoundStorage(st, "f") 63 | -------------------------------------------------------------------------------- /tests/test_flexible.py: -------------------------------------------------------------------------------- 1 | from whoosh import fields 2 | from whoosh.util.testing import TempIndex 3 | 4 | 5 | def test_addfield(): 6 | schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) 7 | with TempIndex(schema, "addfield") as ix: 8 | w = ix.writer() 9 | w.add_document(id="a", content="alfa") 10 | w.add_document(id="b", content="bravo") 11 | w.add_document(id="c", content="charlie") 12 | w.commit() 13 | 14 | ix.add_field("added", fields.KEYWORD(stored=True)) 15 | 16 | w = ix.writer() 17 | w.add_document(id="d", content="delta", added="fourth") 18 | w.add_document(id="e", content="echo", added="fifth") 19 | w.commit(merge=False) 20 | 21 | with ix.searcher() as s: 22 | assert ("id", "d") in s.reader() 23 | assert s.document(id="d") == {"id": "d", "added": "fourth"} 24 | assert s.document(id="b") == {"id": "b"} 25 | 26 | 27 | def test_addfield_spelling(): 28 | schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) 29 | with TempIndex(schema, "addfield") as ix: 30 | w = ix.writer() 31 | w.add_document(id="a", content="alfa") 32 | w.add_document(id="b", content="bravo") 33 | w.add_document(id="c", content="charlie") 34 | w.commit() 35 | 36 | ix.add_field("added", fields.KEYWORD(stored=True)) 37 | 38 | w = ix.writer() 39 | w.add_document(id="d", content="delta", added="fourth") 40 | w.add_document(id="e", content="echo", added="fifth") 41 | w.commit(merge=False) 42 | 43 | with ix.searcher() as s: 44 | assert s.document(id="d") == {"id": "d", "added": "fourth"} 45 | assert s.document(id="b") == {"id": "b"} 46 | 47 | 48 | def test_removefield(): 49 | schema = fields.Schema( 50 | id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True) 51 | ) 52 | with TempIndex(schema, "removefield") as ix: 53 | w = ix.writer() 54 | w.add_document(id="b", content="bravo", city="baghdad") 55 | w.add_document(id="c", content="charlie", city="cairo") 56 | w.add_document(id="d", content="delta", city="dakar") 57 | w.commit() 58 | 59 | with ix.searcher() as s: 60 | assert s.document(id="c") == {"id": "c", "city": "cairo"} 61 | 62 | w = ix.writer() 63 | w.remove_field("content") 64 | w.remove_field("city") 65 | w.commit() 66 | 67 | ixschema = ix._current_schema() 68 | assert ixschema.names() == ["id"] 69 | assert ixschema.stored_names() == ["id"] 70 | 71 | with ix.searcher() as s: 72 | assert ("content", b"charlie") not in s.reader() 73 | assert s.document(id="c") == {"id": "c"} 74 | 75 | 76 | def test_optimize_away(): 77 | schema = fields.Schema( 78 | id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True) 79 | ) 80 | with TempIndex(schema, "optimizeaway") as ix: 81 | w = ix.writer() 82 | w.add_document(id="b", content="bravo", city="baghdad") 83 | w.add_document(id="c", content="charlie", city="cairo") 84 | w.add_document(id="d", content="delta", city="dakar") 85 | w.commit() 86 | 87 | with ix.searcher() as s: 88 | assert s.document(id="c") == {"id": "c", "city": "cairo"} 89 | 90 | w = ix.writer() 91 | w.remove_field("content") 92 | w.remove_field("city") 93 | w.commit(optimize=True) 94 | 95 | with ix.searcher() as s: 96 | assert ("content", "charlie") not in s.reader() 97 | assert s.document(id="c") == {"id": "c"} 98 | 99 | 100 | if __name__ == "__main__": 101 | test_addfield() 102 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import threading 3 | import time 4 | 5 | from whoosh.util.filelock import try_for 6 | from whoosh.util.numeric import byte_to_length, length_to_byte 7 | from whoosh.util.testing import TempStorage 8 | 9 | 10 | def test_now(): 11 | from whoosh.util import now 12 | 13 | t1 = now() 14 | t2 = now() 15 | assert t1 <= t2 16 | 17 | 18 | def test_storage_creation(): 19 | import tempfile 20 | import uuid 21 | 22 | from whoosh import fields 23 | from whoosh.filedb.filestore import FileStorage 24 | 25 | schema = fields.Schema(text=fields.TEXT) 26 | uid = uuid.uuid4() 27 | dirpath = os.path.join(tempfile.gettempdir(), str(uid)) 28 | assert not os.path.exists(dirpath) 29 | 30 | st = FileStorage(dirpath) 31 | st.create() 32 | assert os.path.exists(dirpath) 33 | 34 | ix = st.create_index(schema) 35 | with ix.writer() as w: 36 | w.add_document(text="alfa bravo") 37 | w.add_document(text="bracho charlie") 38 | 39 | st.destroy() 40 | assert not os.path.exists(dirpath) 41 | 42 | 43 | def test_ramstorage(): 44 | from whoosh.filedb.filestore import RamStorage 45 | 46 | st = RamStorage() 47 | lock = st.lock("test") 48 | lock.acquire() 49 | lock.release() 50 | 51 | 52 | def test_filelock_simple(): 53 | with TempStorage("simplefilelock") as st: 54 | lock1 = st.lock("testlock") 55 | lock2 = st.lock("testlock") 56 | assert lock1 is not lock2 57 | 58 | assert lock1.acquire() 59 | assert st.file_exists("testlock") 60 | assert not lock2.acquire() 61 | lock1.release() 62 | assert lock2.acquire() 63 | assert not lock1.acquire() 64 | lock2.release() 65 | 66 | 67 | def test_threaded_filelock(): 68 | with TempStorage("threadedfilelock") as st: 69 | lock1 = st.lock("testlock") 70 | result = [] 71 | 72 | # The thread function tries to acquire the lock and then quits 73 | def fn(): 74 | lock2 = st.lock("testlock") 75 | gotit = try_for(lock2.acquire, 1.0, 0.1) 76 | if gotit: 77 | result.append(True) 78 | lock2.release() 79 | 80 | t = threading.Thread(target=fn) 81 | 82 | # Acquire the lock in this thread 83 | lock1.acquire() 84 | # Start the other thread trying to acquire the lock 85 | t.start() 86 | # Wait for a bit 87 | time.sleep(0.15) 88 | # Release the lock 89 | lock1.release() 90 | # Wait for the other thread to finish 91 | t.join() 92 | # If the other thread got the lock, it should have appended True to the 93 | # "results" list. 94 | assert result == [True] 95 | 96 | 97 | def test_length_byte(): 98 | source = list(range(11)) 99 | xform = [length_to_byte(n) for n in source] 100 | result = [byte_to_length(n) for n in xform] 101 | assert source == result 102 | 103 | 104 | def test_version_object(): 105 | from whoosh.util.versions import SimpleVersion as sv 106 | 107 | assert sv.parse("1") == sv(1) 108 | assert sv.parse("1.2") == sv(1, 2) 109 | assert sv.parse("1.2b") == sv(1, 2, ex="b") 110 | assert sv.parse("1.2rc") == sv(1, 2, ex="rc") 111 | assert sv.parse("1.2b3") == sv(1, 2, ex="b", exnum=3) 112 | assert sv.parse("1.2.3") == sv(1, 2, 3) 113 | assert sv.parse("1.2.3a") == sv(1, 2, 3, "a") 114 | assert sv.parse("1.2.3rc") == sv(1, 2, 3, "rc") 115 | assert sv.parse("1.2.3a4") == sv(1, 2, 3, "a", 4) 116 | assert sv.parse("1.2.3rc2") == sv(1, 2, 3, "rc", 2) 117 | assert sv.parse("999.999.999c999") == sv(999, 999, 999, "c", 999) 118 | 119 | assert sv.parse("1.2") == sv.parse("1.2") 120 | assert sv("1.2") != sv("1.3") 121 | assert sv.parse("1.0") < sv.parse("1.1") 122 | assert sv.parse("1.0") < sv.parse("2.0") 123 | assert sv.parse("1.2.3a4") < sv.parse("1.2.3a5") 124 | assert sv.parse("1.2.3a5") > sv.parse("1.2.3a4") 125 | assert sv.parse("1.2.3c99") < sv.parse("1.2.4") 126 | assert sv.parse("1.2.3a4") != sv.parse("1.2.3a5") 127 | assert sv.parse("1.2.3a5") != sv.parse("1.2.3a4") 128 | assert sv.parse("1.2.3c99") != sv.parse("1.2.4") 129 | assert sv.parse("1.2.3a4") <= sv.parse("1.2.3a5") 130 | assert sv.parse("1.2.3a5") >= sv.parse("1.2.3a4") 131 | assert sv.parse("1.2.3c99") <= sv.parse("1.2.4") 132 | assert sv.parse("1.2") <= sv.parse("1.2") 133 | 134 | assert sv(1, 2, 3).to_int() == 17213488128 135 | assert sv.from_int(17213488128) == sv(1, 2, 3) 136 | -------------------------------------------------------------------------------- /tests/test_postings.py: -------------------------------------------------------------------------------- 1 | from whoosh import analysis, fields 2 | from whoosh.codec import default_codec 3 | from whoosh.formats import ( 4 | CharacterBoosts, 5 | Characters, 6 | Existence, 7 | Frequency, 8 | PositionBoosts, 9 | Positions, 10 | ) 11 | from whoosh.util.testing import TempStorage 12 | 13 | 14 | def _roundtrip(content, format_, astype, ana=None): 15 | with TempStorage("roundtrip") as st: 16 | codec = default_codec() 17 | seg = codec.new_segment(st, "") 18 | ana = ana or analysis.StandardAnalyzer() 19 | field = fields.FieldType(format=format_, analyzer=ana) 20 | 21 | fw = codec.field_writer(st, seg) 22 | fw.start_field("f1", field) 23 | for text, _, weight, valuestring in sorted(field.index(content)): 24 | fw.start_term(text) 25 | fw.add(0, weight, valuestring, None) 26 | fw.finish_term() 27 | fw.finish_field() 28 | fw.close() 29 | 30 | tr = codec.terms_reader(st, seg) 31 | ps = [] 32 | for fieldname, btext in tr.terms(): 33 | m = tr.matcher(fieldname, btext, format_) 34 | ps.append((field.from_bytes(btext), m.value_as(astype))) 35 | tr.close() 36 | return ps 37 | 38 | 39 | def test_existence_postings(): 40 | content = "alfa bravo charlie" 41 | assert _roundtrip(content, Existence(), "frequency") == [ 42 | ("alfa", 1), 43 | ("bravo", 1), 44 | ("charlie", 1), 45 | ] 46 | 47 | 48 | def test_frequency_postings(): 49 | content = "alfa bravo charlie bravo alfa alfa" 50 | assert _roundtrip(content, Frequency(), "frequency") == [ 51 | ("alfa", 3), 52 | ("bravo", 2), 53 | ("charlie", 1), 54 | ] 55 | 56 | 57 | def test_position_postings(): 58 | content = "alfa bravo charlie bravo alfa alfa" 59 | assert _roundtrip(content, Positions(), "positions") == [ 60 | ("alfa", [0, 4, 5]), 61 | ("bravo", [1, 3]), 62 | ("charlie", [2]), 63 | ] 64 | assert _roundtrip(content, Positions(), "frequency") == [ 65 | ("alfa", 3), 66 | ("bravo", 2), 67 | ("charlie", 1), 68 | ] 69 | 70 | 71 | def test_character_postings(): 72 | content = "alfa bravo charlie bravo alfa alfa" 73 | assert _roundtrip(content, Characters(), "characters") == [ 74 | ("alfa", [(0, 0, 4), (4, 25, 29), (5, 30, 34)]), 75 | ("bravo", [(1, 5, 10), (3, 19, 24)]), 76 | ("charlie", [(2, 11, 18)]), 77 | ] 78 | assert _roundtrip(content, Characters(), "positions") == [ 79 | ("alfa", [0, 4, 5]), 80 | ("bravo", [1, 3]), 81 | ("charlie", [2]), 82 | ] 83 | assert _roundtrip(content, Characters(), "frequency") == [ 84 | ("alfa", 3), 85 | ("bravo", 2), 86 | ("charlie", 1), 87 | ] 88 | 89 | 90 | def test_posboost_postings(): 91 | pbs = PositionBoosts() 92 | ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() 93 | content = "alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa" 94 | assert _roundtrip(content, pbs, "position_boosts", ana) == [ 95 | ("alfa", [(0, 2), (4, 1), (5, 1)]), 96 | ("bravo", [(1, 0.1), (3, 0.5)]), 97 | ("charlie", [(2, 2)]), 98 | ] 99 | assert _roundtrip(content, pbs, "positions", ana) == [ 100 | ("alfa", [0, 4, 5]), 101 | ("bravo", [1, 3]), 102 | ("charlie", [2]), 103 | ] 104 | assert _roundtrip(content, pbs, "frequency", ana) == [ 105 | ("alfa", 3), 106 | ("bravo", 2), 107 | ("charlie", 1), 108 | ] 109 | 110 | 111 | def test_charboost_postings(): 112 | cbs = CharacterBoosts() 113 | ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() 114 | content = "alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa" 115 | assert _roundtrip(content, cbs, "character_boosts", ana) == [ 116 | ("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]), 117 | ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]), 118 | ("charlie", [(2, 17, 24, 2)]), 119 | ] 120 | assert _roundtrip(content, cbs, "position_boosts", ana) == [ 121 | ("alfa", [(0, 2), (4, 1), (5, 1)]), 122 | ("bravo", [(1, 0.1), (3, 0.5)]), 123 | ("charlie", [(2, 2)]), 124 | ] 125 | assert _roundtrip(content, cbs, "characters", ana) == [ 126 | ("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]), 127 | ("bravo", [(1, 7, 12), (3, 27, 32)]), 128 | ("charlie", [(2, 17, 24)]), 129 | ] 130 | assert _roundtrip(content, cbs, "positions", ana) == [ 131 | ("alfa", [0, 4, 5]), 132 | ("bravo", [1, 3]), 133 | ("charlie", [2]), 134 | ] 135 | assert _roundtrip(content, cbs, "frequency", ana) == [ 136 | ("alfa", 3), 137 | ("bravo", 2), 138 | ("charlie", 1), 139 | ] 140 | -------------------------------------------------------------------------------- /tests/test_stem.py: -------------------------------------------------------------------------------- 1 | from whoosh.lang.snowball.english import EnglishStemmer 2 | from whoosh.lang.snowball.finnish import FinnishStemmer 3 | from whoosh.lang.snowball.french import FrenchStemmer 4 | from whoosh.lang.snowball.spanish import SpanishStemmer 5 | 6 | 7 | def test_english(): 8 | s = EnglishStemmer() 9 | assert s.stem("hello") == "hello" 10 | assert s.stem("atlas") == "atlas" 11 | assert s.stem("stars") == "star" 12 | 13 | 14 | def test_french(): 15 | s = FrenchStemmer() 16 | assert s.stem("adresse") == "adress" 17 | assert s.stem("lettres") == "lettr" 18 | 19 | 20 | def test_finnish(): 21 | s = FinnishStemmer() 22 | assert s.stem("valitse") == "valits" 23 | assert s.stem("koko") == "koko" 24 | assert s.stem("erikoismerkit") == "erikoismerk" 25 | 26 | 27 | def test_spanish_spell_suffix(): 28 | word = "tgue" 29 | s = SpanishStemmer() 30 | w = s.stem(word) 31 | assert w == "tgu" 32 | -------------------------------------------------------------------------------- /tests/test_weightings.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import sys 3 | from itertools import permutations 4 | from random import choice, randint 5 | 6 | from whoosh import fields, query, scoring 7 | from whoosh.filedb.filestore import RamStorage 8 | 9 | 10 | def u(s): 11 | return s.decode("ascii") if isinstance(s, bytes) else s 12 | 13 | 14 | def _weighting_classes(ignore): 15 | # Get all the subclasses of Weighting in whoosh.scoring 16 | return [ 17 | c 18 | for _, c in inspect.getmembers(scoring, inspect.isclass) 19 | if scoring.Weighting in c.__bases__ and c not in ignore 20 | ] 21 | 22 | 23 | def test_all(): 24 | domain = [u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot")] 25 | schema = fields.Schema(text=fields.TEXT) 26 | storage = RamStorage() 27 | ix = storage.create_index(schema) 28 | w = ix.writer() 29 | for _ in range(100): 30 | w.add_document(text=u(" ").join(choice(domain) for _ in range(randint(10, 20)))) 31 | w.commit() 32 | 33 | # List ABCs that should not be tested 34 | abcs = () 35 | # provide initializer arguments for any weighting classes that require them 36 | init_args = { 37 | "MultiWeighting": ([scoring.BM25F()], {"text": scoring.Frequency()}), 38 | "ReverseWeighting": ([scoring.BM25F()], {}), 39 | } 40 | 41 | for wclass in _weighting_classes(abcs): 42 | try: 43 | if wclass.__name__ in init_args: 44 | args, kwargs = init_args[wclass.__name__] 45 | weighting = wclass(*args, **kwargs) 46 | else: 47 | weighting = wclass() 48 | except TypeError: 49 | e = sys.exc_info()[1] 50 | raise TypeError(f"Error instantiating {wclass!r}: {e}") 51 | 52 | with ix.searcher(weighting=weighting) as s: 53 | try: 54 | for word in domain: 55 | s.search(query.Term("text", word)) 56 | except ValueError: 57 | e = sys.exc_info()[1] 58 | e.msg = f"Error searching with {wclass!r}: {e}" 59 | raise 60 | 61 | 62 | def test_compatibility(): 63 | from whoosh.scoring import Weighting 64 | 65 | # This is the old way of doing a custom weighting model, check that 66 | # it's still supported... 67 | class LegacyWeighting(Weighting): 68 | use_final = True 69 | 70 | def score(self, searcher, fieldname, text, docnum, weight): 71 | return weight + 0.5 72 | 73 | def final(self, searcher, docnum, score): 74 | return score * 1.5 75 | 76 | schema = fields.Schema(text=fields.TEXT) 77 | ix = RamStorage().create_index(schema) 78 | w = ix.writer() 79 | domain = "alfa bravo charlie delta".split() 80 | for ls in permutations(domain, 3): 81 | w.add_document(text=u(" ").join(ls)) 82 | w.commit() 83 | 84 | s = ix.searcher(weighting=LegacyWeighting()) 85 | r = s.search(query.Term("text", u("bravo"))) 86 | assert r.score(0) == 2.25 87 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py38, py39, py310, py311, py312 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | pytest-pep8 8 | commands = py.test -s {posargs} tests 9 | --------------------------------------------------------------------------------