├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── publish-docs.yml
    │   ├── publish.yml
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── AGENTS.md
├── CNAME
├── LICENSE
├── README.md
├── codecov.yml
├── docs
    ├── about.md
    ├── assets
    │   ├── Presentation_PyData_Amsterdam_2025.pdf
    │   ├── generate_text_png.py
    │   ├── sieve.png
    │   └── sieves_sieve_style.png
    ├── bridge.md
    ├── doc.md
    ├── engines
    │   ├── base_engine.md
    │   ├── dspy.md
    │   ├── gliner.md
    │   ├── huggingface.md
    │   ├── langchain.md
    │   └── outlines.md
    ├── guides
    │   ├── custom_tasks.md
    │   ├── distillation.md
    │   ├── getting_started.md
    │   ├── optimization.md
    │   ├── preprocessing.md
    │   └── serialization.md
    ├── index.md
    ├── pipeline.md
    └── tasks
    │   ├── predictive
    │       ├── classification.md
    │       ├── information_extraction.md
    │       ├── ner.md
    │       ├── pii_masking.md
    │       ├── question_answering.md
    │       ├── sentiment_analysis.md
    │       ├── summarization.md
    │       └── translation.md
    │   ├── preprocessing
    │       ├── chunking
    │       │   ├── chonkie.md
    │       │   ├── chunking.md
    │       │   └── naive.md
    │       └── ingestion
    │       │   ├── docling.md
    │       │   ├── ingestion.md
    │       │   ├── marker.md
    │       │   └── unstructured.md
    │   └── task.md
├── examples
    └── pydata_amsterdam_demo.py
├── mkdocs.yml
├── pyproject.toml
├── setup.cfg
├── setup.py
├── sieves
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   └── doc.py
    ├── engines
    │   ├── __init__.py
    │   ├── core.py
    │   ├── dspy_.py
    │   ├── engine_import.py
    │   ├── engine_type.py
    │   ├── glix_.py
    │   ├── huggingface_.py
    │   ├── langchain_.py
    │   ├── missing.py
    │   ├── outlines_.py
    │   ├── types.py
    │   └── utils.py
    ├── pipeline
    │   ├── __init__.py
    │   └── core.py
    ├── serialization.py
    ├── tasks
    │   ├── __init__.py
    │   ├── core.py
    │   ├── optimization
    │   │   ├── __init__.py
    │   │   └── core.py
    │   ├── postprocessing
    │   │   ├── __init__.py
    │   │   └── distillation
    │   │   │   ├── __init__.py
    │   │   │   ├── distillation_import.py
    │   │   │   └── types.py
    │   ├── predictive
    │   │   ├── __init__.py
    │   │   ├── bridges.py
    │   │   ├── classification
    │   │   │   ├── __init__.py
    │   │   │   ├── bridges.py
    │   │   │   └── core.py
    │   │   ├── core.py
    │   │   ├── information_extraction
    │   │   │   ├── __init__.py
    │   │   │   ├── bridges.py
    │   │   │   └── core.py
    │   │   ├── ner
    │   │   │   ├── __init__.py
    │   │   │   ├── bridges.py
    │   │   │   └── core.py
    │   │   ├── pii_masking
    │   │   │   ├── __init__.py
    │   │   │   ├── bridges.py
    │   │   │   └── core.py
    │   │   ├── question_answering
    │   │   │   ├── __init__.py
    │   │   │   ├── bridges.py
    │   │   │   └── core.py
    │   │   ├── sentiment_analysis
    │   │   │   ├── __init__.py
    │   │   │   ├── bridges.py
    │   │   │   └── core.py
    │   │   ├── summarization
    │   │   │   ├── __init__.py
    │   │   │   ├── bridges.py
    │   │   │   └── core.py
    │   │   └── translation
    │   │   │   ├── __init__.py
    │   │   │   ├── bridges.py
    │   │   │   └── core.py
    │   ├── preprocessing
    │   │   ├── __init__.py
    │   │   ├── chunking
    │   │   │   ├── __init__.py
    │   │   │   ├── chonkie_.py
    │   │   │   ├── core.py
    │   │   │   └── naive.py
    │   │   └── ingestion
    │   │   │   ├── __init__.py
    │   │   │   ├── core.py
    │   │   │   ├── docling_.py
    │   │   │   ├── marker_.py
    │   │   │   └── unstructured_.py
    │   ├── types.py
    │   └── utils.py
    └── tests
    │   ├── assets
    │       ├── 1204.0162v2.pdf
    │       └── dummy.txt
    │   ├── conftest.py
    │   ├── tasks
    │       ├── predictive
    │       │   ├── test_classification.py
    │       │   ├── test_information_extraction.py
    │       │   ├── test_ner.py
    │       │   ├── test_pii_masking.py
    │       │   ├── test_question_answering.py
    │       │   ├── test_sentiment_analysis.py
    │       │   ├── test_summarization.py
    │       │   └── test_translation.py
    │       ├── preprocessing
    │       │   ├── chunking
    │       │   │   ├── test_chonkie.py
    │       │   │   ├── test_chunking.py
    │       │   │   └── test_naivechunker.py
    │       │   └── ingestion
    │       │   │   ├── test_docling.py
    │       │   │   ├── test_ingestion.py
    │       │   │   ├── test_marker.py
    │       │   │   └── test_unstructured.py
    │       ├── test_distillation.py
    │       ├── test_misc.py
    │       └── test_optimization.py
    │   ├── test_doc.py
    │   ├── test_pipeline.py
    │   ├── test_serialization.py
    │   └── test_strict_mode.py
├── ty.toml
└── uv.lock


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | <!-- Provide a brief summary of the changes in this PR -->
 3 | 
 4 | ## Related Issues
 5 | <!-- Link related issues using 'Fixes #issue_number' or 'Resolves #issue_number' -->
 6 | \-
 7 | 
 8 | ## Changes Made
 9 | <!-- List key changes in this PR -->
10 | 
11 | ## Checklist
12 | - [ ] Tests have been extended to cover changes in functionality
13 | - [ ] Existing and new tests succeed
14 | - [ ] Documentation updated (if applicable)
15 | - [ ] Related issues linked
16 | 
17 | ## Screenshots/Examples (if applicable)
18 | <!-- Add screenshots or examples of functionality -->
19 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-docs.yml:
--------------------------------------------------------------------------------
 1 | # Simple workflow for deploying static content to GitHub Pages
 2 | name: Deploy documentation
 3 | 
 4 | on:
 5 |   # Runs on pushes targeting the default branch
 6 |   push:
 7 |     branches: ["main"]
 8 |   
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 |   contents: read
15 |   pages: write
16 |   id-token: write
17 | 
18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
20 | concurrency:
21 |   group: "pages"
22 |   cancel-in-progress: false
23 | 
24 | jobs:
25 |   # Single deploy job since we're just deploying
26 |   deploy:
27 |     environment:
28 |       name: github-pages
29 |       url: ${{ steps.deployment.outputs.page_url }}
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - name: Checkout
33 |         uses: actions/checkout@v4
34 |         
35 |       - name: Setup Pages
36 |         uses: actions/configure-pages@v5
37 | 
38 |       - name: Install dependencies
39 |         run: |
40 |           pip install "mkdocstrings[python]>=0.27,<1"
41 |           pip install "mkdocs-material>=9.6,<10"
42 | 
43 |       - name: Build docs
44 |         run: |
45 |           mkdocs build
46 |         
47 |       - name: Upload artifact
48 |         uses: actions/upload-pages-artifact@v3
49 |         with:
50 |           # Upload entire repository
51 |           path: site
52 |           
53 |       - name: Deploy to GitHub Pages
54 |         id: deployment
55 |         uses: actions/deploy-pages@v4
56 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   pypi-publish:
11 |     name: Publish release to PyPI
12 |     runs-on: ubuntu-latest
13 |     permissions:
14 |       id-token: write
15 |       contents: read
16 |     steps:
17 |       - uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: "3.x"
23 | 
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           python -m pip install build
28 | 
29 |       - name: Build package
30 |         run: python -m build
31 | 
32 |       - name: Publish package distributions to PyPI
33 |         uses: pypa/gh-action-pypi-publish@release/v1
34 |         with:
35 |           password: ${{ secrets.PYPI_TOKEN }}
36 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   workflow_dispatch:
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |     paths:
10 |       - sieves/**
11 | 
12 | jobs:
13 |   test:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.12"]
18 | 
19 |     steps:
20 | 
21 |     - name: Maximize build space
22 |       shell: bash
23 |       run: |
24 |           sudo rm -rf /usr/share/dotnet \
25 |             /usr/local/lib/android \
26 |             /opt/ghc \
27 |             /opt/hostedtoolcache/CodeQL
28 | 
29 |     - name: Checkout repo
30 |       uses: actions/checkout@v4
31 |       with:
32 |         fetch-depth: 1
33 | 
34 |     - name: Set up Python ${{ matrix.python-version }}
35 |       uses: actions/setup-python@v4
36 |       with:
37 |         python-version: ${{ matrix.python-version }}
38 | 
39 |     - name: Install uv
40 |       run: |
41 |         curl -LsSf https://astral.sh/uv/install.sh | sh
42 |         echo "$HOME/.cargo/bin" >> $GITHUB_PATH
43 | 
44 |     - name: Install dependencies
45 |       run: |
46 |         sudo apt-get update
47 |         sudo apt-get install tesseract-ocr
48 |         uv venv .venv --python 3.12
49 |         uv sync --all-extras
50 | 
51 |     - name: Create and enable 4 GB swap
52 |       run: |
53 |         sudo fallocate -l 4G /swapfile
54 |         sudo chmod 600 /swapfile
55 |         sudo mkswap /swapfile
56 |         sudo swapon /swapfile
57 |         free -h
58 | 
59 |     - name: Build docs
60 |       run: |
61 |         source .venv/bin/activate
62 |         mkdocs build
63 | 
64 | 
65 |     - name: Run tests
66 |       env:
67 |         ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
68 |         OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
69 |       run: |
70 |         source .venv/bin/activate
71 |         pytest -x --cov --cov-report=xml -m "not slow"
72 | 
73 |     - name: Debug
74 |       run: |
75 |         ls -lh
76 | 
77 |     - name: Upload coverage reports to Codecov
78 |       uses: codecov/codecov-action@v5
79 |       with:
80 |         token: ${{ secrets.CODECOV_TOKEN }}
81 |         files: coverage.xml
82 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.pyc
 3 | *.pyo
 4 | build/
 5 | dist/
 6 | *.egg-info/
 7 | .idea
 8 | .venv/
 9 | .env
10 | .windsurfrules
11 | site/
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_install_hook_types: [pre-push]
 2 | exclude: ^reports/  # skip auto-generated code coverage files
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |     rev: v4.4.0
 7 |     hooks:
 8 |       - id: check-ast  # Is it valid Python?
 9 |       - id: debug-statements # no debbuging statements used
10 |       - id: trailing-whitespace
11 |       - id: end-of-file-fixer
12 |       - id: check-added-large-files
13 |       - id: check-case-conflict
14 |       - id: check-json
15 |       - id: pretty-format-json
16 |         args: [ "--autofix", "--no-sort-keys", "--no-ensure-ascii" ]
17 |         exclude: ^(model/data/testing)/
18 |       - id: check-merge-conflict
19 |       - id: detect-private-key
20 |       - id: check-executables-have-shebangs
21 | 
22 |   - repo: https://github.com/asottile/pyupgrade
23 |     rev: v2.29.1
24 |     hooks:
25 |       - id: pyupgrade
26 |         exclude: ^(TAG)
27 |         # Remove blocks that are only compatible with Python ≤ 3.11
28 |         args: [--py311-plus]
29 | 
30 |   - repo: https://github.com/astral-sh/ruff-pre-commit
31 |     # Ruff version
32 |     rev: v0.11.13
33 |     hooks:
34 |       # Run the linter
35 |       - id: ruff-check
36 |         # Fix lint issues, including docstrings, sorting imports and commented code
37 |         args: ["--extend-select", "D,I,ERA001", "--line-length", "120", "--fix"]
38 |       # Run the formatter
39 |       - id: ruff-format
40 | 
41 |   # Type checking
42 |   - repo: local
43 |     hooks:
44 |       - id: ty-check
45 |         name: Type checking
46 |         entry: bash -c 'uvx ty check --config-file ty.toml'
47 |         language: system
48 | 


--------------------------------------------------------------------------------
/CNAME:
--------------------------------------------------------------------------------
1 | sieves.ai
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Mantis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   require_ci_to_pass: true
 3 | 
 4 | coverage:
 5 |   # How coverage metrics are presented
 6 |   precision: 2
 7 |   round: down
 8 |   range: "70...100"
 9 | 
10 |   # Define coverage thresholds.
11 |   status:
12 |     project:
13 |       default:
14 |         # Fails CI if coverage < 90%
15 |         target: 90
16 |     patch:
17 |       default:
18 |         target: 10
19 | 
20 |   # Optionally ignore specific files/dirs.
21 |   ignore:
22 |     - "sieves/tests"
23 |     - "docs"
24 |     - "**/test_*.py"
25 |     - "sieves/engines/engine_import.py"
26 |     - "sieves/tasks/postprocessing/distillation/distillation_import.py"
27 | 
28 | comment:
29 |   layout: "diff, flags, files"
30 |   behavior: default
31 |   require_changes: false  # learn more in the Requiring Changes section below
32 |   require_base: false        # [true :: must have a base report to post]
33 |   require_head: true       # [true :: must have a head report to post]
34 |   hide_project_coverage: true # [true :: only show coverage on the git diff]
35 | 
36 | parsers:
37 |   # Example parser config
38 |   gcov:
39 |     branch_detection:
40 |       conditional: yes
41 |       loop: yes
42 |       method: no
43 |       macro: no
44 | 


--------------------------------------------------------------------------------
/docs/about.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | For any feedback, feature requests, contributions etc. use our [GitHub issue tracker](https://github.com/MantisAI/sieves/issues).
4 | 
5 | `sieves` is maintained by [Mantis](https://mantisnlp.com), an AI consultancy. We help our clients to solve business problems related to 
6 | natural human language and speech. If that's something you're interested in - [drop us a line](https://mantisnlp.com/contact/#cta)!  


--------------------------------------------------------------------------------
/docs/assets/Presentation_PyData_Amsterdam_2025.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MantisAI/sieves/3ce96055c1343849909650265de2b5e7d98745b8/docs/assets/Presentation_PyData_Amsterdam_2025.pdf


--------------------------------------------------------------------------------
/docs/assets/generate_text_png.py:
--------------------------------------------------------------------------------
 1 | # mypy: ignore-errors
 2 | from PIL import Image, ImageDraw, ImageFont
 3 | 
 4 | 
 5 | def create_sieve_pattern(width, height, dot_radius=5, spacing=20):
 6 |     """
 7 |     Create an RGBA image (black background, alpha=255)
 8 |     with transparent polka-dot holes (alpha=0).
 9 |     """
10 |     # Start with a fully opaque black image
11 |     pattern = Image.new("RGBA", (width, height), (0, 0, 0, 255))
12 |     draw = ImageDraw.Draw(pattern)
13 | 
14 |     # "Punch out" holes by drawing circles with (0,0,0,0) = transparent
15 |     for y in range(0, height, spacing):
16 |         for x in range(0, width, spacing):
17 |             left = x - dot_radius
18 |             top = y - dot_radius
19 |             right = x + dot_radius
20 |             bottom = y + dot_radius
21 |             draw.ellipse([left, top, right, bottom], fill=(0, 0, 0, 0))
22 | 
23 |     return pattern
24 | 
25 | 
26 | def create_text_mask(text, font_path, font_size, image_size):
27 |     """
28 |     Create a grayscale (L-mode) mask with white text on black background.
29 |     White = 255 => opaque region, black = 0 => transparent region.
30 |     """
31 |     mask_img = Image.new("L", image_size, color=0)  # black by default
32 |     draw = ImageDraw.Draw(mask_img)
33 | 
34 |     font = ImageFont.truetype(font_path, font_size)
35 | 
36 |     # Use textbbox in newer Pillow (10.0+), since textsize is deprecated
37 |     bbox = draw.textbbox((0, 0), text, font=font)
38 |     text_width = bbox[2] - bbox[0]
39 |     text_height = bbox[3] - bbox[1]
40 | 
41 |     x_pos = (image_size[0] - text_width) // 2
42 |     y_pos = (image_size[1] - text_height) // 2
43 | 
44 |     # White text on black background
45 |     draw.text((x_pos, y_pos), text, fill=255, font=font)
46 | 
47 |     return mask_img
48 | 
49 | 
50 | def create_sieve_text_image(text, font_path, output_path="sieve_text.png"):
51 |     width, height = 800, 400
52 | 
53 |     # 1) Create the “sieve” pattern (black with transparent holes)
54 |     pattern_img = create_sieve_pattern(width, height, dot_radius=3, spacing=18)
55 | 
56 |     # 2) Create a text mask (white text on black background, "L" mode)
57 |     text_mask = create_text_mask(text=text, font_path=font_path, font_size=100, image_size=(width, height))
58 | 
59 |     # 3) Create a transparent canvas
60 |     canvas = Image.new("RGBA", (width, height), (0, 0, 0, 0))
61 | 
62 |     # 4) Paste the pattern onto the canvas wherever text_mask is non-zero
63 |     # (i.e., where the text is white)
64 |     canvas.paste(pattern_img, (0, 0), text_mask)
65 | 
66 |     # 5) Save
67 |     canvas.save(output_path, "PNG")
68 |     print(f"Saved sieve-style text with transparent holes to: {output_path}")
69 | 
70 | 
71 | # ---------------------------------
72 | # Example usage:
73 | # ---------------------------------
74 | if __name__ == "__main__":
75 |     create_sieve_text_image(
76 |         text="sieves", font_path="/home/raphael/.local/share/fonts/Hack-Bold.ttf", output_path="sieves_sieve_style.png"
77 |     )
78 | 


--------------------------------------------------------------------------------
/docs/assets/sieve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MantisAI/sieves/3ce96055c1343849909650265de2b5e7d98745b8/docs/assets/sieve.png


--------------------------------------------------------------------------------
/docs/assets/sieves_sieve_style.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MantisAI/sieves/3ce96055c1343849909650265de2b5e7d98745b8/docs/assets/sieves_sieve_style.png


--------------------------------------------------------------------------------
/docs/bridge.md:
--------------------------------------------------------------------------------
1 | # Bridge
2 | 
3 | ::: sieves.tasks.predictive.bridges.Bridge
4 | ::: sieves.tasks.predictive.bridges.GliXBridge


--------------------------------------------------------------------------------
/docs/doc.md:
--------------------------------------------------------------------------------
1 | # Doc
2 | 
3 | ::: sieves.data.doc


--------------------------------------------------------------------------------
/docs/engines/base_engine.md:
--------------------------------------------------------------------------------
1 | # Internal Engine
2 | 
3 | ::: sieves.engines.core


--------------------------------------------------------------------------------
/docs/engines/dspy.md:
--------------------------------------------------------------------------------
1 | # DSPy
2 | 
3 | ::: sieves.engines.dspy_.DSPy


--------------------------------------------------------------------------------
/docs/engines/gliner.md:
--------------------------------------------------------------------------------
1 | # GliNER
2 | 
3 | ::: sieves.engines.glix_.GliX


--------------------------------------------------------------------------------
/docs/engines/huggingface.md:
--------------------------------------------------------------------------------
1 | # Hugging Face
2 | 
3 | ::: sieves.engines.huggingface_.HuggingFace


--------------------------------------------------------------------------------
/docs/engines/langchain.md:
--------------------------------------------------------------------------------
1 | # LangChain
2 | 
3 | ::: sieves.engines.langchain_.LangChain


--------------------------------------------------------------------------------
/docs/engines/outlines.md:
--------------------------------------------------------------------------------
1 | # Outlines
2 | 
3 | ::: sieves.engines.outlines_.Outlines


--------------------------------------------------------------------------------
/docs/guides/getting_started.md:
--------------------------------------------------------------------------------
  1 | # Getting Started
  2 | 
  3 | This guide will help you get started with using `sieves` for zero-shot and few-shot NLP tasks with structured generation.
  4 | 
  5 | ## Basic Concepts
  6 | 
  7 | `sieves` is built around four main concepts:
  8 | 
  9 | 1. **Documents (`Doc`)**: The basic unit of text that you want to process. A document can be created from text or a URI.
 10 | 2. **Models + GenerationSettings**: You pass a model from your chosen backend (Outlines, DSPy, LangChain, etc.) and optional `GenerationSettings` (e.g., batch size)
 11 | 3. **Tasks**: NLP operations you want to perform on your documents (classification, information extraction, etc.)
 12 | 4. **Pipeline**: A sequence of tasks that process your documents
 13 | 
 14 | ## Quick Start Example
 15 | 
 16 | Here's a simple example that performs text classification:
 17 | 
 18 | ```python
 19 | import outlines
 20 | from sieves import Pipeline, tasks, Doc
 21 | 
 22 | # Create a document
 23 | doc = Doc(text="Special relativity applies to all physical phenomena in the absence of gravity.")
 24 | 
 25 | # Choose a model (using a small but capable model)
 26 | model = outlines.models.transformers("HuggingFaceTB/SmolLM-135M-Instruct")
 27 | 
 28 | # Create and run the pipeline (verbose init)
 29 | pipeline = Pipeline([
 30 |     tasks.predictive.Classification(
 31 |         labels=["science", "politics"],
 32 |         model=model,
 33 |     )
 34 | ])
 35 | 
 36 | # Print the classification result
 37 | for doc in pipeline([doc]):
 38 |     print(doc.results)
 39 | 
 40 | # Alternatively: succinct chaining with +
 41 | # (useful when you have multiple tasks)
 42 | # classifier = tasks.predictive.Classification(labels=["science", "politics"], model=model)
 43 | # pipeline = classifier  # single-task pipeline
 44 | # Note: set additional Pipeline params (e.g., use_cache=False) only via verbose init.
 45 | ```
 46 | 
 47 | ## Working with Documents
 48 | 
 49 | Documents can be created in several ways:
 50 | 
 51 | ```python
 52 | from sieves import Docs
 53 | 
 54 | # From text
 55 | doc = Doc(text="Your text here")
 56 | 
 57 | # From a file (requires docling)
 58 | doc = Doc(uri="path/to/your/file.pdf")
 59 | 
 60 | # With metadata
 61 | doc = Doc(
 62 |     text="Your text here",
 63 |     meta={"source": "example", "date": "2025-01-31"}
 64 | )
 65 | ```
 66 | 
 67 | Note: File-based ingestion (Docling/Unstructured/Marker) is optional and not installed by default. To enable it, install the ingestion extra or the specific libraries you need:
 68 | 
 69 | ```bash
 70 | pip install "sieves[ingestion]"
 71 | ```
 72 | 
 73 | ## Advanced Example: PDF Processing Pipeline
 74 | 
 75 | Here's a more involved example that:
 76 | 
 77 | 1. Parses a PDF document
 78 | 2. Chunks it into smaller pieces
 79 | 3. Performs information extraction on each chunk
 80 | 
 81 | ```python
 82 | import outlines
 83 | import chonkie
 84 | import tokenizers
 85 | import pydantic
 86 | from sieves import Pipeline, tasks, Doc
 87 | 
 88 | # Create a tokenizer for chunking
 89 | tokenizer = tokenizers.Tokenizer.from_pretrained("bert-base-uncased")
 90 | 
 91 | # Initialize components
 92 | chunker = tasks.preprocessing.Chonkie(
 93 |     chunker=chonkie.TokenChunker(tokenizer, chunk_size=512, chunk_overlap=50)
 94 | )
 95 | 
 96 | # Choose a model for information extraction
 97 | model = outlines.models.transformers("HuggingFaceTB/SmolLM-135M-Instruct")
 98 | 
 99 | 
100 | # Define the structure of information you want to extract
101 | class PersonInfo(pydantic.BaseModel):
102 |     name: str
103 |     age: int | None = None
104 |     occupation: str | None = None
105 | 
106 | 
107 | # Create an information extraction task
108 | extractor = tasks.predictive.InformationExtraction(
109 |     entity_type=PersonInfo,
110 |     model=model,
111 | )
112 | 
113 | # Create the pipeline (verbose init)
114 | pipeline = Pipeline([chunker, extractor])
115 | 
116 | # Alternatively: succinct chaining (+)
117 | # pipeline = chunker + extractor
118 | # Note: to change Pipeline parameters (e.g., use_cache), use the verbose form
119 | #   Pipeline([chunker, extractor], use_cache=False)
120 | 
121 | # Process a PDF document
122 | doc = Doc(text="Marie Curie died at the age of 66 years.")
123 | results = list(pipeline([doc]))
124 | 
125 | # Access the extracted information
126 | for result in results:
127 |     print(result.results["InformationExtraction"])
128 | ```
129 | 
130 | ## Supported Engines
131 | 
132 | `sieves` supports multiple libraries for structured generation:
133 | 
134 | - [`outlines`](https://github.com/outlines-dev/outlines)
135 | - [`dspy`](https://github.com/stanfordnlp/dspy) - also supports Ollama and vLLM integration via `api_base`
136 | - [`langchain`](https://github.com/langchain-ai/langchain)
137 | - [`gliner`](https://github.com/urchade/GLiNER)
138 | - [`transformers`](https://github.com/huggingface/transformers)
139 | 
140 | You pass models from these libraries directly to `PredictiveTask`. Optionally, you can include `GenerationSettings` to
141 | override defaults. Batching is controlled per task via the `batch_size` argument (see below).
142 | 
143 | ### GenerationSettings (optional)
144 | `GenerationSettings` controls engine behavior and is optional. Defaults:
145 | - strict_mode: False (on parse issues, return None instead of raising)
146 | - init_kwargs/inference_kwargs: None (use engine defaults)
147 | - config_kwargs: None (used by some backends like DSPy)
148 | 
149 | Batching is configured on each task via `batch_size`:
150 | - `batch_size = -1` processes all inputs at once (default)
151 | - `batch_size = N` processes N docs per batch
152 | 
153 | Example:
154 | 
155 | ```python
156 | from sieves.engines.utils import GenerationSettings
157 | classifier = tasks.predictive.Classification(
158 |     labels=["science", "politics"],
159 |     model=model,
160 |     generation_settings=GenerationSettings(strict_mode=True),
161 |     batch_size=8,
162 | )
163 | ```
164 | 


--------------------------------------------------------------------------------
/docs/guides/serialization.md:
--------------------------------------------------------------------------------
  1 | # Saving and Loading
  2 | 
  3 | `sieves` provides functionality to save your pipeline configurations to disk and load them later. This is useful for:
  4 | 
  5 | - Sharing pipeline configurations with others
  6 | - Versioning your pipelines
  7 | - Deploying pipelines to production
  8 | 
  9 | ## Basic Pipeline Serialization
 10 | 
 11 | Here's a simple example of saving and loading a classification pipeline:
 12 | 
 13 | ```python
 14 | import outlines
 15 | from sieves import Pipeline, tasks, Doc
 16 | from pathlib import Path
 17 | 
 18 | # Create a basic classification pipeline
 19 | model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
 20 | model = outlines.models.transformers(model_name)
 21 | classifier = tasks.predictive.Classification(labels=["science", "politics"], model=model)
 22 | pipeline = Pipeline([classifier])
 23 | 
 24 | # Save the pipeline configuration
 25 | config_path = Path("classification_pipeline.yml")
 26 | pipeline.dump(config_path)
 27 | 
 28 | # Load the pipeline configuration
 29 | loaded_pipeline = Pipeline.load(config_path, [{"model": outlines.models.transformers(model_name)}])
 30 | 
 31 | # Use the loaded pipeline
 32 | doc = Doc(text="Special relativity applies to all physical phenomena in the absence of gravity.")
 33 | results = list(loaded_pipeline([doc]))
 34 | print(results[0].results["Classification"])
 35 | ```
 36 | 
 37 | ## Dealing with complex third-party objects
 38 | 
 39 | `sieves` doesn't serialize complex third-party objects. When loading pipelines, you need to provide initialization parameters for each task when loading:
 40 | 
 41 | ```python
 42 | import chonkie
 43 | import tokenizers
 44 | import outlines
 45 | import pydantic
 46 | from sieves import Pipeline, tasks
 47 | 
 48 | # Create a tokenizer for chunking
 49 | tokenizer = tokenizers.Tokenizer.from_pretrained("bert-base-uncased")
 50 | chunker = tasks.preprocessing.Chonkie(
 51 |     chunker=chonkie.TokenChunker(tokenizer, chunk_size=512, chunk_overlap=50)
 52 | )
 53 | 
 54 | model = outlines.models.transformers("HuggingFaceTB/SmolLM-135M-Instruct")
 55 | 
 56 | 
 57 | class PersonInfo(pydantic.BaseModel):
 58 |     name: str
 59 |     age: int | None = None
 60 |     occupation: str | None = None
 61 | 
 62 | 
 63 | extractor = tasks.predictive.InformationExtraction(entity_type=PersonInfo, model=model)
 64 | 
 65 | # Create and save the pipeline
 66 | pipeline = Pipeline([chunker, extractor])
 67 | pipeline.dump("extraction_pipeline.yml")
 68 | 
 69 | # Load the pipeline with initialization parameters for each task
 70 | loaded_pipeline = Pipeline.load(
 71 |     "extraction_pipeline.yml",
 72 |     [
 73 |         {"tokenizer": tokenizers.Tokenizer.from_pretrained("bert-base-uncased")},
 74 |         {"model": outlines.models.transformers("HuggingFaceTB/SmolLM-135M-Instruct")},
 75 |     ]
 76 | )
 77 | ```
 78 | 
 79 | ## Understanding Pipeline Configuration Files
 80 | 
 81 | Pipeline configurations are saved as YAML files. Here's an example of what a configuration file looks like:
 82 | 
 83 | ```yaml
 84 | cls_name: sieves.pipeline.core.Pipeline
 85 | version: 0.11.1
 86 | tasks:
 87 |   is_placeholder: false
 88 |   value:
 89 |     - cls_name: sieves.tasks.preprocessing.chunkers.Chunker
 90 |       tokenizer:
 91 |         is_placeholder: true
 92 |         value: tokenizers.Tokenizer
 93 |       chunk_size:
 94 |         is_placeholder: false
 95 |         value: 512
 96 |       chunk_overlap:
 97 |         is_placeholder: false
 98 |         value: 50
 99 |       task_id:
100 |         is_placeholder: false
101 |         value: Chunker
102 |     - cls_name: sieves.tasks.predictive.information_extraction.core.InformationExtraction
103 |       engine:
104 |         is_placeholder: false
105 |         value:
106 |           cls_name: sieves.engines.outlines_.Outlines
107 |           model:
108 |             is_placeholder: true
109 |             value: outlines.models.transformers
110 | ```
111 | 
112 | The configuration file contains:
113 | 
114 | - The full class path of the pipeline and its tasks
115 | - Version information
116 | - Task-specific parameters and their values
117 | - Placeholders for components that need to be provided during loading
118 | 
119 | !!! info Parameter management
120 | 
121 |       When loading pipelines, provide all required initialization parameters (e.g. models) and ensure you're loading a pipeline with a compatible `sieves` version. `GenerationSettings` is optional unless you want to override defaults.
122 | 
123 | !!! warning Limitations
124 | 
125 |       - Model weights are not saved in the configuration files
126 |       - Complex third-party objects (everything beyond primitives or collections thereof) may not be serializable
127 |       - API keys and credentials must be managed separately
128 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # `sieves`
 2 | 
 3 | `sieves` is a Python library designed for zero-shot and few-shot NLP tasks that focuses on structured generation,
 4 | allowing developers to build production-ready NLP prototypes without requiring training data. It provides a unified
 5 | interface that wraps popular NLP tools (like `outlines`, `dspy`, `langchain`, and others) while ensuring structured
 6 | outputs and observability.
 7 | 
 8 | It bundles common NLP utilities, document parsing, and text chunking capabilities together with ready-to-use tasks like
 9 | classification and information extraction, all organized in an observable pipeline architecture. It's particularly
10 | valuable for rapid prototyping scenarios where structured output is needed but training data is scarce.
11 | 
12 | 
13 | ## Quick Installation
14 | 
15 | You can install `sieves` with different options depending on your needs
16 | 
17 | Core package with minimal dependencies:
18 | ```bash
19 | pip install sieves
20 | ```
21 | Note: Ingestion libraries (document parsing such as `docling`, `unstructured`, `marker`) are not installed by default. Install them manually or use the ingestion extra:
22 | 
23 | ```bash
24 | pip install "sieves[ingestion]"
25 | ```
26 | 
27 | The minimal setup lets you add only what you need to keep the footprint small.
28 | 
29 | All optional dependencies for every feature, including engines, distillation, and ingestion:
30 | ```bash
31 | pip install "sieves[engines,distill,ingestion]"
32 | ```
33 | 
34 | ### Specific Features
35 | 
36 | Document ingestion/parsing libraries (PDF/DOCX parsing, etc.):
37 | ```bash
38 | pip install "sieves[ingestion]"
39 | ```
40 | 
41 | All supported engines:
42 | ```bash
43 | pip install "sieves[engines]"
44 | ```
45 | 
46 | ### Development Setup
47 | 
48 | 1. Set up [`uv`](https://github.com/astral-sh/uv).
49 | 2. Install all dependencies for development, testing, documentation generation with: `uv pip install --system .[engines,distill,ingestion,test]`.
50 | 
51 | ## Core Concepts
52 | 
53 | `sieves` is built around five key components:
54 | 
55 | 1. **`Pipeline`**: The main orchestrator that runs your NLP tasks sequentially (define with `Pipeline([...])` or chain with `+`)
56 | 2. **`Task`**: Pre-built or custom NLP operations (classification, extraction, etc.)
57 | 3. **`Engine`**: Backend implementations that power the tasks (outlines, dspy, langchain, etc.)
58 | 4. **`Bridge`**: Connectors between Tasks and Engines
59 | 5. **`Doc`**: The fundamental data structure for document processing
60 | 
61 | ## Essential Links
62 | 
63 | - [GitHub Repository](https://github.com/mantisai/sieves)
64 | - [PyPI Package](https://pypi.org/project/sieves/)
65 | - [Issue Tracker](https://github.com/mantisai/sieves/issues)
66 | 
67 | ## Guides
68 | 
69 | We've prepared several guides to help you get up to speed quickly:
70 | 
71 | - [Getting Started](guides/getting_started.md) - Start here! Learn the basic concepts and create your first pipeline.
72 | - [Document Preprocessing](guides/preprocessing.md) - Master document parsing, chunking, and text standardization.
73 | - [Creating Custom Tasks](guides/custom_tasks.md) - Learn to create your own tasks when the built-in ones aren't enough.
74 | - [Saving and Loading Pipelines](guides/serialization.md) - Version and share your pipeline configurations.
75 | - [Task Optimization](guides/optimization.md) - Improve task performance by optimizing prompts and few-shot examples.
76 | - [Task Distillation](guides/distillation.md) - Fine-tune smaller, specialized models using zero-shot task results.
77 | 
78 | ## Getting Help
79 | 
80 | - Check our [GitHub Issues](https://github.com/mantisai/sieves/issues) for common problems
81 | - Review the documentation in the `/docs/guides/` directory
82 | - Join our community discussions (link to be added)
83 | 
84 | ## Next Steps
85 | 
86 | - Dive into our guides, starting with the [Getting Started Guide](guides/getting_started.md)
87 | - Check out example pipelines in our repository
88 | - Learn about custom task creation
89 | - Understand different engine configurations
90 | 
91 | Consult the API reference for each component you're working with if you have specific question. They contain detailed
92 | information about parameters, configurations, and best practices.
93 | 


--------------------------------------------------------------------------------
/docs/pipeline.md:
--------------------------------------------------------------------------------
 1 | # Pipeline
 2 | 
 3 | Pipelines orchestrate sequential execution of tasks and support two ways to define the sequence:
 4 | 
 5 | - Verbose initialization using `Pipeline([...])` (allows setting parameters like `use_cache`)
 6 | - Succinct chaining with `+` for readability
 7 | 
 8 | Examples
 9 | 
10 | ```python
11 | from sieves import Pipeline, tasks
12 | 
13 | # Verbose initialization (allows non-default configuration).
14 | t_ingest = tasks.preprocessing.Ingestion(export_format="markdown")
15 | t_chunk = tasks.preprocessing.Chunking(chunker)
16 | t_cls = tasks.predictive.Classification(labels=["science", "politics"], model=engine)
17 | pipe = Pipeline([t_ingest, t_chunk, t_cls], use_cache=True)
18 | 
19 | # Succinct chaining (equivalent task order).
20 | pipe2 = t_ingest + t_chunk + t_cls
21 | 
22 | # You can also chain pipelines and tasks.
23 | pipe_left = Pipeline([t_ingest])
24 | pipe_right = Pipeline([t_chunk, t_cls])
25 | pipe3 = pipe_left + pipe_right  # results in [t_ingest, t_chunk, t_cls]
26 | 
27 | # In-place append (mutates the left pipeline).
28 | pipe_left += t_chunk
29 | pipe_left += pipe_right  # appends all tasks from right
30 | 
31 | # Note:
32 | # - Additional Pipeline parameters (e.g., use_cache=False) are only settable via the verbose form
33 | # - Chaining never mutates existing tasks or pipelines; it creates a new Pipeline
34 | # - Using "+=" mutates the existing pipeline by appending tasks
35 | ```
36 | 
37 | Note: Ingestion libraries (e.g., Docling, Unstructured, Marker) are optional and not installed by default. Install them manually or via the extra:
38 | 
39 | ```bash
40 | pip install "sieves[ingestion]"
41 | ```
42 | 
43 | ::: sieves.pipeline.core
44 | 


--------------------------------------------------------------------------------
/docs/tasks/predictive/classification.md:
--------------------------------------------------------------------------------
1 | # Classification
2 | 
3 | ::: sieves.tasks.predictive.classification.core
4 | ::: sieves.tasks.predictive.classification.bridges


--------------------------------------------------------------------------------
/docs/tasks/predictive/information_extraction.md:
--------------------------------------------------------------------------------
1 | # Information Extraction
2 | 
3 | ::: sieves.tasks.predictive.information_extraction.core
4 | ::: sieves.tasks.predictive.information_extraction.bridges


--------------------------------------------------------------------------------
/docs/tasks/predictive/ner.md:
--------------------------------------------------------------------------------
1 | # Named Entity Recognition
2 | 
3 | ::: sieves.tasks.predictive.ner.core
4 | ::: sieves.tasks.predictive.ner.bridges


--------------------------------------------------------------------------------
/docs/tasks/predictive/pii_masking.md:
--------------------------------------------------------------------------------
1 | # PII Masking
2 | 
3 | ::: sieves.tasks.predictive.pii_masking.core
4 | ::: sieves.tasks.predictive.pii_masking.bridges


--------------------------------------------------------------------------------
/docs/tasks/predictive/question_answering.md:
--------------------------------------------------------------------------------
1 | # Question Answering
2 | 
3 | ::: sieves.tasks.predictive.question_answering.core
4 | ::: sieves.tasks.predictive.question_answering.bridges


--------------------------------------------------------------------------------
/docs/tasks/predictive/sentiment_analysis.md:
--------------------------------------------------------------------------------
1 | # Sentiment Analysis
2 | 
3 | ::: sieves.tasks.predictive.sentiment_analysis.core
4 | ::: sieves.tasks.predictive.sentiment_analysis.bridges


--------------------------------------------------------------------------------
/docs/tasks/predictive/summarization.md:
--------------------------------------------------------------------------------
1 | # Summarization
2 | 
3 | ::: sieves.tasks.predictive.summarization.core
4 | ::: sieves.tasks.predictive.summarization.bridges


--------------------------------------------------------------------------------
/docs/tasks/predictive/translation.md:
--------------------------------------------------------------------------------
1 | # Translation
2 | 
3 | ::: sieves.tasks.predictive.translation
4 | ::: sieves.tasks.predictive.translation.bridges


--------------------------------------------------------------------------------
/docs/tasks/preprocessing/chunking/chonkie.md:
--------------------------------------------------------------------------------
1 | # Chonkie
2 | 
3 | ::: sieves.tasks.preprocessing.chunking.chonkie_


--------------------------------------------------------------------------------
/docs/tasks/preprocessing/chunking/chunking.md:
--------------------------------------------------------------------------------
1 | # Chunker
2 | 
3 | ::: sieves.tasks.preprocessing.chunking.core
4 | 


--------------------------------------------------------------------------------
/docs/tasks/preprocessing/chunking/naive.md:
--------------------------------------------------------------------------------
1 | # NaiveChunker
2 | 
3 | ::: sieves.tasks.preprocessing.chunking.naive


--------------------------------------------------------------------------------
/docs/tasks/preprocessing/ingestion/docling.md:
--------------------------------------------------------------------------------
 1 | # Docling
 2 | 
 3 | Note: This task depends on optional ingestion libraries, which are not installed by default. Install them via the ingestion extra, or install the library directly.
 4 | 
 5 | Examples:
 6 | 
 7 | ```bash
 8 | pip install "sieves[ingestion]"   # installs ingestion deps via extra
 9 | # or install the library directly
10 | pip install docling
11 | ```
12 | 
13 |  ::: sieves.tasks.preprocessing.ingestion.docling_
14 | 


--------------------------------------------------------------------------------
/docs/tasks/preprocessing/ingestion/ingestion.md:
--------------------------------------------------------------------------------
1 | # Marker
2 | 
3 | ::: sieves.tasks.preprocessing.ingestion.core
4 | 


--------------------------------------------------------------------------------
/docs/tasks/preprocessing/ingestion/marker.md:
--------------------------------------------------------------------------------
 1 | # Marker
 2 | 
 3 | Note: This task depends on optional ingestion libraries that are not installed by default. You can install them via the ingestion extra, or install the library directly.
 4 | 
 5 | Examples:
 6 | 
 7 | ```bash
 8 | pip install "sieves[ingestion]"   # installs ingestion deps via extra
 9 | # or install the library directly (e.g., the Marker PDF package)
10 | pip install marker                 # or the appropriate marker package variant
11 | ```
12 | 
13 | ::: sieves.tasks.preprocessing.ingestion.marker_
14 | 


--------------------------------------------------------------------------------
/docs/tasks/preprocessing/ingestion/unstructured.md:
--------------------------------------------------------------------------------
 1 | # unstructured
 2 | 
 3 | Note: This task depends on optional ingestion libraries that are not installed by default. Install them with the ingestion extra, or install the specific library directly if you prefer.
 4 | 
 5 | Examples:
 6 | 
 7 | ```bash
 8 | pip install "sieves[ingestion]"          # installs ingestion deps via extra
 9 | # or install the library directly
10 | pip install unstructured                  # choose extras as needed
11 | ```
12 | 
13 | ::: sieves.tasks.preprocessing.ingestion.unstructured_
14 | 


--------------------------------------------------------------------------------
/docs/tasks/task.md:
--------------------------------------------------------------------------------
1 | # Task
2 | 
3 | ::: sieves.tasks.core.Task


--------------------------------------------------------------------------------
/examples/pydata_amsterdam_demo.py:
--------------------------------------------------------------------------------
 1 | """Demo for PyData Amsterdam 2025.
 2 | 
 3 | Required additional dependencies:
 4 | - openai
 5 | - outlines
 6 | """
 7 | 
 8 | import os
 9 | from collections import defaultdict
10 | from pprint import pprint
11 | from typing import Literal
12 | 
13 | import openai
14 | import outlines
15 | import pydantic
16 | 
17 | from sieves import Doc, Engine, tasks
18 | 
19 | 
20 | class Country(pydantic.BaseModel, frozen=True):
21 |     """Describes a country and it's stance on the chat control proposal."""
22 | 
23 |     name: str
24 |     in_eu: bool
25 |     stance_on_chat_control_proposal: Literal["pro", "undecided", "contra", "unknown"]
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     docs = [
30 |         Doc(
31 |             uri="https://www.techradar.com/computing/cyber-security/chat-control-the-list-of-countries-opposing-the-"
32 |                 "law-grows-but-support-remains-strong"
33 |         )
34 |     ]
35 | 
36 |     model = outlines.from_openai(
37 |         openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]),
38 |         model_name="gpt-5-mini"
39 |     )
40 | 
41 |     pipe = tasks.Ingestion() + tasks.InformationExtraction(entity_type=Country, model=model)
42 | 
43 |     for doc in pipe(docs):
44 |         countries = defaultdict(list)
45 |         for country in doc.results["InformationExtraction"]:
46 |             assert isinstance(country, Country)
47 |             if country.in_eu:
48 |                 countries[country.stance_on_chat_control_proposal].append(country.name)
49 | 
50 |         pprint(countries)
51 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: sieves
 2 | site_url: https://sieves.ai/docs
 3 | docs_dir: docs
 4 | repo_url: https://github.com/mantisai/sieves
 5 | repo_name: "sieves on GitHub"
 6 | 
 7 | theme:
 8 |   name: material
 9 |   features:
10 |     - navigation.top
11 |     - navigation.tabs
12 |     - navigation.footer
13 |     - header.autohide
14 |   palette:
15 |     - scheme: slate
16 |       toggle:
17 |         icon: material/weather-sunny
18 |         name: Switch to light mode
19 |     - scheme: default
20 |       toggle:
21 |         icon: material/weather-night
22 |         name: Switch to dark mode
23 |   logo: 'assets/sieve.png'
24 | 
25 | plugins:
26 |   - search
27 |   - mkdocstrings:
28 |       default_handler: python
29 |       handlers:
30 |         python:
31 |           options:
32 |             docstring_style: sphinx
33 |             show_source: true
34 |             inherited_members: true
35 |             extra:
36 |               signatures: true
37 | 
38 | markdown_extensions:
39 |   - admonition
40 |   - codehilite
41 |   - pymdownx.superfences
42 | 
43 | nav:
44 |   - Home:
45 |     - index.md
46 |     - about.md
47 |   - API:
48 |     - pipeline.md
49 |     - doc.md
50 |     - bridge.md
51 |     - Tasks:
52 |       - tasks/task.md
53 |       - Preprocessing:
54 |         - Ingestion:
55 |           - tasks/preprocessing/ingestion/ingestion.md
56 |           - tasks/preprocessing/ingestion/docling.md
57 |           - tasks/preprocessing/ingestion/unstructured.md
58 |           - tasks/preprocessing/ingestion/marker.md
59 |         - Chunking:
60 |           - tasks/preprocessing/chunking/chunking.md
61 |           - tasks/preprocessing/chunking/chonkie.md
62 |           - tasks/preprocessing/chunking/naive.md
63 | 
64 |       - Predictive:
65 |         - tasks/predictive/classification.md
66 |         - tasks/predictive/information_extraction.md
67 |         - tasks/predictive/ner.md
68 |         - tasks/predictive/pii_masking.md
69 |         - tasks/predictive/question_answering.md
70 |         - tasks/predictive/sentiment_analysis.md
71 |         - tasks/predictive/summarization.md
72 |         - tasks/predictive/translation.md
73 | 
74 |     - Engines:
75 |         - engines/base_engine.md
76 |         - All Engines:
77 |             - engines/dspy.md
78 |             - engines/gliner.md
79 |             - engines/huggingface.md
80 |             - engines/langchain.md
81 |             - engines/outlines.md
82 |   - Guides:
83 |       - guides/getting_started.md
84 |       - guides/preprocessing.md
85 |       - guides/serialization.md
86 |       - guides/custom_tasks.md
87 |       - guides/optimization.md
88 |       - guides/distillation.md
89 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools", "setuptools-scm"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "sieves"
  7 | readme = "README.md"
  8 | description = "Plug-and-play, zero-shot document processing pipelines."
  9 | license = { text = "MIT" }
 10 | dynamic = ["version", "authors"]
 11 | requires-python = ">=3.12"
 12 | dependencies = [
 13 |     "chonkie>=1,<2",
 14 |     "datasets>=3,<4",
 15 |     "jinja2>=3,<4",
 16 |     "loguru>=0.7,<1",
 17 |     "outlines>=1,<2",
 18 |     "dspy-ai>=2,<3",
 19 |     "dspy>=2,<3",
 20 |     "pydantic>=2,<3",
 21 | ]
 22 | classifiers = [
 23 |     "Development Status :: 3 - Alpha",
 24 |     "Intended Audience :: Developers",
 25 |     "Topic :: Software Development :: Libraries",
 26 |     "License :: OSI Approved :: MIT License",
 27 |     "Programming Language :: Python :: 3",
 28 |     "Programming Language :: Python :: 3.11",
 29 |     "Programming Language :: Python :: 3.12"
 30 | ]
 31 | 
 32 | [project.optional-dependencies]
 33 | ingestion = [
 34 |     "docling>=2,<3",
 35 |     "marker-pdf[full]>=1.6.1,<2",
 36 |     "nltk>=3.9.1",
 37 |     "unstructured-inference>=0.8,<1",
 38 |     "unstructured[all-docs]>=0.16,<1",
 39 | ]
 40 | engines = [
 41 |     "accelerate>1.2,<2",
 42 |     "gliner<1",
 43 |     "langchain-core>=0.3,<0.4",
 44 |     "langchain>=0.3,<0.4",
 45 |     "nest-asyncio>=1,<2",
 46 |     "sentencepiece<1",
 47 |     "transformers>=4,<5",
 48 | ]
 49 | distill = [
 50 |     "setfit>=1.1,<2",
 51 |     "model2vec[train]>0.4,<0.5",
 52 | ]
 53 | test = [
 54 |     "pre-commit>=4,<5",
 55 |     "pytest>=7,<8",
 56 |     "mypy>=1",
 57 |     "mypy-extensions>=1",
 58 |     "pytest-cov>=6",
 59 |     "anthropic>=0.45,<1",
 60 |     "langchain-community>=0.3.31,<0.4",
 61 |     "langchain-openai>=0.3.35",
 62 | #    "tesseract>=0.1,<1",
 63 |     # For generating documentation.
 64 |     "mkdocstrings[python]>=0.27,<1",
 65 |     "mkdocs-material>=9.6,<10",
 66 |     "pre-commit>=4,<5"
 67 | ]
 68 | 
 69 | [tool.ruff]
 70 | line-length = 120
 71 | target-version = "py312"
 72 | 
 73 | # Exclude a variety of commonly ignored directories.
 74 | exclude = [
 75 |     ".bzr",
 76 |     ".direnv",
 77 |     ".eggs",
 78 |     ".git",
 79 |     ".git-rewrite",
 80 |     ".hg",
 81 |     ".mypy_cache",
 82 |     ".nox",
 83 |     ".pants.d",
 84 |     ".pytype",
 85 |     ".ruff_cache",
 86 |     ".svn",
 87 |     ".tox",
 88 |     ".venv",
 89 |     "__pypackages__",
 90 |     "_build",
 91 |     "buck-out",
 92 |     "build",
 93 |     "dist",
 94 |     "node_modules",
 95 |     "venv",
 96 |     ".venv",
 97 |     "sieves/tests/**",
 98 |     # Ignore examples.
 99 |     "examples/**"
100 | ]
101 | 
102 | [tool.ruff.lint]
103 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)
104 | select = ["E", "F", "I", "UP"]
105 | extend-select = ["D"]
106 | ignore = ["D203", "D212"]
107 | 
108 | # Allow autofix for all enabled rules (when `--fix`) is provided.
109 | fixable = ["ALL"]
110 | unfixable = ["F401"]
111 | 
112 | [tool.ruff.lint.pydocstyle]
113 | convention = "pep257"
114 | 
115 | [tool.ruff.lint.mccabe]
116 | max-complexity = 10
117 | 
118 | [tool.ruff.lint.isort]
119 | known-first-party = ["sieves"]
120 | 
121 | [tool.mypy]
122 | python_version = "3.11"
123 | strict = true
124 | warn_return_any = true
125 | warn_unused_configs = true
126 | disallow_untyped_defs = true
127 | disallow_incomplete_defs = true
128 | check_untyped_defs = true
129 | disallow_untyped_decorators = true
130 | no_implicit_optional = true
131 | warn_redundant_casts = true
132 | warn_unused_ignores = true
133 | warn_no_return = true
134 | warn_unreachable = true
135 | allow_untyped_globals = false
136 | allow_redefinition = false
137 | implicit_reexport = false
138 | strict_equality = true
139 | ignore_missing_imports = true
140 | 
141 | # Per-module ignores for third-party libraries
142 | [[tool.mypy.overrides]]
143 | module = ["outlines.*", "docling.*", "chonkie.*", "tqdm.*", "dspy.*"]
144 | ignore_missing_imports = true
145 | follow_imports = "skip"
146 | 
147 | [tool.pytest.ini_options]
148 | markers = [
149 |   "slow: marks tests as slow (deselect with '-m not slow')"
150 | ]
151 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = sieves
 3 | version = 0.17.0
 4 | author = Matthew Upson, Nick Sorros, Raphael Mitsch, Matthew Maufe, Angelo Di Gianvito
 5 | author_email = hi@mantisnlp.com
 6 | long_description = file: README.md
 7 | long_description_content_type = text/markdown
 8 | url = https://github.com/MantisAI/sieves
 9 | 
10 | [options]
11 | packages = find:
12 | python_requires = >=3.12
13 | install_requires =
14 | 
15 | [options.packages.find]
16 | where = sieves
17 | 
18 | [coverage:run]
19 | source = sieves/
20 | omit = *__init__*
21 | 
22 | [coverage:report]
23 | show_missing = True
24 | precision = 2
25 | sort = Miss
26 | 
27 | [mypy-examples.*]
28 | follow_imports = skip
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Setup script for the Sieves package."""
3 | 
4 | if __name__ == "__main__":
5 |     from setuptools import find_packages, setup
6 | 
7 |     setup(name="sieves", packages=find_packages())
8 | 


--------------------------------------------------------------------------------
/sieves/__init__.py:
--------------------------------------------------------------------------------
 1 | """Sieves."""
 2 | 
 3 | import sieves.tasks as tasks
 4 | from sieves.data import Doc
 5 | 
 6 | from .engines import GenerationSettings
 7 | from .pipeline import Pipeline
 8 | 
 9 | __all__ = ["Doc", "GenerationSettings", "tasks", "Pipeline"]
10 | 


--------------------------------------------------------------------------------
/sieves/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .doc import Doc
2 | 
3 | __all__ = ["Doc"]
4 | 


--------------------------------------------------------------------------------
/sieves/data/doc.py:
--------------------------------------------------------------------------------
  1 | """Doc implementation, types and utilities."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import dataclasses
  6 | from pathlib import Path
  7 | from typing import Any, Literal
  8 | 
  9 | from datasets import Dataset
 10 | from PIL import Image, ImageChops
 11 | 
 12 | Field = Literal["meta", "results", "uri", "text", "chunks", "id", "images"]
 13 | 
 14 | 
 15 | @dataclasses.dataclass
 16 | class Doc:
 17 |     """A document holding data to be processed."""
 18 | 
 19 |     meta: dict[str, Any] = dataclasses.field(default_factory=dict)
 20 |     results: dict[str, Any] = dataclasses.field(default_factory=dict)
 21 |     uri: Path | str | None = None
 22 |     text: str | None = None
 23 |     chunks: list[str] | None = None
 24 |     id: str | None = None
 25 |     images: list[Image.Image] | None = None
 26 | 
 27 |     def __post_init__(self) -> None:
 28 |         """Initialize chunks."""
 29 |         if self.chunks is None and self.text is not None:
 30 |             self.chunks = [self.text]
 31 | 
 32 |     @staticmethod
 33 |     def _are_images_equal(im1: Image.Image | None, im2: Image.Image | None) -> bool:
 34 |         """Check if two images are equal using PIL Image Channel operations.
 35 | 
 36 |         :param im1: First PIL image to compare.
 37 |         :param im2: Second PIL image to compare.
 38 |         :return bool: True if images are equal, False otherwise.
 39 |         """
 40 |         if im1 is None and im2 is None:
 41 |             return True
 42 |         if im1 is None or im2 is None:
 43 |             return False
 44 |         if im1.size != im2.size or im1.mode != im2.mode:
 45 |             return False
 46 |         return ImageChops.difference(im1, im2).getbbox() is None
 47 | 
 48 |     def __eq__(self, other: object) -> bool:
 49 |         """Compare two `Doc` instances.
 50 | 
 51 |         :return: True if `self` is equal to `other`.
 52 |         :raises NotImplementedError: if `other` isn't of type `Doc`.
 53 |         """
 54 |         if not isinstance(other, Doc):
 55 |             raise NotImplementedError
 56 | 
 57 |         # Check if images are equal
 58 |         images_equal_check = False
 59 |         if self.images is None and other.images is None:
 60 |             images_equal_check = True
 61 |         elif self.images is None or other.images is None:
 62 |             images_equal_check = False
 63 |         elif self.images is not None and other.images is not None:
 64 |             if len(self.images) == len(other.images):
 65 |                 images_equal_check = all(
 66 |                     self._are_images_equal(im1, im2) for im1, im2 in zip(self.images, other.images)
 67 |                 )
 68 |             else:
 69 |                 images_equal_check = False
 70 |         return (
 71 |             self.id == other.id
 72 |             and self.uri == other.uri
 73 |             and self.text == other.text
 74 |             and self.chunks == other.chunks
 75 |             and self.results == other.results
 76 |             and images_equal_check
 77 |         )
 78 | 
 79 |     @classmethod
 80 |     def from_hf_dataset(cls, dataset: Dataset, column_map: dict[Field, Any] | None = None) -> list[Doc]:
 81 |         """Generate list of docs from Hugging Face `datasets.Dataset`.
 82 | 
 83 |         :param dataset: Dataset to generate `Doc` instances from. If column_map isn't specified to the contrary, dataset
 84 |             must contain at least one column named "text".
 85 |         :param column_map: Which `Doc` attribute to map to which attribute in `dataset`. If None, the mapping "text" ->
 86 |             "text" is assumed.
 87 |         :return: List of `Doc` instances, each representing one row in the dataset.
 88 |         :raises ValueError: If expected columns are not present in the dataset features.
 89 |         """
 90 |         if column_map is None:
 91 |             column_map = {"text": "text"}
 92 | 
 93 |         missing_cols = set(column_map.values()) - set(dataset.column_names)
 94 |         if len(missing_cols):
 95 |             raise KeyError(f"Specified columns '{missing_cols}' not found in dataset columns: {dataset.column_names}.")
 96 | 
 97 |         docs: list[Doc] = []
 98 |         for row in dataset:
 99 |             docs.append(cls(**{doc_col: row.get(data_col) for doc_col, data_col in column_map.items()}))  # type: ignore[misc]
100 | 
101 |         return docs
102 | 


--------------------------------------------------------------------------------
/sieves/engines/__init__.py:
--------------------------------------------------------------------------------
 1 | """Engines."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from .core import Engine, EngineInferenceMode, EngineModel, EnginePromptSignature, EngineResult
 6 | from .engine_import import (
 7 |     DSPy,
 8 |     GliX,
 9 |     HuggingFace,
10 |     LangChain,
11 |     Outlines,
12 |     dspy_,
13 |     glix_,
14 |     huggingface_,
15 |     langchain_,
16 |     outlines_,
17 | )
18 | from .engine_type import EngineType
19 | from .types import GenerationSettings
20 | 
21 | __all__ = [
22 |     "dspy_",
23 |     "DSPy",
24 |     "EngineInferenceMode",
25 |     "EngineModel",
26 |     "EnginePromptSignature",
27 |     "EngineType",
28 |     "EngineResult",
29 |     "Engine",
30 |     "GenerationSettings",
31 |     "glix_",
32 |     "GliX",
33 |     "langchain_",
34 |     "LangChain",
35 |     "huggingface_",
36 |     "HuggingFace",
37 |     "outlines_",
38 |     "Outlines",
39 | ]
40 | 


--------------------------------------------------------------------------------
/sieves/engines/core.py:
--------------------------------------------------------------------------------
  1 | """Engine core interfaces and base classes used by backends."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import abc
  6 | import asyncio
  7 | import enum
  8 | from collections.abc import Awaitable, Callable, Coroutine, Iterable, Sequence
  9 | from typing import Any, Generic, Protocol, TypeVar, override
 10 | 
 11 | import jinja2
 12 | import pydantic
 13 | 
 14 | from sieves.engines.types import GenerationSettings
 15 | 
 16 | EnginePromptSignature = TypeVar("EnginePromptSignature")
 17 | EngineModel = TypeVar("EngineModel")
 18 | EngineResult = TypeVar("EngineResult", covariant=True)
 19 | EngineInferenceMode = TypeVar("EngineInferenceMode", bound=enum.Enum)
 20 | 
 21 | 
 22 | class Executable(Protocol[EngineResult]):
 23 |     """Callable protocol representing a compiled prompt executable."""
 24 | 
 25 |     def __call__(self, values: Sequence[dict[str, Any]]) -> Iterable[EngineResult | None]:
 26 |         """Execute prompt executable for given values.
 27 | 
 28 |         :param values: Values to inject into prompts.
 29 |         :return: Results for prompts.
 30 |         """
 31 |         ...
 32 | 
 33 | 
 34 | class Engine(Generic[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode]):
 35 |     """Base class for engines wrapping model invocation and batching."""
 36 | 
 37 |     def __init__(self, model: EngineModel, generation_settings: GenerationSettings):
 38 |         """Initialize engine with model and generation settings.
 39 | 
 40 |         :param model: Instantiated model instance.
 41 |         :param generation_settings: Generation settings.
 42 |         """
 43 |         self._model = model
 44 |         self._generation_settings = generation_settings
 45 |         self._inference_kwargs = generation_settings.inference_kwargs or {}
 46 |         self._init_kwargs = generation_settings.init_kwargs or {}
 47 |         self._strict_mode = generation_settings.strict_mode
 48 | 
 49 |     @property
 50 |     def generation_settings(self) -> GenerationSettings:
 51 |         """Return generation settings.
 52 | 
 53 |         :return: Generation settings.
 54 |         """
 55 |         return self._generation_settings
 56 | 
 57 |     @property
 58 |     def model(self) -> EngineModel:
 59 |         """Return model instance.
 60 | 
 61 |         :return: Model instance.
 62 |         """
 63 |         return self._model
 64 | 
 65 |     @property
 66 |     @abc.abstractmethod
 67 |     def supports_few_shotting(self) -> bool:
 68 |         """Return whether engine supports few-shotting.
 69 | 
 70 |         :return: Whether engine supports few-shotting.
 71 |         """
 72 | 
 73 |     @property
 74 |     @abc.abstractmethod
 75 |     def inference_modes(self) -> type[EngineInferenceMode]:
 76 |         """Return supported inference modes.
 77 | 
 78 |         :return: Supported inference modes.
 79 |         """
 80 | 
 81 |     @abc.abstractmethod
 82 |     def build_executable(
 83 |         self,
 84 |         inference_mode: EngineInferenceMode,
 85 |         prompt_template: str | None,
 86 |         prompt_signature: type[EnginePromptSignature] | EnginePromptSignature,
 87 |         fewshot_examples: Sequence[pydantic.BaseModel] = (),
 88 |     ) -> Executable[EngineResult | None]:
 89 |         """Return a prompt executable for the given signature and mode.
 90 | 
 91 |         This wraps the engine‑native generation callable (e.g., DSPy Predict,
 92 |         Outlines Generator) with Sieves’ uniform interface.
 93 |         :param inference_mode: Inference mode to use (e.g. classification, JSON, ... - this is engine-specific).
 94 |         :param prompt_template: Prompt template.
 95 |         :param prompt_signature: Expected prompt signature type.
 96 |         :param fewshot_examples: Few-shot examples.
 97 |         :return: Prompt executable.
 98 |         """
 99 | 
100 |     @staticmethod
101 |     def convert_fewshot_examples(fewshot_examples: Sequence[pydantic.BaseModel]) -> list[dict[str, Any]]:
102 |         """Convert few‑shot examples to dicts.
103 | 
104 |         :param fewshot_examples: Fewshot examples to convert.
105 |         :return: Fewshot examples as dicts.
106 |         """
107 |         return [fs_example.model_dump(serialize_as_any=True) for fs_example in fewshot_examples]
108 | 
109 |     @staticmethod
110 |     async def _execute_async_calls(calls: list[Coroutine[Any, Any, Any]] | list[Awaitable[Any]]) -> Any:
111 |         """Execute a batch of async functions.
112 | 
113 |         :param calls: Async calls to execute.
114 |         :return: Parsed response objects.
115 |         """
116 |         return await asyncio.gather(*calls)
117 | 
118 | 
119 | class PydanticEngine(abc.ABC, Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode]):
120 |     """Abstract super class for engines using Pydantic signatures and results.
121 | 
122 |     Note that this class also assumes the engine accepts a prompt. This holds true for most engines - it doesn't only
123 |     for those with an idiocratic way to process prompts like DSPy, or decoder-only models which don't work with
124 |     object-based signatures anyway.
125 |     If and once we add support for a Pydantic-based engine that doesn't accept prompt templates, we'll adjust by
126 |     modifying `_infer()` to accept an additional parameter specifying how to handle prompt/instruction injection (and
127 |     we might have to make `supports_few_shotting()` engine-specific again).
128 |     """
129 | 
130 |     @classmethod
131 |     def _create_template(cls, template: str | None) -> jinja2.Template:
132 |         """Create Jinja2 template from template string.
133 | 
134 |         :param template: Template string.
135 |         :return: Jinja2 template.
136 |         """
137 |         assert template, f"prompt_template has to be provided to {cls.__name__}."
138 |         return jinja2.Template(template)
139 | 
140 |     @override
141 |     @property
142 |     def supports_few_shotting(self) -> bool:
143 |         return True
144 | 
145 |     def _infer(
146 |         self,
147 |         generator: Callable[[list[str]], Iterable[EngineResult]],
148 |         template: jinja2.Template,
149 |         values: Sequence[dict[str, Any]],
150 |         fewshot_examples: Sequence[pydantic.BaseModel],
151 |     ) -> Iterable[EngineResult | None]:
152 |         """Run inference in batches with exception handling.
153 | 
154 |         :param generator: Callable generating responses.
155 |         :param template: Prompt template.
156 |         :param values: Doc values to inject.
157 |         :param fewshot_examples: Fewshot examples.
158 |         :return: Results parsed from responses.
159 |         """
160 |         fewshot_examples_dict = Engine.convert_fewshot_examples(fewshot_examples)
161 |         examples = {"examples": fewshot_examples_dict} if len(fewshot_examples_dict) else {}
162 | 
163 |         try:
164 |             yield from generator([template.render(**doc_values, **examples) for doc_values in values])
165 | 
166 |         except Exception as err:
167 |             if self._strict_mode:
168 |                 raise type(err)(
169 |                     "Encountered problem when executing prompt. Ensure your few-shot examples and document "
170 |                     "chunks contain sensible information."
171 |                 ) from err
172 |             else:
173 |                 yield from (None for _ in range(len(values)))
174 | 


--------------------------------------------------------------------------------
/sieves/engines/dspy_.py:
--------------------------------------------------------------------------------
  1 | """DSPy engine integration for Sieves."""
  2 | 
  3 | import asyncio
  4 | import enum
  5 | from collections.abc import Iterable, Sequence
  6 | from typing import Any, override
  7 | 
  8 | import dspy
  9 | import nest_asyncio
 10 | import pydantic
 11 | 
 12 | from sieves.engines.core import Engine, Executable
 13 | from sieves.engines.types import GenerationSettings
 14 | 
 15 | PromptSignature = dspy.Signature | dspy.Module
 16 | Model = dspy.LM | dspy.BaseLM
 17 | Result = dspy.Prediction
 18 | 
 19 | 
 20 | nest_asyncio.apply()
 21 | 
 22 | 
 23 | class InferenceMode(enum.Enum):
 24 |     """Available inference modes.
 25 | 
 26 |     See https://dspy.ai/#__tabbed_2_6 for more information and examples.
 27 |     """
 28 | 
 29 |     # Default inference mode.
 30 |     predict = dspy.Predict
 31 |     # CoT-style inference.
 32 |     chain_of_thought = dspy.ChainOfThought
 33 |     # Agentic, i.e. with tool use.
 34 |     react = dspy.ReAct
 35 |     # For multi-stage pipelines within a task. This is handled differently than the other supported modules: dspy.Module
 36 |     # serves as both the signature as well as the inference generator.
 37 |     module = dspy.Module
 38 | 
 39 | 
 40 | class DSPy(Engine[PromptSignature, Result, Model, InferenceMode]):
 41 |     """Engine for DSPy."""
 42 | 
 43 |     def __init__(self, model: Model, generation_settings: GenerationSettings):
 44 |         """Initialize engine.
 45 | 
 46 |         :param model: Model to run. Note: DSPy only runs with APIs. If you want to run a model locally from v2.5
 47 |             onwards, serve it with OLlama - see here: # https://dspy.ai/learn/programming/language_models/?h=models#__tabbed_1_5.
 48 |             In a nutshell:
 49 |             > curl -fsSL https://ollama.ai/install.sh | sh
 50 |             > ollama run MODEL_ID
 51 |             > `model = dspy.LM(MODEL_ID, api_base='http://localhost:11434', api_key='')`
 52 |         :param generation_settings: Settings including DSPy configuration in `config_kwargs`.
 53 |         """
 54 |         super().__init__(model, generation_settings)
 55 |         cfg = generation_settings.config_kwargs or {}
 56 |         dspy.configure(lm=model, **cfg)
 57 | 
 58 |     @override
 59 |     @property
 60 |     def inference_modes(self) -> type[InferenceMode]:
 61 |         return InferenceMode
 62 | 
 63 |     @override
 64 |     @property
 65 |     def supports_few_shotting(self) -> bool:
 66 |         return True
 67 | 
 68 |     @override
 69 |     def build_executable(
 70 |         self,
 71 |         inference_mode: InferenceMode,
 72 |         prompt_template: str | None,  # noqa: UP007
 73 |         prompt_signature: type[PromptSignature] | PromptSignature,
 74 |         fewshot_examples: Sequence[pydantic.BaseModel] = tuple(),
 75 |     ) -> Executable[Result | None]:
 76 |         # Note: prompt_template is ignored here, as DSPy doesn't use it directly (only prompt_signature_description).
 77 |         assert isinstance(prompt_signature, type)
 78 | 
 79 |         # Handled differently than the other supported modules: dspy.Module serves as both the signature as well as
 80 |         # the inference generator.
 81 |         if inference_mode == InferenceMode.module:
 82 |             assert isinstance(prompt_signature, dspy.Module), ValueError(
 83 |                 "In inference mode 'module' the provided prompt signature has to be of type dspy.Module."
 84 |             )
 85 |             generator = inference_mode.value(**self._init_kwargs)
 86 |         else:
 87 |             assert issubclass(prompt_signature, dspy.Signature)
 88 |             generator = inference_mode.value(signature=prompt_signature, **self._init_kwargs)
 89 | 
 90 |         def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result | None]:
 91 |             """Execute structured generation with DSPy.
 92 | 
 93 |             :params values: Values to inject into prompts.
 94 |             :returns: Results for prompts.
 95 |             """
 96 |             # Compile predictor with few-shot examples.
 97 |             fewshot_examples_dicts = DSPy.convert_fewshot_examples(fewshot_examples)
 98 |             generator_fewshot: dspy.Module | None = None
 99 |             if len(fewshot_examples_dicts):
100 |                 examples = [dspy.Example(**fs_example) for fs_example in fewshot_examples_dicts]
101 |                 generator_fewshot = dspy.LabeledFewShot(k=len(examples)).compile(student=generator, trainset=examples)
102 | 
103 |             try:
104 |                 gen = generator_fewshot or generator
105 |                 calls = [gen.acall(**doc_values, **self._inference_kwargs) for doc_values in values]
106 |                 yield from asyncio.run(self._execute_async_calls(calls))
107 | 
108 |             except Exception as err:
109 |                 if self._strict_mode:
110 |                     raise type(err)(
111 |                         "Encountered problem when executing prompt. Ensure your few-shot examples and document "
112 |                         "chunks contain sensible information."
113 |                     ) from err
114 |                 else:
115 |                     yield from [None] * len(values)
116 | 
117 |         return execute
118 | 


--------------------------------------------------------------------------------
/sieves/engines/engine_import.py:
--------------------------------------------------------------------------------
 1 | """Import 3rd-party libraries required for engines.
 2 | 
 3 | If library can't be found, placeholder engines is imported instead.
 4 | 
 5 | This allows us to import everything downstream without having to worry about optional dependencies. If a user specifies
 6 | an engine/model from a non-installed library, we terminate with an error.
 7 | """
 8 | 
 9 | # mypy: disable-error-code="no-redef"
10 | 
11 | import warnings
12 | 
13 | from .missing import MissingEngine
14 | 
15 | _missing_dependencies: list[str] = []
16 | 
17 | 
18 | try:
19 |     from . import dspy_
20 |     from .dspy_ import DSPy
21 | except ModuleNotFoundError:
22 |     from . import missing as dspy_
23 | 
24 |     DSPy = MissingEngine  # type: ignore[misc,assignment]
25 |     _missing_dependencies.append("dspy")
26 | 
27 | 
28 | try:
29 |     from . import glix_
30 |     from .glix_ import GliX
31 | except ModuleNotFoundError:
32 |     from . import missing as glix_
33 | 
34 |     GliX = MissingEngine  # type: ignore[misc,assignment]
35 |     _missing_dependencies.append("gliner")
36 | 
37 | 
38 | try:
39 |     from . import huggingface_
40 |     from .huggingface_ import HuggingFace
41 | except ModuleNotFoundError:
42 |     from . import missing as huggingface_
43 | 
44 |     HuggingFace = MissingEngine  # type: ignore[misc,assignment]
45 |     _missing_dependencies.append("transformers")
46 | 
47 | 
48 | try:
49 |     from . import langchain_
50 |     from .langchain_ import LangChain
51 | except ModuleNotFoundError:
52 |     from . import missing as langchain_
53 | 
54 |     LangChain = MissingEngine  # type: ignore[misc,assignment]
55 |     _missing_dependencies.append("langchain")
56 | 
57 | 
58 | try:
59 |     from . import outlines_
60 |     from .outlines_ import Outlines
61 | except ModuleNotFoundError:
62 |     from . import missing as outlines_
63 | 
64 |     Outlines = MissingEngine  # type: ignore[misc,assignment]
65 |     _missing_dependencies.append("outlines")
66 | 
67 | 
68 | if len(_missing_dependencies):
69 |     warnings.warn(
70 |         "Warning: structured generation dependencies [{deps}] could not be imported. Generating with them requires them"
71 |         " to be installed.".format(deps=", ".join(_missing_dependencies))
72 |     )
73 | 
74 | 
75 | __all__ = [
76 |     "dspy_",
77 |     "DSPy",
78 |     "glix_",
79 |     "GliX",
80 |     "huggingface_",
81 |     "HuggingFace",
82 |     "langchain_",
83 |     "LangChain",
84 |     "outlines_",
85 |     "Outlines",
86 | ]
87 | 


--------------------------------------------------------------------------------
/sieves/engines/engine_type.py:
--------------------------------------------------------------------------------
 1 | """Engine type enum and utilities."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import enum
 6 | 
 7 | from .core import Engine, EngineInferenceMode, EngineModel, EnginePromptSignature, EngineResult
 8 | from .engine_import import dspy_, glix_, huggingface_, langchain_, outlines_
 9 | 
10 | 
11 | class EngineType(enum.Enum):
12 |     """Available engine types."""
13 | 
14 |     dspy = dspy_.DSPy
15 |     glix = glix_.GliX
16 |     huggingface = huggingface_.HuggingFace
17 |     langchain = langchain_.LangChain
18 |     outlines = outlines_.Outlines
19 | 
20 |     @classmethod
21 |     def all(cls) -> tuple[EngineType, ...]:
22 |         """Return all available engine types.
23 | 
24 |         :return tuple[EngineType, ...]: All available engine types.
25 |         """
26 |         return tuple(EngineType)
27 | 
28 |     @classmethod
29 |     def get_engine_type(
30 |         cls, engine: Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode]
31 |     ) -> EngineType:
32 |         """Return engine type for specified engine.
33 | 
34 |         :param engine: Engine to get type for.
35 |         :return EngineType: Engine type for self._engine.
36 |         :raises ValueError: if engine class not found in EngineType.
37 |         """
38 |         for et in EngineType:
39 |             if isinstance(engine, et.value):
40 |                 return et
41 |         raise ValueError(f"Engine class {engine.__class__.__name__} not found in EngineType.")
42 | 


--------------------------------------------------------------------------------
/sieves/engines/glix_.py:
--------------------------------------------------------------------------------
 1 | """GliX engine wrapper built on top of GLiNER multi‑task pipelines."""
 2 | 
 3 | import enum
 4 | import warnings
 5 | from collections.abc import Iterable, Sequence
 6 | from typing import Any, override
 7 | 
 8 | import gliner.multitask.base
 9 | import jinja2
10 | import pydantic
11 | 
12 | from sieves.engines.core import Engine, Executable
13 | from sieves.engines.types import GenerationSettings
14 | 
15 | PromptSignature = list[str]
16 | Model = gliner.model.GLiNER
17 | Result = list[dict[str, str | float]] | str
18 | 
19 | 
20 | class InferenceMode(enum.Enum):
21 |     """Available inference modes."""
22 | 
23 |     ner = gliner.config.GLiNERConfig
24 |     classification = gliner.multitask.GLiNERClassifier
25 |     question_answering = gliner.multitask.GLiNERQuestionAnswerer
26 |     information_extraction = gliner.multitask.GLiNEROpenExtractor
27 |     summarization = gliner.multitask.GLiNERSummarizer
28 |     relation_extraction = gliner.multitask.GLiNERRelationExtractor
29 | 
30 | 
31 | class GliX(Engine[PromptSignature, Result, Model, InferenceMode]):
32 |     """Engine adapter for GLiNER's multitask utilities (NER, CLS, QA, etc.)."""
33 | 
34 |     def __init__(self, model: Model, generation_settings: GenerationSettings):
35 |         """Initialize GliX engine wrapper with model and settings."""
36 |         super().__init__(model, generation_settings)
37 |         self._model_wrappers: dict[InferenceMode, gliner.multitask.base.GLiNERBasePipeline] = {}
38 | 
39 |     @override
40 |     @property
41 |     def inference_modes(self) -> type[InferenceMode]:
42 |         return InferenceMode
43 | 
44 |     @override
45 |     @property
46 |     def supports_few_shotting(self) -> bool:
47 |         return False
48 | 
49 |     @override
50 |     def build_executable(
51 |         self,
52 |         inference_mode: InferenceMode,
53 |         prompt_template: str | None,
54 |         prompt_signature: type[PromptSignature] | PromptSignature,
55 |         fewshot_examples: Sequence[pydantic.BaseModel] = (),
56 |     ) -> Executable[Result]:
57 |         assert isinstance(prompt_signature, list)
58 |         cls_name = self.__class__.__name__
59 |         if len(list(fewshot_examples)):
60 |             warnings.warn(f"Few-shot examples are not supported by engine {cls_name}.")
61 | 
62 |         # Lazily initialize multi-task wrapper for underlying GliNER model.
63 |         if inference_mode not in self._model_wrappers:
64 |             self._model_wrappers[inference_mode] = inference_mode.value(model=self._model)
65 | 
66 |         model = self._model_wrappers[inference_mode]
67 | 
68 |         # Overwrite prompt default template, if template specified. Note that this is a static prompt and GliX doesn't
69 |         # do few-shotting, so we don't inject anything into the template.
70 |         if prompt_template:
71 |             self._model.prompt = jinja2.Template(prompt_template).render()
72 | 
73 |         def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result]:
74 |             """Execute prompts with engine for given values.
75 | 
76 |             :param values: Values to inject into prompts.
77 |             :return Iterable[Result]: Results for prompts.
78 |             """
79 |             try:
80 |                 params: dict[InferenceMode, dict[str, Any]] = {
81 |                     InferenceMode.classification: {"classes": prompt_signature, "multi_label": True},
82 |                     InferenceMode.question_answering: {"questions": prompt_signature},
83 |                     InferenceMode.summarization: {},
84 |                     InferenceMode.ner: {"entity_types": prompt_signature},
85 |                 }
86 |                 selected_params = params[inference_mode]  # Select parameters based on inference mode
87 |             except KeyError:
88 |                 raise ValueError(f"Inference mode {inference_mode} not supported by {cls_name} engine.")
89 | 
90 |             texts = [val["text"] for val in values]
91 |             if inference_mode == InferenceMode.ner:
92 |                 yield from self._model.batch_predict_entities(texts=texts, labels=selected_params["entity_types"])
93 |             else:
94 |                 assert isinstance(selected_params, dict)
95 |                 yield from model(texts, **(selected_params | self._inference_kwargs))
96 | 
97 |         return execute
98 | 


--------------------------------------------------------------------------------
/sieves/engines/huggingface_.py:
--------------------------------------------------------------------------------
 1 | """Hugging Face transformers engine wrapper (zero-shot classification)."""
 2 | 
 3 | import enum
 4 | from collections.abc import Iterable, Sequence
 5 | from typing import Any, override
 6 | 
 7 | import jinja2
 8 | import pydantic
 9 | import transformers
10 | 
11 | from sieves.engines.core import Engine, Executable
12 | 
13 | PromptSignature = list[str]
14 | Model = transformers.Pipeline
15 | Result = dict[str, list[str] | list[float]]
16 | 
17 | 
18 | class InferenceMode(enum.Enum):
19 |     """Available inference modes."""
20 | 
21 |     zeroshot_cls = 0
22 | 
23 | 
24 | class HuggingFace(Engine[PromptSignature, Result, Model, InferenceMode]):
25 |     """Engine adapter around ``transformers.Pipeline`` for zero‑shot tasks."""
26 | 
27 |     @override
28 |     @property
29 |     def inference_modes(self) -> type[InferenceMode]:
30 |         return InferenceMode
31 | 
32 |     @override
33 |     @property
34 |     def supports_few_shotting(self) -> bool:
35 |         return True
36 | 
37 |     @override
38 |     def build_executable(
39 |         self,
40 |         inference_mode: InferenceMode,
41 |         prompt_template: str | None,
42 |         prompt_signature: type[PromptSignature] | PromptSignature,
43 |         fewshot_examples: Sequence[pydantic.BaseModel] = (),
44 |     ) -> Executable[Result | None]:
45 |         cls_name = self.__class__.__name__
46 |         assert prompt_template, ValueError(f"prompt_template has to be provided to {cls_name} engine by task.")
47 |         assert isinstance(prompt_signature, list)
48 | 
49 |         # Render template with few-shot examples. Note that we don't use extracted document values here, as HF zero-shot
50 |         # pipelines only support one hypothesis template per call - and we want to batch, so our hypothesis template
51 |         # will be document-invariant.
52 |         fewshot_examples_dict = HuggingFace.convert_fewshot_examples(fewshot_examples)
53 |         # Render hypothesis template with everything but text.
54 |         template = jinja2.Template(prompt_template).render(**({"examples": fewshot_examples_dict}))
55 | 
56 |         def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result]:
57 |             """Execute prompts with engine for given values.
58 | 
59 |             :param values: Values to inject into prompts.
60 |             :return Iterable[Result]: Results for prompts.
61 |             """
62 |             match inference_mode:
63 |                 case InferenceMode.zeroshot_cls:
64 |                     yield from self._model(
65 |                         sequences=[doc_values["text"] for doc_values in values],
66 |                         candidate_labels=prompt_signature,
67 |                         hypothesis_template=template,
68 |                         multi_label=True,
69 |                         **self._inference_kwargs,
70 |                     )
71 | 
72 |                 case _:
73 |                     raise ValueError(f"Inference mode {inference_mode} not supported by {cls_name} engine.")
74 | 
75 |         return execute
76 | 


--------------------------------------------------------------------------------
/sieves/engines/langchain_.py:
--------------------------------------------------------------------------------
 1 | """LangChain engine wrapper for structured outputs using Pydantic."""
 2 | 
 3 | import asyncio
 4 | import enum
 5 | from collections.abc import Iterable, Sequence
 6 | from typing import Any, override
 7 | 
 8 | import langchain_core.language_models
 9 | import nest_asyncio
10 | import pydantic
11 | 
12 | from sieves.engines.core import Executable, PydanticEngine
13 | 
14 | nest_asyncio.apply()
15 | 
16 | Model = langchain_core.language_models.BaseChatModel
17 | PromptSignature = pydantic.BaseModel
18 | Result = pydantic.BaseModel
19 | 
20 | 
21 | class InferenceMode(enum.Enum):
22 |     """Available inference modes."""
23 | 
24 |     structured = "structured"
25 | 
26 | 
27 | class LangChain(PydanticEngine[PromptSignature, Result, Model, InferenceMode]):
28 |     """Engine for LangChain."""
29 | 
30 |     @override
31 |     @property
32 |     def inference_modes(self) -> type[InferenceMode]:
33 |         return InferenceMode
34 | 
35 |     @override
36 |     @override
37 |     def build_executable(
38 |         self,
39 |         inference_mode: InferenceMode,
40 |         prompt_template: str | None,  # noqa: UP007
41 |         prompt_signature: type[PromptSignature] | PromptSignature,
42 |         fewshot_examples: Sequence[pydantic.BaseModel] = tuple(),
43 |     ) -> Executable[Result | None]:
44 |         assert isinstance(prompt_signature, type)
45 |         cls_name = self.__class__.__name__
46 |         template = self._create_template(prompt_template)
47 |         model = self._model.with_structured_output(prompt_signature)
48 | 
49 |         def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result | None]:
50 |             """Execute prompts with engine for given values.
51 | 
52 |             :param values: Values to inject into prompts.
53 |             :return Iterable[Result | None]: Results for prompts. Results are None if corresponding prompt failed.
54 |             """
55 |             match inference_mode:
56 |                 case InferenceMode.structured:
57 | 
58 |                     def generate(prompts: list[str]) -> Iterable[Result]:
59 |                         try:
60 |                             yield from asyncio.run(model.abatch(prompts, **self._inference_kwargs))
61 | 
62 |                         except Exception as err:
63 |                             raise type(err)(
64 |                                 f"Encountered problem in parsing {cls_name} output. Double-check your prompts and "
65 |                                 f"examples."
66 |                             ) from err
67 | 
68 |                     generator = generate
69 |                 case _:
70 |                     raise ValueError(f"Inference mode {inference_mode} not supported by {cls_name} engine.")
71 | 
72 |             yield from self._infer(generator, template, values, fewshot_examples)
73 | 
74 |         return execute
75 | 


--------------------------------------------------------------------------------
/sieves/engines/missing.py:
--------------------------------------------------------------------------------
 1 | """Fallback engine types when optional dependencies are unavailable."""
 2 | 
 3 | import enum
 4 | from collections.abc import Callable, Iterable, Sequence
 5 | from typing import Any, override
 6 | 
 7 | import pydantic
 8 | 
 9 | from sieves.engines.core import Engine
10 | 
11 | PromptSignature = Any
12 | Model = Any
13 | Result = Any
14 | 
15 | 
16 | class InferenceMode(enum.Enum):
17 |     """Placeholder mode for unsupported engines."""
18 | 
19 |     any = Any
20 | 
21 | 
22 | class MissingEngine(Engine[PromptSignature, Result, Model, InferenceMode]):
23 |     """Placeholder for engine that couldn't be imported due to missing dependencies."""
24 | 
25 |     @override
26 |     @property
27 |     def supports_few_shotting(self) -> bool:
28 |         raise NotImplementedError
29 | 
30 |     @override
31 |     @property
32 |     def inference_modes(self) -> type[InferenceMode]:
33 |         raise NotImplementedError
34 | 
35 |     @override
36 |     def build_executable(
37 |         self,
38 |         inference_mode: InferenceMode,
39 |         prompt_template: str | None,
40 |         prompt_signature: type[PromptSignature] | PromptSignature,
41 |         fewshot_examples: Sequence[pydantic.BaseModel] = (),
42 |     ) -> Callable[[Iterable[dict[str, Any]]], Iterable[Result | None]]:
43 |         raise NotImplementedError
44 | 


--------------------------------------------------------------------------------
/sieves/engines/outlines_.py:
--------------------------------------------------------------------------------
  1 | """Outlines engine wrapper supporting text, choices, regex and JSON schemas."""
  2 | 
  3 | import enum
  4 | from collections.abc import Iterable, Sequence
  5 | from typing import Any, Literal, override
  6 | 
  7 | import outlines
  8 | import pydantic
  9 | from outlines.models import AsyncBlackBoxModel, BlackBoxModel, SteerableModel
 10 | 
 11 | from sieves.engines.core import Executable, PydanticEngine
 12 | 
 13 | PromptSignature = (
 14 |     pydantic.BaseModel | list[str] | str | outlines.types.Choice | outlines.types.Regex | outlines.types.JsonSchema
 15 | )
 16 | Model = AsyncBlackBoxModel | BlackBoxModel | SteerableModel
 17 | Result = pydantic.BaseModel | str
 18 | 
 19 | 
 20 | class InferenceMode(enum.Enum):
 21 |     """Available inference modes.
 22 | 
 23 |     Note: generator functions are wrapped in tuples, as otherwise the Enum instance seems to be replaced by the function
 24 |     itself - not sure why that happens. Should take another look at this.
 25 |     """
 26 | 
 27 |     # For normal text output, i.e. no structured generation.
 28 |     text = "text"
 29 |     # For limited set of choices, e.g. classification.
 30 |     choice = "choice"
 31 |     # Regex-conforming output.
 32 |     regex = "regex"
 33 |     # Output conforming to Pydantic models.
 34 |     json = "json"
 35 | 
 36 | 
 37 | class Outlines(PydanticEngine[PromptSignature, Result, Model, InferenceMode]):
 38 |     """Engine for Outlines with multiple structured inference modes."""
 39 | 
 40 |     @override
 41 |     @property
 42 |     def inference_modes(self) -> type[InferenceMode]:
 43 |         return InferenceMode
 44 | 
 45 |     @override
 46 |     def build_executable(
 47 |         self,
 48 |         inference_mode: InferenceMode,
 49 |         prompt_template: str | None,  # noqa: UP007
 50 |         prompt_signature: type[PromptSignature] | PromptSignature,
 51 |         fewshot_examples: Sequence[pydantic.BaseModel] = (),
 52 |     ) -> Executable[Result | None]:
 53 |         template = self._create_template(prompt_template)
 54 | 
 55 |         # Create Generator instance responsible for generating non-parsed text.
 56 |         if isinstance(prompt_signature, list):
 57 |             prompt_signature = Literal[*prompt_signature]
 58 | 
 59 |         if inference_mode == InferenceMode.regex:
 60 |             prompt_signature = outlines.types.Regex(prompt_signature)
 61 | 
 62 |         generator = outlines.Generator(self._model, output_type=prompt_signature, **self._init_kwargs)
 63 | 
 64 |         def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result | None]:
 65 |             """Execute prompts with engine for given values.
 66 | 
 67 |             :param values: Values to inject into prompts.
 68 |             :return Iterable[Result | None]: Results for prompts. Results are None if corresponding prompt failed.
 69 |             """
 70 | 
 71 |             def generate(prompts: list[str]) -> Iterable[Result]:
 72 |                 try:
 73 |                     results = generator.batch(prompts, **self._inference_kwargs)
 74 |                 # Batch mode is not implemented for all Outlines wrappers. Fall back to single-prompt mode in that case.
 75 |                 except NotImplementedError:
 76 | 
 77 |                     async def generate_async(prompt: str) -> Result | None:
 78 |                         """Generate result async.
 79 | 
 80 |                         :param prompt: Prompt to generate result for.
 81 |                         :return: Result for prompt. Results are None if corresponding prompt failed.
 82 |                         """
 83 |                         return generator(prompt, **self._inference_kwargs)
 84 | 
 85 |                     calls = [generate_async(prompt) for prompt in prompts]
 86 |                     results = self._execute_async_calls(calls)
 87 | 
 88 |                 if inference_mode == InferenceMode.json:
 89 |                     assert len(results) == len(prompts)
 90 |                     assert isinstance(prompt_signature, type) and issubclass(prompt_signature, pydantic.BaseModel)
 91 |                     yield from [prompt_signature.model_validate_json(result) for result in results]
 92 |                 else:
 93 |                     yield from results
 94 | 
 95 |             yield from self._infer(
 96 |                 generate,
 97 |                 template,
 98 |                 values,
 99 |                 fewshot_examples,
100 |             )
101 | 
102 |         return execute
103 | 


--------------------------------------------------------------------------------
/sieves/engines/types.py:
--------------------------------------------------------------------------------
 1 | """Common types."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | import pydantic
 6 | 
 7 | 
 8 | class GenerationSettings(pydantic.BaseModel):
 9 |     """Settings for structured generation.
10 | 
11 |     :param init_kwargs: kwargs passed on to initialization of structured generator. Not all engines use this - ignored
12 |         otherwise.
13 |     :param inference_kwargs: kwargs passed on to inference with structured generator.
14 |     :param config_kwargs: Used only if supplied model is a DSPy model object, ignored otherwise. Optional kwargs
15 |         supplied to dspy.configure().
16 |     :param strict_mode: If True, exception is raised if prompt response can't be parsed correctly.
17 |     """
18 | 
19 |     init_kwargs: dict[str, Any] | None = None
20 |     inference_kwargs: dict[str, Any] | None = None
21 |     config_kwargs: dict[str, Any] | None = None
22 |     strict_mode: bool = False
23 | 


--------------------------------------------------------------------------------
/sieves/engines/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for engines."""
 2 | 
 3 | import outlines
 4 | import transformers
 5 | 
 6 | from sieves.engines.core import Engine, EngineInferenceMode, EngineModel, EnginePromptSignature, EngineResult
 7 | from sieves.engines.engine_import import (
 8 |     dspy_,
 9 |     glix_,
10 |     huggingface_,
11 |     langchain_,
12 |     outlines_,
13 | )
14 | from sieves.engines.types import GenerationSettings
15 | 
16 | Model = dspy_.Model | glix_.Model | huggingface_.Model | langchain_.Model | outlines_.Model
17 | 
18 | 
19 | def init_default_model() -> outlines.models.Transformers:  # noqa: D401
20 |     """Initialize default model (HuggingFaceTB/SmolLM-360M-Instruct with Outlines).
21 | 
22 |     :return: Initialized default model.
23 |     """
24 |     model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
25 | 
26 |     return outlines.models.from_transformers(
27 |         transformers.AutoModelForCausalLM.from_pretrained(model_name),
28 |         transformers.AutoTokenizer.from_pretrained(model_name),
29 |     )
30 | 
31 | 
32 | def init_engine(
33 |     model: Model, generation_settings: GenerationSettings
34 | ) -> Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode]:  # noqa: D401
35 |     """Initialize internal engine object.
36 | 
37 |     :param model: Model to use.
38 |     :param generation_settings: Settings for structured generation.
39 |     :return Engine: Engine.
40 |     :raises ValueError: If model type isn't supported.
41 |     """
42 |     model_type = type(model)
43 |     module_engine_map = {
44 |         dspy_: dspy_.DSPy,
45 |         glix_: glix_.GliX,
46 |         huggingface_: huggingface_.HuggingFace,
47 |         langchain_: langchain_.LangChain,
48 |         outlines_: outlines_.Outlines,
49 |     }
50 | 
51 |     for module, engine_type in module_engine_map.items():
52 |         try:
53 |             module_model_types = module.Model.__args__
54 |         except AttributeError:
55 |             module_model_types = (module.Model,)
56 | 
57 |         if any(issubclass(model_type, module_model_type) for module_model_type in module_model_types):
58 |             internal_engine = engine_type(
59 |                 model=model,
60 |                 generation_settings=generation_settings,
61 |             )
62 |             assert isinstance(internal_engine, Engine)
63 | 
64 |             return internal_engine
65 | 
66 |     raise ValueError(
67 |         f"Model type {model.__class__} is not supported. Please check the documentation and ensure you're "
68 |         f"providing a supported model type."
69 |     )
70 | 


--------------------------------------------------------------------------------
/sieves/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import Pipeline
2 | 
3 | __all__ = ["Pipeline"]
4 | 


--------------------------------------------------------------------------------
/sieves/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | """Tasks."""
 2 | 
 3 | from . import predictive, preprocessing
 4 | from .core import Task
 5 | from .postprocessing import DistillationFramework
 6 | from .predictive import (
 7 |     NER,
 8 |     Classification,
 9 |     InformationExtraction,
10 |     PIIMasking,
11 |     QuestionAnswering,
12 |     SentimentAnalysis,
13 |     Summarization,
14 |     Translation,
15 | )
16 | from .predictive.core import PredictiveTask
17 | from .preprocessing import Chunking, Ingestion
18 | 
19 | __all__ = [
20 |     "Chunking",
21 |     "Classification",
22 |     "DistillationFramework",
23 |     "NER",
24 |     "InformationExtraction",
25 |     "Ingestion",
26 |     "SentimentAnalysis",
27 |     "Summarization",
28 |     "Translation",
29 |     "QuestionAnswering",
30 |     "PIIMasking",
31 |     "Task",
32 |     "predictive",
33 |     "PredictiveTask",
34 |     "preprocessing",
35 | ]
36 | 


--------------------------------------------------------------------------------
/sieves/tasks/core.py:
--------------------------------------------------------------------------------
  1 | """Core task implementation."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import abc
  6 | from collections.abc import Iterable
  7 | from typing import TYPE_CHECKING, Any
  8 | 
  9 | from sieves.data import Doc
 10 | from sieves.serialization import Attribute, Config
 11 | 
 12 | if TYPE_CHECKING:
 13 |     # Imported only for type checking to avoid import cycles at runtime.
 14 |     from sieves.pipeline import Pipeline
 15 | 
 16 | 
 17 | class Task(abc.ABC):
 18 |     """Abstract base class for tasks that can be executed on documents."""
 19 | 
 20 |     def __init__(self, task_id: str | None, include_meta: bool, batch_size: int):
 21 |         """
 22 |         Initiate new Task.
 23 | 
 24 |         :param task_id: Task ID.
 25 |         :param include_meta: Whether to include meta information generated by the task.
 26 |         :param batch_size: Batch size for processing documents. Use -1 to process all documents at once.
 27 |         """
 28 |         self._task_id = task_id if task_id else self.__class__.__name__
 29 |         self._include_meta = include_meta
 30 |         self._batch_size = batch_size
 31 | 
 32 |     @property
 33 |     def id(self) -> str:
 34 |         """Return task ID.
 35 | 
 36 |         Used by pipeline for results and dependency management.
 37 | 
 38 |         :return: Task ID.
 39 |         """
 40 |         return self._task_id
 41 | 
 42 |     @abc.abstractmethod
 43 |     def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
 44 |         """Execute task.
 45 | 
 46 |         :param docs: Docs to process.
 47 |         :return: Processed docs.
 48 |         """
 49 | 
 50 |     def __add__(self, other: Task | Pipeline) -> Pipeline:
 51 |         """Chain this task with another task or pipeline using the ``+`` operator.
 52 | 
 53 |         This returns a new ``Pipeline`` that executes this task first, followed by the
 54 |         task(s) in ``other``. The original task(s)/pipeline are not mutated.
 55 | 
 56 |         Cache semantics:
 57 |         - If ``other`` is a ``Pipeline``, the resulting pipeline adopts ``other``'s
 58 |           ``use_cache`` setting (because the left-hand side is a single task).
 59 |         - If ``other`` is a ``Task``, the resulting pipeline defaults to ``use_cache=True``.
 60 | 
 61 |         :param other: A ``Task`` or ``Pipeline`` to execute after this task.
 62 |         :return: A new ``Pipeline`` representing the chained execution.
 63 |         :raises TypeError: If ``other`` is not a ``Task`` or ``Pipeline``.
 64 |         """
 65 |         # Lazy import to avoid circular dependency at module import time.
 66 |         from sieves.pipeline import Pipeline
 67 | 
 68 |         if isinstance(other, Pipeline):
 69 |             return Pipeline(tasks=[self, *other.tasks], use_cache=other.use_cache)
 70 | 
 71 |         if isinstance(other, Task):
 72 |             return Pipeline(tasks=[self, other])
 73 | 
 74 |         raise TypeError(f"Cannot chain Task with {type(other).__name__}")
 75 | 
 76 |     @property
 77 |     def _state(self) -> dict[str, Any]:
 78 |         """Return attributes to serialize.
 79 | 
 80 |         :return: Dict of attributes to serialize.
 81 |         """
 82 |         return {
 83 |             "task_id": self._task_id,
 84 |             "include_meta": self._include_meta,
 85 |             "batch_size": self._batch_size,
 86 |         }
 87 | 
 88 |     def serialize(self) -> Config:
 89 |         """Serialize task.
 90 | 
 91 |         :return: Config instance.
 92 |         """
 93 |         return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})
 94 | 
 95 |     @classmethod
 96 |     def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Task:
 97 |         """Generate Task instance from config.
 98 | 
 99 |         :param config: Config to generate instance from.
100 |         :param kwargs: Values to inject into loaded config.
101 |         :return: Deserialized Task instance.
102 |         """
103 |         # Deserialize and inject engine.
104 |         return cls(**config.to_init_dict(cls, **kwargs))
105 | 


--------------------------------------------------------------------------------
/sieves/tasks/optimization/__init__.py:
--------------------------------------------------------------------------------
1 | """Prompt/few-shot exapmle optimization for tasks."""
2 | 
3 | from sieves.tasks.optimization.core import EvalMetric, Optimizer
4 | 
5 | __all__ = ["EvalMetric", "Optimizer"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/optimization/core.py:
--------------------------------------------------------------------------------
  1 | """Optimizer implementation."""
  2 | 
  3 | import random
  4 | from collections.abc import Callable
  5 | from typing import Any, Self
  6 | 
  7 | import dspy
  8 | 
  9 | from sieves.serialization import Attribute, Config
 10 | 
 11 | EvalMetric = Callable[[dspy.Example, dspy.Prediction], float]
 12 | 
 13 | 
 14 | class Optimizer:
 15 |     """Config for task optimization with DSPy.
 16 | 
 17 |     Uses MIPROv2 to optimize instructions and few-shot examples.
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         model: dspy.LM | dspy.BaseLM,
 23 |         val_frac: float,
 24 |         seed: int | None = None,
 25 |         shuffle: bool = True,
 26 |         dspy_init_kwargs: dict[str, Any] | None = None,
 27 |         dspy_compile_kwargs: dict[str, Any] | None = None,
 28 |     ):
 29 |         """Initialize optimizer.
 30 | 
 31 |         :param model: Fully initialized DSPy model to use for optimization. Doesn't have to be the same as the model
 32 |             used to run the task, but more similar is better. With a lot of data you might want to pick a faster/cheaper
 33 |             model.
 34 |         :param val_frac: Fraction of examples to use for validation. Everything else is used for optimization.
 35 |         :param seed: Random seed for data splitting.
 36 |         :param shuffle: Whether to shuffle the data.
 37 |         :param dspy_init_kwargs: Optional keyword arguments to pass to DSPy optimizer at init time.
 38 |         :param dspy_compile_kwargs: Optional keyword arguments to pass to DSPy optimizer at compile time.
 39 |         """
 40 |         self._model = model
 41 |         self._val_frac = val_frac
 42 |         self._seed = seed
 43 |         self._shuffle = shuffle
 44 |         self._init_kwargs = dspy_init_kwargs or {}
 45 |         self._compile_kwargs = {"requires_permission_to_run": False} | (dspy_compile_kwargs or {})
 46 | 
 47 |     def __call__(
 48 |         self,
 49 |         signature: type[dspy.Signature] | type[dspy.Module],
 50 |         data: list[dspy.Example],
 51 |         evaluate: EvalMetric,
 52 |         verbose: bool = False,
 53 |     ) -> tuple[str, list[dspy.Example]]:
 54 |         """Optimize prompt and few-shot examples w.r.t. given signature and dataset.
 55 | 
 56 |         :param signature: Task to optimize.
 57 |         :param data: Dataset to use for optimization.
 58 |         :param evaluate: Evaluation metric to use for optimization.
 59 |         :param verbose: Whether to log DSPy output.
 60 |         :return: Best combination of (1) prompt and (2) fewshot-examples.
 61 |         """
 62 |         predictor = dspy.Predict(signature)
 63 |         teleprompter = dspy.MIPROv2(metric=evaluate, **(self._init_kwargs or {}), verbose=False)
 64 |         trainset, devset = self._split_data(data, self._val_frac, self._seed, self._shuffle)
 65 | 
 66 |         optimized_predictor: dspy.Predict = teleprompter.compile(
 67 |             predictor, trainset=trainset, valset=devset, **(self._compile_kwargs or {})
 68 |         )
 69 | 
 70 |         return optimized_predictor.signature.instructions, optimized_predictor.demos
 71 | 
 72 |     @property
 73 |     def model(self) -> dspy.LM:
 74 |         """Return model used for optimization.
 75 | 
 76 |         :return dspy.LM: Model used for optimization.
 77 |         """
 78 |         return self._model
 79 | 
 80 |     @property
 81 |     def _state(self) -> dict[str, Any]:
 82 |         """Return attributes to serialize.
 83 | 
 84 |         :return: Dict of attributes to serialize.
 85 |         """
 86 |         return {
 87 |             "model": self._model,
 88 |             "val_frac": self._val_frac,
 89 |             "seed": self._seed,
 90 |             "shuffle": self._shuffle,
 91 |             "init_kwargs": self._init_kwargs,
 92 |             "compile_kwargs": self._compile_kwargs,
 93 |         }
 94 | 
 95 |     def serialize(self) -> Config:
 96 |         """Serialize task.
 97 | 
 98 |         :return: Config instance.
 99 |         """
100 |         return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()})
101 | 
102 |     @classmethod
103 |     def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Self:
104 |         """Generate Optimizer instance from config.
105 | 
106 |         :param config: Config to generate instance from.
107 |         :param kwargs: Values to inject into loaded config.
108 |         :return: Deserialized Optimizer instance.
109 |         """
110 |         return cls(**config.to_init_dict(cls, **kwargs))
111 | 
112 |     @staticmethod
113 |     def _split_data(
114 |         data: list[dspy.Example], val_frac: float, seed: int | None, shuffle: bool
115 |     ) -> tuple[list[dspy.Example], list[dspy.Example]]:
116 |         """Split data into train and validation sets.
117 | 
118 |         :param data: Dataset to split.
119 |         :param val_frac: Fraction of data to use for validation.
120 |         :param seed: Random seed for shuffling.
121 |         :param shuffle: Whether to shuffle the data before splitting.
122 |         :return: Tuple of (trainset, valset).
123 |         """
124 |         dataset = data.copy()
125 |         if shuffle:
126 |             rng = random.Random(seed)
127 |             rng.shuffle(dataset)
128 | 
129 |         val_size = int(len(dataset) * val_frac)
130 |         trainset = dataset[val_size:]
131 |         valset = dataset[:val_size]
132 | 
133 |         return trainset, valset
134 | 


--------------------------------------------------------------------------------
/sieves/tasks/postprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | """Postprocessing tasks."""
2 | 
3 | from .distillation import DistillationFramework
4 | 
5 | __all__ = ["DistillationFramework"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/postprocessing/distillation/__init__.py:
--------------------------------------------------------------------------------
1 | """Distillation."""
2 | 
3 | from .types import DistillationFramework, DistillationFrameworkLiteral
4 | 
5 | __all__ = ["DistillationFramework", "DistillationFrameworkLiteral"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/postprocessing/distillation/distillation_import.py:
--------------------------------------------------------------------------------
 1 | """Import 3rd-party libraries required for distillation.
 2 | 
 3 | If library can't be found, placeholder engines is imported instead.
 4 | 
 5 | This allows us to import everything downstream without having to worry about optional dependencies. If a user specifies
 6 | a non-installed distillation framework, we terminate with an error.
 7 | """
 8 | 
 9 | # mypy: disable-error-code="no-redef"
10 | 
11 | import warnings
12 | 
13 | _missing_dependencies: list[str] = []
14 | 
15 | 
16 | try:
17 |     import sentence_transformers
18 | except ModuleNotFoundError:
19 |     sentence_transformers = None
20 | 
21 |     _missing_dependencies.append("sentence_transformers")
22 | 
23 | try:
24 |     import setfit
25 | except ModuleNotFoundError:
26 |     setfit = None
27 | 
28 |     _missing_dependencies.append("setfit")
29 | 
30 | try:
31 |     import model2vec
32 |     import model2vec.train
33 | except ModuleNotFoundError:
34 |     model2vec = None
35 | 
36 |     _missing_dependencies.append("model2vec")
37 | 
38 | if len(_missing_dependencies):
39 |     warnings.warn(
40 |         "Warning: distillation dependency [{deps}] could not be imported. Distilling with these tools requires them to "
41 |         "be installed.".format(deps=", ".join(_missing_dependencies))
42 |     )
43 | 
44 | __all__ = ["model2vec", "sentence_transformers", "setfit"]
45 | 


--------------------------------------------------------------------------------
/sieves/tasks/postprocessing/distillation/types.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import enum
 4 | from typing import Literal
 5 | 
 6 | 
 7 | class DistillationFramework(enum.Enum):
 8 |     model2vec = "model2vec"
 9 |     sentence_transformers = "sentence_transformers"
10 |     setfit = "setfit"
11 | 
12 |     @classmethod
13 |     def all(cls) -> tuple[DistillationFramework, ...]:
14 |         """Returns all available engine types.
15 |         :return tuple[EngineType, ...]: All available engine types.
16 |         """
17 |         return tuple(dist_type for dist_type in DistillationFramework)
18 | 
19 | 
20 | DistillationFrameworkLiteral = Literal[*DistillationFramework.all()]  # type: ignore[valid-type]
21 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/__init__.py:
--------------------------------------------------------------------------------
 1 | """Predictive tasks."""
 2 | 
 3 | from .classification import Classification
 4 | from .core import PredictiveTask
 5 | from .information_extraction import InformationExtraction
 6 | from .ner import NER
 7 | from .pii_masking import PIIMasking
 8 | from .question_answering import QuestionAnswering
 9 | from .sentiment_analysis import SentimentAnalysis
10 | from .summarization import Summarization
11 | from .translation import Translation
12 | 
13 | __all__ = [
14 |     "Classification",
15 |     "InformationExtraction",
16 |     "SentimentAnalysis",
17 |     "Summarization",
18 |     "Translation",
19 |     "NER",
20 |     "PIIMasking",
21 |     "PredictiveTask",
22 |     "QuestionAnswering",
23 | ]
24 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/classification/__init__.py:
--------------------------------------------------------------------------------
1 | """Classification task."""
2 | 
3 | from .core import Classification, FewshotExampleMultiLabel, FewshotExampleSingleLabel
4 | 
5 | __all__ = ["Classification", "FewshotExampleMultiLabel", "FewshotExampleSingleLabel"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/information_extraction/__init__.py:
--------------------------------------------------------------------------------
1 | """Information extraction task."""
2 | 
3 | from .core import FewshotExample, InformationExtraction
4 | 
5 | __all__ = ["InformationExtraction", "FewshotExample"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/ner/__init__.py:
--------------------------------------------------------------------------------
1 | """NER task."""
2 | 
3 | from .core import NER, Entity, FewshotExample, _TaskPromptSignature, _TaskResult
4 | 
5 | __all__ = ["Entity", "NER", "FewshotExample", "_TaskResult", "_TaskPromptSignature"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/pii_masking/__init__.py:
--------------------------------------------------------------------------------
1 | """PII masking."""
2 | 
3 | from .core import FewshotExample, PIIEntity, PIIMasking
4 | 
5 | __all__ = ["FewshotExample", "PIIEntity", "PIIMasking"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/question_answering/__init__.py:
--------------------------------------------------------------------------------
1 | """Classification task."""
2 | 
3 | from .core import FewshotExample, QuestionAnswering
4 | 
5 | __all__ = ["QuestionAnswering", "FewshotExample"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/question_answering/core.py:
--------------------------------------------------------------------------------
  1 | """Question Answering predictive task."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from collections.abc import Iterable, Sequence
  6 | from pathlib import Path
  7 | from typing import Any, override
  8 | 
  9 | import datasets
 10 | import pydantic
 11 | 
 12 | from sieves.data import Doc
 13 | from sieves.engines import EngineType, dspy_, glix_, langchain_, outlines_
 14 | from sieves.engines.types import GenerationSettings
 15 | from sieves.serialization import Config
 16 | from sieves.tasks.postprocessing.distillation.types import DistillationFramework
 17 | from sieves.tasks.predictive.bridges import GliXBridge
 18 | from sieves.tasks.predictive.core import FewshotExample as BaseFewshotExample
 19 | from sieves.tasks.predictive.core import PredictiveTask
 20 | from sieves.tasks.predictive.question_answering.bridges import (
 21 |     DSPyQA,
 22 |     LangChainQA,
 23 |     OutlinesQA,
 24 | )
 25 | 
 26 | _TaskModel = dspy_.Model | glix_.Model | langchain_.Model | outlines_.Model
 27 | _TaskPromptSignature = glix_.PromptSignature | pydantic.BaseModel | dspy_.PromptSignature
 28 | _TaskResult = pydantic.BaseModel | dspy_.Result
 29 | _TaskBridge = DSPyQA | GliXBridge | LangChainQA | OutlinesQA
 30 | 
 31 | 
 32 | class FewshotExample(BaseFewshotExample):
 33 |     """Few-shot example with questions and answers for a context."""
 34 | 
 35 |     reasoning: str
 36 |     questions: tuple[str, ...] | list[str]
 37 |     answers: tuple[str, ...] | list[str]
 38 | 
 39 |     @override
 40 |     @property
 41 |     def input_fields(self) -> Sequence[str]:
 42 |         return "text", "questions"
 43 | 
 44 |     @override
 45 |     @property
 46 |     def target_fields(self) -> Sequence[str]:
 47 |         return ("answers",)
 48 | 
 49 | 
 50 | class QuestionAnswering(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]):
 51 |     """Answer questions about a text using structured engines."""
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         questions: list[str],
 56 |         model: _TaskModel,
 57 |         task_id: str | None = None,
 58 |         include_meta: bool = True,
 59 |         batch_size: int = -1,
 60 |         prompt_instructions: str | None = None,
 61 |         fewshot_examples: Sequence[FewshotExample] = (),
 62 |         generation_settings: GenerationSettings = GenerationSettings(),
 63 |     ) -> None:
 64 |         """
 65 |         Initialize QuestionAnswering task.
 66 | 
 67 |         :param questions: Questions to answer.
 68 |         :param model: Model to use.
 69 |         :param task_id: Task ID.
 70 |         :param include_meta: Whether to include meta information generated by the task.
 71 |         :param batch_size: Batch size to use for inference. Use -1 to process all documents at once.
 72 |         :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
 73 |         :param fewshot_examples: Few-shot examples.
 74 |         :param generation_settings: Settings for structured generation.
 75 |         """
 76 |         self._questions = questions
 77 |         super().__init__(
 78 |             model=model,
 79 |             task_id=task_id,
 80 |             include_meta=include_meta,
 81 |             batch_size=batch_size,
 82 |             overwrite=False,
 83 |             prompt_instructions=prompt_instructions,
 84 |             fewshot_examples=fewshot_examples,
 85 |             generation_settings=generation_settings,
 86 |         )
 87 |         self._fewshot_examples: Sequence[FewshotExample]
 88 | 
 89 |     @override
 90 |     def _init_bridge(self, engine_type: EngineType) -> _TaskBridge:
 91 |         if engine_type == EngineType.glix:
 92 |             return GliXBridge(
 93 |                 task_id=self._task_id,
 94 |                 prompt_instructions=self._custom_prompt_instructions,
 95 |                 prompt_signature=self._questions,
 96 |                 inference_mode=glix_.InferenceMode.question_answering,
 97 |             )
 98 | 
 99 |         bridge_types: dict[EngineType, type[_TaskBridge]] = {
100 |             EngineType.dspy: DSPyQA,
101 |             EngineType.outlines: OutlinesQA,
102 |             EngineType.langchain: LangChainQA,
103 |         }
104 | 
105 |         try:
106 |             bridge_type = bridge_types[engine_type]
107 |             assert not issubclass(bridge_type, GliXBridge)
108 | 
109 |             return bridge_type(
110 |                 task_id=self._task_id,
111 |                 prompt_instructions=self._custom_prompt_instructions,
112 |                 questions=self._questions,
113 |             )
114 |         except KeyError as err:
115 |             raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err
116 | 
117 |     @override
118 |     @property
119 |     def supports(self) -> set[EngineType]:
120 |         return {
121 |             EngineType.dspy,
122 |             EngineType.glix,
123 |             EngineType.langchain,
124 |             EngineType.outlines,
125 |         }
126 | 
127 |     @override
128 |     @property
129 |     def _state(self) -> dict[str, Any]:
130 |         return {
131 |             **super()._state,
132 |             "questions": self._questions,
133 |         }
134 | 
135 |     @override
136 |     def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets.Dataset:
137 |         # Define metadata.
138 |         features = datasets.Features(
139 |             {"text": datasets.Value("string"), "answers": datasets.Sequence(datasets.Value("string"))}
140 |         )
141 |         info = datasets.DatasetInfo(
142 |             description=f"Question-answering dataset with questions {self._questions}. Generated with sieves "
143 |             f"v{Config.get_version()}.",
144 |             features=features,
145 |         )
146 | 
147 |         # Fetch data used for generating dataset.
148 |         try:
149 |             data = [(doc.text, doc.results[self._task_id]) for doc in docs]
150 |         except KeyError as err:
151 |             raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err
152 | 
153 |         def generate_data() -> Iterable[dict[str, Any]]:
154 |             """Yield results as dicts.
155 | 
156 |             :return: Results as dicts.
157 |             """
158 |             for text, answers in data:
159 |                 yield {"text": text, "answers": answers}
160 | 
161 |         # Create dataset.
162 |         return datasets.Dataset.from_generator(generate_data, features=features, info=info)
163 | 
164 |     @override
165 |     def distill(
166 |         self,
167 |         base_model_id: str,
168 |         framework: DistillationFramework,
169 |         data: datasets.Dataset | Sequence[Doc],
170 |         output_path: Path | str,
171 |         val_frac: float,
172 |         init_kwargs: dict[str, Any] | None = None,
173 |         train_kwargs: dict[str, Any] | None = None,
174 |         seed: int | None = None,
175 |     ) -> None:
176 |         raise NotImplementedError
177 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/sentiment_analysis/__init__.py:
--------------------------------------------------------------------------------
1 | """Aspect-based sentiment analysis."""
2 | 
3 | from .core import FewshotExample, SentimentAnalysis
4 | 
5 | __all__ = ["SentimentAnalysis", "FewshotExample"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/summarization/__init__.py:
--------------------------------------------------------------------------------
1 | """Information extraction task."""
2 | 
3 | from .core import FewshotExample, Summarization
4 | 
5 | __all__ = ["Summarization", "FewshotExample"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/summarization/core.py:
--------------------------------------------------------------------------------
  1 | """Text summarization predictive task."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from collections.abc import Iterable, Sequence
  6 | from pathlib import Path
  7 | from typing import Any, override
  8 | 
  9 | import datasets
 10 | import pydantic
 11 | 
 12 | from sieves.data import Doc
 13 | from sieves.engines import EngineType, dspy_, glix_, langchain_, outlines_
 14 | from sieves.engines.types import GenerationSettings
 15 | from sieves.serialization import Config
 16 | from sieves.tasks.postprocessing.distillation.types import DistillationFramework
 17 | from sieves.tasks.predictive.bridges import GliXBridge
 18 | from sieves.tasks.predictive.core import FewshotExample as BaseFewshotExample
 19 | from sieves.tasks.predictive.core import PredictiveTask
 20 | from sieves.tasks.predictive.summarization.bridges import (
 21 |     DSPySummarization,
 22 |     LangChainSummarization,
 23 |     OutlinesSummarization,
 24 | )
 25 | 
 26 | _TaskModel = dspy_.Model | glix_.Model | langchain_.Model | outlines_.Model
 27 | _TaskPromptSignature = pydantic.BaseModel | dspy_.PromptSignature | glix_.PromptSignature
 28 | _TaskResult = outlines_.Result | dspy_.Result
 29 | _TaskBridge = DSPySummarization | GliXBridge | LangChainSummarization | OutlinesSummarization
 30 | 
 31 | 
 32 | class FewshotExample(BaseFewshotExample):
 33 |     """Few-shot example with a target summary."""
 34 | 
 35 |     n_words: int
 36 |     summary: str
 37 | 
 38 |     @override
 39 |     @property
 40 |     def target_fields(self) -> Sequence[str]:
 41 |         return ("summary",)
 42 | 
 43 | 
 44 | class Summarization(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]):
 45 |     """Summarize documents to a target length using structured engines."""
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |         n_words: int,
 50 |         model: _TaskModel,
 51 |         task_id: str | None = None,
 52 |         include_meta: bool = True,
 53 |         batch_size: int = -1,
 54 |         overwrite: bool = False,
 55 |         prompt_instructions: str | None = None,
 56 |         fewshot_examples: Sequence[FewshotExample] = (),
 57 |         generation_settings: GenerationSettings = GenerationSettings(),
 58 |     ) -> None:
 59 |         """Initialize new Summarization task.
 60 | 
 61 |         :param n_words: Maximal number of words (consider this a guideline, not a strict limit).
 62 |         :param model: Model to use.
 63 |         :param task_id: Task ID.
 64 |         :param include_meta: Whether to include meta information generated by the task.
 65 |         :param batch_size: Batch size to use for inference. Use -1 to process all documents at once.
 66 |         :param overwrite: Some tasks, e.g. anonymization or translation, output a modified version of the input text.
 67 |             If True, these tasks overwrite the original document text. If False, the result will just be stored in the
 68 |             documents' `.results` field.
 69 |         :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
 70 |         :param fewshot_examples: Few-shot examples.
 71 |         :param generation_settings: Settings for structured generation.
 72 |         """
 73 |         self._n_words = n_words
 74 | 
 75 |         super().__init__(
 76 |             model=model,
 77 |             task_id=task_id,
 78 |             include_meta=include_meta,
 79 |             batch_size=batch_size,
 80 |             overwrite=overwrite,
 81 |             prompt_instructions=prompt_instructions,
 82 |             fewshot_examples=fewshot_examples,
 83 |             generation_settings=generation_settings,
 84 |         )
 85 | 
 86 |     @override
 87 |     def _init_bridge(self, engine_type: EngineType) -> _TaskBridge:
 88 |         if engine_type == EngineType.glix:
 89 |             return GliXBridge(
 90 |                 task_id=self._task_id,
 91 |                 prompt_instructions=self._custom_prompt_instructions,
 92 |                 prompt_signature=[],
 93 |                 inference_mode=glix_.InferenceMode.summarization,
 94 |             )
 95 | 
 96 |         bridge_types: dict[EngineType, type[_TaskBridge]] = {
 97 |             EngineType.dspy: DSPySummarization,
 98 |             EngineType.langchain: LangChainSummarization,
 99 |             EngineType.outlines: OutlinesSummarization,
100 |         }
101 | 
102 |         try:
103 |             bridge_type = bridge_types[engine_type]
104 |             assert not issubclass(bridge_type, GliXBridge)
105 | 
106 |             return bridge_type(
107 |                 task_id=self._task_id,
108 |                 prompt_instructions=self._custom_prompt_instructions,
109 |                 overwrite=self._overwrite,
110 |                 n_words=self._n_words,
111 |             )
112 |         except KeyError as err:
113 |             raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err
114 | 
115 |     @property
116 |     @override
117 |     def supports(self) -> set[EngineType]:
118 |         return {
119 |             EngineType.dspy,
120 |             EngineType.glix,
121 |             EngineType.langchain,
122 |             EngineType.outlines,
123 |         }
124 | 
125 |     @property
126 |     @override
127 |     def _state(self) -> dict[str, Any]:
128 |         return {
129 |             **super()._state,
130 |             "n_words": self._n_words,
131 |         }
132 | 
133 |     @override
134 |     def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets.Dataset:
135 |         # Define metadata.
136 |         features = datasets.Features({"text": datasets.Value("string"), "summary": datasets.Value("string")})
137 |         info = datasets.DatasetInfo(
138 |             description=f"Summarization dataset. Generated with sieves v{Config.get_version()}.",
139 |             features=features,
140 |         )
141 | 
142 |         # Fetch data used for generating dataset.
143 |         try:
144 |             data = [(doc.text, doc.results[self._task_id]) for doc in docs]
145 |         except KeyError as err:
146 |             raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err
147 | 
148 |         def generate_data() -> Iterable[dict[str, Any]]:
149 |             """Yield results as dicts.
150 | 
151 |             :return: Results as dicts.
152 |             """
153 |             for text, summary in data:
154 |                 yield {"text": text, "summary": summary}
155 | 
156 |         # Create dataset.
157 |         return datasets.Dataset.from_generator(generate_data, features=features, info=info)
158 | 
159 |     @override
160 |     def distill(
161 |         self,
162 |         base_model_id: str,
163 |         framework: DistillationFramework,
164 |         data: datasets.Dataset | Sequence[Doc],
165 |         output_path: Path | str,
166 |         val_frac: float,
167 |         init_kwargs: dict[str, Any] | None = None,
168 |         train_kwargs: dict[str, Any] | None = None,
169 |         seed: int | None = None,
170 |     ) -> None:
171 |         raise NotImplementedError
172 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/translation/__init__.py:
--------------------------------------------------------------------------------
1 | """Information extraction task."""
2 | 
3 | from .core import FewshotExample, Translation, _TaskPromptSignature, _TaskResult
4 | 
5 | __all__ = ["Translation", "FewshotExample", "_TaskResult", "_TaskPromptSignature"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/predictive/translation/core.py:
--------------------------------------------------------------------------------
  1 | """Translation predictive task."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from collections.abc import Iterable, Sequence
  6 | from pathlib import Path
  7 | from typing import Any, override
  8 | 
  9 | import datasets
 10 | import pydantic
 11 | 
 12 | from sieves.data import Doc
 13 | from sieves.engines import EngineType, dspy_, langchain_, outlines_
 14 | from sieves.engines.types import GenerationSettings
 15 | from sieves.serialization import Config
 16 | from sieves.tasks.postprocessing.distillation.types import DistillationFramework
 17 | from sieves.tasks.predictive.core import FewshotExample as BaseFewshotExample
 18 | from sieves.tasks.predictive.core import PredictiveTask
 19 | from sieves.tasks.predictive.translation.bridges import (
 20 |     DSPyTranslation,
 21 |     LangChainTranslation,
 22 |     OutlinesTranslation,
 23 | )
 24 | 
 25 | _TaskModel = dspy_.Model | langchain_.Model | outlines_.Model
 26 | _TaskPromptSignature = pydantic.BaseModel | dspy_.PromptSignature
 27 | _TaskResult = outlines_.Result | dspy_.Result
 28 | _TaskBridge = DSPyTranslation | LangChainTranslation | OutlinesTranslation
 29 | 
 30 | 
 31 | class FewshotExample(BaseFewshotExample):
 32 |     """Few-shot example with a target translation."""
 33 | 
 34 |     to: str
 35 |     translation: str
 36 | 
 37 |     @override
 38 |     @property
 39 |     def input_fields(self) -> Sequence[str]:
 40 |         return "text", "to"
 41 | 
 42 |     @override
 43 |     @property
 44 |     def target_fields(self) -> Sequence[str]:
 45 |         return ("translation",)
 46 | 
 47 | 
 48 | class Translation(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]):
 49 |     """Translate documents into a target language using structured engines."""
 50 | 
 51 |     def __init__(
 52 |         self,
 53 |         to: str,
 54 |         model: _TaskModel,
 55 |         task_id: str | None = None,
 56 |         include_meta: bool = True,
 57 |         batch_size: int = -1,
 58 |         overwrite: bool = False,
 59 |         prompt_instructions: str | None = None,
 60 |         fewshot_examples: Sequence[FewshotExample] = (),
 61 |         generation_settings: GenerationSettings = GenerationSettings(),
 62 |     ) -> None:
 63 |         """
 64 |         Initialize Translation task.
 65 | 
 66 |         :param to: Language to translate to.
 67 |         :param model: Model to use.
 68 |         :param task_id: Task ID.
 69 |         :param include_meta: Whether to include meta information generated by the task.
 70 |         :param batch_size: Batch size to use for inference. Use -1 to process all documents at once.
 71 |         :param overwrite: Some tasks, e.g. anonymization or translation, output a modified version of the input text.
 72 |             If True, these tasks overwrite the original document text. If False, the result will just be stored in the
 73 |             documents' `.results` field.
 74 |         :param prompt_instructions: Custom prompt instructions. If None, default instructions are used.
 75 |         :param fewshot_examples: Few-shot examples.
 76 |         :param generation_settings: Settings for structured generation.
 77 |         """
 78 |         self._to = to
 79 | 
 80 |         super().__init__(
 81 |             model=model,
 82 |             task_id=task_id,
 83 |             include_meta=include_meta,
 84 |             batch_size=batch_size,
 85 |             overwrite=overwrite,
 86 |             prompt_instructions=prompt_instructions,
 87 |             fewshot_examples=fewshot_examples,
 88 |             generation_settings=generation_settings,
 89 |         )
 90 | 
 91 |     @override
 92 |     def _init_bridge(self, engine_type: EngineType) -> _TaskBridge:
 93 |         bridge_types: dict[EngineType, type[_TaskBridge]] = {
 94 |             EngineType.dspy: DSPyTranslation,
 95 |             EngineType.langchain: LangChainTranslation,
 96 |             EngineType.outlines: OutlinesTranslation,
 97 |         }
 98 | 
 99 |         try:
100 |             bridge = bridge_types[engine_type](
101 |                 task_id=self._task_id,
102 |                 prompt_instructions=self._custom_prompt_instructions,
103 |                 overwrite=self._overwrite,
104 |                 language=self._to,
105 |             )
106 |         except KeyError as err:
107 |             raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err
108 | 
109 |         return bridge
110 | 
111 |     @override
112 |     @property
113 |     def supports(self) -> set[EngineType]:
114 |         return {EngineType.dspy, EngineType.langchain, EngineType.outlines}
115 | 
116 |     @override
117 |     @property
118 |     def _state(self) -> dict[str, Any]:
119 |         return {
120 |             **super()._state,
121 |             "to": self._to,
122 |         }
123 | 
124 |     @override
125 |     def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets.Dataset:
126 |         # Define metadata.
127 |         features = datasets.Features({"text": datasets.Value("string"), "translation": datasets.Value("string")})
128 |         info = datasets.DatasetInfo(
129 |             description=f"Translation dataset with target language {self._to}."
130 |             f"Generated with sieves v{Config.get_version()}.",
131 |             features=features,
132 |         )
133 | 
134 |         # Fetch data used for generating dataset.
135 |         try:
136 |             data = [(doc.text, doc.results[self._task_id]) for doc in docs]
137 |         except KeyError as err:
138 |             raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err
139 | 
140 |         def generate_data() -> Iterable[dict[str, Any]]:
141 |             """Yield results as dicts.
142 | 
143 |             :return: Results as dicts.
144 |             """
145 |             for text, translation in data:
146 |                 yield {"text": text, "translation": translation}
147 | 
148 |         # Create dataset.
149 |         return datasets.Dataset.from_generator(generate_data, features=features, info=info)
150 | 
151 |     @override
152 |     def distill(
153 |         self,
154 |         base_model_id: str,
155 |         framework: DistillationFramework,
156 |         data: datasets.Dataset | Sequence[Doc],
157 |         output_path: Path | str,
158 |         val_frac: float,
159 |         init_kwargs: dict[str, Any] | None = None,
160 |         train_kwargs: dict[str, Any] | None = None,
161 |         seed: int | None = None,
162 |     ) -> None:
163 |         raise NotImplementedError
164 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | """Preprocessing tasks."""
2 | 
3 | from .chunking import Chunking
4 | from .ingestion import Ingestion
5 | 
6 | __all__ = ["Chunking", "Ingestion"]
7 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/chunking/__init__.py:
--------------------------------------------------------------------------------
1 | from .chonkie_ import Chonkie
2 | from .core import Chunking
3 | from .naive import NaiveChunker
4 | 
5 | __all__ = ["Chunking", "Chonkie", "NaiveChunker"]
6 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/chunking/chonkie_.py:
--------------------------------------------------------------------------------
 1 | """Allows chunking of documents into segments."""
 2 | 
 3 | import itertools
 4 | import sys
 5 | from collections.abc import Iterable
 6 | from typing import Any
 7 | 
 8 | import chonkie
 9 | 
10 | from sieves.data.doc import Doc
11 | from sieves.tasks.core import Task
12 | 
13 | 
14 | class Chonkie(Task):
15 |     """Chunker wrapping the chonkie library."""
16 | 
17 |     def __init__(
18 |         self,
19 |         chunker: chonkie.BaseChunker,
20 |         task_id: str | None = None,
21 |         include_meta: bool = False,
22 |         batch_size: int = -1,
23 |     ):
24 |         """Initialize chunker.
25 | 
26 |         :param task_id: Task ID.
27 |         :param include_meta: Whether to include meta information generated by the task.
28 |         :param batch_size: Batch size to use for processing. Use -1 to process all documents at once.
29 |         """
30 |         super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size)
31 |         self._chunker = chunker
32 | 
33 |     def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
34 |         """Split documents into chunks.
35 | 
36 |         :param docs: Documents to split.
37 |         :return: Split documents.
38 |         """
39 |         batch_size = self._batch_size if self._batch_size > 0 else sys.maxsize
40 |         while docs_batch := [doc for doc in itertools.islice(docs, batch_size)]:
41 |             if len(docs_batch) == 0:
42 |                 break
43 | 
44 |             chunks = self._chunker.chunk_batch([doc.text for doc in docs_batch], show_progress_bar=False)
45 |             assert len(chunks) == len(docs_batch)
46 | 
47 |             for doc, doc_chunks in zip(docs_batch, chunks):
48 |                 if self._include_meta:
49 |                     doc.meta |= {self.id: {doc_chunks}}
50 |                 doc.chunks = [chunk.text for chunk in doc_chunks]
51 | 
52 |                 yield doc
53 | 
54 |     @property
55 |     def _state(self) -> dict[str, Any]:
56 |         return {
57 |             **super()._state,
58 |             "chunker": self._chunker,
59 |         }
60 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/chunking/core.py:
--------------------------------------------------------------------------------
  1 | """Chunking task."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import itertools
  6 | from collections.abc import Iterable
  7 | from typing import Any
  8 | 
  9 | import chonkie
 10 | 
 11 | from sieves.data.doc import Doc
 12 | from sieves.serialization import Config
 13 | from sieves.tasks.core import Task
 14 | from sieves.tasks.preprocessing import chunking
 15 | from sieves.tasks.preprocessing.chunking import chonkie_, naive
 16 | 
 17 | _ChunkerArgType = chonkie.BaseChunker | int
 18 | _ChunkerType = chonkie_.Chonkie | naive.NaiveChunker
 19 | 
 20 | 
 21 | class Chunking(Task):
 22 |     """Task for chunking documents using different strategies.
 23 | 
 24 |     This task acts as a wrapper around specific chunker implementations,
 25 |     allowing for flexible configuration based on the provided chunker object or interval.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         chunker: _ChunkerArgType,
 31 |         task_id: str | None = None,
 32 |         include_meta: bool = False,
 33 |         batch_size: int = -1,
 34 |     ):
 35 |         """Initialize the Chunker task.
 36 | 
 37 |         :param chunker: The chunker instance (chonkie.BaseChunker) or the interval (int) for NaiveChunker.
 38 |         :param task_id: Task ID.
 39 |         :param include_meta: Whether to include meta information generated by the task.
 40 |         :param batch_size: Batch size to use for processing. Use -1 to process all documents at once.
 41 |         """
 42 |         super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size)
 43 |         self._chunker_arg = chunker
 44 |         self._task = self._init_chunker_task()
 45 | 
 46 |     def _init_chunker_task(self) -> _ChunkerType:
 47 |         """Initialize the specific chunker task based on the type of _chunker_arg.
 48 | 
 49 |         :return: Initialized chunker task instance.
 50 |         :raises TypeError: If the type of _chunker_arg is not supported.
 51 |         """
 52 |         chunker_task: _ChunkerType
 53 | 
 54 |         match self._chunker_arg:
 55 |             case chunker if isinstance(chunker, chonkie.BaseChunker):
 56 |                 chunker_task = chunking.chonkie_.Chonkie(
 57 |                     chunker=chunker,
 58 |                     task_id=self.id,
 59 |                     include_meta=self._include_meta,
 60 |                     batch_size=self._batch_size,
 61 |                 )
 62 |             case interval if isinstance(interval, int):
 63 |                 chunker_task = chunking.naive.NaiveChunker(
 64 |                     interval=interval,
 65 |                     task_id=self.id,
 66 |                     include_meta=self._include_meta,
 67 |                     batch_size=self._batch_size,
 68 |                 )
 69 |             case _:
 70 |                 raise TypeError(
 71 |                     f"Unsupported type for 'chunker' argument: {type(self._chunker_arg)}. "
 72 |                     f"Expected chonkie.BaseChunker or int."
 73 |                 )
 74 | 
 75 |         return chunker_task
 76 | 
 77 |     def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
 78 |         """Process documents by chunking their text.
 79 | 
 80 |         :param docs: Documents to process.
 81 |         :return: Processed documents with chunks added.
 82 |         """
 83 |         docs_iters = itertools.tee(docs, 2)
 84 |         assert all(doc.text for doc in docs_iters[0]), ValueError("Documents have to have a value for .text.")
 85 |         yield from self._task(docs_iters[1])
 86 | 
 87 |     @property
 88 |     def _state(self) -> dict[str, Any]:
 89 |         """Return attributes to serialize.
 90 | 
 91 |         :return: Dict of attributes to serialize.
 92 |         """
 93 |         return {
 94 |             **super()._state,
 95 |             "chunker": self._chunker_arg,
 96 |         }
 97 | 
 98 |     @classmethod
 99 |     def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Chunking:
100 |         """Generate Chunker instance from config.
101 | 
102 |         :param config: Config to generate instance from.
103 |         :param kwargs: Values to inject into loaded config.
104 |         :return: Deserialized Chunker instance.
105 |         """
106 |         return cls(**config.to_init_dict(cls, **kwargs))
107 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/chunking/naive.py:
--------------------------------------------------------------------------------
 1 | """Allows chunking of documents into segments."""
 2 | 
 3 | import itertools
 4 | import re
 5 | import sys
 6 | from collections.abc import Iterable
 7 | from typing import Any
 8 | 
 9 | from sieves.data.doc import Doc
10 | from sieves.tasks.core import Task
11 | 
12 | 
13 | class NaiveChunker(Task):
14 |     """Chunks by sentence counts. Only for test purposes."""
15 | 
16 |     def __init__(
17 |         self,
18 |         interval: int,
19 |         task_id: str | None = None,
20 |         include_meta: bool = False,
21 |         batch_size: int = -1,
22 |     ):
23 |         """Initialize chunker.
24 | 
25 |         :param interval: Token count interval for chunks.
26 |         :param task_id: Task ID.
27 |         :param include_meta: Whether to include meta information generated by the task.
28 |         :param batch_size: Batch size to use for processing. Use -1 to process all documents at once.
29 |         """
30 |         super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size)
31 |         self._interval = interval
32 | 
33 |     def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
34 |         """Split documents into chunks.
35 | 
36 |         :param docs: Documents to split.
37 |         :return: Split documents.
38 |         """
39 |         batch_size = self._batch_size if self._batch_size > 0 else sys.maxsize
40 |         while docs_batch := [doc for doc in itertools.islice(docs, batch_size)]:
41 |             if len(docs_batch) == 0:
42 |                 break
43 | 
44 |             for doc in docs_batch:
45 |                 assert doc.text
46 |                 sentences = [sent for sent in re.split("[?!.]", doc.text) if len(sent.strip())]
47 |                 doc.chunks = [
48 |                     ".".join(sentences[i : i + self._interval]) for i in range(0, len(sentences), self._interval)
49 |                 ]
50 | 
51 |                 yield doc
52 | 
53 |     @property
54 |     def _state(self) -> dict[str, Any]:
55 |         return {
56 |             **super()._state,
57 |             "interval": self._interval,
58 |         }
59 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/ingestion/__init__.py:
--------------------------------------------------------------------------------
1 | """Ingestion task implementation."""
2 | 
3 | from .core import Ingestion
4 | from .docling_ import Docling
5 | from .marker_ import Marker
6 | from .unstructured_ import Unstructured
7 | 
8 | __all__ = ["Docling", "Marker", "Ingestion", "Unstructured"]
9 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/ingestion/core.py:
--------------------------------------------------------------------------------
  1 | """Ingestion task implementation."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from collections.abc import Iterable
  6 | from typing import Any
  7 | 
  8 | import docling
  9 | import docling.document_converter
 10 | import marker
 11 | from marker.converters.pdf import PdfConverter
 12 | from marker.converters.table import TableConverter
 13 | 
 14 | from sieves.data.doc import Doc
 15 | from sieves.serialization import Config
 16 | from sieves.tasks.core import Task
 17 | from sieves.tasks.preprocessing.ingestion import docling_, marker_
 18 | 
 19 | _ConverterType = docling.document_converter.DocumentConverter | PdfConverter | TableConverter
 20 | 
 21 | 
 22 | class Ingestion(Task):
 23 |     """Base class for Ingestion tasks that extract text from documents.
 24 | 
 25 |     This unified interface allows different Ingestion converters to be used interchangeably.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         converter: _ConverterType = docling.document_converter.DocumentConverter(),
 31 |         export_format: str = "markdown",
 32 |         task_id: str | None = None,
 33 |         include_meta: bool = False,
 34 |         batch_size: int = -1,
 35 |         **kwargs: Any,
 36 |     ):
 37 |         """Initialize the Ingestion task.
 38 | 
 39 |         :param converter: The Ingestion converter to use.
 40 |         :param task_id: Task ID.
 41 |         :param include_meta: Whether to include meta information generated by the task.
 42 |         :param batch_size: Batch size to use for processing. Use -1 to process all documents at once.
 43 |         :param kwargs: Additional arguments for specific Ingestion implementations.
 44 |         """
 45 |         super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size)
 46 |         self._export_format = export_format
 47 |         self._converter = converter
 48 |         self._kwargs = kwargs
 49 |         self._task = self._init_ingestion_task()
 50 | 
 51 |     def _init_ingestion_task(self) -> Task:
 52 |         """Initialize the bridge for the specific Ingestion implementation.
 53 | 
 54 |         :return: Ingestion bridge implementation.
 55 |         """
 56 |         converter_type = type(self._converter)
 57 |         ingestion_task: Task
 58 |         match converter_type:
 59 |             case converter if issubclass(
 60 |                 converter, (marker.converters.pdf.PdfConverter | marker.converters.table.TableConverter)
 61 |             ):
 62 |                 ingestion_task = marker_.Marker(
 63 |                     converter=self._converter,
 64 |                     export_format=self._export_format,
 65 |                     task_id=self.id,
 66 |                     include_meta=self._include_meta,
 67 |                     batch_size=self._batch_size,
 68 |                     **self._kwargs,
 69 |                 )
 70 |             case docling.document_converter.DocumentConverter:
 71 |                 ingestion_task = docling_.Docling(
 72 |                     converter=self._converter,
 73 |                     export_format=self._export_format,
 74 |                     task_id=self.id,
 75 |                     include_meta=self._include_meta,
 76 |                     batch_size=self._batch_size,
 77 |                 )
 78 |             case _:
 79 |                 raise ValueError(
 80 |                     f"converter type {self._converter} is not supported. Please check the documentation "
 81 |                     f"and ensure you're providing a supported converter type."
 82 |                 )
 83 |         assert isinstance(ingestion_task, Task)
 84 |         return ingestion_task
 85 | 
 86 |     def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
 87 |         """Process documents with Ingestion to extract text.
 88 | 
 89 |         :param docs: Documents to process.
 90 |         :return: Processed documents with extracted text.
 91 |         """
 92 |         docs = list(docs)
 93 |         assert all(doc.uri for doc in docs), ValueError("Documents have to have a value for .uri.")
 94 |         result = self._task(docs)
 95 | 
 96 |         yield from result
 97 | 
 98 |     @property
 99 |     def _state(self) -> dict[str, Any]:
100 |         """Returns attributes to serialize.
101 | 
102 |         :return: Dict of attributes to serialize.
103 |         """
104 |         return {
105 |             **super()._state,
106 |             "converter": self._converter,
107 |             "export_format": self._export_format,
108 |             **self._kwargs,
109 |         }
110 | 
111 |     @classmethod
112 |     def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Ingestion:
113 |         """
114 |         Generate Ingestion instance from config.
115 | 
116 |         :param config: Config to generate instance from.
117 |         :param kwargs: Values to inject into loaded config.
118 |         :return: Deserialized Ingestion instance.
119 |         """
120 |         return cls(**config.to_init_dict(cls, **kwargs))
121 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/ingestion/docling_.py:
--------------------------------------------------------------------------------
 1 | """Wrapper for `Docling` for the conversion of complex files into markdown."""
 2 | 
 3 | import warnings
 4 | from collections.abc import Iterable
 5 | from typing import Any
 6 | 
 7 | import docling.datamodel.document
 8 | import docling.document_converter
 9 | from loguru import logger
10 | 
11 | from sieves.data.doc import Doc
12 | from sieves.tasks.core import Task
13 | 
14 | 
15 | class Docling(Task):
16 |     """Parser wrapping the docling library to convert files into documents."""
17 | 
18 |     def __init__(
19 |         self,
20 |         converter: docling.document_converter.DocumentConverter | None = None,
21 |         export_format: str = "markdown",
22 |         task_id: str | None = None,
23 |         include_meta: bool = False,
24 |         batch_size: int = -1,
25 |     ):
26 |         """Initialize the docling parser.
27 | 
28 |         :param converter: Docling parser instance.
29 |         :param task_id: Task ID.
30 |         :param include_meta: Whether to include meta information generated by the task.
31 |         :param batch_size: Batch size to use for processing. Use -1 to process all documents at once.
32 |         """
33 |         super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size)
34 |         self._converter = converter if converter else docling.document_converter.DocumentConverter()
35 |         self._export_format = export_format
36 | 
37 |     def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
38 |         """Parse resources using docling.
39 | 
40 |         :param docs: Resources to process.
41 |         :return: Parsed documents
42 |         """
43 |         docs = list(docs)
44 | 
45 |         # Validate docs.
46 |         have_text = False
47 |         for doc in docs:
48 |             assert doc.uri, ValueError("Documents have to have a value for .uri.")
49 |             if doc.text:
50 |                 have_text = True
51 |         if have_text:
52 |             warnings.warn(f"Task {self._task_id} is about to overwrite existing .text values.")
53 | 
54 |         parsed_resources: list[docling.datamodel.document.ConversionResult] = list(
55 |             self._converter.convert_all([resource.uri for resource in docs])
56 |         )
57 |         assert len(parsed_resources) == len(docs)
58 | 
59 |         for doc, parsed_resource in zip(docs, parsed_resources):
60 |             try:
61 |                 if self._include_meta:
62 |                     doc.meta |= {self.id: parsed_resource}
63 |                 if self._export_format == "markdown":
64 |                     doc.text = parsed_resource.document.export_to_markdown()
65 |                 elif self._export_format == "html":
66 |                     doc.text = parsed_resource.document.export_to_html()
67 |                 elif self._export_format == "json":
68 |                     doc.text = parsed_resource.document.export_to_dict()
69 |             except Exception as e:
70 |                 logger.error(f"Failed to parse file {doc.uri}: {str(e)}")
71 |                 continue
72 | 
73 |         return docs
74 | 
75 |     @property
76 |     def _state(self) -> dict[str, Any]:
77 |         return {
78 |             **super()._state,
79 |             "converter": self._converter,
80 |             "export_format": self._export_format,
81 |         }
82 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/ingestion/marker_.py:
--------------------------------------------------------------------------------
  1 | """Marker task for converting PDF documents to text."""
  2 | 
  3 | from collections.abc import Iterable
  4 | from pathlib import Path
  5 | from typing import Any
  6 | 
  7 | from marker.converters.pdf import PdfConverter
  8 | from marker.converters.table import TableConverter
  9 | from marker.models import create_model_dict
 10 | from marker.output import text_from_rendered
 11 | 
 12 | from sieves.data import Doc
 13 | from sieves.tasks.core import Task
 14 | 
 15 | 
 16 | class Marker(Task):
 17 |     """Marker task for converting PDF documents to text."""
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         converter: PdfConverter | TableConverter | None = None,
 22 |         export_format: str = "markdown",
 23 |         task_id: str | None = None,
 24 |         include_meta: bool = False,
 25 |         batch_size: int = -1,
 26 |         extract_images: bool = False,
 27 |     ):
 28 |         """Initialize the Marker task.
 29 | 
 30 |         :param converter: Custom PdfConverter or TableConverter instance. If None, a default one will be created.
 31 |         :param export_format: Format to export the document in ("markdown", "html", or "json").
 32 |         :param task_id: Task ID.
 33 |         :param include_meta: Whether to include meta information generated by the task.
 34 |         :param batch_size: Batch size to use for processing. Use -1 to process all documents at once.
 35 |         :param extract_images: Whether to extract images from the PDF.
 36 |         """
 37 |         super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size)
 38 | 
 39 |         self._export_format = export_format
 40 |         self._converter = self._setup_converter(converter, self._export_format)
 41 |         self._extract_images = extract_images
 42 | 
 43 |     def _setup_converter(
 44 |         self, converter: PdfConverter | TableConverter | None, export_format: str
 45 |     ) -> PdfConverter | TableConverter:
 46 |         """Set up the converter with the specified renderer.
 47 | 
 48 |         :param converter: Custom converter instance or None.
 49 |         :param export_format: Format to export the document in.
 50 |         :return: Configured converter instance.
 51 |         """
 52 |         renderer: str = self._get_renderer(export_format)
 53 |         if converter is None:
 54 |             return PdfConverter(artifact_dict=create_model_dict(), renderer=renderer)
 55 | 
 56 |         # If a converter is provided, use its type but update the renderer
 57 |         if isinstance(converter, TableConverter):
 58 |             return TableConverter(artifact_dict=create_model_dict(), renderer=renderer)
 59 |         elif isinstance(converter, PdfConverter):
 60 |             return PdfConverter(artifact_dict=create_model_dict(), renderer=renderer)
 61 |         else:
 62 |             raise ValueError(f"Invalid converter type: {type(converter)}")
 63 | 
 64 |     def _get_renderer(self, export_format: str) -> str:
 65 |         """Get the renderer string based on the export format.
 66 | 
 67 |         :param export_format: Format to export the document in.
 68 |         :return: The renderer string.
 69 |         :raises ValueError: If the export format is invalid.
 70 |         """
 71 |         if export_format == "markdown":
 72 |             return "marker.renderers.markdown.MarkdownRenderer"
 73 |         elif export_format == "html":
 74 |             return "marker.renderers.html.HTMLRenderer"
 75 |         elif export_format == "json":
 76 |             return "marker.renderers.json.JSONRenderer"
 77 |         else:
 78 |             raise ValueError(f"Invalid export format: {export_format}")
 79 | 
 80 |     def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
 81 |         """Process documents using Marker.
 82 | 
 83 |         :param docs: Documents to process.
 84 |         :return: Processed documents.
 85 |         """
 86 |         docs = list(docs)
 87 | 
 88 |         for doc in docs:
 89 |             # Convert URI to string if it's a Path
 90 |             uri = str(doc.uri) if isinstance(doc.uri, Path) else doc.uri
 91 |             # Process the document
 92 |             rendered = self._converter(uri)
 93 | 
 94 |             # Extract text and optionally images
 95 |             text, _, images = text_from_rendered(rendered)
 96 |             if self._extract_images:
 97 |                 doc.images = images
 98 | 
 99 |             # Update document text
100 |             doc.text = text
101 | 
102 |         for doc in docs:
103 |             yield doc
104 | 
105 |     @property
106 |     def _state(self) -> dict[str, Any]:
107 |         """Get state for serialization.
108 | 
109 |         :return: State dictionary.
110 |         """
111 |         return {
112 |             **super()._state,
113 |             "converter": self._converter,
114 |             "export_format": self._export_format,
115 |             "extract_images": self._extract_images,
116 |         }
117 | 


--------------------------------------------------------------------------------
/sieves/tasks/preprocessing/ingestion/unstructured_.py:
--------------------------------------------------------------------------------
  1 | """File preprocessing for converting raw files into documents."""
  2 | 
  3 | import warnings
  4 | from collections.abc import Callable, Iterable
  5 | from typing import Any
  6 | 
  7 | import nltk
  8 | import unstructured
  9 | import unstructured.documents.elements
 10 | import unstructured.partition.auto
 11 | 
 12 | from sieves.data.doc import Doc
 13 | from sieves.tasks.core import Task
 14 | 
 15 | PartitionType = Callable[..., list[unstructured.documents.elements.Text]]
 16 | CleanerType = Callable[[str], str]
 17 | 
 18 | 
 19 | class Unstructured(Task):
 20 |     """Parser wrapping the unstructured library to convert files into documents."""
 21 | 
 22 |     def __init__(
 23 |         self,
 24 |         partition: PartitionType = unstructured.partition.auto.partition,
 25 |         cleaners: tuple[CleanerType, ...] = (),
 26 |         task_id: str | None = None,
 27 |         include_meta: bool = False,
 28 |         batch_size: int = -1,
 29 |         **kwargs: dict[str, Any],
 30 |     ):
 31 |         """Initialize the docling parser.
 32 | 
 33 |         :param partition: Function to use for partitioning.
 34 |         :param cleaners: Cleaning functions to apply.
 35 |         :param task_id: Task ID.
 36 |         :param include_meta: Whether to include meta information generated by the task.
 37 |         :param batch_size: Batch size to use for processing. Use -1 to process all documents at once.
 38 |         :param kwargs: Kwargs to be supplied to partitioning call.
 39 |         """
 40 |         super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size)
 41 |         self._partition = partition
 42 |         self._partition_args = kwargs or {}
 43 |         self._cleaners = cleaners
 44 | 
 45 |         Unstructured._require()
 46 | 
 47 |     @staticmethod
 48 |     def _require() -> None:
 49 |         """Download all necessary resources that have to be installed from within Python."""
 50 |         # Some nltk resources seem necessary for basic functionality.
 51 |         for nltk_resource in ("punkt_tab", "averaged_perceptron_tagger_eng"):
 52 |             # Don't install if already available.
 53 |             try:
 54 |                 nltk.data.find(nltk_resource)
 55 |             except LookupError:
 56 |                 nltk.download(nltk_resource)
 57 | 
 58 |     def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]:
 59 |         """Parse resources using docling.
 60 | 
 61 |         :param docs: Resources to process.
 62 |         :return: Parsed documents.
 63 |         """
 64 |         docs = list(docs)
 65 | 
 66 |         # Validate docs.
 67 |         have_text = False
 68 |         for doc in docs:
 69 |             assert doc.uri, ValueError("Documents have to have a value for .uri.")
 70 |             if doc.text:
 71 |                 have_text = True
 72 |         if have_text:
 73 |             warnings.warn(f"Task {self._task_id} is about to overwrite existing .text values.")
 74 | 
 75 |         # Wrap conversion in TQDM if progress should be shown.
 76 |         does_chunking = "chunking_strategy" in self._partition_args
 77 | 
 78 |         for doc in docs:
 79 |             try:
 80 |                 # Parse and process document.
 81 |                 parsed_resources: list[unstructured.documents.elements.Text] = self._partition(
 82 |                     doc.uri, **self._partition_args
 83 |                 )
 84 | 
 85 |                 # Apply specified cleaners.
 86 |                 for cleaner in self._cleaners:
 87 |                     for pr in parsed_resources:
 88 |                         pr.apply(cleaner)
 89 | 
 90 |                 # Integrate into Doc instances.
 91 |                 if self._include_meta:
 92 |                     doc.meta |= {self.id: parsed_resources}
 93 | 
 94 |                 # Use chunks.
 95 |                 if does_chunking:
 96 |                     doc.chunks = [pr.text for pr in parsed_resources]
 97 | 
 98 |                 # Merge texts from all elements into single string for the entire document.
 99 |                 doc.text = "\n".join(resource.text for resource in parsed_resources)
100 | 
101 |             except FileNotFoundError as err:
102 |                 raise FileNotFoundError(
103 |                     f"File at {doc.uri} not found. Ensure that this is a local file path - unstructured doesn't support"
104 |                     f" loading files via network URIs."
105 |                 ) from err
106 | 
107 |         return docs
108 | 
109 |     @property
110 |     def _state(self) -> dict[str, Any]:
111 |         return {
112 |             **super()._state,
113 |             "partition": self._partition,
114 |             "cleaners": self._cleaners,
115 |             **self._partition_args,
116 |         }
117 | 


--------------------------------------------------------------------------------
/sieves/tasks/types.py:
--------------------------------------------------------------------------------
 1 | """Common types."""
 2 | 
 3 | from sieves.engines.engine_import import (
 4 |     dspy_,
 5 |     glix_,
 6 |     huggingface_,
 7 |     langchain_,
 8 |     outlines_,
 9 | )
10 | 
11 | Model = dspy_.Model | glix_.Model | huggingface_.Model | langchain_.Model | outlines_.Model
12 | 


--------------------------------------------------------------------------------
/sieves/tests/assets/1204.0162v2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MantisAI/sieves/3ce96055c1343849909650265de2b5e7d98745b8/sieves/tests/assets/1204.0162v2.pdf


--------------------------------------------------------------------------------
/sieves/tests/assets/dummy.txt:
--------------------------------------------------------------------------------
 1 | This is a dummy text file.
 2 | This is a dummy text file.
 3 | This is a dummy text file.
 4 | This is a dummy text file.
 5 | This is a dummy text file.
 6 | This is a dummy text file.
 7 | This is a dummy text file.
 8 | This is a dummy text file.
 9 | This is a dummy text file.
10 | This is a dummy text file.
11 | This is a dummy text file.
12 | This is a dummy text file.
13 | This is a dummy text file.
14 | This is a dummy text file.
15 | This is a dummy text file.
16 | This is a dummy text file.
17 | This is a dummy text file.
18 | This is a dummy text file.
19 | This is a dummy text file.
20 | This is a dummy text file.
21 | This is a dummy text file.
22 | This is a dummy text file.
23 | This is a dummy text file.
24 | This is a dummy text file.
25 | This is a dummy text file.
26 | This is a dummy text file.
27 | This is a dummy text file.
28 | This is a dummy text file.
29 | This is a dummy text file.
30 | This is a dummy text file.
31 | This is a dummy text file.
32 | This is a dummy text file.
33 | This is a dummy text file.
34 | This is a dummy text file.
35 | This is a dummy text file.
36 | This is a dummy text file.
37 | This is a dummy text file.
38 | This is a dummy text file.
39 | This is a dummy text file.
40 | This is a dummy text file.
41 | This is a dummy text file.
42 | This is a dummy text file.
43 | This is a dummy text file.
44 | This is a dummy text file.
45 | This is a dummy text file.
46 | This is a dummy text file.
47 | This is a dummy text file.
48 | This is a dummy text file.
49 | This is a dummy text file.
50 | This is a dummy text file.
51 | This is a dummy text file.
52 | This is a dummy text file.
53 | This is a dummy text file.
54 | This is a dummy text file.
55 | This is a dummy text file.
56 | This is a dummy text file.
57 | This is a dummy text file.
58 | This is a dummy text file.
59 | This is a dummy text file.
60 | This is a dummy text file.
61 | This is a dummy text file.
62 | This is a dummy text file.
63 | This is a dummy text file.
64 | This is a dummy text file.
65 | This is a dummy text file.
66 | This is a dummy text file.
67 | This is a dummy text file.
68 | This is a dummy text file.
69 | This is a dummy text file.
70 | This is a dummy text file.
71 | This is a dummy text file.
72 | This is a dummy text file.
73 | This is a dummy text file.
74 | This is a dummy text file.
75 | This is a dummy text file.
76 | This is a dummy text file.
77 | This is a dummy text file.
78 | This is a dummy text file.
79 | This is a dummy text file.
80 | This is a dummy text file.


--------------------------------------------------------------------------------
/sieves/tests/tasks/predictive/test_information_extraction.py:
--------------------------------------------------------------------------------
  1 | # mypy: ignore-errors
  2 | import pydantic
  3 | import pytest
  4 | 
  5 | from sieves import Doc, Pipeline, tasks
  6 | from sieves.engines import EngineType
  7 | from sieves.serialization import Config
  8 | from sieves.tasks import PredictiveTask
  9 | from sieves.tasks.predictive import information_extraction
 10 | 
 11 | 
 12 | class Person(pydantic.BaseModel, frozen=True):
 13 |     name: str
 14 |     age: pydantic.PositiveInt
 15 | 
 16 | 
 17 | @pytest.mark.parametrize(
 18 |     "batch_runtime",
 19 |     (
 20 |         EngineType.dspy,
 21 |         EngineType.langchain,
 22 |         EngineType.outlines,
 23 |     ),
 24 |     indirect=["batch_runtime"],
 25 | )
 26 | @pytest.mark.parametrize("fewshot", [True, False])
 27 | def test_run(information_extraction_docs, batch_runtime, fewshot) -> None:
 28 |     fewshot_examples = [
 29 |         information_extraction.FewshotExample(
 30 |             text="Ada Lovelace lived to 47 years old. Zeno of Citium died with 72 years.",
 31 |             reasoning="There is mention of two people in this text, including lifespans. I will extract those.",
 32 |             entities=[Person(name="Ada Loveloace", age=47), Person(name="Zeno of Citium", age=72)],
 33 |         ),
 34 |         information_extraction.FewshotExample(
 35 |             text="Alan Watts passed away at the age of 58 years. Alan Watts was 58 years old at the time of his death.",
 36 |             reasoning="There is mention of one person in this text, including lifespan. I will extract this person.",
 37 |             entities=[Person(name="Alan Watts", age=58)],
 38 |         ),
 39 |     ]
 40 | 
 41 |     fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {}
 42 |     pipe = Pipeline(
 43 |         [
 44 |             tasks.predictive.InformationExtraction(
 45 |                 entity_type=Person,
 46 |                 model=batch_runtime.model,
 47 |                 generation_settings=batch_runtime.generation_settings,
 48 |                 batch_size=batch_runtime.batch_size,
 49 |                 **fewshot_args),
 50 |         ]
 51 |     )
 52 |     docs = list(pipe(information_extraction_docs))
 53 | 
 54 |     assert len(docs) == 2
 55 |     for doc in docs:
 56 |         assert doc.text
 57 |         assert "InformationExtraction" in doc.results
 58 | 
 59 |     with pytest.raises(NotImplementedError):
 60 |         pipe["InformationExtraction"].distill(None, None, None, None, None, None, None, None)
 61 | 
 62 | 
 63 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 64 | def test_to_hf_dataset(information_extraction_docs, batch_runtime) -> None:
 65 |     task = tasks.predictive.InformationExtraction(
 66 |         entity_type=Person, model=batch_runtime.model, generation_settings=batch_runtime.generation_settings, batch_size=batch_runtime.batch_size
 67 |     )
 68 |     pipe = Pipeline(task)
 69 |     docs = pipe(information_extraction_docs)
 70 | 
 71 |     assert isinstance(task, PredictiveTask)
 72 |     dataset = task.to_hf_dataset(docs)
 73 |     assert all([key in dataset.features for key in ("text", "entities")])
 74 |     assert len(dataset) == 2
 75 |     records = list(dataset)
 76 |     assert records[0]["text"] == "Mahatma Ghandi lived to 79 years old. Bugs Bunny is at least 85 years old."
 77 |     assert records[1]["text"] == "Marie Curie passed away at the age of 67 years. Marie Curie was 67 years old."
 78 |     for record in records:
 79 |         assert isinstance(record["entities"], dict)
 80 |         assert isinstance(record["entities"]["age"], list)
 81 |         assert isinstance(record["entities"]["name"], list)
 82 | 
 83 |     with pytest.raises(KeyError):
 84 |         task.to_hf_dataset([Doc(text="This is a dummy text.")])
 85 | 
 86 | 
 87 | @pytest.mark.parametrize("batch_runtime", [EngineType.outlines], indirect=["batch_runtime"])
 88 | def test_serialization(information_extraction_docs, batch_runtime) -> None:
 89 |     pipe = Pipeline(
 90 |         tasks.predictive.InformationExtraction(
 91 |             entity_type=Person, model=batch_runtime.model, generation_settings=batch_runtime.generation_settings, batch_size=batch_runtime.batch_size,
 92 |         )
 93 |     )
 94 | 
 95 |     config = pipe.serialize()
 96 |     assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline',
 97 |  'tasks': {'is_placeholder': False,
 98 |            'value': [{'cls_name': 'sieves.tasks.predictive.information_extraction.core.InformationExtraction',
 99 |                       'entity_type': {'is_placeholder': True,
100 |                                       'value': 'pydantic._internal._model_construction.ModelMetaclass'},
101 |                       'fewshot_examples': {'is_placeholder': False,
102 |                                            'value': ()},
103 |                       'batch_size': {'is_placeholder': False, "value": -1},
104 |                       'generation_settings': {'is_placeholder': False,
105 |                                               'value': {
106 |                                                         'config_kwargs': None,
107 |                                                         'inference_kwargs': None,
108 |                                                         'init_kwargs': None,
109 |                                                         'strict_mode': False}},
110 |                       'include_meta': {'is_placeholder': False, 'value': True},
111 |                       'model': {'is_placeholder': True,
112 |                                 'value': 'outlines.models.transformers.Transformers'},
113 |                       'prompt_instructions': {'is_placeholder': False,
114 |                                           'value': None},
115 |                       'task_id': {'is_placeholder': False,
116 |                                   'value': 'InformationExtraction'},
117 |                       'version': Config.get_version()}]},
118 |  'use_cache': {'is_placeholder': False, 'value': True},
119 |  'version': Config.get_version()}
120 | 
121 |     Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model, "entity_type": Person}])
122 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/predictive/test_ner.py:
--------------------------------------------------------------------------------
  1 | # mypy: ignore-errors
  2 | import pytest
  3 | 
  4 | from sieves import Doc, Pipeline
  5 | from sieves.engines import EngineType
  6 | from sieves.serialization import Config
  7 | from sieves.tasks import PredictiveTask
  8 | from sieves.tasks.predictive import ner
  9 | from sieves.tasks.predictive.ner.core import Entity
 10 | 
 11 | 
 12 | @pytest.mark.parametrize(
 13 |     "batch_runtime",
 14 |     (
 15 |         EngineType.dspy,
 16 |         EngineType.langchain,
 17 |         EngineType.outlines,
 18 |         EngineType.glix,
 19 |     ),
 20 |     indirect=["batch_runtime"],
 21 | )
 22 | @pytest.mark.parametrize("fewshot", [True, False])
 23 | def test_run(ner_docs, batch_runtime, fewshot) -> None:
 24 |     fewshot_examples = [
 25 |         ner.FewshotExample(
 26 |             text="John studied data science in Barcelona and lives with Jaume",
 27 |             entities=[
 28 |                 Entity(text="John", context="John studied data", entity_type="PERSON"),
 29 |                 Entity(text="Barcelona", context="science in Barcelona", entity_type="LOCATION"),
 30 |                 Entity(text="Jaume", context="lives with Jaume", entity_type="PERSON"),
 31 |             ],
 32 |         ),
 33 |         ner.FewshotExample(
 34 |             text="Maria studied computer engineering in Madrid and works with Carlos",
 35 |             entities=[
 36 |                 Entity(text="Maria", context="Maria studied computer", entity_type="PERSON"),
 37 |                 Entity(text="Madrid", context="engineering in Madrid and works", entity_type="LOCATION"),
 38 |                 Entity(text="Carlos", context="works with Carlos", entity_type="PERSON"),
 39 |             ],
 40 |         ),
 41 |     ]
 42 | 
 43 |     fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {}
 44 |     pipe = Pipeline(
 45 |         ner.NER(
 46 |             entities=["PERSON", "LOCATION", "COMPANY"],
 47 |             model=batch_runtime.model,
 48 |             generation_settings=batch_runtime.generation_settings,
 49 |             batch_size=batch_runtime.batch_size,
 50 |             **fewshot_args
 51 |         )
 52 |     )
 53 |     docs = list(pipe(ner_docs))
 54 | 
 55 |     assert len(docs) == 2
 56 |     for doc in docs:
 57 |         assert "NER" in doc.results
 58 | 
 59 |     with pytest.raises(NotImplementedError):
 60 |         pipe["NER"].distill(None, None, None, None, None, None, None, None)
 61 | 
 62 | 
 63 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 64 | def test_serialization(ner_docs, batch_runtime) -> None:
 65 |     pipe = Pipeline(
 66 |         ner.NER(
 67 |             entities=["PERSON", "LOCATION", "COMPANY"],
 68 |             model=batch_runtime.model,
 69 |             generation_settings=batch_runtime.generation_settings,
 70 |             batch_size=batch_runtime.batch_size,
 71 |         )
 72 |     )
 73 | 
 74 |     config = pipe.serialize()
 75 |     assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline',
 76 |  'tasks': {'is_placeholder': False,
 77 |            'value': [{'cls_name': 'sieves.tasks.predictive.ner.core.NER',
 78 |                       'entities': {'is_placeholder': False,
 79 |                                    'value': ['PERSON', 'LOCATION', 'COMPANY']},
 80 |                       'fewshot_examples': {'is_placeholder': False,
 81 |                                            'value': ()},
 82 |                       'batch_size': {'is_placeholder': False, "value": -1},
 83 |                       'generation_settings': {'is_placeholder': False,
 84 |                                               'value': {
 85 |                                                         'config_kwargs': None,
 86 |                                                         'inference_kwargs': None,
 87 |                                                         'init_kwargs': None,
 88 |                                                         'strict_mode': False}},
 89 |                       'include_meta': {'is_placeholder': False, 'value': True},
 90 |                       'model': {'is_placeholder': True,
 91 |                                 'value': 'dspy.clients.lm.LM'},
 92 |                       'prompt_instructions': {'is_placeholder': False,
 93 |                                           'value': None},
 94 |                       'task_id': {'is_placeholder': False, 'value': 'NER'},
 95 |                       'version': Config.get_version()}]},
 96 |  'use_cache': {'is_placeholder': False, 'value': True},
 97 |  'version': Config.get_version()}
 98 |     Pipeline.deserialize(
 99 |         config=config,
100 |         tasks_kwargs=[{"model": batch_runtime.model}],
101 |     )
102 | 
103 | 
104 | @pytest.mark.parametrize("batch_runtime", [EngineType.glix], indirect=["batch_runtime"])
105 | def test_to_hf_dataset(ner_docs, batch_runtime) -> None:
106 |     task = ner.NER(
107 |         entities=["PERSON", "LOCATION", "COMPANY"],
108 |         model=batch_runtime.model,
109 |         generation_settings=batch_runtime.generation_settings,
110 |         batch_size=batch_runtime.batch_size,
111 |     )
112 |     pipe = Pipeline(task)
113 | 
114 |     assert isinstance(task, PredictiveTask)
115 |     dataset = task.to_hf_dataset(pipe(ner_docs))
116 |     assert all([key in dataset.features for key in ("text", "entities")])
117 |     assert len(dataset) == 2
118 |     dataset_records = list(dataset)
119 |     for rec in dataset_records:
120 |         assert isinstance(rec["entities"], dict)
121 |         assert (
122 |             len(rec["entities"]["entity_type"])
123 |             == len(rec["entities"]["start"])
124 |             == len(rec["entities"]["end"])
125 |             == len(rec["entities"]["text"])
126 |         )
127 |         assert isinstance(rec["text"], str)
128 | 
129 |     with pytest.raises(KeyError):
130 |         task.to_hf_dataset([Doc(text="This is a dummy text.")])
131 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/predictive/test_pii_masking.py:
--------------------------------------------------------------------------------
  1 | # mypy: ignore-errors
  2 | import pytest
  3 | 
  4 | from sieves import Doc, Pipeline, tasks
  5 | from sieves.engines import EngineType
  6 | from sieves.serialization import Config
  7 | from sieves.tasks import PredictiveTask
  8 | from sieves.tasks.predictive import pii_masking
  9 | 
 10 | 
 11 | @pytest.mark.parametrize(
 12 |     "batch_runtime",
 13 |     (
 14 |         EngineType.dspy,
 15 |         EngineType.langchain,
 16 |         EngineType.outlines,
 17 |     ),
 18 |     indirect=["batch_runtime"],
 19 | )
 20 | @pytest.mark.parametrize("fewshot", [True, False])
 21 | def test_run(pii_masking_docs, batch_runtime, fewshot) -> None:
 22 |     fewshot_examples = [
 23 |         pii_masking.FewshotExample(
 24 |             text="Jane Smith works at NASA.",
 25 |             reasoning="Jane Smith is a person's name and should be masked.",
 26 |             masked_text="[MASKED] works at NASA.",
 27 |             pii_entities=[pii_masking.PIIEntity(entity_type="PERSON", text="Jane Smith")],
 28 |         ),
 29 |         pii_masking.FewshotExample(
 30 |             text="He lives at Diagon Alley 37.",
 31 |             reasoning="Diagon Alley 37 is a residential address and should be masked.",
 32 |             masked_text="He lives at [MASKED].",
 33 |             pii_entities=[pii_masking.PIIEntity(entity_type="ADDRESS", text="Diagon Alley 37")],
 34 |         ),
 35 |     ]
 36 | 
 37 |     fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {}
 38 |     pipe = Pipeline([
 39 |         tasks.predictive.PIIMasking(
 40 |             model=batch_runtime.model,
 41 |             generation_settings=batch_runtime.generation_settings,
 42 |             batch_size=batch_runtime.batch_size,
 43 |             **fewshot_args,
 44 |         )
 45 |     ])
 46 |     docs = list(pipe(pii_masking_docs))
 47 | 
 48 |     assert len(docs) == 2
 49 |     for doc in docs:
 50 |         assert doc.text
 51 |         assert "PIIMasking" in doc.results
 52 | 
 53 |     with pytest.raises(NotImplementedError):
 54 |         pipe["PIIMasking"].distill(None, None, None, None, None, None, None, None)
 55 | 
 56 | 
 57 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 58 | def test_to_hf_dataset(pii_masking_docs, batch_runtime) -> None:
 59 |     task = tasks.predictive.PIIMasking(
 60 |         model=batch_runtime.model,
 61 |         generation_settings=batch_runtime.generation_settings,
 62 |         batch_size=batch_runtime.batch_size,
 63 |     )
 64 |     pipe = Pipeline(task)
 65 |     docs = pipe(pii_masking_docs)
 66 | 
 67 |     assert isinstance(task, PredictiveTask)
 68 |     dataset = task.to_hf_dataset(docs)
 69 |     assert all([key in dataset.features for key in ("text", "masked_text")])
 70 |     assert len(dataset) == 2
 71 |     records = list(dataset)
 72 |     assert records[0]["text"] == "Her SSN is 222-333-444. Her credit card number is 1234 5678."
 73 |     assert records[1]["text"] == "You can reach Michael at michael.michaels@gmail.com."
 74 | 
 75 |     with pytest.raises(KeyError):
 76 |         task.to_hf_dataset([Doc(text="This is a dummy text.")])
 77 | 
 78 | 
 79 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 80 | def test_serialization(pii_masking_docs, batch_runtime) -> None:
 81 |     pipe = Pipeline([
 82 |         tasks.predictive.PIIMasking(
 83 |             model=batch_runtime.model,
 84 |             generation_settings=batch_runtime.generation_settings,
 85 |             batch_size=batch_runtime.batch_size,
 86 |         )
 87 |     ])
 88 | 
 89 |     config = pipe.serialize()
 90 |     assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline',
 91 |  'tasks': {'is_placeholder': False,
 92 |            'value': [{'cls_name': 'sieves.tasks.predictive.pii_masking.core.PIIMasking',
 93 |                       'fewshot_examples': {'is_placeholder': False,
 94 |                                            'value': ()},
 95 |                       'batch_size': {'is_placeholder': False, "value": -1},
 96 |                       'generation_settings': {'is_placeholder': False,
 97 |                                               'value': {
 98 |                                                         'config_kwargs': None,
 99 |                                                         'inference_kwargs': None,
100 |                                                         'init_kwargs': None,
101 |                                                         'strict_mode': False}},
102 |                       'include_meta': {'is_placeholder': False, 'value': True},
103 |                       'mask_placeholder': {'is_placeholder': False,
104 |                                            'value': '[MASKED]'},
105 |                       'model': {'is_placeholder': True,
106 |                                 'value': 'dspy.clients.lm.LM'},
107 |                       'pii_types': {'is_placeholder': False, 'value': None},
108 |                       'prompt_instructions': {'is_placeholder': False,
109 |                                           'value': None},
110 |                       'task_id': {'is_placeholder': False,
111 |                                   'value': 'PIIMasking'},
112 |                       'version': Config.get_version()}]},
113 |  'use_cache': {'is_placeholder': False, 'value': True},
114 |  'version': Config.get_version()}
115 | 
116 |     Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}])
117 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/predictive/test_question_answering.py:
--------------------------------------------------------------------------------
  1 | # mypy: ignore-errors
  2 | import pytest
  3 | 
  4 | from sieves import Doc, Pipeline
  5 | from sieves.engines import EngineType
  6 | from sieves.serialization import Config
  7 | from sieves.tasks import PredictiveTask
  8 | from sieves.tasks.predictive import question_answering
  9 | 
 10 | 
 11 | @pytest.mark.parametrize(
 12 |     "batch_runtime",
 13 |     (
 14 |         EngineType.dspy,
 15 |         EngineType.glix,
 16 |         EngineType.langchain,
 17 |         EngineType.outlines,
 18 |     ),
 19 |     indirect=["batch_runtime"],
 20 | )
 21 | @pytest.mark.parametrize("fewshot", [True, False])
 22 | def test_run(qa_docs, batch_runtime, fewshot):
 23 |     fewshot_examples = [
 24 |         question_answering.FewshotExample(
 25 |             text="""
 26 |             Physics is the scientific study of matter, its fundamental constituents, its motion and behavior through
 27 |             space and time, and the related entities of energy and force. Physics is one of the most fundamental
 28 |             scientific disciplines. A scientist who specializes in the field of physics is called a physicist.
 29 |             """,
 30 |             reasoning="The text states ad verbatim what a scientist specializing in physics is called.",
 31 |             questions=("What's a scientist called who specializes in the field of physics?",),
 32 |             answers=("A physicist.",),
 33 |         ),
 34 |         question_answering.FewshotExample(
 35 |             text="""
 36 |             A biologist is a scientist who conducts research in biology. Biologists are interested in studying life on
 37 |             Earth, whether it is an individual cell, a multicellular organism, or a community of interacting
 38 |             populations. They usually specialize in a particular branch (e.g., molecular biology, zoology, and
 39 |             evolutionary biology) of biology and have a specific research focus (e.g., studying malaria or cancer).
 40 |             """,
 41 |             reasoning="The states ad verbatim that biologists are interested in studying life on earth.",
 42 |             questions=("What are biologists interested in?",),
 43 |             answers=("Studying life on earth.",),
 44 |         ),
 45 |     ]
 46 | 
 47 |     fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {}
 48 |     pipe = Pipeline(
 49 |         [
 50 |             question_answering.QuestionAnswering(
 51 |                 task_id="qa",
 52 |                 questions=[
 53 |                     "What branch of science is this text describing?",
 54 |                     "What the goal of the science as described in the text?",
 55 |                 ],
 56 |                 model=batch_runtime.model,
 57 |                 generation_settings=batch_runtime.generation_settings,
 58 |                 batch_size=batch_runtime.batch_size,
 59 |                 **fewshot_args,
 60 |             ),
 61 |         ]
 62 |     )
 63 |     docs = list(pipe(qa_docs))
 64 | 
 65 |     assert len(docs) == 2
 66 |     for doc in docs:
 67 |         assert doc.text
 68 |         assert "qa" in doc.results
 69 | 
 70 |     with pytest.raises(NotImplementedError):
 71 |         pipe["qa"].distill(None, None, None, None, None, None, None, None)
 72 | 
 73 | 
 74 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 75 | def test_to_hf_dataset(qa_docs, batch_runtime) -> None:
 76 |     task = question_answering.QuestionAnswering(
 77 |         task_id="qa",
 78 |         questions=[
 79 |             "What branch of science is this text describing?",
 80 |             "What the goal of the science as described in the text?",
 81 |         ],
 82 |         model=batch_runtime.model,
 83 |         generation_settings=batch_runtime.generation_settings,
 84 |         batch_size=batch_runtime.batch_size,
 85 |     )
 86 |     pipe = Pipeline(task)
 87 | 
 88 |     assert isinstance(task, PredictiveTask)
 89 |     dataset = task.to_hf_dataset(pipe(qa_docs))
 90 |     assert all([key in dataset.features for key in ("text", "answers")])
 91 |     assert len(dataset) == 2
 92 |     dataset_records = list(dataset)
 93 |     for rec in dataset_records:
 94 |         assert isinstance(rec["text"], str)
 95 |         assert isinstance(rec["answers"], list)
 96 | 
 97 |     with pytest.raises(KeyError):
 98 |         task.to_hf_dataset([Doc(text="This is a dummy text.")])
 99 | 
100 | 
101 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
102 | def test_serialization(qa_docs, batch_runtime) -> None:
103 |     pipe = Pipeline(
104 |         [
105 |             question_answering.QuestionAnswering(
106 |                 task_id="qa",
107 |                 questions=[
108 |                     "What branch of science is this text describing?",
109 |                     "What the goal of the science as described in the text?",
110 |                 ],
111 |                 model=batch_runtime.model,
112 |                 generation_settings=batch_runtime.generation_settings,
113 |                 batch_size=batch_runtime.batch_size,
114 |             )
115 |         ]
116 |     )
117 | 
118 |     config = pipe.serialize()
119 |     assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline',
120 |  'tasks': {'is_placeholder': False,
121 |            'value': [{'cls_name': 'sieves.tasks.predictive.question_answering.core.QuestionAnswering',
122 |                       'fewshot_examples': {'is_placeholder': False,
123 |                                            'value': ()},
124 |                       'batch_size': {'is_placeholder': False, "value": -1},
125 |                       'generation_settings': {'is_placeholder': False,
126 |                                               'value': {
127 |                                                         'config_kwargs': None,
128 |                                                         'inference_kwargs': None,
129 |                                                         'init_kwargs': None,
130 |                                                         'strict_mode': False}},
131 |                       'include_meta': {'is_placeholder': False, 'value': True},
132 |                       'model': {'is_placeholder': True,
133 |                                 'value': 'dspy.clients.lm.LM'},
134 |                       'prompt_instructions': {'is_placeholder': False,
135 |                                           'value': None},
136 |                       'questions': {'is_placeholder': False,
137 |                                     'value': ['What branch of science is this '
138 |                                               'text describing?',
139 |                                               'What the goal of the science as '
140 |                                               'described in the text?']},
141 |                       'task_id': {'is_placeholder': False, 'value': 'qa'},
142 |                       'version': Config.get_version()}]},
143 |  'use_cache': {'is_placeholder': False, 'value': True},
144 |  'version': Config.get_version()}
145 | 
146 |     Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}])
147 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/predictive/test_sentiment_analysis.py:
--------------------------------------------------------------------------------
  1 | # mypy: ignore-errors
  2 | import pytest
  3 | 
  4 | from sieves import Doc, Pipeline
  5 | from sieves.engines import EngineType
  6 | from sieves.serialization import Config
  7 | from sieves.tasks import PredictiveTask
  8 | from sieves.tasks.predictive import sentiment_analysis
  9 | 
 10 | 
 11 | @pytest.mark.parametrize(
 12 |     "batch_runtime",
 13 |     (
 14 |         EngineType.dspy,
 15 |         EngineType.langchain,
 16 |         EngineType.outlines,
 17 |     ),
 18 |     indirect=["batch_runtime"],
 19 | )
 20 | @pytest.mark.parametrize("fewshot", [True, False])
 21 | def test_run(sentiment_analysis_docs, batch_runtime, fewshot):
 22 |     fewshot_examples = [
 23 |         sentiment_analysis.FewshotExample(
 24 |             text="The food was perfect, the service only ok.",
 25 |             reasoning="The text is very positive about the quality of the food, and neutral about the service quality."
 26 |             " The overall sentiment is hence positive.",
 27 |             sentiment_per_aspect={"food": 1.0, "service": 0.5, "overall": 0.8},
 28 |         ),
 29 |         sentiment_analysis.FewshotExample(
 30 |             text="The service was amazing - they take excellent care of their customers. The food was despicable "
 31 |             "though, I strongly recommend not to go.",
 32 |             reasoning="While the service is judged as amazing, hence very positive, the assessment of the food is very "
 33 |             "negative. The overall sentiment is strongly negative.",
 34 |             sentiment_per_aspect={"food": 0.1, "service": 1.0, "overall": 0.3},
 35 |         ),
 36 |     ]
 37 | 
 38 |     fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {}
 39 |     pipe = Pipeline(
 40 |         [
 41 |             sentiment_analysis.SentimentAnalysis(
 42 |                 task_id="sentiment_analysis",
 43 |                 aspects=("food", "service"),
 44 |                 model=batch_runtime.model,
 45 |                 generation_settings=batch_runtime.generation_settings,
 46 |                 batch_size=batch_runtime.batch_size,
 47 |                 **fewshot_args,
 48 |             ),
 49 |         ]
 50 |     )
 51 |     docs = list(pipe(sentiment_analysis_docs))
 52 | 
 53 |     assert len(docs) == 2
 54 |     for doc in docs:
 55 |         assert doc.text
 56 |         assert doc.results["sentiment_analysis"]
 57 |         assert "sentiment_analysis" in doc.results
 58 | 
 59 |     with pytest.raises(NotImplementedError):
 60 |         pipe["sentiment_analysis"].distill(None, None, None, None, None, None, None, None)
 61 | 
 62 | 
 63 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 64 | def test_to_hf_dataset(dummy_docs, batch_runtime) -> None:
 65 |     task = sentiment_analysis.SentimentAnalysis(
 66 |         task_id="sentiment_analysis",
 67 |         aspects=("food", "service"),
 68 |         model=batch_runtime.model,
 69 |         generation_settings=batch_runtime.generation_settings,
 70 |         batch_size=batch_runtime.batch_size,
 71 |     )
 72 |     pipe = Pipeline(task)
 73 | 
 74 |     assert isinstance(task, PredictiveTask)
 75 |     dataset = task.to_hf_dataset(pipe(dummy_docs))
 76 |     assert all([key in dataset.features for key in ("text", "aspect")])
 77 |     assert len(dataset) == 2
 78 |     dataset_records = list(dataset)
 79 |     for rec in dataset_records:
 80 |         assert isinstance(rec["aspect"], list)
 81 |         for v in rec["aspect"]:
 82 |             assert isinstance(v, float)
 83 |         assert isinstance(rec["text"], str)
 84 | 
 85 |     with pytest.raises(KeyError):
 86 |         task.to_hf_dataset([Doc(text="This is a dummy text.")])
 87 | 
 88 | 
 89 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 90 | def test_serialization(dummy_docs, batch_runtime) -> None:
 91 |     pipe = Pipeline(
 92 |         [
 93 |             sentiment_analysis.SentimentAnalysis(
 94 |                 task_id="sentiment_analysis",
 95 |                 aspects=("food", "service"),
 96 |                 model=batch_runtime.model,
 97 |                 generation_settings=batch_runtime.generation_settings,
 98 |                 batch_size=batch_runtime.batch_size,
 99 |             )
100 |         ]
101 |     )
102 | 
103 |     config = pipe.serialize()
104 |     assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline',
105 |  'tasks': {'is_placeholder': False,
106 |            'value': [{'aspects': {'is_placeholder': False,
107 |                                   'value': ('food', 'overall', 'service')},
108 |                       'cls_name': 'sieves.tasks.predictive.sentiment_analysis.core.SentimentAnalysis',
109 |                       'fewshot_examples': {'is_placeholder': False,
110 |                                            'value': ()},
111 |                       'batch_size': {'is_placeholder': False, "value": -1},
112 |                       'generation_settings': {'is_placeholder': False,
113 |                                               'value': {
114 |                                                         'config_kwargs': None,
115 |                                                         'inference_kwargs': None,
116 |                                                         'init_kwargs': None,
117 |                                                         'strict_mode': False}},
118 |                       'include_meta': {'is_placeholder': False, 'value': True},
119 |                       'model': {'is_placeholder': True,
120 |                                 'value': 'dspy.clients.lm.LM'},
121 |                       'prompt_instructions': {'is_placeholder': False,
122 |                                           'value': None},
123 |                       'task_id': {'is_placeholder': False,
124 |                                   'value': 'sentiment_analysis'},
125 |                       'version': Config.get_version()}]},
126 |  'use_cache': {'is_placeholder': False, 'value': True},
127 |  'version': Config.get_version()}
128 | 
129 |     Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}])
130 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/predictive/test_summarization.py:
--------------------------------------------------------------------------------
  1 | # mypy: ignore-errors
  2 | import pytest
  3 | 
  4 | from sieves import Doc, Pipeline
  5 | from sieves.engines import EngineType
  6 | from sieves.serialization import Config
  7 | from sieves.tasks import PredictiveTask
  8 | from sieves.tasks.predictive import summarization
  9 | 
 10 | 
 11 | @pytest.mark.parametrize(
 12 |     "batch_runtime",
 13 |     (
 14 |         EngineType.dspy,
 15 |         EngineType.glix,
 16 |         EngineType.langchain,
 17 |         EngineType.outlines,
 18 |     ),
 19 |     indirect=["batch_runtime"],
 20 | )
 21 | @pytest.mark.parametrize("fewshot", [True, False])
 22 | def test_run(summarization_docs, batch_runtime, fewshot) -> None:
 23 |     fewshot_examples = [
 24 |         summarization.FewshotExample(
 25 |             text="They counted: one, two, three, four, five, six, seven, eight, nine, ten, eleven, twelve, thirteen, "
 26 |             "fourteen.",
 27 |             n_words=6,
 28 |             summary="They counted from one to fourteen.",
 29 |         ),
 30 |         summarization.FewshotExample(
 31 |             text="Next in order were the Boeotians, led by Peneleos, Leitus, Arcesilaus, Prothoenor, and Clonius. "
 32 |             "These had with them fifty ships, and on board of each were a hundred and twenty young men of the "
 33 |             "Boeotians. Then came the men of Orchomenus, who lived in the realm of the Minyans, led by Ascalaphus"
 34 |             " and Ialmenus, sons of Mars. In their command were thirty ships. Next were the Phocians, led by"
 35 |             " Schedius and Epistrophus, sons of Iphitus the son of Naubolus. These had forty ships…",
 36 |             n_words=10,
 37 |             summary="Boeotians, Orchomenians, and Phocians sailed to Troy with many ships.",
 38 |         ),
 39 |     ]
 40 | 
 41 |     fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {}
 42 |     pipe = Pipeline([
 43 |         summarization.Summarization(
 44 |             n_words=10,
 45 |             model=batch_runtime.model,
 46 |             generation_settings=batch_runtime.generation_settings,
 47 |             batch_size=batch_runtime.batch_size,
 48 |             **fewshot_args,
 49 |         )
 50 |     ])
 51 |     docs = list(pipe(summarization_docs))
 52 | 
 53 |     assert len(docs) == 2
 54 |     for doc in docs:
 55 |         assert doc.text
 56 |         assert "Summarization" in doc.results
 57 | 
 58 |     with pytest.raises(NotImplementedError):
 59 |         pipe["Summarization"].distill(None, None, None, None, None, None, None, None)
 60 | 
 61 | 
 62 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 63 | def test_to_hf_dataset(summarization_docs, batch_runtime) -> None:
 64 |     task = summarization.Summarization(
 65 |         n_words=10,
 66 |         model=batch_runtime.model,
 67 |         generation_settings=batch_runtime.generation_settings,
 68 |         batch_size=batch_runtime.batch_size,
 69 |     )
 70 |     pipe = Pipeline(task)
 71 |     docs = pipe(summarization_docs)
 72 | 
 73 |     assert isinstance(task, PredictiveTask)
 74 |     dataset = task.to_hf_dataset(docs)
 75 |     assert all([key in dataset.features for key in ("text", "summary")])
 76 |     assert len(dataset) == 2
 77 |     records = list(dataset)
 78 |     assert records[0]["text"].strip().startswith("The decay spreads over the State")
 79 |     assert records[1]["text"].strip().startswith("After all, the practical reason")
 80 |     for record in records:
 81 |         assert isinstance(record["summary"], str)
 82 | 
 83 |     with pytest.raises(KeyError):
 84 |         task.to_hf_dataset([Doc(text="This is a dummy text.")])
 85 | 
 86 | 
 87 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 88 | def test_serialization(summarization_docs, batch_runtime) -> None:
 89 |     pipe = Pipeline([
 90 |         summarization.Summarization(
 91 |             n_words=10,
 92 |             model=batch_runtime.model,
 93 |             generation_settings=batch_runtime.generation_settings,
 94 |             batch_size=batch_runtime.batch_size,
 95 |         )
 96 |     ])
 97 | 
 98 |     config = pipe.serialize()
 99 |     assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline',
100 |  'tasks': {'is_placeholder': False,
101 |            'value': [{'cls_name': 'sieves.tasks.predictive.summarization.core.Summarization',
102 |                       'fewshot_examples': {'is_placeholder': False,
103 |                                            'value': ()},
104 |                       'batch_size': {'is_placeholder': False, "value": -1},
105 |                       'generation_settings': {'is_placeholder': False,
106 |                                               'value': {
107 |                                                         'config_kwargs': None,
108 |                                                         'inference_kwargs': None,
109 |                                                         'init_kwargs': None,
110 |                                                         'strict_mode': False}},
111 |                       'include_meta': {'is_placeholder': False, 'value': True},
112 |                       'model': {'is_placeholder': True,
113 |                                 'value': 'dspy.clients.lm.LM'},
114 |                       'n_words': {'is_placeholder': False, 'value': 10},
115 |                       'prompt_instructions': {'is_placeholder': False,
116 |                                           'value': None},
117 |                       'task_id': {'is_placeholder': False,
118 |                                   'value': 'Summarization'},
119 |                       'version': Config.get_version()}]},
120 |  'use_cache': {'is_placeholder': False, 'value': True},
121 |  'version': Config.get_version()}
122 | 
123 |     Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}])
124 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/predictive/test_translation.py:
--------------------------------------------------------------------------------
  1 | # mypy: ignore-errors
  2 | import pytest
  3 | 
  4 | from sieves import Doc, Pipeline
  5 | from sieves.engines import EngineType
  6 | from sieves.serialization import Config
  7 | from sieves.tasks import PredictiveTask
  8 | from sieves.tasks.predictive import translation
  9 | 
 10 | 
 11 | @pytest.mark.parametrize(
 12 |     "batch_runtime",
 13 |     (
 14 |         EngineType.dspy,
 15 |         EngineType.langchain,
 16 |         EngineType.outlines,
 17 |     ),
 18 |     indirect=["batch_runtime"],
 19 | )
 20 | @pytest.mark.parametrize("fewshot", [True, False])
 21 | def test_run(translation_docs, batch_runtime, fewshot) -> None:
 22 |     fewshot_examples = [
 23 |         translation.FewshotExample(
 24 |             text="The sun is shining today.",
 25 |             to="Spanish",
 26 |             translation="El sol brilla hoy.",
 27 |         ),
 28 |         translation.FewshotExample(
 29 |             text="There's a lot of fog today",
 30 |             to="Spanish",
 31 |             translation="Hay mucha niebla hoy.",
 32 |         ),
 33 |     ]
 34 | 
 35 |     fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {}
 36 |     pipe = Pipeline([
 37 |         translation.Translation(
 38 |             to="Spanish",
 39 |             model=batch_runtime.model,
 40 |             generation_settings=batch_runtime.generation_settings,
 41 |             batch_size=batch_runtime.batch_size,
 42 |             **fewshot_args,
 43 |         )
 44 |     ])
 45 |     docs = list(pipe(translation_docs))
 46 | 
 47 |     assert len(docs) == 2
 48 |     for doc in docs:
 49 |         assert doc.text
 50 |         assert "Translation" in doc.results
 51 | 
 52 |     with pytest.raises(NotImplementedError):
 53 |         pipe["Translation"].distill(None, None, None, None, None, None, None, None)
 54 | 
 55 | 
 56 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 57 | def test_to_hf_dataset(translation_docs, batch_runtime) -> None:
 58 |     task = translation.Translation(
 59 |         to="Spanish",
 60 |         model=batch_runtime.model,
 61 |         generation_settings=batch_runtime.generation_settings,
 62 |         batch_size=batch_runtime.batch_size,
 63 |     )
 64 |     pipe = Pipeline(task)
 65 |     docs = pipe(translation_docs)
 66 | 
 67 |     assert isinstance(task, PredictiveTask)
 68 |     dataset = task.to_hf_dataset(docs)
 69 |     assert all([key in dataset.features for key in ("text", "translation")])
 70 |     assert len(dataset) == 2
 71 |     records = list(dataset)
 72 |     assert records[0]["text"] == "It is rainy today."
 73 |     assert records[1]["text"] == "It is cloudy today."
 74 |     for record in records:
 75 |         assert isinstance(record["translation"], str)
 76 | 
 77 |     with pytest.raises(KeyError):
 78 |         task.to_hf_dataset([Doc(text="This is a dummy text.")])
 79 | 
 80 | 
 81 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"])
 82 | def test_serialization(translation_docs, batch_runtime) -> None:
 83 |     pipe = Pipeline([
 84 |         translation.Translation(
 85 |             to="Spanish",
 86 |             model=batch_runtime.model,
 87 |             generation_settings=batch_runtime.generation_settings,
 88 |             batch_size=batch_runtime.batch_size,
 89 |         )
 90 |     ])
 91 | 
 92 |     config = pipe.serialize()
 93 |     assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline',
 94 |  'tasks': {'is_placeholder': False,
 95 |            'value': [{'cls_name': 'sieves.tasks.predictive.translation.core.Translation',
 96 |                       'fewshot_examples': {'is_placeholder': False,
 97 |                                            'value': ()},
 98 |                       'batch_size': {'is_placeholder': False, "value": -1},
 99 |                       'generation_settings': {'is_placeholder': False,
100 |                                               'value': {
101 |                                                         'config_kwargs': None,
102 |                                                         'inference_kwargs': None,
103 |                                                         'init_kwargs': None,
104 |                                                         'strict_mode': False}},
105 |                       'include_meta': {'is_placeholder': False, 'value': True},
106 |                       'model': {'is_placeholder': True,
107 |                                 'value': 'dspy.clients.lm.LM'},
108 |                       'prompt_instructions': {'is_placeholder': False,
109 |                                           'value': None},
110 |                       'task_id': {'is_placeholder': False,
111 |                                   'value': 'Translation'},
112 |                       'to': {'is_placeholder': False, 'value': 'Spanish'},
113 |                       'version': Config.get_version()}]},
114 |  'use_cache': {'is_placeholder': False, 'value': True},
115 |  'version': Config.get_version()}
116 | 
117 |     Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}])
118 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/preprocessing/chunking/test_chonkie.py:
--------------------------------------------------------------------------------
 1 | # mypy: ignore-errors
 2 | import chonkie
 3 | 
 4 | from sieves import Doc, Pipeline
 5 | from sieves.serialization import Config
 6 | from sieves.tasks.preprocessing.chunking import Chonkie
 7 | 
 8 | 
 9 | def test_chonkie(tokenizer) -> None:
10 |     resources = [Doc(text="This is a text " * 100)]
11 |     pipe = Pipeline(tasks=[Chonkie(chonkie.TokenChunker(tokenizer))])
12 |     docs = list(pipe(resources))
13 | 
14 |     assert len(docs) == 1
15 |     assert docs[0].text
16 |     assert docs[0].chunks
17 | 
18 | 
19 | def test_serialization(tokenizer) -> None:
20 |     resources = [Doc(text="This is a text " * 100)]
21 |     pipe = Pipeline(tasks=[Chonkie(chonkie.TokenChunker(tokenizer))])
22 |     docs = list(pipe(resources))
23 | 
24 |     config = pipe.serialize()
25 |     assert config.model_dump() == {
26 |         "cls_name": "sieves.pipeline.core.Pipeline",
27 |         "use_cache": {"is_placeholder": False, "value": True},
28 |         "tasks": {
29 |             "is_placeholder": False,
30 |             "value": [
31 |                 {
32 |                     "chunker": {"is_placeholder": True, "value": "chonkie.chunker.token.TokenChunker"},
33 |                     'batch_size': {'is_placeholder': False, "value": -1},
34 |                     "cls_name": "sieves.tasks.preprocessing.chunking.chonkie_.Chonkie",
35 |                     "include_meta": {"is_placeholder": False, "value": False},
36 |                     "task_id": {"is_placeholder": False, "value": "Chonkie"},
37 |                     "version": Config.get_version(),
38 |                 }
39 |             ],
40 |         },
41 |         "version": Config.get_version(),
42 |     }
43 | 
44 |     deserialized_pipeline = Pipeline.deserialize(
45 |         config=config, tasks_kwargs=[{"chunker": chonkie.TokenChunker(tokenizer)}]
46 |     )
47 |     assert docs[0] == list(deserialized_pipeline(resources))[0]
48 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/preprocessing/chunking/test_chunking.py:
--------------------------------------------------------------------------------
 1 | # mypy: ignore-errors
 2 | import chonkie
 3 | import pytest
 4 | 
 5 | from sieves import Doc, Pipeline
 6 | from sieves.serialization import Config
 7 | from sieves.tasks.preprocessing import Chunking
 8 | 
 9 | 
10 | @pytest.mark.parametrize("chunker", ["chonkie", "naive"])
11 | def test_chonkie(chunker, tokenizer) -> None:
12 |     resources = [Doc(text="This is a text. " * 100)]
13 |     pipe = Pipeline(tasks=[Chunking(chonkie.TokenChunker(tokenizer) if chunker == "chonkie" else 5)])
14 |     docs = list(pipe(resources))
15 | 
16 |     assert len(docs) == 1
17 |     assert docs[0].text
18 |     assert docs[0].chunks
19 | 
20 | 
21 | def test_serialization(tokenizer) -> None:
22 |     resources = [Doc(text="This is a text " * 100)]
23 |     pipe = Pipeline(tasks=[Chunking(chonkie.TokenChunker(tokenizer))])
24 |     docs = list(pipe(resources))
25 | 
26 |     config = pipe.serialize()
27 |     assert config.model_dump() == {
28 |         "cls_name": "sieves.pipeline.core.Pipeline",
29 |         "use_cache": {"is_placeholder": False, "value": True},
30 |         "tasks": {
31 |             "is_placeholder": False,
32 |             "value": [
33 |                 {
34 |                     "chunker": {"is_placeholder": True, "value": "chonkie.chunker.token.TokenChunker"},
35 |                     'batch_size': {'is_placeholder': False, "value": -1},
36 |                     "cls_name": "sieves.tasks.preprocessing.chunking.core.Chunking",
37 |                     "include_meta": {"is_placeholder": False, "value": False},
38 |                     "task_id": {"is_placeholder": False, "value": "Chunking"},
39 |                     "version": Config.get_version(),
40 |                 }
41 |             ],
42 |         },
43 |         "version": Config.get_version(),
44 |     }
45 | 
46 |     deserialized_pipeline = Pipeline.deserialize(
47 |         config=config, tasks_kwargs=[{"chunker": chonkie.TokenChunker(tokenizer)}]
48 |     )
49 |     assert docs[0] == list(deserialized_pipeline(resources))[0]
50 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/preprocessing/chunking/test_naivechunker.py:
--------------------------------------------------------------------------------
 1 | # mypy: ignore-errors
 2 | import pytest
 3 | 
 4 | from sieves import Pipeline
 5 | from sieves.engines import EngineType
 6 | from sieves.serialization import Config
 7 | from sieves.tasks.preprocessing.chunking.naive import NaiveChunker
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "batch_runtime",
12 |     [EngineType.huggingface],
13 |     indirect=["batch_runtime"],
14 | )
15 | def test_run(dummy_docs, batch_runtime) -> None:
16 |     """Tests whether chunking mechanism in PredictiveTask works as expected."""
17 |     chunk_interval = 5
18 |     pipe = Pipeline([NaiveChunker(interval=chunk_interval)])
19 |     docs = list(pipe(dummy_docs))
20 | 
21 |     assert len(docs) == 2
22 |     for doc in docs:
23 |         assert doc.text
24 |         assert len(doc.chunks) == 2
25 | 
26 | 
27 | def test_serialization(dummy_docs) -> None:
28 |     chunk_interval = 5
29 |     pipe = Pipeline(tasks=[NaiveChunker(interval=chunk_interval)])
30 |     docs = list(pipe(dummy_docs))
31 | 
32 |     config = pipe.serialize()
33 |     assert config.model_dump() == {
34 |         "cls_name": "sieves.pipeline.core.Pipeline",
35 |         "use_cache": {"is_placeholder": False, "value": True},
36 |         "tasks": {
37 |             "is_placeholder": False,
38 |             "value": [
39 |                 {
40 |                     "cls_name": "sieves.tasks.preprocessing.chunking.naive.NaiveChunker",
41 |                     'batch_size': {'is_placeholder': False, "value": -1},
42 |                     "include_meta": {"is_placeholder": False, "value": False},
43 |                     "interval": {"is_placeholder": False, "value": 5},
44 |                     "task_id": {"is_placeholder": False, "value": "NaiveChunker"},
45 |                     "version": Config.get_version(),
46 |                 }
47 |             ],
48 |         },
49 |         "version": Config.get_version(),
50 |     }
51 | 
52 |     deserialized_pipeline = Pipeline.deserialize(config=config, tasks_kwargs=[{}])
53 |     assert docs[0] == list(deserialized_pipeline(dummy_docs))[0]
54 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/preprocessing/ingestion/test_docling.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from sieves import Doc, Pipeline
 4 | from sieves.serialization import Config
 5 | from sieves.tasks.preprocessing.ingestion.docling_ import Docling
 6 | 
 7 | 
 8 | def test_run() -> None:
 9 |     resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")]
10 |     pipe = Pipeline(tasks=[Docling()])
11 |     docs = list(pipe(resources))
12 | 
13 |     assert len(docs) == 1
14 |     assert docs[0].text
15 | 
16 | 
17 | def test_serialization() -> None:
18 |     resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")]
19 |     pipe = Pipeline(tasks=[Docling()])
20 |     docs = list(pipe(resources))
21 | 
22 |     config = pipe.serialize()
23 |     version = Config.get_version()
24 |     assert config.model_dump() == {
25 |         "cls_name": "sieves.pipeline.core.Pipeline",
26 |         "use_cache": {"is_placeholder": False, "value": True},
27 |         "tasks": {
28 |             "is_placeholder": False,
29 |             "value": [
30 |                 {
31 |                     "cls_name": "sieves.tasks.preprocessing.ingestion.docling_.Docling",
32 |                     'batch_size': {'is_placeholder': False, "value": -1},
33 |                     "converter": {"is_placeholder": True, "value": "docling.document_converter.DocumentConverter"},
34 |                     "export_format": {"is_placeholder": False, "value": "markdown"},
35 |                     "include_meta": {"is_placeholder": False, "value": False},
36 |                     "task_id": {"is_placeholder": False, "value": "Docling"},
37 |                     "version": version,
38 |                 }
39 |             ],
40 |         },
41 |         "version": version,
42 |     }
43 | 
44 |     deserialized_pipeline = Pipeline.deserialize(
45 |         config=config, tasks_kwargs=[{"converter": None, "export_format": "markdown"}]
46 |     )
47 |     assert docs[0] == list(deserialized_pipeline(resources))[0]
48 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/preprocessing/ingestion/test_ingestion.py:
--------------------------------------------------------------------------------
 1 | # mypy: ignore-errors
 2 | from pathlib import Path
 3 | 
 4 | from docling.document_converter import DocumentConverter
 5 | 
 6 | from sieves import Doc, Pipeline, tasks
 7 | from sieves.serialization import Config
 8 | 
 9 | 
10 | def test_run() -> None:
11 |     resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")]
12 |     pipe = Pipeline(tasks=[tasks.preprocessing.Ingestion()])
13 |     docs = list(pipe(resources))
14 | 
15 |     assert len(docs) == 1
16 |     assert docs[0].text
17 | 
18 | 
19 | def test_serialization() -> None:
20 |     resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")]
21 |     pipe = Pipeline(tasks=[tasks.preprocessing.Ingestion()])
22 |     config = pipe.serialize()
23 |     version = Config.get_version()
24 |     assert config.model_dump() == {
25 |         "cls_name": "sieves.pipeline.core.Pipeline",
26 |         "use_cache": {"is_placeholder": False, "value": True},
27 |         "tasks": {
28 |             "is_placeholder": False,
29 |             "value": [
30 |                 {
31 |                     "cls_name": "sieves.tasks.preprocessing.ingestion.core.Ingestion",
32 |                     'batch_size': {'is_placeholder': False, "value": -1},
33 |                     "converter": {"is_placeholder": True, "value": "docling.document_converter.DocumentConverter"},
34 |                     "export_format": {"is_placeholder": False, "value": "markdown"},
35 |                     "include_meta": {"is_placeholder": False, "value": False},
36 |                     "task_id": {"is_placeholder": False, "value": "Ingestion"},
37 |                     "version": version,
38 |                 }
39 |             ],
40 |         },
41 |         "version": version,
42 |     }
43 | 
44 |     # For deserialization, we need to provide the converter
45 |     converter = DocumentConverter()
46 |     deserialized_pipeline = Pipeline.deserialize(
47 |         config=config, tasks_kwargs=[{"converter": converter, "export_format": "markdown"}]
48 |     )
49 |     deserialized_docs = list(deserialized_pipeline(resources))
50 | 
51 |     assert len(deserialized_docs) == 1
52 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/preprocessing/ingestion/test_marker.py:
--------------------------------------------------------------------------------
 1 | # mypy: ignore-errors
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | from marker.converters.pdf import PdfConverter
 6 | from marker.models import create_model_dict
 7 | 
 8 | from sieves import Doc, Pipeline, tasks
 9 | from sieves.serialization import Config
10 | 
11 | 
12 | @pytest.mark.skip(reason="Currently running into OOM issues with instantiating Marker converts.")
13 | def test_marker():
14 |     """Workaround to keep memory usage low: run single function with one instantiated Marker instance."""
15 |     marker_converter = PdfConverter(artifact_dict=create_model_dict())
16 | 
17 |     def test_run() -> None:
18 |         resources = [Doc(uri=Path(__file__).parent.parent.parent / "assets" / "1204.0162v2.pdf")]
19 |         pipe = Pipeline(tasks=[tasks.preprocessing.Marker(converter=marker_converter)])
20 |         docs = list(pipe(resources))
21 | 
22 |         assert len(docs) == 1
23 |         assert docs[0].text
24 | 
25 |     def test_with_extract_images() -> None:
26 |         resources = [Doc(uri=Path(__file__).parent.parent.parent / "assets" / "1204.0162v2.pdf")]
27 |         pipe = Pipeline(
28 |             tasks=[tasks.preprocessing.Marker(converter=marker_converter, extract_images=True, include_meta=True)]
29 |         )
30 |         docs = list(pipe(resources))
31 | 
32 |         assert len(docs) == 1
33 |         assert docs[0].text
34 |         assert docs[0].images
35 | 
36 |     def test_serialization() -> None:
37 |         resources = [Doc(uri=Path(__file__).parent.parent.parent / "assets" / "1204.0162v2.pdf")]
38 |         pipe = Pipeline(tasks=[tasks.preprocessing.Marker(converter=marker_converter, include_meta=True)])
39 |         docs = list(pipe(resources))
40 | 
41 |         config = pipe.serialize()
42 |         version = Config.get_version()
43 |         assert config.model_dump() == {
44 |             "cls_name": "sieves.pipeline.core.Pipeline",
45 |             "tasks": {
46 |                 "is_placeholder": False,
47 |                 "value": [
48 |                     {
49 |                         "cls_name": "sieves.tasks.preprocessing.marker_.Marker",
50 |                         "converter": {"is_placeholder": True, "value": "marker.converters.pdf.PdfConverter"},
51 |                         "export_format": {"is_placeholder": False, "value": "markdown"},
52 |                         "extract_images": {"is_placeholder": False, "value": False},
53 |                         "include_meta": {"is_placeholder": False, "value": True},
54 |                         "task_id": {"is_placeholder": False, "value": "Marker"},
55 |                         "version": version,
56 |                     }
57 |                 ],
58 |             },
59 |             "version": version,
60 |         }
61 | 
62 |         # For deserialization, we need to provide the converter
63 |         converter = marker_converter
64 |         deserialized_pipeline = Pipeline.deserialize(
65 |             config=config, tasks_kwargs=[{"converter": converter, "export_format": "markdown"}]
66 |         )
67 |         deserialized_docs = list(deserialized_pipeline(resources))
68 | 
69 |         assert len(deserialized_docs) == 1
70 |         assert deserialized_docs[0].text == docs[0].text
71 | 
72 |     test_run()
73 |     test_with_extract_images()
74 |     test_serialization()
75 | 


--------------------------------------------------------------------------------
/sieves/tests/tasks/preprocessing/ingestion/test_unstructured.py:
--------------------------------------------------------------------------------
 1 | # mypy: ignore-errors
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | import unstructured.cleaners.core
 6 | import unstructured.partition.auto
 7 | 
 8 | from sieves import Doc, Pipeline
 9 | from sieves.serialization import Config
10 | from sieves.tasks.preprocessing.ingestion.unstructured_ import Unstructured
11 | 
12 | 
13 | @pytest.mark.parametrize("to_chunk", [True, False])
14 | def test_run(to_chunk) -> None:
15 |     resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")]
16 |     partition_kwargs = {"chunking_strategy": "basic"} if to_chunk else {}
17 |     pipe = Pipeline(
18 |         tasks=[
19 |             Unstructured(
20 |                 **partition_kwargs,
21 |                 cleaners=(
22 |                     lambda t: unstructured.cleaners.core.clean(
23 |                         t, extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True
24 |                     ),
25 |                 ),
26 |                 include_meta=True,
27 |             ),
28 |         ]
29 |     )
30 |     docs = list(pipe(resources))
31 | 
32 |     assert len(docs) == 1
33 |     assert docs[0].text
34 |     if to_chunk:
35 |         assert len(docs[0].chunks)
36 |     else:
37 |         assert docs[0].chunks is None
38 | 
39 | 
40 | def test_serialization() -> None:
41 |     resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "dummy.txt")]
42 | 
43 |     def cleaner(text: str) -> str:
44 |         return unstructured.cleaners.core.clean(
45 |             text, extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True
46 |         )
47 | 
48 |     pipe = Pipeline(tasks=[Unstructured(cleaners=(cleaner,), include_meta=True)])
49 |     docs = list(pipe(resources))
50 | 
51 |     config = pipe.serialize()
52 |     assert config.model_dump() == {
53 |         "cls_name": "sieves.pipeline.core.Pipeline",
54 |         "use_cache": {"is_placeholder": False, "value": True},
55 |         "tasks": {
56 |             "is_placeholder": False,
57 |             "value": [
58 |                 {
59 |                     "cleaners": {"is_placeholder": True, "value": "builtins.tuple"},
60 |                     'batch_size': {'is_placeholder': False, "value": -1},
61 |                     "cls_name": "sieves.tasks.preprocessing.ingestion.unstructured_.Unstructured",
62 |                     "include_meta": {"is_placeholder": False, "value": True},
63 |                     "partition": {"is_placeholder": True, "value": "builtins.function"},
64 |                     "task_id": {"is_placeholder": False, "value": "Unstructured"},
65 |                     "version": Config.get_version(),
66 |                 }
67 |             ],
68 |         },
69 |         "version": Config.get_version(),
70 |     }
71 | 
72 |     deserialized_pipeline = Pipeline.deserialize(
73 |         config=config, tasks_kwargs=({"partition": unstructured.partition.auto.partition, "cleaners": (cleaner,)},)
74 |     )
75 |     assert docs[0] == list(deserialized_pipeline(resources))[0]
76 | 


--------------------------------------------------------------------------------
/sieves/tests/test_doc.py:
--------------------------------------------------------------------------------
  1 | # mypy: ignore-errors
  2 | import pytest
  3 | import regex
  4 | from datasets import Dataset
  5 | from PIL import Image
  6 | 
  7 | from sieves import Doc
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def test_images() -> dict[str, Image.Image]:
 12 |     return {
 13 |         "rgb_red_100": Image.new("RGB", (100, 100), color="red"),
 14 |         "rgb_red_100_2": Image.new("RGB", (100, 100), color="red"),
 15 |         "rgb_blue_100": Image.new("RGB", (100, 100), color="blue"),
 16 |         "rgb_red_200": Image.new("RGB", (200, 200), color="red"),
 17 |         "l_gray_100": Image.new("L", (100, 100), color=128),
 18 |     }
 19 | 
 20 | 
 21 | def test_identical_images(test_images: dict[str, Image.Image]) -> None:
 22 |     doc1 = Doc(images=[test_images["rgb_red_100"]])
 23 |     doc2 = Doc(images=[test_images["rgb_red_100_2"]])
 24 |     assert doc1 == doc2
 25 | 
 26 | 
 27 | def test_different_images(test_images: dict[str, Image.Image]) -> None:
 28 |     doc1 = Doc(images=[test_images["rgb_red_100"]])
 29 |     doc2 = Doc(images=[test_images["rgb_blue_100"]])
 30 |     assert doc1 != doc2
 31 | 
 32 | 
 33 | def test_none_images() -> None:
 34 |     doc1 = Doc(images=None)
 35 |     doc2 = Doc(images=None)
 36 |     assert doc1 == doc2
 37 | 
 38 | 
 39 | def test_one_none_image(test_images: dict[str, Image.Image]) -> None:
 40 |     doc1 = Doc(images=[test_images["rgb_red_100"]])
 41 |     doc2 = Doc(images=None)
 42 |     assert doc1 != doc2
 43 | 
 44 | 
 45 | def test_different_image_counts(test_images: dict[str, Image.Image]) -> None:
 46 |     doc1 = Doc(images=[test_images["rgb_red_100"], test_images["rgb_red_100_2"]])
 47 |     doc2 = Doc(images=[test_images["rgb_red_100"]])
 48 |     assert doc1 != doc2
 49 | 
 50 | 
 51 | def test_different_image_sizes(test_images: dict[str, Image.Image]) -> None:
 52 |     doc1 = Doc(images=[test_images["rgb_red_100"]])
 53 |     doc2 = Doc(images=[test_images["rgb_red_200"]])
 54 |     assert doc1 != doc2
 55 | 
 56 | 
 57 | def test_different_image_modes(test_images: dict[str, Image.Image]) -> None:
 58 |     doc1 = Doc(images=[test_images["rgb_red_100"]])
 59 |     doc2 = Doc(images=[test_images["l_gray_100"]])
 60 |     assert doc1 != doc2
 61 | 
 62 | 
 63 | def test_doc_comparison_type_error() -> None:
 64 |     doc = Doc(images=None)
 65 |     with pytest.raises(NotImplementedError):
 66 |         doc == 42
 67 | 
 68 | 
 69 | def test_docs_from_hf_dataset() -> None:
 70 |     """Tests generation of Docs instance from HF dataset."""
 71 |     hf_dataset = Dataset.from_dict(
 72 |         {"text": ["This is the first document.", "This is the second document."], "label": [0, 1]}
 73 |     )
 74 |     docs = Doc.from_hf_dataset(hf_dataset)
 75 | 
 76 |     assert len(docs) == 2
 77 |     assert docs[0].text == "This is the first document."
 78 |     assert docs[0].chunks == ["This is the first document."]  # Check post_init
 79 |     assert docs[0].id is None
 80 |     assert docs[0].uri is None
 81 |     assert docs[0].images is None
 82 |     assert docs[0].meta == {}
 83 |     assert docs[0].results == {}
 84 | 
 85 |     assert docs[1].text == "This is the second document."
 86 |     assert docs[1].chunks == ["This is the second document."]  # Check post_init
 87 | 
 88 |     # Test with a different text column name.
 89 |     data_alt_col = {"content": ["Doc A", "Doc B"], "id": ["a", "b"]}
 90 |     hf_dataset_alt_col = Dataset.from_dict(data_alt_col)
 91 |     docs_alt = Doc.from_hf_dataset(hf_dataset_alt_col, column_map={"text": "content", "id": "id"})
 92 |     assert len(docs_alt) == 2
 93 |     assert docs_alt[0].text == "Doc A"
 94 |     assert docs_alt[1].text == "Doc B"
 95 |     assert docs_alt[0].id == "a"
 96 |     assert docs_alt[1].id == "b"
 97 | 
 98 |     # Test ValueError for missing column.
 99 |     with pytest.raises(
100 |         KeyError,
101 |         match=regex.escape("Specified columns '{'wrong_column'}' not found in dataset columns: ['text', 'label']."),
102 |     ):
103 |         Doc.from_hf_dataset(hf_dataset, column_map={"text": "wrong_column"})
104 | 


--------------------------------------------------------------------------------
/sieves/tests/test_serialization.py:
--------------------------------------------------------------------------------
  1 | # mypy: ignore-errors
  2 | import os
  3 | import pickle
  4 | import tempfile
  5 | from pathlib import Path
  6 | 
  7 | import chonkie
  8 | import dspy
  9 | import pytest
 10 | 
 11 | from sieves import Pipeline
 12 | from sieves.engines import EngineType
 13 | from sieves.serialization import Config
 14 | from sieves.tasks import preprocessing
 15 | from sieves.tasks.predictive import classification
 16 | from sieves.tests.conftest import make_model
 17 | 
 18 | 
 19 | @pytest.mark.parametrize(
 20 |     "batch_runtime",
 21 |     [EngineType.dspy],
 22 |     indirect=["batch_runtime"],
 23 | )
 24 | def test_serialization_pipeline(dummy_docs, batch_runtime, tokenizer):
 25 |     """Tests serialization and deserialization of pipeline to files and config objects."""
 26 |     pipe = Pipeline(
 27 |         [
 28 |             preprocessing.Chunking(chonkie.TokenChunker(tokenizer)),
 29 |             classification.Classification(
 30 |                 task_id="classifier",
 31 |                 labels=["science", "politics"],
 32 |                 label_descriptions={"science": "Everything about science.", "politics": "Everything about politics."},
 33 |                 model=batch_runtime.model,
 34 |                 generation_settings=batch_runtime.generation_settings,
 35 |                 batch_size=batch_runtime.batch_size,
 36 |             ),
 37 |         ]
 38 |     )
 39 | 
 40 |     # Get config, assert values are correct.
 41 |     config = pipe.serialize()
 42 |     config_model_dump = config.model_dump()
 43 |     version = Config.get_version()
 44 |     assert config_model_dump == {
 45 |         "cls_name": "sieves.pipeline.core.Pipeline",
 46 |         "use_cache": {"is_placeholder": False, "value": True},
 47 |         "tasks": {
 48 |             "is_placeholder": False,
 49 |             "value": [
 50 |                 {
 51 |                     "chunker": {"is_placeholder": True, "value": "chonkie.chunker.token.TokenChunker"},
 52 |                     'batch_size': {'is_placeholder': False, 'value': -1},
 53 |                     "cls_name": "sieves.tasks.preprocessing.chunking.core.Chunking",
 54 |                     "include_meta": {"is_placeholder": False, "value": False},
 55 |                     "task_id": {"is_placeholder": False, "value": "Chunking"},
 56 |                     "version": version,
 57 |                 },
 58 |                 {
 59 |                     "cls_name": "sieves.tasks.predictive.classification.core.Classification",
 60 |                     'generation_settings': {
 61 |                         'is_placeholder': False,
 62 |                         'value': {
 63 |                             'config_kwargs': None,
 64 |                             'inference_kwargs': None,
 65 |                             'init_kwargs': None,
 66 |                             'strict_mode': False
 67 |                         }
 68 |                     },
 69 |                     "fewshot_examples": {"is_placeholder": False, "value": ()},
 70 |                     "include_meta": {"is_placeholder": False, "value": True},
 71 |                     'batch_size': {'is_placeholder': False, 'value': -1},
 72 |                     "labels": {"is_placeholder": False, "value": ["science", "politics"]},
 73 |                     "label_descriptions": {
 74 |                         "is_placeholder": False,
 75 |                         "value": {"science": "Everything about science.", "politics": "Everything about politics."},
 76 |                     },
 77 |                     'model': {'is_placeholder': True, 'value': 'dspy.clients.lm.LM'},
 78 |                     "prompt_instructions": {"is_placeholder": False, "value": None},
 79 |                     "task_id": {"is_placeholder": False, "value": "classifier"},
 80 |                     "version": version,
 81 |                 },
 82 |             ],
 83 |         },
 84 |         "version": version,
 85 |     }
 86 | 
 87 |     # Save config to temporary file
 88 |     with tempfile.NamedTemporaryFile(suffix=".yml") as tmp_file:
 89 |         tmp_path = Path(tmp_file.name)
 90 |         config.dump(tmp_path)
 91 | 
 92 |         # Load config from file and verify it matches
 93 |         loaded_config = Config.load(tmp_path)
 94 |         # For some reason empty tuple is stored as list, which is fine for our purposes.
 95 |         assert config_model_dump["tasks"]["value"][1]["fewshot_examples"]["value"] == ()
 96 |         config_model_dump["tasks"]["value"][1]["fewshot_examples"]["value"] = []
 97 |         assert loaded_config.model_dump() == config_model_dump
 98 | 
 99 |         # Restore pipeline from config.
100 |         loaded_pipe = Pipeline.load(
101 |             tmp_path,
102 |             (
103 |                 {"chunker": chonkie.TokenChunker(tokenizer)},
104 |                 {"model": make_model(EngineType.dspy)},
105 |             ),
106 |         )
107 | 
108 |         # Run restored pipeline.
109 |         docs = list(loaded_pipe(dummy_docs))
110 |         assert len(docs) == 2
111 |         assert len(docs[0].results["classifier"])
112 | 
113 |         # Compare loaded pipe config with original one.
114 |         assert loaded_pipe.serialize().model_dump() == config_model_dump
115 | 
116 | 
117 | def test_serialization_docs(dummy_docs):
118 |     """Test serializition of docs by saving to and loading from pickle objects."""
119 |     # Create a temporary file for pickle serialization.
120 |     with tempfile.NamedTemporaryFile(suffix=".pkl") as tmp_file:
121 |         tmp_path = Path(tmp_file.name)
122 | 
123 |         # Pickle the dummy_docs to file.
124 |         with open(tmp_path, "wb") as f:
125 |             pickle.dump(dummy_docs, f)
126 | 
127 |         # Load the docs back from file.
128 |         with open(tmp_path, "rb") as f:
129 |             loaded_docs = pickle.load(f)
130 | 
131 |         # Assert the loaded docs are identical to original
132 |         assert len(loaded_docs) == len(dummy_docs)
133 |         assert all([orig_doc == loaded_doc for orig_doc, loaded_doc in zip(dummy_docs, loaded_docs)])
134 | 
135 |         # Test that comparing Doc with int raises NotImplementedError
136 |         with pytest.raises(NotImplementedError):
137 |             loaded_docs[0] == 42
138 | 


--------------------------------------------------------------------------------
/sieves/tests/test_strict_mode.py:
--------------------------------------------------------------------------------
 1 | # mypy: ignore-errors
 2 | import pydantic
 3 | import pytest
 4 | 
 5 | from sieves import Doc, Pipeline
 6 | from sieves.engines import EngineType
 7 | from sieves.tasks.predictive import information_extraction
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "batch_runtime", (EngineType.dspy, EngineType.langchain, EngineType.outlines), indirect=["batch_runtime"]
12 | )
13 | @pytest.mark.parametrize("strict_mode", [True, False])
14 | def test_strict_mode(batch_runtime, strict_mode):
15 |     batch_runtime.generation_settings.strict_mode = strict_mode
16 | 
17 |     class Person(pydantic.BaseModel, frozen=True):
18 |         name: str
19 |         age: pydantic.PositiveInt
20 | 
21 |     pipe = Pipeline([
22 |         information_extraction.InformationExtraction(
23 |             entity_type=Person,
24 |             model=batch_runtime.model,
25 |             generation_settings=batch_runtime.generation_settings,
26 |             batch_size=batch_runtime.batch_size,
27 |         )
28 |     ])
29 | 
30 |     docs: list[Doc] = []
31 |     hit_exception = False
32 |     if strict_mode:
33 |         try:
34 |             docs = list(pipe([Doc(text=".")]))
35 |         except Exception:
36 |             hit_exception = True
37 |     if strict_mode is False:
38 |         docs = list(pipe([Doc(text=".")]))
39 | 
40 |     if strict_mode and hit_exception:
41 |         assert len(docs) == 0
42 |     else:
43 |         assert len(docs) == 1
44 | 
45 |     for doc in docs:
46 |         assert "InformationExtraction" in doc.results
47 | 


--------------------------------------------------------------------------------
/ty.toml:
--------------------------------------------------------------------------------
 1 | [rules]
 2 | # Ignoring a bunch of rules until we get around to clean up typing.
 3 | unresolved-attribute = "ignore"
 4 | unresolved-import = "ignore"
 5 | invalid-assignment = "ignore"
 6 | invalid-argument-type = "ignore"
 7 | missing-argument = "ignore"
 8 | not-iterable = "ignore"
 9 | 
10 | [src]
11 | exclude = [
12 |     ".venv/**",
13 |     "build/**",
14 |     "sieves/tests/**",
15 |     "examples/**"
16 | ]
17 | 


--------------------------------------------------------------------------------