├── .github ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── publish-docs.yml │ ├── publish.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── AGENTS.md ├── CNAME ├── LICENSE ├── README.md ├── codecov.yml ├── docs ├── about.md ├── assets │ ├── Presentation_PyData_Amsterdam_2025.pdf │ ├── generate_text_png.py │ ├── sieve.png │ └── sieves_sieve_style.png ├── bridge.md ├── doc.md ├── engines │ ├── base_engine.md │ ├── dspy.md │ ├── gliner.md │ ├── huggingface.md │ ├── langchain.md │ └── outlines.md ├── guides │ ├── custom_tasks.md │ ├── distillation.md │ ├── getting_started.md │ ├── optimization.md │ ├── preprocessing.md │ └── serialization.md ├── index.md ├── pipeline.md └── tasks │ ├── predictive │ ├── classification.md │ ├── information_extraction.md │ ├── ner.md │ ├── pii_masking.md │ ├── question_answering.md │ ├── sentiment_analysis.md │ ├── summarization.md │ └── translation.md │ ├── preprocessing │ ├── chunking │ │ ├── chonkie.md │ │ ├── chunking.md │ │ └── naive.md │ └── ingestion │ │ ├── docling.md │ │ ├── ingestion.md │ │ ├── marker.md │ │ └── unstructured.md │ └── task.md ├── examples └── pydata_amsterdam_demo.py ├── mkdocs.yml ├── pyproject.toml ├── setup.cfg ├── setup.py ├── sieves ├── __init__.py ├── data │ ├── __init__.py │ └── doc.py ├── engines │ ├── __init__.py │ ├── core.py │ ├── dspy_.py │ ├── engine_import.py │ ├── engine_type.py │ ├── glix_.py │ ├── huggingface_.py │ ├── langchain_.py │ ├── missing.py │ ├── outlines_.py │ ├── types.py │ └── utils.py ├── pipeline │ ├── __init__.py │ └── core.py ├── serialization.py ├── tasks │ ├── __init__.py │ ├── core.py │ ├── optimization │ │ ├── __init__.py │ │ └── core.py │ ├── postprocessing │ │ ├── __init__.py │ │ └── distillation │ │ │ ├── __init__.py │ │ │ ├── distillation_import.py │ │ │ └── types.py │ ├── predictive │ │ ├── __init__.py │ │ ├── bridges.py │ │ ├── classification │ │ │ ├── __init__.py │ │ │ ├── bridges.py │ │ │ └── core.py │ │ ├── core.py │ │ ├── information_extraction │ │ │ ├── __init__.py │ │ │ ├── bridges.py │ │ │ └── core.py │ │ ├── ner │ │ │ ├── __init__.py │ │ │ ├── bridges.py │ │ │ └── core.py │ │ ├── pii_masking │ │ │ ├── __init__.py │ │ │ ├── bridges.py │ │ │ └── core.py │ │ ├── question_answering │ │ │ ├── __init__.py │ │ │ ├── bridges.py │ │ │ └── core.py │ │ ├── sentiment_analysis │ │ │ ├── __init__.py │ │ │ ├── bridges.py │ │ │ └── core.py │ │ ├── summarization │ │ │ ├── __init__.py │ │ │ ├── bridges.py │ │ │ └── core.py │ │ └── translation │ │ │ ├── __init__.py │ │ │ ├── bridges.py │ │ │ └── core.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── chunking │ │ │ ├── __init__.py │ │ │ ├── chonkie_.py │ │ │ ├── core.py │ │ │ └── naive.py │ │ └── ingestion │ │ │ ├── __init__.py │ │ │ ├── core.py │ │ │ ├── docling_.py │ │ │ ├── marker_.py │ │ │ └── unstructured_.py │ ├── types.py │ └── utils.py └── tests │ ├── assets │ ├── 1204.0162v2.pdf │ └── dummy.txt │ ├── conftest.py │ ├── tasks │ ├── predictive │ │ ├── test_classification.py │ │ ├── test_information_extraction.py │ │ ├── test_ner.py │ │ ├── test_pii_masking.py │ │ ├── test_question_answering.py │ │ ├── test_sentiment_analysis.py │ │ ├── test_summarization.py │ │ └── test_translation.py │ ├── preprocessing │ │ ├── chunking │ │ │ ├── test_chonkie.py │ │ │ ├── test_chunking.py │ │ │ └── test_naivechunker.py │ │ └── ingestion │ │ │ ├── test_docling.py │ │ │ ├── test_ingestion.py │ │ │ ├── test_marker.py │ │ │ └── test_unstructured.py │ ├── test_distillation.py │ ├── test_misc.py │ └── test_optimization.py │ ├── test_doc.py │ ├── test_pipeline.py │ ├── test_serialization.py │ └── test_strict_mode.py ├── ty.toml └── uv.lock /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | 4 | ## Related Issues 5 | 6 | \- 7 | 8 | ## Changes Made 9 | 10 | 11 | ## Checklist 12 | - [ ] Tests have been extended to cover changes in functionality 13 | - [ ] Existing and new tests succeed 14 | - [ ] Documentation updated (if applicable) 15 | - [ ] Related issues linked 16 | 17 | ## Screenshots/Examples (if applicable) 18 | 19 | -------------------------------------------------------------------------------- /.github/workflows/publish-docs.yml: -------------------------------------------------------------------------------- 1 | # Simple workflow for deploying static content to GitHub Pages 2 | name: Deploy documentation 3 | 4 | on: 5 | # Runs on pushes targeting the default branch 6 | push: 7 | branches: ["main"] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 13 | permissions: 14 | contents: read 15 | pages: write 16 | id-token: write 17 | 18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 20 | concurrency: 21 | group: "pages" 22 | cancel-in-progress: false 23 | 24 | jobs: 25 | # Single deploy job since we're just deploying 26 | deploy: 27 | environment: 28 | name: github-pages 29 | url: ${{ steps.deployment.outputs.page_url }} 30 | runs-on: ubuntu-latest 31 | steps: 32 | - name: Checkout 33 | uses: actions/checkout@v4 34 | 35 | - name: Setup Pages 36 | uses: actions/configure-pages@v5 37 | 38 | - name: Install dependencies 39 | run: | 40 | pip install "mkdocstrings[python]>=0.27,<1" 41 | pip install "mkdocs-material>=9.6,<10" 42 | 43 | - name: Build docs 44 | run: | 45 | mkdocs build 46 | 47 | - name: Upload artifact 48 | uses: actions/upload-pages-artifact@v3 49 | with: 50 | # Upload entire repository 51 | path: site 52 | 53 | - name: Deploy to GitHub Pages 54 | id: deployment 55 | uses: actions/deploy-pages@v4 56 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package to PyPI 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | workflow_dispatch: 8 | 9 | jobs: 10 | pypi-publish: 11 | name: Publish release to PyPI 12 | runs-on: ubuntu-latest 13 | permissions: 14 | id-token: write 15 | contents: read 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: "3.x" 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install build 28 | 29 | - name: Build package 30 | run: python -m build 31 | 32 | - name: Publish package distributions to PyPI 33 | uses: pypa/gh-action-pypi-publish@release/v1 34 | with: 35 | password: ${{ secrets.PYPI_TOKEN }} 36 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | workflow_dispatch: 6 | push: 7 | branches: 8 | - main 9 | paths: 10 | - sieves/** 11 | 12 | jobs: 13 | test: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.12"] 18 | 19 | steps: 20 | 21 | - name: Maximize build space 22 | shell: bash 23 | run: | 24 | sudo rm -rf /usr/share/dotnet \ 25 | /usr/local/lib/android \ 26 | /opt/ghc \ 27 | /opt/hostedtoolcache/CodeQL 28 | 29 | - name: Checkout repo 30 | uses: actions/checkout@v4 31 | with: 32 | fetch-depth: 1 33 | 34 | - name: Set up Python ${{ matrix.python-version }} 35 | uses: actions/setup-python@v4 36 | with: 37 | python-version: ${{ matrix.python-version }} 38 | 39 | - name: Install uv 40 | run: | 41 | curl -LsSf https://astral.sh/uv/install.sh | sh 42 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 43 | 44 | - name: Install dependencies 45 | run: | 46 | sudo apt-get update 47 | sudo apt-get install tesseract-ocr 48 | uv venv .venv --python 3.12 49 | uv sync --all-extras 50 | 51 | - name: Create and enable 4 GB swap 52 | run: | 53 | sudo fallocate -l 4G /swapfile 54 | sudo chmod 600 /swapfile 55 | sudo mkswap /swapfile 56 | sudo swapon /swapfile 57 | free -h 58 | 59 | - name: Build docs 60 | run: | 61 | source .venv/bin/activate 62 | mkdocs build 63 | 64 | 65 | - name: Run tests 66 | env: 67 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} 68 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} 69 | run: | 70 | source .venv/bin/activate 71 | pytest -x --cov --cov-report=xml -m "not slow" 72 | 73 | - name: Debug 74 | run: | 75 | ls -lh 76 | 77 | - name: Upload coverage reports to Codecov 78 | uses: codecov/codecov-action@v5 79 | with: 80 | token: ${{ secrets.CODECOV_TOKEN }} 81 | files: coverage.xml 82 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | *.pyo 4 | build/ 5 | dist/ 6 | *.egg-info/ 7 | .idea 8 | .venv/ 9 | .env 10 | .windsurfrules 11 | site/ 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_install_hook_types: [pre-push] 2 | exclude: ^reports/ # skip auto-generated code coverage files 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.4.0 7 | hooks: 8 | - id: check-ast # Is it valid Python? 9 | - id: debug-statements # no debbuging statements used 10 | - id: trailing-whitespace 11 | - id: end-of-file-fixer 12 | - id: check-added-large-files 13 | - id: check-case-conflict 14 | - id: check-json 15 | - id: pretty-format-json 16 | args: [ "--autofix", "--no-sort-keys", "--no-ensure-ascii" ] 17 | exclude: ^(model/data/testing)/ 18 | - id: check-merge-conflict 19 | - id: detect-private-key 20 | - id: check-executables-have-shebangs 21 | 22 | - repo: https://github.com/asottile/pyupgrade 23 | rev: v2.29.1 24 | hooks: 25 | - id: pyupgrade 26 | exclude: ^(TAG) 27 | # Remove blocks that are only compatible with Python ≤ 3.11 28 | args: [--py311-plus] 29 | 30 | - repo: https://github.com/astral-sh/ruff-pre-commit 31 | # Ruff version 32 | rev: v0.11.13 33 | hooks: 34 | # Run the linter 35 | - id: ruff-check 36 | # Fix lint issues, including docstrings, sorting imports and commented code 37 | args: ["--extend-select", "D,I,ERA001", "--line-length", "120", "--fix"] 38 | # Run the formatter 39 | - id: ruff-format 40 | 41 | # Type checking 42 | - repo: local 43 | hooks: 44 | - id: ty-check 45 | name: Type checking 46 | entry: bash -c 'uvx ty check --config-file ty.toml' 47 | language: system 48 | -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | sieves.ai 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Mantis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: true 3 | 4 | coverage: 5 | # How coverage metrics are presented 6 | precision: 2 7 | round: down 8 | range: "70...100" 9 | 10 | # Define coverage thresholds. 11 | status: 12 | project: 13 | default: 14 | # Fails CI if coverage < 90% 15 | target: 90 16 | patch: 17 | default: 18 | target: 10 19 | 20 | # Optionally ignore specific files/dirs. 21 | ignore: 22 | - "sieves/tests" 23 | - "docs" 24 | - "**/test_*.py" 25 | - "sieves/engines/engine_import.py" 26 | - "sieves/tasks/postprocessing/distillation/distillation_import.py" 27 | 28 | comment: 29 | layout: "diff, flags, files" 30 | behavior: default 31 | require_changes: false # learn more in the Requiring Changes section below 32 | require_base: false # [true :: must have a base report to post] 33 | require_head: true # [true :: must have a head report to post] 34 | hide_project_coverage: true # [true :: only show coverage on the git diff] 35 | 36 | parsers: 37 | # Example parser config 38 | gcov: 39 | branch_detection: 40 | conditional: yes 41 | loop: yes 42 | method: no 43 | macro: no 44 | -------------------------------------------------------------------------------- /docs/about.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | For any feedback, feature requests, contributions etc. use our [GitHub issue tracker](https://github.com/MantisAI/sieves/issues). 4 | 5 | `sieves` is maintained by [Mantis](https://mantisnlp.com), an AI consultancy. We help our clients to solve business problems related to 6 | natural human language and speech. If that's something you're interested in - [drop us a line](https://mantisnlp.com/contact/#cta)! -------------------------------------------------------------------------------- /docs/assets/Presentation_PyData_Amsterdam_2025.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MantisAI/sieves/3ce96055c1343849909650265de2b5e7d98745b8/docs/assets/Presentation_PyData_Amsterdam_2025.pdf -------------------------------------------------------------------------------- /docs/assets/generate_text_png.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | from PIL import Image, ImageDraw, ImageFont 3 | 4 | 5 | def create_sieve_pattern(width, height, dot_radius=5, spacing=20): 6 | """ 7 | Create an RGBA image (black background, alpha=255) 8 | with transparent polka-dot holes (alpha=0). 9 | """ 10 | # Start with a fully opaque black image 11 | pattern = Image.new("RGBA", (width, height), (0, 0, 0, 255)) 12 | draw = ImageDraw.Draw(pattern) 13 | 14 | # "Punch out" holes by drawing circles with (0,0,0,0) = transparent 15 | for y in range(0, height, spacing): 16 | for x in range(0, width, spacing): 17 | left = x - dot_radius 18 | top = y - dot_radius 19 | right = x + dot_radius 20 | bottom = y + dot_radius 21 | draw.ellipse([left, top, right, bottom], fill=(0, 0, 0, 0)) 22 | 23 | return pattern 24 | 25 | 26 | def create_text_mask(text, font_path, font_size, image_size): 27 | """ 28 | Create a grayscale (L-mode) mask with white text on black background. 29 | White = 255 => opaque region, black = 0 => transparent region. 30 | """ 31 | mask_img = Image.new("L", image_size, color=0) # black by default 32 | draw = ImageDraw.Draw(mask_img) 33 | 34 | font = ImageFont.truetype(font_path, font_size) 35 | 36 | # Use textbbox in newer Pillow (10.0+), since textsize is deprecated 37 | bbox = draw.textbbox((0, 0), text, font=font) 38 | text_width = bbox[2] - bbox[0] 39 | text_height = bbox[3] - bbox[1] 40 | 41 | x_pos = (image_size[0] - text_width) // 2 42 | y_pos = (image_size[1] - text_height) // 2 43 | 44 | # White text on black background 45 | draw.text((x_pos, y_pos), text, fill=255, font=font) 46 | 47 | return mask_img 48 | 49 | 50 | def create_sieve_text_image(text, font_path, output_path="sieve_text.png"): 51 | width, height = 800, 400 52 | 53 | # 1) Create the “sieve” pattern (black with transparent holes) 54 | pattern_img = create_sieve_pattern(width, height, dot_radius=3, spacing=18) 55 | 56 | # 2) Create a text mask (white text on black background, "L" mode) 57 | text_mask = create_text_mask(text=text, font_path=font_path, font_size=100, image_size=(width, height)) 58 | 59 | # 3) Create a transparent canvas 60 | canvas = Image.new("RGBA", (width, height), (0, 0, 0, 0)) 61 | 62 | # 4) Paste the pattern onto the canvas wherever text_mask is non-zero 63 | # (i.e., where the text is white) 64 | canvas.paste(pattern_img, (0, 0), text_mask) 65 | 66 | # 5) Save 67 | canvas.save(output_path, "PNG") 68 | print(f"Saved sieve-style text with transparent holes to: {output_path}") 69 | 70 | 71 | # --------------------------------- 72 | # Example usage: 73 | # --------------------------------- 74 | if __name__ == "__main__": 75 | create_sieve_text_image( 76 | text="sieves", font_path="/home/raphael/.local/share/fonts/Hack-Bold.ttf", output_path="sieves_sieve_style.png" 77 | ) 78 | -------------------------------------------------------------------------------- /docs/assets/sieve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MantisAI/sieves/3ce96055c1343849909650265de2b5e7d98745b8/docs/assets/sieve.png -------------------------------------------------------------------------------- /docs/assets/sieves_sieve_style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MantisAI/sieves/3ce96055c1343849909650265de2b5e7d98745b8/docs/assets/sieves_sieve_style.png -------------------------------------------------------------------------------- /docs/bridge.md: -------------------------------------------------------------------------------- 1 | # Bridge 2 | 3 | ::: sieves.tasks.predictive.bridges.Bridge 4 | ::: sieves.tasks.predictive.bridges.GliXBridge -------------------------------------------------------------------------------- /docs/doc.md: -------------------------------------------------------------------------------- 1 | # Doc 2 | 3 | ::: sieves.data.doc -------------------------------------------------------------------------------- /docs/engines/base_engine.md: -------------------------------------------------------------------------------- 1 | # Internal Engine 2 | 3 | ::: sieves.engines.core -------------------------------------------------------------------------------- /docs/engines/dspy.md: -------------------------------------------------------------------------------- 1 | # DSPy 2 | 3 | ::: sieves.engines.dspy_.DSPy -------------------------------------------------------------------------------- /docs/engines/gliner.md: -------------------------------------------------------------------------------- 1 | # GliNER 2 | 3 | ::: sieves.engines.glix_.GliX -------------------------------------------------------------------------------- /docs/engines/huggingface.md: -------------------------------------------------------------------------------- 1 | # Hugging Face 2 | 3 | ::: sieves.engines.huggingface_.HuggingFace -------------------------------------------------------------------------------- /docs/engines/langchain.md: -------------------------------------------------------------------------------- 1 | # LangChain 2 | 3 | ::: sieves.engines.langchain_.LangChain -------------------------------------------------------------------------------- /docs/engines/outlines.md: -------------------------------------------------------------------------------- 1 | # Outlines 2 | 3 | ::: sieves.engines.outlines_.Outlines -------------------------------------------------------------------------------- /docs/guides/getting_started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | This guide will help you get started with using `sieves` for zero-shot and few-shot NLP tasks with structured generation. 4 | 5 | ## Basic Concepts 6 | 7 | `sieves` is built around four main concepts: 8 | 9 | 1. **Documents (`Doc`)**: The basic unit of text that you want to process. A document can be created from text or a URI. 10 | 2. **Models + GenerationSettings**: You pass a model from your chosen backend (Outlines, DSPy, LangChain, etc.) and optional `GenerationSettings` (e.g., batch size) 11 | 3. **Tasks**: NLP operations you want to perform on your documents (classification, information extraction, etc.) 12 | 4. **Pipeline**: A sequence of tasks that process your documents 13 | 14 | ## Quick Start Example 15 | 16 | Here's a simple example that performs text classification: 17 | 18 | ```python 19 | import outlines 20 | from sieves import Pipeline, tasks, Doc 21 | 22 | # Create a document 23 | doc = Doc(text="Special relativity applies to all physical phenomena in the absence of gravity.") 24 | 25 | # Choose a model (using a small but capable model) 26 | model = outlines.models.transformers("HuggingFaceTB/SmolLM-135M-Instruct") 27 | 28 | # Create and run the pipeline (verbose init) 29 | pipeline = Pipeline([ 30 | tasks.predictive.Classification( 31 | labels=["science", "politics"], 32 | model=model, 33 | ) 34 | ]) 35 | 36 | # Print the classification result 37 | for doc in pipeline([doc]): 38 | print(doc.results) 39 | 40 | # Alternatively: succinct chaining with + 41 | # (useful when you have multiple tasks) 42 | # classifier = tasks.predictive.Classification(labels=["science", "politics"], model=model) 43 | # pipeline = classifier # single-task pipeline 44 | # Note: set additional Pipeline params (e.g., use_cache=False) only via verbose init. 45 | ``` 46 | 47 | ## Working with Documents 48 | 49 | Documents can be created in several ways: 50 | 51 | ```python 52 | from sieves import Docs 53 | 54 | # From text 55 | doc = Doc(text="Your text here") 56 | 57 | # From a file (requires docling) 58 | doc = Doc(uri="path/to/your/file.pdf") 59 | 60 | # With metadata 61 | doc = Doc( 62 | text="Your text here", 63 | meta={"source": "example", "date": "2025-01-31"} 64 | ) 65 | ``` 66 | 67 | Note: File-based ingestion (Docling/Unstructured/Marker) is optional and not installed by default. To enable it, install the ingestion extra or the specific libraries you need: 68 | 69 | ```bash 70 | pip install "sieves[ingestion]" 71 | ``` 72 | 73 | ## Advanced Example: PDF Processing Pipeline 74 | 75 | Here's a more involved example that: 76 | 77 | 1. Parses a PDF document 78 | 2. Chunks it into smaller pieces 79 | 3. Performs information extraction on each chunk 80 | 81 | ```python 82 | import outlines 83 | import chonkie 84 | import tokenizers 85 | import pydantic 86 | from sieves import Pipeline, tasks, Doc 87 | 88 | # Create a tokenizer for chunking 89 | tokenizer = tokenizers.Tokenizer.from_pretrained("bert-base-uncased") 90 | 91 | # Initialize components 92 | chunker = tasks.preprocessing.Chonkie( 93 | chunker=chonkie.TokenChunker(tokenizer, chunk_size=512, chunk_overlap=50) 94 | ) 95 | 96 | # Choose a model for information extraction 97 | model = outlines.models.transformers("HuggingFaceTB/SmolLM-135M-Instruct") 98 | 99 | 100 | # Define the structure of information you want to extract 101 | class PersonInfo(pydantic.BaseModel): 102 | name: str 103 | age: int | None = None 104 | occupation: str | None = None 105 | 106 | 107 | # Create an information extraction task 108 | extractor = tasks.predictive.InformationExtraction( 109 | entity_type=PersonInfo, 110 | model=model, 111 | ) 112 | 113 | # Create the pipeline (verbose init) 114 | pipeline = Pipeline([chunker, extractor]) 115 | 116 | # Alternatively: succinct chaining (+) 117 | # pipeline = chunker + extractor 118 | # Note: to change Pipeline parameters (e.g., use_cache), use the verbose form 119 | # Pipeline([chunker, extractor], use_cache=False) 120 | 121 | # Process a PDF document 122 | doc = Doc(text="Marie Curie died at the age of 66 years.") 123 | results = list(pipeline([doc])) 124 | 125 | # Access the extracted information 126 | for result in results: 127 | print(result.results["InformationExtraction"]) 128 | ``` 129 | 130 | ## Supported Engines 131 | 132 | `sieves` supports multiple libraries for structured generation: 133 | 134 | - [`outlines`](https://github.com/outlines-dev/outlines) 135 | - [`dspy`](https://github.com/stanfordnlp/dspy) - also supports Ollama and vLLM integration via `api_base` 136 | - [`langchain`](https://github.com/langchain-ai/langchain) 137 | - [`gliner`](https://github.com/urchade/GLiNER) 138 | - [`transformers`](https://github.com/huggingface/transformers) 139 | 140 | You pass models from these libraries directly to `PredictiveTask`. Optionally, you can include `GenerationSettings` to 141 | override defaults. Batching is controlled per task via the `batch_size` argument (see below). 142 | 143 | ### GenerationSettings (optional) 144 | `GenerationSettings` controls engine behavior and is optional. Defaults: 145 | - strict_mode: False (on parse issues, return None instead of raising) 146 | - init_kwargs/inference_kwargs: None (use engine defaults) 147 | - config_kwargs: None (used by some backends like DSPy) 148 | 149 | Batching is configured on each task via `batch_size`: 150 | - `batch_size = -1` processes all inputs at once (default) 151 | - `batch_size = N` processes N docs per batch 152 | 153 | Example: 154 | 155 | ```python 156 | from sieves.engines.utils import GenerationSettings 157 | classifier = tasks.predictive.Classification( 158 | labels=["science", "politics"], 159 | model=model, 160 | generation_settings=GenerationSettings(strict_mode=True), 161 | batch_size=8, 162 | ) 163 | ``` 164 | -------------------------------------------------------------------------------- /docs/guides/serialization.md: -------------------------------------------------------------------------------- 1 | # Saving and Loading 2 | 3 | `sieves` provides functionality to save your pipeline configurations to disk and load them later. This is useful for: 4 | 5 | - Sharing pipeline configurations with others 6 | - Versioning your pipelines 7 | - Deploying pipelines to production 8 | 9 | ## Basic Pipeline Serialization 10 | 11 | Here's a simple example of saving and loading a classification pipeline: 12 | 13 | ```python 14 | import outlines 15 | from sieves import Pipeline, tasks, Doc 16 | from pathlib import Path 17 | 18 | # Create a basic classification pipeline 19 | model_name = "HuggingFaceTB/SmolLM-135M-Instruct" 20 | model = outlines.models.transformers(model_name) 21 | classifier = tasks.predictive.Classification(labels=["science", "politics"], model=model) 22 | pipeline = Pipeline([classifier]) 23 | 24 | # Save the pipeline configuration 25 | config_path = Path("classification_pipeline.yml") 26 | pipeline.dump(config_path) 27 | 28 | # Load the pipeline configuration 29 | loaded_pipeline = Pipeline.load(config_path, [{"model": outlines.models.transformers(model_name)}]) 30 | 31 | # Use the loaded pipeline 32 | doc = Doc(text="Special relativity applies to all physical phenomena in the absence of gravity.") 33 | results = list(loaded_pipeline([doc])) 34 | print(results[0].results["Classification"]) 35 | ``` 36 | 37 | ## Dealing with complex third-party objects 38 | 39 | `sieves` doesn't serialize complex third-party objects. When loading pipelines, you need to provide initialization parameters for each task when loading: 40 | 41 | ```python 42 | import chonkie 43 | import tokenizers 44 | import outlines 45 | import pydantic 46 | from sieves import Pipeline, tasks 47 | 48 | # Create a tokenizer for chunking 49 | tokenizer = tokenizers.Tokenizer.from_pretrained("bert-base-uncased") 50 | chunker = tasks.preprocessing.Chonkie( 51 | chunker=chonkie.TokenChunker(tokenizer, chunk_size=512, chunk_overlap=50) 52 | ) 53 | 54 | model = outlines.models.transformers("HuggingFaceTB/SmolLM-135M-Instruct") 55 | 56 | 57 | class PersonInfo(pydantic.BaseModel): 58 | name: str 59 | age: int | None = None 60 | occupation: str | None = None 61 | 62 | 63 | extractor = tasks.predictive.InformationExtraction(entity_type=PersonInfo, model=model) 64 | 65 | # Create and save the pipeline 66 | pipeline = Pipeline([chunker, extractor]) 67 | pipeline.dump("extraction_pipeline.yml") 68 | 69 | # Load the pipeline with initialization parameters for each task 70 | loaded_pipeline = Pipeline.load( 71 | "extraction_pipeline.yml", 72 | [ 73 | {"tokenizer": tokenizers.Tokenizer.from_pretrained("bert-base-uncased")}, 74 | {"model": outlines.models.transformers("HuggingFaceTB/SmolLM-135M-Instruct")}, 75 | ] 76 | ) 77 | ``` 78 | 79 | ## Understanding Pipeline Configuration Files 80 | 81 | Pipeline configurations are saved as YAML files. Here's an example of what a configuration file looks like: 82 | 83 | ```yaml 84 | cls_name: sieves.pipeline.core.Pipeline 85 | version: 0.11.1 86 | tasks: 87 | is_placeholder: false 88 | value: 89 | - cls_name: sieves.tasks.preprocessing.chunkers.Chunker 90 | tokenizer: 91 | is_placeholder: true 92 | value: tokenizers.Tokenizer 93 | chunk_size: 94 | is_placeholder: false 95 | value: 512 96 | chunk_overlap: 97 | is_placeholder: false 98 | value: 50 99 | task_id: 100 | is_placeholder: false 101 | value: Chunker 102 | - cls_name: sieves.tasks.predictive.information_extraction.core.InformationExtraction 103 | engine: 104 | is_placeholder: false 105 | value: 106 | cls_name: sieves.engines.outlines_.Outlines 107 | model: 108 | is_placeholder: true 109 | value: outlines.models.transformers 110 | ``` 111 | 112 | The configuration file contains: 113 | 114 | - The full class path of the pipeline and its tasks 115 | - Version information 116 | - Task-specific parameters and their values 117 | - Placeholders for components that need to be provided during loading 118 | 119 | !!! info Parameter management 120 | 121 | When loading pipelines, provide all required initialization parameters (e.g. models) and ensure you're loading a pipeline with a compatible `sieves` version. `GenerationSettings` is optional unless you want to override defaults. 122 | 123 | !!! warning Limitations 124 | 125 | - Model weights are not saved in the configuration files 126 | - Complex third-party objects (everything beyond primitives or collections thereof) may not be serializable 127 | - API keys and credentials must be managed separately 128 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # `sieves` 2 | 3 | `sieves` is a Python library designed for zero-shot and few-shot NLP tasks that focuses on structured generation, 4 | allowing developers to build production-ready NLP prototypes without requiring training data. It provides a unified 5 | interface that wraps popular NLP tools (like `outlines`, `dspy`, `langchain`, and others) while ensuring structured 6 | outputs and observability. 7 | 8 | It bundles common NLP utilities, document parsing, and text chunking capabilities together with ready-to-use tasks like 9 | classification and information extraction, all organized in an observable pipeline architecture. It's particularly 10 | valuable for rapid prototyping scenarios where structured output is needed but training data is scarce. 11 | 12 | 13 | ## Quick Installation 14 | 15 | You can install `sieves` with different options depending on your needs 16 | 17 | Core package with minimal dependencies: 18 | ```bash 19 | pip install sieves 20 | ``` 21 | Note: Ingestion libraries (document parsing such as `docling`, `unstructured`, `marker`) are not installed by default. Install them manually or use the ingestion extra: 22 | 23 | ```bash 24 | pip install "sieves[ingestion]" 25 | ``` 26 | 27 | The minimal setup lets you add only what you need to keep the footprint small. 28 | 29 | All optional dependencies for every feature, including engines, distillation, and ingestion: 30 | ```bash 31 | pip install "sieves[engines,distill,ingestion]" 32 | ``` 33 | 34 | ### Specific Features 35 | 36 | Document ingestion/parsing libraries (PDF/DOCX parsing, etc.): 37 | ```bash 38 | pip install "sieves[ingestion]" 39 | ``` 40 | 41 | All supported engines: 42 | ```bash 43 | pip install "sieves[engines]" 44 | ``` 45 | 46 | ### Development Setup 47 | 48 | 1. Set up [`uv`](https://github.com/astral-sh/uv). 49 | 2. Install all dependencies for development, testing, documentation generation with: `uv pip install --system .[engines,distill,ingestion,test]`. 50 | 51 | ## Core Concepts 52 | 53 | `sieves` is built around five key components: 54 | 55 | 1. **`Pipeline`**: The main orchestrator that runs your NLP tasks sequentially (define with `Pipeline([...])` or chain with `+`) 56 | 2. **`Task`**: Pre-built or custom NLP operations (classification, extraction, etc.) 57 | 3. **`Engine`**: Backend implementations that power the tasks (outlines, dspy, langchain, etc.) 58 | 4. **`Bridge`**: Connectors between Tasks and Engines 59 | 5. **`Doc`**: The fundamental data structure for document processing 60 | 61 | ## Essential Links 62 | 63 | - [GitHub Repository](https://github.com/mantisai/sieves) 64 | - [PyPI Package](https://pypi.org/project/sieves/) 65 | - [Issue Tracker](https://github.com/mantisai/sieves/issues) 66 | 67 | ## Guides 68 | 69 | We've prepared several guides to help you get up to speed quickly: 70 | 71 | - [Getting Started](guides/getting_started.md) - Start here! Learn the basic concepts and create your first pipeline. 72 | - [Document Preprocessing](guides/preprocessing.md) - Master document parsing, chunking, and text standardization. 73 | - [Creating Custom Tasks](guides/custom_tasks.md) - Learn to create your own tasks when the built-in ones aren't enough. 74 | - [Saving and Loading Pipelines](guides/serialization.md) - Version and share your pipeline configurations. 75 | - [Task Optimization](guides/optimization.md) - Improve task performance by optimizing prompts and few-shot examples. 76 | - [Task Distillation](guides/distillation.md) - Fine-tune smaller, specialized models using zero-shot task results. 77 | 78 | ## Getting Help 79 | 80 | - Check our [GitHub Issues](https://github.com/mantisai/sieves/issues) for common problems 81 | - Review the documentation in the `/docs/guides/` directory 82 | - Join our community discussions (link to be added) 83 | 84 | ## Next Steps 85 | 86 | - Dive into our guides, starting with the [Getting Started Guide](guides/getting_started.md) 87 | - Check out example pipelines in our repository 88 | - Learn about custom task creation 89 | - Understand different engine configurations 90 | 91 | Consult the API reference for each component you're working with if you have specific question. They contain detailed 92 | information about parameters, configurations, and best practices. 93 | -------------------------------------------------------------------------------- /docs/pipeline.md: -------------------------------------------------------------------------------- 1 | # Pipeline 2 | 3 | Pipelines orchestrate sequential execution of tasks and support two ways to define the sequence: 4 | 5 | - Verbose initialization using `Pipeline([...])` (allows setting parameters like `use_cache`) 6 | - Succinct chaining with `+` for readability 7 | 8 | Examples 9 | 10 | ```python 11 | from sieves import Pipeline, tasks 12 | 13 | # Verbose initialization (allows non-default configuration). 14 | t_ingest = tasks.preprocessing.Ingestion(export_format="markdown") 15 | t_chunk = tasks.preprocessing.Chunking(chunker) 16 | t_cls = tasks.predictive.Classification(labels=["science", "politics"], model=engine) 17 | pipe = Pipeline([t_ingest, t_chunk, t_cls], use_cache=True) 18 | 19 | # Succinct chaining (equivalent task order). 20 | pipe2 = t_ingest + t_chunk + t_cls 21 | 22 | # You can also chain pipelines and tasks. 23 | pipe_left = Pipeline([t_ingest]) 24 | pipe_right = Pipeline([t_chunk, t_cls]) 25 | pipe3 = pipe_left + pipe_right # results in [t_ingest, t_chunk, t_cls] 26 | 27 | # In-place append (mutates the left pipeline). 28 | pipe_left += t_chunk 29 | pipe_left += pipe_right # appends all tasks from right 30 | 31 | # Note: 32 | # - Additional Pipeline parameters (e.g., use_cache=False) are only settable via the verbose form 33 | # - Chaining never mutates existing tasks or pipelines; it creates a new Pipeline 34 | # - Using "+=" mutates the existing pipeline by appending tasks 35 | ``` 36 | 37 | Note: Ingestion libraries (e.g., Docling, Unstructured, Marker) are optional and not installed by default. Install them manually or via the extra: 38 | 39 | ```bash 40 | pip install "sieves[ingestion]" 41 | ``` 42 | 43 | ::: sieves.pipeline.core 44 | -------------------------------------------------------------------------------- /docs/tasks/predictive/classification.md: -------------------------------------------------------------------------------- 1 | # Classification 2 | 3 | ::: sieves.tasks.predictive.classification.core 4 | ::: sieves.tasks.predictive.classification.bridges -------------------------------------------------------------------------------- /docs/tasks/predictive/information_extraction.md: -------------------------------------------------------------------------------- 1 | # Information Extraction 2 | 3 | ::: sieves.tasks.predictive.information_extraction.core 4 | ::: sieves.tasks.predictive.information_extraction.bridges -------------------------------------------------------------------------------- /docs/tasks/predictive/ner.md: -------------------------------------------------------------------------------- 1 | # Named Entity Recognition 2 | 3 | ::: sieves.tasks.predictive.ner.core 4 | ::: sieves.tasks.predictive.ner.bridges -------------------------------------------------------------------------------- /docs/tasks/predictive/pii_masking.md: -------------------------------------------------------------------------------- 1 | # PII Masking 2 | 3 | ::: sieves.tasks.predictive.pii_masking.core 4 | ::: sieves.tasks.predictive.pii_masking.bridges -------------------------------------------------------------------------------- /docs/tasks/predictive/question_answering.md: -------------------------------------------------------------------------------- 1 | # Question Answering 2 | 3 | ::: sieves.tasks.predictive.question_answering.core 4 | ::: sieves.tasks.predictive.question_answering.bridges -------------------------------------------------------------------------------- /docs/tasks/predictive/sentiment_analysis.md: -------------------------------------------------------------------------------- 1 | # Sentiment Analysis 2 | 3 | ::: sieves.tasks.predictive.sentiment_analysis.core 4 | ::: sieves.tasks.predictive.sentiment_analysis.bridges -------------------------------------------------------------------------------- /docs/tasks/predictive/summarization.md: -------------------------------------------------------------------------------- 1 | # Summarization 2 | 3 | ::: sieves.tasks.predictive.summarization.core 4 | ::: sieves.tasks.predictive.summarization.bridges -------------------------------------------------------------------------------- /docs/tasks/predictive/translation.md: -------------------------------------------------------------------------------- 1 | # Translation 2 | 3 | ::: sieves.tasks.predictive.translation 4 | ::: sieves.tasks.predictive.translation.bridges -------------------------------------------------------------------------------- /docs/tasks/preprocessing/chunking/chonkie.md: -------------------------------------------------------------------------------- 1 | # Chonkie 2 | 3 | ::: sieves.tasks.preprocessing.chunking.chonkie_ -------------------------------------------------------------------------------- /docs/tasks/preprocessing/chunking/chunking.md: -------------------------------------------------------------------------------- 1 | # Chunker 2 | 3 | ::: sieves.tasks.preprocessing.chunking.core 4 | -------------------------------------------------------------------------------- /docs/tasks/preprocessing/chunking/naive.md: -------------------------------------------------------------------------------- 1 | # NaiveChunker 2 | 3 | ::: sieves.tasks.preprocessing.chunking.naive -------------------------------------------------------------------------------- /docs/tasks/preprocessing/ingestion/docling.md: -------------------------------------------------------------------------------- 1 | # Docling 2 | 3 | Note: This task depends on optional ingestion libraries, which are not installed by default. Install them via the ingestion extra, or install the library directly. 4 | 5 | Examples: 6 | 7 | ```bash 8 | pip install "sieves[ingestion]" # installs ingestion deps via extra 9 | # or install the library directly 10 | pip install docling 11 | ``` 12 | 13 | ::: sieves.tasks.preprocessing.ingestion.docling_ 14 | -------------------------------------------------------------------------------- /docs/tasks/preprocessing/ingestion/ingestion.md: -------------------------------------------------------------------------------- 1 | # Marker 2 | 3 | ::: sieves.tasks.preprocessing.ingestion.core 4 | -------------------------------------------------------------------------------- /docs/tasks/preprocessing/ingestion/marker.md: -------------------------------------------------------------------------------- 1 | # Marker 2 | 3 | Note: This task depends on optional ingestion libraries that are not installed by default. You can install them via the ingestion extra, or install the library directly. 4 | 5 | Examples: 6 | 7 | ```bash 8 | pip install "sieves[ingestion]" # installs ingestion deps via extra 9 | # or install the library directly (e.g., the Marker PDF package) 10 | pip install marker # or the appropriate marker package variant 11 | ``` 12 | 13 | ::: sieves.tasks.preprocessing.ingestion.marker_ 14 | -------------------------------------------------------------------------------- /docs/tasks/preprocessing/ingestion/unstructured.md: -------------------------------------------------------------------------------- 1 | # unstructured 2 | 3 | Note: This task depends on optional ingestion libraries that are not installed by default. Install them with the ingestion extra, or install the specific library directly if you prefer. 4 | 5 | Examples: 6 | 7 | ```bash 8 | pip install "sieves[ingestion]" # installs ingestion deps via extra 9 | # or install the library directly 10 | pip install unstructured # choose extras as needed 11 | ``` 12 | 13 | ::: sieves.tasks.preprocessing.ingestion.unstructured_ 14 | -------------------------------------------------------------------------------- /docs/tasks/task.md: -------------------------------------------------------------------------------- 1 | # Task 2 | 3 | ::: sieves.tasks.core.Task -------------------------------------------------------------------------------- /examples/pydata_amsterdam_demo.py: -------------------------------------------------------------------------------- 1 | """Demo for PyData Amsterdam 2025. 2 | 3 | Required additional dependencies: 4 | - openai 5 | - outlines 6 | """ 7 | 8 | import os 9 | from collections import defaultdict 10 | from pprint import pprint 11 | from typing import Literal 12 | 13 | import openai 14 | import outlines 15 | import pydantic 16 | 17 | from sieves import Doc, Engine, tasks 18 | 19 | 20 | class Country(pydantic.BaseModel, frozen=True): 21 | """Describes a country and it's stance on the chat control proposal.""" 22 | 23 | name: str 24 | in_eu: bool 25 | stance_on_chat_control_proposal: Literal["pro", "undecided", "contra", "unknown"] 26 | 27 | 28 | if __name__ == '__main__': 29 | docs = [ 30 | Doc( 31 | uri="https://www.techradar.com/computing/cyber-security/chat-control-the-list-of-countries-opposing-the-" 32 | "law-grows-but-support-remains-strong" 33 | ) 34 | ] 35 | 36 | model = outlines.from_openai( 37 | openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]), 38 | model_name="gpt-5-mini" 39 | ) 40 | 41 | pipe = tasks.Ingestion() + tasks.InformationExtraction(entity_type=Country, model=model) 42 | 43 | for doc in pipe(docs): 44 | countries = defaultdict(list) 45 | for country in doc.results["InformationExtraction"]: 46 | assert isinstance(country, Country) 47 | if country.in_eu: 48 | countries[country.stance_on_chat_control_proposal].append(country.name) 49 | 50 | pprint(countries) 51 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: sieves 2 | site_url: https://sieves.ai/docs 3 | docs_dir: docs 4 | repo_url: https://github.com/mantisai/sieves 5 | repo_name: "sieves on GitHub" 6 | 7 | theme: 8 | name: material 9 | features: 10 | - navigation.top 11 | - navigation.tabs 12 | - navigation.footer 13 | - header.autohide 14 | palette: 15 | - scheme: slate 16 | toggle: 17 | icon: material/weather-sunny 18 | name: Switch to light mode 19 | - scheme: default 20 | toggle: 21 | icon: material/weather-night 22 | name: Switch to dark mode 23 | logo: 'assets/sieve.png' 24 | 25 | plugins: 26 | - search 27 | - mkdocstrings: 28 | default_handler: python 29 | handlers: 30 | python: 31 | options: 32 | docstring_style: sphinx 33 | show_source: true 34 | inherited_members: true 35 | extra: 36 | signatures: true 37 | 38 | markdown_extensions: 39 | - admonition 40 | - codehilite 41 | - pymdownx.superfences 42 | 43 | nav: 44 | - Home: 45 | - index.md 46 | - about.md 47 | - API: 48 | - pipeline.md 49 | - doc.md 50 | - bridge.md 51 | - Tasks: 52 | - tasks/task.md 53 | - Preprocessing: 54 | - Ingestion: 55 | - tasks/preprocessing/ingestion/ingestion.md 56 | - tasks/preprocessing/ingestion/docling.md 57 | - tasks/preprocessing/ingestion/unstructured.md 58 | - tasks/preprocessing/ingestion/marker.md 59 | - Chunking: 60 | - tasks/preprocessing/chunking/chunking.md 61 | - tasks/preprocessing/chunking/chonkie.md 62 | - tasks/preprocessing/chunking/naive.md 63 | 64 | - Predictive: 65 | - tasks/predictive/classification.md 66 | - tasks/predictive/information_extraction.md 67 | - tasks/predictive/ner.md 68 | - tasks/predictive/pii_masking.md 69 | - tasks/predictive/question_answering.md 70 | - tasks/predictive/sentiment_analysis.md 71 | - tasks/predictive/summarization.md 72 | - tasks/predictive/translation.md 73 | 74 | - Engines: 75 | - engines/base_engine.md 76 | - All Engines: 77 | - engines/dspy.md 78 | - engines/gliner.md 79 | - engines/huggingface.md 80 | - engines/langchain.md 81 | - engines/outlines.md 82 | - Guides: 83 | - guides/getting_started.md 84 | - guides/preprocessing.md 85 | - guides/serialization.md 86 | - guides/custom_tasks.md 87 | - guides/optimization.md 88 | - guides/distillation.md 89 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools-scm"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "sieves" 7 | readme = "README.md" 8 | description = "Plug-and-play, zero-shot document processing pipelines." 9 | license = { text = "MIT" } 10 | dynamic = ["version", "authors"] 11 | requires-python = ">=3.12" 12 | dependencies = [ 13 | "chonkie>=1,<2", 14 | "datasets>=3,<4", 15 | "jinja2>=3,<4", 16 | "loguru>=0.7,<1", 17 | "outlines>=1,<2", 18 | "dspy-ai>=2,<3", 19 | "dspy>=2,<3", 20 | "pydantic>=2,<3", 21 | ] 22 | classifiers = [ 23 | "Development Status :: 3 - Alpha", 24 | "Intended Audience :: Developers", 25 | "Topic :: Software Development :: Libraries", 26 | "License :: OSI Approved :: MIT License", 27 | "Programming Language :: Python :: 3", 28 | "Programming Language :: Python :: 3.11", 29 | "Programming Language :: Python :: 3.12" 30 | ] 31 | 32 | [project.optional-dependencies] 33 | ingestion = [ 34 | "docling>=2,<3", 35 | "marker-pdf[full]>=1.6.1,<2", 36 | "nltk>=3.9.1", 37 | "unstructured-inference>=0.8,<1", 38 | "unstructured[all-docs]>=0.16,<1", 39 | ] 40 | engines = [ 41 | "accelerate>1.2,<2", 42 | "gliner<1", 43 | "langchain-core>=0.3,<0.4", 44 | "langchain>=0.3,<0.4", 45 | "nest-asyncio>=1,<2", 46 | "sentencepiece<1", 47 | "transformers>=4,<5", 48 | ] 49 | distill = [ 50 | "setfit>=1.1,<2", 51 | "model2vec[train]>0.4,<0.5", 52 | ] 53 | test = [ 54 | "pre-commit>=4,<5", 55 | "pytest>=7,<8", 56 | "mypy>=1", 57 | "mypy-extensions>=1", 58 | "pytest-cov>=6", 59 | "anthropic>=0.45,<1", 60 | "langchain-community>=0.3.31,<0.4", 61 | "langchain-openai>=0.3.35", 62 | # "tesseract>=0.1,<1", 63 | # For generating documentation. 64 | "mkdocstrings[python]>=0.27,<1", 65 | "mkdocs-material>=9.6,<10", 66 | "pre-commit>=4,<5" 67 | ] 68 | 69 | [tool.ruff] 70 | line-length = 120 71 | target-version = "py312" 72 | 73 | # Exclude a variety of commonly ignored directories. 74 | exclude = [ 75 | ".bzr", 76 | ".direnv", 77 | ".eggs", 78 | ".git", 79 | ".git-rewrite", 80 | ".hg", 81 | ".mypy_cache", 82 | ".nox", 83 | ".pants.d", 84 | ".pytype", 85 | ".ruff_cache", 86 | ".svn", 87 | ".tox", 88 | ".venv", 89 | "__pypackages__", 90 | "_build", 91 | "buck-out", 92 | "build", 93 | "dist", 94 | "node_modules", 95 | "venv", 96 | ".venv", 97 | "sieves/tests/**", 98 | # Ignore examples. 99 | "examples/**" 100 | ] 101 | 102 | [tool.ruff.lint] 103 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) 104 | select = ["E", "F", "I", "UP"] 105 | extend-select = ["D"] 106 | ignore = ["D203", "D212"] 107 | 108 | # Allow autofix for all enabled rules (when `--fix`) is provided. 109 | fixable = ["ALL"] 110 | unfixable = ["F401"] 111 | 112 | [tool.ruff.lint.pydocstyle] 113 | convention = "pep257" 114 | 115 | [tool.ruff.lint.mccabe] 116 | max-complexity = 10 117 | 118 | [tool.ruff.lint.isort] 119 | known-first-party = ["sieves"] 120 | 121 | [tool.mypy] 122 | python_version = "3.11" 123 | strict = true 124 | warn_return_any = true 125 | warn_unused_configs = true 126 | disallow_untyped_defs = true 127 | disallow_incomplete_defs = true 128 | check_untyped_defs = true 129 | disallow_untyped_decorators = true 130 | no_implicit_optional = true 131 | warn_redundant_casts = true 132 | warn_unused_ignores = true 133 | warn_no_return = true 134 | warn_unreachable = true 135 | allow_untyped_globals = false 136 | allow_redefinition = false 137 | implicit_reexport = false 138 | strict_equality = true 139 | ignore_missing_imports = true 140 | 141 | # Per-module ignores for third-party libraries 142 | [[tool.mypy.overrides]] 143 | module = ["outlines.*", "docling.*", "chonkie.*", "tqdm.*", "dspy.*"] 144 | ignore_missing_imports = true 145 | follow_imports = "skip" 146 | 147 | [tool.pytest.ini_options] 148 | markers = [ 149 | "slow: marks tests as slow (deselect with '-m not slow')" 150 | ] 151 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = sieves 3 | version = 0.17.0 4 | author = Matthew Upson, Nick Sorros, Raphael Mitsch, Matthew Maufe, Angelo Di Gianvito 5 | author_email = hi@mantisnlp.com 6 | long_description = file: README.md 7 | long_description_content_type = text/markdown 8 | url = https://github.com/MantisAI/sieves 9 | 10 | [options] 11 | packages = find: 12 | python_requires = >=3.12 13 | install_requires = 14 | 15 | [options.packages.find] 16 | where = sieves 17 | 18 | [coverage:run] 19 | source = sieves/ 20 | omit = *__init__* 21 | 22 | [coverage:report] 23 | show_missing = True 24 | precision = 2 25 | sort = Miss 26 | 27 | [mypy-examples.*] 28 | follow_imports = skip 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Setup script for the Sieves package.""" 3 | 4 | if __name__ == "__main__": 5 | from setuptools import find_packages, setup 6 | 7 | setup(name="sieves", packages=find_packages()) 8 | -------------------------------------------------------------------------------- /sieves/__init__.py: -------------------------------------------------------------------------------- 1 | """Sieves.""" 2 | 3 | import sieves.tasks as tasks 4 | from sieves.data import Doc 5 | 6 | from .engines import GenerationSettings 7 | from .pipeline import Pipeline 8 | 9 | __all__ = ["Doc", "GenerationSettings", "tasks", "Pipeline"] 10 | -------------------------------------------------------------------------------- /sieves/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .doc import Doc 2 | 3 | __all__ = ["Doc"] 4 | -------------------------------------------------------------------------------- /sieves/data/doc.py: -------------------------------------------------------------------------------- 1 | """Doc implementation, types and utilities.""" 2 | 3 | from __future__ import annotations 4 | 5 | import dataclasses 6 | from pathlib import Path 7 | from typing import Any, Literal 8 | 9 | from datasets import Dataset 10 | from PIL import Image, ImageChops 11 | 12 | Field = Literal["meta", "results", "uri", "text", "chunks", "id", "images"] 13 | 14 | 15 | @dataclasses.dataclass 16 | class Doc: 17 | """A document holding data to be processed.""" 18 | 19 | meta: dict[str, Any] = dataclasses.field(default_factory=dict) 20 | results: dict[str, Any] = dataclasses.field(default_factory=dict) 21 | uri: Path | str | None = None 22 | text: str | None = None 23 | chunks: list[str] | None = None 24 | id: str | None = None 25 | images: list[Image.Image] | None = None 26 | 27 | def __post_init__(self) -> None: 28 | """Initialize chunks.""" 29 | if self.chunks is None and self.text is not None: 30 | self.chunks = [self.text] 31 | 32 | @staticmethod 33 | def _are_images_equal(im1: Image.Image | None, im2: Image.Image | None) -> bool: 34 | """Check if two images are equal using PIL Image Channel operations. 35 | 36 | :param im1: First PIL image to compare. 37 | :param im2: Second PIL image to compare. 38 | :return bool: True if images are equal, False otherwise. 39 | """ 40 | if im1 is None and im2 is None: 41 | return True 42 | if im1 is None or im2 is None: 43 | return False 44 | if im1.size != im2.size or im1.mode != im2.mode: 45 | return False 46 | return ImageChops.difference(im1, im2).getbbox() is None 47 | 48 | def __eq__(self, other: object) -> bool: 49 | """Compare two `Doc` instances. 50 | 51 | :return: True if `self` is equal to `other`. 52 | :raises NotImplementedError: if `other` isn't of type `Doc`. 53 | """ 54 | if not isinstance(other, Doc): 55 | raise NotImplementedError 56 | 57 | # Check if images are equal 58 | images_equal_check = False 59 | if self.images is None and other.images is None: 60 | images_equal_check = True 61 | elif self.images is None or other.images is None: 62 | images_equal_check = False 63 | elif self.images is not None and other.images is not None: 64 | if len(self.images) == len(other.images): 65 | images_equal_check = all( 66 | self._are_images_equal(im1, im2) for im1, im2 in zip(self.images, other.images) 67 | ) 68 | else: 69 | images_equal_check = False 70 | return ( 71 | self.id == other.id 72 | and self.uri == other.uri 73 | and self.text == other.text 74 | and self.chunks == other.chunks 75 | and self.results == other.results 76 | and images_equal_check 77 | ) 78 | 79 | @classmethod 80 | def from_hf_dataset(cls, dataset: Dataset, column_map: dict[Field, Any] | None = None) -> list[Doc]: 81 | """Generate list of docs from Hugging Face `datasets.Dataset`. 82 | 83 | :param dataset: Dataset to generate `Doc` instances from. If column_map isn't specified to the contrary, dataset 84 | must contain at least one column named "text". 85 | :param column_map: Which `Doc` attribute to map to which attribute in `dataset`. If None, the mapping "text" -> 86 | "text" is assumed. 87 | :return: List of `Doc` instances, each representing one row in the dataset. 88 | :raises ValueError: If expected columns are not present in the dataset features. 89 | """ 90 | if column_map is None: 91 | column_map = {"text": "text"} 92 | 93 | missing_cols = set(column_map.values()) - set(dataset.column_names) 94 | if len(missing_cols): 95 | raise KeyError(f"Specified columns '{missing_cols}' not found in dataset columns: {dataset.column_names}.") 96 | 97 | docs: list[Doc] = [] 98 | for row in dataset: 99 | docs.append(cls(**{doc_col: row.get(data_col) for doc_col, data_col in column_map.items()})) # type: ignore[misc] 100 | 101 | return docs 102 | -------------------------------------------------------------------------------- /sieves/engines/__init__.py: -------------------------------------------------------------------------------- 1 | """Engines.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .core import Engine, EngineInferenceMode, EngineModel, EnginePromptSignature, EngineResult 6 | from .engine_import import ( 7 | DSPy, 8 | GliX, 9 | HuggingFace, 10 | LangChain, 11 | Outlines, 12 | dspy_, 13 | glix_, 14 | huggingface_, 15 | langchain_, 16 | outlines_, 17 | ) 18 | from .engine_type import EngineType 19 | from .types import GenerationSettings 20 | 21 | __all__ = [ 22 | "dspy_", 23 | "DSPy", 24 | "EngineInferenceMode", 25 | "EngineModel", 26 | "EnginePromptSignature", 27 | "EngineType", 28 | "EngineResult", 29 | "Engine", 30 | "GenerationSettings", 31 | "glix_", 32 | "GliX", 33 | "langchain_", 34 | "LangChain", 35 | "huggingface_", 36 | "HuggingFace", 37 | "outlines_", 38 | "Outlines", 39 | ] 40 | -------------------------------------------------------------------------------- /sieves/engines/core.py: -------------------------------------------------------------------------------- 1 | """Engine core interfaces and base classes used by backends.""" 2 | 3 | from __future__ import annotations 4 | 5 | import abc 6 | import asyncio 7 | import enum 8 | from collections.abc import Awaitable, Callable, Coroutine, Iterable, Sequence 9 | from typing import Any, Generic, Protocol, TypeVar, override 10 | 11 | import jinja2 12 | import pydantic 13 | 14 | from sieves.engines.types import GenerationSettings 15 | 16 | EnginePromptSignature = TypeVar("EnginePromptSignature") 17 | EngineModel = TypeVar("EngineModel") 18 | EngineResult = TypeVar("EngineResult", covariant=True) 19 | EngineInferenceMode = TypeVar("EngineInferenceMode", bound=enum.Enum) 20 | 21 | 22 | class Executable(Protocol[EngineResult]): 23 | """Callable protocol representing a compiled prompt executable.""" 24 | 25 | def __call__(self, values: Sequence[dict[str, Any]]) -> Iterable[EngineResult | None]: 26 | """Execute prompt executable for given values. 27 | 28 | :param values: Values to inject into prompts. 29 | :return: Results for prompts. 30 | """ 31 | ... 32 | 33 | 34 | class Engine(Generic[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode]): 35 | """Base class for engines wrapping model invocation and batching.""" 36 | 37 | def __init__(self, model: EngineModel, generation_settings: GenerationSettings): 38 | """Initialize engine with model and generation settings. 39 | 40 | :param model: Instantiated model instance. 41 | :param generation_settings: Generation settings. 42 | """ 43 | self._model = model 44 | self._generation_settings = generation_settings 45 | self._inference_kwargs = generation_settings.inference_kwargs or {} 46 | self._init_kwargs = generation_settings.init_kwargs or {} 47 | self._strict_mode = generation_settings.strict_mode 48 | 49 | @property 50 | def generation_settings(self) -> GenerationSettings: 51 | """Return generation settings. 52 | 53 | :return: Generation settings. 54 | """ 55 | return self._generation_settings 56 | 57 | @property 58 | def model(self) -> EngineModel: 59 | """Return model instance. 60 | 61 | :return: Model instance. 62 | """ 63 | return self._model 64 | 65 | @property 66 | @abc.abstractmethod 67 | def supports_few_shotting(self) -> bool: 68 | """Return whether engine supports few-shotting. 69 | 70 | :return: Whether engine supports few-shotting. 71 | """ 72 | 73 | @property 74 | @abc.abstractmethod 75 | def inference_modes(self) -> type[EngineInferenceMode]: 76 | """Return supported inference modes. 77 | 78 | :return: Supported inference modes. 79 | """ 80 | 81 | @abc.abstractmethod 82 | def build_executable( 83 | self, 84 | inference_mode: EngineInferenceMode, 85 | prompt_template: str | None, 86 | prompt_signature: type[EnginePromptSignature] | EnginePromptSignature, 87 | fewshot_examples: Sequence[pydantic.BaseModel] = (), 88 | ) -> Executable[EngineResult | None]: 89 | """Return a prompt executable for the given signature and mode. 90 | 91 | This wraps the engine‑native generation callable (e.g., DSPy Predict, 92 | Outlines Generator) with Sieves’ uniform interface. 93 | :param inference_mode: Inference mode to use (e.g. classification, JSON, ... - this is engine-specific). 94 | :param prompt_template: Prompt template. 95 | :param prompt_signature: Expected prompt signature type. 96 | :param fewshot_examples: Few-shot examples. 97 | :return: Prompt executable. 98 | """ 99 | 100 | @staticmethod 101 | def convert_fewshot_examples(fewshot_examples: Sequence[pydantic.BaseModel]) -> list[dict[str, Any]]: 102 | """Convert few‑shot examples to dicts. 103 | 104 | :param fewshot_examples: Fewshot examples to convert. 105 | :return: Fewshot examples as dicts. 106 | """ 107 | return [fs_example.model_dump(serialize_as_any=True) for fs_example in fewshot_examples] 108 | 109 | @staticmethod 110 | async def _execute_async_calls(calls: list[Coroutine[Any, Any, Any]] | list[Awaitable[Any]]) -> Any: 111 | """Execute a batch of async functions. 112 | 113 | :param calls: Async calls to execute. 114 | :return: Parsed response objects. 115 | """ 116 | return await asyncio.gather(*calls) 117 | 118 | 119 | class PydanticEngine(abc.ABC, Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode]): 120 | """Abstract super class for engines using Pydantic signatures and results. 121 | 122 | Note that this class also assumes the engine accepts a prompt. This holds true for most engines - it doesn't only 123 | for those with an idiocratic way to process prompts like DSPy, or decoder-only models which don't work with 124 | object-based signatures anyway. 125 | If and once we add support for a Pydantic-based engine that doesn't accept prompt templates, we'll adjust by 126 | modifying `_infer()` to accept an additional parameter specifying how to handle prompt/instruction injection (and 127 | we might have to make `supports_few_shotting()` engine-specific again). 128 | """ 129 | 130 | @classmethod 131 | def _create_template(cls, template: str | None) -> jinja2.Template: 132 | """Create Jinja2 template from template string. 133 | 134 | :param template: Template string. 135 | :return: Jinja2 template. 136 | """ 137 | assert template, f"prompt_template has to be provided to {cls.__name__}." 138 | return jinja2.Template(template) 139 | 140 | @override 141 | @property 142 | def supports_few_shotting(self) -> bool: 143 | return True 144 | 145 | def _infer( 146 | self, 147 | generator: Callable[[list[str]], Iterable[EngineResult]], 148 | template: jinja2.Template, 149 | values: Sequence[dict[str, Any]], 150 | fewshot_examples: Sequence[pydantic.BaseModel], 151 | ) -> Iterable[EngineResult | None]: 152 | """Run inference in batches with exception handling. 153 | 154 | :param generator: Callable generating responses. 155 | :param template: Prompt template. 156 | :param values: Doc values to inject. 157 | :param fewshot_examples: Fewshot examples. 158 | :return: Results parsed from responses. 159 | """ 160 | fewshot_examples_dict = Engine.convert_fewshot_examples(fewshot_examples) 161 | examples = {"examples": fewshot_examples_dict} if len(fewshot_examples_dict) else {} 162 | 163 | try: 164 | yield from generator([template.render(**doc_values, **examples) for doc_values in values]) 165 | 166 | except Exception as err: 167 | if self._strict_mode: 168 | raise type(err)( 169 | "Encountered problem when executing prompt. Ensure your few-shot examples and document " 170 | "chunks contain sensible information." 171 | ) from err 172 | else: 173 | yield from (None for _ in range(len(values))) 174 | -------------------------------------------------------------------------------- /sieves/engines/dspy_.py: -------------------------------------------------------------------------------- 1 | """DSPy engine integration for Sieves.""" 2 | 3 | import asyncio 4 | import enum 5 | from collections.abc import Iterable, Sequence 6 | from typing import Any, override 7 | 8 | import dspy 9 | import nest_asyncio 10 | import pydantic 11 | 12 | from sieves.engines.core import Engine, Executable 13 | from sieves.engines.types import GenerationSettings 14 | 15 | PromptSignature = dspy.Signature | dspy.Module 16 | Model = dspy.LM | dspy.BaseLM 17 | Result = dspy.Prediction 18 | 19 | 20 | nest_asyncio.apply() 21 | 22 | 23 | class InferenceMode(enum.Enum): 24 | """Available inference modes. 25 | 26 | See https://dspy.ai/#__tabbed_2_6 for more information and examples. 27 | """ 28 | 29 | # Default inference mode. 30 | predict = dspy.Predict 31 | # CoT-style inference. 32 | chain_of_thought = dspy.ChainOfThought 33 | # Agentic, i.e. with tool use. 34 | react = dspy.ReAct 35 | # For multi-stage pipelines within a task. This is handled differently than the other supported modules: dspy.Module 36 | # serves as both the signature as well as the inference generator. 37 | module = dspy.Module 38 | 39 | 40 | class DSPy(Engine[PromptSignature, Result, Model, InferenceMode]): 41 | """Engine for DSPy.""" 42 | 43 | def __init__(self, model: Model, generation_settings: GenerationSettings): 44 | """Initialize engine. 45 | 46 | :param model: Model to run. Note: DSPy only runs with APIs. If you want to run a model locally from v2.5 47 | onwards, serve it with OLlama - see here: # https://dspy.ai/learn/programming/language_models/?h=models#__tabbed_1_5. 48 | In a nutshell: 49 | > curl -fsSL https://ollama.ai/install.sh | sh 50 | > ollama run MODEL_ID 51 | > `model = dspy.LM(MODEL_ID, api_base='http://localhost:11434', api_key='')` 52 | :param generation_settings: Settings including DSPy configuration in `config_kwargs`. 53 | """ 54 | super().__init__(model, generation_settings) 55 | cfg = generation_settings.config_kwargs or {} 56 | dspy.configure(lm=model, **cfg) 57 | 58 | @override 59 | @property 60 | def inference_modes(self) -> type[InferenceMode]: 61 | return InferenceMode 62 | 63 | @override 64 | @property 65 | def supports_few_shotting(self) -> bool: 66 | return True 67 | 68 | @override 69 | def build_executable( 70 | self, 71 | inference_mode: InferenceMode, 72 | prompt_template: str | None, # noqa: UP007 73 | prompt_signature: type[PromptSignature] | PromptSignature, 74 | fewshot_examples: Sequence[pydantic.BaseModel] = tuple(), 75 | ) -> Executable[Result | None]: 76 | # Note: prompt_template is ignored here, as DSPy doesn't use it directly (only prompt_signature_description). 77 | assert isinstance(prompt_signature, type) 78 | 79 | # Handled differently than the other supported modules: dspy.Module serves as both the signature as well as 80 | # the inference generator. 81 | if inference_mode == InferenceMode.module: 82 | assert isinstance(prompt_signature, dspy.Module), ValueError( 83 | "In inference mode 'module' the provided prompt signature has to be of type dspy.Module." 84 | ) 85 | generator = inference_mode.value(**self._init_kwargs) 86 | else: 87 | assert issubclass(prompt_signature, dspy.Signature) 88 | generator = inference_mode.value(signature=prompt_signature, **self._init_kwargs) 89 | 90 | def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result | None]: 91 | """Execute structured generation with DSPy. 92 | 93 | :params values: Values to inject into prompts. 94 | :returns: Results for prompts. 95 | """ 96 | # Compile predictor with few-shot examples. 97 | fewshot_examples_dicts = DSPy.convert_fewshot_examples(fewshot_examples) 98 | generator_fewshot: dspy.Module | None = None 99 | if len(fewshot_examples_dicts): 100 | examples = [dspy.Example(**fs_example) for fs_example in fewshot_examples_dicts] 101 | generator_fewshot = dspy.LabeledFewShot(k=len(examples)).compile(student=generator, trainset=examples) 102 | 103 | try: 104 | gen = generator_fewshot or generator 105 | calls = [gen.acall(**doc_values, **self._inference_kwargs) for doc_values in values] 106 | yield from asyncio.run(self._execute_async_calls(calls)) 107 | 108 | except Exception as err: 109 | if self._strict_mode: 110 | raise type(err)( 111 | "Encountered problem when executing prompt. Ensure your few-shot examples and document " 112 | "chunks contain sensible information." 113 | ) from err 114 | else: 115 | yield from [None] * len(values) 116 | 117 | return execute 118 | -------------------------------------------------------------------------------- /sieves/engines/engine_import.py: -------------------------------------------------------------------------------- 1 | """Import 3rd-party libraries required for engines. 2 | 3 | If library can't be found, placeholder engines is imported instead. 4 | 5 | This allows us to import everything downstream without having to worry about optional dependencies. If a user specifies 6 | an engine/model from a non-installed library, we terminate with an error. 7 | """ 8 | 9 | # mypy: disable-error-code="no-redef" 10 | 11 | import warnings 12 | 13 | from .missing import MissingEngine 14 | 15 | _missing_dependencies: list[str] = [] 16 | 17 | 18 | try: 19 | from . import dspy_ 20 | from .dspy_ import DSPy 21 | except ModuleNotFoundError: 22 | from . import missing as dspy_ 23 | 24 | DSPy = MissingEngine # type: ignore[misc,assignment] 25 | _missing_dependencies.append("dspy") 26 | 27 | 28 | try: 29 | from . import glix_ 30 | from .glix_ import GliX 31 | except ModuleNotFoundError: 32 | from . import missing as glix_ 33 | 34 | GliX = MissingEngine # type: ignore[misc,assignment] 35 | _missing_dependencies.append("gliner") 36 | 37 | 38 | try: 39 | from . import huggingface_ 40 | from .huggingface_ import HuggingFace 41 | except ModuleNotFoundError: 42 | from . import missing as huggingface_ 43 | 44 | HuggingFace = MissingEngine # type: ignore[misc,assignment] 45 | _missing_dependencies.append("transformers") 46 | 47 | 48 | try: 49 | from . import langchain_ 50 | from .langchain_ import LangChain 51 | except ModuleNotFoundError: 52 | from . import missing as langchain_ 53 | 54 | LangChain = MissingEngine # type: ignore[misc,assignment] 55 | _missing_dependencies.append("langchain") 56 | 57 | 58 | try: 59 | from . import outlines_ 60 | from .outlines_ import Outlines 61 | except ModuleNotFoundError: 62 | from . import missing as outlines_ 63 | 64 | Outlines = MissingEngine # type: ignore[misc,assignment] 65 | _missing_dependencies.append("outlines") 66 | 67 | 68 | if len(_missing_dependencies): 69 | warnings.warn( 70 | "Warning: structured generation dependencies [{deps}] could not be imported. Generating with them requires them" 71 | " to be installed.".format(deps=", ".join(_missing_dependencies)) 72 | ) 73 | 74 | 75 | __all__ = [ 76 | "dspy_", 77 | "DSPy", 78 | "glix_", 79 | "GliX", 80 | "huggingface_", 81 | "HuggingFace", 82 | "langchain_", 83 | "LangChain", 84 | "outlines_", 85 | "Outlines", 86 | ] 87 | -------------------------------------------------------------------------------- /sieves/engines/engine_type.py: -------------------------------------------------------------------------------- 1 | """Engine type enum and utilities.""" 2 | 3 | from __future__ import annotations 4 | 5 | import enum 6 | 7 | from .core import Engine, EngineInferenceMode, EngineModel, EnginePromptSignature, EngineResult 8 | from .engine_import import dspy_, glix_, huggingface_, langchain_, outlines_ 9 | 10 | 11 | class EngineType(enum.Enum): 12 | """Available engine types.""" 13 | 14 | dspy = dspy_.DSPy 15 | glix = glix_.GliX 16 | huggingface = huggingface_.HuggingFace 17 | langchain = langchain_.LangChain 18 | outlines = outlines_.Outlines 19 | 20 | @classmethod 21 | def all(cls) -> tuple[EngineType, ...]: 22 | """Return all available engine types. 23 | 24 | :return tuple[EngineType, ...]: All available engine types. 25 | """ 26 | return tuple(EngineType) 27 | 28 | @classmethod 29 | def get_engine_type( 30 | cls, engine: Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode] 31 | ) -> EngineType: 32 | """Return engine type for specified engine. 33 | 34 | :param engine: Engine to get type for. 35 | :return EngineType: Engine type for self._engine. 36 | :raises ValueError: if engine class not found in EngineType. 37 | """ 38 | for et in EngineType: 39 | if isinstance(engine, et.value): 40 | return et 41 | raise ValueError(f"Engine class {engine.__class__.__name__} not found in EngineType.") 42 | -------------------------------------------------------------------------------- /sieves/engines/glix_.py: -------------------------------------------------------------------------------- 1 | """GliX engine wrapper built on top of GLiNER multi‑task pipelines.""" 2 | 3 | import enum 4 | import warnings 5 | from collections.abc import Iterable, Sequence 6 | from typing import Any, override 7 | 8 | import gliner.multitask.base 9 | import jinja2 10 | import pydantic 11 | 12 | from sieves.engines.core import Engine, Executable 13 | from sieves.engines.types import GenerationSettings 14 | 15 | PromptSignature = list[str] 16 | Model = gliner.model.GLiNER 17 | Result = list[dict[str, str | float]] | str 18 | 19 | 20 | class InferenceMode(enum.Enum): 21 | """Available inference modes.""" 22 | 23 | ner = gliner.config.GLiNERConfig 24 | classification = gliner.multitask.GLiNERClassifier 25 | question_answering = gliner.multitask.GLiNERQuestionAnswerer 26 | information_extraction = gliner.multitask.GLiNEROpenExtractor 27 | summarization = gliner.multitask.GLiNERSummarizer 28 | relation_extraction = gliner.multitask.GLiNERRelationExtractor 29 | 30 | 31 | class GliX(Engine[PromptSignature, Result, Model, InferenceMode]): 32 | """Engine adapter for GLiNER's multitask utilities (NER, CLS, QA, etc.).""" 33 | 34 | def __init__(self, model: Model, generation_settings: GenerationSettings): 35 | """Initialize GliX engine wrapper with model and settings.""" 36 | super().__init__(model, generation_settings) 37 | self._model_wrappers: dict[InferenceMode, gliner.multitask.base.GLiNERBasePipeline] = {} 38 | 39 | @override 40 | @property 41 | def inference_modes(self) -> type[InferenceMode]: 42 | return InferenceMode 43 | 44 | @override 45 | @property 46 | def supports_few_shotting(self) -> bool: 47 | return False 48 | 49 | @override 50 | def build_executable( 51 | self, 52 | inference_mode: InferenceMode, 53 | prompt_template: str | None, 54 | prompt_signature: type[PromptSignature] | PromptSignature, 55 | fewshot_examples: Sequence[pydantic.BaseModel] = (), 56 | ) -> Executable[Result]: 57 | assert isinstance(prompt_signature, list) 58 | cls_name = self.__class__.__name__ 59 | if len(list(fewshot_examples)): 60 | warnings.warn(f"Few-shot examples are not supported by engine {cls_name}.") 61 | 62 | # Lazily initialize multi-task wrapper for underlying GliNER model. 63 | if inference_mode not in self._model_wrappers: 64 | self._model_wrappers[inference_mode] = inference_mode.value(model=self._model) 65 | 66 | model = self._model_wrappers[inference_mode] 67 | 68 | # Overwrite prompt default template, if template specified. Note that this is a static prompt and GliX doesn't 69 | # do few-shotting, so we don't inject anything into the template. 70 | if prompt_template: 71 | self._model.prompt = jinja2.Template(prompt_template).render() 72 | 73 | def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result]: 74 | """Execute prompts with engine for given values. 75 | 76 | :param values: Values to inject into prompts. 77 | :return Iterable[Result]: Results for prompts. 78 | """ 79 | try: 80 | params: dict[InferenceMode, dict[str, Any]] = { 81 | InferenceMode.classification: {"classes": prompt_signature, "multi_label": True}, 82 | InferenceMode.question_answering: {"questions": prompt_signature}, 83 | InferenceMode.summarization: {}, 84 | InferenceMode.ner: {"entity_types": prompt_signature}, 85 | } 86 | selected_params = params[inference_mode] # Select parameters based on inference mode 87 | except KeyError: 88 | raise ValueError(f"Inference mode {inference_mode} not supported by {cls_name} engine.") 89 | 90 | texts = [val["text"] for val in values] 91 | if inference_mode == InferenceMode.ner: 92 | yield from self._model.batch_predict_entities(texts=texts, labels=selected_params["entity_types"]) 93 | else: 94 | assert isinstance(selected_params, dict) 95 | yield from model(texts, **(selected_params | self._inference_kwargs)) 96 | 97 | return execute 98 | -------------------------------------------------------------------------------- /sieves/engines/huggingface_.py: -------------------------------------------------------------------------------- 1 | """Hugging Face transformers engine wrapper (zero-shot classification).""" 2 | 3 | import enum 4 | from collections.abc import Iterable, Sequence 5 | from typing import Any, override 6 | 7 | import jinja2 8 | import pydantic 9 | import transformers 10 | 11 | from sieves.engines.core import Engine, Executable 12 | 13 | PromptSignature = list[str] 14 | Model = transformers.Pipeline 15 | Result = dict[str, list[str] | list[float]] 16 | 17 | 18 | class InferenceMode(enum.Enum): 19 | """Available inference modes.""" 20 | 21 | zeroshot_cls = 0 22 | 23 | 24 | class HuggingFace(Engine[PromptSignature, Result, Model, InferenceMode]): 25 | """Engine adapter around ``transformers.Pipeline`` for zero‑shot tasks.""" 26 | 27 | @override 28 | @property 29 | def inference_modes(self) -> type[InferenceMode]: 30 | return InferenceMode 31 | 32 | @override 33 | @property 34 | def supports_few_shotting(self) -> bool: 35 | return True 36 | 37 | @override 38 | def build_executable( 39 | self, 40 | inference_mode: InferenceMode, 41 | prompt_template: str | None, 42 | prompt_signature: type[PromptSignature] | PromptSignature, 43 | fewshot_examples: Sequence[pydantic.BaseModel] = (), 44 | ) -> Executable[Result | None]: 45 | cls_name = self.__class__.__name__ 46 | assert prompt_template, ValueError(f"prompt_template has to be provided to {cls_name} engine by task.") 47 | assert isinstance(prompt_signature, list) 48 | 49 | # Render template with few-shot examples. Note that we don't use extracted document values here, as HF zero-shot 50 | # pipelines only support one hypothesis template per call - and we want to batch, so our hypothesis template 51 | # will be document-invariant. 52 | fewshot_examples_dict = HuggingFace.convert_fewshot_examples(fewshot_examples) 53 | # Render hypothesis template with everything but text. 54 | template = jinja2.Template(prompt_template).render(**({"examples": fewshot_examples_dict})) 55 | 56 | def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result]: 57 | """Execute prompts with engine for given values. 58 | 59 | :param values: Values to inject into prompts. 60 | :return Iterable[Result]: Results for prompts. 61 | """ 62 | match inference_mode: 63 | case InferenceMode.zeroshot_cls: 64 | yield from self._model( 65 | sequences=[doc_values["text"] for doc_values in values], 66 | candidate_labels=prompt_signature, 67 | hypothesis_template=template, 68 | multi_label=True, 69 | **self._inference_kwargs, 70 | ) 71 | 72 | case _: 73 | raise ValueError(f"Inference mode {inference_mode} not supported by {cls_name} engine.") 74 | 75 | return execute 76 | -------------------------------------------------------------------------------- /sieves/engines/langchain_.py: -------------------------------------------------------------------------------- 1 | """LangChain engine wrapper for structured outputs using Pydantic.""" 2 | 3 | import asyncio 4 | import enum 5 | from collections.abc import Iterable, Sequence 6 | from typing import Any, override 7 | 8 | import langchain_core.language_models 9 | import nest_asyncio 10 | import pydantic 11 | 12 | from sieves.engines.core import Executable, PydanticEngine 13 | 14 | nest_asyncio.apply() 15 | 16 | Model = langchain_core.language_models.BaseChatModel 17 | PromptSignature = pydantic.BaseModel 18 | Result = pydantic.BaseModel 19 | 20 | 21 | class InferenceMode(enum.Enum): 22 | """Available inference modes.""" 23 | 24 | structured = "structured" 25 | 26 | 27 | class LangChain(PydanticEngine[PromptSignature, Result, Model, InferenceMode]): 28 | """Engine for LangChain.""" 29 | 30 | @override 31 | @property 32 | def inference_modes(self) -> type[InferenceMode]: 33 | return InferenceMode 34 | 35 | @override 36 | @override 37 | def build_executable( 38 | self, 39 | inference_mode: InferenceMode, 40 | prompt_template: str | None, # noqa: UP007 41 | prompt_signature: type[PromptSignature] | PromptSignature, 42 | fewshot_examples: Sequence[pydantic.BaseModel] = tuple(), 43 | ) -> Executable[Result | None]: 44 | assert isinstance(prompt_signature, type) 45 | cls_name = self.__class__.__name__ 46 | template = self._create_template(prompt_template) 47 | model = self._model.with_structured_output(prompt_signature) 48 | 49 | def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result | None]: 50 | """Execute prompts with engine for given values. 51 | 52 | :param values: Values to inject into prompts. 53 | :return Iterable[Result | None]: Results for prompts. Results are None if corresponding prompt failed. 54 | """ 55 | match inference_mode: 56 | case InferenceMode.structured: 57 | 58 | def generate(prompts: list[str]) -> Iterable[Result]: 59 | try: 60 | yield from asyncio.run(model.abatch(prompts, **self._inference_kwargs)) 61 | 62 | except Exception as err: 63 | raise type(err)( 64 | f"Encountered problem in parsing {cls_name} output. Double-check your prompts and " 65 | f"examples." 66 | ) from err 67 | 68 | generator = generate 69 | case _: 70 | raise ValueError(f"Inference mode {inference_mode} not supported by {cls_name} engine.") 71 | 72 | yield from self._infer(generator, template, values, fewshot_examples) 73 | 74 | return execute 75 | -------------------------------------------------------------------------------- /sieves/engines/missing.py: -------------------------------------------------------------------------------- 1 | """Fallback engine types when optional dependencies are unavailable.""" 2 | 3 | import enum 4 | from collections.abc import Callable, Iterable, Sequence 5 | from typing import Any, override 6 | 7 | import pydantic 8 | 9 | from sieves.engines.core import Engine 10 | 11 | PromptSignature = Any 12 | Model = Any 13 | Result = Any 14 | 15 | 16 | class InferenceMode(enum.Enum): 17 | """Placeholder mode for unsupported engines.""" 18 | 19 | any = Any 20 | 21 | 22 | class MissingEngine(Engine[PromptSignature, Result, Model, InferenceMode]): 23 | """Placeholder for engine that couldn't be imported due to missing dependencies.""" 24 | 25 | @override 26 | @property 27 | def supports_few_shotting(self) -> bool: 28 | raise NotImplementedError 29 | 30 | @override 31 | @property 32 | def inference_modes(self) -> type[InferenceMode]: 33 | raise NotImplementedError 34 | 35 | @override 36 | def build_executable( 37 | self, 38 | inference_mode: InferenceMode, 39 | prompt_template: str | None, 40 | prompt_signature: type[PromptSignature] | PromptSignature, 41 | fewshot_examples: Sequence[pydantic.BaseModel] = (), 42 | ) -> Callable[[Iterable[dict[str, Any]]], Iterable[Result | None]]: 43 | raise NotImplementedError 44 | -------------------------------------------------------------------------------- /sieves/engines/outlines_.py: -------------------------------------------------------------------------------- 1 | """Outlines engine wrapper supporting text, choices, regex and JSON schemas.""" 2 | 3 | import enum 4 | from collections.abc import Iterable, Sequence 5 | from typing import Any, Literal, override 6 | 7 | import outlines 8 | import pydantic 9 | from outlines.models import AsyncBlackBoxModel, BlackBoxModel, SteerableModel 10 | 11 | from sieves.engines.core import Executable, PydanticEngine 12 | 13 | PromptSignature = ( 14 | pydantic.BaseModel | list[str] | str | outlines.types.Choice | outlines.types.Regex | outlines.types.JsonSchema 15 | ) 16 | Model = AsyncBlackBoxModel | BlackBoxModel | SteerableModel 17 | Result = pydantic.BaseModel | str 18 | 19 | 20 | class InferenceMode(enum.Enum): 21 | """Available inference modes. 22 | 23 | Note: generator functions are wrapped in tuples, as otherwise the Enum instance seems to be replaced by the function 24 | itself - not sure why that happens. Should take another look at this. 25 | """ 26 | 27 | # For normal text output, i.e. no structured generation. 28 | text = "text" 29 | # For limited set of choices, e.g. classification. 30 | choice = "choice" 31 | # Regex-conforming output. 32 | regex = "regex" 33 | # Output conforming to Pydantic models. 34 | json = "json" 35 | 36 | 37 | class Outlines(PydanticEngine[PromptSignature, Result, Model, InferenceMode]): 38 | """Engine for Outlines with multiple structured inference modes.""" 39 | 40 | @override 41 | @property 42 | def inference_modes(self) -> type[InferenceMode]: 43 | return InferenceMode 44 | 45 | @override 46 | def build_executable( 47 | self, 48 | inference_mode: InferenceMode, 49 | prompt_template: str | None, # noqa: UP007 50 | prompt_signature: type[PromptSignature] | PromptSignature, 51 | fewshot_examples: Sequence[pydantic.BaseModel] = (), 52 | ) -> Executable[Result | None]: 53 | template = self._create_template(prompt_template) 54 | 55 | # Create Generator instance responsible for generating non-parsed text. 56 | if isinstance(prompt_signature, list): 57 | prompt_signature = Literal[*prompt_signature] 58 | 59 | if inference_mode == InferenceMode.regex: 60 | prompt_signature = outlines.types.Regex(prompt_signature) 61 | 62 | generator = outlines.Generator(self._model, output_type=prompt_signature, **self._init_kwargs) 63 | 64 | def execute(values: Sequence[dict[str, Any]]) -> Iterable[Result | None]: 65 | """Execute prompts with engine for given values. 66 | 67 | :param values: Values to inject into prompts. 68 | :return Iterable[Result | None]: Results for prompts. Results are None if corresponding prompt failed. 69 | """ 70 | 71 | def generate(prompts: list[str]) -> Iterable[Result]: 72 | try: 73 | results = generator.batch(prompts, **self._inference_kwargs) 74 | # Batch mode is not implemented for all Outlines wrappers. Fall back to single-prompt mode in that case. 75 | except NotImplementedError: 76 | 77 | async def generate_async(prompt: str) -> Result | None: 78 | """Generate result async. 79 | 80 | :param prompt: Prompt to generate result for. 81 | :return: Result for prompt. Results are None if corresponding prompt failed. 82 | """ 83 | return generator(prompt, **self._inference_kwargs) 84 | 85 | calls = [generate_async(prompt) for prompt in prompts] 86 | results = self._execute_async_calls(calls) 87 | 88 | if inference_mode == InferenceMode.json: 89 | assert len(results) == len(prompts) 90 | assert isinstance(prompt_signature, type) and issubclass(prompt_signature, pydantic.BaseModel) 91 | yield from [prompt_signature.model_validate_json(result) for result in results] 92 | else: 93 | yield from results 94 | 95 | yield from self._infer( 96 | generate, 97 | template, 98 | values, 99 | fewshot_examples, 100 | ) 101 | 102 | return execute 103 | -------------------------------------------------------------------------------- /sieves/engines/types.py: -------------------------------------------------------------------------------- 1 | """Common types.""" 2 | 3 | from typing import Any 4 | 5 | import pydantic 6 | 7 | 8 | class GenerationSettings(pydantic.BaseModel): 9 | """Settings for structured generation. 10 | 11 | :param init_kwargs: kwargs passed on to initialization of structured generator. Not all engines use this - ignored 12 | otherwise. 13 | :param inference_kwargs: kwargs passed on to inference with structured generator. 14 | :param config_kwargs: Used only if supplied model is a DSPy model object, ignored otherwise. Optional kwargs 15 | supplied to dspy.configure(). 16 | :param strict_mode: If True, exception is raised if prompt response can't be parsed correctly. 17 | """ 18 | 19 | init_kwargs: dict[str, Any] | None = None 20 | inference_kwargs: dict[str, Any] | None = None 21 | config_kwargs: dict[str, Any] | None = None 22 | strict_mode: bool = False 23 | -------------------------------------------------------------------------------- /sieves/engines/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for engines.""" 2 | 3 | import outlines 4 | import transformers 5 | 6 | from sieves.engines.core import Engine, EngineInferenceMode, EngineModel, EnginePromptSignature, EngineResult 7 | from sieves.engines.engine_import import ( 8 | dspy_, 9 | glix_, 10 | huggingface_, 11 | langchain_, 12 | outlines_, 13 | ) 14 | from sieves.engines.types import GenerationSettings 15 | 16 | Model = dspy_.Model | glix_.Model | huggingface_.Model | langchain_.Model | outlines_.Model 17 | 18 | 19 | def init_default_model() -> outlines.models.Transformers: # noqa: D401 20 | """Initialize default model (HuggingFaceTB/SmolLM-360M-Instruct with Outlines). 21 | 22 | :return: Initialized default model. 23 | """ 24 | model_name = "HuggingFaceTB/SmolLM-360M-Instruct" 25 | 26 | return outlines.models.from_transformers( 27 | transformers.AutoModelForCausalLM.from_pretrained(model_name), 28 | transformers.AutoTokenizer.from_pretrained(model_name), 29 | ) 30 | 31 | 32 | def init_engine( 33 | model: Model, generation_settings: GenerationSettings 34 | ) -> Engine[EnginePromptSignature, EngineResult, EngineModel, EngineInferenceMode]: # noqa: D401 35 | """Initialize internal engine object. 36 | 37 | :param model: Model to use. 38 | :param generation_settings: Settings for structured generation. 39 | :return Engine: Engine. 40 | :raises ValueError: If model type isn't supported. 41 | """ 42 | model_type = type(model) 43 | module_engine_map = { 44 | dspy_: dspy_.DSPy, 45 | glix_: glix_.GliX, 46 | huggingface_: huggingface_.HuggingFace, 47 | langchain_: langchain_.LangChain, 48 | outlines_: outlines_.Outlines, 49 | } 50 | 51 | for module, engine_type in module_engine_map.items(): 52 | try: 53 | module_model_types = module.Model.__args__ 54 | except AttributeError: 55 | module_model_types = (module.Model,) 56 | 57 | if any(issubclass(model_type, module_model_type) for module_model_type in module_model_types): 58 | internal_engine = engine_type( 59 | model=model, 60 | generation_settings=generation_settings, 61 | ) 62 | assert isinstance(internal_engine, Engine) 63 | 64 | return internal_engine 65 | 66 | raise ValueError( 67 | f"Model type {model.__class__} is not supported. Please check the documentation and ensure you're " 68 | f"providing a supported model type." 69 | ) 70 | -------------------------------------------------------------------------------- /sieves/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import Pipeline 2 | 3 | __all__ = ["Pipeline"] 4 | -------------------------------------------------------------------------------- /sieves/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | """Tasks.""" 2 | 3 | from . import predictive, preprocessing 4 | from .core import Task 5 | from .postprocessing import DistillationFramework 6 | from .predictive import ( 7 | NER, 8 | Classification, 9 | InformationExtraction, 10 | PIIMasking, 11 | QuestionAnswering, 12 | SentimentAnalysis, 13 | Summarization, 14 | Translation, 15 | ) 16 | from .predictive.core import PredictiveTask 17 | from .preprocessing import Chunking, Ingestion 18 | 19 | __all__ = [ 20 | "Chunking", 21 | "Classification", 22 | "DistillationFramework", 23 | "NER", 24 | "InformationExtraction", 25 | "Ingestion", 26 | "SentimentAnalysis", 27 | "Summarization", 28 | "Translation", 29 | "QuestionAnswering", 30 | "PIIMasking", 31 | "Task", 32 | "predictive", 33 | "PredictiveTask", 34 | "preprocessing", 35 | ] 36 | -------------------------------------------------------------------------------- /sieves/tasks/core.py: -------------------------------------------------------------------------------- 1 | """Core task implementation.""" 2 | 3 | from __future__ import annotations 4 | 5 | import abc 6 | from collections.abc import Iterable 7 | from typing import TYPE_CHECKING, Any 8 | 9 | from sieves.data import Doc 10 | from sieves.serialization import Attribute, Config 11 | 12 | if TYPE_CHECKING: 13 | # Imported only for type checking to avoid import cycles at runtime. 14 | from sieves.pipeline import Pipeline 15 | 16 | 17 | class Task(abc.ABC): 18 | """Abstract base class for tasks that can be executed on documents.""" 19 | 20 | def __init__(self, task_id: str | None, include_meta: bool, batch_size: int): 21 | """ 22 | Initiate new Task. 23 | 24 | :param task_id: Task ID. 25 | :param include_meta: Whether to include meta information generated by the task. 26 | :param batch_size: Batch size for processing documents. Use -1 to process all documents at once. 27 | """ 28 | self._task_id = task_id if task_id else self.__class__.__name__ 29 | self._include_meta = include_meta 30 | self._batch_size = batch_size 31 | 32 | @property 33 | def id(self) -> str: 34 | """Return task ID. 35 | 36 | Used by pipeline for results and dependency management. 37 | 38 | :return: Task ID. 39 | """ 40 | return self._task_id 41 | 42 | @abc.abstractmethod 43 | def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]: 44 | """Execute task. 45 | 46 | :param docs: Docs to process. 47 | :return: Processed docs. 48 | """ 49 | 50 | def __add__(self, other: Task | Pipeline) -> Pipeline: 51 | """Chain this task with another task or pipeline using the ``+`` operator. 52 | 53 | This returns a new ``Pipeline`` that executes this task first, followed by the 54 | task(s) in ``other``. The original task(s)/pipeline are not mutated. 55 | 56 | Cache semantics: 57 | - If ``other`` is a ``Pipeline``, the resulting pipeline adopts ``other``'s 58 | ``use_cache`` setting (because the left-hand side is a single task). 59 | - If ``other`` is a ``Task``, the resulting pipeline defaults to ``use_cache=True``. 60 | 61 | :param other: A ``Task`` or ``Pipeline`` to execute after this task. 62 | :return: A new ``Pipeline`` representing the chained execution. 63 | :raises TypeError: If ``other`` is not a ``Task`` or ``Pipeline``. 64 | """ 65 | # Lazy import to avoid circular dependency at module import time. 66 | from sieves.pipeline import Pipeline 67 | 68 | if isinstance(other, Pipeline): 69 | return Pipeline(tasks=[self, *other.tasks], use_cache=other.use_cache) 70 | 71 | if isinstance(other, Task): 72 | return Pipeline(tasks=[self, other]) 73 | 74 | raise TypeError(f"Cannot chain Task with {type(other).__name__}") 75 | 76 | @property 77 | def _state(self) -> dict[str, Any]: 78 | """Return attributes to serialize. 79 | 80 | :return: Dict of attributes to serialize. 81 | """ 82 | return { 83 | "task_id": self._task_id, 84 | "include_meta": self._include_meta, 85 | "batch_size": self._batch_size, 86 | } 87 | 88 | def serialize(self) -> Config: 89 | """Serialize task. 90 | 91 | :return: Config instance. 92 | """ 93 | return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()}) 94 | 95 | @classmethod 96 | def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Task: 97 | """Generate Task instance from config. 98 | 99 | :param config: Config to generate instance from. 100 | :param kwargs: Values to inject into loaded config. 101 | :return: Deserialized Task instance. 102 | """ 103 | # Deserialize and inject engine. 104 | return cls(**config.to_init_dict(cls, **kwargs)) 105 | -------------------------------------------------------------------------------- /sieves/tasks/optimization/__init__.py: -------------------------------------------------------------------------------- 1 | """Prompt/few-shot exapmle optimization for tasks.""" 2 | 3 | from sieves.tasks.optimization.core import EvalMetric, Optimizer 4 | 5 | __all__ = ["EvalMetric", "Optimizer"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/optimization/core.py: -------------------------------------------------------------------------------- 1 | """Optimizer implementation.""" 2 | 3 | import random 4 | from collections.abc import Callable 5 | from typing import Any, Self 6 | 7 | import dspy 8 | 9 | from sieves.serialization import Attribute, Config 10 | 11 | EvalMetric = Callable[[dspy.Example, dspy.Prediction], float] 12 | 13 | 14 | class Optimizer: 15 | """Config for task optimization with DSPy. 16 | 17 | Uses MIPROv2 to optimize instructions and few-shot examples. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | model: dspy.LM | dspy.BaseLM, 23 | val_frac: float, 24 | seed: int | None = None, 25 | shuffle: bool = True, 26 | dspy_init_kwargs: dict[str, Any] | None = None, 27 | dspy_compile_kwargs: dict[str, Any] | None = None, 28 | ): 29 | """Initialize optimizer. 30 | 31 | :param model: Fully initialized DSPy model to use for optimization. Doesn't have to be the same as the model 32 | used to run the task, but more similar is better. With a lot of data you might want to pick a faster/cheaper 33 | model. 34 | :param val_frac: Fraction of examples to use for validation. Everything else is used for optimization. 35 | :param seed: Random seed for data splitting. 36 | :param shuffle: Whether to shuffle the data. 37 | :param dspy_init_kwargs: Optional keyword arguments to pass to DSPy optimizer at init time. 38 | :param dspy_compile_kwargs: Optional keyword arguments to pass to DSPy optimizer at compile time. 39 | """ 40 | self._model = model 41 | self._val_frac = val_frac 42 | self._seed = seed 43 | self._shuffle = shuffle 44 | self._init_kwargs = dspy_init_kwargs or {} 45 | self._compile_kwargs = {"requires_permission_to_run": False} | (dspy_compile_kwargs or {}) 46 | 47 | def __call__( 48 | self, 49 | signature: type[dspy.Signature] | type[dspy.Module], 50 | data: list[dspy.Example], 51 | evaluate: EvalMetric, 52 | verbose: bool = False, 53 | ) -> tuple[str, list[dspy.Example]]: 54 | """Optimize prompt and few-shot examples w.r.t. given signature and dataset. 55 | 56 | :param signature: Task to optimize. 57 | :param data: Dataset to use for optimization. 58 | :param evaluate: Evaluation metric to use for optimization. 59 | :param verbose: Whether to log DSPy output. 60 | :return: Best combination of (1) prompt and (2) fewshot-examples. 61 | """ 62 | predictor = dspy.Predict(signature) 63 | teleprompter = dspy.MIPROv2(metric=evaluate, **(self._init_kwargs or {}), verbose=False) 64 | trainset, devset = self._split_data(data, self._val_frac, self._seed, self._shuffle) 65 | 66 | optimized_predictor: dspy.Predict = teleprompter.compile( 67 | predictor, trainset=trainset, valset=devset, **(self._compile_kwargs or {}) 68 | ) 69 | 70 | return optimized_predictor.signature.instructions, optimized_predictor.demos 71 | 72 | @property 73 | def model(self) -> dspy.LM: 74 | """Return model used for optimization. 75 | 76 | :return dspy.LM: Model used for optimization. 77 | """ 78 | return self._model 79 | 80 | @property 81 | def _state(self) -> dict[str, Any]: 82 | """Return attributes to serialize. 83 | 84 | :return: Dict of attributes to serialize. 85 | """ 86 | return { 87 | "model": self._model, 88 | "val_frac": self._val_frac, 89 | "seed": self._seed, 90 | "shuffle": self._shuffle, 91 | "init_kwargs": self._init_kwargs, 92 | "compile_kwargs": self._compile_kwargs, 93 | } 94 | 95 | def serialize(self) -> Config: 96 | """Serialize task. 97 | 98 | :return: Config instance. 99 | """ 100 | return Config.create(self.__class__, {k: Attribute(value=v) for k, v in self._state.items()}) 101 | 102 | @classmethod 103 | def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Self: 104 | """Generate Optimizer instance from config. 105 | 106 | :param config: Config to generate instance from. 107 | :param kwargs: Values to inject into loaded config. 108 | :return: Deserialized Optimizer instance. 109 | """ 110 | return cls(**config.to_init_dict(cls, **kwargs)) 111 | 112 | @staticmethod 113 | def _split_data( 114 | data: list[dspy.Example], val_frac: float, seed: int | None, shuffle: bool 115 | ) -> tuple[list[dspy.Example], list[dspy.Example]]: 116 | """Split data into train and validation sets. 117 | 118 | :param data: Dataset to split. 119 | :param val_frac: Fraction of data to use for validation. 120 | :param seed: Random seed for shuffling. 121 | :param shuffle: Whether to shuffle the data before splitting. 122 | :return: Tuple of (trainset, valset). 123 | """ 124 | dataset = data.copy() 125 | if shuffle: 126 | rng = random.Random(seed) 127 | rng.shuffle(dataset) 128 | 129 | val_size = int(len(dataset) * val_frac) 130 | trainset = dataset[val_size:] 131 | valset = dataset[:val_size] 132 | 133 | return trainset, valset 134 | -------------------------------------------------------------------------------- /sieves/tasks/postprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """Postprocessing tasks.""" 2 | 3 | from .distillation import DistillationFramework 4 | 5 | __all__ = ["DistillationFramework"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/postprocessing/distillation/__init__.py: -------------------------------------------------------------------------------- 1 | """Distillation.""" 2 | 3 | from .types import DistillationFramework, DistillationFrameworkLiteral 4 | 5 | __all__ = ["DistillationFramework", "DistillationFrameworkLiteral"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/postprocessing/distillation/distillation_import.py: -------------------------------------------------------------------------------- 1 | """Import 3rd-party libraries required for distillation. 2 | 3 | If library can't be found, placeholder engines is imported instead. 4 | 5 | This allows us to import everything downstream without having to worry about optional dependencies. If a user specifies 6 | a non-installed distillation framework, we terminate with an error. 7 | """ 8 | 9 | # mypy: disable-error-code="no-redef" 10 | 11 | import warnings 12 | 13 | _missing_dependencies: list[str] = [] 14 | 15 | 16 | try: 17 | import sentence_transformers 18 | except ModuleNotFoundError: 19 | sentence_transformers = None 20 | 21 | _missing_dependencies.append("sentence_transformers") 22 | 23 | try: 24 | import setfit 25 | except ModuleNotFoundError: 26 | setfit = None 27 | 28 | _missing_dependencies.append("setfit") 29 | 30 | try: 31 | import model2vec 32 | import model2vec.train 33 | except ModuleNotFoundError: 34 | model2vec = None 35 | 36 | _missing_dependencies.append("model2vec") 37 | 38 | if len(_missing_dependencies): 39 | warnings.warn( 40 | "Warning: distillation dependency [{deps}] could not be imported. Distilling with these tools requires them to " 41 | "be installed.".format(deps=", ".join(_missing_dependencies)) 42 | ) 43 | 44 | __all__ = ["model2vec", "sentence_transformers", "setfit"] 45 | -------------------------------------------------------------------------------- /sieves/tasks/postprocessing/distillation/types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import enum 4 | from typing import Literal 5 | 6 | 7 | class DistillationFramework(enum.Enum): 8 | model2vec = "model2vec" 9 | sentence_transformers = "sentence_transformers" 10 | setfit = "setfit" 11 | 12 | @classmethod 13 | def all(cls) -> tuple[DistillationFramework, ...]: 14 | """Returns all available engine types. 15 | :return tuple[EngineType, ...]: All available engine types. 16 | """ 17 | return tuple(dist_type for dist_type in DistillationFramework) 18 | 19 | 20 | DistillationFrameworkLiteral = Literal[*DistillationFramework.all()] # type: ignore[valid-type] 21 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/__init__.py: -------------------------------------------------------------------------------- 1 | """Predictive tasks.""" 2 | 3 | from .classification import Classification 4 | from .core import PredictiveTask 5 | from .information_extraction import InformationExtraction 6 | from .ner import NER 7 | from .pii_masking import PIIMasking 8 | from .question_answering import QuestionAnswering 9 | from .sentiment_analysis import SentimentAnalysis 10 | from .summarization import Summarization 11 | from .translation import Translation 12 | 13 | __all__ = [ 14 | "Classification", 15 | "InformationExtraction", 16 | "SentimentAnalysis", 17 | "Summarization", 18 | "Translation", 19 | "NER", 20 | "PIIMasking", 21 | "PredictiveTask", 22 | "QuestionAnswering", 23 | ] 24 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/classification/__init__.py: -------------------------------------------------------------------------------- 1 | """Classification task.""" 2 | 3 | from .core import Classification, FewshotExampleMultiLabel, FewshotExampleSingleLabel 4 | 5 | __all__ = ["Classification", "FewshotExampleMultiLabel", "FewshotExampleSingleLabel"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/information_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | """Information extraction task.""" 2 | 3 | from .core import FewshotExample, InformationExtraction 4 | 5 | __all__ = ["InformationExtraction", "FewshotExample"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/ner/__init__.py: -------------------------------------------------------------------------------- 1 | """NER task.""" 2 | 3 | from .core import NER, Entity, FewshotExample, _TaskPromptSignature, _TaskResult 4 | 5 | __all__ = ["Entity", "NER", "FewshotExample", "_TaskResult", "_TaskPromptSignature"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/pii_masking/__init__.py: -------------------------------------------------------------------------------- 1 | """PII masking.""" 2 | 3 | from .core import FewshotExample, PIIEntity, PIIMasking 4 | 5 | __all__ = ["FewshotExample", "PIIEntity", "PIIMasking"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/question_answering/__init__.py: -------------------------------------------------------------------------------- 1 | """Classification task.""" 2 | 3 | from .core import FewshotExample, QuestionAnswering 4 | 5 | __all__ = ["QuestionAnswering", "FewshotExample"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/question_answering/core.py: -------------------------------------------------------------------------------- 1 | """Question Answering predictive task.""" 2 | 3 | from __future__ import annotations 4 | 5 | from collections.abc import Iterable, Sequence 6 | from pathlib import Path 7 | from typing import Any, override 8 | 9 | import datasets 10 | import pydantic 11 | 12 | from sieves.data import Doc 13 | from sieves.engines import EngineType, dspy_, glix_, langchain_, outlines_ 14 | from sieves.engines.types import GenerationSettings 15 | from sieves.serialization import Config 16 | from sieves.tasks.postprocessing.distillation.types import DistillationFramework 17 | from sieves.tasks.predictive.bridges import GliXBridge 18 | from sieves.tasks.predictive.core import FewshotExample as BaseFewshotExample 19 | from sieves.tasks.predictive.core import PredictiveTask 20 | from sieves.tasks.predictive.question_answering.bridges import ( 21 | DSPyQA, 22 | LangChainQA, 23 | OutlinesQA, 24 | ) 25 | 26 | _TaskModel = dspy_.Model | glix_.Model | langchain_.Model | outlines_.Model 27 | _TaskPromptSignature = glix_.PromptSignature | pydantic.BaseModel | dspy_.PromptSignature 28 | _TaskResult = pydantic.BaseModel | dspy_.Result 29 | _TaskBridge = DSPyQA | GliXBridge | LangChainQA | OutlinesQA 30 | 31 | 32 | class FewshotExample(BaseFewshotExample): 33 | """Few-shot example with questions and answers for a context.""" 34 | 35 | reasoning: str 36 | questions: tuple[str, ...] | list[str] 37 | answers: tuple[str, ...] | list[str] 38 | 39 | @override 40 | @property 41 | def input_fields(self) -> Sequence[str]: 42 | return "text", "questions" 43 | 44 | @override 45 | @property 46 | def target_fields(self) -> Sequence[str]: 47 | return ("answers",) 48 | 49 | 50 | class QuestionAnswering(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]): 51 | """Answer questions about a text using structured engines.""" 52 | 53 | def __init__( 54 | self, 55 | questions: list[str], 56 | model: _TaskModel, 57 | task_id: str | None = None, 58 | include_meta: bool = True, 59 | batch_size: int = -1, 60 | prompt_instructions: str | None = None, 61 | fewshot_examples: Sequence[FewshotExample] = (), 62 | generation_settings: GenerationSettings = GenerationSettings(), 63 | ) -> None: 64 | """ 65 | Initialize QuestionAnswering task. 66 | 67 | :param questions: Questions to answer. 68 | :param model: Model to use. 69 | :param task_id: Task ID. 70 | :param include_meta: Whether to include meta information generated by the task. 71 | :param batch_size: Batch size to use for inference. Use -1 to process all documents at once. 72 | :param prompt_instructions: Custom prompt instructions. If None, default instructions are used. 73 | :param fewshot_examples: Few-shot examples. 74 | :param generation_settings: Settings for structured generation. 75 | """ 76 | self._questions = questions 77 | super().__init__( 78 | model=model, 79 | task_id=task_id, 80 | include_meta=include_meta, 81 | batch_size=batch_size, 82 | overwrite=False, 83 | prompt_instructions=prompt_instructions, 84 | fewshot_examples=fewshot_examples, 85 | generation_settings=generation_settings, 86 | ) 87 | self._fewshot_examples: Sequence[FewshotExample] 88 | 89 | @override 90 | def _init_bridge(self, engine_type: EngineType) -> _TaskBridge: 91 | if engine_type == EngineType.glix: 92 | return GliXBridge( 93 | task_id=self._task_id, 94 | prompt_instructions=self._custom_prompt_instructions, 95 | prompt_signature=self._questions, 96 | inference_mode=glix_.InferenceMode.question_answering, 97 | ) 98 | 99 | bridge_types: dict[EngineType, type[_TaskBridge]] = { 100 | EngineType.dspy: DSPyQA, 101 | EngineType.outlines: OutlinesQA, 102 | EngineType.langchain: LangChainQA, 103 | } 104 | 105 | try: 106 | bridge_type = bridge_types[engine_type] 107 | assert not issubclass(bridge_type, GliXBridge) 108 | 109 | return bridge_type( 110 | task_id=self._task_id, 111 | prompt_instructions=self._custom_prompt_instructions, 112 | questions=self._questions, 113 | ) 114 | except KeyError as err: 115 | raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err 116 | 117 | @override 118 | @property 119 | def supports(self) -> set[EngineType]: 120 | return { 121 | EngineType.dspy, 122 | EngineType.glix, 123 | EngineType.langchain, 124 | EngineType.outlines, 125 | } 126 | 127 | @override 128 | @property 129 | def _state(self) -> dict[str, Any]: 130 | return { 131 | **super()._state, 132 | "questions": self._questions, 133 | } 134 | 135 | @override 136 | def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets.Dataset: 137 | # Define metadata. 138 | features = datasets.Features( 139 | {"text": datasets.Value("string"), "answers": datasets.Sequence(datasets.Value("string"))} 140 | ) 141 | info = datasets.DatasetInfo( 142 | description=f"Question-answering dataset with questions {self._questions}. Generated with sieves " 143 | f"v{Config.get_version()}.", 144 | features=features, 145 | ) 146 | 147 | # Fetch data used for generating dataset. 148 | try: 149 | data = [(doc.text, doc.results[self._task_id]) for doc in docs] 150 | except KeyError as err: 151 | raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err 152 | 153 | def generate_data() -> Iterable[dict[str, Any]]: 154 | """Yield results as dicts. 155 | 156 | :return: Results as dicts. 157 | """ 158 | for text, answers in data: 159 | yield {"text": text, "answers": answers} 160 | 161 | # Create dataset. 162 | return datasets.Dataset.from_generator(generate_data, features=features, info=info) 163 | 164 | @override 165 | def distill( 166 | self, 167 | base_model_id: str, 168 | framework: DistillationFramework, 169 | data: datasets.Dataset | Sequence[Doc], 170 | output_path: Path | str, 171 | val_frac: float, 172 | init_kwargs: dict[str, Any] | None = None, 173 | train_kwargs: dict[str, Any] | None = None, 174 | seed: int | None = None, 175 | ) -> None: 176 | raise NotImplementedError 177 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/sentiment_analysis/__init__.py: -------------------------------------------------------------------------------- 1 | """Aspect-based sentiment analysis.""" 2 | 3 | from .core import FewshotExample, SentimentAnalysis 4 | 5 | __all__ = ["SentimentAnalysis", "FewshotExample"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/summarization/__init__.py: -------------------------------------------------------------------------------- 1 | """Information extraction task.""" 2 | 3 | from .core import FewshotExample, Summarization 4 | 5 | __all__ = ["Summarization", "FewshotExample"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/summarization/core.py: -------------------------------------------------------------------------------- 1 | """Text summarization predictive task.""" 2 | 3 | from __future__ import annotations 4 | 5 | from collections.abc import Iterable, Sequence 6 | from pathlib import Path 7 | from typing import Any, override 8 | 9 | import datasets 10 | import pydantic 11 | 12 | from sieves.data import Doc 13 | from sieves.engines import EngineType, dspy_, glix_, langchain_, outlines_ 14 | from sieves.engines.types import GenerationSettings 15 | from sieves.serialization import Config 16 | from sieves.tasks.postprocessing.distillation.types import DistillationFramework 17 | from sieves.tasks.predictive.bridges import GliXBridge 18 | from sieves.tasks.predictive.core import FewshotExample as BaseFewshotExample 19 | from sieves.tasks.predictive.core import PredictiveTask 20 | from sieves.tasks.predictive.summarization.bridges import ( 21 | DSPySummarization, 22 | LangChainSummarization, 23 | OutlinesSummarization, 24 | ) 25 | 26 | _TaskModel = dspy_.Model | glix_.Model | langchain_.Model | outlines_.Model 27 | _TaskPromptSignature = pydantic.BaseModel | dspy_.PromptSignature | glix_.PromptSignature 28 | _TaskResult = outlines_.Result | dspy_.Result 29 | _TaskBridge = DSPySummarization | GliXBridge | LangChainSummarization | OutlinesSummarization 30 | 31 | 32 | class FewshotExample(BaseFewshotExample): 33 | """Few-shot example with a target summary.""" 34 | 35 | n_words: int 36 | summary: str 37 | 38 | @override 39 | @property 40 | def target_fields(self) -> Sequence[str]: 41 | return ("summary",) 42 | 43 | 44 | class Summarization(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]): 45 | """Summarize documents to a target length using structured engines.""" 46 | 47 | def __init__( 48 | self, 49 | n_words: int, 50 | model: _TaskModel, 51 | task_id: str | None = None, 52 | include_meta: bool = True, 53 | batch_size: int = -1, 54 | overwrite: bool = False, 55 | prompt_instructions: str | None = None, 56 | fewshot_examples: Sequence[FewshotExample] = (), 57 | generation_settings: GenerationSettings = GenerationSettings(), 58 | ) -> None: 59 | """Initialize new Summarization task. 60 | 61 | :param n_words: Maximal number of words (consider this a guideline, not a strict limit). 62 | :param model: Model to use. 63 | :param task_id: Task ID. 64 | :param include_meta: Whether to include meta information generated by the task. 65 | :param batch_size: Batch size to use for inference. Use -1 to process all documents at once. 66 | :param overwrite: Some tasks, e.g. anonymization or translation, output a modified version of the input text. 67 | If True, these tasks overwrite the original document text. If False, the result will just be stored in the 68 | documents' `.results` field. 69 | :param prompt_instructions: Custom prompt instructions. If None, default instructions are used. 70 | :param fewshot_examples: Few-shot examples. 71 | :param generation_settings: Settings for structured generation. 72 | """ 73 | self._n_words = n_words 74 | 75 | super().__init__( 76 | model=model, 77 | task_id=task_id, 78 | include_meta=include_meta, 79 | batch_size=batch_size, 80 | overwrite=overwrite, 81 | prompt_instructions=prompt_instructions, 82 | fewshot_examples=fewshot_examples, 83 | generation_settings=generation_settings, 84 | ) 85 | 86 | @override 87 | def _init_bridge(self, engine_type: EngineType) -> _TaskBridge: 88 | if engine_type == EngineType.glix: 89 | return GliXBridge( 90 | task_id=self._task_id, 91 | prompt_instructions=self._custom_prompt_instructions, 92 | prompt_signature=[], 93 | inference_mode=glix_.InferenceMode.summarization, 94 | ) 95 | 96 | bridge_types: dict[EngineType, type[_TaskBridge]] = { 97 | EngineType.dspy: DSPySummarization, 98 | EngineType.langchain: LangChainSummarization, 99 | EngineType.outlines: OutlinesSummarization, 100 | } 101 | 102 | try: 103 | bridge_type = bridge_types[engine_type] 104 | assert not issubclass(bridge_type, GliXBridge) 105 | 106 | return bridge_type( 107 | task_id=self._task_id, 108 | prompt_instructions=self._custom_prompt_instructions, 109 | overwrite=self._overwrite, 110 | n_words=self._n_words, 111 | ) 112 | except KeyError as err: 113 | raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err 114 | 115 | @property 116 | @override 117 | def supports(self) -> set[EngineType]: 118 | return { 119 | EngineType.dspy, 120 | EngineType.glix, 121 | EngineType.langchain, 122 | EngineType.outlines, 123 | } 124 | 125 | @property 126 | @override 127 | def _state(self) -> dict[str, Any]: 128 | return { 129 | **super()._state, 130 | "n_words": self._n_words, 131 | } 132 | 133 | @override 134 | def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets.Dataset: 135 | # Define metadata. 136 | features = datasets.Features({"text": datasets.Value("string"), "summary": datasets.Value("string")}) 137 | info = datasets.DatasetInfo( 138 | description=f"Summarization dataset. Generated with sieves v{Config.get_version()}.", 139 | features=features, 140 | ) 141 | 142 | # Fetch data used for generating dataset. 143 | try: 144 | data = [(doc.text, doc.results[self._task_id]) for doc in docs] 145 | except KeyError as err: 146 | raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err 147 | 148 | def generate_data() -> Iterable[dict[str, Any]]: 149 | """Yield results as dicts. 150 | 151 | :return: Results as dicts. 152 | """ 153 | for text, summary in data: 154 | yield {"text": text, "summary": summary} 155 | 156 | # Create dataset. 157 | return datasets.Dataset.from_generator(generate_data, features=features, info=info) 158 | 159 | @override 160 | def distill( 161 | self, 162 | base_model_id: str, 163 | framework: DistillationFramework, 164 | data: datasets.Dataset | Sequence[Doc], 165 | output_path: Path | str, 166 | val_frac: float, 167 | init_kwargs: dict[str, Any] | None = None, 168 | train_kwargs: dict[str, Any] | None = None, 169 | seed: int | None = None, 170 | ) -> None: 171 | raise NotImplementedError 172 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/translation/__init__.py: -------------------------------------------------------------------------------- 1 | """Information extraction task.""" 2 | 3 | from .core import FewshotExample, Translation, _TaskPromptSignature, _TaskResult 4 | 5 | __all__ = ["Translation", "FewshotExample", "_TaskResult", "_TaskPromptSignature"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/predictive/translation/core.py: -------------------------------------------------------------------------------- 1 | """Translation predictive task.""" 2 | 3 | from __future__ import annotations 4 | 5 | from collections.abc import Iterable, Sequence 6 | from pathlib import Path 7 | from typing import Any, override 8 | 9 | import datasets 10 | import pydantic 11 | 12 | from sieves.data import Doc 13 | from sieves.engines import EngineType, dspy_, langchain_, outlines_ 14 | from sieves.engines.types import GenerationSettings 15 | from sieves.serialization import Config 16 | from sieves.tasks.postprocessing.distillation.types import DistillationFramework 17 | from sieves.tasks.predictive.core import FewshotExample as BaseFewshotExample 18 | from sieves.tasks.predictive.core import PredictiveTask 19 | from sieves.tasks.predictive.translation.bridges import ( 20 | DSPyTranslation, 21 | LangChainTranslation, 22 | OutlinesTranslation, 23 | ) 24 | 25 | _TaskModel = dspy_.Model | langchain_.Model | outlines_.Model 26 | _TaskPromptSignature = pydantic.BaseModel | dspy_.PromptSignature 27 | _TaskResult = outlines_.Result | dspy_.Result 28 | _TaskBridge = DSPyTranslation | LangChainTranslation | OutlinesTranslation 29 | 30 | 31 | class FewshotExample(BaseFewshotExample): 32 | """Few-shot example with a target translation.""" 33 | 34 | to: str 35 | translation: str 36 | 37 | @override 38 | @property 39 | def input_fields(self) -> Sequence[str]: 40 | return "text", "to" 41 | 42 | @override 43 | @property 44 | def target_fields(self) -> Sequence[str]: 45 | return ("translation",) 46 | 47 | 48 | class Translation(PredictiveTask[_TaskPromptSignature, _TaskResult, _TaskBridge]): 49 | """Translate documents into a target language using structured engines.""" 50 | 51 | def __init__( 52 | self, 53 | to: str, 54 | model: _TaskModel, 55 | task_id: str | None = None, 56 | include_meta: bool = True, 57 | batch_size: int = -1, 58 | overwrite: bool = False, 59 | prompt_instructions: str | None = None, 60 | fewshot_examples: Sequence[FewshotExample] = (), 61 | generation_settings: GenerationSettings = GenerationSettings(), 62 | ) -> None: 63 | """ 64 | Initialize Translation task. 65 | 66 | :param to: Language to translate to. 67 | :param model: Model to use. 68 | :param task_id: Task ID. 69 | :param include_meta: Whether to include meta information generated by the task. 70 | :param batch_size: Batch size to use for inference. Use -1 to process all documents at once. 71 | :param overwrite: Some tasks, e.g. anonymization or translation, output a modified version of the input text. 72 | If True, these tasks overwrite the original document text. If False, the result will just be stored in the 73 | documents' `.results` field. 74 | :param prompt_instructions: Custom prompt instructions. If None, default instructions are used. 75 | :param fewshot_examples: Few-shot examples. 76 | :param generation_settings: Settings for structured generation. 77 | """ 78 | self._to = to 79 | 80 | super().__init__( 81 | model=model, 82 | task_id=task_id, 83 | include_meta=include_meta, 84 | batch_size=batch_size, 85 | overwrite=overwrite, 86 | prompt_instructions=prompt_instructions, 87 | fewshot_examples=fewshot_examples, 88 | generation_settings=generation_settings, 89 | ) 90 | 91 | @override 92 | def _init_bridge(self, engine_type: EngineType) -> _TaskBridge: 93 | bridge_types: dict[EngineType, type[_TaskBridge]] = { 94 | EngineType.dspy: DSPyTranslation, 95 | EngineType.langchain: LangChainTranslation, 96 | EngineType.outlines: OutlinesTranslation, 97 | } 98 | 99 | try: 100 | bridge = bridge_types[engine_type]( 101 | task_id=self._task_id, 102 | prompt_instructions=self._custom_prompt_instructions, 103 | overwrite=self._overwrite, 104 | language=self._to, 105 | ) 106 | except KeyError as err: 107 | raise KeyError(f"Engine type {engine_type} is not supported by {self.__class__.__name__}.") from err 108 | 109 | return bridge 110 | 111 | @override 112 | @property 113 | def supports(self) -> set[EngineType]: 114 | return {EngineType.dspy, EngineType.langchain, EngineType.outlines} 115 | 116 | @override 117 | @property 118 | def _state(self) -> dict[str, Any]: 119 | return { 120 | **super()._state, 121 | "to": self._to, 122 | } 123 | 124 | @override 125 | def to_hf_dataset(self, docs: Iterable[Doc], threshold: float = 0.5) -> datasets.Dataset: 126 | # Define metadata. 127 | features = datasets.Features({"text": datasets.Value("string"), "translation": datasets.Value("string")}) 128 | info = datasets.DatasetInfo( 129 | description=f"Translation dataset with target language {self._to}." 130 | f"Generated with sieves v{Config.get_version()}.", 131 | features=features, 132 | ) 133 | 134 | # Fetch data used for generating dataset. 135 | try: 136 | data = [(doc.text, doc.results[self._task_id]) for doc in docs] 137 | except KeyError as err: 138 | raise KeyError(f"Not all documents have results for this task with ID {self._task_id}") from err 139 | 140 | def generate_data() -> Iterable[dict[str, Any]]: 141 | """Yield results as dicts. 142 | 143 | :return: Results as dicts. 144 | """ 145 | for text, translation in data: 146 | yield {"text": text, "translation": translation} 147 | 148 | # Create dataset. 149 | return datasets.Dataset.from_generator(generate_data, features=features, info=info) 150 | 151 | @override 152 | def distill( 153 | self, 154 | base_model_id: str, 155 | framework: DistillationFramework, 156 | data: datasets.Dataset | Sequence[Doc], 157 | output_path: Path | str, 158 | val_frac: float, 159 | init_kwargs: dict[str, Any] | None = None, 160 | train_kwargs: dict[str, Any] | None = None, 161 | seed: int | None = None, 162 | ) -> None: 163 | raise NotImplementedError 164 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """Preprocessing tasks.""" 2 | 3 | from .chunking import Chunking 4 | from .ingestion import Ingestion 5 | 6 | __all__ = ["Chunking", "Ingestion"] 7 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/chunking/__init__.py: -------------------------------------------------------------------------------- 1 | from .chonkie_ import Chonkie 2 | from .core import Chunking 3 | from .naive import NaiveChunker 4 | 5 | __all__ = ["Chunking", "Chonkie", "NaiveChunker"] 6 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/chunking/chonkie_.py: -------------------------------------------------------------------------------- 1 | """Allows chunking of documents into segments.""" 2 | 3 | import itertools 4 | import sys 5 | from collections.abc import Iterable 6 | from typing import Any 7 | 8 | import chonkie 9 | 10 | from sieves.data.doc import Doc 11 | from sieves.tasks.core import Task 12 | 13 | 14 | class Chonkie(Task): 15 | """Chunker wrapping the chonkie library.""" 16 | 17 | def __init__( 18 | self, 19 | chunker: chonkie.BaseChunker, 20 | task_id: str | None = None, 21 | include_meta: bool = False, 22 | batch_size: int = -1, 23 | ): 24 | """Initialize chunker. 25 | 26 | :param task_id: Task ID. 27 | :param include_meta: Whether to include meta information generated by the task. 28 | :param batch_size: Batch size to use for processing. Use -1 to process all documents at once. 29 | """ 30 | super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size) 31 | self._chunker = chunker 32 | 33 | def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]: 34 | """Split documents into chunks. 35 | 36 | :param docs: Documents to split. 37 | :return: Split documents. 38 | """ 39 | batch_size = self._batch_size if self._batch_size > 0 else sys.maxsize 40 | while docs_batch := [doc for doc in itertools.islice(docs, batch_size)]: 41 | if len(docs_batch) == 0: 42 | break 43 | 44 | chunks = self._chunker.chunk_batch([doc.text for doc in docs_batch], show_progress_bar=False) 45 | assert len(chunks) == len(docs_batch) 46 | 47 | for doc, doc_chunks in zip(docs_batch, chunks): 48 | if self._include_meta: 49 | doc.meta |= {self.id: {doc_chunks}} 50 | doc.chunks = [chunk.text for chunk in doc_chunks] 51 | 52 | yield doc 53 | 54 | @property 55 | def _state(self) -> dict[str, Any]: 56 | return { 57 | **super()._state, 58 | "chunker": self._chunker, 59 | } 60 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/chunking/core.py: -------------------------------------------------------------------------------- 1 | """Chunking task.""" 2 | 3 | from __future__ import annotations 4 | 5 | import itertools 6 | from collections.abc import Iterable 7 | from typing import Any 8 | 9 | import chonkie 10 | 11 | from sieves.data.doc import Doc 12 | from sieves.serialization import Config 13 | from sieves.tasks.core import Task 14 | from sieves.tasks.preprocessing import chunking 15 | from sieves.tasks.preprocessing.chunking import chonkie_, naive 16 | 17 | _ChunkerArgType = chonkie.BaseChunker | int 18 | _ChunkerType = chonkie_.Chonkie | naive.NaiveChunker 19 | 20 | 21 | class Chunking(Task): 22 | """Task for chunking documents using different strategies. 23 | 24 | This task acts as a wrapper around specific chunker implementations, 25 | allowing for flexible configuration based on the provided chunker object or interval. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | chunker: _ChunkerArgType, 31 | task_id: str | None = None, 32 | include_meta: bool = False, 33 | batch_size: int = -1, 34 | ): 35 | """Initialize the Chunker task. 36 | 37 | :param chunker: The chunker instance (chonkie.BaseChunker) or the interval (int) for NaiveChunker. 38 | :param task_id: Task ID. 39 | :param include_meta: Whether to include meta information generated by the task. 40 | :param batch_size: Batch size to use for processing. Use -1 to process all documents at once. 41 | """ 42 | super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size) 43 | self._chunker_arg = chunker 44 | self._task = self._init_chunker_task() 45 | 46 | def _init_chunker_task(self) -> _ChunkerType: 47 | """Initialize the specific chunker task based on the type of _chunker_arg. 48 | 49 | :return: Initialized chunker task instance. 50 | :raises TypeError: If the type of _chunker_arg is not supported. 51 | """ 52 | chunker_task: _ChunkerType 53 | 54 | match self._chunker_arg: 55 | case chunker if isinstance(chunker, chonkie.BaseChunker): 56 | chunker_task = chunking.chonkie_.Chonkie( 57 | chunker=chunker, 58 | task_id=self.id, 59 | include_meta=self._include_meta, 60 | batch_size=self._batch_size, 61 | ) 62 | case interval if isinstance(interval, int): 63 | chunker_task = chunking.naive.NaiveChunker( 64 | interval=interval, 65 | task_id=self.id, 66 | include_meta=self._include_meta, 67 | batch_size=self._batch_size, 68 | ) 69 | case _: 70 | raise TypeError( 71 | f"Unsupported type for 'chunker' argument: {type(self._chunker_arg)}. " 72 | f"Expected chonkie.BaseChunker or int." 73 | ) 74 | 75 | return chunker_task 76 | 77 | def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]: 78 | """Process documents by chunking their text. 79 | 80 | :param docs: Documents to process. 81 | :return: Processed documents with chunks added. 82 | """ 83 | docs_iters = itertools.tee(docs, 2) 84 | assert all(doc.text for doc in docs_iters[0]), ValueError("Documents have to have a value for .text.") 85 | yield from self._task(docs_iters[1]) 86 | 87 | @property 88 | def _state(self) -> dict[str, Any]: 89 | """Return attributes to serialize. 90 | 91 | :return: Dict of attributes to serialize. 92 | """ 93 | return { 94 | **super()._state, 95 | "chunker": self._chunker_arg, 96 | } 97 | 98 | @classmethod 99 | def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Chunking: 100 | """Generate Chunker instance from config. 101 | 102 | :param config: Config to generate instance from. 103 | :param kwargs: Values to inject into loaded config. 104 | :return: Deserialized Chunker instance. 105 | """ 106 | return cls(**config.to_init_dict(cls, **kwargs)) 107 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/chunking/naive.py: -------------------------------------------------------------------------------- 1 | """Allows chunking of documents into segments.""" 2 | 3 | import itertools 4 | import re 5 | import sys 6 | from collections.abc import Iterable 7 | from typing import Any 8 | 9 | from sieves.data.doc import Doc 10 | from sieves.tasks.core import Task 11 | 12 | 13 | class NaiveChunker(Task): 14 | """Chunks by sentence counts. Only for test purposes.""" 15 | 16 | def __init__( 17 | self, 18 | interval: int, 19 | task_id: str | None = None, 20 | include_meta: bool = False, 21 | batch_size: int = -1, 22 | ): 23 | """Initialize chunker. 24 | 25 | :param interval: Token count interval for chunks. 26 | :param task_id: Task ID. 27 | :param include_meta: Whether to include meta information generated by the task. 28 | :param batch_size: Batch size to use for processing. Use -1 to process all documents at once. 29 | """ 30 | super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size) 31 | self._interval = interval 32 | 33 | def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]: 34 | """Split documents into chunks. 35 | 36 | :param docs: Documents to split. 37 | :return: Split documents. 38 | """ 39 | batch_size = self._batch_size if self._batch_size > 0 else sys.maxsize 40 | while docs_batch := [doc for doc in itertools.islice(docs, batch_size)]: 41 | if len(docs_batch) == 0: 42 | break 43 | 44 | for doc in docs_batch: 45 | assert doc.text 46 | sentences = [sent for sent in re.split("[?!.]", doc.text) if len(sent.strip())] 47 | doc.chunks = [ 48 | ".".join(sentences[i : i + self._interval]) for i in range(0, len(sentences), self._interval) 49 | ] 50 | 51 | yield doc 52 | 53 | @property 54 | def _state(self) -> dict[str, Any]: 55 | return { 56 | **super()._state, 57 | "interval": self._interval, 58 | } 59 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/ingestion/__init__.py: -------------------------------------------------------------------------------- 1 | """Ingestion task implementation.""" 2 | 3 | from .core import Ingestion 4 | from .docling_ import Docling 5 | from .marker_ import Marker 6 | from .unstructured_ import Unstructured 7 | 8 | __all__ = ["Docling", "Marker", "Ingestion", "Unstructured"] 9 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/ingestion/core.py: -------------------------------------------------------------------------------- 1 | """Ingestion task implementation.""" 2 | 3 | from __future__ import annotations 4 | 5 | from collections.abc import Iterable 6 | from typing import Any 7 | 8 | import docling 9 | import docling.document_converter 10 | import marker 11 | from marker.converters.pdf import PdfConverter 12 | from marker.converters.table import TableConverter 13 | 14 | from sieves.data.doc import Doc 15 | from sieves.serialization import Config 16 | from sieves.tasks.core import Task 17 | from sieves.tasks.preprocessing.ingestion import docling_, marker_ 18 | 19 | _ConverterType = docling.document_converter.DocumentConverter | PdfConverter | TableConverter 20 | 21 | 22 | class Ingestion(Task): 23 | """Base class for Ingestion tasks that extract text from documents. 24 | 25 | This unified interface allows different Ingestion converters to be used interchangeably. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | converter: _ConverterType = docling.document_converter.DocumentConverter(), 31 | export_format: str = "markdown", 32 | task_id: str | None = None, 33 | include_meta: bool = False, 34 | batch_size: int = -1, 35 | **kwargs: Any, 36 | ): 37 | """Initialize the Ingestion task. 38 | 39 | :param converter: The Ingestion converter to use. 40 | :param task_id: Task ID. 41 | :param include_meta: Whether to include meta information generated by the task. 42 | :param batch_size: Batch size to use for processing. Use -1 to process all documents at once. 43 | :param kwargs: Additional arguments for specific Ingestion implementations. 44 | """ 45 | super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size) 46 | self._export_format = export_format 47 | self._converter = converter 48 | self._kwargs = kwargs 49 | self._task = self._init_ingestion_task() 50 | 51 | def _init_ingestion_task(self) -> Task: 52 | """Initialize the bridge for the specific Ingestion implementation. 53 | 54 | :return: Ingestion bridge implementation. 55 | """ 56 | converter_type = type(self._converter) 57 | ingestion_task: Task 58 | match converter_type: 59 | case converter if issubclass( 60 | converter, (marker.converters.pdf.PdfConverter | marker.converters.table.TableConverter) 61 | ): 62 | ingestion_task = marker_.Marker( 63 | converter=self._converter, 64 | export_format=self._export_format, 65 | task_id=self.id, 66 | include_meta=self._include_meta, 67 | batch_size=self._batch_size, 68 | **self._kwargs, 69 | ) 70 | case docling.document_converter.DocumentConverter: 71 | ingestion_task = docling_.Docling( 72 | converter=self._converter, 73 | export_format=self._export_format, 74 | task_id=self.id, 75 | include_meta=self._include_meta, 76 | batch_size=self._batch_size, 77 | ) 78 | case _: 79 | raise ValueError( 80 | f"converter type {self._converter} is not supported. Please check the documentation " 81 | f"and ensure you're providing a supported converter type." 82 | ) 83 | assert isinstance(ingestion_task, Task) 84 | return ingestion_task 85 | 86 | def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]: 87 | """Process documents with Ingestion to extract text. 88 | 89 | :param docs: Documents to process. 90 | :return: Processed documents with extracted text. 91 | """ 92 | docs = list(docs) 93 | assert all(doc.uri for doc in docs), ValueError("Documents have to have a value for .uri.") 94 | result = self._task(docs) 95 | 96 | yield from result 97 | 98 | @property 99 | def _state(self) -> dict[str, Any]: 100 | """Returns attributes to serialize. 101 | 102 | :return: Dict of attributes to serialize. 103 | """ 104 | return { 105 | **super()._state, 106 | "converter": self._converter, 107 | "export_format": self._export_format, 108 | **self._kwargs, 109 | } 110 | 111 | @classmethod 112 | def deserialize(cls, config: Config, **kwargs: dict[str, Any]) -> Ingestion: 113 | """ 114 | Generate Ingestion instance from config. 115 | 116 | :param config: Config to generate instance from. 117 | :param kwargs: Values to inject into loaded config. 118 | :return: Deserialized Ingestion instance. 119 | """ 120 | return cls(**config.to_init_dict(cls, **kwargs)) 121 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/ingestion/docling_.py: -------------------------------------------------------------------------------- 1 | """Wrapper for `Docling` for the conversion of complex files into markdown.""" 2 | 3 | import warnings 4 | from collections.abc import Iterable 5 | from typing import Any 6 | 7 | import docling.datamodel.document 8 | import docling.document_converter 9 | from loguru import logger 10 | 11 | from sieves.data.doc import Doc 12 | from sieves.tasks.core import Task 13 | 14 | 15 | class Docling(Task): 16 | """Parser wrapping the docling library to convert files into documents.""" 17 | 18 | def __init__( 19 | self, 20 | converter: docling.document_converter.DocumentConverter | None = None, 21 | export_format: str = "markdown", 22 | task_id: str | None = None, 23 | include_meta: bool = False, 24 | batch_size: int = -1, 25 | ): 26 | """Initialize the docling parser. 27 | 28 | :param converter: Docling parser instance. 29 | :param task_id: Task ID. 30 | :param include_meta: Whether to include meta information generated by the task. 31 | :param batch_size: Batch size to use for processing. Use -1 to process all documents at once. 32 | """ 33 | super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size) 34 | self._converter = converter if converter else docling.document_converter.DocumentConverter() 35 | self._export_format = export_format 36 | 37 | def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]: 38 | """Parse resources using docling. 39 | 40 | :param docs: Resources to process. 41 | :return: Parsed documents 42 | """ 43 | docs = list(docs) 44 | 45 | # Validate docs. 46 | have_text = False 47 | for doc in docs: 48 | assert doc.uri, ValueError("Documents have to have a value for .uri.") 49 | if doc.text: 50 | have_text = True 51 | if have_text: 52 | warnings.warn(f"Task {self._task_id} is about to overwrite existing .text values.") 53 | 54 | parsed_resources: list[docling.datamodel.document.ConversionResult] = list( 55 | self._converter.convert_all([resource.uri for resource in docs]) 56 | ) 57 | assert len(parsed_resources) == len(docs) 58 | 59 | for doc, parsed_resource in zip(docs, parsed_resources): 60 | try: 61 | if self._include_meta: 62 | doc.meta |= {self.id: parsed_resource} 63 | if self._export_format == "markdown": 64 | doc.text = parsed_resource.document.export_to_markdown() 65 | elif self._export_format == "html": 66 | doc.text = parsed_resource.document.export_to_html() 67 | elif self._export_format == "json": 68 | doc.text = parsed_resource.document.export_to_dict() 69 | except Exception as e: 70 | logger.error(f"Failed to parse file {doc.uri}: {str(e)}") 71 | continue 72 | 73 | return docs 74 | 75 | @property 76 | def _state(self) -> dict[str, Any]: 77 | return { 78 | **super()._state, 79 | "converter": self._converter, 80 | "export_format": self._export_format, 81 | } 82 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/ingestion/marker_.py: -------------------------------------------------------------------------------- 1 | """Marker task for converting PDF documents to text.""" 2 | 3 | from collections.abc import Iterable 4 | from pathlib import Path 5 | from typing import Any 6 | 7 | from marker.converters.pdf import PdfConverter 8 | from marker.converters.table import TableConverter 9 | from marker.models import create_model_dict 10 | from marker.output import text_from_rendered 11 | 12 | from sieves.data import Doc 13 | from sieves.tasks.core import Task 14 | 15 | 16 | class Marker(Task): 17 | """Marker task for converting PDF documents to text.""" 18 | 19 | def __init__( 20 | self, 21 | converter: PdfConverter | TableConverter | None = None, 22 | export_format: str = "markdown", 23 | task_id: str | None = None, 24 | include_meta: bool = False, 25 | batch_size: int = -1, 26 | extract_images: bool = False, 27 | ): 28 | """Initialize the Marker task. 29 | 30 | :param converter: Custom PdfConverter or TableConverter instance. If None, a default one will be created. 31 | :param export_format: Format to export the document in ("markdown", "html", or "json"). 32 | :param task_id: Task ID. 33 | :param include_meta: Whether to include meta information generated by the task. 34 | :param batch_size: Batch size to use for processing. Use -1 to process all documents at once. 35 | :param extract_images: Whether to extract images from the PDF. 36 | """ 37 | super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size) 38 | 39 | self._export_format = export_format 40 | self._converter = self._setup_converter(converter, self._export_format) 41 | self._extract_images = extract_images 42 | 43 | def _setup_converter( 44 | self, converter: PdfConverter | TableConverter | None, export_format: str 45 | ) -> PdfConverter | TableConverter: 46 | """Set up the converter with the specified renderer. 47 | 48 | :param converter: Custom converter instance or None. 49 | :param export_format: Format to export the document in. 50 | :return: Configured converter instance. 51 | """ 52 | renderer: str = self._get_renderer(export_format) 53 | if converter is None: 54 | return PdfConverter(artifact_dict=create_model_dict(), renderer=renderer) 55 | 56 | # If a converter is provided, use its type but update the renderer 57 | if isinstance(converter, TableConverter): 58 | return TableConverter(artifact_dict=create_model_dict(), renderer=renderer) 59 | elif isinstance(converter, PdfConverter): 60 | return PdfConverter(artifact_dict=create_model_dict(), renderer=renderer) 61 | else: 62 | raise ValueError(f"Invalid converter type: {type(converter)}") 63 | 64 | def _get_renderer(self, export_format: str) -> str: 65 | """Get the renderer string based on the export format. 66 | 67 | :param export_format: Format to export the document in. 68 | :return: The renderer string. 69 | :raises ValueError: If the export format is invalid. 70 | """ 71 | if export_format == "markdown": 72 | return "marker.renderers.markdown.MarkdownRenderer" 73 | elif export_format == "html": 74 | return "marker.renderers.html.HTMLRenderer" 75 | elif export_format == "json": 76 | return "marker.renderers.json.JSONRenderer" 77 | else: 78 | raise ValueError(f"Invalid export format: {export_format}") 79 | 80 | def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]: 81 | """Process documents using Marker. 82 | 83 | :param docs: Documents to process. 84 | :return: Processed documents. 85 | """ 86 | docs = list(docs) 87 | 88 | for doc in docs: 89 | # Convert URI to string if it's a Path 90 | uri = str(doc.uri) if isinstance(doc.uri, Path) else doc.uri 91 | # Process the document 92 | rendered = self._converter(uri) 93 | 94 | # Extract text and optionally images 95 | text, _, images = text_from_rendered(rendered) 96 | if self._extract_images: 97 | doc.images = images 98 | 99 | # Update document text 100 | doc.text = text 101 | 102 | for doc in docs: 103 | yield doc 104 | 105 | @property 106 | def _state(self) -> dict[str, Any]: 107 | """Get state for serialization. 108 | 109 | :return: State dictionary. 110 | """ 111 | return { 112 | **super()._state, 113 | "converter": self._converter, 114 | "export_format": self._export_format, 115 | "extract_images": self._extract_images, 116 | } 117 | -------------------------------------------------------------------------------- /sieves/tasks/preprocessing/ingestion/unstructured_.py: -------------------------------------------------------------------------------- 1 | """File preprocessing for converting raw files into documents.""" 2 | 3 | import warnings 4 | from collections.abc import Callable, Iterable 5 | from typing import Any 6 | 7 | import nltk 8 | import unstructured 9 | import unstructured.documents.elements 10 | import unstructured.partition.auto 11 | 12 | from sieves.data.doc import Doc 13 | from sieves.tasks.core import Task 14 | 15 | PartitionType = Callable[..., list[unstructured.documents.elements.Text]] 16 | CleanerType = Callable[[str], str] 17 | 18 | 19 | class Unstructured(Task): 20 | """Parser wrapping the unstructured library to convert files into documents.""" 21 | 22 | def __init__( 23 | self, 24 | partition: PartitionType = unstructured.partition.auto.partition, 25 | cleaners: tuple[CleanerType, ...] = (), 26 | task_id: str | None = None, 27 | include_meta: bool = False, 28 | batch_size: int = -1, 29 | **kwargs: dict[str, Any], 30 | ): 31 | """Initialize the docling parser. 32 | 33 | :param partition: Function to use for partitioning. 34 | :param cleaners: Cleaning functions to apply. 35 | :param task_id: Task ID. 36 | :param include_meta: Whether to include meta information generated by the task. 37 | :param batch_size: Batch size to use for processing. Use -1 to process all documents at once. 38 | :param kwargs: Kwargs to be supplied to partitioning call. 39 | """ 40 | super().__init__(task_id=task_id, include_meta=include_meta, batch_size=batch_size) 41 | self._partition = partition 42 | self._partition_args = kwargs or {} 43 | self._cleaners = cleaners 44 | 45 | Unstructured._require() 46 | 47 | @staticmethod 48 | def _require() -> None: 49 | """Download all necessary resources that have to be installed from within Python.""" 50 | # Some nltk resources seem necessary for basic functionality. 51 | for nltk_resource in ("punkt_tab", "averaged_perceptron_tagger_eng"): 52 | # Don't install if already available. 53 | try: 54 | nltk.data.find(nltk_resource) 55 | except LookupError: 56 | nltk.download(nltk_resource) 57 | 58 | def __call__(self, docs: Iterable[Doc]) -> Iterable[Doc]: 59 | """Parse resources using docling. 60 | 61 | :param docs: Resources to process. 62 | :return: Parsed documents. 63 | """ 64 | docs = list(docs) 65 | 66 | # Validate docs. 67 | have_text = False 68 | for doc in docs: 69 | assert doc.uri, ValueError("Documents have to have a value for .uri.") 70 | if doc.text: 71 | have_text = True 72 | if have_text: 73 | warnings.warn(f"Task {self._task_id} is about to overwrite existing .text values.") 74 | 75 | # Wrap conversion in TQDM if progress should be shown. 76 | does_chunking = "chunking_strategy" in self._partition_args 77 | 78 | for doc in docs: 79 | try: 80 | # Parse and process document. 81 | parsed_resources: list[unstructured.documents.elements.Text] = self._partition( 82 | doc.uri, **self._partition_args 83 | ) 84 | 85 | # Apply specified cleaners. 86 | for cleaner in self._cleaners: 87 | for pr in parsed_resources: 88 | pr.apply(cleaner) 89 | 90 | # Integrate into Doc instances. 91 | if self._include_meta: 92 | doc.meta |= {self.id: parsed_resources} 93 | 94 | # Use chunks. 95 | if does_chunking: 96 | doc.chunks = [pr.text for pr in parsed_resources] 97 | 98 | # Merge texts from all elements into single string for the entire document. 99 | doc.text = "\n".join(resource.text for resource in parsed_resources) 100 | 101 | except FileNotFoundError as err: 102 | raise FileNotFoundError( 103 | f"File at {doc.uri} not found. Ensure that this is a local file path - unstructured doesn't support" 104 | f" loading files via network URIs." 105 | ) from err 106 | 107 | return docs 108 | 109 | @property 110 | def _state(self) -> dict[str, Any]: 111 | return { 112 | **super()._state, 113 | "partition": self._partition, 114 | "cleaners": self._cleaners, 115 | **self._partition_args, 116 | } 117 | -------------------------------------------------------------------------------- /sieves/tasks/types.py: -------------------------------------------------------------------------------- 1 | """Common types.""" 2 | 3 | from sieves.engines.engine_import import ( 4 | dspy_, 5 | glix_, 6 | huggingface_, 7 | langchain_, 8 | outlines_, 9 | ) 10 | 11 | Model = dspy_.Model | glix_.Model | huggingface_.Model | langchain_.Model | outlines_.Model 12 | -------------------------------------------------------------------------------- /sieves/tests/assets/1204.0162v2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MantisAI/sieves/3ce96055c1343849909650265de2b5e7d98745b8/sieves/tests/assets/1204.0162v2.pdf -------------------------------------------------------------------------------- /sieves/tests/assets/dummy.txt: -------------------------------------------------------------------------------- 1 | This is a dummy text file. 2 | This is a dummy text file. 3 | This is a dummy text file. 4 | This is a dummy text file. 5 | This is a dummy text file. 6 | This is a dummy text file. 7 | This is a dummy text file. 8 | This is a dummy text file. 9 | This is a dummy text file. 10 | This is a dummy text file. 11 | This is a dummy text file. 12 | This is a dummy text file. 13 | This is a dummy text file. 14 | This is a dummy text file. 15 | This is a dummy text file. 16 | This is a dummy text file. 17 | This is a dummy text file. 18 | This is a dummy text file. 19 | This is a dummy text file. 20 | This is a dummy text file. 21 | This is a dummy text file. 22 | This is a dummy text file. 23 | This is a dummy text file. 24 | This is a dummy text file. 25 | This is a dummy text file. 26 | This is a dummy text file. 27 | This is a dummy text file. 28 | This is a dummy text file. 29 | This is a dummy text file. 30 | This is a dummy text file. 31 | This is a dummy text file. 32 | This is a dummy text file. 33 | This is a dummy text file. 34 | This is a dummy text file. 35 | This is a dummy text file. 36 | This is a dummy text file. 37 | This is a dummy text file. 38 | This is a dummy text file. 39 | This is a dummy text file. 40 | This is a dummy text file. 41 | This is a dummy text file. 42 | This is a dummy text file. 43 | This is a dummy text file. 44 | This is a dummy text file. 45 | This is a dummy text file. 46 | This is a dummy text file. 47 | This is a dummy text file. 48 | This is a dummy text file. 49 | This is a dummy text file. 50 | This is a dummy text file. 51 | This is a dummy text file. 52 | This is a dummy text file. 53 | This is a dummy text file. 54 | This is a dummy text file. 55 | This is a dummy text file. 56 | This is a dummy text file. 57 | This is a dummy text file. 58 | This is a dummy text file. 59 | This is a dummy text file. 60 | This is a dummy text file. 61 | This is a dummy text file. 62 | This is a dummy text file. 63 | This is a dummy text file. 64 | This is a dummy text file. 65 | This is a dummy text file. 66 | This is a dummy text file. 67 | This is a dummy text file. 68 | This is a dummy text file. 69 | This is a dummy text file. 70 | This is a dummy text file. 71 | This is a dummy text file. 72 | This is a dummy text file. 73 | This is a dummy text file. 74 | This is a dummy text file. 75 | This is a dummy text file. 76 | This is a dummy text file. 77 | This is a dummy text file. 78 | This is a dummy text file. 79 | This is a dummy text file. 80 | This is a dummy text file. -------------------------------------------------------------------------------- /sieves/tests/tasks/predictive/test_information_extraction.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pydantic 3 | import pytest 4 | 5 | from sieves import Doc, Pipeline, tasks 6 | from sieves.engines import EngineType 7 | from sieves.serialization import Config 8 | from sieves.tasks import PredictiveTask 9 | from sieves.tasks.predictive import information_extraction 10 | 11 | 12 | class Person(pydantic.BaseModel, frozen=True): 13 | name: str 14 | age: pydantic.PositiveInt 15 | 16 | 17 | @pytest.mark.parametrize( 18 | "batch_runtime", 19 | ( 20 | EngineType.dspy, 21 | EngineType.langchain, 22 | EngineType.outlines, 23 | ), 24 | indirect=["batch_runtime"], 25 | ) 26 | @pytest.mark.parametrize("fewshot", [True, False]) 27 | def test_run(information_extraction_docs, batch_runtime, fewshot) -> None: 28 | fewshot_examples = [ 29 | information_extraction.FewshotExample( 30 | text="Ada Lovelace lived to 47 years old. Zeno of Citium died with 72 years.", 31 | reasoning="There is mention of two people in this text, including lifespans. I will extract those.", 32 | entities=[Person(name="Ada Loveloace", age=47), Person(name="Zeno of Citium", age=72)], 33 | ), 34 | information_extraction.FewshotExample( 35 | text="Alan Watts passed away at the age of 58 years. Alan Watts was 58 years old at the time of his death.", 36 | reasoning="There is mention of one person in this text, including lifespan. I will extract this person.", 37 | entities=[Person(name="Alan Watts", age=58)], 38 | ), 39 | ] 40 | 41 | fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {} 42 | pipe = Pipeline( 43 | [ 44 | tasks.predictive.InformationExtraction( 45 | entity_type=Person, 46 | model=batch_runtime.model, 47 | generation_settings=batch_runtime.generation_settings, 48 | batch_size=batch_runtime.batch_size, 49 | **fewshot_args), 50 | ] 51 | ) 52 | docs = list(pipe(information_extraction_docs)) 53 | 54 | assert len(docs) == 2 55 | for doc in docs: 56 | assert doc.text 57 | assert "InformationExtraction" in doc.results 58 | 59 | with pytest.raises(NotImplementedError): 60 | pipe["InformationExtraction"].distill(None, None, None, None, None, None, None, None) 61 | 62 | 63 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 64 | def test_to_hf_dataset(information_extraction_docs, batch_runtime) -> None: 65 | task = tasks.predictive.InformationExtraction( 66 | entity_type=Person, model=batch_runtime.model, generation_settings=batch_runtime.generation_settings, batch_size=batch_runtime.batch_size 67 | ) 68 | pipe = Pipeline(task) 69 | docs = pipe(information_extraction_docs) 70 | 71 | assert isinstance(task, PredictiveTask) 72 | dataset = task.to_hf_dataset(docs) 73 | assert all([key in dataset.features for key in ("text", "entities")]) 74 | assert len(dataset) == 2 75 | records = list(dataset) 76 | assert records[0]["text"] == "Mahatma Ghandi lived to 79 years old. Bugs Bunny is at least 85 years old." 77 | assert records[1]["text"] == "Marie Curie passed away at the age of 67 years. Marie Curie was 67 years old." 78 | for record in records: 79 | assert isinstance(record["entities"], dict) 80 | assert isinstance(record["entities"]["age"], list) 81 | assert isinstance(record["entities"]["name"], list) 82 | 83 | with pytest.raises(KeyError): 84 | task.to_hf_dataset([Doc(text="This is a dummy text.")]) 85 | 86 | 87 | @pytest.mark.parametrize("batch_runtime", [EngineType.outlines], indirect=["batch_runtime"]) 88 | def test_serialization(information_extraction_docs, batch_runtime) -> None: 89 | pipe = Pipeline( 90 | tasks.predictive.InformationExtraction( 91 | entity_type=Person, model=batch_runtime.model, generation_settings=batch_runtime.generation_settings, batch_size=batch_runtime.batch_size, 92 | ) 93 | ) 94 | 95 | config = pipe.serialize() 96 | assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline', 97 | 'tasks': {'is_placeholder': False, 98 | 'value': [{'cls_name': 'sieves.tasks.predictive.information_extraction.core.InformationExtraction', 99 | 'entity_type': {'is_placeholder': True, 100 | 'value': 'pydantic._internal._model_construction.ModelMetaclass'}, 101 | 'fewshot_examples': {'is_placeholder': False, 102 | 'value': ()}, 103 | 'batch_size': {'is_placeholder': False, "value": -1}, 104 | 'generation_settings': {'is_placeholder': False, 105 | 'value': { 106 | 'config_kwargs': None, 107 | 'inference_kwargs': None, 108 | 'init_kwargs': None, 109 | 'strict_mode': False}}, 110 | 'include_meta': {'is_placeholder': False, 'value': True}, 111 | 'model': {'is_placeholder': True, 112 | 'value': 'outlines.models.transformers.Transformers'}, 113 | 'prompt_instructions': {'is_placeholder': False, 114 | 'value': None}, 115 | 'task_id': {'is_placeholder': False, 116 | 'value': 'InformationExtraction'}, 117 | 'version': Config.get_version()}]}, 118 | 'use_cache': {'is_placeholder': False, 'value': True}, 119 | 'version': Config.get_version()} 120 | 121 | Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model, "entity_type": Person}]) 122 | -------------------------------------------------------------------------------- /sieves/tests/tasks/predictive/test_ner.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pytest 3 | 4 | from sieves import Doc, Pipeline 5 | from sieves.engines import EngineType 6 | from sieves.serialization import Config 7 | from sieves.tasks import PredictiveTask 8 | from sieves.tasks.predictive import ner 9 | from sieves.tasks.predictive.ner.core import Entity 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "batch_runtime", 14 | ( 15 | EngineType.dspy, 16 | EngineType.langchain, 17 | EngineType.outlines, 18 | EngineType.glix, 19 | ), 20 | indirect=["batch_runtime"], 21 | ) 22 | @pytest.mark.parametrize("fewshot", [True, False]) 23 | def test_run(ner_docs, batch_runtime, fewshot) -> None: 24 | fewshot_examples = [ 25 | ner.FewshotExample( 26 | text="John studied data science in Barcelona and lives with Jaume", 27 | entities=[ 28 | Entity(text="John", context="John studied data", entity_type="PERSON"), 29 | Entity(text="Barcelona", context="science in Barcelona", entity_type="LOCATION"), 30 | Entity(text="Jaume", context="lives with Jaume", entity_type="PERSON"), 31 | ], 32 | ), 33 | ner.FewshotExample( 34 | text="Maria studied computer engineering in Madrid and works with Carlos", 35 | entities=[ 36 | Entity(text="Maria", context="Maria studied computer", entity_type="PERSON"), 37 | Entity(text="Madrid", context="engineering in Madrid and works", entity_type="LOCATION"), 38 | Entity(text="Carlos", context="works with Carlos", entity_type="PERSON"), 39 | ], 40 | ), 41 | ] 42 | 43 | fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {} 44 | pipe = Pipeline( 45 | ner.NER( 46 | entities=["PERSON", "LOCATION", "COMPANY"], 47 | model=batch_runtime.model, 48 | generation_settings=batch_runtime.generation_settings, 49 | batch_size=batch_runtime.batch_size, 50 | **fewshot_args 51 | ) 52 | ) 53 | docs = list(pipe(ner_docs)) 54 | 55 | assert len(docs) == 2 56 | for doc in docs: 57 | assert "NER" in doc.results 58 | 59 | with pytest.raises(NotImplementedError): 60 | pipe["NER"].distill(None, None, None, None, None, None, None, None) 61 | 62 | 63 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 64 | def test_serialization(ner_docs, batch_runtime) -> None: 65 | pipe = Pipeline( 66 | ner.NER( 67 | entities=["PERSON", "LOCATION", "COMPANY"], 68 | model=batch_runtime.model, 69 | generation_settings=batch_runtime.generation_settings, 70 | batch_size=batch_runtime.batch_size, 71 | ) 72 | ) 73 | 74 | config = pipe.serialize() 75 | assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline', 76 | 'tasks': {'is_placeholder': False, 77 | 'value': [{'cls_name': 'sieves.tasks.predictive.ner.core.NER', 78 | 'entities': {'is_placeholder': False, 79 | 'value': ['PERSON', 'LOCATION', 'COMPANY']}, 80 | 'fewshot_examples': {'is_placeholder': False, 81 | 'value': ()}, 82 | 'batch_size': {'is_placeholder': False, "value": -1}, 83 | 'generation_settings': {'is_placeholder': False, 84 | 'value': { 85 | 'config_kwargs': None, 86 | 'inference_kwargs': None, 87 | 'init_kwargs': None, 88 | 'strict_mode': False}}, 89 | 'include_meta': {'is_placeholder': False, 'value': True}, 90 | 'model': {'is_placeholder': True, 91 | 'value': 'dspy.clients.lm.LM'}, 92 | 'prompt_instructions': {'is_placeholder': False, 93 | 'value': None}, 94 | 'task_id': {'is_placeholder': False, 'value': 'NER'}, 95 | 'version': Config.get_version()}]}, 96 | 'use_cache': {'is_placeholder': False, 'value': True}, 97 | 'version': Config.get_version()} 98 | Pipeline.deserialize( 99 | config=config, 100 | tasks_kwargs=[{"model": batch_runtime.model}], 101 | ) 102 | 103 | 104 | @pytest.mark.parametrize("batch_runtime", [EngineType.glix], indirect=["batch_runtime"]) 105 | def test_to_hf_dataset(ner_docs, batch_runtime) -> None: 106 | task = ner.NER( 107 | entities=["PERSON", "LOCATION", "COMPANY"], 108 | model=batch_runtime.model, 109 | generation_settings=batch_runtime.generation_settings, 110 | batch_size=batch_runtime.batch_size, 111 | ) 112 | pipe = Pipeline(task) 113 | 114 | assert isinstance(task, PredictiveTask) 115 | dataset = task.to_hf_dataset(pipe(ner_docs)) 116 | assert all([key in dataset.features for key in ("text", "entities")]) 117 | assert len(dataset) == 2 118 | dataset_records = list(dataset) 119 | for rec in dataset_records: 120 | assert isinstance(rec["entities"], dict) 121 | assert ( 122 | len(rec["entities"]["entity_type"]) 123 | == len(rec["entities"]["start"]) 124 | == len(rec["entities"]["end"]) 125 | == len(rec["entities"]["text"]) 126 | ) 127 | assert isinstance(rec["text"], str) 128 | 129 | with pytest.raises(KeyError): 130 | task.to_hf_dataset([Doc(text="This is a dummy text.")]) 131 | -------------------------------------------------------------------------------- /sieves/tests/tasks/predictive/test_pii_masking.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pytest 3 | 4 | from sieves import Doc, Pipeline, tasks 5 | from sieves.engines import EngineType 6 | from sieves.serialization import Config 7 | from sieves.tasks import PredictiveTask 8 | from sieves.tasks.predictive import pii_masking 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "batch_runtime", 13 | ( 14 | EngineType.dspy, 15 | EngineType.langchain, 16 | EngineType.outlines, 17 | ), 18 | indirect=["batch_runtime"], 19 | ) 20 | @pytest.mark.parametrize("fewshot", [True, False]) 21 | def test_run(pii_masking_docs, batch_runtime, fewshot) -> None: 22 | fewshot_examples = [ 23 | pii_masking.FewshotExample( 24 | text="Jane Smith works at NASA.", 25 | reasoning="Jane Smith is a person's name and should be masked.", 26 | masked_text="[MASKED] works at NASA.", 27 | pii_entities=[pii_masking.PIIEntity(entity_type="PERSON", text="Jane Smith")], 28 | ), 29 | pii_masking.FewshotExample( 30 | text="He lives at Diagon Alley 37.", 31 | reasoning="Diagon Alley 37 is a residential address and should be masked.", 32 | masked_text="He lives at [MASKED].", 33 | pii_entities=[pii_masking.PIIEntity(entity_type="ADDRESS", text="Diagon Alley 37")], 34 | ), 35 | ] 36 | 37 | fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {} 38 | pipe = Pipeline([ 39 | tasks.predictive.PIIMasking( 40 | model=batch_runtime.model, 41 | generation_settings=batch_runtime.generation_settings, 42 | batch_size=batch_runtime.batch_size, 43 | **fewshot_args, 44 | ) 45 | ]) 46 | docs = list(pipe(pii_masking_docs)) 47 | 48 | assert len(docs) == 2 49 | for doc in docs: 50 | assert doc.text 51 | assert "PIIMasking" in doc.results 52 | 53 | with pytest.raises(NotImplementedError): 54 | pipe["PIIMasking"].distill(None, None, None, None, None, None, None, None) 55 | 56 | 57 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 58 | def test_to_hf_dataset(pii_masking_docs, batch_runtime) -> None: 59 | task = tasks.predictive.PIIMasking( 60 | model=batch_runtime.model, 61 | generation_settings=batch_runtime.generation_settings, 62 | batch_size=batch_runtime.batch_size, 63 | ) 64 | pipe = Pipeline(task) 65 | docs = pipe(pii_masking_docs) 66 | 67 | assert isinstance(task, PredictiveTask) 68 | dataset = task.to_hf_dataset(docs) 69 | assert all([key in dataset.features for key in ("text", "masked_text")]) 70 | assert len(dataset) == 2 71 | records = list(dataset) 72 | assert records[0]["text"] == "Her SSN is 222-333-444. Her credit card number is 1234 5678." 73 | assert records[1]["text"] == "You can reach Michael at michael.michaels@gmail.com." 74 | 75 | with pytest.raises(KeyError): 76 | task.to_hf_dataset([Doc(text="This is a dummy text.")]) 77 | 78 | 79 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 80 | def test_serialization(pii_masking_docs, batch_runtime) -> None: 81 | pipe = Pipeline([ 82 | tasks.predictive.PIIMasking( 83 | model=batch_runtime.model, 84 | generation_settings=batch_runtime.generation_settings, 85 | batch_size=batch_runtime.batch_size, 86 | ) 87 | ]) 88 | 89 | config = pipe.serialize() 90 | assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline', 91 | 'tasks': {'is_placeholder': False, 92 | 'value': [{'cls_name': 'sieves.tasks.predictive.pii_masking.core.PIIMasking', 93 | 'fewshot_examples': {'is_placeholder': False, 94 | 'value': ()}, 95 | 'batch_size': {'is_placeholder': False, "value": -1}, 96 | 'generation_settings': {'is_placeholder': False, 97 | 'value': { 98 | 'config_kwargs': None, 99 | 'inference_kwargs': None, 100 | 'init_kwargs': None, 101 | 'strict_mode': False}}, 102 | 'include_meta': {'is_placeholder': False, 'value': True}, 103 | 'mask_placeholder': {'is_placeholder': False, 104 | 'value': '[MASKED]'}, 105 | 'model': {'is_placeholder': True, 106 | 'value': 'dspy.clients.lm.LM'}, 107 | 'pii_types': {'is_placeholder': False, 'value': None}, 108 | 'prompt_instructions': {'is_placeholder': False, 109 | 'value': None}, 110 | 'task_id': {'is_placeholder': False, 111 | 'value': 'PIIMasking'}, 112 | 'version': Config.get_version()}]}, 113 | 'use_cache': {'is_placeholder': False, 'value': True}, 114 | 'version': Config.get_version()} 115 | 116 | Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}]) 117 | -------------------------------------------------------------------------------- /sieves/tests/tasks/predictive/test_question_answering.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pytest 3 | 4 | from sieves import Doc, Pipeline 5 | from sieves.engines import EngineType 6 | from sieves.serialization import Config 7 | from sieves.tasks import PredictiveTask 8 | from sieves.tasks.predictive import question_answering 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "batch_runtime", 13 | ( 14 | EngineType.dspy, 15 | EngineType.glix, 16 | EngineType.langchain, 17 | EngineType.outlines, 18 | ), 19 | indirect=["batch_runtime"], 20 | ) 21 | @pytest.mark.parametrize("fewshot", [True, False]) 22 | def test_run(qa_docs, batch_runtime, fewshot): 23 | fewshot_examples = [ 24 | question_answering.FewshotExample( 25 | text=""" 26 | Physics is the scientific study of matter, its fundamental constituents, its motion and behavior through 27 | space and time, and the related entities of energy and force. Physics is one of the most fundamental 28 | scientific disciplines. A scientist who specializes in the field of physics is called a physicist. 29 | """, 30 | reasoning="The text states ad verbatim what a scientist specializing in physics is called.", 31 | questions=("What's a scientist called who specializes in the field of physics?",), 32 | answers=("A physicist.",), 33 | ), 34 | question_answering.FewshotExample( 35 | text=""" 36 | A biologist is a scientist who conducts research in biology. Biologists are interested in studying life on 37 | Earth, whether it is an individual cell, a multicellular organism, or a community of interacting 38 | populations. They usually specialize in a particular branch (e.g., molecular biology, zoology, and 39 | evolutionary biology) of biology and have a specific research focus (e.g., studying malaria or cancer). 40 | """, 41 | reasoning="The states ad verbatim that biologists are interested in studying life on earth.", 42 | questions=("What are biologists interested in?",), 43 | answers=("Studying life on earth.",), 44 | ), 45 | ] 46 | 47 | fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {} 48 | pipe = Pipeline( 49 | [ 50 | question_answering.QuestionAnswering( 51 | task_id="qa", 52 | questions=[ 53 | "What branch of science is this text describing?", 54 | "What the goal of the science as described in the text?", 55 | ], 56 | model=batch_runtime.model, 57 | generation_settings=batch_runtime.generation_settings, 58 | batch_size=batch_runtime.batch_size, 59 | **fewshot_args, 60 | ), 61 | ] 62 | ) 63 | docs = list(pipe(qa_docs)) 64 | 65 | assert len(docs) == 2 66 | for doc in docs: 67 | assert doc.text 68 | assert "qa" in doc.results 69 | 70 | with pytest.raises(NotImplementedError): 71 | pipe["qa"].distill(None, None, None, None, None, None, None, None) 72 | 73 | 74 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 75 | def test_to_hf_dataset(qa_docs, batch_runtime) -> None: 76 | task = question_answering.QuestionAnswering( 77 | task_id="qa", 78 | questions=[ 79 | "What branch of science is this text describing?", 80 | "What the goal of the science as described in the text?", 81 | ], 82 | model=batch_runtime.model, 83 | generation_settings=batch_runtime.generation_settings, 84 | batch_size=batch_runtime.batch_size, 85 | ) 86 | pipe = Pipeline(task) 87 | 88 | assert isinstance(task, PredictiveTask) 89 | dataset = task.to_hf_dataset(pipe(qa_docs)) 90 | assert all([key in dataset.features for key in ("text", "answers")]) 91 | assert len(dataset) == 2 92 | dataset_records = list(dataset) 93 | for rec in dataset_records: 94 | assert isinstance(rec["text"], str) 95 | assert isinstance(rec["answers"], list) 96 | 97 | with pytest.raises(KeyError): 98 | task.to_hf_dataset([Doc(text="This is a dummy text.")]) 99 | 100 | 101 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 102 | def test_serialization(qa_docs, batch_runtime) -> None: 103 | pipe = Pipeline( 104 | [ 105 | question_answering.QuestionAnswering( 106 | task_id="qa", 107 | questions=[ 108 | "What branch of science is this text describing?", 109 | "What the goal of the science as described in the text?", 110 | ], 111 | model=batch_runtime.model, 112 | generation_settings=batch_runtime.generation_settings, 113 | batch_size=batch_runtime.batch_size, 114 | ) 115 | ] 116 | ) 117 | 118 | config = pipe.serialize() 119 | assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline', 120 | 'tasks': {'is_placeholder': False, 121 | 'value': [{'cls_name': 'sieves.tasks.predictive.question_answering.core.QuestionAnswering', 122 | 'fewshot_examples': {'is_placeholder': False, 123 | 'value': ()}, 124 | 'batch_size': {'is_placeholder': False, "value": -1}, 125 | 'generation_settings': {'is_placeholder': False, 126 | 'value': { 127 | 'config_kwargs': None, 128 | 'inference_kwargs': None, 129 | 'init_kwargs': None, 130 | 'strict_mode': False}}, 131 | 'include_meta': {'is_placeholder': False, 'value': True}, 132 | 'model': {'is_placeholder': True, 133 | 'value': 'dspy.clients.lm.LM'}, 134 | 'prompt_instructions': {'is_placeholder': False, 135 | 'value': None}, 136 | 'questions': {'is_placeholder': False, 137 | 'value': ['What branch of science is this ' 138 | 'text describing?', 139 | 'What the goal of the science as ' 140 | 'described in the text?']}, 141 | 'task_id': {'is_placeholder': False, 'value': 'qa'}, 142 | 'version': Config.get_version()}]}, 143 | 'use_cache': {'is_placeholder': False, 'value': True}, 144 | 'version': Config.get_version()} 145 | 146 | Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}]) 147 | -------------------------------------------------------------------------------- /sieves/tests/tasks/predictive/test_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pytest 3 | 4 | from sieves import Doc, Pipeline 5 | from sieves.engines import EngineType 6 | from sieves.serialization import Config 7 | from sieves.tasks import PredictiveTask 8 | from sieves.tasks.predictive import sentiment_analysis 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "batch_runtime", 13 | ( 14 | EngineType.dspy, 15 | EngineType.langchain, 16 | EngineType.outlines, 17 | ), 18 | indirect=["batch_runtime"], 19 | ) 20 | @pytest.mark.parametrize("fewshot", [True, False]) 21 | def test_run(sentiment_analysis_docs, batch_runtime, fewshot): 22 | fewshot_examples = [ 23 | sentiment_analysis.FewshotExample( 24 | text="The food was perfect, the service only ok.", 25 | reasoning="The text is very positive about the quality of the food, and neutral about the service quality." 26 | " The overall sentiment is hence positive.", 27 | sentiment_per_aspect={"food": 1.0, "service": 0.5, "overall": 0.8}, 28 | ), 29 | sentiment_analysis.FewshotExample( 30 | text="The service was amazing - they take excellent care of their customers. The food was despicable " 31 | "though, I strongly recommend not to go.", 32 | reasoning="While the service is judged as amazing, hence very positive, the assessment of the food is very " 33 | "negative. The overall sentiment is strongly negative.", 34 | sentiment_per_aspect={"food": 0.1, "service": 1.0, "overall": 0.3}, 35 | ), 36 | ] 37 | 38 | fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {} 39 | pipe = Pipeline( 40 | [ 41 | sentiment_analysis.SentimentAnalysis( 42 | task_id="sentiment_analysis", 43 | aspects=("food", "service"), 44 | model=batch_runtime.model, 45 | generation_settings=batch_runtime.generation_settings, 46 | batch_size=batch_runtime.batch_size, 47 | **fewshot_args, 48 | ), 49 | ] 50 | ) 51 | docs = list(pipe(sentiment_analysis_docs)) 52 | 53 | assert len(docs) == 2 54 | for doc in docs: 55 | assert doc.text 56 | assert doc.results["sentiment_analysis"] 57 | assert "sentiment_analysis" in doc.results 58 | 59 | with pytest.raises(NotImplementedError): 60 | pipe["sentiment_analysis"].distill(None, None, None, None, None, None, None, None) 61 | 62 | 63 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 64 | def test_to_hf_dataset(dummy_docs, batch_runtime) -> None: 65 | task = sentiment_analysis.SentimentAnalysis( 66 | task_id="sentiment_analysis", 67 | aspects=("food", "service"), 68 | model=batch_runtime.model, 69 | generation_settings=batch_runtime.generation_settings, 70 | batch_size=batch_runtime.batch_size, 71 | ) 72 | pipe = Pipeline(task) 73 | 74 | assert isinstance(task, PredictiveTask) 75 | dataset = task.to_hf_dataset(pipe(dummy_docs)) 76 | assert all([key in dataset.features for key in ("text", "aspect")]) 77 | assert len(dataset) == 2 78 | dataset_records = list(dataset) 79 | for rec in dataset_records: 80 | assert isinstance(rec["aspect"], list) 81 | for v in rec["aspect"]: 82 | assert isinstance(v, float) 83 | assert isinstance(rec["text"], str) 84 | 85 | with pytest.raises(KeyError): 86 | task.to_hf_dataset([Doc(text="This is a dummy text.")]) 87 | 88 | 89 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 90 | def test_serialization(dummy_docs, batch_runtime) -> None: 91 | pipe = Pipeline( 92 | [ 93 | sentiment_analysis.SentimentAnalysis( 94 | task_id="sentiment_analysis", 95 | aspects=("food", "service"), 96 | model=batch_runtime.model, 97 | generation_settings=batch_runtime.generation_settings, 98 | batch_size=batch_runtime.batch_size, 99 | ) 100 | ] 101 | ) 102 | 103 | config = pipe.serialize() 104 | assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline', 105 | 'tasks': {'is_placeholder': False, 106 | 'value': [{'aspects': {'is_placeholder': False, 107 | 'value': ('food', 'overall', 'service')}, 108 | 'cls_name': 'sieves.tasks.predictive.sentiment_analysis.core.SentimentAnalysis', 109 | 'fewshot_examples': {'is_placeholder': False, 110 | 'value': ()}, 111 | 'batch_size': {'is_placeholder': False, "value": -1}, 112 | 'generation_settings': {'is_placeholder': False, 113 | 'value': { 114 | 'config_kwargs': None, 115 | 'inference_kwargs': None, 116 | 'init_kwargs': None, 117 | 'strict_mode': False}}, 118 | 'include_meta': {'is_placeholder': False, 'value': True}, 119 | 'model': {'is_placeholder': True, 120 | 'value': 'dspy.clients.lm.LM'}, 121 | 'prompt_instructions': {'is_placeholder': False, 122 | 'value': None}, 123 | 'task_id': {'is_placeholder': False, 124 | 'value': 'sentiment_analysis'}, 125 | 'version': Config.get_version()}]}, 126 | 'use_cache': {'is_placeholder': False, 'value': True}, 127 | 'version': Config.get_version()} 128 | 129 | Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}]) 130 | -------------------------------------------------------------------------------- /sieves/tests/tasks/predictive/test_summarization.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pytest 3 | 4 | from sieves import Doc, Pipeline 5 | from sieves.engines import EngineType 6 | from sieves.serialization import Config 7 | from sieves.tasks import PredictiveTask 8 | from sieves.tasks.predictive import summarization 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "batch_runtime", 13 | ( 14 | EngineType.dspy, 15 | EngineType.glix, 16 | EngineType.langchain, 17 | EngineType.outlines, 18 | ), 19 | indirect=["batch_runtime"], 20 | ) 21 | @pytest.mark.parametrize("fewshot", [True, False]) 22 | def test_run(summarization_docs, batch_runtime, fewshot) -> None: 23 | fewshot_examples = [ 24 | summarization.FewshotExample( 25 | text="They counted: one, two, three, four, five, six, seven, eight, nine, ten, eleven, twelve, thirteen, " 26 | "fourteen.", 27 | n_words=6, 28 | summary="They counted from one to fourteen.", 29 | ), 30 | summarization.FewshotExample( 31 | text="Next in order were the Boeotians, led by Peneleos, Leitus, Arcesilaus, Prothoenor, and Clonius. " 32 | "These had with them fifty ships, and on board of each were a hundred and twenty young men of the " 33 | "Boeotians. Then came the men of Orchomenus, who lived in the realm of the Minyans, led by Ascalaphus" 34 | " and Ialmenus, sons of Mars. In their command were thirty ships. Next were the Phocians, led by" 35 | " Schedius and Epistrophus, sons of Iphitus the son of Naubolus. These had forty ships…", 36 | n_words=10, 37 | summary="Boeotians, Orchomenians, and Phocians sailed to Troy with many ships.", 38 | ), 39 | ] 40 | 41 | fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {} 42 | pipe = Pipeline([ 43 | summarization.Summarization( 44 | n_words=10, 45 | model=batch_runtime.model, 46 | generation_settings=batch_runtime.generation_settings, 47 | batch_size=batch_runtime.batch_size, 48 | **fewshot_args, 49 | ) 50 | ]) 51 | docs = list(pipe(summarization_docs)) 52 | 53 | assert len(docs) == 2 54 | for doc in docs: 55 | assert doc.text 56 | assert "Summarization" in doc.results 57 | 58 | with pytest.raises(NotImplementedError): 59 | pipe["Summarization"].distill(None, None, None, None, None, None, None, None) 60 | 61 | 62 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 63 | def test_to_hf_dataset(summarization_docs, batch_runtime) -> None: 64 | task = summarization.Summarization( 65 | n_words=10, 66 | model=batch_runtime.model, 67 | generation_settings=batch_runtime.generation_settings, 68 | batch_size=batch_runtime.batch_size, 69 | ) 70 | pipe = Pipeline(task) 71 | docs = pipe(summarization_docs) 72 | 73 | assert isinstance(task, PredictiveTask) 74 | dataset = task.to_hf_dataset(docs) 75 | assert all([key in dataset.features for key in ("text", "summary")]) 76 | assert len(dataset) == 2 77 | records = list(dataset) 78 | assert records[0]["text"].strip().startswith("The decay spreads over the State") 79 | assert records[1]["text"].strip().startswith("After all, the practical reason") 80 | for record in records: 81 | assert isinstance(record["summary"], str) 82 | 83 | with pytest.raises(KeyError): 84 | task.to_hf_dataset([Doc(text="This is a dummy text.")]) 85 | 86 | 87 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 88 | def test_serialization(summarization_docs, batch_runtime) -> None: 89 | pipe = Pipeline([ 90 | summarization.Summarization( 91 | n_words=10, 92 | model=batch_runtime.model, 93 | generation_settings=batch_runtime.generation_settings, 94 | batch_size=batch_runtime.batch_size, 95 | ) 96 | ]) 97 | 98 | config = pipe.serialize() 99 | assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline', 100 | 'tasks': {'is_placeholder': False, 101 | 'value': [{'cls_name': 'sieves.tasks.predictive.summarization.core.Summarization', 102 | 'fewshot_examples': {'is_placeholder': False, 103 | 'value': ()}, 104 | 'batch_size': {'is_placeholder': False, "value": -1}, 105 | 'generation_settings': {'is_placeholder': False, 106 | 'value': { 107 | 'config_kwargs': None, 108 | 'inference_kwargs': None, 109 | 'init_kwargs': None, 110 | 'strict_mode': False}}, 111 | 'include_meta': {'is_placeholder': False, 'value': True}, 112 | 'model': {'is_placeholder': True, 113 | 'value': 'dspy.clients.lm.LM'}, 114 | 'n_words': {'is_placeholder': False, 'value': 10}, 115 | 'prompt_instructions': {'is_placeholder': False, 116 | 'value': None}, 117 | 'task_id': {'is_placeholder': False, 118 | 'value': 'Summarization'}, 119 | 'version': Config.get_version()}]}, 120 | 'use_cache': {'is_placeholder': False, 'value': True}, 121 | 'version': Config.get_version()} 122 | 123 | Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}]) 124 | -------------------------------------------------------------------------------- /sieves/tests/tasks/predictive/test_translation.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pytest 3 | 4 | from sieves import Doc, Pipeline 5 | from sieves.engines import EngineType 6 | from sieves.serialization import Config 7 | from sieves.tasks import PredictiveTask 8 | from sieves.tasks.predictive import translation 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "batch_runtime", 13 | ( 14 | EngineType.dspy, 15 | EngineType.langchain, 16 | EngineType.outlines, 17 | ), 18 | indirect=["batch_runtime"], 19 | ) 20 | @pytest.mark.parametrize("fewshot", [True, False]) 21 | def test_run(translation_docs, batch_runtime, fewshot) -> None: 22 | fewshot_examples = [ 23 | translation.FewshotExample( 24 | text="The sun is shining today.", 25 | to="Spanish", 26 | translation="El sol brilla hoy.", 27 | ), 28 | translation.FewshotExample( 29 | text="There's a lot of fog today", 30 | to="Spanish", 31 | translation="Hay mucha niebla hoy.", 32 | ), 33 | ] 34 | 35 | fewshot_args = {"fewshot_examples": fewshot_examples} if fewshot else {} 36 | pipe = Pipeline([ 37 | translation.Translation( 38 | to="Spanish", 39 | model=batch_runtime.model, 40 | generation_settings=batch_runtime.generation_settings, 41 | batch_size=batch_runtime.batch_size, 42 | **fewshot_args, 43 | ) 44 | ]) 45 | docs = list(pipe(translation_docs)) 46 | 47 | assert len(docs) == 2 48 | for doc in docs: 49 | assert doc.text 50 | assert "Translation" in doc.results 51 | 52 | with pytest.raises(NotImplementedError): 53 | pipe["Translation"].distill(None, None, None, None, None, None, None, None) 54 | 55 | 56 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 57 | def test_to_hf_dataset(translation_docs, batch_runtime) -> None: 58 | task = translation.Translation( 59 | to="Spanish", 60 | model=batch_runtime.model, 61 | generation_settings=batch_runtime.generation_settings, 62 | batch_size=batch_runtime.batch_size, 63 | ) 64 | pipe = Pipeline(task) 65 | docs = pipe(translation_docs) 66 | 67 | assert isinstance(task, PredictiveTask) 68 | dataset = task.to_hf_dataset(docs) 69 | assert all([key in dataset.features for key in ("text", "translation")]) 70 | assert len(dataset) == 2 71 | records = list(dataset) 72 | assert records[0]["text"] == "It is rainy today." 73 | assert records[1]["text"] == "It is cloudy today." 74 | for record in records: 75 | assert isinstance(record["translation"], str) 76 | 77 | with pytest.raises(KeyError): 78 | task.to_hf_dataset([Doc(text="This is a dummy text.")]) 79 | 80 | 81 | @pytest.mark.parametrize("batch_runtime", [EngineType.dspy], indirect=["batch_runtime"]) 82 | def test_serialization(translation_docs, batch_runtime) -> None: 83 | pipe = Pipeline([ 84 | translation.Translation( 85 | to="Spanish", 86 | model=batch_runtime.model, 87 | generation_settings=batch_runtime.generation_settings, 88 | batch_size=batch_runtime.batch_size, 89 | ) 90 | ]) 91 | 92 | config = pipe.serialize() 93 | assert config.model_dump() == {'cls_name': 'sieves.pipeline.core.Pipeline', 94 | 'tasks': {'is_placeholder': False, 95 | 'value': [{'cls_name': 'sieves.tasks.predictive.translation.core.Translation', 96 | 'fewshot_examples': {'is_placeholder': False, 97 | 'value': ()}, 98 | 'batch_size': {'is_placeholder': False, "value": -1}, 99 | 'generation_settings': {'is_placeholder': False, 100 | 'value': { 101 | 'config_kwargs': None, 102 | 'inference_kwargs': None, 103 | 'init_kwargs': None, 104 | 'strict_mode': False}}, 105 | 'include_meta': {'is_placeholder': False, 'value': True}, 106 | 'model': {'is_placeholder': True, 107 | 'value': 'dspy.clients.lm.LM'}, 108 | 'prompt_instructions': {'is_placeholder': False, 109 | 'value': None}, 110 | 'task_id': {'is_placeholder': False, 111 | 'value': 'Translation'}, 112 | 'to': {'is_placeholder': False, 'value': 'Spanish'}, 113 | 'version': Config.get_version()}]}, 114 | 'use_cache': {'is_placeholder': False, 'value': True}, 115 | 'version': Config.get_version()} 116 | 117 | Pipeline.deserialize(config=config, tasks_kwargs=[{"model": batch_runtime.model}]) 118 | -------------------------------------------------------------------------------- /sieves/tests/tasks/preprocessing/chunking/test_chonkie.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import chonkie 3 | 4 | from sieves import Doc, Pipeline 5 | from sieves.serialization import Config 6 | from sieves.tasks.preprocessing.chunking import Chonkie 7 | 8 | 9 | def test_chonkie(tokenizer) -> None: 10 | resources = [Doc(text="This is a text " * 100)] 11 | pipe = Pipeline(tasks=[Chonkie(chonkie.TokenChunker(tokenizer))]) 12 | docs = list(pipe(resources)) 13 | 14 | assert len(docs) == 1 15 | assert docs[0].text 16 | assert docs[0].chunks 17 | 18 | 19 | def test_serialization(tokenizer) -> None: 20 | resources = [Doc(text="This is a text " * 100)] 21 | pipe = Pipeline(tasks=[Chonkie(chonkie.TokenChunker(tokenizer))]) 22 | docs = list(pipe(resources)) 23 | 24 | config = pipe.serialize() 25 | assert config.model_dump() == { 26 | "cls_name": "sieves.pipeline.core.Pipeline", 27 | "use_cache": {"is_placeholder": False, "value": True}, 28 | "tasks": { 29 | "is_placeholder": False, 30 | "value": [ 31 | { 32 | "chunker": {"is_placeholder": True, "value": "chonkie.chunker.token.TokenChunker"}, 33 | 'batch_size': {'is_placeholder': False, "value": -1}, 34 | "cls_name": "sieves.tasks.preprocessing.chunking.chonkie_.Chonkie", 35 | "include_meta": {"is_placeholder": False, "value": False}, 36 | "task_id": {"is_placeholder": False, "value": "Chonkie"}, 37 | "version": Config.get_version(), 38 | } 39 | ], 40 | }, 41 | "version": Config.get_version(), 42 | } 43 | 44 | deserialized_pipeline = Pipeline.deserialize( 45 | config=config, tasks_kwargs=[{"chunker": chonkie.TokenChunker(tokenizer)}] 46 | ) 47 | assert docs[0] == list(deserialized_pipeline(resources))[0] 48 | -------------------------------------------------------------------------------- /sieves/tests/tasks/preprocessing/chunking/test_chunking.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import chonkie 3 | import pytest 4 | 5 | from sieves import Doc, Pipeline 6 | from sieves.serialization import Config 7 | from sieves.tasks.preprocessing import Chunking 8 | 9 | 10 | @pytest.mark.parametrize("chunker", ["chonkie", "naive"]) 11 | def test_chonkie(chunker, tokenizer) -> None: 12 | resources = [Doc(text="This is a text. " * 100)] 13 | pipe = Pipeline(tasks=[Chunking(chonkie.TokenChunker(tokenizer) if chunker == "chonkie" else 5)]) 14 | docs = list(pipe(resources)) 15 | 16 | assert len(docs) == 1 17 | assert docs[0].text 18 | assert docs[0].chunks 19 | 20 | 21 | def test_serialization(tokenizer) -> None: 22 | resources = [Doc(text="This is a text " * 100)] 23 | pipe = Pipeline(tasks=[Chunking(chonkie.TokenChunker(tokenizer))]) 24 | docs = list(pipe(resources)) 25 | 26 | config = pipe.serialize() 27 | assert config.model_dump() == { 28 | "cls_name": "sieves.pipeline.core.Pipeline", 29 | "use_cache": {"is_placeholder": False, "value": True}, 30 | "tasks": { 31 | "is_placeholder": False, 32 | "value": [ 33 | { 34 | "chunker": {"is_placeholder": True, "value": "chonkie.chunker.token.TokenChunker"}, 35 | 'batch_size': {'is_placeholder': False, "value": -1}, 36 | "cls_name": "sieves.tasks.preprocessing.chunking.core.Chunking", 37 | "include_meta": {"is_placeholder": False, "value": False}, 38 | "task_id": {"is_placeholder": False, "value": "Chunking"}, 39 | "version": Config.get_version(), 40 | } 41 | ], 42 | }, 43 | "version": Config.get_version(), 44 | } 45 | 46 | deserialized_pipeline = Pipeline.deserialize( 47 | config=config, tasks_kwargs=[{"chunker": chonkie.TokenChunker(tokenizer)}] 48 | ) 49 | assert docs[0] == list(deserialized_pipeline(resources))[0] 50 | -------------------------------------------------------------------------------- /sieves/tests/tasks/preprocessing/chunking/test_naivechunker.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pytest 3 | 4 | from sieves import Pipeline 5 | from sieves.engines import EngineType 6 | from sieves.serialization import Config 7 | from sieves.tasks.preprocessing.chunking.naive import NaiveChunker 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "batch_runtime", 12 | [EngineType.huggingface], 13 | indirect=["batch_runtime"], 14 | ) 15 | def test_run(dummy_docs, batch_runtime) -> None: 16 | """Tests whether chunking mechanism in PredictiveTask works as expected.""" 17 | chunk_interval = 5 18 | pipe = Pipeline([NaiveChunker(interval=chunk_interval)]) 19 | docs = list(pipe(dummy_docs)) 20 | 21 | assert len(docs) == 2 22 | for doc in docs: 23 | assert doc.text 24 | assert len(doc.chunks) == 2 25 | 26 | 27 | def test_serialization(dummy_docs) -> None: 28 | chunk_interval = 5 29 | pipe = Pipeline(tasks=[NaiveChunker(interval=chunk_interval)]) 30 | docs = list(pipe(dummy_docs)) 31 | 32 | config = pipe.serialize() 33 | assert config.model_dump() == { 34 | "cls_name": "sieves.pipeline.core.Pipeline", 35 | "use_cache": {"is_placeholder": False, "value": True}, 36 | "tasks": { 37 | "is_placeholder": False, 38 | "value": [ 39 | { 40 | "cls_name": "sieves.tasks.preprocessing.chunking.naive.NaiveChunker", 41 | 'batch_size': {'is_placeholder': False, "value": -1}, 42 | "include_meta": {"is_placeholder": False, "value": False}, 43 | "interval": {"is_placeholder": False, "value": 5}, 44 | "task_id": {"is_placeholder": False, "value": "NaiveChunker"}, 45 | "version": Config.get_version(), 46 | } 47 | ], 48 | }, 49 | "version": Config.get_version(), 50 | } 51 | 52 | deserialized_pipeline = Pipeline.deserialize(config=config, tasks_kwargs=[{}]) 53 | assert docs[0] == list(deserialized_pipeline(dummy_docs))[0] 54 | -------------------------------------------------------------------------------- /sieves/tests/tasks/preprocessing/ingestion/test_docling.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from sieves import Doc, Pipeline 4 | from sieves.serialization import Config 5 | from sieves.tasks.preprocessing.ingestion.docling_ import Docling 6 | 7 | 8 | def test_run() -> None: 9 | resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")] 10 | pipe = Pipeline(tasks=[Docling()]) 11 | docs = list(pipe(resources)) 12 | 13 | assert len(docs) == 1 14 | assert docs[0].text 15 | 16 | 17 | def test_serialization() -> None: 18 | resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")] 19 | pipe = Pipeline(tasks=[Docling()]) 20 | docs = list(pipe(resources)) 21 | 22 | config = pipe.serialize() 23 | version = Config.get_version() 24 | assert config.model_dump() == { 25 | "cls_name": "sieves.pipeline.core.Pipeline", 26 | "use_cache": {"is_placeholder": False, "value": True}, 27 | "tasks": { 28 | "is_placeholder": False, 29 | "value": [ 30 | { 31 | "cls_name": "sieves.tasks.preprocessing.ingestion.docling_.Docling", 32 | 'batch_size': {'is_placeholder': False, "value": -1}, 33 | "converter": {"is_placeholder": True, "value": "docling.document_converter.DocumentConverter"}, 34 | "export_format": {"is_placeholder": False, "value": "markdown"}, 35 | "include_meta": {"is_placeholder": False, "value": False}, 36 | "task_id": {"is_placeholder": False, "value": "Docling"}, 37 | "version": version, 38 | } 39 | ], 40 | }, 41 | "version": version, 42 | } 43 | 44 | deserialized_pipeline = Pipeline.deserialize( 45 | config=config, tasks_kwargs=[{"converter": None, "export_format": "markdown"}] 46 | ) 47 | assert docs[0] == list(deserialized_pipeline(resources))[0] 48 | -------------------------------------------------------------------------------- /sieves/tests/tasks/preprocessing/ingestion/test_ingestion.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | from pathlib import Path 3 | 4 | from docling.document_converter import DocumentConverter 5 | 6 | from sieves import Doc, Pipeline, tasks 7 | from sieves.serialization import Config 8 | 9 | 10 | def test_run() -> None: 11 | resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")] 12 | pipe = Pipeline(tasks=[tasks.preprocessing.Ingestion()]) 13 | docs = list(pipe(resources)) 14 | 15 | assert len(docs) == 1 16 | assert docs[0].text 17 | 18 | 19 | def test_serialization() -> None: 20 | resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")] 21 | pipe = Pipeline(tasks=[tasks.preprocessing.Ingestion()]) 22 | config = pipe.serialize() 23 | version = Config.get_version() 24 | assert config.model_dump() == { 25 | "cls_name": "sieves.pipeline.core.Pipeline", 26 | "use_cache": {"is_placeholder": False, "value": True}, 27 | "tasks": { 28 | "is_placeholder": False, 29 | "value": [ 30 | { 31 | "cls_name": "sieves.tasks.preprocessing.ingestion.core.Ingestion", 32 | 'batch_size': {'is_placeholder': False, "value": -1}, 33 | "converter": {"is_placeholder": True, "value": "docling.document_converter.DocumentConverter"}, 34 | "export_format": {"is_placeholder": False, "value": "markdown"}, 35 | "include_meta": {"is_placeholder": False, "value": False}, 36 | "task_id": {"is_placeholder": False, "value": "Ingestion"}, 37 | "version": version, 38 | } 39 | ], 40 | }, 41 | "version": version, 42 | } 43 | 44 | # For deserialization, we need to provide the converter 45 | converter = DocumentConverter() 46 | deserialized_pipeline = Pipeline.deserialize( 47 | config=config, tasks_kwargs=[{"converter": converter, "export_format": "markdown"}] 48 | ) 49 | deserialized_docs = list(deserialized_pipeline(resources)) 50 | 51 | assert len(deserialized_docs) == 1 52 | -------------------------------------------------------------------------------- /sieves/tests/tasks/preprocessing/ingestion/test_marker.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | from pathlib import Path 3 | 4 | import pytest 5 | from marker.converters.pdf import PdfConverter 6 | from marker.models import create_model_dict 7 | 8 | from sieves import Doc, Pipeline, tasks 9 | from sieves.serialization import Config 10 | 11 | 12 | @pytest.mark.skip(reason="Currently running into OOM issues with instantiating Marker converts.") 13 | def test_marker(): 14 | """Workaround to keep memory usage low: run single function with one instantiated Marker instance.""" 15 | marker_converter = PdfConverter(artifact_dict=create_model_dict()) 16 | 17 | def test_run() -> None: 18 | resources = [Doc(uri=Path(__file__).parent.parent.parent / "assets" / "1204.0162v2.pdf")] 19 | pipe = Pipeline(tasks=[tasks.preprocessing.Marker(converter=marker_converter)]) 20 | docs = list(pipe(resources)) 21 | 22 | assert len(docs) == 1 23 | assert docs[0].text 24 | 25 | def test_with_extract_images() -> None: 26 | resources = [Doc(uri=Path(__file__).parent.parent.parent / "assets" / "1204.0162v2.pdf")] 27 | pipe = Pipeline( 28 | tasks=[tasks.preprocessing.Marker(converter=marker_converter, extract_images=True, include_meta=True)] 29 | ) 30 | docs = list(pipe(resources)) 31 | 32 | assert len(docs) == 1 33 | assert docs[0].text 34 | assert docs[0].images 35 | 36 | def test_serialization() -> None: 37 | resources = [Doc(uri=Path(__file__).parent.parent.parent / "assets" / "1204.0162v2.pdf")] 38 | pipe = Pipeline(tasks=[tasks.preprocessing.Marker(converter=marker_converter, include_meta=True)]) 39 | docs = list(pipe(resources)) 40 | 41 | config = pipe.serialize() 42 | version = Config.get_version() 43 | assert config.model_dump() == { 44 | "cls_name": "sieves.pipeline.core.Pipeline", 45 | "tasks": { 46 | "is_placeholder": False, 47 | "value": [ 48 | { 49 | "cls_name": "sieves.tasks.preprocessing.marker_.Marker", 50 | "converter": {"is_placeholder": True, "value": "marker.converters.pdf.PdfConverter"}, 51 | "export_format": {"is_placeholder": False, "value": "markdown"}, 52 | "extract_images": {"is_placeholder": False, "value": False}, 53 | "include_meta": {"is_placeholder": False, "value": True}, 54 | "task_id": {"is_placeholder": False, "value": "Marker"}, 55 | "version": version, 56 | } 57 | ], 58 | }, 59 | "version": version, 60 | } 61 | 62 | # For deserialization, we need to provide the converter 63 | converter = marker_converter 64 | deserialized_pipeline = Pipeline.deserialize( 65 | config=config, tasks_kwargs=[{"converter": converter, "export_format": "markdown"}] 66 | ) 67 | deserialized_docs = list(deserialized_pipeline(resources)) 68 | 69 | assert len(deserialized_docs) == 1 70 | assert deserialized_docs[0].text == docs[0].text 71 | 72 | test_run() 73 | test_with_extract_images() 74 | test_serialization() 75 | -------------------------------------------------------------------------------- /sieves/tests/tasks/preprocessing/ingestion/test_unstructured.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | from pathlib import Path 3 | 4 | import pytest 5 | import unstructured.cleaners.core 6 | import unstructured.partition.auto 7 | 8 | from sieves import Doc, Pipeline 9 | from sieves.serialization import Config 10 | from sieves.tasks.preprocessing.ingestion.unstructured_ import Unstructured 11 | 12 | 13 | @pytest.mark.parametrize("to_chunk", [True, False]) 14 | def test_run(to_chunk) -> None: 15 | resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "1204.0162v2.pdf")] 16 | partition_kwargs = {"chunking_strategy": "basic"} if to_chunk else {} 17 | pipe = Pipeline( 18 | tasks=[ 19 | Unstructured( 20 | **partition_kwargs, 21 | cleaners=( 22 | lambda t: unstructured.cleaners.core.clean( 23 | t, extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True 24 | ), 25 | ), 26 | include_meta=True, 27 | ), 28 | ] 29 | ) 30 | docs = list(pipe(resources)) 31 | 32 | assert len(docs) == 1 33 | assert docs[0].text 34 | if to_chunk: 35 | assert len(docs[0].chunks) 36 | else: 37 | assert docs[0].chunks is None 38 | 39 | 40 | def test_serialization() -> None: 41 | resources = [Doc(uri=Path(__file__).parent.parent.parent.parent / "assets" / "dummy.txt")] 42 | 43 | def cleaner(text: str) -> str: 44 | return unstructured.cleaners.core.clean( 45 | text, extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True 46 | ) 47 | 48 | pipe = Pipeline(tasks=[Unstructured(cleaners=(cleaner,), include_meta=True)]) 49 | docs = list(pipe(resources)) 50 | 51 | config = pipe.serialize() 52 | assert config.model_dump() == { 53 | "cls_name": "sieves.pipeline.core.Pipeline", 54 | "use_cache": {"is_placeholder": False, "value": True}, 55 | "tasks": { 56 | "is_placeholder": False, 57 | "value": [ 58 | { 59 | "cleaners": {"is_placeholder": True, "value": "builtins.tuple"}, 60 | 'batch_size': {'is_placeholder': False, "value": -1}, 61 | "cls_name": "sieves.tasks.preprocessing.ingestion.unstructured_.Unstructured", 62 | "include_meta": {"is_placeholder": False, "value": True}, 63 | "partition": {"is_placeholder": True, "value": "builtins.function"}, 64 | "task_id": {"is_placeholder": False, "value": "Unstructured"}, 65 | "version": Config.get_version(), 66 | } 67 | ], 68 | }, 69 | "version": Config.get_version(), 70 | } 71 | 72 | deserialized_pipeline = Pipeline.deserialize( 73 | config=config, tasks_kwargs=({"partition": unstructured.partition.auto.partition, "cleaners": (cleaner,)},) 74 | ) 75 | assert docs[0] == list(deserialized_pipeline(resources))[0] 76 | -------------------------------------------------------------------------------- /sieves/tests/test_doc.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pytest 3 | import regex 4 | from datasets import Dataset 5 | from PIL import Image 6 | 7 | from sieves import Doc 8 | 9 | 10 | @pytest.fixture 11 | def test_images() -> dict[str, Image.Image]: 12 | return { 13 | "rgb_red_100": Image.new("RGB", (100, 100), color="red"), 14 | "rgb_red_100_2": Image.new("RGB", (100, 100), color="red"), 15 | "rgb_blue_100": Image.new("RGB", (100, 100), color="blue"), 16 | "rgb_red_200": Image.new("RGB", (200, 200), color="red"), 17 | "l_gray_100": Image.new("L", (100, 100), color=128), 18 | } 19 | 20 | 21 | def test_identical_images(test_images: dict[str, Image.Image]) -> None: 22 | doc1 = Doc(images=[test_images["rgb_red_100"]]) 23 | doc2 = Doc(images=[test_images["rgb_red_100_2"]]) 24 | assert doc1 == doc2 25 | 26 | 27 | def test_different_images(test_images: dict[str, Image.Image]) -> None: 28 | doc1 = Doc(images=[test_images["rgb_red_100"]]) 29 | doc2 = Doc(images=[test_images["rgb_blue_100"]]) 30 | assert doc1 != doc2 31 | 32 | 33 | def test_none_images() -> None: 34 | doc1 = Doc(images=None) 35 | doc2 = Doc(images=None) 36 | assert doc1 == doc2 37 | 38 | 39 | def test_one_none_image(test_images: dict[str, Image.Image]) -> None: 40 | doc1 = Doc(images=[test_images["rgb_red_100"]]) 41 | doc2 = Doc(images=None) 42 | assert doc1 != doc2 43 | 44 | 45 | def test_different_image_counts(test_images: dict[str, Image.Image]) -> None: 46 | doc1 = Doc(images=[test_images["rgb_red_100"], test_images["rgb_red_100_2"]]) 47 | doc2 = Doc(images=[test_images["rgb_red_100"]]) 48 | assert doc1 != doc2 49 | 50 | 51 | def test_different_image_sizes(test_images: dict[str, Image.Image]) -> None: 52 | doc1 = Doc(images=[test_images["rgb_red_100"]]) 53 | doc2 = Doc(images=[test_images["rgb_red_200"]]) 54 | assert doc1 != doc2 55 | 56 | 57 | def test_different_image_modes(test_images: dict[str, Image.Image]) -> None: 58 | doc1 = Doc(images=[test_images["rgb_red_100"]]) 59 | doc2 = Doc(images=[test_images["l_gray_100"]]) 60 | assert doc1 != doc2 61 | 62 | 63 | def test_doc_comparison_type_error() -> None: 64 | doc = Doc(images=None) 65 | with pytest.raises(NotImplementedError): 66 | doc == 42 67 | 68 | 69 | def test_docs_from_hf_dataset() -> None: 70 | """Tests generation of Docs instance from HF dataset.""" 71 | hf_dataset = Dataset.from_dict( 72 | {"text": ["This is the first document.", "This is the second document."], "label": [0, 1]} 73 | ) 74 | docs = Doc.from_hf_dataset(hf_dataset) 75 | 76 | assert len(docs) == 2 77 | assert docs[0].text == "This is the first document." 78 | assert docs[0].chunks == ["This is the first document."] # Check post_init 79 | assert docs[0].id is None 80 | assert docs[0].uri is None 81 | assert docs[0].images is None 82 | assert docs[0].meta == {} 83 | assert docs[0].results == {} 84 | 85 | assert docs[1].text == "This is the second document." 86 | assert docs[1].chunks == ["This is the second document."] # Check post_init 87 | 88 | # Test with a different text column name. 89 | data_alt_col = {"content": ["Doc A", "Doc B"], "id": ["a", "b"]} 90 | hf_dataset_alt_col = Dataset.from_dict(data_alt_col) 91 | docs_alt = Doc.from_hf_dataset(hf_dataset_alt_col, column_map={"text": "content", "id": "id"}) 92 | assert len(docs_alt) == 2 93 | assert docs_alt[0].text == "Doc A" 94 | assert docs_alt[1].text == "Doc B" 95 | assert docs_alt[0].id == "a" 96 | assert docs_alt[1].id == "b" 97 | 98 | # Test ValueError for missing column. 99 | with pytest.raises( 100 | KeyError, 101 | match=regex.escape("Specified columns '{'wrong_column'}' not found in dataset columns: ['text', 'label']."), 102 | ): 103 | Doc.from_hf_dataset(hf_dataset, column_map={"text": "wrong_column"}) 104 | -------------------------------------------------------------------------------- /sieves/tests/test_serialization.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import os 3 | import pickle 4 | import tempfile 5 | from pathlib import Path 6 | 7 | import chonkie 8 | import dspy 9 | import pytest 10 | 11 | from sieves import Pipeline 12 | from sieves.engines import EngineType 13 | from sieves.serialization import Config 14 | from sieves.tasks import preprocessing 15 | from sieves.tasks.predictive import classification 16 | from sieves.tests.conftest import make_model 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "batch_runtime", 21 | [EngineType.dspy], 22 | indirect=["batch_runtime"], 23 | ) 24 | def test_serialization_pipeline(dummy_docs, batch_runtime, tokenizer): 25 | """Tests serialization and deserialization of pipeline to files and config objects.""" 26 | pipe = Pipeline( 27 | [ 28 | preprocessing.Chunking(chonkie.TokenChunker(tokenizer)), 29 | classification.Classification( 30 | task_id="classifier", 31 | labels=["science", "politics"], 32 | label_descriptions={"science": "Everything about science.", "politics": "Everything about politics."}, 33 | model=batch_runtime.model, 34 | generation_settings=batch_runtime.generation_settings, 35 | batch_size=batch_runtime.batch_size, 36 | ), 37 | ] 38 | ) 39 | 40 | # Get config, assert values are correct. 41 | config = pipe.serialize() 42 | config_model_dump = config.model_dump() 43 | version = Config.get_version() 44 | assert config_model_dump == { 45 | "cls_name": "sieves.pipeline.core.Pipeline", 46 | "use_cache": {"is_placeholder": False, "value": True}, 47 | "tasks": { 48 | "is_placeholder": False, 49 | "value": [ 50 | { 51 | "chunker": {"is_placeholder": True, "value": "chonkie.chunker.token.TokenChunker"}, 52 | 'batch_size': {'is_placeholder': False, 'value': -1}, 53 | "cls_name": "sieves.tasks.preprocessing.chunking.core.Chunking", 54 | "include_meta": {"is_placeholder": False, "value": False}, 55 | "task_id": {"is_placeholder": False, "value": "Chunking"}, 56 | "version": version, 57 | }, 58 | { 59 | "cls_name": "sieves.tasks.predictive.classification.core.Classification", 60 | 'generation_settings': { 61 | 'is_placeholder': False, 62 | 'value': { 63 | 'config_kwargs': None, 64 | 'inference_kwargs': None, 65 | 'init_kwargs': None, 66 | 'strict_mode': False 67 | } 68 | }, 69 | "fewshot_examples": {"is_placeholder": False, "value": ()}, 70 | "include_meta": {"is_placeholder": False, "value": True}, 71 | 'batch_size': {'is_placeholder': False, 'value': -1}, 72 | "labels": {"is_placeholder": False, "value": ["science", "politics"]}, 73 | "label_descriptions": { 74 | "is_placeholder": False, 75 | "value": {"science": "Everything about science.", "politics": "Everything about politics."}, 76 | }, 77 | 'model': {'is_placeholder': True, 'value': 'dspy.clients.lm.LM'}, 78 | "prompt_instructions": {"is_placeholder": False, "value": None}, 79 | "task_id": {"is_placeholder": False, "value": "classifier"}, 80 | "version": version, 81 | }, 82 | ], 83 | }, 84 | "version": version, 85 | } 86 | 87 | # Save config to temporary file 88 | with tempfile.NamedTemporaryFile(suffix=".yml") as tmp_file: 89 | tmp_path = Path(tmp_file.name) 90 | config.dump(tmp_path) 91 | 92 | # Load config from file and verify it matches 93 | loaded_config = Config.load(tmp_path) 94 | # For some reason empty tuple is stored as list, which is fine for our purposes. 95 | assert config_model_dump["tasks"]["value"][1]["fewshot_examples"]["value"] == () 96 | config_model_dump["tasks"]["value"][1]["fewshot_examples"]["value"] = [] 97 | assert loaded_config.model_dump() == config_model_dump 98 | 99 | # Restore pipeline from config. 100 | loaded_pipe = Pipeline.load( 101 | tmp_path, 102 | ( 103 | {"chunker": chonkie.TokenChunker(tokenizer)}, 104 | {"model": make_model(EngineType.dspy)}, 105 | ), 106 | ) 107 | 108 | # Run restored pipeline. 109 | docs = list(loaded_pipe(dummy_docs)) 110 | assert len(docs) == 2 111 | assert len(docs[0].results["classifier"]) 112 | 113 | # Compare loaded pipe config with original one. 114 | assert loaded_pipe.serialize().model_dump() == config_model_dump 115 | 116 | 117 | def test_serialization_docs(dummy_docs): 118 | """Test serializition of docs by saving to and loading from pickle objects.""" 119 | # Create a temporary file for pickle serialization. 120 | with tempfile.NamedTemporaryFile(suffix=".pkl") as tmp_file: 121 | tmp_path = Path(tmp_file.name) 122 | 123 | # Pickle the dummy_docs to file. 124 | with open(tmp_path, "wb") as f: 125 | pickle.dump(dummy_docs, f) 126 | 127 | # Load the docs back from file. 128 | with open(tmp_path, "rb") as f: 129 | loaded_docs = pickle.load(f) 130 | 131 | # Assert the loaded docs are identical to original 132 | assert len(loaded_docs) == len(dummy_docs) 133 | assert all([orig_doc == loaded_doc for orig_doc, loaded_doc in zip(dummy_docs, loaded_docs)]) 134 | 135 | # Test that comparing Doc with int raises NotImplementedError 136 | with pytest.raises(NotImplementedError): 137 | loaded_docs[0] == 42 138 | -------------------------------------------------------------------------------- /sieves/tests/test_strict_mode.py: -------------------------------------------------------------------------------- 1 | # mypy: ignore-errors 2 | import pydantic 3 | import pytest 4 | 5 | from sieves import Doc, Pipeline 6 | from sieves.engines import EngineType 7 | from sieves.tasks.predictive import information_extraction 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "batch_runtime", (EngineType.dspy, EngineType.langchain, EngineType.outlines), indirect=["batch_runtime"] 12 | ) 13 | @pytest.mark.parametrize("strict_mode", [True, False]) 14 | def test_strict_mode(batch_runtime, strict_mode): 15 | batch_runtime.generation_settings.strict_mode = strict_mode 16 | 17 | class Person(pydantic.BaseModel, frozen=True): 18 | name: str 19 | age: pydantic.PositiveInt 20 | 21 | pipe = Pipeline([ 22 | information_extraction.InformationExtraction( 23 | entity_type=Person, 24 | model=batch_runtime.model, 25 | generation_settings=batch_runtime.generation_settings, 26 | batch_size=batch_runtime.batch_size, 27 | ) 28 | ]) 29 | 30 | docs: list[Doc] = [] 31 | hit_exception = False 32 | if strict_mode: 33 | try: 34 | docs = list(pipe([Doc(text=".")])) 35 | except Exception: 36 | hit_exception = True 37 | if strict_mode is False: 38 | docs = list(pipe([Doc(text=".")])) 39 | 40 | if strict_mode and hit_exception: 41 | assert len(docs) == 0 42 | else: 43 | assert len(docs) == 1 44 | 45 | for doc in docs: 46 | assert "InformationExtraction" in doc.results 47 | -------------------------------------------------------------------------------- /ty.toml: -------------------------------------------------------------------------------- 1 | [rules] 2 | # Ignoring a bunch of rules until we get around to clean up typing. 3 | unresolved-attribute = "ignore" 4 | unresolved-import = "ignore" 5 | invalid-assignment = "ignore" 6 | invalid-argument-type = "ignore" 7 | missing-argument = "ignore" 8 | not-iterable = "ignore" 9 | 10 | [src] 11 | exclude = [ 12 | ".venv/**", 13 | "build/**", 14 | "sieves/tests/**", 15 | "examples/**" 16 | ] 17 | --------------------------------------------------------------------------------