├── .copier-answers.yml
├── .envrc
├── .gitattributes
├── .github
    ├── CODEOWNERS
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    ├── release-drafter.yml
    └── workflows
    │   ├── build.yml
    │   ├── chore.yml
    │   ├── ci.yml
    │   └── scorecard.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc
├── .readthedocs.yml
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── SECURITY.md
├── dataframely
    ├── __init__.py
    ├── _base_collection.py
    ├── _base_schema.py
    ├── _compat.py
    ├── _deprecation.py
    ├── _extre.pyi
    ├── _filter.py
    ├── _polars.py
    ├── _rule.py
    ├── _typing.py
    ├── _validation.py
    ├── collection.py
    ├── columns
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── _mixins.py
    │   ├── _utils.py
    │   ├── any.py
    │   ├── array.py
    │   ├── bool.py
    │   ├── datetime.py
    │   ├── decimal.py
    │   ├── enum.py
    │   ├── float.py
    │   ├── integer.py
    │   ├── list.py
    │   ├── object.py
    │   ├── string.py
    │   └── struct.py
    ├── config.py
    ├── exc.py
    ├── failure.py
    ├── functional.py
    ├── mypy.py
    ├── py.typed
    ├── random.py
    ├── schema.py
    └── testing
    │   ├── __init__.py
    │   ├── const.py
    │   ├── factory.py
    │   ├── mask.py
    │   ├── rules.py
    │   └── typing.py
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── _api
    │   ├── dataframely.collection.rst
    │   ├── dataframely.columns.any.rst
    │   ├── dataframely.columns.bool.rst
    │   ├── dataframely.columns.datetime.rst
    │   ├── dataframely.columns.decimal.rst
    │   ├── dataframely.columns.enum.rst
    │   ├── dataframely.columns.float.rst
    │   ├── dataframely.columns.integer.rst
    │   ├── dataframely.columns.list.rst
    │   ├── dataframely.columns.rst
    │   ├── dataframely.columns.string.rst
    │   ├── dataframely.columns.struct.rst
    │   ├── dataframely.config.rst
    │   ├── dataframely.exc.rst
    │   ├── dataframely.failure.rst
    │   ├── dataframely.functional.rst
    │   ├── dataframely.mypy.rst
    │   ├── dataframely.random.rst
    │   ├── dataframely.rst
    │   ├── dataframely.schema.rst
    │   ├── dataframely.testing.const.rst
    │   ├── dataframely.testing.factory.rst
    │   ├── dataframely.testing.mask.rst
    │   ├── dataframely.testing.rst
    │   ├── dataframely.testing.rules.rst
    │   ├── dataframely.testing.typing.rst
    │   └── modules.rst
    ├── _static
    │   ├── custom.css
    │   └── favicon.ico
    ├── conf.py
    ├── index.rst
    ├── make.bat
    └── sites
    │   ├── development.rst
    │   ├── examples
    │       └── real-world.ipynb
    │   ├── faq.rst
    │   ├── installation.rst
    │   ├── quickstart.rst
    │   └── versioning.rst
├── pixi.lock
├── pixi.toml
├── pyproject.toml
├── src
    ├── errdefs.rs
    ├── lib.rs
    └── regex_repr.rs
└── tests
    ├── collection
        ├── test_base.py
        ├── test_cast.py
        ├── test_create_empty.py
        ├── test_filter_one_to_n.py
        ├── test_filter_validate.py
        ├── test_ignore_in_filter.py
        ├── test_implementation.py
        ├── test_optional_members.py
        ├── test_sample.py
        └── test_validate_input.py
    ├── column_types
        ├── __init__.py
        ├── test_any.py
        ├── test_array.py
        ├── test_datetime.py
        ├── test_decimal.py
        ├── test_enum.py
        ├── test_float.py
        ├── test_integer.py
        ├── test_list.py
        ├── test_object.py
        ├── test_string.py
        └── test_struct.py
    ├── columns
        ├── __init__.py
        ├── test_alias.py
        ├── test_check.py
        ├── test_default_dtypes.py
        ├── test_metadata.py
        ├── test_polars_schema.py
        ├── test_pyarrow.py
        ├── test_rules.py
        ├── test_sample.py
        ├── test_sql_schema.py
        ├── test_str.py
        └── test_utils.py
    ├── core_validation
        ├── __init__.py
        ├── test_column_validation.py
        ├── test_dtype_validation.py
        └── test_rule_evaluation.py
    ├── functional
        ├── test_concat.py
        └── test_relationships.py
    ├── schema
        ├── test_base.py
        ├── test_cast.py
        ├── test_create_empty.py
        ├── test_create_empty_if_none.py
        ├── test_filter.py
        ├── test_inheritance.py
        ├── test_rule_implementation.py
        ├── test_sample.py
        └── test_validate.py
    ├── test_compat.py
    ├── test_config.py
    ├── test_deprecation.py
    ├── test_exc.py
    ├── test_extre.py
    ├── test_failure_info.py
    ├── test_random.py
    └── test_typing.py


/.copier-answers.yml:
--------------------------------------------------------------------------------
 1 | # This file is managed by Copier; DO NOT EDIT OR REMOVE.
 2 | _commit: v0.4.1
 3 | _src_path: https://github.com/quantco/copier-template-python-open-source
 4 | add_autobump_workflow: false
 5 | author_email: oliver.borchert@quantco.com
 6 | author_name: Oliver Borchert
 7 | github_url: https://github.com/quantco/dataframely
 8 | github_user: borchero
 9 | minimal_python_version: py311
10 | project_short_description: A declarative, polars-native data frame validation library
11 | project_slug: dataframely
12 | use_devcontainer: false
13 | 


--------------------------------------------------------------------------------
/.envrc:
--------------------------------------------------------------------------------
1 | watch_file pixi.toml pixi.lock
2 | eval "$(pixi shell-hook)"
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | pixi.lock merge=binary linguist-language=YAML linguist-generated=true
2 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @borchero @AndreasAlbertQC @delsner
2 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | # Motivation
2 | 
3 | <!-- Why is this change necessary? Link issues here if applicable. -->
4 | 
5 | # Changes
6 | 
7 | <!-- What changes have been performed? -->
8 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: github-actions
 4 |     directory: /
 5 |     schedule:
 6 |       interval: monthly
 7 |     groups:
 8 |       gh-actions:
 9 |         patterns:
10 |           - "*"
11 |     commit-message:
12 |       prefix: ci
13 | 


--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
  1 | # ------------------------------------- PULL REQUEST LABELS ------------------------------------- #
  2 | autolabeler:
  3 |   # Conventional Commit Types (https://github.com/commitizen/conventional-commit-types)
  4 |   - label: build
  5 |     title:
  6 |       - '/^build(\(.*\))?(\!)?\:/'
  7 |   - label: chore
  8 |     title:
  9 |       - '/^chore(\(.*\))?(\!)?\:/'
 10 |   - label: ci
 11 |     title:
 12 |       - '/^ci(\(.*\))?(\!)?\:/'
 13 |   - label: documentation
 14 |     title:
 15 |       - '/^docs(\(.*\))?(\!)?\:/'
 16 |   - label: enhancement
 17 |     title:
 18 |       - '/^feat(\(.*\))?(\!)?\:/'
 19 |   - label: fix
 20 |     title:
 21 |       - '/^fix(\(.*\))?(\!)?\:/'
 22 |   - label: performance
 23 |     title:
 24 |       - '/^perf(\(.*\))?(\!)?\:/'
 25 |   - label: refactor
 26 |     title:
 27 |       - '/^refactor(\(.*\))?(\!)?\:/'
 28 |   - label: revert
 29 |     title:
 30 |       - '/^revert(\(.*\))?(\!)?\:/'
 31 |   - label: style
 32 |     title:
 33 |       - '/^style(\(.*\))?(\!)?\:/'
 34 |   - label: test
 35 |     title:
 36 |       - '/^test(\(.*\))?(\!)?\:/'
 37 |   # Custom Types
 38 |   - label: breaking
 39 |     title:
 40 |       - '/^[a-z]+(\(.*\))?\!\:/'
 41 | # ------------------------------------- AUTOMATIC VERSIONING ------------------------------------ #
 42 | version-resolver:
 43 |   major:
 44 |     labels:
 45 |       - breaking
 46 |   minor:
 47 |     labels:
 48 |       - enhancement
 49 |   default: patch
 50 | # ------------------------------------ RELEASE CONFIGURATION ------------------------------------ #
 51 | name-template: "v$RESOLVED_VERSION"
 52 | tag-template: "v$RESOLVED_VERSION"
 53 | category-template: "### $TITLE"
 54 | change-template: "- $TITLE by @$AUTHOR in [#$NUMBER]($URL)"
 55 | replacers:
 56 |   # remove conventional commit tag & scope from change list
 57 |   - search: '/- [a-z]+(\(.*\))?(\!)?\: /g'
 58 |     replace: "- "
 59 | template: |
 60 |   ## What's Changed
 61 | 
 62 |   $CHANGES
 63 | 
 64 |   **Full Changelog:** [`$PREVIOUS_TAG...v$RESOLVED_VERSION`](https://github.com/$OWNER/$REPOSITORY/compare/$PREVIOUS_TAG...v$RESOLVED_VERSION)
 65 | categories:
 66 |   - title: ⚠️ Breaking Changes
 67 |     labels:
 68 |       - breaking
 69 |   - title: ✨ New Features
 70 |     labels:
 71 |       - enhancement
 72 |   - title: 🐞 Bug Fixes
 73 |     labels:
 74 |       - fix
 75 |   - title: 🏎️ Performance Improvements
 76 |     labels:
 77 |       - performance
 78 |   - title: 📚 Documentation
 79 |     labels:
 80 |       - documentation
 81 |   - title: 🏗️ Testing
 82 |     labels:
 83 |       - test
 84 |   - title: ⚙️ Automation
 85 |     labels:
 86 |       - ci
 87 |   - title: 🛠 Builds
 88 |     labels:
 89 |       - build
 90 |   - title: 💎 Code Style
 91 |     labels:
 92 |       - style
 93 |   - title: 📦 Refactorings
 94 |     labels:
 95 |       - refactor
 96 |   - title: ♻️ Chores
 97 |     labels:
 98 |       - chore
 99 |   - title: 🗑 Reverts
100 |     labels:
101 |       - revert
102 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |     branches: [main]
 6 |   release:
 7 |     types: [published]
 8 | 
 9 | jobs:
10 |   build-sdist:
11 |     name: Build Sdist
12 |     runs-on: ubuntu-latest
13 |     permissions:
14 |       contents: read
15 |     steps:
16 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
17 |         with:
18 |           fetch-depth: 0
19 |       - name: Set up pixi
20 |         uses: prefix-dev/setup-pixi@19eac09b398e3d0c747adc7921926a6d802df4da # v0.8.8
21 |         with:
22 |           environments: build
23 |       - name: Set version
24 |         run: pixi run -e build set-version
25 |       - name: Build project
26 |         run: pixi run -e build build-sdist
27 |       - name: Upload package
28 |         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
29 |         with:
30 |           name: sdist
31 |           path: dist/*
32 | 
33 |   build-wheel:
34 |     name: Build Wheel (${{ matrix.target-platform }})
35 |     runs-on: ${{ matrix.os }}
36 |     strategy:
37 |       fail-fast: false
38 |       matrix:
39 |         include:
40 |           - target-platform: linux-64
41 |             os: ubuntu-latest
42 |           - target-platform: linux-aarch64
43 |             os: ubuntu-24.04-arm
44 |           - target-platform: osx-64
45 |             os: macos-13
46 |           - target-platform: osx-arm64
47 |             os: macos-latest
48 |           - target-platform: win-64
49 |             os: windows-latest
50 |     steps:
51 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
52 |         with:
53 |           fetch-depth: 0
54 |       - name: Set up pixi
55 |         uses: prefix-dev/setup-pixi@19eac09b398e3d0c747adc7921926a6d802df4da # v0.8.8
56 |         with:
57 |           environments: build
58 |       - name: Set version
59 |         run: pixi run -e build set-version
60 |       - name: Build wheel
61 |         uses: PyO3/maturin-action@aef21716ff3dcae8a1c301d23ec3e4446972a6e3 # v1.49.1
62 |         with:
63 |           command: build
64 |           args: --out dist -i python3.11
65 |           manylinux: auto
66 |       - name: Check package
67 |         run: pixi run -e build check-wheel
68 |       - name: Upload package
69 |         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
70 |         with:
71 |           name: wheel-${{ matrix.target-platform }}
72 |           path: dist/*
73 | 
74 |   release:
75 |     name: Publish package
76 |     if: github.event_name == 'release'
77 |     needs: build-wheel
78 |     runs-on: ubuntu-latest
79 |     permissions:
80 |       id-token: write
81 |     environment: pypi
82 |     steps:
83 |       - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
84 |         with:
85 |           path: dist
86 |           merge-multiple: true
87 |       - name: Publish package on PyPi
88 |         uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
89 | 


--------------------------------------------------------------------------------
/.github/workflows/chore.yml:
--------------------------------------------------------------------------------
 1 | name: Chore
 2 | on:
 3 |   pull_request_target:
 4 |     branches: [main]
 5 |     types: [opened, reopened, edited, synchronize]
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.ref }}
11 |   cancel-in-progress: true
12 | 
13 | jobs:
14 |   check-pr-title:
15 |     name: Check PR Title
16 |     if: github.event_name == 'pull_request_target'
17 |     runs-on: ubuntu-latest
18 |     permissions:
19 |       contents: read
20 |       pull-requests: write
21 |     steps:
22 |       - name: Check valid conventional commit message
23 |         id: lint
24 |         uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
25 |         with:
26 |           subjectPattern: ^[A-Z].+[^. ]$ # subject must start with uppercase letter and may not end with a dot/space
27 |         env:
28 |           GITHUB_TOKEN: ${{ github.token }}
29 |       - name: Post comment about invalid PR title
30 |         if: failure()
31 |         uses: marocchino/sticky-pull-request-comment@52423e01640425a022ef5fd42c6fb5f633a02728 # v2.9.1
32 |         with:
33 |           header: conventional-commit-pr-title
34 |           message: |
35 |             Thank you for opening this pull request! 👋🏼
36 | 
37 |             This repository requires pull request titles to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and it looks like your proposed title needs to be adjusted.
38 | 
39 |             <details><summary><b>Details</b></summary>
40 | 
41 |             ```
42 |             ${{ steps.lint.outputs.error_message }}
43 |             ```
44 | 
45 |             </details>
46 |       - name: Delete comment about invalid PR title
47 |         if: success()
48 |         uses: marocchino/sticky-pull-request-comment@52423e01640425a022ef5fd42c6fb5f633a02728 # v2.9.1
49 |         with:
50 |           header: conventional-commit-pr-title
51 |           delete: true
52 | 
53 |   release-drafter:
54 |     name: ${{ github.event_name == 'pull_request_target' && 'Assign Labels' || 'Draft Release' }}
55 |     runs-on: ubuntu-latest
56 |     permissions:
57 |       contents: write
58 |       pull-requests: write
59 |     steps:
60 |       - name: ${{ github.event_name == 'pull_request_target' && 'Assign labels' || 'Update release draft' }}
61 |         uses: release-drafter/release-drafter@b1476f6e6eb133afa41ed8589daba6dc69b4d3f5 # v6.1.0
62 |         with:
63 |           disable-releaser: ${{ github.event_name == 'pull_request_target' }}
64 |           disable-autolabeler: ${{ github.event_name == 'push' }}
65 |         env:
66 |           GITHUB_TOKEN: ${{ github.token }}
67 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |     branches: [main]
 6 | 
 7 | # Automatically stop old builds on the same branch/PR
 8 | concurrency:
 9 |   group: ${{ github.workflow }}-${{ github.ref }}
10 |   cancel-in-progress: true
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   pre-commit-checks:
17 |     name: Pre-commit Checks
18 |     timeout-minutes: 30
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - name: Checkout branch
22 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
23 |         with:
24 |           # needed for 'pre-commit-mirrors-insert-license'
25 |           fetch-depth: 0
26 |       - name: Set up pixi
27 |         uses: prefix-dev/setup-pixi@19eac09b398e3d0c747adc7921926a6d802df4da # v0.8.8
28 |         with:
29 |           environments: default lint
30 |       - name: Install repository
31 |         run: pixi run -e default postinstall
32 |       - name: pre-commit
33 |         run: pixi run pre-commit-run --color=always --show-diff-on-failure
34 | 
35 |   unit-tests:
36 |     name: Unit Tests (${{ matrix.os == 'ubuntu-latest' && 'Linux' || 'Windows' }}) - ${{ matrix.environment }}
37 |     timeout-minutes: 30
38 |     runs-on: ${{ matrix.os }}
39 |     strategy:
40 |       fail-fast: true
41 |       matrix:
42 |         os: [ubuntu-latest, windows-latest]
43 |         environment: [py311, py312, py313]
44 |     steps:
45 |       - name: Checkout branch
46 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
47 |       - name: Set up pixi
48 |         uses: prefix-dev/setup-pixi@19eac09b398e3d0c747adc7921926a6d802df4da # v0.8.8
49 |         with:
50 |           environments: ${{ matrix.environment }}
51 |       - name: Install repository
52 |         run: pixi run -e ${{ matrix.environment }} postinstall
53 |       - name: Run pytest
54 |         run: pixi run -e ${{ matrix.environment }} test-coverage --color=yes
55 |       - name: Upload codecov
56 |         uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24 # v5.4.3
57 |         with:
58 |           files: ./coverage.xml
59 |           token: ${{ secrets.CODECOV_TOKEN }}
60 | 


--------------------------------------------------------------------------------
/.github/workflows/scorecard.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub. They are provided
 2 | # by a third-party and are governed by separate terms of service, privacy
 3 | # policy, and support documentation.
 4 | 
 5 | name: Scorecard supply-chain security
 6 | on:
 7 |   # For Branch-Protection check. Only the default branch is supported. See
 8 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
 9 |   branch_protection_rule:
10 |   # To guarantee Maintained check is occasionally updated. See
11 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
12 |   schedule:
13 |     - cron: "34 5 * * 0"
14 |   workflow_dispatch:
15 |   push:
16 |     branches: ["main"]
17 | 
18 | # Declare default permissions as read only.
19 | permissions: read-all
20 | 
21 | jobs:
22 |   analysis:
23 |     name: Scorecard analysis
24 |     runs-on: ubuntu-latest
25 |     # `publish_results: true` only works when run from the default branch. conditional can be removed if disabled.
26 |     if: (github.event.repository.default_branch == github.ref_name || github.event_name == 'pull_request') && github.repository == 'quantco/dataframely'
27 |     permissions:
28 |       # Needed to upload the results to code-scanning dashboard.
29 |       security-events: write
30 |       # Needed to publish results and get a badge (see publish_results below).
31 |       id-token: write
32 |       # Uncomment the permissions below if installing in a private repository.
33 |       # contents: read
34 |       # actions: read
35 | 
36 |     steps:
37 |       - name: "Checkout code"
38 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
39 |         with:
40 |           persist-credentials: false
41 | 
42 |       - name: "Run analysis"
43 |         uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
44 |         with:
45 |           results_file: results.sarif
46 |           results_format: sarif
47 |           # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
48 |           # - you want to enable the Branch-Protection check on a *public* repository, or
49 |           # - you are installing Scorecard on a *private* repository
50 |           # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
51 |           # repo_token: ${{ secrets.SCORECARD_TOKEN }}
52 | 
53 |           # Public repositories:
54 |           #   - Publish results to OpenSSF REST API for easy access by consumers
55 |           #   - Allows the repository to include the Scorecard badge.
56 |           #   - See https://github.com/ossf/scorecard-action#publishing-results.
57 |           # For private repositories:
58 |           #   - `publish_results` will always be set to `false`, regardless
59 |           #     of the value entered here.
60 |           publish_results: true
61 | 
62 |           # (Optional) Uncomment file_mode if you have a .gitattributes with files marked export-ignore
63 |           # file_mode: git
64 | 
65 |       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
66 |       # format to the repository Actions tab.
67 |       - name: "Upload artifact"
68 |         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
69 |         with:
70 |           name: SARIF file
71 |           path: results.sarif
72 |           retention-days: 5
73 | 
74 |       # Upload the results to GitHub's code scanning dashboard (optional).
75 |       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
76 |       - name: "Upload to code-scanning"
77 |         uses: github/codeql-action/upload-sarif@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v3.28.18
78 |         with:
79 |           sarif_file: results.sarif
80 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: ^(\.copier-answers\.yml)|.pixi$
 2 | repos:
 3 |   - repo: local
 4 |     hooks:
 5 |       # ensure pixi environments are up to date
 6 |       # workaround for https://github.com/prefix-dev/pixi/issues/1482
 7 |       - id: pixi-install
 8 |         name: pixi-install
 9 |         entry: pixi install -e default -e lint
10 |         language: system
11 |         always_run: true
12 |         require_serial: true
13 |         pass_filenames: false
14 |       # insert-license
15 |       - id: insert-license
16 |         name: insert-license
17 |         entry: >-
18 |           pixi run -e lint
19 |           insert-license
20 |           --license-base64 Q29weXJpZ2h0IChjKSBRdWFudENvIDIwMjUtMjAyNQpTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQlNELTMtQ2xhdXNl
21 |           --dynamic-years
22 |           --comment-style "#"
23 |         language: system
24 |         types: [python]
25 |       # docformatter
26 |       - id: docformatter
27 |         name: docformatter
28 |         entry: pixi run -e lint docformatter -i
29 |         language: system
30 |         types: [python]
31 |       # ruff
32 |       - id: ruff
33 |         name: ruff
34 |         entry: pixi run -e lint ruff check --fix --exit-non-zero-on-fix --force-exclude
35 |         language: system
36 |         types_or: [python, pyi]
37 |         require_serial: true
38 |       - id: ruff-format
39 |         name: ruff-format
40 |         entry: pixi run -e lint ruff format --force-exclude
41 |         language: system
42 |         types_or: [python, pyi]
43 |         require_serial: true
44 |       # mypy
45 |       - id: mypy
46 |         name: mypy
47 |         entry: pixi run -e default mypy
48 |         language: system
49 |         types: [python]
50 |         require_serial: true
51 |       # prettier
52 |       - id: prettier
53 |         name: prettier
54 |         entry: pixi run -e lint prettier --write --list-different --ignore-unknown
55 |         language: system
56 |         types: [text]
57 |         files: \.(md|yml|yaml)$
58 |       # taplo
59 |       - id: taplo
60 |         name: taplo
61 |         entry: pixi run -e lint taplo format
62 |         language: system
63 |         types: [toml]
64 |       # pre-commit-hooks
65 |       - id: trailing-whitespace-fixer
66 |         name: trailing-whitespace-fixer
67 |         entry: pixi run -e lint trailing-whitespace-fixer
68 |         language: system
69 |         types: [text]
70 |       - id: end-of-file-fixer
71 |         name: end-of-file-fixer
72 |         entry: pixi run -e lint end-of-file-fixer
73 |         language: system
74 |         types: [text]
75 |       - id: check-merge-conflict
76 |         name: check-merge-conflict
77 |         entry: pixi run -e lint check-merge-conflict --assume-in-merge
78 |         language: system
79 |         types: [text]
80 |       # typos
81 |       - id: typos
82 |         name: typos
83 |         entry: pixi run -e lint typos --force-exclude
84 |         language: system
85 |         types: [text]
86 |         require_serial: true
87 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | build
2 | conda.recipe
3 | .copier-answers.yml
4 | 
5 | *.html
6 | *.properties
7 | 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "singleQuote": false,
3 |   "bracketSpacing": true,
4 |   "printWidth": 200,
5 |   "endOfLine": "auto",
6 |   "tabWidth": 2
7 | }
8 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | build:
 3 |   os: ubuntu-20.04
 4 |   tools:
 5 |     python: mambaforge-latest
 6 |   commands:
 7 |     - mamba install -c conda-forge -c nodefaults pixi
 8 |     - pixi run -e docs postinstall
 9 |     - pixi run -e docs docs
10 |     - pixi run -e docs readthedocs
11 | sphinx:
12 |   configuration: docs/conf.py
13 | formats:
14 |   - pdf
15 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | edition = "2021"
 3 | name = "dataframely"
 4 | version = "0.1.0"
 5 | 
 6 | [lib]
 7 | crate-type = ["cdylib"]
 8 | name = "dataframely"
 9 | 
10 | [dependencies]
11 | pyo3 = { version = "0.24", features = ["abi3-py311", "extension-module"] }
12 | rand = { version = "0.9", features = ["std_rng"] }
13 | regex-syntax = "0.8"
14 | thiserror = "2.0"
15 | 
16 | [profile.release]
17 | codegen-units = 1
18 | lto = true
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2025, QuantCo
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <!-- LOGO -->
 2 | <br />
 3 | 
 4 | <div align="center">
 5 | 
 6 |   <h3 align="center">
 7 |   <code>dataframely</code> — A declarative, 🐻‍❄️-native data frame validation library
 8 |   </h3>
 9 | 
10 | [![CI](https://img.shields.io/github/actions/workflow/status/quantco/dataframely/ci.yml?style=flat-square&branch=main)](https://github.com/quantco/dataframely/actions/workflows/ci.yml)
11 | [![conda-forge](https://img.shields.io/conda/vn/conda-forge/dataframely?logoColor=white&logo=conda-forge&style=flat-square)](https://prefix.dev/channels/conda-forge/packages/dataframely)
12 | [![pypi-version](https://img.shields.io/pypi/v/dataframely.svg?logo=pypi&logoColor=white&style=flat-square)](https://pypi.org/project/dataframely)
13 | [![python-version](https://img.shields.io/pypi/pyversions/dataframely?logoColor=white&logo=python&style=flat-square)](https://pypi.org/project/dataframely)
14 | [![codecov](https://codecov.io/gh/Quantco/dataframely/graph/badge.svg)](https://codecov.io/gh/Quantco/dataframely)
15 | 
16 | </div>
17 | 
18 | ## 🗂 Table of Contents
19 | 
20 | - [Introduction](#-introduction)
21 | - [Installation](#-installation)
22 | - [Usage](#-usage)
23 | 
24 | ## 📖 Introduction
25 | 
26 | Dataframely is a Python package to validate the schema and content of [`polars`](https://pola.rs/) data frames. Its
27 | purpose is to make data pipelines more robust by ensuring that data meets expectations and more readable by adding
28 | schema information to data frame type hints.
29 | 
30 | ## 💿 Installation
31 | 
32 | You can install `dataframely` using your favorite package manager, e.g., `pixi` or `pip`:
33 | 
34 | ```bash
35 | pixi add dataframely
36 | pip install dataframely
37 | ```
38 | 
39 | ## 🎯 Usage
40 | 
41 | ### Defining a data frame schema
42 | 
43 | ```python
44 | import dataframely as dy
45 | import polars as pl
46 | 
47 | class HouseSchema(dy.Schema):
48 |     zip_code = dy.String(nullable=False, min_length=3)
49 |     num_bedrooms = dy.UInt8(nullable=False)
50 |     num_bathrooms = dy.UInt8(nullable=False)
51 |     price = dy.Float64(nullable=False)
52 | 
53 |     @dy.rule()
54 |     def reasonable_bathroom_to_bedrooom_ratio() -> pl.Expr:
55 |         ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
56 |         return (ratio >= 1 / 3) & (ratio <= 3)
57 | 
58 |     @dy.rule(group_by=["zip_code"])
59 |     def minimum_zip_code_count() -> pl.Expr:
60 |         return pl.len() >= 2
61 | ```
62 | 
63 | ### Validating data against schema
64 | 
65 | ```python
66 | 
67 | import polars as pl
68 | 
69 | df = pl.DataFrame({
70 |     "zip_code": ["01234", "01234", "1", "213", "123", "213"],
71 |     "num_bedrooms": [2, 2, 1, None, None, 2],
72 |     "num_bathrooms": [1, 2, 1, 1, 0, 8],
73 |     "price": [100_000, 110_000, 50_000, 80_000, 60_000, 160_000]
74 | })
75 | 
76 | # Validate the data and cast columns to expected types
77 | validated_df: dy.DataFrame[HouseSchema] = HouseSchema.validate(df, cast=True)
78 | ```
79 | 
80 | See more advanced usage examples in the [documentation](https://dataframely.readthedocs.io/en/latest/).
81 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Reporting Security Issues
2 | 
3 | We take security bugs in our projects seriously. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.
4 | 
5 | To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/quantco/dataframely/security/advisories/new) tab.
6 | 
7 | We will send a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.
8 | 


--------------------------------------------------------------------------------
/dataframely/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import importlib.metadata
 5 | import warnings
 6 | 
 7 | try:
 8 |     __version__ = importlib.metadata.version(__name__)
 9 | except importlib.metadata.PackageNotFoundError as e:  # pragma: no cover
10 |     warnings.warn(f"Could not determine version of {__name__}\n{e!s}", stacklevel=2)
11 |     __version__ = "unknown"
12 | 
13 | from . import random
14 | from ._base_collection import CollectionMember
15 | from ._filter import filter
16 | from ._rule import rule
17 | from ._typing import DataFrame, LazyFrame
18 | from .collection import Collection
19 | from .columns import (
20 |     Any,
21 |     Array,
22 |     Bool,
23 |     Column,
24 |     Date,
25 |     Datetime,
26 |     Decimal,
27 |     Duration,
28 |     Enum,
29 |     Float,
30 |     Float32,
31 |     Float64,
32 |     Int8,
33 |     Int16,
34 |     Int32,
35 |     Int64,
36 |     Integer,
37 |     List,
38 |     Object,
39 |     String,
40 |     Struct,
41 |     Time,
42 |     UInt8,
43 |     UInt16,
44 |     UInt32,
45 |     UInt64,
46 | )
47 | from .config import Config
48 | from .failure import FailureInfo
49 | from .functional import (
50 |     concat_collection_members,
51 |     filter_relationship_one_to_at_least_one,
52 |     filter_relationship_one_to_one,
53 | )
54 | from .schema import Schema
55 | 
56 | __all__ = [
57 |     "random",
58 |     "filter",
59 |     "rule",
60 |     "DataFrame",
61 |     "LazyFrame",
62 |     "Collection",
63 |     "CollectionMember",
64 |     "Config",
65 |     "FailureInfo",
66 |     "concat_collection_members",
67 |     "filter_relationship_one_to_at_least_one",
68 |     "filter_relationship_one_to_one",
69 |     "Schema",
70 |     "Any",
71 |     "Bool",
72 |     "Column",
73 |     "Date",
74 |     "Datetime",
75 |     "Decimal",
76 |     "Duration",
77 |     "Time",
78 |     "Enum",
79 |     "Float",
80 |     "Float32",
81 |     "Float64",
82 |     "Int8",
83 |     "Int16",
84 |     "Int32",
85 |     "Int64",
86 |     "Integer",
87 |     "UInt8",
88 |     "UInt16",
89 |     "UInt32",
90 |     "UInt64",
91 |     "String",
92 |     "Struct",
93 |     "List",
94 |     "Array",
95 |     "Object",
96 | ]
97 | 


--------------------------------------------------------------------------------
/dataframely/_compat.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | 
 5 | from typing import Any
 6 | 
 7 | 
 8 | class _DummyModule:  # pragma: no cover
 9 |     def __init__(self, module: str) -> None:
10 |         self.module = module
11 | 
12 |     def __getattr__(self, name: str) -> Any:
13 |         raise ValueError(f"Module '{self.module}' is not installed.")
14 | 
15 | 
16 | # ------------------------------------ SQLALCHEMY ------------------------------------ #
17 | 
18 | try:
19 |     import sqlalchemy as sa
20 |     import sqlalchemy.dialects.mssql as sa_mssql
21 |     from sqlalchemy.sql.type_api import TypeEngine as sa_TypeEngine
22 | except ImportError:  # pragma: no cover
23 |     sa = _DummyModule("sqlalchemy")  # type: ignore
24 |     sa_mssql = _DummyModule("sqlalchemy")  # type: ignore
25 | 
26 |     class sa_TypeEngine:  # type: ignore # noqa: N801
27 |         pass
28 | 
29 | 
30 | # -------------------------------------- PYARROW ------------------------------------- #
31 | 
32 | try:
33 |     import pyarrow as pa
34 | except ImportError:  # pragma: no cover
35 |     pa = _DummyModule("pyarrow")
36 | 
37 | # ------------------------------------------------------------------------------------ #
38 | 
39 | __all__ = ["sa", "sa_mssql", "sa_TypeEngine", "pa"]
40 | 


--------------------------------------------------------------------------------
/dataframely/_deprecation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import os
 5 | import warnings
 6 | from collections.abc import Callable
 7 | from functools import wraps
 8 | 
 9 | TRUTHY_VALUES = ["1", "true"]
10 | 
11 | 
12 | def skip_if(env: str) -> Callable:
13 |     """Decorator to skip warnings based on environment variable.
14 | 
15 |     If the environment variable is equivalent to any of TRUTHY_VALUES, the wrapped
16 |     function is skipped.
17 |     """
18 | 
19 |     def decorator(fun: Callable) -> Callable:
20 |         @wraps(fun)
21 |         def wrapper() -> None:
22 |             if os.getenv(env, "").lower() in TRUTHY_VALUES:
23 |                 return
24 |             fun()
25 | 
26 |         return wrapper
27 | 
28 |     return decorator
29 | 
30 | 
31 | @skip_if(env="DATAFRAMELY_NO_FUTURE_WARNINGS")
32 | def warn_nullable_default_change() -> None:
33 |     warnings.warn(
34 |         "The 'nullable' argument was not explicitly set. In a future release, "
35 |         "'nullable=False' will be the default if 'nullable' is not specified. "
36 |         "Explicitly set 'nullable=True' if you want your column to be nullable.",
37 |         FutureWarning,
38 |         stacklevel=4,
39 |     )
40 | 


--------------------------------------------------------------------------------
/dataframely/_extre.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Literal, overload
 2 | 
 3 | def matching_string_length(regex: str) -> tuple[int, int | None]:
 4 |     """
 5 |     Compute the minimum and maximum length (if available) of strings matching a regular expression.
 6 | 
 7 |     Args:
 8 |         regex: The regular expression to analyze. The regular expression must not
 9 |             contain any lookaround operators.
10 | 
11 |     Returns:
12 |         A tuple of the minimum of maximum length of the matching strings. While the minimum
13 |         length is guaranteed to be available, the maximum length may be ``None`` if ``regex``
14 |         matches strings of potentially infinite length (e.g. due to the use of ``+`` or ``*``).
15 | 
16 |     Raises:
17 |         ValueError: If the regex cannot be parsed or analyzed.
18 |     """
19 | 
20 | @overload
21 | def sample(
22 |     regex: str, n: int, max_repetitions: int = 16, seed: int | None = None
23 | ) -> list[str]:
24 |     """
25 |     Sample a random (set of) string(s) matching the provided regular expression.
26 | 
27 |     Args:
28 |         regex: The regular expression generated strings must match. The regular
29 |             expression must not contain any lookaround operators.
30 |         n: The number of random strings to generate or ``None`` if a single one should
31 |             be generated.
32 |         max_repetitions: The maximum number of repetitions for ``+`` and ``*``
33 |             quantifiers.
34 |         seed: The seed to use for the random sampling procedure.
35 | 
36 |     Returns:
37 |         A single randomly generated string if ``n is None`` or a list of randomly
38 |         generated strings if ``n`` is an integer.
39 | 
40 |     Raises:
41 |         ValueError: If the regex cannot be parsed.
42 | 
43 |     Attention:
44 |         Using wildcards (i.e. ``.``) really means _any_ valid Unicode character.
45 |         Consider using more precise regular expressions if this is undesired.
46 |     """
47 | 
48 | @overload
49 | def sample(
50 |     regex: str,
51 |     n: Literal[None] = None,
52 |     max_repetitions: int = 16,
53 |     seed: int | None = None,
54 | ) -> str: ...
55 | 


--------------------------------------------------------------------------------
/dataframely/_filter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from collections.abc import Callable
 5 | from typing import Generic, TypeVar
 6 | 
 7 | import polars as pl
 8 | 
 9 | C = TypeVar("C")
10 | 
11 | 
12 | class Filter(Generic[C]):
13 |     """Internal class representing logic for filtering members of a collection."""
14 | 
15 |     def __init__(self, logic: Callable[[C], pl.LazyFrame]) -> None:
16 |         self.logic = logic
17 | 
18 | 
19 | def filter() -> Callable[[Callable[[C], pl.LazyFrame]], Filter[C]]:
20 |     """Mark a function as filters for rows in the members of a collection.
21 | 
22 |     The name of the function will be used as the name of the filter. The name must not
23 |     clash with the name of any column in the member schemas or rules defined on the
24 |     member schemas.
25 | 
26 |     A filter receives a collection as input and must return a data frame like the
27 |     following:
28 | 
29 |     - The columns must be a superset of the common primary keys across all members.
30 |     - The rows must provide the primary keys which ought to be *kept* across the
31 |       members. The filter results in the removal of rows which are lost as the result
32 |       of inner-joining members onto the return value of this function.
33 | 
34 |     Attention:
35 |         Make sure to provide unique combinations of the primary keys or the filters
36 |         might introduce duplicate rows.
37 |     """
38 | 
39 |     def decorator(validation_fn: Callable[[C], pl.LazyFrame]) -> Filter[C]:
40 |         return Filter(logic=validation_fn)
41 | 
42 |     return decorator
43 | 


--------------------------------------------------------------------------------
/dataframely/_polars.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import datetime as dt
 5 | from collections.abc import Iterable
 6 | from typing import TypeVar
 7 | 
 8 | import polars as pl
 9 | from polars.datatypes import DataTypeClass
10 | 
11 | PolarsDataType = pl.DataType | DataTypeClass
12 | FrameType = TypeVar("FrameType", pl.DataFrame, pl.LazyFrame)
13 | 
14 | EPOCH_DATETIME = dt.datetime(1970, 1, 1)
15 | SECONDS_PER_DAY = 86400
16 | 
17 | # --------------------------------------- JOINS -------------------------------------- #
18 | 
19 | 
20 | def join_all_inner(dfs: Iterable[FrameType], on: str | list[str]) -> FrameType:
21 |     it = iter(dfs)
22 |     result = next(it)
23 |     while (df := next(it, None)) is not None:
24 |         result = result.join(df, on=on)
25 |     return result
26 | 
27 | 
28 | def join_all_outer(dfs: Iterable[FrameType], on: str | list[str]) -> FrameType:
29 |     it = iter(dfs)
30 |     result = next(it)
31 |     while (df := next(it, None)) is not None:
32 |         result = result.join(df, on=on, how="full", coalesce=True)
33 |     return result
34 | 
35 | 
36 | # ------------------------------------- DATETIMES ------------------------------------ #
37 | 
38 | 
39 | def date_matches_resolution(t: dt.date, resolution: str) -> bool:
40 |     return pl.Series([t], dtype=pl.Date).dt.truncate(resolution).item() == t
41 | 
42 | 
43 | def datetime_matches_resolution(t: dt.datetime, resolution: str) -> bool:
44 |     return pl.Series([t], dtype=pl.Datetime).dt.truncate(resolution).item() == t
45 | 
46 | 
47 | def time_matches_resolution(t: dt.time, resolution: str) -> bool:
48 |     return (
49 |         pl.Series([t], dtype=pl.Time)
50 |         .to_frame("t")
51 |         .select(
52 |             pl.lit(EPOCH_DATETIME.date())
53 |             .dt.combine(pl.col("t"))
54 |             .dt.truncate(resolution)
55 |             .dt.time()
56 |         )
57 |         .item()
58 |         == t
59 |     )
60 | 
61 | 
62 | def timedelta_matches_resolution(d: dt.timedelta, resolution: str) -> bool:
63 |     return datetime_matches_resolution(EPOCH_DATETIME + d, resolution)
64 | 


--------------------------------------------------------------------------------
/dataframely/_typing.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | from collections.abc import Callable
  7 | from typing import TYPE_CHECKING, Any, Concatenate, Generic, ParamSpec, TypeVar
  8 | 
  9 | import polars as pl
 10 | 
 11 | from ._base_schema import BaseSchema
 12 | 
 13 | S = TypeVar("S", bound=BaseSchema, covariant=True)
 14 | 
 15 | P = ParamSpec("P")
 16 | R = TypeVar("R")
 17 | 
 18 | 
 19 | def inherit_signature(  # pragma: no cover
 20 |     target_fn: Callable[P, Any],
 21 | ) -> Callable[[Callable[..., R]], Callable[P, R]]:
 22 |     # NOTE: This code is executed during parsing but has no effect at runtime.
 23 |     if TYPE_CHECKING:
 24 |         return lambda _: target_fn
 25 | 
 26 |     return lambda _: None
 27 | 
 28 | 
 29 | class DataFrame(pl.DataFrame, Generic[S]):
 30 |     """Generic wrapper around a :class:`polars.DataFrame` to attach schema information.
 31 | 
 32 |     This class is merely used for the type system and never actually instantiated. This
 33 |     means that it won't exist at runtime and ``isinstance(PoalrsDataFrame, <var>)`` will
 34 |     always fail. Accordingly, users should not try to create instances of this class.
 35 |     """
 36 | 
 37 |     # NOTE: Code in this class will never be executed.
 38 | 
 39 |     @inherit_signature(pl.DataFrame.clear)
 40 |     def clear(self, *args: Any, **kwargs: Any) -> DataFrame[S]:
 41 |         raise NotImplementedError  # pragma: no cover
 42 | 
 43 |     @inherit_signature(pl.DataFrame.clone)
 44 |     def clone(self, *args: Any, **kwargs: Any) -> DataFrame[S]:
 45 |         raise NotImplementedError  # pragma: no cover
 46 | 
 47 |     @inherit_signature(pl.DataFrame.lazy)
 48 |     def lazy(self, *args: Any, **kwargs: Any) -> LazyFrame[S]:
 49 |         raise NotImplementedError  # pragma: no cover
 50 | 
 51 |     def pipe(
 52 |         self,
 53 |         function: Callable[Concatenate[DataFrame[S], P], R],
 54 |         *args: P.args,
 55 |         **kwargs: P.kwargs,
 56 |     ) -> R:
 57 |         raise NotImplementedError  # pragma: no cover
 58 | 
 59 |     @inherit_signature(pl.DataFrame.rechunk)
 60 |     def rechunk(self, *args: Any, **kwargs: Any) -> DataFrame[S]:
 61 |         raise NotImplementedError  # pragma: no cover
 62 | 
 63 |     @inherit_signature(pl.DataFrame.set_sorted)
 64 |     def set_sorted(self, *args: Any, **kwargs: Any) -> DataFrame[S]:
 65 |         raise NotImplementedError  # pragma: no cover
 66 | 
 67 |     @inherit_signature(pl.DataFrame.shrink_to_fit)
 68 |     def shrink_to_fit(self, *args: Any, **kwargs: Any) -> DataFrame[S]:
 69 |         raise NotImplementedError  # pragma: no cover
 70 | 
 71 | 
 72 | class LazyFrame(pl.LazyFrame, Generic[S]):
 73 |     """Generic wrapper around a :class:`polars.LazyFrame` to attach schema information.
 74 | 
 75 |     This class is merely used for the type system and never actually instantiated. This
 76 |     means that it won't exist at runtime and ``isinstance(LazyFrame, <var>)`` will
 77 |     always fail. Accordingly, users should not try to create instances of this class.
 78 |     """
 79 | 
 80 |     # NOTE: Code in this class will never be executed.
 81 | 
 82 |     @inherit_signature(pl.LazyFrame.cache)
 83 |     def cache(self, *args: Any, **kwargs: Any) -> LazyFrame[S]:
 84 |         raise NotImplementedError  # pragma: no cover
 85 | 
 86 |     @inherit_signature(pl.LazyFrame.clear)
 87 |     def clear(self, *args: Any, **kwargs: Any) -> LazyFrame[S]:
 88 |         raise NotImplementedError  # pragma: no cover
 89 | 
 90 |     @inherit_signature(pl.LazyFrame.clone)
 91 |     def clone(self, *args: Any, **kwargs: Any) -> LazyFrame[S]:
 92 |         raise NotImplementedError  # pragma: no cover
 93 | 
 94 |     # NOTE: inheriting the signature does not work since `mypy` doesn't correctly
 95 |     #  propagate overloads
 96 |     def collect(self, *args: Any, **kwargs: Any) -> DataFrame[S]:  # type: ignore
 97 |         raise NotImplementedError  # pragma: no cover
 98 | 
 99 |     @inherit_signature(pl.LazyFrame.lazy)
100 |     def lazy(self, *args: Any, **kwargs: Any) -> LazyFrame[S]:
101 |         raise NotImplementedError  # pragma: no cover
102 | 
103 |     def pipe(
104 |         self,
105 |         function: Callable[Concatenate[LazyFrame[S], P], R],
106 |         *args: P.args,
107 |         **kwargs: P.kwargs,
108 |     ) -> R:
109 |         raise NotImplementedError  # pragma: no cover
110 | 
111 |     @inherit_signature(pl.LazyFrame.set_sorted)
112 |     def set_sorted(self, *args: Any, **kwargs: Any) -> LazyFrame[S]:
113 |         raise NotImplementedError  # pragma: no cover
114 | 


--------------------------------------------------------------------------------
/dataframely/_validation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from collections.abc import Iterable
 5 | from typing import Literal
 6 | 
 7 | import polars as pl
 8 | 
 9 | from dataframely.exc import DtypeValidationError, ValidationError
10 | 
11 | from ._polars import PolarsDataType
12 | from .columns import Column
13 | 
14 | DtypeCasting = Literal["none", "lenient", "strict"]
15 | 
16 | 
17 | def validate_columns(
18 |     lf: pl.LazyFrame,
19 |     actual: Iterable[str],
20 |     expected: Iterable[str],
21 | ) -> pl.LazyFrame:
22 |     """Validate the existence of expected columns in a data frame.
23 | 
24 |     Args:
25 |         lf: The data frame whose list of columns to validate.
26 |         actual: The list of columns that _are_ observed. Passed as a separate argument as a
27 |             performance improvement as it minimizes the number of schema collections.
28 |         expected: The list of columns that _should_ be observed.
29 | 
30 |     Raises:
31 |         ValidationError: If any expected column is not part of the actual columns.
32 | 
33 |     Returns:
34 |         The input data frame, either as-is or with extra columns stripped.
35 |     """
36 |     actual_set = set(actual)
37 |     expected_set = set(expected)
38 | 
39 |     missing_columns = expected_set - actual_set
40 |     if len(missing_columns) > 0:
41 |         raise ValidationError(
42 |             f"{len(missing_columns)} columns in the schema are missing in the "
43 |             f"data frame: {sorted(missing_columns)}."
44 |         )
45 | 
46 |     return lf.select(expected)
47 | 
48 | 
49 | def validate_dtypes(
50 |     lf: pl.LazyFrame,
51 |     actual: pl.Schema,
52 |     expected: dict[str, Column],
53 |     casting: DtypeCasting,
54 | ) -> pl.LazyFrame:
55 |     """Validate the dtypes of all expected columns in a data frame.
56 | 
57 |     Args:
58 |         lf: The data frame whose column dtypes to validate.
59 |         actual: The actual schema of the data frame. Passed as a separate argument as a
60 |             performance improvement as it minimizes the number of schema collections.
61 |         expected: The column definitions carrying the expected dtypes.
62 |         casting: The strategy for casting dtypes.
63 | 
64 |     Raises:
65 |         DtypeValidationError: If the expected column dtypes do not match the input's and
66 |             ``casting`` set to ``none``.
67 | 
68 |     Returns:
69 |         The input data frame with all column dtypes ensured to have the expected dtype.
70 |     """
71 |     dtype_errors: dict[str, tuple[PolarsDataType, PolarsDataType]] = {}
72 |     for name, col in expected.items():
73 |         if not col.validate_dtype(actual[name]):
74 |             dtype_errors[name] = (actual[name], col.dtype)
75 | 
76 |     if len(dtype_errors) > 0:
77 |         if casting == "none":
78 |             raise DtypeValidationError(dtype_errors)
79 |         else:
80 |             return lf.with_columns(
81 |                 pl.col(name).cast(expected[name].dtype, strict=(casting == "strict"))
82 |                 for name in dtype_errors.keys()
83 |             )
84 | 
85 |     return lf
86 | 


--------------------------------------------------------------------------------
/dataframely/columns/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from ._base import Column
 5 | from .any import Any
 6 | from .array import Array
 7 | from .bool import Bool
 8 | from .datetime import Date, Datetime, Duration, Time
 9 | from .decimal import Decimal
10 | from .enum import Enum
11 | from .float import Float, Float32, Float64
12 | from .integer import Int8, Int16, Int32, Int64, Integer, UInt8, UInt16, UInt32, UInt64
13 | from .list import List
14 | from .object import Object
15 | from .string import String
16 | from .struct import Struct
17 | 
18 | __all__ = [
19 |     "Column",
20 |     "Any",
21 |     "Array",
22 |     "Bool",
23 |     "Date",
24 |     "Datetime",
25 |     "Decimal",
26 |     "Duration",
27 |     "Enum",
28 |     "Time",
29 |     "Float",
30 |     "Float32",
31 |     "Float64",
32 |     "Int8",
33 |     "Int16",
34 |     "Int32",
35 |     "Int64",
36 |     "Integer",
37 |     "Object",
38 |     "UInt8",
39 |     "UInt16",
40 |     "UInt32",
41 |     "UInt64",
42 |     "String",
43 |     "List",
44 |     "Struct",
45 | ]
46 | 


--------------------------------------------------------------------------------
/dataframely/columns/_mixins.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from collections.abc import Sequence
 5 | from typing import TYPE_CHECKING, Any, Generic, Protocol, Self, TypeVar
 6 | 
 7 | import polars as pl
 8 | 
 9 | if TYPE_CHECKING:  # pragma: no cover
10 |     from ._base import Column
11 | 
12 |     Base = Column
13 | else:
14 |     Base = object
15 | 
16 | # ----------------------------------- ORDINAL MIXIN ---------------------------------- #
17 | 
18 | 
19 | class Comparable(Protocol):
20 |     def __gt__(self, other: Self, /) -> bool: ...
21 |     def __ge__(self, other: Self, /) -> bool: ...
22 | 
23 | 
24 | T = TypeVar("T", bound=Comparable)
25 | 
26 | 
27 | class OrdinalMixin(Generic[T], Base):
28 |     """Mixin to use for ordinal types."""
29 | 
30 |     def __init__(
31 |         self,
32 |         *,
33 |         min: T | None = None,
34 |         min_exclusive: T | None = None,
35 |         max: T | None = None,
36 |         max_exclusive: T | None = None,
37 |         **kwargs: Any,
38 |     ):
39 |         if min is not None and min_exclusive is not None:
40 |             raise ValueError("At most one of `min` and `min_exclusive` must be set.")
41 |         if max is not None and max_exclusive is not None:
42 |             raise ValueError("At most one of `max` and `max_exclusive` must be set.")
43 | 
44 |         if min is not None and max is not None and min > max:
45 |             raise ValueError("`min` must not be greater than `max`.")
46 |         if min_exclusive is not None and max is not None and min_exclusive >= max:
47 |             raise ValueError("`min_exclusive` must not be greater or equal to `max`.")
48 |         if min is not None and max_exclusive is not None and min >= max_exclusive:
49 |             raise ValueError("`min` must not be greater or equal to `max_exclusive`.")
50 |         if (
51 |             min_exclusive is not None
52 |             and max_exclusive is not None
53 |             and min_exclusive >= max_exclusive
54 |         ):
55 |             raise ValueError(
56 |                 "`min_exclusive` must not be greater or equal to `max_exclusive`."
57 |             )
58 | 
59 |         super().__init__(**kwargs)
60 |         self.min = min
61 |         self.min_exclusive = min_exclusive
62 |         self.max = max
63 |         self.max_exclusive = max_exclusive
64 | 
65 |     def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
66 |         result = super().validation_rules(expr)
67 |         if self.min is not None:
68 |             result["min"] = expr >= self.min  # type: ignore
69 |         if self.min_exclusive is not None:
70 |             result["min_exclusive"] = expr > self.min_exclusive  # type: ignore
71 |         if self.max is not None:
72 |             result["max"] = expr <= self.max  # type: ignore
73 |         if self.max_exclusive:
74 |             result["max_exclusive"] = expr < self.max_exclusive  # type: ignore
75 |         return result
76 | 
77 | 
78 | # ------------------------------------ IS IN MIXIN ----------------------------------- #
79 | 
80 | U = TypeVar("U")
81 | 
82 | 
83 | class IsInMixin(Generic[U], Base):
84 |     """Mixin to use for types implementing "is in"."""
85 | 
86 |     def __init__(self, *, is_in: Sequence[U] | None = None, **kwargs: Any) -> None:
87 |         super().__init__(**kwargs)
88 |         self.is_in = is_in
89 | 
90 |     def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
91 |         result = super().validation_rules(expr)
92 |         if self.is_in is not None:
93 |             result["is_in"] = expr.is_in(self.is_in)
94 |         return result
95 | 


--------------------------------------------------------------------------------
/dataframely/columns/_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from collections.abc import Callable
 5 | from typing import Any, Concatenate, Literal, ParamSpec, TypeVar, overload
 6 | 
 7 | 
 8 | class classproperty(property):  # noqa: N801
 9 |     """Replacement for the deprecated @classmethod @property decorator combination.
10 | 
11 |     Usage:
12 |         ```
13 |         @classproperty
14 |         def num_bytes(self) -> int:
15 |             ...
16 |         ```
17 |     """
18 | 
19 |     def __get__(self, instance: Any, owner: type | None = None, /) -> Any:
20 |         return self.fget(owner) if self.fget is not None else None
21 | 
22 | 
23 | T = TypeVar("T")
24 | R = TypeVar("R")
25 | P = ParamSpec("P")
26 | 
27 | 
28 | def map_optional(
29 |     fn: Callable[Concatenate[T, P], R],
30 |     value: T | None,
31 |     *args: P.args,
32 |     **kwargs: P.kwargs,
33 | ) -> R | None:
34 |     if value is None:
35 |         return None
36 |     return fn(value, *args, **kwargs)
37 | 
38 | 
39 | @overload
40 | def first_non_null(
41 |     *values: T | None, allow_null_response: Literal[True]
42 | ) -> T | None: ...
43 | 
44 | 
45 | @overload
46 | def first_non_null(*values: T | None, default: T) -> T: ...
47 | 
48 | 
49 | def first_non_null(
50 |     *values: T | None,
51 |     default: T | None = None,
52 |     allow_null_response: Literal[True] | None = None,
53 | ) -> T | None:
54 |     """Returns the first element in a sequence that is not None."""
55 |     for value in values:
56 |         if value is not None:
57 |             return value
58 |     if allow_null_response:
59 |         return None
60 |     return default
61 | 


--------------------------------------------------------------------------------
/dataframely/columns/any.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from __future__ import annotations
 5 | 
 6 | from collections.abc import Callable
 7 | 
 8 | import polars as pl
 9 | 
10 | from dataframely._compat import pa, sa, sa_mssql, sa_TypeEngine
11 | from dataframely._polars import PolarsDataType
12 | from dataframely.random import Generator
13 | 
14 | from ._base import Column
15 | 
16 | 
17 | class Any(Column):
18 |     """A column with arbitrary type.
19 | 
20 |     As a column with arbitrary type is commonly mapped to the ``Null`` type (this is the
21 |     default in :mod:`polars` and :mod:`pyarrow` for empty columns), dataframely also
22 |     requires this column to be nullable. Hence, it cannot be used as a primary key.
23 |     """
24 | 
25 |     def __init__(
26 |         self,
27 |         *,
28 |         check: (
29 |             Callable[[pl.Expr], pl.Expr]
30 |             | list[Callable[[pl.Expr], pl.Expr]]
31 |             | dict[str, Callable[[pl.Expr], pl.Expr]]
32 |             | None
33 |         ) = None,
34 |         alias: str | None = None,
35 |         metadata: dict[str, Any] | None = None,
36 |     ):
37 |         """
38 |         Args:
39 |             check: A custom rule or multiple rules to run for this column. This can be:
40 |                 - A single callable that returns a non-aggregated boolean expression.
41 |                 The name of the rule is derived from the callable name, or defaults to
42 |                 "check" for lambdas.
43 |                 - A list of callables, where each callable returns a non-aggregated
44 |                 boolean expression. The name of the rule is derived from the callable
45 |                 name, or defaults to "check" for lambdas. Where multiple rules result
46 |                 in the same name, the suffix __i is appended to the name.
47 |                 - A dictionary mapping rule names to callables, where each callable
48 |                 returns a non-aggregated boolean expression.
49 |                 All rule names provided here are given the prefix "check_".
50 |             alias: An overwrite for this column's name which allows for using a column
51 |                 name that is not a valid Python identifier. Especially note that setting
52 |                 this option does _not_ allow to refer to the column with two different
53 |                 names, the specified alias is the only valid name.
54 |             metadata: A dictionary of metadata to attach to the column.
55 |         """
56 |         super().__init__(
57 |             nullable=True,
58 |             primary_key=False,
59 |             check=check,
60 |             alias=alias,
61 |             metadata=metadata,
62 |         )
63 | 
64 |     @property
65 |     def dtype(self) -> pl.DataType:
66 |         return pl.Null()  # default polars dtype
67 | 
68 |     def validate_dtype(self, dtype: PolarsDataType) -> bool:
69 |         return True
70 | 
71 |     def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
72 |         match dialect.name:
73 |             case "mssql":
74 |                 return sa_mssql.SQL_VARIANT()
75 |             case _:  # pragma: no cover
76 |                 raise NotImplementedError("SQL column cannot have 'Any' type.")
77 | 
78 |     def pyarrow_field(self, name: str) -> pa.Field:
79 |         return pa.field(name, self.pyarrow_dtype, nullable=self.nullable)
80 | 
81 |     @property
82 |     def pyarrow_dtype(self) -> pa.DataType:
83 |         return pa.null()
84 | 
85 |     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
86 |         return pl.repeat(None, n, dtype=pl.Null, eager=True)
87 | 


--------------------------------------------------------------------------------
/dataframely/columns/array.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | import math
  7 | from collections.abc import Callable, Sequence
  8 | from typing import Any, Literal
  9 | 
 10 | import polars as pl
 11 | 
 12 | from dataframely._compat import pa, sa, sa_TypeEngine
 13 | from dataframely.random import Generator
 14 | 
 15 | from ._base import Column
 16 | from .struct import Struct
 17 | 
 18 | 
 19 | class Array(Column):
 20 |     """A fixed-shape array column."""
 21 | 
 22 |     def __init__(
 23 |         self,
 24 |         inner: Column,
 25 |         shape: int | tuple[int, ...],
 26 |         *,
 27 |         nullable: bool = True,
 28 |         # polars doesn't yet support grouping by arrays,
 29 |         # see https://github.com/pola-rs/polars/issues/22574
 30 |         primary_key: Literal[False] = False,
 31 |         check: (
 32 |             Callable[[pl.Expr], pl.Expr]
 33 |             | list[Callable[[pl.Expr], pl.Expr]]
 34 |             | dict[str, Callable[[pl.Expr], pl.Expr]]
 35 |             | None
 36 |         ) = None,
 37 |         alias: str | None = None,
 38 |         metadata: dict[str, Any] | None = None,
 39 |     ):
 40 |         """
 41 |         Args:
 42 |             inner: The inner column type. No validation rules on the inner type are supported yet.
 43 |             shape: The shape of the array.
 44 |             nullable: Whether this column may contain null values.
 45 |             primary_key: Whether this column is part of the primary key of the schema.
 46 |                 Not yet supported for the Array type.
 47 |             check: A custom rule or multiple rules to run for this column. This can be:
 48 |                 - A single callable that returns a non-aggregated boolean expression.
 49 |                 The name of the rule is derived from the callable name, or defaults to
 50 |                 "check" for lambdas.
 51 |                 - A list of callables, where each callable returns a non-aggregated
 52 |                 boolean expression. The name of the rule is derived from the callable
 53 |                 name, or defaults to "check" for lambdas. Where multiple rules result
 54 |                 in the same name, the suffix __i is appended to the name.
 55 |                 - A dictionary mapping rule names to callables, where each callable
 56 |                 returns a non-aggregated boolean expression.
 57 |                 All rule names provided here are given the prefix "check_".
 58 |             alias: An overwrite for this column's name which allows for using a column
 59 |                 name that is not a valid Python identifier. Especially note that setting
 60 |                 this option does _not_ allow to refer to the column with two different
 61 |                 names, the specified alias is the only valid name.
 62 |             metadata: A dictionary of metadata to attach to the column.
 63 |         """
 64 |         if inner.primary_key or (
 65 |             isinstance(inner, Struct)
 66 |             and any(col.primary_key for col in inner.inner.values())
 67 |         ):
 68 |             raise ValueError(
 69 |                 "`primary_key=True` is not yet supported for inner types of the Array type."
 70 |             )
 71 | 
 72 |         # We disallow validation rules on the inner type since Polars arrays currently don't support .eval(). Converting
 73 |         # to a list and calling .list.eval() is possible, however, since the shape can have multiple axes, the recursive
 74 |         # conversion could have significant performance impact. Hence, we simply disallow inner validation rules.
 75 |         # Another option would be to allow validation rules only for sampling, but not enforce them.
 76 |         if inner.validation_rules(pl.lit(None)):
 77 |             raise ValueError(
 78 |                 "Validation rules on the inner type of Array are not yet supported."
 79 |             )
 80 | 
 81 |         super().__init__(
 82 |             nullable=nullable,
 83 |             primary_key=False,
 84 |             check=check,
 85 |             alias=alias,
 86 |             metadata=metadata,
 87 |         )
 88 |         self.inner = inner
 89 |         self.shape = shape if isinstance(shape, tuple) else (shape,)
 90 | 
 91 |     @property
 92 |     def dtype(self) -> pl.DataType:
 93 |         return pl.Array(self.inner.dtype, self.shape)
 94 | 
 95 |     def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
 96 |         # NOTE: We might want to add support for PostgreSQL's ARRAY type or use JSON in the future.
 97 |         raise NotImplementedError("SQL column cannot have 'Array' type.")
 98 | 
 99 |     def _pyarrow_dtype_of_shape(self, shape: Sequence[int]) -> pa.DataType:
100 |         if shape:
101 |             size, *rest = shape
102 |             return pa.list_(self._pyarrow_dtype_of_shape(rest), size)
103 |         else:
104 |             return self.inner.pyarrow_dtype
105 | 
106 |     @property
107 |     def pyarrow_dtype(self) -> pa.DataType:
108 |         return self._pyarrow_dtype_of_shape(self.shape)
109 | 
110 |     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
111 |         # Sample the inner elements in a flat series
112 |         n_elements = n * math.prod(self.shape)
113 |         all_elements = self.inner.sample(generator, n_elements)
114 | 
115 |         # Finally, apply a null mask
116 |         return generator._apply_null_mask(
117 |             all_elements.reshape((n, *self.shape)),
118 |             null_probability=self._null_probability,
119 |         )
120 | 


--------------------------------------------------------------------------------
/dataframely/columns/bool.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from __future__ import annotations
 5 | 
 6 | import polars as pl
 7 | 
 8 | from dataframely._compat import pa, sa, sa_TypeEngine
 9 | from dataframely.random import Generator
10 | 
11 | from ._base import Column
12 | 
13 | # ------------------------------------------------------------------------------------ #
14 | 
15 | 
16 | class Bool(Column):
17 |     """A column of booleans."""
18 | 
19 |     @property
20 |     def dtype(self) -> pl.DataType:
21 |         return pl.Boolean()
22 | 
23 |     def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
24 |         return sa.Boolean()
25 | 
26 |     @property
27 |     def pyarrow_dtype(self) -> pa.DataType:
28 |         return pa.bool_()
29 | 
30 |     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
31 |         return generator.sample_bool(n, null_probability=self._null_probability)
32 | 


--------------------------------------------------------------------------------
/dataframely/columns/enum.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from __future__ import annotations
 5 | 
 6 | from collections.abc import Callable, Sequence
 7 | from typing import Any
 8 | 
 9 | import polars as pl
10 | 
11 | from dataframely._compat import pa, sa, sa_TypeEngine
12 | from dataframely._polars import PolarsDataType
13 | from dataframely.random import Generator
14 | 
15 | from ._base import Column
16 | 
17 | 
18 | class Enum(Column):
19 |     """A column of enum (string) values."""
20 | 
21 |     def __init__(
22 |         self,
23 |         categories: Sequence[str],
24 |         *,
25 |         nullable: bool | None = None,
26 |         primary_key: bool = False,
27 |         check: (
28 |             Callable[[pl.Expr], pl.Expr]
29 |             | list[Callable[[pl.Expr], pl.Expr]]
30 |             | dict[str, Callable[[pl.Expr], pl.Expr]]
31 |             | None
32 |         ) = None,
33 |         alias: str | None = None,
34 |         metadata: dict[str, Any] | None = None,
35 |     ):
36 |         """
37 |         Args:
38 |             categories: The list of valid categories for the enum.
39 |             nullable: Whether this column may contain null values.
40 |                 Explicitly set `nullable=True` if you want your column to be nullable.
41 |                 In a future release, `nullable=False` will be the default if `nullable`
42 |                 is not specified.
43 |             primary_key: Whether this column is part of the primary key of the schema.
44 |                 If ``True``, ``nullable`` is automatically set to ``False``.
45 |             check: A custom rule or multiple rules to run for this column. This can be:
46 |                 - A single callable that returns a non-aggregated boolean expression.
47 |                 The name of the rule is derived from the callable name, or defaults to
48 |                 "check" for lambdas.
49 |                 - A list of callables, where each callable returns a non-aggregated
50 |                 boolean expression. The name of the rule is derived from the callable
51 |                 name, or defaults to "check" for lambdas. Where multiple rules result
52 |                 in the same name, the suffix __i is appended to the name.
53 |                 - A dictionary mapping rule names to callables, where each callable
54 |                 returns a non-aggregated boolean expression.
55 |                 All rule names provided here are given the prefix "check_".
56 |             alias: An overwrite for this column's name which allows for using a column
57 |                 name that is not a valid Python identifier. Especially note that setting
58 |                 this option does _not_ allow to refer to the column with two different
59 |                 names, the specified alias is the only valid name.
60 |             metadata: A dictionary of metadata to attach to the column.
61 |         """
62 |         super().__init__(
63 |             nullable=nullable,
64 |             primary_key=primary_key,
65 |             check=check,
66 |             alias=alias,
67 |             metadata=metadata,
68 |         )
69 |         self.categories = list(categories)
70 | 
71 |     @property
72 |     def dtype(self) -> pl.DataType:
73 |         return pl.Enum(self.categories)
74 | 
75 |     def validate_dtype(self, dtype: PolarsDataType) -> bool:
76 |         if not isinstance(dtype, pl.Enum):
77 |             return False
78 |         return self.categories == dtype.categories.to_list()
79 | 
80 |     def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
81 |         category_lengths = [len(c) for c in self.categories]
82 |         if all(length == category_lengths[0] for length in category_lengths):
83 |             return sa.CHAR(category_lengths[0])
84 |         return sa.String(max(category_lengths))
85 | 
86 |     @property
87 |     def pyarrow_dtype(self) -> pa.DataType:
88 |         return pa.dictionary(pa.uint32(), pa.large_string())
89 | 
90 |     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
91 |         return generator.sample_choice(
92 |             n, choices=self.categories, null_probability=self._null_probability
93 |         ).cast(self.dtype)
94 | 


--------------------------------------------------------------------------------
/dataframely/columns/object.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from __future__ import annotations
 5 | 
 6 | from collections.abc import Callable
 7 | from typing import Any
 8 | 
 9 | import polars as pl
10 | 
11 | from dataframely._compat import pa, sa, sa_TypeEngine
12 | from dataframely.random import Generator
13 | 
14 | from ._base import Column
15 | 
16 | 
17 | class Object(Column):
18 |     """A Python Object column."""
19 | 
20 |     def __init__(
21 |         self,
22 |         *,
23 |         nullable: bool = True,
24 |         primary_key: bool = False,
25 |         check: (
26 |             Callable[[pl.Expr], pl.Expr]
27 |             | list[Callable[[pl.Expr], pl.Expr]]
28 |             | dict[str, Callable[[pl.Expr], pl.Expr]]
29 |             | None
30 |         ) = None,
31 |         alias: str | None = None,
32 |         metadata: dict[str, Any] | None = None,
33 |     ):
34 |         """
35 |         Args:
36 |             nullable: Whether this column may contain null values.
37 |             primary_key: Whether this column is part of the primary key of the schema.
38 |             check: A custom rule or multiple rules to run for this column. This can be:
39 |                 - A single callable that returns a non-aggregated boolean expression.
40 |                 The name of the rule is derived from the callable name, or defaults to
41 |                 "check" for lambdas.
42 |                 - A list of callables, where each callable returns a non-aggregated
43 |                 boolean expression. The name of the rule is derived from the callable
44 |                 name, or defaults to "check" for lambdas. Where multiple rules result
45 |                 in the same name, the suffix __i is appended to the name.
46 |                 - A dictionary mapping rule names to callables, where each callable
47 |                 returns a non-aggregated boolean expression.
48 |                 All rule names provided here are given the prefix "check_".
49 |             alias: An overwrite for this column's name which allows for using a column
50 |                 name that is not a valid Python identifier. Especially note that setting
51 |                 this option does _not_ allow to refer to the column with two different
52 |                 names, the specified alias is the only valid name.
53 |             metadata: A dictionary of metadata to attach to the column.
54 |         """
55 |         super().__init__(
56 |             nullable=nullable,
57 |             primary_key=primary_key,
58 |             check=check,
59 |             alias=alias,
60 |             metadata=metadata,
61 |         )
62 | 
63 |     @property
64 |     def dtype(self) -> pl.DataType:
65 |         return pl.Object()
66 | 
67 |     def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
68 |         raise NotImplementedError("SQL column cannot have 'Object' type.")
69 | 
70 |     @property
71 |     def pyarrow_dtype(self) -> pa.DataType:
72 |         raise NotImplementedError("PyArrow column cannot have 'Object' type.")
73 | 
74 |     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
75 |         raise NotImplementedError(
76 |             "Random data sampling not implemented for 'Object' type."
77 |         )
78 | 


--------------------------------------------------------------------------------
/dataframely/columns/struct.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | from collections.abc import Callable
  7 | from typing import Any
  8 | 
  9 | import polars as pl
 10 | 
 11 | from dataframely._compat import pa, sa, sa_TypeEngine
 12 | from dataframely._polars import PolarsDataType
 13 | from dataframely.random import Generator
 14 | 
 15 | from ._base import Column
 16 | 
 17 | 
 18 | class Struct(Column):
 19 |     """A struct column."""
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         inner: dict[str, Column],
 24 |         *,
 25 |         nullable: bool | None = None,
 26 |         primary_key: bool = False,
 27 |         check: (
 28 |             Callable[[pl.Expr], pl.Expr]
 29 |             | list[Callable[[pl.Expr], pl.Expr]]
 30 |             | dict[str, Callable[[pl.Expr], pl.Expr]]
 31 |             | None
 32 |         ) = None,
 33 |         alias: str | None = None,
 34 |         metadata: dict[str, Any] | None = None,
 35 |     ):
 36 |         """
 37 |         Args:
 38 |             inner: The dictionary of struct fields. Struct fields may have
 39 |                 ``primary_key=True`` set but this setting only takes effect if the
 40 |                 struct is nested inside a list. In this case, the list items must be
 41 |                 unique wrt. the struct fields that have ``primary_key=True`` set.
 42 |             nullable: Whether this column may contain null values.
 43 |                 Explicitly set `nullable=True` if you want your column to be nullable.
 44 |                 In a future release, `nullable=False` will be the default if `nullable`
 45 |                 is not specified.
 46 |             primary_key: Whether this column is part of the primary key of the schema.
 47 |             check: A custom rule or multiple rules to run for this column. This can be:
 48 |                 - A single callable that returns a non-aggregated boolean expression.
 49 |                 The name of the rule is derived from the callable name, or defaults to
 50 |                 "check" for lambdas.
 51 |                 - A list of callables, where each callable returns a non-aggregated
 52 |                 boolean expression. The name of the rule is derived from the callable
 53 |                 name, or defaults to "check" for lambdas. Where multiple rules result
 54 |                 in the same name, the suffix __i is appended to the name.
 55 |                 - A dictionary mapping rule names to callables, where each callable
 56 |                 returns a non-aggregated boolean expression.
 57 |                 All rule names provided here are given the prefix "check_".
 58 |             alias: An overwrite for this column's name which allows for using a column
 59 |                 name that is not a valid Python identifier. Especially note that setting
 60 |                 this option does _not_ allow to refer to the column with two different
 61 |                 names, the specified alias is the only valid name.
 62 |             metadata: A dictionary of metadata to attach to the column.
 63 |         """
 64 |         super().__init__(
 65 |             nullable=nullable,
 66 |             primary_key=primary_key,
 67 |             check=check,
 68 |             alias=alias,
 69 |             metadata=metadata,
 70 |         )
 71 |         self.inner = inner
 72 | 
 73 |     @property
 74 |     def dtype(self) -> pl.DataType:
 75 |         return pl.Struct({name: col.dtype for name, col in self.inner.items()})
 76 | 
 77 |     def validate_dtype(self, dtype: PolarsDataType) -> bool:
 78 |         if not isinstance(dtype, pl.Struct):
 79 |             return False
 80 |         if len(dtype.fields) != len(self.inner):
 81 |             return False
 82 | 
 83 |         fields = {field.name: field.dtype for field in dtype.fields}
 84 |         for name, col in self.inner.items():
 85 |             field_dtype = fields.get(name)
 86 |             if field_dtype is None:
 87 |                 return False
 88 |             if not col.validate_dtype(field_dtype):
 89 |                 return False
 90 |         return True
 91 | 
 92 |     def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
 93 |         inner_rules = {
 94 |             f"inner_{name}_{rule_name}": (
 95 |                 pl.when(expr.is_null()).then(pl.lit(True)).otherwise(inner_expr)
 96 |             )
 97 |             for name, col in self.inner.items()
 98 |             for rule_name, inner_expr in col.validation_rules(
 99 |                 expr.struct.field(name)
100 |             ).items()
101 |         }
102 |         return {
103 |             **super().validation_rules(expr),
104 |             **inner_rules,
105 |         }
106 | 
107 |     def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
108 |         # NOTE: We might want to add support for PostgreSQL's JSON in the future.
109 |         raise NotImplementedError("SQL column cannot have 'Struct' type.")
110 | 
111 |     @property
112 |     def pyarrow_dtype(self) -> pa.DataType:
113 |         return pa.struct({name: col.pyarrow_dtype for name, col in self.inner.items()})
114 | 
115 |     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
116 |         series = (
117 |             pl.DataFrame(
118 |                 {name: col.sample(generator, n) for name, col in self.inner.items()}
119 |             )
120 |             .select(pl.struct(pl.all()))
121 |             .to_series()
122 |         )
123 |         # Apply a null mask.
124 |         return generator._apply_null_mask(
125 |             series, null_probability=self._null_probability
126 |         )
127 | 


--------------------------------------------------------------------------------
/dataframely/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import contextlib
 5 | from types import TracebackType
 6 | from typing import TypedDict, Unpack
 7 | 
 8 | 
 9 | class Options(TypedDict):
10 |     #: The maximum number of iterations to use for "fuzzy" sampling.
11 |     max_sampling_iterations: int
12 | 
13 | 
14 | def default_options() -> Options:
15 |     return {
16 |         "max_sampling_iterations": 10_000,
17 |     }
18 | 
19 | 
20 | class Config(contextlib.ContextDecorator):
21 |     """An object to track global configuration for operations in dataframely."""
22 | 
23 |     #: The currently valid config options.
24 |     options: Options = default_options()
25 |     #: Singleton stack to track where to go back after exiting a context.
26 |     _stack: list[Options] = []
27 | 
28 |     def __init__(self, **options: Unpack[Options]) -> None:
29 |         self._local_options: Options = {**default_options(), **options}
30 | 
31 |     @staticmethod
32 |     def set_max_sampling_iterations(iterations: int) -> None:
33 |         """Set the maximum number of sampling iterations to use on
34 |         :meth:`Schema.sample`."""
35 |         Config.options["max_sampling_iterations"] = iterations
36 | 
37 |     @staticmethod
38 |     def restore_defaults() -> None:
39 |         """Restore the defaults of the configuration."""
40 |         Config.options = default_options()
41 | 
42 |     # ------------------------------------ CONTEXT ----------------------------------- #
43 | 
44 |     def __enter__(self) -> None:
45 |         Config._stack.append(Config.options)
46 |         Config.options = self._local_options
47 | 
48 |     def __exit__(
49 |         self,
50 |         exc_type: type[BaseException] | None,
51 |         exc_val: BaseException | None,
52 |         exc_tb: TracebackType | None,
53 |     ) -> None:
54 |         Config.options = Config._stack.pop()
55 | 


--------------------------------------------------------------------------------
/dataframely/exc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | from collections import defaultdict
  5 | 
  6 | import polars as pl
  7 | 
  8 | from ._polars import PolarsDataType
  9 | 
 10 | 
 11 | class ValidationError(Exception):
 12 |     """Error raised when :mod:`dataframely` validation encounters an issue."""
 13 | 
 14 |     def __init__(self, message: str) -> None:
 15 |         super().__init__()
 16 |         self.message = message
 17 | 
 18 |     def __str__(self) -> str:
 19 |         return self.message
 20 | 
 21 | 
 22 | class DtypeValidationError(ValidationError):
 23 |     """Validation error raised when column dtypes are wrong."""
 24 | 
 25 |     def __init__(
 26 |         self, errors: dict[str, tuple[PolarsDataType, PolarsDataType]]
 27 |     ) -> None:
 28 |         super().__init__(f"{len(errors)} columns have an invalid dtype")
 29 |         self.errors = errors
 30 | 
 31 |     def __str__(self) -> str:
 32 |         details = [
 33 |             f" - '{col}': got dtype '{actual}' but expected '{expected}'"
 34 |             for col, (actual, expected) in self.errors.items()
 35 |         ]
 36 |         return "\n".join([f"{self.message}:"] + details)
 37 | 
 38 | 
 39 | class RuleValidationError(ValidationError):
 40 |     """Complex validation error raised when rule validation fails."""
 41 | 
 42 |     def __init__(self, errors: dict[str, int]) -> None:
 43 |         super().__init__(f"{len(errors)} rules failed validation")
 44 | 
 45 |         # Split into schema errors and column errors
 46 |         schema_errors: dict[str, int] = {}
 47 |         column_errors: dict[str, dict[str, int]] = defaultdict(dict)
 48 |         for name, count in sorted(errors.items()):
 49 |             if "|" in name:
 50 |                 column, rule = name.split("|", maxsplit=1)
 51 |                 column_errors[column][rule] = count
 52 |             else:
 53 |                 schema_errors[name] = count
 54 | 
 55 |         self.schema_errors = schema_errors
 56 |         self.column_errors = column_errors
 57 | 
 58 |     def __str__(self) -> str:
 59 |         schema_details = [
 60 |             f" - '{name}' failed validation for {count:,} rows"
 61 |             for name, count in self.schema_errors.items()
 62 |         ]
 63 |         column_details = [
 64 |             msg
 65 |             for column, errors in self.column_errors.items()
 66 |             for msg in (
 67 |                 [f" * Column '{column}' failed validation for {len(errors)} rules:"]
 68 |                 + [
 69 |                     f"   - '{name}' failed for {count:,} rows"
 70 |                     for name, count in errors.items()
 71 |                 ]
 72 |             )
 73 |         ]
 74 |         return "\n".join([f"{self.message}:"] + schema_details + column_details)
 75 | 
 76 | 
 77 | class MemberValidationError(ValidationError):
 78 |     """Validation error raised when multiple members of a collection fail validation."""
 79 | 
 80 |     def __init__(self, errors: dict[str, ValidationError]) -> None:
 81 |         super().__init__(f"{len(errors)} members failed validation")
 82 |         self.errors = errors
 83 | 
 84 |     def __str__(self) -> str:
 85 |         details = [
 86 |             f" > Member '{name}' failed validation:\n"
 87 |             + "\n".join("   " + line for line in str(error).split("\n"))
 88 |             for name, error in self.errors.items()
 89 |         ]
 90 |         return "\n".join([f"{self.message}:"] + details)
 91 | 
 92 | 
 93 | class ImplementationError(Exception):
 94 |     """Error raised when a schema is implemented incorrectly."""
 95 | 
 96 | 
 97 | class AnnotationImplementationError(ImplementationError):
 98 |     """Error raised when the annotations of a collection are invalid."""
 99 | 
100 |     def __init__(self, attr: str, kls: type) -> None:
101 |         message = (
102 |             "Annotations of a 'dy.Collection' may only be an (optional) "
103 |             f"'dy.LazyFrame', but \"{attr}\" has type '{kls}'."
104 |         )
105 |         super().__init__(message)
106 | 
107 | 
108 | class RuleImplementationError(ImplementationError):
109 |     """Error raised when a rule is implemented incorrectly."""
110 | 
111 |     def __init__(
112 |         self, name: str, return_dtype: pl.DataType, is_group_rule: bool
113 |     ) -> None:
114 |         if is_group_rule:
115 |             details = (
116 |                 " When implementing a group rule (i.e. when using the `group_by` "
117 |                 "parameter), make sure to use an aggregation function such as `.any()`, "
118 |                 "`.all()`, and others to reduce an expression evaluated on multiple "
119 |                 "rows in the same group to a single boolean value for the group."
120 |             )
121 |         else:
122 |             details = ""
123 | 
124 |         message = (
125 |             f"Validation rule '{name}' has not been implemented correctly. It "
126 |             f"returns dtype '{return_dtype}' but it must return a boolean value."
127 |             + details
128 |         )
129 |         super().__init__(message)
130 | 


--------------------------------------------------------------------------------
/dataframely/functional.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from collections.abc import Sequence
 5 | from typing import TypeVar
 6 | 
 7 | import polars as pl
 8 | 
 9 | from ._base_collection import BaseCollection
10 | from ._typing import LazyFrame
11 | from .schema import Schema
12 | 
13 | S = TypeVar("S", bound=Schema)
14 | T = TypeVar("T", bound=Schema)
15 | 
16 | # NOTE: Binding to `BaseCollection` is required here as the TypeVar default for the
17 | #  sampling type otherwise causes issues for Python 3.13.
18 | C = TypeVar("C", bound=BaseCollection)
19 | 
20 | # ------------------------------------------------------------------------------------ #
21 | #                                        FILTER                                        #
22 | # ------------------------------------------------------------------------------------ #
23 | 
24 | # --------------------------------- RELATIONSHIP 1:1 --------------------------------- #
25 | 
26 | 
27 | def filter_relationship_one_to_one(
28 |     lhs: LazyFrame[S] | pl.LazyFrame,
29 |     rhs: LazyFrame[T] | pl.LazyFrame,
30 |     /,
31 |     on: str | list[str],
32 | ) -> pl.LazyFrame:
33 |     """Express a 1:1 mapping between data frames for a collection filter.
34 | 
35 |     Args:
36 |         lhs: The first data frame in the 1:1 mapping.
37 |         rhs: The second data frame in the 1:1 mapping.
38 |         on: The columns to join the data frames on. If not provided, the join columns
39 |             are inferred from the mutual primary keys of the provided data frames.
40 |     """
41 |     return lhs.join(rhs, on=on)
42 | 
43 | 
44 | # ------------------------------- RELATIONSHIP 1:{1,N} ------------------------------- #
45 | 
46 | 
47 | def filter_relationship_one_to_at_least_one(
48 |     lhs: LazyFrame[S] | pl.LazyFrame,
49 |     rhs: LazyFrame[T] | pl.LazyFrame,
50 |     /,
51 |     on: str | list[str],
52 | ) -> pl.LazyFrame:
53 |     """Express a 1:{1,N} mapping between data frames for a collection filter.
54 | 
55 |     Args:
56 |         lhs: The data frame with exactly one occurrence for a set of key columns.
57 |         rhs: The data frame with at least one occurrence for a set of key columns.
58 |         on: The columns to join the data frames on. If not provided, the join columns
59 |             are inferred from the joint primary keys of the provided data frames.
60 |     """
61 |     return lhs.join(rhs.unique(on), on=on)
62 | 
63 | 
64 | # ------------------------------------------------------------------------------------ #
65 | #                                        CONCAT                                        #
66 | # ------------------------------------------------------------------------------------ #
67 | 
68 | 
69 | def concat_collection_members(collections: Sequence[C], /) -> dict[str, pl.LazyFrame]:
70 |     """Concatenate the members of collections with the same type.
71 | 
72 |     Args:
73 |         collections: The collections whose members to concatenate. Optional members
74 |             are concatenated only from the collections that provide them.
75 | 
76 |     Returns:
77 |         A mapping from member names to a lazy concatenation of data frames. All keys
78 |         are guaranteed to be valid members of the collection.
79 |     """
80 |     if len(collections) == 0:
81 |         raise ValueError("Cannot concatenate less than one collection.")
82 |     members = [c.to_dict() for c in collections]
83 |     key_union = set(members[0]).union(*members[1:])
84 |     return {
85 |         key: pl.concat(
86 |             [member_dict[key] for member_dict in members if key in member_dict]
87 |         )
88 |         for key in key_union
89 |     }
90 | 


--------------------------------------------------------------------------------
/dataframely/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantco/dataframely/1a8f64e031586416677954f12ed41276b4325945/dataframely/py.typed


--------------------------------------------------------------------------------
/dataframely/testing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from .const import (
 5 |     ALL_COLUMN_TYPES,
 6 |     COLUMN_TYPES,
 7 |     FLOAT_COLUMN_TYPES,
 8 |     INTEGER_COLUMN_TYPES,
 9 |     NO_VALIDATION_COLUMN_TYPES,
10 |     SUPERTYPE_COLUMN_TYPES,
11 | )
12 | from .factory import create_collection, create_collection_raw, create_schema
13 | from .mask import validation_mask
14 | from .rules import evaluate_rules, rules_from_exprs
15 | 
16 | __all__ = [
17 |     "ALL_COLUMN_TYPES",
18 |     "COLUMN_TYPES",
19 |     "FLOAT_COLUMN_TYPES",
20 |     "INTEGER_COLUMN_TYPES",
21 |     "SUPERTYPE_COLUMN_TYPES",
22 |     "NO_VALIDATION_COLUMN_TYPES",
23 |     "create_collection",
24 |     "create_collection_raw",
25 |     "create_schema",
26 |     "validation_mask",
27 |     "evaluate_rules",
28 |     "rules_from_exprs",
29 | ]
30 | 


--------------------------------------------------------------------------------
/dataframely/testing/const.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import dataframely.columns as dc
 5 | 
 6 | COLUMN_TYPES: list[type[dc.Column]] = [
 7 |     dc.Bool,
 8 |     dc.Date,
 9 |     dc.Datetime,
10 |     dc.Time,
11 |     dc.Decimal,
12 |     dc.Duration,
13 |     dc.Float32,
14 |     dc.Float64,
15 |     dc.Int8,
16 |     dc.Int16,
17 |     dc.Int32,
18 |     dc.Int64,
19 |     dc.UInt8,
20 |     dc.UInt16,
21 |     dc.UInt32,
22 |     dc.UInt64,
23 |     dc.String,
24 | ]
25 | INTEGER_COLUMN_TYPES: list[type[dc.Column]] = [
26 |     dc.Integer,
27 |     dc.Int8,
28 |     dc.Int16,
29 |     dc.Int32,
30 |     dc.Int64,
31 |     dc.UInt8,
32 |     dc.UInt16,
33 |     dc.UInt32,
34 |     dc.UInt64,
35 | ]
36 | FLOAT_COLUMN_TYPES: list[type[dc.Column]] = [
37 |     dc.Float,
38 |     dc.Float32,
39 |     dc.Float64,
40 | ]
41 | 
42 | SUPERTYPE_COLUMN_TYPES: list[type[dc.Column]] = [
43 |     dc.Float,
44 |     dc.Integer,
45 | ]
46 | 
47 | ALL_COLUMN_TYPES: list[type[dc.Column]] = (
48 |     [dc.Any] + COLUMN_TYPES + SUPERTYPE_COLUMN_TYPES
49 | )
50 | 
51 | # The following is a list of column types that, when created with default parameter values, add no validation rules.
52 | NO_VALIDATION_COLUMN_TYPES: list[type[dc.Column]] = [
53 |     t for t in ALL_COLUMN_TYPES if t not in FLOAT_COLUMN_TYPES
54 | ]
55 | 


--------------------------------------------------------------------------------
/dataframely/testing/factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from typing import Any
 5 | 
 6 | from dataframely._filter import Filter
 7 | from dataframely._rule import Rule
 8 | from dataframely._typing import LazyFrame
 9 | from dataframely.collection import Collection
10 | from dataframely.columns import Column
11 | from dataframely.schema import Schema
12 | 
13 | 
14 | def create_schema(
15 |     name: str,
16 |     columns: dict[str, Column],
17 |     rules: dict[str, Rule] | None = None,
18 | ) -> type[Schema]:
19 |     """Dynamically create a new schema with the provided name.
20 | 
21 |     Args:
22 |         name: The name of the schema.
23 |         columns: The columns to set on the schema. When properly defining the schema,
24 |             this would be the annotations that define the column types.
25 |         rules: The custom non-column-specific validation rules. When properly defining
26 |             the schema, this would be the functions annotated with ``@dy.rule``.
27 | 
28 |     Returns:
29 |         The dynamically created schema.
30 |     """
31 |     return type(name, (Schema,), {**columns, **(rules or {})})
32 | 
33 | 
34 | def create_collection(
35 |     name: str,
36 |     schemas: dict[str, type[Schema]],
37 |     filters: dict[str, Filter] | None = None,
38 |     *,
39 |     annotation_base_class: type = LazyFrame,
40 | ) -> type[Collection]:
41 |     return create_collection_raw(
42 |         name,
43 |         annotations={
44 |             name: annotation_base_class[schema]  # type: ignore
45 |             for name, schema in schemas.items()
46 |         },
47 |         filters=filters,
48 |     )
49 | 
50 | 
51 | def create_collection_raw(
52 |     name: str,
53 |     annotations: dict[str, Any],
54 |     filters: dict[str, Filter] | None = None,
55 | ) -> type[Collection]:
56 |     return type(
57 |         name,
58 |         (Collection,),
59 |         {
60 |             "__annotations__": annotations,
61 |             **(filters or {}),
62 |         },
63 |     )
64 | 


--------------------------------------------------------------------------------
/dataframely/testing/mask.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | 
 6 | from dataframely.failure import FailureInfo
 7 | 
 8 | 
 9 | def validation_mask(df: pl.DataFrame | pl.LazyFrame, failure: FailureInfo) -> pl.Series:
10 |     """Build a validation mask for the left data frame based on the failure info.
11 | 
12 |     Args:
13 |         df: The data frame for whose rows to generate the validation mask.
14 |         failure: The failure object whose information should be used to determine
15 |             which rows of the input data frame are invalid.
16 | 
17 |     Returns:
18 |         A series where with the same length as the input data frame where a value of
19 |         ``True`` indicates validity and ``False`` the opposite.
20 | 
21 |     Raises:
22 |         ValueError: If columns with a dtype of struct or nested list is contained in
23 |             the input. In polars v1.1.0, both of these do not work reliably.
24 |     """
25 |     if any(
26 |         isinstance(dtype, pl.List) and isinstance(dtype.inner, pl.List)
27 |         for dtype in df.collect_schema().dtypes()
28 |     ):  # pragma: no cover
29 |         raise ValueError("`validation_mask` currently does not allow for nested lists.")
30 |     if any(
31 |         isinstance(dtype, pl.Struct) for dtype in df.collect_schema().dtypes()
32 |     ):  # pragma: no cover
33 |         raise ValueError("`validation_mask` currently does not allow for structs.")
34 | 
35 |     return (
36 |         df.lazy()
37 |         .collect()
38 |         .join(
39 |             failure.invalid().unique().with_columns(__marker__=pl.lit(True)),
40 |             on=list(df.collect_schema()),
41 |             how="left",
42 |             nulls_equal=True,
43 |         )
44 |         .select(pl.col("__marker__").is_null())
45 |         .to_series()
46 |     )
47 | 


--------------------------------------------------------------------------------
/dataframely/testing/rules.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | 
 6 | from dataframely._rule import Rule, with_evaluation_rules
 7 | 
 8 | 
 9 | def rules_from_exprs(exprs: dict[str, pl.Expr]) -> dict[str, Rule]:
10 |     """Turn a set of expressions into simple rules.
11 | 
12 |     Args:
13 |         exprs: The expressions, mapping from names to :class:`polars.Expr`.
14 | 
15 |     Returns:
16 |         The rules corresponding to the expressions.
17 |     """
18 |     return {name: Rule(expr) for name, expr in exprs.items()}
19 | 
20 | 
21 | def evaluate_rules(lf: pl.LazyFrame, rules: dict[str, Rule]) -> pl.LazyFrame:
22 |     """Evaluate the provided rules and return the rules' evaluation.
23 | 
24 |     Args:
25 |         lf: The data frame on which to evaluate the rules.
26 |         rules: The rules to evaluate where the key of the dictionary provides the name
27 |             of the rule.
28 | 
29 |     Returns:
30 |         The same return value as :meth:`with_evaluation_rules` only that the columns
31 |         of the input data frame are dropped.
32 |     """
33 |     return with_evaluation_rules(lf, rules).drop(lf.collect_schema())
34 | 


--------------------------------------------------------------------------------
/dataframely/testing/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from dataframely import (
 5 |     Any,
 6 |     Date,
 7 |     Datetime,
 8 |     Decimal,
 9 |     Enum,
10 |     Float32,
11 |     Int64,
12 |     List,
13 |     Schema,
14 |     Struct,
15 | )
16 | 
17 | 
18 | class MyImportedBaseSchema(Schema):
19 |     a = Int64()
20 | 
21 | 
22 | class MyImportedSchema(MyImportedBaseSchema):
23 |     b = Float32()
24 |     c = Enum(["a", "b", "c"])
25 |     d = Struct({"a": Int64(), "b": Struct({"c": Enum(["a", "b"])})})
26 |     e = List(Struct({"a": Int64()}))
27 |     f = Datetime()
28 |     g = Date()
29 |     h = Any()
30 |     some_decimal = Decimal(12, 8)
31 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   mssql:
 4 |     image: mcr.microsoft.com/azure-sql-edge:latest
 5 |     environment:
 6 |       ACCEPT_EULA: Y
 7 |       MSSQL_USER: sa
 8 |       SA_PASSWORD: P@ssword1
 9 |     ports:
10 |       - "1455:1433"
11 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.collection.rst:
--------------------------------------------------------------------------------
1 | dataframely.collection module
2 | =============================
3 | 
4 | .. automodule:: dataframely.collection
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.any.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.any module
2 | ==============================
3 | 
4 | .. automodule:: dataframely.columns.any
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.bool.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.bool module
2 | ===============================
3 | 
4 | .. automodule:: dataframely.columns.bool
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.datetime.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.datetime module
2 | ===================================
3 | 
4 | .. automodule:: dataframely.columns.datetime
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.decimal.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.decimal module
2 | ==================================
3 | 
4 | .. automodule:: dataframely.columns.decimal
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.enum.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.enum module
2 | ===============================
3 | 
4 | .. automodule:: dataframely.columns.enum
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.float.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.float module
2 | ================================
3 | 
4 | .. automodule:: dataframely.columns.float
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.integer.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.integer module
2 | ==================================
3 | 
4 | .. automodule:: dataframely.columns.integer
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.list.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.list module
2 | ===============================
3 | 
4 | .. automodule:: dataframely.columns.list
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.rst:
--------------------------------------------------------------------------------
  1 | dataframely.columns package
  2 | ===========================
  3 | 
  4 | .. automodule:: dataframely.columns
  5 |    :members:
  6 |    :show-inheritance:
  7 |    :undoc-members:
  8 | 
  9 | Submodules
 10 | ----------
 11 | 
 12 | dataframely.columns.any module
 13 | ------------------------------
 14 | 
 15 | .. automodule:: dataframely.columns.any
 16 |    :members:
 17 |    :show-inheritance:
 18 |    :undoc-members:
 19 | 
 20 | dataframely.columns.array module
 21 | --------------------------------
 22 | 
 23 | .. automodule:: dataframely.columns.array
 24 |    :members:
 25 |    :show-inheritance:
 26 |    :undoc-members:
 27 | 
 28 | dataframely.columns.bool module
 29 | -------------------------------
 30 | 
 31 | .. automodule:: dataframely.columns.bool
 32 |    :members:
 33 |    :show-inheritance:
 34 |    :undoc-members:
 35 | 
 36 | dataframely.columns.datetime module
 37 | -----------------------------------
 38 | 
 39 | .. automodule:: dataframely.columns.datetime
 40 |    :members:
 41 |    :show-inheritance:
 42 |    :undoc-members:
 43 | 
 44 | dataframely.columns.decimal module
 45 | ----------------------------------
 46 | 
 47 | .. automodule:: dataframely.columns.decimal
 48 |    :members:
 49 |    :show-inheritance:
 50 |    :undoc-members:
 51 | 
 52 | dataframely.columns.enum module
 53 | -------------------------------
 54 | 
 55 | .. automodule:: dataframely.columns.enum
 56 |    :members:
 57 |    :show-inheritance:
 58 |    :undoc-members:
 59 | 
 60 | dataframely.columns.float module
 61 | --------------------------------
 62 | 
 63 | .. automodule:: dataframely.columns.float
 64 |    :members:
 65 |    :show-inheritance:
 66 |    :undoc-members:
 67 | 
 68 | dataframely.columns.integer module
 69 | ----------------------------------
 70 | 
 71 | .. automodule:: dataframely.columns.integer
 72 |    :members:
 73 |    :show-inheritance:
 74 |    :undoc-members:
 75 | 
 76 | dataframely.columns.list module
 77 | -------------------------------
 78 | 
 79 | .. automodule:: dataframely.columns.list
 80 |    :members:
 81 |    :show-inheritance:
 82 |    :undoc-members:
 83 | 
 84 | dataframely.columns.object module
 85 | ---------------------------------
 86 | 
 87 | .. automodule:: dataframely.columns.object
 88 |    :members:
 89 |    :show-inheritance:
 90 |    :undoc-members:
 91 | 
 92 | dataframely.columns.string module
 93 | ---------------------------------
 94 | 
 95 | .. automodule:: dataframely.columns.string
 96 |    :members:
 97 |    :show-inheritance:
 98 |    :undoc-members:
 99 | 
100 | dataframely.columns.struct module
101 | ---------------------------------
102 | 
103 | .. automodule:: dataframely.columns.struct
104 |    :members:
105 |    :show-inheritance:
106 |    :undoc-members:
107 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.string.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.string module
2 | =================================
3 | 
4 | .. automodule:: dataframely.columns.string
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.columns.struct.rst:
--------------------------------------------------------------------------------
1 | dataframely.columns.struct module
2 | =================================
3 | 
4 | .. automodule:: dataframely.columns.struct
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.config.rst:
--------------------------------------------------------------------------------
1 | dataframely.config module
2 | =========================
3 | 
4 | .. automodule:: dataframely.config
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.exc.rst:
--------------------------------------------------------------------------------
1 | dataframely.exc module
2 | ======================
3 | 
4 | .. automodule:: dataframely.exc
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.failure.rst:
--------------------------------------------------------------------------------
1 | dataframely.failure module
2 | ==========================
3 | 
4 | .. automodule:: dataframely.failure
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.functional.rst:
--------------------------------------------------------------------------------
1 | dataframely.functional module
2 | =============================
3 | 
4 | .. automodule:: dataframely.functional
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.mypy.rst:
--------------------------------------------------------------------------------
1 | dataframely.mypy module
2 | =======================
3 | 
4 | .. automodule:: dataframely.mypy
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.random.rst:
--------------------------------------------------------------------------------
1 | dataframely.random module
2 | =========================
3 | 
4 | .. automodule:: dataframely.random
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.rst:
--------------------------------------------------------------------------------
 1 | dataframely package
 2 | ===================
 3 | 
 4 | .. automodule:: dataframely
 5 |    :members:
 6 |    :show-inheritance:
 7 |    :undoc-members:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 4
14 | 
15 |    dataframely.columns
16 |    dataframely.testing
17 | 
18 | Submodules
19 | ----------
20 | 
21 | dataframely.collection module
22 | -----------------------------
23 | 
24 | .. automodule:: dataframely.collection
25 |    :members:
26 |    :show-inheritance:
27 |    :undoc-members:
28 | 
29 | dataframely.config module
30 | -------------------------
31 | 
32 | .. automodule:: dataframely.config
33 |    :members:
34 |    :show-inheritance:
35 |    :undoc-members:
36 | 
37 | dataframely.exc module
38 | ----------------------
39 | 
40 | .. automodule:: dataframely.exc
41 |    :members:
42 |    :show-inheritance:
43 |    :undoc-members:
44 | 
45 | dataframely.failure module
46 | --------------------------
47 | 
48 | .. automodule:: dataframely.failure
49 |    :members:
50 |    :show-inheritance:
51 |    :undoc-members:
52 | 
53 | dataframely.functional module
54 | -----------------------------
55 | 
56 | .. automodule:: dataframely.functional
57 |    :members:
58 |    :show-inheritance:
59 |    :undoc-members:
60 | 
61 | dataframely.mypy module
62 | -----------------------
63 | 
64 | .. automodule:: dataframely.mypy
65 |    :members:
66 |    :show-inheritance:
67 |    :undoc-members:
68 | 
69 | dataframely.random module
70 | -------------------------
71 | 
72 | .. automodule:: dataframely.random
73 |    :members:
74 |    :show-inheritance:
75 |    :undoc-members:
76 | 
77 | dataframely.schema module
78 | -------------------------
79 | 
80 | .. automodule:: dataframely.schema
81 |    :members:
82 |    :show-inheritance:
83 |    :undoc-members:
84 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.schema.rst:
--------------------------------------------------------------------------------
1 | dataframely.schema module
2 | =========================
3 | 
4 | .. automodule:: dataframely.schema
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.testing.const.rst:
--------------------------------------------------------------------------------
1 | dataframely.testing.const module
2 | ================================
3 | 
4 | .. automodule:: dataframely.testing.const
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.testing.factory.rst:
--------------------------------------------------------------------------------
1 | dataframely.testing.factory module
2 | ==================================
3 | 
4 | .. automodule:: dataframely.testing.factory
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.testing.mask.rst:
--------------------------------------------------------------------------------
1 | dataframely.testing.mask module
2 | ===============================
3 | 
4 | .. automodule:: dataframely.testing.mask
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.testing.rst:
--------------------------------------------------------------------------------
 1 | dataframely.testing package
 2 | ===========================
 3 | 
 4 | .. automodule:: dataframely.testing
 5 |    :members:
 6 |    :show-inheritance:
 7 |    :undoc-members:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | dataframely.testing.const module
13 | --------------------------------
14 | 
15 | .. automodule:: dataframely.testing.const
16 |    :members:
17 |    :show-inheritance:
18 |    :undoc-members:
19 | 
20 | dataframely.testing.factory module
21 | ----------------------------------
22 | 
23 | .. automodule:: dataframely.testing.factory
24 |    :members:
25 |    :show-inheritance:
26 |    :undoc-members:
27 | 
28 | dataframely.testing.mask module
29 | -------------------------------
30 | 
31 | .. automodule:: dataframely.testing.mask
32 |    :members:
33 |    :show-inheritance:
34 |    :undoc-members:
35 | 
36 | dataframely.testing.rules module
37 | --------------------------------
38 | 
39 | .. automodule:: dataframely.testing.rules
40 |    :members:
41 |    :show-inheritance:
42 |    :undoc-members:
43 | 
44 | dataframely.testing.typing module
45 | ---------------------------------
46 | 
47 | .. automodule:: dataframely.testing.typing
48 |    :members:
49 |    :show-inheritance:
50 |    :undoc-members:
51 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.testing.rules.rst:
--------------------------------------------------------------------------------
1 | dataframely.testing.rules module
2 | ================================
3 | 
4 | .. automodule:: dataframely.testing.rules
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/dataframely.testing.typing.rst:
--------------------------------------------------------------------------------
1 | dataframely.testing.typing module
2 | =================================
3 | 
4 | .. automodule:: dataframely.testing.typing
5 |    :members:
6 |    :show-inheritance:
7 |    :undoc-members:
8 | 


--------------------------------------------------------------------------------
/docs/_api/modules.rst:
--------------------------------------------------------------------------------
1 | dataframely
2 | ===========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    dataframely
8 | 


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
1 | #furo-main-content .caption-text {
2 |     display: none;
3 | }
4 | 


--------------------------------------------------------------------------------
/docs/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Quantco/dataframely/1a8f64e031586416677954f12ed41276b4325945/docs/_static/favicon.ico


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | # Configuration file for the Sphinx documentation builder.
  5 | #
  6 | # This file only contains a selection of the most common options. For a full
  7 | # list see the documentation:
  8 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  9 | 
 10 | # -- Path setup --------------------------------------------------------------
 11 | 
 12 | # If extensions (or modules to document with autodoc) are in another directory,
 13 | # add these directories to sys.path here. If the directory is relative to the
 14 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 15 | #
 16 | # import os
 17 | # import sys
 18 | # sys.path.insert(0, os.path.abspath('.'))
 19 | 
 20 | 
 21 | # -- Project information -----------------------------------------------------
 22 | 
 23 | import datetime
 24 | import importlib
 25 | import inspect
 26 | import os
 27 | import subprocess
 28 | import sys
 29 | from subprocess import CalledProcessError
 30 | from typing import cast
 31 | 
 32 | _mod = importlib.import_module("dataframely")
 33 | 
 34 | 
 35 | project = "dataframely"
 36 | copyright = f"{datetime.date.today().year}, QuantCo, Inc"
 37 | author = "QuantCo, Inc."
 38 | 
 39 | extensions = [
 40 |     "nbsphinx",
 41 |     "numpydoc",
 42 |     "sphinx_copybutton",
 43 |     "sphinx.ext.autodoc",
 44 |     "sphinx.ext.linkcode",
 45 |     "sphinxcontrib.apidoc",
 46 | ]
 47 | 
 48 | numpydoc_class_members_toctree = False
 49 | 
 50 | apidoc_module_dir = "../dataframely"
 51 | apidoc_output_dir = "_api"
 52 | apidoc_module_first = True
 53 | apidoc_extra_args = ["--implicit-namespaces"]
 54 | 
 55 | autodoc_default_options = {
 56 |     "inherited-members": True,
 57 | }
 58 | 
 59 | templates_path = ["_templates"]
 60 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 61 | html_theme = "furo"
 62 | html_title = "Dataframely"
 63 | html_static_path = ["_static"]
 64 | html_css_files = ["custom.css"]
 65 | html_favicon = "_static/favicon.ico"
 66 | 
 67 | 
 68 | # Copied and adapted from
 69 | # https://github.com/pandas-dev/pandas/blob/4a14d064187367cacab3ff4652a12a0e45d0711b/doc/source/conf.py#L613-L659
 70 | # Required configuration function to use sphinx.ext.linkcode
 71 | def linkcode_resolve(domain: str, info: dict[str, str]) -> str | None:
 72 |     """Determine the URL corresponding to a given Python object."""
 73 |     if domain != "py":
 74 |         return None
 75 | 
 76 |     module_name = info["module"]
 77 |     full_name = info["fullname"]
 78 | 
 79 |     _submodule = sys.modules.get(module_name)
 80 |     if _submodule is None:
 81 |         return None
 82 | 
 83 |     _object = _submodule
 84 |     for _part in full_name.split("."):
 85 |         try:
 86 |             _object = getattr(_object, _part)
 87 |         except AttributeError:
 88 |             return None
 89 | 
 90 |     try:
 91 |         fn = inspect.getsourcefile(inspect.unwrap(_object))  # type: ignore
 92 |     except TypeError:
 93 |         fn = None
 94 |     if not fn:
 95 |         return None
 96 | 
 97 |     try:
 98 |         source, line_number = inspect.getsourcelines(_object)
 99 |     except OSError:
100 |         line_number = None
101 | 
102 |     if line_number:
103 |         linespec = f"#L{line_number}-L{line_number + len(source) - 1}"
104 |     else:
105 |         linespec = ""
106 | 
107 |     fn = os.path.relpath(fn, start=os.path.dirname(cast(str, _mod.__file__)))
108 | 
109 |     try:
110 |         # See https://stackoverflow.com/a/21901260
111 |         commit = (
112 |             subprocess.check_output(["git", "rev-parse", "HEAD"])
113 |             .decode("ascii")
114 |             .strip()
115 |         )
116 |     except CalledProcessError:
117 |         # If subprocess returns non-zero exit status
118 |         commit = "main"
119 | 
120 |     return (
121 |         "https://github.com/quantco/dataframely"
122 |         f"/blob/{commit}/{_mod.__name__.replace('.', '/')}/{fn}{linespec}"
123 |     )
124 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Dataframely
 2 | ============
 3 | 
 4 | Dataframely is a Python package to validate the schema and content of `polars <https://pola.rs/>`_ data frames.
 5 | Its purpose is to make data pipelines more robust by ensuring that data meet expectations and more readable by adding schema information to data frame type hints.
 6 | 
 7 | Features
 8 | --------
 9 | 
10 | - Declaratively define schemas as classes with arbitrary inheritance structure
11 | - Specify column-specific validation rules (e.g. nullability, minimum string length, ...)
12 | - Specify cross-column and group validation rules with built-in support for checking the primary key property of a column set
13 | - Specify validation constraints across collections of interdependent data frames
14 | - Validate data frames softly by simply filtering out rows violating rules instead of failing hard
15 | - Introspect validation failure information for run-time failures
16 | - Enhanced type hints for validated data frames allowing users to clearly express expectations about inputs and outputs (i.e., contracts) in data pipelines
17 | - Integrate schemas with external tools (e.g., ``sqlalchemy`` or ``pyarrow``)
18 | - Generate test data that comply with a schema or collection of schemas and its validation rules
19 | 
20 | Contents
21 | ========
22 | 
23 | .. toctree::
24 |     :caption: Contents
25 |     :maxdepth: 1
26 | 
27 |     Installation <sites/installation.rst>
28 |     Quickstart <sites/quickstart.rst>
29 |     Real-world Example <sites/examples/real-world.ipynb>
30 |     FAQ <sites/faq.rst>
31 |     Development Guide <sites/development.rst>
32 |     Versioning <sites/versioning.rst>
33 | 
34 | API Documentation
35 | =================
36 | 
37 | .. toctree::
38 |     :caption: API Documentation
39 |     :maxdepth: 1
40 | 
41 |     Collection <_api/dataframely.collection>
42 |     Column Types <_api/dataframely.columns>
43 |     Config <_api/dataframely.config>
44 |     Random Data Generation <_api/dataframely.random>
45 |     Failure Information <_api/dataframely.failure>
46 |     Schema <_api/dataframely.schema>
47 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/sites/development.rst:
--------------------------------------------------------------------------------
 1 | Development
 2 | ===========
 3 | 
 4 | 
 5 | Thanks for deciding to work on ``dataframely``!
 6 | You can create a development environment with the following steps:
 7 | 
 8 | Environment Installation
 9 | ------------------------
10 | 
11 | .. code-block:: bash
12 | 
13 |     git clone https://github.com/Quantco/dataframely
14 |     cd dataframely
15 |     pixi install
16 | 
17 | Next make sure to install the package locally and set up pre-commit hooks:
18 | 
19 | .. code-block:: bash
20 | 
21 |     pixi run postinstall
22 |     pixi run pre-commit-install
23 | 
24 | 
25 | Running the tests
26 | -----------------
27 | 
28 | .. code-block:: bash
29 | 
30 |     pixi run test
31 | 
32 | 
33 | You can adjust the ``tests/`` path to run tests in a specific directory or module.
34 | 
35 | 
36 | Building the Documentation
37 | --------------------------
38 | 
39 | When updating the documentation, you can compile a localized build of the
40 | documentation and then open it in your web browser using the commands below:
41 | 
42 | .. code-block:: bash
43 | 
44 |     # Run build
45 |     pixi run -e docs postinstall
46 |     pixi run docs
47 | 
48 |     # Open documentation
49 |     open docs/_build/html/index.html
50 | 


--------------------------------------------------------------------------------
/docs/sites/faq.rst:
--------------------------------------------------------------------------------
1 | FAQ
2 | ===
3 | 
4 | Whenever you find out something that you were surprised by or needed some non-trivial
5 | thinking, please add it here.
6 | 


--------------------------------------------------------------------------------
/docs/sites/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | To install ``dataframely``, use your favorite package manager, e.g., using ``pixi`` or ``pip``:
 5 | 
 6 | .. code:: bash
 7 | 
 8 |     pixi add dataframely
 9 |     pip install dataframely
10 | 


--------------------------------------------------------------------------------
/docs/sites/versioning.rst:
--------------------------------------------------------------------------------
 1 | Versioning policy and breaking changes
 2 | ======================================
 3 | 
 4 | Dataframely uses `semantic versioning <https://semver.org/>`_.
 5 | This versioning scheme is designed to make it easy for users to anticipate what types of change they can expect from a given version update in their dependencies.
 6 | We generally recommend that users take measures to control dependency versions. Personally, we like to use ``pixi`` as a package manager, which comes with builtin
 7 | support for lockfiles. Many other package managers support similar functionality. When updating the lockfiles, we recommend to use automated testing
 8 | to ensure that user code still works with newer versions of dependencies such as ``dataframely``.
 9 | 
10 | Most importantly, semantic versioning implies that breaking changes of user-facing functionality are only introduced in **major releases**.
11 | We therefore recommend that users are particularly vigilant when updating their environments to a newer major release of `dataframely`.
12 | As always, automated testing is useful here, but we also recommend checking the release notes `published on GitHub <https://github.com/Quantco/dataframely/releases>`_.
13 | 
14 | In order to give users a heads-up before breaking changes are released, we introduce `FutureWarnings <https://docs.python.org/3/library/exceptions.html#FutureWarning>`_ .
15 | Warnings are the most direct and effective tool at our disposal for reaching users directly.
16 | We therefore generally recommend that users do not silence such warnings explicitly, but instead migrate their code proactively, whenever possible.
17 | However, we also understand that the need for migration may catch users at an inconvenient time, and a temporary band aid solution might be required.
18 | Users can disable ``FutureWarnings`` either through `python builtins <https://docs.python.org/3/library/warnings.html#warnings.filterwarnings>`_,
19 | builtins from tools like `pytest <https://docs.pytest.org/en/stable/how-to/capture-warnings.html#controlling-warnings>`_ ,
20 | or by setting the ``DATAFRAMELY_NO_FUTURE_WARNINGS`` environment variable to ``true`` or ``1``.
21 | 


--------------------------------------------------------------------------------
/pixi.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | channels = ["conda-forge"]
  3 | description = "A declarative, polars-native data frame validation library."
  4 | name = "dataframely"
  5 | platforms = ["linux-64", "linux-aarch64", "osx-64", "osx-arm64", "win-64"]
  6 | 
  7 | [tasks]
  8 | postinstall = "pip install --no-build-isolation --no-deps --disable-pip-version-check -e ."
  9 | 
 10 | [dependencies]
 11 | python = ">=3.11"
 12 | rust = "=1.85"
 13 | 
 14 | numpy = "*"
 15 | polars = ">=1.12"
 16 | 
 17 | [host-dependencies]
 18 | maturin = ">=1.7,<2"
 19 | pip = "*"
 20 | 
 21 | [feature.dev.dependencies]
 22 | jupyter = "*"
 23 | pandas = "*"
 24 | scikit-learn = "*"
 25 | 
 26 | [feature.docs.dependencies]
 27 | furo = "*"
 28 | ipython = "*"
 29 | make = "*"
 30 | # Needed for generating docs for dataframely.mypy
 31 | mypy = "*"
 32 | nbsphinx = "*"
 33 | numpydoc = "*"
 34 | sphinx = "*"
 35 | sphinx-copybutton = "*"
 36 | sphinx_rtd_theme = "*"
 37 | sphinxcontrib-apidoc = "*"
 38 | 
 39 | [feature.docs.tasks]
 40 | docs = "cd docs && make html"
 41 | readthedocs = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r docs/_build/html $READTHEDOCS_OUTPUT/html"
 42 | 
 43 | [feature.test.dependencies]
 44 | mypy = ">=1.13"
 45 | pyarrow = "*"
 46 | pyodbc = "*"
 47 | pytest = ">=6"
 48 | pytest-cov = "*"
 49 | pytest-md = "*"
 50 | sqlalchemy = ">=2"
 51 | 
 52 | [feature.test.tasks]
 53 | test = "pytest"
 54 | test-coverage = "pytest --cov=dataframely --cov-report=xml --cov-report=term-missing"
 55 | 
 56 | [feature.build.dependencies]
 57 | python-build = "*"
 58 | setuptools-scm = "*"
 59 | twine = "*"
 60 | wheel = "*"
 61 | [feature.build.target.unix.dependencies]
 62 | sed = "*"
 63 | 
 64 | [feature.build.tasks]
 65 | build-sdist = "python -m build --sdist --no-isolation ."
 66 | build-wheel = "python -m build --wheel --no-isolation ."
 67 | check-wheel = "twine check dist/*"
 68 | set-version = "sed -i \"s/0.0.0/$(python -m setuptools_scm)/\" pyproject.toml"
 69 | 
 70 | [feature.lint.dependencies]
 71 | docformatter = "*"
 72 | insert-license-header = "*"
 73 | pre-commit = "*"
 74 | pre-commit-hooks = "*"
 75 | prettier = "*"
 76 | ruff = "*"
 77 | taplo = "*"
 78 | typos = "*"
 79 | [feature.lint.tasks]
 80 | pre-commit-install = "pre-commit install"
 81 | pre-commit-run = "pre-commit run -a"
 82 | 
 83 | [feature.py311.dependencies]
 84 | python = "3.11.*"
 85 | 
 86 | [feature.py312.dependencies]
 87 | python = "3.12.*"
 88 | 
 89 | [feature.py313.dependencies]
 90 | python = "3.13.*"
 91 | 
 92 | [environments]
 93 | build = ["build"]
 94 | default = ["dev", "lint", "test"]
 95 | docs = ["docs"]
 96 | lint = { features = ["lint"], no-default-feature = true }
 97 | py311 = ["py311", "test"]
 98 | py312 = ["py312", "test"]
 99 | py313 = ["py313", "test"]
100 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | build-backend = "maturin"
 3 | requires = ["maturin>=1.7,<2.0"]
 4 | 
 5 | [project]
 6 | authors = [
 7 |   { name = "Andreas Albert", email = "andreas.albert@quantco.com" },
 8 |   { name = "Daniel Elsner", email = "daniel.elsner@quantco.com" },
 9 |   { name = "Oliver Borchert", email = "oliver.borchert@quantco.com" },
10 | ]
11 | classifiers = [
12 |   "Programming Language :: Python :: 3",
13 |   "Programming Language :: Python :: 3.11",
14 |   "Programming Language :: Python :: 3.12",
15 |   "Programming Language :: Python :: 3.13",
16 | ]
17 | dependencies = ["numpy", "polars>=1.12"]
18 | description = "A declarative, polars-native data frame validation library"
19 | name = "dataframely"
20 | readme = "README.md"
21 | requires-python = ">=3.11"
22 | version = "0.0.0"
23 | 
24 | [project.urls]
25 | Documentation = "https://dataframely.readthedocs.io/"
26 | Repository = "https://github.com/quantco/dataframely"
27 | 
28 | [tool.maturin]
29 | module-name = "dataframely._extre"
30 | profile = "release"
31 | 
32 | [tool.setuptools.packages.find]
33 | include = ["dataframely"]
34 | namespaces = false
35 | 
36 | [project.scripts]
37 | 
38 | [tool.docformatter]
39 | black = true # only sets the style options to the default values of black
40 | 
41 | [tool.ruff]
42 | line-length = 88
43 | 
44 | [tool.ruff.lint]
45 | ignore = [
46 |   "E501", # https://docs.astral.sh/ruff/faq/#is-the-ruff-linter-compatible-with-black
47 |   "N803", # https://docs.astral.sh/ruff/rules/invalid-argument-name
48 |   "N806", # https://docs.astral.sh/ruff/rules/non-lowercase-variable-in-function
49 | ]
50 | select = [
51 |   # pyflakes
52 |   "F",
53 |   # pycodestyle
54 |   "E",
55 |   "W",
56 |   # isort
57 |   "I",
58 |   # pep8-naming
59 |   "N",
60 |   # pyupgrade
61 |   "UP",
62 | ]
63 | 
64 | [tool.ruff.format]
65 | indent-style = "space"
66 | quote-style = "double"
67 | 
68 | [tool.mypy]
69 | check_untyped_defs = true
70 | disallow_untyped_defs = true
71 | exclude = ["docs/"]
72 | explicit_package_bases = true
73 | no_implicit_optional = true
74 | plugins = ["dataframely.mypy"]
75 | python_version = '3.11'
76 | warn_unused_ignores = true
77 | 
78 | [[tool.mypy.overrides]]
79 | ignore_missing_imports = true
80 | module = ["pyarrow.*"]
81 | 
82 | [tool.pytest.ini_options]
83 | addopts = "--import-mode=importlib"
84 | filterwarnings = [
85 |   # Almost all tests are oblivious to the value of `nullable`. Let's ignore the warning as long as it exists.
86 |   "ignore:The 'nullable' argument was not explicitly set:FutureWarning",
87 | ]
88 | testpaths = ["tests"]
89 | 
90 | [tool.coverage.run]
91 | omit = ["dataframely/mypy.py", "dataframely/testing/generate.py"]
92 | 


--------------------------------------------------------------------------------
/src/errdefs.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::exceptions;
 2 | use pyo3::PyErr;
 3 | 
 4 | pub type Result<T> = std::result::Result<T, Error>;
 5 | 
 6 | #[derive(thiserror::Error, Debug)]
 7 | pub enum Error {
 8 |     #[error("failed to parse regex: {0}")]
 9 |     Parsing(Box<regex_syntax::Error>),
10 |     #[error("failed to interpret bytes as UTF-8: {0}")]
11 |     Utf8(#[from] std::str::Utf8Error),
12 | }
13 | 
14 | impl From<regex_syntax::Error> for Error {
15 |     fn from(value: regex_syntax::Error) -> Self {
16 |         Self::Parsing(Box::new(value))
17 |     }
18 | }
19 | 
20 | impl From<Error> for PyErr {
21 |     fn from(value: Error) -> Self {
22 |         exceptions::PyValueError::new_err(value.to_string())
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod errdefs;
 2 | mod regex_repr;
 3 | 
 4 | use pyo3::prelude::*;
 5 | use rand::prelude::*;
 6 | use rand::rngs::StdRng;
 7 | use regex_repr::Regex;
 8 | 
 9 | #[derive(IntoPyObject)]
10 | enum SampleResult {
11 |     #[pyo3(transparent)]
12 |     One(String),
13 |     #[pyo3(transparent)]
14 |     Many(Vec<String>),
15 | }
16 | 
17 | /// Obtain the minimum and maximum length (if available) of strings matching a regular expression.
18 | #[pyfunction]
19 | fn matching_string_length(regex: &str) -> PyResult<(usize, Option<usize>)> {
20 |     let compiled = Regex::new(regex)?;
21 |     let result = compiled.matching_string_length()?;
22 |     Ok(result)
23 | }
24 | 
25 | #[pyfunction]
26 | #[pyo3(signature = (regex, n = None, max_repetitions = 16, seed = None))]
27 | fn sample(
28 |     regex: &str,
29 |     n: Option<u64>,
30 |     max_repetitions: u32,
31 |     seed: Option<u64>,
32 | ) -> PyResult<SampleResult> {
33 |     let compiled = Regex::new(regex)?;
34 |     let mut rng = match seed {
35 |         None => StdRng::from_os_rng(),
36 |         Some(seed) => StdRng::seed_from_u64(seed),
37 |     };
38 |     let result = match n {
39 |         None => {
40 |             let result = compiled.sample(&mut rng, max_repetitions)?;
41 |             SampleResult::One(result)
42 |         }
43 |         Some(n) => {
44 |             let results = (0..n)
45 |                 .map(|_| compiled.sample(&mut rng, max_repetitions))
46 |                 .collect::<Result<Vec<String>, _>>()?;
47 |             SampleResult::Many(results)
48 |         }
49 |     };
50 |     Ok(result)
51 | }
52 | 
53 | #[pymodule]
54 | #[pyo3(name = "_extre")]
55 | fn extre(m: &Bound<'_, PyModule>) -> PyResult<()> {
56 |     m.add_function(wrap_pyfunction!(matching_string_length, m)?)?;
57 |     m.add_function(wrap_pyfunction!(sample, m)?)?;
58 |     Ok(())
59 | }
60 | 


--------------------------------------------------------------------------------
/tests/collection/test_base.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | from collections.abc import Callable
  5 | from pathlib import Path
  6 | 
  7 | import polars as pl
  8 | import pytest
  9 | from polars.testing import assert_frame_equal
 10 | 
 11 | import dataframely as dy
 12 | 
 13 | 
 14 | class MyFirstSchema(dy.Schema):
 15 |     a = dy.UInt8(primary_key=True)
 16 | 
 17 | 
 18 | class MySecondSchema(dy.Schema):
 19 |     a = dy.UInt16(primary_key=True)
 20 |     b = dy.Integer()
 21 | 
 22 | 
 23 | class MyCollection(dy.Collection):
 24 |     first: dy.LazyFrame[MyFirstSchema]
 25 |     second: dy.LazyFrame[MySecondSchema] | None
 26 | 
 27 | 
 28 | def test_common_primary_keys() -> None:
 29 |     assert MyCollection.common_primary_keys() == ["a"]
 30 | 
 31 | 
 32 | def test_members() -> None:
 33 |     members = MyCollection.members()
 34 |     assert not members["first"].is_optional
 35 |     assert members["second"].is_optional
 36 | 
 37 | 
 38 | def test_member_schemas() -> None:
 39 |     schemas = MyCollection.member_schemas()
 40 |     assert schemas == {"first": MyFirstSchema, "second": MySecondSchema}
 41 | 
 42 | 
 43 | def test_required_members() -> None:
 44 |     required_members = MyCollection.required_members()
 45 |     assert required_members == {"first"}
 46 | 
 47 | 
 48 | def test_optional_members() -> None:
 49 |     optional_members = MyCollection.optional_members()
 50 |     assert optional_members == {"second"}
 51 | 
 52 | 
 53 | def test_cast() -> None:
 54 |     collection = MyCollection.cast(
 55 |         {
 56 |             "first": pl.LazyFrame({"a": [1, 2, 3]}),
 57 |             "second": pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6]}),
 58 |         },
 59 |     )
 60 |     assert collection.first.collect_schema() == MyFirstSchema.polars_schema()
 61 |     assert collection.second is not None
 62 |     assert collection.second.collect_schema() == MySecondSchema.polars_schema()
 63 | 
 64 | 
 65 | @pytest.mark.parametrize(
 66 |     "expected",
 67 |     [
 68 |         {
 69 |             "first": pl.LazyFrame({"a": [1, 2, 3]}, schema={"a": pl.UInt8}),
 70 |             "second": pl.LazyFrame(
 71 |                 {"a": [1, 2, 3], "b": [4, 5, 6]}, schema={"a": pl.UInt16, "b": pl.Int64}
 72 |             ),
 73 |         },
 74 |         {"first": pl.LazyFrame({"a": [1, 2, 3]}, schema={"a": pl.UInt8})},
 75 |     ],
 76 | )
 77 | def test_to_dict(expected: dict[str, pl.LazyFrame]) -> None:
 78 |     collection = MyCollection.validate(expected)
 79 | 
 80 |     # Check that export looks as expected
 81 |     observed = collection.to_dict()
 82 |     assert set(expected.keys()) == set(observed.keys())
 83 |     for key in expected.keys():
 84 |         pl.testing.assert_frame_equal(expected[key], observed[key])
 85 | 
 86 |     # Make sure that "roundtrip" validation works
 87 |     assert MyCollection.is_valid(observed)
 88 | 
 89 | 
 90 | def test_collect_all() -> None:
 91 |     collection = MyCollection.cast(
 92 |         {
 93 |             "first": pl.LazyFrame({"a": [1, 2, 3]}).filter(pl.col("a") < 3),
 94 |             "second": pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).filter(
 95 |                 pl.col("b") <= 5
 96 |             ),
 97 |         }
 98 |     )
 99 |     out = collection.collect_all()
100 | 
101 |     assert isinstance(out, MyCollection)
102 |     assert out.first.explain() == 'DF ["a"]; PROJECT */1 COLUMNS'
103 |     assert len(out.first.collect()) == 2
104 |     assert out.second is not None
105 |     assert out.second.explain() == 'DF ["a", "b"]; PROJECT */2 COLUMNS'
106 |     assert len(out.second.collect()) == 2
107 | 
108 | 
109 | def test_collect_all_optional() -> None:
110 |     collection = MyCollection.cast({"first": pl.LazyFrame({"a": [1, 2, 3]})})
111 |     out = collection.collect_all()
112 | 
113 |     assert isinstance(out, MyCollection)
114 |     assert len(out.first.collect()) == 3
115 |     assert out.second is None
116 | 
117 | 
118 | @pytest.mark.parametrize(
119 |     "read_fn", [MyCollection.scan_parquet, MyCollection.read_parquet]
120 | )
121 | def test_read_write_parquet(
122 |     tmp_path: Path, read_fn: Callable[[Path], MyCollection]
123 | ) -> None:
124 |     collection = MyCollection.cast(
125 |         {
126 |             "first": pl.LazyFrame({"a": [1, 2, 3]}),
127 |             "second": pl.LazyFrame({"a": [1, 2], "b": [10, 15]}),
128 |         }
129 |     )
130 |     collection.write_parquet(tmp_path)
131 | 
132 |     read = read_fn(tmp_path)
133 |     assert_frame_equal(collection.first, read.first)
134 |     assert collection.second is not None
135 |     assert read.second is not None
136 |     assert_frame_equal(collection.second, read.second)
137 | 
138 | 
139 | @pytest.mark.parametrize(
140 |     "read_fn", [MyCollection.scan_parquet, MyCollection.read_parquet]
141 | )
142 | def test_read_write_parquet_optional(
143 |     tmp_path: Path, read_fn: Callable[[Path], MyCollection]
144 | ) -> None:
145 |     collection = MyCollection.cast({"first": pl.LazyFrame({"a": [1, 2, 3]})})
146 |     collection.write_parquet(tmp_path)
147 | 
148 |     read = read_fn(tmp_path)
149 |     assert_frame_equal(collection.first, read.first)
150 |     assert collection.second is None
151 |     assert read.second is None
152 | 


--------------------------------------------------------------------------------
/tests/collection/test_cast.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import polars.exceptions as plexc
 6 | import pytest
 7 | 
 8 | import dataframely as dy
 9 | 
10 | 
11 | class FirstSchema(dy.Schema):
12 |     a = dy.Float64()
13 | 
14 | 
15 | class SecondSchema(dy.Schema):
16 |     a = dy.String()
17 | 
18 | 
19 | class Collection(dy.Collection):
20 |     first: dy.LazyFrame[FirstSchema]
21 |     second: dy.LazyFrame[SecondSchema] | None
22 | 
23 | 
24 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
25 | def test_cast_valid(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
26 |     first = df_type({"a": [3]})
27 |     second = df_type({"a": [1]})
28 |     out = Collection.cast({"first": first, "second": second})  # type: ignore
29 |     assert out.first.collect_schema() == FirstSchema.polars_schema()
30 |     assert out.second is not None
31 |     assert out.second.collect_schema() == SecondSchema.polars_schema()
32 | 
33 | 
34 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
35 | def test_cast_valid_optional(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
36 |     first = df_type({"a": [3]})
37 |     out = Collection.cast({"first": first})  # type: ignore
38 |     assert out.first.collect_schema() == FirstSchema.polars_schema()
39 |     assert out.second is None
40 | 
41 | 
42 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
43 | def test_cast_invalid_members(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
44 |     first = df_type({"a": [3]})
45 |     with pytest.raises(ValueError):
46 |         Collection.cast({"third": first})  # type: ignore
47 | 
48 | 
49 | def test_cast_invalid_member_schema_eager() -> None:
50 |     first = pl.DataFrame({"b": [3]})
51 |     with pytest.raises(plexc.ColumnNotFoundError):
52 |         Collection.cast({"first": first})
53 | 
54 | 
55 | def test_cast_invalid_member_schema_lazy() -> None:
56 |     first = pl.LazyFrame({"b": [3]})
57 |     collection = Collection.cast({"first": first})
58 |     with pytest.raises(plexc.ColumnNotFoundError):
59 |         collection.collect_all()
60 | 


--------------------------------------------------------------------------------
/tests/collection/test_create_empty.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | 
 5 | import dataframely as dy
 6 | 
 7 | 
 8 | class MyFirstSchema(dy.Schema):
 9 |     a = dy.Integer(primary_key=True)
10 |     b = dy.Integer()
11 | 
12 | 
13 | class MySecondSchema(dy.Schema):
14 |     a = dy.Integer(primary_key=True)
15 |     b = dy.Integer(min=1)
16 | 
17 | 
18 | class MyCollection(dy.Collection):
19 |     first: dy.LazyFrame[MyFirstSchema]
20 |     second: dy.LazyFrame[MySecondSchema] | None
21 | 
22 | 
23 | def test_create_empty() -> None:
24 |     collection = MyCollection.create_empty()
25 |     assert collection.first.collect().height == 0
26 |     assert collection.first.collect_schema() == MyFirstSchema.polars_schema()
27 |     assert collection.second is not None
28 |     assert collection.second.collect().height == 0
29 |     assert collection.second.collect_schema() == MySecondSchema.polars_schema()
30 | 


--------------------------------------------------------------------------------
/tests/collection/test_filter_one_to_n.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from typing import Any
 5 | 
 6 | import polars as pl
 7 | 
 8 | import dataframely as dy
 9 | 
10 | 
11 | class CarSchema(dy.Schema):
12 |     vin = dy.String(primary_key=True)
13 |     manufacturer = dy.String(nullable=False)
14 | 
15 | 
16 | class CarPartSchema(dy.Schema):
17 |     vin = dy.String(primary_key=True)
18 |     part = dy.String(primary_key=True)
19 |     price = dy.Float64(primary_key=True)
20 | 
21 | 
22 | class CarFleet(dy.Collection):
23 |     cars: dy.LazyFrame[CarSchema]
24 |     car_parts: dy.LazyFrame[CarPartSchema]
25 | 
26 |     @dy.filter()
27 |     def not_car_with_vin_123(self) -> pl.LazyFrame:
28 |         return self.cars.filter(pl.col("vin") != pl.lit("123"))
29 | 
30 | 
31 | def test_valid_failure_infos() -> None:
32 |     cars = {"vin": ["123", "456"], "manufacturer": ["BMW", "Mercedes"]}
33 |     car_parts: dict[str, list[Any]] = {
34 |         "vin": ["123", "123", "456"],
35 |         "part": ["Motor", "Wheel", "Motor"],
36 |         "price": [1000, 100, 1000],
37 |     }
38 |     car_fleet, failures = CarFleet.filter(
39 |         {"cars": pl.DataFrame(cars), "car_parts": pl.DataFrame(car_parts)},
40 |         cast=True,
41 |     )
42 | 
43 |     assert len(car_fleet.cars.collect()) + len(failures["cars"].invalid()) == len(
44 |         cars["vin"]
45 |     )
46 |     assert len(car_fleet.car_parts.collect()) + len(
47 |         failures["car_parts"].invalid()
48 |     ) == len(car_parts["vin"])
49 | 


--------------------------------------------------------------------------------
/tests/collection/test_ignore_in_filter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from typing import Annotated
 5 | 
 6 | import polars as pl
 7 | 
 8 | import dataframely as dy
 9 | 
10 | 
11 | class MyTestSchema(dy.Schema):
12 |     a_id = dy.UInt8(primary_key=True)
13 | 
14 | 
15 | class MyTestSchema2(dy.Schema):
16 |     a_id = dy.UInt8(primary_key=True)
17 |     b_id = dy.UInt8()
18 | 
19 | 
20 | class IgnoredSchema(dy.Schema):
21 |     b_id = dy.UInt8(primary_key=True)
22 | 
23 | 
24 | class MyTestCollection(dy.Collection):
25 |     a: dy.LazyFrame[MyTestSchema]
26 |     b: dy.LazyFrame[MyTestSchema2]
27 | 
28 |     ignored: Annotated[
29 |         dy.LazyFrame[IgnoredSchema],
30 |         dy.CollectionMember(ignored_in_filters=True),
31 |     ]
32 | 
33 |     @dy.filter()
34 |     def filter_a_id(self) -> pl.LazyFrame:
35 |         return self.a.join(self.b, on="a_id")
36 | 
37 |     @dy.filter()
38 |     def custom_filter_on_ignored(self) -> pl.LazyFrame:
39 |         # we still need to return shared instances
40 |         used_a_ids = (
41 |             self.ignored.unique()
42 |             .join(self.b, on="b_id")
43 |             .join(self.a, on="a_id")
44 |             .select("a_id")
45 |         )
46 |         return used_a_ids
47 | 
48 | 
49 | def test_collection_ignore_in_filter_meta() -> None:
50 |     assert MyTestCollection.non_ignored_members() == {"a", "b"}
51 |     assert MyTestCollection.ignored_members() == {"ignored"}
52 | 
53 | 
54 | def test_collection_ignore_in_filter() -> None:
55 |     success, failure = MyTestCollection.filter(
56 |         {
57 |             "a": pl.LazyFrame({"a_id": [1, 2, 3]}),
58 |             "b": pl.LazyFrame({"a_id": [1, 2, 3], "b_id": [4, 5, 6]}),
59 |             "ignored": pl.LazyFrame({"b_id": [4, 5, 6]}),
60 |         },
61 |         cast=True,
62 |     )
63 |     assert failure["a"].invalid().height == 0
64 |     assert failure["b"].invalid().height == 0
65 |     assert failure["ignored"].invalid().height == 0
66 | 
67 | 
68 | def test_collection_ignore_in_filter_failure() -> None:
69 |     success, failure = MyTestCollection.filter(
70 |         {
71 |             "a": pl.LazyFrame({"a_id": [1, 2, 3]}),
72 |             "b": pl.LazyFrame({"a_id": [1, 2, 3], "b_id": [4, 5, 6]}),
73 |             "ignored": pl.LazyFrame(
74 |                 {"b_id": [9999, 5, 6]}
75 |             ),  # a_id=1 not used by any ignored
76 |         },
77 |         cast=True,
78 |     )
79 |     assert failure["a"].invalid().height == 1
80 |     assert failure["b"].invalid().height == 1
81 |     assert failure["ignored"].invalid().height == 1
82 | 
83 |     assert failure["a"].counts() == {"custom_filter_on_ignored": 1}
84 | 


--------------------------------------------------------------------------------
/tests/collection/test_optional_members.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | 
 6 | import dataframely as dy
 7 | 
 8 | 
 9 | class TestSchema(dy.Schema):
10 |     a = dy.Integer()
11 | 
12 | 
13 | class MyCollection(dy.Collection):
14 |     first: dy.LazyFrame[TestSchema]
15 |     second: dy.LazyFrame[TestSchema] | None
16 | 
17 | 
18 | def test_collection_optional_member() -> None:
19 |     MyCollection.validate({"first": pl.LazyFrame({"a": [1, 2, 3]})})
20 | 
21 | 
22 | def test_filter_failure_info_keys_only_required() -> None:
23 |     out, failure = MyCollection.filter({"first": pl.LazyFrame({"a": [1, 2, 3]})})
24 |     assert out.second is None
25 |     assert set(failure.keys()) == {"first"}
26 | 
27 | 
28 | def test_filter_failure_info_keys_required_and_optional() -> None:
29 |     out, failure = MyCollection.filter(
30 |         {
31 |             "first": pl.LazyFrame({"a": [1, 2, 3]}),
32 |             "second": pl.LazyFrame({"a": [1, 2, 3]}),
33 |         },
34 |     )
35 |     assert out.second is not None
36 |     assert set(failure.keys()) == {"first", "second"}
37 | 


--------------------------------------------------------------------------------
/tests/collection/test_validate_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import pytest
 6 | 
 7 | import dataframely as dy
 8 | 
 9 | 
10 | class TestSchema(dy.Schema):
11 |     a = dy.Integer()
12 | 
13 | 
14 | class MyCollection(dy.Collection):
15 |     first: dy.LazyFrame[TestSchema]
16 |     second: dy.LazyFrame[TestSchema] | None
17 | 
18 | 
19 | def test_collection_missing_required_member() -> None:
20 |     with pytest.raises(ValueError):
21 |         MyCollection.validate({"second": pl.LazyFrame({"a": [1, 2, 3]})})
22 | 
23 | 
24 | def test_collection_superfluous_member() -> None:
25 |     with pytest.warns(Warning):
26 |         MyCollection.validate(
27 |             {
28 |                 "first": pl.LazyFrame({"a": [1, 2, 3]}),
29 |                 "third": pl.LazyFrame({"a": [1, 2, 3]}),
30 |             },
31 |         )
32 | 


--------------------------------------------------------------------------------
/tests/column_types/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) QuantCo 2025-2025
2 | # SPDX-License-Identifier: BSD-3-Clause
3 | 


--------------------------------------------------------------------------------
/tests/column_types/test_any.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from typing import Any
 5 | 
 6 | import polars as pl
 7 | import pytest
 8 | 
 9 | import dataframely as dy
10 | 
11 | 
12 | class AnySchema(dy.Schema):
13 |     a = dy.Any()
14 | 
15 | 
16 | @pytest.mark.parametrize(
17 |     "data",
18 |     [{"a": [None]}, {"a": [True, None]}, {"a": ["foo"]}, {"a": [3.5]}],
19 | )
20 | def test_any_dtype_passes(data: dict[str, Any]) -> None:
21 |     df = pl.DataFrame(data)
22 |     assert AnySchema.is_valid(df)
23 | 


--------------------------------------------------------------------------------
/tests/column_types/test_array.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | import polars as pl
  5 | import pytest
  6 | 
  7 | import dataframely as dy
  8 | from dataframely.columns._base import Column
  9 | from dataframely.testing import create_schema
 10 | 
 11 | 
 12 | @pytest.mark.parametrize(
 13 |     "inner",
 14 |     [
 15 |         (dy.Int64()),
 16 |         (dy.Integer()),
 17 |     ],
 18 | )
 19 | def test_integer_array(inner: Column) -> None:
 20 |     schema = create_schema("test", {"a": dy.Array(inner, 1)})
 21 |     assert schema.is_valid(
 22 |         pl.DataFrame(
 23 |             {"a": [[1], [2], [3]]},
 24 |             schema={
 25 |                 "a": pl.Array(pl.Int64, 1),
 26 |             },
 27 |         )
 28 |     )
 29 | 
 30 | 
 31 | def test_invalid_inner_type() -> None:
 32 |     schema = create_schema("test", {"a": dy.Array(dy.Int64(), 1)})
 33 |     assert not schema.is_valid(pl.DataFrame({"a": [["1"], ["2"], ["3"]]}))
 34 | 
 35 | 
 36 | def test_invalid_shape() -> None:
 37 |     schema = create_schema("test", {"a": dy.Array(dy.Int64(), 2)})
 38 |     assert not schema.is_valid(
 39 |         pl.DataFrame(
 40 |             {"a": [[1], [2], [3]]},
 41 |             schema={
 42 |                 "a": pl.Array(pl.Int64, 1),
 43 |             },
 44 |         )
 45 |     )
 46 | 
 47 | 
 48 | @pytest.mark.parametrize(
 49 |     ("column", "dtype", "is_valid"),
 50 |     [
 51 |         (
 52 |             dy.Array(dy.Int64(), 1),
 53 |             pl.Array(pl.Int64(), 1),
 54 |             True,
 55 |         ),
 56 |         (
 57 |             dy.Array(dy.String(), 1),
 58 |             pl.Array(pl.Int64(), 1),
 59 |             False,
 60 |         ),
 61 |         (
 62 |             dy.Array(dy.String(), 1),
 63 |             pl.Array(pl.Int64(), 2),
 64 |             False,
 65 |         ),
 66 |         (
 67 |             dy.Array(dy.Int64(), (1,)),
 68 |             pl.Array(pl.Int64(), (1,)),
 69 |             True,
 70 |         ),
 71 |         (
 72 |             dy.Array(dy.Int64(), (1,)),
 73 |             pl.Array(pl.Int64(), (2,)),
 74 |             False,
 75 |         ),
 76 |         (
 77 |             dy.Array(dy.String(), 1),
 78 |             dy.Array(dy.String(), 1),
 79 |             False,
 80 |         ),
 81 |         (
 82 |             dy.Array(dy.String(), 1),
 83 |             dy.String(),
 84 |             False,
 85 |         ),
 86 |         (
 87 |             dy.Array(dy.String(), 1),
 88 |             pl.String(),
 89 |             False,
 90 |         ),
 91 |         (
 92 |             dy.Array(dy.Array(dy.String(), 1), 1),
 93 |             pl.Array(pl.String(), (1, 1)),
 94 |             True,
 95 |         ),
 96 |         (
 97 |             dy.Array(dy.String(), (1, 1)),
 98 |             pl.Array(pl.Array(pl.String(), 1), 1),
 99 |             True,
100 |         ),
101 |     ],
102 | )
103 | def test_validate_dtype(column: Column, dtype: pl.DataType, is_valid: bool) -> None:
104 |     assert column.validate_dtype(dtype) == is_valid
105 | 
106 | 
107 | def test_nested_arrays() -> None:
108 |     schema = create_schema("test", {"a": dy.Array(dy.Array(dy.Int64(), 1), 1)})
109 |     assert schema.is_valid(
110 |         pl.DataFrame(
111 |             {"a": [[[1]], [[2]], [[3]]]},
112 |             schema={
113 |                 "a": pl.Array(pl.Int64, (1, 1)),
114 |             },
115 |         )
116 |     )
117 | 
118 | 
119 | def test_nested_array() -> None:
120 |     schema = create_schema("test", {"a": dy.Array(dy.Array(dy.Int64(), 1), 1)})
121 |     assert schema.is_valid(
122 |         pl.DataFrame(
123 |             {"a": [[[1]], [[2]], [[3]]]},
124 |             schema={
125 |                 "a": pl.Array(pl.Int64, (1, 1)),
126 |             },
127 |         )
128 |     )
129 | 
130 | 
131 | def test_array_with_inner_pk() -> None:
132 |     with pytest.raises(ValueError):
133 |         column = dy.Array(dy.String(primary_key=True), 2)
134 |         create_schema(
135 |             "test",
136 |             {"a": column},
137 |         )
138 | 
139 | 
140 | def test_array_with_rules() -> None:
141 |     with pytest.raises(ValueError):
142 |         create_schema(
143 |             "test", {"a": dy.Array(dy.String(min_length=2, nullable=False), 1)}
144 |         )
145 | 
146 | 
147 | def test_outer_nullability() -> None:
148 |     schema = create_schema(
149 |         "test",
150 |         {
151 |             "nullable": dy.Array(
152 |                 inner=dy.Integer(),
153 |                 shape=1,
154 |                 nullable=True,
155 |             )
156 |         },
157 |     )
158 |     df = pl.DataFrame({"nullable": [None, None]})
159 |     schema.validate(df, cast=True)
160 | 


--------------------------------------------------------------------------------
/tests/column_types/test_decimal.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | import decimal
  5 | from typing import Any
  6 | 
  7 | import polars as pl
  8 | import pytest
  9 | from polars.datatypes import DataTypeClass
 10 | from polars.datatypes.group import FLOAT_DTYPES, INTEGER_DTYPES
 11 | from polars.testing import assert_frame_equal
 12 | 
 13 | import dataframely as dy
 14 | from dataframely.testing import evaluate_rules, rules_from_exprs
 15 | 
 16 | 
 17 | class DecimalSchema(dy.Schema):
 18 |     a = dy.Decimal()
 19 | 
 20 | 
 21 | @pytest.mark.parametrize(
 22 |     "kwargs",
 23 |     [
 24 |         {"min": decimal.Decimal(2), "max": decimal.Decimal(1)},
 25 |         {"min_exclusive": decimal.Decimal(2), "max": decimal.Decimal(2)},
 26 |         {"min": decimal.Decimal(2), "max_exclusive": decimal.Decimal(2)},
 27 |         {"min_exclusive": decimal.Decimal(2), "max_exclusive": decimal.Decimal(2)},
 28 |         {"min": decimal.Decimal(2), "min_exclusive": decimal.Decimal(2)},
 29 |         {"max": decimal.Decimal(2), "max_exclusive": decimal.Decimal(2)},
 30 |     ],
 31 | )
 32 | def test_args_consistency_min_max(kwargs: dict[str, Any]) -> None:
 33 |     with pytest.raises(ValueError):
 34 |         dy.Decimal(**kwargs)
 35 | 
 36 | 
 37 | @pytest.mark.parametrize(
 38 |     "kwargs",
 39 |     [
 40 |         dict(scale=1, min=decimal.Decimal("3.14")),
 41 |         dict(scale=1, min_exclusive=decimal.Decimal("3.14")),
 42 |         dict(scale=1, max=decimal.Decimal("3.14")),
 43 |         dict(scale=1, max_exclusive=decimal.Decimal("3.14")),
 44 |         dict(min=decimal.Decimal(float("inf"))),
 45 |         dict(max=decimal.Decimal(float("inf"))),
 46 |         dict(precision=2, min=decimal.Decimal("100")),
 47 |         dict(precision=2, max=decimal.Decimal("100")),
 48 |     ],
 49 | )
 50 | def test_invalid_args(kwargs: dict[str, Any]) -> None:
 51 |     with pytest.raises(ValueError):
 52 |         dy.Decimal(**kwargs)
 53 | 
 54 | 
 55 | @pytest.mark.parametrize(
 56 |     "dtype", [pl.Decimal, pl.Decimal(12), pl.Decimal(None, 8), pl.Decimal(6, 2)]
 57 | )
 58 | def test_any_decimal_dtype_passes(dtype: DataTypeClass) -> None:
 59 |     df = pl.DataFrame(schema={"a": dtype})
 60 |     assert DecimalSchema.is_valid(df)
 61 | 
 62 | 
 63 | @pytest.mark.parametrize(
 64 |     "dtype", [pl.Boolean, pl.String] + list(INTEGER_DTYPES) + list(FLOAT_DTYPES)
 65 | )
 66 | def test_non_decimal_dtype_fails(dtype: DataTypeClass) -> None:
 67 |     df = pl.DataFrame(schema={"a": dtype})
 68 |     assert not DecimalSchema.is_valid(df)
 69 | 
 70 | 
 71 | @pytest.mark.parametrize(
 72 |     ("inclusive", "valid"),
 73 |     [
 74 |         (True, {"min": [False, False, True, True, True]}),
 75 |         (False, {"min_exclusive": [False, False, False, True, True]}),
 76 |     ],
 77 | )
 78 | def test_validate_min(inclusive: bool, valid: dict[str, list[bool]]) -> None:
 79 |     kwargs = {("min" if inclusive else "min_exclusive"): decimal.Decimal(3)}
 80 |     column = dy.Decimal(**kwargs)  # type: ignore
 81 |     lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]})
 82 |     actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a"))))
 83 |     expected = pl.LazyFrame(valid)
 84 |     assert_frame_equal(actual, expected)
 85 | 
 86 | 
 87 | @pytest.mark.parametrize(
 88 |     ("inclusive", "valid"),
 89 |     [
 90 |         (True, {"max": [True, True, True, False, False]}),
 91 |         (False, {"max_exclusive": [True, True, False, False, False]}),
 92 |     ],
 93 | )
 94 | def test_validate_max(inclusive: bool, valid: dict[str, list[bool]]) -> None:
 95 |     kwargs = {("max" if inclusive else "max_exclusive"): decimal.Decimal(3)}
 96 |     column = dy.Decimal(**kwargs)  # type: ignore
 97 |     lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]})
 98 |     actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a"))))
 99 |     expected = pl.LazyFrame(valid)
100 |     assert_frame_equal(actual, expected)
101 | 
102 | 
103 | @pytest.mark.parametrize(
104 |     ("min_inclusive", "max_inclusive", "valid"),
105 |     [
106 |         (
107 |             True,
108 |             True,
109 |             {
110 |                 "min": [False, True, True, True, True],
111 |                 "max": [True, True, True, True, False],
112 |             },
113 |         ),
114 |         (
115 |             True,
116 |             False,
117 |             {
118 |                 "min": [False, True, True, True, True],
119 |                 "max_exclusive": [True, True, True, False, False],
120 |             },
121 |         ),
122 |         (
123 |             False,
124 |             True,
125 |             {
126 |                 "min_exclusive": [False, False, True, True, True],
127 |                 "max": [True, True, True, True, False],
128 |             },
129 |         ),
130 |         (
131 |             False,
132 |             False,
133 |             {
134 |                 "min_exclusive": [False, False, True, True, True],
135 |                 "max_exclusive": [True, True, True, False, False],
136 |             },
137 |         ),
138 |     ],
139 | )
140 | def test_validate_range(
141 |     min_inclusive: bool,
142 |     max_inclusive: bool,
143 |     valid: dict[str, list[bool]],
144 | ) -> None:
145 |     kwargs = {
146 |         ("min" if min_inclusive else "min_exclusive"): decimal.Decimal(2),
147 |         ("max" if max_inclusive else "max_exclusive"): decimal.Decimal(4),
148 |     }
149 |     column = dy.Decimal(**kwargs)  # type: ignore
150 |     lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]})
151 |     actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a"))))
152 |     expected = pl.LazyFrame(valid)
153 |     assert_frame_equal(actual, expected)
154 | 


--------------------------------------------------------------------------------
/tests/column_types/test_enum.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from typing import Any
 5 | 
 6 | import polars as pl
 7 | import pytest
 8 | 
 9 | import dataframely as dy
10 | from dataframely.testing.factory import create_schema
11 | 
12 | 
13 | @pytest.mark.parametrize(
14 |     ("dy_enum", "pl_dtype", "valid"),
15 |     [
16 |         (dy.Enum(["x", "y"]), pl.Enum(["x", "y"]), True),
17 |         (dy.Enum(["y", "x"]), pl.Enum(["x", "y"]), False),
18 |         (dy.Enum(["x"]), pl.Enum(["x", "y"]), False),
19 |         (dy.Enum(["x", "y", "z"]), pl.Enum(["x", "y"]), False),
20 |         (dy.Enum(["x", "y"]), pl.String(), False),
21 |     ],
22 | )
23 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
24 | def test_valid(
25 |     df_type: type[pl.DataFrame] | type[pl.LazyFrame],
26 |     dy_enum: dy.Enum,
27 |     pl_dtype: pl.Enum,
28 |     valid: bool,
29 | ) -> None:
30 |     schema = create_schema("test", {"a": dy_enum})
31 |     df = df_type({"a": ["x", "y", "x", "x"]}).cast(pl_dtype)
32 |     assert schema.is_valid(df) == valid
33 | 
34 | 
35 | @pytest.mark.parametrize("enum", [dy.Enum(["x", "y"]), dy.Enum(["y", "x"])])
36 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
37 | @pytest.mark.parametrize(
38 |     ("data", "valid"),
39 |     [
40 |         ({"a": ["x", "y", "x", "x"]}, True),
41 |         ({"a": ["x", "y", "x", "x"]}, True),
42 |         ({"a": ["x", "y", "z"]}, False),
43 |         ({"a": ["x", "y", "z"]}, False),
44 |     ],
45 | )
46 | def test_valid_cast(
47 |     enum: dy.Enum,
48 |     data: Any,
49 |     valid: bool,
50 |     df_type: type[pl.DataFrame] | type[pl.LazyFrame],
51 | ) -> None:
52 |     schema = create_schema("test", {"a": enum})
53 |     df = df_type(data)
54 |     assert schema.is_valid(df, cast=True) == valid
55 | 
56 | 
57 | @pytest.mark.parametrize("type1", [list, tuple])
58 | @pytest.mark.parametrize("type2", [list, tuple])
59 | def test_different_sequences(type1: type, type2: type) -> None:
60 |     allowed = ["a", "b"]
61 |     S = create_schema("test", {"x": dy.Enum(type1(allowed))})
62 |     df = pl.DataFrame({"x": pl.Series(["a", "b"], dtype=pl.Enum(type2(allowed)))})
63 |     S.validate(df)
64 | 


--------------------------------------------------------------------------------
/tests/column_types/test_object.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | 
 5 | import polars as pl
 6 | import pytest
 7 | 
 8 | import dataframely as dy
 9 | from dataframely.columns._base import Column
10 | from dataframely.random import Generator
11 | from dataframely.testing import create_schema
12 | 
13 | 
14 | class CustomObject:
15 |     def __init__(self, a: int, b: str) -> None:
16 |         self.a = a
17 |         self.b = b
18 | 
19 | 
20 | def test_simple_object() -> None:
21 |     schema = create_schema("test", {"o": dy.Object()})
22 |     assert schema.is_valid(
23 |         pl.DataFrame({"o": [CustomObject(a=1, b="foo"), CustomObject(a=2, b="bar")]})
24 |     )
25 | 
26 | 
27 | @pytest.mark.parametrize(
28 |     ("column", "dtype", "is_valid"),
29 |     [
30 |         (
31 |             dy.Object(),
32 |             pl.Object(),
33 |             True,
34 |         ),
35 |         (
36 |             dy.Object(),
37 |             object(),
38 |             False,
39 |         ),
40 |     ],
41 | )
42 | def test_validate_dtype(column: Column, dtype: pl.DataType, is_valid: bool) -> None:
43 |     assert column.validate_dtype(dtype) == is_valid
44 | 
45 | 
46 | def test_pyarrow_dtype_raises() -> None:
47 |     column = dy.Object()
48 |     with pytest.raises(
49 |         NotImplementedError, match="PyArrow column cannot have 'Object' type."
50 |     ):
51 |         column.pyarrow_dtype
52 | 
53 | 
54 | def test_sampling_raises() -> None:
55 |     column = dy.Object()
56 |     with pytest.raises(
57 |         NotImplementedError,
58 |         match="Random data sampling not implemented for 'Object' type.",
59 |     ):
60 |         column.sample(generator=Generator(), n=10)
61 | 


--------------------------------------------------------------------------------
/tests/column_types/test_string.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | from polars.testing import assert_frame_equal
 6 | 
 7 | import dataframely as dy
 8 | from dataframely.testing import evaluate_rules, rules_from_exprs
 9 | 
10 | 
11 | def test_validate_min_length() -> None:
12 |     column = dy.String(min_length=2)
13 |     lf = pl.LazyFrame({"a": ["foo", "x"]})
14 |     actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a"))))
15 |     expected = pl.LazyFrame({"min_length": [True, False]})
16 |     assert_frame_equal(actual, expected)
17 | 
18 | 
19 | def test_validate_max_length() -> None:
20 |     column = dy.String(max_length=2)
21 |     lf = pl.LazyFrame({"a": ["foo", "x"]})
22 |     actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a"))))
23 |     expected = pl.LazyFrame({"max_length": [False, True]})
24 |     assert_frame_equal(actual, expected)
25 | 
26 | 
27 | def test_validate_regex() -> None:
28 |     column = dy.String(regex="[0-9][a-z]$")
29 |     lf = pl.LazyFrame({"a": ["33x", "3x", "44"]})
30 |     actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a"))))
31 |     expected = pl.LazyFrame({"regex": [True, True, False]})
32 |     assert_frame_equal(actual, expected)
33 | 
34 | 
35 | def test_validate_all_rules() -> None:
36 |     column = dy.String(nullable=False, min_length=2, max_length=4)
37 |     lf = pl.LazyFrame({"a": ["foo", "x", "foobar", None]})
38 |     actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a"))))
39 |     expected = pl.LazyFrame(
40 |         {
41 |             "min_length": [True, False, True, True],
42 |             "max_length": [True, True, False, True],
43 |             "nullability": [True, True, True, False],
44 |         }
45 |     )
46 |     assert_frame_equal(actual, expected, check_column_order=False)
47 | 


--------------------------------------------------------------------------------
/tests/column_types/test_struct.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | 
  5 | import polars as pl
  6 | import pytest
  7 | 
  8 | import dataframely as dy
  9 | from dataframely.columns._base import Column
 10 | from dataframely.testing import create_schema
 11 | 
 12 | 
 13 | def test_simple_struct() -> None:
 14 |     schema = create_schema(
 15 |         "test", {"s": dy.Struct({"a": dy.Integer(), "b": dy.String()})}
 16 |     )
 17 |     assert schema.is_valid(
 18 |         pl.DataFrame({"s": [{"a": 1, "b": "foo"}, {"a": 2, "b": "foo"}]})
 19 |     )
 20 | 
 21 | 
 22 | @pytest.mark.parametrize(
 23 |     ("column", "dtype", "is_valid"),
 24 |     [
 25 |         (
 26 |             dy.Struct({"a": dy.Int64(), "b": dy.String()}),
 27 |             pl.Struct({"a": pl.Int64(), "b": pl.String()}),
 28 |             True,
 29 |         ),
 30 |         (
 31 |             dy.Struct({"b": dy.String(), "a": dy.Int64()}),
 32 |             pl.Struct({"a": pl.Int64(), "b": pl.String()}),
 33 |             True,
 34 |         ),
 35 |         (
 36 |             dy.Struct({"a": dy.Int64(), "b": dy.String(), "c": dy.String()}),
 37 |             pl.Struct({"a": pl.Int64(), "b": pl.String()}),
 38 |             False,
 39 |         ),
 40 |         (
 41 |             dy.Struct({"a": dy.String(), "b": dy.Int64()}),
 42 |             pl.Struct({"a": pl.Int64(), "b": pl.String()}),
 43 |             False,
 44 |         ),
 45 |         (
 46 |             dy.Struct({"a": dy.String(), "b": dy.Int64()}),
 47 |             pl.Struct({"a": pl.Int64(), "b": pl.String()}),
 48 |             False,
 49 |         ),
 50 |         (
 51 |             dy.Struct({"a": dy.String(), "b": dy.Int64()}),
 52 |             pl.Struct({"a": pl.Int64(), "b": pl.String()}),
 53 |             False,
 54 |         ),
 55 |         (
 56 |             dy.Struct({"a": dy.Int64(), "b": dy.String()}),
 57 |             pl.Struct({"a": pl.Int64(), "c": pl.String()}),
 58 |             False,
 59 |         ),
 60 |         (
 61 |             dy.Struct({"a": dy.String(), "b": dy.Int64()}),
 62 |             dy.Struct({"a": dy.String(), "b": dy.Int64()}),
 63 |             False,
 64 |         ),
 65 |         (
 66 |             dy.Struct({"a": dy.String(), "b": dy.Int64()}),
 67 |             dy.String(),
 68 |             False,
 69 |         ),
 70 |         (
 71 |             dy.Struct({"a": dy.String(), "b": dy.Int64()}),
 72 |             pl.String(),
 73 |             False,
 74 |         ),
 75 |     ],
 76 | )
 77 | def test_validate_dtype(column: Column, dtype: pl.DataType, is_valid: bool) -> None:
 78 |     assert column.validate_dtype(dtype) == is_valid
 79 | 
 80 | 
 81 | def test_invalid_inner_type() -> None:
 82 |     schema = create_schema("test", {"a": dy.Struct({"a": dy.Int64()})})
 83 |     assert not schema.is_valid(pl.DataFrame({"a": [{"a": "1"}, {"a": "2"}]}))
 84 | 
 85 | 
 86 | def test_nested_structs() -> None:
 87 |     schema = create_schema(
 88 |         "test",
 89 |         {
 90 |             "s1": dy.Struct(
 91 |                 {
 92 |                     "s2": dy.Struct({"a": dy.Integer(), "b": dy.String()}),
 93 |                     "c": dy.String(),
 94 |                 }
 95 |             )
 96 |         },
 97 |     )
 98 |     assert schema.is_valid(
 99 |         pl.DataFrame({"s1": [{"s2": {"a": 1, "b": "foo"}, "c": "bar"}]})
100 |     )
101 | 
102 | 
103 | def test_struct_with_pk() -> None:
104 |     schema = create_schema(
105 |         "test",
106 |         {"s": dy.Struct({"a": dy.String(), "b": dy.Integer()}, primary_key=True)},
107 |     )
108 |     df = pl.DataFrame(
109 |         {"s": [{"a": "foo", "b": 1}, {"a": "bar", "b": 1}, {"a": "bar", "b": 1}]}
110 |     )
111 |     _, failures = schema.filter(df)
112 |     assert failures.invalid().to_dict(as_series=False) == {
113 |         "s": [{"a": "bar", "b": 1}, {"a": "bar", "b": 1}]
114 |     }
115 |     assert failures.counts() == {"primary_key": 2}
116 | 
117 | 
118 | def test_struct_with_rules() -> None:
119 |     schema = create_schema(
120 |         "test", {"s": dy.Struct({"a": dy.String(min_length=2, nullable=False)})}
121 |     )
122 |     df = pl.DataFrame({"s": [{"a": "ab"}, {"a": "a"}, {"a": None}]})
123 |     _, failures = schema.filter(df)
124 |     assert failures.invalid().to_dict(as_series=False) == {
125 |         "s": [{"a": "a"}, {"a": None}]
126 |     }
127 |     assert failures.counts() == {"s|inner_a_nullability": 1, "s|inner_a_min_length": 1}
128 | 
129 | 
130 | def test_nested_struct_with_rules() -> None:
131 |     schema = create_schema(
132 |         "test",
133 |         {
134 |             "s1": dy.Struct(
135 |                 {"s2": dy.Struct({"a": dy.String(min_length=2, nullable=False)})}
136 |             )
137 |         },
138 |     )
139 |     df = pl.DataFrame(
140 |         {"s1": [{"s2": {"a": "ab"}}, {"s2": {"a": "a"}}, {"s2": {"a": None}}]}
141 |     )
142 |     _, failures = schema.filter(df)
143 |     assert failures.invalid().to_dict(as_series=False) == {
144 |         "s1": [{"s2": {"a": "a"}}, {"s2": {"a": None}}]
145 |     }
146 |     assert failures.counts() == {
147 |         "s1|inner_s2_inner_a_nullability": 1,
148 |         "s1|inner_s2_inner_a_min_length": 1,
149 |     }
150 | 
151 | 
152 | def test_outer_inner_nullability() -> None:
153 |     schema = create_schema(
154 |         "test",
155 |         {
156 |             "nullable": dy.Struct(
157 |                 inner={
158 |                     "not_nullable1": dy.Integer(nullable=False),
159 |                     "not_nullable2": dy.Integer(nullable=False),
160 |                 },
161 |                 nullable=True,
162 |             )
163 |         },
164 |     )
165 |     df = pl.DataFrame({"nullable": [None, None]})
166 | 
167 |     schema.validate(df, cast=True)
168 | 


--------------------------------------------------------------------------------
/tests/columns/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) QuantCo 2025-2025
2 | # SPDX-License-Identifier: BSD-3-Clause
3 | 


--------------------------------------------------------------------------------
/tests/columns/test_alias.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | 
 6 | import dataframely as dy
 7 | 
 8 | 
 9 | class AliasSchema(dy.Schema):
10 |     a = dy.Int64(alias="hello world: col with space!")
11 | 
12 | 
13 | def test_column_names() -> None:
14 |     assert AliasSchema.column_names() == ["hello world: col with space!"]
15 | 
16 | 
17 | def test_validation() -> None:
18 |     df = pl.DataFrame({"hello world: col with space!": [1, 2]})
19 |     assert AliasSchema.is_valid(df)
20 | 
21 | 
22 | def test_create_empty() -> None:
23 |     df = AliasSchema.create_empty()
24 |     assert AliasSchema.is_valid(df)
25 | 


--------------------------------------------------------------------------------
/tests/columns/test_check.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | 
 6 | import dataframely as dy
 7 | from dataframely.testing import validation_mask
 8 | 
 9 | 
10 | class CheckSchema(dy.Schema):
11 |     a = dy.Int64(check=lambda col: (col < 5) | (col > 10))
12 |     b = dy.String(min_length=3, check=lambda col: col.str.contains("x"))
13 | 
14 | 
15 | def test_check() -> None:
16 |     df = pl.DataFrame({"a": [7, 3, 15], "b": ["abc", "xyz", "x"]})
17 |     _, failures = CheckSchema.filter(df)
18 |     assert validation_mask(df, failures).to_list() == [False, True, False]
19 |     assert failures.counts() == {"a|check": 1, "b|min_length": 1, "b|check": 1}
20 | 
21 | 
22 | def test_check_names() -> None:
23 |     def str_starts_with_a(col: pl.Expr) -> pl.Expr:
24 |         return col.str.starts_with("a")
25 | 
26 |     def str_end_with_z(col: pl.Expr) -> pl.Expr:
27 |         return col.str.ends_with("z")
28 | 
29 |     class MultiCheckSchema(dy.Schema):
30 |         name_from_dict = dy.Int64(
31 |             check={
32 |                 "min_max_check": lambda col: (col < 5) | (col > 10),
33 |                 "summation_check": lambda col: col.sum() < 3,
34 |             }
35 |         )
36 |         name_from_callable = dy.String(check=str_starts_with_a)
37 |         name_from_list_of_callables = dy.String(
38 |             check=[
39 |                 str_starts_with_a,
40 |                 str_end_with_z,
41 |                 str_end_with_z,
42 |                 lambda x: x.str.contains("x"),
43 |                 lambda x: x.str.contains("y"),
44 |             ]
45 |         )
46 |         name_from_lambda = dy.Int64(check=lambda x: x < 2)
47 | 
48 |     df = pl.DataFrame(
49 |         {
50 |             "name_from_dict": [2, 4, 6],
51 |             "name_from_callable": ["abc", "acd", "dca"],
52 |             "name_from_list_of_callables": ["xyz", "xac", "aqq"],
53 |             "name_from_lambda": [1, 2, 3],
54 |         }
55 |     )
56 |     _, failures = MultiCheckSchema.filter(df)
57 | 
58 |     assert failures.counts() == {
59 |         "name_from_dict|check__min_max_check": 1,
60 |         "name_from_dict|check__summation_check": 3,
61 |         "name_from_callable|check__str_starts_with_a": 1,
62 |         "name_from_list_of_callables|check__str_starts_with_a": 2,
63 |         "name_from_list_of_callables|check__str_end_with_z__0": 2,
64 |         "name_from_list_of_callables|check__str_end_with_z__1": 2,
65 |         "name_from_list_of_callables|check__0": 1,
66 |         "name_from_list_of_callables|check__1": 2,
67 |         "name_from_lambda|check": 2,
68 |     }
69 | 


--------------------------------------------------------------------------------
/tests/columns/test_default_dtypes.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import pytest
 6 | 
 7 | import dataframely as dy
 8 | from dataframely.columns import Column
 9 | from dataframely.testing import create_schema
10 | 
11 | 
12 | @pytest.mark.parametrize(
13 |     ("column", "dtype"),
14 |     [
15 |         (dy.Any(), pl.Null()),
16 |         (dy.Bool(), pl.Boolean()),
17 |         (dy.Date(), pl.Date()),
18 |         (dy.Datetime(), pl.Datetime()),
19 |         (dy.Time(), pl.Time()),
20 |         (dy.Duration(), pl.Duration()),
21 |         (dy.Decimal(), pl.Decimal()),
22 |         (dy.Decimal(12), pl.Decimal(12)),
23 |         (dy.Decimal(None, 8), pl.Decimal(None, 8)),
24 |         (dy.Decimal(6, 2), pl.Decimal(6, 2)),
25 |         (dy.Float(), pl.Float64()),
26 |         (dy.Float32(), pl.Float32()),
27 |         (dy.Float64(), pl.Float64()),
28 |         (dy.Integer(), pl.Int64()),
29 |         (dy.Int8(), pl.Int8()),
30 |         (dy.Int16(), pl.Int16()),
31 |         (dy.Int32(), pl.Int32()),
32 |         (dy.Int64(), pl.Int64()),
33 |         (dy.UInt8(), pl.UInt8()),
34 |         (dy.UInt16(), pl.UInt16()),
35 |         (dy.UInt32(), pl.UInt32()),
36 |         (dy.UInt64(), pl.UInt64()),
37 |         (dy.String(), pl.String()),
38 |         (dy.List(dy.String()), pl.List(pl.String())),
39 |         (dy.Array(dy.String(), 1), pl.Array(pl.String(), 1)),
40 |         (dy.Struct({"a": dy.String()}), pl.Struct({"a": pl.String()})),
41 |         (dy.Enum(["a", "b"]), pl.Enum(["a", "b"])),
42 |     ],
43 | )
44 | def test_default_dtype(column: Column, dtype: pl.DataType) -> None:
45 |     schema = create_schema("test", {"a": column})
46 |     df = schema.create_empty()
47 |     assert df.schema["a"] == dtype
48 |     schema.validate(df)
49 |     assert schema.is_valid(df)
50 | 


--------------------------------------------------------------------------------
/tests/columns/test_metadata.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2024-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import dataframely as dy
 5 | 
 6 | 
 7 | class SchemaWithMetadata(dy.Schema):
 8 |     a = dy.Int64(metadata={"masked": True, "comment": "foo", "order": 1})
 9 |     b = dy.String()
10 | 
11 | 
12 | def test_metadata() -> None:
13 |     assert SchemaWithMetadata.a.metadata == {
14 |         "masked": True,
15 |         "comment": "foo",
16 |         "order": 1,
17 |     }
18 |     assert SchemaWithMetadata.b.metadata is None
19 | 


--------------------------------------------------------------------------------
/tests/columns/test_polars_schema.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | 
 6 | import dataframely as dy
 7 | from dataframely.testing.factory import create_schema
 8 | 
 9 | 
10 | def test_polars_schema() -> None:
11 |     schema = create_schema("test", {"a": dy.Int32(nullable=False), "b": dy.Float32()})
12 |     pl_schema = schema.polars_schema()
13 |     assert pl_schema == {"a": pl.Int32, "b": pl.Float32}
14 | 


--------------------------------------------------------------------------------
/tests/columns/test_pyarrow.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | import pytest
  5 | from polars._typing import TimeUnit
  6 | 
  7 | import dataframely as dy
  8 | from dataframely.columns import Column
  9 | from dataframely.testing import (
 10 |     ALL_COLUMN_TYPES,
 11 |     COLUMN_TYPES,
 12 |     NO_VALIDATION_COLUMN_TYPES,
 13 |     SUPERTYPE_COLUMN_TYPES,
 14 |     create_schema,
 15 | )
 16 | 
 17 | 
 18 | @pytest.mark.parametrize("column_type", ALL_COLUMN_TYPES)
 19 | def test_equal_to_polars_schema(column_type: type[Column]) -> None:
 20 |     schema = create_schema("test", {"a": column_type()})
 21 |     actual = schema.pyarrow_schema()
 22 |     expected = schema.create_empty().to_arrow().schema
 23 |     assert actual == expected
 24 | 
 25 | 
 26 | def test_equal_polars_schema_enum() -> None:
 27 |     schema = create_schema("test", {"a": dy.Enum(["a", "b"])})
 28 |     actual = schema.pyarrow_schema()
 29 |     expected = schema.create_empty().to_arrow().schema
 30 |     assert actual == expected
 31 | 
 32 | 
 33 | @pytest.mark.parametrize(
 34 |     "inner",
 35 |     [c() for c in ALL_COLUMN_TYPES]
 36 |     + [dy.List(t()) for t in ALL_COLUMN_TYPES]
 37 |     + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES]
 38 |     + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES],
 39 | )
 40 | def test_equal_polars_schema_list(inner: Column) -> None:
 41 |     schema = create_schema("test", {"a": dy.List(inner)})
 42 |     actual = schema.pyarrow_schema()
 43 |     expected = schema.create_empty().to_arrow().schema
 44 |     assert actual == expected
 45 | 
 46 | 
 47 | @pytest.mark.parametrize(
 48 |     "inner",
 49 |     [c() for c in NO_VALIDATION_COLUMN_TYPES]
 50 |     + [dy.List(t()) for t in NO_VALIDATION_COLUMN_TYPES]
 51 |     + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES]
 52 |     + [dy.Struct({"a": t()}) for t in NO_VALIDATION_COLUMN_TYPES],
 53 | )
 54 | @pytest.mark.parametrize(
 55 |     "shape",
 56 |     [
 57 |         1,
 58 |         0,
 59 |         (0, 0),
 60 |     ],
 61 | )
 62 | def test_equal_polars_schema_array(inner: Column, shape: int | tuple[int, ...]) -> None:
 63 |     schema = create_schema("test", {"a": dy.Array(inner, shape)})
 64 |     actual = schema.pyarrow_schema()
 65 |     expected = schema.create_empty().to_arrow().schema
 66 |     assert actual == expected
 67 | 
 68 | 
 69 | @pytest.mark.parametrize(
 70 |     "inner",
 71 |     [c() for c in ALL_COLUMN_TYPES]
 72 |     + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES]
 73 |     + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES]
 74 |     + [dy.List(t()) for t in ALL_COLUMN_TYPES],
 75 | )
 76 | def test_equal_polars_schema_struct(inner: Column) -> None:
 77 |     schema = create_schema("test", {"a": dy.Struct({"a": inner})})
 78 |     actual = schema.pyarrow_schema()
 79 |     expected = schema.create_empty().to_arrow().schema
 80 |     assert actual == expected
 81 | 
 82 | 
 83 | @pytest.mark.parametrize("column_type", COLUMN_TYPES + SUPERTYPE_COLUMN_TYPES)
 84 | @pytest.mark.parametrize("nullable", [True, False])
 85 | def test_nullability_information(column_type: type[Column], nullable: bool) -> None:
 86 |     schema = create_schema("test", {"a": column_type(nullable=nullable)})
 87 |     assert ("not null" in str(schema.pyarrow_schema())) != nullable
 88 | 
 89 | 
 90 | @pytest.mark.parametrize("nullable", [True, False])
 91 | def test_nullability_information_enum(nullable: bool) -> None:
 92 |     schema = create_schema("test", {"a": dy.Enum(["a", "b"], nullable=nullable)})
 93 |     assert ("not null" in str(schema.pyarrow_schema())) != nullable
 94 | 
 95 | 
 96 | @pytest.mark.parametrize(
 97 |     "inner",
 98 |     [c() for c in ALL_COLUMN_TYPES]
 99 |     + [dy.List(t()) for t in ALL_COLUMN_TYPES]
100 |     + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES]
101 |     + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES],
102 | )
103 | @pytest.mark.parametrize("nullable", [True, False])
104 | def test_nullability_information_list(inner: Column, nullable: bool) -> None:
105 |     schema = create_schema("test", {"a": dy.List(inner, nullable=nullable)})
106 |     assert ("not null" in str(schema.pyarrow_schema())) != nullable
107 | 
108 | 
109 | @pytest.mark.parametrize(
110 |     "inner",
111 |     [c() for c in ALL_COLUMN_TYPES]
112 |     + [dy.Struct({"a": t()}) for t in ALL_COLUMN_TYPES]
113 |     + [dy.Array(t(), 1) for t in NO_VALIDATION_COLUMN_TYPES]
114 |     + [dy.List(t()) for t in ALL_COLUMN_TYPES],
115 | )
116 | @pytest.mark.parametrize("nullable", [True, False])
117 | def test_nullability_information_struct(inner: Column, nullable: bool) -> None:
118 |     schema = create_schema("test", {"a": dy.Struct({"a": inner}, nullable=nullable)})
119 |     assert ("not null" in str(schema.pyarrow_schema())) != nullable
120 | 
121 | 
122 | def test_multiple_columns() -> None:
123 |     schema = create_schema("test", {"a": dy.Int32(nullable=False), "b": dy.Integer()})
124 |     assert str(schema.pyarrow_schema()).split("\n") == ["a: int32 not null", "b: int64"]
125 | 
126 | 
127 | @pytest.mark.parametrize("time_unit", ["ns", "us", "ms"])
128 | def test_datetime_time_unit(time_unit: TimeUnit) -> None:
129 |     schema = create_schema("test", {"a": dy.Datetime(time_unit=time_unit)})
130 |     assert str(schema.pyarrow_schema()) == f"a: timestamp[{time_unit}]"
131 | 


--------------------------------------------------------------------------------
/tests/columns/test_rules.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import pytest
 6 | from polars.testing import assert_frame_equal
 7 | 
 8 | from dataframely.columns import Column
 9 | from dataframely.columns.float import _BaseFloat
10 | from dataframely.testing import (
11 |     COLUMN_TYPES,
12 |     SUPERTYPE_COLUMN_TYPES,
13 |     evaluate_rules,
14 |     rules_from_exprs,
15 | )
16 | 
17 | 
18 | @pytest.mark.parametrize("column_type", COLUMN_TYPES + SUPERTYPE_COLUMN_TYPES)
19 | @pytest.mark.parametrize("nullable", [True, False])
20 | def test_rule_count_nullability(column_type: type[Column], nullable: bool) -> None:
21 |     column = column_type(nullable=nullable)
22 |     assert len(column.validation_rules(pl.col("a"))) == int(not nullable) + (
23 |         1 if isinstance(column, _BaseFloat) else 0
24 |     )
25 | 
26 | 
27 | @pytest.mark.parametrize("column_type", COLUMN_TYPES + SUPERTYPE_COLUMN_TYPES)
28 | def test_nullability_rule_for_primary_key(column_type: type[Column]) -> None:
29 |     column = column_type(primary_key=True)
30 |     assert len(column.validation_rules(pl.col("a"))) == (
31 |         2
32 |         if isinstance(column, _BaseFloat)
33 |         else 1  # floats additionally have nan/inf rules
34 |     )
35 | 
36 | 
37 | @pytest.mark.parametrize("column_type", COLUMN_TYPES + SUPERTYPE_COLUMN_TYPES)
38 | def test_nullability_rule(column_type: type[Column]) -> None:
39 |     column = column_type(nullable=False)
40 |     lf = pl.LazyFrame({"a": [None]}, schema={"a": column.dtype})
41 |     actual = evaluate_rules(lf, rules_from_exprs(column.validation_rules(pl.col("a"))))
42 |     expected = pl.LazyFrame({"nullability": [False]})
43 |     assert_frame_equal(actual.select(expected.collect_schema().names()), expected)
44 | 


--------------------------------------------------------------------------------
/tests/columns/test_str.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import pytest
 5 | 
 6 | import dataframely as dy
 7 | from dataframely.columns import Column
 8 | from dataframely.testing import ALL_COLUMN_TYPES
 9 | 
10 | 
11 | @pytest.mark.parametrize("column_type", ALL_COLUMN_TYPES)
12 | def test_string_representation(column_type: type[Column]) -> None:
13 |     column = column_type()
14 |     assert str(column) == column_type.__name__.lower()
15 | 
16 | 
17 | def test_string_representation_enum() -> None:
18 |     column = dy.Enum(["a", "b"])
19 |     assert str(column) == dy.Enum.__name__.lower()
20 | 
21 | 
22 | def test_string_representation_list() -> None:
23 |     column = dy.List(dy.String())
24 |     assert str(column) == dy.List.__name__.lower()
25 | 
26 | 
27 | def test_string_representation_array() -> None:
28 |     column = dy.Array(dy.String(), 1)
29 |     assert str(column) == dy.Array.__name__.lower()
30 | 
31 | 
32 | def test_string_representation_struct() -> None:
33 |     column = dy.Struct({"a": dy.String()})
34 |     assert str(column) == dy.Struct.__name__.lower()
35 | 


--------------------------------------------------------------------------------
/tests/columns/test_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | 
 5 | from dataframely.columns._utils import first_non_null
 6 | 
 7 | 
 8 | def test_first_non_null_basic() -> None:
 9 |     assert first_non_null(1, 2, default=3) == 1
10 |     assert first_non_null(None, 2, default=3) == 2
11 |     assert first_non_null(None, None, default=3) == 3
12 | 
13 | 
14 | def test_first_non_null_allow_null_response() -> None:
15 |     assert first_non_null(None, None, None, allow_null_response=True) is None
16 | 
17 | 
18 | def test_first_non_null_with_terminal() -> None:
19 |     assert first_non_null(None, None, None, default=42) == 42
20 |     assert first_non_null(None, 3, None, default=42) == 3
21 | 
22 | 
23 | def test_first_non_null_mixed_types() -> None:
24 |     assert first_non_null(None, "a", default=3) == "a"
25 |     assert first_non_null(None, 0, default="b") == 0  # 0 is a valid non-null value
26 |     assert (
27 |         first_non_null(None, False, default=1) is False
28 |     )  # False is a valid non-null value
29 | 
30 | 
31 | def test_first_non_null_with_kwargs() -> None:
32 |     assert first_non_null(None, None, allow_null_response=True) is None
33 |     assert first_non_null(None, None, default="fallback") == "fallback"
34 | 


--------------------------------------------------------------------------------
/tests/core_validation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) QuantCo 2025-2025
2 | # SPDX-License-Identifier: BSD-3-Clause
3 | 


--------------------------------------------------------------------------------
/tests/core_validation/test_column_validation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import pytest
 6 | 
 7 | from dataframely._validation import validate_columns
 8 | from dataframely.exc import ValidationError
 9 | 
10 | 
11 | def test_success() -> None:
12 |     df = pl.DataFrame(schema={k: pl.Int64() for k in ["a", "b"]})
13 |     lf = validate_columns(df.lazy(), actual=df.schema.keys(), expected=["a"])
14 |     assert set(lf.collect_schema().names()) == {"a"}
15 | 
16 | 
17 | @pytest.mark.parametrize(
18 |     ("actual", "expected", "error"),
19 |     [
20 |         (["a"], ["a", "b"], r"1 columns in the schema are missing.*'b'"),
21 |         (["c"], ["a", "b"], r"2 columns in the schema are missing.*'a'.*'b'"),
22 |     ],
23 | )
24 | def test_failure(actual: list[str], expected: list[str], error: str) -> None:
25 |     df = pl.DataFrame(schema={k: pl.Int64() for k in actual})
26 |     with pytest.raises(ValidationError, match=error):
27 |         validate_columns(df.lazy(), actual=df.schema.keys(), expected=expected)
28 | 


--------------------------------------------------------------------------------
/tests/core_validation/test_dtype_validation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | import re
  5 | 
  6 | import polars as pl
  7 | import polars.exceptions as plexc
  8 | import pytest
  9 | from polars.testing import assert_frame_equal
 10 | 
 11 | import dataframely as dy
 12 | from dataframely._validation import DtypeCasting, validate_dtypes
 13 | from dataframely.columns import Column
 14 | from dataframely.exc import DtypeValidationError
 15 | 
 16 | 
 17 | @pytest.mark.parametrize(
 18 |     ("actual", "expected", "casting"),
 19 |     [
 20 |         ({"a": pl.Int64()}, {"a": dy.Int64()}, "none"),
 21 |         ({"a": pl.Int32()}, {"a": dy.Int64()}, "lenient"),
 22 |         ({"a": pl.Int32()}, {"a": dy.Int64()}, "strict"),
 23 |         (
 24 |             {"a": pl.Int32(), "b": pl.String()},
 25 |             {"a": dy.Int64(), "b": dy.UInt8()},
 26 |             "strict",
 27 |         ),
 28 |     ],
 29 | )
 30 | def test_success(
 31 |     actual: dict[str, pl.DataType],
 32 |     expected: dict[str, Column],
 33 |     casting: DtypeCasting,
 34 | ) -> None:
 35 |     df = pl.DataFrame(schema=actual)
 36 |     lf = validate_dtypes(
 37 |         df.lazy(), actual=df.schema, expected=expected, casting=casting
 38 |     )
 39 |     schema = lf.collect_schema()
 40 |     for key, col in expected.items():
 41 |         assert col.validate_dtype(schema[key])
 42 | 
 43 | 
 44 | @pytest.mark.parametrize(
 45 |     ("actual", "expected", "error", "fail_columns"),
 46 |     [
 47 |         (
 48 |             {"a": pl.Int32()},
 49 |             {"a": dy.Int64()},
 50 |             r"1 columns have an invalid dtype.*\n.*got dtype 'Int32'",
 51 |             {"a"},
 52 |         ),
 53 |         (
 54 |             {"a": pl.Int32(), "b": pl.String()},
 55 |             {"a": dy.Int64(), "b": dy.UInt8()},
 56 |             r"2 columns have an invalid dtype",
 57 |             {"a", "b"},
 58 |         ),
 59 |     ],
 60 | )
 61 | def test_failure(
 62 |     actual: dict[str, pl.DataType],
 63 |     expected: dict[str, Column],
 64 |     error: str,
 65 |     fail_columns: set[str],
 66 | ) -> None:
 67 |     df = pl.DataFrame(schema=actual)
 68 |     try:
 69 |         validate_dtypes(df.lazy(), actual=df.schema, expected=expected, casting="none")
 70 |         assert False  # above should raise
 71 |     except DtypeValidationError as exc:
 72 |         assert set(exc.errors.keys()) == fail_columns
 73 |         assert re.match(error, str(exc))
 74 | 
 75 | 
 76 | def test_lenient_casting() -> None:
 77 |     lf = pl.LazyFrame(
 78 |         {"a": [1, 2, 3], "b": ["foo", "12", "1313"]},
 79 |         schema={"a": pl.Int64(), "b": pl.String()},
 80 |     )
 81 |     actual = validate_dtypes(
 82 |         lf,
 83 |         actual=lf.collect_schema(),
 84 |         expected={"a": dy.UInt8(), "b": dy.UInt8()},
 85 |         casting="lenient",
 86 |     )
 87 |     expected = pl.LazyFrame(
 88 |         {"a": [1, 2, 3], "b": [None, 12, None]},
 89 |         schema={"a": pl.UInt8(), "b": pl.UInt8()},
 90 |     )
 91 |     assert_frame_equal(actual, expected)
 92 | 
 93 | 
 94 | def test_strict_casting() -> None:
 95 |     lf = pl.LazyFrame(
 96 |         {"a": [1, 2, 3], "b": ["foo", "12", "1313"]},
 97 |         schema={"a": pl.Int64(), "b": pl.String()},
 98 |     )
 99 |     lf_valid = validate_dtypes(
100 |         lf,
101 |         actual=lf.collect_schema(),
102 |         expected={"a": dy.UInt8(), "b": dy.UInt8()},
103 |         casting="strict",
104 |     )
105 |     with pytest.raises(plexc.InvalidOperationError):
106 |         lf_valid.collect()
107 | 


--------------------------------------------------------------------------------
/tests/core_validation/test_rule_evaluation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | import polars as pl
  5 | from polars.testing import assert_frame_equal
  6 | 
  7 | from dataframely._rule import GroupRule, Rule
  8 | from dataframely.testing import evaluate_rules
  9 | 
 10 | 
 11 | def test_single_column_single_rule() -> None:
 12 |     lf = pl.LazyFrame({"a": [1, 2]})
 13 |     rules = {
 14 |         "a|min": Rule(pl.col("a") >= 2),
 15 |     }
 16 |     actual = evaluate_rules(lf, rules)
 17 | 
 18 |     expected = pl.LazyFrame({"a|min": [False, True]})
 19 |     assert_frame_equal(actual, expected)
 20 | 
 21 | 
 22 | def test_single_column_multi_rule() -> None:
 23 |     lf = pl.LazyFrame({"a": [1, 2, 3]})
 24 |     rules = {
 25 |         "a|min": Rule(pl.col("a") >= 2),
 26 |         "a|max": Rule(pl.col("a") <= 2),
 27 |     }
 28 |     actual = evaluate_rules(lf, rules)
 29 | 
 30 |     expected = pl.LazyFrame(
 31 |         {"a|min": [False, True, True], "a|max": [True, True, False]}
 32 |     )
 33 |     assert_frame_equal(actual, expected)
 34 | 
 35 | 
 36 | def test_multi_column_multi_rule() -> None:
 37 |     lf = pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
 38 |     rules = {
 39 |         "a|min": Rule(pl.col("a") >= 2),
 40 |         "a|max": Rule(pl.col("a") <= 2),
 41 |         "b|even": Rule(pl.col("b") % 2 == 0),
 42 |     }
 43 |     actual = evaluate_rules(lf, rules)
 44 | 
 45 |     expected = pl.LazyFrame(
 46 |         {
 47 |             "a|min": [False, True, True],
 48 |             "a|max": [True, True, False],
 49 |             "b|even": [True, False, True],
 50 |         }
 51 |     )
 52 |     assert_frame_equal(actual, expected)
 53 | 
 54 | 
 55 | def test_cross_column_rule() -> None:
 56 |     lf = pl.LazyFrame({"a": [1, 1, 2, 2], "b": [1, 1, 1, 2]})
 57 |     rules = {"primary_key": Rule(~pl.struct("a", "b").is_duplicated())}
 58 |     actual = evaluate_rules(lf, rules)
 59 | 
 60 |     expected = pl.LazyFrame({"primary_key": [False, False, True, True]})
 61 |     assert_frame_equal(actual, expected)
 62 | 
 63 | 
 64 | def test_group_rule() -> None:
 65 |     lf = pl.LazyFrame({"a": [1, 1, 2, 2, 3], "b": [1, 1, 1, 2, 1]})
 66 |     rules: dict[str, Rule] = {
 67 |         "unique_b": GroupRule(pl.col("b").n_unique() == 1, group_columns=["a"])
 68 |     }
 69 |     actual = evaluate_rules(lf, rules)
 70 | 
 71 |     expected = pl.LazyFrame({"unique_b": [True, True, False, False, True]})
 72 |     assert_frame_equal(actual, expected)
 73 | 
 74 | 
 75 | def test_simple_rule_and_group_rule() -> None:
 76 |     lf = pl.LazyFrame({"a": [1, 1, 2, 2, 3], "b": [1, 1, 1, 2, 1]})
 77 |     rules: dict[str, Rule] = {
 78 |         "b|max": Rule(pl.col("b") <= 1),
 79 |         "unique_b": GroupRule(pl.col("b").n_unique() == 1, group_columns=["a"]),
 80 |     }
 81 |     actual = evaluate_rules(lf, rules)
 82 | 
 83 |     expected = pl.LazyFrame(
 84 |         {
 85 |             "b|max": [True, True, True, False, True],
 86 |             "unique_b": [True, True, False, False, True],
 87 |         }
 88 |     )
 89 |     assert_frame_equal(actual, expected, check_column_order=False)
 90 | 
 91 | 
 92 | def test_multiple_group_rules() -> None:
 93 |     lf = pl.LazyFrame({"a": [1, 1, 2, 2, 3], "b": [1, 1, 1, 2, 1]})
 94 |     rules: dict[str, Rule] = {
 95 |         "unique_b": GroupRule(pl.col("b").n_unique() == 1, group_columns=["a"]),
 96 |         "sum_b": GroupRule(pl.col("b").sum() >= 2, group_columns=["a"]),
 97 |         "group_count": GroupRule(pl.len() >= 2, group_columns=["a", "b"]),
 98 |     }
 99 |     actual = evaluate_rules(lf, rules)
100 | 
101 |     expected = pl.LazyFrame(
102 |         {
103 |             "unique_b": [True, True, False, False, True],
104 |             "sum_b": [True, True, True, True, False],
105 |             "group_count": [True, True, False, False, False],
106 |         }
107 |     )
108 |     assert_frame_equal(actual, expected)
109 | 


--------------------------------------------------------------------------------
/tests/functional/test_concat.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import pytest
 6 | 
 7 | import dataframely as dy
 8 | 
 9 | 
10 | class MySchema(dy.Schema):
11 |     a = dy.Int64()
12 | 
13 | 
14 | class SimpleCollection(dy.Collection):
15 |     first: dy.LazyFrame[MySchema]
16 |     second: dy.LazyFrame[MySchema] | None
17 |     third: dy.LazyFrame[MySchema] | None
18 | 
19 | 
20 | def test_concat() -> None:
21 |     col1 = SimpleCollection.cast({"first": pl.LazyFrame({"a": [1, 2, 3]})})
22 |     col2 = SimpleCollection.cast(
23 |         {
24 |             "first": pl.LazyFrame({"a": [4, 5, 6]}),
25 |             "second": pl.LazyFrame({"a": [4, 5, 6]}),
26 |         }
27 |     )
28 |     col3 = SimpleCollection.cast(
29 |         {
30 |             "first": pl.LazyFrame({"a": [7, 8, 9]}),
31 |             "second": pl.LazyFrame({"a": [7, 8, 9]}),
32 |             "third": pl.LazyFrame({"a": [7, 8, 9]}),
33 |         }
34 |     )
35 |     concat = dy.concat_collection_members([col1, col2, col3])
36 |     assert concat["first"].collect().get_column("a").to_list() == list(range(1, 10))
37 |     assert concat["second"].collect().get_column("a").to_list() == list(range(4, 10))
38 |     assert concat["third"].collect().get_column("a").to_list() == list(range(7, 10))
39 | 
40 | 
41 | def test_concat_empty() -> None:
42 |     with pytest.raises(ValueError):
43 |         dy.concat_collection_members([])
44 | 


--------------------------------------------------------------------------------
/tests/functional/test_relationships.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import pytest
 6 | 
 7 | import dataframely as dy
 8 | 
 9 | # -------------------------------------- SCHEMA -------------------------------------- #
10 | 
11 | 
12 | class DepartmentSchema(dy.Schema):
13 |     department_id = dy.Int64(primary_key=True)
14 | 
15 | 
16 | class ManagerSchema(dy.Schema):
17 |     department_id = dy.Int64(primary_key=True)
18 |     name = dy.String(nullable=False)
19 | 
20 | 
21 | class EmployeeSchema(dy.Schema):
22 |     department_id = dy.Int64(primary_key=True)
23 |     employee_number = dy.Int64(primary_key=True)
24 |     name = dy.String(nullable=False)
25 | 
26 | 
27 | # ------------------------------------- FIXTURES ------------------------------------- #
28 | 
29 | 
30 | @pytest.fixture()
31 | def departments() -> dy.LazyFrame[DepartmentSchema]:
32 |     return DepartmentSchema.cast(pl.LazyFrame({"department_id": [1, 2]}))
33 | 
34 | 
35 | @pytest.fixture()
36 | def managers() -> dy.LazyFrame[ManagerSchema]:
37 |     return ManagerSchema.cast(
38 |         pl.LazyFrame({"department_id": [1], "name": ["Donald Duck"]})
39 |     )
40 | 
41 | 
42 | @pytest.fixture()
43 | def employees() -> dy.LazyFrame[EmployeeSchema]:
44 |     return EmployeeSchema.cast(
45 |         pl.LazyFrame(
46 |             {
47 |                 "department_id": [2, 2, 2],
48 |                 "employee_number": [101, 102, 103],
49 |                 "name": ["Huey", "Dewey", "Louie"],
50 |             }
51 |         )
52 |     )
53 | 
54 | 
55 | # ------------------------------------------------------------------------------------ #
56 | #                                         TESTS                                        #
57 | # ------------------------------------------------------------------------------------ #
58 | 
59 | 
60 | def test_one_to_one(
61 |     departments: dy.LazyFrame[DepartmentSchema],
62 |     managers: dy.LazyFrame[ManagerSchema],
63 | ) -> None:
64 |     actual = dy.filter_relationship_one_to_one(
65 |         departments, managers, on="department_id"
66 |     )
67 |     assert actual.select("department_id").collect().to_series().to_list() == [1]
68 | 
69 | 
70 | def test_one_to_at_least_one(
71 |     departments: dy.LazyFrame[DepartmentSchema],
72 |     employees: dy.LazyFrame[EmployeeSchema],
73 | ) -> None:
74 |     actual = dy.filter_relationship_one_to_at_least_one(
75 |         departments, employees, on="department_id"
76 |     )
77 |     assert actual.select("department_id").collect().to_series().to_list() == [2]
78 | 


--------------------------------------------------------------------------------
/tests/schema/test_base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | 
 5 | import polars as pl
 6 | import pytest
 7 | 
 8 | import dataframely as dy
 9 | from dataframely._rule import Rule
10 | from dataframely.exc import ImplementationError
11 | from dataframely.testing import create_schema
12 | 
13 | 
14 | class MySchema(dy.Schema):
15 |     a = dy.Integer(primary_key=True)
16 |     b = dy.String(primary_key=True)
17 |     c = dy.Float64()
18 |     d = dy.Any(alias="e")
19 | 
20 | 
21 | class MySchemaWithRule(MySchema):
22 |     @dy.rule()
23 |     def a_greater_than_c() -> pl.Expr:
24 |         return pl.col("a") > pl.col("c")
25 | 
26 | 
27 | def test_column_names() -> None:
28 |     assert MySchema.column_names() == ["a", "b", "c", "e"]
29 | 
30 | 
31 | def test_columns() -> None:
32 |     columns = MySchema.columns()
33 |     assert isinstance(columns["a"], dy.Integer)
34 |     assert isinstance(columns["b"], dy.String)
35 |     assert isinstance(columns["c"], dy.Float64)
36 |     assert isinstance(columns["e"], dy.Any)
37 | 
38 | 
39 | def test_nullability() -> None:
40 |     columns = MySchema.columns()
41 |     assert not columns["a"].nullable
42 |     assert not columns["b"].nullable
43 |     assert columns["c"].nullable
44 |     assert columns["e"].nullable
45 | 
46 | 
47 | def test_primary_keys() -> None:
48 |     assert MySchema.primary_keys() == ["a", "b"]
49 | 
50 | 
51 | def test_no_rule_named_primary_key() -> None:
52 |     with pytest.raises(ImplementationError):
53 |         create_schema(
54 |             "test",
55 |             {"a": dy.String()},
56 |             {"primary_key": Rule(pl.col("a").str.len_bytes() > 1)},
57 |         )
58 | 
59 | 
60 | def test_col() -> None:
61 |     assert MySchema.a.col.__dict__ == pl.col("a").__dict__
62 |     assert MySchema.b.col.__dict__ == pl.col("b").__dict__
63 |     assert MySchema.c.col.__dict__ == pl.col("c").__dict__
64 |     assert MySchema.d.col.__dict__ == pl.col("e").__dict__
65 | 
66 | 
67 | def test_col_raise_if_none() -> None:
68 |     class InvalidSchema(dy.Schema):
69 |         a = dy.Integer()
70 | 
71 |     # Manually override alias to be ``None``.
72 |     InvalidSchema.a.alias = None
73 |     with pytest.raises(ValueError):
74 |         InvalidSchema.a.col
75 | 
76 | 
77 | def test_col_in_polars_expression() -> None:
78 |     df = (
79 |         pl.DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [1.0, 2.0], "e": [None, None]})
80 |         .filter((MySchema.b.col == "a") & (MySchema.a.col > 0))
81 |         .select(MySchema.a.col)
82 |     )
83 |     assert df.row(0) == (1,)
84 | 
85 | 
86 | def test_dunder_name() -> None:
87 |     assert MySchema.__name__ == "MySchema"
88 | 
89 | 
90 | def test_dunder_name_with_rule() -> None:
91 |     assert MySchemaWithRule.__name__ == "MySchemaWithRule"
92 | 


--------------------------------------------------------------------------------
/tests/schema/test_cast.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from typing import Any
 5 | 
 6 | import polars as pl
 7 | import polars.exceptions as plexc
 8 | import pytest
 9 | 
10 | import dataframely as dy
11 | 
12 | 
13 | class MySchema(dy.Schema):
14 |     a = dy.Float64()
15 |     b = dy.String()
16 | 
17 | 
18 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
19 | @pytest.mark.parametrize(
20 |     "data",
21 |     [
22 |         {"a": [3], "b": [1]},
23 |         {"a": [1], "b": [2], "c": [3]},
24 |     ],
25 | )
26 | def test_cast_valid(
27 |     df_type: type[pl.DataFrame] | type[pl.LazyFrame], data: dict[str, Any]
28 | ) -> None:
29 |     df = df_type(data)
30 |     out = MySchema.cast(df)
31 |     assert isinstance(out, df_type)
32 |     assert out.lazy().collect_schema() == MySchema.polars_schema()
33 | 
34 | 
35 | def test_cast_invalid_schema_eager() -> None:
36 |     df = pl.DataFrame({"a": [1]})
37 |     with pytest.raises(plexc.ColumnNotFoundError):
38 |         MySchema.cast(df)
39 | 
40 | 
41 | def test_cast_invalid_schema_lazy() -> None:
42 |     lf = pl.LazyFrame({"a": [1]})
43 |     lf = MySchema.cast(lf)
44 |     with pytest.raises(plexc.ColumnNotFoundError):
45 |         lf.collect()
46 | 


--------------------------------------------------------------------------------
/tests/schema/test_create_empty.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import pytest
 6 | 
 7 | import dataframely as dy
 8 | 
 9 | 
10 | class MySchema(dy.Schema):
11 |     a = dy.Int64()
12 |     b = dy.String()
13 | 
14 | 
15 | @pytest.mark.parametrize("with_arg", [True, False])
16 | def test_create_empty_eager(with_arg: bool) -> None:
17 |     if with_arg:
18 |         df = MySchema.create_empty(lazy=False)
19 |     else:
20 |         df = MySchema.create_empty()
21 | 
22 |     assert isinstance(df, pl.DataFrame)
23 |     assert df.columns == ["a", "b"]
24 |     assert df.dtypes == [pl.Int64, pl.String]
25 |     assert len(df) == 0
26 | 
27 | 
28 | def test_create_empty_lazy() -> None:
29 |     df = MySchema.create_empty(lazy=True)
30 |     assert isinstance(df, pl.LazyFrame)
31 |     assert df.collect_schema().names() == ["a", "b"]
32 |     assert df.collect_schema().dtypes() == [pl.Int64, pl.String]
33 |     assert len(df.collect()) == 0
34 | 


--------------------------------------------------------------------------------
/tests/schema/test_create_empty_if_none.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2024-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import pytest
 6 | from polars.testing import assert_frame_equal
 7 | 
 8 | import dataframely as dy
 9 | 
10 | 
11 | class MySchema(dy.Schema):
12 |     a = dy.Int64()
13 |     b = dy.String()
14 | 
15 | 
16 | @pytest.mark.parametrize("lazy_in", [True, False])
17 | @pytest.mark.parametrize("lazy_out", [True, False])
18 | def test_create_empty_if_none_non_none(lazy_in: bool, lazy_out: bool) -> None:
19 |     # Arrange
20 |     df_raw = MySchema.validate(pl.DataFrame({"a": [1], "b": ["foo"]}))
21 |     df = df_raw.lazy() if lazy_in else df_raw
22 | 
23 |     # Act
24 |     result = MySchema.create_empty_if_none(df, lazy=lazy_out)
25 | 
26 |     # Assert
27 |     if lazy_out:
28 |         assert isinstance(result, pl.LazyFrame)
29 |     else:
30 |         assert isinstance(result, pl.DataFrame)
31 |     assert_frame_equal(result.lazy().collect(), df.lazy().collect())
32 | 
33 | 
34 | @pytest.mark.parametrize("lazy", [True, False])
35 | def test_create_empty_if_none_none(lazy: bool) -> None:
36 |     # Act
37 |     result = MySchema.create_empty_if_none(None, lazy=lazy)
38 | 
39 |     # Assert
40 |     if lazy:
41 |         assert isinstance(result, pl.LazyFrame)
42 |     else:
43 |         assert isinstance(result, pl.DataFrame)
44 |     assert_frame_equal(result.lazy().collect(), MySchema.create_empty())
45 | 


--------------------------------------------------------------------------------
/tests/schema/test_inheritance.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import dataframely as dy
 5 | 
 6 | 
 7 | class ParentSchema(dy.Schema):
 8 |     a = dy.Integer()
 9 | 
10 | 
11 | class ChildSchema(ParentSchema):
12 |     b = dy.Integer()
13 | 
14 | 
15 | class GrandchildSchema(ChildSchema):
16 |     c = dy.Integer()
17 | 
18 | 
19 | def test_columns() -> None:
20 |     assert ParentSchema.column_names() == ["a"]
21 |     assert ChildSchema.column_names() == ["a", "b"]
22 |     assert GrandchildSchema.column_names() == ["a", "b", "c"]
23 | 


--------------------------------------------------------------------------------
/tests/schema/test_rule_implementation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | import pytest
 6 | 
 7 | import dataframely as dy
 8 | from dataframely._rule import GroupRule, Rule
 9 | from dataframely.exc import ImplementationError, RuleImplementationError
10 | from dataframely.testing import create_schema
11 | 
12 | 
13 | def test_group_rule_group_by_error() -> None:
14 |     with pytest.raises(
15 |         ImplementationError,
16 |         match=(
17 |             r"Group validation rule 'b_greater_zero' has been implemented "
18 |             r"incorrectly\. It references 1 columns which are not in the schema"
19 |         ),
20 |     ):
21 |         create_schema(
22 |             "test",
23 |             columns={"a": dy.Integer(), "b": dy.Integer()},
24 |             rules={
25 |                 "b_greater_zero": GroupRule(
26 |                     (pl.col("b") > 0).all(), group_columns=["c"]
27 |                 )
28 |             },
29 |         )
30 | 
31 | 
32 | def test_rule_implementation_error() -> None:
33 |     with pytest.raises(
34 |         RuleImplementationError, match=r"rule 'integer_rule'.*returns dtype 'Int64'"
35 |     ):
36 |         create_schema(
37 |             "test",
38 |             columns={"a": dy.Integer()},
39 |             rules={"integer_rule": Rule(pl.col("a") + 1)},
40 |         )
41 | 
42 | 
43 | def test_group_rule_implementation_error() -> None:
44 |     with pytest.raises(
45 |         RuleImplementationError,
46 |         match=(
47 |             r"rule 'b_greater_zero'.*returns dtype 'List\(Boolean\)'.*"
48 |             r"make sure to use an aggregation function"
49 |         ),
50 |     ):
51 |         create_schema(
52 |             "test",
53 |             columns={"a": dy.Integer(), "b": dy.Integer()},
54 |             rules={"b_greater_zero": GroupRule(pl.col("b") > 0, group_columns=["a"])},
55 |         )
56 | 
57 | 
58 | def test_rule_column_overlap_error() -> None:
59 |     with pytest.raises(
60 |         ImplementationError,
61 |         match=r"Rules and columns must not be named equally but found 1 overlaps",
62 |     ):
63 |         create_schema(
64 |             "test",
65 |             columns={"test": dy.Integer(alias="a")},
66 |             rules={"a": Rule(pl.col("a") > 0)},
67 |         )
68 | 


--------------------------------------------------------------------------------
/tests/schema/test_sample.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | import numpy as np
  5 | import polars as pl
  6 | import pytest
  7 | from polars.testing import assert_frame_equal
  8 | 
  9 | import dataframely as dy
 10 | from dataframely.random import Generator
 11 | 
 12 | 
 13 | class MySimpleSchema(dy.Schema):
 14 |     a = dy.Int64()
 15 |     b = dy.String()
 16 | 
 17 | 
 18 | class PrimaryKeySchema(dy.Schema):
 19 |     a = dy.Int64(primary_key=True)
 20 |     b = dy.String()
 21 | 
 22 | 
 23 | class CheckSchema(dy.Schema):
 24 |     a = dy.UInt64()
 25 |     b = dy.UInt64()
 26 | 
 27 |     @dy.rule()
 28 |     def a_ge_b() -> pl.Expr:
 29 |         return pl.col("a") >= pl.col("b")
 30 | 
 31 | 
 32 | class ComplexSchema(dy.Schema):
 33 |     a = dy.UInt8(primary_key=True)
 34 |     b = dy.UInt8(primary_key=True)
 35 | 
 36 |     @dy.rule()
 37 |     def a_greater_b() -> pl.Expr:
 38 |         return pl.col("a") > pl.col("b")
 39 | 
 40 |     @dy.rule(group_by=["a"])
 41 |     def minimum_two_per_a() -> pl.Expr:
 42 |         return pl.len() >= 2
 43 | 
 44 | 
 45 | class LimitedComplexSchema(dy.Schema):
 46 |     a = dy.UInt8(primary_key=True)
 47 |     b = dy.UInt8(primary_key=True)
 48 | 
 49 |     @dy.rule()
 50 |     def a_greater_b() -> pl.Expr:
 51 |         return pl.col("a") > pl.col("b")
 52 | 
 53 |     @dy.rule(group_by=["a"])
 54 |     def minimum_two_per_a() -> pl.Expr:
 55 |         # We cannot generate more than 768 rows with this rule
 56 |         return pl.len() <= 3
 57 | 
 58 | 
 59 | # --------------------------------------- TESTS -------------------------------------- #
 60 | 
 61 | 
 62 | @pytest.mark.parametrize("n", [0, 1000])
 63 | def test_sample_deterministic(n: int) -> None:
 64 |     with dy.Config(max_sampling_iterations=1):
 65 |         df = MySimpleSchema.sample(n)
 66 |         MySimpleSchema.validate(df)
 67 | 
 68 | 
 69 | @pytest.mark.parametrize("schema", [PrimaryKeySchema, CheckSchema, ComplexSchema])
 70 | @pytest.mark.parametrize("n", [0, 1000])
 71 | def test_sample_fuzzy(schema: type[dy.Schema], n: int) -> None:
 72 |     df = schema.sample(n, generator=Generator(seed=42))
 73 |     assert len(df) == n
 74 |     schema.validate(df)
 75 | 
 76 | 
 77 | def test_sample_fuzzy_failure() -> None:
 78 |     with pytest.raises(ValueError):
 79 |         with dy.Config(max_sampling_iterations=5):
 80 |             ComplexSchema.sample(1000, generator=Generator(seed=42))
 81 | 
 82 | 
 83 | @pytest.mark.parametrize("n", [1, 1000])
 84 | def test_sample_overrides(n: int) -> None:
 85 |     df = CheckSchema.sample(overrides={"b": range(n)})
 86 |     CheckSchema.validate(df)
 87 |     assert len(df) == n
 88 |     assert df.get_column("b").to_list() == list(range(n))
 89 | 
 90 | 
 91 | def test_sample_overrides_with_removing_groups() -> None:
 92 |     generator = Generator()
 93 |     n = 333  # we cannot use something too large here or we'll never return
 94 |     overrides = np.random.randint(100, size=n)
 95 |     df = LimitedComplexSchema.sample(overrides={"b": overrides}, generator=generator)
 96 |     LimitedComplexSchema.validate(df)
 97 |     assert len(df) == n
 98 |     assert df.get_column("b").to_list() == list(overrides)
 99 | 
100 | 
101 | @pytest.mark.parametrize("n", [1, 1000])
102 | def test_sample_overrides_allow_no_fuzzy(n: int) -> None:
103 |     with dy.Config(max_sampling_iterations=1):
104 |         df = CheckSchema.sample(n, overrides={"b": [0] * n})
105 |         CheckSchema.validate(df)
106 |         assert len(df) == n
107 |         assert df.get_column("b").to_list() == [0] * n
108 | 
109 | 
110 | @pytest.mark.parametrize("n", [1, 1000])
111 | def test_sample_overrides_full(n: int) -> None:
112 |     df = CheckSchema.sample(n)
113 |     df_override = CheckSchema.sample(n, overrides=df.to_dict())
114 |     assert_frame_equal(df, df_override)
115 | 
116 | 
117 | def test_sample_overrides_row_layout() -> None:
118 |     df = MySimpleSchema.sample(overrides=[{"a": 1}, {"a": 2}, {"a": 3}])
119 |     assert len(df) == 3
120 |     assert df.get_column("a").to_list() == [1, 2, 3]
121 | 
122 | 
123 | def test_sample_overrides_invalid_column() -> None:
124 |     with pytest.raises(ValueError, match=r"not in the schema"):
125 |         MySimpleSchema.sample(overrides={"foo": []})
126 | 
127 | 
128 | def test_sample_overrides_invalid_length() -> None:
129 |     with pytest.raises(ValueError, match=r"`num_rows` is different"):
130 |         MySimpleSchema.sample(3, overrides={"a": [1, 2]})
131 | 
132 | 
133 | def test_sample_no_overrides_no_num_rows() -> None:
134 |     # This case infers `num_rows == 1`
135 |     df = MySimpleSchema.sample()
136 |     MySimpleSchema.validate(df)
137 |     assert len(df) == 1
138 | 


--------------------------------------------------------------------------------
/tests/schema/test_validate.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) QuantCo 2025-2025
  2 | # SPDX-License-Identifier: BSD-3-Clause
  3 | 
  4 | import polars as pl
  5 | import pytest
  6 | from polars.testing import assert_frame_equal
  7 | 
  8 | import dataframely as dy
  9 | from dataframely.exc import DtypeValidationError, RuleValidationError, ValidationError
 10 | 
 11 | 
 12 | class MySchema(dy.Schema):
 13 |     a = dy.Int64(primary_key=True)
 14 |     b = dy.String(nullable=False, max_length=5)
 15 |     c = dy.String()
 16 | 
 17 | 
 18 | class MyComplexSchema(dy.Schema):
 19 |     a = dy.Int64()
 20 |     b = dy.Int64()
 21 | 
 22 |     @dy.rule()
 23 |     def b_greater_a() -> pl.Expr:
 24 |         return pl.col("b") > pl.col("a")
 25 | 
 26 |     @dy.rule(group_by=["a"])
 27 |     def b_unique_within_a() -> pl.Expr:
 28 |         return pl.col("b").n_unique() == 1
 29 | 
 30 | 
 31 | # -------------------------------------- COLUMNS ------------------------------------- #
 32 | 
 33 | 
 34 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
 35 | def test_missing_columns(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
 36 |     df = df_type({"a": [1], "b": [""]})
 37 |     with pytest.raises(ValidationError):
 38 |         MySchema.validate(df)
 39 |     assert not MySchema.is_valid(df)
 40 | 
 41 | 
 42 | # -------------------------------------- DTYPES -------------------------------------- #
 43 | 
 44 | 
 45 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
 46 | def test_invalid_dtype(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
 47 |     df = df_type({"a": [1], "b": [1], "c": [1]})
 48 |     try:
 49 |         MySchema.validate(df)
 50 |         assert False  # above should raise
 51 |     except DtypeValidationError as exc:
 52 |         assert len(exc.errors) == 2
 53 |     assert not MySchema.is_valid(df)
 54 | 
 55 | 
 56 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
 57 | def test_invalid_dtype_cast(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
 58 |     df = df_type({"a": [1], "b": [1], "c": [1]})
 59 |     actual = MySchema.validate(df, cast=True)
 60 |     expected = pl.DataFrame({"a": [1], "b": ["1"], "c": ["1"]})
 61 |     assert_frame_equal(actual, expected)
 62 |     assert MySchema.is_valid(df, cast=True)
 63 | 
 64 | 
 65 | # --------------------------------------- RULES -------------------------------------- #
 66 | 
 67 | 
 68 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
 69 | def test_invalid_column_contents(
 70 |     df_type: type[pl.DataFrame] | type[pl.LazyFrame],
 71 | ) -> None:
 72 |     df = df_type({"a": [1, 2, 3], "b": ["x", "longtext", None], "c": ["1", None, "3"]})
 73 |     try:
 74 |         MySchema.validate(df)
 75 |         assert False  # above should raise
 76 |     except RuleValidationError as exc:
 77 |         assert len(exc.schema_errors) == 0
 78 |         assert exc.column_errors == {"b": {"nullability": 1, "max_length": 1}}
 79 |     assert not MySchema.is_valid(df)
 80 | 
 81 | 
 82 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
 83 | def test_invalid_primary_key(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
 84 |     df = df_type({"a": [1, 1], "b": ["x", "y"], "c": ["1", "2"]})
 85 |     try:
 86 |         MySchema.validate(df)
 87 |         assert False  # above should raise
 88 |     except RuleValidationError as exc:
 89 |         assert exc.schema_errors == {"primary_key": 2}
 90 |         assert len(exc.column_errors) == 0
 91 |     assert not MySchema.is_valid(df)
 92 | 
 93 | 
 94 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
 95 | def test_violated_custom_rule(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
 96 |     df = df_type({"a": [1, 1, 2, 3, 3], "b": [2, 2, 2, 4, 5]})
 97 |     try:
 98 |         MyComplexSchema.validate(df)
 99 |         assert False  # above should raise
100 |     except RuleValidationError as exc:
101 |         assert exc.schema_errors == {"b_greater_a": 1, "b_unique_within_a": 2}
102 |         assert len(exc.column_errors) == 0
103 |     assert not MyComplexSchema.is_valid(df)
104 | 
105 | 
106 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
107 | def test_success_multi_row_strip_cast(
108 |     df_type: type[pl.DataFrame] | type[pl.LazyFrame],
109 | ) -> None:
110 |     df = df_type(
111 |         {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [1, None, None], "d": [1, 2, 3]}
112 |     )
113 |     actual = MySchema.validate(df, cast=True)
114 |     expected = pl.DataFrame(
115 |         {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": ["1", None, None]}
116 |     )
117 |     assert_frame_equal(actual, expected)
118 |     assert MySchema.is_valid(df, cast=True)
119 | 
120 | 
121 | @pytest.mark.parametrize("df_type", [pl.DataFrame, pl.LazyFrame])
122 | def test_group_rule_on_nulls(df_type: type[pl.DataFrame] | type[pl.LazyFrame]) -> None:
123 |     # The schema is violated because we have multiple "b" values for the same "a" value
124 |     df = df_type({"a": [None, None], "b": [1, 2]})
125 |     with pytest.raises(RuleValidationError):
126 |         MyComplexSchema.validate(df, cast=True)
127 |     assert not MyComplexSchema.is_valid(df, cast=True)
128 | 


--------------------------------------------------------------------------------
/tests/test_compat.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import pytest
 5 | 
 6 | from dataframely._compat import _DummyModule
 7 | 
 8 | 
 9 | def test_dummy_module() -> None:
10 |     module = "sqlalchemy"
11 |     dm = _DummyModule(module=module)
12 |     assert dm.module == module
13 |     with pytest.raises(ValueError):
14 |         getattr(dm, "foo")
15 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import dataframely as dy
 5 | 
 6 | 
 7 | def test_config_global() -> None:
 8 |     dy.Config.set_max_sampling_iterations(50)
 9 |     assert dy.Config.options["max_sampling_iterations"] == 50
10 |     dy.Config.restore_defaults()
11 | 
12 | 
13 | def test_config_local() -> None:
14 |     try:
15 |         with dy.Config(max_sampling_iterations=35):
16 |             assert dy.Config.options["max_sampling_iterations"] == 35
17 |         assert dy.Config.options["max_sampling_iterations"] == 10_000
18 |     finally:
19 |         dy.Config.restore_defaults()
20 | 
21 | 
22 | def test_config_local_nested() -> None:
23 |     try:
24 |         with dy.Config(max_sampling_iterations=35):
25 |             assert dy.Config.options["max_sampling_iterations"] == 35
26 |             with dy.Config(max_sampling_iterations=20):
27 |                 assert dy.Config.options["max_sampling_iterations"] == 20
28 |             assert dy.Config.options["max_sampling_iterations"] == 35
29 |         assert dy.Config.options["max_sampling_iterations"] == 10_000
30 |     finally:
31 |         dy.Config.restore_defaults()
32 | 
33 | 
34 | def test_config_global_local() -> None:
35 |     try:
36 |         dy.Config.set_max_sampling_iterations(50)
37 |         assert dy.Config.options["max_sampling_iterations"] == 50
38 |         with dy.Config(max_sampling_iterations=35):
39 |             assert dy.Config.options["max_sampling_iterations"] == 35
40 |         assert dy.Config.options["max_sampling_iterations"] == 50
41 |     finally:
42 |         dy.Config.restore_defaults()
43 | 


--------------------------------------------------------------------------------
/tests/test_deprecation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import warnings
 5 | 
 6 | import pytest
 7 | 
 8 | import dataframely as dy
 9 | 
10 | 
11 | def test_column_constructor_warns_about_nullable(
12 |     monkeypatch: pytest.MonkeyPatch,
13 | ) -> None:
14 |     monkeypatch.setenv("DATAFRAMELY_NO_FUTURE_WARNINGS", "")
15 |     with pytest.warns(
16 |         FutureWarning, match="The 'nullable' argument was not explicitly set"
17 |     ):
18 |         dy.Integer()
19 | 
20 | 
21 | @pytest.mark.parametrize("env_var", ["1", "True", "true"])
22 | def test_future_warning_skip(monkeypatch: pytest.MonkeyPatch, env_var: str) -> None:
23 |     monkeypatch.setenv("DATAFRAMELY_NO_FUTURE_WARNINGS", env_var)
24 | 
25 |     # Elevates FutureWarning to an exception
26 |     with warnings.catch_warnings():
27 |         warnings.simplefilter("error", FutureWarning)
28 |         dy.Integer()
29 | 


--------------------------------------------------------------------------------
/tests/test_exc.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import polars as pl
 5 | 
 6 | from dataframely.exc import DtypeValidationError, RuleValidationError, ValidationError
 7 | 
 8 | 
 9 | def test_validation_error_str() -> None:
10 |     message = "validation failed"
11 |     exc = ValidationError(message)
12 |     assert str(exc) == message
13 | 
14 | 
15 | def test_dtype_validation_error_str() -> None:
16 |     exc = DtypeValidationError(
17 |         errors={"a": (pl.Int64, pl.String), "b": (pl.Boolean, pl.String)}
18 |     )
19 |     assert str(exc).split("\n") == [
20 |         "2 columns have an invalid dtype:",
21 |         " - 'a': got dtype 'Int64' but expected 'String'",
22 |         " - 'b': got dtype 'Boolean' but expected 'String'",
23 |     ]
24 | 
25 | 
26 | def test_rule_validation_error_str() -> None:
27 |     exc = RuleValidationError(
28 |         {
29 |             "b|max_length": 1500,
30 |             "a|nullability": 2,
31 |             "primary_key": 2000,
32 |             "a|min_length": 5,
33 |         },
34 |     )
35 |     assert str(exc).split("\n") == [
36 |         "4 rules failed validation:",
37 |         " - 'primary_key' failed validation for 2,000 rows",
38 |         " * Column 'a' failed validation for 2 rules:",
39 |         "   - 'min_length' failed for 5 rows",
40 |         "   - 'nullability' failed for 2 rows",
41 |         " * Column 'b' failed validation for 1 rules:",
42 |         "   - 'max_length' failed for 1,500 rows",
43 |     ]
44 | 


--------------------------------------------------------------------------------
/tests/test_extre.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import math
 5 | import re
 6 | 
 7 | import numpy as np
 8 | import pytest
 9 | 
10 | import dataframely._extre as extre
11 | 
12 | # ------------------------------------ MATCHING STRING LENGTH ----------------------------------- #
13 | 
14 | 
15 | @pytest.mark.parametrize(
16 |     ("regex", "expected_lower", "expected_upper"),
17 |     [
18 |         (r"abc", 3, 3),
19 |         (r".*", 0, None),
20 |         (r"[a-z]{3,5}", 3, 5),
21 |         (r"[0-9]{2}[0-9a-zA-Z]{2,4}", 4, 6),
22 |         (r"^[0-9]{2}[0-9a-zA-Z]{2,4}$", 4, 6),
23 |         (r"^[0-9]{2}[0-9a-zA-Z]{2,4}.+$", 5, None),
24 |     ],
25 | )
26 | def test_matching_string_length(
27 |     regex: str, expected_lower: int, expected_upper: int | None
28 | ) -> None:
29 |     actual_lower, actual_upper = extre.matching_string_length(regex)
30 |     assert actual_lower == expected_lower
31 |     assert actual_upper == expected_upper
32 | 
33 | 
34 | @pytest.mark.parametrize("regex", [r"(?=[A-Za-z\d])"])
35 | def test_failing_matching_string_length(regex: str) -> None:
36 |     with pytest.raises(ValueError):
37 |         extre.matching_string_length(regex)
38 | 
39 | 
40 | # ------------------------------------------- SAMPLING ------------------------------------------ #
41 | 
42 | TEST_REGEXES = [
43 |     "",
44 |     "a",
45 |     "ab",
46 |     "a|b",
47 |     "[A-Z]+",
48 |     "[A-Za-z0-9]?",
49 |     "([a-z]+:)?[0-9]*" r"[^@]+@[^@]+\.[^@]+",
50 |     r"[a-z0-9\._%+!$&*=^|~#%'`?{}/\-]+@([a-z0-9\-]+\.){1,}([a-z]{2,16})",
51 | ]
52 | 
53 | 
54 | @pytest.mark.parametrize("regex", TEST_REGEXES)
55 | def test_sample_one(regex: str) -> None:
56 |     sample = extre.sample(regex, max_repetitions=10)
57 |     assert re.fullmatch(regex, sample) is not None
58 | 
59 | 
60 | @pytest.mark.parametrize("regex", TEST_REGEXES)
61 | def test_sample_many(regex: str) -> None:
62 |     samples = extre.sample(regex, n=100, max_repetitions=10)
63 |     assert all(re.fullmatch(regex, s) is not None for s in samples)
64 | 
65 | 
66 | def test_sample_equal_alternation_probabilities() -> None:
67 |     n = 100_000
68 |     samples = extre.sample("a|b|c", n=n)
69 |     np.allclose(np.unique_counts(samples).counts / n, np.ones(3) / 3, atol=0.01)
70 | 
71 | 
72 | def test_sample_max_repetitions() -> None:
73 |     samples = extre.sample(".*", n=100_000, max_repetitions=10)
74 |     assert max(len(s) for s in samples) == 10
75 |     assert math.isclose(np.mean([len(s) for s in samples]), 5, abs_tol=0.05)
76 | 
77 | 
78 | def test_sample_equal_class_probabilities() -> None:
79 |     n = 1_000_000
80 |     samples = extre.sample("[a-z0-9]", n=n)
81 |     np.allclose(np.unique_counts(samples).counts / n, np.ones(36) / 36, atol=0.001)
82 | 
83 | 
84 | def test_sample_one_seed() -> None:
85 |     choices = [extre.sample("a|b", seed=42) for _ in range(10_000)]
86 |     assert len(set(choices)) == 1
87 | 
88 | 
89 | def test_sample_many_seed() -> None:
90 |     choices = extre.sample("a|b", n=10_000, seed=42)
91 |     assert len(set(choices)) == 2
92 | 


--------------------------------------------------------------------------------
/tests/test_failure_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) QuantCo 2025-2025
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | import polars as pl
 7 | from polars.testing import assert_frame_equal
 8 | 
 9 | import dataframely as dy
10 | 
11 | 
12 | class MySchema(dy.Schema):
13 |     a = dy.Integer(primary_key=True, min=5, max=10)
14 |     b = dy.Integer(nullable=False, is_in=[1, 2, 3, 5, 7, 11])
15 | 
16 | 
17 | def test_read_write_parquet(tmp_path: Path) -> None:
18 |     df = pl.DataFrame(
19 |         {
20 |             "a": [4, 5, 6, 6, 7, 8],
21 |             "b": [1, 2, 3, 4, 5, 6],
22 |         }
23 |     )
24 |     _, failure = MySchema.filter(df)
25 |     assert failure._df.height == 4
26 |     failure.write_parquet(tmp_path / "failure.parquet")
27 | 
28 |     read: dy.FailureInfo[MySchema] = dy.FailureInfo.scan_parquet(
29 |         tmp_path / "failure.parquet"
30 |     )
31 |     assert_frame_equal(failure._lf, read._lf)
32 |     assert failure._rule_columns == read._rule_columns
33 |     assert failure.schema == read.schema == MySchema
34 | 


--------------------------------------------------------------------------------