├── .asf.yaml
├── .codespellrc
├── .github
├── ISSUE_TEMPLATE
│ ├── iceberg_bug_report.yml
│ ├── iceberg_improvement.yml
│ └── iceberg_question.yml
├── dependabot.yml
├── pull_request_template.md
└── workflows
│ ├── check-md-link.yml
│ ├── license_check.yml
│ ├── nightly-pypi-build.yml
│ ├── pypi-build-artifacts.yml
│ ├── python-ci-docs.yml
│ ├── python-ci.yml
│ ├── python-release-docs.yml
│ ├── python-release.yml
│ ├── stale.yml
│ └── svn-build-artifacts.yml
├── .gitignore
├── .markdownlint.yaml
├── .pre-commit-config.yaml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── NOTICE
├── README.md
├── build-module.py
├── dev
├── .rat-excludes
├── Dockerfile
├── check-license
├── docker-compose-azurite.yml
├── docker-compose-gcs-server.yml
├── docker-compose-integration.yml
├── docker-compose.yml
├── entrypoint.sh
├── hive
│ ├── Dockerfile
│ └── core-site.xml
├── provision.py
├── run-azurite.sh
├── run-gcs-server.sh
├── run-minio.sh
└── spark-defaults.conf
├── mkdocs
├── README.md
├── docs
│ ├── SUMMARY.md
│ ├── api.md
│ ├── assets
│ │ └── images
│ │ │ ├── gen-release-notes.jpg
│ │ │ └── iceberg-logo-icon.png
│ ├── cli.md
│ ├── community.md
│ ├── configuration.md
│ ├── contributing.md
│ ├── how-to-release.md
│ ├── index.md
│ ├── nightly-build.md
│ └── verify-release.md
├── gen_doc_stubs.py
└── mkdocs.yml
├── poetry.lock
├── pyiceberg
├── __init__.py
├── avro
│ ├── __init__.py
│ ├── codecs
│ │ ├── __init__.py
│ │ ├── bzip2.py
│ │ ├── codec.py
│ │ ├── deflate.py
│ │ ├── snappy_codec.py
│ │ └── zstandard_codec.py
│ ├── decoder.py
│ ├── decoder_basic.c
│ ├── decoder_fast.pyi
│ ├── decoder_fast.pyx
│ ├── encoder.py
│ ├── file.py
│ ├── reader.py
│ ├── resolver.py
│ └── writer.py
├── catalog
│ ├── __init__.py
│ ├── dynamodb.py
│ ├── glue.py
│ ├── hive.py
│ ├── memory.py
│ ├── noop.py
│ ├── rest
│ │ ├── __init__.py
│ │ ├── auth.py
│ │ └── response.py
│ └── sql.py
├── cli
│ ├── __init__.py
│ ├── console.py
│ └── output.py
├── conversions.py
├── exceptions.py
├── expressions
│ ├── __init__.py
│ ├── literals.py
│ ├── parser.py
│ └── visitors.py
├── io
│ ├── __init__.py
│ ├── fsspec.py
│ └── pyarrow.py
├── manifest.py
├── partitioning.py
├── py.typed
├── schema.py
├── serializers.py
├── table
│ ├── __init__.py
│ ├── inspect.py
│ ├── locations.py
│ ├── metadata.py
│ ├── name_mapping.py
│ ├── puffin.py
│ ├── refs.py
│ ├── snapshots.py
│ ├── sorting.py
│ ├── statistics.py
│ ├── update
│ │ ├── __init__.py
│ │ ├── schema.py
│ │ ├── snapshot.py
│ │ ├── spec.py
│ │ ├── statistics.py
│ │ └── validate.py
│ └── upsert_util.py
├── transforms.py
├── typedef.py
├── types.py
└── utils
│ ├── __init__.py
│ ├── bin_packing.py
│ ├── concurrent.py
│ ├── config.py
│ ├── datetime.py
│ ├── decimal.py
│ ├── deprecated.py
│ ├── lazydict.py
│ ├── parsing.py
│ ├── properties.py
│ ├── schema_conversion.py
│ ├── singleton.py
│ └── truncate.py
├── pyproject.toml
├── ruff.toml
├── tests
├── avro
│ ├── test_decoder.py
│ ├── test_encoder.py
│ ├── test_file.py
│ ├── test_reader.py
│ ├── test_resolver.py
│ └── test_writer.py
├── benchmark
│ └── test_benchmark.py
├── catalog
│ ├── integration_test_dynamodb.py
│ ├── integration_test_glue.py
│ ├── test_base.py
│ ├── test_dynamodb.py
│ ├── test_glue.py
│ ├── test_hive.py
│ ├── test_rest.py
│ ├── test_rest_auth.py
│ └── test_sql.py
├── cli
│ ├── test_console.py
│ └── test_output.py
├── conftest.py
├── expressions
│ ├── test_evaluator.py
│ ├── test_expressions.py
│ ├── test_literals.py
│ ├── test_parser.py
│ ├── test_projection.py
│ ├── test_residual_evaluator.py
│ └── test_visitors.py
├── integration
│ ├── test_add_files.py
│ ├── test_delete_count.py
│ ├── test_deletes.py
│ ├── test_inspect_table.py
│ ├── test_partition_evolution.py
│ ├── test_partitioning_key.py
│ ├── test_reads.py
│ ├── test_register_table.py
│ ├── test_rest_catalog.py
│ ├── test_rest_manifest.py
│ ├── test_rest_schema.py
│ ├── test_snapshot_operations.py
│ ├── test_statistics_operations.py
│ └── test_writes
│ │ ├── test_optimistic_concurrency.py
│ │ ├── test_partitioned_writes.py
│ │ ├── test_writes.py
│ │ └── utils.py
├── io
│ ├── test_fsspec.py
│ ├── test_io.py
│ ├── test_pyarrow.py
│ ├── test_pyarrow_stats.py
│ └── test_pyarrow_visitor.py
├── table
│ ├── bitmaps
│ │ ├── 64map32bitvals.bin
│ │ ├── 64mapempty.bin
│ │ ├── 64maphighvals.bin
│ │ └── 64mapspreadvals.bin
│ ├── test_init.py
│ ├── test_locations.py
│ ├── test_metadata.py
│ ├── test_name_mapping.py
│ ├── test_partitioning.py
│ ├── test_puffin.py
│ ├── test_refs.py
│ ├── test_snapshots.py
│ ├── test_sorting.py
│ ├── test_upsert.py
│ └── test_validate.py
├── test_conversions.py
├── test_schema.py
├── test_serializers.py
├── test_transforms.py
├── test_typedef.py
├── test_types.py
├── test_version.py
└── utils
│ ├── test_bin_packing.py
│ ├── test_concurrent.py
│ ├── test_config.py
│ ├── test_datetime.py
│ ├── test_decimal.py
│ ├── test_deprecated.py
│ ├── test_lazydict.py
│ ├── test_manifest.py
│ ├── test_properties.py
│ ├── test_schema_conversion.py
│ ├── test_singleton.py
│ └── test_truncate.py
└── vendor
├── README.md
├── fb303
├── FacebookService.py
├── __init__.py
├── constants.py
└── ttypes.py
└── hive_metastore
├── ThriftHiveMetastore.py
├── __init__.py
├── constants.py
└── ttypes.py
/.asf.yaml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | # The format of this file is documented at
21 | # https://cwiki.apache.org/confluence/display/INFRA/Git+-+.asf.yaml+features
22 |
23 | github:
24 | description: "Apache PyIceberg"
25 | homepage: https://py.iceberg.apache.org/
26 | labels:
27 | - iceberg
28 | - apache
29 | - hacktoberfest
30 | - pyiceberg
31 | enabled_merge_buttons:
32 | merge: false
33 | squash: true
34 | rebase: true
35 | protected_branches:
36 | main:
37 | required_status_checks:
38 | # strict means "Require branches to be up to date before merging".
39 | strict: true
40 |
41 | required_pull_request_reviews:
42 | required_approving_review_count: 1
43 |
44 | required_linear_history: true
45 | del_branch_on_merge: true
46 | features:
47 | wiki: true
48 | issues: true
49 | projects: true
50 | collaborators: # Note: the number of collaborators is limited to 10
51 | - ajantha-bhat
52 | - ndrluis
53 | ghp_branch: gh-pages
54 | ghp_path: /
55 |
56 | notifications:
57 | commits: commits@iceberg.apache.org
58 | issues: issues@iceberg.apache.org
59 | pullrequests: issues@iceberg.apache.org
60 | jira_options: link label link label
61 |
--------------------------------------------------------------------------------
/.codespellrc:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | [codespell]
18 | ignore-words-list = BoundIn,fo,MoR,NotIn,notIn,oT
19 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | ---
21 | name: Iceberg Bug report 🐞
22 | description: Problems, bugs and issues with Apache Iceberg
23 | labels: ["kind:bug"]
24 | body:
25 | - type: dropdown
26 | attributes:
27 | label: Apache Iceberg version
28 | description: What Apache Iceberg version are you using?
29 | multiple: false
30 | options:
31 | - "0.9.1 (latest release)"
32 | - "0.9.0"
33 | - "0.8.1"
34 | - "0.8.0"
35 | - "0.7.1"
36 | - "0.7.0"
37 | - "0.6.1"
38 | - "0.6.0"
39 | - "0.5.0"
40 | - "0.4.0"
41 | - "0.3.0"
42 | - "0.2.0"
43 | - "0.1.0"
44 | - "main (development)"
45 | validations:
46 | required: false
47 | - type: textarea
48 | attributes:
49 | label: Please describe the bug 🐞
50 | description: >
51 | Please describe the problem, what to expect, and how to reproduce.
52 | Feel free to include stacktraces and the Iceberg catalog configuration.
53 | You can include files by dragging and dropping them here.
54 | validations:
55 | required: true
56 | - type: checkboxes
57 | attributes:
58 | label: Willingness to contribute
59 | description: The Apache Iceberg community encourages bug-fix contributions. Would you or another member of your organization be willing to contribute a fix for this bug to the PyIceberg codebase?
60 | options:
61 | - label: I can contribute a fix for this bug independently
62 | - label: I would be willing to contribute a fix for this bug with guidance from the Iceberg community
63 | - label: I cannot contribute a fix for this bug at this time
64 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/iceberg_improvement.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | ---
21 | name: Iceberg Improvement / Feature Request
22 | description: New features with Apache Iceberg
23 | labels: ["kind:feature request"]
24 | body:
25 | - type: textarea
26 | attributes:
27 | label: Feature Request / Improvement
28 | description: Please describe the feature and elaborate on the use case and motivation behind it
29 | validations:
30 | required: true
31 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/iceberg_question.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | ---
21 | name: Iceberg Question
22 | description: Questions around Apache Iceberg
23 | labels: ["kind:question"]
24 | body:
25 | - type: textarea
26 | attributes:
27 | label: Question
28 | description: What is your question?
29 | validations:
30 | required: true
31 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | version: 2
21 | updates:
22 | - package-ecosystem: "pip"
23 | directory: "/"
24 | schedule:
25 | interval: "weekly"
26 | open-pull-requests-limit: 50
27 | - package-ecosystem: "github-actions"
28 | directory: "/"
29 | schedule:
30 | interval: "weekly"
31 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 |
7 |
8 | # Rationale for this change
9 |
10 | # Are these changes tested?
11 |
12 | # Are there any user-facing changes?
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.github/workflows/check-md-link.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | name: Check Markdown links
21 |
22 | on:
23 | push:
24 | paths:
25 | - '.github/workflows/check-md-link.yml'
26 | - 'mkdocs/**'
27 | branches:
28 | - 'main'
29 | pull_request:
30 | paths:
31 | - '.github/workflows/check-md-link.yml'
32 | - 'mkdocs/**'
33 |
34 | jobs:
35 | markdown-link-check:
36 | runs-on: ubuntu-latest
37 | steps:
38 | - uses: actions/checkout@master
39 | - uses: gaurav-nelson/github-action-markdown-link-check@v1
40 |
--------------------------------------------------------------------------------
/.github/workflows/license_check.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | name: "Run License Check"
21 | on: pull_request
22 |
23 | jobs:
24 | rat:
25 | runs-on: ubuntu-22.04
26 | steps:
27 | - uses: actions/checkout@v4
28 | - run: dev/check-license
29 |
--------------------------------------------------------------------------------
/.github/workflows/nightly-pypi-build.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | name: "Nightly PyPI Build"
21 |
22 | on:
23 | schedule:
24 | - cron: "0 0 * * *" # Runs at midnight UTC every day
25 | workflow_dispatch: # Allows manual triggering
26 |
27 | jobs:
28 | set-version:
29 | if: github.repository == 'apache/iceberg-python' # Only run for apache repo
30 | runs-on: ubuntu-latest
31 | outputs:
32 | VERSION: ${{ steps.set-version.outputs.VERSION }}
33 | steps:
34 | - uses: actions/checkout@v4
35 | with:
36 | fetch-depth: 1
37 |
38 | - uses: actions/setup-python@v5
39 | with:
40 | python-version: 3.12
41 |
42 | - name: Install Poetry
43 | run: make install-poetry
44 |
45 | - name: Set version
46 | id: set-version
47 | run: |
48 | CURRENT_VERSION=$(poetry version --short)
49 | TIMESTAMP=$(date +%Y%m%d%H%M%S)
50 | echo "VERSION=${CURRENT_VERSION}.dev${TIMESTAMP}" >> "$GITHUB_OUTPUT"
51 |
52 | - name: Debug version
53 | run: echo "Publishing version ${{ steps.set-version.outputs.VERSION }}"
54 |
55 | nightly-build:
56 | needs: set-version
57 | uses: ./.github/workflows/pypi-build-artifacts.yml
58 | with:
59 | version: ${{ needs.set-version.outputs.VERSION }}
60 | testpypi-publish:
61 | name: Publish to TestPypi
62 | needs:
63 | - nightly-build
64 | runs-on: ubuntu-latest
65 | environment:
66 | name: testpypi
67 | url: https://test.pypi.org/p/pyiceberg
68 |
69 | permissions:
70 | id-token: write # IMPORTANT: mandatory for trusted publishing
71 |
72 | steps:
73 | - name: Download all the artifacts
74 | uses: actions/download-artifact@v4
75 | with:
76 | merge-multiple: true
77 | path: dist/
78 | - name: List downloaded artifacts
79 | run: ls -R dist/
80 | - name: Publish to TestPyPI
81 | uses: pypa/gh-action-pypi-publish@release/v1
82 | with:
83 | repository-url: https://test.pypi.org/legacy/
84 | skip-existing: true
85 | verbose: true
86 |
--------------------------------------------------------------------------------
/.github/workflows/pypi-build-artifacts.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | name: "Build PyPI Artifacts"
21 |
22 | on:
23 | workflow_call:
24 | inputs:
25 | VERSION:
26 | required: true
27 | type: string
28 |
29 | jobs:
30 | pypi-build-artifacts:
31 | name: Build artifacts for PyPi on ${{ matrix.os }}
32 | runs-on: ${{ matrix.os }}
33 | strategy:
34 | matrix:
35 | os: [ ubuntu-22.04, windows-2022, macos-13, macos-14 ]
36 |
37 | steps:
38 | - uses: actions/checkout@v4
39 | with:
40 | fetch-depth: 1
41 |
42 | - uses: actions/setup-python@v5
43 | with:
44 | python-version: |
45 | 3.9
46 | 3.10
47 | 3.11
48 | 3.12
49 |
50 | - name: Install poetry
51 | run: make install-poetry
52 |
53 | - name: Set version with RC
54 | env:
55 | VERSION: ${{ inputs.VERSION }}
56 | run: python -m poetry version "${{ env.VERSION }}"
57 |
58 | # Publish the source distribution with the version that's in
59 | # the repository, otherwise the tests will fail
60 | - name: Compile source distribution
61 | run: python3 -m poetry build --format=sdist
62 | if: startsWith(matrix.os, 'ubuntu')
63 |
64 | - name: Build wheels
65 | uses: pypa/cibuildwheel@v2.23.3
66 | with:
67 | output-dir: wheelhouse
68 | config-file: "pyproject.toml"
69 | env:
70 | # Ignore 32 bit architectures
71 | CIBW_ARCHS: "auto64"
72 | CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9,<3.13"
73 | CIBW_TEST_REQUIRES: "pytest==7.4.2 moto==5.0.1"
74 | CIBW_TEST_COMMAND: "pytest {project}/tests/avro/test_decoder.py"
75 | # Ignore tests for pypy since not all dependencies are compiled for it
76 | # and would require a local rust build chain
77 | CIBW_TEST_SKIP: "pp*"
78 |
79 | - name: Add source distribution
80 | if: startsWith(matrix.os, 'ubuntu')
81 | run: ls -lah dist/* && cp dist/* wheelhouse/
82 |
83 | - uses: actions/upload-artifact@v4
84 | with:
85 | name: "pypi-release-candidate-${{ matrix.os }}"
86 | path: ./wheelhouse/*
87 |
88 | pypi-merge-artifacts:
89 | runs-on: ubuntu-latest
90 | needs:
91 | - pypi-build-artifacts
92 | steps:
93 | - name: Merge Artifacts
94 | uses: actions/upload-artifact/merge@v4
95 | with:
96 | name: "pypi-release-candidate-${{ inputs.VERSION }}"
97 | pattern: pypi-release-candidate*
98 | delete-merged: true
99 |
--------------------------------------------------------------------------------
/.github/workflows/python-ci-docs.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | name: "Python CI Docs"
21 |
22 | on:
23 | push:
24 | branches:
25 | - 'main'
26 | pull_request:
27 |
28 |
29 | concurrency:
30 | group: ${{ github.workflow }}-${{ github.ref }}
31 | cancel-in-progress: ${{ github.event_name == 'pull_request' }}
32 |
33 | jobs:
34 | docs:
35 | runs-on: ubuntu-22.04
36 |
37 | steps:
38 | - uses: actions/checkout@v4
39 | - name: Install poetry
40 | run: make install-poetry
41 | - uses: actions/setup-python@v5
42 | with:
43 | python-version: 3.12
44 | - name: Install
45 | run: make docs-install
46 | - name: Build docs
47 | run: make docs-build
48 |
--------------------------------------------------------------------------------
/.github/workflows/python-ci.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | name: "Python CI"
21 |
22 | on:
23 | push:
24 | branches:
25 | - 'main'
26 | pull_request:
27 | paths:
28 | - '**' # Include all files and directories in the repository by default.
29 | - '!.github/workflows/**' # Exclude all workflow files
30 | - '.github/workflows/python-ci.yml' # except the current file.
31 | - '!.github/ISSUE_TEMPLATE/**' # Exclude files and directories that don't impact tests or code like templates, metadata, and documentation.
32 | - '!.gitignore'
33 | - '!.asf.yml'
34 | - '!mkdocs/**'
35 | - '!.gitattributes'
36 | - '!README.md'
37 | - '!CONTRIBUTING.md'
38 | - '!LICENSE'
39 | - '!NOTICE'
40 |
41 | concurrency:
42 | group: ${{ github.workflow }}-${{ github.ref }}
43 | cancel-in-progress: ${{ github.event_name == 'pull_request' }}
44 |
45 | jobs:
46 | lint-and-test:
47 | runs-on: ubuntu-22.04
48 | strategy:
49 | matrix:
50 | python: ['3.9', '3.10', '3.11', '3.12']
51 |
52 | steps:
53 | - uses: actions/checkout@v4
54 | - name: Install poetry
55 | run: make install-poetry
56 | - uses: actions/setup-python@v5
57 | with:
58 | python-version: ${{ matrix.python }}
59 | cache: poetry
60 | cache-dependency-path: ./poetry.lock
61 | - name: Install system dependencies
62 | run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos
63 | - name: Install
64 | run: make install-dependencies
65 | - name: Linters
66 | run: make lint
67 | - name: Tests
68 | run: make test-coverage-unit
69 |
70 | integration-test:
71 | runs-on: ubuntu-22.04
72 | strategy:
73 | matrix:
74 | python: ['3.9', '3.10', '3.11', '3.12']
75 |
76 | steps:
77 | - uses: actions/checkout@v4
78 | - name: Install system dependencies
79 | run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos
80 | - name: Install
81 | run: make install
82 | - name: Run integration tests
83 | run: make test-coverage-integration
84 | - name: Show debug logs
85 | if: ${{ failure() }}
86 | run: docker compose -f dev/docker-compose.yml logs
87 |
--------------------------------------------------------------------------------
/.github/workflows/python-release-docs.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | name: "Release Docs"
21 | on:
22 | workflow_dispatch:
23 |
24 | concurrency:
25 | group: ${{ github.workflow }}-${{ github.ref }}
26 | cancel-in-progress: ${{ github.event_name == 'pull_request' }}
27 |
28 | jobs:
29 | docs:
30 | runs-on: ubuntu-22.04
31 |
32 | steps:
33 | - uses: actions/checkout@v4
34 | - name: Install poetry
35 | run: make install-poetry
36 | - uses: actions/setup-python@v5
37 | with:
38 | python-version: ${{ matrix.python }}
39 | - name: Install docs
40 | run: make docs-install
41 | - name: Build docs
42 | run: make docs-build
43 | - name: Copy
44 | working-directory: ./mkdocs
45 | run: mv ./site /tmp/site
46 | - name: Push changes to gh-pages branch
47 | run: |
48 | git checkout --orphan gh-pages-tmp
49 | git rm --quiet -rf .
50 | cp -r /tmp/site/* .
51 | git config --global user.name 'GitHub Actions'
52 | git config --global user.email 'actions@github.com'
53 | echo "py.iceberg.apache.org" > CNAME
54 | git add --all
55 | git commit -m 'Publish Python docs'
56 | git push -f origin gh-pages-tmp:gh-pages || true
57 |
--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | name: "Close Stale Issues"
21 | on:
22 | schedule:
23 | - cron: '0 0 * * *'
24 |
25 | permissions:
26 | # All other permissions are set to none
27 | issues: write
28 |
29 | jobs:
30 | stale:
31 | if: github.repository_owner == 'apache'
32 | runs-on: ubuntu-22.04
33 | steps:
34 | - uses: actions/stale@v9.1.0
35 | with:
36 | stale-issue-label: 'stale'
37 | exempt-issue-labels: 'not-stale'
38 | days-before-issue-stale: 180
39 | days-before-issue-close: 14
40 | # Only close stale issues, leave PRs alone
41 | days-before-pr-stale: -1
42 | stale-issue-message: >
43 | This issue has been automatically marked as stale because it has been open for 180 days
44 | with no activity. It will be closed in next 14 days if no further activity occurs. To
45 | permanently prevent this issue from being considered stale, add the label 'not-stale',
46 | but commenting on the issue is preferred when possible.
47 | close-issue-message: >
48 | This issue has been closed because it has not received any activity in the last 14 days
49 | since being marked as 'stale'
50 |
--------------------------------------------------------------------------------
/.github/workflows/svn-build-artifacts.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | #
19 |
20 | name: "Build SVN Artifacts"
21 |
22 | on:
23 | workflow_call:
24 | inputs:
25 | VERSION:
26 | required: true
27 | type: string
28 |
29 | jobs:
30 | svn-build-artifacts:
31 | name: Build artifacts for SVN on ${{ matrix.os }}
32 | runs-on: ${{ matrix.os }}
33 | strategy:
34 | matrix:
35 | os: [ ubuntu-22.04, windows-2022, macos-13, macos-14 ]
36 |
37 | steps:
38 | - uses: actions/checkout@v4
39 | with:
40 | fetch-depth: 1
41 |
42 | - uses: actions/setup-python@v5
43 | with:
44 | python-version: |
45 | 3.9
46 | 3.10
47 | 3.11
48 | 3.12
49 |
50 | - name: Install poetry
51 | run: make install-poetry
52 |
53 | # Publish the source distribution with the version that's in
54 | # the repository, otherwise the tests will fail
55 | - name: Compile source distribution
56 | run: python3 -m poetry build --format=sdist
57 | if: startsWith(matrix.os, 'ubuntu')
58 |
59 | - name: Build wheels
60 | uses: pypa/cibuildwheel@v2.23.3
61 | with:
62 | output-dir: wheelhouse
63 | config-file: "pyproject.toml"
64 | env:
65 | # Ignore 32 bit architectures
66 | CIBW_ARCHS: "auto64"
67 | CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9,<3.13"
68 | CIBW_TEST_REQUIRES: "pytest==7.4.2 moto==5.0.1"
69 | CIBW_TEST_COMMAND: "pytest {project}/tests/avro/test_decoder.py"
70 | # Ignore tests for pypy since not all dependencies are compiled for it
71 | # and would require a local rust build chain
72 | CIBW_TEST_SKIP: "pp*"
73 |
74 | - name: Add source distribution
75 | if: startsWith(matrix.os, 'ubuntu')
76 | run: ls -lah dist/* && cp dist/* wheelhouse/
77 |
78 | - uses: actions/upload-artifact@v4
79 | with:
80 | name: "svn-release-candidate-${{ matrix.os }}"
81 | path: ./wheelhouse/*
82 |
83 | svn-merge-artifacts:
84 | runs-on: ubuntu-latest
85 | needs:
86 | - svn-build-artifacts
87 | steps:
88 | - name: Merge Artifacts
89 | uses: actions/upload-artifact/merge@v4
90 | with:
91 | name: "svn-release-candidate-${{ inputs.VERSION }}"
92 | pattern: svn-release-candidate*
93 | delete-merged: true
94 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | .DS_Store
3 | .cache
4 | tmp/
5 | site
6 |
7 | # intellij files
8 | .idea
9 | .idea_modules/
10 | *.ipr
11 | *.iws
12 | *.iml
13 | out
14 |
15 | # rat library install location
16 | lib/
17 |
18 | __pycache__/
19 | *.py[cod]
20 | .eggs/
21 | .tox/
22 | env/
23 | venv/
24 | *.egg-info/
25 | test-reports
26 | build/
27 | dist/
28 | sdist/
29 | .coverage
30 | coverage.xml
31 | .pytest_cache/
32 |
33 | # vscode/eclipse files
34 | .classpath
35 | .project
36 | .settings
37 | bin/
38 | .vscode/
39 |
40 | # Hive/metastore files
41 | metastore_db/
42 |
43 | # Spark/metastore files
44 | spark-warehouse/
45 | derby.log
46 |
47 | # Python stuff
48 | .mypy_cache/
49 | htmlcov
50 |
51 | pyiceberg/avro/decoder_fast.c
52 | pyiceberg/avro/*.html
53 | pyiceberg/avro/*.so
54 |
--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | # Default state for all rules
19 | default: true
20 |
21 | # MD013/line-length - Line length
22 | MD013: false
23 |
24 | # MD007/ul-indent - Unordered list indentation
25 | MD007:
26 | indent: 4
27 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | ---
18 | exclude: ^vendor/
19 |
20 | repos:
21 | - repo: https://github.com/pre-commit/pre-commit-hooks
22 | rev: v5.0.0
23 | hooks:
24 | - id: trailing-whitespace
25 | - id: end-of-file-fixer
26 | - id: debug-statements
27 | - id: check-yaml
28 | - id: check-ast
29 | - repo: https://github.com/astral-sh/ruff-pre-commit
30 | rev: v0.8.6
31 | hooks:
32 | - id: ruff
33 | args: [ --fix, --exit-non-zero-on-fix ]
34 | - id: ruff-format
35 | - repo: https://github.com/pre-commit/mirrors-mypy
36 | rev: v1.14.1
37 | hooks:
38 | - id: mypy
39 | args:
40 | [--install-types, --non-interactive, --config=pyproject.toml]
41 | - repo: https://github.com/igorshubovych/markdownlint-cli
42 | rev: v0.43.0
43 | hooks:
44 | - id: markdownlint
45 | args: ["--fix"]
46 | - repo: https://github.com/pycqa/pydocstyle
47 | rev: 6.3.0
48 | hooks:
49 | - id: pydocstyle
50 | args:
51 | [
52 | "--ignore=D100,D102,D101,D103,D104,D107,D203,D212,D213,D404,D405,D406,D407,D411,D413,D415,D417",
53 | ]
54 | additional_dependencies:
55 | - tomli==2.0.1
56 | - repo: https://github.com/ikamensh/flynt
57 | rev: 1.0.1
58 | hooks:
59 | - id: flynt
60 | args:
61 | # --line-length is set to a high value to deal with very long lines
62 | - --line-length
63 | - '99999'
64 | - repo: https://github.com/codespell-project/codespell
65 | rev: v2.3.0
66 | hooks:
67 | - id: codespell
68 | ci:
69 | autofix_commit_msg: |
70 | [pre-commit.ci] auto fixes from pre-commit.com hooks
71 |
72 | for more information, see https://pre-commit.ci
73 | autofix_prs: true
74 | autoupdate_branch: ''
75 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
76 | autoupdate_schedule: weekly
77 | skip: []
78 | submodules: false
79 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | graft src
19 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 |
2 | Apache Iceberg
3 | Copyright 2017-2025 The Apache Software Foundation
4 |
5 | This product includes software developed at
6 | The Apache Software Foundation (http://www.apache.org/).
7 |
8 | --------------------------------------------------------------------------------
9 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
17 |
18 | # Iceberg Python
19 |
20 | PyIceberg is a Python library for programmatic access to Iceberg table metadata as well as to table data in Iceberg format. It is a Python implementation of the [Iceberg table spec](https://iceberg.apache.org/spec/).
21 |
22 | The documentation is available at [https://py.iceberg.apache.org/](https://py.iceberg.apache.org/).
23 |
24 | # Get in Touch
25 |
26 | - [Iceberg community](https://iceberg.apache.org/community/)
27 |
--------------------------------------------------------------------------------
/build-module.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import os
19 | import shutil
20 | from pathlib import Path
21 |
22 | allowed_to_fail = os.environ.get("CIBUILDWHEEL", "0") != "1"
23 |
24 |
25 | def build_cython_extensions() -> None:
26 | import Cython.Compiler.Options
27 | from Cython.Build import build_ext, cythonize
28 | from setuptools import Extension
29 | from setuptools.dist import Distribution
30 |
31 | Cython.Compiler.Options.annotate = True
32 |
33 | if os.name == "nt": # Windows
34 | extra_compile_args = [
35 | "/O2",
36 | ]
37 | else: # UNIX-based systems
38 | extra_compile_args = [
39 | "-O3",
40 | ]
41 |
42 | package_path = "pyiceberg"
43 |
44 | extension = Extension(
45 | # Your .pyx file will be available to cpython at this location.
46 | name="pyiceberg.avro.decoder_fast",
47 | sources=[
48 | os.path.join(package_path, "avro", "decoder_fast.pyx"),
49 | ],
50 | extra_compile_args=extra_compile_args,
51 | language="c",
52 | )
53 |
54 | ext_modules = cythonize([extension], include_path=list(package_path), language_level=3, annotate=True)
55 | dist = Distribution({"ext_modules": ext_modules})
56 | cmd = build_ext(dist)
57 | cmd.ensure_finalized()
58 |
59 | cmd.run()
60 |
61 | for output in cmd.get_outputs():
62 | output = Path(output)
63 | relative_extension = output.relative_to(cmd.build_lib)
64 | shutil.copyfile(output, relative_extension)
65 |
66 |
67 | try:
68 | build_cython_extensions()
69 | except Exception:
70 | if not allowed_to_fail:
71 | raise
72 |
--------------------------------------------------------------------------------
/dev/.rat-excludes:
--------------------------------------------------------------------------------
1 | .github/*
2 | .rat-excludes
3 | build
4 | .git
5 | .gitignore
6 | poetry.lock
7 |
--------------------------------------------------------------------------------
/dev/Dockerfile:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | FROM python:3.9-bullseye
17 |
18 | RUN apt-get -qq update && \
19 | apt-get -qq install -y --no-install-recommends \
20 | sudo \
21 | curl \
22 | vim \
23 | unzip \
24 | openjdk-11-jdk \
25 | build-essential \
26 | software-properties-common \
27 | ssh && \
28 | apt-get -qq clean && \
29 | rm -rf /var/lib/apt/lists/*
30 |
31 | # Optional env variables
32 | ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"}
33 | ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"}
34 | ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH
35 |
36 | RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events
37 | WORKDIR ${SPARK_HOME}
38 |
39 | # Remember to also update `tests/conftest`'s spark setting
40 | ENV SPARK_VERSION=3.5.4
41 | ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
42 | ENV ICEBERG_VERSION=1.9.0
43 | ENV PYICEBERG_VERSION=0.9.0
44 |
45 | RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
46 | && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
47 | && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
48 |
49 | # Download iceberg spark runtime
50 | RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
51 | -Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
52 |
53 |
54 | # Download AWS bundle
55 | RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
56 | -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
57 |
58 | COPY spark-defaults.conf /opt/spark/conf
59 | ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
60 |
61 | RUN chmod u+x /opt/spark/sbin/* && \
62 | chmod u+x /opt/spark/bin/*
63 |
64 | RUN pip3 install -q ipython
65 |
66 | RUN pip3 install "pyiceberg[s3fs,hive]==${PYICEBERG_VERSION}"
67 |
68 | COPY entrypoint.sh .
69 | COPY provision.py .
70 |
71 | ENTRYPOINT ["./entrypoint.sh"]
72 | CMD ["notebook"]
73 |
--------------------------------------------------------------------------------
/dev/check-license:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 |
21 | acquire_rat_jar () {
22 |
23 | URL="https://repo.maven.apache.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar"
24 |
25 | JAR="$rat_jar"
26 |
27 | # Download rat launch jar if it hasn't been downloaded yet
28 | if [ ! -f "$JAR" ]; then
29 | # Download
30 | printf "Attempting to fetch rat\n"
31 | JAR_DL="${JAR}.part"
32 | if [ $(command -v curl) ]; then
33 | curl -L --silent "${URL}" > "$JAR_DL" && mv "$JAR_DL" "$JAR"
34 | elif [ $(command -v wget) ]; then
35 | wget --quiet ${URL} -O "$JAR_DL" && mv "$JAR_DL" "$JAR"
36 | else
37 | printf "You do not have curl or wget installed, please install rat manually.\n"
38 | exit -1
39 | fi
40 | fi
41 |
42 | unzip -tq "$JAR" &> /dev/null
43 | if [ $? -ne 0 ]; then
44 | # We failed to download
45 | rm "$JAR"
46 | printf "Our attempt to download rat locally to ${JAR} failed. Please install rat manually.\n"
47 | exit -1
48 | fi
49 | }
50 |
51 | # Go to the Spark project root directory
52 | FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
53 | cd "$FWDIR"
54 |
55 | if test -x "$JAVA_HOME/bin/java"; then
56 | declare java_cmd="$JAVA_HOME/bin/java"
57 | else
58 | declare java_cmd=java
59 | fi
60 |
61 | export RAT_VERSION=0.16.1
62 | export rat_jar="$FWDIR"/lib/apache-rat-${RAT_VERSION}.jar
63 | mkdir -p "$FWDIR"/lib
64 |
65 | [[ -f "$rat_jar" ]] || acquire_rat_jar || {
66 | echo "Download failed. Obtain the rat jar manually and place it at $rat_jar"
67 | exit 1
68 | }
69 |
70 | mkdir -p build
71 | $java_cmd -jar "$rat_jar" --scan-hidden-directories -E "$FWDIR"/dev/.rat-excludes -d "$FWDIR" > build/rat-results.txt
72 |
73 | if [ $? -ne 0 ]; then
74 | echo "RAT exited abnormally"
75 | exit 1
76 | fi
77 |
78 | ERRORS="$(cat build/rat-results.txt | grep -e "??")"
79 |
80 | if test ! -z "$ERRORS"; then
81 | echo "Could not find Apache license headers in the following files:"
82 | echo "$ERRORS"
83 | exit 1
84 | else
85 | echo -e "RAT checks passed."
86 | fi
87 |
--------------------------------------------------------------------------------
/dev/docker-compose-azurite.yml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | services:
19 | azurite:
20 | image: mcr.microsoft.com/azure-storage/azurite
21 | container_name: azurite
22 | hostname: azurite
23 | ports:
24 | - 10000:10000
25 | command: ["azurite-blob", "--loose", "--blobHost", "0.0.0.0"]
26 |
--------------------------------------------------------------------------------
/dev/docker-compose-gcs-server.yml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | services:
19 | gcs-server:
20 | image: fsouza/fake-gcs-server
21 | container_name: gcs-server
22 | ports:
23 | - 4443:4443
24 | entrypoint: >
25 | /bin/sh -c "
26 | mkdir -p /data/warehouse;
27 | /bin/fake-gcs-server -data /data -scheme http;
28 | exit 0;
29 | "
30 |
--------------------------------------------------------------------------------
/dev/docker-compose-integration.yml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | services:
19 | spark-iceberg:
20 | image: python-integration
21 | container_name: pyiceberg-spark
22 | build: .
23 | networks:
24 | iceberg_net:
25 | depends_on:
26 | - rest
27 | - hive
28 | - minio
29 | volumes:
30 | - ./warehouse:/home/iceberg/warehouse
31 | environment:
32 | - AWS_ACCESS_KEY_ID=admin
33 | - AWS_SECRET_ACCESS_KEY=password
34 | - AWS_REGION=us-east-1
35 | ports:
36 | - 8888:8888
37 | - 8080:8080
38 | links:
39 | - rest:rest
40 | - hive:hive
41 | - minio:minio
42 | rest:
43 | image: apache/iceberg-rest-fixture
44 | container_name: pyiceberg-rest
45 | networks:
46 | iceberg_net:
47 | ports:
48 | - 8181:8181
49 | environment:
50 | - AWS_ACCESS_KEY_ID=admin
51 | - AWS_SECRET_ACCESS_KEY=password
52 | - AWS_REGION=us-east-1
53 | - CATALOG_WAREHOUSE=s3://warehouse/
54 | - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
55 | - CATALOG_S3_ENDPOINT=http://minio:9000
56 | minio:
57 | image: minio/minio
58 | container_name: pyiceberg-minio
59 | environment:
60 | - MINIO_ROOT_USER=admin
61 | - MINIO_ROOT_PASSWORD=password
62 | - MINIO_DOMAIN=minio
63 | networks:
64 | iceberg_net:
65 | aliases:
66 | - warehouse.minio
67 | ports:
68 | - 9001:9001
69 | - 9000:9000
70 | command: ["server", "/data", "--console-address", ":9001"]
71 | mc:
72 | depends_on:
73 | - minio
74 | image: minio/mc
75 | container_name: pyiceberg-mc
76 | networks:
77 | iceberg_net:
78 | environment:
79 | - AWS_ACCESS_KEY_ID=admin
80 | - AWS_SECRET_ACCESS_KEY=password
81 | - AWS_REGION=us-east-1
82 | entrypoint: >
83 | /bin/sh -c "
84 | until (/usr/bin/mc alias set minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
85 | /usr/bin/mc mb minio/warehouse;
86 | /usr/bin/mc policy set public minio/warehouse;
87 | tail -f /dev/null
88 | "
89 | hive:
90 | build: hive/
91 | container_name: hive
92 | hostname: hive
93 | networks:
94 | iceberg_net:
95 | ports:
96 | - 9083:9083
97 | environment:
98 | SERVICE_NAME: "metastore"
99 | SERVICE_OPTS: "-Dmetastore.warehouse.dir=s3a://warehouse/hive/"
100 |
101 | networks:
102 | iceberg_net:
103 |
--------------------------------------------------------------------------------
/dev/docker-compose.yml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | services:
19 | minio:
20 | image: minio/minio
21 | container_name: pyiceberg-minio
22 | environment:
23 | - MINIO_ROOT_USER=admin
24 | - MINIO_ROOT_PASSWORD=password
25 | - MINIO_DOMAIN=minio
26 | ports:
27 | - 9001:9001
28 | - 9000:9000
29 | command: ["server", "/data", "--console-address", ":9001"]
30 | mc:
31 | depends_on:
32 | - minio
33 | image: minio/mc
34 | container_name: pyiceberg-mc
35 | environment:
36 | - AWS_ACCESS_KEY_ID=admin
37 | - AWS_SECRET_ACCESS_KEY=password
38 | - AWS_REGION=us-east-1
39 | entrypoint: >
40 | /bin/sh -c "
41 | until (/usr/bin/mc alias set minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
42 | /usr/bin/mc rm -r --force minio/warehouse;
43 | /usr/bin/mc mb minio/warehouse;
44 | /usr/bin/mc policy set public minio/warehouse;
45 | exit 0;
46 | "
47 |
--------------------------------------------------------------------------------
/dev/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | #
20 |
21 | start-master.sh -p 7077
22 | start-worker.sh spark://spark-iceberg:7077
23 | start-history-server.sh
24 |
25 | tail -f /dev/null
26 |
--------------------------------------------------------------------------------
/dev/hive/Dockerfile:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | FROM openjdk:8-jre-slim AS build
17 |
18 | RUN apt-get update -qq && apt-get -qq -y install curl
19 |
20 | ENV HADOOP_VERSION=3.3.6
21 | ENV AWS_SDK_BUNDLE=1.12.753
22 |
23 | RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /tmp/hadoop-aws-${HADOOP_VERSION}.jar
24 | RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /tmp/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar
25 |
26 | FROM apache/hive:4.0.0
27 |
28 | ENV HADOOP_VERSION=3.3.6
29 | ENV AWS_SDK_BUNDLE=1.12.753
30 |
31 | COPY --from=build /tmp/hadoop-aws-${HADOOP_VERSION}.jar /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar
32 | COPY --from=build /tmp/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar
33 | COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml
34 |
--------------------------------------------------------------------------------
/dev/hive/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
19 |
20 |
21 |
22 | fs.defaultFS
23 | s3a://warehouse/hive
24 |
25 |
26 | fs.s3a.impl
27 | org.apache.hadoop.fs.s3a.S3AFileSystem
28 |
29 |
30 | fs.s3a.fast.upload
31 | true
32 |
33 |
34 | fs.s3a.endpoint
35 | http://minio:9000
36 |
37 |
38 | fs.s3a.access.key
39 | admin
40 |
41 |
42 | fs.s3a.secret.key
43 | password
44 |
45 |
46 | fs.s3a.connection.ssl.enabled
47 | false
48 |
49 |
50 | fs.s3a.path.style.access
51 | true
52 |
53 |
54 |
--------------------------------------------------------------------------------
/dev/run-azurite.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | #
20 |
21 | set -ex
22 |
23 | if [ $(docker ps -q --filter "name=azurite" --filter "status=running" ) ]; then
24 | echo "Azurite backend running"
25 | else
26 | docker compose -f dev/docker-compose-azurite.yml kill
27 | docker compose -f dev/docker-compose-azurite.yml up -d
28 | while [ -z $(docker ps -q --filter "name=azurite" --filter "status=running" ) ]
29 | do
30 | echo "Waiting for Azurite"
31 | sleep 1
32 | done
33 | fi
34 |
--------------------------------------------------------------------------------
/dev/run-gcs-server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | #
20 |
21 | set -ex
22 |
23 | if [ $(docker ps -q --filter "name=gcs-server" --filter "status=running" ) ]; then
24 | echo "Fake GCS Server running"
25 | else
26 | docker compose -f dev/docker-compose-gcs-server.yml kill
27 | docker compose -f dev/docker-compose-gcs-server.yml up -d
28 | while [ -z $(docker ps -q --filter "name=gcs-server" --filter "status=running" ) ]
29 | do
30 | echo "Waiting for Fake GCS Server"
31 | sleep 1
32 | done
33 | fi
34 |
--------------------------------------------------------------------------------
/dev/run-minio.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | #
20 |
21 | set -ex
22 |
23 | if [ $(docker ps -q --filter "name=pyiceberg-minio" --filter "status=running" ) ]; then
24 | echo "Minio backend running"
25 | else
26 | docker compose -f dev/docker-compose.yml kill
27 | docker compose -f dev/docker-compose.yml up -d
28 | while [ -z $(docker ps -q --filter "name=pyiceberg-minio" --filter "status=running" ) ]
29 | do
30 | echo "Waiting for Minio"
31 | sleep 1
32 | done
33 | fi
34 |
--------------------------------------------------------------------------------
/dev/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
19 | spark.sql.catalog.rest org.apache.iceberg.spark.SparkCatalog
20 | spark.sql.catalog.rest.type rest
21 | spark.sql.catalog.rest.uri http://rest:8181
22 | spark.sql.catalog.rest.io-impl org.apache.iceberg.aws.s3.S3FileIO
23 | spark.sql.catalog.rest.warehouse s3://warehouse/rest/
24 | spark.sql.catalog.rest.s3.endpoint http://minio:9000
25 | spark.sql.catalog.hive org.apache.iceberg.spark.SparkCatalog
26 | spark.sql.catalog.hive.type hive
27 | spark.sql.catalog.hive.uri http://hive:9083
28 | spark.sql.catalog.hive.io-impl org.apache.iceberg.aws.s3.S3FileIO
29 | spark.sql.catalog.hive.warehouse s3://warehouse/hive/
30 | spark.sql.catalog.hive.s3.endpoint http://minio:9000
31 | spark.sql.defaultCatalog rest
32 | spark.eventLog.enabled true
33 | spark.eventLog.dir /home/iceberg/spark-events
34 | spark.history.fs.logDirectory /home/iceberg/spark-events
35 | spark.sql.catalogImplementation in-memory
36 |
--------------------------------------------------------------------------------
/mkdocs/README.md:
--------------------------------------------------------------------------------
1 |
17 |
18 | # Docs
19 |
20 | The pyiceberg docs are stored in `docs/`.
21 |
22 | ## Running docs locally
23 |
24 | ```sh
25 | make docs-install
26 | make docs-serve
27 | ```
28 |
--------------------------------------------------------------------------------
/mkdocs/docs/SUMMARY.md:
--------------------------------------------------------------------------------
1 |
17 |
18 |
19 |
20 |
21 | # Summary
22 |
23 | - [Getting started](index.md)
24 | - [Configuration](configuration.md)
25 | - [CLI](cli.md)
26 | - [API](api.md)
27 | - [Contributing](contributing.md)
28 | - [Community](community.md)
29 | - Releases
30 | - [Verify a release](verify-release.md)
31 | - [How to release](how-to-release.md)
32 | - [Release Notes](https://github.com/apache/iceberg-python/releases)
33 | - [Nightly Build](nightly-build.md)
34 | - [Code Reference](reference/)
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/mkdocs/docs/assets/images/gen-release-notes.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/iceberg-python/a67c5592f3243d255519581fedfcc5d93274b9c8/mkdocs/docs/assets/images/gen-release-notes.jpg
--------------------------------------------------------------------------------
/mkdocs/docs/assets/images/iceberg-logo-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apache/iceberg-python/a67c5592f3243d255519581fedfcc5d93274b9c8/mkdocs/docs/assets/images/iceberg-logo-icon.png
--------------------------------------------------------------------------------
/mkdocs/docs/community.md:
--------------------------------------------------------------------------------
1 | ---
2 | hide:
3 | - navigation
4 | ---
5 |
6 |
24 |
25 | # Join the community
26 |
27 | Apache Iceberg tracks issues in GitHub and prefers to receive contributions as pull requests.
28 |
29 | Community discussions happen primarily on the [dev mailing list](https://lists.apache.org/list.html?dev@iceberg.apache.org), on [Apache Iceberg Slack workspace](https://join.slack.com/t/apache-iceberg/shared_invite/zt-287g3akar-K9Oe_En5j1UL7Y_Ikpai3A) in the #python channel, and on specific [GitHub issues](https://github.com/apache/iceberg-python/issues).
30 |
31 | ## Iceberg Community Events
32 |
33 | The PyIceberg community sync is on the last Tuesday of every month. To join, make sure to subscribe to the [iceberg-python-sync Google group](https://groups.google.com/g/iceberg-python-sync).
34 |
35 | ## Community Guidelines
36 |
37 | ### Apache Iceberg Community Guidelines
38 |
39 | The Apache Iceberg community is built on the principles described in the [Apache Way](https://www.apache.org/theapacheway/index.html)
40 | and all who engage with the community are expected to be respectful, open, come with the best interests of the community in mind,
41 | and abide by the Apache Foundation [Code of Conduct](https://www.apache.org/foundation/policies/conduct.html).
42 |
43 | ### Participants with Corporate Interests
44 |
45 | A wide range of corporate entities have interests that overlap in both features and frameworks related to Iceberg and while we
46 | encourage engagement and contributions, the community is not a venue for marketing, solicitation, or recruitment.
47 |
48 | Any vendor who wants to participate in the Apache Iceberg community Slack workspace should create a dedicated vendor channel
49 | for their organization prefixed by `vendor-`.
50 |
51 | This space can be used to discuss features and integration with Iceberg related to the vendor offering. This space should not
52 | be used to promote competing vendor products/services or disparage other vendor offerings. Discussion should be focused on
53 | questions asked by the community and not to expand/introduce/redirect users to alternate offerings.
54 |
55 | ### Marketing / Solicitation / Recruiting
56 |
57 | The Apache Iceberg community is a space for everyone to operate free of influence. The development lists, Slack workspace,
58 | and GitHub should not be used to market products or services. Solicitation or overt promotion should not be performed in common
59 | channels or through direct messages.
60 |
61 | Recruitment of community members should not be conducted through direct messages or community channels, but opportunities
62 | related to contributing to or using Iceberg can be posted to the `#jobs` channel.
63 |
64 | For questions regarding any of the guidelines above, please contact a PMC member
65 |
--------------------------------------------------------------------------------
/mkdocs/docs/nightly-build.md:
--------------------------------------------------------------------------------
1 |
19 |
20 | # Nightly Build
21 |
22 | A nightly build of PyIceberg is available on testpypi, [https://test.pypi.org/project/pyiceberg/](https://test.pypi.org/project/pyiceberg/).
23 |
24 | To install the nightly build,
25 |
26 | ```shell
27 | pip install -i https://test.pypi.org/simple/ --pre pyiceberg
28 | ```
29 |
30 |
31 |
32 | !!! warning "For Testing Purposes Only"
33 | Nightly builds are for testing purposes only and have not been validated. Please use at your own risk, as they may contain untested changes, potential bugs, or incomplete features. Additionally, ensure compliance with any applicable licenses, as these builds may include changes that have not been reviewed for legal or licensing implications.
34 |
35 |
36 |
--------------------------------------------------------------------------------
/mkdocs/docs/verify-release.md:
--------------------------------------------------------------------------------
1 |
19 |
20 | # Verifying a release
21 |
22 | Each Apache PyIceberg release is validated by the community by holding a vote. A community release manager will prepare a release candidate and call a vote on the Iceberg dev list. To validate the release candidate, community members will test it out in their downstream projects and environments.
23 |
24 | In addition to testing in downstream projects, community members also check the release’s signatures, checksums, and license documentation.
25 |
26 | ## Validating a release candidate
27 |
28 | Release announcements include links to the following:
29 |
30 | - A source tarball
31 | - A signature (.asc)
32 | - A checksum (.sha512)
33 | - KEYS file
34 | - GitHub change comparison
35 |
36 | After downloading the source tarball, signature, checksum, and KEYS file, here are instructions on how to verify signatures, checksums, and documentation.
37 |
38 | ## Verifying signatures
39 |
40 | First, import the keys.
41 |
42 | ```sh
43 | curl https://downloads.apache.org/iceberg/KEYS -o KEYS
44 | gpg --import KEYS
45 | ```
46 |
47 | Set an environment variable to the version to verify and path to use
48 |
49 | ```sh
50 | export PYICEBERG_VERSION= # e.g. 0.6.1rc3
51 | export PYICEBERG_VERIFICATION_DIR=/tmp/pyiceberg/${PYICEBERG_VERSION}
52 | ```
53 |
54 | Next, verify the `.asc` file.
55 |
56 | ```sh
57 | svn checkout https://dist.apache.org/repos/dist/dev/iceberg/pyiceberg-${PYICEBERG_VERSION}/ ${PYICEBERG_VERIFICATION_DIR}
58 |
59 | cd ${PYICEBERG_VERIFICATION_DIR}
60 |
61 | for name in $(ls pyiceberg-*.whl pyiceberg-*.tar.gz)
62 | do
63 | gpg --verify ${name}.asc ${name}
64 | done
65 | ```
66 |
67 | ## Verifying checksums
68 |
69 | ```sh
70 | cd ${PYICEBERG_VERIFICATION_DIR}
71 | for name in $(ls pyiceberg-*.whl.sha512 pyiceberg-*.tar.gz.sha512)
72 | do
73 | shasum -a 512 --check ${name}
74 | done
75 | ```
76 |
77 | ## Verifying License Documentation
78 |
79 | ```sh
80 | export PYICEBERG_RELEASE_VERSION=${PYICEBERG_VERSION/rc?/} # remove rcX qualifier
81 | tar xzf pyiceberg-${PYICEBERG_RELEASE_VERSION}.tar.gz
82 | cd pyiceberg-${PYICEBERG_RELEASE_VERSION}
83 | ```
84 |
85 | Run RAT checks to validate license header:
86 |
87 | ```shell
88 | ./dev/check-license
89 | ```
90 |
91 | ## Testing
92 |
93 | This section explains how to run the tests of the source distribution.
94 |
95 |
96 |
97 | !!! note "Python Version"
98 | Make sure you're using [a supported Python version](https://github.com/apache/iceberg-python/blob/main/pyproject.toml#L29-L32)
99 |
100 |
101 |
102 | First step is to install the package:
103 |
104 | ```sh
105 | make install
106 | ```
107 |
108 | To run the full test coverage, with both unit tests and integration tests:
109 |
110 | ```sh
111 | make test-coverage
112 | ```
113 |
114 | This will spin up Docker containers to facilitate running test coverage.
115 |
116 | # Cast the vote
117 |
118 | Votes are cast by replying to the release candidate announcement email on the dev mailing list with either `+1`, `0`, or `-1`. For example :
119 |
120 | > [ ] +1 Release this as PyIceberg 0.3.0
121 | >
122 | > [ ] +0
123 | >
124 | > [ ] -1 Do not release this because…
125 |
126 | In addition to your vote, it’s customary to specify if your vote is binding or non-binding. Only members of the Project Management Committee have formally binding votes. If you’re unsure, you can specify that your vote is non-binding. To read more about voting in the Apache framework, checkout the [Voting](https://www.apache.org/foundation/voting.html) information page on the Apache foundation’s website.
127 |
--------------------------------------------------------------------------------
/mkdocs/gen_doc_stubs.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from pathlib import Path
19 |
20 | import griffe # type: ignore
21 | import mkdocs_gen_files # type: ignore
22 |
23 | nav = mkdocs_gen_files.Nav()
24 |
25 | root = Path(__file__).parent.parent
26 | src_root = root.joinpath("pyiceberg")
27 |
28 | data = griffe.load(src_root)
29 | for path in sorted(src_root.glob("**/*.py")):
30 | module_path = path.relative_to(root).with_suffix("")
31 | doc_path = path.relative_to(root).with_suffix(".md")
32 | full_doc_path = Path("reference", doc_path)
33 |
34 | parts = tuple(module_path.parts)
35 |
36 | if parts[-1] == "__init__":
37 | parts = parts[:-1]
38 | doc_path = doc_path.with_name("index.md")
39 | full_doc_path = full_doc_path.with_name("index.md")
40 | elif parts[-1].startswith("_"):
41 | continue
42 |
43 | if module_path.parts[1:] in data.members and not data[module_path.parts[1:]].has_docstrings:
44 | continue
45 |
46 | nav[parts] = doc_path.as_posix()
47 |
48 | with mkdocs_gen_files.open(full_doc_path, "w") as fd:
49 | ident = ".".join(parts)
50 | fd.write(f"::: {ident}")
51 |
52 | mkdocs_gen_files.set_edit_path(full_doc_path, Path("../") / path)
53 |
54 | with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file:
55 | nav_file.writelines(nav.build_literate_nav())
56 |
--------------------------------------------------------------------------------
/mkdocs/mkdocs.yml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | ---
18 | site_name: PyIceberg
19 | site_url: https://py.iceberg.apache.org/
20 | repo_url: "https://github.com/apache/iceberg-python"
21 | repo_name: "apache/iceberg-python"
22 |
23 | plugins:
24 | - gen-files:
25 | scripts:
26 | - gen_doc_stubs.py
27 | - literate-nav:
28 | nav_file: SUMMARY.md
29 | - search
30 | - section-index
31 | - mkdocstrings:
32 | handlers:
33 | python:
34 | paths: [..]
35 |
36 | theme:
37 | name: material
38 | logo: assets/images/iceberg-logo-icon.png
39 | favicon: assets/images/iceberg-logo-icon.png
40 | font:
41 | text: Lato
42 | features:
43 | - navigation.top
44 | - navigation.tracking
45 | - navigation.tabs
46 | - navigation.tabs.sticky
47 | palette:
48 | - scheme: default
49 | toggle:
50 | icon: material/brightness-7
51 | name: Switch to dark mode
52 | - scheme: slate
53 | toggle:
54 | icon: material/brightness-4
55 | name: Switch to light mode
56 |
57 | markdown_extensions:
58 | - admonition
59 | - pymdownx.highlight:
60 | anchor_linenums: true
61 | - pymdownx.superfences
62 | - toc:
63 | permalink: true
64 |
--------------------------------------------------------------------------------
/pyiceberg/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | __version__ = "0.10.0"
19 |
--------------------------------------------------------------------------------
/pyiceberg/avro/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | import struct
18 |
19 | STRUCT_BOOL = struct.Struct("?")
20 | STRUCT_FLOAT = struct.Struct(" tuple[bytes, int]:
27 | compressed_data = bz2.compress(data)
28 | return compressed_data, len(compressed_data)
29 |
30 | @staticmethod
31 | def decompress(data: bytes) -> bytes:
32 | return bz2.decompress(data)
33 |
34 | except ImportError:
35 |
36 | class BZip2Codec(Codec): # type: ignore
37 | @staticmethod
38 | def compress(data: bytes) -> tuple[bytes, int]:
39 | raise ImportError("Python bzip2 support not installed, please install the extension")
40 |
41 | @staticmethod
42 | def decompress(data: bytes) -> bytes:
43 | raise ImportError("Python bzip2 support not installed, please install the extension")
44 |
--------------------------------------------------------------------------------
/pyiceberg/avro/codecs/codec.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from __future__ import annotations
18 |
19 | from abc import ABC, abstractmethod
20 |
21 |
22 | class Codec(ABC):
23 | """Abstract base class for all Avro codec classes."""
24 |
25 | @staticmethod
26 | @abstractmethod
27 | def compress(data: bytes) -> tuple[bytes, int]: ...
28 |
29 | @staticmethod
30 | @abstractmethod
31 | def decompress(data: bytes) -> bytes: ...
32 |
--------------------------------------------------------------------------------
/pyiceberg/avro/codecs/deflate.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from __future__ import annotations
18 |
19 | import zlib
20 |
21 | from pyiceberg.avro.codecs.codec import Codec
22 |
23 |
24 | class DeflateCodec(Codec):
25 | @staticmethod
26 | def compress(data: bytes) -> tuple[bytes, int]:
27 | # The first two characters and last character are zlib
28 | # wrappers around deflate data.
29 | compressed_data = zlib.compress(data)[2:-1]
30 | return compressed_data, len(compressed_data)
31 |
32 | @staticmethod
33 | def decompress(data: bytes) -> bytes:
34 | # -15 is the log of the window size; negative indicates
35 | # "raw" (no zlib headers) decompression. See zlib.h.
36 | return zlib.decompress(data, -15)
37 |
--------------------------------------------------------------------------------
/pyiceberg/avro/codecs/snappy_codec.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from __future__ import annotations
18 |
19 | import binascii
20 | import struct
21 |
22 | from pyiceberg.avro.codecs.codec import Codec
23 |
24 | STRUCT_CRC32 = struct.Struct(">I") # big-endian unsigned int
25 |
26 | try:
27 | import snappy
28 |
29 | class SnappyCodec(Codec):
30 | @staticmethod
31 | def _check_crc32(bytes_: bytes, checksum: bytes) -> None:
32 | """Incrementally compute CRC-32 from bytes and compare to a checksum.
33 |
34 | Args:
35 | bytes_ (bytes): The bytes to check against `checksum`
36 | checksum (bytes): Byte representation of a checksum
37 |
38 | Raises:
39 | ValueError: If the computed CRC-32 does not match the checksum
40 | """
41 | if binascii.crc32(bytes_) & 0xFFFFFFFF != STRUCT_CRC32.unpack(checksum)[0]:
42 | raise ValueError("Checksum failure")
43 |
44 | @staticmethod
45 | def compress(data: bytes) -> tuple[bytes, int]:
46 | compressed_data = snappy.compress(data)
47 | # A 4-byte, big-endian CRC32 checksum
48 | compressed_data += STRUCT_CRC32.pack(binascii.crc32(data) & 0xFFFFFFFF)
49 | return compressed_data, len(compressed_data)
50 |
51 | @staticmethod
52 | def decompress(data: bytes) -> bytes:
53 | # Compressed data includes a 4-byte CRC32 checksum
54 | data = data[0:-4]
55 | uncompressed = snappy.decompress(data)
56 | checksum = data[-4:]
57 | SnappyCodec._check_crc32(uncompressed, checksum)
58 | return uncompressed
59 |
60 | except ImportError:
61 |
62 | class SnappyCodec(Codec): # type: ignore
63 | @staticmethod
64 | def compress(data: bytes) -> tuple[bytes, int]:
65 | raise ImportError("Snappy support not installed, please install using `pip install pyiceberg[snappy]`")
66 |
67 | @staticmethod
68 | def decompress(data: bytes) -> bytes:
69 | raise ImportError("Snappy support not installed, please install using `pip install pyiceberg[snappy]`")
70 |
--------------------------------------------------------------------------------
/pyiceberg/avro/codecs/zstandard_codec.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from __future__ import annotations
18 |
19 | from io import BytesIO
20 |
21 | from pyiceberg.avro.codecs.codec import Codec
22 |
23 | try:
24 | from zstandard import ZstdCompressor, ZstdDecompressor
25 |
26 | class ZStandardCodec(Codec):
27 | @staticmethod
28 | def compress(data: bytes) -> tuple[bytes, int]:
29 | compressed_data = ZstdCompressor().compress(data)
30 | return compressed_data, len(compressed_data)
31 |
32 | @staticmethod
33 | def decompress(data: bytes) -> bytes:
34 | uncompressed = bytearray()
35 | dctx = ZstdDecompressor()
36 | with dctx.stream_reader(BytesIO(data)) as reader:
37 | while True:
38 | chunk = reader.read(16384)
39 | if not chunk:
40 | break
41 | uncompressed.extend(chunk)
42 | return uncompressed
43 |
44 | except ImportError:
45 |
46 | class ZStandardCodec(Codec): # type: ignore
47 | @staticmethod
48 | def compress(data: bytes) -> tuple[bytes, int]:
49 | raise ImportError("Zstandard support not installed, please install using `pip install pyiceberg[zstandard]`")
50 |
51 | @staticmethod
52 | def decompress(data: bytes) -> bytes:
53 | raise ImportError("Zstandard support not installed, please install using `pip install pyiceberg[zstandard]`")
54 |
--------------------------------------------------------------------------------
/pyiceberg/avro/decoder_basic.c:
--------------------------------------------------------------------------------
1 | /*
2 | Licensed to the Apache Software Foundation (ASF) under one
3 | or more contributor license agreements. See the NOTICE file
4 | distributed with this work for additional information
5 | regarding copyright ownership. The ASF licenses this file
6 | to you under the Apache License, Version 2.0 (the
7 | "License"); you may not use this file except in compliance
8 | with the License. You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing,
13 | software distributed under the License is distributed on an
14 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | KIND, either express or implied. See the License for the
16 | specific language governing permissions and limitations
17 | under the License.
18 | */
19 |
20 | #include
21 |
22 | /*
23 | Decode an an array of zig-zag encoded integers from a buffer.
24 |
25 | The buffer is advanced to the end of the integers.
26 | `count` is the number of integers to decode.
27 | `result` is where the decoded integers are stored.
28 |
29 | The result is guaranteed to be 64 bits wide.
30 |
31 | */
32 | static inline void decode_zigzag_ints(const unsigned char **buffer, const uint64_t count, uint64_t *result) {
33 | uint64_t current_index;
34 | const unsigned char *current_position = *buffer;
35 | uint64_t temp;
36 | // The largest shift will always be < 64
37 | unsigned char shift;
38 |
39 | for (current_index = 0; current_index < count; current_index++) {
40 | shift = 7;
41 | temp = *current_position & 0x7F;
42 | while(*current_position & 0x80) {
43 | current_position += 1;
44 | temp |= (uint64_t)(*current_position & 0x7F) << shift;
45 | shift += 7;
46 | }
47 | result[current_index] = (temp >> 1) ^ (~(temp & 1) + 1);
48 | current_position += 1;
49 | }
50 | *buffer = current_position;
51 | }
52 |
53 |
54 |
55 | /*
56 | Skip a zig-zag encoded integer in a buffer.
57 |
58 | The buffer is advanced to the end of the integer.
59 | */
60 | static inline void skip_zigzag_int(const unsigned char **buffer) {
61 | while(**buffer & 0x80) {
62 | *buffer += 1;
63 | }
64 | *buffer += 1;
65 | }
66 |
--------------------------------------------------------------------------------
/pyiceberg/avro/decoder_fast.pyi:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from pyiceberg.avro.decoder import BinaryDecoder
19 |
20 | class CythonBinaryDecoder(BinaryDecoder):
21 | def __init__(self, input_contents: bytes) -> None:
22 | pass
23 |
24 | def tell(self) -> int:
25 | pass
26 |
27 | def read(self, n: int) -> bytes:
28 | pass
29 |
30 | def read_boolean(self) -> bool:
31 | pass
32 |
33 | def read_int(self) -> int:
34 | pass
35 |
36 | def read_ints(self, count: int) -> tuple[int, ...]:
37 | pass
38 |
39 | def read_int_bytes_dict(self, count: int, dest: dict[int, bytes]) -> None:
40 | pass
41 |
42 | def read_bytes(self) -> bytes:
43 | pass
44 |
45 | def read_float(self) -> float:
46 | pass
47 |
48 | def read_double(self) -> float:
49 | pass
50 |
51 | def read_utf8(self) -> str:
52 | pass
53 |
54 | def skip(self, n: int) -> None:
55 | pass
56 |
57 | def skip_int(self) -> None:
58 | pass
59 |
60 | def skip_boolean(self) -> None:
61 | pass
62 |
63 | def skip_float(self) -> None:
64 | pass
65 |
66 | def skip_double(self) -> None:
67 | pass
68 |
69 | def skip_bytes(self) -> None:
70 | pass
71 |
72 | def skip_utf8(self) -> None:
73 | pass
74 |
--------------------------------------------------------------------------------
/pyiceberg/avro/encoder.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from typing import Any
18 | from uuid import UUID
19 |
20 | from pyiceberg.avro import STRUCT_DOUBLE, STRUCT_FLOAT
21 | from pyiceberg.io import OutputStream
22 | from pyiceberg.typedef import UTF8
23 |
24 |
25 | class BinaryEncoder:
26 | """Encodes Python physical types into bytes."""
27 |
28 | _output_stream: OutputStream
29 |
30 | def __init__(self, output_stream: OutputStream) -> None:
31 | self._output_stream = output_stream
32 |
33 | def write(self, b: bytes) -> None:
34 | self._output_stream.write(b)
35 |
36 | def write_boolean(self, boolean: bool) -> None:
37 | """Write a boolean as a single byte whose value is either 0 (false) or 1 (true).
38 |
39 | Args:
40 | boolean: The boolean to write.
41 | """
42 | self.write(bytearray([bool(boolean)]))
43 |
44 | def write_int(self, integer: int) -> None:
45 | """Integer and long values are written using variable-length zig-zag coding."""
46 | datum = (integer << 1) ^ (integer >> 63)
47 | while (datum & ~0x7F) != 0:
48 | self.write(bytearray([(datum & 0x7F) | 0x80]))
49 | datum >>= 7
50 | self.write(bytearray([datum]))
51 |
52 | def write_float(self, f: float) -> None:
53 | """Write a float as 4 bytes."""
54 | self.write(STRUCT_FLOAT.pack(f))
55 |
56 | def write_double(self, f: float) -> None:
57 | """Write a double as 8 bytes."""
58 | self.write(STRUCT_DOUBLE.pack(f))
59 |
60 | def write_bytes(self, b: bytes) -> None:
61 | """Bytes are encoded as a long followed by that many bytes of data."""
62 | self.write_int(len(b))
63 | self.write(b)
64 |
65 | def write_utf8(self, s: str) -> None:
66 | """Encode a string as a long followed by that many bytes of UTF-8 encoded character data."""
67 | self.write_bytes(s.encode(UTF8))
68 |
69 | def write_uuid(self, uuid: UUID) -> None:
70 | """Write UUID as a fixed[16].
71 |
72 | The uuid logical type represents a random generated universally unique identifier (UUID).
73 | An uuid logical type annotates an Avro string. The string has to conform with RFC-4122.
74 | """
75 | if len(uuid.bytes) != 16:
76 | raise ValueError(f"Expected UUID to have 16 bytes, got: len({uuid.bytes!r})")
77 | return self.write(uuid.bytes)
78 |
79 | def write_unknown(self, _: Any) -> None:
80 | """Nulls are written as 0 bytes in avro, so we do nothing."""
81 |
--------------------------------------------------------------------------------
/pyiceberg/catalog/memory.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from pyiceberg.catalog.sql import SqlCatalog
19 |
20 |
21 | class InMemoryCatalog(SqlCatalog):
22 | """
23 | An in-memory catalog implementation that uses SqlCatalog with SQLite in-memory database.
24 |
25 | This is useful for test, demo, and playground but not in production as it does not support concurrent access.
26 | """
27 |
28 | def __init__(self, name: str, warehouse: str = "file:///tmp/iceberg/warehouse", **kwargs: str) -> None:
29 | self._warehouse_location = warehouse
30 | if "uri" not in kwargs:
31 | kwargs["uri"] = "sqlite:///:memory:"
32 | super().__init__(name=name, warehouse=warehouse, **kwargs)
33 |
--------------------------------------------------------------------------------
/pyiceberg/catalog/rest/response.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from json import JSONDecodeError
18 | from typing import Dict, Literal, Optional, Type
19 |
20 | from pydantic import Field, ValidationError
21 | from requests import HTTPError
22 |
23 | from pyiceberg.exceptions import (
24 | AuthorizationExpiredError,
25 | BadRequestError,
26 | ForbiddenError,
27 | OAuthError,
28 | RESTError,
29 | ServerError,
30 | ServiceUnavailableError,
31 | UnauthorizedError,
32 | )
33 | from pyiceberg.typedef import IcebergBaseModel
34 |
35 |
36 | class TokenResponse(IcebergBaseModel):
37 | access_token: str = Field()
38 | token_type: str = Field()
39 | expires_in: Optional[int] = Field(default=None)
40 | issued_token_type: Optional[str] = Field(default=None)
41 | refresh_token: Optional[str] = Field(default=None)
42 | scope: Optional[str] = Field(default=None)
43 |
44 |
45 | class ErrorResponseMessage(IcebergBaseModel):
46 | message: str = Field()
47 | type: str = Field()
48 | code: int = Field()
49 |
50 |
51 | class ErrorResponse(IcebergBaseModel):
52 | error: ErrorResponseMessage = Field()
53 |
54 |
55 | class OAuthErrorResponse(IcebergBaseModel):
56 | error: Literal[
57 | "invalid_request", "invalid_client", "invalid_grant", "unauthorized_client", "unsupported_grant_type", "invalid_scope"
58 | ]
59 | error_description: Optional[str] = None
60 | error_uri: Optional[str] = None
61 |
62 |
63 | def _handle_non_200_response(exc: HTTPError, error_handler: Dict[int, Type[Exception]]) -> None:
64 | exception: Type[Exception]
65 |
66 | if exc.response is None:
67 | raise ValueError("Did not receive a response")
68 |
69 | code = exc.response.status_code
70 | if code in error_handler:
71 | exception = error_handler[code]
72 | elif code == 400:
73 | exception = BadRequestError
74 | elif code == 401:
75 | exception = UnauthorizedError
76 | elif code == 403:
77 | exception = ForbiddenError
78 | elif code == 422:
79 | exception = RESTError
80 | elif code == 419:
81 | exception = AuthorizationExpiredError
82 | elif code == 501:
83 | exception = NotImplementedError
84 | elif code == 503:
85 | exception = ServiceUnavailableError
86 | elif 500 <= code < 600:
87 | exception = ServerError
88 | else:
89 | exception = RESTError
90 |
91 | try:
92 | if exception == OAuthError:
93 | # The OAuthErrorResponse has a different format
94 | error = OAuthErrorResponse.model_validate_json(exc.response.text)
95 | response = str(error.error)
96 | if description := error.error_description:
97 | response += f": {description}"
98 | if uri := error.error_uri:
99 | response += f" ({uri})"
100 | else:
101 | error = ErrorResponse.model_validate_json(exc.response.text).error
102 | response = f"{error.type}: {error.message}"
103 | except JSONDecodeError:
104 | # In the case we don't have a proper response
105 | response = f"RESTError {exc.response.status_code}: Could not decode json payload: {exc.response.text}"
106 | except ValidationError as e:
107 | # In the case we don't have a proper response
108 | errs = ", ".join(err["msg"] for err in e.errors())
109 | response = f"RESTError {exc.response.status_code}: Received unexpected JSON Payload: {exc.response.text}, errors: {errs}"
110 |
111 | raise exception(response) from exc
112 |
--------------------------------------------------------------------------------
/pyiceberg/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
--------------------------------------------------------------------------------
/pyiceberg/exceptions.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 |
19 | class TableAlreadyExistsError(Exception):
20 | """Raised when creating a table with a name that already exists."""
21 |
22 |
23 | class NamespaceNotEmptyError(Exception):
24 | """Raised when a name-space being dropped is not empty."""
25 |
26 |
27 | class NamespaceAlreadyExistsError(Exception):
28 | """Raised when a name-space being created already exists in the catalog."""
29 |
30 |
31 | class ValidationError(Exception):
32 | """Raises when there is an issue with the schema."""
33 |
34 |
35 | class NoSuchTableError(Exception):
36 | """Raises when the table can't be found in the REST catalog."""
37 |
38 |
39 | class NoSuchIcebergTableError(NoSuchTableError):
40 | """Raises when the table found in the REST catalog is not an iceberg table."""
41 |
42 |
43 | class NoSuchViewError(Exception):
44 | """Raises when the view can't be found in the REST catalog."""
45 |
46 |
47 | class NoSuchIdentifierError(Exception):
48 | """Raises when the identifier can't be found in the REST catalog."""
49 |
50 |
51 | class NoSuchNamespaceError(Exception):
52 | """Raised when a referenced name-space is not found."""
53 |
54 |
55 | class RESTError(Exception):
56 | """Raises when there is an unknown response from the REST Catalog."""
57 |
58 |
59 | class BadRequestError(RESTError):
60 | """Raises when an invalid request is being made."""
61 |
62 |
63 | class UnauthorizedError(RESTError):
64 | """Raises when you don't have the proper authorization."""
65 |
66 |
67 | class ServiceUnavailableError(RESTError):
68 | """Raises when the service doesn't respond."""
69 |
70 |
71 | class ServerError(RESTError):
72 | """Raises when there is an unhandled exception on the server side."""
73 |
74 |
75 | class ForbiddenError(RESTError):
76 | """Raises when you don't have the credentials to perform the action on the REST catalog."""
77 |
78 |
79 | class AuthorizationExpiredError(RESTError):
80 | """When the credentials are expired when performing an action on the REST catalog."""
81 |
82 |
83 | class OAuthError(RESTError):
84 | """Raises when there is an error with the OAuth call."""
85 |
86 |
87 | class NoSuchPropertyException(Exception):
88 | """When a property is missing."""
89 |
90 |
91 | class NotInstalledError(Exception):
92 | """When an optional dependency is not installed."""
93 |
94 |
95 | class SignError(Exception):
96 | """Raises when unable to sign a S3 request."""
97 |
98 |
99 | class ResolveError(Exception):
100 | pass
101 |
102 |
103 | class DynamoDbError(Exception):
104 | pass
105 |
106 |
107 | class ConditionalCheckFailedException(DynamoDbError):
108 | pass
109 |
110 |
111 | class GenericDynamoDbError(DynamoDbError):
112 | pass
113 |
114 |
115 | class CommitFailedException(Exception):
116 | """Commit failed, refresh and try again."""
117 |
118 |
119 | class CommitStateUnknownException(RESTError):
120 | """Commit failed due to unknown reason."""
121 |
122 |
123 | class WaitingForLockException(Exception):
124 | """Need to wait for a lock, try again."""
125 |
126 |
127 | class ValidationException(Exception):
128 | """Raised when validation fails."""
129 |
--------------------------------------------------------------------------------
/pyiceberg/py.typed:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | # Marker file for PEP 561
19 |
--------------------------------------------------------------------------------
/pyiceberg/table/puffin.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | import math
18 | from typing import TYPE_CHECKING, Dict, List, Literal, Optional
19 |
20 | from pydantic import Field
21 | from pyroaring import BitMap, FrozenBitMap
22 |
23 | from pyiceberg.typedef import IcebergBaseModel
24 |
25 | if TYPE_CHECKING:
26 | import pyarrow as pa
27 |
28 | # Short for: Puffin Fratercula arctica, version 1
29 | MAGIC_BYTES = b"PFA1"
30 | EMPTY_BITMAP = FrozenBitMap()
31 | MAX_JAVA_SIGNED = int(math.pow(2, 31)) - 1
32 | PROPERTY_REFERENCED_DATA_FILE = "referenced-data-file"
33 |
34 |
35 | def _deserialize_bitmap(pl: bytes) -> List[BitMap]:
36 | number_of_bitmaps = int.from_bytes(pl[0:8], byteorder="little")
37 | pl = pl[8:]
38 |
39 | bitmaps = []
40 | last_key = -1
41 | for _ in range(number_of_bitmaps):
42 | key = int.from_bytes(pl[0:4], byteorder="little")
43 | if key < 0:
44 | raise ValueError(f"Invalid unsigned key: {key}")
45 | if key <= last_key:
46 | raise ValueError("Keys must be sorted in ascending order")
47 | if key > MAX_JAVA_SIGNED:
48 | raise ValueError(f"Key {key} is too large, max {MAX_JAVA_SIGNED} to maintain compatibility with Java impl")
49 | pl = pl[4:]
50 |
51 | while last_key < key - 1:
52 | bitmaps.append(EMPTY_BITMAP)
53 | last_key += 1
54 |
55 | bm = BitMap().deserialize(pl)
56 | # TODO: Optimize this
57 | pl = pl[len(bm.serialize()) :]
58 | bitmaps.append(bm)
59 |
60 | last_key = key
61 |
62 | return bitmaps
63 |
64 |
65 | class PuffinBlobMetadata(IcebergBaseModel):
66 | type: Literal["deletion-vector-v1"] = Field()
67 | fields: List[int] = Field()
68 | snapshot_id: int = Field(alias="snapshot-id")
69 | sequence_number: int = Field(alias="sequence-number")
70 | offset: int = Field()
71 | length: int = Field()
72 | compression_codec: Optional[str] = Field(alias="compression-codec", default=None)
73 | properties: Dict[str, str] = Field(default_factory=dict)
74 |
75 |
76 | class Footer(IcebergBaseModel):
77 | blobs: List[PuffinBlobMetadata] = Field()
78 | properties: Dict[str, str] = Field(default_factory=dict)
79 |
80 |
81 | def _bitmaps_to_chunked_array(bitmaps: List[BitMap]) -> "pa.ChunkedArray":
82 | import pyarrow as pa
83 |
84 | return pa.chunked_array([(key_pos << 32) + pos for pos in bitmap] for key_pos, bitmap in enumerate(bitmaps))
85 |
86 |
87 | class PuffinFile:
88 | footer: Footer
89 | _deletion_vectors: Dict[str, List[BitMap]]
90 |
91 | def __init__(self, puffin: bytes) -> None:
92 | for magic_bytes in [puffin[:4], puffin[-4:]]:
93 | if magic_bytes != MAGIC_BYTES:
94 | raise ValueError(f"Incorrect magic bytes, expected {MAGIC_BYTES!r}, got {magic_bytes!r}")
95 |
96 | # One flag is set, the rest should be zero
97 | # byte 0 (first)
98 | # - bit 0 (lowest bit): whether FooterPayload is compressed
99 | # - all other bits are reserved for future use and should be set to 0 on write
100 | flags = puffin[-8:-4]
101 | if flags[0] != 0:
102 | raise ValueError("The Puffin-file has a compressed footer, which is not yet supported")
103 |
104 | # 4 byte integer is always signed, in a two's complement representation, stored little-endian.
105 | footer_payload_size_int = int.from_bytes(puffin[-12:-8], byteorder="little")
106 |
107 | self.footer = Footer.model_validate_json(puffin[-(footer_payload_size_int + 12) : -12])
108 | puffin = puffin[8:]
109 |
110 | self._deletion_vectors = {
111 | blob.properties[PROPERTY_REFERENCED_DATA_FILE]: _deserialize_bitmap(puffin[blob.offset : blob.offset + blob.length])
112 | for blob in self.footer.blobs
113 | }
114 |
115 | def to_vector(self) -> Dict[str, "pa.ChunkedArray"]:
116 | return {path: _bitmaps_to_chunked_array(bitmaps) for path, bitmaps in self._deletion_vectors.items()}
117 |
--------------------------------------------------------------------------------
/pyiceberg/table/refs.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from enum import Enum
18 | from typing import Annotated, Optional
19 |
20 | from pydantic import Field, model_validator
21 |
22 | from pyiceberg.exceptions import ValidationError
23 | from pyiceberg.typedef import IcebergBaseModel
24 |
25 | MAIN_BRANCH = "main"
26 |
27 |
28 | class SnapshotRefType(str, Enum):
29 | BRANCH = "branch"
30 | TAG = "tag"
31 |
32 | def __repr__(self) -> str:
33 | """Return the string representation of the SnapshotRefType class."""
34 | return f"SnapshotRefType.{self.name}"
35 |
36 | def __str__(self) -> str:
37 | """Return the string representation of the SnapshotRefType class."""
38 | return self.value
39 |
40 |
41 | class SnapshotRef(IcebergBaseModel):
42 | snapshot_id: int = Field(alias="snapshot-id")
43 | snapshot_ref_type: SnapshotRefType = Field(alias="type")
44 | min_snapshots_to_keep: Annotated[Optional[int], Field(alias="min-snapshots-to-keep", default=None, gt=0)]
45 | max_snapshot_age_ms: Annotated[Optional[int], Field(alias="max-snapshot-age-ms", default=None, gt=0)]
46 | max_ref_age_ms: Annotated[Optional[int], Field(alias="max-ref-age-ms", default=None, gt=0)]
47 |
48 | @model_validator(mode="after")
49 | def check_min_snapshots_to_keep(self) -> "SnapshotRef":
50 | if self.min_snapshots_to_keep is not None and self.snapshot_ref_type == SnapshotRefType.TAG:
51 | raise ValidationError("Tags do not support setting minSnapshotsToKeep")
52 | return self
53 |
54 | @model_validator(mode="after")
55 | def check_max_snapshot_age_ms(self) -> "SnapshotRef":
56 | if self.max_snapshot_age_ms is not None and self.snapshot_ref_type == SnapshotRefType.TAG:
57 | raise ValidationError("Tags do not support setting maxSnapshotAgeMs")
58 | return self
59 |
--------------------------------------------------------------------------------
/pyiceberg/table/statistics.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from typing import Dict, List, Literal, Optional
18 |
19 | from pydantic import Field
20 |
21 | from pyiceberg.typedef import IcebergBaseModel
22 |
23 |
24 | class BlobMetadata(IcebergBaseModel):
25 | type: Literal["apache-datasketches-theta-v1", "deletion-vector-v1"]
26 | snapshot_id: int = Field(alias="snapshot-id")
27 | sequence_number: int = Field(alias="sequence-number")
28 | fields: List[int]
29 | properties: Optional[Dict[str, str]] = None
30 |
31 |
32 | class StatisticsFile(IcebergBaseModel):
33 | snapshot_id: int = Field(alias="snapshot-id")
34 | statistics_path: str = Field(alias="statistics-path")
35 | file_size_in_bytes: int = Field(alias="file-size-in-bytes")
36 | file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes")
37 | key_metadata: Optional[str] = Field(alias="key-metadata", default=None)
38 | blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata")
39 |
40 |
41 | def filter_statistics_by_snapshot_id(
42 | statistics: List[StatisticsFile],
43 | reject_snapshot_id: int,
44 | ) -> List[StatisticsFile]:
45 | return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id]
46 |
--------------------------------------------------------------------------------
/pyiceberg/table/update/statistics.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from typing import TYPE_CHECKING, Tuple
18 |
19 | from pyiceberg.table.statistics import StatisticsFile
20 | from pyiceberg.table.update import (
21 | RemoveStatisticsUpdate,
22 | SetStatisticsUpdate,
23 | TableUpdate,
24 | UpdatesAndRequirements,
25 | UpdateTableMetadata,
26 | )
27 |
28 | if TYPE_CHECKING:
29 | from pyiceberg.table import Transaction
30 |
31 |
32 | class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]):
33 | """
34 | Run statistics management operations using APIs.
35 |
36 | APIs include set_statistics and remove statistics operations.
37 |
38 | Use table.update_statistics().().commit() to run a specific operation.
39 | Use table.update_statistics().().().commit() to run multiple operations.
40 |
41 | Pending changes are applied on commit.
42 |
43 | We can also use context managers to make more changes. For example:
44 |
45 | with table.update_statistics() as update:
46 | update.set_statistics(statistics_file=statistics_file)
47 | update.remove_statistics(snapshot_id=2)
48 | """
49 |
50 | _updates: Tuple[TableUpdate, ...] = ()
51 |
52 | def __init__(self, transaction: "Transaction") -> None:
53 | super().__init__(transaction)
54 |
55 | def set_statistics(self, statistics_file: StatisticsFile) -> "UpdateStatistics":
56 | self._updates += (
57 | SetStatisticsUpdate(
58 | statistics=statistics_file,
59 | ),
60 | )
61 |
62 | return self
63 |
64 | def remove_statistics(self, snapshot_id: int) -> "UpdateStatistics":
65 | self._updates = (
66 | RemoveStatisticsUpdate(
67 | snapshot_id=snapshot_id,
68 | ),
69 | )
70 |
71 | return self
72 |
73 | def _commit(self) -> UpdatesAndRequirements:
74 | return self._updates, ()
75 |
--------------------------------------------------------------------------------
/pyiceberg/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
--------------------------------------------------------------------------------
/pyiceberg/utils/bin_packing.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from __future__ import annotations
18 |
19 | from typing import (
20 | Callable,
21 | Generic,
22 | Iterable,
23 | List,
24 | Optional,
25 | TypeVar,
26 | )
27 |
28 | T = TypeVar("T")
29 |
30 |
31 | class Bin(Generic[T]):
32 | def __init__(self, target_weight: int) -> None:
33 | self.bin_weight = 0
34 | self.target_weight = target_weight
35 | self.items: List[T] = []
36 |
37 | def weight(self) -> int:
38 | return self.bin_weight
39 |
40 | def can_add(self, weight: int) -> bool:
41 | return self.bin_weight + weight <= self.target_weight
42 |
43 | def add(self, item: T, weight: int) -> None:
44 | self.bin_weight += weight
45 | self.items.append(item)
46 |
47 |
48 | class PackingIterator(Generic[T]):
49 | bins: List[Bin[T]]
50 |
51 | def __init__(
52 | self,
53 | items: Iterable[T],
54 | target_weight: int,
55 | lookback: int,
56 | weight_func: Callable[[T], int],
57 | largest_bin_first: bool = False,
58 | ) -> None:
59 | self.items = iter(items)
60 | self.target_weight = target_weight
61 | self.lookback = lookback
62 | self.weight_func = weight_func
63 | self.largest_bin_first = largest_bin_first
64 | self.bins = []
65 |
66 | def __iter__(self) -> PackingIterator[T]:
67 | """Return an iterator for the PackingIterator class."""
68 | return self
69 |
70 | def __next__(self) -> List[T]:
71 | """Return the next item when iterating over the PackingIterator class."""
72 | while True:
73 | try:
74 | item = next(self.items)
75 | weight = self.weight_func(item)
76 | bin_ = self.find_bin(weight)
77 | if bin_ is not None:
78 | bin_.add(item, weight)
79 | else:
80 | bin_ = Bin(self.target_weight)
81 | bin_.add(item, weight)
82 | self.bins.append(bin_)
83 |
84 | if len(self.bins) > self.lookback:
85 | return self.remove_bin().items
86 | except StopIteration:
87 | break
88 |
89 | if len(self.bins) == 0:
90 | raise StopIteration()
91 |
92 | return self.remove_bin().items
93 |
94 | def find_bin(self, weight: int) -> Optional[Bin[T]]:
95 | for bin_ in self.bins:
96 | if bin_.can_add(weight):
97 | return bin_
98 | return None
99 |
100 | def remove_bin(self) -> Bin[T]:
101 | if self.largest_bin_first:
102 | bin_ = max(self.bins, key=lambda b: b.weight())
103 | self.bins.remove(bin_)
104 | return bin_
105 | else:
106 | return self.bins.pop(0)
107 |
108 |
109 | class ListPacker(Generic[T]):
110 | _target_weight: int
111 | _lookback: int
112 | _largest_bin_first: bool
113 |
114 | def __init__(self, target_weight: int, lookback: int, largest_bin_first: bool) -> None:
115 | self._target_weight = target_weight
116 | self._lookback = lookback
117 | self._largest_bin_first = largest_bin_first
118 |
119 | def pack(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]:
120 | return list(
121 | PackingIterator(
122 | items=items,
123 | target_weight=self._target_weight,
124 | lookback=self._lookback,
125 | weight_func=weight_func,
126 | largest_bin_first=self._largest_bin_first,
127 | )
128 | )
129 |
130 | def pack_end(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]:
131 | packed = self.pack(items=list(reversed(items)), weight_func=weight_func)
132 | return [list(reversed(bin_items)) for bin_items in reversed(packed)]
133 |
--------------------------------------------------------------------------------
/pyiceberg/utils/concurrent.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | """Concurrency concepts that support efficient multi-threading."""
18 |
19 | from concurrent.futures import Executor, ThreadPoolExecutor
20 | from typing import Optional
21 |
22 | from pyiceberg.utils.config import Config
23 |
24 |
25 | class ExecutorFactory:
26 | _instance: Optional[Executor] = None
27 |
28 | @staticmethod
29 | def get_or_create() -> Executor:
30 | """Return the same executor in each call."""
31 | if ExecutorFactory._instance is None:
32 | max_workers = ExecutorFactory.max_workers()
33 | ExecutorFactory._instance = ThreadPoolExecutor(max_workers=max_workers)
34 |
35 | return ExecutorFactory._instance
36 |
37 | @staticmethod
38 | def max_workers() -> Optional[int]:
39 | """Return the max number of workers configured."""
40 | return Config().get_int("max-workers")
41 |
--------------------------------------------------------------------------------
/pyiceberg/utils/deprecated.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | import functools
18 | import warnings
19 | from typing import Any, Callable, Optional
20 |
21 |
22 | def deprecated(deprecated_in: str, removed_in: str, help_message: Optional[str] = None) -> Callable: # type: ignore
23 | """Mark functions as deprecated.
24 |
25 | Adding this will result in a warning being emitted when the function is used.
26 | """
27 | if help_message is not None:
28 | help_message = f" {help_message}."
29 |
30 | def decorator(func: Callable): # type: ignore
31 | @functools.wraps(func)
32 | def new_func(*args: Any, **kwargs: Any) -> Any:
33 | message = f"Call to {func.__name__}, deprecated in {deprecated_in}, will be removed in {removed_in}.{help_message}"
34 |
35 | _deprecation_warning(message)
36 |
37 | return func(*args, **kwargs)
38 |
39 | return new_func
40 |
41 | return decorator
42 |
43 |
44 | def deprecation_notice(deprecated_in: str, removed_in: str, help_message: Optional[str]) -> str:
45 | """Return a deprecation notice."""
46 | return f"Deprecated in {deprecated_in}, will be removed in {removed_in}. {help_message}"
47 |
48 |
49 | def deprecation_message(deprecated_in: str, removed_in: str, help_message: Optional[str]) -> None:
50 | """Mark properties or behaviors as deprecated.
51 |
52 | Adding this will result in a warning being emitted.
53 | """
54 | _deprecation_warning(deprecation_notice(deprecated_in, removed_in, help_message))
55 |
56 |
57 | def _deprecation_warning(message: str) -> None:
58 | with warnings.catch_warnings(): # temporarily override warning handling
59 | warnings.warn(
60 | message,
61 | category=DeprecationWarning,
62 | stacklevel=2,
63 | )
64 |
--------------------------------------------------------------------------------
/pyiceberg/utils/lazydict.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from typing import (
19 | Dict,
20 | Iterator,
21 | Mapping,
22 | Optional,
23 | Sequence,
24 | TypeVar,
25 | Union,
26 | cast,
27 | )
28 |
29 | K = TypeVar("K")
30 | V = TypeVar("V")
31 |
32 |
33 | class LazyDict(Mapping[K, V]):
34 | """Lazily build a dictionary from an array of items."""
35 |
36 | __slots__ = ("_contents", "_dict")
37 |
38 | # Since Python's type system is not powerful enough to express the type of the
39 | # contents of the dictionary, we use specify the type as a sequence of either K or V
40 | # values.
41 | #
42 | # Rather than spending the runtime cost of checking the type of each item, we presume
43 | # that the developer has correctly used the class and that the contents are valid.
44 | def __init__(self, contents: Sequence[Sequence[Union[K, V]]]):
45 | self._contents = contents
46 | self._dict: Optional[Dict[K, V]] = None
47 |
48 | def _build_dict(self) -> Dict[K, V]:
49 | self._dict = {}
50 | for item in self._contents:
51 | self._dict.update(dict(zip(cast(Sequence[K], item[::2]), cast(Sequence[V], item[1::2]))))
52 |
53 | return self._dict
54 |
55 | def __getitem__(self, key: K, /) -> V:
56 | """Return the value for the given key."""
57 | source = self._dict or self._build_dict()
58 | return source[key]
59 |
60 | def __iter__(self) -> Iterator[K]:
61 | """Return an iterator over the keys of the dictionary."""
62 | source = self._dict or self._build_dict()
63 | return iter(source)
64 |
65 | def __len__(self) -> int:
66 | """Return the number of items in the dictionary."""
67 | source = self._dict or self._build_dict()
68 | return len(source)
69 |
70 | def __dict__(self) -> Dict[K, V]: # type: ignore
71 | """Convert the lazy dict in a dict."""
72 | return self._dict or self._build_dict()
73 |
--------------------------------------------------------------------------------
/pyiceberg/utils/parsing.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | import re
18 | from re import Pattern
19 |
20 | from pyiceberg.exceptions import ValidationError
21 |
22 |
23 | class ParseNumberFromBrackets:
24 | """Extracts the size from a string in the form of prefix[22]."""
25 |
26 | regex: Pattern # type: ignore
27 | prefix: str
28 |
29 | def __init__(self, prefix: str):
30 | self.prefix = prefix
31 | self.regex = re.compile(rf"{prefix}\[(\d+)\]")
32 |
33 | def match(self, str_repr: str) -> int:
34 | matches = self.regex.search(str_repr)
35 | if matches:
36 | return int(matches.group(1))
37 | raise ValidationError(f"Could not match {str_repr}, expected format {self.prefix}[22]")
38 |
--------------------------------------------------------------------------------
/pyiceberg/utils/properties.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from typing import (
19 | Any,
20 | Dict,
21 | Optional,
22 | )
23 |
24 | from pyiceberg.typedef import Properties
25 | from pyiceberg.types import strtobool
26 |
27 | HEADER_PREFIX = "header."
28 |
29 |
30 | def property_as_int(
31 | properties: Dict[str, str],
32 | property_name: str,
33 | default: Optional[int] = None,
34 | ) -> Optional[int]:
35 | if value := properties.get(property_name):
36 | try:
37 | return int(value)
38 | except ValueError as e:
39 | raise ValueError(f"Could not parse table property {property_name} to an integer: {value}") from e
40 | else:
41 | return default
42 |
43 |
44 | def property_as_float(
45 | properties: Dict[str, str],
46 | property_name: str,
47 | default: Optional[float] = None,
48 | ) -> Optional[float]:
49 | if value := properties.get(property_name):
50 | try:
51 | return float(value)
52 | except ValueError as e:
53 | raise ValueError(f"Could not parse table property {property_name} to a float: {value}") from e
54 | else:
55 | return default
56 |
57 |
58 | def property_as_bool(
59 | properties: Dict[str, str],
60 | property_name: str,
61 | default: bool,
62 | ) -> bool:
63 | if value := properties.get(property_name):
64 | try:
65 | return strtobool(value)
66 | except ValueError as e:
67 | raise ValueError(f"Could not parse table property {property_name} to a boolean: {value}") from e
68 | return default
69 |
70 |
71 | def get_first_property_value(
72 | properties: Properties,
73 | *property_names: str,
74 | ) -> Optional[Any]:
75 | for property_name in property_names:
76 | if property_value := properties.get(property_name):
77 | return property_value
78 | return None
79 |
80 |
81 | def get_header_properties(
82 | properties: Properties,
83 | ) -> Properties:
84 | header_prefix_len = len(HEADER_PREFIX)
85 | return {key[header_prefix_len:]: value for key, value in properties.items() if key.startswith(HEADER_PREFIX)}
86 |
--------------------------------------------------------------------------------
/pyiceberg/utils/singleton.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | """
18 | This is a singleton metaclass that can be used to cache and reuse existing objects.
19 |
20 | In the Iceberg codebase we have a lot of objects that are stateless (for example Types such as StringType,
21 | BooleanType etc). FixedTypes have arguments (eg. Fixed[22]) that we also make part of the key when caching
22 | the newly created object.
23 |
24 | The Singleton uses a metaclass which essentially defines a new type. When the Type gets created, it will first
25 | evaluate the `__call__` method with all the arguments. If we already initialized a class earlier, we'll just
26 | return it.
27 |
28 | More information on metaclasses: https://docs.python.org/3/reference/datamodel.html#metaclasses
29 | """
30 |
31 | from typing import Any, ClassVar, Dict
32 |
33 |
34 | def _convert_to_hashable_type(element: Any) -> Any:
35 | if isinstance(element, dict):
36 | return tuple((_convert_to_hashable_type(k), _convert_to_hashable_type(v)) for k, v in element.items())
37 | elif isinstance(element, list):
38 | return tuple(map(_convert_to_hashable_type, element))
39 | return element
40 |
41 |
42 | class Singleton:
43 | _instances: ClassVar[Dict] = {} # type: ignore
44 |
45 | def __new__(cls, *args, **kwargs): # type: ignore
46 | key = (cls, tuple(args), _convert_to_hashable_type(kwargs))
47 | if key not in cls._instances:
48 | cls._instances[key] = super().__new__(cls)
49 | return cls._instances[key]
50 |
51 | def __deepcopy__(self, memo: Dict[int, Any]) -> Any:
52 | """
53 | Prevent deep copy operations for singletons.
54 |
55 | The IcebergRootModel inherits from Pydantic RootModel,
56 | which has its own implementation of deepcopy. When deepcopy
57 | runs, it calls the RootModel __deepcopy__ method and ignores
58 | that it's a Singleton. To handle this, the order of inheritance
59 | is adjusted and a __deepcopy__ method is implemented for
60 | singletons that simply returns itself.
61 | """
62 | return self
63 |
--------------------------------------------------------------------------------
/pyiceberg/utils/truncate.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from typing import Optional
18 |
19 |
20 | def truncate_upper_bound_text_string(value: str, trunc_length: Optional[int]) -> Optional[str]:
21 | result = value[:trunc_length]
22 | if result != value:
23 | chars = [*result]
24 |
25 | for i in range(-1, -len(result) - 1, -1):
26 | try:
27 | to_inc = ord(chars[i])
28 | # will raise exception if the highest unicode code is reached
29 | _next = chr(to_inc + 1)
30 | chars[i] = _next
31 | return "".join(chars)
32 | except ValueError:
33 | pass
34 | return None # didn't find a valid upper bound
35 | return result
36 |
37 |
38 | def truncate_upper_bound_binary_string(value: bytes, trunc_length: Optional[int]) -> Optional[bytes]:
39 | result = value[:trunc_length]
40 | if result != value:
41 | _bytes = [*result]
42 | for i in range(-1, -len(result) - 1, -1):
43 | if _bytes[i] < 255:
44 | _bytes[i] += 1
45 | return b"".join([i.to_bytes(1, byteorder="little") for i in _bytes])
46 | return None
47 |
48 | return result
49 |
--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | src = ['pyiceberg','tests']
19 | extend-exclude = ["dev/provision.py"]
20 |
21 | # Exclude a variety of commonly ignored directories.
22 | exclude = [
23 | ".bzr",
24 | ".direnv",
25 | ".eggs",
26 | ".git",
27 | ".git-rewrite",
28 | ".hg",
29 | ".mypy_cache",
30 | ".nox",
31 | ".pants.d",
32 | ".pytype",
33 | ".ruff_cache",
34 | ".svn",
35 | ".tox",
36 | ".venv",
37 | "__pypackages__",
38 | "_build",
39 | "buck-out",
40 | "build",
41 | "dist",
42 | "node_modules",
43 | "venv",
44 | ]
45 |
46 | # Ignore _all_ violations.
47 | # Same as Black.
48 | line-length = 130
49 |
50 | [lint]
51 | select = [
52 | "E", # pycodestyle
53 | "W", # pycodestyle
54 | "F", # Pyflakes
55 | "B", # flake8-bugbear
56 | "PIE", # flake8-pie
57 | "C4", # flake8-comprehensions
58 | "I", # isort
59 | "UP", # pyupgrade
60 | ]
61 | ignore = ["E501","E203","B024","B028","UP037", "UP035", "UP006"]
62 |
63 | # Allow autofix for all enabled rules (when `--fix`) is provided.
64 | fixable = ["ALL"]
65 | unfixable = []
66 |
67 | per-file-ignores = {}
68 |
69 | # Allow unused variables when underscore-prefixed.
70 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
71 |
72 | [lint.pyupgrade]
73 | # Preserve types, even if a file imports `from __future__ import annotations`.
74 | keep-runtime-typing = true
75 |
76 | [lint.isort]
77 | detect-same-package = true
78 | lines-between-types = 0
79 | known-first-party = ["pyiceberg", "tests"]
80 | section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"]
81 |
82 | [format]
83 | quote-style = "double"
84 |
--------------------------------------------------------------------------------
/tests/avro/test_encoder.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from __future__ import annotations
18 |
19 | import io
20 | import struct
21 | import uuid
22 |
23 | from pyiceberg.avro.encoder import BinaryEncoder
24 |
25 |
26 | def test_write() -> None:
27 | output = io.BytesIO()
28 | encoder = BinaryEncoder(output)
29 |
30 | _input = b"\x12\x34\x56"
31 |
32 | encoder.write(_input)
33 |
34 | assert output.getbuffer() == _input
35 |
36 |
37 | def test_write_boolean() -> None:
38 | output = io.BytesIO()
39 | encoder = BinaryEncoder(output)
40 |
41 | encoder.write_boolean(True)
42 | encoder.write_boolean(False)
43 |
44 | assert output.getbuffer() == struct.pack("??", True, False)
45 |
46 |
47 | def test_write_int() -> None:
48 | output = io.BytesIO()
49 | encoder = BinaryEncoder(output)
50 |
51 | _1byte_input = 2
52 | _2byte_input = 7466
53 | _3byte_input = 523490
54 | _4byte_input = 86561570
55 | _5byte_input = 2510416930
56 | _6byte_input = 734929016866
57 | _7byte_input = 135081528772642
58 | _8byte_input = 35124861473277986
59 |
60 | encoder.write_int(_1byte_input)
61 | encoder.write_int(_2byte_input)
62 | encoder.write_int(_3byte_input)
63 | encoder.write_int(_4byte_input)
64 | encoder.write_int(_5byte_input)
65 | encoder.write_int(_6byte_input)
66 | encoder.write_int(_7byte_input)
67 | encoder.write_int(_8byte_input)
68 |
69 | buffer = output.getbuffer()
70 |
71 | assert buffer[0:1] == b"\x04"
72 | assert buffer[1:3] == b"\xd4\x74"
73 | assert buffer[3:6] == b"\xc4\xf3\x3f"
74 | assert buffer[6:10] == b"\xc4\xcc\xc6\x52"
75 | assert buffer[10:15] == b"\xc4\xb0\x8f\xda\x12"
76 | assert buffer[15:21] == b"\xc4\xe0\xf6\xd2\xe3\x2a"
77 | assert buffer[21:28] == b"\xc4\xa0\xce\xe8\xe3\xb6\x3d"
78 | assert buffer[28:36] == b"\xc4\xa0\xb2\xae\x83\xf8\xe4\x7c"
79 |
80 |
81 | def test_write_float() -> None:
82 | output = io.BytesIO()
83 | encoder = BinaryEncoder(output)
84 |
85 | _input = 3.14159265359
86 |
87 | encoder.write_float(_input)
88 |
89 | assert output.getbuffer() == struct.pack(" None:
93 | output = io.BytesIO()
94 | encoder = BinaryEncoder(output)
95 |
96 | _input = 3.14159265359
97 |
98 | encoder.write_double(_input)
99 |
100 | assert output.getbuffer() == struct.pack(" None:
104 | output = io.BytesIO()
105 | encoder = BinaryEncoder(output)
106 |
107 | _input = b"\x12\x34\x56"
108 |
109 | encoder.write_bytes(_input)
110 |
111 | assert output.getbuffer() == b"".join([b"\x06", _input])
112 |
113 |
114 | def test_write_utf8() -> None:
115 | output = io.BytesIO()
116 | encoder = BinaryEncoder(output)
117 |
118 | _input = "That, my liege, is how we know the Earth to be banana-shaped."
119 | bin_input = _input.encode()
120 | encoder.write_utf8(_input)
121 |
122 | assert output.getbuffer() == b"".join([b"\x7a", bin_input])
123 |
124 |
125 | def test_write_uuid() -> None:
126 | output = io.BytesIO()
127 | encoder = BinaryEncoder(output)
128 |
129 | _input = uuid.UUID("12345678-1234-5678-1234-567812345678")
130 | encoder.write_uuid(_input)
131 |
132 | buf = output.getbuffer()
133 | assert len(buf) == 16
134 | assert buf.tobytes() == b"\x124Vx\x124Vx\x124Vx\x124Vx"
135 |
--------------------------------------------------------------------------------
/tests/benchmark/test_benchmark.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | import statistics
18 | import timeit
19 | import urllib
20 |
21 | import pyarrow as pa
22 | import pyarrow.parquet as pq
23 | import pytest
24 |
25 | from pyiceberg.transforms import DayTransform
26 |
27 |
28 | @pytest.fixture(scope="session")
29 | def taxi_dataset(tmp_path_factory: pytest.TempPathFactory) -> pa.Table:
30 | """Reads the Taxi dataset to disk"""
31 | taxi_dataset = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet"
32 | taxi_dataset_dest = tmp_path_factory.mktemp("taxi_dataset") / "yellow_tripdata_2022-01.parquet"
33 | urllib.request.urlretrieve(taxi_dataset, taxi_dataset_dest)
34 |
35 | return pq.read_table(taxi_dataset_dest)
36 |
37 |
38 | @pytest.mark.benchmark
39 | def test_partitioned_write(tmp_path_factory: pytest.TempPathFactory, taxi_dataset: pa.Table) -> None:
40 | """Tests writing to a partitioned table with something that would be close a production-like situation"""
41 | from pyiceberg.catalog.sql import SqlCatalog
42 |
43 | warehouse_path = str(tmp_path_factory.mktemp("warehouse"))
44 | catalog = SqlCatalog(
45 | "default",
46 | uri=f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
47 | warehouse=f"file://{warehouse_path}",
48 | )
49 |
50 | catalog.create_namespace("default")
51 |
52 | tbl = catalog.create_table("default.taxi_partitioned", schema=taxi_dataset.schema)
53 |
54 | with tbl.update_spec() as spec:
55 | spec.add_field("tpep_pickup_datetime", DayTransform())
56 |
57 | # Profiling can sometimes be handy as well
58 | # with cProfile.Profile() as pr:
59 | # tbl.append(taxi_dataset)
60 | #
61 | # pr.print_stats(sort=True)
62 |
63 | runs = []
64 | for run in range(5):
65 | start_time = timeit.default_timer()
66 | tbl.append(taxi_dataset)
67 | elapsed = timeit.default_timer() - start_time
68 |
69 | print(f"Run {run} took: {elapsed}")
70 | runs.append(elapsed)
71 |
72 | print(f"Average runtime of {round(statistics.mean(runs), 2)} seconds")
73 |
--------------------------------------------------------------------------------
/tests/catalog/test_rest_auth.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import base64
19 |
20 | import pytest
21 | import requests
22 | from requests_mock import Mocker
23 |
24 | from pyiceberg.catalog.rest.auth import AuthManagerAdapter, BasicAuthManager, NoopAuthManager
25 |
26 | TEST_URI = "https://iceberg-test-catalog/"
27 |
28 |
29 | @pytest.fixture
30 | def rest_mock(requests_mock: Mocker) -> Mocker:
31 | requests_mock.get(
32 | TEST_URI,
33 | json={},
34 | status_code=200,
35 | )
36 | return requests_mock
37 |
38 |
39 | def test_noop_auth_header(rest_mock: Mocker) -> None:
40 | auth_manager = NoopAuthManager()
41 | session = requests.Session()
42 | session.auth = AuthManagerAdapter(auth_manager)
43 |
44 | session.get(TEST_URI)
45 | history = rest_mock.request_history
46 | assert len(history) == 1
47 | actual_headers = history[0].headers
48 | assert "Authorization" not in actual_headers
49 |
50 |
51 | def test_basic_auth_header(rest_mock: Mocker) -> None:
52 | username = "testuser"
53 | password = "testpassword"
54 | expected_token = base64.b64encode(f"{username}:{password}".encode()).decode()
55 | expected_header = f"Basic {expected_token}"
56 |
57 | auth_manager = BasicAuthManager(username=username, password=password)
58 | session = requests.Session()
59 | session.auth = AuthManagerAdapter(auth_manager)
60 |
61 | session.get(TEST_URI)
62 | history = rest_mock.request_history
63 | assert len(history) == 1
64 | actual_headers = history[0].headers
65 | assert actual_headers["Authorization"] == expected_header
66 |
--------------------------------------------------------------------------------
/tests/cli/test_output.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
--------------------------------------------------------------------------------
/tests/integration/test_register_table.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | import pytest
18 |
19 | from pyiceberg.catalog import Catalog
20 | from pyiceberg.exceptions import NoSuchTableError, TableAlreadyExistsError
21 | from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
22 | from pyiceberg.schema import Schema
23 | from pyiceberg.table import Table
24 | from pyiceberg.types import (
25 | BooleanType,
26 | DateType,
27 | IntegerType,
28 | NestedField,
29 | StringType,
30 | )
31 |
32 | TABLE_SCHEMA = Schema(
33 | NestedField(field_id=1, name="foo", field_type=BooleanType(), required=False),
34 | NestedField(field_id=2, name="bar", field_type=StringType(), required=False),
35 | NestedField(field_id=4, name="baz", field_type=IntegerType(), required=False),
36 | NestedField(field_id=10, name="qux", field_type=DateType(), required=False),
37 | )
38 |
39 |
40 | def _create_table(
41 | session_catalog: Catalog,
42 | identifier: str,
43 | format_version: int,
44 | location: str,
45 | partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
46 | schema: Schema = TABLE_SCHEMA,
47 | ) -> Table:
48 | try:
49 | session_catalog.drop_table(identifier=identifier)
50 | except NoSuchTableError:
51 | pass
52 |
53 | return session_catalog.create_table(
54 | identifier=identifier,
55 | schema=schema,
56 | location=location,
57 | properties={"format-version": str(format_version)},
58 | partition_spec=partition_spec,
59 | )
60 |
61 |
62 | @pytest.mark.integration
63 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
64 | def test_register_table(
65 | catalog: Catalog,
66 | ) -> None:
67 | identifier = "default.register_table"
68 | location = "s3a://warehouse/default/register_table"
69 | tbl = _create_table(catalog, identifier, 2, location)
70 | assert catalog.table_exists(identifier=identifier)
71 | catalog.drop_table(identifier=identifier)
72 | assert not catalog.table_exists(identifier=identifier)
73 | catalog.register_table(("default", "register_table"), metadata_location=tbl.metadata_location)
74 | assert catalog.table_exists(identifier=identifier)
75 |
76 |
77 | @pytest.mark.integration
78 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
79 | def test_register_table_existing(
80 | catalog: Catalog,
81 | ) -> None:
82 | identifier = "default.register_table_existing"
83 | location = "s3a://warehouse/default/register_table_existing"
84 | tbl = _create_table(catalog, identifier, 2, location)
85 | assert catalog.table_exists(identifier=identifier)
86 | # Assert that registering the table again raises TableAlreadyExistsError
87 | with pytest.raises(TableAlreadyExistsError):
88 | catalog.register_table(("default", "register_table_existing"), metadata_location=tbl.metadata_location)
89 |
--------------------------------------------------------------------------------
/tests/integration/test_rest_catalog.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | # pylint:disable=redefined-outer-name
18 |
19 | import pytest
20 |
21 | from pyiceberg.catalog.rest import RestCatalog
22 |
23 | TEST_NAMESPACE_IDENTIFIER = "TEST NS"
24 |
25 |
26 | @pytest.mark.integration
27 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")])
28 | def test_namespace_exists(catalog: RestCatalog) -> None:
29 | if not catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER):
30 | catalog.create_namespace(TEST_NAMESPACE_IDENTIFIER)
31 |
32 | assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER)
33 |
34 |
35 | @pytest.mark.integration
36 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")])
37 | def test_namespace_not_exists(catalog: RestCatalog) -> None:
38 | if catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER):
39 | catalog.drop_namespace(TEST_NAMESPACE_IDENTIFIER)
40 |
41 | assert not catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER)
42 |
43 |
44 | @pytest.mark.integration
45 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")])
46 | def test_create_namespace_if_not_exists(catalog: RestCatalog) -> None:
47 | if catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER):
48 | catalog.drop_namespace(TEST_NAMESPACE_IDENTIFIER)
49 |
50 | catalog.create_namespace_if_not_exists(TEST_NAMESPACE_IDENTIFIER)
51 |
52 | assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER)
53 |
54 |
55 | @pytest.mark.integration
56 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")])
57 | def test_create_namespace_if_already_existing(catalog: RestCatalog) -> None:
58 | if not catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER):
59 | catalog.create_namespace(TEST_NAMESPACE_IDENTIFIER)
60 |
61 | catalog.create_namespace_if_not_exists(TEST_NAMESPACE_IDENTIFIER)
62 |
63 | assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER)
64 |
--------------------------------------------------------------------------------
/tests/integration/test_snapshot_operations.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | import pytest
18 |
19 | from pyiceberg.catalog import Catalog
20 | from pyiceberg.table.refs import SnapshotRef
21 |
22 |
23 | @pytest.mark.integration
24 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
25 | def test_create_tag(catalog: Catalog) -> None:
26 | identifier = "default.test_table_snapshot_operations"
27 | tbl = catalog.load_table(identifier)
28 | assert len(tbl.history()) > 3
29 | tag_snapshot_id = tbl.history()[-3].snapshot_id
30 | tbl.manage_snapshots().create_tag(snapshot_id=tag_snapshot_id, tag_name="tag123").commit()
31 | assert tbl.metadata.refs["tag123"] == SnapshotRef(snapshot_id=tag_snapshot_id, snapshot_ref_type="tag")
32 |
33 |
34 | @pytest.mark.integration
35 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
36 | def test_create_branch(catalog: Catalog) -> None:
37 | identifier = "default.test_table_snapshot_operations"
38 | tbl = catalog.load_table(identifier)
39 | assert len(tbl.history()) > 2
40 | branch_snapshot_id = tbl.history()[-2].snapshot_id
41 | tbl.manage_snapshots().create_branch(snapshot_id=branch_snapshot_id, branch_name="branch123").commit()
42 | assert tbl.metadata.refs["branch123"] == SnapshotRef(snapshot_id=branch_snapshot_id, snapshot_ref_type="branch")
43 |
44 |
45 | @pytest.mark.integration
46 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
47 | def test_remove_tag(catalog: Catalog) -> None:
48 | identifier = "default.test_table_snapshot_operations"
49 | tbl = catalog.load_table(identifier)
50 | assert len(tbl.history()) > 3
51 | # first, create the tag to remove
52 | tag_name = "tag_to_remove"
53 | tag_snapshot_id = tbl.history()[-3].snapshot_id
54 | tbl.manage_snapshots().create_tag(snapshot_id=tag_snapshot_id, tag_name=tag_name).commit()
55 | assert tbl.metadata.refs[tag_name] == SnapshotRef(snapshot_id=tag_snapshot_id, snapshot_ref_type="tag")
56 | # now, remove the tag
57 | tbl.manage_snapshots().remove_tag(tag_name=tag_name).commit()
58 | assert tbl.metadata.refs.get(tag_name, None) is None
59 |
60 |
61 | @pytest.mark.integration
62 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
63 | def test_remove_branch(catalog: Catalog) -> None:
64 | identifier = "default.test_table_snapshot_operations"
65 | tbl = catalog.load_table(identifier)
66 | assert len(tbl.history()) > 2
67 | # first, create the branch to remove
68 | branch_name = "branch_to_remove"
69 | branch_snapshot_id = tbl.history()[-2].snapshot_id
70 | tbl.manage_snapshots().create_branch(snapshot_id=branch_snapshot_id, branch_name=branch_name).commit()
71 | assert tbl.metadata.refs[branch_name] == SnapshotRef(snapshot_id=branch_snapshot_id, snapshot_ref_type="branch")
72 | # now, remove the branch
73 | tbl.manage_snapshots().remove_branch(branch_name=branch_name).commit()
74 | assert tbl.metadata.refs.get(branch_name, None) is None
75 |
--------------------------------------------------------------------------------
/tests/integration/test_statistics_operations.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from typing import TYPE_CHECKING
18 |
19 | import pytest
20 |
21 | from pyiceberg.exceptions import NoSuchTableError
22 | from pyiceberg.table.statistics import BlobMetadata, StatisticsFile
23 |
24 | if TYPE_CHECKING:
25 | import pyarrow as pa
26 |
27 | from pyiceberg.catalog import Catalog
28 | from pyiceberg.schema import Schema
29 | from pyiceberg.table import Table
30 |
31 |
32 | def _create_table_with_schema(catalog: "Catalog", schema: "Schema") -> "Table":
33 | tbl_name = "default.test_table_statistics_operations"
34 |
35 | try:
36 | catalog.drop_table(tbl_name)
37 | except NoSuchTableError:
38 | pass
39 | return catalog.create_table(identifier=tbl_name, schema=schema)
40 |
41 |
42 | @pytest.mark.integration
43 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
44 | def test_manage_statistics(catalog: "Catalog", arrow_table_with_null: "pa.Table") -> None:
45 | tbl = _create_table_with_schema(catalog, arrow_table_with_null.schema)
46 |
47 | tbl.append(arrow_table_with_null)
48 | tbl.append(arrow_table_with_null)
49 |
50 | add_snapshot_id_1 = tbl.history()[0].snapshot_id
51 | add_snapshot_id_2 = tbl.history()[1].snapshot_id
52 |
53 | def create_statistics_file(snapshot_id: int, type_name: str) -> StatisticsFile:
54 | blob_metadata = BlobMetadata(
55 | type=type_name,
56 | snapshot_id=snapshot_id,
57 | sequence_number=2,
58 | fields=[1],
59 | properties={"prop-key": "prop-value"},
60 | )
61 |
62 | statistics_file = StatisticsFile(
63 | snapshot_id=snapshot_id,
64 | statistics_path="s3://bucket/warehouse/stats.puffin",
65 | file_size_in_bytes=124,
66 | file_footer_size_in_bytes=27,
67 | blob_metadata=[blob_metadata],
68 | )
69 |
70 | return statistics_file
71 |
72 | statistics_file_snap_1 = create_statistics_file(add_snapshot_id_1, "apache-datasketches-theta-v1")
73 | statistics_file_snap_2 = create_statistics_file(add_snapshot_id_2, "deletion-vector-v1")
74 |
75 | with tbl.update_statistics() as update:
76 | update.set_statistics(statistics_file_snap_1)
77 | update.set_statistics(statistics_file_snap_2)
78 |
79 | assert len(tbl.metadata.statistics) == 2
80 |
81 | with tbl.update_statistics() as update:
82 | update.remove_statistics(add_snapshot_id_1)
83 |
84 | assert len(tbl.metadata.statistics) == 1
85 |
86 | with tbl.transaction() as txn:
87 | with txn.update_statistics() as update:
88 | update.set_statistics(statistics_file_snap_1)
89 | update.set_statistics(statistics_file_snap_2)
90 |
91 | assert len(tbl.metadata.statistics) == 2
92 |
--------------------------------------------------------------------------------
/tests/integration/test_writes/test_optimistic_concurrency.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import pyarrow as pa
19 | import pytest
20 | from pyspark.sql import SparkSession
21 |
22 | from pyiceberg.catalog import Catalog
23 | from pyiceberg.exceptions import CommitFailedException
24 | from utils import _create_table
25 |
26 |
27 | @pytest.mark.integration
28 | @pytest.mark.parametrize("format_version", [1, 2])
29 | def test_conflict_delete_delete(
30 | spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
31 | ) -> None:
32 | """This test should start passing once optimistic concurrency control has been implemented."""
33 | identifier = "default.test_conflict"
34 | tbl1 = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null])
35 | tbl2 = session_catalog.load_table(identifier)
36 |
37 | tbl1.delete("string == 'z'")
38 |
39 | with pytest.raises(CommitFailedException, match="(branch main has changed: expected id ).*"):
40 | # tbl2 isn't aware of the commit by tbl1
41 | tbl2.delete("string == 'z'")
42 |
43 |
44 | @pytest.mark.integration
45 | @pytest.mark.parametrize("format_version", [1, 2])
46 | def test_conflict_delete_append(
47 | spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
48 | ) -> None:
49 | """This test should start passing once optimistic concurrency control has been implemented."""
50 | identifier = "default.test_conflict"
51 | tbl1 = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null])
52 | tbl2 = session_catalog.load_table(identifier)
53 |
54 | # This is allowed
55 | tbl1.delete("string == 'z'")
56 |
57 | with pytest.raises(CommitFailedException, match="(branch main has changed: expected id ).*"):
58 | # tbl2 isn't aware of the commit by tbl1
59 | tbl2.append(arrow_table_with_null)
60 |
61 |
62 | @pytest.mark.integration
63 | @pytest.mark.parametrize("format_version", [1, 2])
64 | def test_conflict_append_delete(
65 | spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
66 | ) -> None:
67 | """This test should start passing once optimistic concurrency control has been implemented."""
68 | identifier = "default.test_conflict"
69 | tbl1 = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null])
70 | tbl2 = session_catalog.load_table(identifier)
71 |
72 | tbl1.append(arrow_table_with_null)
73 |
74 | with pytest.raises(CommitFailedException, match="(branch main has changed: expected id ).*"):
75 | # tbl2 isn't aware of the commit by tbl1
76 | tbl2.delete("string == 'z'")
77 |
78 |
79 | @pytest.mark.integration
80 | @pytest.mark.parametrize("format_version", [1, 2])
81 | def test_conflict_append_append(
82 | spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
83 | ) -> None:
84 | """This test should start passing once optimistic concurrency control has been implemented."""
85 | identifier = "default.test_conflict"
86 | tbl1 = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null])
87 | tbl2 = session_catalog.load_table(identifier)
88 |
89 | tbl1.append(arrow_table_with_null)
90 |
91 | with pytest.raises(CommitFailedException, match="(branch main has changed: expected id ).*"):
92 | # tbl2 isn't aware of the commit by tbl1
93 | tbl2.append(arrow_table_with_null)
94 |
--------------------------------------------------------------------------------
/tests/integration/test_writes/utils.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | # pylint:disable=redefined-outer-name
18 | from typing import List, Optional, Union
19 |
20 | import pyarrow as pa
21 |
22 | from pyiceberg.catalog import Catalog
23 | from pyiceberg.exceptions import NoSuchTableError
24 | from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
25 | from pyiceberg.schema import Schema
26 | from pyiceberg.table import Table
27 | from pyiceberg.typedef import EMPTY_DICT, Properties
28 | from pyiceberg.types import (
29 | BinaryType,
30 | BooleanType,
31 | DateType,
32 | DoubleType,
33 | FixedType,
34 | FloatType,
35 | IntegerType,
36 | LongType,
37 | NestedField,
38 | StringType,
39 | TimestampType,
40 | TimestamptzType,
41 | )
42 |
43 | TABLE_SCHEMA = Schema(
44 | NestedField(field_id=1, name="bool", field_type=BooleanType(), required=False),
45 | NestedField(field_id=2, name="string", field_type=StringType(), required=False),
46 | NestedField(field_id=3, name="string_long", field_type=StringType(), required=False),
47 | NestedField(field_id=4, name="int", field_type=IntegerType(), required=False),
48 | NestedField(field_id=5, name="long", field_type=LongType(), required=False),
49 | NestedField(field_id=6, name="float", field_type=FloatType(), required=False),
50 | NestedField(field_id=7, name="double", field_type=DoubleType(), required=False),
51 | # NestedField(field_id=8, name="time", field_type=TimeType(), required=False), # Spark does not support time fields
52 | NestedField(field_id=8, name="timestamp", field_type=TimestampType(), required=False),
53 | NestedField(field_id=9, name="timestamptz", field_type=TimestamptzType(), required=False),
54 | NestedField(field_id=10, name="date", field_type=DateType(), required=False),
55 | # NestedField(field_id=11, name="time", field_type=TimeType(), required=False),
56 | # NestedField(field_id=12, name="uuid", field_type=UuidType(), required=False),
57 | NestedField(field_id=11, name="binary", field_type=BinaryType(), required=False),
58 | NestedField(field_id=12, name="fixed", field_type=FixedType(16), required=False),
59 | )
60 |
61 |
62 | def _create_table(
63 | session_catalog: Catalog,
64 | identifier: str,
65 | properties: Properties = EMPTY_DICT,
66 | data: Optional[List[pa.Table]] = None,
67 | partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
68 | schema: Union[Schema, "pa.Schema"] = TABLE_SCHEMA,
69 | ) -> Table:
70 | try:
71 | session_catalog.drop_table(identifier=identifier)
72 | except NoSuchTableError:
73 | pass
74 |
75 | tbl = session_catalog.create_table(identifier=identifier, schema=schema, properties=properties, partition_spec=partition_spec)
76 |
77 | if data is not None:
78 | for d in data:
79 | tbl.append(d)
80 |
81 | return tbl
82 |
--------------------------------------------------------------------------------
/tests/table/bitmaps/64map32bitvals.bin:
--------------------------------------------------------------------------------
1 | :0
--------------------------------------------------------------------------------
/tests/table/bitmaps/64mapempty.bin:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/table/bitmaps/64maphighvals.bin:
--------------------------------------------------------------------------------
1 | ����:0 ��
2 | ��������������������������:0 ��
3 | ��������������������������:0 ��
4 | ��������������������������:0 ��
5 | ��������������������������:0 ��
6 | ��������������������������:0 ��
7 | ��������������������������:0 ��
8 | ��������������������������:0 ��
9 | ��������������������������:0 ��
10 | ��������������������������:0 ��
11 | ��������������������������:0 ��
12 | ����������������������
--------------------------------------------------------------------------------
/tests/table/bitmaps/64mapspreadvals.bin:
--------------------------------------------------------------------------------
1 |
2 | :0 :0 :0 :0 :0 :0 :0 :0 :0 :0
--------------------------------------------------------------------------------
/tests/table/test_puffin.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from os import path
18 | from typing import List
19 |
20 | import pytest
21 | from pyroaring import BitMap
22 |
23 | from pyiceberg.table.puffin import _deserialize_bitmap
24 |
25 |
26 | def _open_file(file: str) -> bytes:
27 | cur_dir = path.dirname(path.realpath(__file__))
28 | with open(f"{cur_dir}/bitmaps/{file}", "rb") as f:
29 | return f.read()
30 |
31 |
32 | def test_map_empty() -> None:
33 | puffin = _open_file("64mapempty.bin")
34 |
35 | expected: List[BitMap] = []
36 | actual = _deserialize_bitmap(puffin)
37 |
38 | assert expected == actual
39 |
40 |
41 | def test_map_bitvals() -> None:
42 | puffin = _open_file("64map32bitvals.bin")
43 |
44 | expected = [BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])]
45 | actual = _deserialize_bitmap(puffin)
46 |
47 | assert expected == actual
48 |
49 |
50 | def test_map_spread_vals() -> None:
51 | puffin = _open_file("64mapspreadvals.bin")
52 |
53 | expected = [
54 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
55 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
56 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
57 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
58 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
59 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
60 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
61 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
62 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
63 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
64 | ]
65 | actual = _deserialize_bitmap(puffin)
66 |
67 | assert expected == actual
68 |
69 |
70 | def test_map_high_vals() -> None:
71 | puffin = _open_file("64maphighvals.bin")
72 |
73 | with pytest.raises(ValueError, match="Key 4022190063 is too large, max 2147483647 to maintain compatibility with Java impl"):
74 | _ = _deserialize_bitmap(puffin)
75 |
--------------------------------------------------------------------------------
/tests/table/test_refs.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | # pylint:disable=eval-used
18 | import pytest
19 | from pydantic import ValidationError
20 |
21 | from pyiceberg import exceptions
22 | from pyiceberg.table.refs import SnapshotRef, SnapshotRefType
23 |
24 |
25 | def test_snapshot_with_properties_repr() -> None:
26 | snapshot_ref = SnapshotRef(
27 | snapshot_id=3051729675574597004,
28 | snapshot_ref_type=SnapshotRefType.TAG,
29 | min_snapshots_to_keep=None,
30 | max_snapshot_age_ms=None,
31 | max_ref_age_ms=10000000,
32 | )
33 |
34 | assert (
35 | repr(snapshot_ref)
36 | == """SnapshotRef(snapshot_id=3051729675574597004, snapshot_ref_type=SnapshotRefType.TAG, min_snapshots_to_keep=None, max_snapshot_age_ms=None, max_ref_age_ms=10000000)"""
37 | )
38 | assert snapshot_ref == eval(repr(snapshot_ref))
39 |
40 |
41 | def test_snapshot_with_invalid_field() -> None:
42 | # min_snapshots_to_keep, if present, must be greater than 0
43 | with pytest.raises(ValidationError):
44 | SnapshotRef(
45 | snapshot_id=3051729675574597004,
46 | snapshot_ref_type=SnapshotRefType.TAG,
47 | min_snapshots_to_keep=-1,
48 | max_snapshot_age_ms=None,
49 | max_ref_age_ms=10000000,
50 | )
51 |
52 | # max_snapshot_age_ms, if present, must be greater than 0
53 | with pytest.raises(ValidationError):
54 | SnapshotRef(
55 | snapshot_id=3051729675574597004,
56 | snapshot_ref_type=SnapshotRefType.TAG,
57 | min_snapshots_to_keep=1,
58 | max_snapshot_age_ms=-1,
59 | max_ref_age_ms=10000000,
60 | )
61 |
62 | # max_ref_age_ms, if present, must be greater than 0
63 | with pytest.raises(ValidationError):
64 | SnapshotRef(
65 | snapshot_id=3051729675574597004,
66 | snapshot_ref_type=SnapshotRefType.TAG,
67 | min_snapshots_to_keep=None,
68 | max_snapshot_age_ms=None,
69 | max_ref_age_ms=-1,
70 | )
71 |
72 | with pytest.raises(exceptions.ValidationError, match="Tags do not support setting minSnapshotsToKeep"):
73 | SnapshotRef(
74 | snapshot_id=3051729675574597004,
75 | snapshot_ref_type=SnapshotRefType.TAG,
76 | min_snapshots_to_keep=1,
77 | max_snapshot_age_ms=None,
78 | max_ref_age_ms=10000000,
79 | )
80 |
81 | with pytest.raises(exceptions.ValidationError, match="Tags do not support setting maxSnapshotAgeMs"):
82 | SnapshotRef(
83 | snapshot_id=3051729675574597004,
84 | snapshot_ref_type=SnapshotRefType.TAG,
85 | min_snapshots_to_keep=None,
86 | max_snapshot_age_ms=1,
87 | max_ref_age_ms=100000,
88 | )
89 |
--------------------------------------------------------------------------------
/tests/test_serializers.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import json
19 | import os
20 | import uuid
21 | from typing import Any, Dict
22 |
23 | import pytest
24 | from pytest_mock import MockFixture
25 |
26 | from pyiceberg.serializers import ToOutputFile
27 | from pyiceberg.table import StaticTable
28 | from pyiceberg.table.metadata import TableMetadataV1
29 |
30 |
31 | def test_legacy_current_snapshot_id(
32 | mocker: MockFixture, tmp_path_factory: pytest.TempPathFactory, example_table_metadata_no_snapshot_v1: Dict[str, Any]
33 | ) -> None:
34 | from pyiceberg.io.pyarrow import PyArrowFileIO
35 |
36 | metadata_location = str(tmp_path_factory.mktemp("metadata") / f"{uuid.uuid4()}.metadata.json")
37 | metadata = TableMetadataV1(**example_table_metadata_no_snapshot_v1)
38 | ToOutputFile.table_metadata(metadata, PyArrowFileIO().new_output(location=metadata_location), overwrite=True)
39 | static_table = StaticTable.from_metadata(metadata_location)
40 | assert static_table.metadata.current_snapshot_id is None
41 |
42 | mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"})
43 |
44 | ToOutputFile.table_metadata(metadata, PyArrowFileIO().new_output(location=metadata_location), overwrite=True)
45 | with PyArrowFileIO().new_input(location=metadata_location).open() as input_stream:
46 | metadata_json_bytes = input_stream.read()
47 | assert json.loads(metadata_json_bytes)["current-snapshot-id"] == -1
48 | backwards_compatible_static_table = StaticTable.from_metadata(metadata_location)
49 | assert backwards_compatible_static_table.metadata.current_snapshot_id is None
50 | assert backwards_compatible_static_table.metadata == static_table.metadata
51 |
--------------------------------------------------------------------------------
/tests/test_typedef.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | import pytest
18 |
19 | from pyiceberg.typedef import FrozenDict, KeyDefaultDict, Record
20 |
21 |
22 | def test_setitem_frozendict() -> None:
23 | d = FrozenDict(foo=1, bar=2)
24 | with pytest.raises(AttributeError):
25 | d["foo"] = 3
26 |
27 |
28 | def test_update_frozendict() -> None:
29 | d = FrozenDict(foo=1, bar=2)
30 | with pytest.raises(AttributeError):
31 | d.update({"yes": 2})
32 |
33 |
34 | def test_keydefaultdict() -> None:
35 | def one(_: int) -> int:
36 | return 1
37 |
38 | defaultdict = KeyDefaultDict(one)
39 | assert defaultdict[22] == 1
40 |
41 |
42 | def test_record_named_args() -> None:
43 | r = Record(1, "a", True)
44 |
45 | assert r[0] == 1
46 | assert r[1] == "a"
47 | assert r[2] is True
48 |
49 | assert repr(r) == "Record[1, a, True]"
50 |
--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from pyiceberg import __version__
19 |
20 |
21 | def test_version_format() -> None:
22 | from importlib import metadata
23 |
24 | installed_version = metadata.version("pyiceberg")
25 |
26 | assert __version__ == installed_version, (
27 | f"The installed version ({installed_version}) does not match the current codebase version ({__version__})."
28 | "This failure could be due to a recent version bump in the Pyiceberg library. "
29 | "Please ensure you have the latest version installed by rerunning `make install` command."
30 | )
31 |
--------------------------------------------------------------------------------
/tests/utils/test_concurrent.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import os
19 | from concurrent.futures import ThreadPoolExecutor
20 | from typing import Dict, Optional
21 | from unittest import mock
22 |
23 | import pytest
24 |
25 | from pyiceberg.utils.concurrent import ExecutorFactory
26 |
27 | EMPTY_ENV: Dict[str, Optional[str]] = {}
28 | VALID_ENV = {"PYICEBERG_MAX_WORKERS": "5"}
29 | INVALID_ENV = {"PYICEBERG_MAX_WORKERS": "invalid"}
30 |
31 |
32 | def test_create_reused() -> None:
33 | first = ExecutorFactory.get_or_create()
34 | second = ExecutorFactory.get_or_create()
35 | assert isinstance(first, ThreadPoolExecutor)
36 | assert first is second
37 |
38 |
39 | @mock.patch.dict(os.environ, EMPTY_ENV)
40 | def test_max_workers_none() -> None:
41 | assert ExecutorFactory.max_workers() is None
42 |
43 |
44 | @mock.patch.dict(os.environ, VALID_ENV)
45 | def test_max_workers() -> None:
46 | assert ExecutorFactory.max_workers() == 5
47 |
48 |
49 | @mock.patch.dict(os.environ, INVALID_ENV)
50 | def test_max_workers_invalid() -> None:
51 | with pytest.raises(ValueError):
52 | ExecutorFactory.max_workers()
53 |
--------------------------------------------------------------------------------
/tests/utils/test_decimal.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from decimal import Decimal
18 |
19 | import pytest
20 |
21 | from pyiceberg.utils.decimal import decimal_required_bytes, decimal_to_bytes
22 |
23 |
24 | def test_decimal_required_bytes() -> None:
25 | assert decimal_required_bytes(precision=1) == 1
26 | assert decimal_required_bytes(precision=2) == 1
27 | assert decimal_required_bytes(precision=3) == 2
28 | assert decimal_required_bytes(precision=4) == 2
29 | assert decimal_required_bytes(precision=5) == 3
30 | assert decimal_required_bytes(precision=7) == 4
31 | assert decimal_required_bytes(precision=8) == 4
32 | assert decimal_required_bytes(precision=10) == 5
33 | assert decimal_required_bytes(precision=32) == 14
34 | assert decimal_required_bytes(precision=38) == 16
35 |
36 | with pytest.raises(ValueError) as exc_info:
37 | decimal_required_bytes(precision=40)
38 | assert "(0, 40]" in str(exc_info.value)
39 |
40 | with pytest.raises(ValueError) as exc_info:
41 | decimal_required_bytes(precision=-1)
42 | assert "(0, 40]" in str(exc_info.value)
43 |
44 |
45 | def test_decimal_to_bytes() -> None:
46 | # Check the boundary between 2 and 3 bytes.
47 | # 2 bytes has a minimum of -32,768 and a maximum value of 32,767 (inclusive).
48 | assert decimal_to_bytes(Decimal("32767.")) == b"\x7f\xff"
49 | assert decimal_to_bytes(Decimal("32768.")) == b"\x00\x80\x00"
50 |
--------------------------------------------------------------------------------
/tests/utils/test_deprecated.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from unittest.mock import Mock, patch
18 |
19 | from pyiceberg.utils.deprecated import deprecated
20 |
21 |
22 | @patch("warnings.warn")
23 | def test_deprecated(warn: Mock) -> None:
24 | @deprecated(
25 | deprecated_in="0.1.0",
26 | removed_in="0.2.0",
27 | help_message="Please use load_something_else() instead",
28 | )
29 | def deprecated_method() -> None:
30 | pass
31 |
32 | deprecated_method()
33 |
34 | assert warn.called
35 | assert warn.call_args[0] == (
36 | "Call to deprecated_method, deprecated in 0.1.0, will be removed in 0.2.0. Please use load_something_else() instead.",
37 | )
38 |
39 |
40 | @patch("warnings.warn")
41 | def test_deprecation_message(warn: Mock) -> None:
42 | from pyiceberg.utils.deprecated import deprecation_message
43 |
44 | deprecation_message(
45 | deprecated_in="0.1.0",
46 | removed_in="0.2.0",
47 | help_message="Please use something_else instead",
48 | )
49 |
50 | assert warn.called
51 | assert warn.call_args[0] == ("Deprecated in 0.1.0, will be removed in 0.2.0. Please use something_else instead",)
52 |
--------------------------------------------------------------------------------
/tests/utils/test_lazydict.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | from pyiceberg.utils.lazydict import LazyDict
19 |
20 |
21 | def test_lazy_dict_ints() -> None:
22 | lazy_dict = LazyDict[int, int]([[1, 2], [3, 4]])
23 | assert lazy_dict[1] == 2
24 | assert lazy_dict[3] == 4
25 |
26 |
27 | def test_lazy_dict_strings() -> None:
28 | lazy_dict = LazyDict[int, str]([[1, "red", 5, "banana"], [3, "blue"]])
29 | assert lazy_dict[1] == "red"
30 | assert lazy_dict[3] == "blue"
31 | assert lazy_dict[5] == "banana"
32 |
--------------------------------------------------------------------------------
/tests/utils/test_properties.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | import pytest
19 |
20 | from pyiceberg.utils.properties import (
21 | get_first_property_value,
22 | property_as_bool,
23 | property_as_float,
24 | property_as_int,
25 | )
26 |
27 |
28 | def test_property_as_int() -> None:
29 | properties = {
30 | "int": "42",
31 | }
32 |
33 | assert property_as_int(properties, "int") == 42
34 | assert property_as_int(properties, "missing", default=1) == 1
35 | assert property_as_int(properties, "missing") is None
36 |
37 |
38 | def test_property_as_int_with_invalid_value() -> None:
39 | properties = {
40 | "some_int_prop": "invalid",
41 | }
42 |
43 | with pytest.raises(ValueError) as exc:
44 | property_as_int(properties, "some_int_prop")
45 |
46 | assert "Could not parse table property some_int_prop to an integer: invalid" in str(exc.value)
47 |
48 |
49 | def test_property_as_float() -> None:
50 | properties = {
51 | "float": "42.0",
52 | }
53 |
54 | assert property_as_float(properties, "float", default=1.0) == 42.0
55 | assert property_as_float(properties, "missing", default=1.0) == 1.0
56 | assert property_as_float(properties, "missing") is None
57 |
58 |
59 | def test_property_as_float_with_invalid_value() -> None:
60 | properties = {
61 | "some_float_prop": "invalid",
62 | }
63 |
64 | with pytest.raises(ValueError) as exc:
65 | property_as_float(properties, "some_float_prop")
66 |
67 | assert "Could not parse table property some_float_prop to a float: invalid" in str(exc.value)
68 |
69 |
70 | def test_property_as_bool() -> None:
71 | properties = {
72 | "bool": "True",
73 | }
74 |
75 | assert property_as_bool(properties, "bool", default=False) is True
76 | assert property_as_bool(properties, "missing", default=False) is False
77 | assert property_as_float(properties, "missing") is None
78 |
79 |
80 | def test_property_as_bool_with_invalid_value() -> None:
81 | properties = {
82 | "some_bool_prop": "invalid",
83 | }
84 |
85 | with pytest.raises(ValueError) as exc:
86 | property_as_bool(properties, "some_bool_prop", True)
87 |
88 | assert "Could not parse table property some_bool_prop to a boolean: invalid" in str(exc.value)
89 |
90 |
91 | def test_get_first_property_value() -> None:
92 | properties = {
93 | "prop_1": "value_1",
94 | "prop_2": "value_2",
95 | }
96 |
97 | assert get_first_property_value(properties, "prop_2", "prop_1") == "value_2"
98 | assert get_first_property_value(properties, "missing", "prop_1") == "value_1"
99 |
--------------------------------------------------------------------------------
/tests/utils/test_singleton.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from pyiceberg.avro.reader import BooleanReader, FixedReader
18 | from pyiceberg.transforms import VoidTransform
19 |
20 |
21 | def test_singleton() -> None:
22 | """We want to reuse the readers to avoid creating a gazillion of them"""
23 | assert id(BooleanReader()) == id(BooleanReader())
24 | assert id(FixedReader(22)) == id(FixedReader(22))
25 | assert id(FixedReader(19)) != id(FixedReader(25))
26 |
27 |
28 | def test_singleton_transform() -> None:
29 | """We want to reuse VoidTransform since it doesn't carry any state"""
30 | assert id(VoidTransform()) == id(VoidTransform())
31 |
--------------------------------------------------------------------------------
/tests/utils/test_truncate.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string
18 |
19 |
20 | def test_upper_bound_string_truncation() -> None:
21 | assert truncate_upper_bound_text_string("aaaa", 2) == "ab"
22 | assert truncate_upper_bound_text_string("".join([chr(0x10FFFF), chr(0x10FFFF), chr(0x0)]), 2) is None
23 |
24 |
25 | def test_upper_bound_binary_truncation() -> None:
26 | assert truncate_upper_bound_binary_string(b"\x01\x02\x03", 2) == b"\x01\x03"
27 | assert truncate_upper_bound_binary_string(b"\xff\xff\x00", 2) is None
28 |
--------------------------------------------------------------------------------
/vendor/README.md:
--------------------------------------------------------------------------------
1 |
17 | # Vendor packages
18 |
19 | Some packages we want to maintain in the repository itself, because there is no good 3rd party alternative.
20 |
21 | ## FB303 Thrift client
22 |
23 | fb303 is a base Thrift service and a common set of functionality for querying stats, options, and other information from a service.
24 |
25 | ```bash
26 | rm -f /tmp/fb303.thrift
27 | rm -rf fb303
28 | curl -s https://raw.githubusercontent.com/apache/thrift/master/contrib/fb303/if/fb303.thrift > /tmp/fb303.thrift
29 | rm -rf /tmp/gen-py/
30 | thrift -gen py -o /tmp/ /tmp/fb303.thrift
31 | mv /tmp/gen-py/fb303 fb303
32 | ```
33 |
34 | # Hive Metastore Thrift definition
35 |
36 | The thrift definition require the fb303 service as a dependency
37 |
38 | ```bash
39 | rm -rf /tmp/hive
40 | mkdir -p /tmp/hive/share/fb303/if/
41 | curl -s https://raw.githubusercontent.com/apache/thrift/master/contrib/fb303/if/fb303.thrift > /tmp/hive/share/fb303/if/fb303.thrift
42 | curl -s https://raw.githubusercontent.com/apache/hive/master/standalone-metastore/metastore-common/src/main/thrift/hive_metastore.thrift > /tmp/hive/hive_metastore.thrift
43 | thrift -gen py -o /tmp/hive /tmp/hive/hive_metastore.thrift
44 | mv /tmp/hive/gen-py/hive_metastore hive_metastore
45 | ```
46 |
--------------------------------------------------------------------------------
/vendor/fb303/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | __all__ = ["ttypes", "constants", "FacebookService"]
19 |
--------------------------------------------------------------------------------
/vendor/fb303/constants.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 | # Autogenerated by Thrift Compiler (0.16.0)
19 | #
20 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
21 | #
22 | # options string: py
23 | #
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/vendor/fb303/ttypes.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 | # Autogenerated by Thrift Compiler (0.16.0)
19 | #
20 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
21 | #
22 | # options string: py
23 | #
24 |
25 |
26 | from thrift.TRecursive import fix_spec
27 |
28 | all_structs = []
29 |
30 |
31 | class fb_status:
32 | """
33 | Common status reporting mechanism across all services
34 |
35 | """
36 |
37 | DEAD = 0
38 | STARTING = 1
39 | ALIVE = 2
40 | STOPPING = 3
41 | STOPPED = 4
42 | WARNING = 5
43 |
44 | _VALUES_TO_NAMES = {
45 | 0: "DEAD",
46 | 1: "STARTING",
47 | 2: "ALIVE",
48 | 3: "STOPPING",
49 | 4: "STOPPED",
50 | 5: "WARNING",
51 | }
52 |
53 | _NAMES_TO_VALUES = {
54 | "DEAD": 0,
55 | "STARTING": 1,
56 | "ALIVE": 2,
57 | "STOPPING": 3,
58 | "STOPPED": 4,
59 | "WARNING": 5,
60 | }
61 |
62 |
63 | fix_spec(all_structs)
64 | del all_structs
65 |
--------------------------------------------------------------------------------
/vendor/hive_metastore/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | __all__ = ["ttypes", "constants", "ThriftHiveMetastore"]
18 |
--------------------------------------------------------------------------------
/vendor/hive_metastore/constants.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 | # Autogenerated by Thrift Compiler (0.16.0)
19 | #
20 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
21 | #
22 | # options string: py
23 | #
24 |
25 |
26 |
27 |
28 | DDL_TIME = "transient_lastDdlTime"
29 | ACCESSTYPE_NONE = 1
30 | ACCESSTYPE_READONLY = 2
31 | ACCESSTYPE_WRITEONLY = 4
32 | ACCESSTYPE_READWRITE = 8
33 | HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__"
34 | HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__"
35 | HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__"
36 | IS_ARCHIVED = "is_archived"
37 | ORIGINAL_LOCATION = "original_location"
38 | IS_IMMUTABLE = "immutable"
39 | META_TABLE_COLUMNS = "columns"
40 | META_TABLE_COLUMN_TYPES = "columns.types"
41 | BUCKET_FIELD_NAME = "bucket_field_name"
42 | BUCKET_COUNT = "bucket_count"
43 | FIELD_TO_DIMENSION = "field_to_dimension"
44 | META_TABLE_NAME = "name"
45 | META_TABLE_DB = "db"
46 | META_TABLE_LOCATION = "location"
47 | META_TABLE_SERDE = "serde"
48 | META_TABLE_PARTITION_COLUMNS = "partition_columns"
49 | META_TABLE_PARTITION_COLUMN_TYPES = "partition_columns.types"
50 | FILE_INPUT_FORMAT = "file.inputformat"
51 | FILE_OUTPUT_FORMAT = "file.outputformat"
52 | META_TABLE_STORAGE = "storage_handler"
53 | TABLE_IS_TRANSACTIONAL = "transactional"
54 | TABLE_NO_AUTO_COMPACT = "no_auto_compaction"
55 | TABLE_TRANSACTIONAL_PROPERTIES = "transactional_properties"
56 | TABLE_BUCKETING_VERSION = "bucketing_version"
57 | DRUID_CONFIG_PREFIX = "druid."
58 | JDBC_CONFIG_PREFIX = "hive.sql."
59 | TABLE_IS_CTAS = "created_with_ctas"
60 | TABLE_IS_CTLT = "created_with_ctlt"
61 | PARTITION_TRANSFORM_SPEC = "partition_transform_spec"
62 | NO_CLEANUP = "no_cleanup"
63 | CTAS_LEGACY_CONFIG = "create_table_as_external"
64 | DEFAULT_TABLE_TYPE = "defaultTableType"
65 | TXN_ID = "txnId"
66 | WRITE_ID = "writeId"
67 |
--------------------------------------------------------------------------------