├── .asf.yaml ├── .codespellrc ├── .github ├── ISSUE_TEMPLATE │ ├── iceberg_bug_report.yml │ ├── iceberg_improvement.yml │ └── iceberg_question.yml ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── check-md-link.yml │ ├── license_check.yml │ ├── nightly-pypi-build.yml │ ├── pypi-build-artifacts.yml │ ├── python-ci-docs.yml │ ├── python-ci.yml │ ├── python-release-docs.yml │ ├── python-release.yml │ ├── stale.yml │ └── svn-build-artifacts.yml ├── .gitignore ├── .markdownlint.yaml ├── .pre-commit-config.yaml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── NOTICE ├── README.md ├── build-module.py ├── dev ├── .rat-excludes ├── Dockerfile ├── check-license ├── docker-compose-azurite.yml ├── docker-compose-gcs-server.yml ├── docker-compose-integration.yml ├── docker-compose.yml ├── entrypoint.sh ├── hive │ ├── Dockerfile │ └── core-site.xml ├── provision.py ├── run-azurite.sh ├── run-gcs-server.sh ├── run-minio.sh └── spark-defaults.conf ├── mkdocs ├── README.md ├── docs │ ├── SUMMARY.md │ ├── api.md │ ├── assets │ │ └── images │ │ │ ├── gen-release-notes.jpg │ │ │ └── iceberg-logo-icon.png │ ├── cli.md │ ├── community.md │ ├── configuration.md │ ├── contributing.md │ ├── how-to-release.md │ ├── index.md │ ├── nightly-build.md │ └── verify-release.md ├── gen_doc_stubs.py └── mkdocs.yml ├── poetry.lock ├── pyiceberg ├── __init__.py ├── avro │ ├── __init__.py │ ├── codecs │ │ ├── __init__.py │ │ ├── bzip2.py │ │ ├── codec.py │ │ ├── deflate.py │ │ ├── snappy_codec.py │ │ └── zstandard_codec.py │ ├── decoder.py │ ├── decoder_basic.c │ ├── decoder_fast.pyi │ ├── decoder_fast.pyx │ ├── encoder.py │ ├── file.py │ ├── reader.py │ ├── resolver.py │ └── writer.py ├── catalog │ ├── __init__.py │ ├── dynamodb.py │ ├── glue.py │ ├── hive.py │ ├── memory.py │ ├── noop.py │ ├── rest │ │ ├── __init__.py │ │ ├── auth.py │ │ └── response.py │ └── sql.py ├── cli │ ├── __init__.py │ ├── console.py │ └── output.py ├── conversions.py ├── exceptions.py ├── expressions │ ├── __init__.py │ ├── literals.py │ ├── parser.py │ └── visitors.py ├── io │ ├── __init__.py │ ├── fsspec.py │ └── pyarrow.py ├── manifest.py ├── partitioning.py ├── py.typed ├── schema.py ├── serializers.py ├── table │ ├── __init__.py │ ├── inspect.py │ ├── locations.py │ ├── metadata.py │ ├── name_mapping.py │ ├── puffin.py │ ├── refs.py │ ├── snapshots.py │ ├── sorting.py │ ├── statistics.py │ ├── update │ │ ├── __init__.py │ │ ├── schema.py │ │ ├── snapshot.py │ │ ├── spec.py │ │ ├── statistics.py │ │ └── validate.py │ └── upsert_util.py ├── transforms.py ├── typedef.py ├── types.py └── utils │ ├── __init__.py │ ├── bin_packing.py │ ├── concurrent.py │ ├── config.py │ ├── datetime.py │ ├── decimal.py │ ├── deprecated.py │ ├── lazydict.py │ ├── parsing.py │ ├── properties.py │ ├── schema_conversion.py │ ├── singleton.py │ └── truncate.py ├── pyproject.toml ├── ruff.toml ├── tests ├── avro │ ├── test_decoder.py │ ├── test_encoder.py │ ├── test_file.py │ ├── test_reader.py │ ├── test_resolver.py │ └── test_writer.py ├── benchmark │ └── test_benchmark.py ├── catalog │ ├── integration_test_dynamodb.py │ ├── integration_test_glue.py │ ├── test_base.py │ ├── test_dynamodb.py │ ├── test_glue.py │ ├── test_hive.py │ ├── test_rest.py │ ├── test_rest_auth.py │ └── test_sql.py ├── cli │ ├── test_console.py │ └── test_output.py ├── conftest.py ├── expressions │ ├── test_evaluator.py │ ├── test_expressions.py │ ├── test_literals.py │ ├── test_parser.py │ ├── test_projection.py │ ├── test_residual_evaluator.py │ └── test_visitors.py ├── integration │ ├── test_add_files.py │ ├── test_delete_count.py │ ├── test_deletes.py │ ├── test_inspect_table.py │ ├── test_partition_evolution.py │ ├── test_partitioning_key.py │ ├── test_reads.py │ ├── test_register_table.py │ ├── test_rest_catalog.py │ ├── test_rest_manifest.py │ ├── test_rest_schema.py │ ├── test_snapshot_operations.py │ ├── test_statistics_operations.py │ └── test_writes │ │ ├── test_optimistic_concurrency.py │ │ ├── test_partitioned_writes.py │ │ ├── test_writes.py │ │ └── utils.py ├── io │ ├── test_fsspec.py │ ├── test_io.py │ ├── test_pyarrow.py │ ├── test_pyarrow_stats.py │ └── test_pyarrow_visitor.py ├── table │ ├── bitmaps │ │ ├── 64map32bitvals.bin │ │ ├── 64mapempty.bin │ │ ├── 64maphighvals.bin │ │ └── 64mapspreadvals.bin │ ├── test_init.py │ ├── test_locations.py │ ├── test_metadata.py │ ├── test_name_mapping.py │ ├── test_partitioning.py │ ├── test_puffin.py │ ├── test_refs.py │ ├── test_snapshots.py │ ├── test_sorting.py │ ├── test_upsert.py │ └── test_validate.py ├── test_conversions.py ├── test_schema.py ├── test_serializers.py ├── test_transforms.py ├── test_typedef.py ├── test_types.py ├── test_version.py └── utils │ ├── test_bin_packing.py │ ├── test_concurrent.py │ ├── test_config.py │ ├── test_datetime.py │ ├── test_decimal.py │ ├── test_deprecated.py │ ├── test_lazydict.py │ ├── test_manifest.py │ ├── test_properties.py │ ├── test_schema_conversion.py │ ├── test_singleton.py │ └── test_truncate.py └── vendor ├── README.md ├── fb303 ├── FacebookService.py ├── __init__.py ├── constants.py └── ttypes.py └── hive_metastore ├── ThriftHiveMetastore.py ├── __init__.py ├── constants.py └── ttypes.py /.asf.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | # The format of this file is documented at 21 | # https://cwiki.apache.org/confluence/display/INFRA/Git+-+.asf.yaml+features 22 | 23 | github: 24 | description: "Apache PyIceberg" 25 | homepage: https://py.iceberg.apache.org/ 26 | labels: 27 | - iceberg 28 | - apache 29 | - hacktoberfest 30 | - pyiceberg 31 | enabled_merge_buttons: 32 | merge: false 33 | squash: true 34 | rebase: true 35 | protected_branches: 36 | main: 37 | required_status_checks: 38 | # strict means "Require branches to be up to date before merging". 39 | strict: true 40 | 41 | required_pull_request_reviews: 42 | required_approving_review_count: 1 43 | 44 | required_linear_history: true 45 | del_branch_on_merge: true 46 | features: 47 | wiki: true 48 | issues: true 49 | projects: true 50 | collaborators: # Note: the number of collaborators is limited to 10 51 | - ajantha-bhat 52 | - ndrluis 53 | ghp_branch: gh-pages 54 | ghp_path: / 55 | 56 | notifications: 57 | commits: commits@iceberg.apache.org 58 | issues: issues@iceberg.apache.org 59 | pullrequests: issues@iceberg.apache.org 60 | jira_options: link label link label 61 | -------------------------------------------------------------------------------- /.codespellrc: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | [codespell] 18 | ignore-words-list = BoundIn,fo,MoR,NotIn,notIn,oT 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/iceberg_bug_report.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | --- 21 | name: Iceberg Bug report 🐞 22 | description: Problems, bugs and issues with Apache Iceberg 23 | labels: ["kind:bug"] 24 | body: 25 | - type: dropdown 26 | attributes: 27 | label: Apache Iceberg version 28 | description: What Apache Iceberg version are you using? 29 | multiple: false 30 | options: 31 | - "0.9.1 (latest release)" 32 | - "0.9.0" 33 | - "0.8.1" 34 | - "0.8.0" 35 | - "0.7.1" 36 | - "0.7.0" 37 | - "0.6.1" 38 | - "0.6.0" 39 | - "0.5.0" 40 | - "0.4.0" 41 | - "0.3.0" 42 | - "0.2.0" 43 | - "0.1.0" 44 | - "main (development)" 45 | validations: 46 | required: false 47 | - type: textarea 48 | attributes: 49 | label: Please describe the bug 🐞 50 | description: > 51 | Please describe the problem, what to expect, and how to reproduce. 52 | Feel free to include stacktraces and the Iceberg catalog configuration. 53 | You can include files by dragging and dropping them here. 54 | validations: 55 | required: true 56 | - type: checkboxes 57 | attributes: 58 | label: Willingness to contribute 59 | description: The Apache Iceberg community encourages bug-fix contributions. Would you or another member of your organization be willing to contribute a fix for this bug to the PyIceberg codebase? 60 | options: 61 | - label: I can contribute a fix for this bug independently 62 | - label: I would be willing to contribute a fix for this bug with guidance from the Iceberg community 63 | - label: I cannot contribute a fix for this bug at this time 64 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/iceberg_improvement.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | --- 21 | name: Iceberg Improvement / Feature Request 22 | description: New features with Apache Iceberg 23 | labels: ["kind:feature request"] 24 | body: 25 | - type: textarea 26 | attributes: 27 | label: Feature Request / Improvement 28 | description: Please describe the feature and elaborate on the use case and motivation behind it 29 | validations: 30 | required: true 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/iceberg_question.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | --- 21 | name: Iceberg Question 22 | description: Questions around Apache Iceberg 23 | labels: ["kind:question"] 24 | body: 25 | - type: textarea 26 | attributes: 27 | label: Question 28 | description: What is your question? 29 | validations: 30 | required: true 31 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | version: 2 21 | updates: 22 | - package-ecosystem: "pip" 23 | directory: "/" 24 | schedule: 25 | interval: "weekly" 26 | open-pull-requests-limit: 50 27 | - package-ecosystem: "github-actions" 28 | directory: "/" 29 | schedule: 30 | interval: "weekly" 31 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | # Rationale for this change 9 | 10 | # Are these changes tested? 11 | 12 | # Are there any user-facing changes? 13 | 14 | 15 | -------------------------------------------------------------------------------- /.github/workflows/check-md-link.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | name: Check Markdown links 21 | 22 | on: 23 | push: 24 | paths: 25 | - '.github/workflows/check-md-link.yml' 26 | - 'mkdocs/**' 27 | branches: 28 | - 'main' 29 | pull_request: 30 | paths: 31 | - '.github/workflows/check-md-link.yml' 32 | - 'mkdocs/**' 33 | 34 | jobs: 35 | markdown-link-check: 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@master 39 | - uses: gaurav-nelson/github-action-markdown-link-check@v1 40 | -------------------------------------------------------------------------------- /.github/workflows/license_check.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | name: "Run License Check" 21 | on: pull_request 22 | 23 | jobs: 24 | rat: 25 | runs-on: ubuntu-22.04 26 | steps: 27 | - uses: actions/checkout@v4 28 | - run: dev/check-license 29 | -------------------------------------------------------------------------------- /.github/workflows/nightly-pypi-build.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | name: "Nightly PyPI Build" 21 | 22 | on: 23 | schedule: 24 | - cron: "0 0 * * *" # Runs at midnight UTC every day 25 | workflow_dispatch: # Allows manual triggering 26 | 27 | jobs: 28 | set-version: 29 | if: github.repository == 'apache/iceberg-python' # Only run for apache repo 30 | runs-on: ubuntu-latest 31 | outputs: 32 | VERSION: ${{ steps.set-version.outputs.VERSION }} 33 | steps: 34 | - uses: actions/checkout@v4 35 | with: 36 | fetch-depth: 1 37 | 38 | - uses: actions/setup-python@v5 39 | with: 40 | python-version: 3.12 41 | 42 | - name: Install Poetry 43 | run: make install-poetry 44 | 45 | - name: Set version 46 | id: set-version 47 | run: | 48 | CURRENT_VERSION=$(poetry version --short) 49 | TIMESTAMP=$(date +%Y%m%d%H%M%S) 50 | echo "VERSION=${CURRENT_VERSION}.dev${TIMESTAMP}" >> "$GITHUB_OUTPUT" 51 | 52 | - name: Debug version 53 | run: echo "Publishing version ${{ steps.set-version.outputs.VERSION }}" 54 | 55 | nightly-build: 56 | needs: set-version 57 | uses: ./.github/workflows/pypi-build-artifacts.yml 58 | with: 59 | version: ${{ needs.set-version.outputs.VERSION }} 60 | testpypi-publish: 61 | name: Publish to TestPypi 62 | needs: 63 | - nightly-build 64 | runs-on: ubuntu-latest 65 | environment: 66 | name: testpypi 67 | url: https://test.pypi.org/p/pyiceberg 68 | 69 | permissions: 70 | id-token: write # IMPORTANT: mandatory for trusted publishing 71 | 72 | steps: 73 | - name: Download all the artifacts 74 | uses: actions/download-artifact@v4 75 | with: 76 | merge-multiple: true 77 | path: dist/ 78 | - name: List downloaded artifacts 79 | run: ls -R dist/ 80 | - name: Publish to TestPyPI 81 | uses: pypa/gh-action-pypi-publish@release/v1 82 | with: 83 | repository-url: https://test.pypi.org/legacy/ 84 | skip-existing: true 85 | verbose: true 86 | -------------------------------------------------------------------------------- /.github/workflows/pypi-build-artifacts.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | name: "Build PyPI Artifacts" 21 | 22 | on: 23 | workflow_call: 24 | inputs: 25 | VERSION: 26 | required: true 27 | type: string 28 | 29 | jobs: 30 | pypi-build-artifacts: 31 | name: Build artifacts for PyPi on ${{ matrix.os }} 32 | runs-on: ${{ matrix.os }} 33 | strategy: 34 | matrix: 35 | os: [ ubuntu-22.04, windows-2022, macos-13, macos-14 ] 36 | 37 | steps: 38 | - uses: actions/checkout@v4 39 | with: 40 | fetch-depth: 1 41 | 42 | - uses: actions/setup-python@v5 43 | with: 44 | python-version: | 45 | 3.9 46 | 3.10 47 | 3.11 48 | 3.12 49 | 50 | - name: Install poetry 51 | run: make install-poetry 52 | 53 | - name: Set version with RC 54 | env: 55 | VERSION: ${{ inputs.VERSION }} 56 | run: python -m poetry version "${{ env.VERSION }}" 57 | 58 | # Publish the source distribution with the version that's in 59 | # the repository, otherwise the tests will fail 60 | - name: Compile source distribution 61 | run: python3 -m poetry build --format=sdist 62 | if: startsWith(matrix.os, 'ubuntu') 63 | 64 | - name: Build wheels 65 | uses: pypa/cibuildwheel@v2.23.3 66 | with: 67 | output-dir: wheelhouse 68 | config-file: "pyproject.toml" 69 | env: 70 | # Ignore 32 bit architectures 71 | CIBW_ARCHS: "auto64" 72 | CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9,<3.13" 73 | CIBW_TEST_REQUIRES: "pytest==7.4.2 moto==5.0.1" 74 | CIBW_TEST_COMMAND: "pytest {project}/tests/avro/test_decoder.py" 75 | # Ignore tests for pypy since not all dependencies are compiled for it 76 | # and would require a local rust build chain 77 | CIBW_TEST_SKIP: "pp*" 78 | 79 | - name: Add source distribution 80 | if: startsWith(matrix.os, 'ubuntu') 81 | run: ls -lah dist/* && cp dist/* wheelhouse/ 82 | 83 | - uses: actions/upload-artifact@v4 84 | with: 85 | name: "pypi-release-candidate-${{ matrix.os }}" 86 | path: ./wheelhouse/* 87 | 88 | pypi-merge-artifacts: 89 | runs-on: ubuntu-latest 90 | needs: 91 | - pypi-build-artifacts 92 | steps: 93 | - name: Merge Artifacts 94 | uses: actions/upload-artifact/merge@v4 95 | with: 96 | name: "pypi-release-candidate-${{ inputs.VERSION }}" 97 | pattern: pypi-release-candidate* 98 | delete-merged: true 99 | -------------------------------------------------------------------------------- /.github/workflows/python-ci-docs.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | name: "Python CI Docs" 21 | 22 | on: 23 | push: 24 | branches: 25 | - 'main' 26 | pull_request: 27 | 28 | 29 | concurrency: 30 | group: ${{ github.workflow }}-${{ github.ref }} 31 | cancel-in-progress: ${{ github.event_name == 'pull_request' }} 32 | 33 | jobs: 34 | docs: 35 | runs-on: ubuntu-22.04 36 | 37 | steps: 38 | - uses: actions/checkout@v4 39 | - name: Install poetry 40 | run: make install-poetry 41 | - uses: actions/setup-python@v5 42 | with: 43 | python-version: 3.12 44 | - name: Install 45 | run: make docs-install 46 | - name: Build docs 47 | run: make docs-build 48 | -------------------------------------------------------------------------------- /.github/workflows/python-ci.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | name: "Python CI" 21 | 22 | on: 23 | push: 24 | branches: 25 | - 'main' 26 | pull_request: 27 | paths: 28 | - '**' # Include all files and directories in the repository by default. 29 | - '!.github/workflows/**' # Exclude all workflow files 30 | - '.github/workflows/python-ci.yml' # except the current file. 31 | - '!.github/ISSUE_TEMPLATE/**' # Exclude files and directories that don't impact tests or code like templates, metadata, and documentation. 32 | - '!.gitignore' 33 | - '!.asf.yml' 34 | - '!mkdocs/**' 35 | - '!.gitattributes' 36 | - '!README.md' 37 | - '!CONTRIBUTING.md' 38 | - '!LICENSE' 39 | - '!NOTICE' 40 | 41 | concurrency: 42 | group: ${{ github.workflow }}-${{ github.ref }} 43 | cancel-in-progress: ${{ github.event_name == 'pull_request' }} 44 | 45 | jobs: 46 | lint-and-test: 47 | runs-on: ubuntu-22.04 48 | strategy: 49 | matrix: 50 | python: ['3.9', '3.10', '3.11', '3.12'] 51 | 52 | steps: 53 | - uses: actions/checkout@v4 54 | - name: Install poetry 55 | run: make install-poetry 56 | - uses: actions/setup-python@v5 57 | with: 58 | python-version: ${{ matrix.python }} 59 | cache: poetry 60 | cache-dependency-path: ./poetry.lock 61 | - name: Install system dependencies 62 | run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos 63 | - name: Install 64 | run: make install-dependencies 65 | - name: Linters 66 | run: make lint 67 | - name: Tests 68 | run: make test-coverage-unit 69 | 70 | integration-test: 71 | runs-on: ubuntu-22.04 72 | strategy: 73 | matrix: 74 | python: ['3.9', '3.10', '3.11', '3.12'] 75 | 76 | steps: 77 | - uses: actions/checkout@v4 78 | - name: Install system dependencies 79 | run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos 80 | - name: Install 81 | run: make install 82 | - name: Run integration tests 83 | run: make test-coverage-integration 84 | - name: Show debug logs 85 | if: ${{ failure() }} 86 | run: docker compose -f dev/docker-compose.yml logs 87 | -------------------------------------------------------------------------------- /.github/workflows/python-release-docs.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | name: "Release Docs" 21 | on: 22 | workflow_dispatch: 23 | 24 | concurrency: 25 | group: ${{ github.workflow }}-${{ github.ref }} 26 | cancel-in-progress: ${{ github.event_name == 'pull_request' }} 27 | 28 | jobs: 29 | docs: 30 | runs-on: ubuntu-22.04 31 | 32 | steps: 33 | - uses: actions/checkout@v4 34 | - name: Install poetry 35 | run: make install-poetry 36 | - uses: actions/setup-python@v5 37 | with: 38 | python-version: ${{ matrix.python }} 39 | - name: Install docs 40 | run: make docs-install 41 | - name: Build docs 42 | run: make docs-build 43 | - name: Copy 44 | working-directory: ./mkdocs 45 | run: mv ./site /tmp/site 46 | - name: Push changes to gh-pages branch 47 | run: | 48 | git checkout --orphan gh-pages-tmp 49 | git rm --quiet -rf . 50 | cp -r /tmp/site/* . 51 | git config --global user.name 'GitHub Actions' 52 | git config --global user.email 'actions@github.com' 53 | echo "py.iceberg.apache.org" > CNAME 54 | git add --all 55 | git commit -m 'Publish Python docs' 56 | git push -f origin gh-pages-tmp:gh-pages || true 57 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | name: "Close Stale Issues" 21 | on: 22 | schedule: 23 | - cron: '0 0 * * *' 24 | 25 | permissions: 26 | # All other permissions are set to none 27 | issues: write 28 | 29 | jobs: 30 | stale: 31 | if: github.repository_owner == 'apache' 32 | runs-on: ubuntu-22.04 33 | steps: 34 | - uses: actions/stale@v9.1.0 35 | with: 36 | stale-issue-label: 'stale' 37 | exempt-issue-labels: 'not-stale' 38 | days-before-issue-stale: 180 39 | days-before-issue-close: 14 40 | # Only close stale issues, leave PRs alone 41 | days-before-pr-stale: -1 42 | stale-issue-message: > 43 | This issue has been automatically marked as stale because it has been open for 180 days 44 | with no activity. It will be closed in next 14 days if no further activity occurs. To 45 | permanently prevent this issue from being considered stale, add the label 'not-stale', 46 | but commenting on the issue is preferred when possible. 47 | close-issue-message: > 48 | This issue has been closed because it has not received any activity in the last 14 days 49 | since being marked as 'stale' 50 | -------------------------------------------------------------------------------- /.github/workflows/svn-build-artifacts.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | name: "Build SVN Artifacts" 21 | 22 | on: 23 | workflow_call: 24 | inputs: 25 | VERSION: 26 | required: true 27 | type: string 28 | 29 | jobs: 30 | svn-build-artifacts: 31 | name: Build artifacts for SVN on ${{ matrix.os }} 32 | runs-on: ${{ matrix.os }} 33 | strategy: 34 | matrix: 35 | os: [ ubuntu-22.04, windows-2022, macos-13, macos-14 ] 36 | 37 | steps: 38 | - uses: actions/checkout@v4 39 | with: 40 | fetch-depth: 1 41 | 42 | - uses: actions/setup-python@v5 43 | with: 44 | python-version: | 45 | 3.9 46 | 3.10 47 | 3.11 48 | 3.12 49 | 50 | - name: Install poetry 51 | run: make install-poetry 52 | 53 | # Publish the source distribution with the version that's in 54 | # the repository, otherwise the tests will fail 55 | - name: Compile source distribution 56 | run: python3 -m poetry build --format=sdist 57 | if: startsWith(matrix.os, 'ubuntu') 58 | 59 | - name: Build wheels 60 | uses: pypa/cibuildwheel@v2.23.3 61 | with: 62 | output-dir: wheelhouse 63 | config-file: "pyproject.toml" 64 | env: 65 | # Ignore 32 bit architectures 66 | CIBW_ARCHS: "auto64" 67 | CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9,<3.13" 68 | CIBW_TEST_REQUIRES: "pytest==7.4.2 moto==5.0.1" 69 | CIBW_TEST_COMMAND: "pytest {project}/tests/avro/test_decoder.py" 70 | # Ignore tests for pypy since not all dependencies are compiled for it 71 | # and would require a local rust build chain 72 | CIBW_TEST_SKIP: "pp*" 73 | 74 | - name: Add source distribution 75 | if: startsWith(matrix.os, 'ubuntu') 76 | run: ls -lah dist/* && cp dist/* wheelhouse/ 77 | 78 | - uses: actions/upload-artifact@v4 79 | with: 80 | name: "svn-release-candidate-${{ matrix.os }}" 81 | path: ./wheelhouse/* 82 | 83 | svn-merge-artifacts: 84 | runs-on: ubuntu-latest 85 | needs: 86 | - svn-build-artifacts 87 | steps: 88 | - name: Merge Artifacts 89 | uses: actions/upload-artifact/merge@v4 90 | with: 91 | name: "svn-release-candidate-${{ inputs.VERSION }}" 92 | pattern: svn-release-candidate* 93 | delete-merged: true 94 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | .DS_Store 3 | .cache 4 | tmp/ 5 | site 6 | 7 | # intellij files 8 | .idea 9 | .idea_modules/ 10 | *.ipr 11 | *.iws 12 | *.iml 13 | out 14 | 15 | # rat library install location 16 | lib/ 17 | 18 | __pycache__/ 19 | *.py[cod] 20 | .eggs/ 21 | .tox/ 22 | env/ 23 | venv/ 24 | *.egg-info/ 25 | test-reports 26 | build/ 27 | dist/ 28 | sdist/ 29 | .coverage 30 | coverage.xml 31 | .pytest_cache/ 32 | 33 | # vscode/eclipse files 34 | .classpath 35 | .project 36 | .settings 37 | bin/ 38 | .vscode/ 39 | 40 | # Hive/metastore files 41 | metastore_db/ 42 | 43 | # Spark/metastore files 44 | spark-warehouse/ 45 | derby.log 46 | 47 | # Python stuff 48 | .mypy_cache/ 49 | htmlcov 50 | 51 | pyiceberg/avro/decoder_fast.c 52 | pyiceberg/avro/*.html 53 | pyiceberg/avro/*.so 54 | -------------------------------------------------------------------------------- /.markdownlint.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Default state for all rules 19 | default: true 20 | 21 | # MD013/line-length - Line length 22 | MD013: false 23 | 24 | # MD007/ul-indent - Unordered list indentation 25 | MD007: 26 | indent: 4 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | --- 18 | exclude: ^vendor/ 19 | 20 | repos: 21 | - repo: https://github.com/pre-commit/pre-commit-hooks 22 | rev: v5.0.0 23 | hooks: 24 | - id: trailing-whitespace 25 | - id: end-of-file-fixer 26 | - id: debug-statements 27 | - id: check-yaml 28 | - id: check-ast 29 | - repo: https://github.com/astral-sh/ruff-pre-commit 30 | rev: v0.8.6 31 | hooks: 32 | - id: ruff 33 | args: [ --fix, --exit-non-zero-on-fix ] 34 | - id: ruff-format 35 | - repo: https://github.com/pre-commit/mirrors-mypy 36 | rev: v1.14.1 37 | hooks: 38 | - id: mypy 39 | args: 40 | [--install-types, --non-interactive, --config=pyproject.toml] 41 | - repo: https://github.com/igorshubovych/markdownlint-cli 42 | rev: v0.43.0 43 | hooks: 44 | - id: markdownlint 45 | args: ["--fix"] 46 | - repo: https://github.com/pycqa/pydocstyle 47 | rev: 6.3.0 48 | hooks: 49 | - id: pydocstyle 50 | args: 51 | [ 52 | "--ignore=D100,D102,D101,D103,D104,D107,D203,D212,D213,D404,D405,D406,D407,D411,D413,D415,D417", 53 | ] 54 | additional_dependencies: 55 | - tomli==2.0.1 56 | - repo: https://github.com/ikamensh/flynt 57 | rev: 1.0.1 58 | hooks: 59 | - id: flynt 60 | args: 61 | # --line-length is set to a high value to deal with very long lines 62 | - --line-length 63 | - '99999' 64 | - repo: https://github.com/codespell-project/codespell 65 | rev: v2.3.0 66 | hooks: 67 | - id: codespell 68 | ci: 69 | autofix_commit_msg: | 70 | [pre-commit.ci] auto fixes from pre-commit.com hooks 71 | 72 | for more information, see https://pre-commit.ci 73 | autofix_prs: true 74 | autoupdate_branch: '' 75 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate' 76 | autoupdate_schedule: weekly 77 | skip: [] 78 | submodules: false 79 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | graft src 19 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | 2 | Apache Iceberg 3 | Copyright 2017-2025 The Apache Software Foundation 4 | 5 | This product includes software developed at 6 | The Apache Software Foundation (http://www.apache.org/). 7 | 8 | -------------------------------------------------------------------------------- 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 17 | 18 | # Iceberg Python 19 | 20 | PyIceberg is a Python library for programmatic access to Iceberg table metadata as well as to table data in Iceberg format. It is a Python implementation of the [Iceberg table spec](https://iceberg.apache.org/spec/). 21 | 22 | The documentation is available at [https://py.iceberg.apache.org/](https://py.iceberg.apache.org/). 23 | 24 | # Get in Touch 25 | 26 | - [Iceberg community](https://iceberg.apache.org/community/) 27 | -------------------------------------------------------------------------------- /build-module.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import os 19 | import shutil 20 | from pathlib import Path 21 | 22 | allowed_to_fail = os.environ.get("CIBUILDWHEEL", "0") != "1" 23 | 24 | 25 | def build_cython_extensions() -> None: 26 | import Cython.Compiler.Options 27 | from Cython.Build import build_ext, cythonize 28 | from setuptools import Extension 29 | from setuptools.dist import Distribution 30 | 31 | Cython.Compiler.Options.annotate = True 32 | 33 | if os.name == "nt": # Windows 34 | extra_compile_args = [ 35 | "/O2", 36 | ] 37 | else: # UNIX-based systems 38 | extra_compile_args = [ 39 | "-O3", 40 | ] 41 | 42 | package_path = "pyiceberg" 43 | 44 | extension = Extension( 45 | # Your .pyx file will be available to cpython at this location. 46 | name="pyiceberg.avro.decoder_fast", 47 | sources=[ 48 | os.path.join(package_path, "avro", "decoder_fast.pyx"), 49 | ], 50 | extra_compile_args=extra_compile_args, 51 | language="c", 52 | ) 53 | 54 | ext_modules = cythonize([extension], include_path=list(package_path), language_level=3, annotate=True) 55 | dist = Distribution({"ext_modules": ext_modules}) 56 | cmd = build_ext(dist) 57 | cmd.ensure_finalized() 58 | 59 | cmd.run() 60 | 61 | for output in cmd.get_outputs(): 62 | output = Path(output) 63 | relative_extension = output.relative_to(cmd.build_lib) 64 | shutil.copyfile(output, relative_extension) 65 | 66 | 67 | try: 68 | build_cython_extensions() 69 | except Exception: 70 | if not allowed_to_fail: 71 | raise 72 | -------------------------------------------------------------------------------- /dev/.rat-excludes: -------------------------------------------------------------------------------- 1 | .github/* 2 | .rat-excludes 3 | build 4 | .git 5 | .gitignore 6 | poetry.lock 7 | -------------------------------------------------------------------------------- /dev/Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | FROM python:3.9-bullseye 17 | 18 | RUN apt-get -qq update && \ 19 | apt-get -qq install -y --no-install-recommends \ 20 | sudo \ 21 | curl \ 22 | vim \ 23 | unzip \ 24 | openjdk-11-jdk \ 25 | build-essential \ 26 | software-properties-common \ 27 | ssh && \ 28 | apt-get -qq clean && \ 29 | rm -rf /var/lib/apt/lists/* 30 | 31 | # Optional env variables 32 | ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} 33 | ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"} 34 | ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH 35 | 36 | RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events 37 | WORKDIR ${SPARK_HOME} 38 | 39 | # Remember to also update `tests/conftest`'s spark setting 40 | ENV SPARK_VERSION=3.5.4 41 | ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 42 | ENV ICEBERG_VERSION=1.9.0 43 | ENV PYICEBERG_VERSION=0.9.0 44 | 45 | RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ 46 | && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ 47 | && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz 48 | 49 | # Download iceberg spark runtime 50 | RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ 51 | -Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar 52 | 53 | 54 | # Download AWS bundle 55 | RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ 56 | -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar 57 | 58 | COPY spark-defaults.conf /opt/spark/conf 59 | ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" 60 | 61 | RUN chmod u+x /opt/spark/sbin/* && \ 62 | chmod u+x /opt/spark/bin/* 63 | 64 | RUN pip3 install -q ipython 65 | 66 | RUN pip3 install "pyiceberg[s3fs,hive]==${PYICEBERG_VERSION}" 67 | 68 | COPY entrypoint.sh . 69 | COPY provision.py . 70 | 71 | ENTRYPOINT ["./entrypoint.sh"] 72 | CMD ["notebook"] 73 | -------------------------------------------------------------------------------- /dev/check-license: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | 21 | acquire_rat_jar () { 22 | 23 | URL="https://repo.maven.apache.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar" 24 | 25 | JAR="$rat_jar" 26 | 27 | # Download rat launch jar if it hasn't been downloaded yet 28 | if [ ! -f "$JAR" ]; then 29 | # Download 30 | printf "Attempting to fetch rat\n" 31 | JAR_DL="${JAR}.part" 32 | if [ $(command -v curl) ]; then 33 | curl -L --silent "${URL}" > "$JAR_DL" && mv "$JAR_DL" "$JAR" 34 | elif [ $(command -v wget) ]; then 35 | wget --quiet ${URL} -O "$JAR_DL" && mv "$JAR_DL" "$JAR" 36 | else 37 | printf "You do not have curl or wget installed, please install rat manually.\n" 38 | exit -1 39 | fi 40 | fi 41 | 42 | unzip -tq "$JAR" &> /dev/null 43 | if [ $? -ne 0 ]; then 44 | # We failed to download 45 | rm "$JAR" 46 | printf "Our attempt to download rat locally to ${JAR} failed. Please install rat manually.\n" 47 | exit -1 48 | fi 49 | } 50 | 51 | # Go to the Spark project root directory 52 | FWDIR="$(cd "`dirname "$0"`"/..; pwd)" 53 | cd "$FWDIR" 54 | 55 | if test -x "$JAVA_HOME/bin/java"; then 56 | declare java_cmd="$JAVA_HOME/bin/java" 57 | else 58 | declare java_cmd=java 59 | fi 60 | 61 | export RAT_VERSION=0.16.1 62 | export rat_jar="$FWDIR"/lib/apache-rat-${RAT_VERSION}.jar 63 | mkdir -p "$FWDIR"/lib 64 | 65 | [[ -f "$rat_jar" ]] || acquire_rat_jar || { 66 | echo "Download failed. Obtain the rat jar manually and place it at $rat_jar" 67 | exit 1 68 | } 69 | 70 | mkdir -p build 71 | $java_cmd -jar "$rat_jar" --scan-hidden-directories -E "$FWDIR"/dev/.rat-excludes -d "$FWDIR" > build/rat-results.txt 72 | 73 | if [ $? -ne 0 ]; then 74 | echo "RAT exited abnormally" 75 | exit 1 76 | fi 77 | 78 | ERRORS="$(cat build/rat-results.txt | grep -e "??")" 79 | 80 | if test ! -z "$ERRORS"; then 81 | echo "Could not find Apache license headers in the following files:" 82 | echo "$ERRORS" 83 | exit 1 84 | else 85 | echo -e "RAT checks passed." 86 | fi 87 | -------------------------------------------------------------------------------- /dev/docker-compose-azurite.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | services: 19 | azurite: 20 | image: mcr.microsoft.com/azure-storage/azurite 21 | container_name: azurite 22 | hostname: azurite 23 | ports: 24 | - 10000:10000 25 | command: ["azurite-blob", "--loose", "--blobHost", "0.0.0.0"] 26 | -------------------------------------------------------------------------------- /dev/docker-compose-gcs-server.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | services: 19 | gcs-server: 20 | image: fsouza/fake-gcs-server 21 | container_name: gcs-server 22 | ports: 23 | - 4443:4443 24 | entrypoint: > 25 | /bin/sh -c " 26 | mkdir -p /data/warehouse; 27 | /bin/fake-gcs-server -data /data -scheme http; 28 | exit 0; 29 | " 30 | -------------------------------------------------------------------------------- /dev/docker-compose-integration.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | services: 19 | spark-iceberg: 20 | image: python-integration 21 | container_name: pyiceberg-spark 22 | build: . 23 | networks: 24 | iceberg_net: 25 | depends_on: 26 | - rest 27 | - hive 28 | - minio 29 | volumes: 30 | - ./warehouse:/home/iceberg/warehouse 31 | environment: 32 | - AWS_ACCESS_KEY_ID=admin 33 | - AWS_SECRET_ACCESS_KEY=password 34 | - AWS_REGION=us-east-1 35 | ports: 36 | - 8888:8888 37 | - 8080:8080 38 | links: 39 | - rest:rest 40 | - hive:hive 41 | - minio:minio 42 | rest: 43 | image: apache/iceberg-rest-fixture 44 | container_name: pyiceberg-rest 45 | networks: 46 | iceberg_net: 47 | ports: 48 | - 8181:8181 49 | environment: 50 | - AWS_ACCESS_KEY_ID=admin 51 | - AWS_SECRET_ACCESS_KEY=password 52 | - AWS_REGION=us-east-1 53 | - CATALOG_WAREHOUSE=s3://warehouse/ 54 | - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO 55 | - CATALOG_S3_ENDPOINT=http://minio:9000 56 | minio: 57 | image: minio/minio 58 | container_name: pyiceberg-minio 59 | environment: 60 | - MINIO_ROOT_USER=admin 61 | - MINIO_ROOT_PASSWORD=password 62 | - MINIO_DOMAIN=minio 63 | networks: 64 | iceberg_net: 65 | aliases: 66 | - warehouse.minio 67 | ports: 68 | - 9001:9001 69 | - 9000:9000 70 | command: ["server", "/data", "--console-address", ":9001"] 71 | mc: 72 | depends_on: 73 | - minio 74 | image: minio/mc 75 | container_name: pyiceberg-mc 76 | networks: 77 | iceberg_net: 78 | environment: 79 | - AWS_ACCESS_KEY_ID=admin 80 | - AWS_SECRET_ACCESS_KEY=password 81 | - AWS_REGION=us-east-1 82 | entrypoint: > 83 | /bin/sh -c " 84 | until (/usr/bin/mc alias set minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; 85 | /usr/bin/mc mb minio/warehouse; 86 | /usr/bin/mc policy set public minio/warehouse; 87 | tail -f /dev/null 88 | " 89 | hive: 90 | build: hive/ 91 | container_name: hive 92 | hostname: hive 93 | networks: 94 | iceberg_net: 95 | ports: 96 | - 9083:9083 97 | environment: 98 | SERVICE_NAME: "metastore" 99 | SERVICE_OPTS: "-Dmetastore.warehouse.dir=s3a://warehouse/hive/" 100 | 101 | networks: 102 | iceberg_net: 103 | -------------------------------------------------------------------------------- /dev/docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | services: 19 | minio: 20 | image: minio/minio 21 | container_name: pyiceberg-minio 22 | environment: 23 | - MINIO_ROOT_USER=admin 24 | - MINIO_ROOT_PASSWORD=password 25 | - MINIO_DOMAIN=minio 26 | ports: 27 | - 9001:9001 28 | - 9000:9000 29 | command: ["server", "/data", "--console-address", ":9001"] 30 | mc: 31 | depends_on: 32 | - minio 33 | image: minio/mc 34 | container_name: pyiceberg-mc 35 | environment: 36 | - AWS_ACCESS_KEY_ID=admin 37 | - AWS_SECRET_ACCESS_KEY=password 38 | - AWS_REGION=us-east-1 39 | entrypoint: > 40 | /bin/sh -c " 41 | until (/usr/bin/mc alias set minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; 42 | /usr/bin/mc rm -r --force minio/warehouse; 43 | /usr/bin/mc mb minio/warehouse; 44 | /usr/bin/mc policy set public minio/warehouse; 45 | exit 0; 46 | " 47 | -------------------------------------------------------------------------------- /dev/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | start-master.sh -p 7077 22 | start-worker.sh spark://spark-iceberg:7077 23 | start-history-server.sh 24 | 25 | tail -f /dev/null 26 | -------------------------------------------------------------------------------- /dev/hive/Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | FROM openjdk:8-jre-slim AS build 17 | 18 | RUN apt-get update -qq && apt-get -qq -y install curl 19 | 20 | ENV HADOOP_VERSION=3.3.6 21 | ENV AWS_SDK_BUNDLE=1.12.753 22 | 23 | RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -Lo /tmp/hadoop-aws-${HADOOP_VERSION}.jar 24 | RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar -Lo /tmp/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar 25 | 26 | FROM apache/hive:4.0.0 27 | 28 | ENV HADOOP_VERSION=3.3.6 29 | ENV AWS_SDK_BUNDLE=1.12.753 30 | 31 | COPY --from=build /tmp/hadoop-aws-${HADOOP_VERSION}.jar /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar 32 | COPY --from=build /tmp/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar 33 | COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml 34 | -------------------------------------------------------------------------------- /dev/hive/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 19 | 20 | 21 | 22 | fs.defaultFS 23 | s3a://warehouse/hive 24 | 25 | 26 | fs.s3a.impl 27 | org.apache.hadoop.fs.s3a.S3AFileSystem 28 | 29 | 30 | fs.s3a.fast.upload 31 | true 32 | 33 | 34 | fs.s3a.endpoint 35 | http://minio:9000 36 | 37 | 38 | fs.s3a.access.key 39 | admin 40 | 41 | 42 | fs.s3a.secret.key 43 | password 44 | 45 | 46 | fs.s3a.connection.ssl.enabled 47 | false 48 | 49 | 50 | fs.s3a.path.style.access 51 | true 52 | 53 | 54 | -------------------------------------------------------------------------------- /dev/run-azurite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | set -ex 22 | 23 | if [ $(docker ps -q --filter "name=azurite" --filter "status=running" ) ]; then 24 | echo "Azurite backend running" 25 | else 26 | docker compose -f dev/docker-compose-azurite.yml kill 27 | docker compose -f dev/docker-compose-azurite.yml up -d 28 | while [ -z $(docker ps -q --filter "name=azurite" --filter "status=running" ) ] 29 | do 30 | echo "Waiting for Azurite" 31 | sleep 1 32 | done 33 | fi 34 | -------------------------------------------------------------------------------- /dev/run-gcs-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | set -ex 22 | 23 | if [ $(docker ps -q --filter "name=gcs-server" --filter "status=running" ) ]; then 24 | echo "Fake GCS Server running" 25 | else 26 | docker compose -f dev/docker-compose-gcs-server.yml kill 27 | docker compose -f dev/docker-compose-gcs-server.yml up -d 28 | while [ -z $(docker ps -q --filter "name=gcs-server" --filter "status=running" ) ] 29 | do 30 | echo "Waiting for Fake GCS Server" 31 | sleep 1 32 | done 33 | fi 34 | -------------------------------------------------------------------------------- /dev/run-minio.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # 20 | 21 | set -ex 22 | 23 | if [ $(docker ps -q --filter "name=pyiceberg-minio" --filter "status=running" ) ]; then 24 | echo "Minio backend running" 25 | else 26 | docker compose -f dev/docker-compose.yml kill 27 | docker compose -f dev/docker-compose.yml up -d 28 | while [ -z $(docker ps -q --filter "name=pyiceberg-minio" --filter "status=running" ) ] 29 | do 30 | echo "Waiting for Minio" 31 | sleep 1 32 | done 33 | fi 34 | -------------------------------------------------------------------------------- /dev/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions 19 | spark.sql.catalog.rest org.apache.iceberg.spark.SparkCatalog 20 | spark.sql.catalog.rest.type rest 21 | spark.sql.catalog.rest.uri http://rest:8181 22 | spark.sql.catalog.rest.io-impl org.apache.iceberg.aws.s3.S3FileIO 23 | spark.sql.catalog.rest.warehouse s3://warehouse/rest/ 24 | spark.sql.catalog.rest.s3.endpoint http://minio:9000 25 | spark.sql.catalog.hive org.apache.iceberg.spark.SparkCatalog 26 | spark.sql.catalog.hive.type hive 27 | spark.sql.catalog.hive.uri http://hive:9083 28 | spark.sql.catalog.hive.io-impl org.apache.iceberg.aws.s3.S3FileIO 29 | spark.sql.catalog.hive.warehouse s3://warehouse/hive/ 30 | spark.sql.catalog.hive.s3.endpoint http://minio:9000 31 | spark.sql.defaultCatalog rest 32 | spark.eventLog.enabled true 33 | spark.eventLog.dir /home/iceberg/spark-events 34 | spark.history.fs.logDirectory /home/iceberg/spark-events 35 | spark.sql.catalogImplementation in-memory 36 | -------------------------------------------------------------------------------- /mkdocs/README.md: -------------------------------------------------------------------------------- 1 | 17 | 18 | # Docs 19 | 20 | The pyiceberg docs are stored in `docs/`. 21 | 22 | ## Running docs locally 23 | 24 | ```sh 25 | make docs-install 26 | make docs-serve 27 | ``` 28 | -------------------------------------------------------------------------------- /mkdocs/docs/SUMMARY.md: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | 20 | 21 | # Summary 22 | 23 | - [Getting started](index.md) 24 | - [Configuration](configuration.md) 25 | - [CLI](cli.md) 26 | - [API](api.md) 27 | - [Contributing](contributing.md) 28 | - [Community](community.md) 29 | - Releases 30 | - [Verify a release](verify-release.md) 31 | - [How to release](how-to-release.md) 32 | - [Release Notes](https://github.com/apache/iceberg-python/releases) 33 | - [Nightly Build](nightly-build.md) 34 | - [Code Reference](reference/) 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /mkdocs/docs/assets/images/gen-release-notes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/iceberg-python/a67c5592f3243d255519581fedfcc5d93274b9c8/mkdocs/docs/assets/images/gen-release-notes.jpg -------------------------------------------------------------------------------- /mkdocs/docs/assets/images/iceberg-logo-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apache/iceberg-python/a67c5592f3243d255519581fedfcc5d93274b9c8/mkdocs/docs/assets/images/iceberg-logo-icon.png -------------------------------------------------------------------------------- /mkdocs/docs/community.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | --- 5 | 6 | 24 | 25 | # Join the community 26 | 27 | Apache Iceberg tracks issues in GitHub and prefers to receive contributions as pull requests. 28 | 29 | Community discussions happen primarily on the [dev mailing list](https://lists.apache.org/list.html?dev@iceberg.apache.org), on [Apache Iceberg Slack workspace](https://join.slack.com/t/apache-iceberg/shared_invite/zt-287g3akar-K9Oe_En5j1UL7Y_Ikpai3A) in the #python channel, and on specific [GitHub issues](https://github.com/apache/iceberg-python/issues). 30 | 31 | ## Iceberg Community Events 32 | 33 | The PyIceberg community sync is on the last Tuesday of every month. To join, make sure to subscribe to the [iceberg-python-sync Google group](https://groups.google.com/g/iceberg-python-sync). 34 | 35 | ## Community Guidelines 36 | 37 | ### Apache Iceberg Community Guidelines 38 | 39 | The Apache Iceberg community is built on the principles described in the [Apache Way](https://www.apache.org/theapacheway/index.html) 40 | and all who engage with the community are expected to be respectful, open, come with the best interests of the community in mind, 41 | and abide by the Apache Foundation [Code of Conduct](https://www.apache.org/foundation/policies/conduct.html). 42 | 43 | ### Participants with Corporate Interests 44 | 45 | A wide range of corporate entities have interests that overlap in both features and frameworks related to Iceberg and while we 46 | encourage engagement and contributions, the community is not a venue for marketing, solicitation, or recruitment. 47 | 48 | Any vendor who wants to participate in the Apache Iceberg community Slack workspace should create a dedicated vendor channel 49 | for their organization prefixed by `vendor-`. 50 | 51 | This space can be used to discuss features and integration with Iceberg related to the vendor offering. This space should not 52 | be used to promote competing vendor products/services or disparage other vendor offerings. Discussion should be focused on 53 | questions asked by the community and not to expand/introduce/redirect users to alternate offerings. 54 | 55 | ### Marketing / Solicitation / Recruiting 56 | 57 | The Apache Iceberg community is a space for everyone to operate free of influence. The development lists, Slack workspace, 58 | and GitHub should not be used to market products or services. Solicitation or overt promotion should not be performed in common 59 | channels or through direct messages. 60 | 61 | Recruitment of community members should not be conducted through direct messages or community channels, but opportunities 62 | related to contributing to or using Iceberg can be posted to the `#jobs` channel. 63 | 64 | For questions regarding any of the guidelines above, please contact a PMC member 65 | -------------------------------------------------------------------------------- /mkdocs/docs/nightly-build.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Nightly Build 21 | 22 | A nightly build of PyIceberg is available on testpypi, [https://test.pypi.org/project/pyiceberg/](https://test.pypi.org/project/pyiceberg/). 23 | 24 | To install the nightly build, 25 | 26 | ```shell 27 | pip install -i https://test.pypi.org/simple/ --pre pyiceberg 28 | ``` 29 | 30 | 31 | 32 | !!! warning "For Testing Purposes Only" 33 | Nightly builds are for testing purposes only and have not been validated. Please use at your own risk, as they may contain untested changes, potential bugs, or incomplete features. Additionally, ensure compliance with any applicable licenses, as these builds may include changes that have not been reviewed for legal or licensing implications. 34 | 35 | 36 | -------------------------------------------------------------------------------- /mkdocs/docs/verify-release.md: -------------------------------------------------------------------------------- 1 | 19 | 20 | # Verifying a release 21 | 22 | Each Apache PyIceberg release is validated by the community by holding a vote. A community release manager will prepare a release candidate and call a vote on the Iceberg dev list. To validate the release candidate, community members will test it out in their downstream projects and environments. 23 | 24 | In addition to testing in downstream projects, community members also check the release’s signatures, checksums, and license documentation. 25 | 26 | ## Validating a release candidate 27 | 28 | Release announcements include links to the following: 29 | 30 | - A source tarball 31 | - A signature (.asc) 32 | - A checksum (.sha512) 33 | - KEYS file 34 | - GitHub change comparison 35 | 36 | After downloading the source tarball, signature, checksum, and KEYS file, here are instructions on how to verify signatures, checksums, and documentation. 37 | 38 | ## Verifying signatures 39 | 40 | First, import the keys. 41 | 42 | ```sh 43 | curl https://downloads.apache.org/iceberg/KEYS -o KEYS 44 | gpg --import KEYS 45 | ``` 46 | 47 | Set an environment variable to the version to verify and path to use 48 | 49 | ```sh 50 | export PYICEBERG_VERSION= # e.g. 0.6.1rc3 51 | export PYICEBERG_VERIFICATION_DIR=/tmp/pyiceberg/${PYICEBERG_VERSION} 52 | ``` 53 | 54 | Next, verify the `.asc` file. 55 | 56 | ```sh 57 | svn checkout https://dist.apache.org/repos/dist/dev/iceberg/pyiceberg-${PYICEBERG_VERSION}/ ${PYICEBERG_VERIFICATION_DIR} 58 | 59 | cd ${PYICEBERG_VERIFICATION_DIR} 60 | 61 | for name in $(ls pyiceberg-*.whl pyiceberg-*.tar.gz) 62 | do 63 | gpg --verify ${name}.asc ${name} 64 | done 65 | ``` 66 | 67 | ## Verifying checksums 68 | 69 | ```sh 70 | cd ${PYICEBERG_VERIFICATION_DIR} 71 | for name in $(ls pyiceberg-*.whl.sha512 pyiceberg-*.tar.gz.sha512) 72 | do 73 | shasum -a 512 --check ${name} 74 | done 75 | ``` 76 | 77 | ## Verifying License Documentation 78 | 79 | ```sh 80 | export PYICEBERG_RELEASE_VERSION=${PYICEBERG_VERSION/rc?/} # remove rcX qualifier 81 | tar xzf pyiceberg-${PYICEBERG_RELEASE_VERSION}.tar.gz 82 | cd pyiceberg-${PYICEBERG_RELEASE_VERSION} 83 | ``` 84 | 85 | Run RAT checks to validate license header: 86 | 87 | ```shell 88 | ./dev/check-license 89 | ``` 90 | 91 | ## Testing 92 | 93 | This section explains how to run the tests of the source distribution. 94 | 95 | 96 | 97 | !!! note "Python Version" 98 | Make sure you're using [a supported Python version](https://github.com/apache/iceberg-python/blob/main/pyproject.toml#L29-L32) 99 | 100 | 101 | 102 | First step is to install the package: 103 | 104 | ```sh 105 | make install 106 | ``` 107 | 108 | To run the full test coverage, with both unit tests and integration tests: 109 | 110 | ```sh 111 | make test-coverage 112 | ``` 113 | 114 | This will spin up Docker containers to facilitate running test coverage. 115 | 116 | # Cast the vote 117 | 118 | Votes are cast by replying to the release candidate announcement email on the dev mailing list with either `+1`, `0`, or `-1`. For example : 119 | 120 | > [ ] +1 Release this as PyIceberg 0.3.0 121 | > 122 | > [ ] +0 123 | > 124 | > [ ] -1 Do not release this because… 125 | 126 | In addition to your vote, it’s customary to specify if your vote is binding or non-binding. Only members of the Project Management Committee have formally binding votes. If you’re unsure, you can specify that your vote is non-binding. To read more about voting in the Apache framework, checkout the [Voting](https://www.apache.org/foundation/voting.html) information page on the Apache foundation’s website. 127 | -------------------------------------------------------------------------------- /mkdocs/gen_doc_stubs.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from pathlib import Path 19 | 20 | import griffe # type: ignore 21 | import mkdocs_gen_files # type: ignore 22 | 23 | nav = mkdocs_gen_files.Nav() 24 | 25 | root = Path(__file__).parent.parent 26 | src_root = root.joinpath("pyiceberg") 27 | 28 | data = griffe.load(src_root) 29 | for path in sorted(src_root.glob("**/*.py")): 30 | module_path = path.relative_to(root).with_suffix("") 31 | doc_path = path.relative_to(root).with_suffix(".md") 32 | full_doc_path = Path("reference", doc_path) 33 | 34 | parts = tuple(module_path.parts) 35 | 36 | if parts[-1] == "__init__": 37 | parts = parts[:-1] 38 | doc_path = doc_path.with_name("index.md") 39 | full_doc_path = full_doc_path.with_name("index.md") 40 | elif parts[-1].startswith("_"): 41 | continue 42 | 43 | if module_path.parts[1:] in data.members and not data[module_path.parts[1:]].has_docstrings: 44 | continue 45 | 46 | nav[parts] = doc_path.as_posix() 47 | 48 | with mkdocs_gen_files.open(full_doc_path, "w") as fd: 49 | ident = ".".join(parts) 50 | fd.write(f"::: {ident}") 51 | 52 | mkdocs_gen_files.set_edit_path(full_doc_path, Path("../") / path) 53 | 54 | with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file: 55 | nav_file.writelines(nav.build_literate_nav()) 56 | -------------------------------------------------------------------------------- /mkdocs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | --- 18 | site_name: PyIceberg 19 | site_url: https://py.iceberg.apache.org/ 20 | repo_url: "https://github.com/apache/iceberg-python" 21 | repo_name: "apache/iceberg-python" 22 | 23 | plugins: 24 | - gen-files: 25 | scripts: 26 | - gen_doc_stubs.py 27 | - literate-nav: 28 | nav_file: SUMMARY.md 29 | - search 30 | - section-index 31 | - mkdocstrings: 32 | handlers: 33 | python: 34 | paths: [..] 35 | 36 | theme: 37 | name: material 38 | logo: assets/images/iceberg-logo-icon.png 39 | favicon: assets/images/iceberg-logo-icon.png 40 | font: 41 | text: Lato 42 | features: 43 | - navigation.top 44 | - navigation.tracking 45 | - navigation.tabs 46 | - navigation.tabs.sticky 47 | palette: 48 | - scheme: default 49 | toggle: 50 | icon: material/brightness-7 51 | name: Switch to dark mode 52 | - scheme: slate 53 | toggle: 54 | icon: material/brightness-4 55 | name: Switch to light mode 56 | 57 | markdown_extensions: 58 | - admonition 59 | - pymdownx.highlight: 60 | anchor_linenums: true 61 | - pymdownx.superfences 62 | - toc: 63 | permalink: true 64 | -------------------------------------------------------------------------------- /pyiceberg/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | __version__ = "0.10.0" 19 | -------------------------------------------------------------------------------- /pyiceberg/avro/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | import struct 18 | 19 | STRUCT_BOOL = struct.Struct("?") 20 | STRUCT_FLOAT = struct.Struct(" tuple[bytes, int]: 27 | compressed_data = bz2.compress(data) 28 | return compressed_data, len(compressed_data) 29 | 30 | @staticmethod 31 | def decompress(data: bytes) -> bytes: 32 | return bz2.decompress(data) 33 | 34 | except ImportError: 35 | 36 | class BZip2Codec(Codec): # type: ignore 37 | @staticmethod 38 | def compress(data: bytes) -> tuple[bytes, int]: 39 | raise ImportError("Python bzip2 support not installed, please install the extension") 40 | 41 | @staticmethod 42 | def decompress(data: bytes) -> bytes: 43 | raise ImportError("Python bzip2 support not installed, please install the extension") 44 | -------------------------------------------------------------------------------- /pyiceberg/avro/codecs/codec.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from __future__ import annotations 18 | 19 | from abc import ABC, abstractmethod 20 | 21 | 22 | class Codec(ABC): 23 | """Abstract base class for all Avro codec classes.""" 24 | 25 | @staticmethod 26 | @abstractmethod 27 | def compress(data: bytes) -> tuple[bytes, int]: ... 28 | 29 | @staticmethod 30 | @abstractmethod 31 | def decompress(data: bytes) -> bytes: ... 32 | -------------------------------------------------------------------------------- /pyiceberg/avro/codecs/deflate.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from __future__ import annotations 18 | 19 | import zlib 20 | 21 | from pyiceberg.avro.codecs.codec import Codec 22 | 23 | 24 | class DeflateCodec(Codec): 25 | @staticmethod 26 | def compress(data: bytes) -> tuple[bytes, int]: 27 | # The first two characters and last character are zlib 28 | # wrappers around deflate data. 29 | compressed_data = zlib.compress(data)[2:-1] 30 | return compressed_data, len(compressed_data) 31 | 32 | @staticmethod 33 | def decompress(data: bytes) -> bytes: 34 | # -15 is the log of the window size; negative indicates 35 | # "raw" (no zlib headers) decompression. See zlib.h. 36 | return zlib.decompress(data, -15) 37 | -------------------------------------------------------------------------------- /pyiceberg/avro/codecs/snappy_codec.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from __future__ import annotations 18 | 19 | import binascii 20 | import struct 21 | 22 | from pyiceberg.avro.codecs.codec import Codec 23 | 24 | STRUCT_CRC32 = struct.Struct(">I") # big-endian unsigned int 25 | 26 | try: 27 | import snappy 28 | 29 | class SnappyCodec(Codec): 30 | @staticmethod 31 | def _check_crc32(bytes_: bytes, checksum: bytes) -> None: 32 | """Incrementally compute CRC-32 from bytes and compare to a checksum. 33 | 34 | Args: 35 | bytes_ (bytes): The bytes to check against `checksum` 36 | checksum (bytes): Byte representation of a checksum 37 | 38 | Raises: 39 | ValueError: If the computed CRC-32 does not match the checksum 40 | """ 41 | if binascii.crc32(bytes_) & 0xFFFFFFFF != STRUCT_CRC32.unpack(checksum)[0]: 42 | raise ValueError("Checksum failure") 43 | 44 | @staticmethod 45 | def compress(data: bytes) -> tuple[bytes, int]: 46 | compressed_data = snappy.compress(data) 47 | # A 4-byte, big-endian CRC32 checksum 48 | compressed_data += STRUCT_CRC32.pack(binascii.crc32(data) & 0xFFFFFFFF) 49 | return compressed_data, len(compressed_data) 50 | 51 | @staticmethod 52 | def decompress(data: bytes) -> bytes: 53 | # Compressed data includes a 4-byte CRC32 checksum 54 | data = data[0:-4] 55 | uncompressed = snappy.decompress(data) 56 | checksum = data[-4:] 57 | SnappyCodec._check_crc32(uncompressed, checksum) 58 | return uncompressed 59 | 60 | except ImportError: 61 | 62 | class SnappyCodec(Codec): # type: ignore 63 | @staticmethod 64 | def compress(data: bytes) -> tuple[bytes, int]: 65 | raise ImportError("Snappy support not installed, please install using `pip install pyiceberg[snappy]`") 66 | 67 | @staticmethod 68 | def decompress(data: bytes) -> bytes: 69 | raise ImportError("Snappy support not installed, please install using `pip install pyiceberg[snappy]`") 70 | -------------------------------------------------------------------------------- /pyiceberg/avro/codecs/zstandard_codec.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from __future__ import annotations 18 | 19 | from io import BytesIO 20 | 21 | from pyiceberg.avro.codecs.codec import Codec 22 | 23 | try: 24 | from zstandard import ZstdCompressor, ZstdDecompressor 25 | 26 | class ZStandardCodec(Codec): 27 | @staticmethod 28 | def compress(data: bytes) -> tuple[bytes, int]: 29 | compressed_data = ZstdCompressor().compress(data) 30 | return compressed_data, len(compressed_data) 31 | 32 | @staticmethod 33 | def decompress(data: bytes) -> bytes: 34 | uncompressed = bytearray() 35 | dctx = ZstdDecompressor() 36 | with dctx.stream_reader(BytesIO(data)) as reader: 37 | while True: 38 | chunk = reader.read(16384) 39 | if not chunk: 40 | break 41 | uncompressed.extend(chunk) 42 | return uncompressed 43 | 44 | except ImportError: 45 | 46 | class ZStandardCodec(Codec): # type: ignore 47 | @staticmethod 48 | def compress(data: bytes) -> tuple[bytes, int]: 49 | raise ImportError("Zstandard support not installed, please install using `pip install pyiceberg[zstandard]`") 50 | 51 | @staticmethod 52 | def decompress(data: bytes) -> bytes: 53 | raise ImportError("Zstandard support not installed, please install using `pip install pyiceberg[zstandard]`") 54 | -------------------------------------------------------------------------------- /pyiceberg/avro/decoder_basic.c: -------------------------------------------------------------------------------- 1 | /* 2 | Licensed to the Apache Software Foundation (ASF) under one 3 | or more contributor license agreements. See the NOTICE file 4 | distributed with this work for additional information 5 | regarding copyright ownership. The ASF licenses this file 6 | to you under the Apache License, Version 2.0 (the 7 | "License"); you may not use this file except in compliance 8 | with the License. You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, 13 | software distributed under the License is distributed on an 14 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | KIND, either express or implied. See the License for the 16 | specific language governing permissions and limitations 17 | under the License. 18 | */ 19 | 20 | #include 21 | 22 | /* 23 | Decode an an array of zig-zag encoded integers from a buffer. 24 | 25 | The buffer is advanced to the end of the integers. 26 | `count` is the number of integers to decode. 27 | `result` is where the decoded integers are stored. 28 | 29 | The result is guaranteed to be 64 bits wide. 30 | 31 | */ 32 | static inline void decode_zigzag_ints(const unsigned char **buffer, const uint64_t count, uint64_t *result) { 33 | uint64_t current_index; 34 | const unsigned char *current_position = *buffer; 35 | uint64_t temp; 36 | // The largest shift will always be < 64 37 | unsigned char shift; 38 | 39 | for (current_index = 0; current_index < count; current_index++) { 40 | shift = 7; 41 | temp = *current_position & 0x7F; 42 | while(*current_position & 0x80) { 43 | current_position += 1; 44 | temp |= (uint64_t)(*current_position & 0x7F) << shift; 45 | shift += 7; 46 | } 47 | result[current_index] = (temp >> 1) ^ (~(temp & 1) + 1); 48 | current_position += 1; 49 | } 50 | *buffer = current_position; 51 | } 52 | 53 | 54 | 55 | /* 56 | Skip a zig-zag encoded integer in a buffer. 57 | 58 | The buffer is advanced to the end of the integer. 59 | */ 60 | static inline void skip_zigzag_int(const unsigned char **buffer) { 61 | while(**buffer & 0x80) { 62 | *buffer += 1; 63 | } 64 | *buffer += 1; 65 | } 66 | -------------------------------------------------------------------------------- /pyiceberg/avro/decoder_fast.pyi: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from pyiceberg.avro.decoder import BinaryDecoder 19 | 20 | class CythonBinaryDecoder(BinaryDecoder): 21 | def __init__(self, input_contents: bytes) -> None: 22 | pass 23 | 24 | def tell(self) -> int: 25 | pass 26 | 27 | def read(self, n: int) -> bytes: 28 | pass 29 | 30 | def read_boolean(self) -> bool: 31 | pass 32 | 33 | def read_int(self) -> int: 34 | pass 35 | 36 | def read_ints(self, count: int) -> tuple[int, ...]: 37 | pass 38 | 39 | def read_int_bytes_dict(self, count: int, dest: dict[int, bytes]) -> None: 40 | pass 41 | 42 | def read_bytes(self) -> bytes: 43 | pass 44 | 45 | def read_float(self) -> float: 46 | pass 47 | 48 | def read_double(self) -> float: 49 | pass 50 | 51 | def read_utf8(self) -> str: 52 | pass 53 | 54 | def skip(self, n: int) -> None: 55 | pass 56 | 57 | def skip_int(self) -> None: 58 | pass 59 | 60 | def skip_boolean(self) -> None: 61 | pass 62 | 63 | def skip_float(self) -> None: 64 | pass 65 | 66 | def skip_double(self) -> None: 67 | pass 68 | 69 | def skip_bytes(self) -> None: 70 | pass 71 | 72 | def skip_utf8(self) -> None: 73 | pass 74 | -------------------------------------------------------------------------------- /pyiceberg/avro/encoder.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from typing import Any 18 | from uuid import UUID 19 | 20 | from pyiceberg.avro import STRUCT_DOUBLE, STRUCT_FLOAT 21 | from pyiceberg.io import OutputStream 22 | from pyiceberg.typedef import UTF8 23 | 24 | 25 | class BinaryEncoder: 26 | """Encodes Python physical types into bytes.""" 27 | 28 | _output_stream: OutputStream 29 | 30 | def __init__(self, output_stream: OutputStream) -> None: 31 | self._output_stream = output_stream 32 | 33 | def write(self, b: bytes) -> None: 34 | self._output_stream.write(b) 35 | 36 | def write_boolean(self, boolean: bool) -> None: 37 | """Write a boolean as a single byte whose value is either 0 (false) or 1 (true). 38 | 39 | Args: 40 | boolean: The boolean to write. 41 | """ 42 | self.write(bytearray([bool(boolean)])) 43 | 44 | def write_int(self, integer: int) -> None: 45 | """Integer and long values are written using variable-length zig-zag coding.""" 46 | datum = (integer << 1) ^ (integer >> 63) 47 | while (datum & ~0x7F) != 0: 48 | self.write(bytearray([(datum & 0x7F) | 0x80])) 49 | datum >>= 7 50 | self.write(bytearray([datum])) 51 | 52 | def write_float(self, f: float) -> None: 53 | """Write a float as 4 bytes.""" 54 | self.write(STRUCT_FLOAT.pack(f)) 55 | 56 | def write_double(self, f: float) -> None: 57 | """Write a double as 8 bytes.""" 58 | self.write(STRUCT_DOUBLE.pack(f)) 59 | 60 | def write_bytes(self, b: bytes) -> None: 61 | """Bytes are encoded as a long followed by that many bytes of data.""" 62 | self.write_int(len(b)) 63 | self.write(b) 64 | 65 | def write_utf8(self, s: str) -> None: 66 | """Encode a string as a long followed by that many bytes of UTF-8 encoded character data.""" 67 | self.write_bytes(s.encode(UTF8)) 68 | 69 | def write_uuid(self, uuid: UUID) -> None: 70 | """Write UUID as a fixed[16]. 71 | 72 | The uuid logical type represents a random generated universally unique identifier (UUID). 73 | An uuid logical type annotates an Avro string. The string has to conform with RFC-4122. 74 | """ 75 | if len(uuid.bytes) != 16: 76 | raise ValueError(f"Expected UUID to have 16 bytes, got: len({uuid.bytes!r})") 77 | return self.write(uuid.bytes) 78 | 79 | def write_unknown(self, _: Any) -> None: 80 | """Nulls are written as 0 bytes in avro, so we do nothing.""" 81 | -------------------------------------------------------------------------------- /pyiceberg/catalog/memory.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from pyiceberg.catalog.sql import SqlCatalog 19 | 20 | 21 | class InMemoryCatalog(SqlCatalog): 22 | """ 23 | An in-memory catalog implementation that uses SqlCatalog with SQLite in-memory database. 24 | 25 | This is useful for test, demo, and playground but not in production as it does not support concurrent access. 26 | """ 27 | 28 | def __init__(self, name: str, warehouse: str = "file:///tmp/iceberg/warehouse", **kwargs: str) -> None: 29 | self._warehouse_location = warehouse 30 | if "uri" not in kwargs: 31 | kwargs["uri"] = "sqlite:///:memory:" 32 | super().__init__(name=name, warehouse=warehouse, **kwargs) 33 | -------------------------------------------------------------------------------- /pyiceberg/catalog/rest/response.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from json import JSONDecodeError 18 | from typing import Dict, Literal, Optional, Type 19 | 20 | from pydantic import Field, ValidationError 21 | from requests import HTTPError 22 | 23 | from pyiceberg.exceptions import ( 24 | AuthorizationExpiredError, 25 | BadRequestError, 26 | ForbiddenError, 27 | OAuthError, 28 | RESTError, 29 | ServerError, 30 | ServiceUnavailableError, 31 | UnauthorizedError, 32 | ) 33 | from pyiceberg.typedef import IcebergBaseModel 34 | 35 | 36 | class TokenResponse(IcebergBaseModel): 37 | access_token: str = Field() 38 | token_type: str = Field() 39 | expires_in: Optional[int] = Field(default=None) 40 | issued_token_type: Optional[str] = Field(default=None) 41 | refresh_token: Optional[str] = Field(default=None) 42 | scope: Optional[str] = Field(default=None) 43 | 44 | 45 | class ErrorResponseMessage(IcebergBaseModel): 46 | message: str = Field() 47 | type: str = Field() 48 | code: int = Field() 49 | 50 | 51 | class ErrorResponse(IcebergBaseModel): 52 | error: ErrorResponseMessage = Field() 53 | 54 | 55 | class OAuthErrorResponse(IcebergBaseModel): 56 | error: Literal[ 57 | "invalid_request", "invalid_client", "invalid_grant", "unauthorized_client", "unsupported_grant_type", "invalid_scope" 58 | ] 59 | error_description: Optional[str] = None 60 | error_uri: Optional[str] = None 61 | 62 | 63 | def _handle_non_200_response(exc: HTTPError, error_handler: Dict[int, Type[Exception]]) -> None: 64 | exception: Type[Exception] 65 | 66 | if exc.response is None: 67 | raise ValueError("Did not receive a response") 68 | 69 | code = exc.response.status_code 70 | if code in error_handler: 71 | exception = error_handler[code] 72 | elif code == 400: 73 | exception = BadRequestError 74 | elif code == 401: 75 | exception = UnauthorizedError 76 | elif code == 403: 77 | exception = ForbiddenError 78 | elif code == 422: 79 | exception = RESTError 80 | elif code == 419: 81 | exception = AuthorizationExpiredError 82 | elif code == 501: 83 | exception = NotImplementedError 84 | elif code == 503: 85 | exception = ServiceUnavailableError 86 | elif 500 <= code < 600: 87 | exception = ServerError 88 | else: 89 | exception = RESTError 90 | 91 | try: 92 | if exception == OAuthError: 93 | # The OAuthErrorResponse has a different format 94 | error = OAuthErrorResponse.model_validate_json(exc.response.text) 95 | response = str(error.error) 96 | if description := error.error_description: 97 | response += f": {description}" 98 | if uri := error.error_uri: 99 | response += f" ({uri})" 100 | else: 101 | error = ErrorResponse.model_validate_json(exc.response.text).error 102 | response = f"{error.type}: {error.message}" 103 | except JSONDecodeError: 104 | # In the case we don't have a proper response 105 | response = f"RESTError {exc.response.status_code}: Could not decode json payload: {exc.response.text}" 106 | except ValidationError as e: 107 | # In the case we don't have a proper response 108 | errs = ", ".join(err["msg"] for err in e.errors()) 109 | response = f"RESTError {exc.response.status_code}: Received unexpected JSON Payload: {exc.response.text}, errors: {errs}" 110 | 111 | raise exception(response) from exc 112 | -------------------------------------------------------------------------------- /pyiceberg/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | -------------------------------------------------------------------------------- /pyiceberg/exceptions.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | 19 | class TableAlreadyExistsError(Exception): 20 | """Raised when creating a table with a name that already exists.""" 21 | 22 | 23 | class NamespaceNotEmptyError(Exception): 24 | """Raised when a name-space being dropped is not empty.""" 25 | 26 | 27 | class NamespaceAlreadyExistsError(Exception): 28 | """Raised when a name-space being created already exists in the catalog.""" 29 | 30 | 31 | class ValidationError(Exception): 32 | """Raises when there is an issue with the schema.""" 33 | 34 | 35 | class NoSuchTableError(Exception): 36 | """Raises when the table can't be found in the REST catalog.""" 37 | 38 | 39 | class NoSuchIcebergTableError(NoSuchTableError): 40 | """Raises when the table found in the REST catalog is not an iceberg table.""" 41 | 42 | 43 | class NoSuchViewError(Exception): 44 | """Raises when the view can't be found in the REST catalog.""" 45 | 46 | 47 | class NoSuchIdentifierError(Exception): 48 | """Raises when the identifier can't be found in the REST catalog.""" 49 | 50 | 51 | class NoSuchNamespaceError(Exception): 52 | """Raised when a referenced name-space is not found.""" 53 | 54 | 55 | class RESTError(Exception): 56 | """Raises when there is an unknown response from the REST Catalog.""" 57 | 58 | 59 | class BadRequestError(RESTError): 60 | """Raises when an invalid request is being made.""" 61 | 62 | 63 | class UnauthorizedError(RESTError): 64 | """Raises when you don't have the proper authorization.""" 65 | 66 | 67 | class ServiceUnavailableError(RESTError): 68 | """Raises when the service doesn't respond.""" 69 | 70 | 71 | class ServerError(RESTError): 72 | """Raises when there is an unhandled exception on the server side.""" 73 | 74 | 75 | class ForbiddenError(RESTError): 76 | """Raises when you don't have the credentials to perform the action on the REST catalog.""" 77 | 78 | 79 | class AuthorizationExpiredError(RESTError): 80 | """When the credentials are expired when performing an action on the REST catalog.""" 81 | 82 | 83 | class OAuthError(RESTError): 84 | """Raises when there is an error with the OAuth call.""" 85 | 86 | 87 | class NoSuchPropertyException(Exception): 88 | """When a property is missing.""" 89 | 90 | 91 | class NotInstalledError(Exception): 92 | """When an optional dependency is not installed.""" 93 | 94 | 95 | class SignError(Exception): 96 | """Raises when unable to sign a S3 request.""" 97 | 98 | 99 | class ResolveError(Exception): 100 | pass 101 | 102 | 103 | class DynamoDbError(Exception): 104 | pass 105 | 106 | 107 | class ConditionalCheckFailedException(DynamoDbError): 108 | pass 109 | 110 | 111 | class GenericDynamoDbError(DynamoDbError): 112 | pass 113 | 114 | 115 | class CommitFailedException(Exception): 116 | """Commit failed, refresh and try again.""" 117 | 118 | 119 | class CommitStateUnknownException(RESTError): 120 | """Commit failed due to unknown reason.""" 121 | 122 | 123 | class WaitingForLockException(Exception): 124 | """Need to wait for a lock, try again.""" 125 | 126 | 127 | class ValidationException(Exception): 128 | """Raised when validation fails.""" 129 | -------------------------------------------------------------------------------- /pyiceberg/py.typed: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # Marker file for PEP 561 19 | -------------------------------------------------------------------------------- /pyiceberg/table/puffin.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | import math 18 | from typing import TYPE_CHECKING, Dict, List, Literal, Optional 19 | 20 | from pydantic import Field 21 | from pyroaring import BitMap, FrozenBitMap 22 | 23 | from pyiceberg.typedef import IcebergBaseModel 24 | 25 | if TYPE_CHECKING: 26 | import pyarrow as pa 27 | 28 | # Short for: Puffin Fratercula arctica, version 1 29 | MAGIC_BYTES = b"PFA1" 30 | EMPTY_BITMAP = FrozenBitMap() 31 | MAX_JAVA_SIGNED = int(math.pow(2, 31)) - 1 32 | PROPERTY_REFERENCED_DATA_FILE = "referenced-data-file" 33 | 34 | 35 | def _deserialize_bitmap(pl: bytes) -> List[BitMap]: 36 | number_of_bitmaps = int.from_bytes(pl[0:8], byteorder="little") 37 | pl = pl[8:] 38 | 39 | bitmaps = [] 40 | last_key = -1 41 | for _ in range(number_of_bitmaps): 42 | key = int.from_bytes(pl[0:4], byteorder="little") 43 | if key < 0: 44 | raise ValueError(f"Invalid unsigned key: {key}") 45 | if key <= last_key: 46 | raise ValueError("Keys must be sorted in ascending order") 47 | if key > MAX_JAVA_SIGNED: 48 | raise ValueError(f"Key {key} is too large, max {MAX_JAVA_SIGNED} to maintain compatibility with Java impl") 49 | pl = pl[4:] 50 | 51 | while last_key < key - 1: 52 | bitmaps.append(EMPTY_BITMAP) 53 | last_key += 1 54 | 55 | bm = BitMap().deserialize(pl) 56 | # TODO: Optimize this 57 | pl = pl[len(bm.serialize()) :] 58 | bitmaps.append(bm) 59 | 60 | last_key = key 61 | 62 | return bitmaps 63 | 64 | 65 | class PuffinBlobMetadata(IcebergBaseModel): 66 | type: Literal["deletion-vector-v1"] = Field() 67 | fields: List[int] = Field() 68 | snapshot_id: int = Field(alias="snapshot-id") 69 | sequence_number: int = Field(alias="sequence-number") 70 | offset: int = Field() 71 | length: int = Field() 72 | compression_codec: Optional[str] = Field(alias="compression-codec", default=None) 73 | properties: Dict[str, str] = Field(default_factory=dict) 74 | 75 | 76 | class Footer(IcebergBaseModel): 77 | blobs: List[PuffinBlobMetadata] = Field() 78 | properties: Dict[str, str] = Field(default_factory=dict) 79 | 80 | 81 | def _bitmaps_to_chunked_array(bitmaps: List[BitMap]) -> "pa.ChunkedArray": 82 | import pyarrow as pa 83 | 84 | return pa.chunked_array([(key_pos << 32) + pos for pos in bitmap] for key_pos, bitmap in enumerate(bitmaps)) 85 | 86 | 87 | class PuffinFile: 88 | footer: Footer 89 | _deletion_vectors: Dict[str, List[BitMap]] 90 | 91 | def __init__(self, puffin: bytes) -> None: 92 | for magic_bytes in [puffin[:4], puffin[-4:]]: 93 | if magic_bytes != MAGIC_BYTES: 94 | raise ValueError(f"Incorrect magic bytes, expected {MAGIC_BYTES!r}, got {magic_bytes!r}") 95 | 96 | # One flag is set, the rest should be zero 97 | # byte 0 (first) 98 | # - bit 0 (lowest bit): whether FooterPayload is compressed 99 | # - all other bits are reserved for future use and should be set to 0 on write 100 | flags = puffin[-8:-4] 101 | if flags[0] != 0: 102 | raise ValueError("The Puffin-file has a compressed footer, which is not yet supported") 103 | 104 | # 4 byte integer is always signed, in a two's complement representation, stored little-endian. 105 | footer_payload_size_int = int.from_bytes(puffin[-12:-8], byteorder="little") 106 | 107 | self.footer = Footer.model_validate_json(puffin[-(footer_payload_size_int + 12) : -12]) 108 | puffin = puffin[8:] 109 | 110 | self._deletion_vectors = { 111 | blob.properties[PROPERTY_REFERENCED_DATA_FILE]: _deserialize_bitmap(puffin[blob.offset : blob.offset + blob.length]) 112 | for blob in self.footer.blobs 113 | } 114 | 115 | def to_vector(self) -> Dict[str, "pa.ChunkedArray"]: 116 | return {path: _bitmaps_to_chunked_array(bitmaps) for path, bitmaps in self._deletion_vectors.items()} 117 | -------------------------------------------------------------------------------- /pyiceberg/table/refs.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from enum import Enum 18 | from typing import Annotated, Optional 19 | 20 | from pydantic import Field, model_validator 21 | 22 | from pyiceberg.exceptions import ValidationError 23 | from pyiceberg.typedef import IcebergBaseModel 24 | 25 | MAIN_BRANCH = "main" 26 | 27 | 28 | class SnapshotRefType(str, Enum): 29 | BRANCH = "branch" 30 | TAG = "tag" 31 | 32 | def __repr__(self) -> str: 33 | """Return the string representation of the SnapshotRefType class.""" 34 | return f"SnapshotRefType.{self.name}" 35 | 36 | def __str__(self) -> str: 37 | """Return the string representation of the SnapshotRefType class.""" 38 | return self.value 39 | 40 | 41 | class SnapshotRef(IcebergBaseModel): 42 | snapshot_id: int = Field(alias="snapshot-id") 43 | snapshot_ref_type: SnapshotRefType = Field(alias="type") 44 | min_snapshots_to_keep: Annotated[Optional[int], Field(alias="min-snapshots-to-keep", default=None, gt=0)] 45 | max_snapshot_age_ms: Annotated[Optional[int], Field(alias="max-snapshot-age-ms", default=None, gt=0)] 46 | max_ref_age_ms: Annotated[Optional[int], Field(alias="max-ref-age-ms", default=None, gt=0)] 47 | 48 | @model_validator(mode="after") 49 | def check_min_snapshots_to_keep(self) -> "SnapshotRef": 50 | if self.min_snapshots_to_keep is not None and self.snapshot_ref_type == SnapshotRefType.TAG: 51 | raise ValidationError("Tags do not support setting minSnapshotsToKeep") 52 | return self 53 | 54 | @model_validator(mode="after") 55 | def check_max_snapshot_age_ms(self) -> "SnapshotRef": 56 | if self.max_snapshot_age_ms is not None and self.snapshot_ref_type == SnapshotRefType.TAG: 57 | raise ValidationError("Tags do not support setting maxSnapshotAgeMs") 58 | return self 59 | -------------------------------------------------------------------------------- /pyiceberg/table/statistics.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from typing import Dict, List, Literal, Optional 18 | 19 | from pydantic import Field 20 | 21 | from pyiceberg.typedef import IcebergBaseModel 22 | 23 | 24 | class BlobMetadata(IcebergBaseModel): 25 | type: Literal["apache-datasketches-theta-v1", "deletion-vector-v1"] 26 | snapshot_id: int = Field(alias="snapshot-id") 27 | sequence_number: int = Field(alias="sequence-number") 28 | fields: List[int] 29 | properties: Optional[Dict[str, str]] = None 30 | 31 | 32 | class StatisticsFile(IcebergBaseModel): 33 | snapshot_id: int = Field(alias="snapshot-id") 34 | statistics_path: str = Field(alias="statistics-path") 35 | file_size_in_bytes: int = Field(alias="file-size-in-bytes") 36 | file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes") 37 | key_metadata: Optional[str] = Field(alias="key-metadata", default=None) 38 | blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata") 39 | 40 | 41 | def filter_statistics_by_snapshot_id( 42 | statistics: List[StatisticsFile], 43 | reject_snapshot_id: int, 44 | ) -> List[StatisticsFile]: 45 | return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id] 46 | -------------------------------------------------------------------------------- /pyiceberg/table/update/statistics.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from typing import TYPE_CHECKING, Tuple 18 | 19 | from pyiceberg.table.statistics import StatisticsFile 20 | from pyiceberg.table.update import ( 21 | RemoveStatisticsUpdate, 22 | SetStatisticsUpdate, 23 | TableUpdate, 24 | UpdatesAndRequirements, 25 | UpdateTableMetadata, 26 | ) 27 | 28 | if TYPE_CHECKING: 29 | from pyiceberg.table import Transaction 30 | 31 | 32 | class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]): 33 | """ 34 | Run statistics management operations using APIs. 35 | 36 | APIs include set_statistics and remove statistics operations. 37 | 38 | Use table.update_statistics().().commit() to run a specific operation. 39 | Use table.update_statistics().().().commit() to run multiple operations. 40 | 41 | Pending changes are applied on commit. 42 | 43 | We can also use context managers to make more changes. For example: 44 | 45 | with table.update_statistics() as update: 46 | update.set_statistics(statistics_file=statistics_file) 47 | update.remove_statistics(snapshot_id=2) 48 | """ 49 | 50 | _updates: Tuple[TableUpdate, ...] = () 51 | 52 | def __init__(self, transaction: "Transaction") -> None: 53 | super().__init__(transaction) 54 | 55 | def set_statistics(self, statistics_file: StatisticsFile) -> "UpdateStatistics": 56 | self._updates += ( 57 | SetStatisticsUpdate( 58 | statistics=statistics_file, 59 | ), 60 | ) 61 | 62 | return self 63 | 64 | def remove_statistics(self, snapshot_id: int) -> "UpdateStatistics": 65 | self._updates = ( 66 | RemoveStatisticsUpdate( 67 | snapshot_id=snapshot_id, 68 | ), 69 | ) 70 | 71 | return self 72 | 73 | def _commit(self) -> UpdatesAndRequirements: 74 | return self._updates, () 75 | -------------------------------------------------------------------------------- /pyiceberg/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | -------------------------------------------------------------------------------- /pyiceberg/utils/bin_packing.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from __future__ import annotations 18 | 19 | from typing import ( 20 | Callable, 21 | Generic, 22 | Iterable, 23 | List, 24 | Optional, 25 | TypeVar, 26 | ) 27 | 28 | T = TypeVar("T") 29 | 30 | 31 | class Bin(Generic[T]): 32 | def __init__(self, target_weight: int) -> None: 33 | self.bin_weight = 0 34 | self.target_weight = target_weight 35 | self.items: List[T] = [] 36 | 37 | def weight(self) -> int: 38 | return self.bin_weight 39 | 40 | def can_add(self, weight: int) -> bool: 41 | return self.bin_weight + weight <= self.target_weight 42 | 43 | def add(self, item: T, weight: int) -> None: 44 | self.bin_weight += weight 45 | self.items.append(item) 46 | 47 | 48 | class PackingIterator(Generic[T]): 49 | bins: List[Bin[T]] 50 | 51 | def __init__( 52 | self, 53 | items: Iterable[T], 54 | target_weight: int, 55 | lookback: int, 56 | weight_func: Callable[[T], int], 57 | largest_bin_first: bool = False, 58 | ) -> None: 59 | self.items = iter(items) 60 | self.target_weight = target_weight 61 | self.lookback = lookback 62 | self.weight_func = weight_func 63 | self.largest_bin_first = largest_bin_first 64 | self.bins = [] 65 | 66 | def __iter__(self) -> PackingIterator[T]: 67 | """Return an iterator for the PackingIterator class.""" 68 | return self 69 | 70 | def __next__(self) -> List[T]: 71 | """Return the next item when iterating over the PackingIterator class.""" 72 | while True: 73 | try: 74 | item = next(self.items) 75 | weight = self.weight_func(item) 76 | bin_ = self.find_bin(weight) 77 | if bin_ is not None: 78 | bin_.add(item, weight) 79 | else: 80 | bin_ = Bin(self.target_weight) 81 | bin_.add(item, weight) 82 | self.bins.append(bin_) 83 | 84 | if len(self.bins) > self.lookback: 85 | return self.remove_bin().items 86 | except StopIteration: 87 | break 88 | 89 | if len(self.bins) == 0: 90 | raise StopIteration() 91 | 92 | return self.remove_bin().items 93 | 94 | def find_bin(self, weight: int) -> Optional[Bin[T]]: 95 | for bin_ in self.bins: 96 | if bin_.can_add(weight): 97 | return bin_ 98 | return None 99 | 100 | def remove_bin(self) -> Bin[T]: 101 | if self.largest_bin_first: 102 | bin_ = max(self.bins, key=lambda b: b.weight()) 103 | self.bins.remove(bin_) 104 | return bin_ 105 | else: 106 | return self.bins.pop(0) 107 | 108 | 109 | class ListPacker(Generic[T]): 110 | _target_weight: int 111 | _lookback: int 112 | _largest_bin_first: bool 113 | 114 | def __init__(self, target_weight: int, lookback: int, largest_bin_first: bool) -> None: 115 | self._target_weight = target_weight 116 | self._lookback = lookback 117 | self._largest_bin_first = largest_bin_first 118 | 119 | def pack(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]: 120 | return list( 121 | PackingIterator( 122 | items=items, 123 | target_weight=self._target_weight, 124 | lookback=self._lookback, 125 | weight_func=weight_func, 126 | largest_bin_first=self._largest_bin_first, 127 | ) 128 | ) 129 | 130 | def pack_end(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]: 131 | packed = self.pack(items=list(reversed(items)), weight_func=weight_func) 132 | return [list(reversed(bin_items)) for bin_items in reversed(packed)] 133 | -------------------------------------------------------------------------------- /pyiceberg/utils/concurrent.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | """Concurrency concepts that support efficient multi-threading.""" 18 | 19 | from concurrent.futures import Executor, ThreadPoolExecutor 20 | from typing import Optional 21 | 22 | from pyiceberg.utils.config import Config 23 | 24 | 25 | class ExecutorFactory: 26 | _instance: Optional[Executor] = None 27 | 28 | @staticmethod 29 | def get_or_create() -> Executor: 30 | """Return the same executor in each call.""" 31 | if ExecutorFactory._instance is None: 32 | max_workers = ExecutorFactory.max_workers() 33 | ExecutorFactory._instance = ThreadPoolExecutor(max_workers=max_workers) 34 | 35 | return ExecutorFactory._instance 36 | 37 | @staticmethod 38 | def max_workers() -> Optional[int]: 39 | """Return the max number of workers configured.""" 40 | return Config().get_int("max-workers") 41 | -------------------------------------------------------------------------------- /pyiceberg/utils/deprecated.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | import functools 18 | import warnings 19 | from typing import Any, Callable, Optional 20 | 21 | 22 | def deprecated(deprecated_in: str, removed_in: str, help_message: Optional[str] = None) -> Callable: # type: ignore 23 | """Mark functions as deprecated. 24 | 25 | Adding this will result in a warning being emitted when the function is used. 26 | """ 27 | if help_message is not None: 28 | help_message = f" {help_message}." 29 | 30 | def decorator(func: Callable): # type: ignore 31 | @functools.wraps(func) 32 | def new_func(*args: Any, **kwargs: Any) -> Any: 33 | message = f"Call to {func.__name__}, deprecated in {deprecated_in}, will be removed in {removed_in}.{help_message}" 34 | 35 | _deprecation_warning(message) 36 | 37 | return func(*args, **kwargs) 38 | 39 | return new_func 40 | 41 | return decorator 42 | 43 | 44 | def deprecation_notice(deprecated_in: str, removed_in: str, help_message: Optional[str]) -> str: 45 | """Return a deprecation notice.""" 46 | return f"Deprecated in {deprecated_in}, will be removed in {removed_in}. {help_message}" 47 | 48 | 49 | def deprecation_message(deprecated_in: str, removed_in: str, help_message: Optional[str]) -> None: 50 | """Mark properties or behaviors as deprecated. 51 | 52 | Adding this will result in a warning being emitted. 53 | """ 54 | _deprecation_warning(deprecation_notice(deprecated_in, removed_in, help_message)) 55 | 56 | 57 | def _deprecation_warning(message: str) -> None: 58 | with warnings.catch_warnings(): # temporarily override warning handling 59 | warnings.warn( 60 | message, 61 | category=DeprecationWarning, 62 | stacklevel=2, 63 | ) 64 | -------------------------------------------------------------------------------- /pyiceberg/utils/lazydict.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from typing import ( 19 | Dict, 20 | Iterator, 21 | Mapping, 22 | Optional, 23 | Sequence, 24 | TypeVar, 25 | Union, 26 | cast, 27 | ) 28 | 29 | K = TypeVar("K") 30 | V = TypeVar("V") 31 | 32 | 33 | class LazyDict(Mapping[K, V]): 34 | """Lazily build a dictionary from an array of items.""" 35 | 36 | __slots__ = ("_contents", "_dict") 37 | 38 | # Since Python's type system is not powerful enough to express the type of the 39 | # contents of the dictionary, we use specify the type as a sequence of either K or V 40 | # values. 41 | # 42 | # Rather than spending the runtime cost of checking the type of each item, we presume 43 | # that the developer has correctly used the class and that the contents are valid. 44 | def __init__(self, contents: Sequence[Sequence[Union[K, V]]]): 45 | self._contents = contents 46 | self._dict: Optional[Dict[K, V]] = None 47 | 48 | def _build_dict(self) -> Dict[K, V]: 49 | self._dict = {} 50 | for item in self._contents: 51 | self._dict.update(dict(zip(cast(Sequence[K], item[::2]), cast(Sequence[V], item[1::2])))) 52 | 53 | return self._dict 54 | 55 | def __getitem__(self, key: K, /) -> V: 56 | """Return the value for the given key.""" 57 | source = self._dict or self._build_dict() 58 | return source[key] 59 | 60 | def __iter__(self) -> Iterator[K]: 61 | """Return an iterator over the keys of the dictionary.""" 62 | source = self._dict or self._build_dict() 63 | return iter(source) 64 | 65 | def __len__(self) -> int: 66 | """Return the number of items in the dictionary.""" 67 | source = self._dict or self._build_dict() 68 | return len(source) 69 | 70 | def __dict__(self) -> Dict[K, V]: # type: ignore 71 | """Convert the lazy dict in a dict.""" 72 | return self._dict or self._build_dict() 73 | -------------------------------------------------------------------------------- /pyiceberg/utils/parsing.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | import re 18 | from re import Pattern 19 | 20 | from pyiceberg.exceptions import ValidationError 21 | 22 | 23 | class ParseNumberFromBrackets: 24 | """Extracts the size from a string in the form of prefix[22].""" 25 | 26 | regex: Pattern # type: ignore 27 | prefix: str 28 | 29 | def __init__(self, prefix: str): 30 | self.prefix = prefix 31 | self.regex = re.compile(rf"{prefix}\[(\d+)\]") 32 | 33 | def match(self, str_repr: str) -> int: 34 | matches = self.regex.search(str_repr) 35 | if matches: 36 | return int(matches.group(1)) 37 | raise ValidationError(f"Could not match {str_repr}, expected format {self.prefix}[22]") 38 | -------------------------------------------------------------------------------- /pyiceberg/utils/properties.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from typing import ( 19 | Any, 20 | Dict, 21 | Optional, 22 | ) 23 | 24 | from pyiceberg.typedef import Properties 25 | from pyiceberg.types import strtobool 26 | 27 | HEADER_PREFIX = "header." 28 | 29 | 30 | def property_as_int( 31 | properties: Dict[str, str], 32 | property_name: str, 33 | default: Optional[int] = None, 34 | ) -> Optional[int]: 35 | if value := properties.get(property_name): 36 | try: 37 | return int(value) 38 | except ValueError as e: 39 | raise ValueError(f"Could not parse table property {property_name} to an integer: {value}") from e 40 | else: 41 | return default 42 | 43 | 44 | def property_as_float( 45 | properties: Dict[str, str], 46 | property_name: str, 47 | default: Optional[float] = None, 48 | ) -> Optional[float]: 49 | if value := properties.get(property_name): 50 | try: 51 | return float(value) 52 | except ValueError as e: 53 | raise ValueError(f"Could not parse table property {property_name} to a float: {value}") from e 54 | else: 55 | return default 56 | 57 | 58 | def property_as_bool( 59 | properties: Dict[str, str], 60 | property_name: str, 61 | default: bool, 62 | ) -> bool: 63 | if value := properties.get(property_name): 64 | try: 65 | return strtobool(value) 66 | except ValueError as e: 67 | raise ValueError(f"Could not parse table property {property_name} to a boolean: {value}") from e 68 | return default 69 | 70 | 71 | def get_first_property_value( 72 | properties: Properties, 73 | *property_names: str, 74 | ) -> Optional[Any]: 75 | for property_name in property_names: 76 | if property_value := properties.get(property_name): 77 | return property_value 78 | return None 79 | 80 | 81 | def get_header_properties( 82 | properties: Properties, 83 | ) -> Properties: 84 | header_prefix_len = len(HEADER_PREFIX) 85 | return {key[header_prefix_len:]: value for key, value in properties.items() if key.startswith(HEADER_PREFIX)} 86 | -------------------------------------------------------------------------------- /pyiceberg/utils/singleton.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | """ 18 | This is a singleton metaclass that can be used to cache and reuse existing objects. 19 | 20 | In the Iceberg codebase we have a lot of objects that are stateless (for example Types such as StringType, 21 | BooleanType etc). FixedTypes have arguments (eg. Fixed[22]) that we also make part of the key when caching 22 | the newly created object. 23 | 24 | The Singleton uses a metaclass which essentially defines a new type. When the Type gets created, it will first 25 | evaluate the `__call__` method with all the arguments. If we already initialized a class earlier, we'll just 26 | return it. 27 | 28 | More information on metaclasses: https://docs.python.org/3/reference/datamodel.html#metaclasses 29 | """ 30 | 31 | from typing import Any, ClassVar, Dict 32 | 33 | 34 | def _convert_to_hashable_type(element: Any) -> Any: 35 | if isinstance(element, dict): 36 | return tuple((_convert_to_hashable_type(k), _convert_to_hashable_type(v)) for k, v in element.items()) 37 | elif isinstance(element, list): 38 | return tuple(map(_convert_to_hashable_type, element)) 39 | return element 40 | 41 | 42 | class Singleton: 43 | _instances: ClassVar[Dict] = {} # type: ignore 44 | 45 | def __new__(cls, *args, **kwargs): # type: ignore 46 | key = (cls, tuple(args), _convert_to_hashable_type(kwargs)) 47 | if key not in cls._instances: 48 | cls._instances[key] = super().__new__(cls) 49 | return cls._instances[key] 50 | 51 | def __deepcopy__(self, memo: Dict[int, Any]) -> Any: 52 | """ 53 | Prevent deep copy operations for singletons. 54 | 55 | The IcebergRootModel inherits from Pydantic RootModel, 56 | which has its own implementation of deepcopy. When deepcopy 57 | runs, it calls the RootModel __deepcopy__ method and ignores 58 | that it's a Singleton. To handle this, the order of inheritance 59 | is adjusted and a __deepcopy__ method is implemented for 60 | singletons that simply returns itself. 61 | """ 62 | return self 63 | -------------------------------------------------------------------------------- /pyiceberg/utils/truncate.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from typing import Optional 18 | 19 | 20 | def truncate_upper_bound_text_string(value: str, trunc_length: Optional[int]) -> Optional[str]: 21 | result = value[:trunc_length] 22 | if result != value: 23 | chars = [*result] 24 | 25 | for i in range(-1, -len(result) - 1, -1): 26 | try: 27 | to_inc = ord(chars[i]) 28 | # will raise exception if the highest unicode code is reached 29 | _next = chr(to_inc + 1) 30 | chars[i] = _next 31 | return "".join(chars) 32 | except ValueError: 33 | pass 34 | return None # didn't find a valid upper bound 35 | return result 36 | 37 | 38 | def truncate_upper_bound_binary_string(value: bytes, trunc_length: Optional[int]) -> Optional[bytes]: 39 | result = value[:trunc_length] 40 | if result != value: 41 | _bytes = [*result] 42 | for i in range(-1, -len(result) - 1, -1): 43 | if _bytes[i] < 255: 44 | _bytes[i] += 1 45 | return b"".join([i.to_bytes(1, byteorder="little") for i in _bytes]) 46 | return None 47 | 48 | return result 49 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | src = ['pyiceberg','tests'] 19 | extend-exclude = ["dev/provision.py"] 20 | 21 | # Exclude a variety of commonly ignored directories. 22 | exclude = [ 23 | ".bzr", 24 | ".direnv", 25 | ".eggs", 26 | ".git", 27 | ".git-rewrite", 28 | ".hg", 29 | ".mypy_cache", 30 | ".nox", 31 | ".pants.d", 32 | ".pytype", 33 | ".ruff_cache", 34 | ".svn", 35 | ".tox", 36 | ".venv", 37 | "__pypackages__", 38 | "_build", 39 | "buck-out", 40 | "build", 41 | "dist", 42 | "node_modules", 43 | "venv", 44 | ] 45 | 46 | # Ignore _all_ violations. 47 | # Same as Black. 48 | line-length = 130 49 | 50 | [lint] 51 | select = [ 52 | "E", # pycodestyle 53 | "W", # pycodestyle 54 | "F", # Pyflakes 55 | "B", # flake8-bugbear 56 | "PIE", # flake8-pie 57 | "C4", # flake8-comprehensions 58 | "I", # isort 59 | "UP", # pyupgrade 60 | ] 61 | ignore = ["E501","E203","B024","B028","UP037", "UP035", "UP006"] 62 | 63 | # Allow autofix for all enabled rules (when `--fix`) is provided. 64 | fixable = ["ALL"] 65 | unfixable = [] 66 | 67 | per-file-ignores = {} 68 | 69 | # Allow unused variables when underscore-prefixed. 70 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 71 | 72 | [lint.pyupgrade] 73 | # Preserve types, even if a file imports `from __future__ import annotations`. 74 | keep-runtime-typing = true 75 | 76 | [lint.isort] 77 | detect-same-package = true 78 | lines-between-types = 0 79 | known-first-party = ["pyiceberg", "tests"] 80 | section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"] 81 | 82 | [format] 83 | quote-style = "double" 84 | -------------------------------------------------------------------------------- /tests/avro/test_encoder.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from __future__ import annotations 18 | 19 | import io 20 | import struct 21 | import uuid 22 | 23 | from pyiceberg.avro.encoder import BinaryEncoder 24 | 25 | 26 | def test_write() -> None: 27 | output = io.BytesIO() 28 | encoder = BinaryEncoder(output) 29 | 30 | _input = b"\x12\x34\x56" 31 | 32 | encoder.write(_input) 33 | 34 | assert output.getbuffer() == _input 35 | 36 | 37 | def test_write_boolean() -> None: 38 | output = io.BytesIO() 39 | encoder = BinaryEncoder(output) 40 | 41 | encoder.write_boolean(True) 42 | encoder.write_boolean(False) 43 | 44 | assert output.getbuffer() == struct.pack("??", True, False) 45 | 46 | 47 | def test_write_int() -> None: 48 | output = io.BytesIO() 49 | encoder = BinaryEncoder(output) 50 | 51 | _1byte_input = 2 52 | _2byte_input = 7466 53 | _3byte_input = 523490 54 | _4byte_input = 86561570 55 | _5byte_input = 2510416930 56 | _6byte_input = 734929016866 57 | _7byte_input = 135081528772642 58 | _8byte_input = 35124861473277986 59 | 60 | encoder.write_int(_1byte_input) 61 | encoder.write_int(_2byte_input) 62 | encoder.write_int(_3byte_input) 63 | encoder.write_int(_4byte_input) 64 | encoder.write_int(_5byte_input) 65 | encoder.write_int(_6byte_input) 66 | encoder.write_int(_7byte_input) 67 | encoder.write_int(_8byte_input) 68 | 69 | buffer = output.getbuffer() 70 | 71 | assert buffer[0:1] == b"\x04" 72 | assert buffer[1:3] == b"\xd4\x74" 73 | assert buffer[3:6] == b"\xc4\xf3\x3f" 74 | assert buffer[6:10] == b"\xc4\xcc\xc6\x52" 75 | assert buffer[10:15] == b"\xc4\xb0\x8f\xda\x12" 76 | assert buffer[15:21] == b"\xc4\xe0\xf6\xd2\xe3\x2a" 77 | assert buffer[21:28] == b"\xc4\xa0\xce\xe8\xe3\xb6\x3d" 78 | assert buffer[28:36] == b"\xc4\xa0\xb2\xae\x83\xf8\xe4\x7c" 79 | 80 | 81 | def test_write_float() -> None: 82 | output = io.BytesIO() 83 | encoder = BinaryEncoder(output) 84 | 85 | _input = 3.14159265359 86 | 87 | encoder.write_float(_input) 88 | 89 | assert output.getbuffer() == struct.pack(" None: 93 | output = io.BytesIO() 94 | encoder = BinaryEncoder(output) 95 | 96 | _input = 3.14159265359 97 | 98 | encoder.write_double(_input) 99 | 100 | assert output.getbuffer() == struct.pack(" None: 104 | output = io.BytesIO() 105 | encoder = BinaryEncoder(output) 106 | 107 | _input = b"\x12\x34\x56" 108 | 109 | encoder.write_bytes(_input) 110 | 111 | assert output.getbuffer() == b"".join([b"\x06", _input]) 112 | 113 | 114 | def test_write_utf8() -> None: 115 | output = io.BytesIO() 116 | encoder = BinaryEncoder(output) 117 | 118 | _input = "That, my liege, is how we know the Earth to be banana-shaped." 119 | bin_input = _input.encode() 120 | encoder.write_utf8(_input) 121 | 122 | assert output.getbuffer() == b"".join([b"\x7a", bin_input]) 123 | 124 | 125 | def test_write_uuid() -> None: 126 | output = io.BytesIO() 127 | encoder = BinaryEncoder(output) 128 | 129 | _input = uuid.UUID("12345678-1234-5678-1234-567812345678") 130 | encoder.write_uuid(_input) 131 | 132 | buf = output.getbuffer() 133 | assert len(buf) == 16 134 | assert buf.tobytes() == b"\x124Vx\x124Vx\x124Vx\x124Vx" 135 | -------------------------------------------------------------------------------- /tests/benchmark/test_benchmark.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | import statistics 18 | import timeit 19 | import urllib 20 | 21 | import pyarrow as pa 22 | import pyarrow.parquet as pq 23 | import pytest 24 | 25 | from pyiceberg.transforms import DayTransform 26 | 27 | 28 | @pytest.fixture(scope="session") 29 | def taxi_dataset(tmp_path_factory: pytest.TempPathFactory) -> pa.Table: 30 | """Reads the Taxi dataset to disk""" 31 | taxi_dataset = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet" 32 | taxi_dataset_dest = tmp_path_factory.mktemp("taxi_dataset") / "yellow_tripdata_2022-01.parquet" 33 | urllib.request.urlretrieve(taxi_dataset, taxi_dataset_dest) 34 | 35 | return pq.read_table(taxi_dataset_dest) 36 | 37 | 38 | @pytest.mark.benchmark 39 | def test_partitioned_write(tmp_path_factory: pytest.TempPathFactory, taxi_dataset: pa.Table) -> None: 40 | """Tests writing to a partitioned table with something that would be close a production-like situation""" 41 | from pyiceberg.catalog.sql import SqlCatalog 42 | 43 | warehouse_path = str(tmp_path_factory.mktemp("warehouse")) 44 | catalog = SqlCatalog( 45 | "default", 46 | uri=f"sqlite:///{warehouse_path}/pyiceberg_catalog.db", 47 | warehouse=f"file://{warehouse_path}", 48 | ) 49 | 50 | catalog.create_namespace("default") 51 | 52 | tbl = catalog.create_table("default.taxi_partitioned", schema=taxi_dataset.schema) 53 | 54 | with tbl.update_spec() as spec: 55 | spec.add_field("tpep_pickup_datetime", DayTransform()) 56 | 57 | # Profiling can sometimes be handy as well 58 | # with cProfile.Profile() as pr: 59 | # tbl.append(taxi_dataset) 60 | # 61 | # pr.print_stats(sort=True) 62 | 63 | runs = [] 64 | for run in range(5): 65 | start_time = timeit.default_timer() 66 | tbl.append(taxi_dataset) 67 | elapsed = timeit.default_timer() - start_time 68 | 69 | print(f"Run {run} took: {elapsed}") 70 | runs.append(elapsed) 71 | 72 | print(f"Average runtime of {round(statistics.mean(runs), 2)} seconds") 73 | -------------------------------------------------------------------------------- /tests/catalog/test_rest_auth.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import base64 19 | 20 | import pytest 21 | import requests 22 | from requests_mock import Mocker 23 | 24 | from pyiceberg.catalog.rest.auth import AuthManagerAdapter, BasicAuthManager, NoopAuthManager 25 | 26 | TEST_URI = "https://iceberg-test-catalog/" 27 | 28 | 29 | @pytest.fixture 30 | def rest_mock(requests_mock: Mocker) -> Mocker: 31 | requests_mock.get( 32 | TEST_URI, 33 | json={}, 34 | status_code=200, 35 | ) 36 | return requests_mock 37 | 38 | 39 | def test_noop_auth_header(rest_mock: Mocker) -> None: 40 | auth_manager = NoopAuthManager() 41 | session = requests.Session() 42 | session.auth = AuthManagerAdapter(auth_manager) 43 | 44 | session.get(TEST_URI) 45 | history = rest_mock.request_history 46 | assert len(history) == 1 47 | actual_headers = history[0].headers 48 | assert "Authorization" not in actual_headers 49 | 50 | 51 | def test_basic_auth_header(rest_mock: Mocker) -> None: 52 | username = "testuser" 53 | password = "testpassword" 54 | expected_token = base64.b64encode(f"{username}:{password}".encode()).decode() 55 | expected_header = f"Basic {expected_token}" 56 | 57 | auth_manager = BasicAuthManager(username=username, password=password) 58 | session = requests.Session() 59 | session.auth = AuthManagerAdapter(auth_manager) 60 | 61 | session.get(TEST_URI) 62 | history = rest_mock.request_history 63 | assert len(history) == 1 64 | actual_headers = history[0].headers 65 | assert actual_headers["Authorization"] == expected_header 66 | -------------------------------------------------------------------------------- /tests/cli/test_output.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | -------------------------------------------------------------------------------- /tests/integration/test_register_table.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | import pytest 18 | 19 | from pyiceberg.catalog import Catalog 20 | from pyiceberg.exceptions import NoSuchTableError, TableAlreadyExistsError 21 | from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec 22 | from pyiceberg.schema import Schema 23 | from pyiceberg.table import Table 24 | from pyiceberg.types import ( 25 | BooleanType, 26 | DateType, 27 | IntegerType, 28 | NestedField, 29 | StringType, 30 | ) 31 | 32 | TABLE_SCHEMA = Schema( 33 | NestedField(field_id=1, name="foo", field_type=BooleanType(), required=False), 34 | NestedField(field_id=2, name="bar", field_type=StringType(), required=False), 35 | NestedField(field_id=4, name="baz", field_type=IntegerType(), required=False), 36 | NestedField(field_id=10, name="qux", field_type=DateType(), required=False), 37 | ) 38 | 39 | 40 | def _create_table( 41 | session_catalog: Catalog, 42 | identifier: str, 43 | format_version: int, 44 | location: str, 45 | partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, 46 | schema: Schema = TABLE_SCHEMA, 47 | ) -> Table: 48 | try: 49 | session_catalog.drop_table(identifier=identifier) 50 | except NoSuchTableError: 51 | pass 52 | 53 | return session_catalog.create_table( 54 | identifier=identifier, 55 | schema=schema, 56 | location=location, 57 | properties={"format-version": str(format_version)}, 58 | partition_spec=partition_spec, 59 | ) 60 | 61 | 62 | @pytest.mark.integration 63 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) 64 | def test_register_table( 65 | catalog: Catalog, 66 | ) -> None: 67 | identifier = "default.register_table" 68 | location = "s3a://warehouse/default/register_table" 69 | tbl = _create_table(catalog, identifier, 2, location) 70 | assert catalog.table_exists(identifier=identifier) 71 | catalog.drop_table(identifier=identifier) 72 | assert not catalog.table_exists(identifier=identifier) 73 | catalog.register_table(("default", "register_table"), metadata_location=tbl.metadata_location) 74 | assert catalog.table_exists(identifier=identifier) 75 | 76 | 77 | @pytest.mark.integration 78 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) 79 | def test_register_table_existing( 80 | catalog: Catalog, 81 | ) -> None: 82 | identifier = "default.register_table_existing" 83 | location = "s3a://warehouse/default/register_table_existing" 84 | tbl = _create_table(catalog, identifier, 2, location) 85 | assert catalog.table_exists(identifier=identifier) 86 | # Assert that registering the table again raises TableAlreadyExistsError 87 | with pytest.raises(TableAlreadyExistsError): 88 | catalog.register_table(("default", "register_table_existing"), metadata_location=tbl.metadata_location) 89 | -------------------------------------------------------------------------------- /tests/integration/test_rest_catalog.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # pylint:disable=redefined-outer-name 18 | 19 | import pytest 20 | 21 | from pyiceberg.catalog.rest import RestCatalog 22 | 23 | TEST_NAMESPACE_IDENTIFIER = "TEST NS" 24 | 25 | 26 | @pytest.mark.integration 27 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")]) 28 | def test_namespace_exists(catalog: RestCatalog) -> None: 29 | if not catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER): 30 | catalog.create_namespace(TEST_NAMESPACE_IDENTIFIER) 31 | 32 | assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER) 33 | 34 | 35 | @pytest.mark.integration 36 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")]) 37 | def test_namespace_not_exists(catalog: RestCatalog) -> None: 38 | if catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER): 39 | catalog.drop_namespace(TEST_NAMESPACE_IDENTIFIER) 40 | 41 | assert not catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER) 42 | 43 | 44 | @pytest.mark.integration 45 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")]) 46 | def test_create_namespace_if_not_exists(catalog: RestCatalog) -> None: 47 | if catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER): 48 | catalog.drop_namespace(TEST_NAMESPACE_IDENTIFIER) 49 | 50 | catalog.create_namespace_if_not_exists(TEST_NAMESPACE_IDENTIFIER) 51 | 52 | assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER) 53 | 54 | 55 | @pytest.mark.integration 56 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")]) 57 | def test_create_namespace_if_already_existing(catalog: RestCatalog) -> None: 58 | if not catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER): 59 | catalog.create_namespace(TEST_NAMESPACE_IDENTIFIER) 60 | 61 | catalog.create_namespace_if_not_exists(TEST_NAMESPACE_IDENTIFIER) 62 | 63 | assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER) 64 | -------------------------------------------------------------------------------- /tests/integration/test_snapshot_operations.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | import pytest 18 | 19 | from pyiceberg.catalog import Catalog 20 | from pyiceberg.table.refs import SnapshotRef 21 | 22 | 23 | @pytest.mark.integration 24 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) 25 | def test_create_tag(catalog: Catalog) -> None: 26 | identifier = "default.test_table_snapshot_operations" 27 | tbl = catalog.load_table(identifier) 28 | assert len(tbl.history()) > 3 29 | tag_snapshot_id = tbl.history()[-3].snapshot_id 30 | tbl.manage_snapshots().create_tag(snapshot_id=tag_snapshot_id, tag_name="tag123").commit() 31 | assert tbl.metadata.refs["tag123"] == SnapshotRef(snapshot_id=tag_snapshot_id, snapshot_ref_type="tag") 32 | 33 | 34 | @pytest.mark.integration 35 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) 36 | def test_create_branch(catalog: Catalog) -> None: 37 | identifier = "default.test_table_snapshot_operations" 38 | tbl = catalog.load_table(identifier) 39 | assert len(tbl.history()) > 2 40 | branch_snapshot_id = tbl.history()[-2].snapshot_id 41 | tbl.manage_snapshots().create_branch(snapshot_id=branch_snapshot_id, branch_name="branch123").commit() 42 | assert tbl.metadata.refs["branch123"] == SnapshotRef(snapshot_id=branch_snapshot_id, snapshot_ref_type="branch") 43 | 44 | 45 | @pytest.mark.integration 46 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) 47 | def test_remove_tag(catalog: Catalog) -> None: 48 | identifier = "default.test_table_snapshot_operations" 49 | tbl = catalog.load_table(identifier) 50 | assert len(tbl.history()) > 3 51 | # first, create the tag to remove 52 | tag_name = "tag_to_remove" 53 | tag_snapshot_id = tbl.history()[-3].snapshot_id 54 | tbl.manage_snapshots().create_tag(snapshot_id=tag_snapshot_id, tag_name=tag_name).commit() 55 | assert tbl.metadata.refs[tag_name] == SnapshotRef(snapshot_id=tag_snapshot_id, snapshot_ref_type="tag") 56 | # now, remove the tag 57 | tbl.manage_snapshots().remove_tag(tag_name=tag_name).commit() 58 | assert tbl.metadata.refs.get(tag_name, None) is None 59 | 60 | 61 | @pytest.mark.integration 62 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) 63 | def test_remove_branch(catalog: Catalog) -> None: 64 | identifier = "default.test_table_snapshot_operations" 65 | tbl = catalog.load_table(identifier) 66 | assert len(tbl.history()) > 2 67 | # first, create the branch to remove 68 | branch_name = "branch_to_remove" 69 | branch_snapshot_id = tbl.history()[-2].snapshot_id 70 | tbl.manage_snapshots().create_branch(snapshot_id=branch_snapshot_id, branch_name=branch_name).commit() 71 | assert tbl.metadata.refs[branch_name] == SnapshotRef(snapshot_id=branch_snapshot_id, snapshot_ref_type="branch") 72 | # now, remove the branch 73 | tbl.manage_snapshots().remove_branch(branch_name=branch_name).commit() 74 | assert tbl.metadata.refs.get(branch_name, None) is None 75 | -------------------------------------------------------------------------------- /tests/integration/test_statistics_operations.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from typing import TYPE_CHECKING 18 | 19 | import pytest 20 | 21 | from pyiceberg.exceptions import NoSuchTableError 22 | from pyiceberg.table.statistics import BlobMetadata, StatisticsFile 23 | 24 | if TYPE_CHECKING: 25 | import pyarrow as pa 26 | 27 | from pyiceberg.catalog import Catalog 28 | from pyiceberg.schema import Schema 29 | from pyiceberg.table import Table 30 | 31 | 32 | def _create_table_with_schema(catalog: "Catalog", schema: "Schema") -> "Table": 33 | tbl_name = "default.test_table_statistics_operations" 34 | 35 | try: 36 | catalog.drop_table(tbl_name) 37 | except NoSuchTableError: 38 | pass 39 | return catalog.create_table(identifier=tbl_name, schema=schema) 40 | 41 | 42 | @pytest.mark.integration 43 | @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) 44 | def test_manage_statistics(catalog: "Catalog", arrow_table_with_null: "pa.Table") -> None: 45 | tbl = _create_table_with_schema(catalog, arrow_table_with_null.schema) 46 | 47 | tbl.append(arrow_table_with_null) 48 | tbl.append(arrow_table_with_null) 49 | 50 | add_snapshot_id_1 = tbl.history()[0].snapshot_id 51 | add_snapshot_id_2 = tbl.history()[1].snapshot_id 52 | 53 | def create_statistics_file(snapshot_id: int, type_name: str) -> StatisticsFile: 54 | blob_metadata = BlobMetadata( 55 | type=type_name, 56 | snapshot_id=snapshot_id, 57 | sequence_number=2, 58 | fields=[1], 59 | properties={"prop-key": "prop-value"}, 60 | ) 61 | 62 | statistics_file = StatisticsFile( 63 | snapshot_id=snapshot_id, 64 | statistics_path="s3://bucket/warehouse/stats.puffin", 65 | file_size_in_bytes=124, 66 | file_footer_size_in_bytes=27, 67 | blob_metadata=[blob_metadata], 68 | ) 69 | 70 | return statistics_file 71 | 72 | statistics_file_snap_1 = create_statistics_file(add_snapshot_id_1, "apache-datasketches-theta-v1") 73 | statistics_file_snap_2 = create_statistics_file(add_snapshot_id_2, "deletion-vector-v1") 74 | 75 | with tbl.update_statistics() as update: 76 | update.set_statistics(statistics_file_snap_1) 77 | update.set_statistics(statistics_file_snap_2) 78 | 79 | assert len(tbl.metadata.statistics) == 2 80 | 81 | with tbl.update_statistics() as update: 82 | update.remove_statistics(add_snapshot_id_1) 83 | 84 | assert len(tbl.metadata.statistics) == 1 85 | 86 | with tbl.transaction() as txn: 87 | with txn.update_statistics() as update: 88 | update.set_statistics(statistics_file_snap_1) 89 | update.set_statistics(statistics_file_snap_2) 90 | 91 | assert len(tbl.metadata.statistics) == 2 92 | -------------------------------------------------------------------------------- /tests/integration/test_writes/test_optimistic_concurrency.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pyarrow as pa 19 | import pytest 20 | from pyspark.sql import SparkSession 21 | 22 | from pyiceberg.catalog import Catalog 23 | from pyiceberg.exceptions import CommitFailedException 24 | from utils import _create_table 25 | 26 | 27 | @pytest.mark.integration 28 | @pytest.mark.parametrize("format_version", [1, 2]) 29 | def test_conflict_delete_delete( 30 | spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int 31 | ) -> None: 32 | """This test should start passing once optimistic concurrency control has been implemented.""" 33 | identifier = "default.test_conflict" 34 | tbl1 = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null]) 35 | tbl2 = session_catalog.load_table(identifier) 36 | 37 | tbl1.delete("string == 'z'") 38 | 39 | with pytest.raises(CommitFailedException, match="(branch main has changed: expected id ).*"): 40 | # tbl2 isn't aware of the commit by tbl1 41 | tbl2.delete("string == 'z'") 42 | 43 | 44 | @pytest.mark.integration 45 | @pytest.mark.parametrize("format_version", [1, 2]) 46 | def test_conflict_delete_append( 47 | spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int 48 | ) -> None: 49 | """This test should start passing once optimistic concurrency control has been implemented.""" 50 | identifier = "default.test_conflict" 51 | tbl1 = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null]) 52 | tbl2 = session_catalog.load_table(identifier) 53 | 54 | # This is allowed 55 | tbl1.delete("string == 'z'") 56 | 57 | with pytest.raises(CommitFailedException, match="(branch main has changed: expected id ).*"): 58 | # tbl2 isn't aware of the commit by tbl1 59 | tbl2.append(arrow_table_with_null) 60 | 61 | 62 | @pytest.mark.integration 63 | @pytest.mark.parametrize("format_version", [1, 2]) 64 | def test_conflict_append_delete( 65 | spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int 66 | ) -> None: 67 | """This test should start passing once optimistic concurrency control has been implemented.""" 68 | identifier = "default.test_conflict" 69 | tbl1 = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null]) 70 | tbl2 = session_catalog.load_table(identifier) 71 | 72 | tbl1.append(arrow_table_with_null) 73 | 74 | with pytest.raises(CommitFailedException, match="(branch main has changed: expected id ).*"): 75 | # tbl2 isn't aware of the commit by tbl1 76 | tbl2.delete("string == 'z'") 77 | 78 | 79 | @pytest.mark.integration 80 | @pytest.mark.parametrize("format_version", [1, 2]) 81 | def test_conflict_append_append( 82 | spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int 83 | ) -> None: 84 | """This test should start passing once optimistic concurrency control has been implemented.""" 85 | identifier = "default.test_conflict" 86 | tbl1 = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null]) 87 | tbl2 = session_catalog.load_table(identifier) 88 | 89 | tbl1.append(arrow_table_with_null) 90 | 91 | with pytest.raises(CommitFailedException, match="(branch main has changed: expected id ).*"): 92 | # tbl2 isn't aware of the commit by tbl1 93 | tbl2.append(arrow_table_with_null) 94 | -------------------------------------------------------------------------------- /tests/integration/test_writes/utils.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # pylint:disable=redefined-outer-name 18 | from typing import List, Optional, Union 19 | 20 | import pyarrow as pa 21 | 22 | from pyiceberg.catalog import Catalog 23 | from pyiceberg.exceptions import NoSuchTableError 24 | from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec 25 | from pyiceberg.schema import Schema 26 | from pyiceberg.table import Table 27 | from pyiceberg.typedef import EMPTY_DICT, Properties 28 | from pyiceberg.types import ( 29 | BinaryType, 30 | BooleanType, 31 | DateType, 32 | DoubleType, 33 | FixedType, 34 | FloatType, 35 | IntegerType, 36 | LongType, 37 | NestedField, 38 | StringType, 39 | TimestampType, 40 | TimestamptzType, 41 | ) 42 | 43 | TABLE_SCHEMA = Schema( 44 | NestedField(field_id=1, name="bool", field_type=BooleanType(), required=False), 45 | NestedField(field_id=2, name="string", field_type=StringType(), required=False), 46 | NestedField(field_id=3, name="string_long", field_type=StringType(), required=False), 47 | NestedField(field_id=4, name="int", field_type=IntegerType(), required=False), 48 | NestedField(field_id=5, name="long", field_type=LongType(), required=False), 49 | NestedField(field_id=6, name="float", field_type=FloatType(), required=False), 50 | NestedField(field_id=7, name="double", field_type=DoubleType(), required=False), 51 | # NestedField(field_id=8, name="time", field_type=TimeType(), required=False), # Spark does not support time fields 52 | NestedField(field_id=8, name="timestamp", field_type=TimestampType(), required=False), 53 | NestedField(field_id=9, name="timestamptz", field_type=TimestamptzType(), required=False), 54 | NestedField(field_id=10, name="date", field_type=DateType(), required=False), 55 | # NestedField(field_id=11, name="time", field_type=TimeType(), required=False), 56 | # NestedField(field_id=12, name="uuid", field_type=UuidType(), required=False), 57 | NestedField(field_id=11, name="binary", field_type=BinaryType(), required=False), 58 | NestedField(field_id=12, name="fixed", field_type=FixedType(16), required=False), 59 | ) 60 | 61 | 62 | def _create_table( 63 | session_catalog: Catalog, 64 | identifier: str, 65 | properties: Properties = EMPTY_DICT, 66 | data: Optional[List[pa.Table]] = None, 67 | partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, 68 | schema: Union[Schema, "pa.Schema"] = TABLE_SCHEMA, 69 | ) -> Table: 70 | try: 71 | session_catalog.drop_table(identifier=identifier) 72 | except NoSuchTableError: 73 | pass 74 | 75 | tbl = session_catalog.create_table(identifier=identifier, schema=schema, properties=properties, partition_spec=partition_spec) 76 | 77 | if data is not None: 78 | for d in data: 79 | tbl.append(d) 80 | 81 | return tbl 82 | -------------------------------------------------------------------------------- /tests/table/bitmaps/64map32bitvals.bin: -------------------------------------------------------------------------------- 1 | :0  -------------------------------------------------------------------------------- /tests/table/bitmaps/64mapempty.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/table/bitmaps/64maphighvals.bin: -------------------------------------------------------------------------------- 1 | ����:0�� 2 | ��������������������������:0�� 3 | ��������������������������:0�� 4 | ��������������������������:0�� 5 | ��������������������������:0�� 6 | ��������������������������:0�� 7 | ��������������������������:0�� 8 | ��������������������������:0�� 9 | ��������������������������:0�� 10 | ��������������������������:0�� 11 | ��������������������������:0�� 12 | ���������������������� -------------------------------------------------------------------------------- /tests/table/bitmaps/64mapspreadvals.bin: -------------------------------------------------------------------------------- 1 | 2 | :0  :0  :0  :0  :0  :0  :0  :0  :0  :0  -------------------------------------------------------------------------------- /tests/table/test_puffin.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from os import path 18 | from typing import List 19 | 20 | import pytest 21 | from pyroaring import BitMap 22 | 23 | from pyiceberg.table.puffin import _deserialize_bitmap 24 | 25 | 26 | def _open_file(file: str) -> bytes: 27 | cur_dir = path.dirname(path.realpath(__file__)) 28 | with open(f"{cur_dir}/bitmaps/{file}", "rb") as f: 29 | return f.read() 30 | 31 | 32 | def test_map_empty() -> None: 33 | puffin = _open_file("64mapempty.bin") 34 | 35 | expected: List[BitMap] = [] 36 | actual = _deserialize_bitmap(puffin) 37 | 38 | assert expected == actual 39 | 40 | 41 | def test_map_bitvals() -> None: 42 | puffin = _open_file("64map32bitvals.bin") 43 | 44 | expected = [BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])] 45 | actual = _deserialize_bitmap(puffin) 46 | 47 | assert expected == actual 48 | 49 | 50 | def test_map_spread_vals() -> None: 51 | puffin = _open_file("64mapspreadvals.bin") 52 | 53 | expected = [ 54 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 55 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 56 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 57 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 58 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 59 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 60 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 61 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 62 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 63 | BitMap([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 64 | ] 65 | actual = _deserialize_bitmap(puffin) 66 | 67 | assert expected == actual 68 | 69 | 70 | def test_map_high_vals() -> None: 71 | puffin = _open_file("64maphighvals.bin") 72 | 73 | with pytest.raises(ValueError, match="Key 4022190063 is too large, max 2147483647 to maintain compatibility with Java impl"): 74 | _ = _deserialize_bitmap(puffin) 75 | -------------------------------------------------------------------------------- /tests/table/test_refs.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # pylint:disable=eval-used 18 | import pytest 19 | from pydantic import ValidationError 20 | 21 | from pyiceberg import exceptions 22 | from pyiceberg.table.refs import SnapshotRef, SnapshotRefType 23 | 24 | 25 | def test_snapshot_with_properties_repr() -> None: 26 | snapshot_ref = SnapshotRef( 27 | snapshot_id=3051729675574597004, 28 | snapshot_ref_type=SnapshotRefType.TAG, 29 | min_snapshots_to_keep=None, 30 | max_snapshot_age_ms=None, 31 | max_ref_age_ms=10000000, 32 | ) 33 | 34 | assert ( 35 | repr(snapshot_ref) 36 | == """SnapshotRef(snapshot_id=3051729675574597004, snapshot_ref_type=SnapshotRefType.TAG, min_snapshots_to_keep=None, max_snapshot_age_ms=None, max_ref_age_ms=10000000)""" 37 | ) 38 | assert snapshot_ref == eval(repr(snapshot_ref)) 39 | 40 | 41 | def test_snapshot_with_invalid_field() -> None: 42 | # min_snapshots_to_keep, if present, must be greater than 0 43 | with pytest.raises(ValidationError): 44 | SnapshotRef( 45 | snapshot_id=3051729675574597004, 46 | snapshot_ref_type=SnapshotRefType.TAG, 47 | min_snapshots_to_keep=-1, 48 | max_snapshot_age_ms=None, 49 | max_ref_age_ms=10000000, 50 | ) 51 | 52 | # max_snapshot_age_ms, if present, must be greater than 0 53 | with pytest.raises(ValidationError): 54 | SnapshotRef( 55 | snapshot_id=3051729675574597004, 56 | snapshot_ref_type=SnapshotRefType.TAG, 57 | min_snapshots_to_keep=1, 58 | max_snapshot_age_ms=-1, 59 | max_ref_age_ms=10000000, 60 | ) 61 | 62 | # max_ref_age_ms, if present, must be greater than 0 63 | with pytest.raises(ValidationError): 64 | SnapshotRef( 65 | snapshot_id=3051729675574597004, 66 | snapshot_ref_type=SnapshotRefType.TAG, 67 | min_snapshots_to_keep=None, 68 | max_snapshot_age_ms=None, 69 | max_ref_age_ms=-1, 70 | ) 71 | 72 | with pytest.raises(exceptions.ValidationError, match="Tags do not support setting minSnapshotsToKeep"): 73 | SnapshotRef( 74 | snapshot_id=3051729675574597004, 75 | snapshot_ref_type=SnapshotRefType.TAG, 76 | min_snapshots_to_keep=1, 77 | max_snapshot_age_ms=None, 78 | max_ref_age_ms=10000000, 79 | ) 80 | 81 | with pytest.raises(exceptions.ValidationError, match="Tags do not support setting maxSnapshotAgeMs"): 82 | SnapshotRef( 83 | snapshot_id=3051729675574597004, 84 | snapshot_ref_type=SnapshotRefType.TAG, 85 | min_snapshots_to_keep=None, 86 | max_snapshot_age_ms=1, 87 | max_ref_age_ms=100000, 88 | ) 89 | -------------------------------------------------------------------------------- /tests/test_serializers.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import json 19 | import os 20 | import uuid 21 | from typing import Any, Dict 22 | 23 | import pytest 24 | from pytest_mock import MockFixture 25 | 26 | from pyiceberg.serializers import ToOutputFile 27 | from pyiceberg.table import StaticTable 28 | from pyiceberg.table.metadata import TableMetadataV1 29 | 30 | 31 | def test_legacy_current_snapshot_id( 32 | mocker: MockFixture, tmp_path_factory: pytest.TempPathFactory, example_table_metadata_no_snapshot_v1: Dict[str, Any] 33 | ) -> None: 34 | from pyiceberg.io.pyarrow import PyArrowFileIO 35 | 36 | metadata_location = str(tmp_path_factory.mktemp("metadata") / f"{uuid.uuid4()}.metadata.json") 37 | metadata = TableMetadataV1(**example_table_metadata_no_snapshot_v1) 38 | ToOutputFile.table_metadata(metadata, PyArrowFileIO().new_output(location=metadata_location), overwrite=True) 39 | static_table = StaticTable.from_metadata(metadata_location) 40 | assert static_table.metadata.current_snapshot_id is None 41 | 42 | mocker.patch.dict(os.environ, values={"PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID": "True"}) 43 | 44 | ToOutputFile.table_metadata(metadata, PyArrowFileIO().new_output(location=metadata_location), overwrite=True) 45 | with PyArrowFileIO().new_input(location=metadata_location).open() as input_stream: 46 | metadata_json_bytes = input_stream.read() 47 | assert json.loads(metadata_json_bytes)["current-snapshot-id"] == -1 48 | backwards_compatible_static_table = StaticTable.from_metadata(metadata_location) 49 | assert backwards_compatible_static_table.metadata.current_snapshot_id is None 50 | assert backwards_compatible_static_table.metadata == static_table.metadata 51 | -------------------------------------------------------------------------------- /tests/test_typedef.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | import pytest 18 | 19 | from pyiceberg.typedef import FrozenDict, KeyDefaultDict, Record 20 | 21 | 22 | def test_setitem_frozendict() -> None: 23 | d = FrozenDict(foo=1, bar=2) 24 | with pytest.raises(AttributeError): 25 | d["foo"] = 3 26 | 27 | 28 | def test_update_frozendict() -> None: 29 | d = FrozenDict(foo=1, bar=2) 30 | with pytest.raises(AttributeError): 31 | d.update({"yes": 2}) 32 | 33 | 34 | def test_keydefaultdict() -> None: 35 | def one(_: int) -> int: 36 | return 1 37 | 38 | defaultdict = KeyDefaultDict(one) 39 | assert defaultdict[22] == 1 40 | 41 | 42 | def test_record_named_args() -> None: 43 | r = Record(1, "a", True) 44 | 45 | assert r[0] == 1 46 | assert r[1] == "a" 47 | assert r[2] is True 48 | 49 | assert repr(r) == "Record[1, a, True]" 50 | -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from pyiceberg import __version__ 19 | 20 | 21 | def test_version_format() -> None: 22 | from importlib import metadata 23 | 24 | installed_version = metadata.version("pyiceberg") 25 | 26 | assert __version__ == installed_version, ( 27 | f"The installed version ({installed_version}) does not match the current codebase version ({__version__})." 28 | "This failure could be due to a recent version bump in the Pyiceberg library. " 29 | "Please ensure you have the latest version installed by rerunning `make install` command." 30 | ) 31 | -------------------------------------------------------------------------------- /tests/utils/test_concurrent.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import os 19 | from concurrent.futures import ThreadPoolExecutor 20 | from typing import Dict, Optional 21 | from unittest import mock 22 | 23 | import pytest 24 | 25 | from pyiceberg.utils.concurrent import ExecutorFactory 26 | 27 | EMPTY_ENV: Dict[str, Optional[str]] = {} 28 | VALID_ENV = {"PYICEBERG_MAX_WORKERS": "5"} 29 | INVALID_ENV = {"PYICEBERG_MAX_WORKERS": "invalid"} 30 | 31 | 32 | def test_create_reused() -> None: 33 | first = ExecutorFactory.get_or_create() 34 | second = ExecutorFactory.get_or_create() 35 | assert isinstance(first, ThreadPoolExecutor) 36 | assert first is second 37 | 38 | 39 | @mock.patch.dict(os.environ, EMPTY_ENV) 40 | def test_max_workers_none() -> None: 41 | assert ExecutorFactory.max_workers() is None 42 | 43 | 44 | @mock.patch.dict(os.environ, VALID_ENV) 45 | def test_max_workers() -> None: 46 | assert ExecutorFactory.max_workers() == 5 47 | 48 | 49 | @mock.patch.dict(os.environ, INVALID_ENV) 50 | def test_max_workers_invalid() -> None: 51 | with pytest.raises(ValueError): 52 | ExecutorFactory.max_workers() 53 | -------------------------------------------------------------------------------- /tests/utils/test_decimal.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from decimal import Decimal 18 | 19 | import pytest 20 | 21 | from pyiceberg.utils.decimal import decimal_required_bytes, decimal_to_bytes 22 | 23 | 24 | def test_decimal_required_bytes() -> None: 25 | assert decimal_required_bytes(precision=1) == 1 26 | assert decimal_required_bytes(precision=2) == 1 27 | assert decimal_required_bytes(precision=3) == 2 28 | assert decimal_required_bytes(precision=4) == 2 29 | assert decimal_required_bytes(precision=5) == 3 30 | assert decimal_required_bytes(precision=7) == 4 31 | assert decimal_required_bytes(precision=8) == 4 32 | assert decimal_required_bytes(precision=10) == 5 33 | assert decimal_required_bytes(precision=32) == 14 34 | assert decimal_required_bytes(precision=38) == 16 35 | 36 | with pytest.raises(ValueError) as exc_info: 37 | decimal_required_bytes(precision=40) 38 | assert "(0, 40]" in str(exc_info.value) 39 | 40 | with pytest.raises(ValueError) as exc_info: 41 | decimal_required_bytes(precision=-1) 42 | assert "(0, 40]" in str(exc_info.value) 43 | 44 | 45 | def test_decimal_to_bytes() -> None: 46 | # Check the boundary between 2 and 3 bytes. 47 | # 2 bytes has a minimum of -32,768 and a maximum value of 32,767 (inclusive). 48 | assert decimal_to_bytes(Decimal("32767.")) == b"\x7f\xff" 49 | assert decimal_to_bytes(Decimal("32768.")) == b"\x00\x80\x00" 50 | -------------------------------------------------------------------------------- /tests/utils/test_deprecated.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from unittest.mock import Mock, patch 18 | 19 | from pyiceberg.utils.deprecated import deprecated 20 | 21 | 22 | @patch("warnings.warn") 23 | def test_deprecated(warn: Mock) -> None: 24 | @deprecated( 25 | deprecated_in="0.1.0", 26 | removed_in="0.2.0", 27 | help_message="Please use load_something_else() instead", 28 | ) 29 | def deprecated_method() -> None: 30 | pass 31 | 32 | deprecated_method() 33 | 34 | assert warn.called 35 | assert warn.call_args[0] == ( 36 | "Call to deprecated_method, deprecated in 0.1.0, will be removed in 0.2.0. Please use load_something_else() instead.", 37 | ) 38 | 39 | 40 | @patch("warnings.warn") 41 | def test_deprecation_message(warn: Mock) -> None: 42 | from pyiceberg.utils.deprecated import deprecation_message 43 | 44 | deprecation_message( 45 | deprecated_in="0.1.0", 46 | removed_in="0.2.0", 47 | help_message="Please use something_else instead", 48 | ) 49 | 50 | assert warn.called 51 | assert warn.call_args[0] == ("Deprecated in 0.1.0, will be removed in 0.2.0. Please use something_else instead",) 52 | -------------------------------------------------------------------------------- /tests/utils/test_lazydict.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | from pyiceberg.utils.lazydict import LazyDict 19 | 20 | 21 | def test_lazy_dict_ints() -> None: 22 | lazy_dict = LazyDict[int, int]([[1, 2], [3, 4]]) 23 | assert lazy_dict[1] == 2 24 | assert lazy_dict[3] == 4 25 | 26 | 27 | def test_lazy_dict_strings() -> None: 28 | lazy_dict = LazyDict[int, str]([[1, "red", 5, "banana"], [3, "blue"]]) 29 | assert lazy_dict[1] == "red" 30 | assert lazy_dict[3] == "blue" 31 | assert lazy_dict[5] == "banana" 32 | -------------------------------------------------------------------------------- /tests/utils/test_properties.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import pytest 19 | 20 | from pyiceberg.utils.properties import ( 21 | get_first_property_value, 22 | property_as_bool, 23 | property_as_float, 24 | property_as_int, 25 | ) 26 | 27 | 28 | def test_property_as_int() -> None: 29 | properties = { 30 | "int": "42", 31 | } 32 | 33 | assert property_as_int(properties, "int") == 42 34 | assert property_as_int(properties, "missing", default=1) == 1 35 | assert property_as_int(properties, "missing") is None 36 | 37 | 38 | def test_property_as_int_with_invalid_value() -> None: 39 | properties = { 40 | "some_int_prop": "invalid", 41 | } 42 | 43 | with pytest.raises(ValueError) as exc: 44 | property_as_int(properties, "some_int_prop") 45 | 46 | assert "Could not parse table property some_int_prop to an integer: invalid" in str(exc.value) 47 | 48 | 49 | def test_property_as_float() -> None: 50 | properties = { 51 | "float": "42.0", 52 | } 53 | 54 | assert property_as_float(properties, "float", default=1.0) == 42.0 55 | assert property_as_float(properties, "missing", default=1.0) == 1.0 56 | assert property_as_float(properties, "missing") is None 57 | 58 | 59 | def test_property_as_float_with_invalid_value() -> None: 60 | properties = { 61 | "some_float_prop": "invalid", 62 | } 63 | 64 | with pytest.raises(ValueError) as exc: 65 | property_as_float(properties, "some_float_prop") 66 | 67 | assert "Could not parse table property some_float_prop to a float: invalid" in str(exc.value) 68 | 69 | 70 | def test_property_as_bool() -> None: 71 | properties = { 72 | "bool": "True", 73 | } 74 | 75 | assert property_as_bool(properties, "bool", default=False) is True 76 | assert property_as_bool(properties, "missing", default=False) is False 77 | assert property_as_float(properties, "missing") is None 78 | 79 | 80 | def test_property_as_bool_with_invalid_value() -> None: 81 | properties = { 82 | "some_bool_prop": "invalid", 83 | } 84 | 85 | with pytest.raises(ValueError) as exc: 86 | property_as_bool(properties, "some_bool_prop", True) 87 | 88 | assert "Could not parse table property some_bool_prop to a boolean: invalid" in str(exc.value) 89 | 90 | 91 | def test_get_first_property_value() -> None: 92 | properties = { 93 | "prop_1": "value_1", 94 | "prop_2": "value_2", 95 | } 96 | 97 | assert get_first_property_value(properties, "prop_2", "prop_1") == "value_2" 98 | assert get_first_property_value(properties, "missing", "prop_1") == "value_1" 99 | -------------------------------------------------------------------------------- /tests/utils/test_singleton.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from pyiceberg.avro.reader import BooleanReader, FixedReader 18 | from pyiceberg.transforms import VoidTransform 19 | 20 | 21 | def test_singleton() -> None: 22 | """We want to reuse the readers to avoid creating a gazillion of them""" 23 | assert id(BooleanReader()) == id(BooleanReader()) 24 | assert id(FixedReader(22)) == id(FixedReader(22)) 25 | assert id(FixedReader(19)) != id(FixedReader(25)) 26 | 27 | 28 | def test_singleton_transform() -> None: 29 | """We want to reuse VoidTransform since it doesn't carry any state""" 30 | assert id(VoidTransform()) == id(VoidTransform()) 31 | -------------------------------------------------------------------------------- /tests/utils/test_truncate.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string 18 | 19 | 20 | def test_upper_bound_string_truncation() -> None: 21 | assert truncate_upper_bound_text_string("aaaa", 2) == "ab" 22 | assert truncate_upper_bound_text_string("".join([chr(0x10FFFF), chr(0x10FFFF), chr(0x0)]), 2) is None 23 | 24 | 25 | def test_upper_bound_binary_truncation() -> None: 26 | assert truncate_upper_bound_binary_string(b"\x01\x02\x03", 2) == b"\x01\x03" 27 | assert truncate_upper_bound_binary_string(b"\xff\xff\x00", 2) is None 28 | -------------------------------------------------------------------------------- /vendor/README.md: -------------------------------------------------------------------------------- 1 | 17 | # Vendor packages 18 | 19 | Some packages we want to maintain in the repository itself, because there is no good 3rd party alternative. 20 | 21 | ## FB303 Thrift client 22 | 23 | fb303 is a base Thrift service and a common set of functionality for querying stats, options, and other information from a service. 24 | 25 | ```bash 26 | rm -f /tmp/fb303.thrift 27 | rm -rf fb303 28 | curl -s https://raw.githubusercontent.com/apache/thrift/master/contrib/fb303/if/fb303.thrift > /tmp/fb303.thrift 29 | rm -rf /tmp/gen-py/ 30 | thrift -gen py -o /tmp/ /tmp/fb303.thrift 31 | mv /tmp/gen-py/fb303 fb303 32 | ``` 33 | 34 | # Hive Metastore Thrift definition 35 | 36 | The thrift definition require the fb303 service as a dependency 37 | 38 | ```bash 39 | rm -rf /tmp/hive 40 | mkdir -p /tmp/hive/share/fb303/if/ 41 | curl -s https://raw.githubusercontent.com/apache/thrift/master/contrib/fb303/if/fb303.thrift > /tmp/hive/share/fb303/if/fb303.thrift 42 | curl -s https://raw.githubusercontent.com/apache/hive/master/standalone-metastore/metastore-common/src/main/thrift/hive_metastore.thrift > /tmp/hive/hive_metastore.thrift 43 | thrift -gen py -o /tmp/hive /tmp/hive/hive_metastore.thrift 44 | mv /tmp/hive/gen-py/hive_metastore hive_metastore 45 | ``` 46 | -------------------------------------------------------------------------------- /vendor/fb303/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | __all__ = ["ttypes", "constants", "FacebookService"] 19 | -------------------------------------------------------------------------------- /vendor/fb303/constants.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | # Autogenerated by Thrift Compiler (0.16.0) 19 | # 20 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 21 | # 22 | # options string: py 23 | # 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /vendor/fb303/ttypes.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | # Autogenerated by Thrift Compiler (0.16.0) 19 | # 20 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 21 | # 22 | # options string: py 23 | # 24 | 25 | 26 | from thrift.TRecursive import fix_spec 27 | 28 | all_structs = [] 29 | 30 | 31 | class fb_status: 32 | """ 33 | Common status reporting mechanism across all services 34 | 35 | """ 36 | 37 | DEAD = 0 38 | STARTING = 1 39 | ALIVE = 2 40 | STOPPING = 3 41 | STOPPED = 4 42 | WARNING = 5 43 | 44 | _VALUES_TO_NAMES = { 45 | 0: "DEAD", 46 | 1: "STARTING", 47 | 2: "ALIVE", 48 | 3: "STOPPING", 49 | 4: "STOPPED", 50 | 5: "WARNING", 51 | } 52 | 53 | _NAMES_TO_VALUES = { 54 | "DEAD": 0, 55 | "STARTING": 1, 56 | "ALIVE": 2, 57 | "STOPPING": 3, 58 | "STOPPED": 4, 59 | "WARNING": 5, 60 | } 61 | 62 | 63 | fix_spec(all_structs) 64 | del all_structs 65 | -------------------------------------------------------------------------------- /vendor/hive_metastore/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | __all__ = ["ttypes", "constants", "ThriftHiveMetastore"] 18 | -------------------------------------------------------------------------------- /vendor/hive_metastore/constants.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | # Autogenerated by Thrift Compiler (0.16.0) 19 | # 20 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 21 | # 22 | # options string: py 23 | # 24 | 25 | 26 | 27 | 28 | DDL_TIME = "transient_lastDdlTime" 29 | ACCESSTYPE_NONE = 1 30 | ACCESSTYPE_READONLY = 2 31 | ACCESSTYPE_WRITEONLY = 4 32 | ACCESSTYPE_READWRITE = 8 33 | HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__" 34 | HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__" 35 | HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__" 36 | IS_ARCHIVED = "is_archived" 37 | ORIGINAL_LOCATION = "original_location" 38 | IS_IMMUTABLE = "immutable" 39 | META_TABLE_COLUMNS = "columns" 40 | META_TABLE_COLUMN_TYPES = "columns.types" 41 | BUCKET_FIELD_NAME = "bucket_field_name" 42 | BUCKET_COUNT = "bucket_count" 43 | FIELD_TO_DIMENSION = "field_to_dimension" 44 | META_TABLE_NAME = "name" 45 | META_TABLE_DB = "db" 46 | META_TABLE_LOCATION = "location" 47 | META_TABLE_SERDE = "serde" 48 | META_TABLE_PARTITION_COLUMNS = "partition_columns" 49 | META_TABLE_PARTITION_COLUMN_TYPES = "partition_columns.types" 50 | FILE_INPUT_FORMAT = "file.inputformat" 51 | FILE_OUTPUT_FORMAT = "file.outputformat" 52 | META_TABLE_STORAGE = "storage_handler" 53 | TABLE_IS_TRANSACTIONAL = "transactional" 54 | TABLE_NO_AUTO_COMPACT = "no_auto_compaction" 55 | TABLE_TRANSACTIONAL_PROPERTIES = "transactional_properties" 56 | TABLE_BUCKETING_VERSION = "bucketing_version" 57 | DRUID_CONFIG_PREFIX = "druid." 58 | JDBC_CONFIG_PREFIX = "hive.sql." 59 | TABLE_IS_CTAS = "created_with_ctas" 60 | TABLE_IS_CTLT = "created_with_ctlt" 61 | PARTITION_TRANSFORM_SPEC = "partition_transform_spec" 62 | NO_CLEANUP = "no_cleanup" 63 | CTAS_LEGACY_CONFIG = "create_table_as_external" 64 | DEFAULT_TABLE_TYPE = "defaultTableType" 65 | TXN_ID = "txnId" 66 | WRITE_ID = "writeId" 67 | --------------------------------------------------------------------------------