├── .gitattributes ├── .github └── workflows │ ├── build-and-test.yml │ └── build-kernel-wheels.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE.txt ├── PROTOCOL.md ├── README.md ├── RELEASE_NOTES.md ├── build.sbt ├── build ├── sbt ├── sbt-config │ └── repositories └── sbt-launch-lib.bash ├── client └── src │ ├── main │ ├── resources │ │ └── META-INF │ │ │ └── services │ │ │ └── org.apache.hadoop.fs.FileSystem │ └── scala │ │ ├── io │ │ └── delta │ │ │ └── sharing │ │ │ ├── client │ │ │ ├── DeltaSharingClient.scala │ │ │ ├── DeltaSharingFileSystem.scala │ │ │ ├── DeltaSharingProfileProvider.scala │ │ │ ├── InMemoryHttpInputStream.scala │ │ │ ├── RandomAccessHttpInputStream.scala │ │ │ ├── auth │ │ │ │ ├── AuthCredentialProvider.scala │ │ │ │ ├── AuthCredentialProviderFactory.scala │ │ │ │ ├── BearerTokenAuthProvider.scala │ │ │ │ ├── OAuthClient.scala │ │ │ │ └── OAuthClientCredentialsAuthProvider.scala │ │ │ ├── model.scala │ │ │ └── util │ │ │ │ ├── ConfUtils.scala │ │ │ │ ├── JsonUtils.scala │ │ │ │ └── RetryUtils.scala │ │ │ ├── filters │ │ │ ├── JsonPredicates.scala │ │ │ └── OpConverter.scala │ │ │ └── spark │ │ │ ├── DeltaSharingErrors.scala │ │ │ ├── DeltaSharingOptions.scala │ │ │ ├── DeltaSharingSource.scala │ │ │ ├── DeltaSharingSourceOffset.scala │ │ │ ├── RemoteDeltaCDFRelation.scala │ │ │ ├── RemoteDeltaFileIndex.scala │ │ │ ├── RemoteDeltaLog.scala │ │ │ ├── perf │ │ │ └── DeltaSharingLimitPushDown.scala │ │ │ └── util │ │ │ ├── QueryUtils.scala │ │ │ └── SchemaUtils.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ ├── delta │ │ └── sharing │ │ │ └── PreSignedUrlCache.scala │ │ └── sql │ │ └── DeltaSharingScanUtils.scala │ └── test │ └── scala │ ├── io │ └── delta │ │ └── sharing │ │ ├── client │ │ ├── DeltaSharingFileProfileProviderSuite.scala │ │ ├── DeltaSharingFileSystemSuite.scala │ │ ├── DeltaSharingIntegrationTest.scala │ │ ├── DeltaSharingRestClientDeltaSuite.scala │ │ ├── DeltaSharingRestClientSuite.scala │ │ ├── RandomAccessHttpInputStreamSuite.scala │ │ ├── auth │ │ │ ├── BearerTokenAuthProviderSuite.scala │ │ │ ├── OAuthClientCredentialsAuthProviderSuite.scala │ │ │ └── OAuthClientSuite.scala │ │ └── util │ │ │ ├── ConfUtilsSuite.scala │ │ │ ├── ProxyServer.scala │ │ │ └── RetryUtilsSuite.scala │ │ ├── filters │ │ ├── JsonPredicateSuite.scala │ │ └── OpConverterSuite.scala │ │ └── spark │ │ ├── DeltaSharingOptionsSuite.scala │ │ └── util │ │ └── SchemaUtilsSuite.scala │ └── org │ └── apache │ └── spark │ └── delta │ └── sharing │ └── CachedTableManagerSuite.scala ├── delta-sharing-protocl-api-description.yml ├── dev ├── python_release.sh ├── release.sh └── spark_release.sh ├── examples ├── README.md ├── open-datasets.share └── python │ ├── quickstart_pandas.py │ └── quickstart_spark.py ├── images ├── delta-sharing.png └── the-community.png ├── project ├── build.properties └── plugins.sbt ├── python ├── NOTICE.txt ├── README.md ├── delta-kernel-rust-sharing-wrapper │ ├── .gitignore │ ├── Cargo.toml │ ├── README.md │ ├── pyproject.toml │ └── src │ │ └── lib.rs ├── delta_sharing │ ├── __init__.py │ ├── _internal_auth.py │ ├── _yarl_patch.py │ ├── converter.py │ ├── delta_sharing.py │ ├── fake_checkpoint.py │ ├── protocol.py │ ├── reader.py │ ├── rest_client.py │ ├── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_auth.py │ │ ├── test_converter.py │ │ ├── test_delta_sharing.py │ │ ├── test_oauth_client.py │ │ ├── test_profile.json │ │ ├── test_profile_basic.json │ │ ├── test_profile_bearer.json │ │ ├── test_profile_oauth2.json │ │ ├── test_protocol.py │ │ ├── test_reader.py │ │ └── test_rest_client.py │ └── version.py ├── dev │ ├── .gitignore │ ├── lint-python │ ├── pytest │ ├── reformat │ └── tox.ini ├── requirements-dev.txt └── setup.py ├── scalastyle-config.xml ├── server └── src │ ├── main │ ├── protobuf │ │ └── protocol.proto │ └── scala │ │ └── io │ │ └── delta │ │ ├── sharing │ │ ├── kernel │ │ │ └── internal │ │ │ │ ├── DeltaSharedTableKernel.scala │ │ │ │ ├── JsonPredicatePruner.scala │ │ │ │ └── PredicateConverter.scala │ │ └── server │ │ │ ├── DeltaSharedTableLoader.scala │ │ │ ├── DeltaSharedTableProtocol.scala │ │ │ ├── DeltaSharingService.scala │ │ │ ├── SharedTableManager.scala │ │ │ ├── common │ │ │ ├── CloudFileSigner.scala │ │ │ ├── JsonPredicates.scala │ │ │ ├── JsonUtils.scala │ │ │ ├── SnapshotChecker.scala │ │ │ ├── TimestampUtils.scala │ │ │ └── actions │ │ │ │ ├── Codec.scala │ │ │ │ ├── DeletionVectorDescriptor.scala │ │ │ │ └── DeltaAction.scala │ │ │ ├── config │ │ │ └── ServerConfig.scala │ │ │ ├── exceptions.scala │ │ │ └── model.scala │ │ └── standalone │ │ └── internal │ │ ├── ColumnRange.scala │ │ ├── DeltaCDFErrors.scala │ │ ├── DeltaDataSource.scala │ │ ├── DeltaSharedTable.scala │ │ ├── DeltaSharingCDCReader.scala │ │ ├── DeltaSharingHistoryManager.scala │ │ ├── JsonPredicateEvaluatorV2.scala │ │ ├── JsonPredicateFilterUtils.scala │ │ ├── PartitionFilterUtils.scala │ │ └── model.scala │ ├── test │ ├── resources │ │ └── core-site.xml │ └── scala │ │ └── io │ │ └── delta │ │ ├── sharing │ │ └── server │ │ │ ├── CloudFileSignerSuite.scala │ │ │ ├── DeltaSharingServiceSuite.scala │ │ │ ├── SharedTableManagerSuite.scala │ │ │ ├── TestDeltaSharingServer.scala │ │ │ ├── TestResource.scala │ │ │ └── config │ │ │ └── ServerConfigSuite.scala │ │ └── standalone │ │ └── internal │ │ ├── ColumnRangeSuite.scala │ │ ├── JsonPredicateEvaluatorV2Suite.scala │ │ ├── JsonPredicateFilterUtilsSuite.scala │ │ ├── JsonPredicateSuite.scala │ │ ├── PartitionFilterUtilsSuite.scala │ │ └── TimestampUtilsSuite.scala │ └── universal │ └── conf │ └── delta-sharing-server.yaml.template ├── spark └── src │ ├── main │ ├── resources │ │ └── META-INF │ │ │ └── services │ │ │ ├── org.apache.hadoop.fs.FileSystem │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── scala │ │ └── io │ │ └── delta │ │ └── sharing │ │ └── spark │ │ └── DeltaSharingDataSource.scala │ └── test │ ├── resources │ └── log4j.properties │ └── scala │ └── io │ └── delta │ └── sharing │ └── spark │ ├── DeltaSharingIntegrationTest.scala │ ├── DeltaSharingSourceCDFLimitSuite.scala │ ├── DeltaSharingSourceCDFSuite.scala │ ├── DeltaSharingSourceLimitSuite.scala │ ├── DeltaSharingSourceOffsetSuite.scala │ ├── DeltaSharingSourceParamsSuite.scala │ ├── DeltaSharingSourceSuite.scala │ ├── DeltaSharingSuite.scala │ ├── RemoteDeltaLogSuite.scala │ ├── TestDeltaSharingClient.scala │ ├── TestStorageProxyServer.scala │ └── TestUtils.scala └── version.sbt /.gitattributes: -------------------------------------------------------------------------------- 1 | *.bat text eol=crlf 2 | *.cmd text eol=crlf 3 | *.bin binary 4 | -------------------------------------------------------------------------------- /.github/workflows/build-and-test.yml: -------------------------------------------------------------------------------- 1 | name: Build and Test 2 | on: [push, pull_request, workflow_dispatch] 3 | jobs: 4 | build-and-test-server: 5 | runs-on: ubuntu-24.04 6 | env: 7 | SPARK_LOCAL_IP: localhost 8 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 9 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 10 | AZURE_TEST_ACCOUNT_KEY: ${{ secrets.AZURE_TEST_ACCOUNT_KEY }} 11 | GOOGLE_APPLICATION_CREDENTIALS: /tmp/google_service_account_key.json 12 | GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }} 13 | steps: 14 | - name: Checkout repository 15 | uses: actions/checkout@v2 16 | - name: Cache Scala, SBT 17 | uses: actions/cache@v4 18 | with: 19 | path: | 20 | ~/.sbt 21 | ~/.ivy2 22 | ~/.cache/coursier 23 | key: build-and-test-server 24 | - name: Install Java 8 25 | uses: actions/setup-java@v3 26 | with: 27 | distribution: 'zulu' 28 | java-version: '8' 29 | - run: ./build/sbt server/test 30 | 31 | build-and-test-client: 32 | runs-on: ubuntu-24.04 33 | env: 34 | SPARK_LOCAL_IP: localhost 35 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 36 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 37 | AZURE_TEST_ACCOUNT_KEY: ${{ secrets.AZURE_TEST_ACCOUNT_KEY }} 38 | GOOGLE_APPLICATION_CREDENTIALS: /tmp/google_service_account_key.json 39 | GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }} 40 | steps: 41 | - name: Checkout repository 42 | uses: actions/checkout@v2 43 | - name: Cache Scala, SBT 44 | uses: actions/cache@v4 45 | with: 46 | path: | 47 | ~/.sbt 48 | ~/.ivy2 49 | ~/.cache/coursier 50 | key: build-and-test-client 51 | - name: Install Java 17 52 | uses: actions/setup-java@v3 53 | with: 54 | distribution: 'zulu' 55 | java-version: '17' 56 | - run: ./build/sbt client/test spark/test 57 | 58 | python: 59 | runs-on: ubuntu-24.04 60 | strategy: 61 | fail-fast: false 62 | matrix: 63 | python-version: ["3.8", "3.9", "3.10", "3.11"] 64 | include: 65 | - pandas-version: 2.0.3 66 | pyarrow-version: 16.1.0 67 | env: 68 | PYTHON_VERSION: ${{ matrix.python-version }} 69 | PANDAS_VERSION: ${{ matrix.pandas-version }} 70 | PYARROW_VERSION: ${{ matrix.pyarrow-version }} 71 | SPARK_LOCAL_IP: localhost 72 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 73 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 74 | AZURE_TEST_ACCOUNT_KEY: ${{ secrets.AZURE_TEST_ACCOUNT_KEY }} 75 | GOOGLE_APPLICATION_CREDENTIALS: /tmp/google_service_account_key.json 76 | GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }} 77 | # Github Actions' default miniconda 78 | CONDA_PREFIX: /usr/share/miniconda 79 | steps: 80 | - name: Checkout repository 81 | uses: actions/checkout@v2 82 | - name: Cache Scala, SBT 83 | uses: actions/cache@v4 84 | with: 85 | path: | 86 | ~/.sbt 87 | ~/.ivy2 88 | ~/.cache/coursier 89 | key: build-and-test-python 90 | - name: Install Java 8 91 | uses: actions/setup-java@v1 92 | with: 93 | java-version: '8' 94 | - name: Install dependencies 95 | run: | 96 | # See also https://github.com/conda/conda/issues/7980 97 | source "$CONDA_PREFIX/etc/profile.d/conda.sh" 98 | conda update -q conda 99 | conda create -c conda-forge -q -n test-environment python=$PYTHON_VERSION 100 | conda activate test-environment 101 | conda config --env --add pinned_packages python=$PYTHON_VERSION 102 | conda config --env --add pinned_packages pandas==$PANDAS_VERSION 103 | conda config --env --add pinned_packages pyarrow==$PYARROW_VERSION 104 | conda install -c conda-forge --yes pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION 105 | sed -i -e "/pandas/d" -e "/pyarrow/d" python/requirements-dev.txt 106 | conda install -c conda-forge --yes --file python/requirements-dev.txt 107 | conda list 108 | cd python 109 | cd delta-kernel-rust-sharing-wrapper 110 | maturin develop 111 | - name: Build Server 112 | run: ./build/sbt package 113 | - name: Run tests 114 | run: | 115 | # See also https://github.com/conda/conda/issues/7980 116 | source "$CONDA_PREFIX/etc/profile.d/conda.sh" 117 | conda activate test-environment 118 | ./python/dev/lint-python 119 | ./python/dev/pytest 120 | -------------------------------------------------------------------------------- /.github/workflows/build-kernel-wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build delta-kernel-rust-sharing-wrapper wheels for 4 OS and 2 architectures 2 | 3 | on: 4 | push: 5 | paths: 6 | - python/delta-kernel-rust-sharing-wrapper/** 7 | - .github/workflows/** 8 | pull_request: 9 | paths: 10 | - python/delta-kernel-rust-sharing-wrapper/** 11 | - .github/workflows/** 12 | 13 | jobs: 14 | build: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest, ubuntu-24.04, ubuntu-22.04, macos-latest, windows-latest] 19 | python-version: [3.8] 20 | arch: [x86_64, arm64] 21 | exclude: 22 | - os: ubuntu-latest 23 | arch: arm64 24 | - os: ubuntu-24.04 25 | arch: arm64 26 | - os: ubuntu-22.04 27 | arch: arm64 28 | - os: windows-latest 29 | arch: arm64 30 | 31 | steps: 32 | - uses: actions/checkout@v2 33 | 34 | - name: Set up Python 35 | uses: actions/setup-python@v2 36 | with: 37 | python-version: ${{ matrix.python-version }} 38 | 39 | - name: Install maturin 40 | run: pip install maturin 41 | 42 | - name: Build wheel (x86_64 macOS) 43 | if: matrix.os == 'macos-latest' && matrix.arch == 'x86_64' 44 | run: | 45 | rustup target add x86_64-apple-darwin 46 | cd python/delta-kernel-rust-sharing-wrapper 47 | maturin build --release --target x86_64-apple-darwin 48 | shell: bash 49 | 50 | - name: Build wheel (ARM macOS) 51 | if: matrix.os == 'macos-latest' && matrix.arch == 'arm64' 52 | run: | 53 | rustup target add aarch64-apple-darwin 54 | cd python/delta-kernel-rust-sharing-wrapper 55 | maturin build --release --target aarch64-apple-darwin 56 | shell: bash 57 | 58 | - name: Build wheel (x86_64 Windows) 59 | if: runner.os == 'Windows' 60 | run: | 61 | cd python/delta-kernel-rust-sharing-wrapper 62 | maturin build --release 63 | shell: powershell 64 | 65 | - name: Build wheel (x86_64 Linux) 66 | if: matrix.os == 'ubuntu-latest' 67 | run: | 68 | cd python/delta-kernel-rust-sharing-wrapper 69 | maturin build --release 70 | shell: bash 71 | 72 | - name: Build wheel (x86_64 Linux Ubuntu 24.04) 73 | if: matrix.os == 'ubuntu-24.04' 74 | run: | 75 | cd python/delta-kernel-rust-sharing-wrapper 76 | maturin build --release 77 | shell: bash 78 | 79 | - name: Build wheel (x86_64 Linux Ubuntu 22.04) 80 | if: matrix.os == 'ubuntu-22.04' 81 | run: | 82 | cd python/delta-kernel-rust-sharing-wrapper 83 | maturin build --release 84 | shell: bash 85 | 86 | - name: Upload wheels 87 | uses: actions/upload-artifact@v4 88 | with: 89 | name: wheel-${{ matrix.os }}-${{ matrix.arch }} 90 | path: python/delta-kernel-rust-sharing-wrapper/target/wheels/*.whl 91 | 92 | merge: 93 | runs-on: ubuntu-latest 94 | needs: build 95 | steps: 96 | - name: Merge Artifacts 97 | uses: actions/upload-artifact/merge@v4 98 | with: 99 | name: all-wheels 100 | pattern: wheel-* 101 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *#*# 2 | *.#* 3 | *.iml 4 | *.ipr 5 | *.iws 6 | *.pyc 7 | *.pyo 8 | *.swp 9 | *~ 10 | .DS_Store 11 | .bsp 12 | .cache 13 | .classpath 14 | .ensime 15 | .ensime_cache/ 16 | .ensime_lucene 17 | .generated-mima* 18 | .idea/ 19 | .idea_modules/ 20 | .project 21 | .pydevproject 22 | .scala_dependencies 23 | .settings 24 | /lib/ 25 | R-unit-tests.log 26 | R/unit-tests.out 27 | R/cran-check.out 28 | R/pkg/vignettes/sparkr-vignettes.html 29 | R/pkg/tests/fulltests/Rplots.pdf 30 | build/*.jar 31 | build/apache-maven* 32 | build/scala* 33 | build/zinc* 34 | cache 35 | conf/*.cmd 36 | conf/*.conf 37 | conf/*.properties 38 | conf/*.sh 39 | conf/*.xml 40 | conf/java-opts 41 | conf/slaves 42 | dependency-reduced-pom.xml 43 | derby.log 44 | dev/create-release/*final 45 | dev/create-release/*txt 46 | dev/pr-deps/ 47 | dist/ 48 | docs/_site 49 | docs/api 50 | sql/docs 51 | sql/site 52 | lib_managed/ 53 | lint-r-report.log 54 | log/ 55 | logs/ 56 | out/ 57 | project/boot/ 58 | project/build/target/ 59 | project/plugins/lib_managed/ 60 | project/plugins/project/build.properties 61 | project/plugins/src_managed/ 62 | project/plugins/target/ 63 | python/lib/pyspark.zip 64 | python/deps 65 | docs/python/_static/ 66 | docs/python/_templates/ 67 | docs/python/_build/ 68 | python/build/ 69 | python/test_coverage/coverage_data 70 | python/test_coverage/htmlcov 71 | python/pyspark/python 72 | reports/ 73 | scalastyle-on-compile.generated.xml 74 | scalastyle-output.xml 75 | scalastyle.txt 76 | spark-*-bin-*.tgz 77 | spark-tests.log 78 | src_managed/ 79 | streaming-tests.log 80 | target/ 81 | unit-tests.log 82 | work/ 83 | docs/.jekyll-metadata 84 | 85 | # For Hive 86 | TempStatsStore/ 87 | metastore/ 88 | metastore_db/ 89 | sql/hive-thriftserver/test_warehouses 90 | warehouse/ 91 | spark-warehouse/ 92 | 93 | # For R session data 94 | .RData 95 | .RHistory 96 | .Rhistory 97 | *.Rproj 98 | *.Rproj.* 99 | 100 | .Rproj.user 101 | 102 | **/src/main/resources/js 103 | 104 | # For SBT 105 | .jvmopts 106 | 107 | # For Python 108 | *.egg-info 109 | 110 | # For VSCode 111 | *.vscode 112 | 113 | # For Metals 114 | *.metals 115 | 116 | # For venv 117 | *.venv 118 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | We happily welcome contributions to Delta Sharing. We use [GitHub Issues](/../../issues/) to track community reported issues and [GitHub Pull Requests ](/../../pulls/) for accepting changes. 2 | 3 | # Governance 4 | Delta Sharing governance is conducted by the Technical Steering Committee (TSC), which is currently composed of the following members: 5 | - Michael Armbrust (michael.armbrust@gmail.com) 6 | - Reynold Xin (reynoldx@gmail.com) 7 | - Matei Zaharia (matei@cs.stanford.edu) 8 | 9 | The founding technical charter can be found [here](https://delta.io/wp-content/uploads/2019/12/delta-charter.pdf). 10 | 11 | # Communication 12 | Before starting work on a major feature, please reach out to us via GitHub, Slack, email, etc. We will make sure no one else is already working on it and ask you to open a GitHub issue. 13 | A "major feature" is defined as any change that is > 100 LOC altered (not including tests), or changes any user-facing behavior. 14 | We will use the GitHub issue to discuss the feature and come to agreement. 15 | This is to prevent your time being wasted, as well as ours. 16 | The GitHub review process for major features is also important so that organizations with commit access can come to agreement on design. 17 | If it is appropriate to write a design document, the document must be hosted either in the GitHub tracking issue, or linked to from the issue and hosted in a world-readable location. 18 | Specifically, if the goal is to add a new extension, please read the extension policy. 19 | Small patches and bug fixes don't need prior communication. 20 | 21 | # Coding style 22 | We generally follow the Apache Spark Scala Style Guide. 23 | 24 | # Sign your work 25 | The sign-off is a simple line at the end of the explanation for the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify the below (from developercertificate.org): 26 | 27 | ``` 28 | Developer Certificate of Origin 29 | Version 1.1 30 | 31 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 32 | 1 Letterman Drive 33 | Suite D4700 34 | San Francisco, CA, 94129 35 | 36 | Everyone is permitted to copy and distribute verbatim copies of this 37 | license document, but changing it is not allowed. 38 | 39 | 40 | Developer's Certificate of Origin 1.1 41 | 42 | By making a contribution to this project, I certify that: 43 | 44 | (a) The contribution was created in whole or in part by me and I 45 | have the right to submit it under the open source license 46 | indicated in the file; or 47 | 48 | (b) The contribution is based upon previous work that, to the best 49 | of my knowledge, is covered under an appropriate open source 50 | license and I have the right under that license to submit that 51 | work with modifications, whether created in whole or in part 52 | by me, under the same open source license (unless I am 53 | permitted to submit under a different license), as indicated 54 | in the file; or 55 | 56 | (c) The contribution was provided directly to me by some other 57 | person who certified (a), (b) or (c) and I have not modified 58 | it. 59 | 60 | (d) I understand and agree that this project and the contribution 61 | are public and that a record of the contribution (including all 62 | personal information I submit with it, including my sign-off) is 63 | maintained indefinitely and may be redistributed consistent with 64 | this project or the open source license(s) involved. 65 | ``` 66 | 67 | Then you just add a line to every git commit message: 68 | 69 | ``` 70 | Signed-off-by: Joe Smith 71 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 72 | ``` 73 | 74 | If you set your `user.name` and `user.email` git configs, you can sign your commit automatically with git commit -s. 75 | -------------------------------------------------------------------------------- /RELEASE_NOTES.md: -------------------------------------------------------------------------------- 1 | # Release Notes 2 | 3 | ## Delta Sharing 0.5.4 (Released on 2023-01-11) 4 | Improvements: 5 | - Spark connector changes to consume size from metadata. 6 | 7 | ## Delta Sharing 0.6.2 (Released on 2022-12-20) 8 | Bug fixes: 9 | - Fix comparison of the expiration time to current time for pre-signed urls. 10 | 11 | 12 | ## Delta Sharing 0.5.3 (Released on 2022-12-20) 13 | Bug fixes: 14 | - Extends DeltaSharingProfileProvider to customize tablePath and refresher. 15 | - Refresh pre-signed urls for cdf queries. 16 | - Fix partitionFilters issue for cdf queries. 17 | - Fix comparison of the expiration time to current time for pre-signed urls. 18 | 19 | 20 | ## Delta Sharing 0.6.1 (Released on 2022-12-19) 21 | Improvements: 22 | - Spark connector changes to consume size from metadata. 23 | - Improve delta sharing error messages. 24 | 25 | Bug fixes: 26 | - Extends DeltaSharingProfileProvider to customize tablePath and refresher. 27 | - Refresh pre-signed urls for cdf and streaming queries. 28 | - Allow 0 for versionAsOf parameter, to be consistent with Delta. 29 | - Fix partitionFilters issue: apply it to all file indices. 30 | 31 | ## Delta Sharing 0.6.0 (Released on 2022-12-02) 32 | Improvements: 33 | - Support using a delta sharing table as a source in spark structured streaming, which allows recipients to stay up to date with the shared data. 34 | - Fix a few nits in the PROTOCOL documentation. 35 | - Support timestampAsOf parameter in delta sharing data source. 36 | 37 | ## Delta Sharing 0.5.2 (Released on 2022-10-10) 38 | Fixes: 39 | - Add a Custom Http Header Provider. 40 | 41 | ## Delta Sharing 0.5.1 (Released on 2022-09-08) 42 | Improvements: 43 | - Upgrade AWS SDK to 1.12.189. 44 | - More tests on the error message when loading table fails. 45 | - Add ability to configure armeria server request timeout. 46 | - documentation improvements. 47 | 48 | Bug fixes: 49 | - Fix column selection bug on Delta Sharing CDF spark dataframe. 50 | - Fix GCS path reading. 51 | 52 | ## Delta Sharing 0.5.0 (Released on 2022-08-30) 53 | Improvements: 54 | - Support for Change Data Feed which allows clients to fetch incremental changes for the shared tables. 55 | - Include response body in HTTPError exception in Python library. 56 | - Improve the error message for the /share/schema/table APIs. 57 | - Protocol and REST API documentation improvements. 58 | - Add query_table_version to the rest client. 59 | 60 | ## Delta Sharing 0.4.0 (Released on 2022-01-13) 61 | Improvements: 62 | - Support Google Cloud Storage on Delta Sharing Server. 63 | - Add a new API to get the metadata of a Share. 64 | - Protocol and REST API documentation enhancements. 65 | - Allow for customization of recipient profile in Apache Spark connector. 66 | 67 | Bug fixes: 68 | - Block managed table creation for Delta Sharing to prevent user confusions. 69 | 70 | ## Delta Sharing 0.3.0 (Released on 2021-12-01) 71 | Improvements: 72 | - Support Azure Blob Storage and Azure Data Lake Gen2 in Delta Sharing Server. 73 | - Apache Spark Connector now can send the limitHint parameter when a user query is using limit. 74 | - `load_as_pandas` in Python Connector now accepts a limit parameter to allow users fetching only a few rows to explore. 75 | - Apache Spark Connector will re-fetch pre-signed urls before they expire to support long running queries. 76 | - Add a new API to list all tables in a share to save network round trips. 77 | - Add a User-Agent header to request sent from Apache Spark Connector and Python. 78 | - Add an optional expirationTime field to Delta Sharing Profile File Format to provide the token expiration time. 79 | 80 | Bug fixes: 81 | - Fix a corner case that list_all_tables may not return correct results in the Python Connector. 82 | 83 | ## Delta Sharing 0.2.0 (Released on 2021-08-10) 84 | Improvements: 85 | - Added official Docker images for Delta Sharing Server. 86 | - Added an examples project to show how to try the open Delta Sharing Server. 87 | - Added the conf directory to the Delta Sharing Server classpath to allow users to add their Hadoop configuration files in the directory. 88 | - Added retry with exponential backoff for REST requests in the Python connector. 89 | 90 | Bug fixes: 91 | - Added the minimum fsspec requirement in the Python connector. 92 | - Fixed an issue when files in a table have no stats in the Python connector. 93 | - Improve error handling in Delta Sharing Server to report 400 Bad Request properly. 94 | - Fixed the table schema when a table is empty in the Python connector. 95 | - Fixed KeyError when there are no shared tables in the Python connector. 96 | 97 | ## Delta Sharing 0.1.0 (Released on 2021-05-25) 98 | Components: 99 | - Delta Sharing protocol specification. 100 | - Python Connector: A Python library that implements the Delta Sharing Protocol to read shared tables as pandas DataFrame or Apache Spark DataFrames. 101 | - Apache Spark Connector: An Apache Spark connector that implements the Delta Sharing Protocol to read shared tables from a Delta Sharing Server. The tables can then be accessed in SQL, Python, Java, Scala, or R. 102 | - Delta Sharing Server: A reference implementation server for the Delta Sharing Protocol for development purposes. Users can deploy this server to share existing tables in Delta Lake and Apache Parquet format on modern cloud storage systems. -------------------------------------------------------------------------------- /build/sbt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so 21 | # that we can run Hive to generate the golden answer. This is not required for normal development 22 | # or testing. 23 | if [ -n "$HIVE_HOME" ]; then 24 | for i in "$HIVE_HOME"/lib/* 25 | do HADOOP_CLASSPATH="$HADOOP_CLASSPATH:$i" 26 | done 27 | export HADOOP_CLASSPATH 28 | fi 29 | 30 | realpath () { 31 | ( 32 | TARGET_FILE="$1" 33 | 34 | cd "$(dirname "$TARGET_FILE")" 35 | TARGET_FILE="$(basename "$TARGET_FILE")" 36 | 37 | COUNT=0 38 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] 39 | do 40 | TARGET_FILE="$(readlink "$TARGET_FILE")" 41 | cd $(dirname "$TARGET_FILE") 42 | TARGET_FILE="$(basename $TARGET_FILE)" 43 | COUNT=$(($COUNT + 1)) 44 | done 45 | 46 | echo "$(pwd -P)/"$TARGET_FILE"" 47 | ) 48 | } 49 | 50 | . "$(dirname "$(realpath "$0")")"/sbt-launch-lib.bash 51 | 52 | 53 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" 54 | declare -r sbt_opts_file=".sbtopts" 55 | declare -r etc_sbt_opts_file="/etc/sbt/sbtopts" 56 | declare -r default_sbt_opts="-Xss4m" 57 | 58 | usage() { 59 | cat < path to global settings/plugins directory (default: ~/.sbt) 68 | -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) 69 | -ivy path to local Ivy repository (default: ~/.ivy2) 70 | -mem set memory options (default: $sbt_default_mem, which is $(get_mem_opts $sbt_default_mem)) 71 | -no-share use all local caches; no sharing 72 | -no-global uses global caches, but does not use global ~/.sbt directory. 73 | -jvm-debug Turn on JVM debugging, open at the given port. 74 | -batch Disable interactive mode 75 | 76 | # sbt version (default: from project/build.properties if present, else latest release) 77 | -sbt-version use the specified version of sbt 78 | -sbt-jar use the specified jar as the sbt launcher 79 | -sbt-rc use an RC version of sbt 80 | -sbt-snapshot use a snapshot version of sbt 81 | 82 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) 83 | -java-home alternate JAVA_HOME 84 | 85 | # jvm options and output control 86 | JAVA_OPTS environment variable, if unset uses "$java_opts" 87 | SBT_OPTS environment variable, if unset uses "$default_sbt_opts" 88 | .sbtopts if this file exists in the current directory, it is 89 | prepended to the runner args 90 | /etc/sbt/sbtopts if this file exists, it is prepended to the runner args 91 | -Dkey=val pass -Dkey=val directly to the java runtime 92 | -J-X pass option -X directly to the java runtime 93 | (-J is stripped) 94 | -S-X add -X to sbt's scalacOptions (-S is stripped) 95 | -PmavenProfiles Enable a maven profile for the build. 96 | 97 | In the case of duplicated or conflicting options, the order above 98 | shows precedence: JAVA_OPTS lowest, command line options highest. 99 | EOM 100 | } 101 | 102 | process_my_args () { 103 | while [[ $# -gt 0 ]]; do 104 | case "$1" in 105 | -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; 106 | -no-share) addJava "$noshare_opts" && shift ;; 107 | -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; 108 | -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; 109 | -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; 110 | -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; 111 | -batch) exec /dev/null) 148 | if [[ ! $? ]]; then 149 | saved_stty="" 150 | fi 151 | } 152 | 153 | saveSttySettings 154 | trap onExit INT 155 | 156 | run "$@" 157 | 158 | exit_status=$? 159 | onExit 160 | -------------------------------------------------------------------------------- /build/sbt-config/repositories: -------------------------------------------------------------------------------- 1 | [repositories] 2 | local 3 | local-preloaded-ivy: file:///${sbt.preloaded-${sbt.global.base-${user.home}/.sbt}/preloaded/}, [organization]/[module]/[revision]/[type]s/[artifact](-[classifier]).[ext] 4 | local-preloaded: file:///${sbt.preloaded-${sbt.global.base-${user.home}/.sbt}/preloaded/} 5 | gcs-maven-central-mirror: https://maven-central.storage-download.googleapis.com/repos/central/data/ 6 | maven-central 7 | typesafe-ivy-releases: https://repo.typesafe.com/typesafe/ivy-releases/, [organization]/[module]/[revision]/[type]s/[artifact](-[classifier]).[ext], bootOnly 8 | sbt-ivy-snapshots: https://repo.scala-sbt.org/scalasbt/ivy-snapshots/, [organization]/[module]/[revision]/[type]s/[artifact](-[classifier]).[ext], bootOnly 9 | sbt-plugin-releases: https://repo.scala-sbt.org/scalasbt/sbt-plugin-releases/, [organization]/[module]/(scala_[scalaVersion]/)(sbt_[sbtVersion]/)[revision]/[type]s/[artifact](-[classifier]).[ext] 10 | bintray-spark-packages: https://dl.bintray.com/spark-packages/maven/ 11 | typesafe-releases: http://repo.typesafe.com/typesafe/releases/ 12 | -------------------------------------------------------------------------------- /build/sbt-launch-lib.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | 4 | # A library to simplify using the SBT launcher from other packages. 5 | # Note: This should be used by tools like giter8/conscript etc. 6 | 7 | # TODO - Should we merge the main SBT script with this library? 8 | 9 | if test -z "$HOME"; then 10 | declare -r script_dir="$(dirname "$script_path")" 11 | else 12 | declare -r script_dir="$HOME/.sbt" 13 | fi 14 | 15 | declare -a residual_args 16 | declare -a java_args 17 | declare -a scalac_args 18 | declare -a sbt_commands 19 | declare -a maven_profiles 20 | declare sbt_default_mem=4096 21 | 22 | if test -x "$JAVA_HOME/bin/java"; then 23 | echo -e "Using $JAVA_HOME as default JAVA_HOME." 24 | echo "Note, this will be overridden by -java-home if it is set." 25 | declare java_cmd="$JAVA_HOME/bin/java" 26 | else 27 | declare java_cmd=java 28 | fi 29 | 30 | echoerr () { 31 | echo 1>&2 "$@" 32 | } 33 | vlog () { 34 | [[ $verbose || $debug ]] && echoerr "$@" 35 | } 36 | dlog () { 37 | [[ $debug ]] && echoerr "$@" 38 | } 39 | 40 | acquire_sbt_jar () { 41 | SBT_VERSION=`awk -F "=" '/sbt\.version/ {print $2}' ./project/build.properties` 42 | # DEFAULT_ARTIFACT_REPOSITORY env variable can be used to only fetch 43 | # artifacts from internal repos only. 44 | # Ex: 45 | # DEFAULT_ARTIFACT_REPOSITORY=https://artifacts.internal.com/libs-release/ 46 | URL1=${DEFAULT_ARTIFACT_REPOSITORY:-https://repo1.maven.org/maven2/}org/scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch-${SBT_VERSION}.jar 47 | JAR=build/sbt-launch-${SBT_VERSION}.jar 48 | 49 | sbt_jar=$JAR 50 | 51 | if [[ ! -f "$sbt_jar" ]]; then 52 | # Download sbt launch jar if it hasn't been downloaded yet 53 | if [ ! -f "${JAR}" ]; then 54 | # Download 55 | printf "Attempting to fetch sbt\n" 56 | JAR_DL="${JAR}.part" 57 | if [ $(command -v curl) ]; then 58 | curl --fail --location --silent ${URL1} > "${JAR_DL}" &&\ 59 | mv "${JAR_DL}" "${JAR}" 60 | elif [ $(command -v wget) ]; then 61 | wget --quiet ${URL1} -O "${JAR_DL}" &&\ 62 | mv "${JAR_DL}" "${JAR}" 63 | else 64 | printf "You do not have curl or wget installed, please install sbt manually from https://www.scala-sbt.org/\n" 65 | exit -1 66 | fi 67 | fi 68 | if [ ! -f "${JAR}" ]; then 69 | # We failed to download 70 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from https://www.scala-sbt.org/\n" 71 | exit -1 72 | fi 73 | printf "Launching sbt from ${JAR}\n" 74 | fi 75 | } 76 | 77 | execRunner () { 78 | # print the arguments one to a line, quoting any containing spaces 79 | [[ $verbose || $debug ]] && echo "# Executing command line:" && { 80 | for arg; do 81 | if printf "%s\n" "$arg" | grep -q ' '; then 82 | printf "\"%s\"\n" "$arg" 83 | else 84 | printf "%s\n" "$arg" 85 | fi 86 | done 87 | echo "" 88 | } 89 | 90 | "$@" 91 | } 92 | 93 | addJava () { 94 | dlog "[addJava] arg = '$1'" 95 | java_args=( "${java_args[@]}" "$1" ) 96 | } 97 | 98 | enableProfile () { 99 | dlog "[enableProfile] arg = '$1'" 100 | maven_profiles=( "${maven_profiles[@]}" "$1" ) 101 | export SBT_MAVEN_PROFILES="${maven_profiles[@]}" 102 | } 103 | 104 | addSbt () { 105 | dlog "[addSbt] arg = '$1'" 106 | sbt_commands=( "${sbt_commands[@]}" "$1" ) 107 | } 108 | addResidual () { 109 | dlog "[residual] arg = '$1'" 110 | residual_args=( "${residual_args[@]}" "$1" ) 111 | } 112 | addDebugger () { 113 | addJava "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$1" 114 | } 115 | 116 | # a ham-fisted attempt to move some memory settings in concert 117 | # so they need not be dicked around with individually. 118 | get_mem_opts () { 119 | local mem=${1:-$sbt_default_mem} 120 | local codecache=$(( $mem / 8 )) 121 | (( $codecache > 128 )) || codecache=128 122 | (( $codecache < 2048 )) || codecache=2048 123 | 124 | echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m" 125 | } 126 | 127 | require_arg () { 128 | local type="$1" 129 | local opt="$2" 130 | local arg="$3" 131 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then 132 | echo "$opt requires <$type> argument" 1>&2 133 | exit 1 134 | fi 135 | } 136 | 137 | is_function_defined() { 138 | declare -f "$1" > /dev/null 139 | } 140 | 141 | process_args () { 142 | while [[ $# -gt 0 ]]; do 143 | case "$1" in 144 | -h|-help) usage; exit 1 ;; 145 | -v|-verbose) verbose=1 && shift ;; 146 | -d|-debug) debug=1 && shift ;; 147 | 148 | -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; 149 | -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;; 150 | -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; 151 | -batch) exec 33 | OAuthClientCredentialsAuthProvider(client, authConfig, oauthProfile) 34 | case BearerTokenDeltaSharingProfile(_, _, bearerToken, expirationTime) => 35 | BearerTokenAuthProvider(bearerToken, expirationTime) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/client/auth/BearerTokenAuthProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.client.auth 18 | 19 | import java.sql.Timestamp 20 | import java.time.LocalDateTime 21 | import java.time.format.DateTimeFormatter.ISO_DATE_TIME 22 | 23 | import org.apache.http.HttpHeaders 24 | import org.apache.http.client.methods.HttpRequestBase 25 | 26 | private[client] case class BearerTokenAuthProvider(bearerToken: String, expirationTime: String) 27 | extends AuthCredentialProvider { 28 | override def addAuthHeader(httpRequest: HttpRequestBase): Unit = { 29 | httpRequest.setHeader(HttpHeaders.AUTHORIZATION, s"Bearer $bearerToken") 30 | } 31 | 32 | override def isExpired(): Boolean = { 33 | if (expirationTime == null) return false 34 | try { 35 | val expirationTimeAsTimeStamp = Timestamp.valueOf( 36 | LocalDateTime.parse(expirationTime, ISO_DATE_TIME)) 37 | expirationTimeAsTimeStamp.before(Timestamp.valueOf(LocalDateTime.now())) 38 | } catch { 39 | case _: Throwable => false 40 | } 41 | } 42 | 43 | override def getExpirationTime(): Option[String] = { 44 | expirationTime match { 45 | case null => None 46 | case _ => Some(expirationTime) 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/client/auth/OAuthClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.delta.sharing.client.auth 17 | 18 | import java.util.Base64 19 | 20 | import com.fasterxml.jackson.databind.JsonNode 21 | import com.fasterxml.jackson.databind.node.BaseJsonNode 22 | import org.apache.http.HttpEntity 23 | import org.apache.http.client.methods.{CloseableHttpResponse, HttpPost} 24 | import org.apache.http.entity.StringEntity 25 | import org.apache.http.impl.client.CloseableHttpClient 26 | import org.apache.http.util.EntityUtils 27 | 28 | import io.delta.sharing.client.util.{JsonUtils, RetryUtils, UnexpectedHttpStatus} 29 | 30 | case class OAuthClientCredentials(accessToken: String, 31 | expiresIn: Long, 32 | creationTimestamp: Long) 33 | 34 | private[client] class OAuthClient(httpClient: 35 | CloseableHttpClient, 36 | authConfig: AuthConfig, 37 | tokenEndpoint: String, 38 | clientId: String, 39 | clientSecret: String, 40 | scope: Option[String] = None) { 41 | 42 | def clientCredentials(): OAuthClientCredentials = { 43 | 44 | // see client credentials grant spec detail here: 45 | // https://www.oauth.com/oauth2-servers/access-tokens/client-credentials/ 46 | // https://datatracker.ietf.org/doc/html/rfc6749 47 | val credentials = Base64.getEncoder.encodeToString(s"$clientId:$clientSecret".getBytes("UTF-8")) 48 | 49 | val post = new HttpPost(tokenEndpoint) 50 | post.setHeader("accept", "application/json") 51 | post.setHeader("authorization", s"Basic $credentials") 52 | post.setHeader("content-type", "application/x-www-form-urlencoded") 53 | 54 | val scopeParam = scope.map(s => s"&scope=$s").getOrElse("") 55 | val body = s"grant_type=client_credentials$scopeParam" 56 | post.setEntity(new StringEntity(body)) 57 | 58 | // retries on temporary errors (connection error or 500, 429) from token endpoint 59 | RetryUtils.runWithExponentialBackoff( 60 | authConfig.tokenExchangeMaxRetries, 61 | authConfig.tokenExchangeMaxRetryDurationInSeconds * 1000) { 62 | var response: CloseableHttpResponse = null 63 | try { 64 | response = httpClient.execute(post) 65 | val responseString = getResponseAsString(response.getEntity) 66 | if (response.getStatusLine.getStatusCode != 200) { 67 | throw new UnexpectedHttpStatus(s"Failed to get OAuth token from token endpoint: " + 68 | s"Token Endpoint responded: ${response.getStatusLine} with response: $responseString", 69 | response.getStatusLine.getStatusCode) 70 | } 71 | 72 | parseOAuthTokenResponse(responseString) 73 | } finally { 74 | if (response != null) response.close() 75 | } 76 | } 77 | } 78 | 79 | private def parseOAuthTokenResponse(response: String): OAuthClientCredentials = { 80 | // Parsing the response per oauth spec 81 | // https://datatracker.ietf.org/doc/html/rfc6749#section-5.1 82 | if (response == null || response.isEmpty) { 83 | throw new RuntimeException("Empty response from OAuth token endpoint") 84 | } 85 | val jsonNode = JsonUtils.readTree(response) 86 | if (!jsonNode.has("access_token") || !jsonNode.get("access_token").isTextual) { 87 | throw new RuntimeException("Missing 'access_token' field in OAuth token response") 88 | } 89 | if (!jsonNode.has("expires_in")) { 90 | throw new RuntimeException("Missing 'expires_in' field in OAuth token response") 91 | } 92 | 93 | // OAuth spec requires 'expires_in' to be an integer, e.g., 3600. 94 | // See https://datatracker.ietf.org/doc/html/rfc6749#section-5.1 95 | // But some token endpoints return `expires_in` as a string e.g., "3600". 96 | // This ensures that we support both integer and string values for 'expires_in' field. 97 | // Example request resulting in 'expires_in' as a string: 98 | // curl -X POST \ 99 | // https://login.windows.net/$TENANT_ID/oauth2/token \ 100 | // -H "Content-Type: application/x-www-form-urlencoded" \ 101 | // -d "grant_type=client_credentials" \ 102 | // -d "client_id=$CLIENT_ID" \ 103 | // -d "client_secret=$CLIENT_SECRET" \ 104 | // -d "scope=https://graph.microsoft.com/.default" 105 | val expiresIn : Long = jsonNode.get("expires_in") match { 106 | case n if n.isNumber => n.asLong() 107 | case n if n.isTextual => 108 | try { 109 | n.asText().toLong 110 | } catch { 111 | case _: NumberFormatException => 112 | throw new RuntimeException("Invalid 'expires_in' field in OAuth token response") 113 | } 114 | case _ => 115 | throw new RuntimeException("Invalid 'expires_in' field in OAuth token response") 116 | } 117 | 118 | OAuthClientCredentials( 119 | jsonNode.get("access_token").asText(), 120 | expiresIn, 121 | System.currentTimeMillis() 122 | ) 123 | } 124 | 125 | private def getResponseAsString(httpEntity: HttpEntity) : String = { 126 | if (httpEntity != null) { 127 | EntityUtils.toString(httpEntity) 128 | } else { 129 | null 130 | } 131 | } 132 | } 133 | 134 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/client/auth/OAuthClientCredentialsAuthProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.client.auth 18 | 19 | import java.util.concurrent.locks.ReentrantReadWriteLock 20 | 21 | import org.apache.http.HttpHeaders 22 | import org.apache.http.client.methods.HttpRequestBase 23 | import org.apache.http.impl.client.CloseableHttpClient 24 | 25 | import io.delta.sharing.client.OAuthClientCredentialsDeltaSharingProfile 26 | 27 | private[client] case class OAuthClientCredentialsAuthProvider( 28 | client: CloseableHttpClient, 29 | authConfig: AuthConfig, 30 | profile: OAuthClientCredentialsDeltaSharingProfile) extends AuthCredentialProvider { 31 | 32 | private val readWriteLock = new ReentrantReadWriteLock() 33 | private val readLock = readWriteLock.readLock 34 | private val writeLock = readWriteLock.writeLock 35 | 36 | private[auth] lazy val oauthClient = new OAuthClient(client, authConfig, 37 | profile.tokenEndpoint, profile.clientId, profile.clientSecret, profile.scope) 38 | 39 | // this can be updated on different thread 40 | // read has be through readLock and write has to be through writeLock 41 | private var currentToken: Option[OAuthClientCredentials] = None 42 | 43 | override def addAuthHeader(httpRequest: HttpRequestBase): Unit = { 44 | val token = maybeRefreshToken() 45 | 46 | readLock.lock() 47 | try { 48 | httpRequest.setHeader(HttpHeaders.AUTHORIZATION, s"Bearer ${token.accessToken}") 49 | } finally { 50 | readLock.unlock() 51 | } 52 | } 53 | 54 | // Method to set the current token for testing purposes 55 | private[auth] def setCurrentTokenForTesting(token: OAuthClientCredentials): Unit = { 56 | writeLock.lock() 57 | try { 58 | currentToken = Some(token) 59 | } finally { 60 | writeLock.unlock() 61 | } 62 | } 63 | 64 | private def maybeRefreshToken(): OAuthClientCredentials = { 65 | readLock.lock() 66 | try { 67 | if (currentToken.isDefined && !needsRefresh(currentToken.get)) { 68 | return currentToken.get 69 | } 70 | } finally { 71 | readLock.unlock() 72 | } 73 | 74 | writeLock.lock() 75 | try { 76 | if (currentToken.isEmpty || needsRefresh(currentToken.get)) { 77 | val newToken = oauthClient.clientCredentials() 78 | currentToken = Some(newToken) 79 | } 80 | 81 | currentToken.get 82 | } finally { 83 | writeLock.unlock() 84 | } 85 | } 86 | 87 | private[auth] def needsRefresh(token: OAuthClientCredentials): Boolean = { 88 | val now = System.currentTimeMillis() 89 | val expirationTime = token.creationTimestamp + token.expiresIn * 1000 90 | expirationTime - now < authConfig.tokenRenewalThresholdInSeconds * 1000 91 | } 92 | 93 | override def getExpirationTime(): Option[String] = None 94 | } 95 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/client/util/JsonUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.client.util 18 | 19 | import com.fasterxml.jackson.annotation.JsonInclude.Include 20 | import com.fasterxml.jackson.databind.{DeserializationFeature, JsonNode, ObjectMapper} 21 | import com.fasterxml.jackson.module.scala.{DefaultScalaModule, ScalaObjectMapper} 22 | 23 | private[sharing] object JsonUtils { 24 | /** Used to convert between classes and JSON. */ 25 | lazy val mapper = { 26 | val _mapper = new ObjectMapper with ScalaObjectMapper 27 | _mapper.setSerializationInclusion(Include.NON_ABSENT) 28 | _mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) 29 | _mapper.registerModule(DefaultScalaModule) 30 | _mapper 31 | } 32 | 33 | def toJson[T: Manifest](obj: T): String = { 34 | mapper.writeValueAsString(obj) 35 | } 36 | 37 | def fromJson[T: Manifest](json: String): T = { 38 | mapper.readValue[T](json) 39 | } 40 | 41 | def readTree(json: String) : JsonNode = { 42 | mapper.readTree(json) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/client/util/RetryUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.client.util 18 | 19 | import java.io.{InterruptedIOException, IOException} 20 | 21 | import scala.util.control.NonFatal 22 | 23 | import org.apache.spark.internal.Logging 24 | 25 | import io.delta.sharing.spark.MissingEndStreamActionException 26 | 27 | private[sharing] object RetryUtils extends Logging { 28 | 29 | // Expose it for testing 30 | @volatile var sleeper: Long => Unit = (sleepMs: Long) => Thread.sleep(sleepMs) 31 | 32 | def runWithExponentialBackoff[T]( 33 | numRetries: Int, 34 | maxDurationMillis: Long = Long.MaxValue, 35 | retrySleepInterval: Long = 1000)(func: => T): T = { 36 | var times = 0 37 | var sleepMs = retrySleepInterval 38 | val startTime = System.currentTimeMillis() 39 | while (true) { 40 | times += 1 41 | val retryStartTime = System.currentTimeMillis() 42 | try { 43 | return func 44 | } catch { 45 | case e: Exception => 46 | val totalDuration = System.currentTimeMillis() - startTime 47 | val retryDuration = System.currentTimeMillis() - retryStartTime 48 | logError( 49 | "Error during retry attempt " + times + ", retryDuration=" + retryDuration + 50 | ", totalDuration=" + totalDuration + " : " + e.getMessage, 51 | e 52 | ) 53 | if (shouldRetry(e) && times <= numRetries && totalDuration <= maxDurationMillis) { 54 | logWarning(s"Sleeping $sleepMs ms to retry on error: ${e.getMessage}.") 55 | sleeper(sleepMs) 56 | sleepMs *= 2 57 | } else { 58 | logError(s"Not retrying delta sharing rpc on error: ${e.getMessage}.") 59 | throw e 60 | } 61 | } 62 | } 63 | throw new IllegalStateException("Should not happen") 64 | } 65 | 66 | def shouldRetry(t: Throwable): Boolean = { 67 | t match { 68 | case e: UnexpectedHttpStatus => 69 | if (e.statusCode == 429) { // Too Many Requests 70 | true 71 | } else if (e.statusCode >= 500 && e.statusCode < 600) { // Internal Error 72 | true 73 | } else { 74 | false 75 | } 76 | case _: MissingEndStreamActionException => true 77 | case _: java.net.SocketTimeoutException => true 78 | // do not retry on ConnectionClosedException because it can be caused by invalid json returned 79 | // from the delta sharing server. 80 | case _: org.apache.http.ConnectionClosedException => false 81 | case _: InterruptedException => false 82 | case _: InterruptedIOException => false 83 | case _: IOException => true 84 | case _ => false 85 | } 86 | } 87 | } 88 | 89 | private[sharing] class UnexpectedHttpStatus(message: String, val statusCode: Int) 90 | extends IllegalStateException(message) 91 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/spark/DeltaSharingErrors.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import org.apache.spark.sql.types.StructType 20 | 21 | class MissingEndStreamActionException(message: String) extends IllegalStateException(message) 22 | 23 | class DeltaSharingServerException(message: String) extends RuntimeException(message) 24 | 25 | object DeltaSharingErrors { 26 | def nonExistentDeltaSharingTable(tableId: String): Throwable = { 27 | new IllegalStateException(s"Delta sharing table ${tableId} doesn't exist. " + 28 | s"Please delete your streaming query checkpoint and restart.") 29 | } 30 | 31 | def invalidSourceVersion(version: String): Throwable = { 32 | new IllegalStateException(s"sourceVersion($version) is invalid.") 33 | } 34 | 35 | def timestampInvalid(str: String): Throwable = { 36 | new IllegalArgumentException(s"The provided timestamp ($str) cannot be converted to a valid " + 37 | s"timestamp.") 38 | } 39 | 40 | def cannotFindSourceVersionException(json: String): Throwable = { 41 | new IllegalStateException(s"Cannot find 'sourceVersion' in $json") 42 | } 43 | 44 | def unsupportedTableReaderVersion(supportedVersion: Long, tableVersion: Long): Throwable = { 45 | new IllegalStateException(s"The table reader version ${tableVersion} is not equal to " + 46 | s"supported reader version $supportedVersion." 47 | ) 48 | } 49 | 50 | def illegalDeltaSharingOptionException( 51 | name: String, input: String, explain: String): Throwable = { 52 | new IllegalArgumentException(s"Invalid value '$input' for option '$name', $explain") 53 | } 54 | 55 | def versionAndTimestampBothSetException( 56 | versionOptKey: String, 57 | timestampOptKey: String): Throwable = { 58 | new IllegalArgumentException(s"Please either provide '$versionOptKey' or '$timestampOptKey'.") 59 | } 60 | 61 | def deltaSharingSourceIgnoreDeleteError(version: Long): Throwable = { 62 | new UnsupportedOperationException("Detected deleted data from streaming source at version " + 63 | s"$version. This is currently not supported. If you'd like to ignore deletes, set the " + 64 | s"option 'ignoreDeletes' to 'true'.") 65 | } 66 | 67 | def deltaSharingSourceIgnoreChangesError(version: Long): Throwable = { 68 | new UnsupportedOperationException("Detected a data update in the source table at version " + 69 | s"$version. This is currently not supported. If you'd like to ignore updates, set the " + 70 | s"option 'skipChangeCommits' to 'true'. If you would like the data update to be reflected, " + 71 | s"please restart the query from latest snapshot with a fresh checkpoint directory.") 72 | } 73 | 74 | def unknownReadLimit(limit: String): Throwable = { 75 | new UnsupportedOperationException(s"Unknown ReadLimit: $limit") 76 | } 77 | 78 | def specifySchemaAtReadTimeException: Throwable = { 79 | new UnsupportedOperationException("Delta sharing does not support specifying the schema at " + 80 | "read time.") 81 | } 82 | 83 | def pathNotSpecifiedException: Throwable = { 84 | new IllegalArgumentException("'path' is not specified. If you use SQL to create a Delta " + 85 | "Sharing table, LOCATION must be specified") 86 | } 87 | 88 | def timeTravelNotSupportedException: Throwable = { 89 | new UnsupportedOperationException("Cannot time travel streams.") 90 | } 91 | 92 | def schemaNotSetException: Throwable = { 93 | new IllegalStateException("Shared table schema is not set. Please contact your data provider.") 94 | } 95 | 96 | def schemaChangedException(readSchema: StructType, schemaToCheck: StructType): Throwable = { 97 | val msg = 98 | s"""Detected incompatible schema change: 99 | |schema used to read data: ${readSchema.treeString} 100 | | 101 | |schema seen in the table: ${schemaToCheck.treeString} 102 | | 103 | |Please try restarting the query. If this issue repeats across query restarts without 104 | |making progress, you have made an incompatible schema change and need to start your 105 | |query from scratch using a new checkpoint directory. 106 | """.stripMargin 107 | new IllegalStateException(msg) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/spark/DeltaSharingSourceOffset.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | // scalastyle:off import.ordering.noEmptyLine 20 | import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2} 21 | import org.apache.spark.sql.execution.streaming.Offset 22 | 23 | import io.delta.sharing.client.util.JsonUtils 24 | 25 | /** 26 | * Tracks how far we processed in when reading changes from the [[Delta Sharing Server]]. 27 | * 28 | * @param sourceVersion The version of serialization that this offset is encoded with. 29 | * @param tableId The id of the table we are reading from. Used to detect 30 | * misconfiguration when restarting a query. 31 | * @param tableVersion The version of the table that we are currently processing. 32 | * @param index The index in the sequence of AddFiles in this version. Used to 33 | * break large commits into multiple batches. This index is created by 34 | * sorting on url. 35 | * @param isStartingVersion Whether this offset denotes a query that is starting rather than 36 | * processing changes. When starting a new query, we first process 37 | * all data present in the table at the start and then move on to 38 | * processing new data that has arrived. 39 | */ 40 | case class DeltaSharingSourceOffset( 41 | sourceVersion: Long, 42 | tableId: String, 43 | tableVersion: Long, 44 | index: Long, 45 | isStartingVersion: Boolean 46 | ) extends Offset { 47 | 48 | override def json: String = JsonUtils.toJson(this) 49 | 50 | /** 51 | * Compare two DeltaSharingSourceOffsets which are on the same table. 52 | * @return 0 for equivalent offsets. negative if this offset is less than `otherOffset`. Positive 53 | * if this offset is greater than `otherOffset` 54 | */ 55 | def compare(otherOffset: DeltaSharingSourceOffset): Int = { 56 | assert(tableId == otherOffset.tableId, "Comparing offsets that do not refer to the" + 57 | " same table is disallowed.") 58 | implicitly[Ordering[(Long, Long)]].compare((tableVersion, index), 59 | (otherOffset.tableVersion, otherOffset.index)) 60 | } 61 | } 62 | 63 | object DeltaSharingSourceOffset { 64 | 65 | val VERSION_1 = 1 66 | 67 | def apply( 68 | sourceVersion: Long, 69 | tableId: String, 70 | tableVersion: Long, 71 | index: Long, 72 | isStartingVersion: Boolean 73 | ): DeltaSharingSourceOffset = { 74 | new DeltaSharingSourceOffset( 75 | sourceVersion, 76 | tableId, 77 | tableVersion, 78 | index, 79 | isStartingVersion 80 | ) 81 | } 82 | 83 | def apply(tableId: String, offset: OffsetV2): DeltaSharingSourceOffset = { 84 | offset match { 85 | case o: DeltaSharingSourceOffset => o 86 | case s => 87 | val o = JsonUtils.fromJson[DeltaSharingSourceOffset](s.json) 88 | validateSourceVersion(o) 89 | if (o.tableId != tableId) { 90 | throw DeltaSharingErrors.nonExistentDeltaSharingTable(o.tableId) 91 | } 92 | o 93 | } 94 | } 95 | 96 | private def validateSourceVersion(offset: DeltaSharingSourceOffset) = { 97 | // Only version 1 is supported for now. 98 | if (offset.sourceVersion != VERSION_1) { 99 | throw DeltaSharingErrors.unsupportedTableReaderVersion(VERSION_1, offset.sourceVersion) 100 | } 101 | } 102 | 103 | /** 104 | * Validate offsets to make sure we always move forward. Moving backward may make the query 105 | * re-process data and cause data duplication. 106 | */ 107 | def validateOffsets( 108 | previousOffset: DeltaSharingSourceOffset, 109 | currentOffset: DeltaSharingSourceOffset): Unit = { 110 | if (previousOffset.isStartingVersion == false && currentOffset.isStartingVersion == true) { 111 | throw new IllegalStateException( 112 | s"Found invalid offsets: 'isStartingVersion' fliped incorrectly. " + 113 | s"Previous: $previousOffset, Current: $currentOffset") 114 | } 115 | if (previousOffset.compare(currentOffset) > 0) { 116 | throw new IllegalStateException( 117 | s"Found invalid offsets. Previous: $previousOffset, Current: $currentOffset") 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/spark/perf/DeltaSharingLimitPushDown.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark.perf 18 | 19 | import scala.reflect.runtime.universe.{termNames, typeOf, typeTag} 20 | 21 | import org.apache.spark.sql.SparkSession 22 | import org.apache.spark.sql.catalyst.catalog.CatalogTable 23 | import org.apache.spark.sql.catalyst.expressions.IntegerLiteral 24 | import org.apache.spark.sql.catalyst.plans.logical.{LocalLimit, LogicalPlan, Project} 25 | import org.apache.spark.sql.catalyst.rules.Rule 26 | import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} 27 | import org.apache.spark.sql.sources.BaseRelation 28 | 29 | import io.delta.sharing.client.util.ConfUtils 30 | import io.delta.sharing.spark.RemoteDeltaSnapshotFileIndex 31 | 32 | object DeltaSharingLimitPushDown extends Rule[LogicalPlan] { 33 | 34 | def setup(spark: SparkSession): Unit = synchronized { 35 | if (!spark.experimental.extraOptimizations.contains(DeltaSharingLimitPushDown) ) { 36 | spark.experimental.extraOptimizations ++= Seq(DeltaSharingLimitPushDown) 37 | } 38 | } 39 | 40 | def apply(p: LogicalPlan): LogicalPlan = { 41 | if (ConfUtils.limitPushdownEnabled(p.conf)) { 42 | p transform { 43 | // In Spark 4.0.0, there are two distinct limit pushdown plans: 44 | // - SQL queries (e.g., `SELECT * FROM table LIMIT 1`) follow: 45 | // LocalLimit -> Project -> LogicalRelation. 46 | // - Spark queries (e.g., `spark.read.load(path).limit(1)`) follow: 47 | // LocalLimit -> LogicalRelationWithTable. 48 | case localLimit @ LocalLimit( 49 | literalExpr @ IntegerLiteral(limit), 50 | pr @ Project(_, 51 | l @ LogicalRelation( 52 | r @ HadoopFsRelation(remoteIndex: RemoteDeltaSnapshotFileIndex, _, _, _, _, _), 53 | _, _, _, _)) 54 | ) => 55 | if (remoteIndex.limitHint.isEmpty) { 56 | val spark = SparkSession.active 57 | LocalLimit( 58 | literalExpr, 59 | pr.copy( 60 | child = LogicalRelationShim.copyWithNewRelation( 61 | l, 62 | r.copy(location = remoteIndex.copy(limitHint = Some(limit)))(spark) 63 | ) 64 | ) 65 | ) 66 | } else { 67 | localLimit 68 | } 69 | 70 | case localLimit @ LocalLimit( 71 | literalExpr @ IntegerLiteral(limit), 72 | l @ LogicalRelationWithTable( 73 | r @ HadoopFsRelation(remoteIndex: RemoteDeltaSnapshotFileIndex, _, _, _, _, _), 74 | _) 75 | ) => 76 | if (remoteIndex.limitHint.isEmpty) { 77 | val spark = SparkSession.active 78 | LocalLimit(literalExpr, 79 | LogicalRelationShim.copyWithNewRelation( 80 | l, 81 | r.copy( 82 | location = remoteIndex.copy(limitHint = Some(limit)))(spark)) 83 | ) 84 | } else { 85 | localLimit 86 | } 87 | } 88 | } else { 89 | p 90 | } 91 | } 92 | } 93 | 94 | /** 95 | * Extract the [[BaseRelation]] and [[CatalogTable]] from [[LogicalRelation]]. You can also 96 | * retrieve the instance of LogicalRelation like following: 97 | * 98 | * case l @ LogicalRelationWithTable(relation, catalogTable) => ... 99 | * 100 | * NOTE: This is copied from Spark 4.0 codebase - license: Apache-2.0. 101 | */ 102 | object LogicalRelationWithTable { 103 | def unapply(plan: LogicalRelation): Option[(BaseRelation, Option[CatalogTable])] = { 104 | Some(plan.relation, plan.catalogTable) 105 | } 106 | } 107 | 108 | /** 109 | * This class helps the codebase to address the differences among multiple Spark versions. 110 | */ 111 | object LogicalRelationShim { 112 | /** 113 | * This method provides the ability of copying LogicalRelation instance across Spark versions, 114 | * when the caller only wants to replace the relation in the LogicalRelation. 115 | */ 116 | def copyWithNewRelation(src: LogicalRelation, newRelation: BaseRelation): LogicalRelation = { 117 | // We assume Spark would not change the order of the existing parameter, but it's even safe 118 | // as long as the first parameter is reserved to the `relation`. 119 | val paramsForPrimaryConstructor = src.productIterator.toArray 120 | paramsForPrimaryConstructor(0) = newRelation 121 | 122 | val constructor = typeOf[LogicalRelation] 123 | .decl(termNames.CONSTRUCTOR) 124 | // Getting all the constructors 125 | .alternatives 126 | .map(_.asMethod) 127 | // Picking the primary constructor 128 | .find(_.isPrimaryConstructor) 129 | // A class must always have a primary constructor, so this is safe 130 | .get 131 | val constructorMirror = typeTag[LogicalRelation].mirror 132 | .reflectClass(typeOf[LogicalRelation].typeSymbol.asClass) 133 | .reflectConstructor(constructor) 134 | 135 | constructorMirror.apply(paramsForPrimaryConstructor: _*).asInstanceOf[LogicalRelation] 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/spark/util/QueryUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark.util 18 | 19 | import java.nio.charset.StandardCharsets.UTF_8 20 | 21 | import com.google.common.hash.Hashing 22 | 23 | object QueryUtils { 24 | 25 | // Get a query hash id based on the query parameters for snapshot queries 26 | def getQueryParamsHashId( 27 | predicates: Seq[String], 28 | limitHint: Option[Long], 29 | jsonPredicateHints: Option[String], 30 | version: Long): String = { 31 | val predicateStr = predicates.mkString(",") 32 | val limitStr = limitHint.map(_.toString).getOrElse("none") 33 | val jsonHintsStr = jsonPredicateHints.getOrElse("none") 34 | val fullQueryString = s"${predicateStr}_${jsonHintsStr}_${limitStr}_${version}" 35 | Hashing.sha256().hashString(fullQueryString, UTF_8).toString 36 | } 37 | 38 | // Get a query hash id based on the query parameters for CDF queries 39 | def getQueryParamsHashId(cdfOptions: Map[String, String]): String = { 40 | Hashing.sha256().hashString(cdfOptions.toString, UTF_8).toString 41 | } 42 | 43 | // Get a query hash id based on the query parameters for streaming queries 44 | def getQueryParamsHashId( 45 | startVersion: Long, 46 | endVersion: Long): String = { 47 | val fullQueryString = s"${startVersion}_${endVersion}" 48 | Hashing.sha256().hashString(fullQueryString, UTF_8).toString 49 | } 50 | 51 | // Add id as a suffix to table path, to uniquely identify a query 52 | def getTablePathWithIdSuffix(tablePath: String, id: String): String = { 53 | s"${tablePath}_${id}" 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /client/src/main/scala/io/delta/sharing/spark/util/SchemaUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark.util 18 | 19 | import org.apache.spark.internal.Logging 20 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 21 | import org.apache.spark.sql.types._ 22 | 23 | 24 | object SchemaUtils extends Logging { 25 | 26 | /** 27 | * TODO: switch to SchemaUtils in delta-io/connectors once isReadCompatible is supported in it. 28 | * 29 | * As the Delta snapshots update, the schema may change as well. This method defines whether the 30 | * new schema(schemaToCheck) of a Delta table can be used with a previously analyzed LogicalPlan 31 | * using the readSchema. 32 | * Our rules are to return false if: 33 | * - Dropping or renaming any column that was present in the DataFrame schema 34 | * - Converting nullable=false to nullable=true for any column 35 | * - Any change of datatype 36 | */ 37 | def isReadCompatible(schemaToCheck: StructType, readSchema: StructType): Boolean = { 38 | 39 | def toFieldMap(fields: Seq[StructField]): Map[String, StructField] = { 40 | CaseInsensitiveMap(fields.map(field => field.name -> field).toMap) 41 | } 42 | 43 | def isDatatypeReadCompatible(toCheckType: DataType, readType: DataType): Boolean = { 44 | // Recursively check that all data types are read compatible. 45 | (toCheckType, readType) match { 46 | case (t: StructType, r: StructType) => 47 | isReadCompatible(t, r) 48 | case (t: ArrayType, r: ArrayType) => 49 | // if the read elements are non-nullable, so should be the new element 50 | (!t.containsNull || r.containsNull) && 51 | isDatatypeReadCompatible(t.elementType, r.elementType) 52 | case (t: MapType, r: MapType) => 53 | // if the read value is non-nullable, so should be the new value 54 | (!t.valueContainsNull || r.valueContainsNull) && 55 | isDatatypeReadCompatible(t.keyType, r.keyType) && 56 | isDatatypeReadCompatible(t.valueType, r.valueType) 57 | case (a, b) => a == b 58 | } 59 | } 60 | 61 | def isStructReadCompatible: Boolean = { 62 | val toCheckFieldMap = toFieldMap(schemaToCheck) 63 | // scalastyle:off caselocale 64 | val toCheckFieldNames = schemaToCheck.fieldNames.map(_.toLowerCase).toSet 65 | assert( 66 | toCheckFieldNames.size == schemaToCheck.length, 67 | "Delta tables don't allow field names that only differ by case" 68 | ) 69 | val readFieldMap = readSchema.fieldNames.map(_.toLowerCase).toSet 70 | assert( 71 | readFieldMap.size == readSchema.length, 72 | "Delta tables don't allow field names that only differ by case" 73 | ) 74 | // scalastyle:on caselocale 75 | 76 | if (!toCheckFieldNames.subsetOf(readFieldMap)) { 77 | // Dropped a column that was present in the DataFrame schema 78 | return false 79 | } 80 | readSchema.forall { readField => 81 | // new fields are fine, they just won't be returned 82 | toCheckFieldMap.get(readField.name).forall { toCheckField => 83 | // we know the name matches modulo case - now verify exact match 84 | (toCheckField.name == readField.name 85 | // if toCheckFieldMap value is non-nullable, so should be the new value 86 | && (!toCheckField.nullable || readField.nullable) 87 | // and the type of the field must be compatible, too 88 | && isDatatypeReadCompatible(toCheckField.dataType, readField.dataType)) 89 | } 90 | } 91 | } 92 | 93 | isStructReadCompatible 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /client/src/main/scala/org/apache/spark/sql/DeltaSharingScanUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.sql 18 | 19 | import org.apache.spark.sql.catalyst.expressions.Expression 20 | import org.apache.spark.sql.classic.ClassicConversions._ 21 | import org.apache.spark.sql.classic.Dataset 22 | import org.apache.spark.sql.execution.datasources.LogicalRelation 23 | 24 | object DeltaSharingScanUtils { 25 | // A wrapper to expose Dataset.ofRows function. 26 | // This is needed because Dataset object is in private[sql] scope. 27 | def ofRows(spark: SparkSession, plan: LogicalRelation): DataFrame = { 28 | Dataset.ofRows(spark, plan) 29 | } 30 | 31 | // A wrapper to expose Column.apply(expr: Expression) function. 32 | // This is needed because the Column object is in private[sql] scope. 33 | def toColumn(expr: Expression): Column = { 34 | Column(expr) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /client/src/test/scala/io/delta/sharing/client/DeltaSharingIntegrationTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.client 18 | 19 | import java.io.File 20 | import java.nio.charset.StandardCharsets.UTF_8 21 | import java.nio.file.Files 22 | import java.util.concurrent.{CountDownLatch, TimeUnit} 23 | 24 | import scala.sys.process._ 25 | import scala.util.Try 26 | 27 | import org.apache.commons.io.FileUtils 28 | import org.apache.hadoop.conf.Configuration 29 | import org.apache.spark.SparkFunSuite 30 | import org.scalatest.BeforeAndAfterAll 31 | 32 | trait DeltaSharingIntegrationTest extends SparkFunSuite with BeforeAndAfterAll { 33 | 34 | def shouldRunIntegrationTest: Boolean = { 35 | sys.env.get("AWS_ACCESS_KEY_ID").exists(_.length > 0) && 36 | sys.env.get("AZURE_TEST_ACCOUNT_KEY").exists(_.length > 0) && 37 | sys.env.get("GOOGLE_APPLICATION_CREDENTIALS").exists(_.length > 0) 38 | } 39 | 40 | @volatile private var process: Process = _ 41 | @volatile private var pidFile: File = _ 42 | var testProfileFile: File = _ 43 | 44 | val TEST_PORT = 12345 45 | 46 | override def beforeAll(): Unit = { 47 | super.beforeAll() 48 | if (shouldRunIntegrationTest) { 49 | pidFile = Files.createTempFile("delta-sharing-server", ".pid").toFile 50 | testProfileFile = Files.createTempFile("delta-test", ".share").toFile 51 | FileUtils.writeStringToFile(testProfileFile, 52 | s"""{ 53 | | "shareCredentialsVersion": 1, 54 | | "endpoint": "https://localhost:$TEST_PORT/delta-sharing", 55 | | "bearerToken": "dapi5e3574ec767ca1548ae5bbed1a2dc04d" 56 | |}""".stripMargin, UTF_8) 57 | 58 | val startLatch = new CountDownLatch(1) 59 | new Thread("Run TestDeltaSharingServer") { 60 | setDaemon(true) 61 | 62 | override def run(): Unit = { 63 | val processLogger = ProcessLogger { stdout => 64 | // scalastyle:off println 65 | println(stdout) 66 | // scalastyle:on println 67 | if (stdout.contains(s"https://127.0.0.1:$TEST_PORT/")) { 68 | startLatch.countDown() 69 | } 70 | } 71 | process = 72 | Seq( 73 | "/bin/bash", 74 | "-c", 75 | s"cd .. && build/sbt 'server / Test / runMain " + 76 | s"io.delta.sharing.server.TestDeltaSharingServer ${pidFile.getCanonicalPath}'") 77 | .run(processLogger) 78 | process.exitValue() 79 | process = null 80 | startLatch.countDown() 81 | } 82 | }.start() 83 | try { 84 | assert(startLatch.await(120, TimeUnit.SECONDS), "the server didn't start in 120 seconds") 85 | if (process == null) { 86 | fail("the process exited with an error") 87 | } 88 | } catch { 89 | case e: Throwable => 90 | if (process != null) { 91 | process.destroy() 92 | process = null 93 | } 94 | throw e 95 | } 96 | } 97 | } 98 | 99 | override def afterAll(): Unit = { 100 | if (shouldRunIntegrationTest) { 101 | try { 102 | org.apache.hadoop.fs.FileSystem.closeAll() 103 | if (process != null) { 104 | process.destroy() 105 | process = null 106 | } 107 | if (pidFile != null) { 108 | val pid = FileUtils.readFileToString(pidFile) 109 | Try(pid.toLong).foreach { pid => 110 | // scalastyle:off println 111 | println(s"Killing $pid") 112 | // scalastyle:on println 113 | s"kill -9 $pid".! 114 | } 115 | pidFile.delete() 116 | } 117 | if (testProfileFile != null) { 118 | testProfileFile.delete() 119 | } 120 | } finally { 121 | super.afterAll() 122 | } 123 | } 124 | } 125 | 126 | def testProfileProvider: DeltaSharingProfileProvider = { 127 | new DeltaSharingFileProfileProvider(new Configuration, testProfileFile.getCanonicalPath) 128 | } 129 | 130 | def integrationTest(testName: String)(func: => Unit): Unit = { 131 | test(testName) { 132 | assume(shouldRunIntegrationTest) 133 | func 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /client/src/test/scala/io/delta/sharing/client/RandomAccessHttpInputStreamSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.client 18 | 19 | import org.apache.hadoop.fs.FileSystem 20 | import org.apache.http.{HttpStatus, ProtocolVersion} 21 | import org.apache.http.client.HttpClient 22 | import org.apache.http.message.BasicHttpResponse 23 | import org.apache.spark.SparkFunSuite 24 | import org.apache.spark.delta.sharing.PreSignedUrlFetcher 25 | import org.mockito.ArgumentMatchers.any 26 | import org.mockito.Mockito.when 27 | import org.scalatestplus.mockito.MockitoSugar 28 | 29 | import io.delta.sharing.client.util.UnexpectedHttpStatus 30 | 31 | class RandomAccessHttpInputStreamSuite extends SparkFunSuite with MockitoSugar { 32 | 33 | private def createResponse(status: Int): BasicHttpResponse = { 34 | new BasicHttpResponse(new ProtocolVersion("HTTP", 1, 1), status, "") 35 | } 36 | 37 | private def createMockClient(status: Int): HttpClient = { 38 | val client = mock[HttpClient] 39 | when(client.execute(any())).thenReturn(createResponse(status)) 40 | client 41 | } 42 | 43 | private def createMockFetcher(uri: String): PreSignedUrlFetcher = { 44 | val fetcher = mock[PreSignedUrlFetcher] 45 | when(fetcher.getUrl()).thenReturn(uri) 46 | fetcher 47 | } 48 | 49 | test("Failed HTTP requests should not show URI") { 50 | val uri = "test.uri" 51 | val stream = new RandomAccessHttpInputStream( 52 | createMockClient(HttpStatus.SC_OK), 53 | createMockFetcher(uri), 54 | 1000L, 55 | new FileSystem.Statistics("idbfs"), 56 | 10 57 | ) 58 | val error = intercept[UnexpectedHttpStatus] { 59 | stream.seek(100L) 60 | } 61 | assert(!error.getMessage().contains(uri)) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /client/src/test/scala/io/delta/sharing/client/auth/BearerTokenAuthProviderSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.delta.sharing.client.auth 17 | 18 | import org.apache.http.HttpHeaders 19 | import org.apache.http.client.methods.HttpGet 20 | import org.scalatest.funsuite.AnyFunSuite 21 | import org.scalatestplus.mockito.MockitoSugar 22 | 23 | class BearerTokenAuthProviderSuite extends AnyFunSuite with MockitoSugar { 24 | 25 | test("BearerTokenAuthProvider should add Authorization header") { 26 | val bearerToken = "test-token" 27 | val provider = BearerTokenAuthProvider(bearerToken, null) 28 | val request = new HttpGet("http://example.com") 29 | 30 | provider.addAuthHeader(request) 31 | 32 | assert(request.getFirstHeader(HttpHeaders.AUTHORIZATION) 33 | .getValue == s"Bearer $bearerToken") 34 | } 35 | 36 | test("BearerTokenAuthProvider should correctly identify expired token") { 37 | val expiredToken = "expired-token" 38 | val expirationTime = "2020-01-01T00:00:00.0Z" 39 | val provider = BearerTokenAuthProvider(expiredToken, expirationTime) 40 | 41 | assert(provider.isExpired()) 42 | } 43 | 44 | test("BearerTokenAuthProvider should correctly identify non-expired token") { 45 | val validToken = "valid-token" 46 | val expirationTime = "2999-01-01T00:00:00.0Z" 47 | val provider = BearerTokenAuthProvider(validToken, expirationTime) 48 | 49 | assert(!provider.isExpired()) 50 | } 51 | 52 | test("BearerTokenAuthProvider should return correct expiration time") { 53 | val token = "test-token" 54 | val expirationTime = "2021-11-12T00:12:29.0Z" 55 | val provider = BearerTokenAuthProvider(token, expirationTime) 56 | 57 | assert(provider.getExpirationTime().contains(expirationTime)) 58 | } 59 | 60 | test("BearerTokenAuthProvider should return None for null expiration time") { 61 | val token = "test-token" 62 | val provider = BearerTokenAuthProvider(token, null) 63 | 64 | assert(provider.getExpirationTime().isEmpty) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /client/src/test/scala/io/delta/sharing/client/util/ProxyServer.scala: -------------------------------------------------------------------------------- 1 | package io.delta.sharing.client.util 2 | 3 | import java.util.Collections 4 | 5 | import scala.util.Try 6 | 7 | import jakarta.servlet.http.{HttpServletRequest, HttpServletResponse} 8 | import org.sparkproject.jetty.client.HttpClient 9 | import org.sparkproject.jetty.http.HttpMethod 10 | import org.sparkproject.jetty.server.{Request, Server} 11 | import org.sparkproject.jetty.server.handler.AbstractHandler 12 | 13 | class ProxyServer(port: Int) { 14 | private val server = new Server(port) 15 | private val httpClient = new HttpClient() 16 | private val capturedRequests = Collections 17 | .synchronizedList(new java.util.ArrayList[HttpServletRequest]()) 18 | 19 | server.setHandler(new ProxyHandler) 20 | 21 | def initialize(): Unit = { 22 | new Thread(() => { 23 | Try(httpClient.start()) 24 | Try(server.start()) 25 | }).start() 26 | 27 | do { 28 | Thread.sleep(100) 29 | } while (!server.isStarted()) 30 | } 31 | 32 | def stop(): Unit = { 33 | Try(server.stop()) 34 | Try(httpClient.stop()) 35 | } 36 | 37 | def getPort(): Int = { 38 | server.getURI().getPort() 39 | } 40 | 41 | def getHost(): String = { 42 | server.getURI().getHost 43 | } 44 | 45 | def getCapturedRequests(): Seq[HttpServletRequest] = { 46 | capturedRequests.toArray(Array[HttpServletRequest]()).toSeq 47 | } 48 | 49 | private class ProxyHandler extends AbstractHandler { 50 | override def handle(target: String, 51 | baseRequest: Request, 52 | request: HttpServletRequest, 53 | response: HttpServletResponse): Unit = { 54 | 55 | capturedRequests.add(request) 56 | Option(request.getHeader("Host")) match { 57 | case Some(host) => 58 | Try { 59 | val uri = request.getScheme + "://" + host + request.getRequestURI 60 | val res = httpClient.newRequest(uri) 61 | .method(HttpMethod.GET) 62 | .send() 63 | 64 | response.setContentType(res.getMediaType) 65 | response.setStatus(res.getStatus) 66 | // scalastyle:off 67 | response.getWriter.println(res.getContentAsString) 68 | // scalastyle:on 69 | }.recover { 70 | case e: Exception => 71 | e.printStackTrace() 72 | // scalastyle:off 73 | response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR, "Error in Proxy Server") 74 | // scalastyle:on 75 | } 76 | 77 | baseRequest.setHandled(true) 78 | 79 | case None => 80 | response.sendError(HttpServletResponse.SC_BAD_REQUEST, "No forwarding URL provided") 81 | } 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /client/src/test/scala/io/delta/sharing/client/util/RetryUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.client.util 18 | 19 | import java.io.{InterruptedIOException, IOException} 20 | 21 | import scala.collection.mutable.ArrayBuffer 22 | 23 | import org.apache.spark.SparkFunSuite 24 | 25 | import io.delta.sharing.client.util.{RetryUtils, UnexpectedHttpStatus} 26 | import io.delta.sharing.client.util.RetryUtils._ 27 | import io.delta.sharing.spark.MissingEndStreamActionException 28 | 29 | class RetryUtilsSuite extends SparkFunSuite { 30 | test("shouldRetry") { 31 | assert(shouldRetry(new UnexpectedHttpStatus("error", 429))) 32 | assert(shouldRetry(new UnexpectedHttpStatus("error", 500))) 33 | assert(!shouldRetry(new UnexpectedHttpStatus("error", 404))) 34 | assert(!shouldRetry(new InterruptedException)) 35 | assert(!shouldRetry(new InterruptedIOException)) 36 | assert(shouldRetry(new IOException)) 37 | assert(shouldRetry(new java.net.SocketTimeoutException)) 38 | assert(!shouldRetry(new RuntimeException)) 39 | assert(shouldRetry(new MissingEndStreamActionException("missing"))) 40 | } 41 | 42 | test("runWithExponentialBackoff") { 43 | val sleeps = new ArrayBuffer[Long]() 44 | RetryUtils.sleeper = (sleepMs: Long) => sleeps += sleepMs 45 | // Retry case 46 | intercept[UnexpectedHttpStatus] { 47 | runWithExponentialBackoff(5) { 48 | throw new UnexpectedHttpStatus("error", 429) 49 | } 50 | } 51 | // Run 6 times should sleep 5 times 52 | assert(sleeps.length == 5) 53 | assert(sleeps == Seq(1000, 2000, 4000, 8000, 16000)) 54 | // No retry case 55 | sleeps.clear() 56 | intercept[RuntimeException] { 57 | runWithExponentialBackoff(10) { 58 | throw new RuntimeException 59 | } 60 | } 61 | assert(sleeps == Seq()) 62 | RetryUtils.sleeper = (sleepMs: Long) => Thread.sleep(sleepMs) 63 | } 64 | 65 | test("maxDuration test") { 66 | val sleeps = new ArrayBuffer[Long]() 67 | RetryUtils.sleeper = (sleepMs: Long) => sleeps += sleepMs 68 | 69 | // Retry case 70 | intercept[java.net.SocketTimeoutException] { 71 | runWithExponentialBackoff(10, 2200) { 72 | Thread.sleep(600) 73 | throw new java.net.SocketTimeoutException("MaxDurationTest") 74 | } 75 | } 76 | // Should hit max duration after 2 retries. 77 | assert(sleeps.length == 3) 78 | assert(sleeps == Seq(1000, 2000, 4000)) 79 | RetryUtils.sleeper = (sleepMs: Long) => Thread.sleep(sleepMs) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /dev/python_release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # Switch to the project root directory 5 | pushd "$(dirname "$0")/.." 6 | 7 | # Clean existing artifacts 8 | pushd python 9 | python3 setup.py clean --all 10 | rm -rf delta_sharing.egg-info dist 11 | popd 12 | 13 | printf "Please type the python release version: " 14 | read -r VERSION 15 | echo "$VERSION" 16 | 17 | # Update the Python connector version 18 | sed -i'' "s/^__version__ = \".*\"$/__version__ = \"$VERSION\"/" ./python/delta_sharing/version.py 19 | git add ./python/delta_sharing/version.py 20 | # Use --allow-empty so that we can re-run this script even if the Python connector version has been updated 21 | git commit -m "Update Python connector version to $VERSION" --allow-empty 22 | 23 | # This creates a lightweight tag that points to the current commit. 24 | git tag "py-v$VERSION" 25 | 26 | # Generate Python artifacts 27 | pushd python 28 | python3 setup.py sdist bdist_wheel 29 | popd 30 | 31 | echo "=== Generated all release artifacts ===" 32 | popd 33 | -------------------------------------------------------------------------------- /dev/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e -pipe 2 | 3 | export GPG_TTY=$(tty) 4 | 5 | # Switch to the project root directory 6 | cd $( dirname $0 ) 7 | cd .. 8 | 9 | # Clean up uncommitted files 10 | git clean -fdx 11 | 12 | # Clean existing artifacts 13 | build/sbt clean 14 | cd python 15 | python3 setup.py clean --all 16 | rm -rf delta_sharing.egg-info dist 17 | cd .. 18 | 19 | printf "Please type the release version: " 20 | read VERSION 21 | echo $VERSION 22 | 23 | # Update the Python connector version 24 | sed -i '' "s/__version__ = \".*\"/__version__ = \"$VERSION\"/g" python/delta_sharing/version.py 25 | git add python/delta_sharing/version.py 26 | # Use --allow-empty so that we can re-run this script even if the Python connector version has been updated 27 | git commit -m "Update Python connector version to $VERSION" --allow-empty 28 | 29 | build/sbt "release skip-tests" 30 | 31 | # Switch to the release commit 32 | git checkout v$VERSION 33 | 34 | # Generate Python artifacts 35 | cd python/ 36 | python3 setup.py sdist bdist_wheel 37 | cd .. 38 | 39 | # Generate the pre-built server package and sign files 40 | build/sbt server/universal:packageBin 41 | cd server/target/universal 42 | gpg --detach-sign --armor --sign delta-sharing-server-$VERSION.zip 43 | gpg --verify delta-sharing-server-$VERSION.zip.asc 44 | sha256sum delta-sharing-server-$VERSION.zip > delta-sharing-server-$VERSION.zip.sha256 45 | sha256sum -c delta-sharing-server-$VERSION.zip.sha256 46 | sha256sum delta-sharing-server-$VERSION.zip.asc > delta-sharing-server-$VERSION.zip.asc.sha256 47 | sha256sum -c delta-sharing-server-$VERSION.zip.asc.sha256 48 | cd - 49 | 50 | # Build the docker image 51 | build/sbt server/docker:publish 52 | 53 | git checkout main 54 | 55 | echo "=== Generated all release artifacts ===" 56 | -------------------------------------------------------------------------------- /dev/spark_release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e -pipe 2 | 3 | export GPG_TTY=$(tty) 4 | 5 | # Switch to the project root directory 6 | cd $( dirname $0 ) 7 | cd .. 8 | 9 | # Clean up uncommitted files 10 | git clean -fdx 11 | 12 | # Clean existing artifacts 13 | build/sbt clean 14 | 15 | printf "Please type the release version: " 16 | read VERSION 17 | echo $VERSION 18 | 19 | build/sbt "release skip-tests" 20 | 21 | # Switch to the release commit 22 | git tag v$VERSION 23 | 24 | echo "=== Generated all release artifacts ===" 25 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ## Delta Sharing examples 2 | In this folder there are examples taken from the delta.io/delta-sharing quickstart guide and docs. They are available in Python and can be run if the prerequisites are satisfied. 3 | The profile file from the open, example Delta Sharing Server is downloaded and located in this folder. 4 | 5 | ### Prerequisites 6 | * For Python examples, Python3.6+, Delta-Sharing Python Connector, PySpark need to be installed, see [the project docs](https://github.com/delta-io/delta-sharing) for details. 7 | 8 | ### Instructions 9 | * To run the example of PySpark in Python run `spark-submit --packages io.delta:delta-sharing-spark_2.12:0.6.2 ./python/quickstart_spark.py` 10 | * To run the example of pandas DataFrame in Python run `python3 ./python/quickstart_pandas.py` -------------------------------------------------------------------------------- /examples/open-datasets.share: -------------------------------------------------------------------------------- 1 | { 2 | "shareCredentialsVersion": 1, 3 | "endpoint": "https://sharing.delta.io/delta-sharing/", 4 | "bearerToken": "faaie590d541265bcab1f2de9813274bf233" 5 | } -------------------------------------------------------------------------------- /examples/python/quickstart_pandas.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (2021) The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import delta_sharing 19 | 20 | # Point to the profile file. It can be a file on the local file system or a file on a remote storage. 21 | profile_file = os.path.dirname(__file__) + "/../open-datasets.share" 22 | 23 | # Create a SharingClient. 24 | client = delta_sharing.SharingClient(profile_file) 25 | 26 | # List all shared tables. 27 | print("########### All Available Tables #############") 28 | print(client.list_all_tables()) 29 | 30 | # Create a url to access a shared table. 31 | # A table path is the profile file path following with `#` and the fully qualified name of a table (`..`). 32 | table_url = profile_file + "#delta_sharing.default.owid-covid-data" 33 | 34 | # Fetch 10 rows from a table and convert it to a Pandas DataFrame. This can be used to read sample data from a table that cannot fit in the memory. 35 | print("########### Loading 10 rows from delta_sharing.default.owid-covid-data as a Pandas DataFrame #############") 36 | data = delta_sharing.load_as_pandas(table_url, limit=10) 37 | 38 | # Print the sample. 39 | print("########### Show the fetched 10 rows #############") 40 | print(data) 41 | 42 | # Load a table as a Pandas DataFrame. This can be used to process tables that can fit in the memory. 43 | print("########### Loading delta_sharing.default.owid-covid-data as a Pandas DataFrame #############") 44 | data = delta_sharing.load_as_pandas(table_url) 45 | 46 | # Do whatever you want to your share data! 47 | print("########### Show Data #############") 48 | print(data[data["iso_code"] == "USA"].head(10)) 49 | -------------------------------------------------------------------------------- /examples/python/quickstart_spark.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (2021) The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import delta_sharing 19 | from pyspark.sql import SparkSession 20 | 21 | # Point to the profile file. It can be a file on the local file system or a file on a remote storage. 22 | profile_file = os.path.dirname(__file__) + "/../open-datasets.share" 23 | 24 | # Create a SharingClient. 25 | client = delta_sharing.SharingClient(profile_file) 26 | 27 | # List all shared tables. 28 | print("########### All Available Tables #############") 29 | print(client.list_all_tables()) 30 | 31 | # Create a url to access a shared table. 32 | # A table path is the profile file path following with `#` and the fully qualified name of a table (`..`). 33 | table_url = profile_file + "#delta_sharing.default.owid-covid-data" 34 | 35 | # Create Spark with delta sharing connector 36 | spark = SparkSession.builder \ 37 | .appName("delta-sharing-demo") \ 38 | .master("local[*]") \ 39 | .getOrCreate() 40 | 41 | # Read data using format "deltaSharing" 42 | print("########### Loading delta_sharing.default.owid-covid-data with Spark #############") 43 | df1 = spark.read.format("deltaSharing").load(table_url) \ 44 | .where("iso_code == 'USA'") \ 45 | .select("iso_code", "total_cases", "human_development_index") \ 46 | .show() 47 | 48 | # Or if the code is running with PySpark, you can use `load_as_spark` to load the table as a Spark DataFrame. 49 | print("########### Loading delta_sharing.default.owid-covid-data with Spark #############") 50 | data = delta_sharing.load_as_spark(table_url) 51 | data.where("iso_code == 'USA'") \ 52 | .select("iso_code", "total_cases", "human_development_index").show() 53 | -------------------------------------------------------------------------------- /images/delta-sharing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/delta-io/delta-sharing/afb25b15da7e3b25cefba6afb619e2ca593c73f0/images/delta-sharing.png -------------------------------------------------------------------------------- /images/the-community.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/delta-io/delta-sharing/afb25b15da7e3b25cefba6afb619e2ca593c73f0/images/the-community.png -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (2021) The Delta Lake Project Authors. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | 14 | sbt.version=1.5.0 15 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | ThisBuild / libraryDependencySchemes += "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always 17 | 18 | resolvers += Resolver.url("artifactory", url("https://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 19 | 20 | resolvers += "Typesafe Repository" at "https://repo.typesafe.com/typesafe/releases/" 21 | 22 | resolvers += Resolver.url( 23 | "typesafe sbt-plugins", 24 | url("https://dl.bintray.com/typesafe/sbt-plugins"))(Resolver.ivyStylePatterns) 25 | 26 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13") 27 | 28 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3") 29 | 30 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 31 | 32 | addSbtPlugin("com.thesamet" % "sbt-protoc" % "1.0.6") 33 | 34 | addSbtPlugin("com.github.sbt" % "sbt-native-packager" % "1.9.16") 35 | 36 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") 37 | 38 | libraryDependencies += "com.thesamet.scalapb" %% "compilerplugin" % "0.11.12" 39 | -------------------------------------------------------------------------------- /python/NOTICE.txt: -------------------------------------------------------------------------------- 1 | ############################################################################################### 2 | # 3 | # Copyright © 2023, 2023, Oracle and/or its affiliates. 4 | # Issue ref #269: OAuth 2.0 Credential Format for Delta Sharing Client 5 | # Code update: 6 | # - protocol.py 7 | # - rest_client.py 8 | # - test_protocol.py 9 | # - test_profile_bearer.json 10 | # - test_profile_oauth2.json 11 | # - test_profile_basic.json 12 | # 13 | ############################################################################################### -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # Delta Sharing 2 | 3 | [Delta Sharing](https://delta.io/sharing) is an open protocol for secure real-time exchange of large datasets, which enables secure data sharing across different computing platforms. It lets organizations share access to existing [Delta Lake](https://delta.io) and [Apache Parquet](https://parquet.apache.org) tables with other organizations, who can then directly read the table in Pandas, Apache Spark, or any other software that implements the open protocol. 4 | 5 | This is the Python client library for Delta Sharing, which lets you load shared tables as [pandas](https://pandas.pydata.org/) DataFrames or as [Apache Spark](http://spark.apache.org/) DataFrames if running in PySpark with the [Apache Spark Connector library](https://github.com/delta-io/delta-sharing#set-up-apache-spark). 6 | 7 | ## Installation and Usage 8 | 9 | 1. Install using `pip install delta-sharing`. 10 | a. On some environments, you may also need to [install Rust](https://www.rust-lang.org/tools/install). This is because the `delta-sharing` package depends on the `delta-kernel-rust-sharing-wrapper` package, which does not have a pre-built Python wheel for all environments. As a result, pip will have to build `delta-kernel-rust-sharing-wrapper` from source. 11 | 2. To use the Python Connector, see [the project docs](https://github.com/delta-io/delta-sharing) for details. 12 | 13 | ## Documentation 14 | 15 | This README only contains basic information about the Delta Sharing Python Connector. Please read [the project documentation](https://github.com/delta-io/delta-sharing) for full usage details. 16 | -------------------------------------------------------------------------------- /python/delta-kernel-rust-sharing-wrapper/.gitignore: -------------------------------------------------------------------------------- 1 | Cargo.lock 2 | -------------------------------------------------------------------------------- /python/delta-kernel-rust-sharing-wrapper/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "delta-kernel-rust-sharing-wrapper" 3 | edition = "2021" 4 | license = "Apache-2.0" 5 | version = "0.2.2" 6 | 7 | [lib] 8 | name = "delta_kernel_rust_sharing_wrapper" 9 | # "cdylib" is necessary to produce a shared library for Python to import from. 10 | crate-type = ["cdylib"] 11 | 12 | [dependencies] 13 | arrow = { version = "54.0.0", features = ["pyarrow"] } 14 | delta_kernel = { version = "0.6.1", features = ["cloud", "default-engine"]} 15 | openssl = { version = "0.10", features = ["vendored"] } 16 | url = "2" 17 | 18 | [dependencies.pyo3] 19 | version = "0.23.3" 20 | # "abi3-py38" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.8 21 | features = ["abi3-py38"] 22 | -------------------------------------------------------------------------------- /python/delta-kernel-rust-sharing-wrapper/README.md: -------------------------------------------------------------------------------- 1 | # Delta Kernel Rust Sharing Wrapper 2 | 3 | This adds a thin python bindings via maturin and pyo3 for delta-kernel-rust. 4 | 5 | ## usage 6 | To build the wheel locally you will need to create a venv and set up the deps: 7 | 8 | cd [delta-sharing-root]/python/delta-kernel-rust-sharing-wrapper # if you're not already there 9 | python3 -m venv .venv 10 | source .venv/bin/activate 11 | pip install maturin 12 | pip freeze 13 | maturin develop 14 | 15 | Now it should generate the wheel file, and you'll be able to use the delta_sharing_kernel_rust_wrapper library in python. 16 | -------------------------------------------------------------------------------- /python/delta-kernel-rust-sharing-wrapper/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.0,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [tool.maturin] 6 | # "extension-module" tells pyo3 we want to build an extension module (skips linking against libpython.so) 7 | features = ["pyo3/extension-module"] 8 | -------------------------------------------------------------------------------- /python/delta_sharing/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from delta_sharing.delta_sharing import SharingClient, load_as_pandas, load_as_spark 18 | from delta_sharing.delta_sharing import get_table_metadata, get_table_protocol, get_table_version 19 | from delta_sharing.delta_sharing import load_table_changes_as_pandas, load_table_changes_as_spark 20 | from delta_sharing.protocol import Share, Schema, Table 21 | from delta_sharing.version import __version__ 22 | 23 | 24 | __all__ = [ 25 | "SharingClient", 26 | "Share", 27 | "Schema", 28 | "Table", 29 | "get_table_metadata", 30 | "get_table_protocol", 31 | "get_table_version", 32 | "load_as_pandas", 33 | "load_as_spark", 34 | "load_table_changes_as_pandas", 35 | "load_table_changes_as_spark", 36 | "__version__", 37 | ] 38 | -------------------------------------------------------------------------------- /python/delta_sharing/_yarl_patch.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | try: 17 | from yarl import URL 18 | from yarl._quoting import _Quoter 19 | 20 | # Patch yarl.URL to not replace '%3D' with '=' which would break GCS pre-signed urls 21 | URL._PATH_REQUOTER = _Quoter(safe="@:", protected="/+=") # type: ignore 22 | except: 23 | pass 24 | -------------------------------------------------------------------------------- /python/delta_sharing/converter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from decimal import Decimal 17 | from typing import Any, Callable, Dict 18 | 19 | import numpy as np 20 | import pandas as pd 21 | 22 | 23 | def _get_dummy_column(schema_type): 24 | """ 25 | Return a dummy column with the data type specified in schema_type. 26 | The dummy column is used to populate the dtype fields in empty tables. 27 | :param schema_type: str or json representing a data type 28 | :return: dummy pandas Series to be inserted into an empty table 29 | """ 30 | if schema_type == "boolean": 31 | return pd.Series([False]) 32 | elif schema_type == "byte": 33 | return pd.Series([0], dtype="int8") 34 | elif schema_type == "short": 35 | return pd.Series([0], dtype="int16") 36 | elif schema_type == "integer": 37 | return pd.Series([0], dtype="int32") 38 | elif schema_type == "long": 39 | return pd.Series([0], dtype="int64") 40 | elif schema_type == "float": 41 | return pd.Series([0], dtype="float32") 42 | elif schema_type == "double": 43 | return pd.Series([0], dtype="float64") 44 | elif isinstance(schema_type, str) and schema_type.startswith("decimal"): 45 | return pd.Series([0], dtype=np.dtype("O")) 46 | elif schema_type == "string": 47 | return pd.Series([0], dtype=np.dtype("O")) 48 | elif schema_type == "date": 49 | return pd.Series([pd.Timestamp(0).date()]) 50 | elif schema_type == "timestamp": 51 | return pd.Series([pd.Timestamp(0)], dtype=np.dtype("datetime64[ns]")) 52 | elif schema_type == "binary": 53 | return pd.Series([0], dtype=np.dtype("O")) 54 | elif isinstance(schema_type, dict) and schema_type["type"] in ("array", "struct", "map"): 55 | return pd.Series([0], dtype=np.dtype("O")) 56 | 57 | raise ValueError(f"Could not parse datatype: {schema_type}") 58 | 59 | 60 | def get_empty_table(schema_json: dict) -> pd.DataFrame: 61 | """ 62 | For empty tables, we use dummy columns from `_get_dummy_column` and then 63 | drop all rows to generate a table with the correct column names and 64 | data types. 65 | :param schema_json: json object representing the table schema 66 | :return: empty table with columns specified in schema_json 67 | """ 68 | assert schema_json["type"] == "struct" 69 | 70 | dummy_table = pd.DataFrame( 71 | {field["name"]: _get_dummy_column(field["type"]) for field in schema_json["fields"]} 72 | ) 73 | return dummy_table.iloc[0:0] 74 | 75 | 76 | def to_converters(schema_json: dict) -> Dict[str, Callable[[str], Any]]: 77 | assert schema_json["type"] == "struct" 78 | 79 | return {field["name"]: to_converter(field["type"]) for field in schema_json["fields"]} 80 | 81 | 82 | def to_converter(schema_type) -> Callable[[str], Any]: 83 | """ 84 | For types that support partitioning, a lambda to parse data into the 85 | corresponding type is returned. For data types that cannot be partitioned 86 | on, we return None. The caller is expected to check if the value is None before using. 87 | :param schema_type: str or json representing a data type 88 | :return: converter function or None 89 | """ 90 | if schema_type == "boolean": 91 | return lambda x: None if (x is None or x == "") else (x is True or x == "true") 92 | elif schema_type == "byte": 93 | return lambda x: np.nan if (x is None or x == "") else np.int8(x) 94 | elif schema_type == "short": 95 | return lambda x: np.nan if (x is None or x == "") else np.int16(x) 96 | elif schema_type == "integer": 97 | return lambda x: np.nan if (x is None or x == "") else np.int32(x) 98 | elif schema_type == "long": 99 | return lambda x: np.nan if (x is None or x == "") else np.int64(x) 100 | elif schema_type == "float": 101 | return lambda x: np.nan if (x is None or x == "") else np.float32(x) 102 | elif schema_type == "double": 103 | return lambda x: np.nan if (x is None or x == "") else np.float64(x) 104 | elif isinstance(schema_type, str) and schema_type.startswith("decimal"): 105 | return lambda x: None if (x is None or x == "") else Decimal(x) 106 | elif schema_type == "string": 107 | return lambda x: None if (x is None or x == "") else str(x) 108 | elif schema_type == "date": 109 | return lambda x: None if (x is None or x == "") else pd.Timestamp(x).date() 110 | elif schema_type == "timestamp": 111 | return lambda x: pd.NaT if (x is None or x == "") else pd.Timestamp(x) 112 | elif schema_type == "binary": 113 | return None # partition on binary column not supported 114 | elif isinstance(schema_type, dict) and schema_type["type"] in ("array", "struct", "map"): 115 | return None # partition on complex column not supported 116 | 117 | raise ValueError(f"Could not parse datatype: {schema_type}") 118 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import os 17 | from pathlib import Path 18 | import subprocess 19 | import threading 20 | from typing import Iterator, Optional 21 | 22 | import pytest 23 | from pytest import TempPathFactory 24 | 25 | from delta_sharing.delta_sharing import SharingClient 26 | from delta_sharing.protocol import DeltaSharingProfile 27 | from delta_sharing.rest_client import DataSharingRestClient 28 | 29 | 30 | ENABLE_INTEGRATION = len(os.environ.get("AWS_ACCESS_KEY_ID", "")) > 0 31 | SKIP_MESSAGE = "The integration tests are disabled." 32 | 33 | 34 | @pytest.fixture 35 | def profile_path() -> str: 36 | return os.path.join(os.path.dirname(__file__), "test_profile.json") 37 | 38 | 39 | @pytest.fixture 40 | def profile(profile_path) -> DeltaSharingProfile: 41 | return DeltaSharingProfile.read_from_file(profile_path) 42 | 43 | 44 | @pytest.fixture 45 | def rest_client(profile) -> DataSharingRestClient: 46 | return DataSharingRestClient(profile) 47 | 48 | 49 | @pytest.fixture 50 | def sharing_client(profile) -> SharingClient: 51 | return SharingClient(profile) 52 | 53 | 54 | @pytest.fixture(scope="session", autouse=ENABLE_INTEGRATION) 55 | def test_server(tmp_path_factory: TempPathFactory) -> Iterator[None]: 56 | pid_file: Optional[Path] = None 57 | proc: Optional[subprocess.Popen] = None 58 | try: 59 | if ENABLE_INTEGRATION: 60 | pid_file = tmp_path_factory.getbasetemp() / "delta-sharing-server.pid" 61 | proc = subprocess.Popen( 62 | [ 63 | "./build/sbt", 64 | ( 65 | "server/test:runMain io.delta.sharing.server.TestDeltaSharingServer " 66 | + str(pid_file) 67 | ), 68 | ], 69 | stdout=subprocess.PIPE, 70 | stderr=subprocess.PIPE, 71 | cwd="..", 72 | ) 73 | 74 | ready = threading.Event() 75 | 76 | def wait_for_server() -> None: 77 | for line in proc.stdout: 78 | print(line.decode("utf-8").strip()) 79 | if b"https://127.0.0.1:12345/" in line: 80 | ready.set() 81 | 82 | threading.Thread(target=wait_for_server, daemon=True).start() 83 | 84 | if not ready.wait(timeout=120): 85 | raise TimeoutError("the server didn't start in 120 seconds") 86 | yield 87 | finally: 88 | if ENABLE_INTEGRATION: 89 | if pid_file is not None and pid_file.exists(): 90 | pid = pid_file.read_text() 91 | subprocess.run(["kill", "-9", pid]) 92 | if proc is not None and proc.poll() is None: 93 | proc.kill() 94 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/test_converter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from datetime import date 17 | from decimal import Decimal 18 | from json import loads 19 | from typing import Any 20 | 21 | import numpy as np 22 | import pandas as pd 23 | import pytest 24 | 25 | from delta_sharing.converter import to_converter, get_empty_table 26 | 27 | 28 | def test_to_converter_boolean(): 29 | converter = to_converter("boolean") 30 | assert converter("true") is True 31 | assert converter("false") is False 32 | assert converter("") is None 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "type_str,expected", 37 | [ 38 | pytest.param("byte", np.int8(1), id="byte"), 39 | pytest.param("short", np.int16(1), id="short"), 40 | pytest.param("integer", np.int32(1), id="integer"), 41 | pytest.param("long", np.int64(1), id="long"), 42 | pytest.param("float", np.float32(1), id="float"), 43 | pytest.param("double", np.float64(1), id="double"), 44 | ], 45 | ) 46 | def test_to_converter_numeric(type_str: str, expected: Any): 47 | converter = to_converter(type_str) 48 | assert converter("1") == expected 49 | assert np.isnan(converter("")) 50 | 51 | 52 | def test_to_converter_decimal(): 53 | converter = to_converter("decimal(10,0)") 54 | assert converter("1") == Decimal(1) 55 | assert converter("") is None 56 | 57 | 58 | def test_to_converter_string(): 59 | converter = to_converter("string") 60 | assert converter("abc") == "abc" 61 | assert converter("") is None 62 | 63 | 64 | def test_to_converter_date(): 65 | converter = to_converter("date") 66 | assert converter("2021-01-01") == date(2021, 1, 1) 67 | assert converter("") is None 68 | 69 | 70 | def test_to_converter_timestamp(): 71 | converter = to_converter("timestamp") 72 | assert converter("2021-04-28 23:36:47.599") == pd.Timestamp("2021-04-28 23:36:47.599") 73 | assert converter("") is pd.NaT 74 | 75 | 76 | def test_get_empty_table(): 77 | schema_string = ( 78 | '{"fields": [' 79 | '{"metadata": {},"name": "a","nullable": true,"type": "long"},' 80 | '{"metadata": {},"name": "b","nullable": true,"type": "string"}' 81 | '],"type":"struct"}' 82 | ) 83 | schema_json = loads(schema_string) 84 | pdf = get_empty_table(schema_json) 85 | assert pdf.empty 86 | assert pdf.columns.values.size == 2 87 | assert pdf.columns.values[0] == "a" 88 | assert pdf.columns.values[1] == "b" 89 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/test_oauth_client.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import pytest 17 | import requests 18 | from requests.models import Response 19 | from unittest.mock import patch 20 | from datetime import datetime 21 | from delta_sharing._internal_auth import OAuthClient 22 | 23 | 24 | class MockServer: 25 | def __init__(self): 26 | self.url = "http://localhost:1080/token" 27 | self.responses = [] 28 | 29 | def add_response(self, status_code, json_data): 30 | response = Response() 31 | response.status_code = status_code 32 | response._content = json_data.encode("utf-8") 33 | self.responses.append(response) 34 | 35 | def get_response(self): 36 | return self.responses.pop(0) 37 | 38 | 39 | @pytest.fixture 40 | def mock_server(): 41 | server = MockServer() 42 | yield server 43 | 44 | 45 | @pytest.mark.parametrize( 46 | "response_data, expected_expires_in, expected_access_token", 47 | [ 48 | # OAuth spec requires 'expires_in' to be an integer, e.g., 3600. 49 | # See https://datatracker.ietf.org/doc/html/rfc6749#section-5.1 50 | # But some token endpoints return `expires_in` as a string e.g., "3600". 51 | # This test ensures the client can handle such cases. 52 | # The test case ensures that we support both integer and string values 53 | # for the 'expires_in' field. 54 | ( 55 | '{"access_token": "test-access-token", "expires_in": 3600, "token_type": "bearer"}', 56 | 3600, 57 | "test-access-token", 58 | ), 59 | ( 60 | '{"access_token": "test-access-token", "expires_in": "3600", "token_type": "bearer"}', 61 | 3600, 62 | "test-access-token", 63 | ), 64 | ], 65 | ) 66 | def test_oauth_client_should_parse_token_response_correctly( 67 | mock_server, response_data, expected_expires_in, expected_access_token 68 | ): 69 | mock_server.add_response(200, response_data) 70 | 71 | with patch("requests.post") as mock_post: 72 | mock_post.side_effect = lambda *args, **kwargs: mock_server.get_response() 73 | oauth_client = OAuthClient( 74 | token_endpoint=mock_server.url, client_id="client-id", client_secret="client-secret" 75 | ) 76 | 77 | start = datetime.now().timestamp() 78 | token = oauth_client.client_credentials() 79 | end = datetime.now().timestamp() 80 | 81 | assert token.access_token == expected_access_token 82 | assert token.expires_in == expected_expires_in 83 | assert int(start) <= token.creation_timestamp 84 | assert token.creation_timestamp <= int(end) 85 | 86 | 87 | def test_oauth_client_should_handle_401_unauthorized_response(mock_server): 88 | mock_server.add_response(401, "Unauthorized") 89 | 90 | with patch("requests.post") as mock_post: 91 | mock_post.side_effect = lambda *args, **kwargs: mock_server.get_response() 92 | oauth_client = OAuthClient( 93 | token_endpoint=mock_server.url, client_id="client-id", client_secret="client-secret" 94 | ) 95 | try: 96 | oauth_client.client_credentials() 97 | except requests.HTTPError as e: 98 | assert e.response.status_code == 401 99 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/test_profile.json: -------------------------------------------------------------------------------- 1 | { 2 | "shareCredentialsVersion": 1, 3 | "endpoint": "https://localhost:12345/delta-sharing/", 4 | "bearerToken": "dapi5e3574ec767ca1548ae5bbed1a2dc04d", 5 | "expirationTime": "2021-11-12T00:12:29.0Z" 6 | } 7 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/test_profile_basic.json: -------------------------------------------------------------------------------- 1 | { 2 | "shareCredentialsVersion": 2, 3 | "type": "basic", 4 | "endpoint": "https://localhost/delta-sharing/", 5 | "username": "username", 6 | "password": "password" 7 | } 8 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/test_profile_bearer.json: -------------------------------------------------------------------------------- 1 | { 2 | "shareCredentialsVersion": 2, 3 | "type": "bearer_token", 4 | "endpoint": "https://localhost:12345/delta-sharing/", 5 | "bearerToken": "dapi5e3574ec767ca1548ae5bbed1a2dc04d", 6 | "expirationTime": "2021-11-12T00:12:29.0Z" 7 | } 8 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/test_profile_oauth2.json: -------------------------------------------------------------------------------- 1 | { 2 | "shareCredentialsVersion": 2, 3 | "type": "oauth_client_credentials", 4 | "endpoint": "https://localhost/delta-sharing/", 5 | "tokenEndpoint": "tokenEndpoint", 6 | "clientId": "clientId", 7 | "clientSecret": "clientSecret" 8 | } 9 | -------------------------------------------------------------------------------- /python/delta_sharing/version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | __version__ = "1.1.0" 18 | -------------------------------------------------------------------------------- /python/dev/.gitignore: -------------------------------------------------------------------------------- 1 | pycodestyle*.py 2 | -------------------------------------------------------------------------------- /python/dev/pytest: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Copyright (C) 2021 The Delta Lake Project Authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}" 20 | 21 | set -o pipefail 22 | set -e 23 | 24 | if ! hash pytest 2> /dev/null; then 25 | echo "The pytest command was not found. Please install 'pytest' Python package." 26 | exit 1 27 | fi 28 | 29 | # The current directory of the script. 30 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 31 | 32 | FWDIR="$( cd "$DIR"/.. && pwd )" 33 | cd "$FWDIR" 34 | 35 | if [ -n "AWS_ACCESS_KEY_ID" ]; then 36 | logopts=(-o log_cli=true -s) 37 | fi 38 | 39 | # Runs both doctests and unit tests by default, otherwise hands arguments over to pytest. 40 | if [ "$#" = 0 ]; then 41 | # delta_sharing/_yarl_patch.py is a hack to support GCS pre-signed urls. Ask pytest to not 42 | # import it automatically so that we can verify we are importing it on demand. 43 | $PYTHON_EXECUTABLE -m pytest --ignore=delta_sharing/_yarl_patch.py --verbose --showlocals --color=yes --doctest-modules delta_sharing "${logopts[@]}" 44 | else 45 | $PYTHON_EXECUTABLE -m pytest "$@" 46 | fi 47 | -------------------------------------------------------------------------------- /python/dev/reformat: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright (C) 2021 The Delta Lake Project Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # The current directory of the script. 19 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 20 | FWDIR="$( cd "$DIR"/.. && pwd )" 21 | cd "$FWDIR" 22 | 23 | PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python}" 24 | 25 | BLACK_BUILD="$PYTHON_EXECUTABLE -m black" 26 | BLACK_VERSION="21.12b0" 27 | $BLACK_BUILD --version >> /dev/null 2>&1 28 | if [ $? -ne 0 ]; then 29 | echo "The '$BLACK_BUILD' command was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'." 30 | exit 1 31 | fi 32 | 33 | $BLACK_BUILD delta_sharing --line-length 100 34 | -------------------------------------------------------------------------------- /python/dev/tox.ini: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | [pycodestyle] 18 | ignore=E203,E226,E231,E241,E305,E402,E722,E731,E741,W503,W504 19 | max-line-length=100 20 | exclude=.git/*,docs/build/* 21 | -------------------------------------------------------------------------------- /python/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # Dependencies. When you update don't forget to update setup.py. 2 | pandas>=2.0.3 3 | pyarrow>=16.1.0 4 | fsspec>=0.7.4 5 | requests 6 | types-requests 7 | aiohttp 8 | yarl>=1.6.0 9 | maturin 10 | 11 | # Linter 12 | mypy==0.981 13 | flake8 14 | 15 | # Code formatter. Only support Python 3.6+ 16 | black==24.3.0 17 | 18 | # Test 19 | pytest 20 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Copyright (C) 2021 The Delta Lake Project Authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | from io import open 19 | from os import path 20 | from setuptools import setup 21 | import sys 22 | 23 | DESCRIPTION = "Python Connector for Delta Sharing" 24 | 25 | this_directory = path.abspath(path.dirname(__file__)) 26 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 27 | LONG_DESCRIPTION = f.read() 28 | 29 | try: 30 | exec(open('delta_sharing/version.py').read()) 31 | except IOError: 32 | print("Failed to load Delta Sharing version file for packaging.", 33 | file=sys.stderr) 34 | sys.exit(-1) 35 | VERSION = __version__ # noqa 36 | 37 | setup( 38 | name='delta-sharing', 39 | version=VERSION, 40 | packages=[ 41 | 'delta_sharing', 42 | ], 43 | python_requires='>=3.8', 44 | install_requires=[ 45 | 'delta-kernel-rust-sharing-wrapper>=0.2.0', 46 | 'pandas', 47 | 'pyarrow>=16.1.0', 48 | 'fsspec>=0.7.4', 49 | 'requests', 50 | 'aiohttp', 51 | 'dataclasses;python_version<"3.8"', 52 | 'yarl>=1.6.0', 53 | ], 54 | extras_require={ 55 | 's3': ['s3fs'], 56 | 'abfs': ['adlfs'], 57 | 'adl': ['adlfs'], 58 | 'gcs': ['gcsfs'], 59 | 'gs': ['gcsfs'], 60 | }, 61 | author="The Delta Lake Project Authors", 62 | author_email="delta-users@googlegroups.com", 63 | license="Apache-2.0", 64 | description=DESCRIPTION, 65 | long_description=LONG_DESCRIPTION, 66 | long_description_content_type='text/markdown', 67 | url="https://github.com/delta-io/delta-sharing/", 68 | project_urls={ 69 | 'Source': 'https://github.com/delta-io/delta-sharing', 70 | 'Documentation': 'https://github.com/delta-io/delta-sharing', 71 | 'Issues': 'https://github.com/delta-io/delta-sharing/issues' 72 | }, 73 | classifiers=[ 74 | "Development Status :: 5 - Production/Stable", 75 | "Intended Audience :: Developers", 76 | "License :: OSI Approved :: Apache Software License", 77 | "Operating System :: OS Independent", 78 | "Topic :: Software Development :: Libraries :: Python Modules", 79 | 'Programming Language :: Python :: 3.8', 80 | 'Programming Language :: Python :: 3.9', 81 | 'Programming Language :: Python :: 3.10', 82 | 'Programming Language :: Python :: 3.11', 83 | ], 84 | ) 85 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/DeltaSharedTableLoader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | import java.util.concurrent.TimeUnit 20 | 21 | import com.google.common.cache.CacheBuilder 22 | import io.delta.standalone.internal.DeltaSharedTable 23 | 24 | import io.delta.sharing.kernel.internal.DeltaSharedTableKernel 25 | import io.delta.sharing.server.config.{ServerConfig, TableConfig} 26 | 27 | 28 | /** 29 | * A class to load Delta tables from `TableConfig`. It also caches the loaded tables internally 30 | * to speed up the loading. 31 | */ 32 | class DeltaSharedTableLoader(serverConfig: ServerConfig) { 33 | private val deltaSharedTableCache = { 34 | CacheBuilder.newBuilder() 35 | .expireAfterAccess(60, TimeUnit.MINUTES) 36 | .maximumSize(serverConfig.deltaTableCacheSize) 37 | .build[String, DeltaSharedTable]() 38 | } 39 | 40 | def loadTable(tableConfig: TableConfig, useKernel: Boolean = false): DeltaSharedTableProtocol = { 41 | if (useKernel) { 42 | return new DeltaSharedTableKernel( 43 | tableConfig, 44 | serverConfig.preSignedUrlTimeoutSeconds, 45 | serverConfig.evaluatePredicateHints, 46 | serverConfig.evaluateJsonPredicateHints, 47 | serverConfig.evaluateJsonPredicateHintsV2, 48 | serverConfig.queryTablePageSizeLimit, 49 | serverConfig.queryTablePageTokenTtlMs, 50 | serverConfig.refreshTokenTtlMs 51 | ) 52 | } 53 | try { 54 | val deltaSharedTable = 55 | deltaSharedTableCache.get( 56 | tableConfig.location, 57 | () => { 58 | new DeltaSharedTable( 59 | tableConfig, 60 | serverConfig.preSignedUrlTimeoutSeconds, 61 | serverConfig.evaluatePredicateHints, 62 | serverConfig.evaluateJsonPredicateHints, 63 | serverConfig.evaluateJsonPredicateHintsV2, 64 | serverConfig.queryTablePageSizeLimit, 65 | serverConfig.queryTablePageTokenTtlMs, 66 | serverConfig.refreshTokenTtlMs 67 | ) 68 | } 69 | ) 70 | if (!serverConfig.stalenessAcceptable) { 71 | deltaSharedTable.update() 72 | } 73 | deltaSharedTable 74 | } catch { 75 | case CausedBy(e: DeltaSharingUnsupportedOperationException) => throw e 76 | case e: Throwable => throw e 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/DeltaSharedTableProtocol.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | /** 20 | * QueryResult of query and queryCDF function, including a version, a resopnseFormat, and a list 21 | * of actions. 22 | */ 23 | case class QueryResult( 24 | version: Long, 25 | actions: Seq[Object], 26 | responseFormat: String) 27 | trait DeltaSharedTableProtocol { 28 | def getTableVersion(startingTimestamp: Option[String]): Long = -1 29 | 30 | // scalastyle:off argcount 31 | def query( 32 | includeFiles: Boolean, 33 | predicateHints: Seq[String], 34 | jsonPredicateHints: Option[String], 35 | limitHint: Option[Long], 36 | version: Option[Long], 37 | timestamp: Option[String], 38 | startingVersion: Option[Long], 39 | endingVersion: Option[Long], 40 | maxFiles: Option[Int], 41 | pageToken: Option[String], 42 | includeRefreshToken: Boolean, 43 | refreshToken: Option[String], 44 | responseFormatSet: Set[String], 45 | clientReaderFeaturesSet: Set[String], 46 | includeEndStreamAction: Boolean): QueryResult 47 | 48 | def queryCDF( 49 | cdfOptions: Map[String, String], 50 | includeHistoricalMetadata: Boolean = false, 51 | maxFiles: Option[Int], 52 | pageToken: Option[String], 53 | responseFormatSet: Set[String] = Set("parquet"), 54 | includeEndStreamAction: Boolean): QueryResult 55 | 56 | def validateTable(inputFullHistoryShared: Boolean): Unit = {} 57 | 58 | def getPartitionSpecLogicalToPhysicalMap(inputFullHistoryShared: Boolean): Map[String, String] = 59 | Map.empty 60 | } 61 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/common/JsonUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server.common 18 | 19 | import java.io.OutputStream 20 | 21 | import com.fasterxml.jackson.annotation.JsonInclude.Include 22 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} 23 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 24 | import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper 25 | import org.json4s.JsonAST.JInt 26 | import org.json4s.jackson.JsonMethods.parse 27 | 28 | object JsonUtils { 29 | /** Used to convert between classes and JSON. */ 30 | lazy val mapper = { 31 | val _mapper = new ObjectMapper with ScalaObjectMapper 32 | _mapper.setSerializationInclusion(Include.NON_ABSENT) 33 | _mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) 34 | _mapper.registerModule(DefaultScalaModule) 35 | _mapper 36 | } 37 | 38 | def toJson[T](obj: T): String = { 39 | mapper.writeValueAsString(obj) 40 | } 41 | 42 | def toJson[T](out: OutputStream, obj: T): Unit = { 43 | mapper.writeValue(out, obj) 44 | } 45 | 46 | def fromJson[T: Manifest](json: String): T = { 47 | mapper.readValue[T](json) 48 | } 49 | 50 | /** Parse the `numRecords` field from the stats json of the AddFile 51 | * Returns 0 if the field is not found or if stats is empty. */ 52 | def extractNumRecords(stats: String): Option[Long] = { 53 | if (stats != null && !stats.isEmpty()) { 54 | val numRecordsField = parse(stats) \ "numRecords" 55 | numRecordsField match { 56 | case JInt(numRecords) => Some(numRecords.toLong) 57 | case _ => None 58 | } 59 | } else { 60 | None 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/common/SnapshotChecker.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server.common 18 | 19 | import io.delta.sharing.server.DeltaSharingUnsupportedOperationException 20 | import io.delta.sharing.server.common.actions.{ColumnMappingTableFeature, DeletionVectorsTableFeature, DeltaAction} 21 | 22 | 23 | object SnapshotChecker { 24 | 25 | /** 26 | * Assert all properties present in the table are covered/supported either by clientReaderFeatures 27 | * or flagReaderFeatures. 28 | * 29 | * If a table property is covered by clientReaderFeatures or flagReaderFeatures, then it's 30 | * considered as supported, if not, check if the property value is disabled, 31 | * if not it's considered unsupported, and should throw an error. 32 | * 33 | * This should fail all parquet format requests on tables with property values not in 34 | * tablePropertiesWithDisabledValues, because clientReaderFeatures is empty for parquet format 35 | * sharing rpcs, which won't filter out any properties in tablePropertiesWithDisabledValues. 36 | * 37 | * @param configuration The "configuration" field of a delta Metadata, which contains the 38 | * properties of the table. 39 | * @param tableVersion The table vesrion associated with the configuration. 40 | * @param clientReaderFeatures The set of reader features supported by the delta sharing client 41 | */ 42 | def assertTableProperties( 43 | configuration: Map[String, String], 44 | tableVersion: Option[Long], 45 | clientReaderFeatures: Set[String]): Unit = { 46 | // An unsupported table property can be supported if it is in part of the client supported 47 | // table features. 48 | def propertySupportedByClient(property: String): Boolean = { 49 | if (property == DeltaAction.columnMappingProperty.property) { 50 | ColumnMappingTableFeature.isInSet(clientReaderFeatures) 51 | } else if (property == DeltaAction.deletionVectorsProperty.property) { 52 | DeletionVectorsTableFeature.isInSet(clientReaderFeatures) 53 | } else { 54 | // We should not reject any other table properties as they can contain arbitrary keys. 55 | true 56 | } 57 | } 58 | val unsupportedPropertiesByClient = DeltaAction.tablePropertiesWithDisabledValues 59 | .flatMap { 60 | case pr @ DeltaAction.PropertyAllowedValues(property, allowedValues) 61 | if !propertySupportedByClient(property) => 62 | configuration.get(property).filterNot(allowedValues.contains(_)).map(_ => pr) 63 | case _ => None 64 | } 65 | if (unsupportedPropertiesByClient.nonEmpty) { 66 | throw new DeltaSharingUnsupportedOperationException("Unsupported Delta Table Properties") 67 | } 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/common/TimestampUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server.common 18 | 19 | // A utility class that supports timestamp operations. 20 | // 21 | // We only support UTC timestamps for data skipping in the ISO 8601 format. 22 | // 23 | // This is also the format in which delta stores the stats ranges in the delta log. 24 | object TimestampUtils { 25 | // The formatter we will use. 26 | private val formatter = java.time.format.DateTimeFormatter.ISO_OFFSET_DATE_TIME 27 | 28 | // We represent the timestamp as java.util.Timestamp in memory. 29 | // 30 | // If the timestamp is not in the correct format, this will throw an exception. 31 | // In the context of predicate evaluation, it will eventually turn off filtering. 32 | def parse(ts: String): java.sql.Timestamp = { 33 | new java.sql.Timestamp(java.time.OffsetDateTime.parse(ts, formatter).toInstant.toEpochMilli) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/common/actions/Codec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server.common.actions 18 | 19 | import java.nio.ByteBuffer 20 | import java.util.UUID 21 | 22 | /** 23 | * Additional codecs not supported by Apache Commons Codecs. 24 | * Used to decode deletion vectors. 25 | * Note: Copied from runtime com.databricks.sql.transaction.tahoe.util.Codec.scala 26 | * */ 27 | object Codec { 28 | 29 | def uuidFromByteBuffer(buffer: ByteBuffer): UUID = { 30 | require(buffer.remaining() >= 16) 31 | val highBits = buffer.getLong 32 | val lowBits = buffer.getLong 33 | new UUID(highBits, lowBits) 34 | } 35 | 36 | /** 37 | * This implements Base85 using the 4 byte block aligned encoding and character set from Z85. 38 | * 39 | * @see https://rfc.zeromq.org/spec/32/ 40 | */ 41 | object Base85Codec { 42 | 43 | final val ENCODE_MAP: Array[Byte] = { 44 | val chars = ('0' to '9') ++ ('a' to 'z') ++ ('A' to 'Z') ++ ".-:+=^!/*?&<>()[]{}@%$#" 45 | chars.map(_.toByte).toArray 46 | } 47 | 48 | lazy val DECODE_MAP: Array[Byte] = { 49 | require(ENCODE_MAP.length - 1 <= Byte.MaxValue) 50 | // The bitmask is the same as largest possible value, so the length of the array must 51 | // be one greater. 52 | val map: Array[Byte] = Array.fill(ASCII_BITMASK + 1)(-1) 53 | for ((b, i) <- ENCODE_MAP.zipWithIndex) { 54 | map(b) = i.toByte 55 | } 56 | map 57 | } 58 | 59 | final val BASE: Long = 85L 60 | final val BASE_2ND_POWER: Long = 7225L // 85^2 61 | final val BASE_3RD_POWER: Long = 614125L // 85^3 62 | final val BASE_4TH_POWER: Long = 52200625L // 85^4 63 | final val ASCII_BITMASK: Int = 0x7F 64 | 65 | // UUIDs always encode into 20 characters. 66 | final val ENCODED_UUID_LENGTH: Int = 20 67 | 68 | /** 69 | * Decode a 16 byte UUID. */ 70 | def decodeUUID(encoded: String): UUID = { 71 | val buffer = decodeBlocks(encoded) 72 | uuidFromByteBuffer(buffer) 73 | } 74 | 75 | /** 76 | * Decode an arbitrary byte array. 77 | * 78 | * Output may contain padding bytes, if the input was not 4 byte aligned. 79 | */ 80 | private def decodeBlocks(encoded: String): ByteBuffer = { 81 | val input = encoded.toCharArray 82 | require(input.length % 5 == 0, "Input should be 5 character aligned.") 83 | val buffer = ByteBuffer.allocate(input.length / 5 * 4) 84 | 85 | // A mechanism to detect invalid characters in the input while decoding, that only has a 86 | // single conditional at the very end, instead of branching for every character. 87 | var canary: Int = 0 88 | def decodeInputChar(i: Int): Long = { 89 | val c = input(i) 90 | canary |= c // non-ascii char has bits outside of ASCII_BITMASK 91 | val b = DECODE_MAP(c & ASCII_BITMASK) 92 | canary |= b // invalid char maps to -1, which has bits outside ASCII_BITMASK 93 | b.toLong 94 | } 95 | 96 | var inputIndex = 0 97 | while (buffer.hasRemaining) { 98 | var sum = 0L 99 | sum += decodeInputChar(inputIndex) * BASE_4TH_POWER 100 | sum += decodeInputChar(inputIndex + 1) * BASE_3RD_POWER 101 | sum += decodeInputChar(inputIndex + 2) * BASE_2ND_POWER 102 | sum += decodeInputChar(inputIndex + 3) * BASE 103 | sum += decodeInputChar(inputIndex + 4) 104 | buffer.putInt(sum.toInt) 105 | inputIndex += 5 106 | } 107 | require((canary & ~ASCII_BITMASK) == 0, s"Input is not valid Z85: $encoded") 108 | buffer.rewind() 109 | buffer 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/common/actions/DeletionVectorDescriptor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server.common.actions 18 | 19 | import java.util.UUID 20 | 21 | import com.fasterxml.jackson.annotation.JsonIgnore 22 | import com.fasterxml.jackson.databind.annotation.JsonDeserialize 23 | import org.apache.hadoop.fs.Path 24 | 25 | import io.delta.sharing.server.DeltaSharingUnsupportedOperationException 26 | 27 | /** 28 | * Information about a deletion vector attached to a file action. 29 | * 30 | * Note: This is a stripped down version from runtime providing the minimum 31 | * support for deserializing deletion vector descriptors from the delta log. 32 | */ 33 | case class DeletionVectorDescriptor( 34 | /** 35 | * Indicates how the DV is stored. 36 | * Should be a single letter (see [[pathOrInlineDv]] below.) 37 | */ 38 | storageType: String, 39 | /** 40 | * Contains the actual data that allows accessing the DV. 41 | * 42 | * Three options are currently supported: 43 | * - `storageType="u"` format: `` 44 | * The deletion vector is stored in a file with a path relative to 45 | * the data directory of this Delta Table, and the file name can be 46 | * reconstructed from the UUID. 47 | * The encoded UUID is always exactly 20 characters, so the random 48 | * prefix length can be determined any characters exceeding 20. 49 | * - `storageType="i"` format: `` 50 | * The deletion vector is stored inline in the log. 51 | * - `storageType="p"` format: `` 52 | * The DV is stored in a file with an absolute path given by this 53 | * url. 54 | */ 55 | pathOrInlineDv: String, 56 | /** 57 | * Start of the data for this DV in number of bytes from the beginning of the file it is stored 58 | * in. 59 | * 60 | * Always None when storageType = "i". 61 | */ 62 | @JsonDeserialize(contentAs = classOf[java.lang.Integer]) 63 | offset: Option[Int] = None, 64 | /** Size of the serialized DV in bytes (raw data size, i.e. before base85 encoding). */ 65 | sizeInBytes: Int, 66 | /** Number of rows the DV logically removes from the file. */ 67 | cardinality: Long, 68 | /** 69 | * Transient property that is used to validate DV correctness. 70 | * It is not stored in the log. 71 | */ 72 | @JsonDeserialize(contentAs = classOf[java.lang.Long]) 73 | maxRowIndex: Option[Long] = None) { 74 | 75 | import DeletionVectorDescriptor._ 76 | 77 | @JsonIgnore 78 | def isOnDisk: Boolean = !isInline 79 | 80 | @JsonIgnore 81 | def isInline: Boolean = storageType == INLINE_DV_MARKER 82 | 83 | @JsonIgnore 84 | def isRelative: Boolean = storageType == UUID_DV_MARKER 85 | 86 | @JsonIgnore 87 | def isAbsolute: Boolean = storageType == PATH_DV_MARKER 88 | 89 | def absolutePath(tableLocation: Path): Path = { 90 | require(isOnDisk, "Can't get a path for an inline deletion vector") 91 | storageType match { 92 | case UUID_DV_MARKER => 93 | // If the file was written with a random prefix, we have to extract that, 94 | // before decoding the UUID. 95 | val randomPrefixLength = pathOrInlineDv.length - Codec.Base85Codec.ENCODED_UUID_LENGTH 96 | val (randomPrefix, encodedUuid) = pathOrInlineDv.splitAt(randomPrefixLength) 97 | val uuid = Codec.Base85Codec.decodeUUID(encodedUuid) 98 | assembleDeletionVectorPath(tableLocation, uuid, randomPrefix) 99 | case PATH_DV_MARKER => 100 | throw new DeltaSharingUnsupportedOperationException( 101 | "Table contains absolute paths and cannot be shared through delta sharing") 102 | case _ => 103 | throw new DeltaSharingUnsupportedOperationException( 104 | s"DELTA_CANNOT_RECONSTRUCT_PATH_FROM_URI_$pathOrInlineDv") 105 | } 106 | } 107 | } 108 | 109 | object DeletionVectorDescriptor { 110 | /** String that is used in all file names generated by deletion vector store */ 111 | val DELETION_VECTOR_FILE_NAME_CORE = "deletion_vector" 112 | // Markers to separate different kinds of DV storage. 113 | final val PATH_DV_MARKER: String = "p" 114 | final val INLINE_DV_MARKER: String = "i" 115 | final val UUID_DV_MARKER: String = "u" 116 | 117 | def apply(dv: DeletionVectorDescriptor): DeletionVectorDescriptor = { 118 | if (dv == null) { 119 | return null 120 | } 121 | DeletionVectorDescriptor( 122 | storageType = dv.storageType, 123 | pathOrInlineDv = dv.pathOrInlineDv, 124 | offset = dv.offset, 125 | sizeInBytes = dv.sizeInBytes, 126 | cardinality = dv.cardinality 127 | ) 128 | } 129 | 130 | /** 131 | * Return the unique path under `parentPath` that is based on `id`. 132 | * 133 | * Optionally, prepend a `prefix` to the name. 134 | */ 135 | def assembleDeletionVectorPath(targetParentPath: Path, id: UUID, prefix: String = ""): Path = { 136 | val fileName = s"${DELETION_VECTOR_FILE_NAME_CORE}_${id}.bin" 137 | if (prefix.nonEmpty) { 138 | new Path(new Path(targetParentPath, prefix), fileName) 139 | } else { 140 | new Path(targetParentPath, fileName) 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/exceptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | /** 20 | * A special exception for invalid requests happening in Delta Sharing Server. We define a special 21 | * class rather than reusing `IllegalArgumentException` so that we can ensure that the message in 22 | * `IllegalArgumentException` thrown from other libraries won't be returned to users. 23 | * 24 | * @note `message` will be in the response. Please make sure it doesn't contain any sensitive info. 25 | */ 26 | class DeltaSharingIllegalArgumentException(message: String) 27 | extends IllegalArgumentException(message) 28 | 29 | /** 30 | * A special exception for resource not found in Delta Sharing Server. We define a special 31 | * class rather than reusing `NoSuchElementException` so that we can ensure that the message in 32 | * `NoSuchElementException` thrown from other libraries won't be returned to users. 33 | * 34 | * @note `message` will be in the response. Please make sure it doesn't contain any sensitive info. 35 | */ 36 | class DeltaSharingNoSuchElementException(message: String) 37 | extends NoSuchElementException(message) 38 | 39 | /** 40 | * A special exception for invalid requests happening in Delta Sharing Server. We define a special 41 | * class rather than reusing `UnsupportedOperationException` so that we can ensure that the message 42 | * in `UnsupportedOperationException` thrown from other libraries won't be returned to users. 43 | * 44 | * @note `message` will be in the response. Please make sure it doesn't contain any sensitive info. 45 | */ 46 | class DeltaSharingUnsupportedOperationException(message: String) 47 | extends UnsupportedOperationException(message) 48 | 49 | /** 50 | * A special exception that wraps an unhandled exception when processing a request. 51 | * `DeltaInternalException` should never be exposed to users as an unhandled exception may contain 52 | * sensitive information. 53 | */ 54 | class DeltaInternalException(e: Throwable) extends RuntimeException(e) 55 | 56 | object ErrorStrings { 57 | def multipleParametersSetErrorMsg(params: Seq[String]): String = { 58 | s"Please only provide one of: ${params.mkString(",")}" 59 | } 60 | } 61 | 62 | object CausedBy { 63 | def unapply(e: Throwable): Option[Throwable] = Option(e.getCause) 64 | } 65 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/standalone/internal/DeltaCDFErrors.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.standalone.internal 18 | 19 | class DeltaCDFIllegalArgumentException(message: String) 20 | extends IllegalArgumentException(message) 21 | 22 | object DeltaCDFErrors { 23 | def multipleCDFBoundary(position: String): Throwable = { 24 | new DeltaCDFIllegalArgumentException(s"Multiple $position arguments provided for CDF read. " + 25 | s"Please provide one of either ${position}Timestamp or ${position}Version." 26 | ) 27 | } 28 | 29 | def noStartVersionForCDF: Throwable = { 30 | new DeltaCDFIllegalArgumentException("No startingVersion or startingTimestamp provided for " + 31 | "CDF read.") 32 | } 33 | 34 | def startVersionAfterLatestVersion(start: Long, latest: Long): Throwable = { 35 | new DeltaCDFIllegalArgumentException(s"Provided Start version($start) for reading change " + 36 | "data is invalid. Start version cannot be greater than the latest version of the " + 37 | s"table($latest)." 38 | ) 39 | } 40 | 41 | def endVersionAfterLatestVersion(end: Long, latest: Long): Throwable = { 42 | new DeltaCDFIllegalArgumentException(s"Provided end version($end) is invalid. End version " + 43 | s"cannot be greater than the latest version of the table($latest)." 44 | ) 45 | } 46 | 47 | def endBeforeStartVersionInCDF(start: Long, end: Long): Throwable = { 48 | new DeltaCDFIllegalArgumentException( 49 | s"CDF range from start $start to end $end was invalid. End cannot be before start." 50 | ) 51 | } 52 | 53 | def invalidTimestamp(field: String, message: String): Throwable = { 54 | new DeltaCDFIllegalArgumentException(s"Invalid $field: $message") 55 | } 56 | 57 | def changeDataNotRecordedException(version: Long, start: Long, end: Long): Throwable = { 58 | new DeltaCDFIllegalArgumentException(s"Error getting change data for range [$start, $end] " + 59 | s"as change data was not recorded for version [$version]" 60 | ) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/standalone/internal/DeltaDataSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.delta.standalone.internal 17 | 18 | /** DeltaDataSource constants. */ 19 | object DeltaDataSource { 20 | // Constants for cdf parameters 21 | final val CDF_START_VERSION_KEY = "startingVersion" 22 | 23 | final val CDF_START_TIMESTAMP_KEY = "startingTimestamp" 24 | 25 | final val CDF_END_VERSION_KEY = "endingVersion" 26 | 27 | final val CDF_END_TIMESTAMP_KEY = "endingTimestamp" 28 | } 29 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/standalone/internal/DeltaSharingHistoryManager.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // Putting these classes in this package to access Delta Standalone internal APIs 18 | package io.delta.standalone.internal 19 | 20 | import java.sql.Timestamp 21 | import java.time.OffsetDateTime 22 | import java.time.ZoneOffset 23 | import java.time.format.DateTimeFormatter 24 | 25 | import io.delta.standalone.internal.actions.CommitMarker 26 | import io.delta.standalone.internal.util.FileNames 27 | import io.delta.storage.LogStore 28 | import org.apache.hadoop.conf.Configuration 29 | import org.apache.hadoop.fs.Path 30 | import scala.collection.JavaConverters._ 31 | 32 | object DeltaSharingHistoryManager { 33 | /** 34 | * DeltaHistoryManager.getCommits is not a public method, so we need to make local copies here. 35 | * When calling getCommits, the initial few timestamp values may be wrong because they are not 36 | * properly monotonized. getCommitsSafe uses this to update the start value 37 | * far behind the first timestamp they care about to get correct values. 38 | * TODO(https://github.com/delta-io/delta-sharing/issues/144): Cleans this up once 39 | * DeltaHistoryManager.getCommits is public 40 | */ 41 | private val POTENTIALLY_UNMONOTONIZED_TIMESTAMPS = 100 42 | 43 | // Correct timestamp values are only available through getCommits(). Commit 44 | // info timestamps are wrong, and file modification times are wrong because they need to be 45 | // monotonized first. This just performs a list (we don't read the contents of the files in 46 | // getCommits()) so it's not a big deal. 47 | private[internal] def getTimestampsByVersion( 48 | logStore: LogStore, 49 | logPath: Path, 50 | start: Long, 51 | end: Long, 52 | conf: Configuration): Map[Long, Timestamp] = { 53 | val monotonizationStart = 54 | Seq(start - POTENTIALLY_UNMONOTONIZED_TIMESTAMPS, 0).max 55 | val commits = getCommits(logStore, logPath, monotonizationStart, end, conf) 56 | 57 | // Note that the timestamps come from filesystem modification timestamps, so they're 58 | // milliseconds since epoch and we don't need to deal with timezones. 59 | commits.map(f => (f.version -> new Timestamp(f.timestamp))).toMap 60 | } 61 | 62 | // Convert timestamp string to Timestamp 63 | private[internal] def getTimestamp(paramName: String, timeStampStr: String): Timestamp = { 64 | try { 65 | new Timestamp(OffsetDateTime.parse( 66 | timeStampStr, DateTimeFormatter.ISO_OFFSET_DATE_TIME).toInstant.toEpochMilli) 67 | } catch { 68 | case e: java.time.format.DateTimeParseException => 69 | throw DeltaCDFErrors.invalidTimestamp(paramName, e.getMessage) 70 | } 71 | } 72 | 73 | /** 74 | * Returns the commit version and timestamps of all commits in `[start, end)`. If `end` is not 75 | * specified, will return all commits that exist after `start`. Will guarantee that the commits 76 | * returned will have both monotonically increasing versions as well as timestamps. 77 | * Exposed for tests. 78 | */ 79 | private def getCommits( 80 | logStore: LogStore, 81 | logPath: Path, 82 | start: Long, 83 | end: Long, 84 | conf: Configuration): Array[Commit] = { 85 | val commits = logStore 86 | .listFrom(FileNames.deltaFile(logPath, start), conf) 87 | .asScala 88 | .filter(f => FileNames.isDeltaFile(f.getPath)) 89 | .map { fileStatus => 90 | Commit(FileNames.deltaVersion(fileStatus.getPath), fileStatus.getModificationTime) 91 | } 92 | .takeWhile(_.version < end) 93 | 94 | monotonizeCommitTimestamps(commits.toArray) 95 | } 96 | 97 | /** 98 | * Makes sure that the commit timestamps are monotonically increasing with respect to commit 99 | * versions. Requires the input commits to be sorted by the commit version. 100 | */ 101 | private def monotonizeCommitTimestamps[T <: CommitMarker]( 102 | commits: Array[T]): Array[T] = { 103 | var i = 0 104 | val length = commits.length 105 | while (i < length - 1) { 106 | val prevTimestamp = commits(i).getTimestamp 107 | assert(commits(i).getVersion < commits(i + 1).getVersion, "Unordered commits provided.") 108 | if (prevTimestamp >= commits(i + 1).getTimestamp) { 109 | commits(i + 1) = commits(i + 1).withTimestamp(prevTimestamp + 1).asInstanceOf[T] 110 | } 111 | i += 1 112 | } 113 | commits 114 | } 115 | 116 | /** A helper class to represent the timestamp and version of a commit. */ 117 | case class Commit(version: Long, timestamp: Long) extends CommitMarker { 118 | override def withTimestamp(timestamp: Long): Commit = this.copy(timestamp = timestamp) 119 | 120 | override def getTimestamp: Long = timestamp 121 | 122 | override def getVersion: Long = version 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/standalone/internal/PartitionFilterUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.standalone.internal 18 | 19 | import scala.util.Try 20 | import scala.util.control.NonFatal 21 | 22 | import io.delta.standalone.internal.actions.AddFile 23 | import org.apache.spark.sql.Encoders 24 | import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} 25 | import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, UnresolvedAttribute} 26 | import org.apache.spark.sql.catalyst.expressions._ 27 | import org.apache.spark.sql.execution.SparkSqlParser 28 | import org.apache.spark.sql.internal.SQLConf 29 | import org.apache.spark.sql.types.{DataType, StructField, StructType} 30 | import org.slf4j.LoggerFactory 31 | 32 | object PartitionFilterUtils { 33 | private val logger = LoggerFactory.getLogger(this.getClass) 34 | 35 | private lazy val sqlParser = new SparkSqlParser(new SQLConf) 36 | 37 | def evaluatePredicate( 38 | schemaString: String, 39 | partitionColumns: Seq[String], 40 | partitionFilters: Seq[String], 41 | addFiles: Seq[(AddFile, Int)]): Seq[(AddFile, Int)] = { 42 | try { 43 | val tableSchema = DataType.fromJson(schemaString).asInstanceOf[StructType] 44 | val partitionSchema = new StructType(partitionColumns.map(c => tableSchema(c)).toArray) 45 | val addSchema = Encoders.product[AddFile].schema 46 | val attrs = 47 | addSchema.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()) 48 | val exprs = 49 | rewritePartitionFilters( 50 | partitionSchema, 51 | attrs, 52 | partitionFilters.flatMap { f => 53 | Try(sqlParser.parseExpression(f)).toOption 54 | }.filter(f => isSupportedExpression(f, partitionSchema)) 55 | ) 56 | if (exprs.isEmpty) { 57 | addFiles 58 | } else { 59 | val predicate = InterpretedPredicate.create(exprs.reduce(And), attrs) 60 | predicate.initialize(0) 61 | addFiles.filter { 62 | case (addFile, _) => 63 | val converter = CatalystTypeConverters.createToCatalystConverter(addSchema) 64 | predicate.eval(converter(addFile).asInstanceOf[InternalRow]) 65 | } 66 | } 67 | } catch { 68 | case NonFatal(e) => 69 | logger.error(e.getMessage, e) 70 | // Fail to evaluate the filters. Return all files as a fallback. 71 | addFiles 72 | } 73 | } 74 | 75 | private def isSupportedExpression(e: Expression, partitionSchema: StructType): Boolean = { 76 | def isPartitionColumOrConstant(e: Expression): Boolean = { 77 | e match { 78 | case _: Literal => true 79 | case u: UnresolvedAttribute if u.nameParts.size == 1 => 80 | val unquoted = u.name.stripPrefix("`").stripSuffix("`") 81 | partitionSchema.exists(part => caseInsensitiveResolution(unquoted, part.name)) 82 | case c: Cast => isPartitionColumOrConstant(c.child) 83 | case _ => false 84 | } 85 | } 86 | 87 | e match { 88 | case EqualTo(left, right) 89 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 90 | true 91 | case GreaterThan(left, right) 92 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 93 | true 94 | case LessThan(left, right) 95 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 96 | true 97 | case GreaterThanOrEqual(left, right) 98 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 99 | true 100 | case LessThanOrEqual(left, right) 101 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 102 | true 103 | case EqualNullSafe(left, right) 104 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 105 | true 106 | case IsNull(e) if isPartitionColumOrConstant(e) => 107 | true 108 | case IsNotNull(e) if isPartitionColumOrConstant(e) => 109 | true 110 | case Not(e) if isSupportedExpression(e, partitionSchema) => 111 | true 112 | case _ => false 113 | } 114 | } 115 | 116 | private def rewritePartitionFilters( 117 | partitionSchema: StructType, 118 | attrs: Seq[Attribute], 119 | partitionFilters: Seq[Expression]): Seq[Expression] = { 120 | val partitionValuesAttr = attrs.find(_.name == "partitionValues").head 121 | partitionFilters.map(_.transformUp { 122 | case a: Attribute => 123 | // If we have a special column name, e.g. `a.a`, then an UnresolvedAttribute returns 124 | // the column name as '`a.a`' instead of 'a.a', therefore we need to strip the backticks. 125 | val unquoted = a.name.stripPrefix("`").stripSuffix("`") 126 | val partitionCol = partitionSchema.find { field => field.name == unquoted } 127 | partitionCol match { 128 | case Some(StructField(name, dataType, _, _)) => 129 | Cast( 130 | ExtractValue( 131 | partitionValuesAttr, 132 | Literal(name), 133 | org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution), 134 | dataType) 135 | case None => 136 | // This should not be able to happen, but the case was present in the original code so 137 | // we kept it to be safe. 138 | UnresolvedAttribute(Seq("partitionValues", a.name)) 139 | } 140 | }) 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/standalone/internal/model.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.standalone.internal 18 | 19 | import com.fasterxml.jackson.databind.annotation.JsonDeserialize 20 | import io.delta.standalone.internal.actions.{ 21 | Format => DeltaFormat, 22 | Metadata => DeltaMetadata, 23 | Protocol => DeltaProtocol, 24 | SingleAction => DeltaSingleAction 25 | } 26 | 27 | /** 28 | * A copy of delta Metadata class, removed schema/dataSchema/partitionSchema, is serialized as 29 | * json response for a delta sharing rpc. 30 | */ 31 | case class DeltaMetadataCopy( 32 | id: String, 33 | name: String, 34 | description: String, 35 | format: DeltaFormat, 36 | schemaString: String, 37 | partitionColumns: Seq[String], 38 | configuration: Map[String, String], 39 | @JsonDeserialize(contentAs = classOf[java.lang.Long]) 40 | createdTime: Option[Long] 41 | ) 42 | 43 | object DeltaMetadataCopy { 44 | def apply(metadata: DeltaMetadata): DeltaMetadataCopy = { 45 | DeltaMetadataCopy( 46 | id = metadata.id, 47 | name = metadata.name, 48 | description = metadata.description, 49 | format = metadata.format, 50 | schemaString = metadata.schemaString, 51 | partitionColumns = metadata.partitionColumns, 52 | configuration = metadata.configuration, 53 | createdTime = metadata.createdTime 54 | ) 55 | } 56 | } 57 | /** 58 | * Actions defined to use in the response for delta format sharing. 59 | */ 60 | 61 | sealed trait DeltaResponseAction { 62 | /** Turn this object to the [[DeltaFormatSingleAction]] wrap object. */ 63 | def wrap: DeltaResponseSingleAction 64 | } 65 | 66 | /** 67 | * DeltaResponseProtocol which is part of the delta Protocol. 68 | */ 69 | case class DeltaResponseProtocol(deltaProtocol: DeltaProtocol) extends DeltaResponseAction { 70 | override def wrap: DeltaResponseSingleAction = DeltaResponseSingleAction(protocol = this) 71 | } 72 | 73 | /** 74 | * DeltaResponseFileAction used in delta sharing protocol. It wraps a delta action, 75 | * and adds 4 delta sharing related fields: id/version/timestamp/expirationTimestamp. 76 | * - id: used to uniquely identify a file, and in idToUrl mapping for executor to get 77 | * presigned url. 78 | * - version/timestamp: the version and timestamp of the commit, used to generate faked delta 79 | * log file on the client side. 80 | * - expirationTimestamp: indicate when the presigned url is going to expire and need a 81 | * refresh. 82 | * Suggest to redact the tags field before returning if there are sensitive info. 83 | */ 84 | case class DeltaResponseFileAction( 85 | id: String, 86 | version: java.lang.Long = null, 87 | timestamp: java.lang.Long = null, 88 | expirationTimestamp: Long, 89 | deltaSingleAction: DeltaSingleAction) extends DeltaResponseAction { 90 | override def wrap: DeltaResponseSingleAction = DeltaResponseSingleAction(file = this) 91 | } 92 | 93 | /** 94 | * DeltaResponseMetadata used in delta sharing protocol, it wraps the Metadata in delta. 95 | * Adding 1 delta sharing related field: version. 96 | */ 97 | case class DeltaResponseMetadata( 98 | version: java.lang.Long = null, 99 | deltaMetadata: DeltaMetadataCopy) extends DeltaResponseAction { 100 | override def wrap: DeltaResponseSingleAction = DeltaResponseSingleAction(metaData = this) 101 | } 102 | 103 | /** A serialization helper to create a common action envelope. */ 104 | case class DeltaResponseSingleAction( 105 | file: DeltaResponseFileAction = null, 106 | metaData: DeltaResponseMetadata = null, 107 | protocol: DeltaResponseProtocol = null) { 108 | 109 | def unwrap: DeltaResponseAction = { 110 | if (file != null) { 111 | file 112 | } else if (metaData != null) { 113 | metaData 114 | } else if (protocol != null) { 115 | protocol 116 | } else { 117 | null 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /server/src/test/resources/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | fs.azure.account.key.deltasharingtest.blob.core.windows.net 7 | ${azure.account.key} 8 | 9 | 10 | 11 | fs.azure.account.auth.type.deltasharingtest.dfs.core.windows.net 12 | SharedKey 13 | 14 | 15 | 16 | 17 | fs.azure.account.key.deltasharingtest.dfs.core.windows.net 18 | ${azure.account.key} 19 | 20 | 21 | -------------------------------------------------------------------------------- /server/src/test/scala/io/delta/sharing/server/CloudFileSignerSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.scalatest.FunSuite 21 | 22 | import io.delta.sharing.server.common.GCSFileSigner 23 | 24 | class CloudFileSignerSuite extends FunSuite { 25 | 26 | test("GCSFileSigner.getBucketAndObjectNames") { 27 | assert(GCSFileSigner.getBucketAndObjectNames(new Path("gs://delta-sharing-test/foo")) 28 | == ("delta-sharing-test", "foo")) 29 | assert(GCSFileSigner.getBucketAndObjectNames(new Path("gs://delta_sharing_test/foo")) 30 | == ("delta_sharing_test", "foo")) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /server/src/test/scala/io/delta/sharing/server/TestDeltaSharingServer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | import java.io.File 20 | import java.lang.management.ManagementFactory 21 | 22 | import org.apache.commons.io.FileUtils 23 | 24 | import io.delta.sharing.server.config.ServerConfig 25 | 26 | /** 27 | * This is a special test class for the client projects to test end-to-end experience. It will 28 | * generate configs for testing and start the server. 29 | */ 30 | object TestDeltaSharingServer { 31 | def main(args: Array[String]): Unit = { 32 | val pid = ManagementFactory.getRuntimeMXBean().getName().split("@")(0) 33 | val pidFile = new File(args(0)) 34 | // scalastyle:off println 35 | println(s"Writing pid $pid to $pidFile") 36 | // scalastyle:off on 37 | FileUtils.writeStringToFile(pidFile, pid) 38 | if (sys.env.get("AWS_ACCESS_KEY_ID").exists(_.length > 0)) { 39 | val serverConfigPath = TestResource.setupTestTables().getCanonicalPath 40 | val serverConfig = ServerConfig.load(serverConfigPath) 41 | println("serverConfigPath=" + serverConfigPath) 42 | println("serverConfig=" + serverConfig) 43 | val server = DeltaSharingService.start(serverConfig) 44 | // Run at most 420 seconds and exit. This is to ensure we can exit even if the parent process 45 | // hits any error. 46 | Thread.sleep(420000) 47 | server.stop() 48 | } else { 49 | throw new IllegalArgumentException("Cannot find AWS_ACCESS_KEY_ID in sys.env") 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /server/src/test/scala/io/delta/standalone/internal/PartitionFilterUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.standalone.internal 18 | 19 | import io.delta.standalone.internal.actions.AddFile 20 | import org.apache.spark.sql.types.{IntegerType, StructField, StructType} 21 | import org.scalatest.FunSuite 22 | 23 | class PartitionFilterUtilsSuite extends FunSuite { 24 | 25 | import PartitionFilterUtils._ 26 | 27 | test("evaluatePredicate") { 28 | val schema = StructType.fromDDL("c1 INT, c2 INT").json 29 | val add1 = AddFile("foo1", Map("c2" -> "0"), 1, 1, true) 30 | val add2 = AddFile("foo2", Map("c2" -> "1"), 1, 1, true) 31 | val addFiles = Seq(add1, add2).zipWithIndex 32 | assert(Seq(addFiles(0)) == evaluatePredicate(schema, "c2" :: Nil, "c2 = 0" :: Nil, addFiles)) 33 | assert(Seq(addFiles(1)) == evaluatePredicate(schema, "c2" :: Nil, "c2 = 1" :: Nil, addFiles)) 34 | assert(Seq(addFiles(1)) == evaluatePredicate(schema, "c2" :: Nil, "c2 > 0" :: Nil, addFiles)) 35 | assert(Seq(addFiles(0)) == evaluatePredicate(schema, "c2" :: Nil, "c2 < 1" :: Nil, addFiles)) 36 | assert(Seq(addFiles(1)) == evaluatePredicate(schema, "c2" :: Nil, "c2 >= 1" :: Nil, addFiles)) 37 | assert(Seq(addFiles(0)) == evaluatePredicate(schema, "c2" :: Nil, "c2 <= 0" :: Nil, addFiles)) 38 | assert(Seq(addFiles(1)) == evaluatePredicate(schema, "c2" :: Nil, "c2 <> 0" :: Nil, addFiles)) 39 | assert(Seq(addFiles(0)) == evaluatePredicate(schema, "c2" :: Nil, "c2 <> 1" :: Nil, addFiles)) 40 | assert(Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 is null" :: Nil, addFiles)) 41 | assert(addFiles == evaluatePredicate(schema, "c2" :: Nil, "c2 is not null" :: Nil, addFiles)) 42 | assert(addFiles == evaluatePredicate(schema, "c2" :: Nil, "c2 is not null" :: Nil, addFiles)) 43 | 44 | // Unsupported expression 45 | assert(addFiles == evaluatePredicate(schema, "c2" :: Nil, "c2 = 0 + 1" :: Nil, addFiles)) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /server/src/test/scala/io/delta/standalone/internal/TimestampUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // scalastyle:off println 18 | 19 | package io.delta.standalone.internal 20 | 21 | import org.scalatest.FunSuite 22 | 23 | import io.delta.sharing.server.common.TimestampUtils 24 | 25 | class TimestampUtilsSuite extends FunSuite { 26 | test("basic test") { 27 | // Only ISO 8601 is supported. 28 | TimestampUtils.parse("2023-06-10T00:00:00.000Z") 29 | TimestampUtils.parse("2023-06-10T01:02:13Z") 30 | 31 | // Other formats will trigger errors. 32 | assert(intercept[java.time.format.DateTimeParseException] { 33 | TimestampUtils.parse("2023-06-10 00:00:00.234") 34 | }.getMessage.contains("could not be parsed")) 35 | 36 | assert(intercept[java.time.format.DateTimeParseException] { 37 | TimestampUtils.parse("2023-06-10") 38 | }.getMessage.contains("could not be parsed")) 39 | 40 | assert(intercept[java.time.format.DateTimeParseException] { 41 | TimestampUtils.parse("2023-06-10 00:00:00") 42 | }.getMessage.contains("could not be parsed")) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /server/src/universal/conf/delta-sharing-server.yaml.template: -------------------------------------------------------------------------------- 1 | # The format version of this config file 2 | version: 1 3 | # Config shares/schemas/tables to share 4 | shares: 5 | - name: "share1" 6 | schemas: 7 | - name: "schema1" 8 | tables: 9 | - name: "table1" 10 | # S3. See https://github.com/delta-io/delta-sharing#s3 for how to config the credentials 11 | location: "s3a:///" 12 | id: "00000000-0000-0000-0000-000000000000" 13 | - name: "table2" 14 | # Azure Blob Storage. See https://github.com/delta-io/delta-sharing#azure-blob-storage for how to config the credentials 15 | location: "wasbs://@" 16 | id: "00000000-0000-0000-0000-000000000001" 17 | - name: "share2" 18 | schemas: 19 | - name: "schema2" 20 | tables: 21 | - name: "table3" 22 | # Azure Data Lake Storage Gen2. See https://github.com/delta-io/delta-sharing#azure-data-lake-storage-gen2 for how to config the credentials 23 | location: "abfss://@" 24 | historyShared: true 25 | id: "00000000-0000-0000-0000-000000000002" 26 | - name: "share3" 27 | schemas: 28 | - name: "schema3" 29 | tables: 30 | - name: "table4" 31 | # Google Cloud Storage (GCS). See https://github.com/delta-io/delta-sharing#google-cloud-storage for how to config the credentials 32 | location: "gs:///" 33 | id: "00000000-0000-0000-0000-000000000003" 34 | - name: "share4" 35 | schemas: 36 | - name: "schema4" 37 | tables: 38 | - name: "table5" 39 | # Cloudflare R2. See https://github.com/delta-io/delta-sharing#cloudflare-r2 for how to config the credentials 40 | location: "s3a:///" 41 | id: "00000000-0000-0000-0000-000000000004" 42 | # Set the host name that the server will use 43 | host: "localhost" 44 | # Set the port that the server will listen on. Note: using ports below 1024 45 | # may require a privileged user in some operating systems. 46 | port: 8080 47 | # Set the url prefix for the REST APIs 48 | endpoint: "/delta-sharing" 49 | # Set the timeout of S3 presigned url in seconds 50 | preSignedUrlTimeoutSeconds: 3600 51 | # How many tables to cache in the server 52 | deltaTableCacheSize: 10 53 | # Whether we can accept working with a stale version of the table. This is useful when sharing 54 | # static tables that will never be changed. 55 | stalenessAcceptable: false 56 | # Whether to evaluate user provided `predicateHints` 57 | evaluatePredicateHints: false 58 | # Whether to evaluate user provided `jsonPredicateHints` 59 | evaluateJsonPredicateHints: true 60 | # Whether to evaluate user provided `jsonPredicateHints` for V2 predicates. 61 | evaluateJsonPredicateHintsV2: true 62 | # The maximum page size permitted by queryTable/queryTableChanges API. 63 | queryTablePageSizeLimit: 10000 64 | # The TTL of the page token generated in queryTable/queryTableChanges API (in milliseconds). 65 | queryTablePageTokenTtlMs: 259200000 66 | # The TTL of the refresh token generated in queryTable API (in milliseconds). 67 | refreshTokenTtlMs: 3600000 68 | -------------------------------------------------------------------------------- /spark/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem: -------------------------------------------------------------------------------- 1 | io.delta.sharing.client.DeltaSharingFileSystem -------------------------------------------------------------------------------- /spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | io.delta.sharing.spark.DeltaSharingDataSource -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/DeltaSharingDataSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.util.Collections 20 | 21 | import scala.collection.JavaConverters._ 22 | import scala.collection.mutable 23 | 24 | import org.apache.spark.SparkEnv 25 | import org.apache.spark.delta.sharing.PreSignedUrlCache 26 | import org.apache.spark.sql.{SparkSession, SQLContext} 27 | import org.apache.spark.sql.execution.streaming.Source 28 | import org.apache.spark.sql.sources.{ 29 | BaseRelation, 30 | DataSourceRegister, 31 | RelationProvider, 32 | StreamSourceProvider 33 | } 34 | import org.apache.spark.sql.types.StructType 35 | 36 | 37 | /** A DataSource V1 for integrating Delta into Spark SQL batch APIs. */ 38 | private[sharing] class DeltaSharingDataSource 39 | extends RelationProvider 40 | with StreamSourceProvider 41 | with DataSourceRegister { 42 | 43 | override def createRelation( 44 | sqlContext: SQLContext, 45 | parameters: Map[String, String]): BaseRelation = { 46 | DeltaSharingDataSource.setupFileSystem(sqlContext) 47 | val options = new DeltaSharingOptions(parameters) 48 | val path = options.options.getOrElse("path", throw DeltaSharingErrors.pathNotSpecifiedException) 49 | 50 | val deltaLog = RemoteDeltaLog( 51 | path, forStreaming = false, responseFormat = options.responseFormat 52 | ) 53 | deltaLog.createRelation(options.versionAsOf, options.timestampAsOf, options.cdfOptions) 54 | } 55 | 56 | // Returns the schema of the latest table snapshot. 57 | override def sourceSchema( 58 | sqlContext: SQLContext, 59 | schema: Option[StructType], 60 | providerName: String, 61 | parameters: Map[String, String]): (String, StructType) = { 62 | if (schema.nonEmpty && schema.get.nonEmpty) { 63 | throw DeltaSharingErrors.specifySchemaAtReadTimeException 64 | } 65 | val options = new DeltaSharingOptions(parameters) 66 | if (options.isTimeTravel) { 67 | throw DeltaSharingErrors.timeTravelNotSupportedException 68 | } 69 | 70 | val path = options.options.getOrElse("path", throw DeltaSharingErrors.pathNotSpecifiedException) 71 | val deltaLog = RemoteDeltaLog( 72 | path, forStreaming = true, responseFormat = options.responseFormat 73 | ) 74 | val schemaToUse = deltaLog.snapshot().schema 75 | if (schemaToUse.isEmpty) { 76 | throw DeltaSharingErrors.schemaNotSetException 77 | } 78 | 79 | if (options.readChangeFeed) { 80 | (shortName(), DeltaTableUtils.addCdcSchema(schemaToUse)) 81 | } else { 82 | (shortName(), schemaToUse) 83 | } 84 | } 85 | 86 | override def createSource( 87 | sqlContext: SQLContext, 88 | metadataPath: String, 89 | schema: Option[StructType], 90 | providerName: String, 91 | parameters: Map[String, String]): Source = { 92 | DeltaSharingDataSource.setupFileSystem(sqlContext) 93 | if (schema.nonEmpty && schema.get.nonEmpty) { 94 | throw DeltaSharingErrors.specifySchemaAtReadTimeException 95 | } 96 | val options = new DeltaSharingOptions(parameters) 97 | val path = options.options.getOrElse("path", throw DeltaSharingErrors.pathNotSpecifiedException) 98 | val deltaLog = RemoteDeltaLog(path, forStreaming = true, options.responseFormat) 99 | 100 | DeltaSharingSource(SparkSession.active, deltaLog, options) 101 | } 102 | 103 | override def shortName(): String = "deltaSharing" 104 | } 105 | 106 | private[sharing] object DeltaSharingDataSource { 107 | def setupFileSystem(sqlContext: SQLContext): Unit = { 108 | // We have put our class name in the `org.apache.hadoop.fs.FileSystem` resource file. However, 109 | // this file will be loaded only if the class `FileSystem` is loaded. Hence, it won't work when 110 | // we add the library after starting Spark. Therefore we change the global `hadoopConfiguration` 111 | // to make sure we set up `DeltaSharingFileSystem` correctly. 112 | sqlContext.sparkContext.hadoopConfiguration 113 | .setIfUnset("fs.delta-sharing.impl", "io.delta.sharing.client.DeltaSharingFileSystem") 114 | PreSignedUrlCache.registerIfNeeded(SparkEnv.get) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /spark/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # This file contains code from the Apache Hadoop project (original license above). 18 | # It contains modifications, which are licensed as follows: 19 | # 20 | # Copyright (2021) The Delta Lake Project Authors. 21 | # Licensed under the Apache License, Version 2.0 (the "License"); 22 | # you may not use this file except in compliance with the License. 23 | # You may obtain a copy of the License at 24 | # http://www.apache.org/licenses/LICENSE-2.0 25 | # Unless required by applicable law or agreed to in writing, software 26 | # distributed under the License is distributed on an "AS IS" BASIS, 27 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 28 | # See the License for the specific language governing permissions and 29 | # limitations under the License. 30 | # 31 | 32 | # Set everything to be logged to the file target/unit-tests.log 33 | test.appender=file 34 | log4j.rootCategory=INFO, ${test.appender} 35 | log4j.appender.file=org.apache.log4j.FileAppender 36 | log4j.appender.file.append=true 37 | log4j.appender.file.file=target/unit-tests.log 38 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 39 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 40 | 41 | # Tests that launch java subprocesses can set the "test.appender" system property to 42 | # "console" to avoid having the child process's logs overwrite the unit test's 43 | # log file. 44 | log4j.appender.console=org.apache.log4j.ConsoleAppender 45 | log4j.appender.console.target=System.err 46 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 47 | log4j.appender.console.layout.ConversionPattern=%t: %m%n 48 | 49 | # Ignore messages below warning level from Jetty, because it's a bit verbose 50 | log4j.logger.org.spark_project.jetty=WARN 51 | -------------------------------------------------------------------------------- /spark/src/test/scala/io/delta/sharing/spark/DeltaSharingIntegrationTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.io.File 20 | import java.nio.charset.StandardCharsets.UTF_8 21 | import java.nio.file.Files 22 | import java.util.concurrent.{CountDownLatch, TimeUnit} 23 | 24 | import scala.sys.process._ 25 | import scala.util.Try 26 | 27 | import org.apache.commons.io.FileUtils 28 | import org.apache.hadoop.conf.Configuration 29 | import org.apache.spark.SparkFunSuite 30 | import org.scalatest.BeforeAndAfterAll 31 | 32 | import io.delta.sharing.client.{DeltaSharingFileProfileProvider, DeltaSharingProfileProvider} 33 | 34 | trait DeltaSharingIntegrationTest extends SparkFunSuite with BeforeAndAfterAll { 35 | 36 | def shouldRunIntegrationTest: Boolean = { 37 | sys.env.get("AWS_ACCESS_KEY_ID").exists(_.length > 0) && 38 | sys.env.get("AZURE_TEST_ACCOUNT_KEY").exists(_.length > 0) && 39 | sys.env.get("GOOGLE_APPLICATION_CREDENTIALS").exists(_.length > 0) 40 | } 41 | 42 | @volatile private var process: Process = _ 43 | @volatile private var pidFile: File = _ 44 | var testProfileFile: File = _ 45 | 46 | val TEST_PORT = 12345 47 | 48 | override def beforeAll(): Unit = { 49 | super.beforeAll() 50 | if (shouldRunIntegrationTest) { 51 | pidFile = Files.createTempFile("delta-sharing-server", ".pid").toFile 52 | testProfileFile = Files.createTempFile("delta-test", ".share").toFile 53 | FileUtils.writeStringToFile(testProfileFile, 54 | s"""{ 55 | | "shareCredentialsVersion": 1, 56 | | "endpoint": "https://localhost:$TEST_PORT/delta-sharing", 57 | | "bearerToken": "dapi5e3574ec767ca1548ae5bbed1a2dc04d" 58 | |}""".stripMargin, UTF_8) 59 | 60 | val startLatch = new CountDownLatch(1) 61 | new Thread("Run TestDeltaSharingServer") { 62 | setDaemon(true) 63 | 64 | override def run(): Unit = { 65 | val processLogger = ProcessLogger { stdout => 66 | // scalastyle:off println 67 | println(stdout) 68 | // scalastyle:on println 69 | if (stdout.contains(s"https://127.0.0.1:$TEST_PORT/")) { 70 | startLatch.countDown() 71 | } 72 | } 73 | process = 74 | Seq( 75 | "/bin/bash", 76 | "-c", 77 | s"cd .. && build/sbt 'server / Test / runMain " + 78 | s"io.delta.sharing.server.TestDeltaSharingServer ${pidFile.getCanonicalPath}'") 79 | .run(processLogger) 80 | process.exitValue() 81 | process = null 82 | startLatch.countDown() 83 | } 84 | }.start() 85 | try { 86 | assert(startLatch.await(120, TimeUnit.SECONDS), "the server didn't start in 120 seconds") 87 | if (process == null) { 88 | fail("the process exited with an error") 89 | } 90 | } catch { 91 | case e: Throwable => 92 | if (process != null) { 93 | process.destroy() 94 | process = null 95 | } 96 | throw e 97 | } 98 | } 99 | } 100 | 101 | override def afterAll(): Unit = { 102 | if (shouldRunIntegrationTest) { 103 | try { 104 | org.apache.hadoop.fs.FileSystem.closeAll() 105 | if (process != null) { 106 | process.destroy() 107 | process = null 108 | } 109 | if (pidFile != null) { 110 | val pid = FileUtils.readFileToString(pidFile) 111 | Try(pid.toLong).foreach { pid => 112 | // scalastyle:off println 113 | println(s"Killing $pid") 114 | // scalastyle:on println 115 | s"kill -9 $pid".! 116 | } 117 | pidFile.delete() 118 | } 119 | if (testProfileFile != null) { 120 | testProfileFile.delete() 121 | } 122 | } finally { 123 | super.afterAll() 124 | } 125 | } 126 | } 127 | 128 | def testProfileProvider: DeltaSharingProfileProvider = { 129 | new DeltaSharingFileProfileProvider(new Configuration, testProfileFile.getCanonicalPath) 130 | } 131 | 132 | def integrationTest(testName: String)(func: => Unit): Unit = { 133 | test(testName) { 134 | assume(shouldRunIntegrationTest) 135 | func 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /spark/src/test/scala/io/delta/sharing/spark/DeltaSharingSourceOffsetSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.util.UUID 20 | 21 | import com.fasterxml.jackson.databind.exc.InvalidFormatException 22 | import org.apache.spark.sql.QueryTest 23 | import org.apache.spark.sql.execution.streaming.SerializedOffset 24 | import org.apache.spark.sql.test.SharedSparkSession 25 | 26 | class DeltaSharingSourceOffsetSuite extends QueryTest 27 | with SharedSparkSession with DeltaSharingIntegrationTest { 28 | 29 | import testImplicits._ 30 | 31 | test("DeltaSharingSourceOffset sourceVersion - unknown value") { 32 | // Set unknown sourceVersion as the max allowed version plus 1. 33 | var unknownVersion = 2 34 | 35 | val json = 36 | s""" 37 | |{ 38 | | "sourceVersion": $unknownVersion, 39 | | "tableVersion": 1, 40 | | "index": 1, 41 | | "isStartingVersion": true 42 | |} 43 | """.stripMargin 44 | val e = intercept[IllegalStateException] { 45 | DeltaSharingSourceOffset(UUID.randomUUID().toString, SerializedOffset(json)) 46 | } 47 | assert(e.getMessage.contains("is not equal to supported reader version")) 48 | } 49 | 50 | test("DeltaSharingSourceOffset sourceVersion - invalid value") { 51 | val json = 52 | """ 53 | |{ 54 | | "sourceVersion": "foo", 55 | | "tableVersion": 1, 56 | | "index": 1, 57 | | "isStartingVersion": true 58 | |} 59 | """.stripMargin 60 | val e = intercept[InvalidFormatException] { 61 | DeltaSharingSourceOffset(UUID.randomUUID().toString, SerializedOffset(json)) 62 | } 63 | for (msg <- Seq("foo", "not a valid")) { 64 | assert(e.getMessage.contains(msg)) 65 | } 66 | } 67 | 68 | test("DeltaSharingSourceOffset sourceVersion - missing ") { 69 | val json = 70 | """ 71 | |{ 72 | | "tableVersion": 1, 73 | | "index": 1, 74 | | "isStartingVersion": true 75 | |} 76 | """.stripMargin 77 | val e = intercept[IllegalStateException] { 78 | DeltaSharingSourceOffset(UUID.randomUUID().toString, SerializedOffset(json)) 79 | } 80 | for (msg <- Seq("The table reader version", "is not equal to")) { 81 | assert(e.getMessage.contains(msg)) 82 | } 83 | } 84 | 85 | test("DeltaSharingSourceOffset - unmatched table id") { 86 | val json = 87 | s""" 88 | |{ 89 | | "tableId": "${UUID.randomUUID().toString}", 90 | | "sourceVersion": 1, 91 | | "tableVersion": 1, 92 | | "index": 1, 93 | | "isStartingVersion": true 94 | |} 95 | """.stripMargin 96 | val e = intercept[IllegalStateException] { 97 | DeltaSharingSourceOffset(UUID.randomUUID().toString, SerializedOffset(json)) 98 | } 99 | for (msg <- Seq("delete", "checkpoint", "restart")) { 100 | assert(e.getMessage.contains(msg)) 101 | } 102 | } 103 | 104 | test("DeltaSharingSourceOffset - validateOffsets") { 105 | def testValidateOffset( 106 | previousTableVersion: Long, 107 | previousIndex: Long, 108 | previousIsStarting: Boolean, 109 | currentTableVersion: Long, 110 | currentIndex: Long, 111 | currentIsStarting: Boolean, 112 | errorMessage: Option[String] 113 | ): Unit = { 114 | val previousOffset = DeltaSharingSourceOffset( 115 | sourceVersion = 1, 116 | tableId = "foo", 117 | tableVersion = previousTableVersion, 118 | index = previousIndex, 119 | isStartingVersion = previousIsStarting) 120 | val currentOffset = DeltaSharingSourceOffset( 121 | sourceVersion = 1, 122 | tableId = "foo", 123 | tableVersion = currentTableVersion, 124 | index = currentIndex, 125 | isStartingVersion = currentIsStarting) 126 | if (errorMessage.isDefined) { 127 | assert(intercept[IllegalStateException] { 128 | DeltaSharingSourceOffset.validateOffsets(previousOffset, currentOffset) 129 | }.getMessage.contains(errorMessage.get)) 130 | } else { 131 | DeltaSharingSourceOffset.validateOffsets(previousOffset, currentOffset) 132 | } 133 | } 134 | 135 | // No errors on forward moving offset 136 | testValidateOffset(4, 10, false, 4, 10, false, None) 137 | testValidateOffset(4, 10, false, 4, 11, false, None) 138 | testValidateOffset(4, 10, false, 5, 1, false, None) 139 | testValidateOffset(4, 10, true, 4, 10, true, None) 140 | testValidateOffset(4, 10, true, 4, 11, true, None) 141 | testValidateOffset(4, 10, true, 5, 1, true, None) 142 | 143 | // errors on backward moving offset 144 | testValidateOffset(4, 10, false, 4, 9, false, Some("Found invalid offsets. Previous:")) 145 | testValidateOffset(4, 10, false, 3, 11, false, Some("Found invalid offsets. Previous:")) 146 | testValidateOffset(4, 10, true, 4, 9, true, Some("Found invalid offsets. Previous:")) 147 | testValidateOffset(4, 10, true, 3, 11, true, Some("Found invalid offsets. Previous:")) 148 | 149 | // isStartingVersion flipping from true to false: ok 150 | testValidateOffset(4, 10, true, 4, 10, false, None) 151 | // isStartingVersion flipping from false to true: error 152 | testValidateOffset(4, 10, false, 4, 10, true, Some( 153 | "Found invalid offsets: 'isStartingVersion' fliped incorrectly.")) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /spark/src/test/scala/io/delta/sharing/spark/TestStorageProxyServer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import scala.util.Try 20 | 21 | import jakarta.servlet.http.{HttpServletRequest, HttpServletResponse} 22 | import org.sparkproject.jetty.client.HttpClient 23 | import org.sparkproject.jetty.http.HttpMethod 24 | import org.sparkproject.jetty.server.{Request, Server} 25 | import org.sparkproject.jetty.server.handler.AbstractHandler 26 | 27 | 28 | /** 29 | * A simple proxy server that forwards storage access while upgrading the connection to https. 30 | * This is used to test the behavior of the DeltaSharingFileSystem when 31 | * "spark.delta.sharing.network.never.use.https" is set to true. 32 | */ 33 | class TestStorageProxyServer { 34 | private val server = new Server(0) 35 | private val httpClient = new HttpClient() 36 | server.setHandler(new ProxyHandler) 37 | 38 | def initialize(): Unit = { 39 | new Thread(() => { 40 | Try(httpClient.start()) 41 | Try(server.start()) 42 | }).start() 43 | 44 | do { 45 | Thread.sleep(100) 46 | } while (!server.isStarted()) 47 | } 48 | 49 | def stop(): Unit = { 50 | Try(server.stop()) 51 | Try(httpClient.stop()) 52 | } 53 | 54 | def getPort(): Int = { 55 | server.getURI().getPort() 56 | } 57 | 58 | def getHost(): String = { 59 | server.getURI().getHost 60 | } 61 | 62 | private class ProxyHandler extends AbstractHandler { 63 | override def handle(target: String, 64 | baseRequest: Request, 65 | request: HttpServletRequest, 66 | response: HttpServletResponse): Unit = { 67 | 68 | Option(request.getHeader("Host")) match { 69 | case Some(host) => 70 | // upgrade bucket access call from http -> https 71 | val uri = "https://" + host + request.getRequestURI.replace("null", "") + 72 | "?" + request.getQueryString 73 | 74 | val res = httpClient.newRequest(uri) 75 | .method(HttpMethod.GET) 76 | .header("Range", request.getHeader("Range")) 77 | .send() 78 | 79 | response.setStatus(res.getStatus) 80 | res.getHeaders.forEach { header => 81 | response.setHeader(header.getName, header.getValue) 82 | } 83 | val out = response.getOutputStream 84 | out.write(res.getContent, 0, res.getContent.length) 85 | out.flush() 86 | out.close() 87 | 88 | baseRequest.setHandled(true) 89 | 90 | case None => 91 | response.sendError(HttpServletResponse.SC_BAD_REQUEST, "No forwarding URL provided") 92 | } 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /spark/src/test/scala/io/delta/sharing/spark/TestUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.io.File 20 | import java.nio.file.Files 21 | import java.util.UUID 22 | 23 | import org.apache.commons.io.FileUtils 24 | import org.apache.spark.sql.catalyst.util.DateTimeUtils._ 25 | import org.apache.spark.sql.internal.SQLConf 26 | import org.apache.spark.unsafe.types.UTF8String 27 | 28 | object TestUtils { 29 | 30 | def withTempDir(f: File => Unit): Unit = { 31 | val dir = Files.createTempDirectory(UUID.randomUUID().toString).toFile 32 | try f(dir) finally { 33 | FileUtils.deleteDirectory(dir) 34 | } 35 | } 36 | 37 | def withTempDirs(f: (File, File) => Unit): Unit = { 38 | withTempDir { file1 => 39 | withTempDir { file2 => 40 | f(file1, file2) 41 | } 42 | } 43 | } 44 | 45 | def sqlDate(date: String): java.sql.Date = { 46 | toJavaDate(stringToDate(UTF8String.fromString(date)).get) 47 | } 48 | 49 | def sqlTimestamp(timestamp: String): java.sql.Timestamp = { 50 | toJavaTimestamp(stringToTimestamp( 51 | UTF8String.fromString(timestamp), 52 | getZoneId(SQLConf.get.sessionLocalTimeZone)).get) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "1.1.0" 2 | --------------------------------------------------------------------------------