├── version.sbt ├── .gitattributes ├── images └── delta-sharing.png ├── spark └── src │ ├── main │ ├── resources │ │ └── META-INF │ │ │ └── services │ │ │ ├── org.apache.hadoop.fs.FileSystem │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── scala │ │ ├── org │ │ └── apache │ │ │ └── spark │ │ │ └── sql │ │ │ └── DeltaSharingScanUtils.scala │ │ └── io │ │ └── delta │ │ └── sharing │ │ └── spark │ │ ├── util │ │ ├── JsonUtils.scala │ │ └── RetryUtils.scala │ │ ├── perf │ │ └── DeltaSharingLimitPushDown.scala │ │ ├── InMemoryHttpInputStream.scala │ │ ├── DeltaSharingProfileProvider.scala │ │ ├── RemoteDeltaCDFRelation.scala │ │ ├── DeltaSharingDataSource.scala │ │ ├── model.scala │ │ ├── RemoteDeltaFileIndex.scala │ │ ├── DeltaSharingFileSystem.scala │ │ └── RandomAccessHttpInputStream.scala │ └── test │ ├── scala │ ├── io │ │ └── delta │ │ │ └── sharing │ │ │ └── spark │ │ │ ├── util │ │ │ └── RetryUtilsSuite.scala │ │ │ ├── DeltaSharingFileSystemSuite.scala │ │ │ ├── TestDeltaSharingClient.scala │ │ │ ├── DeltaSharingFileProfileProviderSuite.scala │ │ │ └── DeltaSharingIntegrationTest.scala │ └── org │ │ └── apache │ │ └── spark │ │ └── delta │ │ └── sharing │ │ └── CachedTableManagerSuite.scala │ └── resources │ └── log4j.properties ├── examples ├── open-datasets.share ├── README.md └── python │ ├── quickstart_pandas.py │ └── quickstart_spark.py ├── python ├── delta_sharing │ ├── tests │ │ ├── test_profile.json │ │ ├── __init__.py │ │ ├── test_converter.py │ │ └── conftest.py │ ├── version.py │ ├── _yarl_patch.py │ ├── __init__.py │ ├── converter.py │ ├── reader.py │ └── protocol.py ├── requirements-dev.txt ├── dev │ ├── tox.ini │ ├── reformat │ ├── pytest │ └── lint-python ├── README.md └── setup.py ├── project ├── build.properties └── plugins.sbt ├── server └── src │ ├── test │ ├── resources │ │ └── core-site.xml │ └── scala │ │ └── io │ │ └── delta │ │ ├── sharing │ │ └── server │ │ │ ├── CloudFileSignerSuite.scala │ │ │ ├── TestDeltaSharingServer.scala │ │ │ ├── config │ │ │ └── ServerConfigSuite.scala │ │ │ └── TestResource.scala │ │ └── standalone │ │ └── internal │ │ └── PartitionFilterUtilsSuite.scala │ ├── main │ ├── scala │ │ └── io │ │ │ └── delta │ │ │ ├── standalone │ │ │ └── internal │ │ │ │ ├── DeltaDataSource.scala │ │ │ │ ├── DeltaCDFErrors.scala │ │ │ │ ├── DeltaSharingHistoryManager.scala │ │ │ │ └── PartitionFilterUtils.scala │ │ │ └── sharing │ │ │ └── server │ │ │ ├── util │ │ │ └── JsonUtils.scala │ │ │ ├── exceptions.scala │ │ │ ├── model.scala │ │ │ ├── SharedTableManager.scala │ │ │ ├── config │ │ │ └── ServerConfig.scala │ │ │ └── CloudFileSigner.scala │ └── protobuf │ │ └── protocol.proto │ └── universal │ └── conf │ └── delta-sharing-server.yaml.template ├── dev └── release.sh ├── .gitignore ├── .github └── workflows │ └── build-and-test.yml └── CONTRIBUTING.md /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "0.6.0-SNAPSHOT" 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.bat text eol=crlf 2 | *.cmd text eol=crlf 3 | *.bin binary 4 | -------------------------------------------------------------------------------- /images/delta-sharing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/delta-sharing/main/images/delta-sharing.png -------------------------------------------------------------------------------- /spark/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem: -------------------------------------------------------------------------------- 1 | io.delta.sharing.spark.DeltaSharingFileSystem -------------------------------------------------------------------------------- /spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | io.delta.sharing.spark.DeltaSharingDataSource -------------------------------------------------------------------------------- /examples/open-datasets.share: -------------------------------------------------------------------------------- 1 | { 2 | "shareCredentialsVersion": 1, 3 | "endpoint": "https://sharing.delta.io/delta-sharing/", 4 | "bearerToken": "faaie590d541265bcab1f2de9813274bf233" 5 | } -------------------------------------------------------------------------------- /python/delta_sharing/tests/test_profile.json: -------------------------------------------------------------------------------- 1 | { 2 | "shareCredentialsVersion": 1, 3 | "endpoint": "https://localhost:12345/delta-sharing/", 4 | "bearerToken": "dapi5e3574ec767ca1548ae5bbed1a2dc04d", 5 | "expirationTime": "2021-11-12T00:12:29.0Z" 6 | } 7 | -------------------------------------------------------------------------------- /python/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # Dependencies. When you update don't forget to update setup.py. 2 | pandas 3 | pyarrow>=4.0.0 4 | fsspec>=0.7.4 5 | requests 6 | aiohttp 7 | yarl>=1.6.0 8 | 9 | # Linter 10 | mypy==0.812 11 | flake8 12 | 13 | # Code formatter. Only support Python 3.6+ 14 | black==21.12b0 15 | 16 | # Test 17 | pytest 18 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (2021) The Delta Lake Project Authors. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | 14 | sbt.version=1.5.0 15 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/delta_sharing/version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | __version__ = "0.5.0" 18 | -------------------------------------------------------------------------------- /server/src/test/resources/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | fs.azure.account.key.deltasharingtest.blob.core.windows.net 7 | ${azure.account.key} 8 | 9 | 10 | 11 | fs.azure.account.auth.type.deltasharingtest.dfs.core.windows.net 12 | SharedKey 13 | 14 | 15 | 16 | 17 | fs.azure.account.key.deltasharingtest.dfs.core.windows.net 18 | ${azure.account.key} 19 | 20 | 21 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ## Delta Sharing examples 2 | In this folder there are examples taken from the delta.io/delta-sharing quickstart guide and docs. They are available in Python and can be run if the prerequisites are satisfied. 3 | The profile file from the open, example Delta Sharing Server is downloaded and located in this folder. 4 | 5 | ### Prerequisites 6 | * For Python examples, Python3.6+, Delta-Sharing Python Connector, PySpark need to be installed, see [the project docs](https://github.com/delta-io/delta-sharing) for details. 7 | 8 | ### Instructions 9 | * To run the example of PySpark in Python run `spark-submit --packages io.delta:delta-sharing-spark_2.12:0.1.0 ./python/quickstart_spark.py` 10 | * To run the example of pandas DataFrame in Python run `python3 ./python/quickstart_pandas.py` -------------------------------------------------------------------------------- /python/dev/tox.ini: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | [pycodestyle] 18 | ignore=E203,E226,E231,E241,E305,E402,E722,E731,E741,W503,W504 19 | max-line-length=100 20 | exclude=.git/*,docs/build/* 21 | -------------------------------------------------------------------------------- /python/delta_sharing/_yarl_patch.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | try: 17 | from yarl import URL 18 | from yarl._quoting import _Quoter 19 | 20 | # Patch yarl.URL to not replace '%3D' with '=' which would break GCS pre-signed urls 21 | URL._PATH_REQUOTER = _Quoter(safe="@:", protected="/+=") # type: ignore 22 | except: 23 | pass 24 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/standalone/internal/DeltaDataSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package io.delta.standalone.internal 17 | 18 | /** DeltaDataSource constants. */ 19 | object DeltaDataSource { 20 | // Constants for cdf parameters 21 | final val CDF_START_VERSION_KEY = "startingVersion" 22 | 23 | final val CDF_START_TIMESTAMP_KEY = "startingTimestamp" 24 | 25 | final val CDF_END_VERSION_KEY = "endingVersion" 26 | 27 | final val CDF_END_TIMESTAMP_KEY = "endingTimestamp" 28 | } 29 | -------------------------------------------------------------------------------- /spark/src/main/scala/org/apache/spark/sql/DeltaSharingScanUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.sql 18 | 19 | import org.apache.spark.sql.execution.datasources.LogicalRelation 20 | 21 | object DeltaSharingScanUtils { 22 | // A wrapper to expose Dataset.ofRows function. 23 | // This is needed because Dataset object is in private[sql] scope. 24 | def ofRows(spark: SparkSession, plan: LogicalRelation): DataFrame = { 25 | Dataset.ofRows(spark, plan) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # Delta Sharing 2 | 3 | [Delta Sharing](https://delta.io/sharing) is an open protocol for secure real-time exchange of large datasets, which enables secure data sharing across different computing platforms. It lets organizations share access to existing [Delta Lake](https://delta.io) and [Apache Parquet](https://parquet.apache.org) tables with other organizations, who can then directly read the table in Pandas, Apache Spark, or any other software that implements the open protocol. 4 | 5 | This is the Python client library for Delta Sharing, which lets you load shared tables as [pandas](https://pandas.pydata.org/) DataFrames or as [Apache Spark](http://spark.apache.org/) DataFrames if running in PySpark with the [Apache Spark Connector library](https://github.com/delta-io/delta-sharing#set-up-apache-spark). 6 | 7 | ## Installation and Usage 8 | 9 | 1. Install using `pip install delta-sharing`. 10 | 2. To use the Python Connector, see [the project docs](https://github.com/delta-io/delta-sharing) for details. 11 | 12 | ## Documentation 13 | 14 | This README only contains basic information about the Delta Sharing Python Connector. Please read [the project documentation](https://github.com/delta-io/delta-sharing) for full usage details. 15 | -------------------------------------------------------------------------------- /python/delta_sharing/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from delta_sharing.delta_sharing import SharingClient, load_as_pandas, load_as_spark 18 | from delta_sharing.delta_sharing import load_table_changes_as_pandas, load_table_changes_as_spark 19 | from delta_sharing.protocol import Share, Schema, Table 20 | from delta_sharing.version import __version__ 21 | 22 | 23 | __all__ = [ 24 | "SharingClient", 25 | "Share", 26 | "Schema", 27 | "Table", 28 | "load_as_pandas", 29 | "load_as_spark", 30 | "load_table_changes_as_pandas", 31 | "load_table_changes_as_spark", 32 | "__version__", 33 | ] 34 | -------------------------------------------------------------------------------- /server/src/test/scala/io/delta/sharing/server/CloudFileSignerSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.scalatest.FunSuite 21 | 22 | class CloudFileSignerSuite extends FunSuite { 23 | 24 | test("GCSFileSigner.getBucketAndObjectNames") { 25 | assert(GCSFileSigner.getBucketAndObjectNames(new Path("gs://delta-sharing-test/foo")) 26 | == ("delta-sharing-test", "foo")) 27 | assert(GCSFileSigner.getBucketAndObjectNames(new Path("gs://delta_sharing_test/foo")) 28 | == ("delta_sharing_test", "foo")) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /python/dev/reformat: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright (C) 2021 The Delta Lake Project Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # The current directory of the script. 19 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 20 | FWDIR="$( cd "$DIR"/.. && pwd )" 21 | cd "$FWDIR" 22 | 23 | PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python}" 24 | 25 | BLACK_BUILD="$PYTHON_EXECUTABLE -m black" 26 | BLACK_VERSION="21.12b0" 27 | $BLACK_BUILD 2> /dev/null 28 | if [ $? -ne 0 ]; then 29 | echo "The '$BLACK_BUILD' command was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'." 30 | exit 1 31 | fi 32 | 33 | $BLACK_BUILD delta_sharing --line-length 100 34 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | resolvers += Resolver.url("artifactory", url("https://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 18 | 19 | resolvers += "Typesafe Repository" at "https://repo.typesafe.com/typesafe/releases/" 20 | 21 | resolvers += Resolver.url( 22 | "typesafe sbt-plugins", 23 | url("https://dl.bintray.com/typesafe/sbt-plugins"))(Resolver.ivyStylePatterns) 24 | 25 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13") 26 | 27 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3") 28 | 29 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 30 | 31 | addSbtPlugin("com.thesamet" % "sbt-protoc" % "1.0.2") 32 | 33 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.6") 34 | 35 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") 36 | 37 | libraryDependencies += "com.thesamet.scalapb" %% "compilerplugin" % "0.11.1" 38 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/util/JsonUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark.util 18 | 19 | import com.fasterxml.jackson.annotation.JsonInclude.Include 20 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} 21 | import com.fasterxml.jackson.module.scala.{DefaultScalaModule, ScalaObjectMapper} 22 | 23 | private[sharing] object JsonUtils { 24 | /** Used to convert between classes and JSON. */ 25 | lazy val mapper = { 26 | val _mapper = new ObjectMapper with ScalaObjectMapper 27 | _mapper.setSerializationInclusion(Include.NON_ABSENT) 28 | _mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) 29 | _mapper.registerModule(DefaultScalaModule) 30 | _mapper 31 | } 32 | 33 | def toJson[T: Manifest](obj: T): String = { 34 | mapper.writeValueAsString(obj) 35 | } 36 | 37 | def fromJson[T: Manifest](json: String): T = { 38 | mapper.readValue[T](json) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /python/dev/pytest: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Copyright (C) 2021 The Delta Lake Project Authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python}" 20 | 21 | set -o pipefail 22 | set -e 23 | 24 | if ! hash pytest 2> /dev/null; then 25 | echo "The pytest command was not found. Please install 'pytest' Python package." 26 | exit 1 27 | fi 28 | 29 | # The current directory of the script. 30 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 31 | 32 | FWDIR="$( cd "$DIR"/.. && pwd )" 33 | cd "$FWDIR" 34 | 35 | if [ -n "AWS_ACCESS_KEY_ID" ]; then 36 | logopts=(-o log_cli=true -s) 37 | fi 38 | 39 | # Runs both doctests and unit tests by default, otherwise hands arguments over to pytest. 40 | if [ "$#" = 0 ]; then 41 | # delta_sharing/_yarl_patch.py is a hack to support GCS pre-signed urls. Ask pytest to not 42 | # import it automatically so that we can verify we are importing it on demand. 43 | $PYTHON_EXECUTABLE -m pytest --ignore=delta_sharing/_yarl_patch.py --verbose --showlocals --color=yes --doctest-modules delta_sharing "${logopts[@]}" 44 | else 45 | $PYTHON_EXECUTABLE -m pytest "$@" 46 | fi 47 | -------------------------------------------------------------------------------- /dev/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e -pipe 2 | 3 | export GPG_TTY=$(tty) 4 | 5 | # Switch to the project root directory 6 | cd $( dirname $0 ) 7 | cd .. 8 | 9 | # Clean up uncommitted files 10 | git clean -fdx 11 | 12 | # Clean existing artifacts 13 | build/sbt clean 14 | cd python 15 | python3 setup.py clean --all 16 | rm -rf delta_sharing.egg-info dist 17 | cd .. 18 | 19 | printf "Please type the release version: " 20 | read VERSION 21 | echo $VERSION 22 | 23 | # Update the Python connector version 24 | sed -i '' "s/__version__ = \".*\"/__version__ = \"$VERSION\"/g" python/delta_sharing/version.py 25 | git add python/delta_sharing/version.py 26 | # Use --allow-empty so that we can re-run this script even if the Python connector version has been updated 27 | git commit -m "Update Python connector version to $VERSION" --allow-empty 28 | 29 | build/sbt "release skip-tests" 30 | 31 | # Switch to the release commit 32 | git checkout v$VERSION 33 | 34 | # Generate Python artifacts 35 | cd python/ 36 | python3 setup.py sdist bdist_wheel 37 | cd .. 38 | 39 | # Generate the pre-built server package and sign files 40 | build/sbt server/universal:packageBin 41 | cd server/target/universal 42 | gpg --detach-sign --armor --sign delta-sharing-server-$VERSION.zip 43 | gpg --verify delta-sharing-server-$VERSION.zip.asc 44 | sha256sum delta-sharing-server-$VERSION.zip > delta-sharing-server-$VERSION.zip.sha256 45 | sha256sum -c delta-sharing-server-$VERSION.zip.sha256 46 | sha256sum delta-sharing-server-$VERSION.zip.asc > delta-sharing-server-$VERSION.zip.asc.sha256 47 | sha256sum -c delta-sharing-server-$VERSION.zip.asc.sha256 48 | cd - 49 | 50 | # Build the docker image 51 | build/sbt server/docker:publish 52 | 53 | git checkout main 54 | 55 | echo "=== Generated all release artifacts ===" 56 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/util/JsonUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server.util 18 | 19 | import java.io.OutputStream 20 | 21 | import com.fasterxml.jackson.annotation.JsonInclude.Include 22 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} 23 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 24 | import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper 25 | 26 | object JsonUtils { 27 | /** Used to convert between classes and JSON. */ 28 | lazy val mapper = { 29 | val _mapper = new ObjectMapper with ScalaObjectMapper 30 | _mapper.setSerializationInclusion(Include.NON_ABSENT) 31 | _mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) 32 | _mapper.registerModule(DefaultScalaModule) 33 | _mapper 34 | } 35 | 36 | def toJson[T](obj: T): String = { 37 | mapper.writeValueAsString(obj) 38 | } 39 | 40 | def toJson[T](out: OutputStream, obj: T): Unit = { 41 | mapper.writeValue(out, obj) 42 | } 43 | 44 | def fromJson[T: Manifest](json: String): T = { 45 | mapper.readValue[T](json) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /server/src/universal/conf/delta-sharing-server.yaml.template: -------------------------------------------------------------------------------- 1 | # The format version of this config file 2 | version: 1 3 | # Config shares/schemas/tables to share 4 | shares: 5 | - name: "share1" 6 | schemas: 7 | - name: "schema1" 8 | tables: 9 | - name: "table1" 10 | # S3. See https://github.com/delta-io/delta-sharing#s3 for how to config the credentials 11 | location: "s3a:///" 12 | - name: "table2" 13 | # Azure Blob Storage. See https://github.com/delta-io/delta-sharing#azure-blob-storage for how to config the credentials 14 | location: "wasbs://@" 15 | - name: "share2" 16 | schemas: 17 | - name: "schema2" 18 | tables: 19 | - name: "table3" 20 | # Azure Data Lake Storage Gen2. See https://github.com/delta-io/delta-sharing#azure-data-lake-storage-gen2 for how to config the credentials 21 | location: "abfss://@" 22 | cdfEnabled: true 23 | - name: "share3" 24 | schemas: 25 | - name: "schema3" 26 | tables: 27 | - name: "table4" 28 | # Google Cloud Storage (GCS). See https://github.com/delta-io/delta-sharing#google-cloud-storage for how to config the credentials 29 | location: "gs:///" 30 | # Set the host name that the server will use 31 | host: "localhost" 32 | # Set the port that the server will listen on. Note: using ports below 1024 33 | # may require a privileged user in some operating systems. 34 | port: 8080 35 | # Set the url prefix for the REST APIs 36 | endpoint: "/delta-sharing" 37 | # Set the timeout of S3 presigned url in seconds 38 | preSignedUrlTimeoutSeconds: 3600 39 | # How many tables to cache in the server 40 | deltaTableCacheSize: 10 41 | # Whether we can accept working with a stale version of the table. This is useful when sharing 42 | # static tables that will never be changed. 43 | stalenessAcceptable: false 44 | # Whether to evaluate user provided `predicateHints` 45 | evaluatePredicateHints: false 46 | -------------------------------------------------------------------------------- /server/src/test/scala/io/delta/sharing/server/TestDeltaSharingServer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | import java.io.File 20 | import java.lang.management.ManagementFactory 21 | 22 | import org.apache.commons.io.FileUtils 23 | 24 | import io.delta.sharing.server.config.ServerConfig 25 | 26 | /** 27 | * This is a special test class for the client projects to test end-to-end experience. It will 28 | * generate configs for testing and start the server. 29 | */ 30 | object TestDeltaSharingServer { 31 | def main(args: Array[String]): Unit = { 32 | val pid = ManagementFactory.getRuntimeMXBean().getName().split("@")(0) 33 | val pidFile = new File(args(0)) 34 | // scalastyle:off println 35 | println(s"Writing pid $pid to $pidFile") 36 | // scalastyle:off on 37 | FileUtils.writeStringToFile(pidFile, pid) 38 | if (sys.env.get("AWS_ACCESS_KEY_ID").exists(_.length > 0)) { 39 | val serverConfigPath = TestResource.setupTestTables().getCanonicalPath 40 | val serverConfig = ServerConfig.load(serverConfigPath) 41 | val server = DeltaSharingService.start(serverConfig) 42 | // Run at most 240 seconds and exit. This is to ensure we can exit even if the parent process 43 | // hits any error. 44 | Thread.sleep(240000) 45 | server.stop() 46 | } else { 47 | throw new IllegalArgumentException("Cannot find AWS_ACCESS_KEY_ID in sys.env") 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/exceptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | /** 20 | * A special exception for invalid requests happening in Delta Sharing Server. We define a special 21 | * class rather than reusing `IllegalArgumentException` so that we can ensure that the message in 22 | * `IllegalArgumentException` thrown from other libraries won't be returned to users. 23 | * 24 | * @note `message` will be in the response. Please make sure it doesn't contain any sensitive info. 25 | */ 26 | class DeltaSharingIllegalArgumentException(message: String) 27 | extends IllegalArgumentException(message) 28 | 29 | /** 30 | * A special exception for resource not found in Delta Sharing Server. We define a special 31 | * class rather than reusing `NoSuchElementException` so that we can ensure that the message in 32 | * `NoSuchElementException` thrown from other libraries won't be returned to users. 33 | * 34 | * @note `message` will be in the response. Please make sure it doesn't contain any sensitive info. 35 | */ 36 | class DeltaSharingNoSuchElementException(message: String) 37 | extends NoSuchElementException(message) 38 | 39 | 40 | /** 41 | * A special exception that wraps an unhandled exception when processing a request. 42 | * `DeltaInternalException` should never be exposed to users as an unhandled exception may contain 43 | * sensitive information. 44 | */ 45 | class DeltaInternalException(e: Throwable) extends RuntimeException(e) 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *#*# 2 | *.#* 3 | *.iml 4 | *.ipr 5 | *.iws 6 | *.pyc 7 | *.pyo 8 | *.swp 9 | *~ 10 | .DS_Store 11 | .bsp 12 | .cache 13 | .classpath 14 | .ensime 15 | .ensime_cache/ 16 | .ensime_lucene 17 | .generated-mima* 18 | .idea/ 19 | .idea_modules/ 20 | .project 21 | .pydevproject 22 | .scala_dependencies 23 | .settings 24 | /lib/ 25 | R-unit-tests.log 26 | R/unit-tests.out 27 | R/cran-check.out 28 | R/pkg/vignettes/sparkr-vignettes.html 29 | R/pkg/tests/fulltests/Rplots.pdf 30 | build/*.jar 31 | build/apache-maven* 32 | build/scala* 33 | build/zinc* 34 | cache 35 | conf/*.cmd 36 | conf/*.conf 37 | conf/*.properties 38 | conf/*.sh 39 | conf/*.xml 40 | conf/java-opts 41 | conf/slaves 42 | dependency-reduced-pom.xml 43 | derby.log 44 | dev/create-release/*final 45 | dev/create-release/*txt 46 | dev/pr-deps/ 47 | dist/ 48 | docs/_site 49 | docs/api 50 | sql/docs 51 | sql/site 52 | lib_managed/ 53 | lint-r-report.log 54 | log/ 55 | logs/ 56 | out/ 57 | project/boot/ 58 | project/build/target/ 59 | project/plugins/lib_managed/ 60 | project/plugins/project/build.properties 61 | project/plugins/src_managed/ 62 | project/plugins/target/ 63 | python/lib/pyspark.zip 64 | python/deps 65 | docs/python/_static/ 66 | docs/python/_templates/ 67 | docs/python/_build/ 68 | python/build/ 69 | python/test_coverage/coverage_data 70 | python/test_coverage/htmlcov 71 | python/pyspark/python 72 | reports/ 73 | scalastyle-on-compile.generated.xml 74 | scalastyle-output.xml 75 | scalastyle.txt 76 | spark-*-bin-*.tgz 77 | spark-tests.log 78 | src_managed/ 79 | streaming-tests.log 80 | target/ 81 | unit-tests.log 82 | work/ 83 | docs/.jekyll-metadata 84 | 85 | # For Hive 86 | TempStatsStore/ 87 | metastore/ 88 | metastore_db/ 89 | sql/hive-thriftserver/test_warehouses 90 | warehouse/ 91 | spark-warehouse/ 92 | 93 | # For R session data 94 | .RData 95 | .RHistory 96 | .Rhistory 97 | *.Rproj 98 | *.Rproj.* 99 | 100 | .Rproj.user 101 | 102 | **/src/main/resources/js 103 | 104 | # For SBT 105 | .jvmopts 106 | 107 | # For Python 108 | *.egg-info 109 | 110 | # For VSCode 111 | *.vscode 112 | 113 | # For Metals 114 | *.metals 115 | 116 | # For venv 117 | *.venv 118 | -------------------------------------------------------------------------------- /examples/python/quickstart_pandas.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (2021) The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import delta_sharing 19 | 20 | # Point to the profile file. It can be a file on the local file system or a file on a remote storage. 21 | profile_file = os.path.dirname(__file__) + "/../open-datasets.share" 22 | 23 | # Create a SharingClient. 24 | client = delta_sharing.SharingClient(profile_file) 25 | 26 | # List all shared tables. 27 | print("########### All Available Tables #############") 28 | print(client.list_all_tables()) 29 | 30 | # Create a url to access a shared table. 31 | # A table path is the profile file path following with `#` and the fully qualified name of a table (`..`). 32 | table_url = profile_file + "#delta_sharing.default.owid-covid-data" 33 | 34 | # Fetch 10 rows from a table and convert it to a Pandas DataFrame. This can be used to read sample data from a table that cannot fit in the memory. 35 | print("########### Loading 10 rows from delta_sharing.default.owid-covid-data as a Pandas DataFrame #############") 36 | data = delta_sharing.load_as_pandas(table_url, limit=10) 37 | 38 | # Print the sample. 39 | print("########### Show the fetched 10 rows #############") 40 | print(data) 41 | 42 | # Load a table as a Pandas DataFrame. This can be used to process tables that can fit in the memory. 43 | print("########### Loading delta_sharing.default.owid-covid-data as a Pandas DataFrame #############") 44 | data = delta_sharing.load_as_pandas(table_url) 45 | 46 | # Do whatever you want to your share data! 47 | print("########### Show Data #############") 48 | print(data[data["iso_code"] == "USA"].head(10)) 49 | -------------------------------------------------------------------------------- /spark/src/test/scala/io/delta/sharing/spark/util/RetryUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark.util 18 | 19 | import java.io.{InterruptedIOException, IOException} 20 | 21 | import scala.collection.mutable.ArrayBuffer 22 | 23 | import org.apache.spark.SparkFunSuite 24 | 25 | class RetryUtilsSuite extends SparkFunSuite { 26 | import RetryUtils._ 27 | 28 | test("shouldRetry") { 29 | assert(shouldRetry(new UnexpectedHttpStatus("error", 429))) 30 | assert(shouldRetry(new UnexpectedHttpStatus("error", 500))) 31 | assert(!shouldRetry(new UnexpectedHttpStatus("error", 404))) 32 | assert(!shouldRetry(new InterruptedException)) 33 | assert(!shouldRetry(new InterruptedIOException)) 34 | assert(shouldRetry(new IOException)) 35 | assert(!shouldRetry(new RuntimeException)) 36 | } 37 | 38 | test("runWithExponentialBackoff") { 39 | val sleeps = new ArrayBuffer[Long]() 40 | RetryUtils.sleeper = (sleepMs: Long) => sleeps += sleepMs 41 | // Retry case 42 | intercept[UnexpectedHttpStatus] { 43 | runWithExponentialBackoff(10) { 44 | throw new UnexpectedHttpStatus("error", 429) 45 | } 46 | } 47 | // Run 11 times should sleep 10 times 48 | assert(sleeps.length == 10) 49 | assert(sleeps == Seq(100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200)) 50 | // No retry case 51 | sleeps.clear() 52 | intercept[RuntimeException] { 53 | runWithExponentialBackoff(10) { 54 | throw new RuntimeException 55 | } 56 | } 57 | assert(sleeps == Seq()) 58 | RetryUtils.sleeper = (sleepMs: Long) => Thread.sleep(sleepMs) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /examples/python/quickstart_spark.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (2021) The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import delta_sharing 19 | from pyspark.sql import SparkSession 20 | 21 | # Point to the profile file. It can be a file on the local file system or a file on a remote storage. 22 | profile_file = os.path.dirname(__file__) + "/../open-datasets.share" 23 | 24 | # Create a SharingClient. 25 | client = delta_sharing.SharingClient(profile_file) 26 | 27 | # List all shared tables. 28 | print("########### All Available Tables #############") 29 | print(client.list_all_tables()) 30 | 31 | # Create a url to access a shared table. 32 | # A table path is the profile file path following with `#` and the fully qualified name of a table (`..`). 33 | table_url = profile_file + "#delta_sharing.default.owid-covid-data" 34 | 35 | # Create Spark with delta sharing connector 36 | spark = SparkSession.builder \ 37 | .appName("delta-sharing-demo") \ 38 | .master("local[*]") \ 39 | .getOrCreate() 40 | 41 | # Read data using format "deltaSharing" 42 | print("########### Loading delta_sharing.default.owid-covid-data with Spark #############") 43 | df1 = spark.read.format("deltaSharing").load(table_url) \ 44 | .where("iso_code == 'USA'") \ 45 | .select("iso_code", "total_cases", "human_development_index") \ 46 | .show() 47 | 48 | # Or if the code is running with PySpark, you can use `load_as_spark` to load the table as a Spark DataFrame. 49 | print("########### Loading delta_sharing.default.owid-covid-data with Spark #############") 50 | data = delta_sharing.load_as_spark(table_url) 51 | data.where("iso_code == 'USA'") \ 52 | .select("iso_code", "total_cases", "human_development_index").show() 53 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/util/RetryUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark.util 18 | 19 | import java.io.{InterruptedIOException, IOException} 20 | 21 | import scala.util.control.NonFatal 22 | 23 | import org.apache.spark.internal.Logging 24 | 25 | private[sharing] object RetryUtils extends Logging { 26 | 27 | // Expose it for testing 28 | @volatile var sleeper: Long => Unit = (sleepMs: Long) => Thread.sleep(sleepMs) 29 | 30 | def runWithExponentialBackoff[T](numRetries: Int)(func: => T): T = { 31 | var times = 0 32 | var sleepMs = 100 33 | while (true) { 34 | times += 1 35 | try { 36 | return func 37 | } catch { 38 | case NonFatal(e) if shouldRetry(e) && times <= numRetries => 39 | logWarning(s"Sleeping $sleepMs ms to retry because of error: ${e.getMessage}", e) 40 | sleeper(sleepMs) 41 | sleepMs *= 2 42 | } 43 | } 44 | throw new IllegalStateException("Should not happen") 45 | } 46 | 47 | def shouldRetry(t: Throwable): Boolean = { 48 | t match { 49 | case e: UnexpectedHttpStatus => 50 | if (e.statusCode == 429) { // Too Many Requests 51 | true 52 | } else if (e.statusCode >= 500 && e.statusCode < 600) { // Internal Error 53 | true 54 | } else { 55 | false 56 | } 57 | case _: InterruptedException => false 58 | case _: InterruptedIOException => false 59 | case _: IOException => true 60 | case _ => false 61 | } 62 | } 63 | } 64 | 65 | private[sharing] class UnexpectedHttpStatus(message: String, val statusCode: Int) 66 | extends IllegalStateException(message) 67 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/standalone/internal/DeltaCDFErrors.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.standalone.internal 18 | 19 | class DeltaCDFIllegalArgumentException(message: String) 20 | extends IllegalArgumentException(message) 21 | 22 | object DeltaCDFErrors { 23 | def multipleCDFBoundary(position: String): Throwable = { 24 | new DeltaCDFIllegalArgumentException(s"Multiple $position arguments provided for CDF read. " + 25 | s"Please provide one of either ${position}Timestamp or ${position}Version." 26 | ) 27 | } 28 | 29 | def noStartVersionForCDF: Throwable = { 30 | new DeltaCDFIllegalArgumentException("No startingVersion or startingTimestamp provided for " + 31 | "CDF read.") 32 | } 33 | 34 | def startVersionAfterLatestVersion(start: Long, latest: Long): Throwable = { 35 | new DeltaCDFIllegalArgumentException(s"Provided Start version($start) for reading change " + 36 | "data is invalid. Start version cannot be greater than the latest version of the " + 37 | s"table($latest)." 38 | ) 39 | } 40 | 41 | def endBeforeStartVersionInCDF(start: Long, end: Long): Throwable = { 42 | new DeltaCDFIllegalArgumentException( 43 | s"CDF range from start $start to end $end was invalid. End cannot be before start." 44 | ) 45 | } 46 | 47 | def invalidTimestamp(field: String, message: String): Throwable = { 48 | new DeltaCDFIllegalArgumentException(s"Invalid $field: $message") 49 | } 50 | 51 | def changeDataNotRecordedException(version: Long, start: Long, end: Long): Throwable = { 52 | new DeltaCDFIllegalArgumentException(s"Error getting change data for range [$start, $end] " + 53 | s"as change data was not recorded for version [$version]" 54 | ) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/perf/DeltaSharingLimitPushDown.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark.perf 18 | 19 | import org.apache.spark.sql.SparkSession 20 | import org.apache.spark.sql.catalyst.expressions.IntegerLiteral 21 | import org.apache.spark.sql.catalyst.plans.logical.{LocalLimit, LogicalPlan} 22 | import org.apache.spark.sql.catalyst.rules.Rule 23 | import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} 24 | 25 | import io.delta.sharing.spark.RemoteDeltaSnapshotFileIndex 26 | 27 | object DeltaSharingLimitPushDown extends Rule[LogicalPlan] { 28 | 29 | def setup(spark: SparkSession): Unit = synchronized { 30 | if (!spark.experimental.extraOptimizations.contains(DeltaSharingLimitPushDown) ) { 31 | spark.experimental.extraOptimizations ++= Seq(DeltaSharingLimitPushDown) 32 | } 33 | } 34 | 35 | def apply(p: LogicalPlan): LogicalPlan = { 36 | if (p.conf.getConfString("spark.delta.sharing.limitPushdown.enabled", "true").toBoolean) { 37 | p transform { 38 | case localLimit @ LocalLimit( 39 | literalExpr @ IntegerLiteral(limit), 40 | l @ LogicalRelation( 41 | r @ HadoopFsRelation(remoteIndex: RemoteDeltaSnapshotFileIndex, _, _, _, _, _), 42 | _, _, _) 43 | ) => 44 | if (remoteIndex.limitHint.isEmpty) { 45 | val spark = SparkSession.active 46 | LocalLimit(literalExpr, 47 | l.copy( 48 | relation = r.copy( 49 | location = remoteIndex.copy(limitHint = Some(limit)))(spark) 50 | ) 51 | ) 52 | } else { 53 | localLimit 54 | } 55 | } 56 | } else { 57 | p 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /spark/src/test/scala/io/delta/sharing/spark/DeltaSharingFileSystemSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import org.apache.hadoop.conf.Configuration 20 | import org.apache.hadoop.fs.Path 21 | import org.apache.spark.SparkFunSuite 22 | 23 | import io.delta.sharing.spark.model.{AddCDCFile, AddFile, AddFileForCDF, FileAction, RemoveFile} 24 | 25 | class DeltaSharingFileSystemSuite extends SparkFunSuite { 26 | import DeltaSharingFileSystem._ 27 | 28 | test("encode and decode") { 29 | val tablePath = new Path("https://delta.io/foo") 30 | 31 | val actions: Seq[FileAction] = Seq( 32 | AddFile("unused", "id", Map.empty, 100), 33 | AddFileForCDF("unused_cdf", "id_cdf", Map.empty, 200, 1, 2), 34 | AddCDCFile("unused_cdc", "id_cdc", Map.empty, 300, 1, 2), 35 | RemoveFile("unused_rem", "id_rem", Map.empty, 400, 1, 2) 36 | ) 37 | 38 | actions.foreach ( action => { 39 | assert(decode(encode(tablePath, action)) == 40 | DeltaSharingPath("https://delta.io/foo", action.id, action.size)) 41 | }) 42 | } 43 | 44 | test("file system should be cached") { 45 | val tablePath = new Path("https://delta.io/foo") 46 | val actions: Seq[FileAction] = Seq( 47 | AddFile("unused", "id", Map.empty, 100), 48 | AddFileForCDF("unused_cdf", "id_cdf", Map.empty, 200, 1, 2), 49 | AddCDCFile("unused_cdc", "id_cdc", Map.empty, 300, 1, 2), 50 | RemoveFile("unused_rem", "id_rem", Map.empty, 400, 1, 2) 51 | ) 52 | 53 | actions.foreach( action => { 54 | val path = encode(tablePath, action) 55 | val conf = new Configuration 56 | val fs = path.getFileSystem(conf) 57 | assert(fs.isInstanceOf[DeltaSharingFileSystem]) 58 | assert(fs eq path.getFileSystem(conf)) 59 | }) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /server/src/test/scala/io/delta/standalone/internal/PartitionFilterUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.standalone.internal 18 | 19 | import io.delta.standalone.internal.actions.AddFile 20 | import org.apache.spark.sql.types.StructType 21 | import org.scalatest.FunSuite 22 | 23 | class PartitionFilterUtilsSuite extends FunSuite { 24 | 25 | import PartitionFilterUtils._ 26 | 27 | test("evaluatePredicate") { 28 | val schema = StructType.fromDDL("c1 INT, c2 INT").json 29 | val add1 = AddFile("foo1", Map("c2" -> "0"), 1, 1, true) 30 | val add2 = AddFile("foo2", Map("c2" -> "1"), 1, 1, true) 31 | val addFiles = add1 :: add2 :: Nil 32 | assert(add1 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 = 0" :: Nil, addFiles)) 33 | assert(add2 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 = 1" :: Nil, addFiles)) 34 | assert(add2 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 > 0" :: Nil, addFiles)) 35 | assert(add1 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 < 1" :: Nil, addFiles)) 36 | assert(add2 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 >= 1" :: Nil, addFiles)) 37 | assert(add1 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 <= 0" :: Nil, addFiles)) 38 | assert(add2 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 <> 0" :: Nil, addFiles)) 39 | assert(add1 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 <> 1" :: Nil, addFiles)) 40 | assert(Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 is null" :: Nil, addFiles)) 41 | assert(addFiles == evaluatePredicate(schema, "c2" :: Nil, "c2 is not null" :: Nil, addFiles)) 42 | assert(addFiles == evaluatePredicate(schema, "c2" :: Nil, "c2 is not null" :: Nil, addFiles)) 43 | 44 | // Unsupported expression 45 | assert(addFiles == evaluatePredicate(schema, "c2" :: Nil, "c2 = 0 + 1" :: Nil, addFiles)) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/InMemoryHttpInputStream.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.io.{ByteArrayInputStream, EOFException} 20 | import java.net.URI 21 | 22 | import org.apache.commons.io.IOUtils 23 | import org.apache.hadoop.fs.{PositionedReadable, Seekable} 24 | 25 | /** An input stream that holds the entire content in memory to provide random access. */ 26 | private[sharing] class InMemoryHttpInputStream(uri: URI) 27 | extends ByteArrayInputStream(IOUtils.toByteArray(uri)) with Seekable with PositionedReadable { 28 | 29 | override def seek(pos: Long): Unit = synchronized { 30 | this.pos = pos.toInt 31 | } 32 | 33 | override def getPos: Long = synchronized { 34 | pos 35 | } 36 | 37 | override def seekToNewSource(targetPos: Long): Boolean = { 38 | // We don't support this feature 39 | false 40 | } 41 | 42 | override def read( 43 | position: Long, 44 | buffer: Array[Byte], 45 | offset: Int, 46 | length: Int): Int = synchronized { 47 | val oldPos = getPos() 48 | var nread = -1 49 | try { 50 | seek(position) 51 | nread = read(buffer, offset, length) 52 | } finally { 53 | seek(oldPos) 54 | } 55 | return nread 56 | } 57 | 58 | override def readFully( 59 | position: Long, 60 | buffer: Array[Byte], 61 | offset: Int, 62 | length: Int): Unit = synchronized { 63 | var nread = 0 64 | while (nread < length) { 65 | val nbytes = read(position + nread, buffer, offset + nread, length - nread) 66 | if (nbytes < 0) { 67 | throw new EOFException("End of file reached before reading fully."); 68 | } 69 | nread += nbytes 70 | } 71 | } 72 | 73 | override def readFully(position: Long, buffer: Array[Byte]): Unit = { 74 | readFully(position, buffer, 0, buffer.length) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /spark/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # This file contains code from the Apache Hadoop project (original license above). 18 | # It contains modifications, which are licensed as follows: 19 | # 20 | # Copyright (2021) The Delta Lake Project Authors. 21 | # Licensed under the Apache License, Version 2.0 (the "License"); 22 | # you may not use this file except in compliance with the License. 23 | # You may obtain a copy of the License at 24 | # http://www.apache.org/licenses/LICENSE-2.0 25 | # Unless required by applicable law or agreed to in writing, software 26 | # distributed under the License is distributed on an "AS IS" BASIS, 27 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 28 | # See the License for the specific language governing permissions and 29 | # limitations under the License. 30 | # 31 | 32 | # Set everything to be logged to the file target/unit-tests.log 33 | test.appender=file 34 | log4j.rootCategory=INFO, ${test.appender} 35 | log4j.appender.file=org.apache.log4j.FileAppender 36 | log4j.appender.file.append=true 37 | log4j.appender.file.file=target/unit-tests.log 38 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 39 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 40 | 41 | # Tests that launch java subprocesses can set the "test.appender" system property to 42 | # "console" to avoid having the child process's logs overwrite the unit test's 43 | # log file. 44 | log4j.appender.console=org.apache.log4j.ConsoleAppender 45 | log4j.appender.console.target=System.err 46 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 47 | log4j.appender.console.layout.ConversionPattern=%t: %m%n 48 | 49 | # Ignore messages below warning level from Jetty, because it's a bit verbose 50 | log4j.logger.org.spark_project.jetty=WARN 51 | -------------------------------------------------------------------------------- /server/src/main/protobuf/protocol.proto: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | syntax = "proto2"; 17 | 18 | package io.delta.sharing.server.protocol; 19 | 20 | import "scalapb/scalapb.proto"; 21 | 22 | option java_package = "io.delta.sharing.server.protocol"; 23 | 24 | option java_generate_equals_and_hash = true; 25 | option (scalapb.options).flat_package = true; 26 | 27 | // Define the JSON objects used by REST APIs. The table metadata format is not defined in this file 28 | // because it requires Map type which is not supported by Protocol Buffers Version 2. 29 | 30 | message Share { 31 | optional string name = 1; 32 | } 33 | 34 | message Schema { 35 | optional string name = 1; 36 | optional string share = 2; 37 | } 38 | 39 | message Table { 40 | optional string name = 1; 41 | optional string schema = 2; 42 | optional string share = 3; 43 | } 44 | 45 | message QueryTableRequest { 46 | repeated string predicateHints = 1; 47 | optional int64 limitHint = 2; 48 | 49 | // The table version being queried. 50 | // If not specified, the query is assumed to be for the latest version. 51 | optional int64 version = 3; 52 | } 53 | 54 | message ListSharesResponse { 55 | repeated Share items = 1; 56 | optional string next_page_token = 2; 57 | } 58 | 59 | message GetShareResponse { 60 | optional Share share = 1; 61 | } 62 | 63 | message ListSchemasResponse { 64 | repeated Schema items = 1; 65 | optional string next_page_token = 2; 66 | } 67 | 68 | message ListTablesResponse { 69 | repeated Table items = 1; 70 | optional string next_page_token = 2; 71 | } 72 | 73 | message ListAllTablesResponse { 74 | repeated Table items = 1; 75 | optional string next_page_token = 2; 76 | } 77 | 78 | // Define a special class to generate the page token for pagination. It includes the information we 79 | // need to know where we should start to query, and check whether the page token comes from the 80 | // right result. For example, we would like to throw an error when the user uses a page token 81 | // returning from ListShares and uses it in ListSchemas REST API. 82 | message PageToken { 83 | optional string id = 1; 84 | optional string share = 2; 85 | optional string schema = 3; 86 | } 87 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Copyright (C) 2021 The Delta Lake Project Authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | from io import open 19 | from os import path 20 | from setuptools import setup 21 | import sys 22 | 23 | DESCRIPTION = "Python Connector for Delta Sharing" 24 | 25 | this_directory = path.abspath(path.dirname(__file__)) 26 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 27 | LONG_DESCRIPTION = f.read() 28 | 29 | try: 30 | exec(open('delta_sharing/version.py').read()) 31 | except IOError: 32 | print("Failed to load Delta Sharing version file for packaging.", 33 | file=sys.stderr) 34 | sys.exit(-1) 35 | VERSION = __version__ # noqa 36 | 37 | setup( 38 | name='delta-sharing', 39 | version=VERSION, 40 | packages=[ 41 | 'delta_sharing', 42 | ], 43 | python_requires='>=3.7', 44 | install_requires=[ 45 | 'pandas', 46 | 'pyarrow>=4.0.0', 47 | 'fsspec>=0.7.4', 48 | 'requests', 49 | 'aiohttp', 50 | 'dataclasses;python_version<"3.7"', 51 | 'yarl>=1.6.0', 52 | ], 53 | extras_require={ 54 | 's3': ['s3fs'], 55 | 'abfs': ['adlfs'], 56 | 'adl': ['adlfs'], 57 | 'gcs': ['gcsfs'], 58 | 'gs': ['gcsfs'], 59 | }, 60 | author="The Delta Lake Project Authors", 61 | author_email="delta-users@googlegroups.com", 62 | license="Apache-2.0", 63 | description=DESCRIPTION, 64 | long_description=LONG_DESCRIPTION, 65 | long_description_content_type='text/markdown', 66 | url="https://github.com/delta-io/delta-sharing/", 67 | project_urls={ 68 | 'Source': 'https://github.com/delta-io/delta-sharing', 69 | 'Documentation': 'https://github.com/delta-io/delta-sharing', 70 | 'Issues': 'https://github.com/delta-io/delta-sharing/issues' 71 | }, 72 | classifiers=[ 73 | "Development Status :: 5 - Production/Stable", 74 | "Intended Audience :: Developers", 75 | "License :: OSI Approved :: Apache Software License", 76 | "Operating System :: OS Independent", 77 | "Topic :: Software Development :: Libraries :: Python Modules", 78 | 'Programming Language :: Python :: 3.7', 79 | 'Programming Language :: Python :: 3.8', 80 | 'Programming Language :: Python :: 3.9', 81 | ], 82 | ) 83 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/test_converter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from datetime import date 17 | from decimal import Decimal 18 | from json import loads 19 | from typing import Any 20 | 21 | import numpy as np 22 | import pandas as pd 23 | import pytest 24 | 25 | from delta_sharing.converter import to_converter, get_empty_table 26 | 27 | 28 | def test_to_converter_boolean(): 29 | converter = to_converter("boolean") 30 | assert converter("true") is True 31 | assert converter("false") is False 32 | assert converter("") is None 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "type_str,expected", 37 | [ 38 | pytest.param("byte", np.int8(1), id="byte"), 39 | pytest.param("short", np.int16(1), id="short"), 40 | pytest.param("integer", np.int32(1), id="integer"), 41 | pytest.param("long", np.int64(1), id="long"), 42 | pytest.param("float", np.float32(1), id="float"), 43 | pytest.param("double", np.float64(1), id="double"), 44 | ], 45 | ) 46 | def test_to_converter_numeric(type_str: str, expected: Any): 47 | converter = to_converter(type_str) 48 | assert converter("1") == expected 49 | assert np.isnan(converter("")) 50 | 51 | 52 | def test_to_converter_decimal(): 53 | converter = to_converter("decimal(10,0)") 54 | assert converter("1") == Decimal(1) 55 | assert converter("") is None 56 | 57 | 58 | def test_to_converter_string(): 59 | converter = to_converter("string") 60 | assert converter("abc") == "abc" 61 | assert converter("") is None 62 | 63 | 64 | def test_to_converter_date(): 65 | converter = to_converter("date") 66 | assert converter("2021-01-01") == date(2021, 1, 1) 67 | assert converter("") is None 68 | 69 | 70 | def test_to_converter_timestamp(): 71 | converter = to_converter("timestamp") 72 | assert converter("2021-04-28 23:36:47.599") == pd.Timestamp("2021-04-28 23:36:47.599") 73 | assert converter("") is pd.NaT 74 | 75 | 76 | def test_get_empty_table(): 77 | schema_string = ( 78 | '{"fields": [' 79 | '{"metadata": {},"name": "a","nullable": true,"type": "long"},' 80 | '{"metadata": {},"name": "b","nullable": true,"type": "string"}' 81 | '],"type":"struct"}' 82 | ) 83 | schema_json = loads(schema_string) 84 | pdf = get_empty_table(schema_json) 85 | assert pdf.empty 86 | assert pdf.columns.values.size == 2 87 | assert pdf.columns.values[0] == "a" 88 | assert pdf.columns.values[1] == "b" 89 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/DeltaSharingProfileProvider.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.nio.charset.StandardCharsets.UTF_8 20 | 21 | import org.apache.commons.io.IOUtils 22 | import org.apache.hadoop.conf.Configuration 23 | import org.apache.hadoop.fs.Path 24 | 25 | import io.delta.sharing.spark.util.JsonUtils 26 | 27 | case class DeltaSharingProfile( 28 | shareCredentialsVersion: Option[Int] = Some(DeltaSharingProfile.CURRENT), 29 | endpoint: String = null, 30 | bearerToken: String = null, 31 | expirationTime: String = null) 32 | 33 | object DeltaSharingProfile { 34 | val CURRENT = 1 35 | } 36 | 37 | /** 38 | * A provider that provides Delta Sharing profile for data recipient to access the shared data. 39 | * https://github.com/delta-io/delta-sharing/blob/main/PROTOCOL.md#profile-file-format. 40 | */ 41 | trait DeltaSharingProfileProvider { 42 | def getProfile: DeltaSharingProfile 43 | } 44 | 45 | /** 46 | * Load [[DeltaSharingProfile]] from a file. `conf` should be provided to load the file from remote 47 | * file systems. 48 | */ 49 | private[sharing] class DeltaSharingFileProfileProvider( 50 | conf: Configuration, 51 | file: String) extends DeltaSharingProfileProvider { 52 | 53 | val profile = { 54 | val input = new Path(file).getFileSystem(conf).open(new Path(file)) 55 | val profile = try { 56 | JsonUtils.fromJson[DeltaSharingProfile](IOUtils.toString(input, UTF_8)) 57 | } finally { 58 | input.close() 59 | } 60 | if (profile.shareCredentialsVersion.isEmpty) { 61 | throw new IllegalArgumentException( 62 | "Cannot find the 'shareCredentialsVersion' field in the profile file") 63 | } 64 | 65 | if (profile.shareCredentialsVersion.get > DeltaSharingProfile.CURRENT) { 66 | throw new IllegalArgumentException( 67 | s"'shareCredentialsVersion' in the profile is " + 68 | s"${profile.shareCredentialsVersion.get} which is too new. The current release " + 69 | s"supports version ${DeltaSharingProfile.CURRENT} and below. Please upgrade to a newer " + 70 | s"release.") 71 | } 72 | if (profile.endpoint == null) { 73 | throw new IllegalArgumentException("Cannot find the 'endpoint' field in the profile file") 74 | } 75 | if (profile.bearerToken == null) { 76 | throw new IllegalArgumentException("Cannot find the 'bearerToken' field in the profile file") 77 | } 78 | profile 79 | } 80 | 81 | override def getProfile: DeltaSharingProfile = profile 82 | } 83 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/RemoteDeltaCDFRelation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import scala.collection.mutable.ListBuffer 20 | 21 | import org.apache.spark.rdd.RDD 22 | import org.apache.spark.sql.{DataFrame, DeltaSharingScanUtils, Row, SparkSession, SQLContext} 23 | import org.apache.spark.sql.execution.LogicalRDD 24 | import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} 25 | import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} 26 | import org.apache.spark.sql.types.StructType 27 | 28 | import io.delta.sharing.spark.model.{CDFColumnInfo, Metadata, Table => DeltaSharingTable} 29 | 30 | case class RemoteDeltaCDFRelation( 31 | spark: SparkSession, 32 | snapshotToUse: RemoteSnapshot, 33 | client: DeltaSharingClient, 34 | table: DeltaSharingTable, 35 | cdfOptions: Map[String, String]) extends BaseRelation with PrunedFilteredScan { 36 | 37 | override def schema: StructType = DeltaTableUtils.addCdcSchema(snapshotToUse.schema) 38 | 39 | override def sqlContext: SQLContext = spark.sqlContext 40 | 41 | override def buildScan( 42 | requiredColumns: Array[String], 43 | filters: Array[Filter]): RDD[Row] = { 44 | val deltaTabelFiles = client.getCDFFiles(table, cdfOptions) 45 | val metadata = deltaTabelFiles.metadata 46 | val params = RemoteDeltaFileIndexParams(spark, snapshotToUse) 47 | val dfs = ListBuffer[DataFrame]() 48 | 49 | // We unconditionally add all types of files. 50 | // We will get empty data frames for empty ones, which will get combined later. 51 | dfs.append(scanIndex(new RemoteDeltaCDFAddFileIndex(params, deltaTabelFiles), metadata)) 52 | dfs.append(scanIndex(new RemoteDeltaCDCFileIndex(params, deltaTabelFiles), metadata)) 53 | dfs.append(scanIndex(new RemoteDeltaCDFRemoveFileIndex(params, deltaTabelFiles), metadata)) 54 | 55 | dfs.reduce((df1, df2) => df1.unionAll(df2)).rdd 56 | } 57 | 58 | /** 59 | * Build a dataframe from the specified file index. We can't use a DataFrame scan directly on the 60 | * file names because that scan wouldn't include partition columns. 61 | */ 62 | private def scanIndex(fileIndex: RemoteDeltaCDFFileIndexBase, metadata: Metadata): DataFrame = { 63 | val relation = HadoopFsRelation( 64 | fileIndex, 65 | fileIndex.partitionSchema, 66 | DeltaTableUtils.addCdcSchema(metadata.schemaString), 67 | bucketSpec = None, 68 | snapshotToUse.fileFormat, 69 | Map.empty)(spark) 70 | val plan = LogicalRelation(relation) 71 | DeltaSharingScanUtils.ofRows(spark, plan) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /python/delta_sharing/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import os 17 | from pathlib import Path 18 | import subprocess 19 | import threading 20 | from typing import Iterator, Optional 21 | 22 | import pytest 23 | from pytest import TempPathFactory 24 | 25 | from delta_sharing.delta_sharing import SharingClient 26 | from delta_sharing.protocol import DeltaSharingProfile 27 | from delta_sharing.rest_client import DataSharingRestClient 28 | 29 | 30 | ENABLE_INTEGRATION = len(os.environ.get("AWS_ACCESS_KEY_ID", "")) > 0 31 | SKIP_MESSAGE = "The integration tests are disabled." 32 | 33 | 34 | @pytest.fixture 35 | def profile_path() -> str: 36 | return os.path.join(os.path.dirname(__file__), "test_profile.json") 37 | 38 | 39 | @pytest.fixture 40 | def profile(profile_path) -> DeltaSharingProfile: 41 | return DeltaSharingProfile.read_from_file(profile_path) 42 | 43 | 44 | @pytest.fixture 45 | def rest_client(profile) -> DataSharingRestClient: 46 | return DataSharingRestClient(profile) 47 | 48 | 49 | @pytest.fixture 50 | def sharing_client(profile) -> SharingClient: 51 | return SharingClient(profile) 52 | 53 | 54 | @pytest.fixture(scope="session", autouse=ENABLE_INTEGRATION) 55 | def test_server(tmp_path_factory: TempPathFactory) -> Iterator[None]: 56 | pid_file: Optional[Path] = None 57 | proc: Optional[subprocess.Popen] = None 58 | try: 59 | if ENABLE_INTEGRATION: 60 | pid_file = tmp_path_factory.getbasetemp() / "delta-sharing-server.pid" 61 | proc = subprocess.Popen( 62 | [ 63 | "./build/sbt", 64 | ( 65 | "server/test:runMain io.delta.sharing.server.TestDeltaSharingServer " 66 | + str(pid_file) 67 | ), 68 | ], 69 | stdout=subprocess.PIPE, 70 | stderr=subprocess.PIPE, 71 | cwd="..", 72 | ) 73 | 74 | ready = threading.Event() 75 | 76 | def wait_for_server() -> None: 77 | for line in proc.stdout: 78 | print(line.decode("utf-8").strip()) 79 | if b"https://127.0.0.1:12345/" in line: 80 | ready.set() 81 | 82 | threading.Thread(target=wait_for_server, daemon=True).start() 83 | 84 | if not ready.wait(timeout=120): 85 | raise TimeoutError("the server didn't start in 120 seconds") 86 | yield 87 | finally: 88 | if ENABLE_INTEGRATION: 89 | if pid_file is not None and pid_file.exists(): 90 | pid = pid_file.read_text() 91 | subprocess.run(["kill", "-9", pid]) 92 | if proc is not None and proc.poll() is None: 93 | proc.kill() 94 | -------------------------------------------------------------------------------- /.github/workflows/build-and-test.yml: -------------------------------------------------------------------------------- 1 | name: Build and Test 2 | on: [push, pull_request] 3 | jobs: 4 | build-and-test: 5 | runs-on: ubuntu-20.04 6 | env: 7 | SPARK_LOCAL_IP: localhost 8 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 9 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 10 | AZURE_TEST_ACCOUNT_KEY: ${{ secrets.AZURE_TEST_ACCOUNT_KEY }} 11 | GOOGLE_APPLICATION_CREDENTIALS: /tmp/google_service_account_key.json 12 | GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }} 13 | steps: 14 | - name: Checkout repository 15 | uses: actions/checkout@v2 16 | - name: Cache Scala, SBT 17 | uses: actions/cache@v2 18 | with: 19 | path: | 20 | ~/.sbt 21 | ~/.ivy2 22 | ~/.cache/coursier 23 | key: build-and-test-scala 24 | - name: Install Java 8 25 | uses: actions/setup-java@v1 26 | with: 27 | java-version: '8' 28 | - run: ./build/sbt test 29 | 30 | python: 31 | runs-on: ubuntu-20.04 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | python-version: [3.7, 3.8, 3.9] 36 | include: 37 | - pandas-version: 1.2.4 38 | pyarrow-version: 4.0.0 39 | env: 40 | PYTHON_VERSION: ${{ matrix.python-version }} 41 | PANDAS_VERSION: ${{ matrix.pandas-version }} 42 | PYARROW_VERSION: ${{ matrix.pyarrow-version }} 43 | SPARK_LOCAL_IP: localhost 44 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 45 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 46 | AZURE_TEST_ACCOUNT_KEY: ${{ secrets.AZURE_TEST_ACCOUNT_KEY }} 47 | GOOGLE_APPLICATION_CREDENTIALS: /tmp/google_service_account_key.json 48 | GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }} 49 | # Github Actions' default miniconda 50 | CONDA_PREFIX: /usr/share/miniconda 51 | steps: 52 | - name: Checkout repository 53 | uses: actions/checkout@v2 54 | - name: Cache Scala, SBT 55 | uses: actions/cache@v2 56 | with: 57 | path: | 58 | ~/.sbt 59 | ~/.ivy2 60 | ~/.cache/coursier 61 | key: build-and-test-python 62 | - name: Install Java 8 63 | uses: actions/setup-java@v1 64 | with: 65 | java-version: '8' 66 | - name: Install dependencies 67 | run: | 68 | # See also https://github.com/conda/conda/issues/7980 69 | source "$CONDA_PREFIX/etc/profile.d/conda.sh" 70 | conda update -q conda 71 | conda create -c conda-forge -q -n test-environment python=$PYTHON_VERSION 72 | conda activate test-environment 73 | conda config --env --add pinned_packages python=$PYTHON_VERSION 74 | conda config --env --add pinned_packages pandas==$PANDAS_VERSION 75 | conda config --env --add pinned_packages pyarrow==$PYARROW_VERSION 76 | conda install -c conda-forge --yes pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION 77 | sed -i -e "/pandas/d" -e "/pyarrow/d" python/requirements-dev.txt 78 | conda install -c conda-forge --yes --file python/requirements-dev.txt 79 | conda list 80 | - name: Build Server 81 | run: ./build/sbt package 82 | - name: Run tests 83 | run: | 84 | # See also https://github.com/conda/conda/issues/7980 85 | source "$CONDA_PREFIX/etc/profile.d/conda.sh" 86 | conda activate test-environment 87 | ./python/dev/lint-python 88 | ./python/dev/pytest 89 | -------------------------------------------------------------------------------- /spark/src/test/scala/org/apache/spark/delta/sharing/CachedTableManagerSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.delta.sharing 18 | 19 | import java.lang.ref.WeakReference 20 | 21 | import org.apache.spark.SparkFunSuite 22 | import org.scalatest.concurrent.Eventually._ 23 | import org.scalatest.time.SpanSugar._ 24 | 25 | class CachedTableManagerSuite extends SparkFunSuite { 26 | 27 | test("cache") { 28 | val manager = new CachedTableManager( 29 | preSignedUrlExpirationMs = 10, 30 | refreshCheckIntervalMs = 10, 31 | refreshThresholdMs = 10, 32 | expireAfterAccessMs = 60000 33 | ) 34 | try { 35 | val ref = new AnyRef 36 | manager.register( 37 | "test-table-path", 38 | Map("id1" -> "url1", "id2" -> "url2"), 39 | new WeakReference(ref), 40 | () => { 41 | Map("id1" -> "url1", "id2" -> "url2") 42 | }) 43 | assert(manager.getPreSignedUrl("test-table-path", "id1")._1 == "url1") 44 | assert(manager.getPreSignedUrl("test-table-path", "id2")._1 == "url2") 45 | 46 | manager.register( 47 | "test-table-path2", 48 | Map("id1" -> "url1", "id2" -> "url2"), 49 | new WeakReference(ref), 50 | () => { 51 | Map("id1" -> "url3", "id2" -> "url4") 52 | }) 53 | // We should get the new urls eventually 54 | eventually(timeout(10.seconds)) { 55 | assert(manager.getPreSignedUrl("test-table-path2", "id1")._1 == "url3") 56 | assert(manager.getPreSignedUrl("test-table-path2", "id2")._1 == "url4") 57 | } 58 | 59 | manager.register( 60 | "test-table-path3", 61 | Map("id1" -> "url1", "id2" -> "url2"), 62 | new WeakReference(new AnyRef), 63 | () => { 64 | Map("id1" -> "url3", "id2" -> "url4") 65 | }) 66 | // We should remove the cached table eventually 67 | eventually(timeout(10.seconds)) { 68 | System.gc() 69 | intercept[IllegalStateException](manager.getPreSignedUrl("test-table-path3", "id1")) 70 | intercept[IllegalStateException](manager.getPreSignedUrl("test-table-path3", "id1")) 71 | } 72 | } finally { 73 | manager.stop() 74 | } 75 | } 76 | 77 | test("expireAfterAccessMs") { 78 | val manager = new CachedTableManager( 79 | preSignedUrlExpirationMs = 10, 80 | refreshCheckIntervalMs = 10, 81 | refreshThresholdMs = 10, 82 | expireAfterAccessMs = 10 83 | ) 84 | try { 85 | val ref = new AnyRef 86 | manager.register( 87 | "test-table-path", 88 | Map("id1" -> "url1", "id2" -> "url2"), 89 | new WeakReference(ref), 90 | () => { 91 | Map("id1" -> "url1", "id2" -> "url2") 92 | }) 93 | Thread.sleep(1000) 94 | // We should remove the cached table when it's not accessed 95 | intercept[IllegalStateException](manager.getPreSignedUrl("test-table-path", "id1")) 96 | } finally { 97 | manager.stop() 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | We happily welcome contributions to Delta Sharing. We use [GitHub Issues](/../../issues/) to track community reported issues and [GitHub Pull Requests ](/../../pulls/) for accepting changes. 2 | 3 | # Governance 4 | Delta Sharing governance is conducted by the Technical Steering Committee (TSC), which is currently composed of the following members: 5 | - Michael Armbrust (michael.armbrust@gmail.com) 6 | - Reynold Xin (reynoldx@gmail.com) 7 | - Matei Zaharia (matei@cs.stanford.edu) 8 | 9 | The founding technical charter can be found [here](https://delta.io/wp-content/uploads/2019/12/delta-charter.pdf). 10 | 11 | # Communication 12 | Before starting work on a major feature, please reach out to us via GitHub, Slack, email, etc. We will make sure no one else is already working on it and ask you to open a GitHub issue. 13 | A "major feature" is defined as any change that is > 100 LOC altered (not including tests), or changes any user-facing behavior. 14 | We will use the GitHub issue to discuss the feature and come to agreement. 15 | This is to prevent your time being wasted, as well as ours. 16 | The GitHub review process for major features is also important so that organizations with commit access can come to agreement on design. 17 | If it is appropriate to write a design document, the document must be hosted either in the GitHub tracking issue, or linked to from the issue and hosted in a world-readable location. 18 | Specifically, if the goal is to add a new extension, please read the extension policy. 19 | Small patches and bug fixes don't need prior communication. 20 | 21 | # Coding style 22 | We generally follow the Apache Spark Scala Style Guide. 23 | 24 | # Sign your work 25 | The sign-off is a simple line at the end of the explanation for the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify the below (from developercertificate.org): 26 | 27 | ``` 28 | Developer Certificate of Origin 29 | Version 1.1 30 | 31 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 32 | 1 Letterman Drive 33 | Suite D4700 34 | San Francisco, CA, 94129 35 | 36 | Everyone is permitted to copy and distribute verbatim copies of this 37 | license document, but changing it is not allowed. 38 | 39 | 40 | Developer's Certificate of Origin 1.1 41 | 42 | By making a contribution to this project, I certify that: 43 | 44 | (a) The contribution was created in whole or in part by me and I 45 | have the right to submit it under the open source license 46 | indicated in the file; or 47 | 48 | (b) The contribution is based upon previous work that, to the best 49 | of my knowledge, is covered under an appropriate open source 50 | license and I have the right under that license to submit that 51 | work with modifications, whether created in whole or in part 52 | by me, under the same open source license (unless I am 53 | permitted to submit under a different license), as indicated 54 | in the file; or 55 | 56 | (c) The contribution was provided directly to me by some other 57 | person who certified (a), (b) or (c) and I have not modified 58 | it. 59 | 60 | (d) I understand and agree that this project and the contribution 61 | are public and that a record of the contribution (including all 62 | personal information I submit with it, including my sign-off) is 63 | maintained indefinitely and may be redistributed consistent with 64 | this project or the open source license(s) involved. 65 | ``` 66 | 67 | Then you just add a line to every git commit message: 68 | 69 | ``` 70 | Signed-off-by: Joe Smith 71 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 72 | ``` 73 | 74 | If you set your `user.name` and `user.email` git configs, you can sign your commit automatically with git commit -s. 75 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/model.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server.model 18 | 19 | import com.fasterxml.jackson.annotation.JsonInclude 20 | import org.codehaus.jackson.annotate.JsonRawValue 21 | 22 | case class SingleAction( 23 | file: AddFile = null, 24 | add: AddFileForCDF = null, 25 | cdf: AddCDCFile = null, 26 | remove: RemoveFile = null, 27 | metaData: Metadata = null, 28 | protocol: Protocol = null) { 29 | 30 | def unwrap: Action = { 31 | if (file != null) { 32 | file 33 | } else if (add != null) { 34 | add 35 | } else if (cdf != null) { 36 | cdf 37 | } else if (remove != null) { 38 | remove 39 | } else if (metaData != null) { 40 | metaData 41 | } else if (protocol != null) { 42 | protocol 43 | } else { 44 | null 45 | } 46 | } 47 | } 48 | 49 | case class Format(provider: String = "parquet") 50 | 51 | case class Metadata( 52 | id: String = null, 53 | name: String = null, 54 | description: String = null, 55 | format: Format = Format(), 56 | schemaString: String = null, 57 | configuration: Map[String, String] = Map.empty, 58 | partitionColumns: Seq[String] = Nil) extends Action { 59 | 60 | override def wrap: SingleAction = SingleAction(metaData = this) 61 | } 62 | 63 | sealed trait Action { 64 | /** Turn this object to the [[SingleAction]] wrap object. */ 65 | def wrap: SingleAction 66 | } 67 | 68 | case class Protocol(minReaderVersion: Int) extends Action { 69 | override def wrap: SingleAction = SingleAction(protocol = this) 70 | } 71 | 72 | sealed abstract class AddFileBase( 73 | url: String, 74 | id: String, 75 | @JsonInclude(JsonInclude.Include.ALWAYS) 76 | partitionValues: Map[String, String], 77 | size: Long, 78 | @JsonRawValue 79 | stats: String = null) 80 | extends Action {} 81 | 82 | case class AddFile( 83 | url: String, 84 | id: String, 85 | @JsonInclude(JsonInclude.Include.ALWAYS) 86 | partitionValues: Map[String, String], 87 | size: Long, 88 | @JsonRawValue 89 | stats: String = null) extends AddFileBase(url, id, partitionValues, size, stats) { 90 | 91 | override def wrap: SingleAction = SingleAction(file = this) 92 | } 93 | 94 | case class AddFileForCDF( 95 | url: String, 96 | id: String, 97 | @JsonInclude(JsonInclude.Include.ALWAYS) 98 | partitionValues: Map[String, String], 99 | size: Long, 100 | version: Long, 101 | timestamp: Long, 102 | @JsonRawValue 103 | stats: String = null) 104 | extends AddFileBase(url, id, partitionValues, size, stats) { 105 | 106 | override def wrap: SingleAction = SingleAction(add = this) 107 | } 108 | 109 | case class AddCDCFile( 110 | url: String, 111 | id: String, 112 | @JsonInclude(JsonInclude.Include.ALWAYS) 113 | partitionValues: Map[String, String], 114 | size: Long, 115 | timestamp: Long, 116 | version: Long) 117 | extends Action { 118 | 119 | override def wrap: SingleAction = SingleAction(cdf = this) 120 | } 121 | 122 | case class RemoveFile( 123 | url: String, 124 | id: String, 125 | @JsonInclude(JsonInclude.Include.ALWAYS) 126 | partitionValues: Map[String, String], 127 | size: Long, 128 | timestamp: Long, 129 | version: Long) 130 | extends Action { 131 | 132 | override def wrap: SingleAction = SingleAction(remove = this) 133 | } 134 | -------------------------------------------------------------------------------- /spark/src/test/scala/io/delta/sharing/spark/TestDeltaSharingClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import io.delta.sharing.spark.model.{ 20 | AddCDCFile, 21 | AddFile, 22 | AddFileForCDF, 23 | DeltaTableFiles, 24 | DeltaTableMetadata, 25 | Metadata, 26 | Protocol, 27 | RemoveFile, 28 | SingleAction, 29 | Table 30 | } 31 | import io.delta.sharing.spark.util.JsonUtils 32 | 33 | class TestDeltaSharingClient( 34 | profileProvider: DeltaSharingProfileProvider = null, 35 | timeoutInSeconds: Int = 120, 36 | numRetries: Int = 10, 37 | sslTrustAll: Boolean = false) extends DeltaSharingClient { 38 | 39 | private val metadataString = 40 | """{"metaData":{"id":"93351cf1-c931-4326-88f0-d10e29e71b21","format": 41 | |{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\", 42 | |\"fields\":[{\"name\":\"col1\",\"type\":\"integer\",\"nullable\":true, 43 | |\"metadata\":{}},{\"name\":\"col2\",\"type\":\"string\",\"nullable\":true, 44 | |\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1603723967515}}""" 45 | .stripMargin.replaceAll("\n", "") 46 | private val metadata = JsonUtils.fromJson[SingleAction](metadataString).metaData 47 | 48 | override def listAllTables(): Seq[Table] = Nil 49 | 50 | override def getMetadata(table: Table): DeltaTableMetadata = { 51 | DeltaTableMetadata(0, Protocol(0), metadata) 52 | } 53 | 54 | override def getTableVersion(table: Table): Long = 0 55 | 56 | override def getFiles( 57 | table: Table, 58 | predicates: Seq[String], 59 | limit: Option[Long], 60 | versionAsOf: Option[Long]): DeltaTableFiles = { 61 | limit.foreach(lim => TestDeltaSharingClient.limits = TestDeltaSharingClient.limits :+ lim) 62 | 63 | val addFiles: Seq[AddFile] = Seq( 64 | AddFile("f1.parquet", "f1", Map.empty, 0), 65 | AddFile("f2.parquet", "f2", Map.empty, 0), 66 | AddFile("f3.parquet", "f3", Map.empty, 0), 67 | AddFile("f4.parquet", "f4", Map.empty, 0) 68 | ).take(limit.getOrElse(4L).toInt) 69 | 70 | DeltaTableFiles(0, Protocol(0), metadata, addFiles) 71 | } 72 | 73 | override def getCDFFiles(table: Table, cdfOptions: Map[String, String]): DeltaTableFiles = { 74 | val addFiles: Seq[AddFileForCDF] = Seq( 75 | AddFileForCDF("cdf_add1.parquet", "cdf_add1", Map.empty, 100, 1, 1000) 76 | ) 77 | val cdcFiles: Seq[AddCDCFile] = Seq( 78 | // Return one cdc file from version 2, and two files with version 3. 79 | // This should result in two partition directories. 80 | AddCDCFile("cdf_cdc1.parquet", "cdf_cdc1", Map.empty, 200, 2, 2000), 81 | AddCDCFile("cdf_cdc2.parquet", "cdf_cdc2", Map.empty, 300, 3, 3000), 82 | AddCDCFile("cdf_cdc2.parquet", "cdf_cdc3", Map.empty, 310, 3, 3000) 83 | ) 84 | val removeFiles: Seq[RemoveFile] = Seq( 85 | // Return files with same version but different timestamps. 86 | // This should result in two partition directories. 87 | RemoveFile("cdf_rem1.parquet", "cdf_rem1", Map.empty, 400, 4, 4000), 88 | RemoveFile("cdf_rem2.parquet", "cdf_rem2", Map.empty, 420, 4, 4200) 89 | ) 90 | DeltaTableFiles(0, Protocol(0), metadata, Nil, addFiles, cdcFiles, removeFiles) 91 | } 92 | 93 | def clear(): Unit = { 94 | TestDeltaSharingClient.limits = Nil 95 | } 96 | } 97 | 98 | object TestDeltaSharingClient { 99 | var limits = Seq.empty[Long] 100 | } 101 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/standalone/internal/DeltaSharingHistoryManager.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // Putting these classes in this package to access Delta Standalone internal APIs 18 | package io.delta.standalone.internal 19 | 20 | import io.delta.standalone.internal.actions.CommitMarker 21 | import io.delta.standalone.internal.util.FileNames 22 | import io.delta.standalone.storage.LogStore 23 | import org.apache.hadoop.conf.Configuration 24 | import org.apache.hadoop.fs.Path 25 | import scala.collection.JavaConverters._ 26 | 27 | object DeltaSharingHistoryManager { 28 | /** 29 | * DeltaHistoryManager.getCommits is not a public method, so we need to make local copies here. 30 | * When calling getCommits, the initial few timestamp values may be wrong because they are not 31 | * properly monotonized. getCommitsSafe uses this to update the start value 32 | * far behind the first timestamp they care about to get correct values. 33 | * TODO(https://github.com/delta-io/delta-sharing/issues/144): Cleans this up once 34 | * DeltaHistoryManager.getCommits is public 35 | */ 36 | private val POTENTIALLY_UNMONOTONIZED_TIMESTAMPS = 100 37 | 38 | private[internal] def getCommitsSafe( 39 | logStore: LogStore, 40 | logPath: Path, 41 | start: Long, 42 | end: Long, 43 | conf: Configuration): Array[Commit] = { 44 | val monotonizationStart = 45 | Seq(start - POTENTIALLY_UNMONOTONIZED_TIMESTAMPS, 0).max 46 | getCommits(logStore, logPath, monotonizationStart, end, conf) 47 | } 48 | 49 | /** 50 | * Returns the commit version and timestamps of all commits in `[start, end)`. If `end` is not 51 | * specified, will return all commits that exist after `start`. Will guarantee that the commits 52 | * returned will have both monotonically increasing versions as well as timestamps. 53 | * Exposed for tests. 54 | */ 55 | private def getCommits( 56 | logStore: LogStore, 57 | logPath: Path, 58 | start: Long, 59 | end: Long, 60 | conf: Configuration): Array[Commit] = { 61 | val commits = logStore 62 | .listFrom(FileNames.deltaFile(logPath, start), conf) 63 | .asScala 64 | .filter(f => FileNames.isDeltaFile(f.getPath)) 65 | .map { fileStatus => 66 | Commit(FileNames.deltaVersion(fileStatus.getPath), fileStatus.getModificationTime) 67 | } 68 | .takeWhile(_.version < end) 69 | 70 | monotonizeCommitTimestamps(commits.toArray) 71 | } 72 | 73 | /** 74 | * Makes sure that the commit timestamps are monotonically increasing with respect to commit 75 | * versions. Requires the input commits to be sorted by the commit version. 76 | */ 77 | private def monotonizeCommitTimestamps[T <: CommitMarker]( 78 | commits: Array[T]): Array[T] = { 79 | var i = 0 80 | val length = commits.length 81 | while (i < length - 1) { 82 | val prevTimestamp = commits(i).getTimestamp 83 | assert(commits(i).getVersion < commits(i + 1).getVersion, "Unordered commits provided.") 84 | if (prevTimestamp >= commits(i + 1).getTimestamp) { 85 | commits(i + 1) = commits(i + 1).withTimestamp(prevTimestamp + 1).asInstanceOf[T] 86 | } 87 | i += 1 88 | } 89 | commits 90 | } 91 | 92 | /** A helper class to represent the timestamp and version of a commit. */ 93 | case class Commit(version: Long, timestamp: Long) extends CommitMarker { 94 | override def withTimestamp(timestamp: Long): Commit = this.copy(timestamp = timestamp) 95 | 96 | override def getTimestamp: Long = timestamp 97 | 98 | override def getVersion: Long = version 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /spark/src/test/scala/io/delta/sharing/spark/DeltaSharingFileProfileProviderSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.nio.charset.StandardCharsets.UTF_8 20 | import java.nio.file.Files 21 | 22 | import org.apache.commons.io.FileUtils 23 | import org.apache.hadoop.conf.Configuration 24 | import org.apache.spark.SparkFunSuite 25 | 26 | class DeltaSharingFileProfileProviderSuite extends SparkFunSuite { 27 | 28 | private def testProfile(profile: String, expected: DeltaSharingProfile): Unit = { 29 | val temp = Files.createTempFile("test", ".share").toFile 30 | try { 31 | FileUtils.writeStringToFile(temp, profile, UTF_8) 32 | assert(new DeltaSharingFileProfileProvider(new Configuration, temp.getCanonicalPath) 33 | .getProfile == expected) 34 | } finally { 35 | temp.delete() 36 | } 37 | } 38 | 39 | test("parse") { 40 | testProfile( 41 | """{ 42 | | "shareCredentialsVersion": 1, 43 | | "endpoint": "foo", 44 | | "bearerToken": "bar", 45 | | "expirationTime": "2021-11-12T00:12:29.0Z" 46 | |} 47 | |""".stripMargin, 48 | DeltaSharingProfile( 49 | shareCredentialsVersion = Some(1), 50 | endpoint = "foo", 51 | bearerToken = "bar", 52 | expirationTime = "2021-11-12T00:12:29.0Z" 53 | ) 54 | ) 55 | } 56 | 57 | test("expirationTime is optional") { 58 | testProfile( 59 | """{ 60 | | "shareCredentialsVersion": 1, 61 | | "endpoint": "foo", 62 | | "bearerToken": "bar" 63 | |} 64 | |""".stripMargin, 65 | DeltaSharingProfile( 66 | shareCredentialsVersion = Some(1), 67 | endpoint = "foo", 68 | bearerToken = "bar" 69 | ) 70 | ) 71 | } 72 | 73 | test("version is missing") { 74 | val e = intercept[IllegalArgumentException] { 75 | testProfile( 76 | """{ 77 | | "endpoint": "foo", 78 | | "bearerToken": "bar" 79 | |} 80 | |""".stripMargin, 81 | null 82 | ) 83 | } 84 | assert(e.getMessage.contains( 85 | "Cannot find the 'shareCredentialsVersion' field in the profile file")) 86 | } 87 | 88 | test("shareCredentialsVersion is not supported") { 89 | val e = intercept[IllegalArgumentException] { 90 | testProfile( 91 | """{ 92 | | "shareCredentialsVersion": 100 93 | |} 94 | |""".stripMargin, 95 | null 96 | ) 97 | } 98 | assert(e.getMessage.contains( 99 | "'shareCredentialsVersion' in the profile is 100 which is too new.")) 100 | } 101 | 102 | test("endpoint is missing") { 103 | val e = intercept[IllegalArgumentException] { 104 | testProfile( 105 | """{ 106 | | "shareCredentialsVersion": 1, 107 | | "bearerToken": "bar" 108 | |} 109 | |""".stripMargin, 110 | null 111 | ) 112 | } 113 | assert(e.getMessage.contains("Cannot find the 'endpoint' field in the profile file")) 114 | } 115 | 116 | test("bearerToken is missing") { 117 | val e = intercept[IllegalArgumentException] { 118 | testProfile( 119 | """{ 120 | | "shareCredentialsVersion": 1, 121 | | "endpoint": "foo" 122 | |} 123 | |""".stripMargin, 124 | null 125 | ) 126 | } 127 | assert(e.getMessage.contains("Cannot find the 'bearerToken' field in the profile file")) 128 | } 129 | 130 | test("unknown field should be ignored") { 131 | testProfile( 132 | """{ 133 | | "shareCredentialsVersion": 1, 134 | | "endpoint": "foo", 135 | | "bearerToken": "bar", 136 | | "expirationTime": "2021-11-12T00:12:29.0Z", 137 | | "futureField": "xyz" 138 | |} 139 | |""".stripMargin, 140 | DeltaSharingProfile( 141 | shareCredentialsVersion = Some(1), 142 | endpoint = "foo", 143 | bearerToken = "bar", 144 | expirationTime = "2021-11-12T00:12:29.0Z" 145 | ) 146 | ) 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /spark/src/test/scala/io/delta/sharing/spark/DeltaSharingIntegrationTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.io.File 20 | import java.nio.charset.StandardCharsets.UTF_8 21 | import java.nio.file.Files 22 | import java.util.concurrent.{CountDownLatch, TimeUnit} 23 | 24 | import scala.sys.process._ 25 | import scala.util.Try 26 | 27 | import org.apache.commons.io.FileUtils 28 | import org.apache.hadoop.conf.Configuration 29 | import org.apache.spark.SparkFunSuite 30 | import org.scalatest.BeforeAndAfterAll 31 | 32 | trait DeltaSharingIntegrationTest extends SparkFunSuite with BeforeAndAfterAll { 33 | 34 | def shouldRunIntegrationTest: Boolean = { 35 | sys.env.get("AWS_ACCESS_KEY_ID").exists(_.length > 0) && 36 | sys.env.get("AZURE_TEST_ACCOUNT_KEY").exists(_.length > 0) && 37 | sys.env.get("GOOGLE_APPLICATION_CREDENTIALS").exists(_.length > 0) 38 | } 39 | 40 | @volatile private var process: Process = _ 41 | @volatile private var pidFile: File = _ 42 | var testProfileFile: File = _ 43 | 44 | val TEST_PORT = 12345 45 | 46 | override def beforeAll(): Unit = { 47 | super.beforeAll() 48 | if (shouldRunIntegrationTest) { 49 | pidFile = Files.createTempFile("delta-sharing-server", ".pid").toFile 50 | testProfileFile = Files.createTempFile("delta-test", ".share").toFile 51 | FileUtils.writeStringToFile(testProfileFile, 52 | s"""{ 53 | | "shareCredentialsVersion": 1, 54 | | "endpoint": "https://localhost:$TEST_PORT/delta-sharing", 55 | | "bearerToken": "dapi5e3574ec767ca1548ae5bbed1a2dc04d" 56 | |}""".stripMargin, UTF_8) 57 | 58 | val startLatch = new CountDownLatch(1) 59 | new Thread("Run TestDeltaSharingServer") { 60 | setDaemon(true) 61 | 62 | override def run(): Unit = { 63 | val processLogger = ProcessLogger { stdout => 64 | // scalastyle:off println 65 | println(stdout) 66 | // scalastyle:on println 67 | if (stdout.contains(s"https://127.0.0.1:$TEST_PORT/")) { 68 | startLatch.countDown() 69 | } 70 | } 71 | process = 72 | Seq( 73 | "/bin/bash", 74 | "-c", 75 | s"cd .. && build/sbt 'server / Test / runMain " + 76 | s"io.delta.sharing.server.TestDeltaSharingServer ${pidFile.getCanonicalPath}'") 77 | .run(processLogger) 78 | process.exitValue() 79 | process = null 80 | startLatch.countDown() 81 | } 82 | }.start() 83 | try { 84 | assert(startLatch.await(120, TimeUnit.SECONDS), "the server didn't start in 120 seconds") 85 | if (process == null) { 86 | fail("the process exited with an error") 87 | } 88 | } catch { 89 | case e: Throwable => 90 | if (process != null) { 91 | process.destroy() 92 | process = null 93 | } 94 | throw e 95 | } 96 | } 97 | } 98 | 99 | override def afterAll(): Unit = { 100 | if (shouldRunIntegrationTest) { 101 | try { 102 | if (process != null) { 103 | process.destroy() 104 | process = null 105 | } 106 | if (pidFile != null) { 107 | val pid = FileUtils.readFileToString(pidFile) 108 | Try(pid.toLong).foreach { pid => 109 | // scalastyle:off println 110 | println(s"Killing $pid") 111 | // scalastyle:on println 112 | s"kill -9 $pid".! 113 | } 114 | pidFile.delete() 115 | } 116 | if (testProfileFile != null) { 117 | testProfileFile.delete() 118 | } 119 | } finally { 120 | super.afterAll() 121 | } 122 | } 123 | } 124 | 125 | def testProfileProvider: DeltaSharingProfileProvider = { 126 | new DeltaSharingFileProfileProvider(new Configuration, testProfileFile.getCanonicalPath) 127 | } 128 | 129 | def integrationTest(testName: String)(func: => Unit): Unit = { 130 | test(testName) { 131 | assume(shouldRunIntegrationTest) 132 | func 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/DeltaSharingDataSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.util.Collections 20 | 21 | import scala.collection.JavaConverters._ 22 | import scala.collection.mutable 23 | 24 | import org.apache.spark.SparkEnv 25 | import org.apache.spark.delta.sharing.PreSignedUrlCache 26 | import org.apache.spark.sql.{SparkSession, SQLContext} 27 | import org.apache.spark.sql.connector.catalog.{Table, TableCapability, TableProvider} 28 | import org.apache.spark.sql.connector.expressions.Transform 29 | import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationProvider} 30 | import org.apache.spark.sql.types.StructType 31 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 32 | 33 | /** A DataSource V1 for integrating Delta into Spark SQL batch APIs. */ 34 | private[sharing] class DeltaSharingDataSource extends RelationProvider with DataSourceRegister { 35 | 36 | override def createRelation( 37 | sqlContext: SQLContext, 38 | parameters: Map[String, String]): BaseRelation = { 39 | DeltaSharingDataSource.setupFileSystem(sqlContext) 40 | val path = parameters.getOrElse("path", throw new IllegalArgumentException( 41 | "'path' is not specified. If you use SQL to create a Delta Sharing table, " + 42 | "LOCATION must be specified")) 43 | 44 | var cdfOptions: mutable.Map[String, String] = mutable.Map.empty 45 | val caseInsensitiveParams = new CaseInsensitiveStringMap(parameters.asJava) 46 | if (DeltaSharingDataSource.isCDFRead(caseInsensitiveParams)) { 47 | cdfOptions = mutable.Map[String, String](DeltaSharingDataSource.CDF_ENABLED_KEY -> "true") 48 | if (caseInsensitiveParams.containsKey(DeltaSharingDataSource.CDF_START_VERSION_KEY)) { 49 | cdfOptions(DeltaSharingDataSource.CDF_START_VERSION_KEY) = caseInsensitiveParams.get( 50 | DeltaSharingDataSource.CDF_START_VERSION_KEY) 51 | } 52 | if (caseInsensitiveParams.containsKey(DeltaSharingDataSource.CDF_START_TIMESTAMP_KEY)) { 53 | cdfOptions(DeltaSharingDataSource.CDF_START_TIMESTAMP_KEY) = caseInsensitiveParams.get( 54 | DeltaSharingDataSource.CDF_START_TIMESTAMP_KEY) 55 | } 56 | if (caseInsensitiveParams.containsKey(DeltaSharingDataSource.CDF_END_VERSION_KEY)) { 57 | cdfOptions(DeltaSharingDataSource.CDF_END_VERSION_KEY) = caseInsensitiveParams.get( 58 | DeltaSharingDataSource.CDF_END_VERSION_KEY) 59 | } 60 | if (caseInsensitiveParams.containsKey(DeltaSharingDataSource.CDF_END_TIMESTAMP_KEY)) { 61 | cdfOptions(DeltaSharingDataSource.CDF_END_TIMESTAMP_KEY) = caseInsensitiveParams.get( 62 | DeltaSharingDataSource.CDF_END_TIMESTAMP_KEY) 63 | } 64 | } 65 | 66 | var versionAsOf: Option[Long] = None 67 | if (parameters.get("versionAsOf").isDefined) { 68 | try { 69 | versionAsOf = Some(parameters.get("versionAsOf").get.toLong) 70 | } catch { 71 | case _: NumberFormatException => 72 | throw new IllegalArgumentException("versionAsOf is not a valid number.") 73 | } 74 | } 75 | val deltaLog = RemoteDeltaLog(path) 76 | deltaLog.createRelation(versionAsOf, cdfOptions = cdfOptions.toMap) 77 | } 78 | 79 | override def shortName: String = "deltaSharing" 80 | } 81 | 82 | 83 | private[sharing] object DeltaSharingDataSource { 84 | 85 | def setupFileSystem(sqlContext: SQLContext): Unit = { 86 | // We have put our class name in the `org.apache.hadoop.fs.FileSystem` resource file. However, 87 | // this file will be loaded only if the class `FileSystem` is loaded. Hence, it won't work when 88 | // we add the library after starting Spark. Therefore we change the global `hadoopConfiguration` 89 | // to make sure we set up `DeltaSharingFileSystem` correctly. 90 | sqlContext.sparkContext.hadoopConfiguration 91 | .setIfUnset("fs.delta-sharing.impl", "io.delta.sharing.spark.DeltaSharingFileSystem") 92 | PreSignedUrlCache.registerIfNeeded(SparkEnv.get) 93 | } 94 | 95 | // Based on the read options passed it indicates whether the read was a cdf read or not. 96 | def isCDFRead(options: CaseInsensitiveStringMap): Boolean = { 97 | options.containsKey(DeltaSharingDataSource.CDF_ENABLED_KEY) && 98 | options.get(DeltaSharingDataSource.CDF_ENABLED_KEY) == "true" 99 | } 100 | 101 | // Constants for cdf parameters 102 | final val CDF_ENABLED_KEY = "readChangeFeed" 103 | 104 | final val CDF_START_VERSION_KEY = "startingVersion" 105 | 106 | final val CDF_START_TIMESTAMP_KEY = "startingTimestamp" 107 | 108 | final val CDF_END_VERSION_KEY = "endingVersion" 109 | 110 | final val CDF_END_TIMESTAMP_KEY = "endingTimestamp" 111 | } 112 | -------------------------------------------------------------------------------- /python/delta_sharing/converter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from decimal import Decimal 17 | from typing import Any, Callable, Dict 18 | 19 | import numpy as np 20 | import pandas as pd 21 | 22 | 23 | def _get_dummy_column(schema_type): 24 | """ 25 | Return a dummy column with the data type specified in schema_type. 26 | The dummy column is used to populate the dtype fields in empty tables. 27 | :param schema_type: str or json representing a data type 28 | :return: dummy pandas Series to be inserted into an empty table 29 | """ 30 | if schema_type == "boolean": 31 | return pd.Series([False]) 32 | elif schema_type == "byte": 33 | return pd.Series([0], dtype="int8") 34 | elif schema_type == "short": 35 | return pd.Series([0], dtype="int16") 36 | elif schema_type == "integer": 37 | return pd.Series([0], dtype="int32") 38 | elif schema_type == "long": 39 | return pd.Series([0], dtype="int64") 40 | elif schema_type == "float": 41 | return pd.Series([0], dtype="float32") 42 | elif schema_type == "double": 43 | return pd.Series([0], dtype="float64") 44 | elif isinstance(schema_type, str) and schema_type.startswith("decimal"): 45 | return pd.Series([0], dtype=np.dtype("O")) 46 | elif schema_type == "string": 47 | return pd.Series([0], dtype=np.dtype("O")) 48 | elif schema_type == "date": 49 | return pd.Series([pd.Timestamp(0).date()]) 50 | elif schema_type == "timestamp": 51 | return pd.Series([pd.Timestamp(0)], dtype=np.dtype("datetime64[ns]")) 52 | elif schema_type == "binary": 53 | return pd.Series([0], dtype=np.dtype("O")) 54 | elif isinstance(schema_type, dict) and schema_type["type"] in ("array", "struct", "map"): 55 | return pd.Series([0], dtype=np.dtype("O")) 56 | 57 | raise ValueError(f"Could not parse datatype: {schema_type}") 58 | 59 | 60 | def get_empty_table(schema_json: dict) -> pd.DataFrame: 61 | """ 62 | For empty tables, we use dummy columns from `_get_dummy_column` and then 63 | drop all rows to generate a table with the correct column names and 64 | data types. 65 | :param schema_json: json object representing the table schema 66 | :return: empty table with columns specified in schema_json 67 | """ 68 | assert schema_json["type"] == "struct" 69 | 70 | dummy_table = pd.DataFrame( 71 | {field["name"]: _get_dummy_column(field["type"]) for field in schema_json["fields"]} 72 | ) 73 | return dummy_table.iloc[0:0] 74 | 75 | 76 | def to_converters(schema_json: dict) -> Dict[str, Callable[[str], Any]]: 77 | assert schema_json["type"] == "struct" 78 | 79 | return {field["name"]: to_converter(field["type"]) for field in schema_json["fields"]} 80 | 81 | 82 | def to_converter(schema_type) -> Callable[[str], Any]: 83 | """ 84 | For types that support partitioning, a lambda to parse data into the 85 | corresponding type is returned. For data types that cannot be partitioned 86 | on, we return None. The caller is expected to check if the value is None before using. 87 | :param schema_type: str or json representing a data type 88 | :return: converter function or None 89 | """ 90 | if schema_type == "boolean": 91 | return lambda x: None if (x is None or x == "") else (x is True or x == "true") 92 | elif schema_type == "byte": 93 | return lambda x: np.nan if (x is None or x == "") else np.int8(x) 94 | elif schema_type == "short": 95 | return lambda x: np.nan if (x is None or x == "") else np.int16(x) 96 | elif schema_type == "integer": 97 | return lambda x: np.nan if (x is None or x == "") else np.int32(x) 98 | elif schema_type == "long": 99 | return lambda x: np.nan if (x is None or x == "") else np.int64(x) 100 | elif schema_type == "float": 101 | return lambda x: np.nan if (x is None or x == "") else np.float32(x) 102 | elif schema_type == "double": 103 | return lambda x: np.nan if (x is None or x == "") else np.float64(x) 104 | elif isinstance(schema_type, str) and schema_type.startswith("decimal"): 105 | return lambda x: None if (x is None or x == "") else Decimal(x) 106 | elif schema_type == "string": 107 | return lambda x: None if (x is None or x == "") else str(x) 108 | elif schema_type == "date": 109 | return lambda x: None if (x is None or x == "") else pd.Timestamp(x).date() 110 | elif schema_type == "timestamp": 111 | return lambda x: pd.NaT if (x is None or x == "") else pd.Timestamp(x) 112 | elif schema_type == "binary": 113 | return None # partition on binary column not supported 114 | elif isinstance(schema_type, dict) and schema_type["type"] in ("array", "struct", "map"): 115 | return None # partition on complex column not supported 116 | 117 | raise ValueError(f"Could not parse datatype: {schema_type}") 118 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/standalone/internal/PartitionFilterUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.standalone.internal 18 | 19 | import scala.util.Try 20 | import scala.util.control.NonFatal 21 | 22 | import io.delta.standalone.internal.actions.AddFile 23 | import org.apache.spark.sql.Encoders 24 | import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} 25 | import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, UnresolvedAttribute} 26 | import org.apache.spark.sql.catalyst.expressions._ 27 | import org.apache.spark.sql.execution.SparkSqlParser 28 | import org.apache.spark.sql.internal.SQLConf 29 | import org.apache.spark.sql.types.{DataType, StructField, StructType} 30 | import org.slf4j.LoggerFactory 31 | 32 | object PartitionFilterUtils { 33 | private val logger = LoggerFactory.getLogger(this.getClass) 34 | 35 | private lazy val sqlParser = new SparkSqlParser(new SQLConf) 36 | 37 | def evaluatePredicate( 38 | schemaString: String, 39 | partitionColumns: Seq[String], 40 | partitionFilters: Seq[String], 41 | addFiles: Seq[AddFile]): Seq[AddFile] = { 42 | try { 43 | val tableSchema = DataType.fromJson(schemaString).asInstanceOf[StructType] 44 | val partitionSchema = new StructType(partitionColumns.map(c => tableSchema(c)).toArray) 45 | val addSchema = Encoders.product[AddFile].schema 46 | val attrs = 47 | addSchema.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()) 48 | val exprs = 49 | rewritePartitionFilters( 50 | partitionSchema, 51 | attrs, 52 | partitionFilters.flatMap { f => 53 | Try(sqlParser.parseExpression(f)).toOption 54 | }.filter(f => isSupportedExpression(f, partitionSchema)) 55 | ) 56 | if (exprs.isEmpty) { 57 | addFiles 58 | } else { 59 | val predicate = InterpretedPredicate.create(exprs.reduce(And), attrs) 60 | predicate.initialize(0) 61 | addFiles.filter { addFile => 62 | val converter = CatalystTypeConverters.createToCatalystConverter(addSchema) 63 | predicate.eval(converter(addFile).asInstanceOf[InternalRow]) 64 | } 65 | } 66 | } catch { 67 | case NonFatal(e) => 68 | logger.error(e.getMessage, e) 69 | // Fail to evaluate the filters. Return all files as a fallback. 70 | addFiles 71 | } 72 | } 73 | 74 | private def isSupportedExpression(e: Expression, partitionSchema: StructType): Boolean = { 75 | def isPartitionColumOrConstant(e: Expression): Boolean = { 76 | e match { 77 | case _: Literal => true 78 | case u: UnresolvedAttribute if u.nameParts.size == 1 => 79 | val unquoted = u.name.stripPrefix("`").stripSuffix("`") 80 | partitionSchema.exists(part => caseInsensitiveResolution(unquoted, part.name)) 81 | case c: Cast => isPartitionColumOrConstant(c.child) 82 | case _ => false 83 | } 84 | } 85 | 86 | e match { 87 | case EqualTo(left, right) 88 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 89 | true 90 | case GreaterThan(left, right) 91 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 92 | true 93 | case LessThan(left, right) 94 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 95 | true 96 | case GreaterThanOrEqual(left, right) 97 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 98 | true 99 | case LessThanOrEqual(left, right) 100 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 101 | true 102 | case EqualNullSafe(left, right) 103 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) => 104 | true 105 | case IsNull(e) if isPartitionColumOrConstant(e) => 106 | true 107 | case IsNotNull(e) if isPartitionColumOrConstant(e) => 108 | true 109 | case Not(e) if isSupportedExpression(e, partitionSchema) => 110 | true 111 | case _ => false 112 | } 113 | } 114 | 115 | private def rewritePartitionFilters( 116 | partitionSchema: StructType, 117 | attrs: Seq[Attribute], 118 | partitionFilters: Seq[Expression]): Seq[Expression] = { 119 | val partitionValuesAttr = attrs.find(_.name == "partitionValues").head 120 | partitionFilters.map(_.transformUp { 121 | case a: Attribute => 122 | // If we have a special column name, e.g. `a.a`, then an UnresolvedAttribute returns 123 | // the column name as '`a.a`' instead of 'a.a', therefore we need to strip the backticks. 124 | val unquoted = a.name.stripPrefix("`").stripSuffix("`") 125 | val partitionCol = partitionSchema.find { field => field.name == unquoted } 126 | partitionCol match { 127 | case Some(StructField(name, dataType, _, _)) => 128 | Cast( 129 | ExtractValue( 130 | partitionValuesAttr, 131 | Literal(name), 132 | org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution), 133 | dataType) 134 | case None => 135 | // This should not be able to happen, but the case was present in the original code so 136 | // we kept it to be safe. 137 | UnresolvedAttribute(Seq("partitionValues", a.name)) 138 | } 139 | }) 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /server/src/test/scala/io/delta/sharing/server/config/ServerConfigSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server.config 18 | 19 | import java.io.File 20 | import java.nio.charset.StandardCharsets.UTF_8 21 | import java.nio.file.Files 22 | import java.util.Arrays 23 | 24 | import org.apache.commons.io.FileUtils 25 | import org.scalatest.FunSuite 26 | 27 | class ServerConfigSuite extends FunSuite { 28 | 29 | def testConfig(content: String, serverConfig: ServerConfig): Unit = { 30 | val tempFile = Files.createTempFile("delta-sharing-server", ".yaml").toFile 31 | try { 32 | FileUtils.writeStringToFile(tempFile, content, UTF_8) 33 | val loaded = ServerConfig.load(tempFile.getCanonicalPath) 34 | assert(serverConfig == loaded) 35 | } finally { 36 | tempFile.delete() 37 | } 38 | } 39 | 40 | test("empty config") { 41 | val serverConfig = new ServerConfig() 42 | serverConfig.setVersion(1) 43 | testConfig("version: 1", serverConfig) 44 | } 45 | 46 | test("template") { 47 | val tempFile = Files.createTempFile("delta-sharing-server", ".yaml").toFile 48 | try { 49 | FileUtils.copyFile( 50 | new File("src/universal/conf/delta-sharing-server.yaml.template"), 51 | tempFile) 52 | val loaded = ServerConfig.load(tempFile.getCanonicalPath) 53 | val sharesInTemplate = Arrays.asList( 54 | ShareConfig("share1", Arrays.asList( 55 | SchemaConfig("schema1", Arrays.asList( 56 | TableConfig("table1", "s3a:///"), 57 | TableConfig( 58 | "table2", 59 | "wasbs://@") 60 | )) 61 | )), 62 | ShareConfig("share2", Arrays.asList( 63 | SchemaConfig("schema2", Arrays.asList( 64 | TableConfig( 65 | "table3", 66 | "abfss://@", 67 | true) 68 | )) 69 | )), 70 | ShareConfig("share3", Arrays.asList( 71 | SchemaConfig("schema3", Arrays.asList( 72 | TableConfig( 73 | "table4", 74 | "gs:///") 75 | )) 76 | )) 77 | ) 78 | val serverConfig = new ServerConfig() 79 | serverConfig.setVersion(1) 80 | serverConfig.setShares(sharesInTemplate) 81 | serverConfig.setPort(8080) 82 | assert(loaded == serverConfig) 83 | } finally { 84 | tempFile.delete() 85 | } 86 | } 87 | 88 | test("accept unknown fields") { 89 | val serverConfig = new ServerConfig() 90 | serverConfig.setVersion(1) 91 | testConfig( 92 | """version: 1 93 | |unknown: "test" 94 | |""".stripMargin, serverConfig) 95 | } 96 | 97 | test("authorization token") { 98 | val serverConfig = new ServerConfig() 99 | serverConfig.setVersion(1) 100 | serverConfig.setAuthorization(Authorization("")) 101 | testConfig( 102 | """version: 1 103 | |authorization: 104 | | bearerToken: 105 | |""".stripMargin, serverConfig) 106 | } 107 | 108 | private def assertInvalidConfig(expectedErrorMessage: String)(func: => Unit): Unit = { 109 | assert(intercept[IllegalArgumentException] { 110 | func 111 | }.getMessage.contains(expectedErrorMessage)) 112 | } 113 | 114 | test("invalid version") { 115 | assertInvalidConfig("'version' must be greater than 0") { 116 | testConfig("version: 0", null) 117 | } 118 | } 119 | 120 | test("future version") { 121 | assertInvalidConfig("The 'version' in the server config is 100 which is too new.") { 122 | testConfig("version: 100", null) 123 | } 124 | } 125 | 126 | test("invalid ssl") { 127 | assertInvalidConfig("'certificateFile' in a SSL config must be provided") { 128 | testConfig( 129 | """version: 1 130 | |ssl: 131 | | selfSigned: false 132 | |""".stripMargin, null) 133 | } 134 | } 135 | 136 | test("Authorization") { 137 | assertInvalidConfig("'bearerToken' in 'authorization' must be provided") { 138 | new Authorization().checkConfig() 139 | } 140 | } 141 | 142 | test("SSLConfig") { 143 | assertInvalidConfig("'certificateFile' in a SSL config must be provided") { 144 | val s = new SSLConfig() 145 | assert(s.selfSigned == false) 146 | s.checkConfig() 147 | } 148 | assertInvalidConfig("'certificateKeyFile' in a SSL config must be provided") { 149 | val s = new SSLConfig() 150 | s.setCertificateFile("file") 151 | s.checkConfig() 152 | } 153 | val s = new SSLConfig() 154 | s.setSelfSigned(true) 155 | s.checkConfig() 156 | } 157 | 158 | test("ShareConfig") { 159 | assertInvalidConfig("'name' in a share must be provided") { 160 | new ShareConfig().checkConfig() 161 | } 162 | assertInvalidConfig("'name' in a schema must be provided") { 163 | val s = new ShareConfig() 164 | s.setName("name") 165 | s.setSchemas(Arrays.asList(new SchemaConfig())) 166 | s.checkConfig() 167 | } 168 | } 169 | 170 | test("SchemaConfig") { 171 | assertInvalidConfig("'name' in a schema must be provided") { 172 | new SchemaConfig().checkConfig() 173 | } 174 | assertInvalidConfig("'name' in a table must be provided") { 175 | val s = new SchemaConfig() 176 | s.setName("name") 177 | s.setTables(Arrays.asList(new TableConfig())) 178 | s.checkConfig() 179 | } 180 | } 181 | 182 | test("TableConfig") { 183 | assertInvalidConfig("'name' in a table must be provided") { 184 | new TableConfig().checkConfig() 185 | } 186 | assertInvalidConfig("'name' in a table must be provided") { 187 | val t = new TableConfig() 188 | t.setLocation("Location") 189 | t.checkConfig() 190 | } 191 | assertInvalidConfig("'location' in a table must be provided") { 192 | val t = new TableConfig() 193 | t.setName("name") 194 | t.checkConfig() 195 | } 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /server/src/test/scala/io/delta/sharing/server/TestResource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | import java.io.File 20 | import java.nio.charset.StandardCharsets.UTF_8 21 | import java.nio.file.Files 22 | 23 | import org.apache.commons.io.FileUtils 24 | 25 | import io.delta.sharing.server.config._ 26 | 27 | object TestResource { 28 | def env(key: String): String = { 29 | sys.env.getOrElse(key, throw new IllegalArgumentException(s"Cannot find $key in sys env")) 30 | } 31 | 32 | object AWS { 33 | val bucket = "delta-exchange-test" 34 | } 35 | 36 | object Azure { 37 | val accountName = "deltasharingtest" 38 | val container = "delta-sharing-test-container" 39 | } 40 | 41 | object GCP { 42 | val bucket = "delta-sharing-dev" 43 | } 44 | 45 | val TEST_PORT = 12345 46 | 47 | val testAuthorizationToken = "dapi5e3574ec767ca1548ae5bbed1a2dc04d" 48 | 49 | def maybeSetupGoogleServiceAccountCredentials: Unit = { 50 | // Only setup Google Service Account credentials when it is provided through env variable. 51 | if (sys.env.get("GOOGLE_SERVICE_ACCOUNT_KEY").exists(_.length > 0) 52 | && sys.env.get("GOOGLE_APPLICATION_CREDENTIALS").exists(_.length > 0)) { 53 | val serviceAccountKey = sys.env("GOOGLE_SERVICE_ACCOUNT_KEY") 54 | val credFilePath = new File(sys.env("GOOGLE_APPLICATION_CREDENTIALS")) 55 | credFilePath.deleteOnExit() 56 | FileUtils.writeStringToFile(credFilePath, serviceAccountKey, UTF_8, false) 57 | } 58 | } 59 | 60 | def setupTestTables(): File = { 61 | val testConfigFile = Files.createTempFile("delta-sharing", ".yaml").toFile 62 | testConfigFile.deleteOnExit() 63 | maybeSetupGoogleServiceAccountCredentials 64 | val shares = java.util.Arrays.asList( 65 | ShareConfig("share1", 66 | java.util.Arrays.asList( 67 | SchemaConfig( 68 | "default", 69 | java.util.Arrays.asList( 70 | TableConfig("table1", s"s3a://${AWS.bucket}/delta-exchange-test/table1"), 71 | TableConfig("table3", s"s3a://${AWS.bucket}/delta-exchange-test/table3"), 72 | TableConfig("table7", s"s3a://${AWS.bucket}/delta-exchange-test/table7"), 73 | TableConfig( 74 | "cdf_table_cdf_enabled", 75 | s"s3a://${AWS.bucket}/delta-exchange-test/cdf_table_cdf_enabled", 76 | true 77 | ), 78 | TableConfig( 79 | "cdf_table_with_partition", 80 | s"s3a://${AWS.bucket}/delta-exchange-test/cdf_table_with_partition", 81 | true, 82 | 1 83 | ), 84 | TableConfig( 85 | "cdf_table_with_vacuum", 86 | s"s3a://${AWS.bucket}/delta-exchange-test/cdf_table_with_vacuum", 87 | true 88 | ), 89 | TableConfig( 90 | "cdf_table_missing_log", 91 | s"s3a://${AWS.bucket}/delta-exchange-test/cdf_table_missing_log", 92 | true 93 | ) 94 | ) 95 | ) 96 | ) 97 | ), 98 | ShareConfig("share2", 99 | java.util.Arrays.asList( 100 | SchemaConfig("default", java.util.Arrays.asList( 101 | TableConfig("table2", s"s3a://${AWS.bucket}/delta-exchange-test/table2") 102 | ) 103 | ) 104 | )), 105 | ShareConfig("share3", 106 | java.util.Arrays.asList( 107 | SchemaConfig( 108 | "default", 109 | java.util.Arrays.asList( 110 | TableConfig("table4", s"s3a://${AWS.bucket}/delta-exchange-test/table4"), 111 | TableConfig("table5", s"s3a://${AWS.bucket}/delta-exchange-test/table5") 112 | ) 113 | ) 114 | ) 115 | ), 116 | ShareConfig("share4", 117 | java.util.Arrays.asList( 118 | SchemaConfig( 119 | "default", 120 | java.util.Arrays.asList( 121 | // table made with spark.sql.parquet.compression.codec=gzip 122 | TableConfig("test_gzip", s"s3a://${AWS.bucket}/compress-test/table1") 123 | ) 124 | ) 125 | ) 126 | ), 127 | ShareConfig("share5", 128 | java.util.Arrays.asList( 129 | SchemaConfig( 130 | "default", // empty schema 131 | java.util.Arrays.asList() 132 | ) 133 | ) 134 | ), 135 | ShareConfig("share6", 136 | java.util.Arrays.asList() 137 | ), 138 | ShareConfig("share7", 139 | java.util.Arrays.asList( 140 | SchemaConfig( 141 | "schema1", 142 | java.util.Arrays.asList( 143 | TableConfig("table8", s"s3a://${AWS.bucket}/delta-exchange-test/table8") 144 | ) 145 | ), 146 | SchemaConfig( 147 | "schema2", 148 | java.util.Arrays.asList( 149 | TableConfig("table9", s"s3a://${AWS.bucket}/delta-exchange-test/table9") 150 | ) 151 | ) 152 | ) 153 | ), 154 | // scalastyle:off maxLineLength 155 | ShareConfig("share_azure", 156 | java.util.Arrays.asList( 157 | SchemaConfig( 158 | "default", 159 | java.util.Arrays.asList( 160 | TableConfig("table_wasb", s"wasbs://${Azure.container}@${Azure.accountName}.blob.core.windows.net/delta-sharing-test/table1"), 161 | TableConfig("table_abfs", s"abfss://${Azure.container}@${Azure.accountName}.dfs.core.windows.net/delta-sharing-test/table1") 162 | ) 163 | ) 164 | ) 165 | ), 166 | // scalastyle:on 167 | ShareConfig("share_gcp", 168 | java.util.Arrays.asList( 169 | SchemaConfig( 170 | "default", 171 | java.util.Arrays.asList( 172 | TableConfig("table_gcs", s"gs://${GCP.bucket}/delta-sharing-test/table1") 173 | ) 174 | ) 175 | ) 176 | ) 177 | ) 178 | 179 | val serverConfig = new ServerConfig() 180 | serverConfig.setVersion(1) 181 | serverConfig.setShares(shares) 182 | serverConfig.setAuthorization(Authorization(testAuthorizationToken)) 183 | serverConfig.setPort(TEST_PORT) 184 | serverConfig.setSsl(SSLConfig(selfSigned = true, null, null, null)) 185 | serverConfig.setEvaluatePredicateHints(true) 186 | 187 | serverConfig.save(testConfigFile.getCanonicalPath) 188 | testConfigFile 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/model.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark.model 18 | 19 | import com.fasterxml.jackson.annotation.JsonInclude 20 | import org.apache.spark.sql.types.{DataType, LongType, StringType} 21 | import org.codehaus.jackson.annotate.JsonRawValue 22 | 23 | // Information about CDF columns. 24 | private[sharing] object CDFColumnInfo { 25 | // Internal CDF column names. 26 | val commit_version_col_name = "_commit_version" 27 | val commit_timestamp_col_name = "_commit_timestamp" 28 | val change_type_col_name = "_change_type" 29 | 30 | // Returns internal partition schema for internal columns for CDC actions. 31 | def getInternalPartitonSchemaForCDC(): Map[String, DataType] = 32 | Map(commit_version_col_name -> LongType, commit_timestamp_col_name -> LongType) 33 | 34 | // Returns internal partition schema for internal columns for CDF add/remove actions. 35 | def getInternalPartitonSchemaForCDFAddRemoveFile(): Map[String, DataType] = 36 | getInternalPartitonSchemaForCDC() + (change_type_col_name -> StringType) 37 | } 38 | 39 | private[sharing] case class DeltaTableMetadata( 40 | version: Long, 41 | protocol: Protocol, 42 | metadata: Metadata) 43 | 44 | private[sharing] case class DeltaTableFiles( 45 | version: Long, 46 | protocol: Protocol, 47 | metadata: Metadata, 48 | files: Seq[AddFile] = Nil, 49 | addFilesForCdf: Seq[AddFileForCDF] = Nil, 50 | cdfFiles: Seq[AddCDCFile] = Nil, 51 | removeFiles: Seq[RemoveFile] = Nil) 52 | 53 | private[sharing] case class Share(name: String) 54 | 55 | private[sharing] case class Schema(name: String, share: String) 56 | 57 | private[sharing] case class Table(name: String, schema: String, share: String) 58 | 59 | private[sharing] case class SingleAction( 60 | file: AddFile = null, 61 | add: AddFileForCDF = null, 62 | cdf: AddCDCFile = null, 63 | remove: RemoveFile = null, 64 | metaData: Metadata = null, 65 | protocol: Protocol = null) { 66 | 67 | def unwrap: Action = { 68 | if (file != null) { 69 | file 70 | } else if (add != null) { 71 | add 72 | } else if (cdf != null) { 73 | cdf 74 | } else if (remove != null) { 75 | remove 76 | } else if (metaData != null) { 77 | metaData 78 | } else if (protocol != null) { 79 | protocol 80 | } else { 81 | null 82 | } 83 | } 84 | } 85 | 86 | private[sharing] case class Format(provider: String = "parquet") 87 | 88 | private[sharing] case class Metadata( 89 | id: String = null, 90 | name: String = null, 91 | description: String = null, 92 | format: Format = Format(), 93 | schemaString: String = null, 94 | configuration: Map[String, String] = Map.empty, 95 | partitionColumns: Seq[String] = Nil) extends Action { 96 | override def wrap: SingleAction = SingleAction(metaData = this) 97 | } 98 | 99 | private[sharing] sealed trait Action { 100 | /** Turn this object to the [[SingleAction]] wrap object. */ 101 | def wrap: SingleAction 102 | } 103 | 104 | private[sharing] case class Protocol(minReaderVersion: Int) extends Action { 105 | override def wrap: SingleAction = SingleAction(protocol = this) 106 | } 107 | 108 | // A common base class for all file actions. 109 | private[sharing] sealed abstract class FileAction( 110 | val url: String, 111 | val id: String, 112 | @JsonInclude(JsonInclude.Include.ALWAYS) 113 | val partitionValues: Map[String, String], 114 | val size: Long) extends Action { 115 | 116 | // Returns the partition values to be used in a data frame. 117 | // By default, we return the input partition values. 118 | // Derived class can override this and add internal partitions values as needed. 119 | // For example, internal CDF columns such as commit version are modeled as partitions. 120 | def getPartitionValuesInDF(): Map[String, String] = partitionValues 121 | } 122 | 123 | private[sharing] case class AddFile( 124 | override val url: String, 125 | override val id: String, 126 | @JsonInclude(JsonInclude.Include.ALWAYS) 127 | override val partitionValues: Map[String, String], 128 | override val size: Long, 129 | @JsonRawValue 130 | stats: String = null) extends FileAction(url, id, partitionValues, size) { 131 | 132 | override def wrap: SingleAction = SingleAction(file = this) 133 | } 134 | 135 | private[sharing] case class AddFileForCDF( 136 | override val url: String, 137 | override val id: String, 138 | @JsonInclude(JsonInclude.Include.ALWAYS) 139 | override val partitionValues: Map[String, String], 140 | override val size: Long, 141 | version: Long, 142 | timestamp: Long, 143 | @JsonRawValue 144 | stats: String = null) extends FileAction(url, id, partitionValues, size) { 145 | 146 | override def wrap: SingleAction = SingleAction(add = this) 147 | 148 | override def getPartitionValuesInDF(): Map[String, String] = { 149 | partitionValues + 150 | (CDFColumnInfo.commit_version_col_name -> version.toString) + 151 | (CDFColumnInfo.commit_timestamp_col_name -> timestamp.toString) + 152 | (CDFColumnInfo.change_type_col_name -> "insert") 153 | } 154 | } 155 | 156 | private[sharing] case class AddCDCFile( 157 | override val url: String, 158 | override val id: String, 159 | @JsonInclude(JsonInclude.Include.ALWAYS) 160 | override val partitionValues: Map[String, String], 161 | override val size: Long, 162 | version: Long, 163 | timestamp: Long) extends FileAction(url, id, partitionValues, size) { 164 | 165 | override def wrap: SingleAction = SingleAction(cdf = this) 166 | 167 | override def getPartitionValuesInDF(): Map[String, String] = { 168 | partitionValues + 169 | (CDFColumnInfo.commit_version_col_name -> version.toString) + 170 | (CDFColumnInfo.commit_timestamp_col_name -> timestamp.toString) 171 | } 172 | } 173 | 174 | private[sharing] case class RemoveFile( 175 | override val url: String, 176 | override val id: String, 177 | @JsonInclude(JsonInclude.Include.ALWAYS) 178 | override val partitionValues: Map[String, String], 179 | override val size: Long, 180 | version: Long, 181 | timestamp: Long) extends FileAction(url, id, partitionValues, size) { 182 | 183 | override def wrap: SingleAction = SingleAction(remove = this) 184 | 185 | override def getPartitionValuesInDF(): Map[String, String] = { 186 | partitionValues + 187 | (CDFColumnInfo.commit_version_col_name -> version.toString) + 188 | (CDFColumnInfo.commit_timestamp_col_name -> timestamp.toString) + 189 | (CDFColumnInfo.change_type_col_name -> "delete") 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/RemoteDeltaFileIndex.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.lang.ref.WeakReference 20 | 21 | import org.apache.hadoop.fs.{FileStatus, Path} 22 | import org.apache.spark.delta.sharing.CachedTableManager 23 | import org.apache.spark.sql.SparkSession 24 | import org.apache.spark.sql.catalyst.InternalRow 25 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder 26 | import org.apache.spark.sql.catalyst.expressions.{ 27 | And, 28 | Attribute, 29 | Cast, 30 | Expression, 31 | GenericInternalRow, 32 | Literal, 33 | SubqueryExpression 34 | } 35 | import org.apache.spark.sql.execution.datasources.{ 36 | FileFormat, 37 | FileIndex, 38 | HadoopFsRelation, 39 | PartitionDirectory 40 | } 41 | import org.apache.spark.sql.types.{DataType, StructType} 42 | 43 | import io.delta.sharing.spark.model.{ 44 | AddCDCFile, 45 | AddFile, 46 | CDFColumnInfo, 47 | DeltaTableFiles, 48 | FileAction, 49 | Metadata, 50 | Protocol, 51 | Table => DeltaSharingTable 52 | } 53 | 54 | private[sharing] case class RemoteDeltaFileIndexParams( 55 | val spark: SparkSession, 56 | val snapshotAtAnalysis: RemoteSnapshot) { 57 | def path: Path = snapshotAtAnalysis.getTablePath 58 | } 59 | 60 | // A base class for all file indices for remote delta log. 61 | private[sharing] abstract class RemoteDeltaFileIndexBase( 62 | val params: RemoteDeltaFileIndexParams) extends FileIndex { 63 | override def refresh(): Unit = {} 64 | 65 | override def sizeInBytes: Long = params.snapshotAtAnalysis.sizeInBytes 66 | 67 | override def partitionSchema: StructType = params.snapshotAtAnalysis.partitionSchema 68 | 69 | override def rootPaths: Seq[Path] = params.path :: Nil 70 | 71 | protected def toDeltaSharingPath(f: FileAction): Path = { 72 | DeltaSharingFileSystem.encode(params.path, f) 73 | } 74 | 75 | // A helper function to create partition directories from the specified actions. 76 | protected def makePartitionDirectories(actions: Seq[FileAction]): Seq[PartitionDirectory] = { 77 | val timeZone = params.spark.sessionState.conf.sessionLocalTimeZone 78 | actions.groupBy(_.getPartitionValuesInDF()).map { 79 | case (partitionValues, files) => 80 | val rowValues: Array[Any] = partitionSchema.map { p => 81 | Cast(Literal(partitionValues(p.name)), p.dataType, Option(timeZone)).eval() 82 | }.toArray 83 | 84 | val fileStats = files.map { f => 85 | new FileStatus( 86 | /* length */ f.size, 87 | /* isDir */ false, 88 | /* blockReplication */ 0, 89 | /* blockSize */ 1, 90 | /* modificationTime */ 0, 91 | toDeltaSharingPath(f)) 92 | }.toArray 93 | 94 | try { 95 | // Databricks Runtime has a different `PartitionDirectory.apply` method. We need to use 96 | // Java Reflection to call it. 97 | classOf[PartitionDirectory].getMethod("apply", classOf[InternalRow], fileStats.getClass) 98 | .invoke(null, new GenericInternalRow(rowValues), fileStats) 99 | .asInstanceOf[PartitionDirectory] 100 | } catch { 101 | case _: NoSuchMethodException => 102 | // This is not in Databricks Runtime. We can call Spark's PartitionDirectory directly. 103 | PartitionDirectory(new GenericInternalRow(rowValues), fileStats) 104 | } 105 | }.toSeq 106 | } 107 | } 108 | 109 | // The index for processing files in a delta snapshot. 110 | private[sharing] case class RemoteDeltaSnapshotFileIndex( 111 | override val params: RemoteDeltaFileIndexParams, 112 | limitHint: Option[Long]) extends RemoteDeltaFileIndexBase(params) { 113 | 114 | override def inputFiles: Array[String] = { 115 | params.snapshotAtAnalysis.filesForScan(Nil, None, this) 116 | .map(f => toDeltaSharingPath(f).toString) 117 | .toArray 118 | } 119 | 120 | override def listFiles( 121 | partitionFilters: Seq[Expression], 122 | dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { 123 | makePartitionDirectories(params.snapshotAtAnalysis.filesForScan( 124 | partitionFilters ++ dataFilters, 125 | limitHint, 126 | this 127 | )) 128 | } 129 | } 130 | 131 | // A base class for all file indices for CDF. 132 | private[sharing] abstract class RemoteDeltaCDFFileIndexBase( 133 | override val params: RemoteDeltaFileIndexParams, 134 | actions: Seq[FileAction], 135 | auxPartitionSchema: Map[String, DataType] = Map.empty) 136 | extends RemoteDeltaFileIndexBase(params) { 137 | 138 | override def partitionSchema: StructType = { 139 | DeltaTableUtils.updateSchema(params.snapshotAtAnalysis.partitionSchema, auxPartitionSchema) 140 | } 141 | 142 | override def inputFiles: Array[String] = { 143 | actions.map(f => toDeltaSharingPath(f).toString).toArray 144 | } 145 | 146 | override def listFiles( 147 | partitionFilters: Seq[Expression], 148 | dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { 149 | // Register the files with the pre-signed url fetcher. 150 | CachedTableManager.INSTANCE 151 | .register(params.path.toString, getIdToUrlMap, new WeakReference(this), () => { 152 | getIdToUrlMap 153 | }) 154 | 155 | // We ignore partition filters for list files, since the server already 156 | // parforms this filtering for CDF. 157 | makePartitionDirectories(actions) 158 | } 159 | 160 | private[sharing] def getIdToUrlMap : Map[String, String] = { 161 | actions.map { action => 162 | action.id -> action.url 163 | }.toMap 164 | } 165 | } 166 | 167 | // The index classes for CDF file types. 168 | 169 | private[sharing] case class RemoteDeltaCDFAddFileIndex( 170 | override val params: RemoteDeltaFileIndexParams, 171 | deltaTableFiles: DeltaTableFiles) 172 | extends RemoteDeltaCDFFileIndexBase( 173 | params, 174 | deltaTableFiles.addFilesForCdf, 175 | CDFColumnInfo.getInternalPartitonSchemaForCDFAddRemoveFile) {} 176 | 177 | private[sharing] case class RemoteDeltaCDCFileIndex( 178 | override val params: RemoteDeltaFileIndexParams, 179 | deltaTableFiles: DeltaTableFiles) 180 | extends RemoteDeltaCDFFileIndexBase( 181 | params, 182 | deltaTableFiles.cdfFiles, 183 | CDFColumnInfo.getInternalPartitonSchemaForCDC) {} 184 | 185 | private[sharing] case class RemoteDeltaCDFRemoveFileIndex( 186 | override val params: RemoteDeltaFileIndexParams, 187 | deltaTableFiles: DeltaTableFiles) 188 | extends RemoteDeltaCDFFileIndexBase( 189 | params, 190 | deltaTableFiles.removeFiles, 191 | CDFColumnInfo.getInternalPartitonSchemaForCDFAddRemoveFile) {} 192 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/DeltaSharingFileSystem.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.spark 18 | 19 | import java.net.{URI, URLDecoder, URLEncoder} 20 | import java.util.concurrent.TimeUnit 21 | 22 | import org.apache.hadoop.fs._ 23 | import org.apache.hadoop.fs.permission.FsPermission 24 | import org.apache.hadoop.util.Progressable 25 | import org.apache.http.client.config.RequestConfig 26 | import org.apache.http.impl.client.HttpClientBuilder 27 | import org.apache.spark.SparkEnv 28 | import org.apache.spark.delta.sharing.{PreSignedUrlCache, PreSignedUrlFetcher} 29 | import org.apache.spark.network.util.JavaUtils 30 | 31 | import io.delta.sharing.spark.model.FileAction 32 | 33 | /** Read-only file system for delta paths. */ 34 | private[sharing] class DeltaSharingFileSystem extends FileSystem { 35 | import DeltaSharingFileSystem._ 36 | 37 | lazy private val numRetries = { 38 | val numRetries = getConf.getInt("spark.delta.sharing.network.numRetries", 10) 39 | if (numRetries < 0) { 40 | throw new IllegalArgumentException( 41 | "spark.delta.sharing.network.numRetries must not be negative") 42 | } 43 | numRetries 44 | } 45 | 46 | lazy private val timeoutInSeconds = { 47 | val timeoutStr = getConf.get("spark.delta.sharing.network.timeout", "120s") 48 | val timeoutInSeconds = JavaUtils.timeStringAs(timeoutStr, TimeUnit.SECONDS) 49 | if (timeoutInSeconds < 0) { 50 | throw new IllegalArgumentException( 51 | "spark.delta.sharing.network.timeout must not be negative") 52 | } 53 | if (timeoutInSeconds > Int.MaxValue) { 54 | throw new IllegalArgumentException( 55 | s"spark.delta.sharing.network.timeout is too big: $timeoutStr") 56 | } 57 | timeoutInSeconds.toInt 58 | } 59 | 60 | lazy private val httpClient = { 61 | val maxConnections = getConf.getInt("spark.delta.sharing.network.maxConnections", 64) 62 | if (maxConnections < 0) { 63 | throw new IllegalArgumentException( 64 | "spark.delta.sharing.network.maxConnections must not be negative") 65 | } 66 | val config = RequestConfig.custom() 67 | .setConnectTimeout(timeoutInSeconds * 1000) 68 | .setConnectionRequestTimeout(timeoutInSeconds * 1000) 69 | .setSocketTimeout(timeoutInSeconds * 1000).build() 70 | HttpClientBuilder.create() 71 | .setMaxConnTotal(maxConnections) 72 | .setMaxConnPerRoute(maxConnections) 73 | .setDefaultRequestConfig(config) 74 | // Disable the default retry behavior because we have our own retry logic. 75 | // See `RetryUtils.runWithExponentialBackoff`. 76 | .disableAutomaticRetries() 77 | .build() 78 | } 79 | 80 | private lazy val refreshThresholdMs = getConf.getLong( 81 | "spark.delta.sharing.executor.refreshThresholdMs", 82 | TimeUnit.MINUTES.toMillis(10)) 83 | 84 | private lazy val preSignedUrlCacheRef = PreSignedUrlCache.getEndpointRefInExecutor(SparkEnv.get) 85 | 86 | override def getScheme: String = SCHEME 87 | 88 | override def getUri(): URI = URI.create(s"$SCHEME:///") 89 | 90 | override def open(f: Path, bufferSize: Int): FSDataInputStream = { 91 | val path = DeltaSharingFileSystem.decode(f) 92 | val fetcher = 93 | new PreSignedUrlFetcher(preSignedUrlCacheRef, path.tablePath, path.fileId, refreshThresholdMs) 94 | if (getConf.getBoolean("spark.delta.sharing.loadDataFilesInMemory", false)) { 95 | // `InMemoryHttpInputStream` loads the content into the memory immediately, so we don't need 96 | // to refresh urls. 97 | new FSDataInputStream(new InMemoryHttpInputStream(new URI(fetcher.getUrl()))) 98 | } else { 99 | new FSDataInputStream( 100 | new RandomAccessHttpInputStream(httpClient, fetcher, path.fileSize, statistics, numRetries)) 101 | } 102 | } 103 | 104 | override def create( 105 | f: Path, 106 | permission: FsPermission, 107 | overwrite: Boolean, 108 | bufferSize: Int, 109 | replication: Short, 110 | blockSize: Long, 111 | progress: Progressable): FSDataOutputStream = 112 | throw new UnsupportedOperationException("create") 113 | 114 | override def append(f: Path, bufferSize: Int, progress: Progressable): FSDataOutputStream = 115 | throw new UnsupportedOperationException("append") 116 | 117 | override def rename(src: Path, dst: Path): Boolean = 118 | throw new UnsupportedOperationException("rename") 119 | 120 | override def delete(f: Path, recursive: Boolean): Boolean = 121 | throw new UnsupportedOperationException("delete") 122 | 123 | override def listStatus(f: Path): Array[FileStatus] = 124 | throw new UnsupportedOperationException("listStatus") 125 | 126 | override def setWorkingDirectory(new_dir: Path): Unit = 127 | throw new UnsupportedOperationException("setWorkingDirectory") 128 | 129 | override def getWorkingDirectory: Path = new Path(getUri) 130 | 131 | override def mkdirs(f: Path, permission: FsPermission): Boolean = 132 | throw new UnsupportedOperationException("mkdirs") 133 | 134 | override def getFileStatus(f: Path): FileStatus = { 135 | val resolved = makeQualified(f) 136 | new FileStatus(decode(resolved).fileSize, false, 0, 1, 0, f) 137 | } 138 | 139 | override def finalize(): Unit = { 140 | try super.finalize() finally close() 141 | } 142 | 143 | override def close(): Unit = { 144 | try super.close() finally httpClient.close() 145 | } 146 | } 147 | 148 | private[sharing] object DeltaSharingFileSystem { 149 | 150 | val SCHEME = "delta-sharing" 151 | 152 | case class DeltaSharingPath(tablePath: String, fileId: String, fileSize: Long) { 153 | 154 | /** 155 | * Convert `DeltaSharingPath` to a `Path` in the following format: 156 | * 157 | * ``` 158 | * delta-sharing:///// 159 | * ``` 160 | * 161 | * This format can be decoded by `DeltaSharingFileSystem.decode`. 162 | */ 163 | def toPath: Path = { 164 | val encodedTablePath = URLEncoder.encode(tablePath, "UTF-8") 165 | val encodedFileId = URLEncoder.encode(fileId, "UTF-8") 166 | new Path(s"$SCHEME:///$encodedTablePath/$encodedFileId/$fileSize") 167 | } 168 | } 169 | 170 | def encode(tablePath: Path, action: FileAction): Path = { 171 | DeltaSharingPath(tablePath.toString, action.id, action.size).toPath 172 | } 173 | 174 | def decode(path: Path): DeltaSharingPath = { 175 | val encodedPath = path.toString 176 | .stripPrefix(s"$SCHEME:///") 177 | .stripPrefix(s"$SCHEME:/") 178 | val Array(encodedTablePath, encodedFileId, sizeString) = encodedPath.split("/") 179 | DeltaSharingPath( 180 | URLDecoder.decode(encodedTablePath, "UTF-8"), 181 | URLDecoder.decode(encodedFileId, "UTF-8"), 182 | sizeString.toLong) 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/SharedTableManager.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | import java.io.IOException 20 | import java.nio.charset.StandardCharsets.UTF_8 21 | import java.util.Base64 22 | 23 | import scala.collection.JavaConverters._ 24 | 25 | import io.delta.sharing.server.config.{SchemaConfig, ServerConfig, ShareConfig, TableConfig} 26 | import io.delta.sharing.server.protocol.{PageToken, Schema, Share, Table} 27 | 28 | /** 29 | * Load the shared tables from `ServerConfig` and provide the pagination APIs to query 30 | * shares/schemas/tables. 31 | */ 32 | class SharedTableManager(serverConfig: ServerConfig) { 33 | 34 | private val caseInsensitiveComparer = (a: String, b: String) => a.equalsIgnoreCase(b) 35 | 36 | private val shares = serverConfig.getShares 37 | 38 | private val defaultMaxResults = 500 39 | 40 | private def encodePageToken(id: String, share: Option[String], schema: Option[String]): String = { 41 | val binary = PageToken(id = Option(id), share = share, schema = schema).toByteArray 42 | new String(Base64.getUrlEncoder().encode(binary), UTF_8) 43 | } 44 | 45 | private def decodePageToken( 46 | pageTokenString: String, 47 | expectedShare: Option[String], 48 | expectedSchema: Option[String]): String = { 49 | val pageToken = 50 | try { 51 | val binary = Base64.getUrlDecoder().decode(pageTokenString.getBytes(UTF_8)) 52 | PageToken.parseFrom(binary) 53 | } catch { 54 | case _: IllegalArgumentException | _: IOException => 55 | throw new DeltaSharingIllegalArgumentException("invalid 'nextPageToken'") 56 | } 57 | if (pageToken.id.isEmpty 58 | || pageToken.share != expectedShare 59 | || pageToken.schema != expectedSchema 60 | ) { 61 | throw new DeltaSharingIllegalArgumentException("invalid 'nextPageToken'") 62 | } 63 | pageToken.getId 64 | } 65 | 66 | private def getPage[T]( 67 | nextPageToken: Option[String], 68 | share: Option[String], 69 | schema: Option[String], 70 | maxResults: Option[Int], 71 | totalSize: Int)(func: (Int, Int) => Seq[T]): (Seq[T], Option[String]) = { 72 | assertMaxResults(maxResults) 73 | val start = nextPageToken.map { 74 | pageToken => decodePageToken(pageToken, share, schema).toInt 75 | }.getOrElse(0) 76 | if (start > totalSize) { 77 | throw new DeltaSharingIllegalArgumentException("invalid 'nextPageToken'") 78 | } 79 | val end = start + maxResults.getOrElse(defaultMaxResults) 80 | val results = func(start, end) 81 | val nextId = if (end < totalSize) Some(end) else None 82 | results -> nextId.map(id => encodePageToken(id.toString, share, schema)) 83 | } 84 | 85 | private def assertMaxResults(maxResults: Option[Int]): Unit = { 86 | maxResults.foreach { m => 87 | if (m < 0 || m > defaultMaxResults) { 88 | throw new DeltaSharingIllegalArgumentException( 89 | s"Acceptable values of 'maxResults' are 0 to $defaultMaxResults, inclusive. " + 90 | s"(Default: $defaultMaxResults)") 91 | } 92 | } 93 | } 94 | 95 | private def getShareInternal(share: String): ShareConfig = { 96 | shares.asScala.find(s => caseInsensitiveComparer(s.getName, share)) 97 | .getOrElse(throw new DeltaSharingNoSuchElementException(s"share '$share' not found")) 98 | } 99 | 100 | private def getSchema(shareConfig: ShareConfig, schema: String): SchemaConfig = { 101 | shareConfig.getSchemas.asScala.find(s => caseInsensitiveComparer(s.getName, schema)) 102 | .getOrElse(throw new DeltaSharingNoSuchElementException(s"schema '$schema' not found")) 103 | } 104 | 105 | def listShares( 106 | nextPageToken: Option[String] = None, 107 | maxResults: Option[Int] = None): (Seq[Share], Option[String]) = { 108 | getPage(nextPageToken, None, None, maxResults, shares.size) { (start, end) => 109 | shares.asScala.map { share => 110 | Share().withName(share.getName) 111 | }.slice(start, end) 112 | } 113 | } 114 | 115 | def getShare(share: String): Share = { 116 | val shareConfig = getShareInternal(share) 117 | Share().withName(shareConfig.getName) 118 | } 119 | 120 | def listSchemas( 121 | share: String, 122 | nextPageToken: Option[String] = None, 123 | maxResults: Option[Int] = None): (Seq[Schema], Option[String]) = { 124 | val shareConfig = getShareInternal(share) 125 | getPage(nextPageToken, Some(share), None, maxResults, shareConfig.getSchemas.size) { 126 | (start, end) => 127 | shareConfig.getSchemas.asScala.map { schemaConfig => 128 | Schema().withName(schemaConfig.getName).withShare(share) 129 | }.slice(start, end) 130 | } 131 | } 132 | 133 | def listTables( 134 | share: String, 135 | schema: String, 136 | nextPageToken: Option[String] = None, 137 | maxResults: Option[Int] = None): (Seq[Table], Option[String]) = { 138 | val schemaConfig = getSchema(getShareInternal(share), schema) 139 | getPage(nextPageToken, Some(share), Some(schema), maxResults, schemaConfig.getTables.size) { 140 | (start, end) => 141 | schemaConfig.getTables.asScala.map { tableConfig => 142 | Table().withName(tableConfig.getName).withSchema(schema).withShare(share) 143 | }.slice(start, end) 144 | } 145 | } 146 | 147 | def listAllTables( 148 | share: String, 149 | nextPageToken: Option[String] = None, 150 | maxResults: Option[Int] = None): (Seq[Table], Option[String]) = { 151 | val shareConfig = getShareInternal(share) 152 | val totalSize = shareConfig.schemas.asScala.map(_.tables.size).sum 153 | getPage(nextPageToken, Some(share), None, maxResults, totalSize) { 154 | (start, end) => 155 | shareConfig.schemas.asScala.flatMap { schema => 156 | schema.tables.asScala.map { 157 | table => 158 | Table( 159 | name = Some(table.getName), 160 | schema = Some(schema.name), 161 | share = Some(share) 162 | ) 163 | } 164 | }.slice(start, end) 165 | } 166 | } 167 | 168 | def getTable(share: String, schema: String, table: String): TableConfig = { 169 | val schemaConfig = 170 | try { 171 | getSchema(getShareInternal(share), schema) 172 | } catch { 173 | case _: DeltaSharingNoSuchElementException => 174 | throw new DeltaSharingNoSuchElementException( 175 | s"[Share/Schema/Table] '$share/$schema/$table' does not exist, " + 176 | s"please contact your share provider for further information.") 177 | } 178 | schemaConfig.getTables.asScala.find(t => caseInsensitiveComparer(t.getName, table)) 179 | .getOrElse(throw new DeltaSharingNoSuchElementException( 180 | s"[Share/Schema/Table] '$share/$schema/$table' does not exist, " + 181 | s"please contact your share provider for further information.")) 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /spark/src/main/scala/io/delta/sharing/spark/RandomAccessHttpInputStream.scala: -------------------------------------------------------------------------------- 1 | // scalastyle:off headerCheck 2 | /** 3 | * Licensed to the Apache Software Foundation (ASF) under one 4 | * or more contributor license agreements. See the NOTICE file 5 | * distributed with this work for additional information 6 | * regarding copyright ownership. The ASF licenses this file 7 | * to you under the Apache License, Version 2.0 (the 8 | * "License"); you may not use this file except in compliance 9 | * with the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | /* 21 | * This file contains code from the Apache Hadoop project (original license above). 22 | * It contains modifications, which are licensed as follows: 23 | */ 24 | 25 | /* 26 | * Copyright (2021) The Delta Lake Project Authors. 27 | * 28 | * Licensed under the Apache License, Version 2.0 (the "License"); 29 | * you may not use this file except in compliance with the License. 30 | * You may obtain a copy of the License at 31 | * 32 | * http://www.apache.org/licenses/LICENSE-2.0 33 | * 34 | * Unless required by applicable law or agreed to in writing, software 35 | * distributed under the License is distributed on an "AS IS" BASIS, 36 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 37 | * See the License for the specific language governing permissions and 38 | * limitations under the License. 39 | */ 40 | package io.delta.sharing.spark 41 | 42 | import java.io.{EOFException, InputStream, IOException} 43 | import java.nio.charset.StandardCharsets.UTF_8 44 | 45 | import org.apache.commons.io.IOUtils 46 | import org.apache.hadoop.fs.{FileSystem, FSExceptionMessages, FSInputStream} 47 | import org.apache.http.HttpStatus 48 | import org.apache.http.client.HttpClient 49 | import org.apache.http.client.methods.{HttpGet, HttpRequestBase} 50 | import org.apache.http.conn.EofSensorInputStream 51 | import org.apache.spark.delta.sharing.PreSignedUrlFetcher 52 | import org.apache.spark.internal.Logging 53 | 54 | import io.delta.sharing.spark.util.{RetryUtils, UnexpectedHttpStatus} 55 | 56 | /** 57 | * This is a special input stream to provide random access over HTTP. This class requires the server 58 | * side to support HTTP Range header. 59 | */ 60 | private[sharing] class RandomAccessHttpInputStream( 61 | client: HttpClient, 62 | fetcher: PreSignedUrlFetcher, 63 | contentLength: Long, 64 | stats: FileSystem.Statistics, 65 | numRetries: Int) extends FSInputStream with Logging { 66 | 67 | private var closed = false 68 | private var pos = 0L 69 | private var currentStream: InputStream = null 70 | private var uri: String = null 71 | 72 | private def assertNotClosed(): Unit = { 73 | if (closed) { 74 | throw new IOException(FSExceptionMessages.STREAM_IS_CLOSED) 75 | } 76 | val newUrl = fetcher.getUrl() 77 | if (uri != newUrl) { 78 | // Abort the current open stream so that we will re-open a new stream using the new url 79 | uri = newUrl 80 | abortCurrentStream() 81 | } 82 | } 83 | 84 | override def seek(pos: Long): Unit = synchronized { 85 | if (this.pos != pos) { 86 | assertNotClosed() 87 | reopen(pos) 88 | } 89 | } 90 | 91 | override def getPos: Long = synchronized { 92 | pos 93 | } 94 | 95 | override def seekToNewSource(targetPos: Long): Boolean = { 96 | // We don't support this feature 97 | false 98 | } 99 | 100 | override def read(): Int = synchronized { 101 | assertNotClosed() 102 | if (currentStream == null) { 103 | reopen(pos) 104 | } 105 | val byte = currentStream.read() 106 | if (byte >= 0) { 107 | pos += 1 108 | } 109 | if (stats != null && byte >= 0) { 110 | stats.incrementBytesRead(1) 111 | } 112 | byte 113 | } 114 | 115 | private def createHttpRequest(start: Long): HttpRequestBase = { 116 | val request = new HttpGet(uri) 117 | val rangeValue = s"bytes=$start-${contentLength - 1L}" 118 | request.addHeader("Range", rangeValue) 119 | request 120 | } 121 | 122 | override def read(buf: Array[Byte], off: Int, len: Int): Int = synchronized { 123 | assertNotClosed() 124 | if (currentStream == null) { 125 | reopen(pos) 126 | } 127 | val byteRead = currentStream.read(buf, off, len) 128 | if (byteRead > 0) { 129 | pos += byteRead 130 | } 131 | if (stats != null && byteRead > 0) { 132 | stats.incrementBytesRead(byteRead) 133 | } 134 | byteRead 135 | } 136 | 137 | private def reopen(pos: Long): Unit = { 138 | if (currentStream != null) { 139 | logDebug(s"Aborting old stream to open at pos $pos") 140 | abortCurrentStream() 141 | } 142 | if (pos < 0L) { 143 | throw new EOFException(FSExceptionMessages.NEGATIVE_SEEK + " " + pos) 144 | } else if (contentLength > 0L && pos > this.contentLength - 1L) { 145 | throw new EOFException(FSExceptionMessages.CANNOT_SEEK_PAST_EOF + " " + pos) 146 | } else { 147 | logDebug(s"Opening file $uri at pos $pos") 148 | 149 | val entity = RetryUtils.runWithExponentialBackoff(numRetries) { 150 | val httpRequest = createHttpRequest(pos) 151 | val response = client.execute(httpRequest) 152 | val status = response.getStatusLine() 153 | val entity = response.getEntity() 154 | val statusCode = status.getStatusCode 155 | if (statusCode != HttpStatus.SC_PARTIAL_CONTENT) { 156 | // Note: we will still fail if the server returns 200 because it means the server doesn't 157 | // support HTTP Range header. 158 | val errorBody = if (entity == null) { 159 | "" 160 | } else { 161 | val input = entity.getContent() 162 | try { 163 | IOUtils.toString(input, UTF_8) 164 | } finally { 165 | input.close() 166 | } 167 | } 168 | throw new UnexpectedHttpStatus( 169 | s"HTTP request failed with status: $status $errorBody", 170 | statusCode) 171 | } 172 | entity 173 | } 174 | currentStream = entity.getContent() 175 | this.pos = pos 176 | } 177 | } 178 | 179 | override def available(): Int = synchronized { 180 | assertNotClosed() 181 | currentStream.available() 182 | } 183 | 184 | /** 185 | * Aborts `currentStream` without reading any more data. Apache `HttpClient` tries to read the 186 | * rest of bytes in `Close` in order to reuse the connection. However, it's not efficient when we 187 | * need to discard a lot of bytes. This method provides a way to not reuse the connection when the 188 | * remaining bytes are still a lot. See `EofSensorInputStream` for more details. 189 | */ 190 | private def abortCurrentStream(): Unit = { 191 | if (currentStream != null) { 192 | currentStream match { 193 | case e: EofSensorInputStream => e.abortConnection() 194 | case _ => currentStream.close() 195 | } 196 | currentStream = null 197 | } 198 | } 199 | 200 | override def close(): Unit = synchronized { 201 | if (!closed) { 202 | super.close() 203 | closed = true 204 | if (currentStream != null) { 205 | if (contentLength - pos <= 4096) { 206 | // Close, rather than abort, so that the http connection can be reused. 207 | currentStream.close() 208 | currentStream = null 209 | } else { 210 | // Abort, rather than just close, the underlying stream. Otherwise, the remaining bytes 211 | // are read while closing the stream. 212 | abortCurrentStream() 213 | } 214 | } 215 | } 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /python/delta_sharing/reader.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from typing import Any, Callable, Dict, Optional, Sequence 17 | from urllib.parse import urlparse 18 | from json import loads 19 | 20 | import fsspec 21 | import pandas as pd 22 | from pyarrow.dataset import dataset 23 | 24 | from delta_sharing.converter import to_converters, get_empty_table 25 | from delta_sharing.protocol import AddCdcFile, CdfOptions, FileAction, Table 26 | from delta_sharing.rest_client import DataSharingRestClient 27 | 28 | 29 | class DeltaSharingReader: 30 | def __init__( 31 | self, 32 | table: Table, 33 | rest_client: DataSharingRestClient, 34 | *, 35 | predicateHints: Optional[Sequence[str]] = None, 36 | limit: Optional[int] = None, 37 | version: Optional[int] = None, 38 | ): 39 | self._table = table 40 | self._rest_client = rest_client 41 | 42 | if predicateHints is not None: 43 | assert isinstance(predicateHints, Sequence) 44 | assert all(isinstance(predicateHint, str) for predicateHint in predicateHints) 45 | self._predicateHints = predicateHints 46 | 47 | if limit is not None: 48 | assert isinstance(limit, int) and limit >= 0, "'limit' must be a non-negative int" 49 | self._limit = limit 50 | self._version = version 51 | 52 | @property 53 | def table(self) -> Table: 54 | return self._table 55 | 56 | def predicateHints(self, predicateHints: Optional[Sequence[str]]) -> "DeltaSharingReader": 57 | return self._copy( 58 | predicateHints=predicateHints, 59 | limit=self._limit, 60 | version=self._version 61 | ) 62 | 63 | def limit(self, limit: Optional[int]) -> "DeltaSharingReader": 64 | return self._copy( 65 | predicateHints=self._predicateHints, 66 | limit=limit, 67 | version=self._version 68 | ) 69 | 70 | def to_pandas(self) -> pd.DataFrame: 71 | response = self._rest_client.list_files_in_table( 72 | self._table, 73 | predicateHints=self._predicateHints, 74 | limitHint=self._limit, 75 | version=self._version 76 | ) 77 | 78 | schema_json = loads(response.metadata.schema_string) 79 | 80 | if len(response.add_files) == 0 or self._limit == 0: 81 | return get_empty_table(schema_json) 82 | 83 | converters = to_converters(schema_json) 84 | 85 | if self._limit is None: 86 | pdfs = [ 87 | DeltaSharingReader._to_pandas( 88 | file, converters, False, None) for file in response.add_files 89 | ] 90 | else: 91 | left = self._limit 92 | pdfs = [] 93 | for file in response.add_files: 94 | pdf = DeltaSharingReader._to_pandas(file, converters, False, left) 95 | pdfs.append(pdf) 96 | left -= len(pdf) 97 | assert ( 98 | left >= 0 99 | ), f"'_to_pandas' returned too many rows. Required: {left}, returned: {len(pdf)}" 100 | if left == 0: 101 | break 102 | 103 | return pd.concat( 104 | pdfs, 105 | axis=0, 106 | ignore_index=True, 107 | copy=False, 108 | )[[field["name"] for field in schema_json["fields"]]] 109 | 110 | def table_changes_to_pandas(self, cdfOptions: CdfOptions) -> pd.DataFrame: 111 | response = self._rest_client.list_table_changes(self._table, cdfOptions) 112 | 113 | schema_json = loads(response.metadata.schema_string) 114 | 115 | if len(response.actions) == 0: 116 | return get_empty_table(self._add_special_cdf_schema(schema_json)) 117 | 118 | converters = to_converters(schema_json) 119 | pdfs = [] 120 | for action in response.actions: 121 | pdf = DeltaSharingReader._to_pandas(action, converters, True, None) 122 | pdfs.append(pdf) 123 | 124 | return pd.concat(pdfs, axis=0, ignore_index=True, copy=False) 125 | 126 | def _copy( 127 | self, 128 | *, 129 | predicateHints: Optional[Sequence[str]], 130 | limit: Optional[int], 131 | version: Optional[int] 132 | ) -> "DeltaSharingReader": 133 | return DeltaSharingReader( 134 | table=self._table, 135 | rest_client=self._rest_client, 136 | predicateHints=predicateHints, 137 | limit=limit, 138 | version=version 139 | ) 140 | 141 | @staticmethod 142 | def _to_pandas( 143 | action: FileAction, 144 | converters: Dict[str, Callable[[str], Any]], 145 | for_cdf: bool, 146 | limit: Optional[int] 147 | ) -> pd.DataFrame: 148 | url = urlparse(action.url) 149 | if "storage.googleapis.com" in (url.netloc.lower()): 150 | # Apply the yarl patch for GCS pre-signed urls 151 | import delta_sharing._yarl_patch # noqa: F401 152 | 153 | protocol = url.scheme 154 | filesystem = fsspec.filesystem(protocol) 155 | 156 | pa_dataset = dataset(source=action.url, format="parquet", filesystem=filesystem) 157 | pa_table = pa_dataset.head(limit) if limit is not None else pa_dataset.to_table() 158 | pdf = pa_table.to_pandas( 159 | date_as_object=True, use_threads=False, split_blocks=True, self_destruct=True 160 | ) 161 | 162 | for col, converter in converters.items(): 163 | if col not in pdf.columns: 164 | if col in action.partition_values: 165 | if converter is not None: 166 | pdf[col] = converter(action.partition_values[col]) 167 | else: 168 | raise ValueError("Cannot partition on binary or complex columns") 169 | else: 170 | pdf[col] = None 171 | 172 | if for_cdf: 173 | # Add the change type col name to non cdc actions. 174 | if type(action) != AddCdcFile: 175 | pdf[DeltaSharingReader._change_type_col_name()] = action.get_change_type_col_value() 176 | 177 | # If available, add timestamp and version columns from the action. 178 | # All rows of the dataframe will get the same value. 179 | if action.version is not None: 180 | assert DeltaSharingReader._commit_version_col_name() not in pdf.columns 181 | pdf[DeltaSharingReader._commit_version_col_name()] = action.version 182 | 183 | if action.timestamp is not None: 184 | assert DeltaSharingReader._commit_timestamp_col_name() not in pdf.columns 185 | pdf[DeltaSharingReader._commit_timestamp_col_name()] = action.timestamp 186 | return pdf 187 | 188 | # The names of special delta columns for cdf. 189 | 190 | @staticmethod 191 | def _change_type_col_name(): 192 | return "_change_type" 193 | 194 | @staticmethod 195 | def _commit_timestamp_col_name(): 196 | return "_commit_timestamp" 197 | 198 | @staticmethod 199 | def _commit_version_col_name(): 200 | return "_commit_version" 201 | 202 | @staticmethod 203 | def _add_special_cdf_schema(schema_json: dict) -> dict: 204 | fields = schema_json["fields"] 205 | fields.append({"name" : DeltaSharingReader._change_type_col_name(), "type" : "string"}) 206 | fields.append({"name" : DeltaSharingReader._commit_version_col_name(), "type" : "long"}) 207 | fields.append({"name" : DeltaSharingReader._commit_timestamp_col_name(), "type" : "long"}) 208 | return schema_json 209 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/config/ServerConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server.config 18 | 19 | import java.io.{File, IOException} 20 | import java.util.Collections 21 | 22 | import scala.beans.BeanProperty 23 | 24 | import com.fasterxml.jackson.annotation.JsonInclude 25 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} 26 | import com.fasterxml.jackson.dataformat.yaml.YAMLFactory 27 | 28 | /** A trait that requires to implement */ 29 | trait ConfigItem { 30 | /** Verify whether the config is valid */ 31 | def checkConfig(): Unit 32 | } 33 | 34 | /** 35 | * The class for the server config yaml file. The yaml file will be loaded as this class. 36 | * 37 | * As `jackson-dataformat-yaml` only supports Java, we need to use `@BeanProperty var` to generate 38 | * Java bean classes. 39 | */ 40 | case class ServerConfig( 41 | @BeanProperty var version: java.lang.Integer, 42 | @BeanProperty var shares: java.util.List[ShareConfig], 43 | @BeanProperty var authorization: Authorization, 44 | @BeanProperty var ssl: SSLConfig, 45 | @BeanProperty var host: String, 46 | @BeanProperty var port: Int, 47 | @BeanProperty var endpoint: String, 48 | // The timeout of S3 presigned url in seconds 49 | @BeanProperty var preSignedUrlTimeoutSeconds: Long, 50 | // How many tables to cache in the memory. 51 | @BeanProperty var deltaTableCacheSize: Int, 52 | // Whether we can accept working with a stale version of the table. This is useful when sharing 53 | // static tables that will never be changed. 54 | @BeanProperty var stalenessAcceptable: Boolean, 55 | // Whether to evaluate user provided `predicateHints` 56 | @BeanProperty var evaluatePredicateHints: Boolean, 57 | // The timeout of an incoming web request in seconds. Set to 0 for no timeout 58 | @BeanProperty var requestTimeoutSeconds: Long 59 | ) extends ConfigItem { 60 | import ServerConfig._ 61 | 62 | def this() = { 63 | // Set default values here 64 | this( 65 | version = null, 66 | shares = Collections.emptyList(), 67 | authorization = null, 68 | ssl = null, 69 | host = "localhost", 70 | port = 80, 71 | endpoint = "/delta-sharing", 72 | preSignedUrlTimeoutSeconds = 3600, 73 | deltaTableCacheSize = 10, 74 | stalenessAcceptable = false, 75 | evaluatePredicateHints = false, 76 | requestTimeoutSeconds = 30 77 | ) 78 | } 79 | 80 | private def checkVersion(): Unit = { 81 | if (version == null) { 82 | throw new IllegalArgumentException("'version' must be provided") 83 | } 84 | if (version <= 0) { 85 | throw new IllegalArgumentException("'version' must be greater than 0") 86 | } 87 | if (version > CURRENT) { 88 | throw new IllegalArgumentException(s"The 'version' in the server config is $version which " + 89 | s"is too new. The current release supports version $CURRENT and below. " + 90 | s"Please upgrade to a newer release.") 91 | } 92 | } 93 | 94 | def save(configFile: String): Unit = { 95 | ServerConfig.save(this, configFile) 96 | } 97 | 98 | override def checkConfig(): Unit = { 99 | checkVersion() 100 | shares.forEach(_.checkConfig()) 101 | if (authorization != null) { 102 | authorization.checkConfig() 103 | } 104 | if (ssl != null) { 105 | ssl.checkConfig() 106 | } 107 | } 108 | } 109 | 110 | object ServerConfig{ 111 | /** The version that we understand */ 112 | private val CURRENT = 1 113 | 114 | private def createYamlObjectMapper = { 115 | new ObjectMapper(new YAMLFactory) 116 | .setSerializationInclusion(JsonInclude.Include.NON_ABSENT) 117 | .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) 118 | } 119 | 120 | /** 121 | * Load the configurations for the server from the config file. If the file name ends with 122 | * `.yaml` or `.yml`, load it using the YAML parser. Otherwise, throw an error. 123 | */ 124 | def load(configFile: String): ServerConfig = { 125 | if (configFile.endsWith(".yaml") || configFile.endsWith(".yml")) { 126 | val serverConfig = 127 | createYamlObjectMapper.readValue(new File(configFile), classOf[ServerConfig]) 128 | serverConfig.checkConfig() 129 | serverConfig 130 | } else { 131 | throw new IOException("The server config file must be a yml or yaml file") 132 | } 133 | } 134 | 135 | /** 136 | * Serialize the [[ServerConfig]] object to the config file. If the file name ends with `.yaml` 137 | * or `.yml`, save it as a YAML file. Otherwise, throw an error. 138 | */ 139 | def save(config: ServerConfig, configFile: String): Unit = { 140 | if (configFile.endsWith(".yaml") || configFile.endsWith(".yml")) { 141 | createYamlObjectMapper.writeValue(new File(configFile), config) 142 | } else { 143 | throw new IOException("The server config file must be a yml or yaml file") 144 | } 145 | } 146 | } 147 | 148 | case class Authorization(@BeanProperty var bearerToken: String) extends ConfigItem { 149 | 150 | def this() { 151 | this(null) 152 | } 153 | 154 | override def checkConfig(): Unit = { 155 | if (bearerToken == null) { 156 | throw new IllegalArgumentException("'bearerToken' in 'authorization' must be provided") 157 | } 158 | } 159 | } 160 | 161 | case class SSLConfig( 162 | @BeanProperty var selfSigned: Boolean, 163 | // The file of the PEM-format certificate 164 | @BeanProperty var certificateFile: String, 165 | // The file of the certificate’s private key 166 | @BeanProperty var certificateKeyFile: String, 167 | // The file storing the password to access the above certificate’s private key if it's protected 168 | @BeanProperty var certificatePasswordFile: String) extends ConfigItem { 169 | 170 | def this() { 171 | this(selfSigned = false, null, null, null) 172 | } 173 | 174 | override def checkConfig(): Unit = { 175 | if (!selfSigned) { 176 | if (certificateFile == null) { 177 | throw new IllegalArgumentException("'certificateFile' in a SSL config must be provided") 178 | } 179 | if (certificateKeyFile == null) { 180 | throw new IllegalArgumentException("'certificateKeyFile' in a SSL config must be provided") 181 | } 182 | } 183 | } 184 | } 185 | 186 | case class ShareConfig( 187 | @BeanProperty var name: String, 188 | @BeanProperty var schemas: java.util.List[SchemaConfig]) extends ConfigItem { 189 | 190 | def this() { 191 | this(null, Collections.emptyList()) 192 | } 193 | 194 | override def checkConfig(): Unit = { 195 | if (name == null) { 196 | throw new IllegalArgumentException("'name' in a share must be provided") 197 | } 198 | schemas.forEach(_.checkConfig()) 199 | } 200 | } 201 | 202 | case class SchemaConfig( 203 | @BeanProperty var name: String, 204 | @BeanProperty var tables: java.util.List[TableConfig]) extends ConfigItem { 205 | 206 | def this() { 207 | this(null, Collections.emptyList()) 208 | } 209 | 210 | override def checkConfig(): Unit = { 211 | if (name == null) { 212 | throw new IllegalArgumentException("'name' in a schema must be provided") 213 | } 214 | tables.forEach(_.checkConfig()) 215 | } 216 | } 217 | 218 | case class TableConfig( 219 | @BeanProperty var name: String, 220 | @BeanProperty var location: String, 221 | @BeanProperty var cdfEnabled: Boolean = false, 222 | @BeanProperty var startVersion: Long = 0) extends ConfigItem { 223 | 224 | def this() { 225 | this(null, null) 226 | } 227 | 228 | override def checkConfig(): Unit = { 229 | if (name == null) { 230 | throw new IllegalArgumentException("'name' in a table must be provided") 231 | } 232 | if (location == null) { 233 | throw new IllegalArgumentException("'location' in a table must be provided") 234 | } 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /python/delta_sharing/protocol.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2021 The Delta Lake Project Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from dataclasses import dataclass, field 17 | from json import loads 18 | from pathlib import Path 19 | from typing import ClassVar, Dict, IO, Optional, Sequence, Union 20 | 21 | import fsspec 22 | 23 | 24 | @dataclass(frozen=True) 25 | class DeltaSharingProfile: 26 | CURRENT: ClassVar[int] = 1 27 | 28 | share_credentials_version: int 29 | endpoint: str 30 | bearer_token: str 31 | expiration_time: Optional[str] = None 32 | 33 | def __post_init__(self): 34 | if self.share_credentials_version > DeltaSharingProfile.CURRENT: 35 | raise ValueError( 36 | "'shareCredentialsVersion' in the profile is " 37 | f"{self.share_credentials_version} which is too new. " 38 | f"The current release supports version {DeltaSharingProfile.CURRENT} and below. " 39 | "Please upgrade to a newer release." 40 | ) 41 | 42 | @staticmethod 43 | def read_from_file(profile: Union[str, IO, Path]) -> "DeltaSharingProfile": 44 | if isinstance(profile, str): 45 | infile = fsspec.open(profile).open() 46 | elif isinstance(profile, Path): 47 | infile = fsspec.open(profile.as_uri()).open() 48 | else: 49 | infile = profile 50 | try: 51 | return DeltaSharingProfile.from_json(infile.read()) 52 | finally: 53 | infile.close() 54 | 55 | @staticmethod 56 | def from_json(json) -> "DeltaSharingProfile": 57 | if isinstance(json, (str, bytes, bytearray)): 58 | json = loads(json) 59 | endpoint = json["endpoint"] 60 | if endpoint.endswith("/"): 61 | endpoint = endpoint[:-1] 62 | expiration_time = json.get("expirationTime") 63 | return DeltaSharingProfile( 64 | share_credentials_version=int(json["shareCredentialsVersion"]), 65 | endpoint=endpoint, 66 | bearer_token=json["bearerToken"], 67 | expiration_time=expiration_time, 68 | ) 69 | 70 | 71 | @dataclass(frozen=True) 72 | class Share: 73 | name: str 74 | 75 | @staticmethod 76 | def from_json(json) -> "Share": 77 | if isinstance(json, (str, bytes, bytearray)): 78 | json = loads(json) 79 | return Share(name=json["name"]) 80 | 81 | 82 | @dataclass(frozen=True) 83 | class Schema: 84 | name: str 85 | share: str 86 | 87 | @staticmethod 88 | def from_json(json) -> "Schema": 89 | if isinstance(json, (str, bytes, bytearray)): 90 | json = loads(json) 91 | return Schema(name=json["name"], share=json["share"]) 92 | 93 | 94 | @dataclass(frozen=True) 95 | class Table: 96 | name: str 97 | share: str 98 | schema: str 99 | 100 | @staticmethod 101 | def from_json(json) -> "Table": 102 | if isinstance(json, (str, bytes, bytearray)): 103 | json = loads(json) 104 | return Table(name=json["name"], share=json["share"], schema=json["schema"]) 105 | 106 | 107 | @dataclass(frozen=True) 108 | class Protocol: 109 | CURRENT: ClassVar[int] = 1 110 | 111 | min_reader_version: int 112 | 113 | def __post_init__(self): 114 | if self.min_reader_version > Protocol.CURRENT: 115 | raise ValueError( 116 | f"The table requires a newer version {self.min_reader_version} to read. " 117 | f"But the current release supports version {Protocol.CURRENT} and below. " 118 | f"Please upgrade to a newer release." 119 | ) 120 | 121 | @staticmethod 122 | def from_json(json) -> "Protocol": 123 | if isinstance(json, (str, bytes, bytearray)): 124 | json = loads(json) 125 | return Protocol(min_reader_version=int(json["minReaderVersion"])) 126 | 127 | 128 | @dataclass(frozen=True) 129 | class Format: 130 | provider: str = "parquet" 131 | options: Dict[str, str] = field(default_factory=dict) 132 | 133 | @staticmethod 134 | def from_json(json) -> "Format": 135 | if isinstance(json, (str, bytes, bytearray)): 136 | json = loads(json) 137 | return Format(provider=json.get("provider", "parquet"), options=json.get("options", {})) 138 | 139 | 140 | @dataclass(frozen=True) 141 | class Metadata: 142 | id: Optional[str] = None 143 | name: Optional[str] = None 144 | description: Optional[str] = None 145 | format: Format = field(default_factory=Format) 146 | schema_string: Optional[str] = None 147 | configuration: Dict[str, str] = field(default_factory=dict) 148 | partition_columns: Sequence[str] = field(default_factory=list) 149 | 150 | @staticmethod 151 | def from_json(json) -> "Metadata": 152 | if isinstance(json, (str, bytes, bytearray)): 153 | json = loads(json) 154 | if "configuration" in json: 155 | configuration = json["configuration"] 156 | else: 157 | configuration = {} 158 | return Metadata( 159 | id=json["id"], 160 | name=json.get("name", None), 161 | description=json.get("description", None), 162 | format=Format.from_json(json["format"]), 163 | schema_string=json["schemaString"], 164 | configuration=configuration, 165 | partition_columns=json["partitionColumns"], 166 | ) 167 | 168 | 169 | @dataclass(frozen=True) 170 | class FileAction: 171 | url: str 172 | id: str 173 | partition_values: Dict[str, str] 174 | size: int 175 | timestamp: Optional[int] = None 176 | version: Optional[int] = None 177 | 178 | def get_change_type_col_value(self) -> str: 179 | raise ValueError(f"_change_type not supported for {self.url}") 180 | 181 | @staticmethod 182 | def from_json(action_json) -> "FileAction": 183 | if "add" in action_json: 184 | return AddFile.from_json(action_json["add"]) 185 | elif "cdf" in action_json: 186 | return AddCdcFile.from_json(action_json["cdf"]) 187 | elif "remove" in action_json: 188 | return RemoveFile.from_json(action_json["remove"]) 189 | else: 190 | return None 191 | 192 | 193 | @dataclass(frozen=True) 194 | class AddFile(FileAction): 195 | stats: Optional[str] = None 196 | 197 | @staticmethod 198 | def from_json(json) -> "AddFile": 199 | if isinstance(json, (str, bytes, bytearray)): 200 | json = loads(json) 201 | return AddFile( 202 | url=json["url"], 203 | id=json["id"], 204 | partition_values=json["partitionValues"], 205 | size=int(json["size"]), 206 | stats=json.get("stats", None), 207 | timestamp=json.get("timestamp", None), 208 | version=json.get("version", None), 209 | ) 210 | 211 | def get_change_type_col_value(self) -> str: 212 | return "insert" 213 | 214 | 215 | @dataclass(frozen=True) 216 | class AddCdcFile(FileAction): 217 | @staticmethod 218 | def from_json(json) -> "AddCdcFile": 219 | if isinstance(json, (str, bytes, bytearray)): 220 | json = loads(json) 221 | return AddCdcFile( 222 | url=json["url"], 223 | id=json["id"], 224 | partition_values=json["partitionValues"], 225 | size=int(json["size"]), 226 | timestamp=json["timestamp"], 227 | version=json["version"], 228 | ) 229 | 230 | 231 | @dataclass(frozen=True) 232 | class RemoveFile(FileAction): 233 | @staticmethod 234 | def from_json(json) -> "RemoveFile": 235 | if isinstance(json, (str, bytes, bytearray)): 236 | json = loads(json) 237 | return RemoveFile( 238 | url=json["url"], 239 | id=json["id"], 240 | partition_values=json["partitionValues"], 241 | size=int(json["size"]), 242 | timestamp=json.get("timestamp", None), 243 | version=json.get("version", None), 244 | ) 245 | 246 | def get_change_type_col_value(self) -> str: 247 | return "delete" 248 | 249 | 250 | @dataclass(frozen=True) 251 | class CdfOptions: 252 | starting_version: Optional[int] = None 253 | ending_version: Optional[int] = None 254 | starting_timestamp: Optional[str] = None 255 | ending_timestamp: Optional[str] = None 256 | -------------------------------------------------------------------------------- /server/src/main/scala/io/delta/sharing/server/CloudFileSigner.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (2021) The Delta Lake Project Authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package io.delta.sharing.server 18 | 19 | import java.net.URI 20 | import java.util.Date 21 | import java.util.concurrent.TimeUnit.SECONDS 22 | 23 | import com.amazonaws.HttpMethod 24 | import com.amazonaws.services.s3.model.GeneratePresignedUrlRequest 25 | import com.google.cloud.hadoop.gcsio.StorageResourceId 26 | import com.google.cloud.storage.BlobId 27 | import com.google.cloud.storage.BlobInfo 28 | import com.google.cloud.storage.Storage 29 | import com.google.cloud.storage.StorageOptions 30 | import com.microsoft.azure.storage.{CloudStorageAccount, SharedAccessProtocols, StorageCredentialsSharedAccessSignature} 31 | import com.microsoft.azure.storage.blob.{SharedAccessBlobPermissions, SharedAccessBlobPolicy} 32 | import org.apache.hadoop.conf.Configuration 33 | import org.apache.hadoop.fs.Path 34 | import org.apache.hadoop.fs.azure.{AzureNativeFileSystemStore, NativeAzureFileSystem} 35 | import org.apache.hadoop.fs.azurebfs.{AzureBlobFileSystem, AzureBlobFileSystemStore} 36 | import org.apache.hadoop.fs.azurebfs.services.AuthType 37 | import org.apache.hadoop.fs.s3a.DefaultS3ClientFactory 38 | import org.apache.hadoop.util.ReflectionUtils 39 | 40 | 41 | trait CloudFileSigner { 42 | def sign(path: Path): String 43 | } 44 | 45 | class S3FileSigner( 46 | name: URI, 47 | conf: Configuration, 48 | preSignedUrlTimeoutSeconds: Long) extends CloudFileSigner { 49 | 50 | private val s3Client = ReflectionUtils.newInstance(classOf[DefaultS3ClientFactory], conf) 51 | .createS3Client(name) 52 | 53 | override def sign(path: Path): String = { 54 | val absPath = path.toUri 55 | val bucketName = absPath.getHost 56 | val objectKey = absPath.getPath.stripPrefix("/") 57 | val expiration = 58 | new Date(System.currentTimeMillis() + SECONDS.toMillis(preSignedUrlTimeoutSeconds)) 59 | assert(objectKey.nonEmpty, s"cannot get object key from $path") 60 | val request = new GeneratePresignedUrlRequest(bucketName, objectKey) 61 | .withMethod(HttpMethod.GET) 62 | .withExpiration(expiration) 63 | s3Client.generatePresignedUrl(request).toString 64 | } 65 | } 66 | 67 | class AzureFileSigner( 68 | accountName: String, 69 | storageKey: String, 70 | container: String, 71 | preSignedUrlTimeoutSeconds: Long, 72 | objectKeyExtractor: Path => String) extends CloudFileSigner { 73 | 74 | private val (rawAccountName, endpointSuffix) = { 75 | val splits = accountName.split("\\.", 3) 76 | if (splits.length != 3) { 77 | throw new IllegalArgumentException(s"Incorrect account name: $accountName") 78 | } 79 | (splits(0), splits(2)) 80 | } 81 | 82 | private def getCloudStorageAccount: CloudStorageAccount = { 83 | val connectionString = Seq( 84 | "DefaultEndpointsProtocol=https", 85 | s"AccountName=$rawAccountName", 86 | s"AccountKey=$storageKey", 87 | s"EndpointSuffix=$endpointSuffix" 88 | ).mkString(";") 89 | CloudStorageAccount.parse(connectionString) 90 | } 91 | 92 | private val cloudStorageAccount = getCloudStorageAccount 93 | 94 | private val blobClient = cloudStorageAccount.createCloudBlobClient() 95 | 96 | private def getAccessPolicy: SharedAccessBlobPolicy = { 97 | val expiration = 98 | new Date(System.currentTimeMillis() + SECONDS.toMillis(preSignedUrlTimeoutSeconds)) 99 | val sharedAccessPolicy = new SharedAccessBlobPolicy() 100 | sharedAccessPolicy.setPermissions(java.util.EnumSet.of(SharedAccessBlobPermissions.READ)) 101 | sharedAccessPolicy.setSharedAccessExpiryTime(expiration) 102 | sharedAccessPolicy 103 | } 104 | 105 | override def sign(path: Path): String = { 106 | val containerRef = blobClient.getContainerReference(container) 107 | val objectKey = objectKeyExtractor(path) 108 | assert(objectKey.nonEmpty, s"cannot get object key from $path") 109 | val blobRef = containerRef.getBlockBlobReference(objectKey) 110 | val accessPolicy = getAccessPolicy 111 | val sasToken = blobRef.generateSharedAccessSignature( 112 | accessPolicy, 113 | /* headers */ null, 114 | /* groupPolicyIdentifier */ null, 115 | /* ipRange */ null, 116 | SharedAccessProtocols.HTTPS_ONLY 117 | ) 118 | val sasTokenCredentials = new StorageCredentialsSharedAccessSignature(sasToken) 119 | sasTokenCredentials.transformUri(blobRef.getUri).toString 120 | } 121 | } 122 | 123 | object WasbFileSigner { 124 | private def getAccountFromAuthority(store: AzureNativeFileSystemStore, uri: URI): String = { 125 | val getAccountFromAuthorityMethod = classOf[AzureNativeFileSystemStore] 126 | .getDeclaredMethod("getAccountFromAuthority", classOf[URI]) 127 | getAccountFromAuthorityMethod.setAccessible(true) 128 | getAccountFromAuthorityMethod.invoke(store, uri).asInstanceOf[String] 129 | } 130 | 131 | private def getContainerFromAuthority(store: AzureNativeFileSystemStore, uri: URI): String = { 132 | val getContainerFromAuthorityMethod = classOf[AzureNativeFileSystemStore] 133 | .getDeclaredMethod("getContainerFromAuthority", classOf[URI]) 134 | getContainerFromAuthorityMethod.setAccessible(true) 135 | getContainerFromAuthorityMethod.invoke(store, uri).asInstanceOf[String] 136 | } 137 | 138 | def apply( 139 | fs: NativeAzureFileSystem, 140 | uri: URI, 141 | conf: Configuration, 142 | preSignedUrlTimeoutSeconds: Long): CloudFileSigner = { 143 | val accountName = getAccountFromAuthority(fs.getStore, uri) 144 | val accountKey = AzureNativeFileSystemStore.getAccountKeyFromConfiguration(accountName, conf) 145 | val container = getContainerFromAuthority(fs.getStore, uri) 146 | new AzureFileSigner( 147 | accountName, 148 | accountKey, 149 | container, 150 | preSignedUrlTimeoutSeconds, 151 | fs.pathToKey) 152 | } 153 | } 154 | 155 | object AbfsFileSigner { 156 | private def getAbfsStore(fs: AzureBlobFileSystem): AzureBlobFileSystemStore = { 157 | val getAbfsStoreMethod = classOf[AzureBlobFileSystem].getDeclaredMethod("getAbfsStore") 158 | getAbfsStoreMethod.setAccessible(true) 159 | getAbfsStoreMethod.invoke(fs).asInstanceOf[AzureBlobFileSystemStore] 160 | } 161 | 162 | private def getRelativePath(abfsStore: AzureBlobFileSystemStore, path: Path): String = { 163 | val getRelativePathMethod = classOf[AzureBlobFileSystemStore] 164 | .getDeclaredMethod("getRelativePath", classOf[Path]) 165 | getRelativePathMethod.setAccessible(true) 166 | getRelativePathMethod.invoke(abfsStore, path).asInstanceOf[String] 167 | } 168 | 169 | private def authorityParts(abfsStore: AzureBlobFileSystemStore, uri: URI): Array[String] = { 170 | val authorityPartsMethod = classOf[AzureBlobFileSystemStore] 171 | .getDeclaredMethod("authorityParts", classOf[URI]) 172 | authorityPartsMethod.setAccessible(true) 173 | authorityPartsMethod.invoke(abfsStore, uri).asInstanceOf[Array[String]] 174 | } 175 | 176 | def apply( 177 | fs: AzureBlobFileSystem, 178 | uri: URI, 179 | preSignedUrlTimeoutSeconds: Long): CloudFileSigner = { 180 | val abfsStore = getAbfsStore(fs) 181 | val abfsConfiguration = abfsStore.getAbfsConfiguration 182 | val accountName = abfsConfiguration.accountConf("dummy").stripPrefix("dummy.") 183 | val authType = abfsConfiguration.getAuthType(accountName) 184 | if (authType != AuthType.SharedKey) { 185 | throw new UnsupportedOperationException(s"unsupported auth type: $authType") 186 | } 187 | val accountKey = abfsConfiguration.getStorageAccountKey 188 | val container = authorityParts(abfsStore, uri)(0) 189 | new AzureFileSigner( 190 | accountName, 191 | accountKey, 192 | container, 193 | preSignedUrlTimeoutSeconds, 194 | getRelativePath(abfsStore, _)) 195 | } 196 | } 197 | 198 | class GCSFileSigner( 199 | name: URI, 200 | conf: Configuration, 201 | preSignedUrlTimeoutSeconds: Long) extends CloudFileSigner { 202 | 203 | private val storage = StorageOptions.newBuilder.build.getService 204 | 205 | override def sign(path: Path): String = { 206 | val (bucketName, objectName) = GCSFileSigner.getBucketAndObjectNames(path) 207 | assert(objectName.nonEmpty, s"cannot get object key from $path") 208 | val blobInfo = BlobInfo.newBuilder(BlobId.of(bucketName, objectName)).build 209 | storage.signUrl( 210 | blobInfo, preSignedUrlTimeoutSeconds, SECONDS, Storage.SignUrlOption.withV4Signature()) 211 | .toString 212 | } 213 | } 214 | 215 | object GCSFileSigner { 216 | def getBucketAndObjectNames(path: Path): (String, String) = { 217 | val resourceId = StorageResourceId.fromUriPath(path.toUri, false /* = allowEmptyObjectName */) 218 | (resourceId.getBucketName, resourceId.getObjectName) 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /python/dev/lint-python: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright (C) 2021 The Delta Lake Project Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | FLAKE8_BUILD="flake8" 19 | MINIMUM_FLAKE8="3.5.0" 20 | 21 | PYCODESTYLE_BUILD="pycodestyle" 22 | MINIMUM_PYCODESTYLE="2.7.0" 23 | 24 | SPHINX_BUILD="sphinx-build" 25 | 26 | PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}" 27 | 28 | MYPY_BUILD="mypy" 29 | 30 | BLACK_BUILD="$PYTHON_EXECUTABLE -m black" 31 | 32 | function satisfies_min_version { 33 | local provided_version="$1" 34 | local expected_version="$2" 35 | echo "$( 36 | "$PYTHON_EXECUTABLE" << EOM 37 | from setuptools.extern.packaging import version 38 | print(version.parse('$provided_version') >= version.parse('$expected_version')) 39 | EOM 40 | )" 41 | } 42 | 43 | function compile_python_test { 44 | local COMPILE_STATUS= 45 | local COMPILE_REPORT= 46 | 47 | if [[ ! "$1" ]]; then 48 | echo "No python files found! Something is very wrong -- exiting." 49 | exit 1; 50 | fi 51 | 52 | # compileall: https://docs.python.org/3/library/compileall.html 53 | echo "starting python compilation test..." 54 | COMPILE_REPORT=$( ("$PYTHON_EXECUTABLE" -B -mcompileall -q -l -x "[/\\\\][.]git" $1) 2>&1) 55 | COMPILE_STATUS=$? 56 | 57 | if [ $COMPILE_STATUS -ne 0 ]; then 58 | echo "Python compilation failed with the following errors:" 59 | echo "$COMPILE_REPORT" 60 | echo "$COMPILE_STATUS" 61 | exit "$COMPILE_STATUS" 62 | else 63 | echo "python compilation succeeded." 64 | echo 65 | fi 66 | } 67 | 68 | function pycodestyle_test { 69 | local PYCODESTYLE_STATUS= 70 | local PYCODESTYLE_REPORT= 71 | local RUN_LOCAL_PYCODESTYLE= 72 | local PYCODESTYLE_VERSION= 73 | local EXPECTED_PYCODESTYLE= 74 | local PYCODESTYLE_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pycodestyle-$MINIMUM_PYCODESTYLE.py" 75 | local PYCODESTYLE_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/PyCQA/pycodestyle/$MINIMUM_PYCODESTYLE/pycodestyle.py" 76 | 77 | if [[ ! "$1" ]]; then 78 | echo "No python files found! Something is very wrong -- exiting." 79 | exit 1; 80 | fi 81 | 82 | # check for locally installed pycodestyle & version 83 | RUN_LOCAL_PYCODESTYLE="False" 84 | if hash "$PYCODESTYLE_BUILD" 2> /dev/null; then 85 | PYCODESTYLE_VERSION="$($PYCODESTYLE_BUILD --version)" 86 | EXPECTED_PYCODESTYLE="$(satisfies_min_version $PYCODESTYLE_VERSION $MINIMUM_PYCODESTYLE)" 87 | if [ "$EXPECTED_PYCODESTYLE" == "True" ]; then 88 | RUN_LOCAL_PYCODESTYLE="True" 89 | fi 90 | fi 91 | 92 | # download the right version or run locally 93 | if [ $RUN_LOCAL_PYCODESTYLE == "False" ]; then 94 | # Get pycodestyle at runtime so that we don't rely on it being installed on the build server. 95 | # See: https://github.com/apache/spark/pull/1744#issuecomment-50982162 96 | # Updated to the latest official version of pep8. pep8 is formally renamed to pycodestyle. 97 | echo "downloading pycodestyle from $PYCODESTYLE_SCRIPT_REMOTE_PATH..." 98 | if [ ! -e "$PYCODESTYLE_SCRIPT_PATH" ]; then 99 | curl --silent -o "$PYCODESTYLE_SCRIPT_PATH" "$PYCODESTYLE_SCRIPT_REMOTE_PATH" 100 | local curl_status="$?" 101 | 102 | if [ "$curl_status" -ne 0 ]; then 103 | echo "Failed to download pycodestyle.py from $PYCODESTYLE_SCRIPT_REMOTE_PATH" 104 | exit "$curl_status" 105 | fi 106 | fi 107 | 108 | echo "starting pycodestyle test..." 109 | PYCODESTYLE_REPORT=$( ("$PYTHON_EXECUTABLE" "$PYCODESTYLE_SCRIPT_PATH" --config=dev/tox.ini $1) 2>&1) 110 | PYCODESTYLE_STATUS=$? 111 | else 112 | # we have the right version installed, so run locally 113 | echo "starting pycodestyle test..." 114 | PYCODESTYLE_REPORT=$( ($PYCODESTYLE_BUILD --config=dev/tox.ini $1) 2>&1) 115 | PYCODESTYLE_STATUS=$? 116 | fi 117 | 118 | if [ $PYCODESTYLE_STATUS -ne 0 ]; then 119 | echo "pycodestyle checks failed:" 120 | echo "$PYCODESTYLE_REPORT" 121 | exit "$PYCODESTYLE_STATUS" 122 | else 123 | echo "pycodestyle checks passed." 124 | echo 125 | fi 126 | } 127 | 128 | function flake8_test { 129 | local FLAKE8_VERSION= 130 | local EXPECTED_FLAKE8= 131 | local FLAKE8_REPORT= 132 | local FLAKE8_STATUS= 133 | 134 | if ! hash "$FLAKE8_BUILD" 2> /dev/null; then 135 | echo "The flake8 command was not found." 136 | echo "flake8 checks failed." 137 | exit 1 138 | fi 139 | 140 | _FLAKE8_VERSION=($($FLAKE8_BUILD --version)) 141 | FLAKE8_VERSION="${_FLAKE8_VERSION[0]}" 142 | EXPECTED_FLAKE8="$(satisfies_min_version $FLAKE8_VERSION $MINIMUM_FLAKE8)" 143 | 144 | if [[ "$EXPECTED_FLAKE8" == "False" ]]; then 145 | echo "\ 146 | The minimum flake8 version needs to be $MINIMUM_FLAKE8. Your current version is $FLAKE8_VERSION 147 | 148 | flake8 checks failed." 149 | exit 1 150 | fi 151 | 152 | echo "starting $FLAKE8_BUILD test..." 153 | FLAKE8_REPORT=$( ($FLAKE8_BUILD . --count --select=E901,E999,F821,F822,F823,F401,F405 \ 154 | --exclude="docs/build/html/reference/api/*.py","build" \ 155 | --max-line-length=100 --show-source --statistics) 2>&1) 156 | FLAKE8_STATUS=$? 157 | 158 | if [ "$FLAKE8_STATUS" -ne 0 ]; then 159 | echo "flake8 checks failed:" 160 | echo "$FLAKE8_REPORT" 161 | echo "$FLAKE8_STATUS" 162 | exit "$FLAKE8_STATUS" 163 | else 164 | echo "flake8 checks passed." 165 | echo 166 | fi 167 | } 168 | 169 | function mypy_test { 170 | local MYPY_REPORT= 171 | local MYPY_STATUS= 172 | 173 | # Skip check if mypy is not installed. 174 | if ! hash "$MYPY_BUILD" 2> /dev/null; then 175 | echo "The $MYPY_BUILD command was not found. Skipping mypy checks for now." 176 | echo 177 | return 178 | fi 179 | 180 | echo "starting mypy test..." 181 | MYPY_REPORT=$( ($MYPY_BUILD --package delta_sharing --show-error-context --no-strict-optional --ignore-missing-imports) 2>&1) 182 | MYPY_STATUS=$? 183 | 184 | if [ "$MYPY_STATUS" -ne 0 ]; then 185 | echo "mypy checks failed:" 186 | echo "$MYPY_REPORT" 187 | echo "$MYPY_STATUS" 188 | exit "$MYPY_STATUS" 189 | else 190 | echo "mypy checks passed." 191 | echo 192 | fi 193 | } 194 | 195 | function sphinx_test { 196 | local SPHINX_REPORT= 197 | local SPHINX_STATUS= 198 | 199 | python -c "import sys; assert sys.version_info >= (3, 6), 'Sphinx build requires Python 3.6+, skipping for now.'" 200 | exit_code=$? 201 | if [ $exit_code -ne 0 ]; then 202 | return 203 | fi 204 | 205 | # Check that the documentation builds acceptably, skip check if sphinx is not installed. 206 | if ! hash "$SPHINX_BUILD" 2> /dev/null; then 207 | echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now." 208 | echo 209 | return 210 | fi 211 | 212 | echo "starting $SPHINX_BUILD tests..." 213 | pushd docs &> /dev/null 214 | make clean &> /dev/null 215 | # Treat warnings as errors so we stop correctly 216 | SPHINX_REPORT=$( (SPHINXOPTS="-a -W" make html) 2>&1) 217 | SPHINX_STATUS=$? 218 | 219 | if [ "$SPHINX_STATUS" -ne 0 ]; then 220 | echo "$SPHINX_BUILD checks failed:" 221 | echo "$SPHINX_REPORT" 222 | echo 223 | echo "re-running make html to print full warning list:" 224 | make clean &> /dev/null 225 | SPHINX_REPORT=$( (SPHINXOPTS="-a" make html) 2>&1) 226 | echo "$SPHINX_REPORT" 227 | exit "$SPHINX_STATUS" 228 | else 229 | echo "$SPHINX_BUILD checks passed." 230 | echo 231 | fi 232 | 233 | popd &> /dev/null 234 | } 235 | 236 | function black_test { 237 | local BLACK_REPORT= 238 | local BLACK_STATUS= 239 | 240 | # Skip check if black is not installed. 241 | $BLACK_BUILD 2> /dev/null 242 | if [ $? -ne 0 ]; then 243 | echo "The $BLACK_BUILD command was not found. Skipping black checks for now." 244 | echo 245 | return 246 | fi 247 | 248 | echo "starting black test..." 249 | BLACK_REPORT=$( ($BLACK_BUILD delta_sharing --line-length 100 --check --diff) 2>&1) 250 | BLACK_STATUS=$? 251 | 252 | if [ "$BLACK_STATUS" -ne 0 ]; then 253 | echo "black checks failed:" 254 | echo "$BLACK_REPORT" 255 | echo "Please run 'dev/reformat' script." 256 | echo "$BLACK_STATUS" 257 | exit "$BLACK_STATUS" 258 | else 259 | echo "black checks passed." 260 | echo 261 | fi 262 | } 263 | 264 | SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" 265 | SPARK_ROOT_DIR="$(dirname "${SCRIPT_DIR}")" 266 | 267 | pushd "$SPARK_ROOT_DIR" &> /dev/null 268 | 269 | PYTHON_SOURCE="$(find . -name "*.py")" 270 | 271 | compile_python_test "$PYTHON_SOURCE" 272 | black_test 273 | pycodestyle_test "$PYTHON_SOURCE" 274 | flake8_test 275 | mypy_test 276 | sphinx_test 277 | 278 | echo 279 | echo "all lint-python tests passed!" 280 | 281 | popd &> /dev/null 282 | --------------------------------------------------------------------------------