├── version.sbt
├── .gitattributes
├── images
└── delta-sharing.png
├── spark
└── src
│ ├── main
│ ├── resources
│ │ └── META-INF
│ │ │ └── services
│ │ │ ├── org.apache.hadoop.fs.FileSystem
│ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ └── scala
│ │ ├── org
│ │ └── apache
│ │ │ └── spark
│ │ │ └── sql
│ │ │ └── DeltaSharingScanUtils.scala
│ │ └── io
│ │ └── delta
│ │ └── sharing
│ │ └── spark
│ │ ├── util
│ │ ├── JsonUtils.scala
│ │ └── RetryUtils.scala
│ │ ├── perf
│ │ └── DeltaSharingLimitPushDown.scala
│ │ ├── InMemoryHttpInputStream.scala
│ │ ├── DeltaSharingProfileProvider.scala
│ │ ├── RemoteDeltaCDFRelation.scala
│ │ ├── DeltaSharingDataSource.scala
│ │ ├── model.scala
│ │ ├── RemoteDeltaFileIndex.scala
│ │ ├── DeltaSharingFileSystem.scala
│ │ └── RandomAccessHttpInputStream.scala
│ └── test
│ ├── scala
│ ├── io
│ │ └── delta
│ │ │ └── sharing
│ │ │ └── spark
│ │ │ ├── util
│ │ │ └── RetryUtilsSuite.scala
│ │ │ ├── DeltaSharingFileSystemSuite.scala
│ │ │ ├── TestDeltaSharingClient.scala
│ │ │ ├── DeltaSharingFileProfileProviderSuite.scala
│ │ │ └── DeltaSharingIntegrationTest.scala
│ └── org
│ │ └── apache
│ │ └── spark
│ │ └── delta
│ │ └── sharing
│ │ └── CachedTableManagerSuite.scala
│ └── resources
│ └── log4j.properties
├── examples
├── open-datasets.share
├── README.md
└── python
│ ├── quickstart_pandas.py
│ └── quickstart_spark.py
├── python
├── delta_sharing
│ ├── tests
│ │ ├── test_profile.json
│ │ ├── __init__.py
│ │ ├── test_converter.py
│ │ └── conftest.py
│ ├── version.py
│ ├── _yarl_patch.py
│ ├── __init__.py
│ ├── converter.py
│ ├── reader.py
│ └── protocol.py
├── requirements-dev.txt
├── dev
│ ├── tox.ini
│ ├── reformat
│ ├── pytest
│ └── lint-python
├── README.md
└── setup.py
├── project
├── build.properties
└── plugins.sbt
├── server
└── src
│ ├── test
│ ├── resources
│ │ └── core-site.xml
│ └── scala
│ │ └── io
│ │ └── delta
│ │ ├── sharing
│ │ └── server
│ │ │ ├── CloudFileSignerSuite.scala
│ │ │ ├── TestDeltaSharingServer.scala
│ │ │ ├── config
│ │ │ └── ServerConfigSuite.scala
│ │ │ └── TestResource.scala
│ │ └── standalone
│ │ └── internal
│ │ └── PartitionFilterUtilsSuite.scala
│ ├── main
│ ├── scala
│ │ └── io
│ │ │ └── delta
│ │ │ ├── standalone
│ │ │ └── internal
│ │ │ │ ├── DeltaDataSource.scala
│ │ │ │ ├── DeltaCDFErrors.scala
│ │ │ │ ├── DeltaSharingHistoryManager.scala
│ │ │ │ └── PartitionFilterUtils.scala
│ │ │ └── sharing
│ │ │ └── server
│ │ │ ├── util
│ │ │ └── JsonUtils.scala
│ │ │ ├── exceptions.scala
│ │ │ ├── model.scala
│ │ │ ├── SharedTableManager.scala
│ │ │ ├── config
│ │ │ └── ServerConfig.scala
│ │ │ └── CloudFileSigner.scala
│ └── protobuf
│ │ └── protocol.proto
│ └── universal
│ └── conf
│ └── delta-sharing-server.yaml.template
├── dev
└── release.sh
├── .gitignore
├── .github
└── workflows
│ └── build-and-test.yml
└── CONTRIBUTING.md
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "0.6.0-SNAPSHOT"
2 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bat text eol=crlf
2 | *.cmd text eol=crlf
3 | *.bin binary
4 |
--------------------------------------------------------------------------------
/images/delta-sharing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/delta-sharing/main/images/delta-sharing.png
--------------------------------------------------------------------------------
/spark/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem:
--------------------------------------------------------------------------------
1 | io.delta.sharing.spark.DeltaSharingFileSystem
--------------------------------------------------------------------------------
/spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | io.delta.sharing.spark.DeltaSharingDataSource
--------------------------------------------------------------------------------
/examples/open-datasets.share:
--------------------------------------------------------------------------------
1 | {
2 | "shareCredentialsVersion": 1,
3 | "endpoint": "https://sharing.delta.io/delta-sharing/",
4 | "bearerToken": "faaie590d541265bcab1f2de9813274bf233"
5 | }
--------------------------------------------------------------------------------
/python/delta_sharing/tests/test_profile.json:
--------------------------------------------------------------------------------
1 | {
2 | "shareCredentialsVersion": 1,
3 | "endpoint": "https://localhost:12345/delta-sharing/",
4 | "bearerToken": "dapi5e3574ec767ca1548ae5bbed1a2dc04d",
5 | "expirationTime": "2021-11-12T00:12:29.0Z"
6 | }
7 |
--------------------------------------------------------------------------------
/python/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | # Dependencies. When you update don't forget to update setup.py.
2 | pandas
3 | pyarrow>=4.0.0
4 | fsspec>=0.7.4
5 | requests
6 | aiohttp
7 | yarl>=1.6.0
8 |
9 | # Linter
10 | mypy==0.812
11 | flake8
12 |
13 | # Code formatter. Only support Python 3.6+
14 | black==21.12b0
15 |
16 | # Test
17 | pytest
18 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (2021) The Delta Lake Project Authors.
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | #
13 |
14 | sbt.version=1.5.0
15 |
--------------------------------------------------------------------------------
/python/delta_sharing/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
--------------------------------------------------------------------------------
/python/delta_sharing/version.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | __version__ = "0.5.0"
18 |
--------------------------------------------------------------------------------
/server/src/test/resources/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | fs.azure.account.key.deltasharingtest.blob.core.windows.net
7 | ${azure.account.key}
8 |
9 |
10 |
11 | fs.azure.account.auth.type.deltasharingtest.dfs.core.windows.net
12 | SharedKey
13 |
14 |
15 |
16 |
17 | fs.azure.account.key.deltasharingtest.dfs.core.windows.net
18 | ${azure.account.key}
19 |
20 |
21 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | ## Delta Sharing examples
2 | In this folder there are examples taken from the delta.io/delta-sharing quickstart guide and docs. They are available in Python and can be run if the prerequisites are satisfied.
3 | The profile file from the open, example Delta Sharing Server is downloaded and located in this folder.
4 |
5 | ### Prerequisites
6 | * For Python examples, Python3.6+, Delta-Sharing Python Connector, PySpark need to be installed, see [the project docs](https://github.com/delta-io/delta-sharing) for details.
7 |
8 | ### Instructions
9 | * To run the example of PySpark in Python run `spark-submit --packages io.delta:delta-sharing-spark_2.12:0.1.0 ./python/quickstart_spark.py`
10 | * To run the example of pandas DataFrame in Python run `python3 ./python/quickstart_pandas.py`
--------------------------------------------------------------------------------
/python/dev/tox.ini:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | [pycodestyle]
18 | ignore=E203,E226,E231,E241,E305,E402,E722,E731,E741,W503,W504
19 | max-line-length=100
20 | exclude=.git/*,docs/build/*
21 |
--------------------------------------------------------------------------------
/python/delta_sharing/_yarl_patch.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | try:
17 | from yarl import URL
18 | from yarl._quoting import _Quoter
19 |
20 | # Patch yarl.URL to not replace '%3D' with '=' which would break GCS pre-signed urls
21 | URL._PATH_REQUOTER = _Quoter(safe="@:", protected="/+=") # type: ignore
22 | except:
23 | pass
24 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/standalone/internal/DeltaDataSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package io.delta.standalone.internal
17 |
18 | /** DeltaDataSource constants. */
19 | object DeltaDataSource {
20 | // Constants for cdf parameters
21 | final val CDF_START_VERSION_KEY = "startingVersion"
22 |
23 | final val CDF_START_TIMESTAMP_KEY = "startingTimestamp"
24 |
25 | final val CDF_END_VERSION_KEY = "endingVersion"
26 |
27 | final val CDF_END_TIMESTAMP_KEY = "endingTimestamp"
28 | }
29 |
--------------------------------------------------------------------------------
/spark/src/main/scala/org/apache/spark/sql/DeltaSharingScanUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package org.apache.spark.sql
18 |
19 | import org.apache.spark.sql.execution.datasources.LogicalRelation
20 |
21 | object DeltaSharingScanUtils {
22 | // A wrapper to expose Dataset.ofRows function.
23 | // This is needed because Dataset object is in private[sql] scope.
24 | def ofRows(spark: SparkSession, plan: LogicalRelation): DataFrame = {
25 | Dataset.ofRows(spark, plan)
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
1 | # Delta Sharing
2 |
3 | [Delta Sharing](https://delta.io/sharing) is an open protocol for secure real-time exchange of large datasets, which enables secure data sharing across different computing platforms. It lets organizations share access to existing [Delta Lake](https://delta.io) and [Apache Parquet](https://parquet.apache.org) tables with other organizations, who can then directly read the table in Pandas, Apache Spark, or any other software that implements the open protocol.
4 |
5 | This is the Python client library for Delta Sharing, which lets you load shared tables as [pandas](https://pandas.pydata.org/) DataFrames or as [Apache Spark](http://spark.apache.org/) DataFrames if running in PySpark with the [Apache Spark Connector library](https://github.com/delta-io/delta-sharing#set-up-apache-spark).
6 |
7 | ## Installation and Usage
8 |
9 | 1. Install using `pip install delta-sharing`.
10 | 2. To use the Python Connector, see [the project docs](https://github.com/delta-io/delta-sharing) for details.
11 |
12 | ## Documentation
13 |
14 | This README only contains basic information about the Delta Sharing Python Connector. Please read [the project documentation](https://github.com/delta-io/delta-sharing) for full usage details.
15 |
--------------------------------------------------------------------------------
/python/delta_sharing/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | from delta_sharing.delta_sharing import SharingClient, load_as_pandas, load_as_spark
18 | from delta_sharing.delta_sharing import load_table_changes_as_pandas, load_table_changes_as_spark
19 | from delta_sharing.protocol import Share, Schema, Table
20 | from delta_sharing.version import __version__
21 |
22 |
23 | __all__ = [
24 | "SharingClient",
25 | "Share",
26 | "Schema",
27 | "Table",
28 | "load_as_pandas",
29 | "load_as_spark",
30 | "load_table_changes_as_pandas",
31 | "load_table_changes_as_spark",
32 | "__version__",
33 | ]
34 |
--------------------------------------------------------------------------------
/server/src/test/scala/io/delta/sharing/server/CloudFileSignerSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server
18 |
19 | import org.apache.hadoop.fs.Path
20 | import org.scalatest.FunSuite
21 |
22 | class CloudFileSignerSuite extends FunSuite {
23 |
24 | test("GCSFileSigner.getBucketAndObjectNames") {
25 | assert(GCSFileSigner.getBucketAndObjectNames(new Path("gs://delta-sharing-test/foo"))
26 | == ("delta-sharing-test", "foo"))
27 | assert(GCSFileSigner.getBucketAndObjectNames(new Path("gs://delta_sharing_test/foo"))
28 | == ("delta_sharing_test", "foo"))
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/python/dev/reformat:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Copyright (C) 2021 The Delta Lake Project Authors.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # The current directory of the script.
19 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
20 | FWDIR="$( cd "$DIR"/.. && pwd )"
21 | cd "$FWDIR"
22 |
23 | PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python}"
24 |
25 | BLACK_BUILD="$PYTHON_EXECUTABLE -m black"
26 | BLACK_VERSION="21.12b0"
27 | $BLACK_BUILD 2> /dev/null
28 | if [ $? -ne 0 ]; then
29 | echo "The '$BLACK_BUILD' command was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'."
30 | exit 1
31 | fi
32 |
33 | $BLACK_BUILD delta_sharing --line-length 100
34 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | resolvers += Resolver.url("artifactory", url("https://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
18 |
19 | resolvers += "Typesafe Repository" at "https://repo.typesafe.com/typesafe/releases/"
20 |
21 | resolvers += Resolver.url(
22 | "typesafe sbt-plugins",
23 | url("https://dl.bintray.com/typesafe/sbt-plugins"))(Resolver.ivyStylePatterns)
24 |
25 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13")
26 |
27 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3")
28 |
29 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
30 |
31 | addSbtPlugin("com.thesamet" % "sbt-protoc" % "1.0.2")
32 |
33 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.6")
34 |
35 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2")
36 |
37 | libraryDependencies += "com.thesamet.scalapb" %% "compilerplugin" % "0.11.1"
38 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/util/JsonUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark.util
18 |
19 | import com.fasterxml.jackson.annotation.JsonInclude.Include
20 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
21 | import com.fasterxml.jackson.module.scala.{DefaultScalaModule, ScalaObjectMapper}
22 |
23 | private[sharing] object JsonUtils {
24 | /** Used to convert between classes and JSON. */
25 | lazy val mapper = {
26 | val _mapper = new ObjectMapper with ScalaObjectMapper
27 | _mapper.setSerializationInclusion(Include.NON_ABSENT)
28 | _mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
29 | _mapper.registerModule(DefaultScalaModule)
30 | _mapper
31 | }
32 |
33 | def toJson[T: Manifest](obj: T): String = {
34 | mapper.writeValueAsString(obj)
35 | }
36 |
37 | def fromJson[T: Manifest](json: String): T = {
38 | mapper.readValue[T](json)
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/python/dev/pytest:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # Copyright (C) 2021 The Delta Lake Project Authors.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python}"
20 |
21 | set -o pipefail
22 | set -e
23 |
24 | if ! hash pytest 2> /dev/null; then
25 | echo "The pytest command was not found. Please install 'pytest' Python package."
26 | exit 1
27 | fi
28 |
29 | # The current directory of the script.
30 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
31 |
32 | FWDIR="$( cd "$DIR"/.. && pwd )"
33 | cd "$FWDIR"
34 |
35 | if [ -n "AWS_ACCESS_KEY_ID" ]; then
36 | logopts=(-o log_cli=true -s)
37 | fi
38 |
39 | # Runs both doctests and unit tests by default, otherwise hands arguments over to pytest.
40 | if [ "$#" = 0 ]; then
41 | # delta_sharing/_yarl_patch.py is a hack to support GCS pre-signed urls. Ask pytest to not
42 | # import it automatically so that we can verify we are importing it on demand.
43 | $PYTHON_EXECUTABLE -m pytest --ignore=delta_sharing/_yarl_patch.py --verbose --showlocals --color=yes --doctest-modules delta_sharing "${logopts[@]}"
44 | else
45 | $PYTHON_EXECUTABLE -m pytest "$@"
46 | fi
47 |
--------------------------------------------------------------------------------
/dev/release.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e -pipe
2 |
3 | export GPG_TTY=$(tty)
4 |
5 | # Switch to the project root directory
6 | cd $( dirname $0 )
7 | cd ..
8 |
9 | # Clean up uncommitted files
10 | git clean -fdx
11 |
12 | # Clean existing artifacts
13 | build/sbt clean
14 | cd python
15 | python3 setup.py clean --all
16 | rm -rf delta_sharing.egg-info dist
17 | cd ..
18 |
19 | printf "Please type the release version: "
20 | read VERSION
21 | echo $VERSION
22 |
23 | # Update the Python connector version
24 | sed -i '' "s/__version__ = \".*\"/__version__ = \"$VERSION\"/g" python/delta_sharing/version.py
25 | git add python/delta_sharing/version.py
26 | # Use --allow-empty so that we can re-run this script even if the Python connector version has been updated
27 | git commit -m "Update Python connector version to $VERSION" --allow-empty
28 |
29 | build/sbt "release skip-tests"
30 |
31 | # Switch to the release commit
32 | git checkout v$VERSION
33 |
34 | # Generate Python artifacts
35 | cd python/
36 | python3 setup.py sdist bdist_wheel
37 | cd ..
38 |
39 | # Generate the pre-built server package and sign files
40 | build/sbt server/universal:packageBin
41 | cd server/target/universal
42 | gpg --detach-sign --armor --sign delta-sharing-server-$VERSION.zip
43 | gpg --verify delta-sharing-server-$VERSION.zip.asc
44 | sha256sum delta-sharing-server-$VERSION.zip > delta-sharing-server-$VERSION.zip.sha256
45 | sha256sum -c delta-sharing-server-$VERSION.zip.sha256
46 | sha256sum delta-sharing-server-$VERSION.zip.asc > delta-sharing-server-$VERSION.zip.asc.sha256
47 | sha256sum -c delta-sharing-server-$VERSION.zip.asc.sha256
48 | cd -
49 |
50 | # Build the docker image
51 | build/sbt server/docker:publish
52 |
53 | git checkout main
54 |
55 | echo "=== Generated all release artifacts ==="
56 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/sharing/server/util/JsonUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server.util
18 |
19 | import java.io.OutputStream
20 |
21 | import com.fasterxml.jackson.annotation.JsonInclude.Include
22 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
23 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
24 | import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
25 |
26 | object JsonUtils {
27 | /** Used to convert between classes and JSON. */
28 | lazy val mapper = {
29 | val _mapper = new ObjectMapper with ScalaObjectMapper
30 | _mapper.setSerializationInclusion(Include.NON_ABSENT)
31 | _mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
32 | _mapper.registerModule(DefaultScalaModule)
33 | _mapper
34 | }
35 |
36 | def toJson[T](obj: T): String = {
37 | mapper.writeValueAsString(obj)
38 | }
39 |
40 | def toJson[T](out: OutputStream, obj: T): Unit = {
41 | mapper.writeValue(out, obj)
42 | }
43 |
44 | def fromJson[T: Manifest](json: String): T = {
45 | mapper.readValue[T](json)
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/server/src/universal/conf/delta-sharing-server.yaml.template:
--------------------------------------------------------------------------------
1 | # The format version of this config file
2 | version: 1
3 | # Config shares/schemas/tables to share
4 | shares:
5 | - name: "share1"
6 | schemas:
7 | - name: "schema1"
8 | tables:
9 | - name: "table1"
10 | # S3. See https://github.com/delta-io/delta-sharing#s3 for how to config the credentials
11 | location: "s3a:///"
12 | - name: "table2"
13 | # Azure Blob Storage. See https://github.com/delta-io/delta-sharing#azure-blob-storage for how to config the credentials
14 | location: "wasbs://@"
15 | - name: "share2"
16 | schemas:
17 | - name: "schema2"
18 | tables:
19 | - name: "table3"
20 | # Azure Data Lake Storage Gen2. See https://github.com/delta-io/delta-sharing#azure-data-lake-storage-gen2 for how to config the credentials
21 | location: "abfss://@"
22 | cdfEnabled: true
23 | - name: "share3"
24 | schemas:
25 | - name: "schema3"
26 | tables:
27 | - name: "table4"
28 | # Google Cloud Storage (GCS). See https://github.com/delta-io/delta-sharing#google-cloud-storage for how to config the credentials
29 | location: "gs:///"
30 | # Set the host name that the server will use
31 | host: "localhost"
32 | # Set the port that the server will listen on. Note: using ports below 1024
33 | # may require a privileged user in some operating systems.
34 | port: 8080
35 | # Set the url prefix for the REST APIs
36 | endpoint: "/delta-sharing"
37 | # Set the timeout of S3 presigned url in seconds
38 | preSignedUrlTimeoutSeconds: 3600
39 | # How many tables to cache in the server
40 | deltaTableCacheSize: 10
41 | # Whether we can accept working with a stale version of the table. This is useful when sharing
42 | # static tables that will never be changed.
43 | stalenessAcceptable: false
44 | # Whether to evaluate user provided `predicateHints`
45 | evaluatePredicateHints: false
46 |
--------------------------------------------------------------------------------
/server/src/test/scala/io/delta/sharing/server/TestDeltaSharingServer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server
18 |
19 | import java.io.File
20 | import java.lang.management.ManagementFactory
21 |
22 | import org.apache.commons.io.FileUtils
23 |
24 | import io.delta.sharing.server.config.ServerConfig
25 |
26 | /**
27 | * This is a special test class for the client projects to test end-to-end experience. It will
28 | * generate configs for testing and start the server.
29 | */
30 | object TestDeltaSharingServer {
31 | def main(args: Array[String]): Unit = {
32 | val pid = ManagementFactory.getRuntimeMXBean().getName().split("@")(0)
33 | val pidFile = new File(args(0))
34 | // scalastyle:off println
35 | println(s"Writing pid $pid to $pidFile")
36 | // scalastyle:off on
37 | FileUtils.writeStringToFile(pidFile, pid)
38 | if (sys.env.get("AWS_ACCESS_KEY_ID").exists(_.length > 0)) {
39 | val serverConfigPath = TestResource.setupTestTables().getCanonicalPath
40 | val serverConfig = ServerConfig.load(serverConfigPath)
41 | val server = DeltaSharingService.start(serverConfig)
42 | // Run at most 240 seconds and exit. This is to ensure we can exit even if the parent process
43 | // hits any error.
44 | Thread.sleep(240000)
45 | server.stop()
46 | } else {
47 | throw new IllegalArgumentException("Cannot find AWS_ACCESS_KEY_ID in sys.env")
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/sharing/server/exceptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server
18 |
19 | /**
20 | * A special exception for invalid requests happening in Delta Sharing Server. We define a special
21 | * class rather than reusing `IllegalArgumentException` so that we can ensure that the message in
22 | * `IllegalArgumentException` thrown from other libraries won't be returned to users.
23 | *
24 | * @note `message` will be in the response. Please make sure it doesn't contain any sensitive info.
25 | */
26 | class DeltaSharingIllegalArgumentException(message: String)
27 | extends IllegalArgumentException(message)
28 |
29 | /**
30 | * A special exception for resource not found in Delta Sharing Server. We define a special
31 | * class rather than reusing `NoSuchElementException` so that we can ensure that the message in
32 | * `NoSuchElementException` thrown from other libraries won't be returned to users.
33 | *
34 | * @note `message` will be in the response. Please make sure it doesn't contain any sensitive info.
35 | */
36 | class DeltaSharingNoSuchElementException(message: String)
37 | extends NoSuchElementException(message)
38 |
39 |
40 | /**
41 | * A special exception that wraps an unhandled exception when processing a request.
42 | * `DeltaInternalException` should never be exposed to users as an unhandled exception may contain
43 | * sensitive information.
44 | */
45 | class DeltaInternalException(e: Throwable) extends RuntimeException(e)
46 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *#*#
2 | *.#*
3 | *.iml
4 | *.ipr
5 | *.iws
6 | *.pyc
7 | *.pyo
8 | *.swp
9 | *~
10 | .DS_Store
11 | .bsp
12 | .cache
13 | .classpath
14 | .ensime
15 | .ensime_cache/
16 | .ensime_lucene
17 | .generated-mima*
18 | .idea/
19 | .idea_modules/
20 | .project
21 | .pydevproject
22 | .scala_dependencies
23 | .settings
24 | /lib/
25 | R-unit-tests.log
26 | R/unit-tests.out
27 | R/cran-check.out
28 | R/pkg/vignettes/sparkr-vignettes.html
29 | R/pkg/tests/fulltests/Rplots.pdf
30 | build/*.jar
31 | build/apache-maven*
32 | build/scala*
33 | build/zinc*
34 | cache
35 | conf/*.cmd
36 | conf/*.conf
37 | conf/*.properties
38 | conf/*.sh
39 | conf/*.xml
40 | conf/java-opts
41 | conf/slaves
42 | dependency-reduced-pom.xml
43 | derby.log
44 | dev/create-release/*final
45 | dev/create-release/*txt
46 | dev/pr-deps/
47 | dist/
48 | docs/_site
49 | docs/api
50 | sql/docs
51 | sql/site
52 | lib_managed/
53 | lint-r-report.log
54 | log/
55 | logs/
56 | out/
57 | project/boot/
58 | project/build/target/
59 | project/plugins/lib_managed/
60 | project/plugins/project/build.properties
61 | project/plugins/src_managed/
62 | project/plugins/target/
63 | python/lib/pyspark.zip
64 | python/deps
65 | docs/python/_static/
66 | docs/python/_templates/
67 | docs/python/_build/
68 | python/build/
69 | python/test_coverage/coverage_data
70 | python/test_coverage/htmlcov
71 | python/pyspark/python
72 | reports/
73 | scalastyle-on-compile.generated.xml
74 | scalastyle-output.xml
75 | scalastyle.txt
76 | spark-*-bin-*.tgz
77 | spark-tests.log
78 | src_managed/
79 | streaming-tests.log
80 | target/
81 | unit-tests.log
82 | work/
83 | docs/.jekyll-metadata
84 |
85 | # For Hive
86 | TempStatsStore/
87 | metastore/
88 | metastore_db/
89 | sql/hive-thriftserver/test_warehouses
90 | warehouse/
91 | spark-warehouse/
92 |
93 | # For R session data
94 | .RData
95 | .RHistory
96 | .Rhistory
97 | *.Rproj
98 | *.Rproj.*
99 |
100 | .Rproj.user
101 |
102 | **/src/main/resources/js
103 |
104 | # For SBT
105 | .jvmopts
106 |
107 | # For Python
108 | *.egg-info
109 |
110 | # For VSCode
111 | *.vscode
112 |
113 | # For Metals
114 | *.metals
115 |
116 | # For venv
117 | *.venv
118 |
--------------------------------------------------------------------------------
/examples/python/quickstart_pandas.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (2021) The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | import os
18 | import delta_sharing
19 |
20 | # Point to the profile file. It can be a file on the local file system or a file on a remote storage.
21 | profile_file = os.path.dirname(__file__) + "/../open-datasets.share"
22 |
23 | # Create a SharingClient.
24 | client = delta_sharing.SharingClient(profile_file)
25 |
26 | # List all shared tables.
27 | print("########### All Available Tables #############")
28 | print(client.list_all_tables())
29 |
30 | # Create a url to access a shared table.
31 | # A table path is the profile file path following with `#` and the fully qualified name of a table (`..`).
32 | table_url = profile_file + "#delta_sharing.default.owid-covid-data"
33 |
34 | # Fetch 10 rows from a table and convert it to a Pandas DataFrame. This can be used to read sample data from a table that cannot fit in the memory.
35 | print("########### Loading 10 rows from delta_sharing.default.owid-covid-data as a Pandas DataFrame #############")
36 | data = delta_sharing.load_as_pandas(table_url, limit=10)
37 |
38 | # Print the sample.
39 | print("########### Show the fetched 10 rows #############")
40 | print(data)
41 |
42 | # Load a table as a Pandas DataFrame. This can be used to process tables that can fit in the memory.
43 | print("########### Loading delta_sharing.default.owid-covid-data as a Pandas DataFrame #############")
44 | data = delta_sharing.load_as_pandas(table_url)
45 |
46 | # Do whatever you want to your share data!
47 | print("########### Show Data #############")
48 | print(data[data["iso_code"] == "USA"].head(10))
49 |
--------------------------------------------------------------------------------
/spark/src/test/scala/io/delta/sharing/spark/util/RetryUtilsSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark.util
18 |
19 | import java.io.{InterruptedIOException, IOException}
20 |
21 | import scala.collection.mutable.ArrayBuffer
22 |
23 | import org.apache.spark.SparkFunSuite
24 |
25 | class RetryUtilsSuite extends SparkFunSuite {
26 | import RetryUtils._
27 |
28 | test("shouldRetry") {
29 | assert(shouldRetry(new UnexpectedHttpStatus("error", 429)))
30 | assert(shouldRetry(new UnexpectedHttpStatus("error", 500)))
31 | assert(!shouldRetry(new UnexpectedHttpStatus("error", 404)))
32 | assert(!shouldRetry(new InterruptedException))
33 | assert(!shouldRetry(new InterruptedIOException))
34 | assert(shouldRetry(new IOException))
35 | assert(!shouldRetry(new RuntimeException))
36 | }
37 |
38 | test("runWithExponentialBackoff") {
39 | val sleeps = new ArrayBuffer[Long]()
40 | RetryUtils.sleeper = (sleepMs: Long) => sleeps += sleepMs
41 | // Retry case
42 | intercept[UnexpectedHttpStatus] {
43 | runWithExponentialBackoff(10) {
44 | throw new UnexpectedHttpStatus("error", 429)
45 | }
46 | }
47 | // Run 11 times should sleep 10 times
48 | assert(sleeps.length == 10)
49 | assert(sleeps == Seq(100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200))
50 | // No retry case
51 | sleeps.clear()
52 | intercept[RuntimeException] {
53 | runWithExponentialBackoff(10) {
54 | throw new RuntimeException
55 | }
56 | }
57 | assert(sleeps == Seq())
58 | RetryUtils.sleeper = (sleepMs: Long) => Thread.sleep(sleepMs)
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/examples/python/quickstart_spark.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (2021) The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | import os
18 | import delta_sharing
19 | from pyspark.sql import SparkSession
20 |
21 | # Point to the profile file. It can be a file on the local file system or a file on a remote storage.
22 | profile_file = os.path.dirname(__file__) + "/../open-datasets.share"
23 |
24 | # Create a SharingClient.
25 | client = delta_sharing.SharingClient(profile_file)
26 |
27 | # List all shared tables.
28 | print("########### All Available Tables #############")
29 | print(client.list_all_tables())
30 |
31 | # Create a url to access a shared table.
32 | # A table path is the profile file path following with `#` and the fully qualified name of a table (`..`).
33 | table_url = profile_file + "#delta_sharing.default.owid-covid-data"
34 |
35 | # Create Spark with delta sharing connector
36 | spark = SparkSession.builder \
37 | .appName("delta-sharing-demo") \
38 | .master("local[*]") \
39 | .getOrCreate()
40 |
41 | # Read data using format "deltaSharing"
42 | print("########### Loading delta_sharing.default.owid-covid-data with Spark #############")
43 | df1 = spark.read.format("deltaSharing").load(table_url) \
44 | .where("iso_code == 'USA'") \
45 | .select("iso_code", "total_cases", "human_development_index") \
46 | .show()
47 |
48 | # Or if the code is running with PySpark, you can use `load_as_spark` to load the table as a Spark DataFrame.
49 | print("########### Loading delta_sharing.default.owid-covid-data with Spark #############")
50 | data = delta_sharing.load_as_spark(table_url)
51 | data.where("iso_code == 'USA'") \
52 | .select("iso_code", "total_cases", "human_development_index").show()
53 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/util/RetryUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark.util
18 |
19 | import java.io.{InterruptedIOException, IOException}
20 |
21 | import scala.util.control.NonFatal
22 |
23 | import org.apache.spark.internal.Logging
24 |
25 | private[sharing] object RetryUtils extends Logging {
26 |
27 | // Expose it for testing
28 | @volatile var sleeper: Long => Unit = (sleepMs: Long) => Thread.sleep(sleepMs)
29 |
30 | def runWithExponentialBackoff[T](numRetries: Int)(func: => T): T = {
31 | var times = 0
32 | var sleepMs = 100
33 | while (true) {
34 | times += 1
35 | try {
36 | return func
37 | } catch {
38 | case NonFatal(e) if shouldRetry(e) && times <= numRetries =>
39 | logWarning(s"Sleeping $sleepMs ms to retry because of error: ${e.getMessage}", e)
40 | sleeper(sleepMs)
41 | sleepMs *= 2
42 | }
43 | }
44 | throw new IllegalStateException("Should not happen")
45 | }
46 |
47 | def shouldRetry(t: Throwable): Boolean = {
48 | t match {
49 | case e: UnexpectedHttpStatus =>
50 | if (e.statusCode == 429) { // Too Many Requests
51 | true
52 | } else if (e.statusCode >= 500 && e.statusCode < 600) { // Internal Error
53 | true
54 | } else {
55 | false
56 | }
57 | case _: InterruptedException => false
58 | case _: InterruptedIOException => false
59 | case _: IOException => true
60 | case _ => false
61 | }
62 | }
63 | }
64 |
65 | private[sharing] class UnexpectedHttpStatus(message: String, val statusCode: Int)
66 | extends IllegalStateException(message)
67 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/standalone/internal/DeltaCDFErrors.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.standalone.internal
18 |
19 | class DeltaCDFIllegalArgumentException(message: String)
20 | extends IllegalArgumentException(message)
21 |
22 | object DeltaCDFErrors {
23 | def multipleCDFBoundary(position: String): Throwable = {
24 | new DeltaCDFIllegalArgumentException(s"Multiple $position arguments provided for CDF read. " +
25 | s"Please provide one of either ${position}Timestamp or ${position}Version."
26 | )
27 | }
28 |
29 | def noStartVersionForCDF: Throwable = {
30 | new DeltaCDFIllegalArgumentException("No startingVersion or startingTimestamp provided for " +
31 | "CDF read.")
32 | }
33 |
34 | def startVersionAfterLatestVersion(start: Long, latest: Long): Throwable = {
35 | new DeltaCDFIllegalArgumentException(s"Provided Start version($start) for reading change " +
36 | "data is invalid. Start version cannot be greater than the latest version of the " +
37 | s"table($latest)."
38 | )
39 | }
40 |
41 | def endBeforeStartVersionInCDF(start: Long, end: Long): Throwable = {
42 | new DeltaCDFIllegalArgumentException(
43 | s"CDF range from start $start to end $end was invalid. End cannot be before start."
44 | )
45 | }
46 |
47 | def invalidTimestamp(field: String, message: String): Throwable = {
48 | new DeltaCDFIllegalArgumentException(s"Invalid $field: $message")
49 | }
50 |
51 | def changeDataNotRecordedException(version: Long, start: Long, end: Long): Throwable = {
52 | new DeltaCDFIllegalArgumentException(s"Error getting change data for range [$start, $end] " +
53 | s"as change data was not recorded for version [$version]"
54 | )
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/perf/DeltaSharingLimitPushDown.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark.perf
18 |
19 | import org.apache.spark.sql.SparkSession
20 | import org.apache.spark.sql.catalyst.expressions.IntegerLiteral
21 | import org.apache.spark.sql.catalyst.plans.logical.{LocalLimit, LogicalPlan}
22 | import org.apache.spark.sql.catalyst.rules.Rule
23 | import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
24 |
25 | import io.delta.sharing.spark.RemoteDeltaSnapshotFileIndex
26 |
27 | object DeltaSharingLimitPushDown extends Rule[LogicalPlan] {
28 |
29 | def setup(spark: SparkSession): Unit = synchronized {
30 | if (!spark.experimental.extraOptimizations.contains(DeltaSharingLimitPushDown) ) {
31 | spark.experimental.extraOptimizations ++= Seq(DeltaSharingLimitPushDown)
32 | }
33 | }
34 |
35 | def apply(p: LogicalPlan): LogicalPlan = {
36 | if (p.conf.getConfString("spark.delta.sharing.limitPushdown.enabled", "true").toBoolean) {
37 | p transform {
38 | case localLimit @ LocalLimit(
39 | literalExpr @ IntegerLiteral(limit),
40 | l @ LogicalRelation(
41 | r @ HadoopFsRelation(remoteIndex: RemoteDeltaSnapshotFileIndex, _, _, _, _, _),
42 | _, _, _)
43 | ) =>
44 | if (remoteIndex.limitHint.isEmpty) {
45 | val spark = SparkSession.active
46 | LocalLimit(literalExpr,
47 | l.copy(
48 | relation = r.copy(
49 | location = remoteIndex.copy(limitHint = Some(limit)))(spark)
50 | )
51 | )
52 | } else {
53 | localLimit
54 | }
55 | }
56 | } else {
57 | p
58 | }
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/spark/src/test/scala/io/delta/sharing/spark/DeltaSharingFileSystemSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import org.apache.hadoop.conf.Configuration
20 | import org.apache.hadoop.fs.Path
21 | import org.apache.spark.SparkFunSuite
22 |
23 | import io.delta.sharing.spark.model.{AddCDCFile, AddFile, AddFileForCDF, FileAction, RemoveFile}
24 |
25 | class DeltaSharingFileSystemSuite extends SparkFunSuite {
26 | import DeltaSharingFileSystem._
27 |
28 | test("encode and decode") {
29 | val tablePath = new Path("https://delta.io/foo")
30 |
31 | val actions: Seq[FileAction] = Seq(
32 | AddFile("unused", "id", Map.empty, 100),
33 | AddFileForCDF("unused_cdf", "id_cdf", Map.empty, 200, 1, 2),
34 | AddCDCFile("unused_cdc", "id_cdc", Map.empty, 300, 1, 2),
35 | RemoveFile("unused_rem", "id_rem", Map.empty, 400, 1, 2)
36 | )
37 |
38 | actions.foreach ( action => {
39 | assert(decode(encode(tablePath, action)) ==
40 | DeltaSharingPath("https://delta.io/foo", action.id, action.size))
41 | })
42 | }
43 |
44 | test("file system should be cached") {
45 | val tablePath = new Path("https://delta.io/foo")
46 | val actions: Seq[FileAction] = Seq(
47 | AddFile("unused", "id", Map.empty, 100),
48 | AddFileForCDF("unused_cdf", "id_cdf", Map.empty, 200, 1, 2),
49 | AddCDCFile("unused_cdc", "id_cdc", Map.empty, 300, 1, 2),
50 | RemoveFile("unused_rem", "id_rem", Map.empty, 400, 1, 2)
51 | )
52 |
53 | actions.foreach( action => {
54 | val path = encode(tablePath, action)
55 | val conf = new Configuration
56 | val fs = path.getFileSystem(conf)
57 | assert(fs.isInstanceOf[DeltaSharingFileSystem])
58 | assert(fs eq path.getFileSystem(conf))
59 | })
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/server/src/test/scala/io/delta/standalone/internal/PartitionFilterUtilsSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.standalone.internal
18 |
19 | import io.delta.standalone.internal.actions.AddFile
20 | import org.apache.spark.sql.types.StructType
21 | import org.scalatest.FunSuite
22 |
23 | class PartitionFilterUtilsSuite extends FunSuite {
24 |
25 | import PartitionFilterUtils._
26 |
27 | test("evaluatePredicate") {
28 | val schema = StructType.fromDDL("c1 INT, c2 INT").json
29 | val add1 = AddFile("foo1", Map("c2" -> "0"), 1, 1, true)
30 | val add2 = AddFile("foo2", Map("c2" -> "1"), 1, 1, true)
31 | val addFiles = add1 :: add2 :: Nil
32 | assert(add1 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 = 0" :: Nil, addFiles))
33 | assert(add2 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 = 1" :: Nil, addFiles))
34 | assert(add2 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 > 0" :: Nil, addFiles))
35 | assert(add1 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 < 1" :: Nil, addFiles))
36 | assert(add2 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 >= 1" :: Nil, addFiles))
37 | assert(add1 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 <= 0" :: Nil, addFiles))
38 | assert(add2 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 <> 0" :: Nil, addFiles))
39 | assert(add1 :: Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 <> 1" :: Nil, addFiles))
40 | assert(Nil == evaluatePredicate(schema, "c2" :: Nil, "c2 is null" :: Nil, addFiles))
41 | assert(addFiles == evaluatePredicate(schema, "c2" :: Nil, "c2 is not null" :: Nil, addFiles))
42 | assert(addFiles == evaluatePredicate(schema, "c2" :: Nil, "c2 is not null" :: Nil, addFiles))
43 |
44 | // Unsupported expression
45 | assert(addFiles == evaluatePredicate(schema, "c2" :: Nil, "c2 = 0 + 1" :: Nil, addFiles))
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/InMemoryHttpInputStream.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import java.io.{ByteArrayInputStream, EOFException}
20 | import java.net.URI
21 |
22 | import org.apache.commons.io.IOUtils
23 | import org.apache.hadoop.fs.{PositionedReadable, Seekable}
24 |
25 | /** An input stream that holds the entire content in memory to provide random access. */
26 | private[sharing] class InMemoryHttpInputStream(uri: URI)
27 | extends ByteArrayInputStream(IOUtils.toByteArray(uri)) with Seekable with PositionedReadable {
28 |
29 | override def seek(pos: Long): Unit = synchronized {
30 | this.pos = pos.toInt
31 | }
32 |
33 | override def getPos: Long = synchronized {
34 | pos
35 | }
36 |
37 | override def seekToNewSource(targetPos: Long): Boolean = {
38 | // We don't support this feature
39 | false
40 | }
41 |
42 | override def read(
43 | position: Long,
44 | buffer: Array[Byte],
45 | offset: Int,
46 | length: Int): Int = synchronized {
47 | val oldPos = getPos()
48 | var nread = -1
49 | try {
50 | seek(position)
51 | nread = read(buffer, offset, length)
52 | } finally {
53 | seek(oldPos)
54 | }
55 | return nread
56 | }
57 |
58 | override def readFully(
59 | position: Long,
60 | buffer: Array[Byte],
61 | offset: Int,
62 | length: Int): Unit = synchronized {
63 | var nread = 0
64 | while (nread < length) {
65 | val nbytes = read(position + nread, buffer, offset + nread, length - nread)
66 | if (nbytes < 0) {
67 | throw new EOFException("End of file reached before reading fully.");
68 | }
69 | nread += nbytes
70 | }
71 | }
72 |
73 | override def readFully(position: Long, buffer: Array[Byte]): Unit = {
74 | readFully(position, buffer, 0, buffer.length)
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/spark/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # This file contains code from the Apache Hadoop project (original license above).
18 | # It contains modifications, which are licensed as follows:
19 | #
20 | # Copyright (2021) The Delta Lake Project Authors.
21 | # Licensed under the Apache License, Version 2.0 (the "License");
22 | # you may not use this file except in compliance with the License.
23 | # You may obtain a copy of the License at
24 | # http://www.apache.org/licenses/LICENSE-2.0
25 | # Unless required by applicable law or agreed to in writing, software
26 | # distributed under the License is distributed on an "AS IS" BASIS,
27 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28 | # See the License for the specific language governing permissions and
29 | # limitations under the License.
30 | #
31 |
32 | # Set everything to be logged to the file target/unit-tests.log
33 | test.appender=file
34 | log4j.rootCategory=INFO, ${test.appender}
35 | log4j.appender.file=org.apache.log4j.FileAppender
36 | log4j.appender.file.append=true
37 | log4j.appender.file.file=target/unit-tests.log
38 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
39 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
40 |
41 | # Tests that launch java subprocesses can set the "test.appender" system property to
42 | # "console" to avoid having the child process's logs overwrite the unit test's
43 | # log file.
44 | log4j.appender.console=org.apache.log4j.ConsoleAppender
45 | log4j.appender.console.target=System.err
46 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
47 | log4j.appender.console.layout.ConversionPattern=%t: %m%n
48 |
49 | # Ignore messages below warning level from Jetty, because it's a bit verbose
50 | log4j.logger.org.spark_project.jetty=WARN
51 |
--------------------------------------------------------------------------------
/server/src/main/protobuf/protocol.proto:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | syntax = "proto2";
17 |
18 | package io.delta.sharing.server.protocol;
19 |
20 | import "scalapb/scalapb.proto";
21 |
22 | option java_package = "io.delta.sharing.server.protocol";
23 |
24 | option java_generate_equals_and_hash = true;
25 | option (scalapb.options).flat_package = true;
26 |
27 | // Define the JSON objects used by REST APIs. The table metadata format is not defined in this file
28 | // because it requires Map type which is not supported by Protocol Buffers Version 2.
29 |
30 | message Share {
31 | optional string name = 1;
32 | }
33 |
34 | message Schema {
35 | optional string name = 1;
36 | optional string share = 2;
37 | }
38 |
39 | message Table {
40 | optional string name = 1;
41 | optional string schema = 2;
42 | optional string share = 3;
43 | }
44 |
45 | message QueryTableRequest {
46 | repeated string predicateHints = 1;
47 | optional int64 limitHint = 2;
48 |
49 | // The table version being queried.
50 | // If not specified, the query is assumed to be for the latest version.
51 | optional int64 version = 3;
52 | }
53 |
54 | message ListSharesResponse {
55 | repeated Share items = 1;
56 | optional string next_page_token = 2;
57 | }
58 |
59 | message GetShareResponse {
60 | optional Share share = 1;
61 | }
62 |
63 | message ListSchemasResponse {
64 | repeated Schema items = 1;
65 | optional string next_page_token = 2;
66 | }
67 |
68 | message ListTablesResponse {
69 | repeated Table items = 1;
70 | optional string next_page_token = 2;
71 | }
72 |
73 | message ListAllTablesResponse {
74 | repeated Table items = 1;
75 | optional string next_page_token = 2;
76 | }
77 |
78 | // Define a special class to generate the page token for pagination. It includes the information we
79 | // need to know where we should start to query, and check whether the page token comes from the
80 | // right result. For example, we would like to throw an error when the user uses a page token
81 | // returning from ListShares and uses it in ListSchemas REST API.
82 | message PageToken {
83 | optional string id = 1;
84 | optional string share = 2;
85 | optional string schema = 3;
86 | }
87 |
--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | #
4 | # Copyright (C) 2021 The Delta Lake Project Authors.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | from io import open
19 | from os import path
20 | from setuptools import setup
21 | import sys
22 |
23 | DESCRIPTION = "Python Connector for Delta Sharing"
24 |
25 | this_directory = path.abspath(path.dirname(__file__))
26 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
27 | LONG_DESCRIPTION = f.read()
28 |
29 | try:
30 | exec(open('delta_sharing/version.py').read())
31 | except IOError:
32 | print("Failed to load Delta Sharing version file for packaging.",
33 | file=sys.stderr)
34 | sys.exit(-1)
35 | VERSION = __version__ # noqa
36 |
37 | setup(
38 | name='delta-sharing',
39 | version=VERSION,
40 | packages=[
41 | 'delta_sharing',
42 | ],
43 | python_requires='>=3.7',
44 | install_requires=[
45 | 'pandas',
46 | 'pyarrow>=4.0.0',
47 | 'fsspec>=0.7.4',
48 | 'requests',
49 | 'aiohttp',
50 | 'dataclasses;python_version<"3.7"',
51 | 'yarl>=1.6.0',
52 | ],
53 | extras_require={
54 | 's3': ['s3fs'],
55 | 'abfs': ['adlfs'],
56 | 'adl': ['adlfs'],
57 | 'gcs': ['gcsfs'],
58 | 'gs': ['gcsfs'],
59 | },
60 | author="The Delta Lake Project Authors",
61 | author_email="delta-users@googlegroups.com",
62 | license="Apache-2.0",
63 | description=DESCRIPTION,
64 | long_description=LONG_DESCRIPTION,
65 | long_description_content_type='text/markdown',
66 | url="https://github.com/delta-io/delta-sharing/",
67 | project_urls={
68 | 'Source': 'https://github.com/delta-io/delta-sharing',
69 | 'Documentation': 'https://github.com/delta-io/delta-sharing',
70 | 'Issues': 'https://github.com/delta-io/delta-sharing/issues'
71 | },
72 | classifiers=[
73 | "Development Status :: 5 - Production/Stable",
74 | "Intended Audience :: Developers",
75 | "License :: OSI Approved :: Apache Software License",
76 | "Operating System :: OS Independent",
77 | "Topic :: Software Development :: Libraries :: Python Modules",
78 | 'Programming Language :: Python :: 3.7',
79 | 'Programming Language :: Python :: 3.8',
80 | 'Programming Language :: Python :: 3.9',
81 | ],
82 | )
83 |
--------------------------------------------------------------------------------
/python/delta_sharing/tests/test_converter.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from datetime import date
17 | from decimal import Decimal
18 | from json import loads
19 | from typing import Any
20 |
21 | import numpy as np
22 | import pandas as pd
23 | import pytest
24 |
25 | from delta_sharing.converter import to_converter, get_empty_table
26 |
27 |
28 | def test_to_converter_boolean():
29 | converter = to_converter("boolean")
30 | assert converter("true") is True
31 | assert converter("false") is False
32 | assert converter("") is None
33 |
34 |
35 | @pytest.mark.parametrize(
36 | "type_str,expected",
37 | [
38 | pytest.param("byte", np.int8(1), id="byte"),
39 | pytest.param("short", np.int16(1), id="short"),
40 | pytest.param("integer", np.int32(1), id="integer"),
41 | pytest.param("long", np.int64(1), id="long"),
42 | pytest.param("float", np.float32(1), id="float"),
43 | pytest.param("double", np.float64(1), id="double"),
44 | ],
45 | )
46 | def test_to_converter_numeric(type_str: str, expected: Any):
47 | converter = to_converter(type_str)
48 | assert converter("1") == expected
49 | assert np.isnan(converter(""))
50 |
51 |
52 | def test_to_converter_decimal():
53 | converter = to_converter("decimal(10,0)")
54 | assert converter("1") == Decimal(1)
55 | assert converter("") is None
56 |
57 |
58 | def test_to_converter_string():
59 | converter = to_converter("string")
60 | assert converter("abc") == "abc"
61 | assert converter("") is None
62 |
63 |
64 | def test_to_converter_date():
65 | converter = to_converter("date")
66 | assert converter("2021-01-01") == date(2021, 1, 1)
67 | assert converter("") is None
68 |
69 |
70 | def test_to_converter_timestamp():
71 | converter = to_converter("timestamp")
72 | assert converter("2021-04-28 23:36:47.599") == pd.Timestamp("2021-04-28 23:36:47.599")
73 | assert converter("") is pd.NaT
74 |
75 |
76 | def test_get_empty_table():
77 | schema_string = (
78 | '{"fields": ['
79 | '{"metadata": {},"name": "a","nullable": true,"type": "long"},'
80 | '{"metadata": {},"name": "b","nullable": true,"type": "string"}'
81 | '],"type":"struct"}'
82 | )
83 | schema_json = loads(schema_string)
84 | pdf = get_empty_table(schema_json)
85 | assert pdf.empty
86 | assert pdf.columns.values.size == 2
87 | assert pdf.columns.values[0] == "a"
88 | assert pdf.columns.values[1] == "b"
89 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/DeltaSharingProfileProvider.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import java.nio.charset.StandardCharsets.UTF_8
20 |
21 | import org.apache.commons.io.IOUtils
22 | import org.apache.hadoop.conf.Configuration
23 | import org.apache.hadoop.fs.Path
24 |
25 | import io.delta.sharing.spark.util.JsonUtils
26 |
27 | case class DeltaSharingProfile(
28 | shareCredentialsVersion: Option[Int] = Some(DeltaSharingProfile.CURRENT),
29 | endpoint: String = null,
30 | bearerToken: String = null,
31 | expirationTime: String = null)
32 |
33 | object DeltaSharingProfile {
34 | val CURRENT = 1
35 | }
36 |
37 | /**
38 | * A provider that provides Delta Sharing profile for data recipient to access the shared data.
39 | * https://github.com/delta-io/delta-sharing/blob/main/PROTOCOL.md#profile-file-format.
40 | */
41 | trait DeltaSharingProfileProvider {
42 | def getProfile: DeltaSharingProfile
43 | }
44 |
45 | /**
46 | * Load [[DeltaSharingProfile]] from a file. `conf` should be provided to load the file from remote
47 | * file systems.
48 | */
49 | private[sharing] class DeltaSharingFileProfileProvider(
50 | conf: Configuration,
51 | file: String) extends DeltaSharingProfileProvider {
52 |
53 | val profile = {
54 | val input = new Path(file).getFileSystem(conf).open(new Path(file))
55 | val profile = try {
56 | JsonUtils.fromJson[DeltaSharingProfile](IOUtils.toString(input, UTF_8))
57 | } finally {
58 | input.close()
59 | }
60 | if (profile.shareCredentialsVersion.isEmpty) {
61 | throw new IllegalArgumentException(
62 | "Cannot find the 'shareCredentialsVersion' field in the profile file")
63 | }
64 |
65 | if (profile.shareCredentialsVersion.get > DeltaSharingProfile.CURRENT) {
66 | throw new IllegalArgumentException(
67 | s"'shareCredentialsVersion' in the profile is " +
68 | s"${profile.shareCredentialsVersion.get} which is too new. The current release " +
69 | s"supports version ${DeltaSharingProfile.CURRENT} and below. Please upgrade to a newer " +
70 | s"release.")
71 | }
72 | if (profile.endpoint == null) {
73 | throw new IllegalArgumentException("Cannot find the 'endpoint' field in the profile file")
74 | }
75 | if (profile.bearerToken == null) {
76 | throw new IllegalArgumentException("Cannot find the 'bearerToken' field in the profile file")
77 | }
78 | profile
79 | }
80 |
81 | override def getProfile: DeltaSharingProfile = profile
82 | }
83 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/RemoteDeltaCDFRelation.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import scala.collection.mutable.ListBuffer
20 |
21 | import org.apache.spark.rdd.RDD
22 | import org.apache.spark.sql.{DataFrame, DeltaSharingScanUtils, Row, SparkSession, SQLContext}
23 | import org.apache.spark.sql.execution.LogicalRDD
24 | import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
25 | import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
26 | import org.apache.spark.sql.types.StructType
27 |
28 | import io.delta.sharing.spark.model.{CDFColumnInfo, Metadata, Table => DeltaSharingTable}
29 |
30 | case class RemoteDeltaCDFRelation(
31 | spark: SparkSession,
32 | snapshotToUse: RemoteSnapshot,
33 | client: DeltaSharingClient,
34 | table: DeltaSharingTable,
35 | cdfOptions: Map[String, String]) extends BaseRelation with PrunedFilteredScan {
36 |
37 | override def schema: StructType = DeltaTableUtils.addCdcSchema(snapshotToUse.schema)
38 |
39 | override def sqlContext: SQLContext = spark.sqlContext
40 |
41 | override def buildScan(
42 | requiredColumns: Array[String],
43 | filters: Array[Filter]): RDD[Row] = {
44 | val deltaTabelFiles = client.getCDFFiles(table, cdfOptions)
45 | val metadata = deltaTabelFiles.metadata
46 | val params = RemoteDeltaFileIndexParams(spark, snapshotToUse)
47 | val dfs = ListBuffer[DataFrame]()
48 |
49 | // We unconditionally add all types of files.
50 | // We will get empty data frames for empty ones, which will get combined later.
51 | dfs.append(scanIndex(new RemoteDeltaCDFAddFileIndex(params, deltaTabelFiles), metadata))
52 | dfs.append(scanIndex(new RemoteDeltaCDCFileIndex(params, deltaTabelFiles), metadata))
53 | dfs.append(scanIndex(new RemoteDeltaCDFRemoveFileIndex(params, deltaTabelFiles), metadata))
54 |
55 | dfs.reduce((df1, df2) => df1.unionAll(df2)).rdd
56 | }
57 |
58 | /**
59 | * Build a dataframe from the specified file index. We can't use a DataFrame scan directly on the
60 | * file names because that scan wouldn't include partition columns.
61 | */
62 | private def scanIndex(fileIndex: RemoteDeltaCDFFileIndexBase, metadata: Metadata): DataFrame = {
63 | val relation = HadoopFsRelation(
64 | fileIndex,
65 | fileIndex.partitionSchema,
66 | DeltaTableUtils.addCdcSchema(metadata.schemaString),
67 | bucketSpec = None,
68 | snapshotToUse.fileFormat,
69 | Map.empty)(spark)
70 | val plan = LogicalRelation(relation)
71 | DeltaSharingScanUtils.ofRows(spark, plan)
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/python/delta_sharing/tests/conftest.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import os
17 | from pathlib import Path
18 | import subprocess
19 | import threading
20 | from typing import Iterator, Optional
21 |
22 | import pytest
23 | from pytest import TempPathFactory
24 |
25 | from delta_sharing.delta_sharing import SharingClient
26 | from delta_sharing.protocol import DeltaSharingProfile
27 | from delta_sharing.rest_client import DataSharingRestClient
28 |
29 |
30 | ENABLE_INTEGRATION = len(os.environ.get("AWS_ACCESS_KEY_ID", "")) > 0
31 | SKIP_MESSAGE = "The integration tests are disabled."
32 |
33 |
34 | @pytest.fixture
35 | def profile_path() -> str:
36 | return os.path.join(os.path.dirname(__file__), "test_profile.json")
37 |
38 |
39 | @pytest.fixture
40 | def profile(profile_path) -> DeltaSharingProfile:
41 | return DeltaSharingProfile.read_from_file(profile_path)
42 |
43 |
44 | @pytest.fixture
45 | def rest_client(profile) -> DataSharingRestClient:
46 | return DataSharingRestClient(profile)
47 |
48 |
49 | @pytest.fixture
50 | def sharing_client(profile) -> SharingClient:
51 | return SharingClient(profile)
52 |
53 |
54 | @pytest.fixture(scope="session", autouse=ENABLE_INTEGRATION)
55 | def test_server(tmp_path_factory: TempPathFactory) -> Iterator[None]:
56 | pid_file: Optional[Path] = None
57 | proc: Optional[subprocess.Popen] = None
58 | try:
59 | if ENABLE_INTEGRATION:
60 | pid_file = tmp_path_factory.getbasetemp() / "delta-sharing-server.pid"
61 | proc = subprocess.Popen(
62 | [
63 | "./build/sbt",
64 | (
65 | "server/test:runMain io.delta.sharing.server.TestDeltaSharingServer "
66 | + str(pid_file)
67 | ),
68 | ],
69 | stdout=subprocess.PIPE,
70 | stderr=subprocess.PIPE,
71 | cwd="..",
72 | )
73 |
74 | ready = threading.Event()
75 |
76 | def wait_for_server() -> None:
77 | for line in proc.stdout:
78 | print(line.decode("utf-8").strip())
79 | if b"https://127.0.0.1:12345/" in line:
80 | ready.set()
81 |
82 | threading.Thread(target=wait_for_server, daemon=True).start()
83 |
84 | if not ready.wait(timeout=120):
85 | raise TimeoutError("the server didn't start in 120 seconds")
86 | yield
87 | finally:
88 | if ENABLE_INTEGRATION:
89 | if pid_file is not None and pid_file.exists():
90 | pid = pid_file.read_text()
91 | subprocess.run(["kill", "-9", pid])
92 | if proc is not None and proc.poll() is None:
93 | proc.kill()
94 |
--------------------------------------------------------------------------------
/.github/workflows/build-and-test.yml:
--------------------------------------------------------------------------------
1 | name: Build and Test
2 | on: [push, pull_request]
3 | jobs:
4 | build-and-test:
5 | runs-on: ubuntu-20.04
6 | env:
7 | SPARK_LOCAL_IP: localhost
8 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
9 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
10 | AZURE_TEST_ACCOUNT_KEY: ${{ secrets.AZURE_TEST_ACCOUNT_KEY }}
11 | GOOGLE_APPLICATION_CREDENTIALS: /tmp/google_service_account_key.json
12 | GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }}
13 | steps:
14 | - name: Checkout repository
15 | uses: actions/checkout@v2
16 | - name: Cache Scala, SBT
17 | uses: actions/cache@v2
18 | with:
19 | path: |
20 | ~/.sbt
21 | ~/.ivy2
22 | ~/.cache/coursier
23 | key: build-and-test-scala
24 | - name: Install Java 8
25 | uses: actions/setup-java@v1
26 | with:
27 | java-version: '8'
28 | - run: ./build/sbt test
29 |
30 | python:
31 | runs-on: ubuntu-20.04
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | python-version: [3.7, 3.8, 3.9]
36 | include:
37 | - pandas-version: 1.2.4
38 | pyarrow-version: 4.0.0
39 | env:
40 | PYTHON_VERSION: ${{ matrix.python-version }}
41 | PANDAS_VERSION: ${{ matrix.pandas-version }}
42 | PYARROW_VERSION: ${{ matrix.pyarrow-version }}
43 | SPARK_LOCAL_IP: localhost
44 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
45 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
46 | AZURE_TEST_ACCOUNT_KEY: ${{ secrets.AZURE_TEST_ACCOUNT_KEY }}
47 | GOOGLE_APPLICATION_CREDENTIALS: /tmp/google_service_account_key.json
48 | GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }}
49 | # Github Actions' default miniconda
50 | CONDA_PREFIX: /usr/share/miniconda
51 | steps:
52 | - name: Checkout repository
53 | uses: actions/checkout@v2
54 | - name: Cache Scala, SBT
55 | uses: actions/cache@v2
56 | with:
57 | path: |
58 | ~/.sbt
59 | ~/.ivy2
60 | ~/.cache/coursier
61 | key: build-and-test-python
62 | - name: Install Java 8
63 | uses: actions/setup-java@v1
64 | with:
65 | java-version: '8'
66 | - name: Install dependencies
67 | run: |
68 | # See also https://github.com/conda/conda/issues/7980
69 | source "$CONDA_PREFIX/etc/profile.d/conda.sh"
70 | conda update -q conda
71 | conda create -c conda-forge -q -n test-environment python=$PYTHON_VERSION
72 | conda activate test-environment
73 | conda config --env --add pinned_packages python=$PYTHON_VERSION
74 | conda config --env --add pinned_packages pandas==$PANDAS_VERSION
75 | conda config --env --add pinned_packages pyarrow==$PYARROW_VERSION
76 | conda install -c conda-forge --yes pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION
77 | sed -i -e "/pandas/d" -e "/pyarrow/d" python/requirements-dev.txt
78 | conda install -c conda-forge --yes --file python/requirements-dev.txt
79 | conda list
80 | - name: Build Server
81 | run: ./build/sbt package
82 | - name: Run tests
83 | run: |
84 | # See also https://github.com/conda/conda/issues/7980
85 | source "$CONDA_PREFIX/etc/profile.d/conda.sh"
86 | conda activate test-environment
87 | ./python/dev/lint-python
88 | ./python/dev/pytest
89 |
--------------------------------------------------------------------------------
/spark/src/test/scala/org/apache/spark/delta/sharing/CachedTableManagerSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package org.apache.spark.delta.sharing
18 |
19 | import java.lang.ref.WeakReference
20 |
21 | import org.apache.spark.SparkFunSuite
22 | import org.scalatest.concurrent.Eventually._
23 | import org.scalatest.time.SpanSugar._
24 |
25 | class CachedTableManagerSuite extends SparkFunSuite {
26 |
27 | test("cache") {
28 | val manager = new CachedTableManager(
29 | preSignedUrlExpirationMs = 10,
30 | refreshCheckIntervalMs = 10,
31 | refreshThresholdMs = 10,
32 | expireAfterAccessMs = 60000
33 | )
34 | try {
35 | val ref = new AnyRef
36 | manager.register(
37 | "test-table-path",
38 | Map("id1" -> "url1", "id2" -> "url2"),
39 | new WeakReference(ref),
40 | () => {
41 | Map("id1" -> "url1", "id2" -> "url2")
42 | })
43 | assert(manager.getPreSignedUrl("test-table-path", "id1")._1 == "url1")
44 | assert(manager.getPreSignedUrl("test-table-path", "id2")._1 == "url2")
45 |
46 | manager.register(
47 | "test-table-path2",
48 | Map("id1" -> "url1", "id2" -> "url2"),
49 | new WeakReference(ref),
50 | () => {
51 | Map("id1" -> "url3", "id2" -> "url4")
52 | })
53 | // We should get the new urls eventually
54 | eventually(timeout(10.seconds)) {
55 | assert(manager.getPreSignedUrl("test-table-path2", "id1")._1 == "url3")
56 | assert(manager.getPreSignedUrl("test-table-path2", "id2")._1 == "url4")
57 | }
58 |
59 | manager.register(
60 | "test-table-path3",
61 | Map("id1" -> "url1", "id2" -> "url2"),
62 | new WeakReference(new AnyRef),
63 | () => {
64 | Map("id1" -> "url3", "id2" -> "url4")
65 | })
66 | // We should remove the cached table eventually
67 | eventually(timeout(10.seconds)) {
68 | System.gc()
69 | intercept[IllegalStateException](manager.getPreSignedUrl("test-table-path3", "id1"))
70 | intercept[IllegalStateException](manager.getPreSignedUrl("test-table-path3", "id1"))
71 | }
72 | } finally {
73 | manager.stop()
74 | }
75 | }
76 |
77 | test("expireAfterAccessMs") {
78 | val manager = new CachedTableManager(
79 | preSignedUrlExpirationMs = 10,
80 | refreshCheckIntervalMs = 10,
81 | refreshThresholdMs = 10,
82 | expireAfterAccessMs = 10
83 | )
84 | try {
85 | val ref = new AnyRef
86 | manager.register(
87 | "test-table-path",
88 | Map("id1" -> "url1", "id2" -> "url2"),
89 | new WeakReference(ref),
90 | () => {
91 | Map("id1" -> "url1", "id2" -> "url2")
92 | })
93 | Thread.sleep(1000)
94 | // We should remove the cached table when it's not accessed
95 | intercept[IllegalStateException](manager.getPreSignedUrl("test-table-path", "id1"))
96 | } finally {
97 | manager.stop()
98 | }
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | We happily welcome contributions to Delta Sharing. We use [GitHub Issues](/../../issues/) to track community reported issues and [GitHub Pull Requests ](/../../pulls/) for accepting changes.
2 |
3 | # Governance
4 | Delta Sharing governance is conducted by the Technical Steering Committee (TSC), which is currently composed of the following members:
5 | - Michael Armbrust (michael.armbrust@gmail.com)
6 | - Reynold Xin (reynoldx@gmail.com)
7 | - Matei Zaharia (matei@cs.stanford.edu)
8 |
9 | The founding technical charter can be found [here](https://delta.io/wp-content/uploads/2019/12/delta-charter.pdf).
10 |
11 | # Communication
12 | Before starting work on a major feature, please reach out to us via GitHub, Slack, email, etc. We will make sure no one else is already working on it and ask you to open a GitHub issue.
13 | A "major feature" is defined as any change that is > 100 LOC altered (not including tests), or changes any user-facing behavior.
14 | We will use the GitHub issue to discuss the feature and come to agreement.
15 | This is to prevent your time being wasted, as well as ours.
16 | The GitHub review process for major features is also important so that organizations with commit access can come to agreement on design.
17 | If it is appropriate to write a design document, the document must be hosted either in the GitHub tracking issue, or linked to from the issue and hosted in a world-readable location.
18 | Specifically, if the goal is to add a new extension, please read the extension policy.
19 | Small patches and bug fixes don't need prior communication.
20 |
21 | # Coding style
22 | We generally follow the Apache Spark Scala Style Guide.
23 |
24 | # Sign your work
25 | The sign-off is a simple line at the end of the explanation for the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify the below (from developercertificate.org):
26 |
27 | ```
28 | Developer Certificate of Origin
29 | Version 1.1
30 |
31 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
32 | 1 Letterman Drive
33 | Suite D4700
34 | San Francisco, CA, 94129
35 |
36 | Everyone is permitted to copy and distribute verbatim copies of this
37 | license document, but changing it is not allowed.
38 |
39 |
40 | Developer's Certificate of Origin 1.1
41 |
42 | By making a contribution to this project, I certify that:
43 |
44 | (a) The contribution was created in whole or in part by me and I
45 | have the right to submit it under the open source license
46 | indicated in the file; or
47 |
48 | (b) The contribution is based upon previous work that, to the best
49 | of my knowledge, is covered under an appropriate open source
50 | license and I have the right under that license to submit that
51 | work with modifications, whether created in whole or in part
52 | by me, under the same open source license (unless I am
53 | permitted to submit under a different license), as indicated
54 | in the file; or
55 |
56 | (c) The contribution was provided directly to me by some other
57 | person who certified (a), (b) or (c) and I have not modified
58 | it.
59 |
60 | (d) I understand and agree that this project and the contribution
61 | are public and that a record of the contribution (including all
62 | personal information I submit with it, including my sign-off) is
63 | maintained indefinitely and may be redistributed consistent with
64 | this project or the open source license(s) involved.
65 | ```
66 |
67 | Then you just add a line to every git commit message:
68 |
69 | ```
70 | Signed-off-by: Joe Smith
71 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
72 | ```
73 |
74 | If you set your `user.name` and `user.email` git configs, you can sign your commit automatically with git commit -s.
75 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/sharing/server/model.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server.model
18 |
19 | import com.fasterxml.jackson.annotation.JsonInclude
20 | import org.codehaus.jackson.annotate.JsonRawValue
21 |
22 | case class SingleAction(
23 | file: AddFile = null,
24 | add: AddFileForCDF = null,
25 | cdf: AddCDCFile = null,
26 | remove: RemoveFile = null,
27 | metaData: Metadata = null,
28 | protocol: Protocol = null) {
29 |
30 | def unwrap: Action = {
31 | if (file != null) {
32 | file
33 | } else if (add != null) {
34 | add
35 | } else if (cdf != null) {
36 | cdf
37 | } else if (remove != null) {
38 | remove
39 | } else if (metaData != null) {
40 | metaData
41 | } else if (protocol != null) {
42 | protocol
43 | } else {
44 | null
45 | }
46 | }
47 | }
48 |
49 | case class Format(provider: String = "parquet")
50 |
51 | case class Metadata(
52 | id: String = null,
53 | name: String = null,
54 | description: String = null,
55 | format: Format = Format(),
56 | schemaString: String = null,
57 | configuration: Map[String, String] = Map.empty,
58 | partitionColumns: Seq[String] = Nil) extends Action {
59 |
60 | override def wrap: SingleAction = SingleAction(metaData = this)
61 | }
62 |
63 | sealed trait Action {
64 | /** Turn this object to the [[SingleAction]] wrap object. */
65 | def wrap: SingleAction
66 | }
67 |
68 | case class Protocol(minReaderVersion: Int) extends Action {
69 | override def wrap: SingleAction = SingleAction(protocol = this)
70 | }
71 |
72 | sealed abstract class AddFileBase(
73 | url: String,
74 | id: String,
75 | @JsonInclude(JsonInclude.Include.ALWAYS)
76 | partitionValues: Map[String, String],
77 | size: Long,
78 | @JsonRawValue
79 | stats: String = null)
80 | extends Action {}
81 |
82 | case class AddFile(
83 | url: String,
84 | id: String,
85 | @JsonInclude(JsonInclude.Include.ALWAYS)
86 | partitionValues: Map[String, String],
87 | size: Long,
88 | @JsonRawValue
89 | stats: String = null) extends AddFileBase(url, id, partitionValues, size, stats) {
90 |
91 | override def wrap: SingleAction = SingleAction(file = this)
92 | }
93 |
94 | case class AddFileForCDF(
95 | url: String,
96 | id: String,
97 | @JsonInclude(JsonInclude.Include.ALWAYS)
98 | partitionValues: Map[String, String],
99 | size: Long,
100 | version: Long,
101 | timestamp: Long,
102 | @JsonRawValue
103 | stats: String = null)
104 | extends AddFileBase(url, id, partitionValues, size, stats) {
105 |
106 | override def wrap: SingleAction = SingleAction(add = this)
107 | }
108 |
109 | case class AddCDCFile(
110 | url: String,
111 | id: String,
112 | @JsonInclude(JsonInclude.Include.ALWAYS)
113 | partitionValues: Map[String, String],
114 | size: Long,
115 | timestamp: Long,
116 | version: Long)
117 | extends Action {
118 |
119 | override def wrap: SingleAction = SingleAction(cdf = this)
120 | }
121 |
122 | case class RemoveFile(
123 | url: String,
124 | id: String,
125 | @JsonInclude(JsonInclude.Include.ALWAYS)
126 | partitionValues: Map[String, String],
127 | size: Long,
128 | timestamp: Long,
129 | version: Long)
130 | extends Action {
131 |
132 | override def wrap: SingleAction = SingleAction(remove = this)
133 | }
134 |
--------------------------------------------------------------------------------
/spark/src/test/scala/io/delta/sharing/spark/TestDeltaSharingClient.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import io.delta.sharing.spark.model.{
20 | AddCDCFile,
21 | AddFile,
22 | AddFileForCDF,
23 | DeltaTableFiles,
24 | DeltaTableMetadata,
25 | Metadata,
26 | Protocol,
27 | RemoveFile,
28 | SingleAction,
29 | Table
30 | }
31 | import io.delta.sharing.spark.util.JsonUtils
32 |
33 | class TestDeltaSharingClient(
34 | profileProvider: DeltaSharingProfileProvider = null,
35 | timeoutInSeconds: Int = 120,
36 | numRetries: Int = 10,
37 | sslTrustAll: Boolean = false) extends DeltaSharingClient {
38 |
39 | private val metadataString =
40 | """{"metaData":{"id":"93351cf1-c931-4326-88f0-d10e29e71b21","format":
41 | |{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",
42 | |\"fields\":[{\"name\":\"col1\",\"type\":\"integer\",\"nullable\":true,
43 | |\"metadata\":{}},{\"name\":\"col2\",\"type\":\"string\",\"nullable\":true,
44 | |\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1603723967515}}"""
45 | .stripMargin.replaceAll("\n", "")
46 | private val metadata = JsonUtils.fromJson[SingleAction](metadataString).metaData
47 |
48 | override def listAllTables(): Seq[Table] = Nil
49 |
50 | override def getMetadata(table: Table): DeltaTableMetadata = {
51 | DeltaTableMetadata(0, Protocol(0), metadata)
52 | }
53 |
54 | override def getTableVersion(table: Table): Long = 0
55 |
56 | override def getFiles(
57 | table: Table,
58 | predicates: Seq[String],
59 | limit: Option[Long],
60 | versionAsOf: Option[Long]): DeltaTableFiles = {
61 | limit.foreach(lim => TestDeltaSharingClient.limits = TestDeltaSharingClient.limits :+ lim)
62 |
63 | val addFiles: Seq[AddFile] = Seq(
64 | AddFile("f1.parquet", "f1", Map.empty, 0),
65 | AddFile("f2.parquet", "f2", Map.empty, 0),
66 | AddFile("f3.parquet", "f3", Map.empty, 0),
67 | AddFile("f4.parquet", "f4", Map.empty, 0)
68 | ).take(limit.getOrElse(4L).toInt)
69 |
70 | DeltaTableFiles(0, Protocol(0), metadata, addFiles)
71 | }
72 |
73 | override def getCDFFiles(table: Table, cdfOptions: Map[String, String]): DeltaTableFiles = {
74 | val addFiles: Seq[AddFileForCDF] = Seq(
75 | AddFileForCDF("cdf_add1.parquet", "cdf_add1", Map.empty, 100, 1, 1000)
76 | )
77 | val cdcFiles: Seq[AddCDCFile] = Seq(
78 | // Return one cdc file from version 2, and two files with version 3.
79 | // This should result in two partition directories.
80 | AddCDCFile("cdf_cdc1.parquet", "cdf_cdc1", Map.empty, 200, 2, 2000),
81 | AddCDCFile("cdf_cdc2.parquet", "cdf_cdc2", Map.empty, 300, 3, 3000),
82 | AddCDCFile("cdf_cdc2.parquet", "cdf_cdc3", Map.empty, 310, 3, 3000)
83 | )
84 | val removeFiles: Seq[RemoveFile] = Seq(
85 | // Return files with same version but different timestamps.
86 | // This should result in two partition directories.
87 | RemoveFile("cdf_rem1.parquet", "cdf_rem1", Map.empty, 400, 4, 4000),
88 | RemoveFile("cdf_rem2.parquet", "cdf_rem2", Map.empty, 420, 4, 4200)
89 | )
90 | DeltaTableFiles(0, Protocol(0), metadata, Nil, addFiles, cdcFiles, removeFiles)
91 | }
92 |
93 | def clear(): Unit = {
94 | TestDeltaSharingClient.limits = Nil
95 | }
96 | }
97 |
98 | object TestDeltaSharingClient {
99 | var limits = Seq.empty[Long]
100 | }
101 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/standalone/internal/DeltaSharingHistoryManager.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | // Putting these classes in this package to access Delta Standalone internal APIs
18 | package io.delta.standalone.internal
19 |
20 | import io.delta.standalone.internal.actions.CommitMarker
21 | import io.delta.standalone.internal.util.FileNames
22 | import io.delta.standalone.storage.LogStore
23 | import org.apache.hadoop.conf.Configuration
24 | import org.apache.hadoop.fs.Path
25 | import scala.collection.JavaConverters._
26 |
27 | object DeltaSharingHistoryManager {
28 | /**
29 | * DeltaHistoryManager.getCommits is not a public method, so we need to make local copies here.
30 | * When calling getCommits, the initial few timestamp values may be wrong because they are not
31 | * properly monotonized. getCommitsSafe uses this to update the start value
32 | * far behind the first timestamp they care about to get correct values.
33 | * TODO(https://github.com/delta-io/delta-sharing/issues/144): Cleans this up once
34 | * DeltaHistoryManager.getCommits is public
35 | */
36 | private val POTENTIALLY_UNMONOTONIZED_TIMESTAMPS = 100
37 |
38 | private[internal] def getCommitsSafe(
39 | logStore: LogStore,
40 | logPath: Path,
41 | start: Long,
42 | end: Long,
43 | conf: Configuration): Array[Commit] = {
44 | val monotonizationStart =
45 | Seq(start - POTENTIALLY_UNMONOTONIZED_TIMESTAMPS, 0).max
46 | getCommits(logStore, logPath, monotonizationStart, end, conf)
47 | }
48 |
49 | /**
50 | * Returns the commit version and timestamps of all commits in `[start, end)`. If `end` is not
51 | * specified, will return all commits that exist after `start`. Will guarantee that the commits
52 | * returned will have both monotonically increasing versions as well as timestamps.
53 | * Exposed for tests.
54 | */
55 | private def getCommits(
56 | logStore: LogStore,
57 | logPath: Path,
58 | start: Long,
59 | end: Long,
60 | conf: Configuration): Array[Commit] = {
61 | val commits = logStore
62 | .listFrom(FileNames.deltaFile(logPath, start), conf)
63 | .asScala
64 | .filter(f => FileNames.isDeltaFile(f.getPath))
65 | .map { fileStatus =>
66 | Commit(FileNames.deltaVersion(fileStatus.getPath), fileStatus.getModificationTime)
67 | }
68 | .takeWhile(_.version < end)
69 |
70 | monotonizeCommitTimestamps(commits.toArray)
71 | }
72 |
73 | /**
74 | * Makes sure that the commit timestamps are monotonically increasing with respect to commit
75 | * versions. Requires the input commits to be sorted by the commit version.
76 | */
77 | private def monotonizeCommitTimestamps[T <: CommitMarker](
78 | commits: Array[T]): Array[T] = {
79 | var i = 0
80 | val length = commits.length
81 | while (i < length - 1) {
82 | val prevTimestamp = commits(i).getTimestamp
83 | assert(commits(i).getVersion < commits(i + 1).getVersion, "Unordered commits provided.")
84 | if (prevTimestamp >= commits(i + 1).getTimestamp) {
85 | commits(i + 1) = commits(i + 1).withTimestamp(prevTimestamp + 1).asInstanceOf[T]
86 | }
87 | i += 1
88 | }
89 | commits
90 | }
91 |
92 | /** A helper class to represent the timestamp and version of a commit. */
93 | case class Commit(version: Long, timestamp: Long) extends CommitMarker {
94 | override def withTimestamp(timestamp: Long): Commit = this.copy(timestamp = timestamp)
95 |
96 | override def getTimestamp: Long = timestamp
97 |
98 | override def getVersion: Long = version
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/spark/src/test/scala/io/delta/sharing/spark/DeltaSharingFileProfileProviderSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import java.nio.charset.StandardCharsets.UTF_8
20 | import java.nio.file.Files
21 |
22 | import org.apache.commons.io.FileUtils
23 | import org.apache.hadoop.conf.Configuration
24 | import org.apache.spark.SparkFunSuite
25 |
26 | class DeltaSharingFileProfileProviderSuite extends SparkFunSuite {
27 |
28 | private def testProfile(profile: String, expected: DeltaSharingProfile): Unit = {
29 | val temp = Files.createTempFile("test", ".share").toFile
30 | try {
31 | FileUtils.writeStringToFile(temp, profile, UTF_8)
32 | assert(new DeltaSharingFileProfileProvider(new Configuration, temp.getCanonicalPath)
33 | .getProfile == expected)
34 | } finally {
35 | temp.delete()
36 | }
37 | }
38 |
39 | test("parse") {
40 | testProfile(
41 | """{
42 | | "shareCredentialsVersion": 1,
43 | | "endpoint": "foo",
44 | | "bearerToken": "bar",
45 | | "expirationTime": "2021-11-12T00:12:29.0Z"
46 | |}
47 | |""".stripMargin,
48 | DeltaSharingProfile(
49 | shareCredentialsVersion = Some(1),
50 | endpoint = "foo",
51 | bearerToken = "bar",
52 | expirationTime = "2021-11-12T00:12:29.0Z"
53 | )
54 | )
55 | }
56 |
57 | test("expirationTime is optional") {
58 | testProfile(
59 | """{
60 | | "shareCredentialsVersion": 1,
61 | | "endpoint": "foo",
62 | | "bearerToken": "bar"
63 | |}
64 | |""".stripMargin,
65 | DeltaSharingProfile(
66 | shareCredentialsVersion = Some(1),
67 | endpoint = "foo",
68 | bearerToken = "bar"
69 | )
70 | )
71 | }
72 |
73 | test("version is missing") {
74 | val e = intercept[IllegalArgumentException] {
75 | testProfile(
76 | """{
77 | | "endpoint": "foo",
78 | | "bearerToken": "bar"
79 | |}
80 | |""".stripMargin,
81 | null
82 | )
83 | }
84 | assert(e.getMessage.contains(
85 | "Cannot find the 'shareCredentialsVersion' field in the profile file"))
86 | }
87 |
88 | test("shareCredentialsVersion is not supported") {
89 | val e = intercept[IllegalArgumentException] {
90 | testProfile(
91 | """{
92 | | "shareCredentialsVersion": 100
93 | |}
94 | |""".stripMargin,
95 | null
96 | )
97 | }
98 | assert(e.getMessage.contains(
99 | "'shareCredentialsVersion' in the profile is 100 which is too new."))
100 | }
101 |
102 | test("endpoint is missing") {
103 | val e = intercept[IllegalArgumentException] {
104 | testProfile(
105 | """{
106 | | "shareCredentialsVersion": 1,
107 | | "bearerToken": "bar"
108 | |}
109 | |""".stripMargin,
110 | null
111 | )
112 | }
113 | assert(e.getMessage.contains("Cannot find the 'endpoint' field in the profile file"))
114 | }
115 |
116 | test("bearerToken is missing") {
117 | val e = intercept[IllegalArgumentException] {
118 | testProfile(
119 | """{
120 | | "shareCredentialsVersion": 1,
121 | | "endpoint": "foo"
122 | |}
123 | |""".stripMargin,
124 | null
125 | )
126 | }
127 | assert(e.getMessage.contains("Cannot find the 'bearerToken' field in the profile file"))
128 | }
129 |
130 | test("unknown field should be ignored") {
131 | testProfile(
132 | """{
133 | | "shareCredentialsVersion": 1,
134 | | "endpoint": "foo",
135 | | "bearerToken": "bar",
136 | | "expirationTime": "2021-11-12T00:12:29.0Z",
137 | | "futureField": "xyz"
138 | |}
139 | |""".stripMargin,
140 | DeltaSharingProfile(
141 | shareCredentialsVersion = Some(1),
142 | endpoint = "foo",
143 | bearerToken = "bar",
144 | expirationTime = "2021-11-12T00:12:29.0Z"
145 | )
146 | )
147 | }
148 | }
149 |
--------------------------------------------------------------------------------
/spark/src/test/scala/io/delta/sharing/spark/DeltaSharingIntegrationTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import java.io.File
20 | import java.nio.charset.StandardCharsets.UTF_8
21 | import java.nio.file.Files
22 | import java.util.concurrent.{CountDownLatch, TimeUnit}
23 |
24 | import scala.sys.process._
25 | import scala.util.Try
26 |
27 | import org.apache.commons.io.FileUtils
28 | import org.apache.hadoop.conf.Configuration
29 | import org.apache.spark.SparkFunSuite
30 | import org.scalatest.BeforeAndAfterAll
31 |
32 | trait DeltaSharingIntegrationTest extends SparkFunSuite with BeforeAndAfterAll {
33 |
34 | def shouldRunIntegrationTest: Boolean = {
35 | sys.env.get("AWS_ACCESS_KEY_ID").exists(_.length > 0) &&
36 | sys.env.get("AZURE_TEST_ACCOUNT_KEY").exists(_.length > 0) &&
37 | sys.env.get("GOOGLE_APPLICATION_CREDENTIALS").exists(_.length > 0)
38 | }
39 |
40 | @volatile private var process: Process = _
41 | @volatile private var pidFile: File = _
42 | var testProfileFile: File = _
43 |
44 | val TEST_PORT = 12345
45 |
46 | override def beforeAll(): Unit = {
47 | super.beforeAll()
48 | if (shouldRunIntegrationTest) {
49 | pidFile = Files.createTempFile("delta-sharing-server", ".pid").toFile
50 | testProfileFile = Files.createTempFile("delta-test", ".share").toFile
51 | FileUtils.writeStringToFile(testProfileFile,
52 | s"""{
53 | | "shareCredentialsVersion": 1,
54 | | "endpoint": "https://localhost:$TEST_PORT/delta-sharing",
55 | | "bearerToken": "dapi5e3574ec767ca1548ae5bbed1a2dc04d"
56 | |}""".stripMargin, UTF_8)
57 |
58 | val startLatch = new CountDownLatch(1)
59 | new Thread("Run TestDeltaSharingServer") {
60 | setDaemon(true)
61 |
62 | override def run(): Unit = {
63 | val processLogger = ProcessLogger { stdout =>
64 | // scalastyle:off println
65 | println(stdout)
66 | // scalastyle:on println
67 | if (stdout.contains(s"https://127.0.0.1:$TEST_PORT/")) {
68 | startLatch.countDown()
69 | }
70 | }
71 | process =
72 | Seq(
73 | "/bin/bash",
74 | "-c",
75 | s"cd .. && build/sbt 'server / Test / runMain " +
76 | s"io.delta.sharing.server.TestDeltaSharingServer ${pidFile.getCanonicalPath}'")
77 | .run(processLogger)
78 | process.exitValue()
79 | process = null
80 | startLatch.countDown()
81 | }
82 | }.start()
83 | try {
84 | assert(startLatch.await(120, TimeUnit.SECONDS), "the server didn't start in 120 seconds")
85 | if (process == null) {
86 | fail("the process exited with an error")
87 | }
88 | } catch {
89 | case e: Throwable =>
90 | if (process != null) {
91 | process.destroy()
92 | process = null
93 | }
94 | throw e
95 | }
96 | }
97 | }
98 |
99 | override def afterAll(): Unit = {
100 | if (shouldRunIntegrationTest) {
101 | try {
102 | if (process != null) {
103 | process.destroy()
104 | process = null
105 | }
106 | if (pidFile != null) {
107 | val pid = FileUtils.readFileToString(pidFile)
108 | Try(pid.toLong).foreach { pid =>
109 | // scalastyle:off println
110 | println(s"Killing $pid")
111 | // scalastyle:on println
112 | s"kill -9 $pid".!
113 | }
114 | pidFile.delete()
115 | }
116 | if (testProfileFile != null) {
117 | testProfileFile.delete()
118 | }
119 | } finally {
120 | super.afterAll()
121 | }
122 | }
123 | }
124 |
125 | def testProfileProvider: DeltaSharingProfileProvider = {
126 | new DeltaSharingFileProfileProvider(new Configuration, testProfileFile.getCanonicalPath)
127 | }
128 |
129 | def integrationTest(testName: String)(func: => Unit): Unit = {
130 | test(testName) {
131 | assume(shouldRunIntegrationTest)
132 | func
133 | }
134 | }
135 | }
136 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/DeltaSharingDataSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import java.util.Collections
20 |
21 | import scala.collection.JavaConverters._
22 | import scala.collection.mutable
23 |
24 | import org.apache.spark.SparkEnv
25 | import org.apache.spark.delta.sharing.PreSignedUrlCache
26 | import org.apache.spark.sql.{SparkSession, SQLContext}
27 | import org.apache.spark.sql.connector.catalog.{Table, TableCapability, TableProvider}
28 | import org.apache.spark.sql.connector.expressions.Transform
29 | import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationProvider}
30 | import org.apache.spark.sql.types.StructType
31 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
32 |
33 | /** A DataSource V1 for integrating Delta into Spark SQL batch APIs. */
34 | private[sharing] class DeltaSharingDataSource extends RelationProvider with DataSourceRegister {
35 |
36 | override def createRelation(
37 | sqlContext: SQLContext,
38 | parameters: Map[String, String]): BaseRelation = {
39 | DeltaSharingDataSource.setupFileSystem(sqlContext)
40 | val path = parameters.getOrElse("path", throw new IllegalArgumentException(
41 | "'path' is not specified. If you use SQL to create a Delta Sharing table, " +
42 | "LOCATION must be specified"))
43 |
44 | var cdfOptions: mutable.Map[String, String] = mutable.Map.empty
45 | val caseInsensitiveParams = new CaseInsensitiveStringMap(parameters.asJava)
46 | if (DeltaSharingDataSource.isCDFRead(caseInsensitiveParams)) {
47 | cdfOptions = mutable.Map[String, String](DeltaSharingDataSource.CDF_ENABLED_KEY -> "true")
48 | if (caseInsensitiveParams.containsKey(DeltaSharingDataSource.CDF_START_VERSION_KEY)) {
49 | cdfOptions(DeltaSharingDataSource.CDF_START_VERSION_KEY) = caseInsensitiveParams.get(
50 | DeltaSharingDataSource.CDF_START_VERSION_KEY)
51 | }
52 | if (caseInsensitiveParams.containsKey(DeltaSharingDataSource.CDF_START_TIMESTAMP_KEY)) {
53 | cdfOptions(DeltaSharingDataSource.CDF_START_TIMESTAMP_KEY) = caseInsensitiveParams.get(
54 | DeltaSharingDataSource.CDF_START_TIMESTAMP_KEY)
55 | }
56 | if (caseInsensitiveParams.containsKey(DeltaSharingDataSource.CDF_END_VERSION_KEY)) {
57 | cdfOptions(DeltaSharingDataSource.CDF_END_VERSION_KEY) = caseInsensitiveParams.get(
58 | DeltaSharingDataSource.CDF_END_VERSION_KEY)
59 | }
60 | if (caseInsensitiveParams.containsKey(DeltaSharingDataSource.CDF_END_TIMESTAMP_KEY)) {
61 | cdfOptions(DeltaSharingDataSource.CDF_END_TIMESTAMP_KEY) = caseInsensitiveParams.get(
62 | DeltaSharingDataSource.CDF_END_TIMESTAMP_KEY)
63 | }
64 | }
65 |
66 | var versionAsOf: Option[Long] = None
67 | if (parameters.get("versionAsOf").isDefined) {
68 | try {
69 | versionAsOf = Some(parameters.get("versionAsOf").get.toLong)
70 | } catch {
71 | case _: NumberFormatException =>
72 | throw new IllegalArgumentException("versionAsOf is not a valid number.")
73 | }
74 | }
75 | val deltaLog = RemoteDeltaLog(path)
76 | deltaLog.createRelation(versionAsOf, cdfOptions = cdfOptions.toMap)
77 | }
78 |
79 | override def shortName: String = "deltaSharing"
80 | }
81 |
82 |
83 | private[sharing] object DeltaSharingDataSource {
84 |
85 | def setupFileSystem(sqlContext: SQLContext): Unit = {
86 | // We have put our class name in the `org.apache.hadoop.fs.FileSystem` resource file. However,
87 | // this file will be loaded only if the class `FileSystem` is loaded. Hence, it won't work when
88 | // we add the library after starting Spark. Therefore we change the global `hadoopConfiguration`
89 | // to make sure we set up `DeltaSharingFileSystem` correctly.
90 | sqlContext.sparkContext.hadoopConfiguration
91 | .setIfUnset("fs.delta-sharing.impl", "io.delta.sharing.spark.DeltaSharingFileSystem")
92 | PreSignedUrlCache.registerIfNeeded(SparkEnv.get)
93 | }
94 |
95 | // Based on the read options passed it indicates whether the read was a cdf read or not.
96 | def isCDFRead(options: CaseInsensitiveStringMap): Boolean = {
97 | options.containsKey(DeltaSharingDataSource.CDF_ENABLED_KEY) &&
98 | options.get(DeltaSharingDataSource.CDF_ENABLED_KEY) == "true"
99 | }
100 |
101 | // Constants for cdf parameters
102 | final val CDF_ENABLED_KEY = "readChangeFeed"
103 |
104 | final val CDF_START_VERSION_KEY = "startingVersion"
105 |
106 | final val CDF_START_TIMESTAMP_KEY = "startingTimestamp"
107 |
108 | final val CDF_END_VERSION_KEY = "endingVersion"
109 |
110 | final val CDF_END_TIMESTAMP_KEY = "endingTimestamp"
111 | }
112 |
--------------------------------------------------------------------------------
/python/delta_sharing/converter.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from decimal import Decimal
17 | from typing import Any, Callable, Dict
18 |
19 | import numpy as np
20 | import pandas as pd
21 |
22 |
23 | def _get_dummy_column(schema_type):
24 | """
25 | Return a dummy column with the data type specified in schema_type.
26 | The dummy column is used to populate the dtype fields in empty tables.
27 | :param schema_type: str or json representing a data type
28 | :return: dummy pandas Series to be inserted into an empty table
29 | """
30 | if schema_type == "boolean":
31 | return pd.Series([False])
32 | elif schema_type == "byte":
33 | return pd.Series([0], dtype="int8")
34 | elif schema_type == "short":
35 | return pd.Series([0], dtype="int16")
36 | elif schema_type == "integer":
37 | return pd.Series([0], dtype="int32")
38 | elif schema_type == "long":
39 | return pd.Series([0], dtype="int64")
40 | elif schema_type == "float":
41 | return pd.Series([0], dtype="float32")
42 | elif schema_type == "double":
43 | return pd.Series([0], dtype="float64")
44 | elif isinstance(schema_type, str) and schema_type.startswith("decimal"):
45 | return pd.Series([0], dtype=np.dtype("O"))
46 | elif schema_type == "string":
47 | return pd.Series([0], dtype=np.dtype("O"))
48 | elif schema_type == "date":
49 | return pd.Series([pd.Timestamp(0).date()])
50 | elif schema_type == "timestamp":
51 | return pd.Series([pd.Timestamp(0)], dtype=np.dtype("datetime64[ns]"))
52 | elif schema_type == "binary":
53 | return pd.Series([0], dtype=np.dtype("O"))
54 | elif isinstance(schema_type, dict) and schema_type["type"] in ("array", "struct", "map"):
55 | return pd.Series([0], dtype=np.dtype("O"))
56 |
57 | raise ValueError(f"Could not parse datatype: {schema_type}")
58 |
59 |
60 | def get_empty_table(schema_json: dict) -> pd.DataFrame:
61 | """
62 | For empty tables, we use dummy columns from `_get_dummy_column` and then
63 | drop all rows to generate a table with the correct column names and
64 | data types.
65 | :param schema_json: json object representing the table schema
66 | :return: empty table with columns specified in schema_json
67 | """
68 | assert schema_json["type"] == "struct"
69 |
70 | dummy_table = pd.DataFrame(
71 | {field["name"]: _get_dummy_column(field["type"]) for field in schema_json["fields"]}
72 | )
73 | return dummy_table.iloc[0:0]
74 |
75 |
76 | def to_converters(schema_json: dict) -> Dict[str, Callable[[str], Any]]:
77 | assert schema_json["type"] == "struct"
78 |
79 | return {field["name"]: to_converter(field["type"]) for field in schema_json["fields"]}
80 |
81 |
82 | def to_converter(schema_type) -> Callable[[str], Any]:
83 | """
84 | For types that support partitioning, a lambda to parse data into the
85 | corresponding type is returned. For data types that cannot be partitioned
86 | on, we return None. The caller is expected to check if the value is None before using.
87 | :param schema_type: str or json representing a data type
88 | :return: converter function or None
89 | """
90 | if schema_type == "boolean":
91 | return lambda x: None if (x is None or x == "") else (x is True or x == "true")
92 | elif schema_type == "byte":
93 | return lambda x: np.nan if (x is None or x == "") else np.int8(x)
94 | elif schema_type == "short":
95 | return lambda x: np.nan if (x is None or x == "") else np.int16(x)
96 | elif schema_type == "integer":
97 | return lambda x: np.nan if (x is None or x == "") else np.int32(x)
98 | elif schema_type == "long":
99 | return lambda x: np.nan if (x is None or x == "") else np.int64(x)
100 | elif schema_type == "float":
101 | return lambda x: np.nan if (x is None or x == "") else np.float32(x)
102 | elif schema_type == "double":
103 | return lambda x: np.nan if (x is None or x == "") else np.float64(x)
104 | elif isinstance(schema_type, str) and schema_type.startswith("decimal"):
105 | return lambda x: None if (x is None or x == "") else Decimal(x)
106 | elif schema_type == "string":
107 | return lambda x: None if (x is None or x == "") else str(x)
108 | elif schema_type == "date":
109 | return lambda x: None if (x is None or x == "") else pd.Timestamp(x).date()
110 | elif schema_type == "timestamp":
111 | return lambda x: pd.NaT if (x is None or x == "") else pd.Timestamp(x)
112 | elif schema_type == "binary":
113 | return None # partition on binary column not supported
114 | elif isinstance(schema_type, dict) and schema_type["type"] in ("array", "struct", "map"):
115 | return None # partition on complex column not supported
116 |
117 | raise ValueError(f"Could not parse datatype: {schema_type}")
118 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/standalone/internal/PartitionFilterUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.standalone.internal
18 |
19 | import scala.util.Try
20 | import scala.util.control.NonFatal
21 |
22 | import io.delta.standalone.internal.actions.AddFile
23 | import org.apache.spark.sql.Encoders
24 | import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
25 | import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, UnresolvedAttribute}
26 | import org.apache.spark.sql.catalyst.expressions._
27 | import org.apache.spark.sql.execution.SparkSqlParser
28 | import org.apache.spark.sql.internal.SQLConf
29 | import org.apache.spark.sql.types.{DataType, StructField, StructType}
30 | import org.slf4j.LoggerFactory
31 |
32 | object PartitionFilterUtils {
33 | private val logger = LoggerFactory.getLogger(this.getClass)
34 |
35 | private lazy val sqlParser = new SparkSqlParser(new SQLConf)
36 |
37 | def evaluatePredicate(
38 | schemaString: String,
39 | partitionColumns: Seq[String],
40 | partitionFilters: Seq[String],
41 | addFiles: Seq[AddFile]): Seq[AddFile] = {
42 | try {
43 | val tableSchema = DataType.fromJson(schemaString).asInstanceOf[StructType]
44 | val partitionSchema = new StructType(partitionColumns.map(c => tableSchema(c)).toArray)
45 | val addSchema = Encoders.product[AddFile].schema
46 | val attrs =
47 | addSchema.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)())
48 | val exprs =
49 | rewritePartitionFilters(
50 | partitionSchema,
51 | attrs,
52 | partitionFilters.flatMap { f =>
53 | Try(sqlParser.parseExpression(f)).toOption
54 | }.filter(f => isSupportedExpression(f, partitionSchema))
55 | )
56 | if (exprs.isEmpty) {
57 | addFiles
58 | } else {
59 | val predicate = InterpretedPredicate.create(exprs.reduce(And), attrs)
60 | predicate.initialize(0)
61 | addFiles.filter { addFile =>
62 | val converter = CatalystTypeConverters.createToCatalystConverter(addSchema)
63 | predicate.eval(converter(addFile).asInstanceOf[InternalRow])
64 | }
65 | }
66 | } catch {
67 | case NonFatal(e) =>
68 | logger.error(e.getMessage, e)
69 | // Fail to evaluate the filters. Return all files as a fallback.
70 | addFiles
71 | }
72 | }
73 |
74 | private def isSupportedExpression(e: Expression, partitionSchema: StructType): Boolean = {
75 | def isPartitionColumOrConstant(e: Expression): Boolean = {
76 | e match {
77 | case _: Literal => true
78 | case u: UnresolvedAttribute if u.nameParts.size == 1 =>
79 | val unquoted = u.name.stripPrefix("`").stripSuffix("`")
80 | partitionSchema.exists(part => caseInsensitiveResolution(unquoted, part.name))
81 | case c: Cast => isPartitionColumOrConstant(c.child)
82 | case _ => false
83 | }
84 | }
85 |
86 | e match {
87 | case EqualTo(left, right)
88 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) =>
89 | true
90 | case GreaterThan(left, right)
91 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) =>
92 | true
93 | case LessThan(left, right)
94 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) =>
95 | true
96 | case GreaterThanOrEqual(left, right)
97 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) =>
98 | true
99 | case LessThanOrEqual(left, right)
100 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) =>
101 | true
102 | case EqualNullSafe(left, right)
103 | if isPartitionColumOrConstant(left) && isPartitionColumOrConstant(right) =>
104 | true
105 | case IsNull(e) if isPartitionColumOrConstant(e) =>
106 | true
107 | case IsNotNull(e) if isPartitionColumOrConstant(e) =>
108 | true
109 | case Not(e) if isSupportedExpression(e, partitionSchema) =>
110 | true
111 | case _ => false
112 | }
113 | }
114 |
115 | private def rewritePartitionFilters(
116 | partitionSchema: StructType,
117 | attrs: Seq[Attribute],
118 | partitionFilters: Seq[Expression]): Seq[Expression] = {
119 | val partitionValuesAttr = attrs.find(_.name == "partitionValues").head
120 | partitionFilters.map(_.transformUp {
121 | case a: Attribute =>
122 | // If we have a special column name, e.g. `a.a`, then an UnresolvedAttribute returns
123 | // the column name as '`a.a`' instead of 'a.a', therefore we need to strip the backticks.
124 | val unquoted = a.name.stripPrefix("`").stripSuffix("`")
125 | val partitionCol = partitionSchema.find { field => field.name == unquoted }
126 | partitionCol match {
127 | case Some(StructField(name, dataType, _, _)) =>
128 | Cast(
129 | ExtractValue(
130 | partitionValuesAttr,
131 | Literal(name),
132 | org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution),
133 | dataType)
134 | case None =>
135 | // This should not be able to happen, but the case was present in the original code so
136 | // we kept it to be safe.
137 | UnresolvedAttribute(Seq("partitionValues", a.name))
138 | }
139 | })
140 | }
141 | }
142 |
--------------------------------------------------------------------------------
/server/src/test/scala/io/delta/sharing/server/config/ServerConfigSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server.config
18 |
19 | import java.io.File
20 | import java.nio.charset.StandardCharsets.UTF_8
21 | import java.nio.file.Files
22 | import java.util.Arrays
23 |
24 | import org.apache.commons.io.FileUtils
25 | import org.scalatest.FunSuite
26 |
27 | class ServerConfigSuite extends FunSuite {
28 |
29 | def testConfig(content: String, serverConfig: ServerConfig): Unit = {
30 | val tempFile = Files.createTempFile("delta-sharing-server", ".yaml").toFile
31 | try {
32 | FileUtils.writeStringToFile(tempFile, content, UTF_8)
33 | val loaded = ServerConfig.load(tempFile.getCanonicalPath)
34 | assert(serverConfig == loaded)
35 | } finally {
36 | tempFile.delete()
37 | }
38 | }
39 |
40 | test("empty config") {
41 | val serverConfig = new ServerConfig()
42 | serverConfig.setVersion(1)
43 | testConfig("version: 1", serverConfig)
44 | }
45 |
46 | test("template") {
47 | val tempFile = Files.createTempFile("delta-sharing-server", ".yaml").toFile
48 | try {
49 | FileUtils.copyFile(
50 | new File("src/universal/conf/delta-sharing-server.yaml.template"),
51 | tempFile)
52 | val loaded = ServerConfig.load(tempFile.getCanonicalPath)
53 | val sharesInTemplate = Arrays.asList(
54 | ShareConfig("share1", Arrays.asList(
55 | SchemaConfig("schema1", Arrays.asList(
56 | TableConfig("table1", "s3a:///"),
57 | TableConfig(
58 | "table2",
59 | "wasbs://@")
60 | ))
61 | )),
62 | ShareConfig("share2", Arrays.asList(
63 | SchemaConfig("schema2", Arrays.asList(
64 | TableConfig(
65 | "table3",
66 | "abfss://@",
67 | true)
68 | ))
69 | )),
70 | ShareConfig("share3", Arrays.asList(
71 | SchemaConfig("schema3", Arrays.asList(
72 | TableConfig(
73 | "table4",
74 | "gs:///")
75 | ))
76 | ))
77 | )
78 | val serverConfig = new ServerConfig()
79 | serverConfig.setVersion(1)
80 | serverConfig.setShares(sharesInTemplate)
81 | serverConfig.setPort(8080)
82 | assert(loaded == serverConfig)
83 | } finally {
84 | tempFile.delete()
85 | }
86 | }
87 |
88 | test("accept unknown fields") {
89 | val serverConfig = new ServerConfig()
90 | serverConfig.setVersion(1)
91 | testConfig(
92 | """version: 1
93 | |unknown: "test"
94 | |""".stripMargin, serverConfig)
95 | }
96 |
97 | test("authorization token") {
98 | val serverConfig = new ServerConfig()
99 | serverConfig.setVersion(1)
100 | serverConfig.setAuthorization(Authorization(""))
101 | testConfig(
102 | """version: 1
103 | |authorization:
104 | | bearerToken:
105 | |""".stripMargin, serverConfig)
106 | }
107 |
108 | private def assertInvalidConfig(expectedErrorMessage: String)(func: => Unit): Unit = {
109 | assert(intercept[IllegalArgumentException] {
110 | func
111 | }.getMessage.contains(expectedErrorMessage))
112 | }
113 |
114 | test("invalid version") {
115 | assertInvalidConfig("'version' must be greater than 0") {
116 | testConfig("version: 0", null)
117 | }
118 | }
119 |
120 | test("future version") {
121 | assertInvalidConfig("The 'version' in the server config is 100 which is too new.") {
122 | testConfig("version: 100", null)
123 | }
124 | }
125 |
126 | test("invalid ssl") {
127 | assertInvalidConfig("'certificateFile' in a SSL config must be provided") {
128 | testConfig(
129 | """version: 1
130 | |ssl:
131 | | selfSigned: false
132 | |""".stripMargin, null)
133 | }
134 | }
135 |
136 | test("Authorization") {
137 | assertInvalidConfig("'bearerToken' in 'authorization' must be provided") {
138 | new Authorization().checkConfig()
139 | }
140 | }
141 |
142 | test("SSLConfig") {
143 | assertInvalidConfig("'certificateFile' in a SSL config must be provided") {
144 | val s = new SSLConfig()
145 | assert(s.selfSigned == false)
146 | s.checkConfig()
147 | }
148 | assertInvalidConfig("'certificateKeyFile' in a SSL config must be provided") {
149 | val s = new SSLConfig()
150 | s.setCertificateFile("file")
151 | s.checkConfig()
152 | }
153 | val s = new SSLConfig()
154 | s.setSelfSigned(true)
155 | s.checkConfig()
156 | }
157 |
158 | test("ShareConfig") {
159 | assertInvalidConfig("'name' in a share must be provided") {
160 | new ShareConfig().checkConfig()
161 | }
162 | assertInvalidConfig("'name' in a schema must be provided") {
163 | val s = new ShareConfig()
164 | s.setName("name")
165 | s.setSchemas(Arrays.asList(new SchemaConfig()))
166 | s.checkConfig()
167 | }
168 | }
169 |
170 | test("SchemaConfig") {
171 | assertInvalidConfig("'name' in a schema must be provided") {
172 | new SchemaConfig().checkConfig()
173 | }
174 | assertInvalidConfig("'name' in a table must be provided") {
175 | val s = new SchemaConfig()
176 | s.setName("name")
177 | s.setTables(Arrays.asList(new TableConfig()))
178 | s.checkConfig()
179 | }
180 | }
181 |
182 | test("TableConfig") {
183 | assertInvalidConfig("'name' in a table must be provided") {
184 | new TableConfig().checkConfig()
185 | }
186 | assertInvalidConfig("'name' in a table must be provided") {
187 | val t = new TableConfig()
188 | t.setLocation("Location")
189 | t.checkConfig()
190 | }
191 | assertInvalidConfig("'location' in a table must be provided") {
192 | val t = new TableConfig()
193 | t.setName("name")
194 | t.checkConfig()
195 | }
196 | }
197 | }
198 |
--------------------------------------------------------------------------------
/server/src/test/scala/io/delta/sharing/server/TestResource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server
18 |
19 | import java.io.File
20 | import java.nio.charset.StandardCharsets.UTF_8
21 | import java.nio.file.Files
22 |
23 | import org.apache.commons.io.FileUtils
24 |
25 | import io.delta.sharing.server.config._
26 |
27 | object TestResource {
28 | def env(key: String): String = {
29 | sys.env.getOrElse(key, throw new IllegalArgumentException(s"Cannot find $key in sys env"))
30 | }
31 |
32 | object AWS {
33 | val bucket = "delta-exchange-test"
34 | }
35 |
36 | object Azure {
37 | val accountName = "deltasharingtest"
38 | val container = "delta-sharing-test-container"
39 | }
40 |
41 | object GCP {
42 | val bucket = "delta-sharing-dev"
43 | }
44 |
45 | val TEST_PORT = 12345
46 |
47 | val testAuthorizationToken = "dapi5e3574ec767ca1548ae5bbed1a2dc04d"
48 |
49 | def maybeSetupGoogleServiceAccountCredentials: Unit = {
50 | // Only setup Google Service Account credentials when it is provided through env variable.
51 | if (sys.env.get("GOOGLE_SERVICE_ACCOUNT_KEY").exists(_.length > 0)
52 | && sys.env.get("GOOGLE_APPLICATION_CREDENTIALS").exists(_.length > 0)) {
53 | val serviceAccountKey = sys.env("GOOGLE_SERVICE_ACCOUNT_KEY")
54 | val credFilePath = new File(sys.env("GOOGLE_APPLICATION_CREDENTIALS"))
55 | credFilePath.deleteOnExit()
56 | FileUtils.writeStringToFile(credFilePath, serviceAccountKey, UTF_8, false)
57 | }
58 | }
59 |
60 | def setupTestTables(): File = {
61 | val testConfigFile = Files.createTempFile("delta-sharing", ".yaml").toFile
62 | testConfigFile.deleteOnExit()
63 | maybeSetupGoogleServiceAccountCredentials
64 | val shares = java.util.Arrays.asList(
65 | ShareConfig("share1",
66 | java.util.Arrays.asList(
67 | SchemaConfig(
68 | "default",
69 | java.util.Arrays.asList(
70 | TableConfig("table1", s"s3a://${AWS.bucket}/delta-exchange-test/table1"),
71 | TableConfig("table3", s"s3a://${AWS.bucket}/delta-exchange-test/table3"),
72 | TableConfig("table7", s"s3a://${AWS.bucket}/delta-exchange-test/table7"),
73 | TableConfig(
74 | "cdf_table_cdf_enabled",
75 | s"s3a://${AWS.bucket}/delta-exchange-test/cdf_table_cdf_enabled",
76 | true
77 | ),
78 | TableConfig(
79 | "cdf_table_with_partition",
80 | s"s3a://${AWS.bucket}/delta-exchange-test/cdf_table_with_partition",
81 | true,
82 | 1
83 | ),
84 | TableConfig(
85 | "cdf_table_with_vacuum",
86 | s"s3a://${AWS.bucket}/delta-exchange-test/cdf_table_with_vacuum",
87 | true
88 | ),
89 | TableConfig(
90 | "cdf_table_missing_log",
91 | s"s3a://${AWS.bucket}/delta-exchange-test/cdf_table_missing_log",
92 | true
93 | )
94 | )
95 | )
96 | )
97 | ),
98 | ShareConfig("share2",
99 | java.util.Arrays.asList(
100 | SchemaConfig("default", java.util.Arrays.asList(
101 | TableConfig("table2", s"s3a://${AWS.bucket}/delta-exchange-test/table2")
102 | )
103 | )
104 | )),
105 | ShareConfig("share3",
106 | java.util.Arrays.asList(
107 | SchemaConfig(
108 | "default",
109 | java.util.Arrays.asList(
110 | TableConfig("table4", s"s3a://${AWS.bucket}/delta-exchange-test/table4"),
111 | TableConfig("table5", s"s3a://${AWS.bucket}/delta-exchange-test/table5")
112 | )
113 | )
114 | )
115 | ),
116 | ShareConfig("share4",
117 | java.util.Arrays.asList(
118 | SchemaConfig(
119 | "default",
120 | java.util.Arrays.asList(
121 | // table made with spark.sql.parquet.compression.codec=gzip
122 | TableConfig("test_gzip", s"s3a://${AWS.bucket}/compress-test/table1")
123 | )
124 | )
125 | )
126 | ),
127 | ShareConfig("share5",
128 | java.util.Arrays.asList(
129 | SchemaConfig(
130 | "default", // empty schema
131 | java.util.Arrays.asList()
132 | )
133 | )
134 | ),
135 | ShareConfig("share6",
136 | java.util.Arrays.asList()
137 | ),
138 | ShareConfig("share7",
139 | java.util.Arrays.asList(
140 | SchemaConfig(
141 | "schema1",
142 | java.util.Arrays.asList(
143 | TableConfig("table8", s"s3a://${AWS.bucket}/delta-exchange-test/table8")
144 | )
145 | ),
146 | SchemaConfig(
147 | "schema2",
148 | java.util.Arrays.asList(
149 | TableConfig("table9", s"s3a://${AWS.bucket}/delta-exchange-test/table9")
150 | )
151 | )
152 | )
153 | ),
154 | // scalastyle:off maxLineLength
155 | ShareConfig("share_azure",
156 | java.util.Arrays.asList(
157 | SchemaConfig(
158 | "default",
159 | java.util.Arrays.asList(
160 | TableConfig("table_wasb", s"wasbs://${Azure.container}@${Azure.accountName}.blob.core.windows.net/delta-sharing-test/table1"),
161 | TableConfig("table_abfs", s"abfss://${Azure.container}@${Azure.accountName}.dfs.core.windows.net/delta-sharing-test/table1")
162 | )
163 | )
164 | )
165 | ),
166 | // scalastyle:on
167 | ShareConfig("share_gcp",
168 | java.util.Arrays.asList(
169 | SchemaConfig(
170 | "default",
171 | java.util.Arrays.asList(
172 | TableConfig("table_gcs", s"gs://${GCP.bucket}/delta-sharing-test/table1")
173 | )
174 | )
175 | )
176 | )
177 | )
178 |
179 | val serverConfig = new ServerConfig()
180 | serverConfig.setVersion(1)
181 | serverConfig.setShares(shares)
182 | serverConfig.setAuthorization(Authorization(testAuthorizationToken))
183 | serverConfig.setPort(TEST_PORT)
184 | serverConfig.setSsl(SSLConfig(selfSigned = true, null, null, null))
185 | serverConfig.setEvaluatePredicateHints(true)
186 |
187 | serverConfig.save(testConfigFile.getCanonicalPath)
188 | testConfigFile
189 | }
190 | }
191 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/model.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark.model
18 |
19 | import com.fasterxml.jackson.annotation.JsonInclude
20 | import org.apache.spark.sql.types.{DataType, LongType, StringType}
21 | import org.codehaus.jackson.annotate.JsonRawValue
22 |
23 | // Information about CDF columns.
24 | private[sharing] object CDFColumnInfo {
25 | // Internal CDF column names.
26 | val commit_version_col_name = "_commit_version"
27 | val commit_timestamp_col_name = "_commit_timestamp"
28 | val change_type_col_name = "_change_type"
29 |
30 | // Returns internal partition schema for internal columns for CDC actions.
31 | def getInternalPartitonSchemaForCDC(): Map[String, DataType] =
32 | Map(commit_version_col_name -> LongType, commit_timestamp_col_name -> LongType)
33 |
34 | // Returns internal partition schema for internal columns for CDF add/remove actions.
35 | def getInternalPartitonSchemaForCDFAddRemoveFile(): Map[String, DataType] =
36 | getInternalPartitonSchemaForCDC() + (change_type_col_name -> StringType)
37 | }
38 |
39 | private[sharing] case class DeltaTableMetadata(
40 | version: Long,
41 | protocol: Protocol,
42 | metadata: Metadata)
43 |
44 | private[sharing] case class DeltaTableFiles(
45 | version: Long,
46 | protocol: Protocol,
47 | metadata: Metadata,
48 | files: Seq[AddFile] = Nil,
49 | addFilesForCdf: Seq[AddFileForCDF] = Nil,
50 | cdfFiles: Seq[AddCDCFile] = Nil,
51 | removeFiles: Seq[RemoveFile] = Nil)
52 |
53 | private[sharing] case class Share(name: String)
54 |
55 | private[sharing] case class Schema(name: String, share: String)
56 |
57 | private[sharing] case class Table(name: String, schema: String, share: String)
58 |
59 | private[sharing] case class SingleAction(
60 | file: AddFile = null,
61 | add: AddFileForCDF = null,
62 | cdf: AddCDCFile = null,
63 | remove: RemoveFile = null,
64 | metaData: Metadata = null,
65 | protocol: Protocol = null) {
66 |
67 | def unwrap: Action = {
68 | if (file != null) {
69 | file
70 | } else if (add != null) {
71 | add
72 | } else if (cdf != null) {
73 | cdf
74 | } else if (remove != null) {
75 | remove
76 | } else if (metaData != null) {
77 | metaData
78 | } else if (protocol != null) {
79 | protocol
80 | } else {
81 | null
82 | }
83 | }
84 | }
85 |
86 | private[sharing] case class Format(provider: String = "parquet")
87 |
88 | private[sharing] case class Metadata(
89 | id: String = null,
90 | name: String = null,
91 | description: String = null,
92 | format: Format = Format(),
93 | schemaString: String = null,
94 | configuration: Map[String, String] = Map.empty,
95 | partitionColumns: Seq[String] = Nil) extends Action {
96 | override def wrap: SingleAction = SingleAction(metaData = this)
97 | }
98 |
99 | private[sharing] sealed trait Action {
100 | /** Turn this object to the [[SingleAction]] wrap object. */
101 | def wrap: SingleAction
102 | }
103 |
104 | private[sharing] case class Protocol(minReaderVersion: Int) extends Action {
105 | override def wrap: SingleAction = SingleAction(protocol = this)
106 | }
107 |
108 | // A common base class for all file actions.
109 | private[sharing] sealed abstract class FileAction(
110 | val url: String,
111 | val id: String,
112 | @JsonInclude(JsonInclude.Include.ALWAYS)
113 | val partitionValues: Map[String, String],
114 | val size: Long) extends Action {
115 |
116 | // Returns the partition values to be used in a data frame.
117 | // By default, we return the input partition values.
118 | // Derived class can override this and add internal partitions values as needed.
119 | // For example, internal CDF columns such as commit version are modeled as partitions.
120 | def getPartitionValuesInDF(): Map[String, String] = partitionValues
121 | }
122 |
123 | private[sharing] case class AddFile(
124 | override val url: String,
125 | override val id: String,
126 | @JsonInclude(JsonInclude.Include.ALWAYS)
127 | override val partitionValues: Map[String, String],
128 | override val size: Long,
129 | @JsonRawValue
130 | stats: String = null) extends FileAction(url, id, partitionValues, size) {
131 |
132 | override def wrap: SingleAction = SingleAction(file = this)
133 | }
134 |
135 | private[sharing] case class AddFileForCDF(
136 | override val url: String,
137 | override val id: String,
138 | @JsonInclude(JsonInclude.Include.ALWAYS)
139 | override val partitionValues: Map[String, String],
140 | override val size: Long,
141 | version: Long,
142 | timestamp: Long,
143 | @JsonRawValue
144 | stats: String = null) extends FileAction(url, id, partitionValues, size) {
145 |
146 | override def wrap: SingleAction = SingleAction(add = this)
147 |
148 | override def getPartitionValuesInDF(): Map[String, String] = {
149 | partitionValues +
150 | (CDFColumnInfo.commit_version_col_name -> version.toString) +
151 | (CDFColumnInfo.commit_timestamp_col_name -> timestamp.toString) +
152 | (CDFColumnInfo.change_type_col_name -> "insert")
153 | }
154 | }
155 |
156 | private[sharing] case class AddCDCFile(
157 | override val url: String,
158 | override val id: String,
159 | @JsonInclude(JsonInclude.Include.ALWAYS)
160 | override val partitionValues: Map[String, String],
161 | override val size: Long,
162 | version: Long,
163 | timestamp: Long) extends FileAction(url, id, partitionValues, size) {
164 |
165 | override def wrap: SingleAction = SingleAction(cdf = this)
166 |
167 | override def getPartitionValuesInDF(): Map[String, String] = {
168 | partitionValues +
169 | (CDFColumnInfo.commit_version_col_name -> version.toString) +
170 | (CDFColumnInfo.commit_timestamp_col_name -> timestamp.toString)
171 | }
172 | }
173 |
174 | private[sharing] case class RemoveFile(
175 | override val url: String,
176 | override val id: String,
177 | @JsonInclude(JsonInclude.Include.ALWAYS)
178 | override val partitionValues: Map[String, String],
179 | override val size: Long,
180 | version: Long,
181 | timestamp: Long) extends FileAction(url, id, partitionValues, size) {
182 |
183 | override def wrap: SingleAction = SingleAction(remove = this)
184 |
185 | override def getPartitionValuesInDF(): Map[String, String] = {
186 | partitionValues +
187 | (CDFColumnInfo.commit_version_col_name -> version.toString) +
188 | (CDFColumnInfo.commit_timestamp_col_name -> timestamp.toString) +
189 | (CDFColumnInfo.change_type_col_name -> "delete")
190 | }
191 | }
192 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/RemoteDeltaFileIndex.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import java.lang.ref.WeakReference
20 |
21 | import org.apache.hadoop.fs.{FileStatus, Path}
22 | import org.apache.spark.delta.sharing.CachedTableManager
23 | import org.apache.spark.sql.SparkSession
24 | import org.apache.spark.sql.catalyst.InternalRow
25 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
26 | import org.apache.spark.sql.catalyst.expressions.{
27 | And,
28 | Attribute,
29 | Cast,
30 | Expression,
31 | GenericInternalRow,
32 | Literal,
33 | SubqueryExpression
34 | }
35 | import org.apache.spark.sql.execution.datasources.{
36 | FileFormat,
37 | FileIndex,
38 | HadoopFsRelation,
39 | PartitionDirectory
40 | }
41 | import org.apache.spark.sql.types.{DataType, StructType}
42 |
43 | import io.delta.sharing.spark.model.{
44 | AddCDCFile,
45 | AddFile,
46 | CDFColumnInfo,
47 | DeltaTableFiles,
48 | FileAction,
49 | Metadata,
50 | Protocol,
51 | Table => DeltaSharingTable
52 | }
53 |
54 | private[sharing] case class RemoteDeltaFileIndexParams(
55 | val spark: SparkSession,
56 | val snapshotAtAnalysis: RemoteSnapshot) {
57 | def path: Path = snapshotAtAnalysis.getTablePath
58 | }
59 |
60 | // A base class for all file indices for remote delta log.
61 | private[sharing] abstract class RemoteDeltaFileIndexBase(
62 | val params: RemoteDeltaFileIndexParams) extends FileIndex {
63 | override def refresh(): Unit = {}
64 |
65 | override def sizeInBytes: Long = params.snapshotAtAnalysis.sizeInBytes
66 |
67 | override def partitionSchema: StructType = params.snapshotAtAnalysis.partitionSchema
68 |
69 | override def rootPaths: Seq[Path] = params.path :: Nil
70 |
71 | protected def toDeltaSharingPath(f: FileAction): Path = {
72 | DeltaSharingFileSystem.encode(params.path, f)
73 | }
74 |
75 | // A helper function to create partition directories from the specified actions.
76 | protected def makePartitionDirectories(actions: Seq[FileAction]): Seq[PartitionDirectory] = {
77 | val timeZone = params.spark.sessionState.conf.sessionLocalTimeZone
78 | actions.groupBy(_.getPartitionValuesInDF()).map {
79 | case (partitionValues, files) =>
80 | val rowValues: Array[Any] = partitionSchema.map { p =>
81 | Cast(Literal(partitionValues(p.name)), p.dataType, Option(timeZone)).eval()
82 | }.toArray
83 |
84 | val fileStats = files.map { f =>
85 | new FileStatus(
86 | /* length */ f.size,
87 | /* isDir */ false,
88 | /* blockReplication */ 0,
89 | /* blockSize */ 1,
90 | /* modificationTime */ 0,
91 | toDeltaSharingPath(f))
92 | }.toArray
93 |
94 | try {
95 | // Databricks Runtime has a different `PartitionDirectory.apply` method. We need to use
96 | // Java Reflection to call it.
97 | classOf[PartitionDirectory].getMethod("apply", classOf[InternalRow], fileStats.getClass)
98 | .invoke(null, new GenericInternalRow(rowValues), fileStats)
99 | .asInstanceOf[PartitionDirectory]
100 | } catch {
101 | case _: NoSuchMethodException =>
102 | // This is not in Databricks Runtime. We can call Spark's PartitionDirectory directly.
103 | PartitionDirectory(new GenericInternalRow(rowValues), fileStats)
104 | }
105 | }.toSeq
106 | }
107 | }
108 |
109 | // The index for processing files in a delta snapshot.
110 | private[sharing] case class RemoteDeltaSnapshotFileIndex(
111 | override val params: RemoteDeltaFileIndexParams,
112 | limitHint: Option[Long]) extends RemoteDeltaFileIndexBase(params) {
113 |
114 | override def inputFiles: Array[String] = {
115 | params.snapshotAtAnalysis.filesForScan(Nil, None, this)
116 | .map(f => toDeltaSharingPath(f).toString)
117 | .toArray
118 | }
119 |
120 | override def listFiles(
121 | partitionFilters: Seq[Expression],
122 | dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
123 | makePartitionDirectories(params.snapshotAtAnalysis.filesForScan(
124 | partitionFilters ++ dataFilters,
125 | limitHint,
126 | this
127 | ))
128 | }
129 | }
130 |
131 | // A base class for all file indices for CDF.
132 | private[sharing] abstract class RemoteDeltaCDFFileIndexBase(
133 | override val params: RemoteDeltaFileIndexParams,
134 | actions: Seq[FileAction],
135 | auxPartitionSchema: Map[String, DataType] = Map.empty)
136 | extends RemoteDeltaFileIndexBase(params) {
137 |
138 | override def partitionSchema: StructType = {
139 | DeltaTableUtils.updateSchema(params.snapshotAtAnalysis.partitionSchema, auxPartitionSchema)
140 | }
141 |
142 | override def inputFiles: Array[String] = {
143 | actions.map(f => toDeltaSharingPath(f).toString).toArray
144 | }
145 |
146 | override def listFiles(
147 | partitionFilters: Seq[Expression],
148 | dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
149 | // Register the files with the pre-signed url fetcher.
150 | CachedTableManager.INSTANCE
151 | .register(params.path.toString, getIdToUrlMap, new WeakReference(this), () => {
152 | getIdToUrlMap
153 | })
154 |
155 | // We ignore partition filters for list files, since the server already
156 | // parforms this filtering for CDF.
157 | makePartitionDirectories(actions)
158 | }
159 |
160 | private[sharing] def getIdToUrlMap : Map[String, String] = {
161 | actions.map { action =>
162 | action.id -> action.url
163 | }.toMap
164 | }
165 | }
166 |
167 | // The index classes for CDF file types.
168 |
169 | private[sharing] case class RemoteDeltaCDFAddFileIndex(
170 | override val params: RemoteDeltaFileIndexParams,
171 | deltaTableFiles: DeltaTableFiles)
172 | extends RemoteDeltaCDFFileIndexBase(
173 | params,
174 | deltaTableFiles.addFilesForCdf,
175 | CDFColumnInfo.getInternalPartitonSchemaForCDFAddRemoveFile) {}
176 |
177 | private[sharing] case class RemoteDeltaCDCFileIndex(
178 | override val params: RemoteDeltaFileIndexParams,
179 | deltaTableFiles: DeltaTableFiles)
180 | extends RemoteDeltaCDFFileIndexBase(
181 | params,
182 | deltaTableFiles.cdfFiles,
183 | CDFColumnInfo.getInternalPartitonSchemaForCDC) {}
184 |
185 | private[sharing] case class RemoteDeltaCDFRemoveFileIndex(
186 | override val params: RemoteDeltaFileIndexParams,
187 | deltaTableFiles: DeltaTableFiles)
188 | extends RemoteDeltaCDFFileIndexBase(
189 | params,
190 | deltaTableFiles.removeFiles,
191 | CDFColumnInfo.getInternalPartitonSchemaForCDFAddRemoveFile) {}
192 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/DeltaSharingFileSystem.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.spark
18 |
19 | import java.net.{URI, URLDecoder, URLEncoder}
20 | import java.util.concurrent.TimeUnit
21 |
22 | import org.apache.hadoop.fs._
23 | import org.apache.hadoop.fs.permission.FsPermission
24 | import org.apache.hadoop.util.Progressable
25 | import org.apache.http.client.config.RequestConfig
26 | import org.apache.http.impl.client.HttpClientBuilder
27 | import org.apache.spark.SparkEnv
28 | import org.apache.spark.delta.sharing.{PreSignedUrlCache, PreSignedUrlFetcher}
29 | import org.apache.spark.network.util.JavaUtils
30 |
31 | import io.delta.sharing.spark.model.FileAction
32 |
33 | /** Read-only file system for delta paths. */
34 | private[sharing] class DeltaSharingFileSystem extends FileSystem {
35 | import DeltaSharingFileSystem._
36 |
37 | lazy private val numRetries = {
38 | val numRetries = getConf.getInt("spark.delta.sharing.network.numRetries", 10)
39 | if (numRetries < 0) {
40 | throw new IllegalArgumentException(
41 | "spark.delta.sharing.network.numRetries must not be negative")
42 | }
43 | numRetries
44 | }
45 |
46 | lazy private val timeoutInSeconds = {
47 | val timeoutStr = getConf.get("spark.delta.sharing.network.timeout", "120s")
48 | val timeoutInSeconds = JavaUtils.timeStringAs(timeoutStr, TimeUnit.SECONDS)
49 | if (timeoutInSeconds < 0) {
50 | throw new IllegalArgumentException(
51 | "spark.delta.sharing.network.timeout must not be negative")
52 | }
53 | if (timeoutInSeconds > Int.MaxValue) {
54 | throw new IllegalArgumentException(
55 | s"spark.delta.sharing.network.timeout is too big: $timeoutStr")
56 | }
57 | timeoutInSeconds.toInt
58 | }
59 |
60 | lazy private val httpClient = {
61 | val maxConnections = getConf.getInt("spark.delta.sharing.network.maxConnections", 64)
62 | if (maxConnections < 0) {
63 | throw new IllegalArgumentException(
64 | "spark.delta.sharing.network.maxConnections must not be negative")
65 | }
66 | val config = RequestConfig.custom()
67 | .setConnectTimeout(timeoutInSeconds * 1000)
68 | .setConnectionRequestTimeout(timeoutInSeconds * 1000)
69 | .setSocketTimeout(timeoutInSeconds * 1000).build()
70 | HttpClientBuilder.create()
71 | .setMaxConnTotal(maxConnections)
72 | .setMaxConnPerRoute(maxConnections)
73 | .setDefaultRequestConfig(config)
74 | // Disable the default retry behavior because we have our own retry logic.
75 | // See `RetryUtils.runWithExponentialBackoff`.
76 | .disableAutomaticRetries()
77 | .build()
78 | }
79 |
80 | private lazy val refreshThresholdMs = getConf.getLong(
81 | "spark.delta.sharing.executor.refreshThresholdMs",
82 | TimeUnit.MINUTES.toMillis(10))
83 |
84 | private lazy val preSignedUrlCacheRef = PreSignedUrlCache.getEndpointRefInExecutor(SparkEnv.get)
85 |
86 | override def getScheme: String = SCHEME
87 |
88 | override def getUri(): URI = URI.create(s"$SCHEME:///")
89 |
90 | override def open(f: Path, bufferSize: Int): FSDataInputStream = {
91 | val path = DeltaSharingFileSystem.decode(f)
92 | val fetcher =
93 | new PreSignedUrlFetcher(preSignedUrlCacheRef, path.tablePath, path.fileId, refreshThresholdMs)
94 | if (getConf.getBoolean("spark.delta.sharing.loadDataFilesInMemory", false)) {
95 | // `InMemoryHttpInputStream` loads the content into the memory immediately, so we don't need
96 | // to refresh urls.
97 | new FSDataInputStream(new InMemoryHttpInputStream(new URI(fetcher.getUrl())))
98 | } else {
99 | new FSDataInputStream(
100 | new RandomAccessHttpInputStream(httpClient, fetcher, path.fileSize, statistics, numRetries))
101 | }
102 | }
103 |
104 | override def create(
105 | f: Path,
106 | permission: FsPermission,
107 | overwrite: Boolean,
108 | bufferSize: Int,
109 | replication: Short,
110 | blockSize: Long,
111 | progress: Progressable): FSDataOutputStream =
112 | throw new UnsupportedOperationException("create")
113 |
114 | override def append(f: Path, bufferSize: Int, progress: Progressable): FSDataOutputStream =
115 | throw new UnsupportedOperationException("append")
116 |
117 | override def rename(src: Path, dst: Path): Boolean =
118 | throw new UnsupportedOperationException("rename")
119 |
120 | override def delete(f: Path, recursive: Boolean): Boolean =
121 | throw new UnsupportedOperationException("delete")
122 |
123 | override def listStatus(f: Path): Array[FileStatus] =
124 | throw new UnsupportedOperationException("listStatus")
125 |
126 | override def setWorkingDirectory(new_dir: Path): Unit =
127 | throw new UnsupportedOperationException("setWorkingDirectory")
128 |
129 | override def getWorkingDirectory: Path = new Path(getUri)
130 |
131 | override def mkdirs(f: Path, permission: FsPermission): Boolean =
132 | throw new UnsupportedOperationException("mkdirs")
133 |
134 | override def getFileStatus(f: Path): FileStatus = {
135 | val resolved = makeQualified(f)
136 | new FileStatus(decode(resolved).fileSize, false, 0, 1, 0, f)
137 | }
138 |
139 | override def finalize(): Unit = {
140 | try super.finalize() finally close()
141 | }
142 |
143 | override def close(): Unit = {
144 | try super.close() finally httpClient.close()
145 | }
146 | }
147 |
148 | private[sharing] object DeltaSharingFileSystem {
149 |
150 | val SCHEME = "delta-sharing"
151 |
152 | case class DeltaSharingPath(tablePath: String, fileId: String, fileSize: Long) {
153 |
154 | /**
155 | * Convert `DeltaSharingPath` to a `Path` in the following format:
156 | *
157 | * ```
158 | * delta-sharing://///
159 | * ```
160 | *
161 | * This format can be decoded by `DeltaSharingFileSystem.decode`.
162 | */
163 | def toPath: Path = {
164 | val encodedTablePath = URLEncoder.encode(tablePath, "UTF-8")
165 | val encodedFileId = URLEncoder.encode(fileId, "UTF-8")
166 | new Path(s"$SCHEME:///$encodedTablePath/$encodedFileId/$fileSize")
167 | }
168 | }
169 |
170 | def encode(tablePath: Path, action: FileAction): Path = {
171 | DeltaSharingPath(tablePath.toString, action.id, action.size).toPath
172 | }
173 |
174 | def decode(path: Path): DeltaSharingPath = {
175 | val encodedPath = path.toString
176 | .stripPrefix(s"$SCHEME:///")
177 | .stripPrefix(s"$SCHEME:/")
178 | val Array(encodedTablePath, encodedFileId, sizeString) = encodedPath.split("/")
179 | DeltaSharingPath(
180 | URLDecoder.decode(encodedTablePath, "UTF-8"),
181 | URLDecoder.decode(encodedFileId, "UTF-8"),
182 | sizeString.toLong)
183 | }
184 | }
185 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/sharing/server/SharedTableManager.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server
18 |
19 | import java.io.IOException
20 | import java.nio.charset.StandardCharsets.UTF_8
21 | import java.util.Base64
22 |
23 | import scala.collection.JavaConverters._
24 |
25 | import io.delta.sharing.server.config.{SchemaConfig, ServerConfig, ShareConfig, TableConfig}
26 | import io.delta.sharing.server.protocol.{PageToken, Schema, Share, Table}
27 |
28 | /**
29 | * Load the shared tables from `ServerConfig` and provide the pagination APIs to query
30 | * shares/schemas/tables.
31 | */
32 | class SharedTableManager(serverConfig: ServerConfig) {
33 |
34 | private val caseInsensitiveComparer = (a: String, b: String) => a.equalsIgnoreCase(b)
35 |
36 | private val shares = serverConfig.getShares
37 |
38 | private val defaultMaxResults = 500
39 |
40 | private def encodePageToken(id: String, share: Option[String], schema: Option[String]): String = {
41 | val binary = PageToken(id = Option(id), share = share, schema = schema).toByteArray
42 | new String(Base64.getUrlEncoder().encode(binary), UTF_8)
43 | }
44 |
45 | private def decodePageToken(
46 | pageTokenString: String,
47 | expectedShare: Option[String],
48 | expectedSchema: Option[String]): String = {
49 | val pageToken =
50 | try {
51 | val binary = Base64.getUrlDecoder().decode(pageTokenString.getBytes(UTF_8))
52 | PageToken.parseFrom(binary)
53 | } catch {
54 | case _: IllegalArgumentException | _: IOException =>
55 | throw new DeltaSharingIllegalArgumentException("invalid 'nextPageToken'")
56 | }
57 | if (pageToken.id.isEmpty
58 | || pageToken.share != expectedShare
59 | || pageToken.schema != expectedSchema
60 | ) {
61 | throw new DeltaSharingIllegalArgumentException("invalid 'nextPageToken'")
62 | }
63 | pageToken.getId
64 | }
65 |
66 | private def getPage[T](
67 | nextPageToken: Option[String],
68 | share: Option[String],
69 | schema: Option[String],
70 | maxResults: Option[Int],
71 | totalSize: Int)(func: (Int, Int) => Seq[T]): (Seq[T], Option[String]) = {
72 | assertMaxResults(maxResults)
73 | val start = nextPageToken.map {
74 | pageToken => decodePageToken(pageToken, share, schema).toInt
75 | }.getOrElse(0)
76 | if (start > totalSize) {
77 | throw new DeltaSharingIllegalArgumentException("invalid 'nextPageToken'")
78 | }
79 | val end = start + maxResults.getOrElse(defaultMaxResults)
80 | val results = func(start, end)
81 | val nextId = if (end < totalSize) Some(end) else None
82 | results -> nextId.map(id => encodePageToken(id.toString, share, schema))
83 | }
84 |
85 | private def assertMaxResults(maxResults: Option[Int]): Unit = {
86 | maxResults.foreach { m =>
87 | if (m < 0 || m > defaultMaxResults) {
88 | throw new DeltaSharingIllegalArgumentException(
89 | s"Acceptable values of 'maxResults' are 0 to $defaultMaxResults, inclusive. " +
90 | s"(Default: $defaultMaxResults)")
91 | }
92 | }
93 | }
94 |
95 | private def getShareInternal(share: String): ShareConfig = {
96 | shares.asScala.find(s => caseInsensitiveComparer(s.getName, share))
97 | .getOrElse(throw new DeltaSharingNoSuchElementException(s"share '$share' not found"))
98 | }
99 |
100 | private def getSchema(shareConfig: ShareConfig, schema: String): SchemaConfig = {
101 | shareConfig.getSchemas.asScala.find(s => caseInsensitiveComparer(s.getName, schema))
102 | .getOrElse(throw new DeltaSharingNoSuchElementException(s"schema '$schema' not found"))
103 | }
104 |
105 | def listShares(
106 | nextPageToken: Option[String] = None,
107 | maxResults: Option[Int] = None): (Seq[Share], Option[String]) = {
108 | getPage(nextPageToken, None, None, maxResults, shares.size) { (start, end) =>
109 | shares.asScala.map { share =>
110 | Share().withName(share.getName)
111 | }.slice(start, end)
112 | }
113 | }
114 |
115 | def getShare(share: String): Share = {
116 | val shareConfig = getShareInternal(share)
117 | Share().withName(shareConfig.getName)
118 | }
119 |
120 | def listSchemas(
121 | share: String,
122 | nextPageToken: Option[String] = None,
123 | maxResults: Option[Int] = None): (Seq[Schema], Option[String]) = {
124 | val shareConfig = getShareInternal(share)
125 | getPage(nextPageToken, Some(share), None, maxResults, shareConfig.getSchemas.size) {
126 | (start, end) =>
127 | shareConfig.getSchemas.asScala.map { schemaConfig =>
128 | Schema().withName(schemaConfig.getName).withShare(share)
129 | }.slice(start, end)
130 | }
131 | }
132 |
133 | def listTables(
134 | share: String,
135 | schema: String,
136 | nextPageToken: Option[String] = None,
137 | maxResults: Option[Int] = None): (Seq[Table], Option[String]) = {
138 | val schemaConfig = getSchema(getShareInternal(share), schema)
139 | getPage(nextPageToken, Some(share), Some(schema), maxResults, schemaConfig.getTables.size) {
140 | (start, end) =>
141 | schemaConfig.getTables.asScala.map { tableConfig =>
142 | Table().withName(tableConfig.getName).withSchema(schema).withShare(share)
143 | }.slice(start, end)
144 | }
145 | }
146 |
147 | def listAllTables(
148 | share: String,
149 | nextPageToken: Option[String] = None,
150 | maxResults: Option[Int] = None): (Seq[Table], Option[String]) = {
151 | val shareConfig = getShareInternal(share)
152 | val totalSize = shareConfig.schemas.asScala.map(_.tables.size).sum
153 | getPage(nextPageToken, Some(share), None, maxResults, totalSize) {
154 | (start, end) =>
155 | shareConfig.schemas.asScala.flatMap { schema =>
156 | schema.tables.asScala.map {
157 | table =>
158 | Table(
159 | name = Some(table.getName),
160 | schema = Some(schema.name),
161 | share = Some(share)
162 | )
163 | }
164 | }.slice(start, end)
165 | }
166 | }
167 |
168 | def getTable(share: String, schema: String, table: String): TableConfig = {
169 | val schemaConfig =
170 | try {
171 | getSchema(getShareInternal(share), schema)
172 | } catch {
173 | case _: DeltaSharingNoSuchElementException =>
174 | throw new DeltaSharingNoSuchElementException(
175 | s"[Share/Schema/Table] '$share/$schema/$table' does not exist, " +
176 | s"please contact your share provider for further information.")
177 | }
178 | schemaConfig.getTables.asScala.find(t => caseInsensitiveComparer(t.getName, table))
179 | .getOrElse(throw new DeltaSharingNoSuchElementException(
180 | s"[Share/Schema/Table] '$share/$schema/$table' does not exist, " +
181 | s"please contact your share provider for further information."))
182 | }
183 | }
184 |
--------------------------------------------------------------------------------
/spark/src/main/scala/io/delta/sharing/spark/RandomAccessHttpInputStream.scala:
--------------------------------------------------------------------------------
1 | // scalastyle:off headerCheck
2 | /**
3 | * Licensed to the Apache Software Foundation (ASF) under one
4 | * or more contributor license agreements. See the NOTICE file
5 | * distributed with this work for additional information
6 | * regarding copyright ownership. The ASF licenses this file
7 | * to you under the Apache License, Version 2.0 (the
8 | * "License"); you may not use this file except in compliance
9 | * with the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | /*
21 | * This file contains code from the Apache Hadoop project (original license above).
22 | * It contains modifications, which are licensed as follows:
23 | */
24 |
25 | /*
26 | * Copyright (2021) The Delta Lake Project Authors.
27 | *
28 | * Licensed under the Apache License, Version 2.0 (the "License");
29 | * you may not use this file except in compliance with the License.
30 | * You may obtain a copy of the License at
31 | *
32 | * http://www.apache.org/licenses/LICENSE-2.0
33 | *
34 | * Unless required by applicable law or agreed to in writing, software
35 | * distributed under the License is distributed on an "AS IS" BASIS,
36 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37 | * See the License for the specific language governing permissions and
38 | * limitations under the License.
39 | */
40 | package io.delta.sharing.spark
41 |
42 | import java.io.{EOFException, InputStream, IOException}
43 | import java.nio.charset.StandardCharsets.UTF_8
44 |
45 | import org.apache.commons.io.IOUtils
46 | import org.apache.hadoop.fs.{FileSystem, FSExceptionMessages, FSInputStream}
47 | import org.apache.http.HttpStatus
48 | import org.apache.http.client.HttpClient
49 | import org.apache.http.client.methods.{HttpGet, HttpRequestBase}
50 | import org.apache.http.conn.EofSensorInputStream
51 | import org.apache.spark.delta.sharing.PreSignedUrlFetcher
52 | import org.apache.spark.internal.Logging
53 |
54 | import io.delta.sharing.spark.util.{RetryUtils, UnexpectedHttpStatus}
55 |
56 | /**
57 | * This is a special input stream to provide random access over HTTP. This class requires the server
58 | * side to support HTTP Range header.
59 | */
60 | private[sharing] class RandomAccessHttpInputStream(
61 | client: HttpClient,
62 | fetcher: PreSignedUrlFetcher,
63 | contentLength: Long,
64 | stats: FileSystem.Statistics,
65 | numRetries: Int) extends FSInputStream with Logging {
66 |
67 | private var closed = false
68 | private var pos = 0L
69 | private var currentStream: InputStream = null
70 | private var uri: String = null
71 |
72 | private def assertNotClosed(): Unit = {
73 | if (closed) {
74 | throw new IOException(FSExceptionMessages.STREAM_IS_CLOSED)
75 | }
76 | val newUrl = fetcher.getUrl()
77 | if (uri != newUrl) {
78 | // Abort the current open stream so that we will re-open a new stream using the new url
79 | uri = newUrl
80 | abortCurrentStream()
81 | }
82 | }
83 |
84 | override def seek(pos: Long): Unit = synchronized {
85 | if (this.pos != pos) {
86 | assertNotClosed()
87 | reopen(pos)
88 | }
89 | }
90 |
91 | override def getPos: Long = synchronized {
92 | pos
93 | }
94 |
95 | override def seekToNewSource(targetPos: Long): Boolean = {
96 | // We don't support this feature
97 | false
98 | }
99 |
100 | override def read(): Int = synchronized {
101 | assertNotClosed()
102 | if (currentStream == null) {
103 | reopen(pos)
104 | }
105 | val byte = currentStream.read()
106 | if (byte >= 0) {
107 | pos += 1
108 | }
109 | if (stats != null && byte >= 0) {
110 | stats.incrementBytesRead(1)
111 | }
112 | byte
113 | }
114 |
115 | private def createHttpRequest(start: Long): HttpRequestBase = {
116 | val request = new HttpGet(uri)
117 | val rangeValue = s"bytes=$start-${contentLength - 1L}"
118 | request.addHeader("Range", rangeValue)
119 | request
120 | }
121 |
122 | override def read(buf: Array[Byte], off: Int, len: Int): Int = synchronized {
123 | assertNotClosed()
124 | if (currentStream == null) {
125 | reopen(pos)
126 | }
127 | val byteRead = currentStream.read(buf, off, len)
128 | if (byteRead > 0) {
129 | pos += byteRead
130 | }
131 | if (stats != null && byteRead > 0) {
132 | stats.incrementBytesRead(byteRead)
133 | }
134 | byteRead
135 | }
136 |
137 | private def reopen(pos: Long): Unit = {
138 | if (currentStream != null) {
139 | logDebug(s"Aborting old stream to open at pos $pos")
140 | abortCurrentStream()
141 | }
142 | if (pos < 0L) {
143 | throw new EOFException(FSExceptionMessages.NEGATIVE_SEEK + " " + pos)
144 | } else if (contentLength > 0L && pos > this.contentLength - 1L) {
145 | throw new EOFException(FSExceptionMessages.CANNOT_SEEK_PAST_EOF + " " + pos)
146 | } else {
147 | logDebug(s"Opening file $uri at pos $pos")
148 |
149 | val entity = RetryUtils.runWithExponentialBackoff(numRetries) {
150 | val httpRequest = createHttpRequest(pos)
151 | val response = client.execute(httpRequest)
152 | val status = response.getStatusLine()
153 | val entity = response.getEntity()
154 | val statusCode = status.getStatusCode
155 | if (statusCode != HttpStatus.SC_PARTIAL_CONTENT) {
156 | // Note: we will still fail if the server returns 200 because it means the server doesn't
157 | // support HTTP Range header.
158 | val errorBody = if (entity == null) {
159 | ""
160 | } else {
161 | val input = entity.getContent()
162 | try {
163 | IOUtils.toString(input, UTF_8)
164 | } finally {
165 | input.close()
166 | }
167 | }
168 | throw new UnexpectedHttpStatus(
169 | s"HTTP request failed with status: $status $errorBody",
170 | statusCode)
171 | }
172 | entity
173 | }
174 | currentStream = entity.getContent()
175 | this.pos = pos
176 | }
177 | }
178 |
179 | override def available(): Int = synchronized {
180 | assertNotClosed()
181 | currentStream.available()
182 | }
183 |
184 | /**
185 | * Aborts `currentStream` without reading any more data. Apache `HttpClient` tries to read the
186 | * rest of bytes in `Close` in order to reuse the connection. However, it's not efficient when we
187 | * need to discard a lot of bytes. This method provides a way to not reuse the connection when the
188 | * remaining bytes are still a lot. See `EofSensorInputStream` for more details.
189 | */
190 | private def abortCurrentStream(): Unit = {
191 | if (currentStream != null) {
192 | currentStream match {
193 | case e: EofSensorInputStream => e.abortConnection()
194 | case _ => currentStream.close()
195 | }
196 | currentStream = null
197 | }
198 | }
199 |
200 | override def close(): Unit = synchronized {
201 | if (!closed) {
202 | super.close()
203 | closed = true
204 | if (currentStream != null) {
205 | if (contentLength - pos <= 4096) {
206 | // Close, rather than abort, so that the http connection can be reused.
207 | currentStream.close()
208 | currentStream = null
209 | } else {
210 | // Abort, rather than just close, the underlying stream. Otherwise, the remaining bytes
211 | // are read while closing the stream.
212 | abortCurrentStream()
213 | }
214 | }
215 | }
216 | }
217 | }
218 |
--------------------------------------------------------------------------------
/python/delta_sharing/reader.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from typing import Any, Callable, Dict, Optional, Sequence
17 | from urllib.parse import urlparse
18 | from json import loads
19 |
20 | import fsspec
21 | import pandas as pd
22 | from pyarrow.dataset import dataset
23 |
24 | from delta_sharing.converter import to_converters, get_empty_table
25 | from delta_sharing.protocol import AddCdcFile, CdfOptions, FileAction, Table
26 | from delta_sharing.rest_client import DataSharingRestClient
27 |
28 |
29 | class DeltaSharingReader:
30 | def __init__(
31 | self,
32 | table: Table,
33 | rest_client: DataSharingRestClient,
34 | *,
35 | predicateHints: Optional[Sequence[str]] = None,
36 | limit: Optional[int] = None,
37 | version: Optional[int] = None,
38 | ):
39 | self._table = table
40 | self._rest_client = rest_client
41 |
42 | if predicateHints is not None:
43 | assert isinstance(predicateHints, Sequence)
44 | assert all(isinstance(predicateHint, str) for predicateHint in predicateHints)
45 | self._predicateHints = predicateHints
46 |
47 | if limit is not None:
48 | assert isinstance(limit, int) and limit >= 0, "'limit' must be a non-negative int"
49 | self._limit = limit
50 | self._version = version
51 |
52 | @property
53 | def table(self) -> Table:
54 | return self._table
55 |
56 | def predicateHints(self, predicateHints: Optional[Sequence[str]]) -> "DeltaSharingReader":
57 | return self._copy(
58 | predicateHints=predicateHints,
59 | limit=self._limit,
60 | version=self._version
61 | )
62 |
63 | def limit(self, limit: Optional[int]) -> "DeltaSharingReader":
64 | return self._copy(
65 | predicateHints=self._predicateHints,
66 | limit=limit,
67 | version=self._version
68 | )
69 |
70 | def to_pandas(self) -> pd.DataFrame:
71 | response = self._rest_client.list_files_in_table(
72 | self._table,
73 | predicateHints=self._predicateHints,
74 | limitHint=self._limit,
75 | version=self._version
76 | )
77 |
78 | schema_json = loads(response.metadata.schema_string)
79 |
80 | if len(response.add_files) == 0 or self._limit == 0:
81 | return get_empty_table(schema_json)
82 |
83 | converters = to_converters(schema_json)
84 |
85 | if self._limit is None:
86 | pdfs = [
87 | DeltaSharingReader._to_pandas(
88 | file, converters, False, None) for file in response.add_files
89 | ]
90 | else:
91 | left = self._limit
92 | pdfs = []
93 | for file in response.add_files:
94 | pdf = DeltaSharingReader._to_pandas(file, converters, False, left)
95 | pdfs.append(pdf)
96 | left -= len(pdf)
97 | assert (
98 | left >= 0
99 | ), f"'_to_pandas' returned too many rows. Required: {left}, returned: {len(pdf)}"
100 | if left == 0:
101 | break
102 |
103 | return pd.concat(
104 | pdfs,
105 | axis=0,
106 | ignore_index=True,
107 | copy=False,
108 | )[[field["name"] for field in schema_json["fields"]]]
109 |
110 | def table_changes_to_pandas(self, cdfOptions: CdfOptions) -> pd.DataFrame:
111 | response = self._rest_client.list_table_changes(self._table, cdfOptions)
112 |
113 | schema_json = loads(response.metadata.schema_string)
114 |
115 | if len(response.actions) == 0:
116 | return get_empty_table(self._add_special_cdf_schema(schema_json))
117 |
118 | converters = to_converters(schema_json)
119 | pdfs = []
120 | for action in response.actions:
121 | pdf = DeltaSharingReader._to_pandas(action, converters, True, None)
122 | pdfs.append(pdf)
123 |
124 | return pd.concat(pdfs, axis=0, ignore_index=True, copy=False)
125 |
126 | def _copy(
127 | self,
128 | *,
129 | predicateHints: Optional[Sequence[str]],
130 | limit: Optional[int],
131 | version: Optional[int]
132 | ) -> "DeltaSharingReader":
133 | return DeltaSharingReader(
134 | table=self._table,
135 | rest_client=self._rest_client,
136 | predicateHints=predicateHints,
137 | limit=limit,
138 | version=version
139 | )
140 |
141 | @staticmethod
142 | def _to_pandas(
143 | action: FileAction,
144 | converters: Dict[str, Callable[[str], Any]],
145 | for_cdf: bool,
146 | limit: Optional[int]
147 | ) -> pd.DataFrame:
148 | url = urlparse(action.url)
149 | if "storage.googleapis.com" in (url.netloc.lower()):
150 | # Apply the yarl patch for GCS pre-signed urls
151 | import delta_sharing._yarl_patch # noqa: F401
152 |
153 | protocol = url.scheme
154 | filesystem = fsspec.filesystem(protocol)
155 |
156 | pa_dataset = dataset(source=action.url, format="parquet", filesystem=filesystem)
157 | pa_table = pa_dataset.head(limit) if limit is not None else pa_dataset.to_table()
158 | pdf = pa_table.to_pandas(
159 | date_as_object=True, use_threads=False, split_blocks=True, self_destruct=True
160 | )
161 |
162 | for col, converter in converters.items():
163 | if col not in pdf.columns:
164 | if col in action.partition_values:
165 | if converter is not None:
166 | pdf[col] = converter(action.partition_values[col])
167 | else:
168 | raise ValueError("Cannot partition on binary or complex columns")
169 | else:
170 | pdf[col] = None
171 |
172 | if for_cdf:
173 | # Add the change type col name to non cdc actions.
174 | if type(action) != AddCdcFile:
175 | pdf[DeltaSharingReader._change_type_col_name()] = action.get_change_type_col_value()
176 |
177 | # If available, add timestamp and version columns from the action.
178 | # All rows of the dataframe will get the same value.
179 | if action.version is not None:
180 | assert DeltaSharingReader._commit_version_col_name() not in pdf.columns
181 | pdf[DeltaSharingReader._commit_version_col_name()] = action.version
182 |
183 | if action.timestamp is not None:
184 | assert DeltaSharingReader._commit_timestamp_col_name() not in pdf.columns
185 | pdf[DeltaSharingReader._commit_timestamp_col_name()] = action.timestamp
186 | return pdf
187 |
188 | # The names of special delta columns for cdf.
189 |
190 | @staticmethod
191 | def _change_type_col_name():
192 | return "_change_type"
193 |
194 | @staticmethod
195 | def _commit_timestamp_col_name():
196 | return "_commit_timestamp"
197 |
198 | @staticmethod
199 | def _commit_version_col_name():
200 | return "_commit_version"
201 |
202 | @staticmethod
203 | def _add_special_cdf_schema(schema_json: dict) -> dict:
204 | fields = schema_json["fields"]
205 | fields.append({"name" : DeltaSharingReader._change_type_col_name(), "type" : "string"})
206 | fields.append({"name" : DeltaSharingReader._commit_version_col_name(), "type" : "long"})
207 | fields.append({"name" : DeltaSharingReader._commit_timestamp_col_name(), "type" : "long"})
208 | return schema_json
209 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/sharing/server/config/ServerConfig.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server.config
18 |
19 | import java.io.{File, IOException}
20 | import java.util.Collections
21 |
22 | import scala.beans.BeanProperty
23 |
24 | import com.fasterxml.jackson.annotation.JsonInclude
25 | import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
26 | import com.fasterxml.jackson.dataformat.yaml.YAMLFactory
27 |
28 | /** A trait that requires to implement */
29 | trait ConfigItem {
30 | /** Verify whether the config is valid */
31 | def checkConfig(): Unit
32 | }
33 |
34 | /**
35 | * The class for the server config yaml file. The yaml file will be loaded as this class.
36 | *
37 | * As `jackson-dataformat-yaml` only supports Java, we need to use `@BeanProperty var` to generate
38 | * Java bean classes.
39 | */
40 | case class ServerConfig(
41 | @BeanProperty var version: java.lang.Integer,
42 | @BeanProperty var shares: java.util.List[ShareConfig],
43 | @BeanProperty var authorization: Authorization,
44 | @BeanProperty var ssl: SSLConfig,
45 | @BeanProperty var host: String,
46 | @BeanProperty var port: Int,
47 | @BeanProperty var endpoint: String,
48 | // The timeout of S3 presigned url in seconds
49 | @BeanProperty var preSignedUrlTimeoutSeconds: Long,
50 | // How many tables to cache in the memory.
51 | @BeanProperty var deltaTableCacheSize: Int,
52 | // Whether we can accept working with a stale version of the table. This is useful when sharing
53 | // static tables that will never be changed.
54 | @BeanProperty var stalenessAcceptable: Boolean,
55 | // Whether to evaluate user provided `predicateHints`
56 | @BeanProperty var evaluatePredicateHints: Boolean,
57 | // The timeout of an incoming web request in seconds. Set to 0 for no timeout
58 | @BeanProperty var requestTimeoutSeconds: Long
59 | ) extends ConfigItem {
60 | import ServerConfig._
61 |
62 | def this() = {
63 | // Set default values here
64 | this(
65 | version = null,
66 | shares = Collections.emptyList(),
67 | authorization = null,
68 | ssl = null,
69 | host = "localhost",
70 | port = 80,
71 | endpoint = "/delta-sharing",
72 | preSignedUrlTimeoutSeconds = 3600,
73 | deltaTableCacheSize = 10,
74 | stalenessAcceptable = false,
75 | evaluatePredicateHints = false,
76 | requestTimeoutSeconds = 30
77 | )
78 | }
79 |
80 | private def checkVersion(): Unit = {
81 | if (version == null) {
82 | throw new IllegalArgumentException("'version' must be provided")
83 | }
84 | if (version <= 0) {
85 | throw new IllegalArgumentException("'version' must be greater than 0")
86 | }
87 | if (version > CURRENT) {
88 | throw new IllegalArgumentException(s"The 'version' in the server config is $version which " +
89 | s"is too new. The current release supports version $CURRENT and below. " +
90 | s"Please upgrade to a newer release.")
91 | }
92 | }
93 |
94 | def save(configFile: String): Unit = {
95 | ServerConfig.save(this, configFile)
96 | }
97 |
98 | override def checkConfig(): Unit = {
99 | checkVersion()
100 | shares.forEach(_.checkConfig())
101 | if (authorization != null) {
102 | authorization.checkConfig()
103 | }
104 | if (ssl != null) {
105 | ssl.checkConfig()
106 | }
107 | }
108 | }
109 |
110 | object ServerConfig{
111 | /** The version that we understand */
112 | private val CURRENT = 1
113 |
114 | private def createYamlObjectMapper = {
115 | new ObjectMapper(new YAMLFactory)
116 | .setSerializationInclusion(JsonInclude.Include.NON_ABSENT)
117 | .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
118 | }
119 |
120 | /**
121 | * Load the configurations for the server from the config file. If the file name ends with
122 | * `.yaml` or `.yml`, load it using the YAML parser. Otherwise, throw an error.
123 | */
124 | def load(configFile: String): ServerConfig = {
125 | if (configFile.endsWith(".yaml") || configFile.endsWith(".yml")) {
126 | val serverConfig =
127 | createYamlObjectMapper.readValue(new File(configFile), classOf[ServerConfig])
128 | serverConfig.checkConfig()
129 | serverConfig
130 | } else {
131 | throw new IOException("The server config file must be a yml or yaml file")
132 | }
133 | }
134 |
135 | /**
136 | * Serialize the [[ServerConfig]] object to the config file. If the file name ends with `.yaml`
137 | * or `.yml`, save it as a YAML file. Otherwise, throw an error.
138 | */
139 | def save(config: ServerConfig, configFile: String): Unit = {
140 | if (configFile.endsWith(".yaml") || configFile.endsWith(".yml")) {
141 | createYamlObjectMapper.writeValue(new File(configFile), config)
142 | } else {
143 | throw new IOException("The server config file must be a yml or yaml file")
144 | }
145 | }
146 | }
147 |
148 | case class Authorization(@BeanProperty var bearerToken: String) extends ConfigItem {
149 |
150 | def this() {
151 | this(null)
152 | }
153 |
154 | override def checkConfig(): Unit = {
155 | if (bearerToken == null) {
156 | throw new IllegalArgumentException("'bearerToken' in 'authorization' must be provided")
157 | }
158 | }
159 | }
160 |
161 | case class SSLConfig(
162 | @BeanProperty var selfSigned: Boolean,
163 | // The file of the PEM-format certificate
164 | @BeanProperty var certificateFile: String,
165 | // The file of the certificate’s private key
166 | @BeanProperty var certificateKeyFile: String,
167 | // The file storing the password to access the above certificate’s private key if it's protected
168 | @BeanProperty var certificatePasswordFile: String) extends ConfigItem {
169 |
170 | def this() {
171 | this(selfSigned = false, null, null, null)
172 | }
173 |
174 | override def checkConfig(): Unit = {
175 | if (!selfSigned) {
176 | if (certificateFile == null) {
177 | throw new IllegalArgumentException("'certificateFile' in a SSL config must be provided")
178 | }
179 | if (certificateKeyFile == null) {
180 | throw new IllegalArgumentException("'certificateKeyFile' in a SSL config must be provided")
181 | }
182 | }
183 | }
184 | }
185 |
186 | case class ShareConfig(
187 | @BeanProperty var name: String,
188 | @BeanProperty var schemas: java.util.List[SchemaConfig]) extends ConfigItem {
189 |
190 | def this() {
191 | this(null, Collections.emptyList())
192 | }
193 |
194 | override def checkConfig(): Unit = {
195 | if (name == null) {
196 | throw new IllegalArgumentException("'name' in a share must be provided")
197 | }
198 | schemas.forEach(_.checkConfig())
199 | }
200 | }
201 |
202 | case class SchemaConfig(
203 | @BeanProperty var name: String,
204 | @BeanProperty var tables: java.util.List[TableConfig]) extends ConfigItem {
205 |
206 | def this() {
207 | this(null, Collections.emptyList())
208 | }
209 |
210 | override def checkConfig(): Unit = {
211 | if (name == null) {
212 | throw new IllegalArgumentException("'name' in a schema must be provided")
213 | }
214 | tables.forEach(_.checkConfig())
215 | }
216 | }
217 |
218 | case class TableConfig(
219 | @BeanProperty var name: String,
220 | @BeanProperty var location: String,
221 | @BeanProperty var cdfEnabled: Boolean = false,
222 | @BeanProperty var startVersion: Long = 0) extends ConfigItem {
223 |
224 | def this() {
225 | this(null, null)
226 | }
227 |
228 | override def checkConfig(): Unit = {
229 | if (name == null) {
230 | throw new IllegalArgumentException("'name' in a table must be provided")
231 | }
232 | if (location == null) {
233 | throw new IllegalArgumentException("'location' in a table must be provided")
234 | }
235 | }
236 | }
237 |
--------------------------------------------------------------------------------
/python/delta_sharing/protocol.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2021 The Delta Lake Project Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from dataclasses import dataclass, field
17 | from json import loads
18 | from pathlib import Path
19 | from typing import ClassVar, Dict, IO, Optional, Sequence, Union
20 |
21 | import fsspec
22 |
23 |
24 | @dataclass(frozen=True)
25 | class DeltaSharingProfile:
26 | CURRENT: ClassVar[int] = 1
27 |
28 | share_credentials_version: int
29 | endpoint: str
30 | bearer_token: str
31 | expiration_time: Optional[str] = None
32 |
33 | def __post_init__(self):
34 | if self.share_credentials_version > DeltaSharingProfile.CURRENT:
35 | raise ValueError(
36 | "'shareCredentialsVersion' in the profile is "
37 | f"{self.share_credentials_version} which is too new. "
38 | f"The current release supports version {DeltaSharingProfile.CURRENT} and below. "
39 | "Please upgrade to a newer release."
40 | )
41 |
42 | @staticmethod
43 | def read_from_file(profile: Union[str, IO, Path]) -> "DeltaSharingProfile":
44 | if isinstance(profile, str):
45 | infile = fsspec.open(profile).open()
46 | elif isinstance(profile, Path):
47 | infile = fsspec.open(profile.as_uri()).open()
48 | else:
49 | infile = profile
50 | try:
51 | return DeltaSharingProfile.from_json(infile.read())
52 | finally:
53 | infile.close()
54 |
55 | @staticmethod
56 | def from_json(json) -> "DeltaSharingProfile":
57 | if isinstance(json, (str, bytes, bytearray)):
58 | json = loads(json)
59 | endpoint = json["endpoint"]
60 | if endpoint.endswith("/"):
61 | endpoint = endpoint[:-1]
62 | expiration_time = json.get("expirationTime")
63 | return DeltaSharingProfile(
64 | share_credentials_version=int(json["shareCredentialsVersion"]),
65 | endpoint=endpoint,
66 | bearer_token=json["bearerToken"],
67 | expiration_time=expiration_time,
68 | )
69 |
70 |
71 | @dataclass(frozen=True)
72 | class Share:
73 | name: str
74 |
75 | @staticmethod
76 | def from_json(json) -> "Share":
77 | if isinstance(json, (str, bytes, bytearray)):
78 | json = loads(json)
79 | return Share(name=json["name"])
80 |
81 |
82 | @dataclass(frozen=True)
83 | class Schema:
84 | name: str
85 | share: str
86 |
87 | @staticmethod
88 | def from_json(json) -> "Schema":
89 | if isinstance(json, (str, bytes, bytearray)):
90 | json = loads(json)
91 | return Schema(name=json["name"], share=json["share"])
92 |
93 |
94 | @dataclass(frozen=True)
95 | class Table:
96 | name: str
97 | share: str
98 | schema: str
99 |
100 | @staticmethod
101 | def from_json(json) -> "Table":
102 | if isinstance(json, (str, bytes, bytearray)):
103 | json = loads(json)
104 | return Table(name=json["name"], share=json["share"], schema=json["schema"])
105 |
106 |
107 | @dataclass(frozen=True)
108 | class Protocol:
109 | CURRENT: ClassVar[int] = 1
110 |
111 | min_reader_version: int
112 |
113 | def __post_init__(self):
114 | if self.min_reader_version > Protocol.CURRENT:
115 | raise ValueError(
116 | f"The table requires a newer version {self.min_reader_version} to read. "
117 | f"But the current release supports version {Protocol.CURRENT} and below. "
118 | f"Please upgrade to a newer release."
119 | )
120 |
121 | @staticmethod
122 | def from_json(json) -> "Protocol":
123 | if isinstance(json, (str, bytes, bytearray)):
124 | json = loads(json)
125 | return Protocol(min_reader_version=int(json["minReaderVersion"]))
126 |
127 |
128 | @dataclass(frozen=True)
129 | class Format:
130 | provider: str = "parquet"
131 | options: Dict[str, str] = field(default_factory=dict)
132 |
133 | @staticmethod
134 | def from_json(json) -> "Format":
135 | if isinstance(json, (str, bytes, bytearray)):
136 | json = loads(json)
137 | return Format(provider=json.get("provider", "parquet"), options=json.get("options", {}))
138 |
139 |
140 | @dataclass(frozen=True)
141 | class Metadata:
142 | id: Optional[str] = None
143 | name: Optional[str] = None
144 | description: Optional[str] = None
145 | format: Format = field(default_factory=Format)
146 | schema_string: Optional[str] = None
147 | configuration: Dict[str, str] = field(default_factory=dict)
148 | partition_columns: Sequence[str] = field(default_factory=list)
149 |
150 | @staticmethod
151 | def from_json(json) -> "Metadata":
152 | if isinstance(json, (str, bytes, bytearray)):
153 | json = loads(json)
154 | if "configuration" in json:
155 | configuration = json["configuration"]
156 | else:
157 | configuration = {}
158 | return Metadata(
159 | id=json["id"],
160 | name=json.get("name", None),
161 | description=json.get("description", None),
162 | format=Format.from_json(json["format"]),
163 | schema_string=json["schemaString"],
164 | configuration=configuration,
165 | partition_columns=json["partitionColumns"],
166 | )
167 |
168 |
169 | @dataclass(frozen=True)
170 | class FileAction:
171 | url: str
172 | id: str
173 | partition_values: Dict[str, str]
174 | size: int
175 | timestamp: Optional[int] = None
176 | version: Optional[int] = None
177 |
178 | def get_change_type_col_value(self) -> str:
179 | raise ValueError(f"_change_type not supported for {self.url}")
180 |
181 | @staticmethod
182 | def from_json(action_json) -> "FileAction":
183 | if "add" in action_json:
184 | return AddFile.from_json(action_json["add"])
185 | elif "cdf" in action_json:
186 | return AddCdcFile.from_json(action_json["cdf"])
187 | elif "remove" in action_json:
188 | return RemoveFile.from_json(action_json["remove"])
189 | else:
190 | return None
191 |
192 |
193 | @dataclass(frozen=True)
194 | class AddFile(FileAction):
195 | stats: Optional[str] = None
196 |
197 | @staticmethod
198 | def from_json(json) -> "AddFile":
199 | if isinstance(json, (str, bytes, bytearray)):
200 | json = loads(json)
201 | return AddFile(
202 | url=json["url"],
203 | id=json["id"],
204 | partition_values=json["partitionValues"],
205 | size=int(json["size"]),
206 | stats=json.get("stats", None),
207 | timestamp=json.get("timestamp", None),
208 | version=json.get("version", None),
209 | )
210 |
211 | def get_change_type_col_value(self) -> str:
212 | return "insert"
213 |
214 |
215 | @dataclass(frozen=True)
216 | class AddCdcFile(FileAction):
217 | @staticmethod
218 | def from_json(json) -> "AddCdcFile":
219 | if isinstance(json, (str, bytes, bytearray)):
220 | json = loads(json)
221 | return AddCdcFile(
222 | url=json["url"],
223 | id=json["id"],
224 | partition_values=json["partitionValues"],
225 | size=int(json["size"]),
226 | timestamp=json["timestamp"],
227 | version=json["version"],
228 | )
229 |
230 |
231 | @dataclass(frozen=True)
232 | class RemoveFile(FileAction):
233 | @staticmethod
234 | def from_json(json) -> "RemoveFile":
235 | if isinstance(json, (str, bytes, bytearray)):
236 | json = loads(json)
237 | return RemoveFile(
238 | url=json["url"],
239 | id=json["id"],
240 | partition_values=json["partitionValues"],
241 | size=int(json["size"]),
242 | timestamp=json.get("timestamp", None),
243 | version=json.get("version", None),
244 | )
245 |
246 | def get_change_type_col_value(self) -> str:
247 | return "delete"
248 |
249 |
250 | @dataclass(frozen=True)
251 | class CdfOptions:
252 | starting_version: Optional[int] = None
253 | ending_version: Optional[int] = None
254 | starting_timestamp: Optional[str] = None
255 | ending_timestamp: Optional[str] = None
256 |
--------------------------------------------------------------------------------
/server/src/main/scala/io/delta/sharing/server/CloudFileSigner.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (2021) The Delta Lake Project Authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package io.delta.sharing.server
18 |
19 | import java.net.URI
20 | import java.util.Date
21 | import java.util.concurrent.TimeUnit.SECONDS
22 |
23 | import com.amazonaws.HttpMethod
24 | import com.amazonaws.services.s3.model.GeneratePresignedUrlRequest
25 | import com.google.cloud.hadoop.gcsio.StorageResourceId
26 | import com.google.cloud.storage.BlobId
27 | import com.google.cloud.storage.BlobInfo
28 | import com.google.cloud.storage.Storage
29 | import com.google.cloud.storage.StorageOptions
30 | import com.microsoft.azure.storage.{CloudStorageAccount, SharedAccessProtocols, StorageCredentialsSharedAccessSignature}
31 | import com.microsoft.azure.storage.blob.{SharedAccessBlobPermissions, SharedAccessBlobPolicy}
32 | import org.apache.hadoop.conf.Configuration
33 | import org.apache.hadoop.fs.Path
34 | import org.apache.hadoop.fs.azure.{AzureNativeFileSystemStore, NativeAzureFileSystem}
35 | import org.apache.hadoop.fs.azurebfs.{AzureBlobFileSystem, AzureBlobFileSystemStore}
36 | import org.apache.hadoop.fs.azurebfs.services.AuthType
37 | import org.apache.hadoop.fs.s3a.DefaultS3ClientFactory
38 | import org.apache.hadoop.util.ReflectionUtils
39 |
40 |
41 | trait CloudFileSigner {
42 | def sign(path: Path): String
43 | }
44 |
45 | class S3FileSigner(
46 | name: URI,
47 | conf: Configuration,
48 | preSignedUrlTimeoutSeconds: Long) extends CloudFileSigner {
49 |
50 | private val s3Client = ReflectionUtils.newInstance(classOf[DefaultS3ClientFactory], conf)
51 | .createS3Client(name)
52 |
53 | override def sign(path: Path): String = {
54 | val absPath = path.toUri
55 | val bucketName = absPath.getHost
56 | val objectKey = absPath.getPath.stripPrefix("/")
57 | val expiration =
58 | new Date(System.currentTimeMillis() + SECONDS.toMillis(preSignedUrlTimeoutSeconds))
59 | assert(objectKey.nonEmpty, s"cannot get object key from $path")
60 | val request = new GeneratePresignedUrlRequest(bucketName, objectKey)
61 | .withMethod(HttpMethod.GET)
62 | .withExpiration(expiration)
63 | s3Client.generatePresignedUrl(request).toString
64 | }
65 | }
66 |
67 | class AzureFileSigner(
68 | accountName: String,
69 | storageKey: String,
70 | container: String,
71 | preSignedUrlTimeoutSeconds: Long,
72 | objectKeyExtractor: Path => String) extends CloudFileSigner {
73 |
74 | private val (rawAccountName, endpointSuffix) = {
75 | val splits = accountName.split("\\.", 3)
76 | if (splits.length != 3) {
77 | throw new IllegalArgumentException(s"Incorrect account name: $accountName")
78 | }
79 | (splits(0), splits(2))
80 | }
81 |
82 | private def getCloudStorageAccount: CloudStorageAccount = {
83 | val connectionString = Seq(
84 | "DefaultEndpointsProtocol=https",
85 | s"AccountName=$rawAccountName",
86 | s"AccountKey=$storageKey",
87 | s"EndpointSuffix=$endpointSuffix"
88 | ).mkString(";")
89 | CloudStorageAccount.parse(connectionString)
90 | }
91 |
92 | private val cloudStorageAccount = getCloudStorageAccount
93 |
94 | private val blobClient = cloudStorageAccount.createCloudBlobClient()
95 |
96 | private def getAccessPolicy: SharedAccessBlobPolicy = {
97 | val expiration =
98 | new Date(System.currentTimeMillis() + SECONDS.toMillis(preSignedUrlTimeoutSeconds))
99 | val sharedAccessPolicy = new SharedAccessBlobPolicy()
100 | sharedAccessPolicy.setPermissions(java.util.EnumSet.of(SharedAccessBlobPermissions.READ))
101 | sharedAccessPolicy.setSharedAccessExpiryTime(expiration)
102 | sharedAccessPolicy
103 | }
104 |
105 | override def sign(path: Path): String = {
106 | val containerRef = blobClient.getContainerReference(container)
107 | val objectKey = objectKeyExtractor(path)
108 | assert(objectKey.nonEmpty, s"cannot get object key from $path")
109 | val blobRef = containerRef.getBlockBlobReference(objectKey)
110 | val accessPolicy = getAccessPolicy
111 | val sasToken = blobRef.generateSharedAccessSignature(
112 | accessPolicy,
113 | /* headers */ null,
114 | /* groupPolicyIdentifier */ null,
115 | /* ipRange */ null,
116 | SharedAccessProtocols.HTTPS_ONLY
117 | )
118 | val sasTokenCredentials = new StorageCredentialsSharedAccessSignature(sasToken)
119 | sasTokenCredentials.transformUri(blobRef.getUri).toString
120 | }
121 | }
122 |
123 | object WasbFileSigner {
124 | private def getAccountFromAuthority(store: AzureNativeFileSystemStore, uri: URI): String = {
125 | val getAccountFromAuthorityMethod = classOf[AzureNativeFileSystemStore]
126 | .getDeclaredMethod("getAccountFromAuthority", classOf[URI])
127 | getAccountFromAuthorityMethod.setAccessible(true)
128 | getAccountFromAuthorityMethod.invoke(store, uri).asInstanceOf[String]
129 | }
130 |
131 | private def getContainerFromAuthority(store: AzureNativeFileSystemStore, uri: URI): String = {
132 | val getContainerFromAuthorityMethod = classOf[AzureNativeFileSystemStore]
133 | .getDeclaredMethod("getContainerFromAuthority", classOf[URI])
134 | getContainerFromAuthorityMethod.setAccessible(true)
135 | getContainerFromAuthorityMethod.invoke(store, uri).asInstanceOf[String]
136 | }
137 |
138 | def apply(
139 | fs: NativeAzureFileSystem,
140 | uri: URI,
141 | conf: Configuration,
142 | preSignedUrlTimeoutSeconds: Long): CloudFileSigner = {
143 | val accountName = getAccountFromAuthority(fs.getStore, uri)
144 | val accountKey = AzureNativeFileSystemStore.getAccountKeyFromConfiguration(accountName, conf)
145 | val container = getContainerFromAuthority(fs.getStore, uri)
146 | new AzureFileSigner(
147 | accountName,
148 | accountKey,
149 | container,
150 | preSignedUrlTimeoutSeconds,
151 | fs.pathToKey)
152 | }
153 | }
154 |
155 | object AbfsFileSigner {
156 | private def getAbfsStore(fs: AzureBlobFileSystem): AzureBlobFileSystemStore = {
157 | val getAbfsStoreMethod = classOf[AzureBlobFileSystem].getDeclaredMethod("getAbfsStore")
158 | getAbfsStoreMethod.setAccessible(true)
159 | getAbfsStoreMethod.invoke(fs).asInstanceOf[AzureBlobFileSystemStore]
160 | }
161 |
162 | private def getRelativePath(abfsStore: AzureBlobFileSystemStore, path: Path): String = {
163 | val getRelativePathMethod = classOf[AzureBlobFileSystemStore]
164 | .getDeclaredMethod("getRelativePath", classOf[Path])
165 | getRelativePathMethod.setAccessible(true)
166 | getRelativePathMethod.invoke(abfsStore, path).asInstanceOf[String]
167 | }
168 |
169 | private def authorityParts(abfsStore: AzureBlobFileSystemStore, uri: URI): Array[String] = {
170 | val authorityPartsMethod = classOf[AzureBlobFileSystemStore]
171 | .getDeclaredMethod("authorityParts", classOf[URI])
172 | authorityPartsMethod.setAccessible(true)
173 | authorityPartsMethod.invoke(abfsStore, uri).asInstanceOf[Array[String]]
174 | }
175 |
176 | def apply(
177 | fs: AzureBlobFileSystem,
178 | uri: URI,
179 | preSignedUrlTimeoutSeconds: Long): CloudFileSigner = {
180 | val abfsStore = getAbfsStore(fs)
181 | val abfsConfiguration = abfsStore.getAbfsConfiguration
182 | val accountName = abfsConfiguration.accountConf("dummy").stripPrefix("dummy.")
183 | val authType = abfsConfiguration.getAuthType(accountName)
184 | if (authType != AuthType.SharedKey) {
185 | throw new UnsupportedOperationException(s"unsupported auth type: $authType")
186 | }
187 | val accountKey = abfsConfiguration.getStorageAccountKey
188 | val container = authorityParts(abfsStore, uri)(0)
189 | new AzureFileSigner(
190 | accountName,
191 | accountKey,
192 | container,
193 | preSignedUrlTimeoutSeconds,
194 | getRelativePath(abfsStore, _))
195 | }
196 | }
197 |
198 | class GCSFileSigner(
199 | name: URI,
200 | conf: Configuration,
201 | preSignedUrlTimeoutSeconds: Long) extends CloudFileSigner {
202 |
203 | private val storage = StorageOptions.newBuilder.build.getService
204 |
205 | override def sign(path: Path): String = {
206 | val (bucketName, objectName) = GCSFileSigner.getBucketAndObjectNames(path)
207 | assert(objectName.nonEmpty, s"cannot get object key from $path")
208 | val blobInfo = BlobInfo.newBuilder(BlobId.of(bucketName, objectName)).build
209 | storage.signUrl(
210 | blobInfo, preSignedUrlTimeoutSeconds, SECONDS, Storage.SignUrlOption.withV4Signature())
211 | .toString
212 | }
213 | }
214 |
215 | object GCSFileSigner {
216 | def getBucketAndObjectNames(path: Path): (String, String) = {
217 | val resourceId = StorageResourceId.fromUriPath(path.toUri, false /* = allowEmptyObjectName */)
218 | (resourceId.getBucketName, resourceId.getObjectName)
219 | }
220 | }
221 |
--------------------------------------------------------------------------------
/python/dev/lint-python:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Copyright (C) 2021 The Delta Lake Project Authors.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | FLAKE8_BUILD="flake8"
19 | MINIMUM_FLAKE8="3.5.0"
20 |
21 | PYCODESTYLE_BUILD="pycodestyle"
22 | MINIMUM_PYCODESTYLE="2.7.0"
23 |
24 | SPHINX_BUILD="sphinx-build"
25 |
26 | PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
27 |
28 | MYPY_BUILD="mypy"
29 |
30 | BLACK_BUILD="$PYTHON_EXECUTABLE -m black"
31 |
32 | function satisfies_min_version {
33 | local provided_version="$1"
34 | local expected_version="$2"
35 | echo "$(
36 | "$PYTHON_EXECUTABLE" << EOM
37 | from setuptools.extern.packaging import version
38 | print(version.parse('$provided_version') >= version.parse('$expected_version'))
39 | EOM
40 | )"
41 | }
42 |
43 | function compile_python_test {
44 | local COMPILE_STATUS=
45 | local COMPILE_REPORT=
46 |
47 | if [[ ! "$1" ]]; then
48 | echo "No python files found! Something is very wrong -- exiting."
49 | exit 1;
50 | fi
51 |
52 | # compileall: https://docs.python.org/3/library/compileall.html
53 | echo "starting python compilation test..."
54 | COMPILE_REPORT=$( ("$PYTHON_EXECUTABLE" -B -mcompileall -q -l -x "[/\\\\][.]git" $1) 2>&1)
55 | COMPILE_STATUS=$?
56 |
57 | if [ $COMPILE_STATUS -ne 0 ]; then
58 | echo "Python compilation failed with the following errors:"
59 | echo "$COMPILE_REPORT"
60 | echo "$COMPILE_STATUS"
61 | exit "$COMPILE_STATUS"
62 | else
63 | echo "python compilation succeeded."
64 | echo
65 | fi
66 | }
67 |
68 | function pycodestyle_test {
69 | local PYCODESTYLE_STATUS=
70 | local PYCODESTYLE_REPORT=
71 | local RUN_LOCAL_PYCODESTYLE=
72 | local PYCODESTYLE_VERSION=
73 | local EXPECTED_PYCODESTYLE=
74 | local PYCODESTYLE_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pycodestyle-$MINIMUM_PYCODESTYLE.py"
75 | local PYCODESTYLE_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/PyCQA/pycodestyle/$MINIMUM_PYCODESTYLE/pycodestyle.py"
76 |
77 | if [[ ! "$1" ]]; then
78 | echo "No python files found! Something is very wrong -- exiting."
79 | exit 1;
80 | fi
81 |
82 | # check for locally installed pycodestyle & version
83 | RUN_LOCAL_PYCODESTYLE="False"
84 | if hash "$PYCODESTYLE_BUILD" 2> /dev/null; then
85 | PYCODESTYLE_VERSION="$($PYCODESTYLE_BUILD --version)"
86 | EXPECTED_PYCODESTYLE="$(satisfies_min_version $PYCODESTYLE_VERSION $MINIMUM_PYCODESTYLE)"
87 | if [ "$EXPECTED_PYCODESTYLE" == "True" ]; then
88 | RUN_LOCAL_PYCODESTYLE="True"
89 | fi
90 | fi
91 |
92 | # download the right version or run locally
93 | if [ $RUN_LOCAL_PYCODESTYLE == "False" ]; then
94 | # Get pycodestyle at runtime so that we don't rely on it being installed on the build server.
95 | # See: https://github.com/apache/spark/pull/1744#issuecomment-50982162
96 | # Updated to the latest official version of pep8. pep8 is formally renamed to pycodestyle.
97 | echo "downloading pycodestyle from $PYCODESTYLE_SCRIPT_REMOTE_PATH..."
98 | if [ ! -e "$PYCODESTYLE_SCRIPT_PATH" ]; then
99 | curl --silent -o "$PYCODESTYLE_SCRIPT_PATH" "$PYCODESTYLE_SCRIPT_REMOTE_PATH"
100 | local curl_status="$?"
101 |
102 | if [ "$curl_status" -ne 0 ]; then
103 | echo "Failed to download pycodestyle.py from $PYCODESTYLE_SCRIPT_REMOTE_PATH"
104 | exit "$curl_status"
105 | fi
106 | fi
107 |
108 | echo "starting pycodestyle test..."
109 | PYCODESTYLE_REPORT=$( ("$PYTHON_EXECUTABLE" "$PYCODESTYLE_SCRIPT_PATH" --config=dev/tox.ini $1) 2>&1)
110 | PYCODESTYLE_STATUS=$?
111 | else
112 | # we have the right version installed, so run locally
113 | echo "starting pycodestyle test..."
114 | PYCODESTYLE_REPORT=$( ($PYCODESTYLE_BUILD --config=dev/tox.ini $1) 2>&1)
115 | PYCODESTYLE_STATUS=$?
116 | fi
117 |
118 | if [ $PYCODESTYLE_STATUS -ne 0 ]; then
119 | echo "pycodestyle checks failed:"
120 | echo "$PYCODESTYLE_REPORT"
121 | exit "$PYCODESTYLE_STATUS"
122 | else
123 | echo "pycodestyle checks passed."
124 | echo
125 | fi
126 | }
127 |
128 | function flake8_test {
129 | local FLAKE8_VERSION=
130 | local EXPECTED_FLAKE8=
131 | local FLAKE8_REPORT=
132 | local FLAKE8_STATUS=
133 |
134 | if ! hash "$FLAKE8_BUILD" 2> /dev/null; then
135 | echo "The flake8 command was not found."
136 | echo "flake8 checks failed."
137 | exit 1
138 | fi
139 |
140 | _FLAKE8_VERSION=($($FLAKE8_BUILD --version))
141 | FLAKE8_VERSION="${_FLAKE8_VERSION[0]}"
142 | EXPECTED_FLAKE8="$(satisfies_min_version $FLAKE8_VERSION $MINIMUM_FLAKE8)"
143 |
144 | if [[ "$EXPECTED_FLAKE8" == "False" ]]; then
145 | echo "\
146 | The minimum flake8 version needs to be $MINIMUM_FLAKE8. Your current version is $FLAKE8_VERSION
147 |
148 | flake8 checks failed."
149 | exit 1
150 | fi
151 |
152 | echo "starting $FLAKE8_BUILD test..."
153 | FLAKE8_REPORT=$( ($FLAKE8_BUILD . --count --select=E901,E999,F821,F822,F823,F401,F405 \
154 | --exclude="docs/build/html/reference/api/*.py","build" \
155 | --max-line-length=100 --show-source --statistics) 2>&1)
156 | FLAKE8_STATUS=$?
157 |
158 | if [ "$FLAKE8_STATUS" -ne 0 ]; then
159 | echo "flake8 checks failed:"
160 | echo "$FLAKE8_REPORT"
161 | echo "$FLAKE8_STATUS"
162 | exit "$FLAKE8_STATUS"
163 | else
164 | echo "flake8 checks passed."
165 | echo
166 | fi
167 | }
168 |
169 | function mypy_test {
170 | local MYPY_REPORT=
171 | local MYPY_STATUS=
172 |
173 | # Skip check if mypy is not installed.
174 | if ! hash "$MYPY_BUILD" 2> /dev/null; then
175 | echo "The $MYPY_BUILD command was not found. Skipping mypy checks for now."
176 | echo
177 | return
178 | fi
179 |
180 | echo "starting mypy test..."
181 | MYPY_REPORT=$( ($MYPY_BUILD --package delta_sharing --show-error-context --no-strict-optional --ignore-missing-imports) 2>&1)
182 | MYPY_STATUS=$?
183 |
184 | if [ "$MYPY_STATUS" -ne 0 ]; then
185 | echo "mypy checks failed:"
186 | echo "$MYPY_REPORT"
187 | echo "$MYPY_STATUS"
188 | exit "$MYPY_STATUS"
189 | else
190 | echo "mypy checks passed."
191 | echo
192 | fi
193 | }
194 |
195 | function sphinx_test {
196 | local SPHINX_REPORT=
197 | local SPHINX_STATUS=
198 |
199 | python -c "import sys; assert sys.version_info >= (3, 6), 'Sphinx build requires Python 3.6+, skipping for now.'"
200 | exit_code=$?
201 | if [ $exit_code -ne 0 ]; then
202 | return
203 | fi
204 |
205 | # Check that the documentation builds acceptably, skip check if sphinx is not installed.
206 | if ! hash "$SPHINX_BUILD" 2> /dev/null; then
207 | echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now."
208 | echo
209 | return
210 | fi
211 |
212 | echo "starting $SPHINX_BUILD tests..."
213 | pushd docs &> /dev/null
214 | make clean &> /dev/null
215 | # Treat warnings as errors so we stop correctly
216 | SPHINX_REPORT=$( (SPHINXOPTS="-a -W" make html) 2>&1)
217 | SPHINX_STATUS=$?
218 |
219 | if [ "$SPHINX_STATUS" -ne 0 ]; then
220 | echo "$SPHINX_BUILD checks failed:"
221 | echo "$SPHINX_REPORT"
222 | echo
223 | echo "re-running make html to print full warning list:"
224 | make clean &> /dev/null
225 | SPHINX_REPORT=$( (SPHINXOPTS="-a" make html) 2>&1)
226 | echo "$SPHINX_REPORT"
227 | exit "$SPHINX_STATUS"
228 | else
229 | echo "$SPHINX_BUILD checks passed."
230 | echo
231 | fi
232 |
233 | popd &> /dev/null
234 | }
235 |
236 | function black_test {
237 | local BLACK_REPORT=
238 | local BLACK_STATUS=
239 |
240 | # Skip check if black is not installed.
241 | $BLACK_BUILD 2> /dev/null
242 | if [ $? -ne 0 ]; then
243 | echo "The $BLACK_BUILD command was not found. Skipping black checks for now."
244 | echo
245 | return
246 | fi
247 |
248 | echo "starting black test..."
249 | BLACK_REPORT=$( ($BLACK_BUILD delta_sharing --line-length 100 --check --diff) 2>&1)
250 | BLACK_STATUS=$?
251 |
252 | if [ "$BLACK_STATUS" -ne 0 ]; then
253 | echo "black checks failed:"
254 | echo "$BLACK_REPORT"
255 | echo "Please run 'dev/reformat' script."
256 | echo "$BLACK_STATUS"
257 | exit "$BLACK_STATUS"
258 | else
259 | echo "black checks passed."
260 | echo
261 | fi
262 | }
263 |
264 | SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
265 | SPARK_ROOT_DIR="$(dirname "${SCRIPT_DIR}")"
266 |
267 | pushd "$SPARK_ROOT_DIR" &> /dev/null
268 |
269 | PYTHON_SOURCE="$(find . -name "*.py")"
270 |
271 | compile_python_test "$PYTHON_SOURCE"
272 | black_test
273 | pycodestyle_test "$PYTHON_SOURCE"
274 | flake8_test
275 | mypy_test
276 | sphinx_test
277 |
278 | echo
279 | echo "all lint-python tests passed!"
280 |
281 | popd &> /dev/null
282 |
--------------------------------------------------------------------------------