├── python
├── .gitignore
├── requirements.txt
├── run_tests.sh
└── spannergraph
│ ├── __init__.py
│ └── tests.py
├── spark-3.1-spanner-lib
├── src
│ ├── test
│ │ ├── resources
│ │ │ ├── spark-spanner-connector.properties
│ │ │ ├── META-INF
│ │ │ │ └── services
│ │ │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ │ │ ├── acceptance
│ │ │ │ └── read_test_table.py
│ │ │ └── db
│ │ │ │ ├── populate_ddl_pg.sql
│ │ │ │ ├── populate_ddl.sql
│ │ │ │ ├── insert_data_pg.sql
│ │ │ │ ├── insert_data_graph.sql
│ │ │ │ ├── populate_ddl_graph.sql
│ │ │ │ └── insert_data.sql
│ │ └── java
│ │ │ └── com
│ │ │ └── google
│ │ │ └── cloud
│ │ │ └── spark
│ │ │ └── spanner
│ │ │ ├── SpannerTestUtils.java
│ │ │ ├── graph
│ │ │ ├── GraphReadIntegrationTestBase.java
│ │ │ └── GraphErrorHandlingTest.java
│ │ │ ├── acceptance
│ │ │ ├── DataprocServerlessImage11AcceptanceTest.java
│ │ │ ├── DataprocServerlessImage20AcceptanceTest.java
│ │ │ ├── DataprocServerlessImage21AcceptanceTest.java
│ │ │ ├── DataprocImage20AcceptanceTest.java
│ │ │ ├── DataprocImage21AcceptanceTest.java
│ │ │ ├── DataprocImage22AcceptanceTest.java
│ │ │ ├── AcceptanceTestContext.java
│ │ │ └── AcceptanceTestUtils.java
│ │ │ ├── SparkSpannerIntegrationTestBase.java
│ │ │ ├── SpannerTableSchemaTest.java
│ │ │ ├── WriteIntegrationTestBase.java
│ │ │ ├── SpannerScannerTest.java
│ │ │ ├── TestData.java
│ │ │ ├── OpenLineageIntegrationTestBase.java
│ │ │ └── SpannerTableTest.java
│ ├── build
│ │ └── resources
│ │ │ └── spark-spanner-connector.properties
│ └── main
│ │ └── java
│ │ └── com
│ │ └── google
│ │ └── cloud
│ │ └── spark
│ │ └── spanner
│ │ ├── SpannerRowConverter.java
│ │ ├── DefaultSource.java
│ │ ├── SpannerRowConverterDirect.java
│ │ ├── InputPartitionReaderContext.java
│ │ ├── InputPartitionContext.java
│ │ ├── graph
│ │ ├── query
│ │ │ ├── GraphSubQuery.java
│ │ │ ├── SelectField.java
│ │ │ ├── NodeElementTableQuery.java
│ │ │ ├── DirectGraphQuery.java
│ │ │ ├── EdgeElementTableQuery.java
│ │ │ └── SpannerGraphQuery.java
│ │ ├── SpannerGraphScanBuilder.java
│ │ ├── SpannerRowConverterWithSchema.java
│ │ ├── SpannerGraph.java
│ │ ├── SpannerGraphBuilder.java
│ │ ├── SpannerGraphConfigs.java
│ │ ├── PropertyGraph.java
│ │ └── SpannerGraphScanner.java
│ │ ├── SpannerErrorCode.java
│ │ ├── SpannerPartitionReader.java
│ │ ├── SpannerConnectorException.java
│ │ ├── SpannerPartition.java
│ │ ├── BatchClientWithCloser.java
│ │ ├── SpannerInputPartitionContext.java
│ │ ├── SpannerPartitionReaderFactory.java
│ │ ├── SpannerTableSchema.java
│ │ ├── SpannerInputPartitionReaderContext.java
│ │ ├── Spark31SpannerTableProvider.java
│ │ ├── SpannerScanBuilder.java
│ │ └── SpannerScanner.java
└── pom.xml
├── spark-3.1-spanner
├── src
│ └── main
│ │ └── resources
│ │ └── META-INF
│ │ └── services
│ │ └── org.apache.spark.sql.sources.DataSourceRegister
└── pom.xml
├── spark-3.2-spanner
├── src
│ └── main
│ │ └── resources
│ │ └── META-INF
│ │ └── services
│ │ └── org.apache.spark.sql.sources.DataSourceRegister
└── pom.xml
├── spark-3.3-spanner
├── src
│ └── main
│ │ └── resources
│ │ └── META-INF
│ │ └── services
│ │ └── org.apache.spark.sql.sources.DataSourceRegister
└── pom.xml
├── spark-3.2-spanner-lib
├── src
│ ├── test
│ │ └── resources
│ │ │ └── META-INF
│ │ │ └── services
│ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ └── main
│ │ └── java
│ │ └── com
│ │ └── google
│ │ └── cloud
│ │ └── spark
│ │ └── spanner
│ │ └── Spark32SpannerTableProvider.java
└── pom.xml
├── spark-3.3-spanner-lib
├── src
│ ├── test
│ │ └── resources
│ │ │ └── META-INF
│ │ │ └── services
│ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ └── main
│ │ └── java
│ │ └── com
│ │ └── google
│ │ └── cloud
│ │ └── spark
│ │ └── spanner
│ │ └── Spark33SpannerTableProvider.java
└── pom.xml
├── .mvn
└── wrapper
│ ├── maven-wrapper.properties
│ └── MavenWrapperDownloader.java
├── cloudbuild
├── gcp-settings.xml
├── Dockerfile
├── cloudbuild.yaml
└── presubmit.sh
├── CHANGES.md
├── .gitignore
├── examples
├── SpannerSpark.py
└── SpannerSpark.java
├── spark-spanner-lib-parent
└── pom.xml
├── pom.xml
├── CONTRIBUTING.md
└── mvnw.cmd
/python/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/resources/spark-spanner-connector.properties:
--------------------------------------------------------------------------------
1 | connector.version=test
2 |
--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
1 | typing_extensions
2 | pyspark
3 | unittest
4 | numpy
5 | pycodestyle
6 | pandas
7 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/build/resources/spark-spanner-connector.properties:
--------------------------------------------------------------------------------
1 | connector.version=${project.version}
2 |
--------------------------------------------------------------------------------
/python/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | pycodestyle spannergraph --exclude=tests_gold.py && python3 -m spannergraph.tests
4 |
--------------------------------------------------------------------------------
/python/spannergraph/__init__.py:
--------------------------------------------------------------------------------
1 | """This module enables exporting Spanner Graphs as GraphFrames graphs."""
2 |
3 | from ._connector import SpannerGraphConnector
4 |
--------------------------------------------------------------------------------
/spark-3.1-spanner/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.google.cloud.spark.spanner.Spark31SpannerTableProvider
2 |
--------------------------------------------------------------------------------
/spark-3.2-spanner/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.google.cloud.spark.spanner.Spark32SpannerTableProvider
2 |
--------------------------------------------------------------------------------
/spark-3.3-spanner/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.google.cloud.spark.spanner.Spark33SpannerTableProvider
2 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.google.cloud.spark.spanner.Spark31SpannerTableProvider
2 |
--------------------------------------------------------------------------------
/spark-3.2-spanner-lib/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.google.cloud.spark.spanner.Spark32SpannerTableProvider
2 |
--------------------------------------------------------------------------------
/spark-3.3-spanner-lib/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.google.cloud.spark.spanner.Spark33SpannerTableProvider
2 |
--------------------------------------------------------------------------------
/.mvn/wrapper/maven-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.4/apache-maven-3.8.4-bin.zip
2 | wrapperUrl=https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar
--------------------------------------------------------------------------------
/cloudbuild/gcp-settings.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 |
--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
1 | # Release Notes
2 |
3 | ## Next
4 |
5 | ## 1.1.1 - 2025-12-18
6 |
7 | * Prefix column names with table name to avoid ambiguity
8 | * Make table name lookup case-insensitive for GoogleSQL
9 |
10 | ## 1.1.0 - 2024-12-20
11 |
12 | * Add support for exporting graphs from Spanner
13 |
14 | ## 1.0.0 - 2023-11-13
15 |
16 | * Initial release.
17 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerRowConverter.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner;
2 |
3 | import com.google.cloud.spanner.Struct;
4 | import org.apache.spark.sql.catalyst.InternalRow;
5 |
6 | /** Converts rows from Spanner query outputs to rows in Spark DataFrame */
7 | public interface SpannerRowConverter {
8 |
9 | /** Generates a Spark row based on the Spanner row */
10 | InternalRow convert(Struct spannerRow);
11 | }
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
4 | # sbt/maven specific
5 | .cache/
6 | .history/
7 | .lib/
8 | .flattened-pom.xml
9 | dist/*
10 | target/
11 | lib_managed/
12 | src_managed/
13 | project/boot/
14 | project/plugins/project/
15 | dependency-reduced-pom.xml
16 | *.versionsBackup
17 | .mvn/wrapper/maven-wrapper.jar
18 |
19 | # Scala-IDE specific
20 | .scala_dependencies
21 | .worksheet
22 | .idea
23 | *.iml
24 |
25 | # Eclipse IDE Specific files
26 | .classpath
27 | .project
28 | .settings/
29 |
30 | # Mac
31 | .DS_Store
32 |
--------------------------------------------------------------------------------
/cloudbuild/Dockerfile:
--------------------------------------------------------------------------------
1 | # This Dockerfile creates an image for running presubmit tests.
2 | FROM openjdk:8
3 |
4 | RUN \
5 | echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | \
6 | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
7 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \
8 | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
9 | apt-get update -y && \
10 | apt-get install google-cloud-cli -y
11 |
12 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/DefaultSource.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | public class DefaultSource extends Spark31SpannerTableProvider {}
18 |
--------------------------------------------------------------------------------
/spark-3.2-spanner-lib/src/main/java/com/google/cloud/spark/spanner/Spark32SpannerTableProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2023 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.google.cloud.spark.spanner;
17 |
18 | public class Spark32SpannerTableProvider extends Spark31SpannerTableProvider {}
19 |
--------------------------------------------------------------------------------
/spark-3.3-spanner-lib/src/main/java/com/google/cloud/spark/spanner/Spark33SpannerTableProvider.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2023 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.google.cloud.spark.spanner;
17 |
18 | public class Spark33SpannerTableProvider extends Spark32SpannerTableProvider {}
19 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SpannerTestUtils.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner;
2 |
3 | import java.time.ZonedDateTime;
4 | import java.util.ArrayList;
5 | import java.util.List;
6 | import org.apache.spark.sql.catalyst.util.GenericArrayData;
7 |
8 | public class SpannerTestUtils {
9 |
10 | public static GenericArrayData zonedDateTimeIterToSparkDates(Iterable tsIt) {
11 | List dest = new ArrayList<>();
12 | tsIt.forEach((ts) -> dest.add(SpannerUtils.zonedDateTimeToSparkDate(ts)));
13 | return new GenericArrayData(dest.toArray(new Integer[0]));
14 | }
15 |
16 | public static GenericArrayData zonedDateTimeIterToSparkTimestamps(Iterable tsIt) {
17 | List dest = new ArrayList<>();
18 | tsIt.forEach((ts) -> dest.add(SpannerUtils.zonedDateTimeToSparkTimestamp(ts)));
19 | return new GenericArrayData(dest.toArray(new Long[0]));
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerRowConverterDirect.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner;
2 |
3 | import com.google.cloud.spanner.Struct;
4 | import java.io.Serializable;
5 | import org.apache.spark.sql.catalyst.InternalRow;
6 |
7 | /** Converts rows from Spanner query outputs to rows in a Spark DataFrame with 1:1 field mapping. */
8 | public class SpannerRowConverterDirect implements SpannerRowConverter, Serializable {
9 |
10 | /**
11 | * Converts a spanner row to a Spark DataFrame row with 1:1 field mapping.
12 | *
13 | * @param spannerRow the row from Spanner to convert.
14 | * @return a Spark DataFrame row with the same length as the input. Each field in the output is
15 | * converted directly from the field at the same position in the input.
16 | */
17 | @Override
18 | public InternalRow convert(Struct spannerRow) {
19 | return SpannerUtils.spannerStructToInternalRow(spannerRow);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/InputPartitionReaderContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import java.io.Closeable;
18 | import java.io.IOException;
19 |
20 | public interface InputPartitionReaderContext extends Closeable {
21 |
22 | boolean next() throws IOException;
23 |
24 | T get();
25 | }
26 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/InputPartitionContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import java.io.Serializable;
18 |
19 | public interface InputPartitionContext extends Serializable {
20 |
21 | InputPartitionReaderContext createPartitionReaderContext();
22 |
23 | boolean supportsColumnarReads();
24 | }
25 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/GraphSubQuery.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph.query;
2 |
3 | import com.google.cloud.Tuple;
4 | import com.google.cloud.spanner.Statement;
5 | import com.google.cloud.spark.spanner.SpannerRowConverter;
6 | import java.util.List;
7 | import org.apache.spark.sql.types.StructField;
8 | import org.apache.spark.sql.types.StructType;
9 |
10 | /** Handles a single SQL query (e.g., for an individual element table, for a GQL query) */
11 | public interface GraphSubQuery {
12 |
13 | /**
14 | * Get the statement for this sub-query and a row converter that converts outputs to a row in the
15 | * DataFrame
16 | *
17 | * @param dataframeSchema schema of the DataFrame that will store the outputs
18 | * @return the statement and the row converter
19 | */
20 | Tuple getQueryAndConverter(StructType dataframeSchema);
21 |
22 | /** Get a list of fields that this sub-query will output */
23 | List getOutputSparkFields();
24 | }
25 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/SelectField.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph.query;
2 |
3 | import java.util.Collection;
4 | import java.util.stream.Collectors;
5 |
6 | /** Represents a field in the SELECT clause */
7 | public class SelectField {
8 |
9 | public final String inputExpression;
10 | public final String outputName;
11 |
12 | public SelectField(String columnName) {
13 | this(columnName, columnName);
14 | }
15 |
16 | public SelectField(String inputExpression, String outputName) {
17 | this.inputExpression = inputExpression.trim();
18 | this.outputName = outputName.trim();
19 | }
20 |
21 | @Override
22 | public String toString() {
23 | if (inputExpression.equals(outputName)) {
24 | return outputName;
25 | } else {
26 | return inputExpression + " AS " + outputName;
27 | }
28 | }
29 |
30 | public static String join(Collection selectFields) {
31 | return selectFields.stream().map(SelectField::toString).collect(Collectors.joining(", "));
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/spark-3.2-spanner/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 |
6 | com.google.cloud.spark.spanner
7 | spark-spanner-parent
8 | ${revision}
9 | ../spark-spanner-parent
10 |
11 |
12 | spark-3.2-spanner
13 | ${revision}
14 | spanner DataSource v2 for Spark 3.2
15 |
16 | 3.2.0
17 | false
18 |
19 |
20 |
21 | Apache License, Version 2.0
22 | http://www.apache.org/licenses/LICENSE-2.0.txt
23 | repo
24 |
25 |
26 |
27 |
28 | ${project.groupId}
29 | spark-3.2-spanner-lib
30 | ${project.version}
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/spark-3.3-spanner/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 |
6 | com.google.cloud.spark.spanner
7 | spark-spanner-parent
8 | ${revision}
9 | ../spark-spanner-parent
10 |
11 |
12 | spark-3.3-spanner
13 | ${revision}
14 | spanner DataSource v2 for Spark 3.3
15 |
16 | 3.3.0
17 | false
18 |
19 |
20 |
21 | Apache License, Version 2.0
22 | http://www.apache.org/licenses/LICENSE-2.0.txt
23 | repo
24 |
25 |
26 |
27 |
28 | ${project.groupId}
29 | spark-3.3-spanner-lib
30 | ${project.version}
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/spark-3.2-spanner-lib/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 |
6 | com.google.cloud.spark.spanner
7 | spark-spanner-lib-parent
8 | ${revision}
9 | ../spark-spanner-lib-parent
10 |
11 |
12 | spark-3.2-spanner-lib
13 | ${revision}
14 | Connector code for spanner DataSource v2 for Spark 3.2
15 |
16 | 3.2.0
17 | true
18 |
19 |
20 |
21 | Apache License, Version 2.0
22 | http://www.apache.org/licenses/LICENSE-2.0.txt
23 | repo
24 |
25 |
26 |
27 |
28 | ${project.groupId}
29 | spark-3.1-spanner
30 | ${project.version}
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/spark-3.3-spanner-lib/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 |
6 | com.google.cloud.spark.spanner
7 | spark-spanner-lib-parent
8 | ${revision}
9 | ../spark-spanner-lib-parent
10 |
11 |
12 | spark-3.3-spanner-lib
13 | ${revision}
14 | Connector code for spanner DataSource v2 for Spark 3.3
15 |
16 | 3.3.0
17 | true
18 |
19 |
20 |
21 | Apache License, Version 2.0
22 | http://www.apache.org/licenses/LICENSE-2.0.txt
23 | repo
24 |
25 |
26 |
27 |
28 | ${project.groupId}
29 | spark-3.2-spanner
30 | ${project.version}
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerErrorCode.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | public enum SpannerErrorCode {
18 | SPANNER_FAILED_TO_EXECUTE_QUERY(0),
19 | SPANNER_FAILED_TO_PARSE_OPTIONS(1),
20 | COLUMNAR_READS_NOT_SUPPORTED(2),
21 | WRITES_NOT_SUPPORTED(3),
22 | RESOURCE_EXHAUSTED_ON_SPANNER(4),
23 | DATABASE_DIALECT_NOT_SUPPORTED(5),
24 | DECIMAL_OUT_OF_RANGE(6),
25 |
26 | // Should be last
27 | UNSUPPORTED(9998),
28 | UNKNOWN(9999);
29 |
30 | final int code;
31 |
32 | SpannerErrorCode(int code) {
33 | this.code = code;
34 | }
35 |
36 | public int getCode() {
37 | return code;
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/examples/SpannerSpark.py:
--------------------------------------------------------------------------------
1 | #/usr/bin/env python
2 |
3 | # Copyright 2023 Google LLC. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from pyspark.sql import SparkSession
18 |
19 | def main():
20 | table = "TABLE_NAME"
21 | spark = SparkSession.builder.appName("SparkSpannerDemo").getOrCreate()
22 | df = spark.read.format('cloud-spanner') \
23 | .option("projectId", "") \
24 | .option("instanceId", "") \
25 | .option("databaseId", "") \
26 | .option("enableDataBoost", "true") \
27 | .option("table", "") \
28 | .load()
29 | df.printSchema()
30 | df.show()
31 |
32 | if __name__ == '__main__':
33 | main()
34 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/graph/GraphReadIntegrationTestBase.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph;
2 |
3 | import com.google.cloud.spark.spanner.SparkSpannerIntegrationTestBase;
4 | import com.google.gson.Gson;
5 | import org.apache.spark.sql.DataFrameReader;
6 | import org.apache.spark.sql.Dataset;
7 | import org.apache.spark.sql.Row;
8 |
9 | public class GraphReadIntegrationTestBase extends SparkSpannerIntegrationTestBase {
10 | public DataFrameReader flexibleGraphReader(SpannerGraphConfigs configs) {
11 | DataFrameReader reader =
12 | reader().option("enableDataBoost", "true").option("graph", "FlexibleGraph");
13 | return configs == null ? reader : reader.option("configs", new Gson().toJson(configs));
14 | }
15 |
16 | public DataFrameReader musicGraphReader(SpannerGraphConfigs configs) {
17 | DataFrameReader reader =
18 | reader().option("enableDataBoost", "true").option("graph", "MusicGraph");
19 | return configs == null ? reader : reader.option("configs", new Gson().toJson(configs));
20 | }
21 |
22 | public static Dataset readNodes(DataFrameReader reader) {
23 | return reader.option("type", "node").load();
24 | }
25 |
26 | public static Dataset readEdges(DataFrameReader reader) {
27 | return reader.option("type", "edge").load();
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocServerlessImage11AcceptanceTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner.acceptance;
16 |
17 | import org.junit.runner.RunWith;
18 | import org.junit.runners.JUnit4;
19 |
20 | /**
21 | * The acceptance test on the Dataproc Serverless. The test have to be running on the project with
22 | * requireOsLogin disabled, otherwise an org policy violation error will be thrown.
23 | */
24 | @RunWith(JUnit4.class)
25 | public final class DataprocServerlessImage11AcceptanceTest
26 | extends DataprocServerlessAcceptanceTestBase {
27 |
28 | private static AcceptanceTestContext context;
29 |
30 | public DataprocServerlessImage11AcceptanceTest() {
31 | super("spark-3.1-spanner", "1.1");
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocServerlessImage20AcceptanceTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner.acceptance;
16 |
17 | import org.junit.runner.RunWith;
18 | import org.junit.runners.JUnit4;
19 |
20 | /**
21 | * The acceptance test on the Dataproc Serverless. The test have to be running on the project with
22 | * requireOsLogin disabled, otherwise an org policy violation error will be thrown.
23 | */
24 | @RunWith(JUnit4.class)
25 | public final class DataprocServerlessImage20AcceptanceTest
26 | extends DataprocServerlessAcceptanceTestBase {
27 |
28 | private static AcceptanceTestContext context;
29 |
30 | public DataprocServerlessImage20AcceptanceTest() {
31 | super("spark-3.1-spanner", "2.0");
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocServerlessImage21AcceptanceTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner.acceptance;
16 |
17 | import org.junit.runner.RunWith;
18 | import org.junit.runners.JUnit4;
19 |
20 | /**
21 | * The acceptance test on the Dataproc Serverless. The test have to be running on the project with
22 | * requireOsLogin disabled, otherwise an org policy violation error will be thrown.
23 | */
24 | @RunWith(JUnit4.class)
25 | public final class DataprocServerlessImage21AcceptanceTest
26 | extends DataprocServerlessAcceptanceTestBase {
27 |
28 | private static AcceptanceTestContext context;
29 |
30 | public DataprocServerlessImage21AcceptanceTest() {
31 | super("spark-3.1-spanner", "2.1");
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerPartitionReader.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import java.io.IOException;
18 | import org.apache.spark.sql.connector.read.PartitionReader;
19 |
20 | public class SpannerPartitionReader implements PartitionReader {
21 |
22 | private InputPartitionReaderContext context;
23 |
24 | public SpannerPartitionReader(InputPartitionReaderContext context) {
25 | this.context = context;
26 | }
27 |
28 | @Override
29 | public boolean next() throws IOException {
30 | return this.context.next();
31 | }
32 |
33 | @Override
34 | public T get() {
35 | return this.context.get();
36 | }
37 |
38 | @Override
39 | public void close() throws IOException {
40 | this.context.close();
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/spark-3.1-spanner/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 |
6 | com.google.cloud.spark.spanner
7 | spark-spanner-parent
8 | ${revision}
9 | ../spark-spanner-parent
10 |
11 |
12 | spark-3.1-spanner
13 | ${revision}
14 | spanner DataSource v2 for Spark 3.1
15 |
16 | 3.1.0
17 | false
18 |
19 |
20 |
21 | Apache License, Version 2.0
22 | http://www.apache.org/licenses/LICENSE-2.0.txt
23 | repo
24 |
25 |
26 |
27 |
28 | ${project.groupId}
29 | spark-3.1-spanner-lib
30 | ${project.version}
31 |
32 |
33 |
34 | org.apache.spark
35 | spark-sql_2.12
36 | 3.1.1
37 | provided
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/resources/acceptance/read_test_table.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2023 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import os
17 | import sys
18 | from pyspark.sql import SparkSession
19 | from pyspark.sql.functions import col
20 |
21 |
22 | def main():
23 | spark = SparkSession.builder.appName('Acceptance Test on Spark').getOrCreate()
24 |
25 | table = 'ATable'
26 | df = spark.read.format('cloud-spanner') \
27 | .option("projectId", sys.argv[2]) \
28 | .option("instanceId", sys.argv[3]) \
29 | .option("databaseId", sys.argv[4]) \
30 | .option("table", table) \
31 | .load(table)
32 |
33 | print('The resulting schema is')
34 | df.printSchema()
35 |
36 | df = df.select("A", "B", "D", "E")
37 | df = df.groupBy().sum('A')
38 |
39 | print('Table:')
40 | df.show()
41 |
42 | df.write.csv(sys.argv[1])
43 |
44 | if __name__ == '__main__':
45 | main()
46 |
--------------------------------------------------------------------------------
/examples/SpannerSpark.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner.examples;
16 |
17 | import org.apache.spark.sql.Dataset;
18 | import org.apache.spark.sql.Row;
19 | import org.apache.spark.sql.SparkSession;
20 |
21 | public class SpannerSpark {
22 | public static void main(String[] args) {
23 | SparkSession spark = SparkSession
24 | .builder()
25 | .appName("cloud spanner for census 2020")
26 | .getOrCreate();
27 |
28 |
29 | Dataset df = spark.read()
30 | .format("cloud-spanner")
31 | .option("table", "people")
32 | .option("projectId", System.getenv("SPANNER_SPARK_PROJECT"))
33 | .option("instanceId", System.getenv("SPANNER_SPARK_INSTANCE"))
34 | .option("database", System.getenv("SPANNER_SPARK_DATABASE"))
35 | .load();
36 | df.show();
37 | df.printSchema();
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraphScanBuilder.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph;
2 |
3 | import com.google.cloud.spark.spanner.SpannerUtils;
4 | import com.google.common.collect.ImmutableSet;
5 | import java.util.Set;
6 | import javax.annotation.Nullable;
7 | import org.apache.spark.sql.connector.read.Scan;
8 | import org.apache.spark.sql.connector.read.ScanBuilder;
9 | import org.apache.spark.sql.connector.read.SupportsPushDownRequiredColumns;
10 | import org.apache.spark.sql.types.StructType;
11 |
12 | /** Builder for {@link SpannerGraphScanner} */
13 | public class SpannerGraphScanBuilder implements ScanBuilder, SupportsPushDownRequiredColumns {
14 |
15 | private final SpannerGraph spannerGraph;
16 | private @Nullable Set requiredColumns;
17 |
18 | public SpannerGraphScanBuilder(SpannerGraph spannerGraph) {
19 | this.spannerGraph = spannerGraph;
20 | }
21 |
22 | @Override
23 | public Scan build() {
24 | return new SpannerGraphScanner(
25 | spannerGraph.options,
26 | spannerGraph.configs.extraHeaders,
27 | spannerGraph.readTimestamp,
28 | spannerGraph.configs.partitionSizeBytes,
29 | spannerGraph.dataBoostEnabled,
30 | spannerGraph.spannerGraphQuery,
31 | requiredColumns,
32 | SpannerUtils.pruneSchema(spannerGraph.schema(), requiredColumns));
33 | }
34 |
35 | @Override
36 | public void pruneColumns(StructType requiredSchema) {
37 | this.requiredColumns = ImmutableSet.copyOf(requiredSchema.fieldNames());
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerConnectorException.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | public class SpannerConnectorException extends RuntimeException {
18 |
19 | final SpannerErrorCode errorCode;
20 |
21 | public SpannerConnectorException(String message) {
22 | this(SpannerErrorCode.UNKNOWN, message);
23 | }
24 |
25 | public SpannerConnectorException(String message, Throwable cause) {
26 | this(SpannerErrorCode.UNKNOWN, message, cause);
27 | }
28 |
29 | public SpannerConnectorException(SpannerErrorCode errorCode, String message) {
30 | super(message);
31 | this.errorCode = errorCode;
32 | }
33 |
34 | public SpannerConnectorException(SpannerErrorCode errorCode, String message, Throwable cause) {
35 | super(message, cause);
36 | this.errorCode = errorCode;
37 | }
38 |
39 | public SpannerErrorCode getErrorCode() {
40 | return errorCode;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocImage20AcceptanceTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner.acceptance;
16 |
17 | import com.google.common.collect.ImmutableList;
18 | import org.junit.AfterClass;
19 | import org.junit.BeforeClass;
20 | import org.junit.runner.RunWith;
21 | import org.junit.runners.JUnit4;
22 |
23 | @RunWith(JUnit4.class)
24 | public final class DataprocImage20AcceptanceTest extends DataprocAcceptanceTestBase {
25 |
26 | private static AcceptanceTestContext context;
27 |
28 | public DataprocImage20AcceptanceTest() {
29 | super(context);
30 | }
31 |
32 | @BeforeClass
33 | public static void setup() throws Exception {
34 | context =
35 | DataprocAcceptanceTestBase.setup("2.0-debian10", "spark-3.1-spanner", ImmutableList.of());
36 | }
37 |
38 | @AfterClass
39 | public static void tearDown() throws Exception {
40 | DataprocAcceptanceTestBase.tearDown(context);
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocImage21AcceptanceTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner.acceptance;
16 |
17 | import com.google.common.collect.ImmutableList;
18 | import org.junit.AfterClass;
19 | import org.junit.BeforeClass;
20 | import org.junit.runner.RunWith;
21 | import org.junit.runners.JUnit4;
22 |
23 | @RunWith(JUnit4.class)
24 | public final class DataprocImage21AcceptanceTest extends DataprocAcceptanceTestBase {
25 |
26 | private static AcceptanceTestContext context;
27 |
28 | public DataprocImage21AcceptanceTest() {
29 | super(context);
30 | }
31 |
32 | @BeforeClass
33 | public static void setup() throws Exception {
34 | context =
35 | DataprocAcceptanceTestBase.setup("2.1-debian11", "spark-3.1-spanner", ImmutableList.of());
36 | }
37 |
38 | @AfterClass
39 | public static void tearDown() throws Exception {
40 | DataprocAcceptanceTestBase.tearDown(context);
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocImage22AcceptanceTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner.acceptance;
16 |
17 | import com.google.common.collect.ImmutableList;
18 | import org.junit.AfterClass;
19 | import org.junit.BeforeClass;
20 | import org.junit.runner.RunWith;
21 | import org.junit.runners.JUnit4;
22 |
23 | @RunWith(JUnit4.class)
24 | public final class DataprocImage22AcceptanceTest extends DataprocAcceptanceTestBase {
25 |
26 | private static AcceptanceTestContext context;
27 |
28 | public DataprocImage22AcceptanceTest() {
29 | super(context);
30 | }
31 |
32 | @BeforeClass
33 | public static void setup() throws Exception {
34 | context =
35 | DataprocAcceptanceTestBase.setup("2.2-debian12", "spark-3.1-spanner", ImmutableList.of());
36 | }
37 |
38 | @AfterClass
39 | public static void tearDown() throws Exception {
40 | DataprocAcceptanceTestBase.tearDown(context);
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/cloudbuild/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | steps:
2 | # 1. Create a Docker image containing hadoop-connectors repo
3 | - name: 'gcr.io/cloud-builders/docker'
4 | id: 'docker-build'
5 | args: ['build', '--tag=gcr.io/$PROJECT_ID/dataproc-spark-spanner-connector-presubmit', '-f', 'cloudbuild/Dockerfile', '.']
6 |
7 | # 2. Fetch maven and dependencies
8 | - name: 'gcr.io/$PROJECT_ID/dataproc-spark-spanner-connector-presubmit'
9 | id: 'init'
10 | waitFor: ['docker-build']
11 | entrypoint: 'bash'
12 | args: ['/workspace/cloudbuild/presubmit.sh', 'init']
13 |
14 | # 3. Run integration tests with real Spanner databases
15 | - name: 'gcr.io/$PROJECT_ID/dataproc-spark-spanner-connector-presubmit'
16 | id: 'integration-real-spanner'
17 | waitFor: ['init']
18 | entrypoint: 'bash'
19 | args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest-real-spanner']
20 | env:
21 | - 'SPANNER_PROJECT_ID=$PROJECT_ID'
22 | - 'SPANNER_INSTANCE_ID=test-instance'
23 | - 'SPANNER_DATABASE_ID=testdb'
24 |
25 | # 4. Run acceptance tests by creating real Dataproc clusters.
26 | # TODO: Make the acceptance test run in parallel with integration-real-spanner.
27 | - name: 'gcr.io/$PROJECT_ID/dataproc-spark-spanner-connector-presubmit'
28 | id: 'acceptance-test'
29 | waitFor: ['init']
30 | entrypoint: 'bash'
31 | args: ['/workspace/cloudbuild/presubmit.sh', 'acceptance-test']
32 | env:
33 | - 'SPANNER_PROJECT_ID=$PROJECT_ID'
34 | - 'GOOGLE_CLOUD_PROJECT=$PROJECT_ID'
35 | - 'SPANNER_INSTANCE_ID=accept-testins'
36 | - 'SPANNER_DATABASE_ID=accept-testdb'
37 | - 'ACCEPTANCE_TEST_BUCKET=spark-spanner-connector-acceptance-test'
38 |
39 |
40 | timeout: 3600s
41 | options:
42 | machineType: 'N1_HIGHCPU_32'
43 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/AcceptanceTestContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner.acceptance;
16 |
17 | public class AcceptanceTestContext {
18 | final String testId;
19 | final String clusterId;
20 | final String connectorJarUri;
21 | final String testBaseGcsDir;
22 | final String spannerDataset;
23 | final String spannerTable;
24 |
25 | public AcceptanceTestContext(
26 | String testId, String clusterId, String testBaseGcsDir, String connectorJarUri) {
27 | this.testId = testId;
28 | this.clusterId = clusterId;
29 | this.testBaseGcsDir = testBaseGcsDir;
30 | this.connectorJarUri = connectorJarUri;
31 | this.spannerDataset = "spanner_acceptance_test_dataset_" + testId.replace("-", "_");
32 | this.spannerTable = "spanner_acceptance_test_table_" + testId.replace("-", "_");
33 | }
34 |
35 | public String getScriptUri(String testName) {
36 | return testBaseGcsDir + "/" + testName + "/script.py";
37 | }
38 |
39 | public String getResultsDirUri(String testName) {
40 | return testBaseGcsDir + "/" + testName + "/results";
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerPartition.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import java.io.Serializable;
18 | import org.apache.spark.Partition;
19 | import org.apache.spark.sql.catalyst.InternalRow;
20 | import org.apache.spark.sql.connector.read.InputPartition;
21 |
22 | public class SpannerPartition implements Partition, InputPartition, Serializable {
23 |
24 | private final com.google.cloud.spanner.Partition partition;
25 | private final int index;
26 | private final InputPartitionContext ctx;
27 |
28 | public SpannerPartition(
29 | com.google.cloud.spanner.Partition partition,
30 | int index,
31 | InputPartitionContext ctx) {
32 | this.index = index;
33 | this.partition = partition;
34 | this.ctx = ctx;
35 | }
36 |
37 | @Override
38 | public int index() {
39 | return this.index;
40 | }
41 |
42 | @Override
43 | public String toString() {
44 | return "SpannerPartition{index=" + this.index + ", stream=" + this.partition + "}";
45 | }
46 |
47 | public InputPartitionContext getContext() {
48 | return this.ctx;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/cloudbuild/presubmit.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2023 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the 'License');
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an 'AS IS' BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | set -euxo pipefail
17 |
18 | readonly MVN="./mvnw -B -e -s /workspace/cloudbuild/gcp-settings.xml -Dmaven.repo.local=/workspace/.repository"
19 | readonly STEP=$1
20 |
21 | cd /workspace
22 |
23 | case $STEP in
24 | # Download maven and all the dependencies
25 | init)
26 | $MVN install -DskipTests -P3.1
27 | exit
28 | ;;
29 |
30 | # Run integration tests with Spanner emulator.
31 | integrationtest-real-spanner)
32 | # Starts the Spanner emulator and setup the gcloud command.
33 | # Sets the env used in the integration test.
34 | $MVN test -T 1C "-Dtest=SpannerTableTest,SpannerScanBuilderTest,SpannerInputPartitionReaderContextTest,SparkFilterUtilsTest,ReadIntegrationTestBase,WriteIntegrationTestBase,FunctionsAndExpressionsTest,ReadIntegrationTestPg,GraphReadIntegrationTest,GraphErrorHandlingTest"
35 | ;;
36 |
37 | acceptance-test)
38 | $MVN test -T 1C -Dtest=DataprocImage20AcceptanceTest
39 | $MVN test -T 1C -Dtest=DataprocImage21AcceptanceTest
40 | $MVN test -T 1C -Dtest=DataprocImage22AcceptanceTest
41 | ;;
42 |
43 | *)
44 | echo "Unknown step $STEP"
45 | exit 1
46 | ;;
47 | esac
48 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/BatchClientWithCloser.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import com.google.cloud.spanner.BatchClient;
18 | import com.google.cloud.spanner.DatabaseClient;
19 | import com.google.cloud.spanner.Spanner;
20 |
21 | public class BatchClientWithCloser implements AutoCloseable {
22 | public BatchClient batchClient;
23 | public DatabaseClient databaseClient;
24 | private Spanner spanner;
25 |
26 | public BatchClientWithCloser(
27 | Spanner spanner, BatchClient batchClient, DatabaseClient databaseClient) {
28 | this.spanner = spanner;
29 | this.batchClient = batchClient;
30 | this.databaseClient = databaseClient;
31 | }
32 |
33 | /*
34 | * close is a runtime hook for AutoCloseable to properly shut down resources
35 | * before this object is garbage collected. It is useful in scenarios such as
36 | * asynchronous processing for which we won't have a deterministic time/scope
37 | * for when a Spanner object will be closed.
38 | */
39 | @Override
40 | public void close() {
41 | if (this.spanner != null) {
42 | this.spanner.close();
43 | this.spanner = null;
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 |
6 | com.google.cloud.spark.spanner
7 | spark-spanner-lib-parent
8 | ${revision}
9 | ../spark-spanner-lib-parent
10 |
11 |
12 | spark-3.1-spanner-lib
13 | ${revision}
14 | Connector code for spanner DataSource v2 for Spark 3.1
15 |
16 | 3.1.0
17 | true
18 |
19 |
20 |
21 | Apache License, Version 2.0
22 | http://www.apache.org/licenses/LICENSE-2.0.txt
23 | repo
24 |
25 |
26 |
27 |
28 | com.google.cloud
29 | google-cloud-spanner
30 |
31 |
32 |
33 | com.google.cloud
34 | google-cloud-dataproc
35 | test
36 |
37 |
38 |
39 | com.google.cloud
40 | google-cloud-storage
41 | test
42 |
43 |
44 |
45 | com.google.code.gson
46 | gson
47 | 2.10.1
48 |
49 |
50 |
51 | com.fasterxml.jackson.core
52 | jackson-databind
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerInputPartitionContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import com.google.cloud.spanner.BatchTransactionId;
18 | import com.google.cloud.spanner.Partition;
19 | import java.io.Serializable;
20 | import org.apache.spark.sql.catalyst.InternalRow;
21 |
22 | public class SpannerInputPartitionContext
23 | implements InputPartitionContext, Serializable {
24 |
25 | private final BatchTransactionId batchTransactionId;
26 | private final Partition partition;
27 | private final String mapAsJSONStr;
28 | private final SpannerRowConverter rowConverter;
29 |
30 | public SpannerInputPartitionContext(
31 | Partition partition,
32 | BatchTransactionId batchTransactionId,
33 | String mapAsJSONStr,
34 | SpannerRowConverter rowConverter) {
35 | this.mapAsJSONStr = mapAsJSONStr;
36 | this.partition = partition;
37 | this.batchTransactionId = batchTransactionId;
38 | this.rowConverter = rowConverter;
39 | }
40 |
41 | @Override
42 | public InputPartitionReaderContext createPartitionReaderContext() {
43 | return new SpannerInputPartitionReaderContext(
44 | partition, batchTransactionId, mapAsJSONStr, rowConverter);
45 | }
46 |
47 | @Override
48 | public boolean supportsColumnarReads() {
49 | return false;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerPartitionReaderFactory.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import org.apache.spark.sql.catalyst.InternalRow;
18 | import org.apache.spark.sql.connector.read.InputPartition;
19 | import org.apache.spark.sql.connector.read.PartitionReader;
20 | import org.apache.spark.sql.connector.read.PartitionReaderFactory;
21 | import org.apache.spark.sql.vectorized.ColumnarBatch;
22 |
23 | /*
24 | * SpannerPartitionReaderFactory is an entry to implement PartitionReaderFactory.
25 | */
26 | public class SpannerPartitionReaderFactory implements PartitionReaderFactory {
27 |
28 | public SpannerPartitionReaderFactory() {}
29 |
30 | @Override
31 | public PartitionReader createColumnarReader(InputPartition partition) {
32 | throw new SpannerConnectorException(
33 | SpannerErrorCode.COLUMNAR_READS_NOT_SUPPORTED,
34 | "Columnar reads are not supported by the Spark Spanner Connector.");
35 | }
36 |
37 | @Override
38 | public PartitionReader createReader(InputPartition partition) {
39 | InputPartitionContext ctx = ((SpannerPartition) partition).getContext();
40 | return new SpannerPartitionReader<>(ctx.createPartitionReaderContext());
41 | }
42 |
43 | @Override
44 | public boolean supportColumnarReads(InputPartition partition) {
45 | return false;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/resources/db/populate_ddl_pg.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE composite_table (
2 | id int NOT NULL,
3 | charvCol character varying(1024),
4 | textCol text,
5 | varcharCol varchar(1024),
6 | boolCol bool,
7 | booleanCol boolean,
8 | bigintCol bigint,
9 | int8Col int8,
10 | intCol int,
11 | doubleCol double precision,
12 | floatCol float8,
13 | byteCol bytea,
14 | dateCol date,
15 | numericCol numeric,
16 | decimalCol decimal,
17 | timeWithZoneCol timestamp with time zone,
18 | timestampCol timestamptz,
19 | jsonCol jsonb,
20 | PRIMARY KEY(id)
21 | );
22 |
23 | CREATE TABLE integration_composite_table (
24 | id int NOT NULL,
25 | charvCol character varying(1024),
26 | textCol text,
27 | varcharCol varchar(1024),
28 | boolCol bool,
29 | booleanCol boolean,
30 | bigintCol bigint,
31 | int8Col int8,
32 | intCol int,
33 | doubleCol double precision,
34 | floatCol float8,
35 | byteCol bytea,
36 | dateCol date,
37 | numericCol numeric,
38 | decimalCol decimal,
39 | timeWithZoneCol timestamp with time zone,
40 | timestampCol timestamptz,
41 | jsonCol jsonb,
42 | PRIMARY KEY(id)
43 | );
44 |
45 | CREATE TABLE numeric_table (
46 | id int NOT NULL,
47 | numericCol numeric,
48 | PRIMARY KEY(id)
49 | );
50 |
51 | CREATE TABLE array_table (
52 | id int NOT NULL,
53 | charvArray character varying(1024)[],
54 | boolArray bool[3],
55 | bigintArray bigint[],
56 | doubleArray double precision[],
57 | byteArray bytea[],
58 | dateArray date[],
59 | numericArray numeric[],
60 | timestampArray timestamptz[],
61 | jsonArray jsonb[],
62 | PRIMARY KEY(id)
63 | );
64 |
65 | CREATE TABLE Shakespeare (
66 | id int,
67 | word character varying(1024),
68 | word_count int,
69 | corpus character varying(1024),
70 | corpus_date int,
71 | PRIMARY KEY(id)
72 | );
73 |
74 | CREATE TABLE string_table (
75 | id bigint NOT NULL,
76 | charvCol character varying(1024),
77 | textCol text,
78 | varcharCol varchar(1024),
79 | smallCol character varying(1),
80 | PRIMARY KEY(id)
81 | );
82 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SparkSpannerIntegrationTestBase.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import java.util.Map;
18 | import org.apache.spark.sql.DataFrameReader;
19 | import org.apache.spark.sql.SparkSession;
20 | import org.junit.ClassRule;
21 | import org.junit.rules.ExternalResource;
22 |
23 | public class SparkSpannerIntegrationTestBase extends SpannerTestBase {
24 |
25 | @ClassRule public static SparkFactory sparkFactory = new SparkFactory();
26 |
27 | protected SparkSession spark;
28 |
29 | public SparkSpannerIntegrationTestBase() {
30 | this.spark = sparkFactory.spark;
31 | }
32 |
33 | public DataFrameReader reader() {
34 | Map props = connectionProperties();
35 | DataFrameReader reader =
36 | spark
37 | .read()
38 | .format("cloud-spanner")
39 | .option("viewsEnabled", true)
40 | .option("projectId", props.get("projectId"))
41 | .option("instanceId", props.get("instanceId"))
42 | .option("databaseId", props.get("databaseId"));
43 | String emulatorHost = props.get("emulatorHost");
44 | if (emulatorHost != null) reader = reader.option("emulatorHost", props.get("emulatorHost"));
45 | return reader;
46 | }
47 |
48 | protected static class SparkFactory extends ExternalResource {
49 | SparkSession spark;
50 |
51 | @Override
52 | protected void before() throws Throwable {
53 | spark =
54 | SparkSession.builder()
55 | .master("local")
56 | .config("spark.ui.enabled", "false")
57 | .config("spark.default.parallelism", 20)
58 | .getOrCreate();
59 | // reducing test's logs
60 | spark.sparkContext().setLogLevel("WARN");
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/resources/db/populate_ddl.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE game_items (
2 | itemUUID STRING(36) NOT NULL,
3 | item_name STRING(MAX) NOT NULL,
4 | item_value NUMERIC NOT NULL,
5 | available_time TIMESTAMP NOT NULL,
6 | duration INT64,
7 | ) PRIMARY KEY(itemUUID);
8 |
9 | CREATE TABLE games (
10 | gameUUID STRING(36) NOT NULL,
11 | players ARRAY NOT NULL,
12 | winner STRING(36),
13 | created TIMESTAMP,
14 | finished TIMESTAMP,
15 | max_date DATE,
16 | ) PRIMARY KEY(gameUUID);
17 |
18 | CREATE TABLE players (
19 | playerUUID STRING(36) NOT NULL,
20 | player_name STRING(64) NOT NULL,
21 | email STRING(MAX) NOT NULL,
22 | password_hash BYTES(60) NOT NULL,
23 | created TIMESTAMP,
24 | updated TIMESTAMP,
25 | stats JSON,
26 | account_balance NUMERIC NOT NULL DEFAULT (0),
27 | is_logged_in BOOL NOT NULL DEFAULT (FALSE),
28 | last_login TIMESTAMP,
29 | valid_email BOOL,
30 | current_game STRING(36),
31 | dob DATE,
32 | FOREIGN KEY(current_game) REFERENCES games(gameUUID),
33 | ) PRIMARY KEY(playerUUID);
34 |
35 | CREATE TABLE simpleTable(
36 | A INT64 NOT NULL,
37 | B STRING(100),
38 | C FLOAT64
39 | ) PRIMARY KEY(A);
40 |
41 | CREATE TABLE ATable(
42 | A INT64 NOT NULL,
43 | B STRING(100),
44 | C BYTES(MAX),
45 | D TIMESTAMP,
46 | E NUMERIC,
47 | F ARRAY,
48 | G JSON
49 | ) PRIMARY KEY(A);
50 |
51 | CREATE TABLE compositeTable (
52 | id STRING(36) NOT NULL,
53 | A ARRAY,
54 | B ARRAY,
55 | C STRING(30),
56 | D NUMERIC,
57 | E DATE,
58 | F TIMESTAMP,
59 | G BOOL,
60 | H ARRAY,
61 | I ARRAY,
62 | J BYTES(20),
63 | K JSON,
64 | ) PRIMARY KEY(id);
65 |
66 | CREATE TABLE nullsTable (
67 | id INT64,
68 | A ARRAY,
69 | B ARRAY,
70 | C STRING(30),
71 | D NUMERIC,
72 | E DATE,
73 | F TIMESTAMP,
74 | G BOOL,
75 | H ARRAY,
76 | I ARRAY,
77 | J ARRAY,
78 | K ARRAY,
79 | M ARRAY,
80 | N ARRAY,
81 | O ARRAY,
82 | ) PRIMARY KEY(id);
83 |
84 | CREATE TABLE Shakespeare (
85 | id INT64,
86 | word STRING(MAX),
87 | word_count INT64,
88 | corpus STRING(MAX),
89 | corpus_date INT64,
90 | ) PRIMARY KEY(id);
91 |
92 | CREATE TABLE bytesTable (
93 | id INT64,
94 | A BYTES(MAX),
95 | ) PRIMARY KEY(id);
96 |
97 | CREATE TABLE valueLimitsTable (
98 | A INT64,
99 | B FLOAT64,
100 | C NUMERIC,
101 | D DATE,
102 | E TIMESTAMP,
103 | ) PRIMARY KEY(A);
104 |
--------------------------------------------------------------------------------
/spark-spanner-lib-parent/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 |
6 | com.google.cloud.spark.spanner
7 | spark-spanner-parent
8 | ${revision}
9 | ../spark-spanner-parent
10 |
11 |
12 | spark-spanner-lib-parent
13 | ${revision}
14 | pom
15 | Common Spark Spanner library setting
16 |
17 | 2.12
18 | 3.1.0
19 |
20 |
21 |
22 | Apache License, Version 2.0
23 | http://www.apache.org/licenses/LICENSE-2.0.txt
24 | repo
25 |
26 |
27 |
28 |
29 | org.apache.spark
30 | spark-sql_${scala.version}
31 | ${spark.version}
32 | provided
33 |
34 |
35 |
36 |
37 | integration
38 |
39 | false
40 |
41 |
42 |
43 |
44 | org.apache.maven.plugins
45 | maven-failsafe-plugin
46 |
47 | ${argLine}
48 | 7
49 | false
50 |
51 | **/*IntegrationTest.java
52 |
53 |
54 |
55 |
56 | integration-test
57 |
58 | integration-test
59 |
60 |
61 |
62 | verify
63 |
64 | verify
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SpannerTableSchemaTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import static com.google.common.truth.Truth.assertThat;
18 |
19 | import com.google.cloud.spanner.Statement;
20 | import org.junit.Test;
21 | import org.junit.runner.RunWith;
22 | import org.junit.runners.JUnit4;
23 |
24 | /** Unit tests for SpannerTableSchema.buildSchemaQuery() */
25 | @RunWith(JUnit4.class)
26 | public class SpannerTableSchemaTest {
27 |
28 | @Test
29 | public void testBuildSchemaQuery_googleSql_usesCaseInsensitiveComparison() {
30 | Statement stmt = SpannerTableSchema.buildSchemaQuery("MyTable", false);
31 | String query = stmt.getSql();
32 |
33 | // Verify GoogleSQL uses UPPER() for case-insensitive table name comparison
34 | assertThat(query).contains("UPPER(TABLE_NAME)=UPPER(@tableName)");
35 | }
36 |
37 | @Test
38 | public void testBuildSchemaQuery_googleSql_differentCasing() {
39 | // Test with different table name casings
40 | Statement stmt1 = SpannerTableSchema.buildSchemaQuery("mytable", false);
41 | Statement stmt2 = SpannerTableSchema.buildSchemaQuery("MyTable", false);
42 | Statement stmt3 = SpannerTableSchema.buildSchemaQuery("MYTABLE", false);
43 |
44 | // All should generate the same query structure with UPPER()
45 | assertThat(stmt1.getSql()).contains("UPPER(TABLE_NAME)=UPPER(@tableName)");
46 | assertThat(stmt2.getSql()).contains("UPPER(TABLE_NAME)=UPPER(@tableName)");
47 | assertThat(stmt3.getSql()).contains("UPPER(TABLE_NAME)=UPPER(@tableName)");
48 | }
49 |
50 | @Test
51 | public void testBuildSchemaQuery_postgreSql_usesDirectComparison() {
52 | Statement stmt = SpannerTableSchema.buildSchemaQuery("myTable", true);
53 | String query = stmt.getSql();
54 |
55 | // Verify PostgreSQL uses direct comparison without UPPER()
56 | assertThat(query).contains("columns.table_name=$1");
57 | assertThat(query).doesNotContain("UPPER");
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 |
6 | com.google.cloud.spark.spanner
7 | spark-spanner-parent
8 | ${revision}
9 | spark-spanner-parent
10 |
11 |
12 | spark-spanner-reactor
13 | pom
14 | Spark Spanner Connector Reactor
15 |
16 |
17 |
18 | Apache License, Version 2.0
19 | http://www.apache.org/licenses/LICENSE-2.0.txt
20 | repo
21 |
22 |
23 |
24 |
25 |
26 | Google Inc.
27 | http://www.google.com
28 |
29 |
30 |
31 |
32 |
33 | scm:git:git@github.com:GoogleCloudDataproc/spark-spanner-connector.git
34 |
35 |
36 | scm:git:git@github.com:GoogleCloudDataproc/spark-spanner-connector.git
37 |
38 | git@github.com:GoogleCloudDataproc/spark-spanner-connector.git
39 |
40 |
41 |
42 | GitHub Issues
43 |
44 | https://github.com/GoogleCloudDataproc/spark-spanner-connector/issues
45 |
46 |
47 |
48 |
49 | spark-spanner-parent
50 | spark-spanner-lib-parent
51 | spark-3.1-spanner-lib
52 |
53 |
54 |
55 |
56 | 3.1
57 | false
58 |
59 | spark-3.1-spanner
60 |
61 |
62 |
63 | 3.2
64 | false
65 |
66 | spark-3.2-spanner-lib
67 | spark-3.2-spanner
68 |
69 |
70 |
71 | 3.3
72 | false
73 |
74 | spark-3.2-spanner-lib
75 | spark-3.2-spanner
76 | spark-3.3-spanner-lib
77 | spark-3.3-spanner
78 |
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/WriteIntegrationTestBase.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import static com.google.common.truth.Truth.assertThat;
18 | import static org.junit.Assert.assertThrows;
19 |
20 | import java.util.Map;
21 | import org.apache.spark.SparkException;
22 | import org.apache.spark.sql.DataFrameWriter;
23 | import org.apache.spark.sql.Dataset;
24 | import org.apache.spark.sql.Row;
25 | import org.junit.Test;
26 |
27 | public class WriteIntegrationTestBase extends SparkSpannerIntegrationTestBase {
28 |
29 | private static ReadIntegrationTestBase readIntegration = new ReadIntegrationTestBase();
30 | private Map props = this.connectionProperties();
31 |
32 | public DataFrameWriter writerToTable(Dataset df, String table) {
33 | return df.write()
34 | .format("cloud-spanner")
35 | .option("viewsEnabled", true)
36 | .option("projectId", props.get("projectId"))
37 | .option("instanceId", props.get("instanceId"))
38 | .option("databaseId", props.get("databaseId"))
39 | .option("emulatorHost", props.get("emulatorHost"))
40 | .option("table", table);
41 | }
42 |
43 | @Test
44 | public void testWritesToTableFail() {
45 | String table = "compositeTable";
46 | Dataset drf = readIntegration.readFromTable(table);
47 | SparkException e =
48 | assertThrows(
49 | SparkException.class,
50 | () -> {
51 | DataFrameWriter dwf = writerToTable(drf.select("id"), table);
52 | dwf.saveAsTable(table);
53 | });
54 | assertThat(e)
55 | .hasMessageThat()
56 | .isEqualTo("Table implementation does not support writes: default.compositeTable");
57 | }
58 |
59 | @Test
60 | public void testCreateTableFail() {
61 | String table = "compositeTable";
62 | Dataset drf = readIntegration.readFromTable(table);
63 | SpannerConnectorException e =
64 | assertThrows(
65 | SpannerConnectorException.class,
66 | () -> {
67 | DataFrameWriter dwf = writerToTable(drf.select("id"), table);
68 | dwf.save();
69 | });
70 | assertThat(e)
71 | .hasMessageThat()
72 | .isEqualTo("writes are not supported in the Spark Spanner Connector");
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/NodeElementTableQuery.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph.query;
2 |
3 | import com.google.cloud.spark.spanner.SpannerTableSchema;
4 | import com.google.cloud.spark.spanner.graph.PropertyGraph;
5 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphElementTable;
6 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs;
7 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs.LabelConfig;
8 | import java.util.List;
9 |
10 | /** Query for a node table */
11 | public class NodeElementTableQuery extends ElementTableQuery {
12 |
13 | /**
14 | * Construct a query for a node element table
15 | *
16 | * @param graphSchema schema of the graph
17 | * @param elementTable the element table to construct a query for
18 | * @param configs user configs for exporting the graph
19 | * @param exportIdColumnDirectly export the key column directly as the "id" column to avoid the
20 | * need of downstream ID translation. Should be true only when there is only one key column in
21 | * this table.
22 | * @return a {@link EdgeElementTableQuery} for the element table.
23 | */
24 | public static NodeElementTableQuery create(
25 | PropertyGraph graphSchema,
26 | GraphElementTable elementTable,
27 | SpannerTableSchema baseTableSchema,
28 | SpannerGraphConfigs configs,
29 | boolean exportIdColumnDirectly) {
30 |
31 | List matchedLabels = getMatchedLabels(elementTable, configs.nodeLabelConfigs);
32 |
33 | return new NodeElementTableQuery(
34 | graphSchema,
35 | elementTable,
36 | baseTableSchema,
37 | configs.outputIndividualKeys,
38 | exportIdColumnDirectly,
39 | mergeProperties(elementTable, matchedLabels),
40 | mergeWhereClauses(matchedLabels));
41 | }
42 |
43 | private NodeElementTableQuery(
44 | PropertyGraph graphSchema,
45 | GraphElementTable elementTable,
46 | SpannerTableSchema baseTableSchema,
47 | boolean outputIndividualKeys,
48 | boolean exportIdColumnDirectly,
49 | List properties,
50 | String whereClause) {
51 | super(baseTableSchema, whereClause);
52 | if (!PropertyGraph.GRAPH_ELEMENT_TABLE_KIND_NODE.equalsIgnoreCase(elementTable.kind)) {
53 | throw new IllegalArgumentException("Invalid elementTable kind: " + elementTable.kind);
54 | }
55 |
56 | if (exportIdColumnDirectly) {
57 | if (elementTable.keyColumns.size() != 1) {
58 | throw new IllegalArgumentException(
59 | "Cannot export multiple key columns directly as one ID column.");
60 | }
61 | addDirectField(elementTable.keyColumns.get(0), "id");
62 | } else {
63 | if (outputIndividualKeys) {
64 | addNodeTableColumn("id", graphSchema.getTableId(elementTable.name));
65 | addIndividualKeyColumns("id", elementTable.keyColumns, elementTable.keyColumns);
66 | } else {
67 | addCombinedId("id", graphSchema.getTableId(elementTable.name), elementTable.keyColumns);
68 | }
69 | }
70 |
71 | addInnerProperties(elementTable.propertyDefinitions);
72 | addOutputProperties(graphSchema, properties);
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/resources/db/insert_data_pg.sql:
--------------------------------------------------------------------------------
1 | DELETE FROM composite_table WHERE 1=1;
2 |
3 | INSERT INTO composite_table (id, charvcol, textcol, varcharcol, boolcol, booleancol, bigintcol, int8col, intcol, doublecol, floatcol, bytecol, datecol, numericcol, decimalcol, timewithzonecol, timestampcol, jsoncol)
4 | VALUES (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL),
5 | (2, 'charvcol', 'textcol', 'varcharcol', true, false, 1, -1, 0, 0.00000001, 0.00000001, 'beefdead', '1999-01-08', NUMERIC '1.23456e05', NUMERIC '9e23', '2003-04-12 04:05:06 America/Los_Angeles', '2003-04-12 05:05:06 America/Los_Angeles', '{"tags": ["multi-cuisine", "open-seating"], "rating": 4.5}');
6 |
7 | DELETE FROM array_table WHERE 1=1;
8 |
9 | INSERT INTO array_table (id, charvarray, boolarray, bigintarray, doublearray, bytearray, datearray, numericarray, timestamparray, jsonarray)
10 | VALUES (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL),
11 | (2, '{NULL, "charvarray"}', '{NULL, true}', '{NULL, 1024}', '{NULL, 0.00000001}', '{NULL, "beefdead"}', '{NULL, "1999-01-08"}', '{NULL, "1.2345e05"}', '{NULL, "2003-04-12 04:05:06 America/Los_Angeles"}', ARRAY[NULL, CAST('{"tags": ["multi-cuisine", "open-seating"], "rating": 4.5}' as JSONB)]),
12 | (3, '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');
13 |
14 | DELETE FROM integration_composite_table WHERE 1=1;
15 |
16 | INSERT INTO integration_composite_table (id, charvcol, textcol, varcharcol, boolcol, booleancol, bigintcol, int8col, intcol, doublecol, floatcol, bytecol, datecol, numericcol, decimalcol, timewithzonecol, timestampcol, jsoncol)
17 | VALUES (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL),
18 | (2, 'charvcol', 'textcol', 'varcharcol', true, false, 1, -1, 0, 0.00000001, 0.00000001, 'beefdead', '1999-01-08', NUMERIC '1.23456e05', NUMERIC '9e23', '2003-04-12 04:05:06 America/Los_Angeles', '2003-04-12 05:05:06 America/Los_Angeles', '{"tags": ["multi-cuisine", "open-seating"], "rating": 4.5}'),
19 | (3, NULL, NULL, NULL, NULL, NULL, 9223372036854775807, -9223372036854775808, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL),
20 | (4, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'NaN', 'NaN', NULL, NULL, NULL, NULL, NULL, NULL, NULL),
21 | (5, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, '9999-12-31', NULL, NULL, NULL, NULL, NULL),
22 | (6, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, '1700-01-01', NULL, NULL, NULL, NULL, NULL),
23 | (7, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 99999999999999999999999999999.999999999, -99999999999999999999999999999.999999999, NULL, NULL, NULL),
24 | (8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, '0001-01-01 23:00:00 America/Los_Angeles', '9999-12-30 01:00:00 America/Los_Angeles', NULL),
25 | (9, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'NaN', 'NaN', NULL, NULL, NULL),
26 | (10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'inf', '-inf', NULL, NULL, NULL, NULL, NULL, NULL, NULL);
27 |
28 |
29 | DELETE FROM numeric_table WHERE 1=1;
30 |
31 | INSERT INTO numeric_table (id, numericcol)
32 | VALUES (1, 9999999999999999999999999999999999999999999.9999999999999999999999);
33 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | ## Community Guidelines
26 |
27 | This project follows
28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
29 |
30 | ## Building the project
31 |
32 | To build, package, and run all unit tests run the command
33 |
34 | ```
35 | mvn clean verify
36 | ```
37 |
38 | ### Running Integration tests
39 |
40 | To include integration tests when building the project, you need access to
41 | a GCP Project with a valid service account.
42 |
43 | For instructions on how to generate a service account and corresponding
44 | credentials JSON see: [Creating a Service Account][1].
45 |
46 | Then run the following to build, package, run all unit tests and run all
47 | integration tests.
48 |
49 | ```bash
50 | export GOOGLE_APPLICATION_CREDENTIALS=/path/to/service/account.json
51 | mvn -Penable-integration-tests clean verify
52 | ```
53 |
54 | ## Code Samples
55 |
56 | All code samples must be in compliance with the [java sample formatting guide][3].
57 | Code Samples must be bundled in separate Maven modules.
58 |
59 | The samples must be separate from the primary project for a few reasons:
60 | 1. Primary projects have a minimum Java version of Java 8 whereas samples can have
61 | Java version of Java 11. Due to this we need the ability to
62 | selectively exclude samples from a build run.
63 | 2. Many code samples depend on external GCP services and need
64 | credentials to access the service.
65 | 3. Code samples are not released as Maven artifacts and must be excluded from
66 | release builds.
67 |
68 | ### Building
69 |
70 | ```bash
71 | mvn clean verify
72 | ```
73 |
74 | Some samples require access to GCP services and require a service account:
75 |
76 | ```bash
77 | export GOOGLE_APPLICATION_CREDENTIALS=/path/to/service/account.json
78 | mvn clean verify
79 | ```
80 |
81 | ### Code Formatting
82 |
83 | Code in this repo is formatted with
84 | [google-java-format](https://github.com/google/google-java-format).
85 | To run formatting on your project, you can run:
86 | ```
87 | mvn com.coveo:fmt-maven-plugin:format
88 | ```
89 |
90 | [1]: https://cloud.google.com/docs/authentication/getting-started#creating_a_service_account
91 | [2]: https://maven.apache.org/settings.html#Active_Profiles
92 | [3]: https://github.com/GoogleCloudPlatform/java-docs-samples/blob/main/SAMPLE_FORMAT.md
93 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerRowConverterWithSchema.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph;
2 |
3 | import com.google.cloud.Tuple;
4 | import com.google.cloud.spanner.Struct;
5 | import com.google.cloud.spark.spanner.SpannerRowConverter;
6 | import com.google.cloud.spark.spanner.SpannerUtils;
7 | import com.google.common.collect.Streams;
8 | import java.io.Serializable;
9 | import java.util.ArrayList;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.stream.Collectors;
13 | import org.apache.spark.sql.catalyst.InternalRow;
14 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
15 | import org.apache.spark.sql.types.StructField;
16 | import org.apache.spark.sql.types.StructType;
17 |
18 | /** Converts rows from Spanner query outputs to rows in a Spark DataFrame with specific schema. */
19 | public class SpannerRowConverterWithSchema implements SpannerRowConverter, Serializable {
20 |
21 | private final List sparkFields = new ArrayList<>();
22 |
23 | public SpannerRowConverterWithSchema(
24 | StructType dataframeSchema,
25 | List queryOutputColumns,
26 | Map fixedValues) {
27 | Map nameToQueryOutputColumnIndex =
28 | Streams.mapWithIndex(queryOutputColumns.stream(), Tuple::of)
29 | .collect(Collectors.toMap(Tuple::x, Tuple::y));
30 | for (StructField field : dataframeSchema.fields()) {
31 | Integer fixedValue = fixedValues.get(field.name());
32 | if (fixedValue != null) {
33 | sparkFields.add(new FixedIntField(fixedValue));
34 | continue;
35 | }
36 | Long spannerRowIndex = nameToQueryOutputColumnIndex.get(field.name());
37 | if (spannerRowIndex != null) {
38 | sparkFields.add(new ValueField(spannerRowIndex.intValue()));
39 | continue;
40 | }
41 | sparkFields.add(new NullField());
42 | }
43 | }
44 |
45 | @Override
46 | public InternalRow convert(Struct spannerRow) {
47 | GenericInternalRow sparkRow = new GenericInternalRow(sparkFields.size());
48 | for (int i = 0; i < sparkFields.size(); ++i) {
49 | sparkFields.get(i).update(sparkRow, spannerRow, i);
50 | }
51 | return sparkRow;
52 | }
53 |
54 | private abstract static class Field implements Serializable {
55 | public abstract void update(GenericInternalRow sparkRow, Struct spannerRow, int sparkRowIndex);
56 | }
57 |
58 | private static class NullField extends Field {
59 | @Override
60 | public void update(GenericInternalRow sparkRow, Struct spannerRow, int sparkRowIndex) {
61 | sparkRow.update(sparkRowIndex, null);
62 | }
63 | }
64 |
65 | private static class FixedIntField extends Field {
66 |
67 | private final int value;
68 |
69 | FixedIntField(int value) {
70 | this.value = value;
71 | }
72 |
73 | @Override
74 | public void update(GenericInternalRow sparkRow, Struct spannerRow, int sparkRowIndex) {
75 | sparkRow.setInt(sparkRowIndex, value);
76 | }
77 | }
78 |
79 | private static class ValueField extends Field {
80 |
81 | private final int spannerRowIndex;
82 |
83 | ValueField(int spannerRowIndex) {
84 | this.spannerRowIndex = spannerRowIndex;
85 | }
86 |
87 | @Override
88 | public void update(GenericInternalRow sparkRow, Struct spannerRow, int sparkRowIndex) {
89 | SpannerUtils.convertRowAt(spannerRow, spannerRowIndex, sparkRow, sparkRowIndex);
90 | }
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SpannerScannerTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import static com.google.common.truth.Truth.assertThat;
18 |
19 | import java.util.Arrays;
20 | import java.util.HashSet;
21 | import java.util.Set;
22 | import org.junit.Test;
23 | import org.junit.runner.RunWith;
24 | import org.junit.runners.JUnit4;
25 |
26 | /** Unit tests for SpannerScanner.buildColumnsWithTablePrefix() */
27 | @RunWith(JUnit4.class)
28 | public class SpannerScannerTest {
29 |
30 | @Test
31 | public void testBuildColumnsWithTablePrefix_googleSql_singleColumn() {
32 | Set columns = new HashSet<>(Arrays.asList("id"));
33 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, false);
34 | assertThat(result).isEqualTo("`users`.`id`");
35 | }
36 |
37 | @Test
38 | public void testBuildColumnsWithTablePrefix_googleSql_multipleColumns() {
39 | Set columns = new HashSet<>(Arrays.asList("id", "name"));
40 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, false);
41 | assertThat(result).contains("`users`.`id`");
42 | assertThat(result).contains("`users`.`name`");
43 | }
44 |
45 | @Test
46 | public void testBuildColumnsWithTablePrefix_googleSql_columnMatchingTableName() {
47 | Set columns = new HashSet<>(Arrays.asList("users", "id"));
48 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, false);
49 | assertThat(result).contains("`users`.`users`");
50 | assertThat(result).contains("`users`.`id`");
51 | }
52 |
53 | @Test
54 | public void testBuildColumnsWithTablePrefix_postgreSql_singleColumn() {
55 | Set columns = new HashSet<>(Arrays.asList("id"));
56 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, true);
57 | assertThat(result).isEqualTo("\"users\".\"id\"");
58 | }
59 |
60 | @Test
61 | public void testBuildColumnsWithTablePrefix_postgreSql_multipleColumns() {
62 | Set columns = new HashSet<>(Arrays.asList("id", "name"));
63 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, true);
64 | assertThat(result).contains("\"users\".\"id\"");
65 | assertThat(result).contains("\"users\".\"name\"");
66 | }
67 |
68 | @Test
69 | public void testBuildColumnsWithTablePrefix_postgreSql_columnMatchingTableName() {
70 | Set columns = new HashSet<>(Arrays.asList("users", "id"));
71 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, true);
72 | assertThat(result).contains("\"users\".\"users\"");
73 | assertThat(result).contains("\"users\".\"id\"");
74 | }
75 |
76 | @Test
77 | public void testBuildColumnsWithTablePrefix_emptyColumns() {
78 | Set columns = new HashSet<>();
79 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, false);
80 | assertThat(result).isEmpty();
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/graph/GraphErrorHandlingTest.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph;
2 |
3 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs.LabelConfig;
4 | import com.google.common.collect.ImmutableList;
5 | import com.google.gson.Gson;
6 | import java.util.Collection;
7 | import java.util.Collections;
8 | import org.apache.spark.sql.DataFrameReader;
9 | import org.junit.Assert;
10 | import org.junit.Test;
11 |
12 | public class GraphErrorHandlingTest extends GraphReadIntegrationTestBase {
13 |
14 | @Test
15 | public void testDirectQueryNonRootPartitionable() {
16 | String nodeQuery =
17 | "SELECT * FROM GRAPH_TABLE (MusicGraph MATCH (n:SINGER|ALBUM) RETURN n.id AS id)";
18 | DataFrameReader reader =
19 | musicGraphReader(null).option("graphQuery", nodeQuery).option("type", "node");
20 | Exception e = Assert.assertThrows(Exception.class, reader::load);
21 | Assert.assertTrue(e.getMessage().contains("root-partitionable"));
22 | }
23 |
24 | @Test
25 | public void testDirectQueryNoId() {
26 | String nodeQuery =
27 | "SELECT * FROM GRAPH_TABLE (MusicGraph MATCH (n:SINGER) RETURN n.id AS no_id)";
28 | DataFrameReader reader =
29 | musicGraphReader(null).option("graphQuery", nodeQuery).option("type", "node");
30 | Exception e = Assert.assertThrows(IllegalArgumentException.class, reader::load);
31 | Assert.assertTrue(e.getMessage().contains("id missing"));
32 | }
33 |
34 | @Test
35 | public void testWildcardLabelMixedWithOtherLabel() {
36 | SpannerGraphConfigs configs = new SpannerGraphConfigs();
37 | configs.nodeLabelConfigs.add(new LabelConfig("*", Collections.emptyList(), null));
38 | configs.nodeLabelConfigs.add(new LabelConfig("SINGER", Collections.emptyList(), null));
39 | configs.edgeLabelConfigs.add(new LabelConfig("*", Collections.emptyList(), null));
40 | configs.edgeLabelConfigs.add(new LabelConfig("KNOWN", Collections.emptyList(), null));
41 |
42 | DataFrameReader reader = musicGraphReader(null).option("configs", new Gson().toJson(configs));
43 |
44 | Assert.assertThrows(IllegalArgumentException.class, () -> reader.option("type", "node").load());
45 | Assert.assertThrows(IllegalArgumentException.class, () -> reader.option("type", "edge").load());
46 | }
47 |
48 | @Test
49 | public void testEdgeReferencingFilteredOutNodes() {
50 | SpannerGraphConfigs configs = new SpannerGraphConfigs();
51 | configs.nodeLabelConfigs.add(new LabelConfig("SINGER", Collections.emptyList(), null));
52 | Assert.assertThrows(IllegalArgumentException.class, () -> readEdges(musicGraphReader(configs)));
53 | }
54 |
55 | private void testNonExistentProperties(Collection labels, boolean node) {
56 | SpannerGraphConfigs configs = new SpannerGraphConfigs();
57 | if (node) {
58 | configs.nodeLabelConfigs.addAll(labels);
59 | } else {
60 | configs.edgeLabelConfigs.addAll(labels);
61 | }
62 | Exception e =
63 | Assert.assertThrows(
64 | IllegalArgumentException.class,
65 | () -> musicGraphReader(configs).option("type", node ? "node" : "edge").load());
66 | Assert.assertTrue(e.getMessage().contains("property"));
67 | }
68 |
69 | @Test
70 | public void testNonExistentProperties() {
71 | testNonExistentProperties(
72 | ImmutableList.of(new LabelConfig("*", ImmutableList.of("FriendId"), null)), true);
73 | testNonExistentProperties(
74 | ImmutableList.of(new LabelConfig("*", ImmutableList.of("AlbumTitle"), null)), false);
75 | testNonExistentProperties(
76 | ImmutableList.of(new LabelConfig("SINGER", ImmutableList.of("album_id"), null)), true);
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerTableSchema.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner;
2 |
3 | import com.google.cloud.spanner.ResultSet;
4 | import com.google.cloud.spanner.Statement;
5 | import com.google.cloud.spanner.Struct;
6 | import com.google.cloud.spanner.connection.Connection;
7 | import java.util.HashMap;
8 | import java.util.Map;
9 | import java.util.Objects;
10 | import org.apache.spark.sql.types.DataType;
11 | import org.apache.spark.sql.types.MetadataBuilder;
12 | import org.apache.spark.sql.types.StructField;
13 | import org.apache.spark.sql.types.StructType;
14 |
15 | public class SpannerTableSchema {
16 |
17 | private static final String QUERY_PREFIX =
18 | "SELECT COLUMN_NAME, IS_NULLABLE='YES' AS ISNULLABLE, SPANNER_TYPE "
19 | + "FROM INFORMATION_SCHEMA.COLUMNS WHERE ";
20 | private static final String QUERY_SUFFIX = " ORDER BY ORDINAL_POSITION";
21 | private static final String GOOGLESQL_SCHEMA =
22 | QUERY_PREFIX + "UPPER(TABLE_NAME)=UPPER(@tableName)" + QUERY_SUFFIX;
23 | private static final String POSTGRESQL_SCHEMA =
24 | QUERY_PREFIX + "columns.table_name=$1" + QUERY_SUFFIX;
25 |
26 | private final Map columns;
27 |
28 | public final String name;
29 | public final StructType schema;
30 |
31 | static Statement buildSchemaQuery(String tableName, boolean isPostgreSql) {
32 | if (isPostgreSql) {
33 | return Statement.newBuilder(POSTGRESQL_SCHEMA).bind("p1").to(tableName).build();
34 | } else {
35 | return Statement.newBuilder(GOOGLESQL_SCHEMA).bind("tableName").to(tableName).build();
36 | }
37 | }
38 |
39 | public SpannerTableSchema(Connection conn, String tableName, boolean isPostgreSql) {
40 | this.name = tableName;
41 | this.columns = new HashMap<>();
42 | Statement stmt = buildSchemaQuery(tableName, isPostgreSql);
43 | try (final ResultSet rs = conn.executeQuery(stmt)) {
44 | // Expecting resultset columns in the ordering:
45 | // COLUMN_NAME, IS_NULLABLE, SPANNER_TYPE
46 | // row1:
47 | // ...
48 | // rowN:
49 | StructType schema = new StructType();
50 | while (rs.next()) {
51 | Struct row = rs.getCurrentRowAsStruct();
52 | String columnName = row.getString(0);
53 | StructField structField =
54 | getSparkStructField(columnName, row.getString(2), row.getBoolean(1), isPostgreSql);
55 | schema = schema.add(structField);
56 | this.columns.put(columnName, structField);
57 | }
58 | this.schema = schema;
59 | }
60 | }
61 |
62 | public static StructField getSparkStructField(
63 | String name, String spannerType, boolean isNullable, boolean isPostgreSql) {
64 | DataType catalogType =
65 | isPostgreSql
66 | ? SpannerTable.ofSpannerStrTypePg(spannerType, isNullable)
67 | : SpannerTable.ofSpannerStrType(spannerType, isNullable);
68 | MetadataBuilder metadataBuilder = new MetadataBuilder();
69 | if (isJson(spannerType)) {
70 | metadataBuilder.putString(SpannerUtils.COLUMN_TYPE, "json");
71 | } else if (isJsonb(spannerType)) {
72 | metadataBuilder.putString(SpannerUtils.COLUMN_TYPE, "jsonb");
73 | }
74 | return new StructField(name, catalogType, isNullable, metadataBuilder.build());
75 | }
76 |
77 | public StructField getStructFieldForColumn(String columnName) {
78 | return Objects.requireNonNull(columns.get(columnName));
79 | }
80 |
81 | public static boolean isJson(String spannerStrType) {
82 | return "json".equalsIgnoreCase(spannerStrType.trim());
83 | }
84 |
85 | public static boolean isJsonb(String spannerStrType) {
86 | return "jsonb".equalsIgnoreCase(spannerStrType.trim());
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraph.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph;
2 |
3 | import com.google.cloud.spanner.Options;
4 | import com.google.cloud.spanner.Statement;
5 | import com.google.cloud.spanner.TimestampBound;
6 | import com.google.cloud.spark.spanner.SpannerConnectorException;
7 | import com.google.cloud.spark.spanner.SpannerErrorCode;
8 | import com.google.cloud.spark.spanner.graph.query.SpannerGraphQuery;
9 | import com.google.common.collect.ImmutableList;
10 | import com.google.common.collect.ImmutableSet;
11 | import java.util.List;
12 | import java.util.Map;
13 | import java.util.Objects;
14 | import java.util.Set;
15 | import javax.annotation.Nullable;
16 | import org.apache.spark.sql.connector.catalog.SupportsRead;
17 | import org.apache.spark.sql.connector.catalog.SupportsWrite;
18 | import org.apache.spark.sql.connector.catalog.Table;
19 | import org.apache.spark.sql.connector.catalog.TableCapability;
20 | import org.apache.spark.sql.connector.read.ScanBuilder;
21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo;
22 | import org.apache.spark.sql.connector.write.WriteBuilder;
23 | import org.apache.spark.sql.types.StructType;
24 | import org.apache.spark.sql.util.CaseInsensitiveStringMap;
25 |
26 | /** Represents the Spanner Graph data source in Spark */
27 | public class SpannerGraph implements Table, SupportsRead, SupportsWrite {
28 |
29 | static final List requiredOptions =
30 | ImmutableList.of("projectId", "instanceId", "databaseId", "graph", "type");
31 |
32 | public final Map options;
33 | public final Options.ReadAndQueryOption dataBoostEnabled;
34 | public final SpannerGraphConfigs configs;
35 | public final @Nullable Statement directQuery;
36 | public final boolean nodeDataframe;
37 | public final SpannerGraphQuery spannerGraphQuery;
38 | public final TimestampBound readTimestamp;
39 | public final String graphName;
40 |
41 | SpannerGraph(
42 | Map options,
43 | String graphName,
44 | SpannerGraphConfigs configs,
45 | @Nullable Statement directQuery,
46 | boolean dataBoost,
47 | boolean node,
48 | TimestampBound readTimestamp,
49 | SpannerGraphQuery spannerGraphQuery) {
50 | checkOptions(options);
51 | this.graphName = graphName;
52 | this.options = new CaseInsensitiveStringMap(options);
53 | this.configs = Objects.requireNonNull(configs);
54 | this.directQuery = directQuery;
55 | this.dataBoostEnabled = Options.dataBoostEnabled(dataBoost);
56 | this.nodeDataframe = node;
57 | this.readTimestamp = readTimestamp;
58 | this.spannerGraphQuery = spannerGraphQuery;
59 | }
60 |
61 | static void checkOptions(Map options) {
62 | for (String o : requiredOptions) {
63 | Objects.requireNonNull(options.get(o), "missing " + o + " in the options");
64 | }
65 | }
66 |
67 | @Override
68 | public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {
69 | return new SpannerGraphScanBuilder(this);
70 | }
71 |
72 | @Override
73 | public WriteBuilder newWriteBuilder(LogicalWriteInfo info) {
74 | throw new SpannerConnectorException(
75 | SpannerErrorCode.WRITES_NOT_SUPPORTED,
76 | "writes are not supported in the Spark Spanner Connector");
77 | }
78 |
79 | @Override
80 | public String name() {
81 | return graphName;
82 | }
83 |
84 | /** Returns the schema of this table. */
85 | @Override
86 | public StructType schema() {
87 | return spannerGraphQuery.dataframeSchema;
88 | }
89 |
90 | /** Returns the set of capabilities for this table. */
91 | @Override
92 | public Set capabilities() {
93 | return ImmutableSet.of(TableCapability.BATCH_READ);
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/DirectGraphQuery.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph.query;
2 |
3 | import com.google.cloud.Tuple;
4 | import com.google.cloud.spanner.ReadContext.QueryAnalyzeMode;
5 | import com.google.cloud.spanner.ResultSet;
6 | import com.google.cloud.spanner.Statement;
7 | import com.google.cloud.spanner.connection.Connection;
8 | import com.google.cloud.spark.spanner.SpannerRowConverter;
9 | import com.google.cloud.spark.spanner.SpannerRowConverterDirect;
10 | import com.google.cloud.spark.spanner.SpannerTableSchema;
11 | import com.google.common.collect.ImmutableSet;
12 | import com.google.spanner.v1.ResultSetMetadata;
13 | import java.util.ArrayList;
14 | import java.util.Arrays;
15 | import java.util.Collections;
16 | import java.util.List;
17 | import java.util.Set;
18 | import java.util.stream.Collectors;
19 | import org.apache.spark.sql.types.StructField;
20 | import org.apache.spark.sql.types.StructType;
21 |
22 | /** A user-provided GQL query for fetching nodes/edges */
23 | public class DirectGraphQuery implements GraphSubQuery {
24 |
25 | private final Statement query;
26 | private final List outputSparkFields;
27 |
28 | public DirectGraphQuery(Connection conn, Statement query, boolean node) {
29 | this.query = query;
30 | this.outputSparkFields = Collections.unmodifiableList(getOutputSparkFields(conn, query, node));
31 | }
32 |
33 | private static List getOutputSparkFields(
34 | Connection conn, Statement query, boolean node) {
35 | final Set idColumns = node ? ImmutableSet.of("id") : ImmutableSet.of("src", "dst");
36 |
37 | List fields;
38 | try (ResultSet rs = conn.analyzeQuery(query, QueryAnalyzeMode.PLAN)) {
39 | fields = resultSetMetadataToSchema(rs.getMetadata(), idColumns);
40 | }
41 | for (String idColumn : idColumns) {
42 | boolean hasField = fields.stream().map(StructField::name).anyMatch(n -> n.equals(idColumn));
43 | if (!hasField) {
44 | throw new IllegalArgumentException(
45 | String.format(
46 | "Column %s missing in the query output. Query: %s. Spark fields: %s",
47 | idColumn, query, fields));
48 | }
49 | }
50 | return fields;
51 | }
52 |
53 | private static List resultSetMetadataToSchema(
54 | ResultSetMetadata metadata, Set notNullableColumns) {
55 | List fields = new ArrayList<>();
56 | for (com.google.spanner.v1.StructType.Field column : metadata.getRowType().getFieldsList()) {
57 | String name = column.getName();
58 | String type = column.getType().getCode().name();
59 | boolean isNullable = !notNullableColumns.contains(name);
60 | fields.add(SpannerTableSchema.getSparkStructField(name, type, isNullable, false));
61 | }
62 | return fields;
63 | }
64 |
65 | @Override
66 | public Tuple getQueryAndConverter(StructType dataframeSchema) {
67 | if (Arrays.equals(outputSparkFields.toArray(new StructField[0]), dataframeSchema.fields())) {
68 | return Tuple.of(query, new SpannerRowConverterDirect());
69 | } else {
70 | String selectedColumnNames =
71 | Arrays.stream(dataframeSchema.fields())
72 | .map(StructField::name)
73 | .collect(Collectors.joining(", "));
74 | if (selectedColumnNames.isEmpty()) {
75 | selectedColumnNames = "null";
76 | }
77 | String prunedSql = String.format("SELECT %s FROM (%s)", selectedColumnNames, query.getSql());
78 | return Tuple.of(
79 | query.toBuilder().replace(prunedSql).build(), new SpannerRowConverterDirect());
80 | }
81 | }
82 |
83 | @Override
84 | public List getOutputSparkFields() {
85 | return outputSparkFields;
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerInputPartitionReaderContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import com.fasterxml.jackson.core.JsonProcessingException;
18 | import com.google.cloud.spanner.BatchReadOnlyTransaction;
19 | import com.google.cloud.spanner.BatchTransactionId;
20 | import com.google.cloud.spanner.ErrorCode;
21 | import com.google.cloud.spanner.Partition;
22 | import com.google.cloud.spanner.ResultSet;
23 | import com.google.cloud.spanner.SpannerException;
24 | import java.io.IOException;
25 | import java.util.Map;
26 | import java.util.Objects;
27 | import org.apache.spark.sql.catalyst.InternalRow;
28 | import org.apache.spark.sql.util.CaseInsensitiveStringMap;
29 |
30 | public class SpannerInputPartitionReaderContext
31 | implements AutoCloseable, InputPartitionReaderContext {
32 |
33 | private BatchClientWithCloser batchClientWithCloser;
34 | private ResultSet rs;
35 | private final SpannerRowConverter rowConverter;
36 |
37 | public SpannerInputPartitionReaderContext(
38 | Partition partition,
39 | BatchTransactionId batchTransactionId,
40 | String mapAsJSONStr,
41 | SpannerRowConverter rowConverter) {
42 | Map opts;
43 | try {
44 | opts = SpannerUtils.deserializeMap(mapAsJSONStr);
45 | } catch (JsonProcessingException e) {
46 | throw new SpannerConnectorException(
47 | SpannerErrorCode.SPANNER_FAILED_TO_PARSE_OPTIONS, "Error parsing the input options.", e);
48 | }
49 | // The map might be case-insensitive when being serialized
50 | opts = new CaseInsensitiveStringMap(opts);
51 |
52 | // Please note that we are using BatchClientWithCloser to avoid resource leaks.
53 | // That is because, since we do have a deterministic scope and timeline for how long
54 | // SpannerInputPartitionReaderContext's BatchClient.Spanner will execute, we use this
55 | // custom client with an AutoCloser that'll clean up resources as it is garbage collected.
56 | this.batchClientWithCloser = SpannerUtils.batchClientFromProperties(opts);
57 | try (BatchReadOnlyTransaction txn =
58 | batchClientWithCloser.batchClient.batchReadOnlyTransaction(batchTransactionId)) {
59 | this.rs = txn.execute(partition);
60 | }
61 | this.rowConverter = Objects.requireNonNull(rowConverter);
62 | }
63 |
64 | @Override
65 | public boolean next() throws IOException {
66 | try {
67 | return this.rs.next();
68 | } catch (SpannerException e) {
69 | if (e.getErrorCode() == ErrorCode.RESOURCE_EXHAUSTED) {
70 | throw new SpannerConnectorException(
71 | SpannerErrorCode.RESOURCE_EXHAUSTED_ON_SPANNER,
72 | e.getMessage().split("- Statement:")[0]
73 | + "You may receive the error message due to not enough quota on the project.");
74 | }
75 | throw e;
76 | }
77 | }
78 |
79 | @Override
80 | public InternalRow get() {
81 | return rowConverter.convert(this.rs.getCurrentRowAsStruct());
82 | }
83 |
84 | @Override
85 | public void close() throws IOException {
86 | if (this.rs != null) {
87 | this.rs.close();
88 | this.rs = null;
89 | }
90 | if (this.batchClientWithCloser != null) {
91 | this.batchClientWithCloser.close();
92 | this.batchClientWithCloser = null;
93 | }
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/resources/db/insert_data_graph.sql:
--------------------------------------------------------------------------------
1 | DELETE FROM FlexibleGraphNode WHERE TRUE;
2 | INSERT INTO FlexibleGraphNode(id, type, properties, cluster_id)
3 | VALUES (1, "Person", JSON '{"birthday":"1991-12-21T08:00:00Z","city":"Adelaide","country":"Australia","name":"Alex"}', 1),
4 | (2, "Person", JSON '{"birthday":"1980-10-31T08:00:00Z","city":"Moravia","country":"Czech_Republic","name":"Dana"}', 1),
5 | (3, "Person", JSON '{"birthday":"1986-12-07T08:00:00Z","city":"Kollam","country":"India","name":"Lee"}', 1),
6 | (7, "Account", JSON '{"create_time":"2020-01-10T14:22:20.222Z","is_blocked":false,"nick_name":"Vacation Fund"}', 1),
7 | (16, "Account", JSON '{"create_time":"2020-01-28T01:55:09.206Z","is_blocked":true,"nick_name":"Vacation Fund"}', 1),
8 | (20, "Account", JSON '{"create_time":"2020-02-18T13:44:20.655Z","is_blocked":false,"nick_name":"Rainy Day Fund"}', 1),
9 | (100, "Account", JSON '{"create_time":"2020-01-10T14:22:20.222Z","is_blocked":false,"nick_name":"Vacation Fund"}', 100),
10 | (101, "Account", JSON '{"create_time":"2020-01-28T01:55:09.206Z","is_blocked":true,"nick_name":"Vacation Fund"}',100);
11 |
12 | DELETE FROM FlexibleGraphEdge WHERE TRUE;
13 | INSERT INTO FlexibleGraphEdge(id, edge_type, to_id, edge_id, properties)
14 | VALUES (1, "Owns", 7, "2020-01-10T14:22:20.222Z",
15 | JSON '{"create_time":"2020-01-10T14:22:20.222Z"}'),
16 | (2, "Owns", 20, "2020-01-28T01:55:09.206Z",
17 | JSON '{"create_time":"2020-01-28T01:55:09.206Z"}'),
18 | (3, "Owns", 16, "2020-02-18T13:44:20.655Z",
19 | JSON '{"create_time":"2020-02-18T13:44:20.655Z"}'),
20 | (7, "Transfers", 16, "2020-08-29T22:28:58.647Z",
21 | JSON '{"amount":300,"create_time":"2020-08-29T22:28:58.647Z","order_number":"304330008004315"}'),
22 | (7, "Transfers", 16, "2020-10-04T23:55:05.342Z",
23 | JSON '{"amount":100,"create_time":"2020-10-04T23:55:05.342Z","order_number":"304120005529714"}'),
24 | (16, "Transfers", 20, "2020-09-25T09:36:14.926Z",
25 | JSON '{"amount":300,"create_time":"2020-09-25T09:36:14.926Z","order_number":"103650009791820"}'),
26 | (20, "Transfers", 7, "2020-10-04T23:55:05.342Z",
27 | JSON '{"amount":500,"create_time":"2020-10-04T23:55:05.342Z","order_number":"304120005529714"}'),
28 | (20, "Transfers", 16, "2020-10-17T10:59:40.247Z",
29 | JSON '{"amount":200,"create_time":"2020-10-17T10:59:40.247Z","order_number":"302290001255747"}'),
30 | (100, "Transfers", 101, "2020-08-29T22:28:58.647Z",
31 | JSON '{"amount":300,"create_time":"2020-08-29T22:28:58.647Z","order_number":"304330008004315"}');
32 |
33 | DELETE FROM ProductionCompanies WHERE TRUE;
34 | INSERT INTO ProductionCompanies (CompanyId, CompanyName, LocationCountry, FoundedYear) VALUES (1, 'Mellow Wave', 'U.S.A.', 1993), (2, 'Rolling Stow', 'Canada', 2002), (3, 'Picky Penang', 'Malaysia', 1984), (4, 'Ice Ice', 'Poland', 2012), (5, 'Oint Is Not An Ink', 'Peru', 2000);
35 |
36 | DELETE FROM Singers WHERE TRUE;
37 | INSERT INTO Singers (SingerId, FirstName, LastName, BirthDate) VALUES (1, 'Cruz', 'Richards', '1970-9-3'), (2, 'Tristan', 'Smith', '1990-8-17') ,(3, 'Izumi', 'Trentor', '1991-10-2'), (4, 'Ira', 'Martin', '1991-11-9'),(5, 'Mahan', 'Lomond', '1977-1-29');
38 |
39 | DELETE FROM Albums WHERE TRUE;
40 | INSERT INTO Albums (SingerId, AlbumId, AlbumTitle, ReleaseDate, CompanyId) VALUES (1, 1, 'Total Junk', '2014-3-2', 1), (1, 2, 'Go Go Go', '2011-2-9', 1), (2, 3, 'Green', '2012-9-17', 2), (2, 4, 'Forever Hold Your Peace', '2010-10-15', 3), (3, 5, 'Terrified', '2008-6-7', 3), (4, 6, 'Nothing To Do With Me', '2014-4-29', 4), (5, 7, 'Play', '2013-12-21', 5);
41 |
42 | DELETE FROM SingerFriends WHERE TRUE;
43 | INSERT INTO SingerFriends (SingerId, FriendId) VALUES (1, 2), (1, 3), (2, 1), (2, 4), (2, 5), (3, 1), (3, 5), (4, 2), (4, 5), (5, 2), (5, 3), (5, 4);
44 |
45 | DELETE FROM SingerContracts WHERE TRUE;
46 | INSERT INTO SingerContracts (SingerId, CompanyId) VALUES (1, 4), (2, 2), (3, 5), (4, 1), (5, 3);
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/Spark31SpannerTableProvider.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import com.google.cloud.spark.spanner.graph.SpannerGraphBuilder;
18 | import java.util.Map;
19 | import javax.annotation.Nullable;
20 | import org.apache.spark.sql.Dataset;
21 | import org.apache.spark.sql.Row;
22 | import org.apache.spark.sql.SQLContext;
23 | import org.apache.spark.sql.SaveMode;
24 | import org.apache.spark.sql.connector.catalog.Table;
25 | import org.apache.spark.sql.connector.catalog.TableProvider;
26 | import org.apache.spark.sql.connector.expressions.Transform;
27 | import org.apache.spark.sql.sources.BaseRelation;
28 | import org.apache.spark.sql.sources.CreatableRelationProvider;
29 | import org.apache.spark.sql.sources.DataSourceRegister;
30 | import org.apache.spark.sql.types.StructType;
31 | import org.apache.spark.sql.util.CaseInsensitiveStringMap;
32 |
33 | public class Spark31SpannerTableProvider
34 | implements DataSourceRegister, TableProvider, CreatableRelationProvider {
35 |
36 | private @Nullable Table table;
37 |
38 | /*
39 | * Infers the schema of the table identified by the given options.
40 | */
41 | @Override
42 | public StructType inferSchema(CaseInsensitiveStringMap options) {
43 | if (table == null) {
44 | table = getTable(options);
45 | }
46 | return table.schema();
47 | }
48 |
49 | /*
50 | * Returns a Table instance with the specified table schema,
51 | * partitioning and properties to perform a read or write.
52 | */
53 | @Override
54 | public Table getTable(
55 | StructType schema, Transform[] partitioning, Map properties) {
56 | if (table == null) {
57 | table = getTable(properties);
58 | }
59 | return table;
60 | }
61 |
62 | /*
63 | * Returns true if the source has the ability of
64 | * accepting external table metadata when getting tables.
65 | */
66 | @Override
67 | public boolean supportsExternalMetadata() {
68 | return false;
69 | }
70 |
71 | /*
72 | * Implements DataSourceRegister.shortName(). This method allows Spark to match
73 | * the DataSource when spark.read(...).format("spanner") is invoked.
74 | */
75 | @Override
76 | public String shortName() {
77 | return "cloud-spanner";
78 | }
79 |
80 | /** Creation of Database is not supported by the Spark Spanner Connector. */
81 | @Override
82 | public BaseRelation createRelation(
83 | SQLContext sqlContext,
84 | SaveMode mode,
85 | scala.collection.immutable.Map parameters,
86 | Dataset data) {
87 | throw new SpannerConnectorException(
88 | SpannerErrorCode.WRITES_NOT_SUPPORTED,
89 | "writes are not supported in the Spark Spanner Connector");
90 | }
91 |
92 | private Table getTable(Map properties) {
93 | boolean hasTable = properties.containsKey("table");
94 | boolean hasGraph = properties.containsKey("graph");
95 | if (hasTable && !hasGraph) {
96 | return new SpannerTable(properties);
97 | } else if (!hasTable && hasGraph) {
98 | return SpannerGraphBuilder.build(properties);
99 | } else {
100 | throw new SpannerConnectorException("properties must contain one of \"table\" or \"graph\"");
101 | }
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerScanBuilder.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import com.google.common.collect.ImmutableSet;
18 | import java.util.ArrayList;
19 | import java.util.LinkedHashMap;
20 | import java.util.List;
21 | import java.util.Map;
22 | import java.util.Set;
23 | import org.apache.spark.sql.connector.read.Scan;
24 | import org.apache.spark.sql.connector.read.ScanBuilder;
25 | import org.apache.spark.sql.connector.read.SupportsPushDownFilters;
26 | import org.apache.spark.sql.connector.read.SupportsPushDownRequiredColumns;
27 | import org.apache.spark.sql.sources.Filter;
28 | import org.apache.spark.sql.types.StructField;
29 | import org.apache.spark.sql.types.StructType;
30 | import org.apache.spark.sql.util.CaseInsensitiveStringMap;
31 | import org.slf4j.Logger;
32 | import org.slf4j.LoggerFactory;
33 |
34 | /*
35 | * Allows us to implement ScanBuilder.
36 | */
37 | public class SpannerScanBuilder
38 | implements ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns {
39 | private CaseInsensitiveStringMap opts;
40 | private List pushedFilters;
41 | private Set requiredColumns;
42 | private SpannerScanner scanner;
43 | private static final Logger log = LoggerFactory.getLogger(SpannerScanBuilder.class);
44 | private SpannerTable spannerTable;
45 | private Map fields;
46 |
47 | public SpannerScanBuilder(CaseInsensitiveStringMap options) {
48 | this.opts = options;
49 | this.pushedFilters = new ArrayList();
50 | this.spannerTable = new SpannerTable(options);
51 | this.fields = new LinkedHashMap<>();
52 | for (StructField field : spannerTable.schema().fields()) {
53 | fields.put(field.name(), field);
54 | }
55 | }
56 |
57 | @Override
58 | public Scan build() {
59 | this.scanner =
60 | new SpannerScanner(
61 | this.opts.asCaseSensitiveMap(),
62 | this.spannerTable,
63 | this.fields,
64 | this.pushedFilters(),
65 | this.requiredColumns);
66 | return this.scanner;
67 | }
68 |
69 | @Override
70 | public Filter[] pushedFilters() {
71 | return this.pushedFilters.toArray(new Filter[0]);
72 | }
73 |
74 | @Override
75 | public Filter[] pushFilters(Filter[] filters) {
76 | List handledFilters = new ArrayList<>();
77 | List unhandledFilters = new ArrayList<>();
78 | for (Filter filter : filters) {
79 | if (SparkFilterUtils.isTopLevelFieldHandled(false, filter, fields)) {
80 | handledFilters.add(filter);
81 | } else {
82 | unhandledFilters.add(filter);
83 | }
84 | }
85 | this.pushedFilters.addAll(handledFilters);
86 | return unhandledFilters.stream().toArray(Filter[]::new);
87 | }
88 |
89 | /*
90 | * pruneColumns applies column pruning with respect to the requiredSchema.
91 | * The docs recommend implementing this methood to push down required columns
92 | * to the data source and only read these columns during scan to
93 | * reduce the size of the data to be read.
94 | */
95 | @Override
96 | public void pruneColumns(StructType requiredSchema) {
97 | // A user could invoke: SELECT a, b, d, a FROM TABLE;
98 | // and we should still be able to serve them back their
99 | // query without deduplication.
100 | this.requiredColumns = ImmutableSet.copyOf(requiredSchema.fieldNames());
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/TestData.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import com.google.cloud.spanner.Mutation;
18 | import com.google.common.io.CharStreams;
19 | import java.io.IOException;
20 | import java.io.InputStream;
21 | import java.io.InputStreamReader;
22 | import java.util.ArrayList;
23 | import java.util.Arrays;
24 | import java.util.List;
25 | import java.util.Objects;
26 | import java.util.stream.Collectors;
27 |
28 | public final class TestData {
29 | public static List initialDDL = createInitialDDL("/db/populate_ddl.sql");
30 | public static List initialDDLPg = createInitialDDL("/db/populate_ddl_pg.sql");
31 | public static List initialDDLGraph = createInitialDDL("/db/populate_ddl_graph.sql");
32 | public static List initialDML = createInitialDML("/db/insert_data.sql");
33 | public static List initialDMLPg = createInitialDML("/db/insert_data_pg.sql");
34 | public static List initialDMLGraph = createInitialDML("/db/insert_data_graph.sql");
35 | public static List shakespearMutations = createShakespeareTableMutations();
36 |
37 | private TestData() {}
38 |
39 | private static List readAndParseSQL(String filename) {
40 | String initialDDL = mustReadResource(filename);
41 | String[] statements =
42 | Arrays.stream(initialDDL.split("\\r?\\n"))
43 | .map(String::trim)
44 | .filter(l -> !l.startsWith("--"))
45 | .collect(Collectors.joining("\n"))
46 | .split(";");
47 | return Arrays.stream(statements)
48 | .map(String::trim)
49 | .filter(s -> !s.isEmpty())
50 | .collect(Collectors.toList());
51 | }
52 |
53 | private static List createInitialDDL(String filePath) {
54 | return readAndParseSQL(filePath);
55 | }
56 |
57 | private static List createInitialDML(String filePath) {
58 | return readAndParseSQL(filePath);
59 | }
60 |
61 | private static String mustReadResource(String path) {
62 | try (InputStream stream = TestData.class.getResourceAsStream(path)) {
63 | String data = CharStreams.toString(new InputStreamReader(Objects.requireNonNull(stream)));
64 | if (data == null || data.length() == 0) {
65 | throw new RuntimeException(path + " has no content");
66 | }
67 | return data;
68 | } catch (IOException e) {
69 | throw new RuntimeException("failed to read resource " + path, e);
70 | }
71 | }
72 |
73 | private static List createShakespeareTableMutations() {
74 | String csv = mustReadResource("/db/shakespeare_bq.csv");
75 | String[] csvLines = csv.trim().split("\n");
76 | Long id = 1L;
77 | List mutations = new ArrayList<>();
78 | for (String csvLine : csvLines) {
79 | csvLine = csvLine.trim();
80 | if (csvLine.equals("") || csvLine.equals("\n")) {
81 | continue;
82 | }
83 |
84 | String[] splits = csvLine.split(",");
85 |
86 | mutations.add(
87 | Mutation.newInsertBuilder("Shakespeare")
88 | .set("id")
89 | .to(id)
90 | .set("word")
91 | .to(splits[0])
92 | .set("word_count")
93 | .to(splits[1])
94 | .set("corpus")
95 | .to(splits[2])
96 | .set("corpus_date")
97 | .to(splits[3])
98 | .build());
99 | id++;
100 | }
101 | return mutations;
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/EdgeElementTableQuery.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph.query;
2 |
3 | import com.google.cloud.spark.spanner.SpannerTableSchema;
4 | import com.google.cloud.spark.spanner.graph.PropertyGraph;
5 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphElementTable;
6 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphNodeTableReference;
7 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs;
8 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs.LabelConfig;
9 | import java.util.List;
10 |
11 | /** Query for an edge table */
12 | public class EdgeElementTableQuery extends ElementTableQuery {
13 |
14 | /**
15 | * Construct a query for an edge element table
16 | *
17 | * @param graphSchema schema of the graph
18 | * @param elementTable the element table to construct a query for
19 | * @param configs user configs for exporting the graph
20 | * @param exportIdColumnDirectly export the key column for src/dst directly to avoid the need of
21 | * downstream ID translation. Should be true only when there is only one key column for src
22 | * and one key column for dst.
23 | * @return a {@link EdgeElementTableQuery} for the element table.
24 | */
25 | public static EdgeElementTableQuery create(
26 | PropertyGraph graphSchema,
27 | GraphElementTable elementTable,
28 | SpannerTableSchema baseTableSchema,
29 | SpannerGraphConfigs configs,
30 | boolean exportIdColumnDirectly) {
31 |
32 | List matchedLabels = getMatchedLabels(elementTable, configs.edgeLabelConfigs);
33 |
34 | return new EdgeElementTableQuery(
35 | graphSchema,
36 | elementTable,
37 | baseTableSchema,
38 | configs.outputIndividualKeys,
39 | exportIdColumnDirectly,
40 | mergeProperties(elementTable, matchedLabels),
41 | mergeWhereClauses(matchedLabels));
42 | }
43 |
44 | private EdgeElementTableQuery(
45 | PropertyGraph graphSchema,
46 | GraphElementTable elementTable,
47 | SpannerTableSchema baseTableSchema,
48 | boolean outputIndividualKeys,
49 | boolean exportIdColumnDirectly,
50 | List properties,
51 | String whereClause) {
52 | super(baseTableSchema, whereClause);
53 | if (!PropertyGraph.GRAPH_ELEMENT_TABLE_KIND_EDGE.equalsIgnoreCase(elementTable.kind)) {
54 | throw new IllegalArgumentException("Invalid elementTable kind: " + elementTable.kind);
55 | }
56 | graphSchema.checkEdgeReferenceKeyColumnsMatchNodeKeyColumns(elementTable);
57 |
58 | if (exportIdColumnDirectly) {
59 | if (elementTable.sourceNodeTable.edgeTableColumns.size() != 1
60 | || elementTable.destinationNodeTable.edgeTableColumns.size() != 1) {
61 | throw new IllegalArgumentException(
62 | "Cannot export multiple key columns directly as one SRC/DST column. ");
63 | }
64 | addDirectField(elementTable.sourceNodeTable.edgeTableColumns.get(0), "src");
65 | addDirectField(elementTable.destinationNodeTable.edgeTableColumns.get(0), "dst");
66 | } else {
67 | if (outputIndividualKeys) {
68 | addIndividualKeysForNodeTableReference("src", graphSchema, elementTable.sourceNodeTable);
69 | addIndividualKeysForNodeTableReference(
70 | "dst", graphSchema, elementTable.destinationNodeTable);
71 | } else {
72 | addCombinedId(
73 | "src",
74 | graphSchema.getTableId(elementTable.sourceNodeTable.nodeTableName),
75 | elementTable.sourceNodeTable.edgeTableColumns);
76 | addCombinedId(
77 | "dst",
78 | graphSchema.getTableId(elementTable.destinationNodeTable.nodeTableName),
79 | elementTable.destinationNodeTable.edgeTableColumns);
80 | }
81 | }
82 |
83 | addInnerProperties(elementTable.propertyDefinitions);
84 | addOutputProperties(graphSchema, properties);
85 | }
86 |
87 | private void addIndividualKeysForNodeTableReference(
88 | String type, PropertyGraph graphSchema, GraphNodeTableReference nodeTableReference) {
89 | addNodeTableColumn(type, graphSchema.getTableId(nodeTableReference.nodeTableName));
90 | addIndividualKeyColumns(
91 | type, nodeTableReference.edgeTableColumns, nodeTableReference.nodeTableColumns);
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/OpenLineageIntegrationTestBase.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner;
2 |
3 | import static com.google.common.truth.Truth.assertThat;
4 |
5 | import com.google.gson.JsonObject;
6 | import com.google.gson.JsonParser;
7 | import io.openlineage.spark.agent.OpenLineageSparkListener;
8 | import java.io.File;
9 | import java.util.ArrayList;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.Scanner;
13 | import org.apache.spark.sql.Dataset;
14 | import org.apache.spark.sql.Row;
15 | import org.apache.spark.sql.SaveMode;
16 | import org.apache.spark.sql.SparkSession;
17 | import org.junit.ClassRule;
18 | import org.junit.Test;
19 | import org.junit.rules.ExternalResource;
20 |
21 | public class OpenLineageIntegrationTestBase extends SpannerTestBase {
22 |
23 | @ClassRule public static OLSparkFactory sparkFactory = new OLSparkFactory();
24 |
25 | protected SparkSession spark;
26 | protected File lineageOutputFile;
27 |
28 | protected Map connectionProperties;
29 |
30 | public OpenLineageIntegrationTestBase() {
31 | this.spark = sparkFactory.spark;
32 | this.lineageOutputFile = sparkFactory.lineageOutputFile;
33 | this.connectionProperties = connectionProperties();
34 | }
35 |
36 | protected static class OLSparkFactory extends ExternalResource {
37 | SparkSession spark;
38 |
39 | File lineageOutputFile;
40 |
41 | @Override
42 | protected void before() throws Throwable {
43 | lineageOutputFile = File.createTempFile("openlineage_test_" + System.nanoTime(), ".log");
44 | lineageOutputFile.deleteOnExit();
45 | spark =
46 | SparkSession.builder()
47 | .master("local")
48 | .config("spark.ui.enabled", "false")
49 | .config("spark.default.parallelism", 20)
50 | .config("spark.extraListeners", OpenLineageSparkListener.class.getCanonicalName())
51 | .config("spark.openlineage.transport.type", "file")
52 | .config("spark.openlineage.transport.location", lineageOutputFile.getAbsolutePath())
53 | .getOrCreate();
54 | spark.sparkContext().setLogLevel("WARN");
55 | }
56 | }
57 |
58 | public Dataset readFromTable(String table) {
59 | Map props = this.connectionProperties();
60 | return spark
61 | .read()
62 | .format("cloud-spanner")
63 | .option("viewsEnabled", true)
64 | .option("projectId", props.get("projectId"))
65 | .option("instanceId", props.get("instanceId"))
66 | .option("databaseId", props.get("databaseId"))
67 | .option("emulatorHost", props.get("emulatorHost"))
68 | .option("table", table)
69 | .load();
70 | }
71 |
72 | @Test
73 | public void testOpenLineageEvents() throws Exception {
74 | File outputCsv = File.createTempFile("output_" + System.nanoTime(), ".csv");
75 | outputCsv.deleteOnExit();
76 | Dataset df = readFromTable("compositeTable");
77 | df.createOrReplaceTempView("tempview");
78 | Dataset outputDf =
79 | spark.sql(
80 | "SELECT word, count(*) AS count FROM (SELECT explode(split(C, ' ')) AS word FROM tempview) GROUP BY 1");
81 |
82 | outputDf
83 | .coalesce(1)
84 | .write()
85 | .format("csv")
86 | .mode(SaveMode.Overwrite)
87 | .save("file://" + outputCsv.getPath());
88 |
89 | List jsonObjects = parseEventLog(lineageOutputFile);
90 | assertThat(jsonObjects).isNotEmpty();
91 |
92 | jsonObjects.forEach(
93 | jsonObject -> {
94 | JsonObject input = jsonObject.getAsJsonArray("inputs").get(0).getAsJsonObject();
95 | assertThat(input.get("namespace").getAsString())
96 | .isEqualTo(
97 | String.format(
98 | "spanner://%s/%s",
99 | connectionProperties.get("projectId"),
100 | connectionProperties.get("instanceId")));
101 | assertThat(input.get("name").getAsString())
102 | .isEqualTo(
103 | String.format("%s/%s", connectionProperties.get("databaseId"), "compositeTable"));
104 | });
105 | }
106 |
107 | private List parseEventLog(File file) throws Exception {
108 | List eventList;
109 | try (Scanner scanner = new Scanner(file)) {
110 | eventList = new ArrayList<>();
111 | while (scanner.hasNextLine()) {
112 | String line = scanner.nextLine();
113 | JsonObject event = JsonParser.parseString(line).getAsJsonObject();
114 | if (!event.getAsJsonArray("inputs").isEmpty()) {
115 | eventList.add(event);
116 | }
117 | }
118 | }
119 | return eventList;
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraphBuilder.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph;
2 |
3 | import com.google.cloud.Timestamp;
4 | import com.google.cloud.spanner.Dialect;
5 | import com.google.cloud.spanner.PartitionOptions;
6 | import com.google.cloud.spanner.Statement;
7 | import com.google.cloud.spanner.TimestampBound;
8 | import com.google.cloud.spanner.connection.AbstractStatementParser;
9 | import com.google.cloud.spanner.connection.Connection;
10 | import com.google.cloud.spark.spanner.SpannerConnectorException;
11 | import com.google.cloud.spark.spanner.SpannerErrorCode;
12 | import com.google.cloud.spark.spanner.SpannerUtils;
13 | import com.google.cloud.spark.spanner.graph.query.SpannerGraphQuery;
14 | import java.util.Map;
15 | import java.util.Objects;
16 |
17 | /** Builder for {@link SpannerGraph} */
18 | public class SpannerGraphBuilder {
19 |
20 | public static SpannerGraph build(Map options) {
21 | SpannerGraph.checkOptions(options);
22 | String graphName = Objects.requireNonNull(options.get("graph"));
23 | String directQueryString = options.get("graphQuery");
24 | boolean dataBoost = getEnableDataBoost(options);
25 | String configsJson = options.get("configs");
26 | SpannerGraphConfigs configs =
27 | configsJson != null ? SpannerGraphConfigs.fromJson(configsJson) : new SpannerGraphConfigs();
28 | boolean node = getIsNodeDataframe(options);
29 | TimestampBound readTimestamp = getReadTimestamp(options);
30 | Statement directQuery = directQueryString != null ? Statement.of(directQueryString) : null;
31 |
32 | SpannerGraphQuery graphQuery;
33 | try (Connection conn = getConnection(options)) {
34 | // Ensure the version of the schema read matches the specified timestamp
35 | conn.setReadOnly(true);
36 | conn.setAutocommit(true);
37 | conn.setReadOnlyStaleness(readTimestamp);
38 |
39 | PropertyGraph graphSchema = PropertyGraph.Builder.getFromSpanner(conn, graphName);
40 | configs.validate(graphSchema, directQuery != null);
41 | if (directQuery != null) {
42 | checkQueryIsSql(directQuery);
43 | // Test if the provided query is root-partitionable
44 | // Will throw an exception if the query is not root-partitionable
45 | conn.partitionQuery(directQuery, PartitionOptions.getDefaultInstance()).close();
46 | graphQuery = new SpannerGraphQuery(conn, directQuery, node);
47 | } else {
48 | graphQuery = new SpannerGraphQuery(conn, graphSchema, configs, node);
49 | }
50 | }
51 |
52 | return new SpannerGraph(
53 | options, graphName, configs, directQuery, dataBoost, node, readTimestamp, graphQuery);
54 | }
55 |
56 | private static boolean getEnableDataBoost(Map options) {
57 | final String dataBoostEnabledKey = "enableDataBoost";
58 | String dataBoost = options.getOrDefault(dataBoostEnabledKey, "false");
59 | if ("true".equalsIgnoreCase(dataBoost)) {
60 | return true;
61 | } else if ("false".equalsIgnoreCase(dataBoost)) {
62 | return false;
63 | } else {
64 | throw new IllegalArgumentException(dataBoostEnabledKey + " must be true or false");
65 | }
66 | }
67 |
68 | private static boolean getIsNodeDataframe(Map options) {
69 | String type = Objects.requireNonNull(options.get("type"));
70 | if ("node".equalsIgnoreCase(type)) {
71 | return true;
72 | } else if ("edge".equalsIgnoreCase(type)) {
73 | return false;
74 | } else {
75 | throw new IllegalArgumentException("type must be node or edge");
76 | }
77 | }
78 |
79 | private static TimestampBound getReadTimestamp(Map options) {
80 | String timestamp = options.get("timestamp");
81 | return TimestampBound.ofReadTimestamp(
82 | timestamp == null ? Timestamp.now() : Timestamp.parseTimestamp(timestamp));
83 | }
84 |
85 | private static Connection getConnection(Map options) {
86 | Connection conn = SpannerUtils.connectionFromProperties(options);
87 | if (!conn.getDialect().equals(Dialect.GOOGLE_STANDARD_SQL)) {
88 | throw new SpannerConnectorException(
89 | SpannerErrorCode.DATABASE_DIALECT_NOT_SUPPORTED,
90 | "Expecting dialect: GOOGLE_STANDARD_SQL, but the actual dialect used is "
91 | + conn.getDialect());
92 | }
93 | return conn;
94 | }
95 |
96 | private static void checkQueryIsSql(Statement query) {
97 | AbstractStatementParser parser =
98 | AbstractStatementParser.getInstance(Dialect.GOOGLE_STANDARD_SQL);
99 | if (!parser.isQuery(parser.removeCommentsAndTrim(query.getSql()))) {
100 | throw new IllegalArgumentException(
101 | "Only SQL queries starting with SELECT are supported. Query provided: " + query);
102 | }
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/resources/db/populate_ddl_graph.sql:
--------------------------------------------------------------------------------
1 | -- FlexibleGraph
2 |
3 | CREATE TABLE FlexibleGraphNode
4 | (
5 | id INT64 NOT NULL,
6 | type STRING( MAX) NOT NULL,
7 | properties JSON,
8 | cluster_id INT64,
9 | ) PRIMARY KEY(id);
10 |
11 | CREATE INDEX NodeByType ON FlexibleGraphNode (type);
12 |
13 | CREATE TABLE FlexibleGraphEdge
14 | (
15 | id INT64 NOT NULL,
16 | edge_type STRING( MAX) NOT NULL,
17 | to_id INT64 NOT NULL,
18 | edge_id STRING( MAX),
19 | properties JSON,
20 | CONSTRAINT FK_ToNode FOREIGN KEY (to_id) REFERENCES FlexibleGraphNode (id),
21 | ) PRIMARY KEY(id, edge_type, to_id, edge_id),
22 | INTERLEAVE IN PARENT FlexibleGraphNode ON DELETE CASCADE;
23 |
24 | CREATE PROPERTY GRAPH FlexibleGraph
25 | NODE TABLES(
26 | FlexibleGraphNode
27 | KEY(id)
28 | LABEL Node PROPERTIES(
29 | cluster_id,
30 | id,
31 | properties,
32 | type)
33 | )
34 | EDGE TABLES(
35 | FlexibleGraphEdge
36 | KEY(id, edge_type, to_id, edge_id)
37 | SOURCE KEY(id) REFERENCES FlexibleGraphNode(id)
38 | DESTINATION KEY(to_id) REFERENCES FlexibleGraphNode(id)
39 | LABEL Edge PROPERTIES(
40 | edge_id,
41 | edge_type,
42 | id,
43 | properties,
44 | to_id)
45 | );
46 |
47 | -- MusicGraph
48 |
49 | CREATE TABLE ProductionCompanies
50 | (
51 | CompanyId INT64 NOT NULL,
52 | CompanyName STRING( MAX) NOT NULL,
53 | LocationCountry STRING( MAX) NOT NULL,
54 | FoundedYear INT64 NOT NULL,
55 | ) PRIMARY KEY(CompanyId);
56 |
57 | CREATE TABLE Singers
58 | (
59 | SingerId INT64 NOT NULL,
60 | FirstName STRING(1024),
61 | LastName STRING(1024),
62 | BirthDate DATE,
63 | ) PRIMARY KEY(SingerId);
64 |
65 | CREATE TABLE Albums
66 | (
67 | SingerId INT64 NOT NULL,
68 | AlbumId INT64 NOT NULL,
69 | AlbumTitle STRING( MAX),
70 | ReleaseDate DATE,
71 | CompanyId INT64 NOT NULL,
72 | CONSTRAINT FKProductionCompanyId FOREIGN KEY (CompanyId) REFERENCES ProductionCompanies (CompanyId),
73 | ) PRIMARY KEY(SingerId, AlbumId),
74 | INTERLEAVE IN PARENT Singers ON
75 | DELETE
76 | CASCADE;
77 |
78 | CREATE TABLE SingerContracts
79 | (
80 | SingerId INT64 NOT NULL,
81 | CompanyId INT64 NOT NULL,
82 | CONSTRAINT FKSingerCompanyId FOREIGN KEY (CompanyId) REFERENCES ProductionCompanies (CompanyId),
83 | ) PRIMARY KEY(SingerId, CompanyId),
84 | INTERLEAVE IN PARENT Singers ON
85 | DELETE
86 | CASCADE;
87 |
88 | CREATE TABLE SingerFriends
89 | (
90 | SingerId INT64 NOT NULL,
91 | FriendId INT64 NOT NULL,
92 | CONSTRAINT FKSingerFriendId FOREIGN KEY (FriendId) REFERENCES Singers (SingerId),
93 | ) PRIMARY KEY(SingerId, FriendId),
94 | INTERLEAVE IN PARENT Singers ON
95 | DELETE
96 | CASCADE;
97 |
98 | CREATE OR REPLACE PROPERTY GRAPH MusicGraph
99 | NODE TABLES(
100 | Albums AS Album
101 | KEY(SingerId, AlbumId)
102 | LABEL ALBUM PROPERTIES(
103 | AlbumId,
104 | AlbumTitle,
105 | CompanyId,
106 | ReleaseDate,
107 | SingerId),
108 |
109 | ProductionCompanies AS Company
110 | KEY(CompanyId)
111 | LABEL MUSIC_COMPANY PROPERTIES(
112 | FoundedYear AS founded_year,
113 | CompanyName AS name)
114 | LABEL MUSIC_CREATOR PROPERTIES(
115 | LocationCountry AS country_origin,
116 | CompanyName AS name),
117 |
118 | Singers AS Singer
119 | KEY(SingerId)
120 | LABEL MUSIC_CREATOR PROPERTIES(
121 | "US" AS country_origin,
122 | CONCAT(FirstName, " ", LastName) AS name)
123 | LABEL SINGER PROPERTIES(
124 | BirthDate AS birthday,
125 | SingerId AS id,
126 | CONCAT(FirstName, " ", LastName) AS singer_name)
127 | )
128 | EDGE TABLES(
129 | Albums AS COMPANY_PRODUCES_ALBUM
130 | KEY(CompanyId, SingerId, AlbumId)
131 | SOURCE KEY(CompanyId) REFERENCES Company(CompanyId)
132 | DESTINATION KEY(AlbumId, SingerId) REFERENCES Album(AlbumId, SingerId)
133 | LABEL CREATES_MUSIC PROPERTIES(
134 | AlbumId AS album_id,
135 | ReleaseDate AS release_date),
136 |
137 | Albums AS SINGER_CREATES_ALBUM
138 | KEY(SingerId, AlbumId)
139 | SOURCE KEY(SingerId) REFERENCES Singer(SingerId)
140 | DESTINATION KEY(AlbumId, SingerId) REFERENCES Album(AlbumId, SingerId)
141 | LABEL CREATES_MUSIC PROPERTIES(
142 | AlbumId AS album_id,
143 | ReleaseDate AS release_date),
144 |
145 | SingerFriends AS SINGER_HAS_FRIEND
146 | KEY(SingerId, FriendId)
147 | SOURCE KEY(SingerId) REFERENCES Singer(SingerId)
148 | DESTINATION KEY(FriendId) REFERENCES Singer(SingerId)
149 | LABEL KNOWS PROPERTIES(
150 | FriendId,
151 | SingerId),
152 |
153 | SingerContracts AS SINGER_SIGNED_BY_COMPANY
154 | KEY(SingerId, CompanyId)
155 | SOURCE KEY(SingerId) REFERENCES Singer(SingerId)
156 | DESTINATION KEY(CompanyId) REFERENCES Company(CompanyId)
157 | LABEL SIGNED_BY PROPERTIES(
158 | CompanyId,
159 | SingerId)
160 | );
161 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/resources/db/insert_data.sql:
--------------------------------------------------------------------------------
1 | DELETE FROM ATable WHERE 1=1;
2 |
3 | INSERT INTO
4 | ATable(A, B, C, D, E)
5 | VALUES
6 | (1, "2", NULL, TIMESTAMP("2023-08-22T12:22:00Z"), 1000.282111401),
7 | (10, "20", NULL, TIMESTAMP("2023-08-22T12:23:00Z"), 10000.282111603),
8 | (30, "30", NULL, TIMESTAMP("2023-08-22T12:24:00Z"), 30000.282111805);
9 |
10 | DELETE FROM simpleTable WHERE 1=1;
11 |
12 | INSERT INTO
13 | simpleTable(A, B, C)
14 | VALUES
15 | (1, "1", 2.5),
16 | (2, "2", 5.0),
17 | (3, "3", CAST("+inf" AS FLOAT64)),
18 | (4, "4", CAST("-inf" AS FLOAT64)),
19 | (5, "5", CAST("NaN" AS FLOAT64)),
20 | (6, "6", 100000000017.100000000017),
21 | (7, "7", -0.1),
22 | (8, "8", +0.1),
23 | (9, "9", -19999997.9);
24 |
25 | DELETE FROM players WHERE 1=1;
26 | DELETE FROM games WHERE 1=1;
27 | INSERT INTO
28 | games(gameUUID, players, winner, created, finished, max_date)
29 | VALUES
30 | ("g1", ["p1", "p2", "p3"], "T1", TIMESTAMP("2023-08-26T12:22:00Z"), TIMESTAMP("2023-08-26T12:22:00Z"), DATE("2023-12-31T00:00:00Z")),
31 | ("g2", ["p4", "p5", "p6"], "T2", TIMESTAMP("2023-08-26T12:22:00Z"), TIMESTAMP("2023-08-26T12:22:00Z"), DATE("2023-12-31T00:00:00Z"));
32 |
33 | DELETE FROM game_items WHERE 1=1;
34 | INSERT INTO
35 | game_items(itemUUID, item_name, item_value, available_time, duration)
36 | VALUES
37 | ("gi_1", "powerup", 237, TIMESTAMP("2023-08-22T12:22:00Z"), 90),
38 | ("gi_2", "diff", 500, TIMESTAMP("2023-08-22T12:22:00Z"), 90);
39 |
40 | INSERT INTO
41 | players(playerUUID, player_name, email, password_hash, created, updated, stats, account_balance, is_logged_in, last_login, valid_email, current_game, dob)
42 | VALUES
43 | ("p1", "PLAYER 1", "p1@games.com", FROM_HEX("deadbeef"), TIMESTAMP("2023-08-26T12:22:00Z"), null, TO_JSON('{"a":"b"}'), 17517, true, TIMESTAMP("2023-08-26T12:22:00Z"), true, "g1", DATE("1999-06-06T00:00:00Z")),
44 | ("p2", "PLAYER 2", "p2@games.com", FROM_HEX("beefdead"), TIMESTAMP("2023-08-26T12:22:00Z"), null, TO_JSON('{"1":"2","k":291}'), 8519, false, TIMESTAMP("2023-08-26T12:22:00Z"), true, "g2", DATE("1997-12-06T00:00:00Z"));
45 |
46 |
47 | DELETE FROM compositeTable WHERE 1=1;
48 | INSERT INTO
49 | compositeTable(id, A, B, C, D, E, F, G, H, I, J, K)
50 | VALUES
51 | (
52 | "id1", [10, 100, 991, 567282], ["a", "b", "c"], "foobar", 2934, DATE(2023, 1, 1),
53 | TIMESTAMP("2023-08-26T12:22:05Z"), true, [DATE(2023, 1, 2), DATE(2023, 12, 31)],
54 | [TIMESTAMP("2023-08-26T12:11:10Z"), TIMESTAMP("2023-08-27T12:11:09Z")], FROM_HEX("beefdead"),
55 | JSON'{"a":1, "b":2}'
56 | ),
57 | (
58 | "id2", [20, 200, 2991, 888885], ["A", "B", "C"], "this one", 93411, DATE(2023, 9, 23),
59 | TIMESTAMP("2023-09-22T12:22:05Z"), false, [DATE(2023, 9, 2), DATE(2023, 12, 31)],
60 | [TIMESTAMP("2023-09-22T12:11:10Z"), TIMESTAMP("2023-09-23T12:11:09Z")], FROM_HEX("deadbeef"),
61 | JSON'{}'
62 | ),
63 | (
64 | "id3", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, b"deadbeef", NULL
65 | );
66 |
67 | DELETE FROM nullsTable WHERE 1=1;
68 | INSERT INTO
69 | nullsTable(id, A, B, C, D, E, F, G, H, I, J, K, M, N, O)
70 | VALUES
71 | (1, NULL, NULL, NULL, NULL, NULL, NULL, true, [NULL, DATE("2023-09-23T00:00:00Z")], NULL, [true, NULL, false], [23.67], NULL, NULL, [CAST(-99.37171 AS NUMERIC), NULL]),
72 | (2, [1, 2, NULL], NULL, NULL, 99.37171, NULL, NULL, NULL, [DATE("2022-10-02T00:00:00Z"), NULL], NULL, [NULL, NULL, true], [NULL, 198.1827], NULL, NULL, NULL),
73 | (3, [2, 3, NULL], ["a", "b", "FF", NULL], "😎🚨", NULL, NULL, TIMESTAMP("2023-09-23T12:11:09Z"), false, NULL, NULL, NULL, [-28888.8888, 0.12, NULL], NULL, NULL, [NULL, CAST(-55.7 AS NUMERIC), CAST(9.3 AS NUMERIC)]),
74 | (4, [NULL, 4, 57, 10], ["💡🚨", NULL, "b", "fg"], "🚨", 55.7, DATE(2023, 12, 31), NULL, false, NULL, [NULL, TIMESTAMP("2023-09-23T12:11:09Z")], [true, true], [0.71], [NULL, FROM_HEX("beefdead")], [NULL, JSON'{"a":1}'], [NULL, CAST(12 AS NUMERIC)]),
75 | (5, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL),
76 | (6, [NULL, 1234], [NULL, "stringarray"], NULL, NULL, NULL, NULL, NULL, [NULL, DATE(2023, 12, 31)], [NULL, TIMESTAMP("2023-09-23T12:11:09Z")], [NULL, true], [NULL, 0.000001], [NULL, b"beefdead"], [NULL, JSON'{"a":1}'], [NULL, CAST(123456 AS NUMERIC)]),
77 | (7, [], [], NULL, NULL, NULL, NULL, NULL, [], [], [], [], [], [], []);
78 |
79 |
80 | DELETE FROM bytesTable WHERE 1=1;
81 | INSERT INTO
82 | bytesTable(id, A)
83 | VALUES
84 | (1, B"ABCDEFGHIJKLMNOPQ"),
85 | (2, B"abcdefghijklmnopq"),
86 | (3, B"1234efghijklmnopq");
87 |
88 |
89 | DELETE FROM valueLimitsTable WHERE 1=1;
90 | INSERT INTO
91 | valueLimitsTable(A, B, C, D, E)
92 | VALUES
93 | (-9223372036854775808, CAST("NaN" AS FLOAT64), -9.9999999999999999999999999999999999999E+28, DATE("1700-01-01T00:00:00Z"), TIMESTAMP("9999-12-30T23:59:59.00Z")),
94 | (9223372036854775807, CAST("+inf" AS FLOAT64), +9.9999999999999999999999999999999999999E+28, DATE("4000-12-30T23:59:59Z"), TIMESTAMP("2222-02-22T22:22:22.999999Z")),
95 | (0, CAST("-inf" AS FLOAT64), 10.389, DATE("1900-12-30T23:59:59Z"), TIMESTAMP("2023-09-28 21:59:59", "America/Los_Angeles")),
96 | (1, CAST("0.657818" AS FLOAT64), -10.389, DATE("2023-09-28T00:00:00Z"), TIMESTAMP("0001-01-03T00:00:01Z"));
97 |
--------------------------------------------------------------------------------
/.mvn/wrapper/MavenWrapperDownloader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2007-present the original author or authors.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | import java.net.*;
17 | import java.io.*;
18 | import java.nio.channels.*;
19 | import java.util.Properties;
20 |
21 | public class MavenWrapperDownloader {
22 |
23 | private static final String WRAPPER_VERSION = "0.5.6";
24 | /**
25 | * Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided.
26 | */
27 | private static final String DEFAULT_DOWNLOAD_URL = "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/"
28 | + WRAPPER_VERSION + "/maven-wrapper-" + WRAPPER_VERSION + ".jar";
29 |
30 | /**
31 | * Path to the maven-wrapper.properties file, which might contain a downloadUrl property to
32 | * use instead of the default one.
33 | */
34 | private static final String MAVEN_WRAPPER_PROPERTIES_PATH =
35 | ".mvn/wrapper/maven-wrapper.properties";
36 |
37 | /**
38 | * Path where the maven-wrapper.jar will be saved to.
39 | */
40 | private static final String MAVEN_WRAPPER_JAR_PATH =
41 | ".mvn/wrapper/maven-wrapper.jar";
42 |
43 | /**
44 | * Name of the property which should be used to override the default download url for the wrapper.
45 | */
46 | private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl";
47 |
48 | public static void main(String args[]) {
49 | System.out.println("- Downloader started");
50 | File baseDirectory = new File(args[0]);
51 | System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath());
52 |
53 | // If the maven-wrapper.properties exists, read it and check if it contains a custom
54 | // wrapperUrl parameter.
55 | File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH);
56 | String url = DEFAULT_DOWNLOAD_URL;
57 | if(mavenWrapperPropertyFile.exists()) {
58 | FileInputStream mavenWrapperPropertyFileInputStream = null;
59 | try {
60 | mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile);
61 | Properties mavenWrapperProperties = new Properties();
62 | mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream);
63 | url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url);
64 | } catch (IOException e) {
65 | System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'");
66 | } finally {
67 | try {
68 | if(mavenWrapperPropertyFileInputStream != null) {
69 | mavenWrapperPropertyFileInputStream.close();
70 | }
71 | } catch (IOException e) {
72 | // Ignore ...
73 | }
74 | }
75 | }
76 | System.out.println("- Downloading from: " + url);
77 |
78 | File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH);
79 | if(!outputFile.getParentFile().exists()) {
80 | if(!outputFile.getParentFile().mkdirs()) {
81 | System.out.println(
82 | "- ERROR creating output directory '" + outputFile.getParentFile().getAbsolutePath() + "'");
83 | }
84 | }
85 | System.out.println("- Downloading to: " + outputFile.getAbsolutePath());
86 | try {
87 | downloadFileFromURL(url, outputFile);
88 | System.out.println("Done");
89 | System.exit(0);
90 | } catch (Throwable e) {
91 | System.out.println("- Error downloading");
92 | e.printStackTrace();
93 | System.exit(1);
94 | }
95 | }
96 |
97 | private static void downloadFileFromURL(String urlString, File destination) throws Exception {
98 | if (System.getenv("MVNW_USERNAME") != null && System.getenv("MVNW_PASSWORD") != null) {
99 | String username = System.getenv("MVNW_USERNAME");
100 | char[] password = System.getenv("MVNW_PASSWORD").toCharArray();
101 | Authenticator.setDefault(new Authenticator() {
102 | @Override
103 | protected PasswordAuthentication getPasswordAuthentication() {
104 | return new PasswordAuthentication(username, password);
105 | }
106 | });
107 | }
108 | URL website = new URL(urlString);
109 | ReadableByteChannel rbc;
110 | rbc = Channels.newChannel(website.openStream());
111 | FileOutputStream fos = new FileOutputStream(destination);
112 | fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
113 | fos.close();
114 | rbc.close();
115 | }
116 |
117 | }
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraphConfigs.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph;
2 |
3 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphElementTable;
4 | import com.google.gson.Gson;
5 | import java.util.ArrayList;
6 | import java.util.HashSet;
7 | import java.util.LinkedHashSet;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Set;
11 | import javax.annotation.Nonnull;
12 | import javax.annotation.Nullable;
13 |
14 | /** User-supplied configs for exporting graphs in Spanner */
15 | public class SpannerGraphConfigs {
16 |
17 | /** Do not export ID columns directly even when it can */
18 | public boolean disableDirectIdExport = false;
19 |
20 | /** Output individual node element key columns instead of one column that concatenate all keys */
21 | public boolean outputIndividualKeys = false;
22 |
23 | /** Labels and properties to fetch for nodes */
24 | public List nodeLabelConfigs = new ArrayList<>();
25 |
26 | /** Labels and properties to fetch for edges */
27 | public List edgeLabelConfigs = new ArrayList<>();
28 |
29 | /**
30 | * Same as PartitionOptions
32 | */
33 | public Long partitionSizeBytes = null;
34 |
35 | /** Extra headers added to requests when fetching partitions of the graph */
36 | public Map> extraHeaders = null;
37 |
38 | public static SpannerGraphConfigs fromJson(String json) {
39 | return new Gson().fromJson(json, SpannerGraphConfigs.class);
40 | }
41 |
42 | public void validate(PropertyGraph graphSchema, boolean directGqlQuery) {
43 | if (directGqlQuery) {
44 | if (!nodeLabelConfigs.isEmpty() || !edgeLabelConfigs.isEmpty()) {
45 | throw new IllegalArgumentException(
46 | "nodeLabelConfigs and edgeLabelConfigs are invalid "
47 | + "options when using GQL queries are provided.");
48 | }
49 | }
50 | checkExclusiveAnyLabel(nodeLabelConfigs);
51 | checkExclusiveAnyLabel(edgeLabelConfigs);
52 | for (LabelConfig labelConfig : nodeLabelConfigs) {
53 | labelConfig.validate(graphSchema, /*node=*/ true);
54 | }
55 | for (LabelConfig labelConfig : edgeLabelConfigs) {
56 | labelConfig.validate(graphSchema, /*node=*/ false);
57 | }
58 | if (partitionSizeBytes != null && partitionSizeBytes <= 0) {
59 | throw new IllegalArgumentException("partitionSize must be greater than 0");
60 | }
61 | }
62 |
63 | private void checkExclusiveAnyLabel(List labelConfigs) {
64 | boolean hasAnyLabel = labelConfigs.stream().anyMatch(lc -> "*".equals(lc.label));
65 | if (!hasAnyLabel) {
66 | return;
67 | }
68 | if (labelConfigs.size() > 1) {
69 | throw new IllegalArgumentException(
70 | "Label wildcard (\"*\") cannot be specified together with other label filters.");
71 | }
72 | }
73 |
74 | public static class LabelConfig {
75 |
76 | @Nonnull public String label;
77 | @Nonnull public List properties;
78 | @Nullable public String filter;
79 |
80 | public LabelConfig(
81 | @Nonnull String label, @Nullable List properties, @Nullable String filter) {
82 | this.label = label;
83 | this.filter = filter;
84 | this.properties = properties != null ? properties : new ArrayList<>();
85 | }
86 |
87 | private void validate(PropertyGraph graphSchema, boolean node) {
88 | if (label == null) {
89 | throw new IllegalArgumentException("label must be specified");
90 | }
91 |
92 | // Ensure label and properties exist in the graph
93 | if (label.equals("*")) {
94 | List elementTables =
95 | node ? graphSchema.nodeTables : graphSchema.edgeTables;
96 | Set availableProperties = new LinkedHashSet<>();
97 | elementTables.stream()
98 | .flatMap(t -> t.propertyDefinitions.stream())
99 | .map(d -> d.propertyDeclarationName)
100 | .forEach(availableProperties::add);
101 | for (String property : properties) {
102 | if (!availableProperties.contains(property)) {
103 | throw new IllegalArgumentException(
104 | String.format(
105 | "Cannot find %s property %s in the graph schema. Existing properties: %s",
106 | node ? "node" : "edge", property, availableProperties));
107 | }
108 | }
109 | } else {
110 | Set availableProperties =
111 | new HashSet<>(
112 | graphSchema.labels.stream()
113 | .filter(l -> l.name.equalsIgnoreCase(label))
114 | .findFirst()
115 | .orElseThrow(
116 | () ->
117 | new IllegalArgumentException(
118 | String.format("Cannot find label %s in the graph schema.", label)))
119 | .propertyDeclarationNames);
120 | for (String property : properties) {
121 | if (!availableProperties.contains(property)) {
122 | throw new IllegalArgumentException(
123 | String.format("Cannot find property %s in label %s", property, label));
124 | }
125 | }
126 | }
127 | }
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SpannerTableTest.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import static org.junit.Assert.assertEquals;
18 |
19 | import java.util.Arrays;
20 | import java.util.Map;
21 | import org.apache.spark.sql.types.DataTypes;
22 | import org.apache.spark.sql.types.MetadataBuilder;
23 | import org.apache.spark.sql.types.StructField;
24 | import org.apache.spark.sql.types.StructType;
25 | import org.junit.Test;
26 | import org.junit.runner.RunWith;
27 | import org.junit.runners.JUnit4;
28 |
29 | @RunWith(JUnit4.class)
30 | public class SpannerTableTest extends SpannerTestBase {
31 |
32 | @Test
33 | public void querySchemaShouldSuccessInSpannerTable() {
34 | Map props = this.connectionProperties();
35 | SpannerTable spannerTable = new SpannerTable(props);
36 | StructType actualSchema = spannerTable.schema();
37 | MetadataBuilder jsonMetaBuilder = new MetadataBuilder();
38 | jsonMetaBuilder.putString(SpannerUtils.COLUMN_TYPE, "json");
39 | StructType expectSchema =
40 | new StructType(
41 | Arrays.asList(
42 | new StructField("A", DataTypes.LongType, false, null),
43 | new StructField("B", DataTypes.StringType, true, null),
44 | new StructField("C", DataTypes.BinaryType, true, null),
45 | new StructField("D", DataTypes.TimestampType, true, null),
46 | new StructField("E", DataTypes.createDecimalType(38, 9), true, null),
47 | new StructField(
48 | "F", DataTypes.createArrayType(DataTypes.StringType, true), true, null),
49 | new StructField("G", DataTypes.StringType, true, jsonMetaBuilder.build()))
50 | .toArray(new StructField[0]));
51 |
52 | // Object.equals fails for StructType with fields so we'll
53 | // firstly compare lengths, then fieldNames then the simpleString.
54 | assertEquals(expectSchema.length(), actualSchema.length());
55 | assertEquals(expectSchema.fieldNames(), actualSchema.fieldNames());
56 | assertEquals(expectSchema.simpleString(), actualSchema.simpleString());
57 | }
58 |
59 | @Test
60 | public void queryPgSchemaShouldSucceedInSpannerTable() {
61 | if (SpannerTableTest.emulatorHost != null && !SpannerTableTest.emulatorHost.isEmpty()) {
62 | // Spanner emulator doesn't support the PostgreSql dialect interface.
63 | // If the emulator is set. We return immediately here.
64 | // TODO: Use logger instead of System out once logger configuration is set.
65 | System.out.println(
66 | "queryPgSchemaShouldSuccessInSpannerTable is skipped since pg is not supported in Spanner emulator");
67 | return;
68 | }
69 | Map props = this.connectionProperties(/* usePostgreSql= */ true);
70 | SpannerTable spannerTable = new SpannerTable(props);
71 | StructType actualSchema = spannerTable.schema();
72 | MetadataBuilder jsonMetaBuilder = new MetadataBuilder();
73 | jsonMetaBuilder.putString(SpannerUtils.COLUMN_TYPE, "jsonb");
74 | StructType expectSchema =
75 | new StructType(
76 | Arrays.asList(
77 | new StructField("id", DataTypes.LongType, false, null),
78 | new StructField("charvcol", DataTypes.StringType, true, null),
79 | new StructField("textcol", DataTypes.StringType, true, null),
80 | new StructField("varcharcol", DataTypes.StringType, true, null),
81 | new StructField("boolcol", DataTypes.BooleanType, true, null),
82 | new StructField("booleancol", DataTypes.BooleanType, true, null),
83 | new StructField("bigintcol", DataTypes.LongType, true, null),
84 | new StructField("int8col", DataTypes.LongType, true, null),
85 | new StructField("intcol", DataTypes.LongType, true, null),
86 | new StructField("doublecol", DataTypes.DoubleType, true, null),
87 | new StructField("floatcol", DataTypes.DoubleType, true, null),
88 | new StructField("bytecol", DataTypes.BinaryType, true, null),
89 | new StructField("datecol", DataTypes.DateType, true, null),
90 | new StructField("numericcol", DataTypes.createDecimalType(38, 9), true, null),
91 | new StructField("decimalcol", DataTypes.createDecimalType(38, 9), true, null),
92 | new StructField("timewithzonecol", DataTypes.TimestampType, true, null),
93 | new StructField("timestampcol", DataTypes.TimestampType, true, null),
94 | new StructField("jsoncol", DataTypes.StringType, true, jsonMetaBuilder.build()))
95 | .toArray(new StructField[0]));
96 |
97 | // Object.equals fails for StructType with fields so we'll
98 | // firstly compare lengths, then fieldNames then the simpleString.
99 | assertEquals(expectSchema.length(), actualSchema.length());
100 | assertEquals(expectSchema.fieldNames(), actualSchema.fieldNames());
101 | assertEquals(expectSchema.simpleString(), actualSchema.simpleString());
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/AcceptanceTestUtils.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner.acceptance;
16 |
17 | import com.google.cloud.WriteChannel;
18 | import com.google.cloud.storage.*;
19 | import com.google.common.base.Preconditions;
20 | import com.google.common.io.ByteStreams;
21 | import java.io.*;
22 | import java.net.URI;
23 | import java.nio.ByteBuffer;
24 | import java.nio.MappedByteBuffer;
25 | import java.nio.channels.FileChannel;
26 | import java.nio.charset.StandardCharsets;
27 | import java.nio.file.Files;
28 | import java.nio.file.Path;
29 | import java.nio.file.Paths;
30 | import java.nio.file.attribute.FileTime;
31 | import java.util.Comparator;
32 | import java.util.function.Predicate;
33 | import java.util.stream.StreamSupport;
34 |
35 | public final class AcceptanceTestUtils {
36 | static final String BUCKET =
37 | Preconditions.checkNotNull(
38 | System.getenv("ACCEPTANCE_TEST_BUCKET"),
39 | "Please set the 'ACCEPTANCE_TEST_BUCKET' environment variable");
40 | static Storage storage =
41 | new StorageOptions.DefaultStorageFactory().create(StorageOptions.getDefaultInstance());
42 |
43 | public static Path getArtifact(Path targetDir, String prefix, String suffix) {
44 | Predicate prefixSuffixChecker = prefixSuffixChecker(prefix, suffix);
45 | try {
46 | return Files.list(targetDir)
47 | .filter(Files::isRegularFile)
48 | .filter(prefixSuffixChecker)
49 | .max(Comparator.comparing(AcceptanceTestUtils::lastModifiedTime))
50 | .get();
51 | } catch (IOException e) {
52 | throw new UncheckedIOException(e.getMessage(), e);
53 | }
54 | }
55 |
56 | public static String getCsv(String resultsDirUri) throws Exception {
57 | URI uri = new URI(resultsDirUri);
58 | Blob csvBlob =
59 | StreamSupport.stream(
60 | storage
61 | .list(
62 | uri.getAuthority(),
63 | Storage.BlobListOption.prefix(uri.getPath().substring(1)))
64 | .iterateAll()
65 | .spliterator(),
66 | false)
67 | .filter(blob -> blob.getName().endsWith("csv"))
68 | .findFirst()
69 | .get();
70 | return new String(storage.readAllBytes(csvBlob.getBlobId()), StandardCharsets.UTF_8);
71 | }
72 |
73 | private static Predicate prefixSuffixChecker(final String prefix, final String suffix) {
74 | return path -> {
75 | String name = path.toFile().getName();
76 | return name.startsWith(prefix) && name.endsWith(suffix) && name.indexOf("-javadoc") == -1;
77 | };
78 | }
79 |
80 | private static FileTime lastModifiedTime(Path path) {
81 | try {
82 | return Files.getLastModifiedTime(path);
83 | } catch (IOException e) {
84 | throw new UncheckedIOException(e.getMessage(), e);
85 | }
86 | }
87 |
88 | public static BlobId copyToGcs(Path source, String destinationUri, String contentType)
89 | throws Exception {
90 | File sourceFile = source.toFile();
91 | try (FileInputStream sourceInputStream = new FileInputStream(sourceFile)) {
92 | FileChannel sourceFileChannel = sourceInputStream.getChannel();
93 | MappedByteBuffer sourceContent =
94 | sourceFileChannel.map(FileChannel.MapMode.READ_ONLY, 0, sourceFile.length());
95 | return uploadToGcs(sourceContent, destinationUri, contentType);
96 | } catch (IOException e) {
97 | throw new UncheckedIOException(
98 | String.format("Failed to write '%s' to '%s'", source, destinationUri), e);
99 | }
100 | }
101 |
102 | public static BlobId uploadToGcs(InputStream source, String destinationUri, String contentType)
103 | throws Exception {
104 | try {
105 | ByteBuffer sourceContent = ByteBuffer.wrap(ByteStreams.toByteArray(source));
106 | return uploadToGcs(sourceContent, destinationUri, contentType);
107 | } catch (IOException e) {
108 | throw new UncheckedIOException(String.format("Failed to write to '%s'", destinationUri), e);
109 | }
110 | }
111 |
112 | public static BlobId uploadToGcs(ByteBuffer content, String destinationUri, String contentType)
113 | throws Exception {
114 | URI uri = new URI(destinationUri);
115 | BlobId blobId = BlobId.of(uri.getAuthority(), uri.getPath().substring(1));
116 | BlobInfo blobInfo = BlobInfo.newBuilder(blobId).setContentType(contentType).build();
117 | try (WriteChannel writer = storage.writer(blobInfo)) {
118 | writer.write(content);
119 | } catch (IOException e) {
120 | throw new UncheckedIOException(String.format("Failed to write to '%s'", destinationUri), e);
121 | }
122 | return blobId;
123 | }
124 |
125 | public static String createTestBaseGcsDir(String testId) {
126 | return String.format("gs://%s/tests/%s", BUCKET, testId);
127 | }
128 |
129 | static void uploadConnectorJar(String targetDir, String prefix, String connectorJarUri)
130 | throws Exception {
131 | Path targetDirPath = Paths.get(targetDir);
132 | Path assemblyJar = AcceptanceTestUtils.getArtifact(targetDirPath, prefix, ".jar");
133 | AcceptanceTestUtils.copyToGcs(assemblyJar, connectorJarUri, "application/java-archive");
134 | }
135 |
136 | public static void deleteGcsDir(String testBaseGcsDir) throws Exception {
137 | URI uri = new URI(testBaseGcsDir);
138 | BlobId[] blobIds =
139 | StreamSupport.stream(
140 | storage
141 | .list(
142 | uri.getAuthority(),
143 | Storage.BlobListOption.prefix(uri.getPath().substring(1)))
144 | .iterateAll()
145 | .spliterator(),
146 | false)
147 | .map(Blob::getBlobId)
148 | .toArray(BlobId[]::new);
149 | if (blobIds.length > 1) {
150 | storage.delete(blobIds);
151 | }
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerScanner.java:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.cloud.spark.spanner;
16 |
17 | import com.fasterxml.jackson.core.JsonProcessingException;
18 | import com.google.cloud.Timestamp;
19 | import com.google.cloud.spanner.BatchReadOnlyTransaction;
20 | import com.google.cloud.spanner.Dialect;
21 | import com.google.cloud.spanner.Options;
22 | import com.google.cloud.spanner.PartitionOptions;
23 | import com.google.cloud.spanner.Statement;
24 | import com.google.cloud.spanner.TimestampBound;
25 | import com.google.common.collect.Streams;
26 | import java.util.List;
27 | import java.util.Map;
28 | import java.util.Optional;
29 | import java.util.Set;
30 | import java.util.stream.Collectors;
31 | import org.apache.spark.Partition;
32 | import org.apache.spark.sql.connector.read.Batch;
33 | import org.apache.spark.sql.connector.read.InputPartition;
34 | import org.apache.spark.sql.connector.read.PartitionReaderFactory;
35 | import org.apache.spark.sql.connector.read.Scan;
36 | import org.apache.spark.sql.sources.Filter;
37 | import org.apache.spark.sql.types.StructField;
38 | import org.apache.spark.sql.types.StructType;
39 | import org.slf4j.Logger;
40 | import org.slf4j.LoggerFactory;
41 |
42 | /*
43 | * SpannerScanner implements Scan.
44 | */
45 | public class SpannerScanner implements Batch, Scan {
46 | private final SpannerTable spannerTable;
47 | private final Filter[] filters;
48 | private final Set requiredColumns;
49 | private final Map opts;
50 | private static final Logger log = LoggerFactory.getLogger(SpannerScanner.class);
51 | private final Timestamp INIT_TIME = Timestamp.now();
52 | private final Map fields;
53 | private final StructType readSchema;
54 |
55 | public SpannerScanner(
56 | Map opts,
57 | SpannerTable spannerTable,
58 | Map fields,
59 | Filter[] filters,
60 | Set requiredColumns) {
61 | this.opts = opts;
62 | this.spannerTable = spannerTable;
63 | this.fields = fields;
64 | this.filters = filters;
65 | this.requiredColumns = requiredColumns;
66 | this.readSchema = SpannerUtils.pruneSchema(spannerTable.schema(), requiredColumns);
67 | }
68 |
69 | @Override
70 | public StructType readSchema() {
71 | return readSchema;
72 | }
73 |
74 | @Override
75 | public Batch toBatch() {
76 | return this;
77 | }
78 |
79 | @Override
80 | public PartitionReaderFactory createReaderFactory() {
81 | return new SpannerPartitionReaderFactory();
82 | }
83 |
84 | static String buildColumnsWithTablePrefix(
85 | String tableName, Set columns, boolean isPostgreSql) {
86 | String quotedTableName = isPostgreSql ? "\"" + tableName + "\"" : "`" + tableName + "`";
87 | return columns.stream()
88 | .map(col -> isPostgreSql ? "\"" + col + "\"" : "`" + col + "`")
89 | .map(quotedCol -> quotedTableName + "." + quotedCol)
90 | .collect(Collectors.joining(", "));
91 | }
92 |
93 | @Override
94 | public InputPartition[] planInputPartitions() {
95 | BatchClientWithCloser batchClient = SpannerUtils.batchClientFromProperties(this.opts);
96 |
97 | // 1. Use * if no requiredColumns were requested else select them.
98 | String selectPrefix = "SELECT *";
99 | if (this.requiredColumns != null && this.requiredColumns.size() > 0) {
100 | // Prefix each column with the table name to avoid ambiguity when column name
101 | // matches table name
102 | boolean isPostgreSql = batchClient.databaseClient.getDialect().equals(Dialect.POSTGRESQL);
103 | String columnsWithTablePrefix =
104 | buildColumnsWithTablePrefix(this.spannerTable.name(), this.requiredColumns, isPostgreSql);
105 | selectPrefix = "SELECT " + columnsWithTablePrefix;
106 | }
107 | String sqlStmt = selectPrefix + " FROM " + this.spannerTable.name();
108 | if (this.filters.length > 0) {
109 | sqlStmt +=
110 | " WHERE "
111 | + SparkFilterUtils.getCompiledFilter(
112 | true,
113 | Optional.empty(),
114 | batchClient.databaseClient.getDialect().equals(Dialect.POSTGRESQL),
115 | fields,
116 | this.filters);
117 | }
118 |
119 | Boolean enableDataboost = false;
120 | if (this.opts.containsKey("enableDataBoost")) {
121 | enableDataboost = this.opts.get("enableDataBoost").equalsIgnoreCase("true");
122 | }
123 |
124 | try (BatchReadOnlyTransaction txn =
125 | batchClient.batchClient.batchReadOnlyTransaction(
126 | TimestampBound.ofReadTimestamp(INIT_TIME))) {
127 | String mapAsJSON = SpannerUtils.serializeMap(this.opts);
128 | List rawPartitions =
129 | txn.partitionQuery(
130 | PartitionOptions.getDefaultInstance(),
131 | Statement.of(sqlStmt),
132 | Options.dataBoostEnabled(enableDataboost));
133 |
134 | List parts =
135 | Streams.mapWithIndex(
136 | rawPartitions.stream(),
137 | (part, index) ->
138 | new SpannerPartition(
139 | part,
140 | Math.toIntExact(index),
141 | new SpannerInputPartitionContext(
142 | part,
143 | txn.getBatchTransactionId(),
144 | mapAsJSON,
145 | new SpannerRowConverterDirect())))
146 | .collect(Collectors.toList());
147 |
148 | return parts.toArray(new InputPartition[0]);
149 | } catch (JsonProcessingException e) {
150 | throw new SpannerConnectorException(
151 | SpannerErrorCode.SPANNER_FAILED_TO_PARSE_OPTIONS, "Error parsing the input options.", e);
152 | } finally {
153 | batchClient.close();
154 | }
155 | }
156 | }
157 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/PropertyGraph.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph;
2 |
3 | import com.google.cloud.spanner.ResultSet;
4 | import com.google.cloud.spanner.Statement;
5 | import com.google.cloud.spanner.connection.Connection;
6 | import com.google.cloud.spark.spanner.SpannerConnectorException;
7 | import com.google.gson.Gson;
8 | import java.util.HashMap;
9 | import java.util.HashSet;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.Set;
13 | import java.util.stream.Collectors;
14 |
15 | /**
16 | * Parses INFORMATION_SCHEMA.PROPERTY_GRAPHS as defined in
17 | * https://cloud.google.com/spanner/docs/information-schema#property-graphs.
18 | */
19 | public class PropertyGraph {
20 |
21 | public static String GRAPH_ELEMENT_TABLE_KIND_NODE = "NODE";
22 | public static String GRAPH_ELEMENT_TABLE_KIND_EDGE = "EDGE";
23 |
24 | private static final String GRAPH_SCHEMA_QUERY =
25 | "SELECT PROPERTY_GRAPH_METADATA_JSON FROM "
26 | + "INFORMATION_SCHEMA.PROPERTY_GRAPHS WHERE PROPERTY_GRAPH_NAME = @graph";
27 |
28 | public String catalog;
29 | public String schema;
30 | public String name;
31 | public List nodeTables;
32 | public List edgeTables;
33 | public List labels;
34 | public List propertyDeclarations;
35 | private Map tableIdMapping;
36 |
37 | private PropertyGraph() {}
38 |
39 | public int getTableId(String elementTableName) {
40 | Integer tableId = tableIdMapping.get(elementTableName);
41 | if (tableId == null) {
42 | throw new IllegalArgumentException(
43 | String.format("Cannot find tableId for table with name=%s", elementTableName));
44 | }
45 | return tableId;
46 | }
47 |
48 | public String getPropertyType(String propertyName) {
49 | for (GraphPropertyDeclaration gpd : propertyDeclarations) {
50 | if (gpd.name.equals(propertyName)) {
51 | return gpd.type;
52 | }
53 | }
54 | throw new IllegalArgumentException("Cannot find property: " + propertyName);
55 | }
56 |
57 | public void checkEdgeReferenceKeyColumnsMatchNodeKeyColumns(GraphElementTable edgeTable) {
58 | if (!edgeTable.kind.equalsIgnoreCase(GRAPH_ELEMENT_TABLE_KIND_EDGE)) {
59 | throw new IllegalArgumentException();
60 | }
61 | Map> nodeTableKeyColumns = new HashMap<>();
62 | for (GraphElementTable nodeTable : nodeTables) {
63 | nodeTableKeyColumns.put(nodeTable.name, new HashSet<>(nodeTable.keyColumns));
64 | }
65 | throwIfNodeTableKeyColumnsMismatch(
66 | nodeTableKeyColumns, edgeTable.sourceNodeTable, edgeTable.name, "source");
67 | throwIfNodeTableKeyColumnsMismatch(
68 | nodeTableKeyColumns, edgeTable.destinationNodeTable, edgeTable.name, "destination");
69 | }
70 |
71 | private static void throwIfNodeTableKeyColumnsMismatch(
72 | Map> nodeTableKeyColumns,
73 | GraphNodeTableReference nodeTableReference,
74 | String edgeTableName,
75 | String type) {
76 | String nodeTableName = nodeTableReference.nodeTableName;
77 | Set expected = nodeTableKeyColumns.get(nodeTableReference.nodeTableName);
78 | if (!expected.equals(new HashSet<>(nodeTableReference.nodeTableColumns))) {
79 | throw new UnsupportedOperationException(
80 | String.format(
81 | "%s of edge table %s references node table %s using column(s) [%s], "
82 | + "while key column(s) of node table %s are [%s]. "
83 | + "Currently, the connector expects the key columns an edge table used to reference "
84 | + "source/destination nodes to match the key columns of the node table.",
85 | type,
86 | edgeTableName,
87 | nodeTableName,
88 | String.join(", ", nodeTableReference.nodeTableColumns),
89 | nodeTableName,
90 | String.join(", ", expected)));
91 | }
92 | }
93 |
94 | public static class GraphElementTable {
95 |
96 | public String name;
97 | public String kind;
98 | public String baseCatalogName;
99 | public String baseSchemaName;
100 | public String baseTableName;
101 | public List keyColumns;
102 | public List labelNames;
103 | public List propertyDefinitions;
104 | public GraphNodeTableReference sourceNodeTable;
105 | public GraphNodeTableReference destinationNodeTable;
106 | }
107 |
108 | public static class GraphNodeTableReference {
109 |
110 | public String nodeTableName;
111 | public List edgeTableColumns;
112 | public List nodeTableColumns;
113 | }
114 |
115 | public static class GraphElementLabel {
116 |
117 | public String name;
118 | public List propertyDeclarationNames;
119 | }
120 |
121 | public static class GraphPropertyDeclaration {
122 |
123 | public String name;
124 | public String type;
125 | }
126 |
127 | public static class GraphPropertyDefinition {
128 |
129 | public String propertyDeclarationName;
130 | public String valueExpressionSql;
131 | }
132 |
133 | public static class Builder {
134 |
135 | public static PropertyGraph getFromSpanner(Connection conn, String graph) {
136 | Statement schemaQuery =
137 | Statement.newBuilder(GRAPH_SCHEMA_QUERY).bind("graph").to(graph).build();
138 | try (ResultSet rs = conn.executeQuery(schemaQuery)) {
139 | if (!rs.next()) {
140 | throw new SpannerConnectorException(
141 | String.format(
142 | "Unable to find the schema for graph %s. Query: %s", graph, schemaQuery));
143 | }
144 | String schemaJson = rs.getCurrentRowAsStruct().getJson(0);
145 | if (rs.next()) {
146 | throw new SpannerConnectorException(
147 | String.format(
148 | "Found more than one schema for graph %s. Query: %s", graph, schemaQuery));
149 | }
150 | return fromJson(schemaJson);
151 | }
152 | }
153 |
154 | public static PropertyGraph fromJson(String json) {
155 | PropertyGraph propertyGraph = new Gson().fromJson(json, PropertyGraph.class);
156 | propertyGraph.tableIdMapping =
157 | getTableIdMapping(propertyGraph.nodeTables, propertyGraph.edgeTables);
158 | return propertyGraph;
159 | }
160 |
161 | private static Map getTableIdMapping(
162 | List nodeTables, List edgeTables) {
163 | int tableCount = 0;
164 | Map tableIdMapping = new HashMap<>();
165 | for (String tableName :
166 | nodeTables.stream().map(t -> t.name).sorted().collect(Collectors.toList())) {
167 | tableIdMapping.put(tableName, ++tableCount);
168 | }
169 | for (String tableName :
170 | edgeTables.stream().map(t -> t.name).sorted().collect(Collectors.toList())) {
171 | tableIdMapping.put(tableName, ++tableCount);
172 | }
173 | return tableIdMapping;
174 | }
175 | }
176 | }
177 |
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/SpannerGraphQuery.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph.query;
2 |
3 | import com.google.cloud.spanner.Statement;
4 | import com.google.cloud.spanner.connection.Connection;
5 | import com.google.cloud.spark.spanner.SpannerTable;
6 | import com.google.cloud.spark.spanner.SpannerTableSchema;
7 | import com.google.cloud.spark.spanner.graph.PropertyGraph;
8 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphElementTable;
9 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs;
10 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs.LabelConfig;
11 | import com.google.common.collect.ImmutableList;
12 | import com.google.common.collect.Iterables;
13 | import java.util.ArrayList;
14 | import java.util.Collections;
15 | import java.util.HashMap;
16 | import java.util.List;
17 | import java.util.Map;
18 | import java.util.Objects;
19 | import java.util.Set;
20 | import java.util.stream.Collectors;
21 | import org.apache.spark.sql.types.IntegralType;
22 | import org.apache.spark.sql.types.StructField;
23 | import org.apache.spark.sql.types.StructType;
24 | import org.slf4j.Logger;
25 | import org.slf4j.LoggerFactory;
26 |
27 | /**
28 | * Handles naming, schema, and mapping of columns across layers of the queries for a Spanner Graph
29 | */
30 | public class SpannerGraphQuery {
31 |
32 | private static final Logger log = LoggerFactory.getLogger(SpannerTable.class);
33 |
34 | public final StructType dataframeSchema;
35 | public final List graphSubqueries;
36 |
37 | /** Constructor for user-provided graph query */
38 | public SpannerGraphQuery(Connection conn, Statement query, boolean node) {
39 | DirectGraphQuery directGraphQuery = new DirectGraphQuery(conn, query, node);
40 | this.graphSubqueries = ImmutableList.of(directGraphQuery);
41 | this.dataframeSchema = fieldsToStruct(directGraphQuery.getOutputSparkFields());
42 | }
43 |
44 | public SpannerGraphQuery(
45 | Connection conn, PropertyGraph graphSchema, SpannerGraphConfigs configs, boolean node) {
46 | List nodeTables =
47 | getMatchedElementTables(graphSchema.nodeTables, configs.nodeLabelConfigs);
48 | List edgeTables =
49 | getMatchedElementTables(graphSchema.edgeTables, configs.edgeLabelConfigs);
50 | Map baseTableSchemas =
51 | getBaseTableSchemas(conn, Iterables.concat(nodeTables, edgeTables));
52 | boolean idColumnsExist =
53 | !configs.disableDirectIdExport && getIdColumnsExist(nodeTables, baseTableSchemas);
54 |
55 | List subQueries = new ArrayList<>();
56 | if (node) {
57 | if (nodeTables.size() == 0) {
58 | throw new IllegalArgumentException("No node table left.");
59 | }
60 | for (GraphElementTable table : nodeTables) {
61 | subQueries.add(
62 | NodeElementTableQuery.create(
63 | graphSchema,
64 | table,
65 | baseTableSchemas.get(table.baseTableName),
66 | configs,
67 | idColumnsExist));
68 | }
69 | } else {
70 | if (edgeTables.size() == 0) {
71 | throw new IllegalArgumentException("No edge table left.");
72 | }
73 | checkValidTableReference(nodeTables, edgeTables);
74 | for (GraphElementTable table : edgeTables) {
75 | subQueries.add(
76 | EdgeElementTableQuery.create(
77 | graphSchema,
78 | table,
79 | baseTableSchemas.get(table.baseTableName),
80 | configs,
81 | idColumnsExist));
82 | }
83 | }
84 | this.graphSubqueries = Collections.unmodifiableList(subQueries);
85 | this.dataframeSchema =
86 | ElementTableQuery.mergeDataframeSchema(
87 | subQueries, node ? configs.nodeLabelConfigs : configs.edgeLabelConfigs);
88 | }
89 |
90 | private static boolean getIdColumnsExist(
91 | List nodeTables, Map baseTableSchemas) {
92 | if (nodeTables.size() != 1 || nodeTables.get(0).keyColumns.size() != 1) {
93 | return false;
94 | }
95 | GraphElementTable nodeTable = nodeTables.get(0);
96 | if (nodeTable.keyColumns.size() != 1) {
97 | return false;
98 | }
99 | SpannerTableSchema tableSchema =
100 | Objects.requireNonNull(baseTableSchemas.get(nodeTable.baseTableName));
101 | String keyColumn = nodeTables.get(0).keyColumns.get(0);
102 | return tableSchema.getStructFieldForColumn(keyColumn).dataType() instanceof IntegralType;
103 | }
104 |
105 | private static Map getBaseTableSchemas(
106 | Connection conn, Iterable elementTables) {
107 | Map schemas = new HashMap<>();
108 | for (GraphElementTable t : elementTables) {
109 | schemas.put(t.baseTableName, new SpannerTableSchema(conn, t.baseTableName, false));
110 | }
111 | return schemas;
112 | }
113 |
114 | /**
115 | * Filters a list of element tables based on label filters
116 | *
117 | * @param elementTables the list of element tables to filter
118 | * @param labelConfigs the label config specified by the user
119 | * @return a list of element tables that have a matched label config
120 | */
121 | private List getMatchedElementTables(
122 | List elementTables, List labelConfigs) {
123 | if (labelConfigs == null || labelConfigs.isEmpty() || labelConfigs.get(0).label.equals("*")) {
124 | return new ArrayList<>(elementTables);
125 | }
126 |
127 | Set targetLabels =
128 | labelConfigs.stream().map(lc -> lc.label.trim().toLowerCase()).collect(Collectors.toSet());
129 | return elementTables.stream()
130 | .filter(
131 | t -> {
132 | for (String label : t.labelNames) {
133 | if (targetLabels.contains(label.toLowerCase())) {
134 | return true;
135 | }
136 | }
137 | return false;
138 | })
139 | .collect(Collectors.toList());
140 | }
141 |
142 | private void checkValidTableReference(
143 | List nodeTables, List edgeTables) {
144 | Set nodeTableNames = nodeTables.stream().map(t -> t.name).collect(Collectors.toSet());
145 | for (GraphElementTable t : edgeTables) {
146 | String srcTable = t.sourceNodeTable.nodeTableName;
147 | String dstTable = t.destinationNodeTable.nodeTableName;
148 | if (!nodeTableNames.contains(srcTable) || !nodeTableNames.contains(dstTable)) {
149 | throw new IllegalArgumentException(
150 | String.format(
151 | "One or both of the referenced node tables (%s, %s) of edge table %s are filtered"
152 | + " out. Existing node tables: %s.",
153 | srcTable, dstTable, t.name, nodeTableNames));
154 | }
155 | }
156 | }
157 |
158 | private StructType fieldsToStruct(List fields) {
159 | StructType result = new StructType();
160 | for (StructField field : fields) {
161 | result = result.add(field);
162 | }
163 | return result;
164 | }
165 | }
166 |
--------------------------------------------------------------------------------
/mvnw.cmd:
--------------------------------------------------------------------------------
1 | @REM ----------------------------------------------------------------------------
2 | @REM Licensed to the Apache Software Foundation (ASF) under one
3 | @REM or more contributor license agreements. See the NOTICE file
4 | @REM distributed with this work for additional information
5 | @REM regarding copyright ownership. The ASF licenses this file
6 | @REM to you under the Apache License, Version 2.0 (the
7 | @REM "License"); you may not use this file except in compliance
8 | @REM with the License. You may obtain a copy of the License at
9 | @REM
10 | @REM http://www.apache.org/licenses/LICENSE-2.0
11 | @REM
12 | @REM Unless required by applicable law or agreed to in writing,
13 | @REM software distributed under the License is distributed on an
14 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | @REM KIND, either express or implied. See the License for the
16 | @REM specific language governing permissions and limitations
17 | @REM under the License.
18 | @REM ----------------------------------------------------------------------------
19 |
20 | @REM ----------------------------------------------------------------------------
21 | @REM Maven Start Up Batch script
22 | @REM
23 | @REM Required ENV vars:
24 | @REM JAVA_HOME - location of a JDK home dir
25 | @REM
26 | @REM Optional ENV vars
27 | @REM M2_HOME - location of maven2's installed home dir
28 | @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
29 | @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending
30 | @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
31 | @REM e.g. to debug Maven itself, use
32 | @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
33 | @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
34 | @REM ----------------------------------------------------------------------------
35 |
36 | @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
37 | @echo off
38 | @REM set title of command window
39 | title %0
40 | @REM enable echoing by setting MAVEN_BATCH_ECHO to 'on'
41 | @if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
42 |
43 | @REM set %HOME% to equivalent of $HOME
44 | if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
45 |
46 | @REM Execute a user defined script before this one
47 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
48 | @REM check for pre script, once with legacy .bat ending and once with .cmd ending
49 | if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat"
50 | if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd"
51 | :skipRcPre
52 |
53 | @setlocal
54 |
55 | set ERROR_CODE=0
56 |
57 | @REM To isolate internal variables from possible post scripts, we use another setlocal
58 | @setlocal
59 |
60 | @REM ==== START VALIDATION ====
61 | if not "%JAVA_HOME%" == "" goto OkJHome
62 |
63 | echo.
64 | echo Error: JAVA_HOME not found in your environment. >&2
65 | echo Please set the JAVA_HOME variable in your environment to match the >&2
66 | echo location of your Java installation. >&2
67 | echo.
68 | goto error
69 |
70 | :OkJHome
71 | if exist "%JAVA_HOME%\bin\java.exe" goto init
72 |
73 | echo.
74 | echo Error: JAVA_HOME is set to an invalid directory. >&2
75 | echo JAVA_HOME = "%JAVA_HOME%" >&2
76 | echo Please set the JAVA_HOME variable in your environment to match the >&2
77 | echo location of your Java installation. >&2
78 | echo.
79 | goto error
80 |
81 | @REM ==== END VALIDATION ====
82 |
83 | :init
84 |
85 | @REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
86 | @REM Fallback to current working directory if not found.
87 |
88 | set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
89 | IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
90 |
91 | set EXEC_DIR=%CD%
92 | set WDIR=%EXEC_DIR%
93 | :findBaseDir
94 | IF EXIST "%WDIR%"\.mvn goto baseDirFound
95 | cd ..
96 | IF "%WDIR%"=="%CD%" goto baseDirNotFound
97 | set WDIR=%CD%
98 | goto findBaseDir
99 |
100 | :baseDirFound
101 | set MAVEN_PROJECTBASEDIR=%WDIR%
102 | cd "%EXEC_DIR%"
103 | goto endDetectBaseDir
104 |
105 | :baseDirNotFound
106 | set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
107 | cd "%EXEC_DIR%"
108 |
109 | :endDetectBaseDir
110 |
111 | IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
112 |
113 | @setlocal EnableExtensions EnableDelayedExpansion
114 | for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
115 | @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
116 |
117 | :endReadAdditionalConfig
118 |
119 | SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
120 | set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
121 | set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
122 |
123 | set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar"
124 |
125 | FOR /F "tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO (
126 | IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B
127 | )
128 |
129 | @REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
130 | @REM This allows using the maven wrapper in projects that prohibit checking in binary data.
131 | if exist %WRAPPER_JAR% (
132 | if "%MVNW_VERBOSE%" == "true" (
133 | echo Found %WRAPPER_JAR%
134 | )
135 | ) else (
136 | if not "%MVNW_REPOURL%" == "" (
137 | SET DOWNLOAD_URL="%MVNW_REPOURL%/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar"
138 | )
139 | if "%MVNW_VERBOSE%" == "true" (
140 | echo Couldn't find %WRAPPER_JAR%, downloading it ...
141 | echo Downloading from: %DOWNLOAD_URL%
142 | )
143 |
144 | powershell -Command "&{"^
145 | "$webclient = new-object System.Net.WebClient;"^
146 | "if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^
147 | "$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^
148 | "}"^
149 | "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^
150 | "}"
151 | if "%MVNW_VERBOSE%" == "true" (
152 | echo Finished downloading %WRAPPER_JAR%
153 | )
154 | )
155 | @REM End of extension
156 |
157 | @REM Provide a "standardized" way to retrieve the CLI args that will
158 | @REM work with both Windows and non-Windows executions.
159 | set MAVEN_CMD_LINE_ARGS=%*
160 |
161 | %MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
162 | if ERRORLEVEL 1 goto error
163 | goto end
164 |
165 | :error
166 | set ERROR_CODE=1
167 |
168 | :end
169 | @endlocal & set ERROR_CODE=%ERROR_CODE%
170 |
171 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost
172 | @REM check for post script, once with legacy .bat ending and once with .cmd ending
173 | if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat"
174 | if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd"
175 | :skipRcPost
176 |
177 | @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
178 | if "%MAVEN_BATCH_PAUSE%" == "on" pause
179 |
180 | if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE%
181 |
182 | exit /B %ERROR_CODE%
--------------------------------------------------------------------------------
/spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraphScanner.java:
--------------------------------------------------------------------------------
1 | package com.google.cloud.spark.spanner.graph;
2 |
3 | import com.fasterxml.jackson.core.JsonProcessingException;
4 | import com.google.api.gax.grpc.GrpcCallContext;
5 | import com.google.api.gax.rpc.ApiCallContext;
6 | import com.google.cloud.Tuple;
7 | import com.google.cloud.spanner.BatchReadOnlyTransaction;
8 | import com.google.cloud.spanner.BatchTransactionId;
9 | import com.google.cloud.spanner.Options;
10 | import com.google.cloud.spanner.Partition;
11 | import com.google.cloud.spanner.PartitionOptions;
12 | import com.google.cloud.spanner.SpannerOptions;
13 | import com.google.cloud.spanner.SpannerOptions.CallContextConfigurator;
14 | import com.google.cloud.spanner.Statement;
15 | import com.google.cloud.spanner.TimestampBound;
16 | import com.google.cloud.spark.spanner.BatchClientWithCloser;
17 | import com.google.cloud.spark.spanner.SpannerInputPartitionContext;
18 | import com.google.cloud.spark.spanner.SpannerPartition;
19 | import com.google.cloud.spark.spanner.SpannerPartitionReaderFactory;
20 | import com.google.cloud.spark.spanner.SpannerRowConverter;
21 | import com.google.cloud.spark.spanner.SpannerScanner;
22 | import com.google.cloud.spark.spanner.SpannerUtils;
23 | import com.google.cloud.spark.spanner.graph.query.SpannerGraphQuery;
24 | import com.google.common.collect.ImmutableSet;
25 | import io.grpc.Context;
26 | import io.grpc.MethodDescriptor;
27 | import java.util.ArrayList;
28 | import java.util.List;
29 | import java.util.Map;
30 | import java.util.Set;
31 | import java.util.stream.Collectors;
32 | import javax.annotation.Nullable;
33 | import org.apache.spark.sql.connector.read.Batch;
34 | import org.apache.spark.sql.connector.read.PartitionReaderFactory;
35 | import org.apache.spark.sql.connector.read.Scan;
36 | import org.apache.spark.sql.types.StructType;
37 | import org.apache.spark.sql.util.CaseInsensitiveStringMap;
38 | import org.slf4j.Logger;
39 | import org.slf4j.LoggerFactory;
40 |
41 | /** Logically and physically represents a scan of a graph in Spanner */
42 | public class SpannerGraphScanner implements Batch, Scan {
43 |
44 | private static final Logger log = LoggerFactory.getLogger(SpannerScanner.class);
45 |
46 | private final Map options;
47 | private final @Nullable Map> extraHeaders;
48 | private final TimestampBound readTimestamp;
49 | private final @Nullable Long partitionSizeBytes;
50 | private final Options.ReadAndQueryOption dataBoostEnabled;
51 | private final @Nullable ImmutableSet requiredColumns;
52 | private final StructType readSchema;
53 | private final List> queryAndRowConverters;
54 | private final List partitions;
55 |
56 | public SpannerGraphScanner(
57 | Map options,
58 | @Nullable Map> extraHeaders,
59 | TimestampBound readTimestamp,
60 | @Nullable Long partitionSizeBytes,
61 | Options.ReadAndQueryOption dataBoostEnabled,
62 | SpannerGraphQuery graphQuery,
63 | @Nullable Set requiredColumns,
64 | StructType readSchema) {
65 | // Potential improvement: support filter pushdown.
66 | this.options = new CaseInsensitiveStringMap(options);
67 | this.extraHeaders = extraHeaders;
68 | this.readTimestamp = readTimestamp;
69 | this.partitionSizeBytes = partitionSizeBytes;
70 | this.dataBoostEnabled = dataBoostEnabled;
71 | this.requiredColumns = requiredColumns == null ? null : ImmutableSet.copyOf(requiredColumns);
72 | this.readSchema = readSchema;
73 | this.queryAndRowConverters =
74 | graphQuery.graphSubqueries.stream()
75 | .map(q -> q.getQueryAndConverter(readSchema))
76 | .collect(Collectors.toList());
77 | this.partitions = new ArrayList<>(); // Filled later
78 | }
79 |
80 | /**
81 | * Returns a list of {@link SpannerPartition input partitions}. Each {@link SpannerPartition}
82 | * represents a data split that can be processed by one Spark task. The number of input partitions
83 | * returned here is the same as the number of RDD partitions this scan outputs.
84 | */
85 | @Override
86 | public SpannerPartition[] planInputPartitions() {
87 | if (extraHeaders == null || extraHeaders.isEmpty()) {
88 | return doPlanInputPartitions();
89 | }
90 | Context context =
91 | Context.current()
92 | .withValue(
93 | SpannerOptions.CALL_CONTEXT_CONFIGURATOR_KEY,
94 | new CallContextConfigurator() {
95 | @Override
96 | public ApiCallContext configure(
97 | ApiCallContext context, ReqT request, MethodDescriptor method) {
98 | return GrpcCallContext.createDefault().withExtraHeaders(extraHeaders);
99 | }
100 | });
101 | try {
102 | return context.call(this::doPlanInputPartitions);
103 | } catch (Exception e) {
104 | throw new RuntimeException(e);
105 | }
106 | }
107 |
108 | private SpannerPartition[] doPlanInputPartitions() {
109 | String optionsJson;
110 | try {
111 | optionsJson = SpannerUtils.serializeMap(options);
112 | } catch (JsonProcessingException e) {
113 | throw new RuntimeException(e);
114 | }
115 | try (BatchClientWithCloser batchClient = SpannerUtils.batchClientFromProperties(options)) {
116 | try (BatchReadOnlyTransaction txn =
117 | batchClient.batchClient.batchReadOnlyTransaction(readTimestamp)) {
118 | BatchTransactionId txnId = txn.getBatchTransactionId();
119 | PartitionOptions options = PartitionOptions.getDefaultInstance();
120 | if (partitionSizeBytes != null) {
121 | options = PartitionOptions.newBuilder().setPartitionSizeBytes(partitionSizeBytes).build();
122 | }
123 |
124 | partitions.clear();
125 | for (Tuple queryAndRowConverter : queryAndRowConverters) {
126 | List rawPartitions =
127 | txn.partitionQuery(options, queryAndRowConverter.x(), dataBoostEnabled);
128 |
129 | for (Partition rawPartition : rawPartitions) {
130 | SpannerInputPartitionContext context =
131 | new SpannerInputPartitionContext(
132 | rawPartition, txnId, optionsJson, queryAndRowConverter.y());
133 | int index = partitions.size();
134 | partitions.add(new SpannerPartition(rawPartition, index, context));
135 | }
136 | }
137 | log.info("Number of partitions: " + partitions.size());
138 | return partitions.toArray(new SpannerPartition[0]);
139 | }
140 | }
141 | }
142 |
143 | @Override
144 | public PartitionReaderFactory createReaderFactory() {
145 | return new SpannerPartitionReaderFactory();
146 | }
147 |
148 | @Override
149 | public StructType readSchema() {
150 | return readSchema;
151 | }
152 |
153 | @Override
154 | public Batch toBatch() {
155 | return this;
156 | }
157 |
158 | @Override
159 | public String description() {
160 | return String.format(
161 | "%s\nRequired Columns: %s\nRead Timestamp: %s"
162 | + "\nStatements (%d):\n%s\nNumber of Partitions: %d",
163 | this.getClass(),
164 | requiredColumns,
165 | readTimestamp,
166 | queryAndRowConverters.size(),
167 | queryAndRowConverters.stream()
168 | .map(qc -> qc.x().toString())
169 | .collect(Collectors.joining("\n")),
170 | partitions.size());
171 | }
172 | }
173 |
--------------------------------------------------------------------------------
/python/spannergraph/tests.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | import unittest
4 | import logging
5 |
6 | from datetime import datetime, timedelta
7 |
8 | from pyspark.sql import DataFrame
9 | from pyspark.sql.session import SparkSession
10 | from pyspark.sql import functions as sf
11 |
12 | from ._connector import SpannerGraphConnector
13 |
14 | from .tests_gold import *
15 |
16 |
17 | logging.basicConfig(level=logging.INFO)
18 |
19 |
20 | def get_connector_jar() -> str:
21 | jar_path = os.path.abspath(
22 | "../spark-3.1-spanner/target/spark-3.1-spanner-0.0.1-SNAPSHOT.jar"
23 | )
24 | assert os.path.exists(jar_path), (
25 | f"Cannot find connector JAR at {jar_path}. "
26 | "Please build the connector JAR first."
27 | )
28 | return jar_path
29 |
30 |
31 | def _as_pandas_str(df: DataFrame) -> str:
32 | df_pd = df.toPandas()
33 | return df_pd.sort_values(by=df_pd.columns.to_list()) \
34 | .reset_index(drop=True).to_string()
35 |
36 |
37 | # These tests rely on the graphs named FlexibleGraph and MusicGraph
38 | # in the databases named flexible-graph and music-graph, respectively.
39 | # See the following files for the definitions of the graph:
40 | # spark-3.1-spanner-lib/src/test/resources/db/populate_ddl_graph.sql
41 | # spark-3.1-spanner-lib/src/test/resources/db/insert_data_graph.sql
42 |
43 |
44 | class TestGraphConnector(unittest.TestCase):
45 |
46 | def __init__(self, *args, **kwargs) -> None:
47 | super().__init__(*args, **kwargs)
48 | self.check_point_dir = tempfile.TemporaryDirectory()
49 | self.spark = (
50 | SparkSession.builder.appName("spanner-spark-connector-test")
51 | .config(
52 | "spark.jars.packages",
53 | "graphframes:graphframes:0.8.4-spark3.5-s_2.12",
54 | )
55 | .config("spark.jars", get_connector_jar())
56 | .getOrCreate()
57 | )
58 | self.spark.sparkContext.setCheckpointDir(self.check_point_dir.name)
59 |
60 | def test_flexible_graph_cc(self) -> None:
61 | connector = (
62 | SpannerGraphConnector()
63 | .spark(self.spark)
64 | .project(os.getenv("SPANNER_PROJECT_ID"))
65 | .instance(os.getenv("SPANNER_INSTANCE_ID"))
66 | .database("flexible-graph")
67 | .graph("FlexibleGraph")
68 | .data_boost()
69 | .node_label("*", properties=["id"])
70 | .edge_label("*", properties=["to_id"],
71 | where="id < 100 AND to_id < 100")
72 | .partition_size_bytes(1) # hint only
73 | .repartition(3)
74 | .read_timestamp(datetime.now() - timedelta(minutes=10))
75 | )
76 |
77 | g = connector.load_graph()
78 |
79 | vertices_str = self._df_to_str(g.vertices)
80 | edges_str = self._df_to_str(g.edges)
81 | self.assertEqual(
82 | vertices_str,
83 | "['id', 'property_id'] - "
84 | + "[[1, 1], [2, 2], [3, 3], [7, 7], [16, 16], "
85 | + "[20, 20], [100, 100], [101, 101]]",
86 | )
87 | self.assertEqual(
88 | edges_str,
89 | "['src', 'dst', 'property_to_id'] - "
90 | + "[[1, 7, 7], [2, 20, 20], [3, 16, 16], "
91 | + "[7, 16, 16], [7, 16, 16], [16, 20, 20], "
92 | + "[20, 7, 7], [20, 16, 16]]",
93 | )
94 |
95 | cc = g.connectedComponents()
96 | cc_str = self._df_to_str(cc)
97 | self.assertEqual(
98 | cc_str,
99 | "['id', 'property_id', 'component'] - "
100 | + "[[1, 1, 1], [2, 2, 1], [3, 3, 1], [7, 7, 1], "
101 | + "[16, 16, 1], [20, 20, 1], [100, 100, 100], "
102 | + "[101, 101, 101]]",
103 | )
104 |
105 | self.assertEqual(g.vertices.rdd.getNumPartitions(), 3)
106 | self.assertEqual(g.edges.rdd.getNumPartitions(), 3)
107 |
108 | def test_music_graph_cc(self) -> None:
109 | connector = (
110 | SpannerGraphConnector()
111 | .spark(self.spark)
112 | .project(os.getenv("SPANNER_PROJECT_ID"))
113 | .instance(os.getenv("SPANNER_INSTANCE_ID"))
114 | .database("music-graph")
115 | .graph("MusicGraph")
116 | .data_boost(True)
117 | .repartition(6)
118 | .node_label(
119 | "*",
120 | properties=[
121 | "name", "country_origin", "birthday", "ReleaseDate"
122 | ]
123 | )
124 | .edge_label(
125 | "*",
126 | properties=["SingerId", "release_date", "FriendId"]
127 | )
128 | )
129 |
130 | df_nodes, df_edges, df_mapping = connector.load_dfs()
131 | df_nodes = df_nodes.join(df_mapping, "id").drop("id")
132 | df_edges_src = df_edges \
133 | .join(df_mapping, sf.expr("src <=> id")).drop("id", "src", "dst")
134 | df_edges_dst = df_edges \
135 | .join(df_mapping, sf.expr("dst <=> id")).drop("id", "src", "dst")
136 |
137 | vertices_str = _as_pandas_str(df_nodes)
138 | edges_str_src = _as_pandas_str(df_edges_src)
139 | edges_str_dst = _as_pandas_str(df_edges_dst)
140 |
141 | self.assertEqual(vertices_str, TEST_MUSIC_GRAPH_CC_VERTICES)
142 | self.assertEqual(edges_str_src, TEST_MUSIC_GRAPH_CC_EDGES_SRC)
143 | self.assertEqual(edges_str_dst, TEST_MUSIC_GRAPH_CC_EDGES_DST)
144 |
145 | def test_flexible_graph_undirected(self) -> None:
146 | connector = (
147 | SpannerGraphConnector()
148 | .spark(self.spark)
149 | .project(os.getenv("SPANNER_PROJECT_ID"))
150 | .instance(os.getenv("SPANNER_INSTANCE_ID"))
151 | .database("flexible-graph")
152 | .graph("FlexibleGraph")
153 | .symmetrize_graph()
154 | .edge_label("*", properties=["to_id"])
155 | .repartition(7)
156 | )
157 |
158 | g = connector.load_graph()
159 |
160 | vertices_str = self._df_to_str(g.vertices)
161 | edges_str = self._df_to_str(g.edges)
162 | self.assertEqual(
163 | vertices_str,
164 | "['id'] - [[1], [2], [3], [7], [16], [20], [100], [101]]",
165 | )
166 | self.assertEqual(
167 | edges_str,
168 | "['src', 'dst', 'property_to_id'] - "
169 | "[[1, 7, 7], [2, 20, 20], [3, 16, 16], [7, 1, 7], [7, 16, 16], "
170 | "[7, 20, 7], [16, 3, 16], [16, 7, 16], [16, 20, 16], "
171 | "[16, 20, 20], [20, 2, 20], [20, 7, 7], [20, 16, 16], "
172 | "[20, 16, 20], [100, 101, 101], [101, 100, 101]]",
173 | )
174 |
175 | self.assertEqual(g.vertices.rdd.getNumPartitions(), 7)
176 | self.assertEqual(g.edges.rdd.getNumPartitions(), 7)
177 |
178 | def test_music_graph_direct_queries(self) -> None:
179 | node_query = "SELECT * FROM GRAPH_TABLE " \
180 | "(MusicGraph MATCH (n:SINGER) RETURN n.id AS id)"
181 | edge_query = "SELECT * FROM GRAPH_TABLE " \
182 | "(MusicGraph MATCH -[e:KNOWS]-> " \
183 | "RETURN e.SingerId AS src, e.FriendId AS dst)"
184 |
185 | connector = (
186 | SpannerGraphConnector()
187 | .spark(self.spark)
188 | .project(os.getenv("SPANNER_PROJECT_ID"))
189 | .instance(os.getenv("SPANNER_INSTANCE_ID"))
190 | .database("music-graph")
191 | .graph("MusicGraph")
192 | .node_query(node_query)
193 | .edge_query(edge_query)
194 | .data_boost()
195 | )
196 |
197 | g = connector.load_graph()
198 |
199 | vertices_str = self._df_to_str(g.vertices)
200 | edges_str = self._df_to_str(g.edges)
201 | self.assertEqual(
202 | vertices_str,
203 | "['id'] - [[1], [2], [3], [4], [5]]",
204 | )
205 | self.assertEqual(
206 | edges_str,
207 | "['src', 'dst'] - "
208 | "[[1, 2], [1, 3], [2, 1], [2, 4], [2, 5], [3, 1], [3, 5], "
209 | "[4, 2], [4, 5], [5, 2], [5, 3], [5, 4]]",
210 | )
211 |
212 | def _df_to_str(self, df: DataFrame) -> str:
213 | rows = sorted([list(r.asDict().values()) for r in df.collect()])
214 | return f"{df.columns} - {rows}"
215 |
216 |
217 | if __name__ == "__main__":
218 | unittest.main()
219 |
--------------------------------------------------------------------------------