├── python ├── .gitignore ├── requirements.txt ├── run_tests.sh └── spannergraph │ ├── __init__.py │ └── tests.py ├── spark-3.1-spanner-lib ├── src │ ├── test │ │ ├── resources │ │ │ ├── spark-spanner-connector.properties │ │ │ ├── META-INF │ │ │ │ └── services │ │ │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ │ │ ├── acceptance │ │ │ │ └── read_test_table.py │ │ │ └── db │ │ │ │ ├── populate_ddl_pg.sql │ │ │ │ ├── populate_ddl.sql │ │ │ │ ├── insert_data_pg.sql │ │ │ │ ├── insert_data_graph.sql │ │ │ │ ├── populate_ddl_graph.sql │ │ │ │ └── insert_data.sql │ │ └── java │ │ │ └── com │ │ │ └── google │ │ │ └── cloud │ │ │ └── spark │ │ │ └── spanner │ │ │ ├── SpannerTestUtils.java │ │ │ ├── graph │ │ │ ├── GraphReadIntegrationTestBase.java │ │ │ └── GraphErrorHandlingTest.java │ │ │ ├── acceptance │ │ │ ├── DataprocServerlessImage11AcceptanceTest.java │ │ │ ├── DataprocServerlessImage20AcceptanceTest.java │ │ │ ├── DataprocServerlessImage21AcceptanceTest.java │ │ │ ├── DataprocImage20AcceptanceTest.java │ │ │ ├── DataprocImage21AcceptanceTest.java │ │ │ ├── DataprocImage22AcceptanceTest.java │ │ │ ├── AcceptanceTestContext.java │ │ │ └── AcceptanceTestUtils.java │ │ │ ├── SparkSpannerIntegrationTestBase.java │ │ │ ├── SpannerTableSchemaTest.java │ │ │ ├── WriteIntegrationTestBase.java │ │ │ ├── SpannerScannerTest.java │ │ │ ├── TestData.java │ │ │ ├── OpenLineageIntegrationTestBase.java │ │ │ └── SpannerTableTest.java │ ├── build │ │ └── resources │ │ │ └── spark-spanner-connector.properties │ └── main │ │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── spark │ │ └── spanner │ │ ├── SpannerRowConverter.java │ │ ├── DefaultSource.java │ │ ├── SpannerRowConverterDirect.java │ │ ├── InputPartitionReaderContext.java │ │ ├── InputPartitionContext.java │ │ ├── graph │ │ ├── query │ │ │ ├── GraphSubQuery.java │ │ │ ├── SelectField.java │ │ │ ├── NodeElementTableQuery.java │ │ │ ├── DirectGraphQuery.java │ │ │ ├── EdgeElementTableQuery.java │ │ │ └── SpannerGraphQuery.java │ │ ├── SpannerGraphScanBuilder.java │ │ ├── SpannerRowConverterWithSchema.java │ │ ├── SpannerGraph.java │ │ ├── SpannerGraphBuilder.java │ │ ├── SpannerGraphConfigs.java │ │ ├── PropertyGraph.java │ │ └── SpannerGraphScanner.java │ │ ├── SpannerErrorCode.java │ │ ├── SpannerPartitionReader.java │ │ ├── SpannerConnectorException.java │ │ ├── SpannerPartition.java │ │ ├── BatchClientWithCloser.java │ │ ├── SpannerInputPartitionContext.java │ │ ├── SpannerPartitionReaderFactory.java │ │ ├── SpannerTableSchema.java │ │ ├── SpannerInputPartitionReaderContext.java │ │ ├── Spark31SpannerTableProvider.java │ │ ├── SpannerScanBuilder.java │ │ └── SpannerScanner.java └── pom.xml ├── spark-3.1-spanner ├── src │ └── main │ │ └── resources │ │ └── META-INF │ │ └── services │ │ └── org.apache.spark.sql.sources.DataSourceRegister └── pom.xml ├── spark-3.2-spanner ├── src │ └── main │ │ └── resources │ │ └── META-INF │ │ └── services │ │ └── org.apache.spark.sql.sources.DataSourceRegister └── pom.xml ├── spark-3.3-spanner ├── src │ └── main │ │ └── resources │ │ └── META-INF │ │ └── services │ │ └── org.apache.spark.sql.sources.DataSourceRegister └── pom.xml ├── spark-3.2-spanner-lib ├── src │ ├── test │ │ └── resources │ │ │ └── META-INF │ │ │ └── services │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── main │ │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── spark │ │ └── spanner │ │ └── Spark32SpannerTableProvider.java └── pom.xml ├── spark-3.3-spanner-lib ├── src │ ├── test │ │ └── resources │ │ │ └── META-INF │ │ │ └── services │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── main │ │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── spark │ │ └── spanner │ │ └── Spark33SpannerTableProvider.java └── pom.xml ├── .mvn └── wrapper │ ├── maven-wrapper.properties │ └── MavenWrapperDownloader.java ├── cloudbuild ├── gcp-settings.xml ├── Dockerfile ├── cloudbuild.yaml └── presubmit.sh ├── CHANGES.md ├── .gitignore ├── examples ├── SpannerSpark.py └── SpannerSpark.java ├── spark-spanner-lib-parent └── pom.xml ├── pom.xml ├── CONTRIBUTING.md └── mvnw.cmd /python/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/resources/spark-spanner-connector.properties: -------------------------------------------------------------------------------- 1 | connector.version=test 2 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | typing_extensions 2 | pyspark 3 | unittest 4 | numpy 5 | pycodestyle 6 | pandas 7 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/build/resources/spark-spanner-connector.properties: -------------------------------------------------------------------------------- 1 | connector.version=${project.version} 2 | -------------------------------------------------------------------------------- /python/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | pycodestyle spannergraph --exclude=tests_gold.py && python3 -m spannergraph.tests 4 | -------------------------------------------------------------------------------- /python/spannergraph/__init__.py: -------------------------------------------------------------------------------- 1 | """This module enables exporting Spanner Graphs as GraphFrames graphs.""" 2 | 3 | from ._connector import SpannerGraphConnector 4 | -------------------------------------------------------------------------------- /spark-3.1-spanner/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | com.google.cloud.spark.spanner.Spark31SpannerTableProvider 2 | -------------------------------------------------------------------------------- /spark-3.2-spanner/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | com.google.cloud.spark.spanner.Spark32SpannerTableProvider 2 | -------------------------------------------------------------------------------- /spark-3.3-spanner/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | com.google.cloud.spark.spanner.Spark33SpannerTableProvider 2 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | com.google.cloud.spark.spanner.Spark31SpannerTableProvider 2 | -------------------------------------------------------------------------------- /spark-3.2-spanner-lib/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | com.google.cloud.spark.spanner.Spark32SpannerTableProvider 2 | -------------------------------------------------------------------------------- /spark-3.3-spanner-lib/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | com.google.cloud.spark.spanner.Spark33SpannerTableProvider 2 | -------------------------------------------------------------------------------- /.mvn/wrapper/maven-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.4/apache-maven-3.8.4-bin.zip 2 | wrapperUrl=https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar -------------------------------------------------------------------------------- /cloudbuild/gcp-settings.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # Release Notes 2 | 3 | ## Next 4 | 5 | ## 1.1.1 - 2025-12-18 6 | 7 | * Prefix column names with table name to avoid ambiguity 8 | * Make table name lookup case-insensitive for GoogleSQL 9 | 10 | ## 1.1.0 - 2024-12-20 11 | 12 | * Add support for exporting graphs from Spanner 13 | 14 | ## 1.0.0 - 2023-11-13 15 | 16 | * Initial release. 17 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerRowConverter.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner; 2 | 3 | import com.google.cloud.spanner.Struct; 4 | import org.apache.spark.sql.catalyst.InternalRow; 5 | 6 | /** Converts rows from Spanner query outputs to rows in Spark DataFrame */ 7 | public interface SpannerRowConverter { 8 | 9 | /** Generates a Spark row based on the Spanner row */ 10 | InternalRow convert(Struct spannerRow); 11 | } 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt/maven specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | .flattened-pom.xml 9 | dist/* 10 | target/ 11 | lib_managed/ 12 | src_managed/ 13 | project/boot/ 14 | project/plugins/project/ 15 | dependency-reduced-pom.xml 16 | *.versionsBackup 17 | .mvn/wrapper/maven-wrapper.jar 18 | 19 | # Scala-IDE specific 20 | .scala_dependencies 21 | .worksheet 22 | .idea 23 | *.iml 24 | 25 | # Eclipse IDE Specific files 26 | .classpath 27 | .project 28 | .settings/ 29 | 30 | # Mac 31 | .DS_Store 32 | -------------------------------------------------------------------------------- /cloudbuild/Dockerfile: -------------------------------------------------------------------------------- 1 | # This Dockerfile creates an image for running presubmit tests. 2 | FROM openjdk:8 3 | 4 | RUN \ 5 | echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | \ 6 | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ 7 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ 8 | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ 9 | apt-get update -y && \ 10 | apt-get install google-cloud-cli -y 11 | 12 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/DefaultSource.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | public class DefaultSource extends Spark31SpannerTableProvider {} 18 | -------------------------------------------------------------------------------- /spark-3.2-spanner-lib/src/main/java/com/google/cloud/spark/spanner/Spark32SpannerTableProvider.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.spark.spanner; 17 | 18 | public class Spark32SpannerTableProvider extends Spark31SpannerTableProvider {} 19 | -------------------------------------------------------------------------------- /spark-3.3-spanner-lib/src/main/java/com/google/cloud/spark/spanner/Spark33SpannerTableProvider.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.spark.spanner; 17 | 18 | public class Spark33SpannerTableProvider extends Spark32SpannerTableProvider {} 19 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SpannerTestUtils.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner; 2 | 3 | import java.time.ZonedDateTime; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import org.apache.spark.sql.catalyst.util.GenericArrayData; 7 | 8 | public class SpannerTestUtils { 9 | 10 | public static GenericArrayData zonedDateTimeIterToSparkDates(Iterable tsIt) { 11 | List dest = new ArrayList<>(); 12 | tsIt.forEach((ts) -> dest.add(SpannerUtils.zonedDateTimeToSparkDate(ts))); 13 | return new GenericArrayData(dest.toArray(new Integer[0])); 14 | } 15 | 16 | public static GenericArrayData zonedDateTimeIterToSparkTimestamps(Iterable tsIt) { 17 | List dest = new ArrayList<>(); 18 | tsIt.forEach((ts) -> dest.add(SpannerUtils.zonedDateTimeToSparkTimestamp(ts))); 19 | return new GenericArrayData(dest.toArray(new Long[0])); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerRowConverterDirect.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner; 2 | 3 | import com.google.cloud.spanner.Struct; 4 | import java.io.Serializable; 5 | import org.apache.spark.sql.catalyst.InternalRow; 6 | 7 | /** Converts rows from Spanner query outputs to rows in a Spark DataFrame with 1:1 field mapping. */ 8 | public class SpannerRowConverterDirect implements SpannerRowConverter, Serializable { 9 | 10 | /** 11 | * Converts a spanner row to a Spark DataFrame row with 1:1 field mapping. 12 | * 13 | * @param spannerRow the row from Spanner to convert. 14 | * @return a Spark DataFrame row with the same length as the input. Each field in the output is 15 | * converted directly from the field at the same position in the input. 16 | */ 17 | @Override 18 | public InternalRow convert(Struct spannerRow) { 19 | return SpannerUtils.spannerStructToInternalRow(spannerRow); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/InputPartitionReaderContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import java.io.Closeable; 18 | import java.io.IOException; 19 | 20 | public interface InputPartitionReaderContext extends Closeable { 21 | 22 | boolean next() throws IOException; 23 | 24 | T get(); 25 | } 26 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/InputPartitionContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import java.io.Serializable; 18 | 19 | public interface InputPartitionContext extends Serializable { 20 | 21 | InputPartitionReaderContext createPartitionReaderContext(); 22 | 23 | boolean supportsColumnarReads(); 24 | } 25 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/GraphSubQuery.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph.query; 2 | 3 | import com.google.cloud.Tuple; 4 | import com.google.cloud.spanner.Statement; 5 | import com.google.cloud.spark.spanner.SpannerRowConverter; 6 | import java.util.List; 7 | import org.apache.spark.sql.types.StructField; 8 | import org.apache.spark.sql.types.StructType; 9 | 10 | /** Handles a single SQL query (e.g., for an individual element table, for a GQL query) */ 11 | public interface GraphSubQuery { 12 | 13 | /** 14 | * Get the statement for this sub-query and a row converter that converts outputs to a row in the 15 | * DataFrame 16 | * 17 | * @param dataframeSchema schema of the DataFrame that will store the outputs 18 | * @return the statement and the row converter 19 | */ 20 | Tuple getQueryAndConverter(StructType dataframeSchema); 21 | 22 | /** Get a list of fields that this sub-query will output */ 23 | List getOutputSparkFields(); 24 | } 25 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/SelectField.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph.query; 2 | 3 | import java.util.Collection; 4 | import java.util.stream.Collectors; 5 | 6 | /** Represents a field in the SELECT clause */ 7 | public class SelectField { 8 | 9 | public final String inputExpression; 10 | public final String outputName; 11 | 12 | public SelectField(String columnName) { 13 | this(columnName, columnName); 14 | } 15 | 16 | public SelectField(String inputExpression, String outputName) { 17 | this.inputExpression = inputExpression.trim(); 18 | this.outputName = outputName.trim(); 19 | } 20 | 21 | @Override 22 | public String toString() { 23 | if (inputExpression.equals(outputName)) { 24 | return outputName; 25 | } else { 26 | return inputExpression + " AS " + outputName; 27 | } 28 | } 29 | 30 | public static String join(Collection selectFields) { 31 | return selectFields.stream().map(SelectField::toString).collect(Collectors.joining(", ")); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-3.2-spanner/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.google.cloud.spark.spanner 7 | spark-spanner-parent 8 | ${revision} 9 | ../spark-spanner-parent 10 | 11 | 12 | spark-3.2-spanner 13 | ${revision} 14 | spanner DataSource v2 for Spark 3.2 15 | 16 | 3.2.0 17 | false 18 | 19 | 20 | 21 | Apache License, Version 2.0 22 | http://www.apache.org/licenses/LICENSE-2.0.txt 23 | repo 24 | 25 | 26 | 27 | 28 | ${project.groupId} 29 | spark-3.2-spanner-lib 30 | ${project.version} 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /spark-3.3-spanner/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.google.cloud.spark.spanner 7 | spark-spanner-parent 8 | ${revision} 9 | ../spark-spanner-parent 10 | 11 | 12 | spark-3.3-spanner 13 | ${revision} 14 | spanner DataSource v2 for Spark 3.3 15 | 16 | 3.3.0 17 | false 18 | 19 | 20 | 21 | Apache License, Version 2.0 22 | http://www.apache.org/licenses/LICENSE-2.0.txt 23 | repo 24 | 25 | 26 | 27 | 28 | ${project.groupId} 29 | spark-3.3-spanner-lib 30 | ${project.version} 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /spark-3.2-spanner-lib/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.google.cloud.spark.spanner 7 | spark-spanner-lib-parent 8 | ${revision} 9 | ../spark-spanner-lib-parent 10 | 11 | 12 | spark-3.2-spanner-lib 13 | ${revision} 14 | Connector code for spanner DataSource v2 for Spark 3.2 15 | 16 | 3.2.0 17 | true 18 | 19 | 20 | 21 | Apache License, Version 2.0 22 | http://www.apache.org/licenses/LICENSE-2.0.txt 23 | repo 24 | 25 | 26 | 27 | 28 | ${project.groupId} 29 | spark-3.1-spanner 30 | ${project.version} 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /spark-3.3-spanner-lib/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.google.cloud.spark.spanner 7 | spark-spanner-lib-parent 8 | ${revision} 9 | ../spark-spanner-lib-parent 10 | 11 | 12 | spark-3.3-spanner-lib 13 | ${revision} 14 | Connector code for spanner DataSource v2 for Spark 3.3 15 | 16 | 3.3.0 17 | true 18 | 19 | 20 | 21 | Apache License, Version 2.0 22 | http://www.apache.org/licenses/LICENSE-2.0.txt 23 | repo 24 | 25 | 26 | 27 | 28 | ${project.groupId} 29 | spark-3.2-spanner 30 | ${project.version} 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerErrorCode.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | public enum SpannerErrorCode { 18 | SPANNER_FAILED_TO_EXECUTE_QUERY(0), 19 | SPANNER_FAILED_TO_PARSE_OPTIONS(1), 20 | COLUMNAR_READS_NOT_SUPPORTED(2), 21 | WRITES_NOT_SUPPORTED(3), 22 | RESOURCE_EXHAUSTED_ON_SPANNER(4), 23 | DATABASE_DIALECT_NOT_SUPPORTED(5), 24 | DECIMAL_OUT_OF_RANGE(6), 25 | 26 | // Should be last 27 | UNSUPPORTED(9998), 28 | UNKNOWN(9999); 29 | 30 | final int code; 31 | 32 | SpannerErrorCode(int code) { 33 | this.code = code; 34 | } 35 | 36 | public int getCode() { 37 | return code; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /examples/SpannerSpark.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/env python 2 | 3 | # Copyright 2023 Google LLC. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from pyspark.sql import SparkSession 18 | 19 | def main(): 20 | table = "TABLE_NAME" 21 | spark = SparkSession.builder.appName("SparkSpannerDemo").getOrCreate() 22 | df = spark.read.format('cloud-spanner') \ 23 | .option("projectId", "") \ 24 | .option("instanceId", "") \ 25 | .option("databaseId", "") \ 26 | .option("enableDataBoost", "true") \ 27 | .option("table", "") \ 28 | .load() 29 | df.printSchema() 30 | df.show() 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/graph/GraphReadIntegrationTestBase.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph; 2 | 3 | import com.google.cloud.spark.spanner.SparkSpannerIntegrationTestBase; 4 | import com.google.gson.Gson; 5 | import org.apache.spark.sql.DataFrameReader; 6 | import org.apache.spark.sql.Dataset; 7 | import org.apache.spark.sql.Row; 8 | 9 | public class GraphReadIntegrationTestBase extends SparkSpannerIntegrationTestBase { 10 | public DataFrameReader flexibleGraphReader(SpannerGraphConfigs configs) { 11 | DataFrameReader reader = 12 | reader().option("enableDataBoost", "true").option("graph", "FlexibleGraph"); 13 | return configs == null ? reader : reader.option("configs", new Gson().toJson(configs)); 14 | } 15 | 16 | public DataFrameReader musicGraphReader(SpannerGraphConfigs configs) { 17 | DataFrameReader reader = 18 | reader().option("enableDataBoost", "true").option("graph", "MusicGraph"); 19 | return configs == null ? reader : reader.option("configs", new Gson().toJson(configs)); 20 | } 21 | 22 | public static Dataset readNodes(DataFrameReader reader) { 23 | return reader.option("type", "node").load(); 24 | } 25 | 26 | public static Dataset readEdges(DataFrameReader reader) { 27 | return reader.option("type", "edge").load(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocServerlessImage11AcceptanceTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner.acceptance; 16 | 17 | import org.junit.runner.RunWith; 18 | import org.junit.runners.JUnit4; 19 | 20 | /** 21 | * The acceptance test on the Dataproc Serverless. The test have to be running on the project with 22 | * requireOsLogin disabled, otherwise an org policy violation error will be thrown. 23 | */ 24 | @RunWith(JUnit4.class) 25 | public final class DataprocServerlessImage11AcceptanceTest 26 | extends DataprocServerlessAcceptanceTestBase { 27 | 28 | private static AcceptanceTestContext context; 29 | 30 | public DataprocServerlessImage11AcceptanceTest() { 31 | super("spark-3.1-spanner", "1.1"); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocServerlessImage20AcceptanceTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner.acceptance; 16 | 17 | import org.junit.runner.RunWith; 18 | import org.junit.runners.JUnit4; 19 | 20 | /** 21 | * The acceptance test on the Dataproc Serverless. The test have to be running on the project with 22 | * requireOsLogin disabled, otherwise an org policy violation error will be thrown. 23 | */ 24 | @RunWith(JUnit4.class) 25 | public final class DataprocServerlessImage20AcceptanceTest 26 | extends DataprocServerlessAcceptanceTestBase { 27 | 28 | private static AcceptanceTestContext context; 29 | 30 | public DataprocServerlessImage20AcceptanceTest() { 31 | super("spark-3.1-spanner", "2.0"); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocServerlessImage21AcceptanceTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner.acceptance; 16 | 17 | import org.junit.runner.RunWith; 18 | import org.junit.runners.JUnit4; 19 | 20 | /** 21 | * The acceptance test on the Dataproc Serverless. The test have to be running on the project with 22 | * requireOsLogin disabled, otherwise an org policy violation error will be thrown. 23 | */ 24 | @RunWith(JUnit4.class) 25 | public final class DataprocServerlessImage21AcceptanceTest 26 | extends DataprocServerlessAcceptanceTestBase { 27 | 28 | private static AcceptanceTestContext context; 29 | 30 | public DataprocServerlessImage21AcceptanceTest() { 31 | super("spark-3.1-spanner", "2.1"); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerPartitionReader.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import java.io.IOException; 18 | import org.apache.spark.sql.connector.read.PartitionReader; 19 | 20 | public class SpannerPartitionReader implements PartitionReader { 21 | 22 | private InputPartitionReaderContext context; 23 | 24 | public SpannerPartitionReader(InputPartitionReaderContext context) { 25 | this.context = context; 26 | } 27 | 28 | @Override 29 | public boolean next() throws IOException { 30 | return this.context.next(); 31 | } 32 | 33 | @Override 34 | public T get() { 35 | return this.context.get(); 36 | } 37 | 38 | @Override 39 | public void close() throws IOException { 40 | this.context.close(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark-3.1-spanner/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.google.cloud.spark.spanner 7 | spark-spanner-parent 8 | ${revision} 9 | ../spark-spanner-parent 10 | 11 | 12 | spark-3.1-spanner 13 | ${revision} 14 | spanner DataSource v2 for Spark 3.1 15 | 16 | 3.1.0 17 | false 18 | 19 | 20 | 21 | Apache License, Version 2.0 22 | http://www.apache.org/licenses/LICENSE-2.0.txt 23 | repo 24 | 25 | 26 | 27 | 28 | ${project.groupId} 29 | spark-3.1-spanner-lib 30 | ${project.version} 31 | 32 | 33 | 34 | org.apache.spark 35 | spark-sql_2.12 36 | 3.1.1 37 | provided 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/resources/acceptance/read_test_table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2023 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import sys 18 | from pyspark.sql import SparkSession 19 | from pyspark.sql.functions import col 20 | 21 | 22 | def main(): 23 | spark = SparkSession.builder.appName('Acceptance Test on Spark').getOrCreate() 24 | 25 | table = 'ATable' 26 | df = spark.read.format('cloud-spanner') \ 27 | .option("projectId", sys.argv[2]) \ 28 | .option("instanceId", sys.argv[3]) \ 29 | .option("databaseId", sys.argv[4]) \ 30 | .option("table", table) \ 31 | .load(table) 32 | 33 | print('The resulting schema is') 34 | df.printSchema() 35 | 36 | df = df.select("A", "B", "D", "E") 37 | df = df.groupBy().sum('A') 38 | 39 | print('Table:') 40 | df.show() 41 | 42 | df.write.csv(sys.argv[1]) 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /examples/SpannerSpark.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner.examples; 16 | 17 | import org.apache.spark.sql.Dataset; 18 | import org.apache.spark.sql.Row; 19 | import org.apache.spark.sql.SparkSession; 20 | 21 | public class SpannerSpark { 22 | public static void main(String[] args) { 23 | SparkSession spark = SparkSession 24 | .builder() 25 | .appName("cloud spanner for census 2020") 26 | .getOrCreate(); 27 | 28 | 29 | Dataset df = spark.read() 30 | .format("cloud-spanner") 31 | .option("table", "people") 32 | .option("projectId", System.getenv("SPANNER_SPARK_PROJECT")) 33 | .option("instanceId", System.getenv("SPANNER_SPARK_INSTANCE")) 34 | .option("database", System.getenv("SPANNER_SPARK_DATABASE")) 35 | .load(); 36 | df.show(); 37 | df.printSchema(); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraphScanBuilder.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph; 2 | 3 | import com.google.cloud.spark.spanner.SpannerUtils; 4 | import com.google.common.collect.ImmutableSet; 5 | import java.util.Set; 6 | import javax.annotation.Nullable; 7 | import org.apache.spark.sql.connector.read.Scan; 8 | import org.apache.spark.sql.connector.read.ScanBuilder; 9 | import org.apache.spark.sql.connector.read.SupportsPushDownRequiredColumns; 10 | import org.apache.spark.sql.types.StructType; 11 | 12 | /** Builder for {@link SpannerGraphScanner} */ 13 | public class SpannerGraphScanBuilder implements ScanBuilder, SupportsPushDownRequiredColumns { 14 | 15 | private final SpannerGraph spannerGraph; 16 | private @Nullable Set requiredColumns; 17 | 18 | public SpannerGraphScanBuilder(SpannerGraph spannerGraph) { 19 | this.spannerGraph = spannerGraph; 20 | } 21 | 22 | @Override 23 | public Scan build() { 24 | return new SpannerGraphScanner( 25 | spannerGraph.options, 26 | spannerGraph.configs.extraHeaders, 27 | spannerGraph.readTimestamp, 28 | spannerGraph.configs.partitionSizeBytes, 29 | spannerGraph.dataBoostEnabled, 30 | spannerGraph.spannerGraphQuery, 31 | requiredColumns, 32 | SpannerUtils.pruneSchema(spannerGraph.schema(), requiredColumns)); 33 | } 34 | 35 | @Override 36 | public void pruneColumns(StructType requiredSchema) { 37 | this.requiredColumns = ImmutableSet.copyOf(requiredSchema.fieldNames()); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerConnectorException.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | public class SpannerConnectorException extends RuntimeException { 18 | 19 | final SpannerErrorCode errorCode; 20 | 21 | public SpannerConnectorException(String message) { 22 | this(SpannerErrorCode.UNKNOWN, message); 23 | } 24 | 25 | public SpannerConnectorException(String message, Throwable cause) { 26 | this(SpannerErrorCode.UNKNOWN, message, cause); 27 | } 28 | 29 | public SpannerConnectorException(SpannerErrorCode errorCode, String message) { 30 | super(message); 31 | this.errorCode = errorCode; 32 | } 33 | 34 | public SpannerConnectorException(SpannerErrorCode errorCode, String message, Throwable cause) { 35 | super(message, cause); 36 | this.errorCode = errorCode; 37 | } 38 | 39 | public SpannerErrorCode getErrorCode() { 40 | return errorCode; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocImage20AcceptanceTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner.acceptance; 16 | 17 | import com.google.common.collect.ImmutableList; 18 | import org.junit.AfterClass; 19 | import org.junit.BeforeClass; 20 | import org.junit.runner.RunWith; 21 | import org.junit.runners.JUnit4; 22 | 23 | @RunWith(JUnit4.class) 24 | public final class DataprocImage20AcceptanceTest extends DataprocAcceptanceTestBase { 25 | 26 | private static AcceptanceTestContext context; 27 | 28 | public DataprocImage20AcceptanceTest() { 29 | super(context); 30 | } 31 | 32 | @BeforeClass 33 | public static void setup() throws Exception { 34 | context = 35 | DataprocAcceptanceTestBase.setup("2.0-debian10", "spark-3.1-spanner", ImmutableList.of()); 36 | } 37 | 38 | @AfterClass 39 | public static void tearDown() throws Exception { 40 | DataprocAcceptanceTestBase.tearDown(context); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocImage21AcceptanceTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner.acceptance; 16 | 17 | import com.google.common.collect.ImmutableList; 18 | import org.junit.AfterClass; 19 | import org.junit.BeforeClass; 20 | import org.junit.runner.RunWith; 21 | import org.junit.runners.JUnit4; 22 | 23 | @RunWith(JUnit4.class) 24 | public final class DataprocImage21AcceptanceTest extends DataprocAcceptanceTestBase { 25 | 26 | private static AcceptanceTestContext context; 27 | 28 | public DataprocImage21AcceptanceTest() { 29 | super(context); 30 | } 31 | 32 | @BeforeClass 33 | public static void setup() throws Exception { 34 | context = 35 | DataprocAcceptanceTestBase.setup("2.1-debian11", "spark-3.1-spanner", ImmutableList.of()); 36 | } 37 | 38 | @AfterClass 39 | public static void tearDown() throws Exception { 40 | DataprocAcceptanceTestBase.tearDown(context); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/DataprocImage22AcceptanceTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner.acceptance; 16 | 17 | import com.google.common.collect.ImmutableList; 18 | import org.junit.AfterClass; 19 | import org.junit.BeforeClass; 20 | import org.junit.runner.RunWith; 21 | import org.junit.runners.JUnit4; 22 | 23 | @RunWith(JUnit4.class) 24 | public final class DataprocImage22AcceptanceTest extends DataprocAcceptanceTestBase { 25 | 26 | private static AcceptanceTestContext context; 27 | 28 | public DataprocImage22AcceptanceTest() { 29 | super(context); 30 | } 31 | 32 | @BeforeClass 33 | public static void setup() throws Exception { 34 | context = 35 | DataprocAcceptanceTestBase.setup("2.2-debian12", "spark-3.1-spanner", ImmutableList.of()); 36 | } 37 | 38 | @AfterClass 39 | public static void tearDown() throws Exception { 40 | DataprocAcceptanceTestBase.tearDown(context); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /cloudbuild/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | # 1. Create a Docker image containing hadoop-connectors repo 3 | - name: 'gcr.io/cloud-builders/docker' 4 | id: 'docker-build' 5 | args: ['build', '--tag=gcr.io/$PROJECT_ID/dataproc-spark-spanner-connector-presubmit', '-f', 'cloudbuild/Dockerfile', '.'] 6 | 7 | # 2. Fetch maven and dependencies 8 | - name: 'gcr.io/$PROJECT_ID/dataproc-spark-spanner-connector-presubmit' 9 | id: 'init' 10 | waitFor: ['docker-build'] 11 | entrypoint: 'bash' 12 | args: ['/workspace/cloudbuild/presubmit.sh', 'init'] 13 | 14 | # 3. Run integration tests with real Spanner databases 15 | - name: 'gcr.io/$PROJECT_ID/dataproc-spark-spanner-connector-presubmit' 16 | id: 'integration-real-spanner' 17 | waitFor: ['init'] 18 | entrypoint: 'bash' 19 | args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest-real-spanner'] 20 | env: 21 | - 'SPANNER_PROJECT_ID=$PROJECT_ID' 22 | - 'SPANNER_INSTANCE_ID=test-instance' 23 | - 'SPANNER_DATABASE_ID=testdb' 24 | 25 | # 4. Run acceptance tests by creating real Dataproc clusters. 26 | # TODO: Make the acceptance test run in parallel with integration-real-spanner. 27 | - name: 'gcr.io/$PROJECT_ID/dataproc-spark-spanner-connector-presubmit' 28 | id: 'acceptance-test' 29 | waitFor: ['init'] 30 | entrypoint: 'bash' 31 | args: ['/workspace/cloudbuild/presubmit.sh', 'acceptance-test'] 32 | env: 33 | - 'SPANNER_PROJECT_ID=$PROJECT_ID' 34 | - 'GOOGLE_CLOUD_PROJECT=$PROJECT_ID' 35 | - 'SPANNER_INSTANCE_ID=accept-testins' 36 | - 'SPANNER_DATABASE_ID=accept-testdb' 37 | - 'ACCEPTANCE_TEST_BUCKET=spark-spanner-connector-acceptance-test' 38 | 39 | 40 | timeout: 3600s 41 | options: 42 | machineType: 'N1_HIGHCPU_32' 43 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/AcceptanceTestContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner.acceptance; 16 | 17 | public class AcceptanceTestContext { 18 | final String testId; 19 | final String clusterId; 20 | final String connectorJarUri; 21 | final String testBaseGcsDir; 22 | final String spannerDataset; 23 | final String spannerTable; 24 | 25 | public AcceptanceTestContext( 26 | String testId, String clusterId, String testBaseGcsDir, String connectorJarUri) { 27 | this.testId = testId; 28 | this.clusterId = clusterId; 29 | this.testBaseGcsDir = testBaseGcsDir; 30 | this.connectorJarUri = connectorJarUri; 31 | this.spannerDataset = "spanner_acceptance_test_dataset_" + testId.replace("-", "_"); 32 | this.spannerTable = "spanner_acceptance_test_table_" + testId.replace("-", "_"); 33 | } 34 | 35 | public String getScriptUri(String testName) { 36 | return testBaseGcsDir + "/" + testName + "/script.py"; 37 | } 38 | 39 | public String getResultsDirUri(String testName) { 40 | return testBaseGcsDir + "/" + testName + "/results"; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerPartition.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import java.io.Serializable; 18 | import org.apache.spark.Partition; 19 | import org.apache.spark.sql.catalyst.InternalRow; 20 | import org.apache.spark.sql.connector.read.InputPartition; 21 | 22 | public class SpannerPartition implements Partition, InputPartition, Serializable { 23 | 24 | private final com.google.cloud.spanner.Partition partition; 25 | private final int index; 26 | private final InputPartitionContext ctx; 27 | 28 | public SpannerPartition( 29 | com.google.cloud.spanner.Partition partition, 30 | int index, 31 | InputPartitionContext ctx) { 32 | this.index = index; 33 | this.partition = partition; 34 | this.ctx = ctx; 35 | } 36 | 37 | @Override 38 | public int index() { 39 | return this.index; 40 | } 41 | 42 | @Override 43 | public String toString() { 44 | return "SpannerPartition{index=" + this.index + ", stream=" + this.partition + "}"; 45 | } 46 | 47 | public InputPartitionContext getContext() { 48 | return this.ctx; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /cloudbuild/presubmit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2023 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the 'License'); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an 'AS IS' BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -euxo pipefail 17 | 18 | readonly MVN="./mvnw -B -e -s /workspace/cloudbuild/gcp-settings.xml -Dmaven.repo.local=/workspace/.repository" 19 | readonly STEP=$1 20 | 21 | cd /workspace 22 | 23 | case $STEP in 24 | # Download maven and all the dependencies 25 | init) 26 | $MVN install -DskipTests -P3.1 27 | exit 28 | ;; 29 | 30 | # Run integration tests with Spanner emulator. 31 | integrationtest-real-spanner) 32 | # Starts the Spanner emulator and setup the gcloud command. 33 | # Sets the env used in the integration test. 34 | $MVN test -T 1C "-Dtest=SpannerTableTest,SpannerScanBuilderTest,SpannerInputPartitionReaderContextTest,SparkFilterUtilsTest,ReadIntegrationTestBase,WriteIntegrationTestBase,FunctionsAndExpressionsTest,ReadIntegrationTestPg,GraphReadIntegrationTest,GraphErrorHandlingTest" 35 | ;; 36 | 37 | acceptance-test) 38 | $MVN test -T 1C -Dtest=DataprocImage20AcceptanceTest 39 | $MVN test -T 1C -Dtest=DataprocImage21AcceptanceTest 40 | $MVN test -T 1C -Dtest=DataprocImage22AcceptanceTest 41 | ;; 42 | 43 | *) 44 | echo "Unknown step $STEP" 45 | exit 1 46 | ;; 47 | esac 48 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/BatchClientWithCloser.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import com.google.cloud.spanner.BatchClient; 18 | import com.google.cloud.spanner.DatabaseClient; 19 | import com.google.cloud.spanner.Spanner; 20 | 21 | public class BatchClientWithCloser implements AutoCloseable { 22 | public BatchClient batchClient; 23 | public DatabaseClient databaseClient; 24 | private Spanner spanner; 25 | 26 | public BatchClientWithCloser( 27 | Spanner spanner, BatchClient batchClient, DatabaseClient databaseClient) { 28 | this.spanner = spanner; 29 | this.batchClient = batchClient; 30 | this.databaseClient = databaseClient; 31 | } 32 | 33 | /* 34 | * close is a runtime hook for AutoCloseable to properly shut down resources 35 | * before this object is garbage collected. It is useful in scenarios such as 36 | * asynchronous processing for which we won't have a deterministic time/scope 37 | * for when a Spanner object will be closed. 38 | */ 39 | @Override 40 | public void close() { 41 | if (this.spanner != null) { 42 | this.spanner.close(); 43 | this.spanner = null; 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.google.cloud.spark.spanner 7 | spark-spanner-lib-parent 8 | ${revision} 9 | ../spark-spanner-lib-parent 10 | 11 | 12 | spark-3.1-spanner-lib 13 | ${revision} 14 | Connector code for spanner DataSource v2 for Spark 3.1 15 | 16 | 3.1.0 17 | true 18 | 19 | 20 | 21 | Apache License, Version 2.0 22 | http://www.apache.org/licenses/LICENSE-2.0.txt 23 | repo 24 | 25 | 26 | 27 | 28 | com.google.cloud 29 | google-cloud-spanner 30 | 31 | 32 | 33 | com.google.cloud 34 | google-cloud-dataproc 35 | test 36 | 37 | 38 | 39 | com.google.cloud 40 | google-cloud-storage 41 | test 42 | 43 | 44 | 45 | com.google.code.gson 46 | gson 47 | 2.10.1 48 | 49 | 50 | 51 | com.fasterxml.jackson.core 52 | jackson-databind 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerInputPartitionContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import com.google.cloud.spanner.BatchTransactionId; 18 | import com.google.cloud.spanner.Partition; 19 | import java.io.Serializable; 20 | import org.apache.spark.sql.catalyst.InternalRow; 21 | 22 | public class SpannerInputPartitionContext 23 | implements InputPartitionContext, Serializable { 24 | 25 | private final BatchTransactionId batchTransactionId; 26 | private final Partition partition; 27 | private final String mapAsJSONStr; 28 | private final SpannerRowConverter rowConverter; 29 | 30 | public SpannerInputPartitionContext( 31 | Partition partition, 32 | BatchTransactionId batchTransactionId, 33 | String mapAsJSONStr, 34 | SpannerRowConverter rowConverter) { 35 | this.mapAsJSONStr = mapAsJSONStr; 36 | this.partition = partition; 37 | this.batchTransactionId = batchTransactionId; 38 | this.rowConverter = rowConverter; 39 | } 40 | 41 | @Override 42 | public InputPartitionReaderContext createPartitionReaderContext() { 43 | return new SpannerInputPartitionReaderContext( 44 | partition, batchTransactionId, mapAsJSONStr, rowConverter); 45 | } 46 | 47 | @Override 48 | public boolean supportsColumnarReads() { 49 | return false; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerPartitionReaderFactory.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import org.apache.spark.sql.catalyst.InternalRow; 18 | import org.apache.spark.sql.connector.read.InputPartition; 19 | import org.apache.spark.sql.connector.read.PartitionReader; 20 | import org.apache.spark.sql.connector.read.PartitionReaderFactory; 21 | import org.apache.spark.sql.vectorized.ColumnarBatch; 22 | 23 | /* 24 | * SpannerPartitionReaderFactory is an entry to implement PartitionReaderFactory. 25 | */ 26 | public class SpannerPartitionReaderFactory implements PartitionReaderFactory { 27 | 28 | public SpannerPartitionReaderFactory() {} 29 | 30 | @Override 31 | public PartitionReader createColumnarReader(InputPartition partition) { 32 | throw new SpannerConnectorException( 33 | SpannerErrorCode.COLUMNAR_READS_NOT_SUPPORTED, 34 | "Columnar reads are not supported by the Spark Spanner Connector."); 35 | } 36 | 37 | @Override 38 | public PartitionReader createReader(InputPartition partition) { 39 | InputPartitionContext ctx = ((SpannerPartition) partition).getContext(); 40 | return new SpannerPartitionReader<>(ctx.createPartitionReaderContext()); 41 | } 42 | 43 | @Override 44 | public boolean supportColumnarReads(InputPartition partition) { 45 | return false; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/resources/db/populate_ddl_pg.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE composite_table ( 2 | id int NOT NULL, 3 | charvCol character varying(1024), 4 | textCol text, 5 | varcharCol varchar(1024), 6 | boolCol bool, 7 | booleanCol boolean, 8 | bigintCol bigint, 9 | int8Col int8, 10 | intCol int, 11 | doubleCol double precision, 12 | floatCol float8, 13 | byteCol bytea, 14 | dateCol date, 15 | numericCol numeric, 16 | decimalCol decimal, 17 | timeWithZoneCol timestamp with time zone, 18 | timestampCol timestamptz, 19 | jsonCol jsonb, 20 | PRIMARY KEY(id) 21 | ); 22 | 23 | CREATE TABLE integration_composite_table ( 24 | id int NOT NULL, 25 | charvCol character varying(1024), 26 | textCol text, 27 | varcharCol varchar(1024), 28 | boolCol bool, 29 | booleanCol boolean, 30 | bigintCol bigint, 31 | int8Col int8, 32 | intCol int, 33 | doubleCol double precision, 34 | floatCol float8, 35 | byteCol bytea, 36 | dateCol date, 37 | numericCol numeric, 38 | decimalCol decimal, 39 | timeWithZoneCol timestamp with time zone, 40 | timestampCol timestamptz, 41 | jsonCol jsonb, 42 | PRIMARY KEY(id) 43 | ); 44 | 45 | CREATE TABLE numeric_table ( 46 | id int NOT NULL, 47 | numericCol numeric, 48 | PRIMARY KEY(id) 49 | ); 50 | 51 | CREATE TABLE array_table ( 52 | id int NOT NULL, 53 | charvArray character varying(1024)[], 54 | boolArray bool[3], 55 | bigintArray bigint[], 56 | doubleArray double precision[], 57 | byteArray bytea[], 58 | dateArray date[], 59 | numericArray numeric[], 60 | timestampArray timestamptz[], 61 | jsonArray jsonb[], 62 | PRIMARY KEY(id) 63 | ); 64 | 65 | CREATE TABLE Shakespeare ( 66 | id int, 67 | word character varying(1024), 68 | word_count int, 69 | corpus character varying(1024), 70 | corpus_date int, 71 | PRIMARY KEY(id) 72 | ); 73 | 74 | CREATE TABLE string_table ( 75 | id bigint NOT NULL, 76 | charvCol character varying(1024), 77 | textCol text, 78 | varcharCol varchar(1024), 79 | smallCol character varying(1), 80 | PRIMARY KEY(id) 81 | ); 82 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SparkSpannerIntegrationTestBase.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import java.util.Map; 18 | import org.apache.spark.sql.DataFrameReader; 19 | import org.apache.spark.sql.SparkSession; 20 | import org.junit.ClassRule; 21 | import org.junit.rules.ExternalResource; 22 | 23 | public class SparkSpannerIntegrationTestBase extends SpannerTestBase { 24 | 25 | @ClassRule public static SparkFactory sparkFactory = new SparkFactory(); 26 | 27 | protected SparkSession spark; 28 | 29 | public SparkSpannerIntegrationTestBase() { 30 | this.spark = sparkFactory.spark; 31 | } 32 | 33 | public DataFrameReader reader() { 34 | Map props = connectionProperties(); 35 | DataFrameReader reader = 36 | spark 37 | .read() 38 | .format("cloud-spanner") 39 | .option("viewsEnabled", true) 40 | .option("projectId", props.get("projectId")) 41 | .option("instanceId", props.get("instanceId")) 42 | .option("databaseId", props.get("databaseId")); 43 | String emulatorHost = props.get("emulatorHost"); 44 | if (emulatorHost != null) reader = reader.option("emulatorHost", props.get("emulatorHost")); 45 | return reader; 46 | } 47 | 48 | protected static class SparkFactory extends ExternalResource { 49 | SparkSession spark; 50 | 51 | @Override 52 | protected void before() throws Throwable { 53 | spark = 54 | SparkSession.builder() 55 | .master("local") 56 | .config("spark.ui.enabled", "false") 57 | .config("spark.default.parallelism", 20) 58 | .getOrCreate(); 59 | // reducing test's logs 60 | spark.sparkContext().setLogLevel("WARN"); 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/resources/db/populate_ddl.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE game_items ( 2 | itemUUID STRING(36) NOT NULL, 3 | item_name STRING(MAX) NOT NULL, 4 | item_value NUMERIC NOT NULL, 5 | available_time TIMESTAMP NOT NULL, 6 | duration INT64, 7 | ) PRIMARY KEY(itemUUID); 8 | 9 | CREATE TABLE games ( 10 | gameUUID STRING(36) NOT NULL, 11 | players ARRAY NOT NULL, 12 | winner STRING(36), 13 | created TIMESTAMP, 14 | finished TIMESTAMP, 15 | max_date DATE, 16 | ) PRIMARY KEY(gameUUID); 17 | 18 | CREATE TABLE players ( 19 | playerUUID STRING(36) NOT NULL, 20 | player_name STRING(64) NOT NULL, 21 | email STRING(MAX) NOT NULL, 22 | password_hash BYTES(60) NOT NULL, 23 | created TIMESTAMP, 24 | updated TIMESTAMP, 25 | stats JSON, 26 | account_balance NUMERIC NOT NULL DEFAULT (0), 27 | is_logged_in BOOL NOT NULL DEFAULT (FALSE), 28 | last_login TIMESTAMP, 29 | valid_email BOOL, 30 | current_game STRING(36), 31 | dob DATE, 32 | FOREIGN KEY(current_game) REFERENCES games(gameUUID), 33 | ) PRIMARY KEY(playerUUID); 34 | 35 | CREATE TABLE simpleTable( 36 | A INT64 NOT NULL, 37 | B STRING(100), 38 | C FLOAT64 39 | ) PRIMARY KEY(A); 40 | 41 | CREATE TABLE ATable( 42 | A INT64 NOT NULL, 43 | B STRING(100), 44 | C BYTES(MAX), 45 | D TIMESTAMP, 46 | E NUMERIC, 47 | F ARRAY, 48 | G JSON 49 | ) PRIMARY KEY(A); 50 | 51 | CREATE TABLE compositeTable ( 52 | id STRING(36) NOT NULL, 53 | A ARRAY, 54 | B ARRAY, 55 | C STRING(30), 56 | D NUMERIC, 57 | E DATE, 58 | F TIMESTAMP, 59 | G BOOL, 60 | H ARRAY, 61 | I ARRAY, 62 | J BYTES(20), 63 | K JSON, 64 | ) PRIMARY KEY(id); 65 | 66 | CREATE TABLE nullsTable ( 67 | id INT64, 68 | A ARRAY, 69 | B ARRAY, 70 | C STRING(30), 71 | D NUMERIC, 72 | E DATE, 73 | F TIMESTAMP, 74 | G BOOL, 75 | H ARRAY, 76 | I ARRAY, 77 | J ARRAY, 78 | K ARRAY, 79 | M ARRAY, 80 | N ARRAY, 81 | O ARRAY, 82 | ) PRIMARY KEY(id); 83 | 84 | CREATE TABLE Shakespeare ( 85 | id INT64, 86 | word STRING(MAX), 87 | word_count INT64, 88 | corpus STRING(MAX), 89 | corpus_date INT64, 90 | ) PRIMARY KEY(id); 91 | 92 | CREATE TABLE bytesTable ( 93 | id INT64, 94 | A BYTES(MAX), 95 | ) PRIMARY KEY(id); 96 | 97 | CREATE TABLE valueLimitsTable ( 98 | A INT64, 99 | B FLOAT64, 100 | C NUMERIC, 101 | D DATE, 102 | E TIMESTAMP, 103 | ) PRIMARY KEY(A); 104 | -------------------------------------------------------------------------------- /spark-spanner-lib-parent/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.google.cloud.spark.spanner 7 | spark-spanner-parent 8 | ${revision} 9 | ../spark-spanner-parent 10 | 11 | 12 | spark-spanner-lib-parent 13 | ${revision} 14 | pom 15 | Common Spark Spanner library setting 16 | 17 | 2.12 18 | 3.1.0 19 | 20 | 21 | 22 | Apache License, Version 2.0 23 | http://www.apache.org/licenses/LICENSE-2.0.txt 24 | repo 25 | 26 | 27 | 28 | 29 | org.apache.spark 30 | spark-sql_${scala.version} 31 | ${spark.version} 32 | provided 33 | 34 | 35 | 36 | 37 | integration 38 | 39 | false 40 | 41 | 42 | 43 | 44 | org.apache.maven.plugins 45 | maven-failsafe-plugin 46 | 47 | ${argLine} 48 | 7 49 | false 50 | 51 | **/*IntegrationTest.java 52 | 53 | 54 | 55 | 56 | integration-test 57 | 58 | integration-test 59 | 60 | 61 | 62 | verify 63 | 64 | verify 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SpannerTableSchemaTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import static com.google.common.truth.Truth.assertThat; 18 | 19 | import com.google.cloud.spanner.Statement; 20 | import org.junit.Test; 21 | import org.junit.runner.RunWith; 22 | import org.junit.runners.JUnit4; 23 | 24 | /** Unit tests for SpannerTableSchema.buildSchemaQuery() */ 25 | @RunWith(JUnit4.class) 26 | public class SpannerTableSchemaTest { 27 | 28 | @Test 29 | public void testBuildSchemaQuery_googleSql_usesCaseInsensitiveComparison() { 30 | Statement stmt = SpannerTableSchema.buildSchemaQuery("MyTable", false); 31 | String query = stmt.getSql(); 32 | 33 | // Verify GoogleSQL uses UPPER() for case-insensitive table name comparison 34 | assertThat(query).contains("UPPER(TABLE_NAME)=UPPER(@tableName)"); 35 | } 36 | 37 | @Test 38 | public void testBuildSchemaQuery_googleSql_differentCasing() { 39 | // Test with different table name casings 40 | Statement stmt1 = SpannerTableSchema.buildSchemaQuery("mytable", false); 41 | Statement stmt2 = SpannerTableSchema.buildSchemaQuery("MyTable", false); 42 | Statement stmt3 = SpannerTableSchema.buildSchemaQuery("MYTABLE", false); 43 | 44 | // All should generate the same query structure with UPPER() 45 | assertThat(stmt1.getSql()).contains("UPPER(TABLE_NAME)=UPPER(@tableName)"); 46 | assertThat(stmt2.getSql()).contains("UPPER(TABLE_NAME)=UPPER(@tableName)"); 47 | assertThat(stmt3.getSql()).contains("UPPER(TABLE_NAME)=UPPER(@tableName)"); 48 | } 49 | 50 | @Test 51 | public void testBuildSchemaQuery_postgreSql_usesDirectComparison() { 52 | Statement stmt = SpannerTableSchema.buildSchemaQuery("myTable", true); 53 | String query = stmt.getSql(); 54 | 55 | // Verify PostgreSQL uses direct comparison without UPPER() 56 | assertThat(query).contains("columns.table_name=$1"); 57 | assertThat(query).doesNotContain("UPPER"); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.google.cloud.spark.spanner 7 | spark-spanner-parent 8 | ${revision} 9 | spark-spanner-parent 10 | 11 | 12 | spark-spanner-reactor 13 | pom 14 | Spark Spanner Connector Reactor 15 | 16 | 17 | 18 | Apache License, Version 2.0 19 | http://www.apache.org/licenses/LICENSE-2.0.txt 20 | repo 21 | 22 | 23 | 24 | 25 | 26 | Google Inc. 27 | http://www.google.com 28 | 29 | 30 | 31 | 32 | 33 | scm:git:git@github.com:GoogleCloudDataproc/spark-spanner-connector.git 34 | 35 | 36 | scm:git:git@github.com:GoogleCloudDataproc/spark-spanner-connector.git 37 | 38 | git@github.com:GoogleCloudDataproc/spark-spanner-connector.git 39 | 40 | 41 | 42 | GitHub Issues 43 | 44 | https://github.com/GoogleCloudDataproc/spark-spanner-connector/issues 45 | 46 | 47 | 48 | 49 | spark-spanner-parent 50 | spark-spanner-lib-parent 51 | spark-3.1-spanner-lib 52 | 53 | 54 | 55 | 56 | 3.1 57 | false 58 | 59 | spark-3.1-spanner 60 | 61 | 62 | 63 | 3.2 64 | false 65 | 66 | spark-3.2-spanner-lib 67 | spark-3.2-spanner 68 | 69 | 70 | 71 | 3.3 72 | false 73 | 74 | spark-3.2-spanner-lib 75 | spark-3.2-spanner 76 | spark-3.3-spanner-lib 77 | spark-3.3-spanner 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/WriteIntegrationTestBase.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import static com.google.common.truth.Truth.assertThat; 18 | import static org.junit.Assert.assertThrows; 19 | 20 | import java.util.Map; 21 | import org.apache.spark.SparkException; 22 | import org.apache.spark.sql.DataFrameWriter; 23 | import org.apache.spark.sql.Dataset; 24 | import org.apache.spark.sql.Row; 25 | import org.junit.Test; 26 | 27 | public class WriteIntegrationTestBase extends SparkSpannerIntegrationTestBase { 28 | 29 | private static ReadIntegrationTestBase readIntegration = new ReadIntegrationTestBase(); 30 | private Map props = this.connectionProperties(); 31 | 32 | public DataFrameWriter writerToTable(Dataset df, String table) { 33 | return df.write() 34 | .format("cloud-spanner") 35 | .option("viewsEnabled", true) 36 | .option("projectId", props.get("projectId")) 37 | .option("instanceId", props.get("instanceId")) 38 | .option("databaseId", props.get("databaseId")) 39 | .option("emulatorHost", props.get("emulatorHost")) 40 | .option("table", table); 41 | } 42 | 43 | @Test 44 | public void testWritesToTableFail() { 45 | String table = "compositeTable"; 46 | Dataset drf = readIntegration.readFromTable(table); 47 | SparkException e = 48 | assertThrows( 49 | SparkException.class, 50 | () -> { 51 | DataFrameWriter dwf = writerToTable(drf.select("id"), table); 52 | dwf.saveAsTable(table); 53 | }); 54 | assertThat(e) 55 | .hasMessageThat() 56 | .isEqualTo("Table implementation does not support writes: default.compositeTable"); 57 | } 58 | 59 | @Test 60 | public void testCreateTableFail() { 61 | String table = "compositeTable"; 62 | Dataset drf = readIntegration.readFromTable(table); 63 | SpannerConnectorException e = 64 | assertThrows( 65 | SpannerConnectorException.class, 66 | () -> { 67 | DataFrameWriter dwf = writerToTable(drf.select("id"), table); 68 | dwf.save(); 69 | }); 70 | assertThat(e) 71 | .hasMessageThat() 72 | .isEqualTo("writes are not supported in the Spark Spanner Connector"); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/NodeElementTableQuery.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph.query; 2 | 3 | import com.google.cloud.spark.spanner.SpannerTableSchema; 4 | import com.google.cloud.spark.spanner.graph.PropertyGraph; 5 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphElementTable; 6 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs; 7 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs.LabelConfig; 8 | import java.util.List; 9 | 10 | /** Query for a node table */ 11 | public class NodeElementTableQuery extends ElementTableQuery { 12 | 13 | /** 14 | * Construct a query for a node element table 15 | * 16 | * @param graphSchema schema of the graph 17 | * @param elementTable the element table to construct a query for 18 | * @param configs user configs for exporting the graph 19 | * @param exportIdColumnDirectly export the key column directly as the "id" column to avoid the 20 | * need of downstream ID translation. Should be true only when there is only one key column in 21 | * this table. 22 | * @return a {@link EdgeElementTableQuery} for the element table. 23 | */ 24 | public static NodeElementTableQuery create( 25 | PropertyGraph graphSchema, 26 | GraphElementTable elementTable, 27 | SpannerTableSchema baseTableSchema, 28 | SpannerGraphConfigs configs, 29 | boolean exportIdColumnDirectly) { 30 | 31 | List matchedLabels = getMatchedLabels(elementTable, configs.nodeLabelConfigs); 32 | 33 | return new NodeElementTableQuery( 34 | graphSchema, 35 | elementTable, 36 | baseTableSchema, 37 | configs.outputIndividualKeys, 38 | exportIdColumnDirectly, 39 | mergeProperties(elementTable, matchedLabels), 40 | mergeWhereClauses(matchedLabels)); 41 | } 42 | 43 | private NodeElementTableQuery( 44 | PropertyGraph graphSchema, 45 | GraphElementTable elementTable, 46 | SpannerTableSchema baseTableSchema, 47 | boolean outputIndividualKeys, 48 | boolean exportIdColumnDirectly, 49 | List properties, 50 | String whereClause) { 51 | super(baseTableSchema, whereClause); 52 | if (!PropertyGraph.GRAPH_ELEMENT_TABLE_KIND_NODE.equalsIgnoreCase(elementTable.kind)) { 53 | throw new IllegalArgumentException("Invalid elementTable kind: " + elementTable.kind); 54 | } 55 | 56 | if (exportIdColumnDirectly) { 57 | if (elementTable.keyColumns.size() != 1) { 58 | throw new IllegalArgumentException( 59 | "Cannot export multiple key columns directly as one ID column."); 60 | } 61 | addDirectField(elementTable.keyColumns.get(0), "id"); 62 | } else { 63 | if (outputIndividualKeys) { 64 | addNodeTableColumn("id", graphSchema.getTableId(elementTable.name)); 65 | addIndividualKeyColumns("id", elementTable.keyColumns, elementTable.keyColumns); 66 | } else { 67 | addCombinedId("id", graphSchema.getTableId(elementTable.name), elementTable.keyColumns); 68 | } 69 | } 70 | 71 | addInnerProperties(elementTable.propertyDefinitions); 72 | addOutputProperties(graphSchema, properties); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/resources/db/insert_data_pg.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM composite_table WHERE 1=1; 2 | 3 | INSERT INTO composite_table (id, charvcol, textcol, varcharcol, boolcol, booleancol, bigintcol, int8col, intcol, doublecol, floatcol, bytecol, datecol, numericcol, decimalcol, timewithzonecol, timestampcol, jsoncol) 4 | VALUES (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), 5 | (2, 'charvcol', 'textcol', 'varcharcol', true, false, 1, -1, 0, 0.00000001, 0.00000001, 'beefdead', '1999-01-08', NUMERIC '1.23456e05', NUMERIC '9e23', '2003-04-12 04:05:06 America/Los_Angeles', '2003-04-12 05:05:06 America/Los_Angeles', '{"tags": ["multi-cuisine", "open-seating"], "rating": 4.5}'); 6 | 7 | DELETE FROM array_table WHERE 1=1; 8 | 9 | INSERT INTO array_table (id, charvarray, boolarray, bigintarray, doublearray, bytearray, datearray, numericarray, timestamparray, jsonarray) 10 | VALUES (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), 11 | (2, '{NULL, "charvarray"}', '{NULL, true}', '{NULL, 1024}', '{NULL, 0.00000001}', '{NULL, "beefdead"}', '{NULL, "1999-01-08"}', '{NULL, "1.2345e05"}', '{NULL, "2003-04-12 04:05:06 America/Los_Angeles"}', ARRAY[NULL, CAST('{"tags": ["multi-cuisine", "open-seating"], "rating": 4.5}' as JSONB)]), 12 | (3, '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}'); 13 | 14 | DELETE FROM integration_composite_table WHERE 1=1; 15 | 16 | INSERT INTO integration_composite_table (id, charvcol, textcol, varcharcol, boolcol, booleancol, bigintcol, int8col, intcol, doublecol, floatcol, bytecol, datecol, numericcol, decimalcol, timewithzonecol, timestampcol, jsoncol) 17 | VALUES (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), 18 | (2, 'charvcol', 'textcol', 'varcharcol', true, false, 1, -1, 0, 0.00000001, 0.00000001, 'beefdead', '1999-01-08', NUMERIC '1.23456e05', NUMERIC '9e23', '2003-04-12 04:05:06 America/Los_Angeles', '2003-04-12 05:05:06 America/Los_Angeles', '{"tags": ["multi-cuisine", "open-seating"], "rating": 4.5}'), 19 | (3, NULL, NULL, NULL, NULL, NULL, 9223372036854775807, -9223372036854775808, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), 20 | (4, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'NaN', 'NaN', NULL, NULL, NULL, NULL, NULL, NULL, NULL), 21 | (5, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, '9999-12-31', NULL, NULL, NULL, NULL, NULL), 22 | (6, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, '1700-01-01', NULL, NULL, NULL, NULL, NULL), 23 | (7, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 99999999999999999999999999999.999999999, -99999999999999999999999999999.999999999, NULL, NULL, NULL), 24 | (8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, '0001-01-01 23:00:00 America/Los_Angeles', '9999-12-30 01:00:00 America/Los_Angeles', NULL), 25 | (9, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'NaN', 'NaN', NULL, NULL, NULL), 26 | (10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'inf', '-inf', NULL, NULL, NULL, NULL, NULL, NULL, NULL); 27 | 28 | 29 | DELETE FROM numeric_table WHERE 1=1; 30 | 31 | INSERT INTO numeric_table (id, numericcol) 32 | VALUES (1, 9999999999999999999999999999999999999999999.9999999999999999999999); 33 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows 28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). 29 | 30 | ## Building the project 31 | 32 | To build, package, and run all unit tests run the command 33 | 34 | ``` 35 | mvn clean verify 36 | ``` 37 | 38 | ### Running Integration tests 39 | 40 | To include integration tests when building the project, you need access to 41 | a GCP Project with a valid service account. 42 | 43 | For instructions on how to generate a service account and corresponding 44 | credentials JSON see: [Creating a Service Account][1]. 45 | 46 | Then run the following to build, package, run all unit tests and run all 47 | integration tests. 48 | 49 | ```bash 50 | export GOOGLE_APPLICATION_CREDENTIALS=/path/to/service/account.json 51 | mvn -Penable-integration-tests clean verify 52 | ``` 53 | 54 | ## Code Samples 55 | 56 | All code samples must be in compliance with the [java sample formatting guide][3]. 57 | Code Samples must be bundled in separate Maven modules. 58 | 59 | The samples must be separate from the primary project for a few reasons: 60 | 1. Primary projects have a minimum Java version of Java 8 whereas samples can have 61 | Java version of Java 11. Due to this we need the ability to 62 | selectively exclude samples from a build run. 63 | 2. Many code samples depend on external GCP services and need 64 | credentials to access the service. 65 | 3. Code samples are not released as Maven artifacts and must be excluded from 66 | release builds. 67 | 68 | ### Building 69 | 70 | ```bash 71 | mvn clean verify 72 | ``` 73 | 74 | Some samples require access to GCP services and require a service account: 75 | 76 | ```bash 77 | export GOOGLE_APPLICATION_CREDENTIALS=/path/to/service/account.json 78 | mvn clean verify 79 | ``` 80 | 81 | ### Code Formatting 82 | 83 | Code in this repo is formatted with 84 | [google-java-format](https://github.com/google/google-java-format). 85 | To run formatting on your project, you can run: 86 | ``` 87 | mvn com.coveo:fmt-maven-plugin:format 88 | ``` 89 | 90 | [1]: https://cloud.google.com/docs/authentication/getting-started#creating_a_service_account 91 | [2]: https://maven.apache.org/settings.html#Active_Profiles 92 | [3]: https://github.com/GoogleCloudPlatform/java-docs-samples/blob/main/SAMPLE_FORMAT.md 93 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerRowConverterWithSchema.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph; 2 | 3 | import com.google.cloud.Tuple; 4 | import com.google.cloud.spanner.Struct; 5 | import com.google.cloud.spark.spanner.SpannerRowConverter; 6 | import com.google.cloud.spark.spanner.SpannerUtils; 7 | import com.google.common.collect.Streams; 8 | import java.io.Serializable; 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.stream.Collectors; 13 | import org.apache.spark.sql.catalyst.InternalRow; 14 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; 15 | import org.apache.spark.sql.types.StructField; 16 | import org.apache.spark.sql.types.StructType; 17 | 18 | /** Converts rows from Spanner query outputs to rows in a Spark DataFrame with specific schema. */ 19 | public class SpannerRowConverterWithSchema implements SpannerRowConverter, Serializable { 20 | 21 | private final List sparkFields = new ArrayList<>(); 22 | 23 | public SpannerRowConverterWithSchema( 24 | StructType dataframeSchema, 25 | List queryOutputColumns, 26 | Map fixedValues) { 27 | Map nameToQueryOutputColumnIndex = 28 | Streams.mapWithIndex(queryOutputColumns.stream(), Tuple::of) 29 | .collect(Collectors.toMap(Tuple::x, Tuple::y)); 30 | for (StructField field : dataframeSchema.fields()) { 31 | Integer fixedValue = fixedValues.get(field.name()); 32 | if (fixedValue != null) { 33 | sparkFields.add(new FixedIntField(fixedValue)); 34 | continue; 35 | } 36 | Long spannerRowIndex = nameToQueryOutputColumnIndex.get(field.name()); 37 | if (spannerRowIndex != null) { 38 | sparkFields.add(new ValueField(spannerRowIndex.intValue())); 39 | continue; 40 | } 41 | sparkFields.add(new NullField()); 42 | } 43 | } 44 | 45 | @Override 46 | public InternalRow convert(Struct spannerRow) { 47 | GenericInternalRow sparkRow = new GenericInternalRow(sparkFields.size()); 48 | for (int i = 0; i < sparkFields.size(); ++i) { 49 | sparkFields.get(i).update(sparkRow, spannerRow, i); 50 | } 51 | return sparkRow; 52 | } 53 | 54 | private abstract static class Field implements Serializable { 55 | public abstract void update(GenericInternalRow sparkRow, Struct spannerRow, int sparkRowIndex); 56 | } 57 | 58 | private static class NullField extends Field { 59 | @Override 60 | public void update(GenericInternalRow sparkRow, Struct spannerRow, int sparkRowIndex) { 61 | sparkRow.update(sparkRowIndex, null); 62 | } 63 | } 64 | 65 | private static class FixedIntField extends Field { 66 | 67 | private final int value; 68 | 69 | FixedIntField(int value) { 70 | this.value = value; 71 | } 72 | 73 | @Override 74 | public void update(GenericInternalRow sparkRow, Struct spannerRow, int sparkRowIndex) { 75 | sparkRow.setInt(sparkRowIndex, value); 76 | } 77 | } 78 | 79 | private static class ValueField extends Field { 80 | 81 | private final int spannerRowIndex; 82 | 83 | ValueField(int spannerRowIndex) { 84 | this.spannerRowIndex = spannerRowIndex; 85 | } 86 | 87 | @Override 88 | public void update(GenericInternalRow sparkRow, Struct spannerRow, int sparkRowIndex) { 89 | SpannerUtils.convertRowAt(spannerRow, spannerRowIndex, sparkRow, sparkRowIndex); 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SpannerScannerTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import static com.google.common.truth.Truth.assertThat; 18 | 19 | import java.util.Arrays; 20 | import java.util.HashSet; 21 | import java.util.Set; 22 | import org.junit.Test; 23 | import org.junit.runner.RunWith; 24 | import org.junit.runners.JUnit4; 25 | 26 | /** Unit tests for SpannerScanner.buildColumnsWithTablePrefix() */ 27 | @RunWith(JUnit4.class) 28 | public class SpannerScannerTest { 29 | 30 | @Test 31 | public void testBuildColumnsWithTablePrefix_googleSql_singleColumn() { 32 | Set columns = new HashSet<>(Arrays.asList("id")); 33 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, false); 34 | assertThat(result).isEqualTo("`users`.`id`"); 35 | } 36 | 37 | @Test 38 | public void testBuildColumnsWithTablePrefix_googleSql_multipleColumns() { 39 | Set columns = new HashSet<>(Arrays.asList("id", "name")); 40 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, false); 41 | assertThat(result).contains("`users`.`id`"); 42 | assertThat(result).contains("`users`.`name`"); 43 | } 44 | 45 | @Test 46 | public void testBuildColumnsWithTablePrefix_googleSql_columnMatchingTableName() { 47 | Set columns = new HashSet<>(Arrays.asList("users", "id")); 48 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, false); 49 | assertThat(result).contains("`users`.`users`"); 50 | assertThat(result).contains("`users`.`id`"); 51 | } 52 | 53 | @Test 54 | public void testBuildColumnsWithTablePrefix_postgreSql_singleColumn() { 55 | Set columns = new HashSet<>(Arrays.asList("id")); 56 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, true); 57 | assertThat(result).isEqualTo("\"users\".\"id\""); 58 | } 59 | 60 | @Test 61 | public void testBuildColumnsWithTablePrefix_postgreSql_multipleColumns() { 62 | Set columns = new HashSet<>(Arrays.asList("id", "name")); 63 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, true); 64 | assertThat(result).contains("\"users\".\"id\""); 65 | assertThat(result).contains("\"users\".\"name\""); 66 | } 67 | 68 | @Test 69 | public void testBuildColumnsWithTablePrefix_postgreSql_columnMatchingTableName() { 70 | Set columns = new HashSet<>(Arrays.asList("users", "id")); 71 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, true); 72 | assertThat(result).contains("\"users\".\"users\""); 73 | assertThat(result).contains("\"users\".\"id\""); 74 | } 75 | 76 | @Test 77 | public void testBuildColumnsWithTablePrefix_emptyColumns() { 78 | Set columns = new HashSet<>(); 79 | String result = SpannerScanner.buildColumnsWithTablePrefix("users", columns, false); 80 | assertThat(result).isEmpty(); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/graph/GraphErrorHandlingTest.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph; 2 | 3 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs.LabelConfig; 4 | import com.google.common.collect.ImmutableList; 5 | import com.google.gson.Gson; 6 | import java.util.Collection; 7 | import java.util.Collections; 8 | import org.apache.spark.sql.DataFrameReader; 9 | import org.junit.Assert; 10 | import org.junit.Test; 11 | 12 | public class GraphErrorHandlingTest extends GraphReadIntegrationTestBase { 13 | 14 | @Test 15 | public void testDirectQueryNonRootPartitionable() { 16 | String nodeQuery = 17 | "SELECT * FROM GRAPH_TABLE (MusicGraph MATCH (n:SINGER|ALBUM) RETURN n.id AS id)"; 18 | DataFrameReader reader = 19 | musicGraphReader(null).option("graphQuery", nodeQuery).option("type", "node"); 20 | Exception e = Assert.assertThrows(Exception.class, reader::load); 21 | Assert.assertTrue(e.getMessage().contains("root-partitionable")); 22 | } 23 | 24 | @Test 25 | public void testDirectQueryNoId() { 26 | String nodeQuery = 27 | "SELECT * FROM GRAPH_TABLE (MusicGraph MATCH (n:SINGER) RETURN n.id AS no_id)"; 28 | DataFrameReader reader = 29 | musicGraphReader(null).option("graphQuery", nodeQuery).option("type", "node"); 30 | Exception e = Assert.assertThrows(IllegalArgumentException.class, reader::load); 31 | Assert.assertTrue(e.getMessage().contains("id missing")); 32 | } 33 | 34 | @Test 35 | public void testWildcardLabelMixedWithOtherLabel() { 36 | SpannerGraphConfigs configs = new SpannerGraphConfigs(); 37 | configs.nodeLabelConfigs.add(new LabelConfig("*", Collections.emptyList(), null)); 38 | configs.nodeLabelConfigs.add(new LabelConfig("SINGER", Collections.emptyList(), null)); 39 | configs.edgeLabelConfigs.add(new LabelConfig("*", Collections.emptyList(), null)); 40 | configs.edgeLabelConfigs.add(new LabelConfig("KNOWN", Collections.emptyList(), null)); 41 | 42 | DataFrameReader reader = musicGraphReader(null).option("configs", new Gson().toJson(configs)); 43 | 44 | Assert.assertThrows(IllegalArgumentException.class, () -> reader.option("type", "node").load()); 45 | Assert.assertThrows(IllegalArgumentException.class, () -> reader.option("type", "edge").load()); 46 | } 47 | 48 | @Test 49 | public void testEdgeReferencingFilteredOutNodes() { 50 | SpannerGraphConfigs configs = new SpannerGraphConfigs(); 51 | configs.nodeLabelConfigs.add(new LabelConfig("SINGER", Collections.emptyList(), null)); 52 | Assert.assertThrows(IllegalArgumentException.class, () -> readEdges(musicGraphReader(configs))); 53 | } 54 | 55 | private void testNonExistentProperties(Collection labels, boolean node) { 56 | SpannerGraphConfigs configs = new SpannerGraphConfigs(); 57 | if (node) { 58 | configs.nodeLabelConfigs.addAll(labels); 59 | } else { 60 | configs.edgeLabelConfigs.addAll(labels); 61 | } 62 | Exception e = 63 | Assert.assertThrows( 64 | IllegalArgumentException.class, 65 | () -> musicGraphReader(configs).option("type", node ? "node" : "edge").load()); 66 | Assert.assertTrue(e.getMessage().contains("property")); 67 | } 68 | 69 | @Test 70 | public void testNonExistentProperties() { 71 | testNonExistentProperties( 72 | ImmutableList.of(new LabelConfig("*", ImmutableList.of("FriendId"), null)), true); 73 | testNonExistentProperties( 74 | ImmutableList.of(new LabelConfig("*", ImmutableList.of("AlbumTitle"), null)), false); 75 | testNonExistentProperties( 76 | ImmutableList.of(new LabelConfig("SINGER", ImmutableList.of("album_id"), null)), true); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerTableSchema.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner; 2 | 3 | import com.google.cloud.spanner.ResultSet; 4 | import com.google.cloud.spanner.Statement; 5 | import com.google.cloud.spanner.Struct; 6 | import com.google.cloud.spanner.connection.Connection; 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | import java.util.Objects; 10 | import org.apache.spark.sql.types.DataType; 11 | import org.apache.spark.sql.types.MetadataBuilder; 12 | import org.apache.spark.sql.types.StructField; 13 | import org.apache.spark.sql.types.StructType; 14 | 15 | public class SpannerTableSchema { 16 | 17 | private static final String QUERY_PREFIX = 18 | "SELECT COLUMN_NAME, IS_NULLABLE='YES' AS ISNULLABLE, SPANNER_TYPE " 19 | + "FROM INFORMATION_SCHEMA.COLUMNS WHERE "; 20 | private static final String QUERY_SUFFIX = " ORDER BY ORDINAL_POSITION"; 21 | private static final String GOOGLESQL_SCHEMA = 22 | QUERY_PREFIX + "UPPER(TABLE_NAME)=UPPER(@tableName)" + QUERY_SUFFIX; 23 | private static final String POSTGRESQL_SCHEMA = 24 | QUERY_PREFIX + "columns.table_name=$1" + QUERY_SUFFIX; 25 | 26 | private final Map columns; 27 | 28 | public final String name; 29 | public final StructType schema; 30 | 31 | static Statement buildSchemaQuery(String tableName, boolean isPostgreSql) { 32 | if (isPostgreSql) { 33 | return Statement.newBuilder(POSTGRESQL_SCHEMA).bind("p1").to(tableName).build(); 34 | } else { 35 | return Statement.newBuilder(GOOGLESQL_SCHEMA).bind("tableName").to(tableName).build(); 36 | } 37 | } 38 | 39 | public SpannerTableSchema(Connection conn, String tableName, boolean isPostgreSql) { 40 | this.name = tableName; 41 | this.columns = new HashMap<>(); 42 | Statement stmt = buildSchemaQuery(tableName, isPostgreSql); 43 | try (final ResultSet rs = conn.executeQuery(stmt)) { 44 | // Expecting resultset columns in the ordering: 45 | // COLUMN_NAME, IS_NULLABLE, SPANNER_TYPE 46 | // row1: 47 | // ... 48 | // rowN: 49 | StructType schema = new StructType(); 50 | while (rs.next()) { 51 | Struct row = rs.getCurrentRowAsStruct(); 52 | String columnName = row.getString(0); 53 | StructField structField = 54 | getSparkStructField(columnName, row.getString(2), row.getBoolean(1), isPostgreSql); 55 | schema = schema.add(structField); 56 | this.columns.put(columnName, structField); 57 | } 58 | this.schema = schema; 59 | } 60 | } 61 | 62 | public static StructField getSparkStructField( 63 | String name, String spannerType, boolean isNullable, boolean isPostgreSql) { 64 | DataType catalogType = 65 | isPostgreSql 66 | ? SpannerTable.ofSpannerStrTypePg(spannerType, isNullable) 67 | : SpannerTable.ofSpannerStrType(spannerType, isNullable); 68 | MetadataBuilder metadataBuilder = new MetadataBuilder(); 69 | if (isJson(spannerType)) { 70 | metadataBuilder.putString(SpannerUtils.COLUMN_TYPE, "json"); 71 | } else if (isJsonb(spannerType)) { 72 | metadataBuilder.putString(SpannerUtils.COLUMN_TYPE, "jsonb"); 73 | } 74 | return new StructField(name, catalogType, isNullable, metadataBuilder.build()); 75 | } 76 | 77 | public StructField getStructFieldForColumn(String columnName) { 78 | return Objects.requireNonNull(columns.get(columnName)); 79 | } 80 | 81 | public static boolean isJson(String spannerStrType) { 82 | return "json".equalsIgnoreCase(spannerStrType.trim()); 83 | } 84 | 85 | public static boolean isJsonb(String spannerStrType) { 86 | return "jsonb".equalsIgnoreCase(spannerStrType.trim()); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraph.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph; 2 | 3 | import com.google.cloud.spanner.Options; 4 | import com.google.cloud.spanner.Statement; 5 | import com.google.cloud.spanner.TimestampBound; 6 | import com.google.cloud.spark.spanner.SpannerConnectorException; 7 | import com.google.cloud.spark.spanner.SpannerErrorCode; 8 | import com.google.cloud.spark.spanner.graph.query.SpannerGraphQuery; 9 | import com.google.common.collect.ImmutableList; 10 | import com.google.common.collect.ImmutableSet; 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.Objects; 14 | import java.util.Set; 15 | import javax.annotation.Nullable; 16 | import org.apache.spark.sql.connector.catalog.SupportsRead; 17 | import org.apache.spark.sql.connector.catalog.SupportsWrite; 18 | import org.apache.spark.sql.connector.catalog.Table; 19 | import org.apache.spark.sql.connector.catalog.TableCapability; 20 | import org.apache.spark.sql.connector.read.ScanBuilder; 21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo; 22 | import org.apache.spark.sql.connector.write.WriteBuilder; 23 | import org.apache.spark.sql.types.StructType; 24 | import org.apache.spark.sql.util.CaseInsensitiveStringMap; 25 | 26 | /** Represents the Spanner Graph data source in Spark */ 27 | public class SpannerGraph implements Table, SupportsRead, SupportsWrite { 28 | 29 | static final List requiredOptions = 30 | ImmutableList.of("projectId", "instanceId", "databaseId", "graph", "type"); 31 | 32 | public final Map options; 33 | public final Options.ReadAndQueryOption dataBoostEnabled; 34 | public final SpannerGraphConfigs configs; 35 | public final @Nullable Statement directQuery; 36 | public final boolean nodeDataframe; 37 | public final SpannerGraphQuery spannerGraphQuery; 38 | public final TimestampBound readTimestamp; 39 | public final String graphName; 40 | 41 | SpannerGraph( 42 | Map options, 43 | String graphName, 44 | SpannerGraphConfigs configs, 45 | @Nullable Statement directQuery, 46 | boolean dataBoost, 47 | boolean node, 48 | TimestampBound readTimestamp, 49 | SpannerGraphQuery spannerGraphQuery) { 50 | checkOptions(options); 51 | this.graphName = graphName; 52 | this.options = new CaseInsensitiveStringMap(options); 53 | this.configs = Objects.requireNonNull(configs); 54 | this.directQuery = directQuery; 55 | this.dataBoostEnabled = Options.dataBoostEnabled(dataBoost); 56 | this.nodeDataframe = node; 57 | this.readTimestamp = readTimestamp; 58 | this.spannerGraphQuery = spannerGraphQuery; 59 | } 60 | 61 | static void checkOptions(Map options) { 62 | for (String o : requiredOptions) { 63 | Objects.requireNonNull(options.get(o), "missing " + o + " in the options"); 64 | } 65 | } 66 | 67 | @Override 68 | public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { 69 | return new SpannerGraphScanBuilder(this); 70 | } 71 | 72 | @Override 73 | public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { 74 | throw new SpannerConnectorException( 75 | SpannerErrorCode.WRITES_NOT_SUPPORTED, 76 | "writes are not supported in the Spark Spanner Connector"); 77 | } 78 | 79 | @Override 80 | public String name() { 81 | return graphName; 82 | } 83 | 84 | /** Returns the schema of this table. */ 85 | @Override 86 | public StructType schema() { 87 | return spannerGraphQuery.dataframeSchema; 88 | } 89 | 90 | /** Returns the set of capabilities for this table. */ 91 | @Override 92 | public Set capabilities() { 93 | return ImmutableSet.of(TableCapability.BATCH_READ); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/DirectGraphQuery.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph.query; 2 | 3 | import com.google.cloud.Tuple; 4 | import com.google.cloud.spanner.ReadContext.QueryAnalyzeMode; 5 | import com.google.cloud.spanner.ResultSet; 6 | import com.google.cloud.spanner.Statement; 7 | import com.google.cloud.spanner.connection.Connection; 8 | import com.google.cloud.spark.spanner.SpannerRowConverter; 9 | import com.google.cloud.spark.spanner.SpannerRowConverterDirect; 10 | import com.google.cloud.spark.spanner.SpannerTableSchema; 11 | import com.google.common.collect.ImmutableSet; 12 | import com.google.spanner.v1.ResultSetMetadata; 13 | import java.util.ArrayList; 14 | import java.util.Arrays; 15 | import java.util.Collections; 16 | import java.util.List; 17 | import java.util.Set; 18 | import java.util.stream.Collectors; 19 | import org.apache.spark.sql.types.StructField; 20 | import org.apache.spark.sql.types.StructType; 21 | 22 | /** A user-provided GQL query for fetching nodes/edges */ 23 | public class DirectGraphQuery implements GraphSubQuery { 24 | 25 | private final Statement query; 26 | private final List outputSparkFields; 27 | 28 | public DirectGraphQuery(Connection conn, Statement query, boolean node) { 29 | this.query = query; 30 | this.outputSparkFields = Collections.unmodifiableList(getOutputSparkFields(conn, query, node)); 31 | } 32 | 33 | private static List getOutputSparkFields( 34 | Connection conn, Statement query, boolean node) { 35 | final Set idColumns = node ? ImmutableSet.of("id") : ImmutableSet.of("src", "dst"); 36 | 37 | List fields; 38 | try (ResultSet rs = conn.analyzeQuery(query, QueryAnalyzeMode.PLAN)) { 39 | fields = resultSetMetadataToSchema(rs.getMetadata(), idColumns); 40 | } 41 | for (String idColumn : idColumns) { 42 | boolean hasField = fields.stream().map(StructField::name).anyMatch(n -> n.equals(idColumn)); 43 | if (!hasField) { 44 | throw new IllegalArgumentException( 45 | String.format( 46 | "Column %s missing in the query output. Query: %s. Spark fields: %s", 47 | idColumn, query, fields)); 48 | } 49 | } 50 | return fields; 51 | } 52 | 53 | private static List resultSetMetadataToSchema( 54 | ResultSetMetadata metadata, Set notNullableColumns) { 55 | List fields = new ArrayList<>(); 56 | for (com.google.spanner.v1.StructType.Field column : metadata.getRowType().getFieldsList()) { 57 | String name = column.getName(); 58 | String type = column.getType().getCode().name(); 59 | boolean isNullable = !notNullableColumns.contains(name); 60 | fields.add(SpannerTableSchema.getSparkStructField(name, type, isNullable, false)); 61 | } 62 | return fields; 63 | } 64 | 65 | @Override 66 | public Tuple getQueryAndConverter(StructType dataframeSchema) { 67 | if (Arrays.equals(outputSparkFields.toArray(new StructField[0]), dataframeSchema.fields())) { 68 | return Tuple.of(query, new SpannerRowConverterDirect()); 69 | } else { 70 | String selectedColumnNames = 71 | Arrays.stream(dataframeSchema.fields()) 72 | .map(StructField::name) 73 | .collect(Collectors.joining(", ")); 74 | if (selectedColumnNames.isEmpty()) { 75 | selectedColumnNames = "null"; 76 | } 77 | String prunedSql = String.format("SELECT %s FROM (%s)", selectedColumnNames, query.getSql()); 78 | return Tuple.of( 79 | query.toBuilder().replace(prunedSql).build(), new SpannerRowConverterDirect()); 80 | } 81 | } 82 | 83 | @Override 84 | public List getOutputSparkFields() { 85 | return outputSparkFields; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerInputPartitionReaderContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import com.fasterxml.jackson.core.JsonProcessingException; 18 | import com.google.cloud.spanner.BatchReadOnlyTransaction; 19 | import com.google.cloud.spanner.BatchTransactionId; 20 | import com.google.cloud.spanner.ErrorCode; 21 | import com.google.cloud.spanner.Partition; 22 | import com.google.cloud.spanner.ResultSet; 23 | import com.google.cloud.spanner.SpannerException; 24 | import java.io.IOException; 25 | import java.util.Map; 26 | import java.util.Objects; 27 | import org.apache.spark.sql.catalyst.InternalRow; 28 | import org.apache.spark.sql.util.CaseInsensitiveStringMap; 29 | 30 | public class SpannerInputPartitionReaderContext 31 | implements AutoCloseable, InputPartitionReaderContext { 32 | 33 | private BatchClientWithCloser batchClientWithCloser; 34 | private ResultSet rs; 35 | private final SpannerRowConverter rowConverter; 36 | 37 | public SpannerInputPartitionReaderContext( 38 | Partition partition, 39 | BatchTransactionId batchTransactionId, 40 | String mapAsJSONStr, 41 | SpannerRowConverter rowConverter) { 42 | Map opts; 43 | try { 44 | opts = SpannerUtils.deserializeMap(mapAsJSONStr); 45 | } catch (JsonProcessingException e) { 46 | throw new SpannerConnectorException( 47 | SpannerErrorCode.SPANNER_FAILED_TO_PARSE_OPTIONS, "Error parsing the input options.", e); 48 | } 49 | // The map might be case-insensitive when being serialized 50 | opts = new CaseInsensitiveStringMap(opts); 51 | 52 | // Please note that we are using BatchClientWithCloser to avoid resource leaks. 53 | // That is because, since we do have a deterministic scope and timeline for how long 54 | // SpannerInputPartitionReaderContext's BatchClient.Spanner will execute, we use this 55 | // custom client with an AutoCloser that'll clean up resources as it is garbage collected. 56 | this.batchClientWithCloser = SpannerUtils.batchClientFromProperties(opts); 57 | try (BatchReadOnlyTransaction txn = 58 | batchClientWithCloser.batchClient.batchReadOnlyTransaction(batchTransactionId)) { 59 | this.rs = txn.execute(partition); 60 | } 61 | this.rowConverter = Objects.requireNonNull(rowConverter); 62 | } 63 | 64 | @Override 65 | public boolean next() throws IOException { 66 | try { 67 | return this.rs.next(); 68 | } catch (SpannerException e) { 69 | if (e.getErrorCode() == ErrorCode.RESOURCE_EXHAUSTED) { 70 | throw new SpannerConnectorException( 71 | SpannerErrorCode.RESOURCE_EXHAUSTED_ON_SPANNER, 72 | e.getMessage().split("- Statement:")[0] 73 | + "You may receive the error message due to not enough quota on the project."); 74 | } 75 | throw e; 76 | } 77 | } 78 | 79 | @Override 80 | public InternalRow get() { 81 | return rowConverter.convert(this.rs.getCurrentRowAsStruct()); 82 | } 83 | 84 | @Override 85 | public void close() throws IOException { 86 | if (this.rs != null) { 87 | this.rs.close(); 88 | this.rs = null; 89 | } 90 | if (this.batchClientWithCloser != null) { 91 | this.batchClientWithCloser.close(); 92 | this.batchClientWithCloser = null; 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/resources/db/insert_data_graph.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM FlexibleGraphNode WHERE TRUE; 2 | INSERT INTO FlexibleGraphNode(id, type, properties, cluster_id) 3 | VALUES (1, "Person", JSON '{"birthday":"1991-12-21T08:00:00Z","city":"Adelaide","country":"Australia","name":"Alex"}', 1), 4 | (2, "Person", JSON '{"birthday":"1980-10-31T08:00:00Z","city":"Moravia","country":"Czech_Republic","name":"Dana"}', 1), 5 | (3, "Person", JSON '{"birthday":"1986-12-07T08:00:00Z","city":"Kollam","country":"India","name":"Lee"}', 1), 6 | (7, "Account", JSON '{"create_time":"2020-01-10T14:22:20.222Z","is_blocked":false,"nick_name":"Vacation Fund"}', 1), 7 | (16, "Account", JSON '{"create_time":"2020-01-28T01:55:09.206Z","is_blocked":true,"nick_name":"Vacation Fund"}', 1), 8 | (20, "Account", JSON '{"create_time":"2020-02-18T13:44:20.655Z","is_blocked":false,"nick_name":"Rainy Day Fund"}', 1), 9 | (100, "Account", JSON '{"create_time":"2020-01-10T14:22:20.222Z","is_blocked":false,"nick_name":"Vacation Fund"}', 100), 10 | (101, "Account", JSON '{"create_time":"2020-01-28T01:55:09.206Z","is_blocked":true,"nick_name":"Vacation Fund"}',100); 11 | 12 | DELETE FROM FlexibleGraphEdge WHERE TRUE; 13 | INSERT INTO FlexibleGraphEdge(id, edge_type, to_id, edge_id, properties) 14 | VALUES (1, "Owns", 7, "2020-01-10T14:22:20.222Z", 15 | JSON '{"create_time":"2020-01-10T14:22:20.222Z"}'), 16 | (2, "Owns", 20, "2020-01-28T01:55:09.206Z", 17 | JSON '{"create_time":"2020-01-28T01:55:09.206Z"}'), 18 | (3, "Owns", 16, "2020-02-18T13:44:20.655Z", 19 | JSON '{"create_time":"2020-02-18T13:44:20.655Z"}'), 20 | (7, "Transfers", 16, "2020-08-29T22:28:58.647Z", 21 | JSON '{"amount":300,"create_time":"2020-08-29T22:28:58.647Z","order_number":"304330008004315"}'), 22 | (7, "Transfers", 16, "2020-10-04T23:55:05.342Z", 23 | JSON '{"amount":100,"create_time":"2020-10-04T23:55:05.342Z","order_number":"304120005529714"}'), 24 | (16, "Transfers", 20, "2020-09-25T09:36:14.926Z", 25 | JSON '{"amount":300,"create_time":"2020-09-25T09:36:14.926Z","order_number":"103650009791820"}'), 26 | (20, "Transfers", 7, "2020-10-04T23:55:05.342Z", 27 | JSON '{"amount":500,"create_time":"2020-10-04T23:55:05.342Z","order_number":"304120005529714"}'), 28 | (20, "Transfers", 16, "2020-10-17T10:59:40.247Z", 29 | JSON '{"amount":200,"create_time":"2020-10-17T10:59:40.247Z","order_number":"302290001255747"}'), 30 | (100, "Transfers", 101, "2020-08-29T22:28:58.647Z", 31 | JSON '{"amount":300,"create_time":"2020-08-29T22:28:58.647Z","order_number":"304330008004315"}'); 32 | 33 | DELETE FROM ProductionCompanies WHERE TRUE; 34 | INSERT INTO ProductionCompanies (CompanyId, CompanyName, LocationCountry, FoundedYear) VALUES (1, 'Mellow Wave', 'U.S.A.', 1993), (2, 'Rolling Stow', 'Canada', 2002), (3, 'Picky Penang', 'Malaysia', 1984), (4, 'Ice Ice', 'Poland', 2012), (5, 'Oint Is Not An Ink', 'Peru', 2000); 35 | 36 | DELETE FROM Singers WHERE TRUE; 37 | INSERT INTO Singers (SingerId, FirstName, LastName, BirthDate) VALUES (1, 'Cruz', 'Richards', '1970-9-3'), (2, 'Tristan', 'Smith', '1990-8-17') ,(3, 'Izumi', 'Trentor', '1991-10-2'), (4, 'Ira', 'Martin', '1991-11-9'),(5, 'Mahan', 'Lomond', '1977-1-29'); 38 | 39 | DELETE FROM Albums WHERE TRUE; 40 | INSERT INTO Albums (SingerId, AlbumId, AlbumTitle, ReleaseDate, CompanyId) VALUES (1, 1, 'Total Junk', '2014-3-2', 1), (1, 2, 'Go Go Go', '2011-2-9', 1), (2, 3, 'Green', '2012-9-17', 2), (2, 4, 'Forever Hold Your Peace', '2010-10-15', 3), (3, 5, 'Terrified', '2008-6-7', 3), (4, 6, 'Nothing To Do With Me', '2014-4-29', 4), (5, 7, 'Play', '2013-12-21', 5); 41 | 42 | DELETE FROM SingerFriends WHERE TRUE; 43 | INSERT INTO SingerFriends (SingerId, FriendId) VALUES (1, 2), (1, 3), (2, 1), (2, 4), (2, 5), (3, 1), (3, 5), (4, 2), (4, 5), (5, 2), (5, 3), (5, 4); 44 | 45 | DELETE FROM SingerContracts WHERE TRUE; 46 | INSERT INTO SingerContracts (SingerId, CompanyId) VALUES (1, 4), (2, 2), (3, 5), (4, 1), (5, 3); -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/Spark31SpannerTableProvider.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import com.google.cloud.spark.spanner.graph.SpannerGraphBuilder; 18 | import java.util.Map; 19 | import javax.annotation.Nullable; 20 | import org.apache.spark.sql.Dataset; 21 | import org.apache.spark.sql.Row; 22 | import org.apache.spark.sql.SQLContext; 23 | import org.apache.spark.sql.SaveMode; 24 | import org.apache.spark.sql.connector.catalog.Table; 25 | import org.apache.spark.sql.connector.catalog.TableProvider; 26 | import org.apache.spark.sql.connector.expressions.Transform; 27 | import org.apache.spark.sql.sources.BaseRelation; 28 | import org.apache.spark.sql.sources.CreatableRelationProvider; 29 | import org.apache.spark.sql.sources.DataSourceRegister; 30 | import org.apache.spark.sql.types.StructType; 31 | import org.apache.spark.sql.util.CaseInsensitiveStringMap; 32 | 33 | public class Spark31SpannerTableProvider 34 | implements DataSourceRegister, TableProvider, CreatableRelationProvider { 35 | 36 | private @Nullable Table table; 37 | 38 | /* 39 | * Infers the schema of the table identified by the given options. 40 | */ 41 | @Override 42 | public StructType inferSchema(CaseInsensitiveStringMap options) { 43 | if (table == null) { 44 | table = getTable(options); 45 | } 46 | return table.schema(); 47 | } 48 | 49 | /* 50 | * Returns a Table instance with the specified table schema, 51 | * partitioning and properties to perform a read or write. 52 | */ 53 | @Override 54 | public Table getTable( 55 | StructType schema, Transform[] partitioning, Map properties) { 56 | if (table == null) { 57 | table = getTable(properties); 58 | } 59 | return table; 60 | } 61 | 62 | /* 63 | * Returns true if the source has the ability of 64 | * accepting external table metadata when getting tables. 65 | */ 66 | @Override 67 | public boolean supportsExternalMetadata() { 68 | return false; 69 | } 70 | 71 | /* 72 | * Implements DataSourceRegister.shortName(). This method allows Spark to match 73 | * the DataSource when spark.read(...).format("spanner") is invoked. 74 | */ 75 | @Override 76 | public String shortName() { 77 | return "cloud-spanner"; 78 | } 79 | 80 | /** Creation of Database is not supported by the Spark Spanner Connector. */ 81 | @Override 82 | public BaseRelation createRelation( 83 | SQLContext sqlContext, 84 | SaveMode mode, 85 | scala.collection.immutable.Map parameters, 86 | Dataset data) { 87 | throw new SpannerConnectorException( 88 | SpannerErrorCode.WRITES_NOT_SUPPORTED, 89 | "writes are not supported in the Spark Spanner Connector"); 90 | } 91 | 92 | private Table getTable(Map properties) { 93 | boolean hasTable = properties.containsKey("table"); 94 | boolean hasGraph = properties.containsKey("graph"); 95 | if (hasTable && !hasGraph) { 96 | return new SpannerTable(properties); 97 | } else if (!hasTable && hasGraph) { 98 | return SpannerGraphBuilder.build(properties); 99 | } else { 100 | throw new SpannerConnectorException("properties must contain one of \"table\" or \"graph\""); 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerScanBuilder.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import com.google.common.collect.ImmutableSet; 18 | import java.util.ArrayList; 19 | import java.util.LinkedHashMap; 20 | import java.util.List; 21 | import java.util.Map; 22 | import java.util.Set; 23 | import org.apache.spark.sql.connector.read.Scan; 24 | import org.apache.spark.sql.connector.read.ScanBuilder; 25 | import org.apache.spark.sql.connector.read.SupportsPushDownFilters; 26 | import org.apache.spark.sql.connector.read.SupportsPushDownRequiredColumns; 27 | import org.apache.spark.sql.sources.Filter; 28 | import org.apache.spark.sql.types.StructField; 29 | import org.apache.spark.sql.types.StructType; 30 | import org.apache.spark.sql.util.CaseInsensitiveStringMap; 31 | import org.slf4j.Logger; 32 | import org.slf4j.LoggerFactory; 33 | 34 | /* 35 | * Allows us to implement ScanBuilder. 36 | */ 37 | public class SpannerScanBuilder 38 | implements ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns { 39 | private CaseInsensitiveStringMap opts; 40 | private List pushedFilters; 41 | private Set requiredColumns; 42 | private SpannerScanner scanner; 43 | private static final Logger log = LoggerFactory.getLogger(SpannerScanBuilder.class); 44 | private SpannerTable spannerTable; 45 | private Map fields; 46 | 47 | public SpannerScanBuilder(CaseInsensitiveStringMap options) { 48 | this.opts = options; 49 | this.pushedFilters = new ArrayList(); 50 | this.spannerTable = new SpannerTable(options); 51 | this.fields = new LinkedHashMap<>(); 52 | for (StructField field : spannerTable.schema().fields()) { 53 | fields.put(field.name(), field); 54 | } 55 | } 56 | 57 | @Override 58 | public Scan build() { 59 | this.scanner = 60 | new SpannerScanner( 61 | this.opts.asCaseSensitiveMap(), 62 | this.spannerTable, 63 | this.fields, 64 | this.pushedFilters(), 65 | this.requiredColumns); 66 | return this.scanner; 67 | } 68 | 69 | @Override 70 | public Filter[] pushedFilters() { 71 | return this.pushedFilters.toArray(new Filter[0]); 72 | } 73 | 74 | @Override 75 | public Filter[] pushFilters(Filter[] filters) { 76 | List handledFilters = new ArrayList<>(); 77 | List unhandledFilters = new ArrayList<>(); 78 | for (Filter filter : filters) { 79 | if (SparkFilterUtils.isTopLevelFieldHandled(false, filter, fields)) { 80 | handledFilters.add(filter); 81 | } else { 82 | unhandledFilters.add(filter); 83 | } 84 | } 85 | this.pushedFilters.addAll(handledFilters); 86 | return unhandledFilters.stream().toArray(Filter[]::new); 87 | } 88 | 89 | /* 90 | * pruneColumns applies column pruning with respect to the requiredSchema. 91 | * The docs recommend implementing this methood to push down required columns 92 | * to the data source and only read these columns during scan to 93 | * reduce the size of the data to be read. 94 | */ 95 | @Override 96 | public void pruneColumns(StructType requiredSchema) { 97 | // A user could invoke: SELECT a, b, d, a FROM TABLE; 98 | // and we should still be able to serve them back their 99 | // query without deduplication. 100 | this.requiredColumns = ImmutableSet.copyOf(requiredSchema.fieldNames()); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/TestData.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import com.google.cloud.spanner.Mutation; 18 | import com.google.common.io.CharStreams; 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | import java.io.InputStreamReader; 22 | import java.util.ArrayList; 23 | import java.util.Arrays; 24 | import java.util.List; 25 | import java.util.Objects; 26 | import java.util.stream.Collectors; 27 | 28 | public final class TestData { 29 | public static List initialDDL = createInitialDDL("/db/populate_ddl.sql"); 30 | public static List initialDDLPg = createInitialDDL("/db/populate_ddl_pg.sql"); 31 | public static List initialDDLGraph = createInitialDDL("/db/populate_ddl_graph.sql"); 32 | public static List initialDML = createInitialDML("/db/insert_data.sql"); 33 | public static List initialDMLPg = createInitialDML("/db/insert_data_pg.sql"); 34 | public static List initialDMLGraph = createInitialDML("/db/insert_data_graph.sql"); 35 | public static List shakespearMutations = createShakespeareTableMutations(); 36 | 37 | private TestData() {} 38 | 39 | private static List readAndParseSQL(String filename) { 40 | String initialDDL = mustReadResource(filename); 41 | String[] statements = 42 | Arrays.stream(initialDDL.split("\\r?\\n")) 43 | .map(String::trim) 44 | .filter(l -> !l.startsWith("--")) 45 | .collect(Collectors.joining("\n")) 46 | .split(";"); 47 | return Arrays.stream(statements) 48 | .map(String::trim) 49 | .filter(s -> !s.isEmpty()) 50 | .collect(Collectors.toList()); 51 | } 52 | 53 | private static List createInitialDDL(String filePath) { 54 | return readAndParseSQL(filePath); 55 | } 56 | 57 | private static List createInitialDML(String filePath) { 58 | return readAndParseSQL(filePath); 59 | } 60 | 61 | private static String mustReadResource(String path) { 62 | try (InputStream stream = TestData.class.getResourceAsStream(path)) { 63 | String data = CharStreams.toString(new InputStreamReader(Objects.requireNonNull(stream))); 64 | if (data == null || data.length() == 0) { 65 | throw new RuntimeException(path + " has no content"); 66 | } 67 | return data; 68 | } catch (IOException e) { 69 | throw new RuntimeException("failed to read resource " + path, e); 70 | } 71 | } 72 | 73 | private static List createShakespeareTableMutations() { 74 | String csv = mustReadResource("/db/shakespeare_bq.csv"); 75 | String[] csvLines = csv.trim().split("\n"); 76 | Long id = 1L; 77 | List mutations = new ArrayList<>(); 78 | for (String csvLine : csvLines) { 79 | csvLine = csvLine.trim(); 80 | if (csvLine.equals("") || csvLine.equals("\n")) { 81 | continue; 82 | } 83 | 84 | String[] splits = csvLine.split(","); 85 | 86 | mutations.add( 87 | Mutation.newInsertBuilder("Shakespeare") 88 | .set("id") 89 | .to(id) 90 | .set("word") 91 | .to(splits[0]) 92 | .set("word_count") 93 | .to(splits[1]) 94 | .set("corpus") 95 | .to(splits[2]) 96 | .set("corpus_date") 97 | .to(splits[3]) 98 | .build()); 99 | id++; 100 | } 101 | return mutations; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/EdgeElementTableQuery.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph.query; 2 | 3 | import com.google.cloud.spark.spanner.SpannerTableSchema; 4 | import com.google.cloud.spark.spanner.graph.PropertyGraph; 5 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphElementTable; 6 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphNodeTableReference; 7 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs; 8 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs.LabelConfig; 9 | import java.util.List; 10 | 11 | /** Query for an edge table */ 12 | public class EdgeElementTableQuery extends ElementTableQuery { 13 | 14 | /** 15 | * Construct a query for an edge element table 16 | * 17 | * @param graphSchema schema of the graph 18 | * @param elementTable the element table to construct a query for 19 | * @param configs user configs for exporting the graph 20 | * @param exportIdColumnDirectly export the key column for src/dst directly to avoid the need of 21 | * downstream ID translation. Should be true only when there is only one key column for src 22 | * and one key column for dst. 23 | * @return a {@link EdgeElementTableQuery} for the element table. 24 | */ 25 | public static EdgeElementTableQuery create( 26 | PropertyGraph graphSchema, 27 | GraphElementTable elementTable, 28 | SpannerTableSchema baseTableSchema, 29 | SpannerGraphConfigs configs, 30 | boolean exportIdColumnDirectly) { 31 | 32 | List matchedLabels = getMatchedLabels(elementTable, configs.edgeLabelConfigs); 33 | 34 | return new EdgeElementTableQuery( 35 | graphSchema, 36 | elementTable, 37 | baseTableSchema, 38 | configs.outputIndividualKeys, 39 | exportIdColumnDirectly, 40 | mergeProperties(elementTable, matchedLabels), 41 | mergeWhereClauses(matchedLabels)); 42 | } 43 | 44 | private EdgeElementTableQuery( 45 | PropertyGraph graphSchema, 46 | GraphElementTable elementTable, 47 | SpannerTableSchema baseTableSchema, 48 | boolean outputIndividualKeys, 49 | boolean exportIdColumnDirectly, 50 | List properties, 51 | String whereClause) { 52 | super(baseTableSchema, whereClause); 53 | if (!PropertyGraph.GRAPH_ELEMENT_TABLE_KIND_EDGE.equalsIgnoreCase(elementTable.kind)) { 54 | throw new IllegalArgumentException("Invalid elementTable kind: " + elementTable.kind); 55 | } 56 | graphSchema.checkEdgeReferenceKeyColumnsMatchNodeKeyColumns(elementTable); 57 | 58 | if (exportIdColumnDirectly) { 59 | if (elementTable.sourceNodeTable.edgeTableColumns.size() != 1 60 | || elementTable.destinationNodeTable.edgeTableColumns.size() != 1) { 61 | throw new IllegalArgumentException( 62 | "Cannot export multiple key columns directly as one SRC/DST column. "); 63 | } 64 | addDirectField(elementTable.sourceNodeTable.edgeTableColumns.get(0), "src"); 65 | addDirectField(elementTable.destinationNodeTable.edgeTableColumns.get(0), "dst"); 66 | } else { 67 | if (outputIndividualKeys) { 68 | addIndividualKeysForNodeTableReference("src", graphSchema, elementTable.sourceNodeTable); 69 | addIndividualKeysForNodeTableReference( 70 | "dst", graphSchema, elementTable.destinationNodeTable); 71 | } else { 72 | addCombinedId( 73 | "src", 74 | graphSchema.getTableId(elementTable.sourceNodeTable.nodeTableName), 75 | elementTable.sourceNodeTable.edgeTableColumns); 76 | addCombinedId( 77 | "dst", 78 | graphSchema.getTableId(elementTable.destinationNodeTable.nodeTableName), 79 | elementTable.destinationNodeTable.edgeTableColumns); 80 | } 81 | } 82 | 83 | addInnerProperties(elementTable.propertyDefinitions); 84 | addOutputProperties(graphSchema, properties); 85 | } 86 | 87 | private void addIndividualKeysForNodeTableReference( 88 | String type, PropertyGraph graphSchema, GraphNodeTableReference nodeTableReference) { 89 | addNodeTableColumn(type, graphSchema.getTableId(nodeTableReference.nodeTableName)); 90 | addIndividualKeyColumns( 91 | type, nodeTableReference.edgeTableColumns, nodeTableReference.nodeTableColumns); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/OpenLineageIntegrationTestBase.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner; 2 | 3 | import static com.google.common.truth.Truth.assertThat; 4 | 5 | import com.google.gson.JsonObject; 6 | import com.google.gson.JsonParser; 7 | import io.openlineage.spark.agent.OpenLineageSparkListener; 8 | import java.io.File; 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Scanner; 13 | import org.apache.spark.sql.Dataset; 14 | import org.apache.spark.sql.Row; 15 | import org.apache.spark.sql.SaveMode; 16 | import org.apache.spark.sql.SparkSession; 17 | import org.junit.ClassRule; 18 | import org.junit.Test; 19 | import org.junit.rules.ExternalResource; 20 | 21 | public class OpenLineageIntegrationTestBase extends SpannerTestBase { 22 | 23 | @ClassRule public static OLSparkFactory sparkFactory = new OLSparkFactory(); 24 | 25 | protected SparkSession spark; 26 | protected File lineageOutputFile; 27 | 28 | protected Map connectionProperties; 29 | 30 | public OpenLineageIntegrationTestBase() { 31 | this.spark = sparkFactory.spark; 32 | this.lineageOutputFile = sparkFactory.lineageOutputFile; 33 | this.connectionProperties = connectionProperties(); 34 | } 35 | 36 | protected static class OLSparkFactory extends ExternalResource { 37 | SparkSession spark; 38 | 39 | File lineageOutputFile; 40 | 41 | @Override 42 | protected void before() throws Throwable { 43 | lineageOutputFile = File.createTempFile("openlineage_test_" + System.nanoTime(), ".log"); 44 | lineageOutputFile.deleteOnExit(); 45 | spark = 46 | SparkSession.builder() 47 | .master("local") 48 | .config("spark.ui.enabled", "false") 49 | .config("spark.default.parallelism", 20) 50 | .config("spark.extraListeners", OpenLineageSparkListener.class.getCanonicalName()) 51 | .config("spark.openlineage.transport.type", "file") 52 | .config("spark.openlineage.transport.location", lineageOutputFile.getAbsolutePath()) 53 | .getOrCreate(); 54 | spark.sparkContext().setLogLevel("WARN"); 55 | } 56 | } 57 | 58 | public Dataset readFromTable(String table) { 59 | Map props = this.connectionProperties(); 60 | return spark 61 | .read() 62 | .format("cloud-spanner") 63 | .option("viewsEnabled", true) 64 | .option("projectId", props.get("projectId")) 65 | .option("instanceId", props.get("instanceId")) 66 | .option("databaseId", props.get("databaseId")) 67 | .option("emulatorHost", props.get("emulatorHost")) 68 | .option("table", table) 69 | .load(); 70 | } 71 | 72 | @Test 73 | public void testOpenLineageEvents() throws Exception { 74 | File outputCsv = File.createTempFile("output_" + System.nanoTime(), ".csv"); 75 | outputCsv.deleteOnExit(); 76 | Dataset df = readFromTable("compositeTable"); 77 | df.createOrReplaceTempView("tempview"); 78 | Dataset outputDf = 79 | spark.sql( 80 | "SELECT word, count(*) AS count FROM (SELECT explode(split(C, ' ')) AS word FROM tempview) GROUP BY 1"); 81 | 82 | outputDf 83 | .coalesce(1) 84 | .write() 85 | .format("csv") 86 | .mode(SaveMode.Overwrite) 87 | .save("file://" + outputCsv.getPath()); 88 | 89 | List jsonObjects = parseEventLog(lineageOutputFile); 90 | assertThat(jsonObjects).isNotEmpty(); 91 | 92 | jsonObjects.forEach( 93 | jsonObject -> { 94 | JsonObject input = jsonObject.getAsJsonArray("inputs").get(0).getAsJsonObject(); 95 | assertThat(input.get("namespace").getAsString()) 96 | .isEqualTo( 97 | String.format( 98 | "spanner://%s/%s", 99 | connectionProperties.get("projectId"), 100 | connectionProperties.get("instanceId"))); 101 | assertThat(input.get("name").getAsString()) 102 | .isEqualTo( 103 | String.format("%s/%s", connectionProperties.get("databaseId"), "compositeTable")); 104 | }); 105 | } 106 | 107 | private List parseEventLog(File file) throws Exception { 108 | List eventList; 109 | try (Scanner scanner = new Scanner(file)) { 110 | eventList = new ArrayList<>(); 111 | while (scanner.hasNextLine()) { 112 | String line = scanner.nextLine(); 113 | JsonObject event = JsonParser.parseString(line).getAsJsonObject(); 114 | if (!event.getAsJsonArray("inputs").isEmpty()) { 115 | eventList.add(event); 116 | } 117 | } 118 | } 119 | return eventList; 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraphBuilder.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph; 2 | 3 | import com.google.cloud.Timestamp; 4 | import com.google.cloud.spanner.Dialect; 5 | import com.google.cloud.spanner.PartitionOptions; 6 | import com.google.cloud.spanner.Statement; 7 | import com.google.cloud.spanner.TimestampBound; 8 | import com.google.cloud.spanner.connection.AbstractStatementParser; 9 | import com.google.cloud.spanner.connection.Connection; 10 | import com.google.cloud.spark.spanner.SpannerConnectorException; 11 | import com.google.cloud.spark.spanner.SpannerErrorCode; 12 | import com.google.cloud.spark.spanner.SpannerUtils; 13 | import com.google.cloud.spark.spanner.graph.query.SpannerGraphQuery; 14 | import java.util.Map; 15 | import java.util.Objects; 16 | 17 | /** Builder for {@link SpannerGraph} */ 18 | public class SpannerGraphBuilder { 19 | 20 | public static SpannerGraph build(Map options) { 21 | SpannerGraph.checkOptions(options); 22 | String graphName = Objects.requireNonNull(options.get("graph")); 23 | String directQueryString = options.get("graphQuery"); 24 | boolean dataBoost = getEnableDataBoost(options); 25 | String configsJson = options.get("configs"); 26 | SpannerGraphConfigs configs = 27 | configsJson != null ? SpannerGraphConfigs.fromJson(configsJson) : new SpannerGraphConfigs(); 28 | boolean node = getIsNodeDataframe(options); 29 | TimestampBound readTimestamp = getReadTimestamp(options); 30 | Statement directQuery = directQueryString != null ? Statement.of(directQueryString) : null; 31 | 32 | SpannerGraphQuery graphQuery; 33 | try (Connection conn = getConnection(options)) { 34 | // Ensure the version of the schema read matches the specified timestamp 35 | conn.setReadOnly(true); 36 | conn.setAutocommit(true); 37 | conn.setReadOnlyStaleness(readTimestamp); 38 | 39 | PropertyGraph graphSchema = PropertyGraph.Builder.getFromSpanner(conn, graphName); 40 | configs.validate(graphSchema, directQuery != null); 41 | if (directQuery != null) { 42 | checkQueryIsSql(directQuery); 43 | // Test if the provided query is root-partitionable 44 | // Will throw an exception if the query is not root-partitionable 45 | conn.partitionQuery(directQuery, PartitionOptions.getDefaultInstance()).close(); 46 | graphQuery = new SpannerGraphQuery(conn, directQuery, node); 47 | } else { 48 | graphQuery = new SpannerGraphQuery(conn, graphSchema, configs, node); 49 | } 50 | } 51 | 52 | return new SpannerGraph( 53 | options, graphName, configs, directQuery, dataBoost, node, readTimestamp, graphQuery); 54 | } 55 | 56 | private static boolean getEnableDataBoost(Map options) { 57 | final String dataBoostEnabledKey = "enableDataBoost"; 58 | String dataBoost = options.getOrDefault(dataBoostEnabledKey, "false"); 59 | if ("true".equalsIgnoreCase(dataBoost)) { 60 | return true; 61 | } else if ("false".equalsIgnoreCase(dataBoost)) { 62 | return false; 63 | } else { 64 | throw new IllegalArgumentException(dataBoostEnabledKey + " must be true or false"); 65 | } 66 | } 67 | 68 | private static boolean getIsNodeDataframe(Map options) { 69 | String type = Objects.requireNonNull(options.get("type")); 70 | if ("node".equalsIgnoreCase(type)) { 71 | return true; 72 | } else if ("edge".equalsIgnoreCase(type)) { 73 | return false; 74 | } else { 75 | throw new IllegalArgumentException("type must be node or edge"); 76 | } 77 | } 78 | 79 | private static TimestampBound getReadTimestamp(Map options) { 80 | String timestamp = options.get("timestamp"); 81 | return TimestampBound.ofReadTimestamp( 82 | timestamp == null ? Timestamp.now() : Timestamp.parseTimestamp(timestamp)); 83 | } 84 | 85 | private static Connection getConnection(Map options) { 86 | Connection conn = SpannerUtils.connectionFromProperties(options); 87 | if (!conn.getDialect().equals(Dialect.GOOGLE_STANDARD_SQL)) { 88 | throw new SpannerConnectorException( 89 | SpannerErrorCode.DATABASE_DIALECT_NOT_SUPPORTED, 90 | "Expecting dialect: GOOGLE_STANDARD_SQL, but the actual dialect used is " 91 | + conn.getDialect()); 92 | } 93 | return conn; 94 | } 95 | 96 | private static void checkQueryIsSql(Statement query) { 97 | AbstractStatementParser parser = 98 | AbstractStatementParser.getInstance(Dialect.GOOGLE_STANDARD_SQL); 99 | if (!parser.isQuery(parser.removeCommentsAndTrim(query.getSql()))) { 100 | throw new IllegalArgumentException( 101 | "Only SQL queries starting with SELECT are supported. Query provided: " + query); 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/resources/db/populate_ddl_graph.sql: -------------------------------------------------------------------------------- 1 | -- FlexibleGraph 2 | 3 | CREATE TABLE FlexibleGraphNode 4 | ( 5 | id INT64 NOT NULL, 6 | type STRING( MAX) NOT NULL, 7 | properties JSON, 8 | cluster_id INT64, 9 | ) PRIMARY KEY(id); 10 | 11 | CREATE INDEX NodeByType ON FlexibleGraphNode (type); 12 | 13 | CREATE TABLE FlexibleGraphEdge 14 | ( 15 | id INT64 NOT NULL, 16 | edge_type STRING( MAX) NOT NULL, 17 | to_id INT64 NOT NULL, 18 | edge_id STRING( MAX), 19 | properties JSON, 20 | CONSTRAINT FK_ToNode FOREIGN KEY (to_id) REFERENCES FlexibleGraphNode (id), 21 | ) PRIMARY KEY(id, edge_type, to_id, edge_id), 22 | INTERLEAVE IN PARENT FlexibleGraphNode ON DELETE CASCADE; 23 | 24 | CREATE PROPERTY GRAPH FlexibleGraph 25 | NODE TABLES( 26 | FlexibleGraphNode 27 | KEY(id) 28 | LABEL Node PROPERTIES( 29 | cluster_id, 30 | id, 31 | properties, 32 | type) 33 | ) 34 | EDGE TABLES( 35 | FlexibleGraphEdge 36 | KEY(id, edge_type, to_id, edge_id) 37 | SOURCE KEY(id) REFERENCES FlexibleGraphNode(id) 38 | DESTINATION KEY(to_id) REFERENCES FlexibleGraphNode(id) 39 | LABEL Edge PROPERTIES( 40 | edge_id, 41 | edge_type, 42 | id, 43 | properties, 44 | to_id) 45 | ); 46 | 47 | -- MusicGraph 48 | 49 | CREATE TABLE ProductionCompanies 50 | ( 51 | CompanyId INT64 NOT NULL, 52 | CompanyName STRING( MAX) NOT NULL, 53 | LocationCountry STRING( MAX) NOT NULL, 54 | FoundedYear INT64 NOT NULL, 55 | ) PRIMARY KEY(CompanyId); 56 | 57 | CREATE TABLE Singers 58 | ( 59 | SingerId INT64 NOT NULL, 60 | FirstName STRING(1024), 61 | LastName STRING(1024), 62 | BirthDate DATE, 63 | ) PRIMARY KEY(SingerId); 64 | 65 | CREATE TABLE Albums 66 | ( 67 | SingerId INT64 NOT NULL, 68 | AlbumId INT64 NOT NULL, 69 | AlbumTitle STRING( MAX), 70 | ReleaseDate DATE, 71 | CompanyId INT64 NOT NULL, 72 | CONSTRAINT FKProductionCompanyId FOREIGN KEY (CompanyId) REFERENCES ProductionCompanies (CompanyId), 73 | ) PRIMARY KEY(SingerId, AlbumId), 74 | INTERLEAVE IN PARENT Singers ON 75 | DELETE 76 | CASCADE; 77 | 78 | CREATE TABLE SingerContracts 79 | ( 80 | SingerId INT64 NOT NULL, 81 | CompanyId INT64 NOT NULL, 82 | CONSTRAINT FKSingerCompanyId FOREIGN KEY (CompanyId) REFERENCES ProductionCompanies (CompanyId), 83 | ) PRIMARY KEY(SingerId, CompanyId), 84 | INTERLEAVE IN PARENT Singers ON 85 | DELETE 86 | CASCADE; 87 | 88 | CREATE TABLE SingerFriends 89 | ( 90 | SingerId INT64 NOT NULL, 91 | FriendId INT64 NOT NULL, 92 | CONSTRAINT FKSingerFriendId FOREIGN KEY (FriendId) REFERENCES Singers (SingerId), 93 | ) PRIMARY KEY(SingerId, FriendId), 94 | INTERLEAVE IN PARENT Singers ON 95 | DELETE 96 | CASCADE; 97 | 98 | CREATE OR REPLACE PROPERTY GRAPH MusicGraph 99 | NODE TABLES( 100 | Albums AS Album 101 | KEY(SingerId, AlbumId) 102 | LABEL ALBUM PROPERTIES( 103 | AlbumId, 104 | AlbumTitle, 105 | CompanyId, 106 | ReleaseDate, 107 | SingerId), 108 | 109 | ProductionCompanies AS Company 110 | KEY(CompanyId) 111 | LABEL MUSIC_COMPANY PROPERTIES( 112 | FoundedYear AS founded_year, 113 | CompanyName AS name) 114 | LABEL MUSIC_CREATOR PROPERTIES( 115 | LocationCountry AS country_origin, 116 | CompanyName AS name), 117 | 118 | Singers AS Singer 119 | KEY(SingerId) 120 | LABEL MUSIC_CREATOR PROPERTIES( 121 | "US" AS country_origin, 122 | CONCAT(FirstName, " ", LastName) AS name) 123 | LABEL SINGER PROPERTIES( 124 | BirthDate AS birthday, 125 | SingerId AS id, 126 | CONCAT(FirstName, " ", LastName) AS singer_name) 127 | ) 128 | EDGE TABLES( 129 | Albums AS COMPANY_PRODUCES_ALBUM 130 | KEY(CompanyId, SingerId, AlbumId) 131 | SOURCE KEY(CompanyId) REFERENCES Company(CompanyId) 132 | DESTINATION KEY(AlbumId, SingerId) REFERENCES Album(AlbumId, SingerId) 133 | LABEL CREATES_MUSIC PROPERTIES( 134 | AlbumId AS album_id, 135 | ReleaseDate AS release_date), 136 | 137 | Albums AS SINGER_CREATES_ALBUM 138 | KEY(SingerId, AlbumId) 139 | SOURCE KEY(SingerId) REFERENCES Singer(SingerId) 140 | DESTINATION KEY(AlbumId, SingerId) REFERENCES Album(AlbumId, SingerId) 141 | LABEL CREATES_MUSIC PROPERTIES( 142 | AlbumId AS album_id, 143 | ReleaseDate AS release_date), 144 | 145 | SingerFriends AS SINGER_HAS_FRIEND 146 | KEY(SingerId, FriendId) 147 | SOURCE KEY(SingerId) REFERENCES Singer(SingerId) 148 | DESTINATION KEY(FriendId) REFERENCES Singer(SingerId) 149 | LABEL KNOWS PROPERTIES( 150 | FriendId, 151 | SingerId), 152 | 153 | SingerContracts AS SINGER_SIGNED_BY_COMPANY 154 | KEY(SingerId, CompanyId) 155 | SOURCE KEY(SingerId) REFERENCES Singer(SingerId) 156 | DESTINATION KEY(CompanyId) REFERENCES Company(CompanyId) 157 | LABEL SIGNED_BY PROPERTIES( 158 | CompanyId, 159 | SingerId) 160 | ); 161 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/resources/db/insert_data.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM ATable WHERE 1=1; 2 | 3 | INSERT INTO 4 | ATable(A, B, C, D, E) 5 | VALUES 6 | (1, "2", NULL, TIMESTAMP("2023-08-22T12:22:00Z"), 1000.282111401), 7 | (10, "20", NULL, TIMESTAMP("2023-08-22T12:23:00Z"), 10000.282111603), 8 | (30, "30", NULL, TIMESTAMP("2023-08-22T12:24:00Z"), 30000.282111805); 9 | 10 | DELETE FROM simpleTable WHERE 1=1; 11 | 12 | INSERT INTO 13 | simpleTable(A, B, C) 14 | VALUES 15 | (1, "1", 2.5), 16 | (2, "2", 5.0), 17 | (3, "3", CAST("+inf" AS FLOAT64)), 18 | (4, "4", CAST("-inf" AS FLOAT64)), 19 | (5, "5", CAST("NaN" AS FLOAT64)), 20 | (6, "6", 100000000017.100000000017), 21 | (7, "7", -0.1), 22 | (8, "8", +0.1), 23 | (9, "9", -19999997.9); 24 | 25 | DELETE FROM players WHERE 1=1; 26 | DELETE FROM games WHERE 1=1; 27 | INSERT INTO 28 | games(gameUUID, players, winner, created, finished, max_date) 29 | VALUES 30 | ("g1", ["p1", "p2", "p3"], "T1", TIMESTAMP("2023-08-26T12:22:00Z"), TIMESTAMP("2023-08-26T12:22:00Z"), DATE("2023-12-31T00:00:00Z")), 31 | ("g2", ["p4", "p5", "p6"], "T2", TIMESTAMP("2023-08-26T12:22:00Z"), TIMESTAMP("2023-08-26T12:22:00Z"), DATE("2023-12-31T00:00:00Z")); 32 | 33 | DELETE FROM game_items WHERE 1=1; 34 | INSERT INTO 35 | game_items(itemUUID, item_name, item_value, available_time, duration) 36 | VALUES 37 | ("gi_1", "powerup", 237, TIMESTAMP("2023-08-22T12:22:00Z"), 90), 38 | ("gi_2", "diff", 500, TIMESTAMP("2023-08-22T12:22:00Z"), 90); 39 | 40 | INSERT INTO 41 | players(playerUUID, player_name, email, password_hash, created, updated, stats, account_balance, is_logged_in, last_login, valid_email, current_game, dob) 42 | VALUES 43 | ("p1", "PLAYER 1", "p1@games.com", FROM_HEX("deadbeef"), TIMESTAMP("2023-08-26T12:22:00Z"), null, TO_JSON('{"a":"b"}'), 17517, true, TIMESTAMP("2023-08-26T12:22:00Z"), true, "g1", DATE("1999-06-06T00:00:00Z")), 44 | ("p2", "PLAYER 2", "p2@games.com", FROM_HEX("beefdead"), TIMESTAMP("2023-08-26T12:22:00Z"), null, TO_JSON('{"1":"2","k":291}'), 8519, false, TIMESTAMP("2023-08-26T12:22:00Z"), true, "g2", DATE("1997-12-06T00:00:00Z")); 45 | 46 | 47 | DELETE FROM compositeTable WHERE 1=1; 48 | INSERT INTO 49 | compositeTable(id, A, B, C, D, E, F, G, H, I, J, K) 50 | VALUES 51 | ( 52 | "id1", [10, 100, 991, 567282], ["a", "b", "c"], "foobar", 2934, DATE(2023, 1, 1), 53 | TIMESTAMP("2023-08-26T12:22:05Z"), true, [DATE(2023, 1, 2), DATE(2023, 12, 31)], 54 | [TIMESTAMP("2023-08-26T12:11:10Z"), TIMESTAMP("2023-08-27T12:11:09Z")], FROM_HEX("beefdead"), 55 | JSON'{"a":1, "b":2}' 56 | ), 57 | ( 58 | "id2", [20, 200, 2991, 888885], ["A", "B", "C"], "this one", 93411, DATE(2023, 9, 23), 59 | TIMESTAMP("2023-09-22T12:22:05Z"), false, [DATE(2023, 9, 2), DATE(2023, 12, 31)], 60 | [TIMESTAMP("2023-09-22T12:11:10Z"), TIMESTAMP("2023-09-23T12:11:09Z")], FROM_HEX("deadbeef"), 61 | JSON'{}' 62 | ), 63 | ( 64 | "id3", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, b"deadbeef", NULL 65 | ); 66 | 67 | DELETE FROM nullsTable WHERE 1=1; 68 | INSERT INTO 69 | nullsTable(id, A, B, C, D, E, F, G, H, I, J, K, M, N, O) 70 | VALUES 71 | (1, NULL, NULL, NULL, NULL, NULL, NULL, true, [NULL, DATE("2023-09-23T00:00:00Z")], NULL, [true, NULL, false], [23.67], NULL, NULL, [CAST(-99.37171 AS NUMERIC), NULL]), 72 | (2, [1, 2, NULL], NULL, NULL, 99.37171, NULL, NULL, NULL, [DATE("2022-10-02T00:00:00Z"), NULL], NULL, [NULL, NULL, true], [NULL, 198.1827], NULL, NULL, NULL), 73 | (3, [2, 3, NULL], ["a", "b", "FF", NULL], "😎🚨", NULL, NULL, TIMESTAMP("2023-09-23T12:11:09Z"), false, NULL, NULL, NULL, [-28888.8888, 0.12, NULL], NULL, NULL, [NULL, CAST(-55.7 AS NUMERIC), CAST(9.3 AS NUMERIC)]), 74 | (4, [NULL, 4, 57, 10], ["💡🚨", NULL, "b", "fg"], "🚨", 55.7, DATE(2023, 12, 31), NULL, false, NULL, [NULL, TIMESTAMP("2023-09-23T12:11:09Z")], [true, true], [0.71], [NULL, FROM_HEX("beefdead")], [NULL, JSON'{"a":1}'], [NULL, CAST(12 AS NUMERIC)]), 75 | (5, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), 76 | (6, [NULL, 1234], [NULL, "stringarray"], NULL, NULL, NULL, NULL, NULL, [NULL, DATE(2023, 12, 31)], [NULL, TIMESTAMP("2023-09-23T12:11:09Z")], [NULL, true], [NULL, 0.000001], [NULL, b"beefdead"], [NULL, JSON'{"a":1}'], [NULL, CAST(123456 AS NUMERIC)]), 77 | (7, [], [], NULL, NULL, NULL, NULL, NULL, [], [], [], [], [], [], []); 78 | 79 | 80 | DELETE FROM bytesTable WHERE 1=1; 81 | INSERT INTO 82 | bytesTable(id, A) 83 | VALUES 84 | (1, B"ABCDEFGHIJKLMNOPQ"), 85 | (2, B"abcdefghijklmnopq"), 86 | (3, B"1234efghijklmnopq"); 87 | 88 | 89 | DELETE FROM valueLimitsTable WHERE 1=1; 90 | INSERT INTO 91 | valueLimitsTable(A, B, C, D, E) 92 | VALUES 93 | (-9223372036854775808, CAST("NaN" AS FLOAT64), -9.9999999999999999999999999999999999999E+28, DATE("1700-01-01T00:00:00Z"), TIMESTAMP("9999-12-30T23:59:59.00Z")), 94 | (9223372036854775807, CAST("+inf" AS FLOAT64), +9.9999999999999999999999999999999999999E+28, DATE("4000-12-30T23:59:59Z"), TIMESTAMP("2222-02-22T22:22:22.999999Z")), 95 | (0, CAST("-inf" AS FLOAT64), 10.389, DATE("1900-12-30T23:59:59Z"), TIMESTAMP("2023-09-28 21:59:59", "America/Los_Angeles")), 96 | (1, CAST("0.657818" AS FLOAT64), -10.389, DATE("2023-09-28T00:00:00Z"), TIMESTAMP("0001-01-03T00:00:01Z")); 97 | -------------------------------------------------------------------------------- /.mvn/wrapper/MavenWrapperDownloader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2007-present the original author or authors. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import java.net.*; 17 | import java.io.*; 18 | import java.nio.channels.*; 19 | import java.util.Properties; 20 | 21 | public class MavenWrapperDownloader { 22 | 23 | private static final String WRAPPER_VERSION = "0.5.6"; 24 | /** 25 | * Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided. 26 | */ 27 | private static final String DEFAULT_DOWNLOAD_URL = "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/" 28 | + WRAPPER_VERSION + "/maven-wrapper-" + WRAPPER_VERSION + ".jar"; 29 | 30 | /** 31 | * Path to the maven-wrapper.properties file, which might contain a downloadUrl property to 32 | * use instead of the default one. 33 | */ 34 | private static final String MAVEN_WRAPPER_PROPERTIES_PATH = 35 | ".mvn/wrapper/maven-wrapper.properties"; 36 | 37 | /** 38 | * Path where the maven-wrapper.jar will be saved to. 39 | */ 40 | private static final String MAVEN_WRAPPER_JAR_PATH = 41 | ".mvn/wrapper/maven-wrapper.jar"; 42 | 43 | /** 44 | * Name of the property which should be used to override the default download url for the wrapper. 45 | */ 46 | private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl"; 47 | 48 | public static void main(String args[]) { 49 | System.out.println("- Downloader started"); 50 | File baseDirectory = new File(args[0]); 51 | System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath()); 52 | 53 | // If the maven-wrapper.properties exists, read it and check if it contains a custom 54 | // wrapperUrl parameter. 55 | File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH); 56 | String url = DEFAULT_DOWNLOAD_URL; 57 | if(mavenWrapperPropertyFile.exists()) { 58 | FileInputStream mavenWrapperPropertyFileInputStream = null; 59 | try { 60 | mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile); 61 | Properties mavenWrapperProperties = new Properties(); 62 | mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream); 63 | url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url); 64 | } catch (IOException e) { 65 | System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'"); 66 | } finally { 67 | try { 68 | if(mavenWrapperPropertyFileInputStream != null) { 69 | mavenWrapperPropertyFileInputStream.close(); 70 | } 71 | } catch (IOException e) { 72 | // Ignore ... 73 | } 74 | } 75 | } 76 | System.out.println("- Downloading from: " + url); 77 | 78 | File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH); 79 | if(!outputFile.getParentFile().exists()) { 80 | if(!outputFile.getParentFile().mkdirs()) { 81 | System.out.println( 82 | "- ERROR creating output directory '" + outputFile.getParentFile().getAbsolutePath() + "'"); 83 | } 84 | } 85 | System.out.println("- Downloading to: " + outputFile.getAbsolutePath()); 86 | try { 87 | downloadFileFromURL(url, outputFile); 88 | System.out.println("Done"); 89 | System.exit(0); 90 | } catch (Throwable e) { 91 | System.out.println("- Error downloading"); 92 | e.printStackTrace(); 93 | System.exit(1); 94 | } 95 | } 96 | 97 | private static void downloadFileFromURL(String urlString, File destination) throws Exception { 98 | if (System.getenv("MVNW_USERNAME") != null && System.getenv("MVNW_PASSWORD") != null) { 99 | String username = System.getenv("MVNW_USERNAME"); 100 | char[] password = System.getenv("MVNW_PASSWORD").toCharArray(); 101 | Authenticator.setDefault(new Authenticator() { 102 | @Override 103 | protected PasswordAuthentication getPasswordAuthentication() { 104 | return new PasswordAuthentication(username, password); 105 | } 106 | }); 107 | } 108 | URL website = new URL(urlString); 109 | ReadableByteChannel rbc; 110 | rbc = Channels.newChannel(website.openStream()); 111 | FileOutputStream fos = new FileOutputStream(destination); 112 | fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); 113 | fos.close(); 114 | rbc.close(); 115 | } 116 | 117 | } -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraphConfigs.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph; 2 | 3 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphElementTable; 4 | import com.google.gson.Gson; 5 | import java.util.ArrayList; 6 | import java.util.HashSet; 7 | import java.util.LinkedHashSet; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Set; 11 | import javax.annotation.Nonnull; 12 | import javax.annotation.Nullable; 13 | 14 | /** User-supplied configs for exporting graphs in Spanner */ 15 | public class SpannerGraphConfigs { 16 | 17 | /** Do not export ID columns directly even when it can */ 18 | public boolean disableDirectIdExport = false; 19 | 20 | /** Output individual node element key columns instead of one column that concatenate all keys */ 21 | public boolean outputIndividualKeys = false; 22 | 23 | /** Labels and properties to fetch for nodes */ 24 | public List nodeLabelConfigs = new ArrayList<>(); 25 | 26 | /** Labels and properties to fetch for edges */ 27 | public List edgeLabelConfigs = new ArrayList<>(); 28 | 29 | /** 30 | * Same as PartitionOptions 32 | */ 33 | public Long partitionSizeBytes = null; 34 | 35 | /** Extra headers added to requests when fetching partitions of the graph */ 36 | public Map> extraHeaders = null; 37 | 38 | public static SpannerGraphConfigs fromJson(String json) { 39 | return new Gson().fromJson(json, SpannerGraphConfigs.class); 40 | } 41 | 42 | public void validate(PropertyGraph graphSchema, boolean directGqlQuery) { 43 | if (directGqlQuery) { 44 | if (!nodeLabelConfigs.isEmpty() || !edgeLabelConfigs.isEmpty()) { 45 | throw new IllegalArgumentException( 46 | "nodeLabelConfigs and edgeLabelConfigs are invalid " 47 | + "options when using GQL queries are provided."); 48 | } 49 | } 50 | checkExclusiveAnyLabel(nodeLabelConfigs); 51 | checkExclusiveAnyLabel(edgeLabelConfigs); 52 | for (LabelConfig labelConfig : nodeLabelConfigs) { 53 | labelConfig.validate(graphSchema, /*node=*/ true); 54 | } 55 | for (LabelConfig labelConfig : edgeLabelConfigs) { 56 | labelConfig.validate(graphSchema, /*node=*/ false); 57 | } 58 | if (partitionSizeBytes != null && partitionSizeBytes <= 0) { 59 | throw new IllegalArgumentException("partitionSize must be greater than 0"); 60 | } 61 | } 62 | 63 | private void checkExclusiveAnyLabel(List labelConfigs) { 64 | boolean hasAnyLabel = labelConfigs.stream().anyMatch(lc -> "*".equals(lc.label)); 65 | if (!hasAnyLabel) { 66 | return; 67 | } 68 | if (labelConfigs.size() > 1) { 69 | throw new IllegalArgumentException( 70 | "Label wildcard (\"*\") cannot be specified together with other label filters."); 71 | } 72 | } 73 | 74 | public static class LabelConfig { 75 | 76 | @Nonnull public String label; 77 | @Nonnull public List properties; 78 | @Nullable public String filter; 79 | 80 | public LabelConfig( 81 | @Nonnull String label, @Nullable List properties, @Nullable String filter) { 82 | this.label = label; 83 | this.filter = filter; 84 | this.properties = properties != null ? properties : new ArrayList<>(); 85 | } 86 | 87 | private void validate(PropertyGraph graphSchema, boolean node) { 88 | if (label == null) { 89 | throw new IllegalArgumentException("label must be specified"); 90 | } 91 | 92 | // Ensure label and properties exist in the graph 93 | if (label.equals("*")) { 94 | List elementTables = 95 | node ? graphSchema.nodeTables : graphSchema.edgeTables; 96 | Set availableProperties = new LinkedHashSet<>(); 97 | elementTables.stream() 98 | .flatMap(t -> t.propertyDefinitions.stream()) 99 | .map(d -> d.propertyDeclarationName) 100 | .forEach(availableProperties::add); 101 | for (String property : properties) { 102 | if (!availableProperties.contains(property)) { 103 | throw new IllegalArgumentException( 104 | String.format( 105 | "Cannot find %s property %s in the graph schema. Existing properties: %s", 106 | node ? "node" : "edge", property, availableProperties)); 107 | } 108 | } 109 | } else { 110 | Set availableProperties = 111 | new HashSet<>( 112 | graphSchema.labels.stream() 113 | .filter(l -> l.name.equalsIgnoreCase(label)) 114 | .findFirst() 115 | .orElseThrow( 116 | () -> 117 | new IllegalArgumentException( 118 | String.format("Cannot find label %s in the graph schema.", label))) 119 | .propertyDeclarationNames); 120 | for (String property : properties) { 121 | if (!availableProperties.contains(property)) { 122 | throw new IllegalArgumentException( 123 | String.format("Cannot find property %s in label %s", property, label)); 124 | } 125 | } 126 | } 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/SpannerTableTest.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import static org.junit.Assert.assertEquals; 18 | 19 | import java.util.Arrays; 20 | import java.util.Map; 21 | import org.apache.spark.sql.types.DataTypes; 22 | import org.apache.spark.sql.types.MetadataBuilder; 23 | import org.apache.spark.sql.types.StructField; 24 | import org.apache.spark.sql.types.StructType; 25 | import org.junit.Test; 26 | import org.junit.runner.RunWith; 27 | import org.junit.runners.JUnit4; 28 | 29 | @RunWith(JUnit4.class) 30 | public class SpannerTableTest extends SpannerTestBase { 31 | 32 | @Test 33 | public void querySchemaShouldSuccessInSpannerTable() { 34 | Map props = this.connectionProperties(); 35 | SpannerTable spannerTable = new SpannerTable(props); 36 | StructType actualSchema = spannerTable.schema(); 37 | MetadataBuilder jsonMetaBuilder = new MetadataBuilder(); 38 | jsonMetaBuilder.putString(SpannerUtils.COLUMN_TYPE, "json"); 39 | StructType expectSchema = 40 | new StructType( 41 | Arrays.asList( 42 | new StructField("A", DataTypes.LongType, false, null), 43 | new StructField("B", DataTypes.StringType, true, null), 44 | new StructField("C", DataTypes.BinaryType, true, null), 45 | new StructField("D", DataTypes.TimestampType, true, null), 46 | new StructField("E", DataTypes.createDecimalType(38, 9), true, null), 47 | new StructField( 48 | "F", DataTypes.createArrayType(DataTypes.StringType, true), true, null), 49 | new StructField("G", DataTypes.StringType, true, jsonMetaBuilder.build())) 50 | .toArray(new StructField[0])); 51 | 52 | // Object.equals fails for StructType with fields so we'll 53 | // firstly compare lengths, then fieldNames then the simpleString. 54 | assertEquals(expectSchema.length(), actualSchema.length()); 55 | assertEquals(expectSchema.fieldNames(), actualSchema.fieldNames()); 56 | assertEquals(expectSchema.simpleString(), actualSchema.simpleString()); 57 | } 58 | 59 | @Test 60 | public void queryPgSchemaShouldSucceedInSpannerTable() { 61 | if (SpannerTableTest.emulatorHost != null && !SpannerTableTest.emulatorHost.isEmpty()) { 62 | // Spanner emulator doesn't support the PostgreSql dialect interface. 63 | // If the emulator is set. We return immediately here. 64 | // TODO: Use logger instead of System out once logger configuration is set. 65 | System.out.println( 66 | "queryPgSchemaShouldSuccessInSpannerTable is skipped since pg is not supported in Spanner emulator"); 67 | return; 68 | } 69 | Map props = this.connectionProperties(/* usePostgreSql= */ true); 70 | SpannerTable spannerTable = new SpannerTable(props); 71 | StructType actualSchema = spannerTable.schema(); 72 | MetadataBuilder jsonMetaBuilder = new MetadataBuilder(); 73 | jsonMetaBuilder.putString(SpannerUtils.COLUMN_TYPE, "jsonb"); 74 | StructType expectSchema = 75 | new StructType( 76 | Arrays.asList( 77 | new StructField("id", DataTypes.LongType, false, null), 78 | new StructField("charvcol", DataTypes.StringType, true, null), 79 | new StructField("textcol", DataTypes.StringType, true, null), 80 | new StructField("varcharcol", DataTypes.StringType, true, null), 81 | new StructField("boolcol", DataTypes.BooleanType, true, null), 82 | new StructField("booleancol", DataTypes.BooleanType, true, null), 83 | new StructField("bigintcol", DataTypes.LongType, true, null), 84 | new StructField("int8col", DataTypes.LongType, true, null), 85 | new StructField("intcol", DataTypes.LongType, true, null), 86 | new StructField("doublecol", DataTypes.DoubleType, true, null), 87 | new StructField("floatcol", DataTypes.DoubleType, true, null), 88 | new StructField("bytecol", DataTypes.BinaryType, true, null), 89 | new StructField("datecol", DataTypes.DateType, true, null), 90 | new StructField("numericcol", DataTypes.createDecimalType(38, 9), true, null), 91 | new StructField("decimalcol", DataTypes.createDecimalType(38, 9), true, null), 92 | new StructField("timewithzonecol", DataTypes.TimestampType, true, null), 93 | new StructField("timestampcol", DataTypes.TimestampType, true, null), 94 | new StructField("jsoncol", DataTypes.StringType, true, jsonMetaBuilder.build())) 95 | .toArray(new StructField[0])); 96 | 97 | // Object.equals fails for StructType with fields so we'll 98 | // firstly compare lengths, then fieldNames then the simpleString. 99 | assertEquals(expectSchema.length(), actualSchema.length()); 100 | assertEquals(expectSchema.fieldNames(), actualSchema.fieldNames()); 101 | assertEquals(expectSchema.simpleString(), actualSchema.simpleString()); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/test/java/com/google/cloud/spark/spanner/acceptance/AcceptanceTestUtils.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner.acceptance; 16 | 17 | import com.google.cloud.WriteChannel; 18 | import com.google.cloud.storage.*; 19 | import com.google.common.base.Preconditions; 20 | import com.google.common.io.ByteStreams; 21 | import java.io.*; 22 | import java.net.URI; 23 | import java.nio.ByteBuffer; 24 | import java.nio.MappedByteBuffer; 25 | import java.nio.channels.FileChannel; 26 | import java.nio.charset.StandardCharsets; 27 | import java.nio.file.Files; 28 | import java.nio.file.Path; 29 | import java.nio.file.Paths; 30 | import java.nio.file.attribute.FileTime; 31 | import java.util.Comparator; 32 | import java.util.function.Predicate; 33 | import java.util.stream.StreamSupport; 34 | 35 | public final class AcceptanceTestUtils { 36 | static final String BUCKET = 37 | Preconditions.checkNotNull( 38 | System.getenv("ACCEPTANCE_TEST_BUCKET"), 39 | "Please set the 'ACCEPTANCE_TEST_BUCKET' environment variable"); 40 | static Storage storage = 41 | new StorageOptions.DefaultStorageFactory().create(StorageOptions.getDefaultInstance()); 42 | 43 | public static Path getArtifact(Path targetDir, String prefix, String suffix) { 44 | Predicate prefixSuffixChecker = prefixSuffixChecker(prefix, suffix); 45 | try { 46 | return Files.list(targetDir) 47 | .filter(Files::isRegularFile) 48 | .filter(prefixSuffixChecker) 49 | .max(Comparator.comparing(AcceptanceTestUtils::lastModifiedTime)) 50 | .get(); 51 | } catch (IOException e) { 52 | throw new UncheckedIOException(e.getMessage(), e); 53 | } 54 | } 55 | 56 | public static String getCsv(String resultsDirUri) throws Exception { 57 | URI uri = new URI(resultsDirUri); 58 | Blob csvBlob = 59 | StreamSupport.stream( 60 | storage 61 | .list( 62 | uri.getAuthority(), 63 | Storage.BlobListOption.prefix(uri.getPath().substring(1))) 64 | .iterateAll() 65 | .spliterator(), 66 | false) 67 | .filter(blob -> blob.getName().endsWith("csv")) 68 | .findFirst() 69 | .get(); 70 | return new String(storage.readAllBytes(csvBlob.getBlobId()), StandardCharsets.UTF_8); 71 | } 72 | 73 | private static Predicate prefixSuffixChecker(final String prefix, final String suffix) { 74 | return path -> { 75 | String name = path.toFile().getName(); 76 | return name.startsWith(prefix) && name.endsWith(suffix) && name.indexOf("-javadoc") == -1; 77 | }; 78 | } 79 | 80 | private static FileTime lastModifiedTime(Path path) { 81 | try { 82 | return Files.getLastModifiedTime(path); 83 | } catch (IOException e) { 84 | throw new UncheckedIOException(e.getMessage(), e); 85 | } 86 | } 87 | 88 | public static BlobId copyToGcs(Path source, String destinationUri, String contentType) 89 | throws Exception { 90 | File sourceFile = source.toFile(); 91 | try (FileInputStream sourceInputStream = new FileInputStream(sourceFile)) { 92 | FileChannel sourceFileChannel = sourceInputStream.getChannel(); 93 | MappedByteBuffer sourceContent = 94 | sourceFileChannel.map(FileChannel.MapMode.READ_ONLY, 0, sourceFile.length()); 95 | return uploadToGcs(sourceContent, destinationUri, contentType); 96 | } catch (IOException e) { 97 | throw new UncheckedIOException( 98 | String.format("Failed to write '%s' to '%s'", source, destinationUri), e); 99 | } 100 | } 101 | 102 | public static BlobId uploadToGcs(InputStream source, String destinationUri, String contentType) 103 | throws Exception { 104 | try { 105 | ByteBuffer sourceContent = ByteBuffer.wrap(ByteStreams.toByteArray(source)); 106 | return uploadToGcs(sourceContent, destinationUri, contentType); 107 | } catch (IOException e) { 108 | throw new UncheckedIOException(String.format("Failed to write to '%s'", destinationUri), e); 109 | } 110 | } 111 | 112 | public static BlobId uploadToGcs(ByteBuffer content, String destinationUri, String contentType) 113 | throws Exception { 114 | URI uri = new URI(destinationUri); 115 | BlobId blobId = BlobId.of(uri.getAuthority(), uri.getPath().substring(1)); 116 | BlobInfo blobInfo = BlobInfo.newBuilder(blobId).setContentType(contentType).build(); 117 | try (WriteChannel writer = storage.writer(blobInfo)) { 118 | writer.write(content); 119 | } catch (IOException e) { 120 | throw new UncheckedIOException(String.format("Failed to write to '%s'", destinationUri), e); 121 | } 122 | return blobId; 123 | } 124 | 125 | public static String createTestBaseGcsDir(String testId) { 126 | return String.format("gs://%s/tests/%s", BUCKET, testId); 127 | } 128 | 129 | static void uploadConnectorJar(String targetDir, String prefix, String connectorJarUri) 130 | throws Exception { 131 | Path targetDirPath = Paths.get(targetDir); 132 | Path assemblyJar = AcceptanceTestUtils.getArtifact(targetDirPath, prefix, ".jar"); 133 | AcceptanceTestUtils.copyToGcs(assemblyJar, connectorJarUri, "application/java-archive"); 134 | } 135 | 136 | public static void deleteGcsDir(String testBaseGcsDir) throws Exception { 137 | URI uri = new URI(testBaseGcsDir); 138 | BlobId[] blobIds = 139 | StreamSupport.stream( 140 | storage 141 | .list( 142 | uri.getAuthority(), 143 | Storage.BlobListOption.prefix(uri.getPath().substring(1))) 144 | .iterateAll() 145 | .spliterator(), 146 | false) 147 | .map(Blob::getBlobId) 148 | .toArray(BlobId[]::new); 149 | if (blobIds.length > 1) { 150 | storage.delete(blobIds); 151 | } 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/SpannerScanner.java: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.cloud.spark.spanner; 16 | 17 | import com.fasterxml.jackson.core.JsonProcessingException; 18 | import com.google.cloud.Timestamp; 19 | import com.google.cloud.spanner.BatchReadOnlyTransaction; 20 | import com.google.cloud.spanner.Dialect; 21 | import com.google.cloud.spanner.Options; 22 | import com.google.cloud.spanner.PartitionOptions; 23 | import com.google.cloud.spanner.Statement; 24 | import com.google.cloud.spanner.TimestampBound; 25 | import com.google.common.collect.Streams; 26 | import java.util.List; 27 | import java.util.Map; 28 | import java.util.Optional; 29 | import java.util.Set; 30 | import java.util.stream.Collectors; 31 | import org.apache.spark.Partition; 32 | import org.apache.spark.sql.connector.read.Batch; 33 | import org.apache.spark.sql.connector.read.InputPartition; 34 | import org.apache.spark.sql.connector.read.PartitionReaderFactory; 35 | import org.apache.spark.sql.connector.read.Scan; 36 | import org.apache.spark.sql.sources.Filter; 37 | import org.apache.spark.sql.types.StructField; 38 | import org.apache.spark.sql.types.StructType; 39 | import org.slf4j.Logger; 40 | import org.slf4j.LoggerFactory; 41 | 42 | /* 43 | * SpannerScanner implements Scan. 44 | */ 45 | public class SpannerScanner implements Batch, Scan { 46 | private final SpannerTable spannerTable; 47 | private final Filter[] filters; 48 | private final Set requiredColumns; 49 | private final Map opts; 50 | private static final Logger log = LoggerFactory.getLogger(SpannerScanner.class); 51 | private final Timestamp INIT_TIME = Timestamp.now(); 52 | private final Map fields; 53 | private final StructType readSchema; 54 | 55 | public SpannerScanner( 56 | Map opts, 57 | SpannerTable spannerTable, 58 | Map fields, 59 | Filter[] filters, 60 | Set requiredColumns) { 61 | this.opts = opts; 62 | this.spannerTable = spannerTable; 63 | this.fields = fields; 64 | this.filters = filters; 65 | this.requiredColumns = requiredColumns; 66 | this.readSchema = SpannerUtils.pruneSchema(spannerTable.schema(), requiredColumns); 67 | } 68 | 69 | @Override 70 | public StructType readSchema() { 71 | return readSchema; 72 | } 73 | 74 | @Override 75 | public Batch toBatch() { 76 | return this; 77 | } 78 | 79 | @Override 80 | public PartitionReaderFactory createReaderFactory() { 81 | return new SpannerPartitionReaderFactory(); 82 | } 83 | 84 | static String buildColumnsWithTablePrefix( 85 | String tableName, Set columns, boolean isPostgreSql) { 86 | String quotedTableName = isPostgreSql ? "\"" + tableName + "\"" : "`" + tableName + "`"; 87 | return columns.stream() 88 | .map(col -> isPostgreSql ? "\"" + col + "\"" : "`" + col + "`") 89 | .map(quotedCol -> quotedTableName + "." + quotedCol) 90 | .collect(Collectors.joining(", ")); 91 | } 92 | 93 | @Override 94 | public InputPartition[] planInputPartitions() { 95 | BatchClientWithCloser batchClient = SpannerUtils.batchClientFromProperties(this.opts); 96 | 97 | // 1. Use * if no requiredColumns were requested else select them. 98 | String selectPrefix = "SELECT *"; 99 | if (this.requiredColumns != null && this.requiredColumns.size() > 0) { 100 | // Prefix each column with the table name to avoid ambiguity when column name 101 | // matches table name 102 | boolean isPostgreSql = batchClient.databaseClient.getDialect().equals(Dialect.POSTGRESQL); 103 | String columnsWithTablePrefix = 104 | buildColumnsWithTablePrefix(this.spannerTable.name(), this.requiredColumns, isPostgreSql); 105 | selectPrefix = "SELECT " + columnsWithTablePrefix; 106 | } 107 | String sqlStmt = selectPrefix + " FROM " + this.spannerTable.name(); 108 | if (this.filters.length > 0) { 109 | sqlStmt += 110 | " WHERE " 111 | + SparkFilterUtils.getCompiledFilter( 112 | true, 113 | Optional.empty(), 114 | batchClient.databaseClient.getDialect().equals(Dialect.POSTGRESQL), 115 | fields, 116 | this.filters); 117 | } 118 | 119 | Boolean enableDataboost = false; 120 | if (this.opts.containsKey("enableDataBoost")) { 121 | enableDataboost = this.opts.get("enableDataBoost").equalsIgnoreCase("true"); 122 | } 123 | 124 | try (BatchReadOnlyTransaction txn = 125 | batchClient.batchClient.batchReadOnlyTransaction( 126 | TimestampBound.ofReadTimestamp(INIT_TIME))) { 127 | String mapAsJSON = SpannerUtils.serializeMap(this.opts); 128 | List rawPartitions = 129 | txn.partitionQuery( 130 | PartitionOptions.getDefaultInstance(), 131 | Statement.of(sqlStmt), 132 | Options.dataBoostEnabled(enableDataboost)); 133 | 134 | List parts = 135 | Streams.mapWithIndex( 136 | rawPartitions.stream(), 137 | (part, index) -> 138 | new SpannerPartition( 139 | part, 140 | Math.toIntExact(index), 141 | new SpannerInputPartitionContext( 142 | part, 143 | txn.getBatchTransactionId(), 144 | mapAsJSON, 145 | new SpannerRowConverterDirect()))) 146 | .collect(Collectors.toList()); 147 | 148 | return parts.toArray(new InputPartition[0]); 149 | } catch (JsonProcessingException e) { 150 | throw new SpannerConnectorException( 151 | SpannerErrorCode.SPANNER_FAILED_TO_PARSE_OPTIONS, "Error parsing the input options.", e); 152 | } finally { 153 | batchClient.close(); 154 | } 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/PropertyGraph.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph; 2 | 3 | import com.google.cloud.spanner.ResultSet; 4 | import com.google.cloud.spanner.Statement; 5 | import com.google.cloud.spanner.connection.Connection; 6 | import com.google.cloud.spark.spanner.SpannerConnectorException; 7 | import com.google.gson.Gson; 8 | import java.util.HashMap; 9 | import java.util.HashSet; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Set; 13 | import java.util.stream.Collectors; 14 | 15 | /** 16 | * Parses INFORMATION_SCHEMA.PROPERTY_GRAPHS as defined in 17 | * https://cloud.google.com/spanner/docs/information-schema#property-graphs. 18 | */ 19 | public class PropertyGraph { 20 | 21 | public static String GRAPH_ELEMENT_TABLE_KIND_NODE = "NODE"; 22 | public static String GRAPH_ELEMENT_TABLE_KIND_EDGE = "EDGE"; 23 | 24 | private static final String GRAPH_SCHEMA_QUERY = 25 | "SELECT PROPERTY_GRAPH_METADATA_JSON FROM " 26 | + "INFORMATION_SCHEMA.PROPERTY_GRAPHS WHERE PROPERTY_GRAPH_NAME = @graph"; 27 | 28 | public String catalog; 29 | public String schema; 30 | public String name; 31 | public List nodeTables; 32 | public List edgeTables; 33 | public List labels; 34 | public List propertyDeclarations; 35 | private Map tableIdMapping; 36 | 37 | private PropertyGraph() {} 38 | 39 | public int getTableId(String elementTableName) { 40 | Integer tableId = tableIdMapping.get(elementTableName); 41 | if (tableId == null) { 42 | throw new IllegalArgumentException( 43 | String.format("Cannot find tableId for table with name=%s", elementTableName)); 44 | } 45 | return tableId; 46 | } 47 | 48 | public String getPropertyType(String propertyName) { 49 | for (GraphPropertyDeclaration gpd : propertyDeclarations) { 50 | if (gpd.name.equals(propertyName)) { 51 | return gpd.type; 52 | } 53 | } 54 | throw new IllegalArgumentException("Cannot find property: " + propertyName); 55 | } 56 | 57 | public void checkEdgeReferenceKeyColumnsMatchNodeKeyColumns(GraphElementTable edgeTable) { 58 | if (!edgeTable.kind.equalsIgnoreCase(GRAPH_ELEMENT_TABLE_KIND_EDGE)) { 59 | throw new IllegalArgumentException(); 60 | } 61 | Map> nodeTableKeyColumns = new HashMap<>(); 62 | for (GraphElementTable nodeTable : nodeTables) { 63 | nodeTableKeyColumns.put(nodeTable.name, new HashSet<>(nodeTable.keyColumns)); 64 | } 65 | throwIfNodeTableKeyColumnsMismatch( 66 | nodeTableKeyColumns, edgeTable.sourceNodeTable, edgeTable.name, "source"); 67 | throwIfNodeTableKeyColumnsMismatch( 68 | nodeTableKeyColumns, edgeTable.destinationNodeTable, edgeTable.name, "destination"); 69 | } 70 | 71 | private static void throwIfNodeTableKeyColumnsMismatch( 72 | Map> nodeTableKeyColumns, 73 | GraphNodeTableReference nodeTableReference, 74 | String edgeTableName, 75 | String type) { 76 | String nodeTableName = nodeTableReference.nodeTableName; 77 | Set expected = nodeTableKeyColumns.get(nodeTableReference.nodeTableName); 78 | if (!expected.equals(new HashSet<>(nodeTableReference.nodeTableColumns))) { 79 | throw new UnsupportedOperationException( 80 | String.format( 81 | "%s of edge table %s references node table %s using column(s) [%s], " 82 | + "while key column(s) of node table %s are [%s]. " 83 | + "Currently, the connector expects the key columns an edge table used to reference " 84 | + "source/destination nodes to match the key columns of the node table.", 85 | type, 86 | edgeTableName, 87 | nodeTableName, 88 | String.join(", ", nodeTableReference.nodeTableColumns), 89 | nodeTableName, 90 | String.join(", ", expected))); 91 | } 92 | } 93 | 94 | public static class GraphElementTable { 95 | 96 | public String name; 97 | public String kind; 98 | public String baseCatalogName; 99 | public String baseSchemaName; 100 | public String baseTableName; 101 | public List keyColumns; 102 | public List labelNames; 103 | public List propertyDefinitions; 104 | public GraphNodeTableReference sourceNodeTable; 105 | public GraphNodeTableReference destinationNodeTable; 106 | } 107 | 108 | public static class GraphNodeTableReference { 109 | 110 | public String nodeTableName; 111 | public List edgeTableColumns; 112 | public List nodeTableColumns; 113 | } 114 | 115 | public static class GraphElementLabel { 116 | 117 | public String name; 118 | public List propertyDeclarationNames; 119 | } 120 | 121 | public static class GraphPropertyDeclaration { 122 | 123 | public String name; 124 | public String type; 125 | } 126 | 127 | public static class GraphPropertyDefinition { 128 | 129 | public String propertyDeclarationName; 130 | public String valueExpressionSql; 131 | } 132 | 133 | public static class Builder { 134 | 135 | public static PropertyGraph getFromSpanner(Connection conn, String graph) { 136 | Statement schemaQuery = 137 | Statement.newBuilder(GRAPH_SCHEMA_QUERY).bind("graph").to(graph).build(); 138 | try (ResultSet rs = conn.executeQuery(schemaQuery)) { 139 | if (!rs.next()) { 140 | throw new SpannerConnectorException( 141 | String.format( 142 | "Unable to find the schema for graph %s. Query: %s", graph, schemaQuery)); 143 | } 144 | String schemaJson = rs.getCurrentRowAsStruct().getJson(0); 145 | if (rs.next()) { 146 | throw new SpannerConnectorException( 147 | String.format( 148 | "Found more than one schema for graph %s. Query: %s", graph, schemaQuery)); 149 | } 150 | return fromJson(schemaJson); 151 | } 152 | } 153 | 154 | public static PropertyGraph fromJson(String json) { 155 | PropertyGraph propertyGraph = new Gson().fromJson(json, PropertyGraph.class); 156 | propertyGraph.tableIdMapping = 157 | getTableIdMapping(propertyGraph.nodeTables, propertyGraph.edgeTables); 158 | return propertyGraph; 159 | } 160 | 161 | private static Map getTableIdMapping( 162 | List nodeTables, List edgeTables) { 163 | int tableCount = 0; 164 | Map tableIdMapping = new HashMap<>(); 165 | for (String tableName : 166 | nodeTables.stream().map(t -> t.name).sorted().collect(Collectors.toList())) { 167 | tableIdMapping.put(tableName, ++tableCount); 168 | } 169 | for (String tableName : 170 | edgeTables.stream().map(t -> t.name).sorted().collect(Collectors.toList())) { 171 | tableIdMapping.put(tableName, ++tableCount); 172 | } 173 | return tableIdMapping; 174 | } 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/query/SpannerGraphQuery.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph.query; 2 | 3 | import com.google.cloud.spanner.Statement; 4 | import com.google.cloud.spanner.connection.Connection; 5 | import com.google.cloud.spark.spanner.SpannerTable; 6 | import com.google.cloud.spark.spanner.SpannerTableSchema; 7 | import com.google.cloud.spark.spanner.graph.PropertyGraph; 8 | import com.google.cloud.spark.spanner.graph.PropertyGraph.GraphElementTable; 9 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs; 10 | import com.google.cloud.spark.spanner.graph.SpannerGraphConfigs.LabelConfig; 11 | import com.google.common.collect.ImmutableList; 12 | import com.google.common.collect.Iterables; 13 | import java.util.ArrayList; 14 | import java.util.Collections; 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Map; 18 | import java.util.Objects; 19 | import java.util.Set; 20 | import java.util.stream.Collectors; 21 | import org.apache.spark.sql.types.IntegralType; 22 | import org.apache.spark.sql.types.StructField; 23 | import org.apache.spark.sql.types.StructType; 24 | import org.slf4j.Logger; 25 | import org.slf4j.LoggerFactory; 26 | 27 | /** 28 | * Handles naming, schema, and mapping of columns across layers of the queries for a Spanner Graph 29 | */ 30 | public class SpannerGraphQuery { 31 | 32 | private static final Logger log = LoggerFactory.getLogger(SpannerTable.class); 33 | 34 | public final StructType dataframeSchema; 35 | public final List graphSubqueries; 36 | 37 | /** Constructor for user-provided graph query */ 38 | public SpannerGraphQuery(Connection conn, Statement query, boolean node) { 39 | DirectGraphQuery directGraphQuery = new DirectGraphQuery(conn, query, node); 40 | this.graphSubqueries = ImmutableList.of(directGraphQuery); 41 | this.dataframeSchema = fieldsToStruct(directGraphQuery.getOutputSparkFields()); 42 | } 43 | 44 | public SpannerGraphQuery( 45 | Connection conn, PropertyGraph graphSchema, SpannerGraphConfigs configs, boolean node) { 46 | List nodeTables = 47 | getMatchedElementTables(graphSchema.nodeTables, configs.nodeLabelConfigs); 48 | List edgeTables = 49 | getMatchedElementTables(graphSchema.edgeTables, configs.edgeLabelConfigs); 50 | Map baseTableSchemas = 51 | getBaseTableSchemas(conn, Iterables.concat(nodeTables, edgeTables)); 52 | boolean idColumnsExist = 53 | !configs.disableDirectIdExport && getIdColumnsExist(nodeTables, baseTableSchemas); 54 | 55 | List subQueries = new ArrayList<>(); 56 | if (node) { 57 | if (nodeTables.size() == 0) { 58 | throw new IllegalArgumentException("No node table left."); 59 | } 60 | for (GraphElementTable table : nodeTables) { 61 | subQueries.add( 62 | NodeElementTableQuery.create( 63 | graphSchema, 64 | table, 65 | baseTableSchemas.get(table.baseTableName), 66 | configs, 67 | idColumnsExist)); 68 | } 69 | } else { 70 | if (edgeTables.size() == 0) { 71 | throw new IllegalArgumentException("No edge table left."); 72 | } 73 | checkValidTableReference(nodeTables, edgeTables); 74 | for (GraphElementTable table : edgeTables) { 75 | subQueries.add( 76 | EdgeElementTableQuery.create( 77 | graphSchema, 78 | table, 79 | baseTableSchemas.get(table.baseTableName), 80 | configs, 81 | idColumnsExist)); 82 | } 83 | } 84 | this.graphSubqueries = Collections.unmodifiableList(subQueries); 85 | this.dataframeSchema = 86 | ElementTableQuery.mergeDataframeSchema( 87 | subQueries, node ? configs.nodeLabelConfigs : configs.edgeLabelConfigs); 88 | } 89 | 90 | private static boolean getIdColumnsExist( 91 | List nodeTables, Map baseTableSchemas) { 92 | if (nodeTables.size() != 1 || nodeTables.get(0).keyColumns.size() != 1) { 93 | return false; 94 | } 95 | GraphElementTable nodeTable = nodeTables.get(0); 96 | if (nodeTable.keyColumns.size() != 1) { 97 | return false; 98 | } 99 | SpannerTableSchema tableSchema = 100 | Objects.requireNonNull(baseTableSchemas.get(nodeTable.baseTableName)); 101 | String keyColumn = nodeTables.get(0).keyColumns.get(0); 102 | return tableSchema.getStructFieldForColumn(keyColumn).dataType() instanceof IntegralType; 103 | } 104 | 105 | private static Map getBaseTableSchemas( 106 | Connection conn, Iterable elementTables) { 107 | Map schemas = new HashMap<>(); 108 | for (GraphElementTable t : elementTables) { 109 | schemas.put(t.baseTableName, new SpannerTableSchema(conn, t.baseTableName, false)); 110 | } 111 | return schemas; 112 | } 113 | 114 | /** 115 | * Filters a list of element tables based on label filters 116 | * 117 | * @param elementTables the list of element tables to filter 118 | * @param labelConfigs the label config specified by the user 119 | * @return a list of element tables that have a matched label config 120 | */ 121 | private List getMatchedElementTables( 122 | List elementTables, List labelConfigs) { 123 | if (labelConfigs == null || labelConfigs.isEmpty() || labelConfigs.get(0).label.equals("*")) { 124 | return new ArrayList<>(elementTables); 125 | } 126 | 127 | Set targetLabels = 128 | labelConfigs.stream().map(lc -> lc.label.trim().toLowerCase()).collect(Collectors.toSet()); 129 | return elementTables.stream() 130 | .filter( 131 | t -> { 132 | for (String label : t.labelNames) { 133 | if (targetLabels.contains(label.toLowerCase())) { 134 | return true; 135 | } 136 | } 137 | return false; 138 | }) 139 | .collect(Collectors.toList()); 140 | } 141 | 142 | private void checkValidTableReference( 143 | List nodeTables, List edgeTables) { 144 | Set nodeTableNames = nodeTables.stream().map(t -> t.name).collect(Collectors.toSet()); 145 | for (GraphElementTable t : edgeTables) { 146 | String srcTable = t.sourceNodeTable.nodeTableName; 147 | String dstTable = t.destinationNodeTable.nodeTableName; 148 | if (!nodeTableNames.contains(srcTable) || !nodeTableNames.contains(dstTable)) { 149 | throw new IllegalArgumentException( 150 | String.format( 151 | "One or both of the referenced node tables (%s, %s) of edge table %s are filtered" 152 | + " out. Existing node tables: %s.", 153 | srcTable, dstTable, t.name, nodeTableNames)); 154 | } 155 | } 156 | } 157 | 158 | private StructType fieldsToStruct(List fields) { 159 | StructType result = new StructType(); 160 | for (StructField field : fields) { 161 | result = result.add(field); 162 | } 163 | return result; 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /mvnw.cmd: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Licensed to the Apache Software Foundation (ASF) under one 3 | @REM or more contributor license agreements. See the NOTICE file 4 | @REM distributed with this work for additional information 5 | @REM regarding copyright ownership. The ASF licenses this file 6 | @REM to you under the Apache License, Version 2.0 (the 7 | @REM "License"); you may not use this file except in compliance 8 | @REM with the License. You may obtain a copy of the License at 9 | @REM 10 | @REM http://www.apache.org/licenses/LICENSE-2.0 11 | @REM 12 | @REM Unless required by applicable law or agreed to in writing, 13 | @REM software distributed under the License is distributed on an 14 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | @REM KIND, either express or implied. See the License for the 16 | @REM specific language governing permissions and limitations 17 | @REM under the License. 18 | @REM ---------------------------------------------------------------------------- 19 | 20 | @REM ---------------------------------------------------------------------------- 21 | @REM Maven Start Up Batch script 22 | @REM 23 | @REM Required ENV vars: 24 | @REM JAVA_HOME - location of a JDK home dir 25 | @REM 26 | @REM Optional ENV vars 27 | @REM M2_HOME - location of maven2's installed home dir 28 | @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands 29 | @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending 30 | @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven 31 | @REM e.g. to debug Maven itself, use 32 | @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 33 | @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files 34 | @REM ---------------------------------------------------------------------------- 35 | 36 | @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on' 37 | @echo off 38 | @REM set title of command window 39 | title %0 40 | @REM enable echoing by setting MAVEN_BATCH_ECHO to 'on' 41 | @if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO% 42 | 43 | @REM set %HOME% to equivalent of $HOME 44 | if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%") 45 | 46 | @REM Execute a user defined script before this one 47 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre 48 | @REM check for pre script, once with legacy .bat ending and once with .cmd ending 49 | if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat" 50 | if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd" 51 | :skipRcPre 52 | 53 | @setlocal 54 | 55 | set ERROR_CODE=0 56 | 57 | @REM To isolate internal variables from possible post scripts, we use another setlocal 58 | @setlocal 59 | 60 | @REM ==== START VALIDATION ==== 61 | if not "%JAVA_HOME%" == "" goto OkJHome 62 | 63 | echo. 64 | echo Error: JAVA_HOME not found in your environment. >&2 65 | echo Please set the JAVA_HOME variable in your environment to match the >&2 66 | echo location of your Java installation. >&2 67 | echo. 68 | goto error 69 | 70 | :OkJHome 71 | if exist "%JAVA_HOME%\bin\java.exe" goto init 72 | 73 | echo. 74 | echo Error: JAVA_HOME is set to an invalid directory. >&2 75 | echo JAVA_HOME = "%JAVA_HOME%" >&2 76 | echo Please set the JAVA_HOME variable in your environment to match the >&2 77 | echo location of your Java installation. >&2 78 | echo. 79 | goto error 80 | 81 | @REM ==== END VALIDATION ==== 82 | 83 | :init 84 | 85 | @REM Find the project base dir, i.e. the directory that contains the folder ".mvn". 86 | @REM Fallback to current working directory if not found. 87 | 88 | set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR% 89 | IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir 90 | 91 | set EXEC_DIR=%CD% 92 | set WDIR=%EXEC_DIR% 93 | :findBaseDir 94 | IF EXIST "%WDIR%"\.mvn goto baseDirFound 95 | cd .. 96 | IF "%WDIR%"=="%CD%" goto baseDirNotFound 97 | set WDIR=%CD% 98 | goto findBaseDir 99 | 100 | :baseDirFound 101 | set MAVEN_PROJECTBASEDIR=%WDIR% 102 | cd "%EXEC_DIR%" 103 | goto endDetectBaseDir 104 | 105 | :baseDirNotFound 106 | set MAVEN_PROJECTBASEDIR=%EXEC_DIR% 107 | cd "%EXEC_DIR%" 108 | 109 | :endDetectBaseDir 110 | 111 | IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig 112 | 113 | @setlocal EnableExtensions EnableDelayedExpansion 114 | for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a 115 | @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS% 116 | 117 | :endReadAdditionalConfig 118 | 119 | SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe" 120 | set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar" 121 | set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain 122 | 123 | set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 124 | 125 | FOR /F "tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO ( 126 | IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B 127 | ) 128 | 129 | @REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central 130 | @REM This allows using the maven wrapper in projects that prohibit checking in binary data. 131 | if exist %WRAPPER_JAR% ( 132 | if "%MVNW_VERBOSE%" == "true" ( 133 | echo Found %WRAPPER_JAR% 134 | ) 135 | ) else ( 136 | if not "%MVNW_REPOURL%" == "" ( 137 | SET DOWNLOAD_URL="%MVNW_REPOURL%/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" 138 | ) 139 | if "%MVNW_VERBOSE%" == "true" ( 140 | echo Couldn't find %WRAPPER_JAR%, downloading it ... 141 | echo Downloading from: %DOWNLOAD_URL% 142 | ) 143 | 144 | powershell -Command "&{"^ 145 | "$webclient = new-object System.Net.WebClient;"^ 146 | "if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^ 147 | "$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^ 148 | "}"^ 149 | "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^ 150 | "}" 151 | if "%MVNW_VERBOSE%" == "true" ( 152 | echo Finished downloading %WRAPPER_JAR% 153 | ) 154 | ) 155 | @REM End of extension 156 | 157 | @REM Provide a "standardized" way to retrieve the CLI args that will 158 | @REM work with both Windows and non-Windows executions. 159 | set MAVEN_CMD_LINE_ARGS=%* 160 | 161 | %MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %* 162 | if ERRORLEVEL 1 goto error 163 | goto end 164 | 165 | :error 166 | set ERROR_CODE=1 167 | 168 | :end 169 | @endlocal & set ERROR_CODE=%ERROR_CODE% 170 | 171 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost 172 | @REM check for post script, once with legacy .bat ending and once with .cmd ending 173 | if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat" 174 | if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd" 175 | :skipRcPost 176 | 177 | @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on' 178 | if "%MAVEN_BATCH_PAUSE%" == "on" pause 179 | 180 | if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE% 181 | 182 | exit /B %ERROR_CODE% -------------------------------------------------------------------------------- /spark-3.1-spanner-lib/src/main/java/com/google/cloud/spark/spanner/graph/SpannerGraphScanner.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.spark.spanner.graph; 2 | 3 | import com.fasterxml.jackson.core.JsonProcessingException; 4 | import com.google.api.gax.grpc.GrpcCallContext; 5 | import com.google.api.gax.rpc.ApiCallContext; 6 | import com.google.cloud.Tuple; 7 | import com.google.cloud.spanner.BatchReadOnlyTransaction; 8 | import com.google.cloud.spanner.BatchTransactionId; 9 | import com.google.cloud.spanner.Options; 10 | import com.google.cloud.spanner.Partition; 11 | import com.google.cloud.spanner.PartitionOptions; 12 | import com.google.cloud.spanner.SpannerOptions; 13 | import com.google.cloud.spanner.SpannerOptions.CallContextConfigurator; 14 | import com.google.cloud.spanner.Statement; 15 | import com.google.cloud.spanner.TimestampBound; 16 | import com.google.cloud.spark.spanner.BatchClientWithCloser; 17 | import com.google.cloud.spark.spanner.SpannerInputPartitionContext; 18 | import com.google.cloud.spark.spanner.SpannerPartition; 19 | import com.google.cloud.spark.spanner.SpannerPartitionReaderFactory; 20 | import com.google.cloud.spark.spanner.SpannerRowConverter; 21 | import com.google.cloud.spark.spanner.SpannerScanner; 22 | import com.google.cloud.spark.spanner.SpannerUtils; 23 | import com.google.cloud.spark.spanner.graph.query.SpannerGraphQuery; 24 | import com.google.common.collect.ImmutableSet; 25 | import io.grpc.Context; 26 | import io.grpc.MethodDescriptor; 27 | import java.util.ArrayList; 28 | import java.util.List; 29 | import java.util.Map; 30 | import java.util.Set; 31 | import java.util.stream.Collectors; 32 | import javax.annotation.Nullable; 33 | import org.apache.spark.sql.connector.read.Batch; 34 | import org.apache.spark.sql.connector.read.PartitionReaderFactory; 35 | import org.apache.spark.sql.connector.read.Scan; 36 | import org.apache.spark.sql.types.StructType; 37 | import org.apache.spark.sql.util.CaseInsensitiveStringMap; 38 | import org.slf4j.Logger; 39 | import org.slf4j.LoggerFactory; 40 | 41 | /** Logically and physically represents a scan of a graph in Spanner */ 42 | public class SpannerGraphScanner implements Batch, Scan { 43 | 44 | private static final Logger log = LoggerFactory.getLogger(SpannerScanner.class); 45 | 46 | private final Map options; 47 | private final @Nullable Map> extraHeaders; 48 | private final TimestampBound readTimestamp; 49 | private final @Nullable Long partitionSizeBytes; 50 | private final Options.ReadAndQueryOption dataBoostEnabled; 51 | private final @Nullable ImmutableSet requiredColumns; 52 | private final StructType readSchema; 53 | private final List> queryAndRowConverters; 54 | private final List partitions; 55 | 56 | public SpannerGraphScanner( 57 | Map options, 58 | @Nullable Map> extraHeaders, 59 | TimestampBound readTimestamp, 60 | @Nullable Long partitionSizeBytes, 61 | Options.ReadAndQueryOption dataBoostEnabled, 62 | SpannerGraphQuery graphQuery, 63 | @Nullable Set requiredColumns, 64 | StructType readSchema) { 65 | // Potential improvement: support filter pushdown. 66 | this.options = new CaseInsensitiveStringMap(options); 67 | this.extraHeaders = extraHeaders; 68 | this.readTimestamp = readTimestamp; 69 | this.partitionSizeBytes = partitionSizeBytes; 70 | this.dataBoostEnabled = dataBoostEnabled; 71 | this.requiredColumns = requiredColumns == null ? null : ImmutableSet.copyOf(requiredColumns); 72 | this.readSchema = readSchema; 73 | this.queryAndRowConverters = 74 | graphQuery.graphSubqueries.stream() 75 | .map(q -> q.getQueryAndConverter(readSchema)) 76 | .collect(Collectors.toList()); 77 | this.partitions = new ArrayList<>(); // Filled later 78 | } 79 | 80 | /** 81 | * Returns a list of {@link SpannerPartition input partitions}. Each {@link SpannerPartition} 82 | * represents a data split that can be processed by one Spark task. The number of input partitions 83 | * returned here is the same as the number of RDD partitions this scan outputs. 84 | */ 85 | @Override 86 | public SpannerPartition[] planInputPartitions() { 87 | if (extraHeaders == null || extraHeaders.isEmpty()) { 88 | return doPlanInputPartitions(); 89 | } 90 | Context context = 91 | Context.current() 92 | .withValue( 93 | SpannerOptions.CALL_CONTEXT_CONFIGURATOR_KEY, 94 | new CallContextConfigurator() { 95 | @Override 96 | public ApiCallContext configure( 97 | ApiCallContext context, ReqT request, MethodDescriptor method) { 98 | return GrpcCallContext.createDefault().withExtraHeaders(extraHeaders); 99 | } 100 | }); 101 | try { 102 | return context.call(this::doPlanInputPartitions); 103 | } catch (Exception e) { 104 | throw new RuntimeException(e); 105 | } 106 | } 107 | 108 | private SpannerPartition[] doPlanInputPartitions() { 109 | String optionsJson; 110 | try { 111 | optionsJson = SpannerUtils.serializeMap(options); 112 | } catch (JsonProcessingException e) { 113 | throw new RuntimeException(e); 114 | } 115 | try (BatchClientWithCloser batchClient = SpannerUtils.batchClientFromProperties(options)) { 116 | try (BatchReadOnlyTransaction txn = 117 | batchClient.batchClient.batchReadOnlyTransaction(readTimestamp)) { 118 | BatchTransactionId txnId = txn.getBatchTransactionId(); 119 | PartitionOptions options = PartitionOptions.getDefaultInstance(); 120 | if (partitionSizeBytes != null) { 121 | options = PartitionOptions.newBuilder().setPartitionSizeBytes(partitionSizeBytes).build(); 122 | } 123 | 124 | partitions.clear(); 125 | for (Tuple queryAndRowConverter : queryAndRowConverters) { 126 | List rawPartitions = 127 | txn.partitionQuery(options, queryAndRowConverter.x(), dataBoostEnabled); 128 | 129 | for (Partition rawPartition : rawPartitions) { 130 | SpannerInputPartitionContext context = 131 | new SpannerInputPartitionContext( 132 | rawPartition, txnId, optionsJson, queryAndRowConverter.y()); 133 | int index = partitions.size(); 134 | partitions.add(new SpannerPartition(rawPartition, index, context)); 135 | } 136 | } 137 | log.info("Number of partitions: " + partitions.size()); 138 | return partitions.toArray(new SpannerPartition[0]); 139 | } 140 | } 141 | } 142 | 143 | @Override 144 | public PartitionReaderFactory createReaderFactory() { 145 | return new SpannerPartitionReaderFactory(); 146 | } 147 | 148 | @Override 149 | public StructType readSchema() { 150 | return readSchema; 151 | } 152 | 153 | @Override 154 | public Batch toBatch() { 155 | return this; 156 | } 157 | 158 | @Override 159 | public String description() { 160 | return String.format( 161 | "%s\nRequired Columns: %s\nRead Timestamp: %s" 162 | + "\nStatements (%d):\n%s\nNumber of Partitions: %d", 163 | this.getClass(), 164 | requiredColumns, 165 | readTimestamp, 166 | queryAndRowConverters.size(), 167 | queryAndRowConverters.stream() 168 | .map(qc -> qc.x().toString()) 169 | .collect(Collectors.joining("\n")), 170 | partitions.size()); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /python/spannergraph/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | import logging 5 | 6 | from datetime import datetime, timedelta 7 | 8 | from pyspark.sql import DataFrame 9 | from pyspark.sql.session import SparkSession 10 | from pyspark.sql import functions as sf 11 | 12 | from ._connector import SpannerGraphConnector 13 | 14 | from .tests_gold import * 15 | 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | 20 | def get_connector_jar() -> str: 21 | jar_path = os.path.abspath( 22 | "../spark-3.1-spanner/target/spark-3.1-spanner-0.0.1-SNAPSHOT.jar" 23 | ) 24 | assert os.path.exists(jar_path), ( 25 | f"Cannot find connector JAR at {jar_path}. " 26 | "Please build the connector JAR first." 27 | ) 28 | return jar_path 29 | 30 | 31 | def _as_pandas_str(df: DataFrame) -> str: 32 | df_pd = df.toPandas() 33 | return df_pd.sort_values(by=df_pd.columns.to_list()) \ 34 | .reset_index(drop=True).to_string() 35 | 36 | 37 | # These tests rely on the graphs named FlexibleGraph and MusicGraph 38 | # in the databases named flexible-graph and music-graph, respectively. 39 | # See the following files for the definitions of the graph: 40 | # spark-3.1-spanner-lib/src/test/resources/db/populate_ddl_graph.sql 41 | # spark-3.1-spanner-lib/src/test/resources/db/insert_data_graph.sql 42 | 43 | 44 | class TestGraphConnector(unittest.TestCase): 45 | 46 | def __init__(self, *args, **kwargs) -> None: 47 | super().__init__(*args, **kwargs) 48 | self.check_point_dir = tempfile.TemporaryDirectory() 49 | self.spark = ( 50 | SparkSession.builder.appName("spanner-spark-connector-test") 51 | .config( 52 | "spark.jars.packages", 53 | "graphframes:graphframes:0.8.4-spark3.5-s_2.12", 54 | ) 55 | .config("spark.jars", get_connector_jar()) 56 | .getOrCreate() 57 | ) 58 | self.spark.sparkContext.setCheckpointDir(self.check_point_dir.name) 59 | 60 | def test_flexible_graph_cc(self) -> None: 61 | connector = ( 62 | SpannerGraphConnector() 63 | .spark(self.spark) 64 | .project(os.getenv("SPANNER_PROJECT_ID")) 65 | .instance(os.getenv("SPANNER_INSTANCE_ID")) 66 | .database("flexible-graph") 67 | .graph("FlexibleGraph") 68 | .data_boost() 69 | .node_label("*", properties=["id"]) 70 | .edge_label("*", properties=["to_id"], 71 | where="id < 100 AND to_id < 100") 72 | .partition_size_bytes(1) # hint only 73 | .repartition(3) 74 | .read_timestamp(datetime.now() - timedelta(minutes=10)) 75 | ) 76 | 77 | g = connector.load_graph() 78 | 79 | vertices_str = self._df_to_str(g.vertices) 80 | edges_str = self._df_to_str(g.edges) 81 | self.assertEqual( 82 | vertices_str, 83 | "['id', 'property_id'] - " 84 | + "[[1, 1], [2, 2], [3, 3], [7, 7], [16, 16], " 85 | + "[20, 20], [100, 100], [101, 101]]", 86 | ) 87 | self.assertEqual( 88 | edges_str, 89 | "['src', 'dst', 'property_to_id'] - " 90 | + "[[1, 7, 7], [2, 20, 20], [3, 16, 16], " 91 | + "[7, 16, 16], [7, 16, 16], [16, 20, 20], " 92 | + "[20, 7, 7], [20, 16, 16]]", 93 | ) 94 | 95 | cc = g.connectedComponents() 96 | cc_str = self._df_to_str(cc) 97 | self.assertEqual( 98 | cc_str, 99 | "['id', 'property_id', 'component'] - " 100 | + "[[1, 1, 1], [2, 2, 1], [3, 3, 1], [7, 7, 1], " 101 | + "[16, 16, 1], [20, 20, 1], [100, 100, 100], " 102 | + "[101, 101, 101]]", 103 | ) 104 | 105 | self.assertEqual(g.vertices.rdd.getNumPartitions(), 3) 106 | self.assertEqual(g.edges.rdd.getNumPartitions(), 3) 107 | 108 | def test_music_graph_cc(self) -> None: 109 | connector = ( 110 | SpannerGraphConnector() 111 | .spark(self.spark) 112 | .project(os.getenv("SPANNER_PROJECT_ID")) 113 | .instance(os.getenv("SPANNER_INSTANCE_ID")) 114 | .database("music-graph") 115 | .graph("MusicGraph") 116 | .data_boost(True) 117 | .repartition(6) 118 | .node_label( 119 | "*", 120 | properties=[ 121 | "name", "country_origin", "birthday", "ReleaseDate" 122 | ] 123 | ) 124 | .edge_label( 125 | "*", 126 | properties=["SingerId", "release_date", "FriendId"] 127 | ) 128 | ) 129 | 130 | df_nodes, df_edges, df_mapping = connector.load_dfs() 131 | df_nodes = df_nodes.join(df_mapping, "id").drop("id") 132 | df_edges_src = df_edges \ 133 | .join(df_mapping, sf.expr("src <=> id")).drop("id", "src", "dst") 134 | df_edges_dst = df_edges \ 135 | .join(df_mapping, sf.expr("dst <=> id")).drop("id", "src", "dst") 136 | 137 | vertices_str = _as_pandas_str(df_nodes) 138 | edges_str_src = _as_pandas_str(df_edges_src) 139 | edges_str_dst = _as_pandas_str(df_edges_dst) 140 | 141 | self.assertEqual(vertices_str, TEST_MUSIC_GRAPH_CC_VERTICES) 142 | self.assertEqual(edges_str_src, TEST_MUSIC_GRAPH_CC_EDGES_SRC) 143 | self.assertEqual(edges_str_dst, TEST_MUSIC_GRAPH_CC_EDGES_DST) 144 | 145 | def test_flexible_graph_undirected(self) -> None: 146 | connector = ( 147 | SpannerGraphConnector() 148 | .spark(self.spark) 149 | .project(os.getenv("SPANNER_PROJECT_ID")) 150 | .instance(os.getenv("SPANNER_INSTANCE_ID")) 151 | .database("flexible-graph") 152 | .graph("FlexibleGraph") 153 | .symmetrize_graph() 154 | .edge_label("*", properties=["to_id"]) 155 | .repartition(7) 156 | ) 157 | 158 | g = connector.load_graph() 159 | 160 | vertices_str = self._df_to_str(g.vertices) 161 | edges_str = self._df_to_str(g.edges) 162 | self.assertEqual( 163 | vertices_str, 164 | "['id'] - [[1], [2], [3], [7], [16], [20], [100], [101]]", 165 | ) 166 | self.assertEqual( 167 | edges_str, 168 | "['src', 'dst', 'property_to_id'] - " 169 | "[[1, 7, 7], [2, 20, 20], [3, 16, 16], [7, 1, 7], [7, 16, 16], " 170 | "[7, 20, 7], [16, 3, 16], [16, 7, 16], [16, 20, 16], " 171 | "[16, 20, 20], [20, 2, 20], [20, 7, 7], [20, 16, 16], " 172 | "[20, 16, 20], [100, 101, 101], [101, 100, 101]]", 173 | ) 174 | 175 | self.assertEqual(g.vertices.rdd.getNumPartitions(), 7) 176 | self.assertEqual(g.edges.rdd.getNumPartitions(), 7) 177 | 178 | def test_music_graph_direct_queries(self) -> None: 179 | node_query = "SELECT * FROM GRAPH_TABLE " \ 180 | "(MusicGraph MATCH (n:SINGER) RETURN n.id AS id)" 181 | edge_query = "SELECT * FROM GRAPH_TABLE " \ 182 | "(MusicGraph MATCH -[e:KNOWS]-> " \ 183 | "RETURN e.SingerId AS src, e.FriendId AS dst)" 184 | 185 | connector = ( 186 | SpannerGraphConnector() 187 | .spark(self.spark) 188 | .project(os.getenv("SPANNER_PROJECT_ID")) 189 | .instance(os.getenv("SPANNER_INSTANCE_ID")) 190 | .database("music-graph") 191 | .graph("MusicGraph") 192 | .node_query(node_query) 193 | .edge_query(edge_query) 194 | .data_boost() 195 | ) 196 | 197 | g = connector.load_graph() 198 | 199 | vertices_str = self._df_to_str(g.vertices) 200 | edges_str = self._df_to_str(g.edges) 201 | self.assertEqual( 202 | vertices_str, 203 | "['id'] - [[1], [2], [3], [4], [5]]", 204 | ) 205 | self.assertEqual( 206 | edges_str, 207 | "['src', 'dst'] - " 208 | "[[1, 2], [1, 3], [2, 1], [2, 4], [2, 5], [3, 1], [3, 5], " 209 | "[4, 2], [4, 5], [5, 2], [5, 3], [5, 4]]", 210 | ) 211 | 212 | def _df_to_str(self, df: DataFrame) -> str: 213 | rows = sorted([list(r.asDict().values()) for r in df.collect()]) 214 | return f"{df.columns} - {rows}" 215 | 216 | 217 | if __name__ == "__main__": 218 | unittest.main() 219 | --------------------------------------------------------------------------------