├── docs ├── Conf.md ├── ConfigLoader.md ├── Deliverable.md ├── _config.yml ├── Logging.md ├── img │ ├── logo_setl.png │ ├── logo_setl_1280_640.png │ └── old_logo │ │ ├── logo_setl.png │ │ └── logo_setl_1280_640.png ├── Architecture.md ├── Condition.md ├── vocabulary.md ├── StructAnalyser.md ├── SparkSessionBuilder.md ├── SchemaConverter.md ├── utils │ └── Compressor_Archiver.md ├── Stage.md ├── Transformer.md ├── data_access_layer │ ├── SparkRepositoryBuilder.md │ ├── CustomConnector.md │ ├── Structured-Streaming-Connector.md │ ├── ConnectorBuilder.md │ └── SparkRepositoryAdapter.md ├── Factory.md ├── index.md └── SparkRepository-caching.md ├── src ├── test │ ├── resources │ │ ├── myconf.conf │ │ ├── test_base_path.csv │ │ ├── test-archiver │ │ │ ├── test-input-file.txt │ │ │ └── test-input │ │ │ │ ├── col3=c │ │ │ │ ├── file1-1-1.csv │ │ │ │ ├── file1-2-1.csv │ │ │ │ └── file1-2-2.csv │ │ │ │ └── col3=cc │ │ │ │ └── file2-1.csv │ │ ├── test-list-files │ │ │ ├── file1.csv │ │ │ ├── subdir2 │ │ │ │ └── file2-1.csv │ │ │ └── subdir1 │ │ │ │ ├── subsubdir1 │ │ │ │ ├── file1-1-1.csv │ │ │ │ └── wrongfile1-1-1.csv │ │ │ │ └── subsubdir2 │ │ │ │ ├── file1-2-1.csv │ │ │ │ └── file1-2-2.csv │ │ ├── test-list-files2 │ │ │ ├── col3=cc │ │ │ │ └── file2-1.csv │ │ │ └── col3=c │ │ │ │ ├── file1-1-1.csv │ │ │ │ ├── file1-2-1.csv │ │ │ │ └── file1-2-2.csv │ │ ├── streaming_test_resources │ │ │ ├── input2 │ │ │ │ └── input2.csv │ │ │ ├── streaming.conf │ │ │ └── input │ │ │ │ └── text.txt │ │ ├── test_schema_converter.csv │ │ ├── test_connector_builder.conf │ │ ├── dynamodb.conf │ │ ├── test_priority.conf │ │ ├── test-json.json │ │ ├── log4j.properties │ │ └── local.conf │ └── scala │ │ └── io │ │ └── github │ │ └── setl │ │ ├── workflow │ │ ├── package.scala │ │ └── FlowSuite.scala │ │ ├── config │ │ ├── DeltaConnectorConfSuite.scala │ │ ├── DynamoDBConnectorConfSuite.scala │ │ ├── PropertiesSuite.scala │ │ ├── HudiConnectorConfSuite.scala │ │ ├── StructuredStreamingConnectorConfSuite.scala │ │ ├── Properties.scala │ │ ├── ConfLoaderSuite.scala │ │ ├── FileConnectorConfSuite.scala │ │ └── JDBCConnectorConfSuite.scala │ │ ├── storage │ │ ├── connector │ │ │ ├── ConnectorSuite.scala │ │ │ ├── SparkSQLConnectorSuite.scala │ │ │ ├── HudiConnectorSuite.scala │ │ │ └── StructuredStreamingConnectorSuite.scala │ │ ├── repository │ │ │ ├── streaming │ │ │ │ └── StreamingRepositorySuite.scala │ │ │ ├── package.scala │ │ │ └── RepositoryAdapterSuite.scala │ │ ├── XZCompressorSuite.scala │ │ ├── SnappyCompressorSuite.scala │ │ ├── GZIPCompressorSuite.scala │ │ └── ConditionSuite.scala │ │ ├── SparkTestUtils.scala │ │ ├── factory │ │ └── FactoryDeliveryMetadataSuite.scala │ │ ├── TestObject.scala │ │ ├── util │ │ ├── TypesafeConfigUtilsSuite.scala │ │ └── IOUtils.scala │ │ └── internal │ │ ├── BenchmarkInvocationHandlerSuite.scala │ │ ├── TestClasses.scala │ │ └── StructAnalyserSuite.scala └── main │ ├── java │ └── io │ │ └── github │ │ └── setl │ │ ├── enums │ │ ├── PathFormat.java │ │ ├── ValueType.java │ │ └── Storage.java │ │ ├── exception │ │ ├── ConfException.java │ │ ├── AlreadyExistsException.java │ │ ├── BaseException.java │ │ ├── ConnectorException.java │ │ ├── RepositoryException.java │ │ ├── InvalidSchemaException.java │ │ ├── InvalidConnectorException.java │ │ ├── InvalidDeliveryException.java │ │ └── UnknownException.java │ │ ├── annotation │ │ ├── Experimental.java │ │ ├── Benchmark.java │ │ ├── InterfaceStability.java │ │ ├── Compress.java │ │ └── Delivery.java │ │ ├── storage │ │ ├── SnappyCompressor.java │ │ ├── XZCompressor.java │ │ └── GZIPCompressor.java │ │ └── internal │ │ └── BenchmarkInvocationHandler.java │ └── scala │ └── io │ └── github │ └── setl │ ├── internal │ ├── HasReaderWriter.scala │ ├── HasReader.scala │ ├── HasWriter.scala │ ├── Configurable.scala │ ├── HasType.scala │ ├── CanDrop.scala │ ├── HasDescription.scala │ ├── CanDelete.scala │ ├── Identifiable.scala │ ├── Writable.scala │ ├── CanUpdate.scala │ ├── CanCreate.scala │ ├── HasDiagram.scala │ ├── HasBenchmark.scala │ ├── CanWait.scala │ ├── CanPartition.scala │ ├── CanVacuum.scala │ ├── Logging.scala │ └── HasRegistry.scala │ ├── storage │ ├── connector │ │ ├── StreamingConnector.scala │ │ ├── DBConnector.scala │ │ ├── ACIDConnector.scala │ │ ├── ConnectorInterface.scala │ │ ├── ParquetConnector.scala │ │ ├── SparkSQLConnector.scala │ │ ├── Connector.scala │ │ ├── HudiConnector.scala │ │ └── StructuredStreamingConnector.scala │ ├── DatasetConverter.scala │ ├── Compressor.scala │ ├── repository │ │ ├── RepositoryAdapter.scala │ │ ├── ImplicitRepositoryAdapter.scala │ │ └── Repository.scala │ └── Archiver.scala │ ├── config │ ├── ConnectorConf.scala │ ├── HudiConnectorConf.scala │ ├── DeltaConnectorConf.scala │ ├── StructuredStreamingConnectorConf.scala │ └── DynamoDBConnectorConf.scala │ ├── transformation │ ├── AbstractFactory.scala │ ├── Transformer.scala │ ├── MLTransformer.scala │ ├── FactoryInput.scala │ ├── Factory.scala │ └── FactoryOutput.scala │ ├── workflow │ ├── PipelineOptimizer.scala │ ├── External.scala │ ├── Flow.scala │ ├── DAG.scala │ └── SimplePipelineOptimizer.scala │ ├── annotation │ ├── ColumnName.scala │ └── CompoundKey.scala │ ├── util │ ├── ExpectedDeliverable.scala │ ├── ReflectUtils.scala │ ├── HasSparkSession.scala │ ├── FilterImplicits.scala │ ├── MermaidUtils.scala │ ├── SparkUtils.scala │ └── TypesafeConfigUtils.scala │ ├── Builder.scala │ ├── BenchmarkResult.scala │ └── Converter.scala ├── dev ├── test.sh ├── deploy-snapshot.sh ├── deploy-release.sh ├── docker-compose.yml └── change-scala-version.sh ├── .github ├── dependabot.yml ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── stale.yml └── workflows │ ├── test.yml │ ├── snapshot.yml │ └── release.yml ├── .gitignore ├── CONTRIBUTING.md └── CODE_OF_CONDUCT.md /docs/Conf.md: -------------------------------------------------------------------------------- 1 | ## Definition -------------------------------------------------------------------------------- /docs/ConfigLoader.md: -------------------------------------------------------------------------------- 1 | ## Definition -------------------------------------------------------------------------------- /docs/Deliverable.md: -------------------------------------------------------------------------------- 1 | ## Definition -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /docs/Logging.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | Logging module -------------------------------------------------------------------------------- /src/test/resources/myconf.conf: -------------------------------------------------------------------------------- 1 | my_test_variable = "haha" -------------------------------------------------------------------------------- /src/test/resources/test_base_path.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | a,b 3 | c,d -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input-file.txt: -------------------------------------------------------------------------------- 1 | Hello, world! -------------------------------------------------------------------------------- /src/test/resources/test-list-files/file1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "D", "d" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir2/file2-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "D", "d" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files2/col3=cc/file2-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "D", "d" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files2/col3=c/file1-1-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "A", "a" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files2/col3=c/file1-2-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "B", "b" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files2/col3=c/file1-2-2.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "C", "c" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input/col3=c/file1-1-1.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | "A","a" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input/col3=c/file1-2-1.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | "B","b" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input/col3=c/file1-2-2.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | "C","c" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-archiver/test-input/col3=cc/file2-1.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | "D","d" 3 | -------------------------------------------------------------------------------- /src/test/resources/streaming_test_resources/input2/input2.csv: -------------------------------------------------------------------------------- 1 | text 2 | "hello" 3 | "world" 4 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir1/subsubdir1/file1-1-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "A", "a" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir1/subsubdir2/file1-2-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "B", "b" 3 | -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir1/subsubdir2/file1-2-2.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "C", "c" 3 | -------------------------------------------------------------------------------- /docs/img/logo_setl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SETL-Framework/setl/HEAD/docs/img/logo_setl.png -------------------------------------------------------------------------------- /src/test/resources/test-list-files/subdir1/subsubdir1/wrongfile1-1-1.csv: -------------------------------------------------------------------------------- 1 | col1, col2 2 | "A", "a" 3 | -------------------------------------------------------------------------------- /src/test/resources/test_schema_converter.csv: -------------------------------------------------------------------------------- 1 | col1,col2,col3,col4 2 | 1,"1","A","a" 3 | 2,"2","B","b" -------------------------------------------------------------------------------- /docs/img/logo_setl_1280_640.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SETL-Framework/setl/HEAD/docs/img/logo_setl_1280_640.png -------------------------------------------------------------------------------- /docs/img/old_logo/logo_setl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SETL-Framework/setl/HEAD/docs/img/old_logo/logo_setl.png -------------------------------------------------------------------------------- /docs/img/old_logo/logo_setl_1280_640.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SETL-Framework/setl/HEAD/docs/img/old_logo/logo_setl_1280_640.png -------------------------------------------------------------------------------- /src/test/resources/test_connector_builder.conf: -------------------------------------------------------------------------------- 1 | customConnector { 2 | storage = "OTHER" 3 | class = "io.github.setl.CustomConnector" 4 | } 5 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/enums/PathFormat.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.enums; 2 | 3 | public enum PathFormat { 4 | WILDCARD, 5 | REGEX; 6 | } 7 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasReaderWriter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | trait HasReaderWriter extends HasReader with HasWriter { Connector => 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/test/resources/dynamodb.conf: -------------------------------------------------------------------------------- 1 | dynamodb { 2 | connector { 3 | storage = "DYNAMODB" 4 | region = "eu-west-1" 5 | table = "test-table" 6 | saveMode = "Overwrite" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /docs/Architecture.md: -------------------------------------------------------------------------------- 1 |  2 | 3 |  4 | 5 |  -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasReader.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import org.apache.spark.sql.DataFrameReader 4 | 5 | trait HasReader { Connector => 6 | 7 | protected val reader: DataFrameReader 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/StreamingConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.internal.CanWait 4 | 5 | abstract class StreamingConnector extends Connector 6 | with CanWait { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/config/ConnectorConf.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | abstract class ConnectorConf extends Conf { 4 | 5 | def getReaderConf: Map[String, String] 6 | 7 | def getWriterConf: Map[String, String] 8 | 9 | } 10 | -------------------------------------------------------------------------------- /dev/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | export AWS_ACCESS_KEY_ID="fakeAccess" 6 | export AWS_SECRET_ACCESS_KEY="fakeSecret" 7 | export AWS_REGION="eu-west-1" 8 | 9 | mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${SPARK_VER} 10 | -------------------------------------------------------------------------------- /dev/deploy-snapshot.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | echo ${MVN_SETTINGS} | base64 -d > ${HOME}/.m2/settings.xml 6 | echo ${MVN_SECURITY} | base64 -d > ${HOME}/.m2/settings-security.xml 7 | 8 | mvn clean deploy scala:doc -ntp -B -DskipTests -P snapshot,spark_${SPARK_VER} 9 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasWriter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row} 4 | 5 | trait HasWriter { Connector => 6 | 7 | protected val writer: DataFrame => DataFrameWriter[Row] 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/AbstractFactory.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | trait AbstractFactory[A] { 4 | 5 | def read(): this.type 6 | 7 | def process(): this.type 8 | 9 | def write(): this.type 10 | 11 | def get(): A 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/workflow/PipelineOptimizer.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.workflow 2 | 3 | trait PipelineOptimizer { 4 | 5 | def setExecutionPlan(dag: DAG): this.type 6 | 7 | def optimize(stages: Iterable[Stage]): Array[Stage] 8 | 9 | def getOptimizedExecutionPlan: DAG 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/Configurable.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | 5 | @InterfaceStability.Evolving 6 | trait Configurable { 7 | 8 | def set(key: String, value: String): this.type 9 | 10 | def get(key: String): Option[String] 11 | 12 | } 13 | -------------------------------------------------------------------------------- /docs/Condition.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | 3 | **Condition** is used by the `findBy` method of a **Repository** 4 | 5 | ```scala 6 | val cond = Set( 7 | Condition("column1", ">", 100), 8 | Condition("column2", "=", "value2") 9 | ) 10 | 11 | myRepository.findBy(cond) 12 | ``` 13 | 14 | ## Operation 15 | - `>` 16 | - `<` 17 | - `>=` 18 | - `<=` 19 | - `=` -------------------------------------------------------------------------------- /dev/deploy-release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | echo ${MVN_SETTINGS} | base64 -d > ${HOME}/.m2/settings.xml 6 | echo ${MVN_SECURITY} | base64 -d > ${HOME}/.m2/settings-security.xml 7 | echo ${GPG_KEY} | base64 -d | gpg --import --batch > /dev/null 2>&1 8 | 9 | mvn clean deploy scala:doc -ntp -B -DskipTests -P release,spark_${SPARK_VER} 10 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/annotation/ColumnName.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation 2 | 3 | import scala.annotation.StaticAnnotation 4 | 5 | /** 6 | * Define an alias for the current field in the table 7 | * 8 | * @param name alias of the current field name 9 | */ 10 | @InterfaceStability.Stable 11 | final case class ColumnName(name: String) extends StaticAnnotation 12 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasType.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | 5 | import scala.reflect.runtime 6 | 7 | /** 8 | * HasType should be used on classed having a payload 9 | */ 10 | @InterfaceStability.Evolving 11 | trait HasType { 12 | 13 | val runtimeType: runtime.universe.Type 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanDrop.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | 5 | /** 6 | * Connectors that inherit CanDrop should be able to drop the entire data table 7 | */ 8 | trait CanDrop { 9 | self: Connector => 10 | 11 | /** 12 | * Drop the entire table. 13 | */ 14 | def drop(): Unit 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/DBConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.{CanCreate, CanDelete, CanDrop} 5 | 6 | @InterfaceStability.Evolving 7 | abstract class DBConnector extends Connector 8 | with CanCreate 9 | with CanDrop 10 | with CanDelete { 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/ExpectedDeliverable.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import io.github.setl.transformation.Factory 4 | 5 | case class ExpectedDeliverable(deliverableType: String, 6 | deliveryId: String, 7 | producer: Class[_], 8 | consumer: Class[_ <: Factory[_]]) { 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/workflow/package.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | package object workflow { 4 | 5 | case class Product1(x: String) 6 | 7 | case class Product2(x: String, y: String) 8 | 9 | case class Product(x: String) 10 | 11 | case class Product23(x: String) 12 | 13 | case class Container[T](content: T) 14 | 15 | case class Container2[T](content: T) 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/ACIDConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.{CanDelete, CanDrop, CanUpdate, CanVacuum} 5 | 6 | @InterfaceStability.Evolving 7 | abstract class ACIDConnector extends Connector 8 | with CanUpdate 9 | with CanDrop 10 | with CanDelete 11 | with CanVacuum { 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasDescription.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.util.ReflectUtils 5 | 6 | 7 | @InterfaceStability.Evolving 8 | trait HasDescription { 9 | 10 | def getPrettyName: String = ReflectUtils.getPrettyName(this.getClass) 11 | 12 | /** Describe the current class */ 13 | def describe(): this.type 14 | 15 | } 16 | -------------------------------------------------------------------------------- /docs/vocabulary.md: -------------------------------------------------------------------------------- 1 | #### Data access layer 2 | Data access layer is a layer of a computer program which provides simplified access (saving and retrieving) data to data stored in persistent storage. 3 | 4 | #### Business logic layer 5 | Business logic layer contains code which works with the data, processing it according to the rules of the business logic. 6 | 7 | #### Persistence storage 8 | A storage of data, *e.g* a database, a distributed filesystem, etc. 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/enums/ValueType.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.enums; 2 | 3 | public enum ValueType { 4 | STRING("string"), 5 | DATETIME("timestamp"), 6 | DATE("date"), 7 | NUMBER("number"), 8 | SET("set"), 9 | COLUMN("column"); 10 | 11 | private final String value; 12 | 13 | ValueType(String value) { 14 | this.value = value; 15 | } 16 | 17 | public String value() { 18 | return value; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/ReflectUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import scala.reflect.runtime 4 | 5 | object ReflectUtils { 6 | 7 | def getPrettyName(tpe: runtime.universe.Type): String = tpe.toString.split("\\[").map(getPrettyName).mkString("[") 8 | 9 | def getPrettyName(cls: Class[_]): String = getPrettyName(cls.getCanonicalName) 10 | 11 | def getPrettyName(canonicalName: String): String = canonicalName.split("\\.").last 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanDelete.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | 5 | /** 6 | * Connectors that inherit CanDelete should be able to delete records for a given query string 7 | */ 8 | trait CanDelete { 9 | self: Connector => 10 | 11 | /** 12 | * Delete rows according to the query 13 | * 14 | * @param query a query string 15 | */ 16 | def delete(query: String): Unit 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/workflow/External.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.workflow 2 | 3 | import java.util.UUID 4 | 5 | import io.github.setl.transformation.Factory 6 | 7 | sealed abstract class External private extends Factory[External] 8 | 9 | /** 10 | * Singleton for external data source 11 | */ 12 | object External { 13 | val NODE: Node = Node( 14 | classOf[External], 15 | UUID.fromString("00000000-0000-0000-0000-000000000000"), 16 | -1, 17 | List(), 18 | null 19 | ) 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/ConfException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class ConfException extends BaseException { 4 | 5 | public ConfException(String errorMessage) { 6 | super(errorMessage); 7 | } 8 | 9 | public static class Format extends ConfException { 10 | /** 11 | * @param errorMessage error message 12 | */ 13 | public Format(String errorMessage) { 14 | super(errorMessage); 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/AlreadyExistsException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class AlreadyExistsException extends BaseException { 4 | public AlreadyExistsException() { 5 | } 6 | 7 | public AlreadyExistsException(String message) { 8 | super(message); 9 | } 10 | 11 | public AlreadyExistsException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public AlreadyExistsException(Throwable cause) { 16 | super(cause); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/Identifiable.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import java.util.UUID 4 | 5 | import io.github.setl.annotation.InterfaceStability 6 | 7 | /** 8 | * Identifiable generates an UUID for any object that implement the trait 9 | */ 10 | @InterfaceStability.Evolving 11 | trait Identifiable { 12 | 13 | private[this] val _uuid: UUID = UUID.randomUUID 14 | 15 | private[this] val _name: String = getClass.getCanonicalName 16 | 17 | def getUUID: UUID = _uuid 18 | 19 | def getCanonicalName: String = _name 20 | 21 | } 22 | -------------------------------------------------------------------------------- /dev/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.2' 2 | services: 3 | psql: 4 | image: "postgres" 5 | container_name: "postgres-unit-test" 6 | environment: 7 | - POSTGRES_USER=postgres 8 | - POSTGRES_PASSWORD=postgres 9 | - POSTGRES_DB=framework_dev 10 | ports: 11 | - "5432:5432" 12 | 13 | cassandra: 14 | image: "cassandra" 15 | container_name: "cassandra-unit-test" 16 | ports: 17 | - "9042:9042" 18 | 19 | dynamodb: 20 | image: "amazon/dynamodb-local" 21 | container_name: "dynamodb-unit-test" 22 | ports: 23 | - "8000:8000" 24 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/Builder.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.Logging 5 | 6 | /** 7 | * Builder could be used to build or initialize objects 8 | * 9 | * @tparam A the type of object that the builder is supposed to produce 10 | */ 11 | @InterfaceStability.Evolving 12 | trait Builder[+A] extends Logging { 13 | 14 | /** 15 | * Build an object 16 | * 17 | * @return 18 | */ 19 | def build(): this.type 20 | 21 | def get(): A 22 | 23 | def getOrCreate(): A = this.build().get() 24 | } 25 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "maven" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | ignore: 13 | - dependency-name: "scala*" 14 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/annotation/Experimental.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | /** 9 | * The Experimental annotation indicate that the annotated class/method/field is supposed to be an experimental feature, 10 | * thus the stability can't be guaranteed. 11 | */ 12 | @Retention(RetentionPolicy.CLASS) 13 | @Target({ElementType.FIELD, ElementType.METHOD, ElementType.TYPE}) 14 | public @interface Experimental { 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/DatasetConverter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import io.github.setl.Converter 4 | import io.github.setl.annotation.InterfaceStability 5 | import org.apache.spark.sql.Dataset 6 | 7 | /** 8 | * DatasetConverter inherits from a Converter. It can convert between two Dataset: Dataset[A] and Dataset[B] 9 | * 10 | * @tparam A Type of Dataset[A] 11 | * @tparam B Type of Dataset[B] 12 | */ 13 | @InterfaceStability.Evolving 14 | abstract class DatasetConverter[A, B] extends Converter { 15 | 16 | override type T1 = Dataset[A] 17 | override type T2 = Dataset[B] 18 | 19 | } 20 | -------------------------------------------------------------------------------- /docs/StructAnalyser.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | 3 | **StructAnalyser** provides functionalities to retrieve annotation information from a class. 4 | 5 | It scans the class' metadata and returns a **StructType** so that the **SchemaConverter** could use to transform the schema of a DataFrame/Dataset. 6 | 7 | You can access the metadata of your class by getting the metadata of **StructField** 8 | 9 | ### Demo 10 | 11 | ```scala 12 | case class MyClass(col1: String, @ColumnName("column_2") col2: String) 13 | 14 | // analyseSchema will return a StructType of MyClass 15 | val structType = StructAnalyser.analyseSchema[MyClass] 16 | ``` 17 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/storage/SnappyCompressor.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage; 2 | 3 | import org.xerial.snappy.Snappy; 4 | 5 | import java.io.IOException; 6 | import java.nio.charset.StandardCharsets; 7 | 8 | public class SnappyCompressor implements Compressor { 9 | 10 | @Override 11 | public byte[] compress(String input) throws IOException { 12 | return Snappy.compress(input, StandardCharsets.UTF_8); 13 | } 14 | 15 | @Override 16 | public String decompress(byte[] bytes) throws IOException { 17 | return Snappy.uncompressString(bytes, StandardCharsets.UTF_8); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/BenchmarkResult.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | case class BenchmarkResult(cls: String, read: Double, process: Double, write: Double, get: Double, total: Double) { 4 | 5 | override def toString: String = { 6 | 7 | val formatter = java.text.NumberFormat.getNumberInstance 8 | 9 | s"Benchmark class: $cls\n" + 10 | s"Total elapsed time: ${formatter.format(total)} s\n" + 11 | s"read: ${formatter.format(read)} s\n" + 12 | s"process: ${formatter.format(process)} s\n" + 13 | s"write: ${formatter.format(write)} s\n" + 14 | "=================" 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/Transformer.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.internal.{Identifiable, Logging} 5 | 6 | /** 7 | * A transformer can transform data into a type A 8 | * 9 | * @tparam T : Type of output data 10 | */ 11 | @InterfaceStability.Evolving 12 | trait Transformer[T] extends Logging with Identifiable { 13 | 14 | /** 15 | * Get the transformed data 16 | * 17 | * @return 18 | */ 19 | def transformed: T 20 | 21 | /** 22 | * Transform the current data 23 | */ 24 | def transform(): this.type 25 | } 26 | -------------------------------------------------------------------------------- /docs/SparkSessionBuilder.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | The class `SparkSessionBuilder` is used to configure and build new spark session for the given usage(s). 4 | 5 | ## Code Example 6 | 7 | ```scala 8 | import com.jcdecaux.datacorp.spark.SparkSessionBuilder 9 | 10 | // Auto-configure 11 | val spark1: SparkSession = new SparkSessionBuilder("cassandra") 12 | .setAppName("myApp") 13 | .setEnv("dev") // or AppEnv.DEV 14 | .setCassandraHost("localhost") 15 | .build() 16 | .get() 17 | 18 | // Build with your own SparkConf 19 | val spark2: SparkSession = new SparkSessionBuilder() 20 | .configure(yourSparkConf) 21 | .build() 22 | .get() 23 | 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/Writable.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | /** 4 | * Indicate that users can activate or deactivate the write of the class 5 | */ 6 | trait Writable { 7 | 8 | protected var _write: Boolean = true 9 | 10 | /** 11 | * Whether invoke the write method or not 12 | * 13 | * @param write if set to true, then the write method of the factory will be invoked 14 | * @return 15 | */ 16 | def writable(write: Boolean): this.type = { 17 | this._write = write 18 | this 19 | } 20 | 21 | /** Return true if the write method will be invoked by the pipeline */ 22 | def writable: Boolean = this._write 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/Compressor.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage 2 | 3 | import java.io.IOException 4 | 5 | import io.github.setl.annotation.InterfaceStability 6 | 7 | /** 8 | * A Compressor is able to compress an input string into a byte array and vice versa. 9 | */ 10 | @InterfaceStability.Evolving 11 | trait Compressor extends Serializable { 12 | 13 | /** 14 | * Compress an input string into a byte array 15 | */ 16 | @throws[IOException] 17 | def compress(input: String): Array[Byte] 18 | 19 | /** 20 | * Decompress a byte array into an input string 21 | */ 22 | @throws[IOException] 23 | def decompress(bytes: Array[Byte]): String 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/BaseException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class BaseException extends RuntimeException { 4 | 5 | public BaseException() { 6 | } 7 | 8 | public BaseException(String message) { 9 | super(message); 10 | } 11 | 12 | public BaseException(String message, Throwable cause) { 13 | super(message, cause); 14 | } 15 | 16 | public BaseException(Throwable cause) { 17 | super(cause); 18 | } 19 | 20 | public BaseException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 21 | super(message, cause, enableSuppression, writableStackTrace); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /docs/SchemaConverter.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | 3 | **SchemaConverter** can: 4 | - Convert a Dataset[A] to a DataFrame with the metadata of class **A** (extracted by **StructAnalyser**) 5 | - Convert a DataFrame to a Dataset[A] 6 | 7 | For each of the three annotations: ColumnName, CompoundKey and Compress, SchemaConverter will 8 | - rename the column 9 | - create/drop the compound key column(s) 10 | - compress/decompress the column(s) having Compress annotation. 11 | 12 | ## Demo 13 | 14 | ### Dataset to DataFrame 15 | ```scala 16 | val ds: Dataset[MyClass] = ... 17 | SchemaConverter.toDF(ds) 18 | ``` 19 | 20 | ### DataFrame to Dataset 21 | ```scala 22 | val df: DataFrame = ... 23 | SchemaConverter.fromDF[MyClass](df) 24 | ``` -------------------------------------------------------------------------------- /src/test/resources/test_priority.conf: -------------------------------------------------------------------------------- 1 | my.value = "haha" 2 | 3 | setl.config { 4 | spark { 5 | spark.master = "local" 6 | spark.app.name = "my_app_2" 7 | spark.sql.shuffle.partitions = "1000" 8 | } 9 | } 10 | 11 | setl.config_2 { 12 | spark { 13 | spark.master = "local" 14 | spark.app.name = "my_app_context_2" 15 | spark.sql.shuffle.partitions = "2000" 16 | } 17 | } 18 | 19 | test { 20 | string = "abc" 21 | int = 1 22 | long = 2 23 | float = 3.1 24 | float2 = "3.1" 25 | double = 4.4 26 | boolean = false 27 | boolean2 = "true" 28 | list = [1,2,3] 29 | listFloat = [1.2,2,3] 30 | listString = ["1.2","2","3"] 31 | 32 | map { 33 | v1 = "a" 34 | v2 = "b" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "Feature request title" 5 | labels: feature 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/ConnectorException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class ConnectorException extends BaseException { 4 | public ConnectorException() { 5 | } 6 | 7 | public ConnectorException(String message) { 8 | super(message); 9 | } 10 | 11 | public ConnectorException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public ConnectorException(Throwable cause) { 16 | super(cause); 17 | } 18 | 19 | public ConnectorException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 20 | super(message, cause, enableSuppression, writableStackTrace); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "Issue title" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Environment (please complete the following information):** 23 | - OS: [e.g. iOS] 24 | - Version [e.g. 22] 25 | - Dependencies: 26 | 27 | **Additional context** 28 | Add any other context about the problem here. 29 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/RepositoryException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class RepositoryException extends BaseException { 4 | public RepositoryException() { 5 | } 6 | 7 | public RepositoryException(String message) { 8 | super(message); 9 | } 10 | 11 | public RepositoryException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public RepositoryException(Throwable cause) { 16 | super(cause); 17 | } 18 | 19 | public RepositoryException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 20 | super(message, cause, enableSuppression, writableStackTrace); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/Converter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | 5 | /** 6 | * A converter should be able to convert between two types T1 and T2. 7 | */ 8 | @InterfaceStability.Evolving 9 | trait Converter { 10 | type T1 11 | type T2 12 | 13 | /** 14 | * Convert from an object of type T2 to an object of type T1 15 | * 16 | * @param t2 object of type T2 17 | * @return an object of type T1 18 | */ 19 | def convertFrom(t2: T2): T1 20 | 21 | /** 22 | * Convert an object of type T1 to an object of type T2 23 | * 24 | * @param t1 object of type T1 to be convert to T2 25 | * @return an object of type T2 26 | */ 27 | def convertTo(t1: T1): T2 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanUpdate.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | import org.apache.spark.sql.DataFrame 5 | 6 | /** 7 | * Connectors that inherit CanUpdate should be able to update the data store with a new data frame and a given matching 8 | * columns. 9 | */ 10 | trait CanUpdate { 11 | self: Connector => 12 | 13 | /** 14 | * Update the data store with a new data frame and the given matching columns. 15 | * 16 | * All the matched data will be updated, the non-matched data will be inserted 17 | * 18 | * @param df new data 19 | * @param columns other columns to be matched 20 | */ 21 | def update(df: DataFrame, columns: String*): Unit 22 | 23 | } 24 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: stale 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /docs/utils/Compressor_Archiver.md: -------------------------------------------------------------------------------- 1 | # Compressor 2 | 3 | A [compressor](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/Compressor.scala) 4 | can: 5 | - compress a string to a byte array 6 | - decompress a byte array to a string 7 | 8 | ## Example: 9 | 10 | ```scala 11 | import io.github.setl.storage.GZIPCompressor 12 | 13 | val compressor = new GZIPCompressor() 14 | 15 | val compressed = compressor.compress("data to be compressed") 16 | val data = compressor.decompress(compressed) 17 | ``` 18 | 19 | # Archiver 20 | 21 | An [Archiver](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/Archiver.scala) can 22 | package files and directories into a single data archive file. 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/InvalidSchemaException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class InvalidSchemaException extends BaseException { 4 | public InvalidSchemaException() { 5 | } 6 | 7 | public InvalidSchemaException(String message) { 8 | super(message); 9 | } 10 | 11 | public InvalidSchemaException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public InvalidSchemaException(Throwable cause) { 16 | super(cause); 17 | } 18 | 19 | public InvalidSchemaException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 20 | super(message, cause, enableSuppression, writableStackTrace); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/InvalidConnectorException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class InvalidConnectorException extends BaseException { 4 | public InvalidConnectorException() { 5 | } 6 | 7 | public InvalidConnectorException(String message) { 8 | super(message); 9 | } 10 | 11 | public InvalidConnectorException(String message, Throwable cause) { 12 | super(message, cause); 13 | } 14 | 15 | public InvalidConnectorException(Throwable cause) { 16 | super(cause); 17 | } 18 | 19 | public InvalidConnectorException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 20 | super(message, cause, enableSuppression, writableStackTrace); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/InvalidDeliveryException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | public class InvalidDeliveryException extends BaseException { 4 | 5 | public InvalidDeliveryException() { 6 | } 7 | 8 | public InvalidDeliveryException(String message) { 9 | super(message); 10 | } 11 | 12 | public InvalidDeliveryException(String message, Throwable cause) { 13 | super(message, cause); 14 | } 15 | 16 | public InvalidDeliveryException(Throwable cause) { 17 | super(cause); 18 | } 19 | 20 | public InvalidDeliveryException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { 21 | super(message, cause, enableSuppression, writableStackTrace); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/util/HasSparkSession.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.util 2 | 3 | import org.apache.spark.SparkException 4 | import org.apache.spark.sql.SparkSession 5 | 6 | trait HasSparkSession { 7 | 8 | val spark: SparkSession = SparkSession.getActiveSession match { 9 | case Some(ss) => ss 10 | case _ => throw new SparkException("No active Spark session") 11 | } 12 | 13 | def setJobDescription(desc: String): Unit = spark.sparkContext.setJobDescription(desc) 14 | 15 | def setJobGroup(group: String): Unit = spark.sparkContext.setJobGroup(group, null) 16 | 17 | def setJobGroup(group: String, description: String): Unit = spark.sparkContext.setJobGroup(group, description) 18 | 19 | def clearJobGroup(): Unit = spark.sparkContext.clearJobGroup() 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/DeltaConnectorConfSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import org.apache.spark.sql.SaveMode 4 | import org.scalatest.funsuite.AnyFunSuite 5 | 6 | class DeltaConnectorConfSuite extends AnyFunSuite { 7 | 8 | val conf = new DeltaConnectorConf() 9 | 10 | test("Set DeltaConnectorConf") { 11 | assert(conf.get("path") === None) 12 | assert(conf.get("saveMode") === None) 13 | conf.setPath("./path") 14 | conf.setSaveMode(SaveMode.Overwrite) 15 | 16 | assert(conf.get("path").get === "./path") 17 | assert(conf.get("saveMode").get === "Overwrite") 18 | } 19 | 20 | test("Getters of DynamoDBConnectorConf") { 21 | assert(conf.getPath === "./path") 22 | assert(conf.getSaveMode === SaveMode.Overwrite) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/storage/connector/ConnectorSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | import org.scalatest.BeforeAndAfterAll 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | class ConnectorSuite extends AnyFunSuite with BeforeAndAfterAll { 9 | 10 | test("Connector object") { 11 | val spark: SparkSession = SparkSession.builder().config(new SparkConf()).master("local[*]").getOrCreate() 12 | 13 | val df = spark.emptyDataFrame 14 | 15 | assert(Connector.empty.spark === null) 16 | assert(Connector.empty.storage === null) 17 | assert(Connector.empty.read() === null) 18 | Connector.empty.write(df) 19 | Connector.empty.write(df, Some("suffix")) 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/config/DynamoDBConnectorConfSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.config 2 | 3 | import org.scalatest.funsuite.AnyFunSuite 4 | 5 | class DynamoDBConnectorConfSuite extends AnyFunSuite { 6 | 7 | val conf = new DynamoDBConnectorConf() 8 | 9 | test("Set DynamoDBConnectorConf") { 10 | assert(conf.get("table") === None) 11 | assert(conf.get("readPartitions") === None) 12 | conf.setTable("realTable") 13 | conf.setReadPartitions("realReadPartitions") 14 | 15 | assert(conf.get("table").get === "realTable") 16 | assert(conf.get("readPartitions").get === "realReadPartitions") 17 | } 18 | 19 | test("Getters of DynamoDBConnectorConf") { 20 | assert(conf.getTable === Some("realTable")) 21 | assert(conf.getReadPartitions === Some("realReadPartitions")) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/test/resources/test-json.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "col1": "a", 4 | "col2": 1, 5 | "col3": { 6 | "col3-1": "haha", 7 | "col3-2": "hehe" 8 | }, 9 | "col4": true, 10 | "col5": 1.1 11 | }, 12 | { 13 | "col1": "b", 14 | "col2": 2, 15 | "col3": { 16 | "col3-1": "hahahaha", 17 | "col3-2": "hehehehe" 18 | }, 19 | "col4": true, 20 | "col5": 1.2 21 | }, 22 | { 23 | "col1": "c", 24 | "col2": 3, 25 | "col3": { 26 | "col3-1": "hahahahahaha", 27 | "col3-2": "hehehehehehe" 28 | }, 29 | "col4": false, 30 | "col5": 1.3 31 | }, 32 | { 33 | "col1": "d", 34 | "col2": 4, 35 | "col3": { 36 | "col3-1": "hahahahahahahaha", 37 | "col3-2": "hehehehehehehehe" 38 | }, 39 | "col4": false, 40 | "col5": 1.4 41 | } 42 | ] -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/SparkTestUtils.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | import io.github.setl.util.SparkUtils 4 | import org.apache.spark.SparkContext 5 | 6 | private[setl] object SparkTestUtils { 7 | 8 | def getActiveSparkContext: Option[SparkContext] = { 9 | val method = SparkContext.getClass.getDeclaredMethod("getActive") 10 | method.setAccessible(true) 11 | method.invoke(SparkContext).asInstanceOf[Option[SparkContext]] 12 | } 13 | 14 | def checkSparkVersion(requiredVersion: String): Boolean = SparkUtils.checkSparkVersion(requiredVersion) 15 | 16 | def testConsolePrint(test: => Any, expected: String): Boolean = { 17 | val stream = new java.io.ByteArrayOutputStream() 18 | Console.withOut(stream)(test) 19 | val result = stream.toString().trim() 20 | result == expected 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/annotation/Benchmark.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | 9 | /** 10 | *
The Benchmark annotation should be put on any class of Factory[T] to enable the benchmark process. 11 | * The total elapsed time of the factory will then be recorded.
12 | * 13 | *In addition, user can also put it onto any the "read", "process" or "write" methods that are defined 14 | * in AbstractFactory[T], and the elapsed time of each method will be recorded as well.
15 | */ 16 | @InterfaceStability.Evolving 17 | @Retention(RetentionPolicy.RUNTIME) 18 | @Target({ElementType.METHOD, ElementType.TYPE}) 19 | public @interface Benchmark { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/MLTransformer.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import org.apache.hadoop.fs.Path 5 | import org.apache.spark.ml.Model 6 | 7 | /** 8 | * A MLTransformer is a basic transformer with a ML model and ML-related functionality. 9 | * 10 | * @tparam T Data type of the transformer 11 | */ 12 | @InterfaceStability.Evolving 13 | trait MLTransformer[T, M <: Model[_]] extends Transformer[T] { 14 | 15 | var model: M = _ 16 | val modelPath: Path 17 | var overwriteModel: Boolean = false 18 | 19 | /** Fit a model with the current data */ 20 | def fit(): MLTransformer.this.type 21 | 22 | /** Load a model from a given path */ 23 | def loadModel(): MLTransformer.this.type 24 | 25 | /** Save the current model */ 26 | def saveModel(): MLTransformer.this.type 27 | } 28 | -------------------------------------------------------------------------------- /docs/Stage.md: -------------------------------------------------------------------------------- 1 | ## Definition 2 | A **Stage** is a collection of independent **Factories**. All the stages of a pipeline will be executed sequentially at runtime. Within a stage, all factories could be executed parallelly or sequentially. 3 | 4 | ## Demo 5 | 6 | You could instantiate a stage like the follows: 7 | ```scala 8 | val stage = new Stage() 9 | ``` 10 | 11 | Run in sequential mode: 12 | ```scala 13 | stage.parallel(false) 14 | ``` 15 | 16 | Add a factory into this stage: 17 | ```scala 18 | // Add an already existed instance of factory 19 | val myFactory = new MyFactory() 20 | stage.addFactory(myFactory) 21 | 22 | // Or let the framework handle the instantiation 23 | stage.addFactory(classOf[MyFactory], constructorArguments...) 24 | ``` 25 | 26 | Describe the current stage: 27 | ```scala 28 | stage.describe() 29 | ``` 30 | 31 | Run the current stage: 32 | ```scala 33 | stage.run() 34 | ``` -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanCreate.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | import org.apache.spark.sql.DataFrame 5 | 6 | /** 7 | * Connectors that inherit CanCreate should be able to create a table in a database or a file/folder in a file system 8 | */ 9 | trait CanCreate { 10 | self: Connector => 11 | 12 | /** 13 | * Create a data storage (e.g. table in a database or file/folder in a file system) with a suffix 14 | * 15 | * @param t data frame to be written 16 | * @param suffix suffix to be appended at the end of the data storage name 17 | */ 18 | def create(t: DataFrame, suffix: Option[String]): Unit 19 | 20 | /** 21 | * Create a data storage (e.g. table in a database or file/folder in a file system) 22 | * 23 | * @param t data frame to be written 24 | */ 25 | def create(t: DataFrame): Unit 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasDiagram.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import scala.reflect.runtime 4 | 5 | trait HasDiagram { 6 | 7 | /** Generate the diagram */ 8 | def toDiagram: String 9 | 10 | /** Get the diagram ID */ 11 | def diagramId: String 12 | 13 | protected def getTypeArgList(tpe: runtime.universe.Type): List[runtime.universe.Symbol] = { 14 | tpe 15 | .baseClasses.head 16 | .asClass 17 | .primaryConstructor 18 | .typeSignature 19 | .paramLists 20 | .head 21 | } 22 | 23 | protected def formatDiagramId(prettyName: String, 24 | deliveryId: String, 25 | suffix: String): String = { 26 | prettyName.replaceAll("[\\[\\]]", "") + deliveryId.capitalize + suffix 27 | } 28 | 29 | /** Display the diagram */ 30 | def showDiagram(): Unit = println(toDiagram) 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/factory/FactoryDeliveryMetadataSuite.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.factory 2 | 3 | import io.github.setl.internal.TestClasses.TestFactory 4 | import io.github.setl.transformation.FactoryDeliveryMetadata 5 | import io.github.setl.workflow.External 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | class FactoryDeliveryMetadataSuite extends AnyFunSuite { 9 | 10 | val fac = new TestFactory 11 | 12 | test("Test FactoryDeliveryMetadata Builder") { 13 | 14 | val setters = FactoryDeliveryMetadata.builder().setFactory(fac).getOrCreate() 15 | 16 | setters.foreach(println) 17 | 18 | assert(setters.size === 4) 19 | assert(setters.map(_.factoryUUID).toSet.size === 1) 20 | assert(setters.find(_.name == "inputInt").get.producer === classOf[External]) 21 | assert(setters.find(_.name == "setInputs").get.argTypes.size === 2) 22 | assert(setters.find(_.isDataset.contains(true)).size === 0) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/repository/RepositoryAdapter.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.repository 2 | 3 | import io.github.setl.Converter 4 | import io.github.setl.annotation.InterfaceStability 5 | import io.github.setl.storage.Condition 6 | 7 | /** 8 | * RepositoryAdapter could be used when one wants to save a `Dataset[A]` to a data store of type `B`. 9 | * 10 | * A `Repository[A]` and a `DatasetConverter[A, B]` must be provided (either explicitly or implicitly) 11 | * 12 | * @tparam A Type of the Repository 13 | * @tparam B Target data store type 14 | */ 15 | @InterfaceStability.Evolving 16 | trait RepositoryAdapter[A, B] { 17 | 18 | val repository: Repository[A] 19 | 20 | val converter: Converter 21 | 22 | def findAllAndConvert(): A 23 | 24 | def findByAndConvert(conditions: Set[Condition]): A 25 | 26 | def findByAndConvert(condition: Condition): A 27 | 28 | def convertAndSave(data: A, suffix: Option[String]): this.type 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/HasBenchmark.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.BenchmarkResult 4 | 5 | /** 6 | * HasBenchmark should be used for object having an aggregated benchmark. Typically a Pipeline or a Stage 7 | */ 8 | trait HasBenchmark { 9 | 10 | protected var _benchmark: Option[Boolean] = None 11 | 12 | /** 13 | * True if the benchmark will be measured, otherwise false 14 | * 15 | * @return boolean 16 | */ 17 | def benchmark: Option[Boolean] = _benchmark 18 | 19 | /** 20 | * Set to true to enable the benchmarking 21 | * 22 | * @param boo true to enable benchmarking 23 | * @return this object 24 | */ 25 | def benchmark(boo: Boolean): this.type = { 26 | _benchmark = Option(boo) 27 | this 28 | } 29 | 30 | /** 31 | * Get the aggregated benchmark result. 32 | * 33 | * @return an array of BenchmarkResult 34 | */ 35 | def getBenchmarkResult: Array[BenchmarkResult] 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=warn, stdout 3 | # Captures all logs inside jcdecaux airport package 4 | log4j.logger.com.jcdecaux=DEBUG, stdout 5 | log4j.additivity.com.jcdecaux=false 6 | # Decrease the verbosity of external libraries logging 7 | log4j.logger.org.apache=WARN, stdout 8 | log4j.additivity.org.apache=false 9 | log4j.logger.com.datastax=INFO, stdout 10 | log4j.additivity.com.datastax=false 11 | log4j.logger.io.netty=WARN, stdout 12 | log4j.additivity.io.netty=false 13 | log4j.logger.org.apache.spark.sql=WARN, stdout 14 | log4j.additivity.org.apache.spark.sql=false 15 | log4j.logger.org.apache.spark.core=WARN, stdout 16 | log4j.additivity.org.apache.spark.core=false 17 | # Direct log messages to stdout 18 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 19 | log4j.appender.stdout.Target=System.out 20 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 21 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{3}:%L - %m%n 22 | -------------------------------------------------------------------------------- /docs/Transformer.md: -------------------------------------------------------------------------------- 1 | The notion of the transformer is preliminary. 2 | 3 | # Definition 4 | **Transformer** is the atomic class for data transformation. A `transformer[T]` will transform some input data into an object of type **T**. 5 | 6 | 7 | ## When should I use a transformer 8 | The original idea of the transformer is to decouple a complex data processing procedure of a **Factory**. Generally, a transformer should be placed inside a **Factory**. A factory can have multiple transformers. 9 | 10 | A transformer should be simple (in terms of task, for example, transform an object of type A to type B) and stateless (which means it should minimize its dependence on the application context). 11 | 12 | Another use case would be to implement several different data transformation logic for one factory (for example, there may be several different ML models for one single prediction job). In this case, there should be a way to select the most appropriate transformer according to their performance in a specific environment. 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/exception/UnknownException.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.exception; 2 | 3 | /** 4 | * UnknownException 5 | */ 6 | public class UnknownException extends BaseException { 7 | 8 | public UnknownException(String errorMessage) { 9 | super(errorMessage); 10 | } 11 | 12 | public static class Storage extends UnknownException { 13 | public Storage(String errorMessage) { 14 | super(errorMessage); 15 | } 16 | } 17 | 18 | public static class Format extends UnknownException { 19 | public Format(String errorMessage) { 20 | super(errorMessage); 21 | } 22 | } 23 | 24 | public static class Environment extends UnknownException { 25 | public Environment(String errorMessage) { 26 | super(errorMessage); 27 | } 28 | } 29 | 30 | public static class ValueType extends UnknownException { 31 | public ValueType(String errorMessage) { 32 | super(errorMessage); 33 | } 34 | } 35 | } 36 | 37 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/ConnectorInterface.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.config.Conf 4 | import io.github.setl.enums.Storage 5 | import com.typesafe.config.Config 6 | 7 | /** 8 | * ConnectorInterface provides the abstraction of a pluggable connector that could be used by [[io.github.setl.storage.ConnectorBuilder]]. 9 | * Users can implement their customized data source connector by extending this trait. 10 | */ 11 | trait ConnectorInterface extends Connector { 12 | 13 | /** 14 | * By default, the custom connector's storage type should be OTHER. 15 | */ 16 | override val storage: Storage = Storage.OTHER 17 | 18 | /** 19 | * Configure the connector with the given [[Conf]] 20 | * @param conf an object of [[Conf]] 21 | */ 22 | def setConf(conf: Conf): Unit 23 | 24 | /** 25 | * Configure the connector with the given [[Config]] 26 | * @param config an object of [[Config]] 27 | */ 28 | def setConfig(config: Config): Unit = this.setConf(Conf.fromConfig(config)) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/transformation/FactoryInput.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.transformation 2 | 3 | import io.github.setl.internal.HasType 4 | 5 | import scala.language.existentials 6 | import scala.reflect.runtime 7 | 8 | /** 9 | * Metadata of an input of a Factory. 10 | * 11 | * If a `FactoryDeliveryMetadata` represents a method, then it may be converted to multiple FactoryInputs as each of its 12 | * arguments will be abstracted as a `FactoryInput`. 13 | * 14 | * @param runtimeType runtime type of the input 15 | * @param producer producer of the input 16 | * @param deliveryId delivery id of the input 17 | */ 18 | private[setl] case class FactoryInput(override val runtimeType: runtime.universe.Type, 19 | producer: Class[_], 20 | deliveryId: String = Deliverable.DEFAULT_ID, 21 | autoLoad: Boolean, 22 | optional: Boolean, 23 | consumer: Class[_ <: Factory[_]]) extends HasType 24 | -------------------------------------------------------------------------------- /docs/data_access_layer/SparkRepositoryBuilder.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | Based on the same idea of [**ConnectorBuilder**](ConnectorBuilder), [**SparkRepositoryBuilder**](https://github.com/SETL-Developers/setl/tree/master/src/main/scala/com/jcdecaux/setl/storage/SparkRepositoryBuilder.scala) helps you create your **SparkRepository** :ok_hand: 4 | 5 | ## Usage 6 | Firstly, you should create a configuration file in your project's resources directory. 7 | 8 | In this case, let's call it `application.conf`. 9 | 10 | ```text 11 | csvConfiguration { 12 | storage = "CSV" 13 | path = "your/path/to/file.csv" 14 | inferSchema = "true" 15 | delimiter = ";" 16 | header = "true" 17 | saveMode = "Append" 18 | } 19 | ``` 20 | 21 | Then you can use **ConfigLoader** to load your configuration file. By default it loads `application.conf`. 22 | ```scala 23 | val repo = new SparkRepositoryBuilder[MyClass](setl.configLoader.getConfig("csvConfiguration")).getOrCreate() 24 | 25 | repo.findAll() 26 | repo.save(dataset) 27 | ``` 28 | 29 | ## Parameters 30 | Please refer to [Connector documentation](Connector) -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/storage/connector/ParquetConnector.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.storage.connector 2 | 3 | import io.github.setl.annotation.InterfaceStability 4 | import io.github.setl.config.{Conf, FileConnectorConf} 5 | import io.github.setl.enums.Storage 6 | import io.github.setl.util.TypesafeConfigUtils 7 | import com.typesafe.config.Config 8 | import org.apache.spark.sql._ 9 | 10 | /** 11 | * ParquetConnector contains functionality for transforming [[DataFrame]] into parquet files 12 | */ 13 | @InterfaceStability.Evolving 14 | class ParquetConnector(override val options: FileConnectorConf) extends FileConnector(options) { 15 | 16 | def this(options: Map[String, String]) = this(FileConnectorConf.fromMap(options)) 17 | 18 | def this(path: String, saveMode: SaveMode) = this(Map("path" -> path, "saveMode" -> saveMode.toString)) 19 | 20 | def this(config: Config) = this(TypesafeConfigUtils.getMap(config)) 21 | 22 | def this(conf: Conf) = this(conf.toMap) 23 | 24 | override val storage: Storage = Storage.PARQUET 25 | 26 | this.options.setStorage(storage) 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/io/github/setl/internal/CanWait.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl.internal 2 | 3 | import io.github.setl.storage.connector.Connector 4 | import io.github.setl.storage.connector.Connector 5 | 6 | /** 7 | * Connectors that inherit CanWait should be able to wait for the execution to stop 8 | */ 9 | trait CanWait { 10 | self: Connector => 11 | 12 | /** 13 | * Wait for the execution to stop. Any exceptions that occurs during the execution 14 | * will be thrown in this thread. 15 | */ 16 | def awaitTermination(): Unit 17 | 18 | /** 19 | * Wait for the execution to stop. Any exceptions that occurs during the execution 20 | * will be thrown in this thread. 21 | * 22 | * @param timeout time to wait in milliseconds 23 | * @return `true` if it's stopped; or throw the reported error during the execution; or `false` 24 | * if the waiting time elapsed before returning from the method. 25 | */ 26 | def awaitTerminationOrTimeout(timeout: Long): Boolean 27 | 28 | /** 29 | * Stops the execution of this query if it is running. 30 | */ 31 | def stop(): Unit 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/enums/Storage.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.enums; 2 | 3 | /** 4 | * StorageType 5 | */ 6 | public enum Storage { 7 | CSV("io.github.setl.storage.connector.CSVConnector"), 8 | EXCEL("io.github.setl.storage.connector.ExcelConnector"), 9 | PARQUET("io.github.setl.storage.connector.ParquetConnector"), 10 | DELTA("io.github.setl.storage.connector.DeltaConnector"), 11 | CASSANDRA("io.github.setl.storage.connector.CassandraConnector"), 12 | DYNAMODB("io.github.setl.storage.connector.DynamoDBConnector"), 13 | JSON("io.github.setl.storage.connector.JSONConnector"), 14 | JDBC("io.github.setl.storage.connector.JDBCConnector"), 15 | STRUCTURED_STREAMING("io.github.setl.storage.connector.StructuredStreamingConnector"), 16 | HUDI("io.github.setl.storage.connector.HudiConnector"), 17 | SPARK_SQL("io.github.setl.storage.connector.SparkSQLConnector"), 18 | OTHER(null); 19 | 20 | private String connectorName; 21 | 22 | Storage(String cls) { 23 | this.connectorName = cls; 24 | } 25 | 26 | public String connectorName() { 27 | return connectorName; 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/annotation/InterfaceStability.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation; 2 | 3 | import java.lang.annotation.Documented; 4 | 5 | /** 6 | * Annotation to inform users of how much to rely on a particular package, 7 | * class or method not changing over time. 8 | */ 9 | public class InterfaceStability { 10 | 11 | /** 12 | * Stable APIs that retain source and binary compatibility within a major release. 13 | * These interfaces can change from one major release to another major release 14 | * (e.g. from 1.0 to 2.0). 15 | */ 16 | @Documented 17 | public @interface Stable { 18 | } 19 | 20 | /** 21 | * APIs that are meant to evolve towards becoming stable APIs, but are not stable APIs yet. 22 | * Evolving interfaces can change from one feature release to another release (i.e. 2.1 to 2.2). 23 | */ 24 | @Documented 25 | public @interface Evolving { 26 | } 27 | 28 | /** 29 | * Unstable APIs, with no guarantee on stability. 30 | * Classes that are unannotated are considered Unstable. 31 | */ 32 | @Documented 33 | public @interface Unstable { 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/test/resources/streaming_test_resources/streaming.conf: -------------------------------------------------------------------------------- 1 | structured_streaming_connector_input { 2 | storage = "STRUCTURED_STREAMING" 3 | format = "text" 4 | path = "src/test/resources/streaming_test_resources/input" 5 | } 6 | 7 | structured_streaming_connector_output { 8 | storage = "STRUCTURED_STREAMING" 9 | format = "csv" 10 | header = "false" 11 | outputMode = "append" 12 | checkpointLocation = "src/test/resources/streaming_test_resources/output/checkpoint_2" 13 | path = "src/test/resources/streaming_test_resources/output/2" 14 | } 15 | 16 | structured_streaming_connector_input_repository { 17 | storage = "STRUCTURED_STREAMING" 18 | format = "csv" 19 | schema = "text STRING" // must be provided for streaming 20 | header = "true" 21 | path = "src/test/resources/streaming_test_resources/input2" 22 | } 23 | 24 | structured_streaming_connector_output_repository { 25 | storage = "STRUCTURED_STREAMING" 26 | format = "csv" 27 | header = "true" 28 | outputMode = "append" 29 | checkpointLocation = "src/test/resources/streaming_test_resources/output/checkpoint_3" 30 | path = "src/test/resources/streaming_test_resources/output/3" 31 | } 32 | -------------------------------------------------------------------------------- /src/test/scala/io/github/setl/TestObject.scala: -------------------------------------------------------------------------------- 1 | package io.github.setl 2 | 3 | import java.sql.{Date, Timestamp} 4 | 5 | import io.github.setl.config.Conf 6 | import io.github.setl.internal.CanDrop 7 | import io.github.setl.storage.connector.ConnectorInterface 8 | import com.typesafe.config.Config 9 | import org.apache.spark.sql.DataFrame 10 | 11 | 12 | case class TestObject(partition1: Int, partition2: String, clustering1: String, value: Long) 13 | 14 | case class TestObject3(partition1: Int, partition2: String, clustering1: String, value: Long, value2: String) 15 | 16 | case class TestObject2(col1: String, col2: Int, col3: Double, col4: Timestamp, col5: Date, col6: Long) 17 | 18 | class CustomConnector extends ConnectorInterface with CanDrop { 19 | override def setConf(conf: Conf): Unit = null 20 | 21 | override def read(): DataFrame = { 22 | import spark.implicits._ 23 | Seq(1, 2, 3).toDF("id") 24 | } 25 | 26 | override def write(t: DataFrame, suffix: Option[String]): Unit = logDebug("Write with suffix") 27 | 28 | override def write(t: DataFrame): Unit = logDebug("Write") 29 | 30 | /** 31 | * Drop the entire table. 32 | */ 33 | override def drop(): Unit = logDebug("drop") 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/io/github/setl/annotation/Compress.java: -------------------------------------------------------------------------------- 1 | package io.github.setl.annotation; 2 | 3 | import io.github.setl.internal.SchemaConverter; 4 | import io.github.setl.internal.StructAnalyser; 5 | import io.github.setl.storage.Compressor; 6 | import io.github.setl.storage.XZCompressor; 7 | 8 | import java.lang.annotation.ElementType; 9 | import java.lang.annotation.Retention; 10 | import java.lang.annotation.RetentionPolicy; 11 | import java.lang.annotation.Target; 12 | 13 | /** 14 | *15 | * The annotation Compress indicates {@link StructAnalyser} to save the metadata of corresponding fields 16 | * into the output StructType object. All annotated columns will be compressed by {@link SchemaConverter} 17 | * during the saving process in SparkRepository 18 | *
19 | * 20 | *
21 | * By default, the compression algorithm is XZ with the default compression level (=6). You can define other compressor
22 | * by implementing com.jcdecaux.datacorp.storage.Compressor interface.
23 | *
14 | * If multiple {@link io.github.setl.transformation.Deliverable} of the same type were found in the delivery pool of DispatchManager, then
15 | * it will try to compare the producer of the Deliverable
16 | */
17 | @InterfaceStability.Evolving
18 | @Retention(RetentionPolicy.RUNTIME)
19 | @Target({ElementType.FIELD, ElementType.METHOD})
20 | public @interface Delivery {
21 |
22 | /**
23 | * Producer of the current delivery that will be use by DispatchManager in order to find the corresponding delivery
24 | */
25 | Class> producer() default External.class;
26 |
27 | /**
28 | * Indicates whether the current Delivery is optional or not
29 | */
30 | boolean optional() default false;
31 |
32 | boolean autoLoad() default false;
33 |
34 | String condition() default "";
35 |
36 | String id() default "";
37 | }
38 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/SnappyCompressorSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage
2 |
3 | import org.scalatest.funsuite.AnyFunSuite
4 |
5 | class SnappyCompressorSuite extends AnyFunSuite {
6 |
7 | val compressor = new SnappyCompressor
8 |
9 | test("XZCompressor should be able to compress a string to a Byte[]") {
10 | println(s"String1: ${str.getBytes().length} -> ${compressor.compress(str).length}")
11 | println(s"String2: ${str2.getBytes().length} -> ${compressor.compress(str2).length}")
12 | println(s"String3: ${str3.getBytes().length} -> ${compressor.compress(str3).length}")
13 | println(s"String4: ${str4.getBytes().length} -> ${compressor.compress(str4).length}")
14 |
15 | assert(str.getBytes().length >= compressor.compress(str).length)
16 | assert(str2.getBytes().length >= compressor.compress(str2).length)
17 | assert(str3.getBytes().length >= compressor.compress(str3).length)
18 | assert(str4.getBytes().length >= compressor.compress(str4).length)
19 |
20 | }
21 |
22 | test("XZCompressor should be able to decompress a Byte array to string") {
23 | assert(compressor.decompress(compressor.compress(str)) === str)
24 | assert(compressor.decompress(compressor.compress(str2)) === str2)
25 | assert(compressor.decompress(compressor.compress(str3)) === str3)
26 | assert(compressor.decompress(compressor.compress(str4)) === str4)
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/Logging.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.internal
2 |
3 | import io.github.setl.annotation.InterfaceStability
4 | import org.apache.log4j.{LogManager, Logger}
5 |
6 | /**
7 | * Logging provide logging features for the class that extends this trait
8 | */
9 | @InterfaceStability.Evolving
10 | private[setl] trait Logging {
11 |
12 | // Make the log field transient so that objects with Logging can
13 | // be serialized and used on another machine
14 | @transient private var logger: Logger = _
15 |
16 | // Method to get or create the logger for this object
17 | protected def log: Logger = {
18 | if (logger == null) {
19 | logger = LogManager.getLogger(logName)
20 | }
21 | logger
22 | }
23 |
24 | // Method to get the logger name for this object
25 | protected def logName: String = {
26 | // Ignore trailing $'s in the class names for Scala objects
27 | this.getClass.getName.stripSuffix("$")
28 | }
29 |
30 | protected def logInfo(msg: => String): Unit = {
31 | if (log.isInfoEnabled) log.info(msg)
32 | }
33 |
34 | protected def logDebug(msg: => String): Unit = {
35 | if (log.isDebugEnabled) log.debug(msg)
36 | }
37 |
38 | protected def logTrace(msg: => String): Unit = {
39 | if (log.isTraceEnabled) log.trace(msg)
40 | }
41 |
42 | protected def logWarning(msg: => String): Unit = log.warn(msg)
43 |
44 | protected def logError(msg: => String): Unit = log.error(msg)
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/GZIPCompressorSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage
2 |
3 | import org.scalatest.funsuite.AnyFunSuite
4 |
5 | class GZIPCompressorSuite extends AnyFunSuite {
6 |
7 | val compressor = new GZIPCompressor
8 |
9 | test("GZIPCompressor should be able to compress a string to a Byte[]") {
10 | println(s"String1: ${str.getBytes().length} -> ${compressor.compress(str).length}")
11 | println(s"String2: ${str2.getBytes().length} -> ${compressor.compress(str2).length}")
12 | println(s"String3: ${str3.getBytes().length} -> ${compressor.compress(str3).length}")
13 | println(s"String4: ${str4.getBytes().length} -> ${compressor.compress(str4).length}")
14 | assert(str.getBytes().length >= compressor.compress(str).length)
15 | assert(str2.getBytes().length >= compressor.compress(str2).length)
16 | assert(str3.getBytes().length >= compressor.compress(str3).length)
17 | assert(str4.getBytes().length >= compressor.compress(str4).length)
18 |
19 | }
20 |
21 | test("GZIPCompressor should be able to decompress a Byte array to string") {
22 | assert(compressor.decompress(compressor.compress(str)) === str)
23 | assert(compressor.decompress(compressor.compress(str2)) === str2)
24 | assert(compressor.decompress(compressor.compress(str3)) === str3)
25 | assert(compressor.decompress(compressor.compress(str4)) === str4)
26 | assert(compressor.decompress("testtesttest".getBytes()) === "testtesttest")
27 |
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/docs/data_access_layer/CustomConnector.md:
--------------------------------------------------------------------------------
1 | ## Custom Connector
2 |
3 | You can implement you own data source connector by implementing the `ConnectorInterface`
4 |
5 | ```scala
6 | import io.github.setl.storage.connector.ConnectorInterface
7 | import io.github.setl.internal.CanDrop
8 | import io.github.setl.config.Conf
9 | import org.apache.spark.sql.DataFrame
10 |
11 | class CustomConnector extends ConnectorInterface with CanDrop {
12 | override def setConf(conf: Conf): Unit = {
13 | // configuration
14 | }
15 |
16 | override def read(): DataFrame = {
17 | import spark.implicits._
18 | Seq(1, 2, 3).toDF("id")
19 | }
20 |
21 | override def write(t: DataFrame, suffix: Option[String]): Unit = logDebug("Write with suffix")
22 |
23 | override def write(t: DataFrame): Unit = logDebug("Write")
24 |
25 | override def drop(): Unit = logDebug("drop")
26 | }
27 | ```
28 |
29 | ### Functionalities
30 |
31 | Like the previous example, by extending your connector class with functionality traits (like `CanDrop`)
32 | and implementing accordingly their abstract methods, SparkRepository will be able to use these specific
33 | functionalities.
34 |
35 | ### Use the custom connector
36 |
37 | To use this connector, set the storage to **OTHER** and provide the class reference of your connector:
38 |
39 | ```txt
40 | myConnector {
41 | storage = "OTHER"
42 | class = "com.example.CustomConnector" // class reference of your connector
43 | yourParam = "some parameter" // put your parameters here
44 | }
45 | ```
46 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/PropertiesSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.config
2 |
3 | import io.github.setl.util.TypesafeConfigUtils
4 | import org.scalatest.BeforeAndAfterAll
5 | import org.scalatest.funsuite.AnyFunSuite
6 |
7 | class PropertiesSuite extends AnyFunSuite with BeforeAndAfterAll {
8 |
9 | override protected def beforeAll(): Unit = {
10 | System.setProperty("myvalue", "test-my-value")
11 | }
12 |
13 | System.setProperty("myvalue", "test-my-value")
14 |
15 | override protected def afterAll(): Unit = {
16 | System.clearProperty("myvalue")
17 | }
18 |
19 | // test("ConfigLoader beforeAll") {
20 | // assert(Properties.cl.get("myValue") === "test-my-value")
21 | // assert(Properties.cl.get("test.myValue2") === "test-my-value-loaded")
22 | // }
23 |
24 | test("Cassandra config") {
25 | assert(TypesafeConfigUtils.getAs[String](Properties.cassandraConfig, "storage").get === "CASSANDRA")
26 | assert(TypesafeConfigUtils.getAs[String](Properties.cassandraConfig, "keyspace").get === "test_space")
27 | assert(TypesafeConfigUtils.getAs[String](Properties.cassandraConfig, "table").get === "test_spark_connector2")
28 | assert(TypesafeConfigUtils.getList(Properties.cassandraConfig, "partitionKeyColumns").get === Array("partition1", "partition2"))
29 | assert(TypesafeConfigUtils.getList(Properties.cassandraConfig, "clusteringKeyColumns").get === Array("clustering1"))
30 | assert(TypesafeConfigUtils.getList(Properties.cassandraConfig, "doesntExist") === None)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/docs/data_access_layer/Structured-Streaming-Connector.md:
--------------------------------------------------------------------------------
1 | **StructuredStreamingConnector** is a new connector added since the version 0.4.3. It brings the Spark Structured Streaming API together with the Connector API. It allows users to manipulate streaming data like any other static connectors.
2 |
3 | Here is an implementation of the [word count program](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#quick-example) from the Spark structured streaming documentation:
4 |
5 | ```scala
6 | // Configuration
7 | val input = Map(
8 | "storage" -> "STRUCTURED_STREAMING",
9 | "format" -> "socket",
10 | "host" -> "localhost",
11 | "port" -> "9999"
12 | )
13 |
14 | val output = Map(
15 | "storage" -> "STRUCTURED_STREAMING",
16 | "outputMode" -> "complete",
17 | "format" -> "console"
18 | )
19 |
20 | val spark = SparkSession
21 | .builder
22 | .appName("StructuredNetworkWordCount")
23 | .master("local")
24 | .getOrCreate()
25 |
26 | import spark.implicits._
27 |
28 | val inputConnector = new ConnectorBuilder(Conf.fromMap(input)).getOrCreate()
29 | val outputConnector = new ConnectorBuilder(Conf.fromMap(output)).getOrCreate().asInstanceOf[StructuredStreamingConnector]
30 |
31 | // read lines
32 | val lines = inputConnector.read()
33 | // Split the lines into words
34 | val words = lines.as[String].flatMap(_.split(" "))
35 | // Generate running word count
36 | val wordCounts = words.groupBy("value").count()
37 | // Show the output
38 | outputConnector.write(wordCounts)
39 | outputConnector.awaitTermination()
40 | ```
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: test
2 |
3 | on:
4 | pull_request:
5 | branches: [ master ]
6 | paths-ignore:
7 | - 'README.md'
8 | - 'docs/**'
9 | - '.github/ISSUE_TEMPLATE/**'
10 |
11 | jobs:
12 | test_setl:
13 | runs-on: ubuntu-latest
14 | strategy:
15 | fail-fast: false
16 | matrix:
17 | SCALA_VER: ["2.12", "2.11"]
18 | SPARK_VER: ["3.2", "3.0", "2.4", "2.3"]
19 | exclude:
20 | - SCALA_VER: 2.12
21 | SPARK_VER: 2.3
22 | - SCALA_VER: 2.11
23 | SPARK_VER: 3.0
24 | - SCALA_VER: 2.11
25 | SPARK_VER: 3.2
26 | steps:
27 | - name: Checkout
28 | uses: actions/checkout@v2
29 |
30 | - name: Set up JDK 1.8
31 | uses: actions/setup-java@v1
32 | with:
33 | java-version: 1.8
34 |
35 | - name: Before all
36 | run: |
37 | chmod +x ./dev/change-scala-version.sh
38 | ./dev/change-scala-version.sh ${{ matrix.SCALA_VER }}
39 | docker-compose -f ./dev/docker-compose.yml up -d
40 |
41 | - name: Run tests
42 | run: |
43 | set -e
44 | export AWS_ACCESS_KEY_ID="fakeAccess"
45 | export AWS_SECRET_ACCESS_KEY="fakeSecret"
46 | export AWS_REGION="eu-west-1"
47 | mvn -B -ntp clean:clean scoverage:report -P snapshot,spark_${{ matrix.SPARK_VER }}
48 |
49 | - name: Upload coverage to Codecov
50 | uses: codecov/codecov-action@v1
51 | with:
52 | flags: pr_${{ matrix.SCALA_VER }}_${{ matrix.SPARK_VER }}
53 | name: codecov-pull-request
54 |
55 |
--------------------------------------------------------------------------------
/docs/Factory.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | A **Factory[A]** is a complete data transformation job to produce an object of type A.
3 |
4 | ## Difference with *Transformer*
5 | A **Factory** is more complex than a **Transformer**. In addition to data transformation, a **Factory** contains also logics for reading and writing data.
6 |
7 | ## Demo
8 | You could implement your own factory by extending the class **Factory[A]**.
9 |
10 | ```scala
11 | case class MyProduct
12 |
13 | // MyFactory will produce MyProduct
14 | class MyFactory extend Factory[MyProduct] {
15 | override def read(): this.type = ...
16 | override def process(): this.type = ...
17 | override def write(): this.type = ...
18 | override def get(): MyProduct = ...
19 | }
20 | ```
21 |
22 | To run **MyFactory**:
23 | ```scala
24 | new MyFactory().read().process().write().get()
25 | ```
26 |
27 | ## Dependency Handling
28 | Dependency of a **Factory** could be handled by a **Pipeline** if the field has the **Delivery** annotation.
29 | For the previous **MyFactory** class:
30 |
31 | ```scala
32 | case class MyProduct
33 |
34 | // MyFactory will produce MyProduct
35 | class MyFactory extend Factory[MyProduct] {
36 |
37 | @Delivery
38 | var input: String = _
39 |
40 | override def read(): this.type = ...
41 | override def process(): this.type = ...
42 | override def write(): this.type = ...
43 | override def get(): MyProduct = ...
44 | }
45 | ```
46 |
47 | By adding `@Delivery` to the variable **input**, the value of **input** will be automatically injected by **Pipeline**.
48 |
49 | For more information about dependency handling, read the [doc of **Pipeline**](Pipeline).
50 |
--------------------------------------------------------------------------------
/src/test/resources/local.conf:
--------------------------------------------------------------------------------
1 | include "application.conf"
2 |
3 | test.string = "foo"
4 | test.variable = ${?myJvmProperty}
5 |
6 | setl.config {
7 | spark {
8 | spark.app.name = "my_app"
9 | spark.sql.shuffle.partitions = "1000"
10 | }
11 | }
12 |
13 | setl.config_2 {
14 | spark.app.name = "my_app_2"
15 | spark.sql.shuffle.partitions = "2000"
16 | }
17 |
18 | usages.config {
19 | spark {
20 | spark.app.name = "usages_app"
21 | spark.cassandra.connection.host = "cassandraHost"
22 | }
23 | usages = ["cassandra"]
24 | }
25 |
26 | context.spark.spark.sql.shuffle.partitions = 600
27 |
28 | csv_dc_context2 {
29 | storage = "CSV"
30 | path = "src/test/resources/test_config_csv_dc_context2"
31 | inferSchema = "true"
32 | delimiter = ";"
33 | header = "true"
34 | saveMode = "Append"
35 | }
36 |
37 | csv_dc_context {
38 | storage = "CSV"
39 | path = "src/test/resources/test_config_csv_dc_context"
40 | inferSchema = "true"
41 | delimiter = ";"
42 | header = "true"
43 | saveMode = "Append"
44 | }
45 |
46 | parquet_dc_context {
47 | storage = "PARQUET"
48 | path = "src/test/resources/test_parquet_dc_context" // must be absolute path
49 | table = "test_config2222"
50 | saveMode = "Append"
51 | }
52 |
53 | csv_dc_context_consumer {
54 | storage = "CSV"
55 | path = "src/test/resources/test_config_csv_dc_context_consumer"
56 | inferSchema = "true"
57 | delimiter = ";"
58 | header = "true"
59 | saveMode = "Overwrite"
60 | }
61 |
62 | parquet_dc_context_consumer {
63 | storage = "PARQUET"
64 | path = "src/test/resources/test_parquet_dc_context_consumer" // must be absolute path
65 | saveMode = "Append"
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/java/io/github/setl/storage/XZCompressor.java:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage;
2 |
3 | import org.tukaani.xz.LZMA2Options;
4 | import org.tukaani.xz.XZInputStream;
5 | import org.tukaani.xz.XZOutputStream;
6 |
7 | import java.io.ByteArrayInputStream;
8 | import java.io.ByteArrayOutputStream;
9 | import java.io.IOException;
10 | import java.nio.charset.StandardCharsets;
11 |
12 | /**
13 | * XZCompressor implement {@link Compressor}'s interface with the XZ compression algorithm
14 | */
15 | public class XZCompressor implements Compressor {
16 |
17 | @Override
18 | public byte[] compress(String input) throws IOException {
19 | if ((input == null) || (input.length() == 0)) {
20 | return null;
21 | }
22 | ByteArrayOutputStream xzOutput = new ByteArrayOutputStream();
23 | XZOutputStream xzStream = new XZOutputStream(xzOutput, new LZMA2Options(LZMA2Options.PRESET_DEFAULT));
24 | xzStream.write(input.getBytes(StandardCharsets.UTF_8));
25 | xzStream.close();
26 | return xzOutput.toByteArray();
27 | }
28 |
29 | @Override
30 | public String decompress(byte[] bytes) throws IOException {
31 | if ((bytes == null) || (bytes.length == 0)) {
32 | return "";
33 | }
34 | XZInputStream xzInputStream = new XZInputStream(new ByteArrayInputStream(bytes));
35 | byte firstByte = (byte) xzInputStream.read();
36 | byte[] buffer = new byte[xzInputStream.available() + 1];
37 | buffer[0] = firstByte;
38 | xzInputStream.read(buffer, 1, buffer.length - 1);
39 | xzInputStream.close();
40 | return new String(buffer);
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/SparkSQLConnector.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.connector
2 |
3 | import com.typesafe.config.Config
4 | import io.github.setl.config.Conf
5 | import io.github.setl.enums.Storage
6 | import io.github.setl.util.TypesafeConfigUtils
7 | import org.apache.spark.sql.DataFrame
8 |
9 | class SparkSQLConnector(val query: String) extends Connector {
10 | override val storage: Storage = Storage.SPARK_SQL
11 |
12 | def this(conf: Conf) = this(conf.get("query", ""))
13 | def this(config: Config) = this(
14 | query = TypesafeConfigUtils.getAs[String](config, "query").getOrElse("")
15 | )
16 |
17 | require(query.nonEmpty, "query is not defined")
18 |
19 | /**
20 | * Read data from the data source
21 | *
22 | * @return a [[DataFrame]]
23 | */
24 | @throws[org.apache.spark.sql.AnalysisException](s"$query is invalid")
25 | override def read(): DataFrame = spark.sql(query)
26 |
27 | /**
28 | * Write a [[DataFrame]] into the data storage
29 | *
30 | * @param t a [[DataFrame]] to be saved
31 | * @param suffix for data connectors that support suffix (e.g. [[FileConnector]],
32 | * add the given suffix to the save path
33 | */
34 | override def write(t: DataFrame, suffix: Option[String]): Unit = {
35 | if (suffix.isDefined) logWarning("suffix is not supported in SparkSQLConnector")
36 | write(t)
37 | }
38 |
39 | /**
40 | * Write a [[DataFrame]] into the data storage
41 | *
42 | * @param t a [[DataFrame]] to be saved
43 | */
44 | override def write(t: DataFrame): Unit = {
45 | logWarning("write is not supported in SparkSQLConnector")
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/docs/data_access_layer/ConnectorBuilder.md:
--------------------------------------------------------------------------------
1 | ## Definition
2 | [**ConnectorBuilder**](https://github.com/SETL-Developers/setl/tree/master/src/main/scala/com/jcdecaux/setl/storage/ConnectorBuilder.scala) provides a simplified way to create **Connector**.
3 |
4 | ## Usage
5 | You have two ways to instantiate a **ConnectorBuilder**:
6 | - with a *Typesafe* [**Config**](https://github.com/lightbend/config) object from a configuration file
7 | - with a [**Conf**](https://github.com/SETL-Developers/setl/tree/master/src/main/scala/com/jcdecaux/setl/config/Conf.scala) object from a `Map[String, String]`.
8 |
9 | ### With Typesafe Config
10 | Firstly, you should create a configuration file in your project's resources directory.
11 |
12 | In this case, let's call it `application.conf`.
13 |
14 | ```text
15 | csvConfiguration {
16 | storage = "CSV"
17 | path = "your/path/to/file.csv"
18 | inferSchema = "true"
19 | delimiter = ";"
20 | header = "true"
21 | saveMode = "Append"
22 | }
23 | ```
24 |
25 | Then you can use **ConfigLoader** to load your configuration file. By default, it loads `application.conf`.
26 | ```scala
27 | object Properties extends ConfigLoader
28 |
29 | val connector = new ConnectorBuilder(spark, Properties.getConfig("csvConfiguration")).getOrCreate()
30 |
31 | connector.read()
32 | connector.write(df)
33 | ```
34 |
35 | ### With Conf
36 | You can create a **Conf** object from a **Map**.
37 | ```scala
38 | val conf = Conf.fromMap(
39 | Map(
40 | "storage" -> "PARQUET",
41 | "path" -> "path/to/your/file",
42 | ...
43 | )
44 | )
45 |
46 | val connector = new ConnectorBuilder(spark, conf).getOrCreate()
47 |
48 | connector.read()
49 | connector.write(df)
50 |
51 | ```
52 |
53 | ## Parameters
54 | Please refer to [Connector documentation](Connector)
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/HudiConnectorConfSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.config
2 |
3 | import io.github.setl.exception.ConfException
4 | import org.scalatest.funsuite.AnyFunSuite
5 | import org.apache.spark.sql.SaveMode
6 |
7 | class HudiConnectorConfSuite extends AnyFunSuite {
8 | val conf = new HudiConnectorConf
9 |
10 | test("Get/Set HudiConnectorConf") {
11 | assert(conf.get("saveMode") === None)
12 | conf.setSaveMode("Append")
13 | assert(conf.getSaveMode === SaveMode.Append)
14 | conf.setSaveMode("Overwrite")
15 | assert(conf.getSaveMode === SaveMode.Overwrite)
16 | conf.setSaveMode(SaveMode.Overwrite)
17 | assert(conf.getSaveMode === SaveMode.Overwrite)
18 |
19 | assert(conf.get("path") === None)
20 | assertThrows[ConfException](conf.getPath)
21 |
22 | conf.setPath("path")
23 | assert(conf.getPath === "path")
24 | }
25 |
26 | test("Init HudiConnectorConf from options") {
27 | val options : Map[String, String] = Map(
28 | "path" -> "path",
29 | "saveMode" -> "Append",
30 | "hoodie.table.name" -> "test_object",
31 | "hoodie.datasource.write.recordkey.field" -> "col1",
32 | "hoodie.datasource.write.precombine.field" -> "col4",
33 | "hoodie.datasource.write.table.type" -> "MERGE_ON_READ"
34 | )
35 |
36 | val confFromOpts: HudiConnectorConf = HudiConnectorConf.fromMap(options)
37 | assert(confFromOpts.getPath === "path")
38 | assert(confFromOpts.getSaveMode === SaveMode.Append)
39 |
40 | val readerOpts = confFromOpts.getReaderConf
41 | val writerOpts = confFromOpts.getWriterConf
42 |
43 | // Config should not contains path & save mode
44 | assert(!readerOpts.contains("path"))
45 | assert(!writerOpts.contains("path"))
46 | assert(!writerOpts.contains("saveMode"))
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/test/resources/streaming_test_resources/input/text.txt:
--------------------------------------------------------------------------------
1 | Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the Dataset/DataFrame API in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write-Ahead Logs. In short, Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming.
2 | Internally, by default, Structured Streaming queries are processed using a micro-batch processing engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called Continuous Processing, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able to choose the mode based on your application requirements.
3 | In this guide, we are going to walk you through the programming model and the APIs. We are going to explain the concepts mostly using the default micro-batch processing model, and then later discuss Continuous Processing model. First, let’s start with a simple example of a Structured Streaming query - a streaming word count.
4 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/config/StructuredStreamingConnectorConf.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.config
2 |
3 | /**
4 | * Configuration parameters:
5 | * Spark documentation
6 | */
7 | class StructuredStreamingConnectorConf extends ConnectorConf {
8 |
9 | import StructuredStreamingConnectorConf._
10 |
11 | def setFormat(format: String): this.type = set(FORMAT.toLowerCase(), format)
12 |
13 | def getFormat: String = getWithException(FORMAT).toLowerCase()
14 |
15 | def setSchema(schema: String): this.type = set(SCHEMA, schema)
16 |
17 | def getSchema: String = getWithException(SCHEMA)
18 |
19 | def setOutputMode(mode: String): this.type = set(OUTPUT_MODE, mode)
20 |
21 | def getOutputMode: String = getWithException(OUTPUT_MODE)
22 |
23 | def setPath(path: String): this.type = set(PATH, path)
24 |
25 | def getPath: String = getWithException(PATH)
26 |
27 | override def getReaderConf: Map[String, String] = removePrivateConf()
28 |
29 | override def getWriterConf: Map[String, String] = removePrivateConf()
30 |
31 | private[this] def getWithException(key: String): String = {
32 | get(key).getOrElse(throw new IllegalArgumentException(s"Can't find $key"))
33 | }
34 |
35 | private[this] def removePrivateConf(): Map[String, String] = {
36 | import scala.collection.JavaConverters._
37 | settings.asScala.toMap - FORMAT - SCHEMA - OUTPUT_MODE
38 | }
39 | }
40 |
41 | object StructuredStreamingConnectorConf {
42 | def fromMap(options: Map[String, String]): StructuredStreamingConnectorConf =
43 | new StructuredStreamingConnectorConf().set(options)
44 |
45 | val FORMAT: String = "format"
46 | val SCHEMA: String = "schema"
47 | val OUTPUT_MODE: String = "outputMode"
48 | val PATH: String = "path"
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/util/TypesafeConfigUtilsSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.util
2 |
3 | import com.typesafe.config.ConfigFactory
4 | import org.scalatest.funsuite.AnyFunSuite
5 | import org.scalatest.matchers.should.Matchers
6 |
7 | class TypesafeConfigUtilsSuite extends AnyFunSuite with Matchers {
8 |
9 | val config = ConfigFactory.load("test_priority.conf")
10 | import TypesafeConfigUtils._
11 |
12 | test("TypesafeConfigUtils should handle implicit type conversion") {
13 | assert(getAs[String](config, "test.string") === Option("abc"))
14 | assert(getAs[Int](config, "test.int") === Option(1))
15 | assert(getAs[Long](config, "test.long") === Option(2L))
16 | assert(getAs[Float](config, "test.float") === Option(3.1F))
17 | assert(getAs[Float](config, "test.float2") === Option(3.1F))
18 | assert(getAs[Double](config, "test.double") === Option(4.4D))
19 | assert(getAs[Boolean](config, "test.boolean") === Option(false))
20 | assert(getAs[Boolean](config, "test.boolean2") === Option(true))
21 | assert(getAs[Int](config, "test.non_existing") === None)
22 | assert(isDefined(config, "test.non_existing") === false)
23 | assert(isDefined(config, "test.string"))
24 | }
25 |
26 | test("TypesafeConfigUtils should handle list") {
27 | getList(config, "test.list").get should equal (Array(1, 2, 3))
28 | val expected = Array(1.2, 2, 3)
29 | getList(config, "test.listFloat").get should equal (expected)
30 | getList(config, "test.listString").get should equal (Array("1.2", "2", "3"))
31 | }
32 |
33 | test("TypesafeConfigUtils should handle map") {
34 | getMap(config.getConfig("test.map")) should equal (Map("v1" -> "a", "v2" -> "b"))
35 |
36 | }
37 |
38 | test("TypesafeConfigUtils exceptions") {
39 | assertThrows[com.typesafe.config.ConfigException.WrongType](getAs[Int](config, "test.string"))
40 | }
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/Connector.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.connector
2 |
3 | import io.github.setl.annotation.InterfaceStability
4 | import io.github.setl.enums.Storage
5 | import io.github.setl.internal.Logging
6 | import io.github.setl.util.HasSparkSession
7 | import org.apache.spark.sql._
8 |
9 | /**
10 | * Connector is a non-typed data access layer (DAL) abstraction that provides read/write functionalities.
11 | *
12 | *
13 | * A basic data storage connector has two main functionalities:
14 | *
addConnector(connector, Some("new_name")), the structure in the compressed zip file will be:
24 | *
25 | * {{{
26 | * outputPath.zip // outputPath.zip is given during the instantiation of FileConsolidator
27 | * |--new_name
28 | * |-- dir_1
29 | * | |-- file1
30 | * |-- dir_2
31 | * |-- file2
32 | * }}}
33 | *
34 | * @param repository Repository that will be used to load data
35 | * @param name name of the directory in the zip output. default is the name of the base directory of the connector
36 | * @return
37 | */
38 | @throws[InvalidConnectorException]
39 | def addRepository(repository: SparkRepository[_], name: Option[String] = None): this.type
40 |
41 | /**
42 | * Add the connector's data to the consolidator. For a directory with the following structure:
43 | *
44 | * {{{
45 | * base_path
46 | * |-- dir_1
47 | * | |-- file1
48 | * |-- dir_2
49 | * |-- file2
50 | * }}}
51 | *
52 | * After calling addConnector(connector, Some("new_name")), the structure in the compressed zip file will be:
53 | *
54 | * {{{
55 | * outputPath.zip // outputPath.zip is given during the instantiation of FileConsolidator
56 | * |--new_name
57 | * |-- dir_1
58 | * | |-- file1
59 | * |-- dir_2
60 | * |-- file2
61 | * }}}
62 | *
63 | * @param connector FileConnector that will be used to load data
64 | * @param name name of the directory in the zip output. default is the name of the base directory of the connector
65 | * @return
66 | */
67 | def addConnector(connector: FileConnector, name: Option[String] = None): this.type
68 |
69 | def archive(outputPath: Path): this.type
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/Properties.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.config
2 |
3 | import com.typesafe.config.Config
4 |
5 | object Properties {
6 |
7 | // override def beforeAll(): Unit = {
8 | // System.setProperty("myvalue", "test-my-value")
9 | // }
10 | //
11 | val cl: ConfigLoader = ConfigLoader
12 | .builder()
13 | .setProperty("myvalue", "test-my-value")
14 | .setConfigPath("application.conf").getOrCreate()
15 |
16 | val excelConfig: Config = cl.getConfig("test.excel")
17 | val excelConfigWithoutSchema: Config = cl.getConfig("test.excelWithoutSchema")
18 | val cassandraConfig: Config = cl.getConfig("test.cassandra")
19 | val cassandraConfigWithoutClustering: Config = cl.getConfig("test.cassandraWithoutClustering")
20 |
21 | val csvConfig: Config = cl.getConfig("test.csv")
22 | val parquetConfig: Config = cl.getConfig("test.parquet")
23 |
24 | val jsonConfig: Config = cl.getConfig("test.json")
25 |
26 | val jdbcConfig: Config = cl.getConfig("psql.test")
27 |
28 | val hudiConfig : Config = cl.getConfig("hudi.test")
29 | val sparkSQLConfig : Config = cl.getConfig("sparkSQL.test")
30 |
31 | val excelConfigConnector: Config = cl.getConfig("connector.excel")
32 | val cassandraConfigConnector: Config = cl.getConfig("connector.cassandra")
33 | val csvConfigConnector: Config = cl.getConfig("connector.csv")
34 | val parquetConfigConnector: Config = cl.getConfig("connector.parquet")
35 | val dynamoDbConfigConnector: Config = cl.getConfig("connector.dynamo")
36 |
37 | val excelConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.excel")
38 | val cassandraConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.cassandra")
39 | val csvConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.csv")
40 | val jsonConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.json")
41 | val deltaConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.delta")
42 |
43 |
44 | val wrongCsvConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.wrong_csv")
45 | val customConnectorWithoutRef: Config = cl.getConfig("connectorBuilder.wrong_csv2")
46 | val parquetConfigConnectorBuilder: Config = cl.getConfig("connectorBuilder.parquet")
47 |
48 |
49 | val excelConfigRepoBuilder: Config = cl.getConfig("repoBuilder.excel")
50 | val cassandraConfigRepoBuilder: Config = cl.getConfig("repoBuilder.cassandra")
51 | val csvConfigRepoBuilder: Config = cl.getConfig("repoBuilder.csv")
52 | val parquetConfigRepoBuilder: Config = cl.getConfig("repoBuilder.parquet")
53 | val deltaConfigRepoBuilder: Config = cl.getConfig("repoBuilder.delta")
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/ConditionSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage
2 |
3 | import java.time.{LocalDate, LocalDateTime}
4 |
5 | import io.github.setl.enums.ValueType
6 | import org.scalatest.funsuite.AnyFunSuite
7 |
8 | class ConditionSuite extends AnyFunSuite {
9 |
10 | test("Condition could be converted to sql request") {
11 |
12 | val strCond = Condition("col1", "=", "haha")
13 | assert(strCond.toSqlRequest === "(`col1` = 'haha')")
14 |
15 | val intCond = Condition("col1", "=", 1)
16 | assert(intCond.toSqlRequest === "(`col1` = 1)")
17 |
18 | val floatCond = Condition("col1", "=", 1F)
19 | assert(floatCond.toSqlRequest === "(`col1` = 1.0)")
20 |
21 | val date = LocalDate.parse("1990-01-01")
22 | val dateCond = Condition("date", "=", date)
23 | assert(dateCond.toSqlRequest === "(`date` = cast('1990-01-01' as date))")
24 |
25 | val datetime = LocalDateTime.parse("1990-01-01T00:00:00")
26 | val datetimeCond = Condition("datetime", "=", datetime)
27 | assert(datetimeCond.toSqlRequest === "(`datetime` = cast('1990-01-01 00:00:00' as timestamp))")
28 |
29 | val strSetCond = Condition("str_set", "in", Set("a", "b"))
30 | assert(strSetCond.toSqlRequest === "(`str_set` IN ('a', 'b'))")
31 |
32 | val floatSetCond = Condition("float_set", "in", Set(1.343F, 2.445F))
33 | assert(floatSetCond.toSqlRequest === "(`float_set` IN (1.343, 2.445))")
34 |
35 | val strCondWithType = Condition("col1", "=", "hehe", ValueType.STRING)
36 | assert(strCondWithType.toSqlRequest === "(`col1` = 'hehe')")
37 | }
38 |
39 | test("Condition should return null if value is not defined") {
40 | val cond = Condition("a", "=", None, ValueType.STRING)
41 | assert(cond.toSqlRequest === null)
42 | }
43 |
44 | test("Null sql request should be ignored in a condition set") {
45 |
46 | val conds = Set(
47 | Condition("a", "=", None, ValueType.STRING),
48 | Condition("b", "=", 1.5),
49 | Condition("c", "in", Set("x", "y"))
50 | )
51 |
52 | import io.github.setl.util.FilterImplicits._
53 | assert(conds.toSqlRequest === "(`b` = 1.5) AND (`c` IN ('x', 'y'))")
54 |
55 | }
56 |
57 | test("Condition should handle Column") {
58 | import org.apache.spark.sql.functions._
59 | val condition = Condition(
60 | col("test col").isin(1, 2, 3)
61 | )
62 |
63 | assert(condition.toSqlRequest === Condition("test col", "IN", Set(1, 2, 3)).toSqlRequest)
64 |
65 | val condition2 = Condition(
66 | col("test col").isin(1, 2, 3) && col("test col 2") === "A"
67 | )
68 | assert(condition2.toSqlRequest === "((`test col` IN (1, 2, 3)) AND (`test col 2` = 'A'))")
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/repository/ImplicitRepositoryAdapter.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.repository
2 |
3 | import io.github.setl.internal.{SchemaConverter, StructAnalyser}
4 | import io.github.setl.storage.{Condition, DatasetConverter}
5 | import org.apache.spark.sql.Dataset
6 | import org.apache.spark.sql.types.StructType
7 |
8 | import scala.reflect.runtime.universe.TypeTag
9 |
10 | object ImplicitRepositoryAdapter {
11 |
12 | /**
13 | * SparkRepositoryAdapter is an implemented implicit RepositoryAdapter that provides 4 additional methods to an
14 | * existing `SparkRepository[A]`.
15 | *
16 | * {{{
17 | * // Example:
18 | *
19 | * implicit val converter = new DatasetConverter[A, B] {
20 | * // implementation
21 | * }
22 | *
23 | * val defaultRepository: SparkRepository[A] // a default repository that can save a Dataset[A]
24 | *
25 | * import io.github.setl.storage.repository.ImplicitRepositoryAdapter._
26 | *
27 | * // This will convert dsOfTypeA (a Dataset[A]) to a Dataset[B] by using the previous implicit converter, then
28 | * // save the converted dataset into the data store
29 | * defaultRepository.convertAndSave(dsOfTypeA)
30 | *
31 | * defaultRepository.findAllAndConvert()
32 | * }}}
33 | *
34 | * @param repository an existing repository
35 | * @param converter a DatasetConverter (should be implemented by user)
36 | * @tparam A source type
37 | * @tparam B target type
38 | */
39 | implicit class SparkRepositoryAdapter[A: TypeTag, B: TypeTag]
40 | (override val repository: SparkRepository[A])
41 | (override implicit val converter: DatasetConverter[A, B]) extends RepositoryAdapter[Dataset[A], Dataset[B]] {
42 |
43 | private[this] val DBTypeSchema: StructType = StructAnalyser.analyseSchema[B]
44 |
45 | def findAllAndConvert(): Dataset[A] = {
46 | val data = repository.readDataFrame()
47 | converter.convertFrom(SchemaConverter.fromDF[B](data))
48 | }
49 |
50 | def findByAndConvert(conditions: Set[Condition]): Dataset[A] = {
51 | val data = repository.readDataFrame(SparkRepository.handleConditions(conditions, DBTypeSchema))
52 | converter.convertFrom(SchemaConverter.fromDF[B](data))
53 | }
54 |
55 | def findByAndConvert(condition: Condition): Dataset[A] = {
56 | findByAndConvert(Set(condition))
57 | }
58 |
59 | def convertAndSave(data: Dataset[A], suffix: Option[String] = None): SparkRepositoryAdapter.this.type = {
60 | val dsToSave = converter.convertTo(data)
61 | repository.configureConnector(dsToSave.toDF(), suffix)
62 | repository.writeDataFrame(SchemaConverter.toDF[B](dsToSave))
63 | this
64 | }
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/docs/data_access_layer/SparkRepositoryAdapter.md:
--------------------------------------------------------------------------------
1 | # RepositoryAdapter
2 |
3 | In some situation, the data format defined in the data source doesn't match the case class defined in our project, and we want to hide
4 | the conversion detail (which may be irrelevant to the business logic). We can achieve this by using the
5 | [SparkRepositoryAdapter](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/repository/ImplicitRepositoryAdapter.scala).
6 | and [DatasetConverter](https://github.com/SETL-Developers/setl/blob/master/src/main/scala/com/jcdecaux/setl/storage/DatasetConverter.scala)
7 |
8 | ## Example
9 |
10 | Imagine our datasource has a format that match the following case class:
11 |
12 | ```scala
13 | case class DataSourceFormat(col1: String, col2: Int, col3: String)
14 |
15 | // col1, col2, col3
16 | // r1, 1, r1-1
17 | // r2, 2, r1-2
18 | ```
19 |
20 | The column `col3` is not necessary (as it's only a concatenation of `col1` and `col2`, we can ignore it and use this
21 | case class in out project:
22 |
23 | ```scala
24 | case class ProjectFormat(col1: String, col2: Int)
25 | ```
26 |
27 | So the data conversions that we want to hide are:
28 | - during the reading, we want to implicitly drop the `col3`
29 | - during the writing, we want to implicitly create `col3` by concatenating `col1` and `col2`
30 |
31 | Let's implement our dataset converter:
32 | ```scala
33 | import io.github.setl.storage.DatasetConverter
34 |
35 | implicit val myConverter = new DatasetConverter[ProjectFormat, DataSourceFormat] {
36 | override def convertFrom(t2: Dataset[DataSourceFormat]): Dataset[ProjectFormat] = {
37 | t2.drop("col3")
38 | .as[ProjectFormat](ExpressionEncoder[ProjectFormat])
39 | }
40 |
41 | override def convertTo(t1: Dataset[ProjectFormat]): Dataset[DataSourceFormat] = {
42 | import org.apache.spark.sql.functions._
43 |
44 | t1.withColumn("col3", concat(col("col1"), lit("-"), col("col2")))
45 | .as[DataSourceFormat](ExpressionEncoder[DataSourceFormat])
46 | }
47 | }
48 | ```
49 |
50 | To use this converter:
51 | ```scala
52 | import io.github.setl.storage.repository.ImplicitRepositoryAdapter._
53 |
54 | // Supposed that we have a repository of type ProjectFormat.
55 | // After the import, several new methods will be added to the SparkRepository
56 | // For example: convertAndSave and findAllAndConvert
57 | val projectFormatRepo = SparkRepository[ProjectFormat]
58 |
59 | // This will convert a Dataset[ProjectFormat] to a Dataset[DataSourceFormat] and save it
60 | projectFormatRepo.convertAndSave(projectFormatDataset)
61 |
62 | // This will load a Dataset[DataSourceFormat] and automatically convert it to a Dataset[ProjectFormat]
63 | val loaded = projectFormatRepo.findAllAndConvert()
64 | ```
65 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/connector/SparkSQLConnectorSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.connector
2 |
3 | import io.github.setl.config.{Conf, Properties}
4 | import io.github.setl.{SparkSessionBuilder, TestObject}
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.sql.SparkSession
7 | import org.scalatest.funsuite.AnyFunSuite
8 |
9 | class SparkSQLConnectorSuite extends AnyFunSuite{
10 |
11 | val query : String =
12 | """
13 | | SELECT (ones.n1 + tens.n2 * 10) as user_id
14 | | FROM (
15 | | SELECT 0 AS n1
16 | | UNION SELECT 1 AS n1
17 | | UNION SELECT 2 AS n1
18 | | UNION SELECT 3 AS n1
19 | | UNION SELECT 4 AS n1
20 | | UNION SELECT 5 AS n1
21 | | UNION SELECT 6 AS n1
22 | | UNION SELECT 7 AS n1
23 | | UNION SELECT 8 AS n1
24 | | UNION SELECT 9 AS n1
25 | | ) ones
26 | | CROSS JOIN
27 | | (
28 | | SELECT 0 AS n2
29 | | UNION SELECT 1 AS n2
30 | | UNION SELECT 2 AS n2
31 | | UNION SELECT 3 AS n2
32 | | UNION SELECT 4 AS n2
33 | | UNION SELECT 5 AS n2
34 | | UNION SELECT 6 AS n2
35 | | UNION SELECT 7 AS n2
36 | | UNION SELECT 8 AS n2
37 | | UNION SELECT 9 AS n2
38 | | ) tens
39 | |""".stripMargin
40 |
41 | val testTable: Seq[TestObject] = Seq(
42 | TestObject(1, "p1", "c1", 1L),
43 | TestObject(2, "p2", "c2", 2L),
44 | TestObject(3, "p3", "c3", 3L)
45 | )
46 |
47 | val options : Map[String, String] = Map(
48 | "query" -> query
49 | )
50 |
51 |
52 | test("Instantiation of constructors") {
53 | val connector = new SparkSQLConnector(query)
54 | assert(connector.query === query)
55 |
56 | val testConfig = Properties.sparkSQLConfig
57 | val connector2 = new SparkSQLConnector(testConfig)
58 | assert(connector2.query === "SELECT * FROM schema.table")
59 |
60 | val connector3 = new SparkSQLConnector(Conf.fromMap(options))
61 | assert(connector3.query === query)
62 |
63 | assertThrows[IllegalArgumentException](new SparkSQLConnector(""))
64 | assertThrows[IllegalArgumentException](new SparkSQLConnector(Conf.fromMap(Map.empty)))
65 | assertThrows[IllegalArgumentException](new SparkSQLConnector(testConfig.withoutPath("query")))
66 | }
67 |
68 | test("Read/Write of SparkSQLConnector") {
69 | val spark: SparkSession = SparkSession.builder().config(new SparkConf()).master("local[*]").getOrCreate()
70 | import spark.implicits._
71 |
72 | val connector = new SparkSQLConnector(query)
73 | assert(connector.read().collect().length == 100)
74 |
75 | // Should log warning & do nothing
76 | val testDF = testTable.toDF()
77 | connector.write(testDF)
78 | connector.write(testDF, Some("any_"))
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/internal/BenchmarkInvocationHandlerSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.internal
2 |
3 | import io.github.setl.annotation.Benchmark
4 | import io.github.setl.transformation.{AbstractFactory, Factory}
5 | import io.github.setl.workflow.Pipeline
6 | import org.scalatest.funsuite.AnyFunSuite
7 |
8 | class BenchmarkInvocationHandlerSuite extends AnyFunSuite {
9 |
10 | import BenchmarkInvocationHandlerSuite._
11 |
12 | test("BenchmarkInvocationHandler should log execution time") {
13 | val factory = new BenchmarkFactory
14 | val benchmarkHandler = new BenchmarkInvocationHandler(factory)
15 |
16 | val proxyFactory = java.lang.reflect.Proxy.newProxyInstance(
17 | getClass.getClassLoader,
18 | Array(classOf[AbstractFactory[_]]),
19 | benchmarkHandler
20 | ).asInstanceOf[AbstractFactory[_]]
21 |
22 | proxyFactory.read()
23 | proxyFactory.process()
24 | proxyFactory.write()
25 |
26 | assert(classOf[BenchmarkFactory].isAnnotationPresent(classOf[Benchmark]))
27 | assert(factory.get() === proxyFactory.get())
28 |
29 | import scala.collection.JavaConverters._
30 | benchmarkHandler.getBenchmarkResult.asScala.foreach {
31 | x => assert(x._2 >=0)
32 | }
33 |
34 | assert(benchmarkHandler.getBenchmarkResult.size() === 2)
35 |
36 | }
37 |
38 | test("Benchmark should be handled in pipeline") {
39 |
40 | val pipeline = new Pipeline()
41 |
42 | val result = pipeline
43 | .addStage[BenchmarkFactory]()
44 | .benchmark(true)
45 | .run()
46 | .getBenchmarkResult
47 |
48 | assert(result.length === 1)
49 |
50 | val result2 = new Pipeline()
51 | .addStage[BenchmarkFactory]()
52 | .run()
53 | .getBenchmarkResult
54 |
55 | assert(result2.isEmpty)
56 |
57 | val result3 = new Pipeline()
58 | .addStage[BenchmarkFactory]()
59 | .benchmark(false)
60 | .run()
61 | .getBenchmarkResult
62 |
63 | assert(result3.isEmpty)
64 | }
65 |
66 | }
67 |
68 | object BenchmarkInvocationHandlerSuite {
69 |
70 | @Benchmark
71 | class BenchmarkFactory extends Factory[String] {
72 |
73 | private[this] var data = ""
74 |
75 | override def read(): BenchmarkFactory.this.type = {
76 | data = s"testing ${this.getClass.getSimpleName}... "
77 | this
78 | }
79 |
80 | @Benchmark
81 | override def process(): BenchmarkFactory.this.type = {
82 | data = data + data
83 | this
84 | }
85 |
86 | @Benchmark
87 | override def write(): BenchmarkFactory.this.type = {
88 | println(data)
89 | sleep()
90 | this
91 | }
92 |
93 | override def get(): String = data
94 |
95 | def sleep(): Unit = Thread.sleep(1000L)
96 |
97 | }
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/internal/TestClasses.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.internal
2 |
3 | import io.github.setl.annotation.{ColumnName, CompoundKey, Compress, Delivery}
4 | import io.github.setl.storage.Compressor
5 | import io.github.setl.transformation.Factory
6 |
7 | object TestClasses {
8 |
9 | case class WrongClass(@ColumnName("1") @ColumnName("2") col1: String)
10 |
11 | case class MultipleCompoundKeyTest(@CompoundKey("sort", "1") @CompoundKey("part", "1") col1: String,
12 | @CompoundKey("sort", "2") col2: String,
13 | @CompoundKey("part", "2") @ColumnName("COLUMN_3") col3: String)
14 |
15 | case class InnerClass(innerCol1: String, innerCol2: String)
16 |
17 | case class TestCompression(@ColumnName("dqsf") col1: String,
18 | @CompoundKey("test", "1") col2: String,
19 | @Compress col3: Seq[InnerClass],
20 | @Compress col4: Seq[String]) {
21 | }
22 |
23 | case class TestStructAnalyser(@ColumnName("alias1") col1: String,
24 | @CompoundKey("test", "1") col2: String,
25 | @CompoundKey("test", "2") col22: String,
26 | @Compress col3: Seq[InnerClass],
27 | @Compress(compressor = classOf[Compressor]) col4: Seq[String]) {
28 | }
29 |
30 | class Producer1
31 |
32 | class Producer2
33 |
34 | class TestFactory extends Factory[String] {
35 |
36 | var input3: Double = _
37 | var input4: Boolean = _
38 |
39 | @Delivery(producer = classOf[Producer1])
40 | var inputString1: String = _
41 |
42 | @Delivery(producer = classOf[Producer2])
43 | var inputString2: String = _
44 |
45 | @Delivery(optional = true)
46 | var inputInt: Int = _
47 |
48 | @Delivery
49 | def setInputs(d: Double, boo: Boolean): this.type = {
50 | input3 = d
51 | input4 = boo
52 | this
53 | }
54 |
55 | /**
56 | * Read data
57 | */
58 | override def read(): TestFactory.this.type = this
59 |
60 | /**
61 | * Process data
62 | */
63 | override def process(): TestFactory.this.type = this
64 |
65 | /**
66 | * Write data
67 | */
68 | override def write(): TestFactory.this.type = this
69 |
70 | /**
71 | * Get the processed data
72 | */
73 | override def get(): String = "Product of TestFactory " + inputString1 + inputString2
74 | }
75 |
76 |
77 | case class MyObject(@ColumnName("col1") column1: String, column2: String)
78 |
79 | case class TestCompoundKey(@CompoundKey("primary", "1") a: String, @CompoundKey("primary", "2") b: Int, @CompoundKey("sort", "1") c: String)
80 |
81 | case class TestNullableColumn(@CompoundKey("primary", "1") col1: String, col2: String, col3: Option[Int], col4: Double)
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/repository/RepositoryAdapterSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.repository
2 |
3 | import io.github.setl.SparkSessionBuilder
4 | import io.github.setl.storage.Condition
5 | import io.github.setl.storage.connector.CSVConnector
6 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
7 | import org.apache.spark.sql.{Dataset, SparkSession}
8 | import org.scalatest.funsuite.AnyFunSuite
9 |
10 | class RepositoryAdapterSuite extends AnyFunSuite {
11 |
12 | val path: String = "src/test/resources/test_repository_adapter"
13 |
14 | val data: Seq[RepoAdapterTesterA] = Seq(
15 | RepoAdapterTesterA("a", "A"),
16 | RepoAdapterTesterA("b", "B")
17 | )
18 |
19 | test("RepositoryAdapter should implicitly convert two dataset") {
20 | val spark: SparkSession = new SparkSessionBuilder().setEnv("local").build().get()
21 | val ds: Dataset[RepoAdapterTesterA] = spark.createDataset(data)(ExpressionEncoder[RepoAdapterTesterA])
22 |
23 | import io.github.setl.storage.repository.ImplicitConverter.a2b
24 | import io.github.setl.storage.repository.ImplicitRepositoryAdapter._
25 |
26 | val options: Map[String, String] = Map[String, String](
27 | "path" -> path,
28 | "inferSchema" -> "true",
29 | "delimiter" -> ",",
30 | "header" -> "true",
31 | "saveMode" -> "Overwrite"
32 | )
33 |
34 | val csvConnector = new CSVConnector(options)
35 |
36 | val repo: SparkRepository[RepoAdapterTesterA] =
37 | new SparkRepository[RepoAdapterTesterA]().setConnector(csvConnector)
38 |
39 | repo.convertAndSave(ds)
40 | val ds2 = repo.findAllAndConvert()
41 | val df = csvConnector.read()
42 |
43 | assert(ds2.columns === ds.columns)
44 | assert(df.columns === Array("column1", "col2", "col3"))
45 | csvConnector.delete()
46 | }
47 |
48 | test("RepositoryAdapter should be able to handle filter") {
49 | val spark: SparkSession = new SparkSessionBuilder().setEnv("local").build().get()
50 | val ds: Dataset[RepoAdapterTesterA] = spark.createDataset(data)(ExpressionEncoder[RepoAdapterTesterA])
51 |
52 | import io.github.setl.storage.repository.ImplicitConverter.a2b
53 | import io.github.setl.storage.repository.ImplicitRepositoryAdapter._
54 |
55 | val options: Map[String, String] = Map[String, String](
56 | "path" -> (path + "_filter"),
57 | "inferSchema" -> "true",
58 | "delimiter" -> ",",
59 | "header" -> "true",
60 | "saveMode" -> "Overwrite"
61 | )
62 |
63 | val csvConnector = new CSVConnector(options)
64 |
65 | val repo: SparkRepository[RepoAdapterTesterA] =
66 | new SparkRepository[RepoAdapterTesterA]().setConnector(csvConnector)
67 |
68 | repo.convertAndSave(ds)
69 |
70 | val conditions = Condition("column1", "=", "a")
71 |
72 | assert(repo.findByAndConvert(conditions).count() === 1)
73 | csvConnector.delete()
74 | }
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/ConfLoaderSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.config
2 |
3 | import com.typesafe.config.ConfigFactory
4 | import org.scalatest.funsuite.AnyFunSuite
5 |
6 | class ConfLoaderSuite extends AnyFunSuite {
7 |
8 | test("ConfigLoader builder should build ConfigLoader") {
9 | System.setProperty("app.environment", "test")
10 | System.setProperty("myvalue", "test-my-value")
11 |
12 | val cl = ConfigLoader.builder()
13 | .setAppEnv("local")
14 | .setAppName("TestConfigLoaderBuilder")
15 | .setProperty("myJvmProperty", "myJvmPropertyValue")
16 | .getOrCreate()
17 |
18 | assert(cl.get("test.string") === "foo")
19 | assert(cl.get("test.variable") === "myJvmPropertyValue")
20 | assert(cl.appName === "TestConfigLoaderBuilder")
21 |
22 | System.clearProperty("app.environment")
23 | System.clearProperty("myvalue")
24 | }
25 |
26 | test("Getters of ConfigLoader") {
27 | System.setProperty("app.environment", "test")
28 | val cl = ConfigLoader.builder()
29 | .setAppEnv("local")
30 | .setConfigPath("test_priority.conf")
31 | .getOrCreate()
32 |
33 | assert(cl.get("my.value") === "haha")
34 | assert(cl.getOption("my.value") === Some("haha"))
35 | assert(cl.getOption("notExisting") === None)
36 | assert(cl.getArray("test.list") === Array("1","2","3"))
37 | assert(cl.getObject("setl.config") === cl.config.getObject("setl.config"))
38 | }
39 |
40 | test("ConfigLoader builder should prioritize setConfigPath than setAppEnv and jvm property and pom") {
41 | System.setProperty("app.environment", "test")
42 | val cl = ConfigLoader.builder()
43 | .setAppEnv("local")
44 | .setConfigPath("test_priority.conf")
45 | .getOrCreate()
46 |
47 | assert(cl.get("my.value") === "haha")
48 | System.clearProperty("app.environment")
49 | }
50 |
51 | test("ConfigLoader builder should take into account the app.environment property in pom") {
52 | System.clearProperty("app.environment")
53 | val configLoader = ConfigLoader.builder().getOrCreate()
54 | assert(configLoader.appEnv === ConfigFactory.load().getString("setl.environment"))
55 | System.clearProperty("app.environment")
56 | }
57 |
58 | test("ConfigLoader builder should prioritize setAppEnv than jvm property and pom") {
59 | System.setProperty("app.environment", "test")
60 |
61 | val cl = ConfigLoader.builder()
62 | .setAppEnv("test_priority")
63 | .getOrCreate()
64 |
65 | assert(cl.get("my.value") === "haha")
66 | System.clearProperty("app.environment")
67 | }
68 |
69 | test("ConfigLoader builder should prioritize jvm property than pom") {
70 | System.setProperty("app.environment", "test_priority")
71 |
72 | val cl = ConfigLoader.builder()
73 | .getOrCreate()
74 |
75 | assert(cl.get("my.value") === "haha")
76 | System.clearProperty("app.environment")
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/connector/HudiConnectorSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.connector
2 |
3 | import io.github.setl.config.{Conf, HudiConnectorConf, Properties}
4 | import io.github.setl.{SparkSessionBuilder, SparkTestUtils, TestObject2}
5 | import org.apache.spark.sql.{SaveMode, SparkSession}
6 | import org.scalatest.funsuite.AnyFunSuite
7 |
8 | import java.nio.file.Paths
9 | import java.sql.{Date, Timestamp}
10 |
11 | class HudiConnectorSuite extends AnyFunSuite {
12 |
13 | val path: String = Paths.get("src", "test", "resources", "test_hudi").toFile.getAbsolutePath
14 | val saveMode = SaveMode.Overwrite
15 |
16 | val options: Map[String, String] = Map[String, String](
17 | "path" -> path,
18 | "saveMode" -> saveMode.toString,
19 | "hoodie.table.name" -> "test_object",
20 | "hoodie.datasource.write.recordkey.field" -> "col1",
21 | "hoodie.datasource.write.precombine.field" -> "col4",
22 | "hoodie.datasource.write.table.type" -> "MERGE_ON_READ"
23 | )
24 |
25 | val testTable: Seq[TestObject2] = Seq(
26 | TestObject2("string", 5, 0.000000001685400132103450D, new Timestamp(1557153268000L), new Date(1557100800000L), 999999999999999999L),
27 | TestObject2("string2", 5, 0.000000001685400132103450D, new Timestamp(1557153268000L), new Date(1557100800000L), 999999999999999999L),
28 | TestObject2("string3", 5, 0.000000001685400132103450D, new Timestamp(1557153268000L), new Date(1557100800000L), 999999999999999999L)
29 | )
30 |
31 | test("Instantiation of constructors") {
32 |
33 | // New spark session here since Hudi only supports KryoSerializer
34 | val spark: SparkSession = new SparkSessionBuilder().setEnv("local")
35 | .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
36 | .build()
37 | .get()
38 | assume(SparkTestUtils.checkSparkVersion("2.4"))
39 |
40 | import spark.implicits._
41 |
42 | val connector = new HudiConnector(HudiConnectorConf.fromMap(options))
43 | connector.write(testTable.toDF)
44 | assert(connector.read().collect().length == testTable.length)
45 |
46 | val path2: String = Paths.get("src", "test", "resources", "test_hudi_2").toFile.getAbsolutePath
47 | val options2 = options + ("path" -> path2)
48 | val connector2 = new HudiConnector(options2)
49 | connector2.write(testTable.toDF)
50 | assert(connector2.read().collect().length == testTable.length)
51 |
52 | val path3: String = Paths.get("src", "test", "resources", "test_hudi_3").toFile.getAbsolutePath
53 | val options3 = options + ("path" -> path3)
54 | val connector3 = new HudiConnector(Conf.fromMap(options3))
55 | connector3.write(testTable.toDF, Some("any_"))
56 | assert(connector3.read().collect().length == testTable.length)
57 |
58 | val connector7 = new HudiConnector(Properties.hudiConfig)
59 | connector7.write(testTable.toDF)
60 | assert(connector7.read().collect().length == testTable.length)
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/FileConnectorConfSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.config
2 |
3 | import io.github.setl.enums.{PathFormat, Storage}
4 | import io.github.setl.exception.ConfException
5 | import org.apache.spark.sql.SaveMode
6 | import org.scalatest.funsuite.AnyFunSuite
7 |
8 | class FileConnectorConfSuite extends AnyFunSuite {
9 |
10 | val conf = new FileConnectorConf()
11 |
12 |
13 | test("Set FileConnectorConf") {
14 | assert(conf.get("storage") === None)
15 | conf.setStorage("CSV")
16 | assert(conf.get("storage").get === "CSV")
17 | conf.setStorage(Storage.EXCEL)
18 | assert(conf.get("storage").get === "EXCEL")
19 |
20 | assert(conf.get("encoding") === None)
21 | conf.setEncoding("latin-1")
22 | assert(conf.get("encoding").get === "latin-1")
23 |
24 | assert(conf.get("saveMode") === None)
25 | conf.setSaveMode("Append")
26 | assert(conf.get("saveMode").get === "Append")
27 | conf.setSaveMode(SaveMode.Overwrite)
28 | assert(conf.get("saveMode").get === "Overwrite")
29 |
30 | assert(conf.get("path") === None)
31 | conf.setPath("path")
32 | assert(conf.get("path").get === "path")
33 |
34 | assert(conf.get("pathFormat") === None)
35 | conf.setPathFormat(PathFormat.WILDCARD)
36 | assert(conf.get("pathFormat").get === "WILDCARD")
37 |
38 | assert(conf.get("credentialsProvider") === None)
39 | conf.setS3CredentialsProvider("credentialsProvider")
40 | assert(conf.get("fs.s3a.aws.credentials.provider").get === "credentialsProvider")
41 |
42 | assert(conf.get("accessKey") === None)
43 | conf.setS3AccessKey("accessKey")
44 | assert(conf.get("fs.s3a.access.key").get === "accessKey")
45 |
46 | assert(conf.get("secretKey") === None)
47 | conf.setS3SecretKey("secretKey")
48 | assert(conf.get("fs.s3a.secret.key").get === "secretKey")
49 |
50 | assert(conf.get("sessionToken") === None)
51 | conf.setS3SessionToken("sessionToken")
52 | assert(conf.get("fs.s3a.session.token").get === "sessionToken")
53 |
54 | assert(conf.get("filenamePattern") === None)
55 | conf.setFilenamePattern("(file)(.*)(\\.csv)")
56 | assert(conf.get("filenamePattern").get === "(file)(.*)(\\.csv)")
57 | }
58 |
59 | test("Getters FileConnectorConf") {
60 | assert(conf.getEncoding === "latin-1")
61 | assert(conf.getSaveMode === SaveMode.Overwrite)
62 | assert(conf.getStorage === Storage.EXCEL)
63 | assert(conf.getPath === "path")
64 | assert(conf.getPathFormat === "WILDCARD")
65 | assert(conf.getSchema === None)
66 | assert(conf.getS3CredentialsProvider === Some("credentialsProvider"))
67 | assert(conf.getS3AccessKey === Some("accessKey"))
68 | assert(conf.getS3SecretKey === Some("secretKey"))
69 | assert(conf.getS3SessionToken === Some("sessionToken"))
70 | assert(conf.getFilenamePattern === Some("(file)(.*)(\\.csv)"))
71 |
72 | val newConf = new FileConnectorConf()
73 | assertThrows[ConfException](newConf.getStorage)
74 | assertThrows[ConfException](newConf.getPath)
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/internal/HasRegistry.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.internal
2 |
3 | import java.util.UUID
4 |
5 | import io.github.setl.annotation.InterfaceStability
6 | import io.github.setl.exception.AlreadyExistsException
7 |
8 | import scala.collection.immutable.ListMap
9 |
10 | /**
11 | * HasUUIDRegistry provide a UUID registry and methods to check if an
12 | * [[io.github.setl.internal.Identifiable]] object already
13 | * exists in its registry
14 | */
15 | @InterfaceStability.Evolving
16 | trait HasRegistry[T <: Identifiable] {
17 |
18 | /**
19 | * Registry is a HashSet that keeps the UUID of identifiable objects
20 | */
21 | private[this] var registry: ListMap[UUID, T] = ListMap.empty
22 |
23 | /**
24 | * Register a new [[io.github.setl.internal.Identifiable]] in registry
25 | *
26 | * @param item an object that inherit [[io.github.setl.internal.Identifiable]]
27 | * @return true if the given item is registered, false otherwise
28 | */
29 | @throws[AlreadyExistsException]
30 | protected def registerNewItem(item: T): Unit = {
31 | if (hasRegisteredItem(item)) {
32 | throw new AlreadyExistsException(s"The current item ${item.getUUID} of type ${item.getCanonicalName} already exists")
33 | } else {
34 | registry += (item.getUUID -> item)
35 | }
36 | }
37 |
38 | /** Clear the registry */
39 | protected def clearRegistry(): Unit = {
40 | registry = ListMap.empty
41 | }
42 |
43 | /**
44 | * Register multiple items
45 | *
46 | * @param items an [[io.github.setl.internal.Identifiable]] object
47 | */
48 | protected def registerNewItems(items: Iterable[T]): Unit = items.foreach(this.registerNewItem)
49 |
50 | /**
51 | * Check if the Identifiable exists in the registry
52 | *
53 | * @param item an object that inherit [[io.github.setl.internal.Identifiable]]
54 | * @return true if it already exists in the registry, false otherwise
55 | */
56 | def hasRegisteredItem(item: Identifiable): Boolean = this.hasRegisteredItem(item.getUUID)
57 |
58 | /**
59 | * Check if the UUID exists in the registry
60 | *
61 | * @param uuid an UUID
62 | * @return true if it already exists in the registry, false otherwise
63 | */
64 | def hasRegisteredItem(uuid: UUID): Boolean = registry.contains(uuid)
65 |
66 | /** Return the registry */
67 | def getRegistry: ListMap[UUID, T] = this.registry
68 |
69 | /**
70 | * For a given UUID, return the corresponding registered item
71 | *
72 | * @param uuid uuid
73 | * @return
74 | */
75 | def getRegisteredItem(uuid: UUID): Option[T] = registry.get(uuid)
76 |
77 | /** Return the number of items in the current registry */
78 | def getRegistryLength: Long = registry.size
79 |
80 | /** Return true if the registry is empty, false otherwise */
81 | def isRegistryEmpty: Boolean = registry.isEmpty
82 |
83 | /**
84 | * Return the last registered item
85 | *
86 | * @return if the registry is empty, None will be returned
87 | */
88 | def lastRegisteredItem: Option[T] = if (isRegistryEmpty) {
89 | None
90 | } else {
91 | Option(registry.last._2)
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/connector/StructuredStreamingConnector.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.connector
2 |
3 | import io.github.setl.annotation.{Experimental, InterfaceStability}
4 | import io.github.setl.config.{Conf, StructuredStreamingConnectorConf}
5 | import io.github.setl.enums.Storage
6 | import io.github.setl.util.TypesafeConfigUtils
7 | import com.typesafe.config.Config
8 | import org.apache.spark.sql.streaming.{DataStreamReader, DataStreamWriter, StreamingQuery}
9 | import org.apache.spark.sql.{DataFrame, Row}
10 |
11 | /**
12 | * :: Experimental ::
13 | *
14 | * Spark Structured Streaming connector
15 | *
16 | * @param conf configuration, see
17 | * Spark structured streaming documentation for details
18 | */
19 | @Experimental
20 | @InterfaceStability.Unstable
21 | class StructuredStreamingConnector(val conf: StructuredStreamingConnectorConf) extends StreamingConnector {
22 |
23 | private[this] var streamingQuery: StreamingQuery = _
24 |
25 | def this(options: Map[String, String]) = this(StructuredStreamingConnectorConf.fromMap(options))
26 |
27 | def this(config: Config) = this(TypesafeConfigUtils.getMap(config))
28 |
29 | def this(config: Conf) = this(config.toMap)
30 |
31 | override val storage: Storage = Storage.STRUCTURED_STREAMING
32 |
33 | @inline protected val streamReader: DataStreamReader = spark.readStream
34 | .format(conf.getFormat)
35 | .options(conf.getReaderConf)
36 |
37 | protected val streamWriter: DataFrame => DataStreamWriter[Row] = (df: DataFrame) => {
38 | df.writeStream
39 | .outputMode(conf.getOutputMode)
40 | .format(conf.getFormat)
41 | .options(conf.getWriterConf)
42 | }
43 |
44 | override def read(): DataFrame = {
45 | if (conf.has(StructuredStreamingConnectorConf.SCHEMA)) {
46 | logInfo("Apply user-defined schema")
47 | streamReader
48 | .schema(conf.getSchema)
49 | .load()
50 | } else {
51 | streamReader.load()
52 | }
53 | }
54 |
55 | override def write(t: DataFrame, suffix: Option[String]): Unit = {
56 | logWarning("Suffix will be ignored by StructuredStreamingConnector")
57 | write(t)
58 | }
59 |
60 | override def write(t: DataFrame): Unit = {
61 | streamingQuery = streamWriter(t).start()
62 | }
63 |
64 | /**
65 | * Wait for the execution to stop. Any exceptions that occurs during the execution
66 | * will be thrown in this thread.
67 | */
68 | override def awaitTermination(): Unit = streamingQuery.awaitTermination()
69 |
70 | /**
71 | * Wait for the execution to stop. Any exceptions that occurs during the execution
72 | * will be thrown in this thread.
73 | *
74 | * @param timeout time to wait in milliseconds
75 | * @return `true` if it's stopped; or throw the reported error during the execution; or `false`
76 | * if the waiting time elapsed before returning from the method.
77 | */
78 | override def awaitTerminationOrTimeout(timeout: Long): Boolean = streamingQuery.awaitTermination(timeout)
79 |
80 | /**
81 | * Stops the execution of this query if it is running.
82 | */
83 | override def stop(): Unit = streamingQuery.stop()
84 | }
85 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/workflow/SimplePipelineOptimizer.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.workflow
2 |
3 | import io.github.setl.annotation.InterfaceStability
4 | import io.github.setl.internal.Logging
5 |
6 | import scala.annotation.tailrec
7 |
8 | @InterfaceStability.Unstable
9 | class SimplePipelineOptimizer(val parallelism: Int = 4) extends PipelineOptimizer with Logging {
10 |
11 | private[this] var _executionPlan: DAG = _
12 | lazy val optExecutionPlan: DAG = optimize()
13 |
14 | override def getOptimizedExecutionPlan: DAG = optExecutionPlan
15 |
16 | override def setExecutionPlan(dag: DAG): SimplePipelineOptimizer.this.type = {
17 | this._executionPlan = dag
18 | this
19 | }
20 |
21 | private[this] def optimize(): DAG = {
22 | val nodes = _executionPlan.nodes.toList.sortBy(_.stage)
23 | val oldDag = _executionPlan.copy()
24 | nodes.foldLeft[DAG](oldDag) {
25 | case (dag, node) => updateNode(node, dag)
26 | }
27 | }
28 |
29 | override def optimize(stages: Iterable[Stage]): Array[Stage] = {
30 | val factories = stages.flatMap(_.factories)
31 |
32 | optExecutionPlan.nodes.groupBy(_.stage).map {
33 | case (id, nodes) =>
34 | val stage = new Stage().setStageId(id)
35 |
36 | val factoryUUIDs = nodes.map(_.factoryUUID)
37 |
38 | factories
39 | .filter(f => factoryUUIDs.contains(f.getUUID))
40 | .foreach(stage.addFactory)
41 |
42 | stage
43 | }.toArray.sortBy(_.stageId)
44 | }
45 |
46 | private[this] def flowsOf(node: Node, dag: DAG): Set[Flow] = {
47 | dag.flows.filter(_.to.factoryUUID == node.factoryUUID)
48 | }
49 |
50 | private[this] def updateDag(newNode: Node, dag: DAG): DAG = {
51 | logDebug(s"Update DAG for node ${newNode.getPrettyName}")
52 | val oldNode = dag.nodes.find(_.factoryUUID == newNode.factoryUUID).get
53 |
54 | val startingFlows = dag.flows
55 | .filter(_.from == oldNode)
56 | .map(_.copy(from = newNode))
57 |
58 | val endingFlows = dag.flows
59 | .filter(_.to == oldNode)
60 | .map(_.copy(to = newNode))
61 |
62 | val otherFlows = dag.flows.filter(_.from != oldNode).filter(_.to != oldNode)
63 |
64 | val otherNodes = dag.nodes.filter(_ != oldNode)
65 |
66 | DAG(otherNodes + newNode, startingFlows ++ endingFlows ++ otherFlows)
67 | }
68 |
69 | @tailrec
70 | private[this] def validateStage(newStageID: Int, dag: DAG): Int = {
71 | val nodeCount = dag.nodes.count(_.stage == newStageID)
72 | if (nodeCount < parallelism) {
73 | logDebug(s"Valid stage ID: $newStageID")
74 | newStageID
75 | } else {
76 | validateStage(newStageID + 1, dag)
77 | }
78 | }
79 |
80 | private[this] def updateNode(oldNode: Node, dag: DAG): DAG = {
81 | logDebug(s"Optimize node: ${oldNode.getPrettyName} of stage ${oldNode.stage}")
82 | val currentDag = dag.copy()
83 | val flows = flowsOf(oldNode, dag)
84 |
85 | val maxInputStage = flows.size match {
86 | case 0 => 0
87 | case _ => flows.map(_.stage).max + 1
88 | }
89 |
90 | logDebug(s"Max input stage of ${oldNode.getPrettyName}: $maxInputStage")
91 |
92 | val validStage = validateStage(maxInputStage, dag)
93 |
94 | val newNode = oldNode.copy(stage = validStage)
95 |
96 | updateDag(newNode, currentDag)
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/util/TypesafeConfigUtils.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.util
2 |
3 | import io.github.setl.enums.Storage
4 | import com.typesafe.config.{Config, ConfigException}
5 |
6 | object TypesafeConfigUtils {
7 |
8 | @throws[com.typesafe.config.ConfigException]
9 | def getAs[T](config: Config, path: String)(implicit getter: ConfigGetter[T]): Option[T] = getter.get(config, path)
10 |
11 | private[this] def _get[T](path: String): (String => T) => Option[T] = (fun: String => T) => {
12 | try {
13 | Option(fun(path))
14 | } catch {
15 | case _: ConfigException.Missing => None
16 | case e: ConfigException.WrongType => throw e
17 | }
18 | }
19 |
20 | private[setl] implicit val stringGetter: ConfigGetter[String] = new ConfigGetter[String] {
21 | override def get(config: Config, path: String): Option[String] = {
22 | _get[String](path)(config.getString)
23 | }
24 | }
25 |
26 | private[setl] implicit val intGetter: ConfigGetter[Int] = new ConfigGetter[Int] {
27 | override def get(config: Config, path: String): Option[Int] = {
28 | _get[Int](path)(config.getInt)
29 | }
30 | }
31 |
32 | private[setl] implicit val longGetter: ConfigGetter[Long] = new ConfigGetter[Long] {
33 | override def get(config: Config, path: String): Option[Long] = {
34 | _get[Long](path)(config.getLong)
35 | }
36 | }
37 |
38 | private[setl] implicit val floatGetter: ConfigGetter[Float] = new ConfigGetter[Float] {
39 | override def get(config: Config, path: String): Option[Float] = {
40 | _get[Float](path)(x => config.getString(x).toFloat)
41 | }
42 | }
43 |
44 | private[setl] implicit val doubleGetter: ConfigGetter[Double] = new ConfigGetter[Double] {
45 | override def get(config: Config, path: String): Option[Double] = {
46 | _get[Double](path)(config.getDouble)
47 | }
48 | }
49 |
50 | private[setl] implicit val booleanGetter: ConfigGetter[Boolean] = new ConfigGetter[Boolean] {
51 | override def get(config: Config, path: String): Option[Boolean] = {
52 | _get[Boolean](path)(config.getBoolean)
53 | }
54 | }
55 |
56 | private[setl] implicit val listGetter: ConfigGetter[Array[AnyRef]] = new ConfigGetter[Array[AnyRef]] {
57 | override def get(config: Config, path: String): Option[Array[AnyRef]] = {
58 | _get[Array[AnyRef]](path)(x => config.getList(x).unwrapped().toArray())
59 | }
60 | }
61 |
62 | private[setl] implicit val StorageGetter: ConfigGetter[Storage] = new ConfigGetter[Storage] {
63 | override def get(config: Config, path: String): Option[Storage] = {
64 | _get[Storage](path)(x => Storage.valueOf(config.getString(x)))
65 | }
66 | }
67 |
68 | def getList(config: Config, path: String): Option[Array[AnyRef]] = {
69 | listGetter.get(config, path)
70 | }
71 |
72 | def getMap(config: Config): Map[String, String] = {
73 | import scala.collection.JavaConverters._
74 | config.entrySet().asScala.map(x => x.getKey -> x.getValue.unwrapped().toString).toMap
75 | }
76 |
77 | def isDefined(config: Config, path: String): Boolean = {
78 | try {
79 | config.getAnyRef(path) != null
80 | } catch {
81 | case _: ConfigException => false
82 | }
83 | }
84 |
85 | private[setl] trait ConfigGetter[T] {
86 | def get(config: Config, path: String): Option[T]
87 | }
88 |
89 | }
90 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/config/JDBCConnectorConfSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.config
2 |
3 | import org.apache.spark.sql.SaveMode
4 | import org.scalatest.funsuite.AnyFunSuite
5 |
6 | class JDBCConnectorConfSuite extends AnyFunSuite {
7 |
8 | val conf = new JDBCConnectorConf()
9 | val url = "url"
10 | val dbTable = "dbtable"
11 | val user = "user"
12 | val password = "password"
13 | val numPartitions = "numPartitions"
14 | val partitionColumn = "partitionColumn"
15 | val lowerBound = "lowerBound"
16 | val upperBound = "upperBound"
17 | val fetchSize = "fetchsize"
18 | val batchSize = "batchsize"
19 | val truncate = "truncate"
20 | val driver = "driver"
21 |
22 | test("Set JDBCConnectorConf") {
23 | assert(conf.get(url) === None)
24 | conf.setUrl(url)
25 | assert(conf.get(url).get === url)
26 |
27 | assert(conf.get(dbTable) === None)
28 | conf.setDbTable(dbTable)
29 | assert(conf.get(dbTable).get === dbTable)
30 |
31 | assert(conf.get(user) === None)
32 | conf.setUser(user)
33 | assert(conf.get(user).get === user)
34 |
35 | assert(conf.get(password) === None)
36 | conf.setPassword(password)
37 | assert(conf.get(password).get === password)
38 |
39 | assert(conf.get("saveMode") === None)
40 | conf.setSaveMode("Overwrite")
41 | assert(conf.get("saveMode").get === "Overwrite")
42 |
43 | conf.setSaveMode(SaveMode.Append)
44 | assert(conf.get("saveMode").get === "Append")
45 |
46 | assert(conf.get(numPartitions) === None)
47 | conf.setNumPartitions(numPartitions)
48 | assert(conf.get(numPartitions).get === numPartitions)
49 |
50 | assert(conf.get(partitionColumn) === None)
51 | conf.setPartitionColumn(partitionColumn)
52 | assert(conf.get(partitionColumn).get === partitionColumn)
53 |
54 | assert(conf.get(lowerBound) === None)
55 | conf.setLowerBound(lowerBound)
56 | assert(conf.get(lowerBound).get === lowerBound)
57 |
58 | assert(conf.get(upperBound) === None)
59 | conf.setUpperBound(upperBound)
60 | assert(conf.get(upperBound).get === upperBound)
61 |
62 | assert(conf.get(fetchSize) === None)
63 | conf.setFetchSize(fetchSize)
64 | assert(conf.get(fetchSize).get === fetchSize)
65 |
66 | assert(conf.get(batchSize) === None)
67 | conf.setBatchSize(batchSize)
68 | assert(conf.get(batchSize).get === batchSize)
69 |
70 | assert(conf.get(truncate) === None)
71 | conf.setTruncate(truncate)
72 | assert(conf.get(truncate).get === truncate)
73 |
74 | assert(conf.get(driver) === None)
75 | conf.setDriver(driver)
76 | assert(conf.get(driver).get === driver)
77 | }
78 |
79 | test("Getters of JDBCConnectorConf") {
80 | assert(conf.getUrl === Some(url))
81 | assert(conf.getDbTable === Some(dbTable))
82 | assert(conf.getUser === Some(user))
83 | assert(conf.getPassword === Some(password))
84 | assert(conf.getSaveMode === Some("Append"))
85 | assert(conf.getNumPartitions === Some(numPartitions))
86 | assert(conf.getPartitionColumn === Some(partitionColumn))
87 | assert(conf.getLowerBound === Some(lowerBound))
88 | assert(conf.getUpperBound === Some(upperBound))
89 | assert(conf.getFetchSize === Some(fetchSize))
90 | assert(conf.getBatchSize === Some(batchSize))
91 | assert(conf.getTruncate === Some(truncate))
92 | assert(conf.getDriver === Some(driver))
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/internal/StructAnalyserSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.internal
2 |
3 | import io.github.setl.annotation.{ColumnName, CompoundKey, Compress}
4 | import io.github.setl.internal.TestClasses.TestStructAnalyser
5 | import io.github.setl.storage.{Compressor, XZCompressor}
6 | import org.apache.spark.sql.types.StructType
7 | import org.scalatest.funsuite.AnyFunSuite
8 |
9 | class StructAnalyserSuite extends AnyFunSuite {
10 |
11 | val schema: StructType = StructAnalyser.analyseSchema[TestStructAnalyser]
12 |
13 | test("StructAnalyser should be able to handle @ColumnName") {
14 | val fields = schema.filter(_.metadata.contains(classOf[ColumnName].getCanonicalName))
15 |
16 | assert(fields.length === 1)
17 | assert(fields.head.name === "col1")
18 | assert(fields.head.metadata.getStringArray(classOf[ColumnName].getCanonicalName) === Array("alias1"))
19 |
20 | }
21 |
22 | test("StructAnalyser should be able to handle @CompoundKey") {
23 | val fields = schema.filter(_.metadata.contains(classOf[CompoundKey].getCanonicalName))
24 |
25 | assert(fields.length === 2)
26 | assert(fields.map(_.name) === Array("col2", "col22"))
27 | assert(fields.map(_.metadata.getStringArray(classOf[CompoundKey].getCanonicalName)).map(_ (0)) === List("test!@1", "test!@2"))
28 | }
29 |
30 | test("StructAnalyser should be able to handle @Compress") {
31 | val fields = schema.filter(_.metadata.contains(classOf[Compress].getCanonicalName))
32 |
33 | assert(fields.length === 2)
34 | assert(fields.map(_.name) === Array("col3", "col4"))
35 |
36 | assert(
37 | fields
38 | .find(_.name == "col3")
39 | .get.metadata
40 | .getStringArray(classOf[Compress].getCanonicalName)(0) === classOf[XZCompressor].getCanonicalName
41 | )
42 |
43 | assert(
44 | fields
45 | .find(_.name == "col4")
46 | .get.metadata
47 | .getStringArray(classOf[Compress].getCanonicalName)(0) === classOf[Compressor].getCanonicalName
48 | )
49 | }
50 |
51 | test("[SETL-34] StructAnalyser should handle multiple @CompoundKey annotations") {
52 | val structType = StructAnalyser.analyseSchema[TestClasses.MultipleCompoundKeyTest]
53 | structType.foreach { x =>
54 | println(s"name: ${x.name}, type: ${x.dataType}, meta: ${x.metadata}")
55 | }
56 |
57 | assert(structType.find(_.name == "col1").get.metadata.getStringArray(classOf[CompoundKey].getCanonicalName) === Array("sort!@1","part!@1"))
58 | assert(structType.find(_.name == "col2").get.metadata.getStringArray(classOf[CompoundKey].getCanonicalName) === Array("sort!@2"))
59 | assert(structType.find(_.name == "col3").get.metadata.getStringArray(classOf[CompoundKey].getCanonicalName) === Array("part!@2"))
60 | }
61 |
62 |
63 | test("StructAnalyser should be able to find columns with @CompoundKey") {
64 | val primaryColumns1 = StructAnalyser.findCompoundColumns[TestClasses.MultipleCompoundKeyTest]
65 | val primaryColumns2 = StructAnalyser.findCompoundColumns[TestClasses.MyObject]
66 |
67 | assert(primaryColumns1.length == 3)
68 | assert(primaryColumns1 === Array("col1", "col2", "COLUMN_3"))
69 | assert(primaryColumns2.isEmpty)
70 | assert(primaryColumns2 === Array())
71 | }
72 |
73 | test("[SETL-34] StructAnalyser should throw exception when there are more than one ColumnName annotation") {
74 | assertThrows[IllegalArgumentException](StructAnalyser.analyseSchema[TestClasses.WrongClass])
75 | }
76 |
77 | }
78 |
--------------------------------------------------------------------------------
/src/main/scala/io/github/setl/storage/repository/Repository.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.repository
2 |
3 | import io.github.setl.annotation.InterfaceStability
4 | import io.github.setl.storage.Condition
5 | import org.apache.spark.sql.{Column, DataFrame, Dataset}
6 |
7 | /**
8 | * The goal of Repository is to significantly reduce the amount of boilerplate code required to
9 | * implement data access layers for various persistence stores.
10 | *
11 | * @tparam DT data type
12 | */
13 | @InterfaceStability.Evolving
14 | trait Repository[DT] {
15 |
16 | /**
17 | * Find data by giving a set of conditions
18 | *
19 | * @param conditions Set of [[Condition]]
20 | * @return
21 | */
22 | def findBy(conditions: Set[Condition]): DT
23 |
24 | /**
25 | * Find data by giving a single condition
26 | *
27 | * @param condition a [[Condition]]
28 | * @return
29 | */
30 | def findBy(condition: Condition): DT = this.findBy(Set(condition))
31 |
32 | /**
33 | * Find data by giving a Spark sql column
34 | *
35 | * @param column a column object (could be chained)
36 | * @return
37 | */
38 | def findBy(column: Column): DT = this.findBy(Condition(column))
39 |
40 | /**
41 | * Retrieve all data
42 | *
43 | * @return
44 | */
45 | def findAll(): DT
46 |
47 | /**
48 | * Save a [[Dataset]] into a data persistence store
49 | *
50 | * @param data data to be saved
51 | * @param suffix an optional string to separate data
52 | * @return this repository instance
53 | */
54 | def save(data: DT, suffix: Option[String]): this.type
55 |
56 |
57 | /**
58 | * Update/Insert a [[Dataset]] into a data persistence store
59 | *
60 | * @param data data to be saved
61 | * @return this repository instance
62 | */
63 | def update(data: DT): this.type
64 |
65 | /**
66 | * Drop the entire table/file/directory
67 | * @return this repository instance
68 | */
69 | def drop(): this.type
70 |
71 | def delete(query: String): this.type
72 |
73 | /**
74 | * Create a data storage (e.g. table in a database or file/folder in a file system) with a suffix
75 | *
76 | * @param t data frame to be written
77 | * @param suffix suffix to be appended at the end of the data storage name
78 | */
79 | def create(t: DataFrame, suffix: Option[String]): this.type
80 |
81 | /**
82 | * Create a data storage (e.g. table in a database or file/folder in a file system)
83 | *
84 | * @param t data frame to be written
85 | */
86 | def create(t: DataFrame): this.type
87 |
88 | def vacuum(retentionHours: Double): this.type
89 |
90 | def vacuum(): this.type
91 |
92 | /**
93 | * Wait for the execution to stop. Any exceptions that occurs during the execution
94 | * will be thrown in this thread.
95 | */
96 | def awaitTermination(): Unit
97 |
98 | /**
99 | * Wait for the execution to stop. Any exceptions that occurs during the execution
100 | * will be thrown in this thread.
101 | *
102 | * @param timeout time to wait in milliseconds
103 | * @return `true` if it's stopped; or throw the reported error during the execution; or `false`
104 | * if the waiting time elapsed before returning from the method.
105 | */
106 | def awaitTerminationOrTimeout(timeout: Long): Boolean
107 |
108 | /**
109 | * Stops the execution of this query if it is running.
110 | */
111 | def stopStreaming(): this.type
112 |
113 | }
114 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at setl@qinxuzhou.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/src/test/scala/io/github/setl/storage/connector/StructuredStreamingConnectorSuite.scala:
--------------------------------------------------------------------------------
1 | package io.github.setl.storage.connector
2 |
3 | import io.github.setl.SparkSessionBuilder
4 | import io.github.setl.config.Conf
5 | import org.apache.spark.sql.SparkSession
6 | import org.scalatest.funsuite.AnyFunSuite
7 |
8 | class StructuredStreamingConnectorSuite extends AnyFunSuite {
9 |
10 | val inputConf: Map[String, String] = Map(
11 | "format" -> "text",
12 | "path" -> "src/test/resources/streaming_test_resources/input"
13 | )
14 |
15 | val consoleOutputConf: Map[String, String] = Map(
16 | "format" -> "console",
17 | "outputMode" -> "append"
18 | )
19 |
20 | val parquetOutputConf: Map[String, String] = Map(
21 | "format" -> "PARQUET",
22 | "outputMode" -> "append",
23 | "checkpointLocation" -> "src/test/resources/streaming_test_resources/output/checkpoint_1",
24 | "path" -> "src/test/resources/streaming_test_resources/output/1"
25 | )
26 |
27 | test("StructuredStreamingConnector instantiation") {
28 | val spark: SparkSession = new SparkSessionBuilder().setEnv("local").build().get()
29 | import spark.implicits._
30 |
31 | val _conf = Conf.fromMap(parquetOutputConf)
32 |
33 | val connector = new StructuredStreamingConnector(inputConf)
34 | val outputConnector = new StructuredStreamingConnector(_conf)
35 | val parquetConnector = new ParquetConnector(parquetOutputConf)
36 |
37 | val input = connector.read()
38 |
39 | outputConnector.write(input, Option("suffix_should_be_ignored"))
40 | outputConnector.awaitTerminationOrTimeout(10000)
41 |
42 | parquetConnector.read().show()
43 | assert(parquetConnector.read().as[String].collect().mkString(" ") === StructuredStreamingConnectorSuite.text)
44 | }
45 |
46 | }
47 |
48 | object StructuredStreamingConnectorSuite {
49 | val text = "Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the Dataset/DataFrame API in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write-Ahead Logs. In short, Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming. Internally, by default, Structured Streaming queries are processed using a micro-batch processing engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called Continuous Processing, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able to choose the mode based on your application requirements. In this guide, we are going to walk you through the programming model and the APIs. We are going to explain the concepts mostly using the default micro-batch processing model, and then later discuss Continuous Processing model. First, let’s start with a simple example of a Structured Streaming query - a streaming word count."
50 | }
51 |
--------------------------------------------------------------------------------