├── project ├── build.properties └── plugins.sbt ├── src ├── test │ ├── resources │ │ ├── parquet │ │ │ ├── good │ │ │ │ ├── _temporary │ │ │ │ │ └── .placeholder │ │ │ │ └── year=2017 │ │ │ │ │ └── .placeholder │ │ │ └── with_backup_dir │ │ │ │ └── _backup │ │ │ │ └── .placeholder │ │ ├── parquet-roundtrip │ │ │ ├── transform │ │ │ │ └── t.sql │ │ │ └── app.yaml │ │ ├── uri-loader │ │ │ ├── without_env_vars │ │ │ ├── with_env_vars │ │ │ ├── with_bogus_includes │ │ │ └── with_includes │ │ ├── runtime-ctx │ │ │ └── spark │ │ │ │ ├── transform │ │ │ │ ├── client_all.sql │ │ │ │ ├── client_spending.sql │ │ │ │ ├── minor_purchase.sql │ │ │ │ └── item_purchase.sql │ │ │ │ ├── extract-check │ │ │ │ ├── client.sql │ │ │ │ ├── transaction.sql │ │ │ │ └── item.sql │ │ │ │ └── transform-check │ │ │ │ ├── item_purchase.sql │ │ │ │ ├── minor_purchase.sql │ │ │ │ └── client_spending.sql │ │ ├── main-utils │ │ │ ├── spark │ │ │ │ ├── extract-check │ │ │ │ │ ├── client.sql │ │ │ │ │ ├── transaction.sql │ │ │ │ │ └── item.sql │ │ │ │ ├── transform-check │ │ │ │ │ ├── item_purchase.sql │ │ │ │ │ ├── minor_purchase.sql │ │ │ │ │ └── client_spending.sql │ │ │ │ └── transform │ │ │ │ │ ├── client_spending.sql │ │ │ │ │ ├── item_purchase.sql │ │ │ │ │ └── minor_purchase.sql │ │ │ └── config │ │ │ │ └── app.yaml │ │ └── log4j.properties │ └── scala │ │ └── spark_etl │ │ ├── parser │ │ └── ParserSpec.scala │ │ ├── util │ │ ├── DeaultEnvSpec.scala │ │ ├── SparkParserSpec.scala │ │ ├── DepTreeSpec.scala │ │ ├── UriLoaderSpec.scala │ │ └── ValidationSpec.scala │ │ ├── parquet │ │ ├── PathValidatorSpec.scala │ │ └── WriteReadRoundtripSpec.scala │ │ ├── CLIOpsSpec.scala │ │ ├── model │ │ ├── ConfigSpec.scala │ │ └── RuntimeContextSpec.scala │ │ └── oracle │ │ └── OracleLoadAppenderSpec.scala └── main │ ├── scala │ └── spark_etl │ │ ├── ConfigError.scala │ │ ├── util │ │ ├── Files.scala │ │ ├── DefaultEnv.scala │ │ ├── BAHelper.scala │ │ ├── SparkParser.scala │ │ ├── UriLoader.scala │ │ ├── DepTree.scala │ │ └── Validation.scala │ │ ├── model │ │ ├── Load.scala │ │ ├── Extract.scala │ │ ├── Transform.scala │ │ ├── ParametrizedConstructor.scala │ │ ├── Config.scala │ │ ├── Persist.scala │ │ └── RuntimeContext.scala │ │ ├── LoadWriter.scala │ │ ├── ExtractReader.scala │ │ ├── parquet │ │ ├── ParquetLoadWriter.scala │ │ ├── ParquetExtractReader.scala │ │ └── PathValidator.scala │ │ ├── parser │ │ └── Parser.scala │ │ ├── oracle │ │ ├── OracleValidator.scala │ │ └── OracleLoadAppender.scala │ │ ├── CLI.scala │ │ └── CLIOps.scala │ └── resources │ ├── monkey-patch.sh │ ├── log4j.properties │ └── run.sh ├── .gitignore ├── .travis.yml └── README.md /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.9 -------------------------------------------------------------------------------- /src/test/resources/parquet/good/_temporary/.placeholder: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/test/resources/parquet/good/year=2017/.placeholder: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/test/resources/parquet/with_backup_dir/_backup/.placeholder: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/test/resources/parquet-roundtrip/transform/t.sql: -------------------------------------------------------------------------------- 1 | select s from x -------------------------------------------------------------------------------- /src/test/resources/uri-loader/without_env_vars: -------------------------------------------------------------------------------- 1 | hello there 2 | 123 3 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.3.5") 2 | -------------------------------------------------------------------------------- /src/test/resources/uri-loader/with_env_vars: -------------------------------------------------------------------------------- 1 | 111 ${var1} 222 ${var2} 333 ${var1} 444 2 | -------------------------------------------------------------------------------- /src/test/resources/uri-loader/with_bogus_includes: -------------------------------------------------------------------------------- 1 | === 2 | #include<__bogus_include__> 3 | --- -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | project/target 3 | project/project 4 | **/.DS_Store 5 | .idea/ 6 | *~ 7 | -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/transform/client_all.sql: -------------------------------------------------------------------------------- 1 | -- transform - client - get all 2 | SELECT * from client -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.8 4 | script: 5 | - sbt ++$TRAVIS_SCALA_VERSION coverage test coverageReport 6 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/ConfigError.scala: -------------------------------------------------------------------------------- 1 | package spark_etl 2 | 3 | case class ConfigError(msg: String, exc: Option[Throwable] = None) 4 | -------------------------------------------------------------------------------- /src/test/resources/uri-loader/with_includes: -------------------------------------------------------------------------------- 1 | === 2 | #include 3 | --- 4 | #include 5 | +++ -------------------------------------------------------------------------------- /src/main/scala/spark_etl/util/Files.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import java.io.File 4 | 5 | object Files { 6 | def pwd = new File(".").getCanonicalPath 7 | def rootResource = getClass.getResource("/").getFile 8 | } 9 | -------------------------------------------------------------------------------- /src/test/resources/parquet-roundtrip/app.yaml: -------------------------------------------------------------------------------- 1 | extracts: 2 | - name: x 3 | uri: "${path}/x" 4 | 5 | transforms: 6 | - name: transform 7 | sql: "/parquet-roundtrip/transform/t.sql" 8 | 9 | loads: 10 | - name: y 11 | source: transform 12 | uri: "${path}/y" 13 | -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/extract-check/client.sql: -------------------------------------------------------------------------------- 1 | -- pre-check - client 2 | SELECT -- null checks 3 | (SELECT count(1) FROM client WHERE id IS NULL) = 0 AS id_null_less, 4 | (SELECT count(1) FROM client WHERE name IS NULL) = 0 AS name_null_less, 5 | (SELECT count(1) FROM client WHERE age IS NULL) = 0 AS age_null_less -------------------------------------------------------------------------------- /src/main/scala/spark_etl/model/Load.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.model 2 | 3 | import net.jcazevedo.moultingyaml.DefaultYamlProtocol 4 | 5 | case class Load(name: String, source: String, uri: String, partition_by: Option[List[String]] = None) 6 | 7 | object Load extends DefaultYamlProtocol { 8 | implicit val yamlFormat = yamlFormat4(Load.apply) 9 | } 10 | -------------------------------------------------------------------------------- /src/test/resources/main-utils/spark/extract-check/client.sql: -------------------------------------------------------------------------------- 1 | -- pre-check - client 2 | SELECT -- null checks 3 | (SELECT ${count_fun}(1) FROM client WHERE id IS NULL) = 0 AS id_null_less, 4 | (SELECT ${count_fun}(1) FROM client WHERE name IS NULL) = 0 AS name_null_less, 5 | (SELECT ${count_fun}(1) FROM client WHERE age IS NULL) = 0 AS age_null_less -------------------------------------------------------------------------------- /src/main/resources/monkey-patch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage() { 4 | echo " Usage:" 5 | echo " " 6 | exit 1 7 | } 8 | 9 | if [[ $# -lt 2 ]]; then usage; fi 10 | jar=$1 11 | file=$2 12 | 13 | rm -rf jar_exploded 14 | mkdir jar_exploded 15 | pushd jar_exploded 16 | unzip -q ../$jar 17 | vi $file 18 | zip -ur ../$jar $file 19 | popd 20 | rm -rf jar_exploded -------------------------------------------------------------------------------- /src/main/scala/spark_etl/model/Extract.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.model 2 | 3 | import net.jcazevedo.moultingyaml.DefaultYamlProtocol 4 | 5 | case class Extract(name: String, uri: String, cache: Option[Boolean] = None, persist: Option[Persist] = None, check: Option[String] = None) 6 | 7 | object Extract extends DefaultYamlProtocol { 8 | implicit val yamlFormat = yamlFormat5(Extract.apply) 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/model/Transform.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.model 2 | 3 | import net.jcazevedo.moultingyaml.DefaultYamlProtocol 4 | 5 | case class Transform(name: String, sql: String, cache: Option[Boolean] = None, persist: Option[Persist] = None, check: Option[String] = None) 6 | 7 | object Transform extends DefaultYamlProtocol { 8 | implicit val yamlFormat = yamlFormat5(Transform.apply) 9 | } 10 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=INFO, console 2 | log4j.logger.org.apache.spark=WARN 3 | log4j.logger.org.spark_project=WARN 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender 5 | log4j.appender.console.target=System.out 6 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.console.layout.ConversionPattern=%d{ISO8601} logLevel=%p thread=%t class=%C line_number=%L %m%n 8 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/LoadWriter.scala: -------------------------------------------------------------------------------- 1 | package spark_etl 2 | 3 | import org.apache.spark.sql.DataFrame 4 | import spark_etl.model.Load 5 | import spark_etl.util.Validation 6 | 7 | abstract class LoadWriter(params: Map[String, Any]) { 8 | def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit 9 | def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit] 10 | def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit] 11 | } 12 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=WARN, console 2 | log4j.logger.org.apache.hadoop=ERROR 3 | log4j.logger.org.apache.spark=ERROR 4 | log4j.logger.org.spark_project=ERROR 5 | log4j.appender.console=org.apache.log4j.ConsoleAppender 6 | log4j.appender.console.target=System.out 7 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.console.layout.ConversionPattern=%d{ISO8601} logLevel=%p thread=%t class=%C line_number=%L %m%n 9 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/ExtractReader.scala: -------------------------------------------------------------------------------- 1 | package spark_etl 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import spark_etl.model.Extract 5 | import spark_etl.util.Validation 6 | 7 | abstract class ExtractReader(params: Map[String, Any]) { 8 | def checkLocal(extracts: Seq[Extract]): Validation[ConfigError, Unit] 9 | def checkRemote(extracts: Seq[Extract]): Validation[ConfigError, Unit] 10 | def read(extracts: Seq[Extract])(implicit spark: SparkSession): Seq[(Extract, DataFrame)] 11 | } 12 | -------------------------------------------------------------------------------- /src/test/resources/main-utils/spark/transform-check/item_purchase.sql: -------------------------------------------------------------------------------- 1 | SELECT -- null checks 2 | (SELECT count(1) FROM item_purchase WHERE id IS NULL) = 0 AS id_null_less, 3 | (SELECT count(1) FROM item_purchase WHERE name IS NULL) = 0 AS name_null_less, 4 | -- min checks 5 | min(id) > 0 AS id_positive_ok, 6 | min(total_purchase) > 0 AS total_purchase_ok, 7 | -- col width checks 8 | max(${length_fun}(name)) <= 10 AS name_ok 9 | FROM item_purchase -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/transform-check/item_purchase.sql: -------------------------------------------------------------------------------- 1 | SELECT -- null checks 2 | (SELECT count(1) FROM item_purchase WHERE id IS NULL) = 0 AS id_null_less, 3 | (SELECT count(1) FROM item_purchase WHERE name IS NULL) = 0 AS name_null_less, 4 | -- min checks 5 | min(id) > 0 AS id_positive_ok, 6 | min(total_purchase) > 0 AS total_purchase_ok, 7 | -- col width checks 8 | max(length(name)) <= 10 AS name_ok 9 | FROM item_purchase -------------------------------------------------------------------------------- /src/test/resources/main-utils/spark/transform-check/minor_purchase.sql: -------------------------------------------------------------------------------- 1 | SELECT -- null checks 2 | (SELECT count(1) FROM minor_purchase WHERE id IS NULL) = 0 AS id_null_less, 3 | (SELECT count(1) FROM minor_purchase WHERE name IS NULL) = 0 AS name_null_less, 4 | -- min checks 5 | min(id) > 0 AS id_positive_ok, 6 | min(sold_to_minors) > 0 AS sold_to_minors_ok, 7 | -- col width checks 8 | max(length(name)) <= 10 AS name_ok 9 | FROM minor_purchase -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/transform-check/minor_purchase.sql: -------------------------------------------------------------------------------- 1 | SELECT -- null checks 2 | (SELECT count(1) FROM minor_purchase WHERE id IS NULL) = 0 AS id_null_less, 3 | (SELECT count(1) FROM minor_purchase WHERE name IS NULL) = 0 AS name_null_less, 4 | -- min checks 5 | min(id) > 0 AS id_positive_ok, 6 | min(sold_to_minors) > 0 AS sold_to_minors_ok, 7 | -- col width checks 8 | max(length(name)) <= 10 AS name_ok 9 | FROM minor_purchase -------------------------------------------------------------------------------- /src/test/resources/main-utils/spark/transform-check/client_spending.sql: -------------------------------------------------------------------------------- 1 | SELECT -- null checks 2 | (SELECT count(1) FROM client_spending WHERE id IS NULL) = 0 AS id_null_less, 3 | (SELECT count(1) FROM client_spending WHERE name IS NULL) = 0 AS name_null_less, 4 | -- min checks 5 | min(id) > 0 AS id_positive_ok, 6 | min(total_spending) > 0 AS total_spending_ok, 7 | -- col width checks 8 | max(length(name)) <= 10 AS name_ok 9 | FROM client_spending -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/transform-check/client_spending.sql: -------------------------------------------------------------------------------- 1 | SELECT -- null checks 2 | (SELECT count(1) FROM client_spending WHERE id IS NULL) = 0 AS id_null_less, 3 | (SELECT count(1) FROM client_spending WHERE name IS NULL) = 0 AS name_null_less, 4 | -- min checks 5 | min(id) > 0 AS id_positive_ok, 6 | min(total_spending) > 0 AS total_spending_ok, 7 | -- col width checks 8 | max(length(name)) <= 10 AS name_ok 9 | FROM client_spending -------------------------------------------------------------------------------- /src/test/scala/spark_etl/parser/ParserSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.parser 2 | 3 | import org.scalatest.{FlatSpec, Inside, Matchers} 4 | import spark_etl.parser.Parser._ 5 | 6 | class ParserSpec extends FlatSpec with Matchers with Inside { 7 | "Parser" should "resolve all UnresolvedRelations" in { 8 | getDsos("SELECT a from b.b").toSet shouldBe Set("b.b") 9 | getDsos("SELECT a from b.c").toSet shouldBe Set("b.c") 10 | getDsos("SELECT a from b" ).toSet shouldBe Set("b") 11 | getDsos("SELECT a.x from b").toSet shouldBe Set("b") 12 | getDsos("SELECT z.x from b").toSet shouldBe Set("b") 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/test/resources/main-utils/spark/extract-check/transaction.sql: -------------------------------------------------------------------------------- 1 | -- pre-check - transaction 2 | SELECT -- null checks 3 | (SELECT count(1) FROM transaction WHERE client_id IS NULL) = 0 AS client_id_null_less, 4 | (SELECT count(1) FROM transaction WHERE item_id IS NULL) = 0 AS item_id_null_less, 5 | (SELECT count(1) FROM transaction WHERE quantity IS NULL) = 0 AS quantity_null_less, 6 | -- positive price checks 7 | (SELECT count(1) FROM (SELECT CAST(quantity AS INTEGER) FROM transaction) WHERE quantity < 0) = 0 AS has_positive_quantity -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/extract-check/transaction.sql: -------------------------------------------------------------------------------- 1 | -- pre-check - transaction 2 | SELECT -- null checks 3 | (SELECT count(1) FROM transaction WHERE client_id IS NULL) = 0 AS client_id_null_less, 4 | (SELECT count(1) FROM transaction WHERE item_id IS NULL) = 0 AS item_id_null_less, 5 | (SELECT count(1) FROM transaction WHERE quantity IS NULL) = 0 AS quantity_null_less, 6 | -- positive price checks 7 | (SELECT count(1) FROM (SELECT CAST(quantity AS INTEGER) FROM transaction) WHERE quantity < 0) = 0 AS has_positive_quantity -------------------------------------------------------------------------------- /src/test/resources/main-utils/spark/extract-check/item.sql: -------------------------------------------------------------------------------- 1 | -- pre-check - item 2 | SELECT -- null checks 3 | (SELECT count(1) FROM item WHERE id IS NULL) = 0 AS id_null_less, 4 | (SELECT count(1) FROM item WHERE name IS NULL) = 0 AS name_null_less, 5 | (SELECT count(1) FROM item WHERE price IS NULL) = 0 AS age_null_less, 6 | (SELECT count(1) FROM item WHERE for_adults IS NULL) = 0 AS for_adults_null_less, 7 | -- positive price checks 8 | (SELECT count(1) FROM (SELECT CAST(price AS INTEGER) FROM item) WHERE price < 0) = 0 AS has_positive_prices -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/extract-check/item.sql: -------------------------------------------------------------------------------- 1 | -- pre-check - item 2 | SELECT -- null checks 3 | (SELECT count(1) FROM item WHERE id IS NULL) = 0 AS id_null_less, 4 | (SELECT count(1) FROM item WHERE name IS NULL) = 0 AS name_null_less, 5 | (SELECT count(1) FROM item WHERE price IS NULL) = 0 AS age_null_less, 6 | (SELECT count(1) FROM item WHERE for_adults IS NULL) = 0 AS for_adults_null_less, 7 | -- positive price checks 8 | (SELECT count(1) FROM (SELECT CAST(price AS INTEGER) FROM item) WHERE price < 0) = 0 AS has_positive_prices -------------------------------------------------------------------------------- /src/main/scala/spark_etl/parquet/ParquetLoadWriter.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.parquet 2 | 3 | import org.apache.spark.sql.DataFrame 4 | import spark_etl.model.Load 5 | import spark_etl.util.Validation 6 | import spark_etl.util.Validation._ 7 | import spark_etl.{ConfigError, LoadWriter} 8 | 9 | class ParquetLoadWriter(params: Map[String, String]) extends LoadWriter(params) { 10 | override def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit = { 11 | loadsAndDfs.foreach { 12 | case (Load(_, _, uri, Some(partitionBy)), df) => df.write.partitionBy(partitionBy:_*).parquet(uri) 13 | case (Load(_, _, uri, None), df) => df.write.parquet(uri) 14 | } 15 | } 16 | 17 | // nothing to validate 18 | override def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit] = 19 | ().success[ConfigError] 20 | 21 | override def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit] = 22 | ().success[ConfigError] 23 | } 24 | -------------------------------------------------------------------------------- /src/test/resources/main-utils/spark/transform/client_spending.sql: -------------------------------------------------------------------------------- 1 | -- transform - client_spending 2 | SELECT c.id, 3 | c.name, 4 | totals.total_spending 5 | FROM client c 6 | INNER JOIN 7 | (SELECT id, 8 | SUM(spent) AS total_spending 9 | FROM (SELECT typed_client.id, 10 | (typed_item.price * typed_transaction.quantity) AS spent 11 | FROM (SELECT id, name FROM client) AS typed_client 12 | LEFT OUTER JOIN 13 | (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) AS typed_transaction 14 | ON typed_transaction.client_id = typed_client.id 15 | LEFT OUTER JOIN 16 | (SELECT id, CAST(price AS INTEGER) FROM item) AS typed_item 17 | ON typed_transaction.item_id = typed_item.id 18 | ) GROUP BY id 19 | ) AS totals 20 | ON c.id = totals.id 21 | -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/transform/client_spending.sql: -------------------------------------------------------------------------------- 1 | -- transform - client_spending 2 | SELECT c.id, 3 | c.name, 4 | totals.total_spending 5 | FROM client c 6 | INNER JOIN 7 | (SELECT id, 8 | SUM(spent) AS total_spending 9 | FROM (SELECT typed_client.id, 10 | (typed_item.price * typed_transaction.quantity) AS spent 11 | FROM (SELECT id, name FROM client) AS typed_client 12 | LEFT OUTER JOIN 13 | (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) AS typed_transaction 14 | ON typed_transaction.client_id = typed_client.id 15 | LEFT OUTER JOIN 16 | (SELECT id, CAST(price AS INTEGER) FROM item) AS typed_item 17 | ON typed_transaction.item_id = typed_item.id 18 | ) GROUP BY id 19 | ) AS totals 20 | ON c.id = totals.id 21 | -------------------------------------------------------------------------------- /src/test/resources/main-utils/spark/transform/item_purchase.sql: -------------------------------------------------------------------------------- 1 | -- transform - item_purchase 2 | SELECT i.id, 3 | i.name, 4 | totals.total_purchase 5 | FROM item i 6 | INNER JOIN 7 | (SELECT id, 8 | SUM(purchase) AS total_purchase 9 | FROM (SELECT typed_client.id, 10 | (typed_item.price * typed_transaction.quantity) AS purchase 11 | FROM (SELECT id, name, CAST(age AS INTEGER) FROM client) AS typed_client 12 | ${join_type} 13 | (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) AS typed_transaction 14 | ON typed_transaction.client_id = typed_client.id 15 | ${join_type} 16 | (SELECT id, CAST(price AS INTEGER) FROM item) AS typed_item 17 | ON typed_transaction.item_id = typed_item.id 18 | ) GROUP BY id 19 | ) AS totals 20 | ON i.id = totals.id 21 | -------------------------------------------------------------------------------- /src/test/resources/main-utils/spark/transform/minor_purchase.sql: -------------------------------------------------------------------------------- 1 | -- transform - minor_purchase 2 | SELECT i.id, 3 | i.name, 4 | totals.sold_to_minors 5 | FROM item i 6 | INNER JOIN 7 | (SELECT id, 8 | SUM(sold_to_minors) AS sold_to_minors 9 | FROM (SELECT typed_client.id, 10 | positive_transaction.sold_to_minors 11 | FROM (SELECT id, CAST(age AS INTEGER) FROM client) AS typed_client 12 | INNER JOIN 13 | (SELECT client_id, item_id, quantity AS sold_to_minors FROM (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) WHERE quantity > 0) AS positive_transaction 14 | ON positive_transaction.client_id = typed_client.id 15 | INNER JOIN 16 | item 17 | ON positive_transaction.item_id = item.id 18 | ) GROUP BY id 19 | ) AS totals 20 | ON i.id = totals.id 21 | -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/transform/minor_purchase.sql: -------------------------------------------------------------------------------- 1 | -- transform - minor_purchase 2 | SELECT i.id, 3 | i.name, 4 | totals.sold_to_minors 5 | FROM item i 6 | INNER JOIN 7 | (SELECT id, 8 | SUM(sold_to_minors) AS sold_to_minors 9 | FROM (SELECT typed_client.id, 10 | positive_transaction.sold_to_minors 11 | FROM (SELECT id, CAST(age AS INTEGER) FROM client) AS typed_client 12 | INNER JOIN 13 | (SELECT client_id, item_id, quantity AS sold_to_minors FROM (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) WHERE quantity > 0) AS positive_transaction 14 | ON positive_transaction.client_id = typed_client.id 15 | INNER JOIN 16 | item 17 | ON positive_transaction.item_id = item.id 18 | ) GROUP BY id 19 | ) AS totals 20 | ON i.id = totals.id 21 | -------------------------------------------------------------------------------- /src/test/resources/runtime-ctx/spark/transform/item_purchase.sql: -------------------------------------------------------------------------------- 1 | -- transform - item_purchase 2 | SELECT i.id, 3 | i.name, 4 | totals.total_purchase 5 | FROM item i 6 | INNER JOIN 7 | (SELECT id, 8 | SUM(purchase) AS total_purchase 9 | FROM (SELECT typed_client.id, 10 | (typed_item.price * typed_transaction.quantity) AS purchase 11 | FROM (SELECT id, name, CAST(age AS INTEGER) FROM client) AS typed_client 12 | LEFT OUTER JOIN 13 | (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) AS typed_transaction 14 | ON typed_transaction.client_id = typed_client.id 15 | LEFT OUTER JOIN 16 | (SELECT id, CAST(price AS INTEGER) FROM item) AS typed_item 17 | ON typed_transaction.item_id = typed_item.id 18 | ) GROUP BY id 19 | ) AS totals 20 | ON i.id = totals.id 21 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/util/DeaultEnvSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import org.joda.time.DateTime 4 | import org.scalatest.{FlatSpec, Inside, Matchers} 5 | 6 | class DeaultEnvSpec extends FlatSpec with Matchers with Inside { 7 | "DefaultEnv" should "obtain dates for start of epoch" in { 8 | DefaultEnv.getAll(new DateTime(0)).toList should contain allElementsOf Seq( 9 | // t-1d 10 | "yyyy-MM-1d" -> "1969-12", 11 | "yyyy-MM-dd-1d" -> "1969-12-31", 12 | "sod-1d" -> "1969-12-31 00:00:00", 13 | "eod-1d" -> "1969-12-31 23:59:59", 14 | "y-1d" -> "1969", 15 | "m-1d" -> "12", 16 | "d-1d" -> "31", 17 | // utc-1d 18 | "utc-yyyy-MM-1d" -> "1969-12", 19 | "utc-yyyy-MM-dd-1d" -> "1969-12-31", 20 | "utc-sod-1d" -> "1969-12-31 00:00:00", 21 | "utc-eod-1d" -> "1969-12-31 23:59:59", 22 | "utc-y-1d" -> "1969", 23 | "utc-m-1d" -> "12", 24 | "utc-d-1d" -> "31" 25 | ) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/util/SparkParserSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import spark_etl.util.SparkParser.QfDep 4 | import org.scalatest._ 5 | 6 | class SparkParserSpec extends FlatSpec with Matchers { 7 | val complexSql = 8 | """ 9 | |-- select all from client, transaction, item 10 | |SELECT * 11 | | FROM namespace1.client c, 12 | | namespace2.transaction t, 13 | | item i 14 | | WHERE c.id = t.c_id AND t.i_id = i.id""".stripMargin 15 | 16 | "SparkParser" should "fetch deps" in { 17 | SparkParser.getDeps(complexSql) should contain allOf( 18 | QfDep("client", Some("namespace1")), 19 | QfDep("transaction", Some("namespace2")), 20 | QfDep("item") 21 | ) 22 | } 23 | 24 | it should "strip db prefixes" in { 25 | SparkParser.stripDbs(complexSql) shouldBe 26 | """ 27 | |-- select all from client, transaction, item 28 | |SELECT * 29 | | FROM client c, 30 | | transaction t, 31 | | item i 32 | | WHERE c.id = t.c_id AND t.i_id = i.id""".stripMargin 33 | } 34 | } -------------------------------------------------------------------------------- /src/main/scala/spark_etl/parser/Parser.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.parser 2 | 3 | import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation 4 | import org.apache.spark.sql.catalyst.parser.CatalystSqlParser 5 | import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LogicalPlan, UnaryNode, Union} 6 | 7 | object Parser { 8 | def getDsos(sql: String): List[String] = { 9 | def toStr(r: UnresolvedRelation) = 10 | r.tableIdentifier.database 11 | .map(db => s"$db.${r.tableIdentifier.table}") 12 | .getOrElse(r.tableIdentifier.table) 13 | 14 | def getDsoNames(plan: LogicalPlan, soFar: List[String] = List.empty): List[String] = { 15 | plan match { 16 | case un: UnaryNode => getDsoNames(un.child, soFar) 17 | case bn: BinaryNode => getDsoNames(bn.right, getDsoNames(bn.left, soFar)) 18 | case ur: UnresolvedRelation => toStr(ur) :: soFar 19 | case u: Union => u.children.foldLeft(soFar) { case (soFar2, c) => getDsoNames(c, soFar2)} 20 | case _ => soFar 21 | } 22 | } 23 | val plan = CatalystSqlParser.parsePlan(sql) 24 | getDsoNames(plan) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/parquet/ParquetExtractReader.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.parquet 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import spark_etl.model.Extract 5 | import spark_etl.util.Validation 6 | import spark_etl.util.Validation._ 7 | import spark_etl.{ConfigError, ExtractReader} 8 | 9 | import scala.util.Try 10 | 11 | class ParquetExtractReader(params: Map[String, Any]) extends ExtractReader(params) { 12 | val checkChildren = Try(params("check-children").asInstanceOf[Boolean]).getOrElse(false) 13 | val expectPartition = Try(params("expect-partition").asInstanceOf[Boolean]).getOrElse(false) 14 | 15 | // nothing to validate 16 | override def checkLocal(extracts: Seq[Extract]): Validation[ConfigError, Unit] = 17 | ().success[ConfigError] 18 | 19 | override def checkRemote(extracts: Seq[Extract]): Validation[ConfigError, Unit] = { 20 | val parquetUris = extracts.map(_.uri) 21 | PathValidator.validate(checkChildren, expectPartition, parquetUris: _*).map(_ => ()) 22 | } 23 | 24 | override def read(extracts: Seq[Extract])(implicit spark: SparkSession): Seq[(Extract, DataFrame)] = { 25 | extracts.map { 26 | e => 27 | val df = spark.read.parquet(e.uri) 28 | (e, df) 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/test/resources/main-utils/config/app.yaml: -------------------------------------------------------------------------------- 1 | extracts: 2 | - name: client 3 | uri: "data/dev/client_2017" 4 | check: "file:../${engine}/extract-check/client.sql" 5 | - name: item 6 | uri: "data/dev/item_2017" 7 | check: "file:../${engine}/extract-check/item.sql" 8 | - name: transaction 9 | uri: "data/dev/transaction_2017" 10 | check: "file:../${engine}/extract-check/transaction.sql" 11 | 12 | transforms: 13 | - name: client_spending 14 | check: "file:../${engine}/transform-check/client_spending.sql" 15 | sql: "file:../${engine}/transform/client_spending.sql" 16 | - name: item_purchase 17 | check: "file:../${engine}/transform-check/item_purchase.sql" 18 | sql: "file:../${engine}/transform/item_purchase.sql" 19 | - name: minor_purchase 20 | check: "file:../${engine}/transform-check/minor_purchase.sql" 21 | sql: "file:../${engine}/transform/minor_purchase.sql" 22 | 23 | loads: 24 | - name: client_spending_out 25 | source: client_spending 26 | uri: "/tmp/out/client_spending" 27 | - name: item_purchase_out 28 | source: item_purchase 29 | uri: "/tmp/out/item_purchase" 30 | - name: minor_purchase_out 31 | source: minor_purchase 32 | uri: "/tmp/out/minor_purchase" 33 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/parquet/PathValidatorSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.parquet 2 | 3 | import java.io.File 4 | 5 | import org.scalatest.{FlatSpec, Inside, Matchers} 6 | import spark_etl.util._ 7 | 8 | class PathValidatorSpec extends FlatSpec with Matchers with Inside { 9 | val root = Files.rootResource 10 | 11 | // create an empty dir, without a placeholder 12 | val emptyDir = new File(s"$root/parquet/empty") 13 | if (! emptyDir.exists) 14 | emptyDir.mkdir() 15 | 16 | "PathValidator" should "validate local `good` path" in { 17 | PathValidator.validate( 18 | true, 19 | true, 20 | s"$root/parquet/good" 21 | ) shouldBe Success(List(s"$root/parquet/good")) 22 | } 23 | 24 | it should "validate local `bad` path" in { 25 | val res = PathValidator.validate( 26 | true, 27 | true, 28 | s"$root/parquet/empty", 29 | s"$root/parquet/with_backup_dir", 30 | s"$root/parquet/__bogus_dir__" 31 | ) 32 | inside(res) { 33 | case Failure(errs) => 34 | val errMsgs = errs.toList.map(_.msg).sorted 35 | errMsgs.length shouldBe 3 36 | errMsgs(0) should startWith("Local path doesn't exist") 37 | errMsgs(1) should startWith("Local path is empty for") 38 | errMsgs(2) should startWith("Unexpected local children for") 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/util/DefaultEnv.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import org.joda.time.{DateTime, DateTimeZone} 4 | import org.joda.time.format.DateTimeFormat 5 | 6 | /** 7 | * Default env vars, for token substitution in queries/paths. 8 | */ 9 | object DefaultEnv { 10 | private val `yyyy-MM` = DateTimeFormat.forPattern("yyyy-MM") 11 | private val `yyyy-MM-dd` = DateTimeFormat.forPattern("yyyy-MM-dd") 12 | private val `yyyy-MM-dd HH:mm:ss` = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss") 13 | 14 | def getAll(date: DateTime, prefix: String = ""): Map[String, String] = { 15 | val `t-1` = date.minusDays(1) 16 | val `utc-1` = date.withZone(DateTimeZone.UTC).minusDays(1) 17 | get(`t-1`, prefix, "-1d") ++ 18 | get(`utc-1`, prefix + "utc-", "-1d") 19 | 20 | } 21 | 22 | def get(date: DateTime, prefix: String = "", suffix: String = ""): Map[String, String] = { 23 | Map( 24 | "yyyy-MM" -> `yyyy-MM`.print(date), 25 | "yyyy-MM-dd" -> `yyyy-MM-dd`.print(date), 26 | "sod" -> `yyyy-MM-dd HH:mm:ss`.print(date.withHourOfDay(0).withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0)), 27 | "eod" -> `yyyy-MM-dd HH:mm:ss`.print(date.withHourOfDay(23).withMinuteOfHour(59).withSecondOfMinute(59).withMillisOfSecond(999)), 28 | "y" -> date.getYear.toString, 29 | "m" -> date.getMonthOfYear.toString, 30 | "d" -> date.getDayOfMonth.toString 31 | ).map { case (k,v) => s"$prefix$k$suffix" -> v } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/util/BAHelper.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import java.io.{File, PrintWriter} 4 | 5 | import scala.io.Source 6 | 7 | object BAHelper { 8 | def copySqls(sourceDir: File, targetDir: File, rmTargetDir: Boolean): Seq[(String, String)] = { 9 | if (rmTargetDir) 10 | rmdir(targetDir) 11 | val sqlFiles = descendants(sourceDir).filter(_.getName.toLowerCase.endsWith(".sql")) 12 | sqlFiles.map { 13 | f => 14 | val fPerms = java.nio.file.Files.getPosixFilePermissions(f.toPath) 15 | val contents = Source.fromFile(f).mkString 16 | val target = new File(targetDir, f.getAbsolutePath.replace(sourceDir.getAbsolutePath, "")) 17 | val targetParent = target.getParentFile 18 | targetParent.mkdirs() 19 | new PrintWriter(target) { 20 | val stripped = SparkParser.stripDbs(contents) 21 | write(stripped) 22 | close() 23 | } 24 | java.nio.file.Files.setPosixFilePermissions(target.toPath, fPerms) 25 | (f.getPath, target.getPath) 26 | } 27 | } 28 | 29 | private def descendants(f: File): Seq[File] = { 30 | val children = f.listFiles 31 | if (children == null) 32 | Nil 33 | else 34 | children ++ children.filter(_.isDirectory).flatMap(descendants) 35 | } 36 | 37 | private def rmdir(file: File): Unit = { 38 | if (file.isDirectory) 39 | file.listFiles.foreach(rmdir) 40 | if (file.exists && !file.delete) 41 | throw new Exception(s"Unable to delete ${file.getAbsolutePath}") 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/util/SparkParser.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation 4 | import org.apache.spark.sql.catalyst.parser.CatalystSqlParser 5 | import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LogicalPlan, UnaryNode, Union} 6 | 7 | object SparkParser { 8 | /** 9 | * Get a list of qualified dependencies. 10 | */ 11 | def getDeps(sql: String): List[QfDep] = { 12 | def toDep(r: UnresolvedRelation) = 13 | r.tableIdentifier.database 14 | .map(db => QfDep(r.tableIdentifier.table, Some(db))) 15 | .getOrElse(QfDep(r.tableIdentifier.table)) 16 | 17 | def getDepNames(plan: LogicalPlan, soFar: List[QfDep] = List.empty): List[QfDep] = { 18 | plan match { 19 | case un: UnaryNode => getDepNames(un.child, soFar) 20 | case bn: BinaryNode => getDepNames(bn.right, getDepNames(bn.left, soFar)) 21 | case ur: UnresolvedRelation => toDep(ur) :: soFar 22 | case u: Union => u.children.foldLeft(soFar) { case (soFar2, c) => getDepNames(c, soFar2) } 23 | case _ => soFar 24 | } 25 | } 26 | val plan = CatalystSqlParser.parsePlan(sql) 27 | getDepNames(plan).distinct 28 | } 29 | 30 | /** 31 | * Strip db prefixes. 32 | */ 33 | def stripDbs(sql: String): String = 34 | getDeps(sql).foldLeft(sql) { 35 | case (soFar, dep) => 36 | soFar.replace(dep.qfStr, dep.dep) 37 | } 38 | 39 | case class QfDep(dep: String, prefix: Option[String] = None) { 40 | def qfStr: String = prefix.map(dbId => s"$dbId.$dep").getOrElse(dep) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/util/DepTreeSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import org.scalatest.{FlatSpec, Inside, Matchers} 4 | 5 | class DepTreeSpec extends FlatSpec with Matchers with Inside { 6 | val initVertices = Seq( 7 | Vertex("l1", L), 8 | Vertex("l2", L), 9 | Vertex("t1", T), 10 | Vertex("t2", T), 11 | Vertex("e1", E), 12 | Vertex("e2", E), 13 | Vertex("e3", E) 14 | ) 15 | 16 | "DepTree" should "validate simple tree with no dangling" in { 17 | val tree = new DepTree(initVertices) 18 | 19 | // add actual deps 20 | tree.addEdge("e1", Vertex("t1", T)) 21 | tree.addEdge("e2", Vertex("t1", T)) 22 | tree.addEdge("t1", Vertex("t2", T)) 23 | tree.addEdge("e3", Vertex("t2", T)) 24 | tree.addEdge("t1", Vertex("l1", L), true) 25 | tree.addEdge("t2", Vertex("l2", L), true) 26 | 27 | // validate 28 | tree.dangling shouldBe Nil 29 | tree.forType(L) shouldBe Seq( 30 | Vertex("l1", L), 31 | Vertex("l2", L) 32 | ) 33 | tree.forType(T) shouldBe Seq( 34 | Vertex("t1", T), 35 | Vertex("t2", T) 36 | ) 37 | } 38 | 39 | it should "find dangling" in { 40 | val tree = new DepTree(initVertices) 41 | 42 | // add actual deps 43 | tree.addEdge("e1", Vertex("t1", T)) 44 | tree.addEdge("__bogus_e__", Vertex("t1", T)) 45 | tree.addEdge("t1", Vertex("l1", L), true) 46 | 47 | // validate 48 | tree.dangling shouldBe Seq(Edge(Vertex("__bogus_e__", Dangling), Vertex("t1", T), false)) 49 | 50 | tree.rootless shouldBe Seq(Vertex("t2", T), Vertex("e2", E), Vertex("e3", E), Vertex("__bogus_e__", Dangling)) 51 | 52 | tree.forType(T) shouldBe Seq(Vertex("t1", T)) 53 | tree.forType(E) shouldBe Seq(Vertex("e1", E)) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/CLIOpsSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl 2 | 3 | import org.scalatest.{FlatSpec, Inside, Matchers} 4 | import spark_etl.util._ 5 | 6 | class CLIOpsSpec extends FlatSpec with Matchers with Inside { 7 | val root = Files.rootResource 8 | 9 | "CLI" should "validate-local complex file specs" in { 10 | CLI.main(Array("-Denv.engine=spark", "-Denv.length_fun=length", "-Denv.count_fun=count", "-Denv.join_type=LEFT OUTER JOIN", s"--conf-uri=file:$root/main-utils/config/app.yaml", "validate-local")) 11 | } 12 | 13 | "CLIOps" should "validate-local complex file specs" in { 14 | val envVars = Map("engine" -> "spark", "length_fun" -> "length", "count_fun" -> "count", "join_type" -> "LEFT OUTER JOIN") 15 | CLIOps.validateLocal("file:main-utils/config/app.yaml", root, envVars) shouldBe Success(()) 16 | } 17 | 18 | it should "fail on missing app.yaml env vars" in { 19 | val envVars = Map("length_fun" -> "length", "count_fun" -> "count", "join_type" -> "LEFT OUTER JOIN") 20 | inside(CLIOps.validateLocal("file:main-utils/config/app.yaml", root, envVars)) { 21 | case Failure(errs) => 22 | errs.length shouldBe 1 23 | errs.head.msg shouldBe "Unresolved env vars in file:main-utils/config/app.yaml: ${engine}" 24 | } 25 | } 26 | 27 | it should "fail on missing SQL env vars" in { 28 | val envVars = Map("engine" -> "spark") 29 | inside(CLIOps.validateLocal("file:main-utils/config/app.yaml", root, envVars)) { 30 | case Failure(errs) => 31 | errs.toList.map(_.msg).sorted shouldBe List( 32 | "Unresolved env vars in file:../spark/extract-check/client.sql: ${count_fun}", 33 | "Unresolved env vars in file:../spark/transform-check/item_purchase.sql: ${length_fun}", 34 | "Unresolved env vars in file:../spark/transform/item_purchase.sql: ${join_type}" 35 | ) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/model/ParametrizedConstructor.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.model 2 | 3 | import net.jcazevedo.moultingyaml._ 4 | 5 | case class ParametrizedConstructor(`class`: String, params: Option[Map[String, Any]] = Some(Map.empty)) 6 | 7 | object ParametrizedConstructor extends DefaultYamlProtocol { 8 | implicit val mapFormat: YamlFormat[Map[String, Any]] = new YamlFormat[Map[String, Any]] { 9 | override def read(v: YamlValue): Map[String, Any] = v match { 10 | case o: YamlObject => readValue(o).asInstanceOf[Map[String, Any]] 11 | case other => deserializationError(s"Map like object expected, got $other") 12 | } 13 | 14 | def readValue: (YamlValue) => Any = { 15 | case x: YamlBoolean => x.boolean 16 | case x: YamlDate => x.date.toDate 17 | case x: YamlNumber => x.value.toInt 18 | case x: YamlString => x.value 19 | case x: YamlObject => x.fields.map { case (k, v) => asStr(k) -> readValue(v)} 20 | case x: YamlArray => x.elements.map(readValue) 21 | case x: YamlSet => x.set.map(readValue) 22 | case YamlNull => null 23 | case YamlNaN => Double.NaN 24 | case YamlNegativeInf => Double.NegativeInfinity 25 | case YamlPositiveInf => Double.PositiveInfinity 26 | } 27 | 28 | def asStr: (YamlValue) => String = { 29 | case x: YamlBoolean => x.boolean.toString 30 | case x: YamlDate => x.date.toString 31 | case x: YamlNumber => x.value.toString 32 | case x: YamlString => x.value 33 | case YamlNull => null 34 | case YamlNaN => "nan" 35 | case YamlNegativeInf => "-∞" 36 | case YamlPositiveInf => "∞" 37 | case other => deserializationError(s"Failed to stringify map key: $other") 38 | } 39 | 40 | override def write(obj: Map[String, Any]): YamlValue = ??? 41 | } 42 | 43 | implicit val yamlFormat = yamlFormat2(ParametrizedConstructor.apply) 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/model/Config.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.model 2 | 3 | import net.jcazevedo.moultingyaml._ 4 | import spark_etl.ConfigError 5 | import spark_etl.parquet.{ParquetExtractReader, ParquetLoadWriter} 6 | import spark_etl.util.Validation._ 7 | import spark_etl.util.{UriLoader, Validation} 8 | 9 | import scala.util.{Failure, Success, Try} 10 | 11 | case class Config( 12 | extracts: List[Extract], 13 | transforms: List[Transform], 14 | loads: List[Load], 15 | extract_reader: Option[ParametrizedConstructor] = Some(ParametrizedConstructor(classOf[ParquetExtractReader].getName, Some(Map.empty))), 16 | load_writer: Option[ParametrizedConstructor] = Some(ParametrizedConstructor(classOf[ParquetLoadWriter].getName, Some(Map.empty)))) 17 | 18 | object Config extends DefaultYamlProtocol { 19 | implicit val yamlFormat = yamlFormat5(Config.apply) 20 | 21 | /** 22 | * Load Config from resource/file Uri 23 | */ 24 | def load(resourceUri: String, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, Config] = 25 | UriLoader.load(resourceUri, filePathRoot, env).flatMap(parse(_, env)) 26 | 27 | def parse(configStr: String, env: Map[String, String] = Map.empty): Validation[ConfigError, Config] = 28 | Try(configStr.parseYaml.convertTo[Config]) match { 29 | case Success(conf) => 30 | // yaml parser does not populate with defaults - force them 31 | val defaultExtractReader = Config(Nil, Nil, Nil).extract_reader 32 | val defaultLoadWriter = Config(Nil, Nil, Nil).load_writer 33 | val conf2 = conf.copy( 34 | extract_reader = conf.extract_reader.orElse(defaultExtractReader), 35 | load_writer = conf.load_writer.orElse(defaultLoadWriter) 36 | ) 37 | conf2.success[ConfigError] 38 | case Failure(e: DeserializationException) => 39 | ConfigError(s"Failed to deserialize config body, exception: ${e.getMessage}").failure[Config] 40 | case Failure(e) => 41 | ConfigError(s"Failed to parse config body", Some(e)).failure[Config] 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/model/Persist.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.model 2 | 3 | import net.jcazevedo.moultingyaml._ 4 | import org.apache.spark.storage.StorageLevel 5 | 6 | sealed trait Persist { def asSpark: StorageLevel } 7 | 8 | object Persist { 9 | object NONE extends Persist { def asSpark = StorageLevel.NONE } 10 | object DISK_ONLY extends Persist { def asSpark = StorageLevel.DISK_ONLY } 11 | object DISK_ONLY_2 extends Persist { def asSpark = StorageLevel.DISK_ONLY_2 } 12 | object MEMORY_ONLY extends Persist { def asSpark = StorageLevel.MEMORY_ONLY } 13 | object MEMORY_ONLY_2 extends Persist { def asSpark = StorageLevel.MEMORY_ONLY_2 } 14 | object MEMORY_ONLY_SER extends Persist { def asSpark = StorageLevel.MEMORY_ONLY_SER } 15 | object MEMORY_ONLY_SER_2 extends Persist { def asSpark = StorageLevel.MEMORY_ONLY_SER_2 } 16 | object MEMORY_AND_DISK extends Persist { def asSpark = StorageLevel.MEMORY_AND_DISK } 17 | object MEMORY_AND_DISK_2 extends Persist { def asSpark = StorageLevel.MEMORY_AND_DISK_2 } 18 | object MEMORY_AND_DISK_SER extends Persist { def asSpark = StorageLevel.MEMORY_AND_DISK_SER } 19 | object MEMORY_AND_DISK_SER_2 extends Persist { def asSpark = StorageLevel.MEMORY_AND_DISK_SER_2 } 20 | object OFF_HEAP extends Persist { def asSpark = StorageLevel.OFF_HEAP } 21 | 22 | implicit val typeFormat = new YamlFormat[Persist] { 23 | def read(value: YamlValue): Persist = value match { 24 | case YamlString(x) => 25 | x.toUpperCase match { 26 | case "NONE" => NONE 27 | case "DISK_ONLY" => DISK_ONLY 28 | case "DISK_ONLY_2" => DISK_ONLY_2 29 | case "MEMORY_ONLY" => MEMORY_ONLY 30 | case "MEMORY_ONLY_2" => MEMORY_ONLY_2 31 | case "MEMORY_ONLY_SER" => MEMORY_ONLY_SER 32 | case "MEMORY_ONLY_SER_2" => MEMORY_ONLY_SER_2 33 | case "MEMORY_AND_DISK" => MEMORY_AND_DISK 34 | case "MEMORY_AND_DISK_2" => MEMORY_AND_DISK_2 35 | case "MEMORY_AND_DISK_SER" => MEMORY_AND_DISK_SER 36 | case "MEMORY_AND_DISK_SER_2" => MEMORY_AND_DISK_SER_2 37 | case "OFF_HEAP" => OFF_HEAP 38 | } 39 | case _ => deserializationError("Invalid Persist mode, see Spark StorageLevel options") 40 | } 41 | def write(g: Persist) = ??? 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/model/ConfigSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.model 2 | 3 | import org.scalatest.{FlatSpec, Inside, Matchers} 4 | 5 | import spark_etl.util._ 6 | 7 | class ConfigSpec extends FlatSpec with Matchers with Inside { 8 | "Config" should "fail to parse" in { 9 | val bogusConfig = "NOT A CONFIG" 10 | inside(Config.parse(bogusConfig)) { 11 | case Failure(Seq(err)) => 12 | err.msg should startWith("Failed to deserialize") 13 | } 14 | } 15 | 16 | it should "read simple config" in { 17 | val simpleConfig = 18 | s"""extracts: 19 | | - name: e1 20 | | uri: e1_uri 21 | | cache: true 22 | | persist: MEMORY_ONLY 23 | | 24 | |transforms: 25 | | - name: t1 26 | | cache: false 27 | | sql: t1_uri 28 | | persist: DISK_ONLY 29 | | 30 | |loads: 31 | | - name: l1 32 | | source: t1 33 | | uri: l1_uri 34 | """.stripMargin 35 | Config.parse(simpleConfig) shouldBe Success(Config( 36 | List(Extract("e1", "e1_uri", Some(true), Some(Persist.MEMORY_ONLY))), 37 | List(Transform("t1", "t1_uri", Some(false), Some(Persist.DISK_ONLY))), 38 | List(Load("l1", "t1", "l1_uri")) 39 | )) 40 | } 41 | 42 | it should "read reader/writer constructors" in { 43 | val simpleConfig = 44 | s"""extracts: 45 | | - name: e1 46 | | uri: e1_uri 47 | | 48 | |transforms: 49 | | - name: t1 50 | | sql: t1_uri 51 | | 52 | |loads: 53 | | - name: l1 54 | | source: t1 55 | | uri: l1_uri 56 | | 57 | |extract_reader: 58 | | class: DummyExtractReader 59 | | params: 60 | | x: 11 61 | | y: aa 62 | | 63 | |load_writer: 64 | | class: DummyLoadWriter 65 | | params: 66 | | b: false 67 | | a: [1, xxx] 68 | """.stripMargin 69 | Config.parse(simpleConfig) shouldBe Success(Config( 70 | List(Extract("e1", "e1_uri")), 71 | List(Transform("t1", "t1_uri")), 72 | List(Load("l1", "t1", "l1_uri")), 73 | Some(ParametrizedConstructor("DummyExtractReader", Some(Map("x" -> 11d, "y" -> "aa")))), 74 | Some(ParametrizedConstructor("DummyLoadWriter", Some(Map("b" -> false, "a" -> List(1d, "xxx"))))) 75 | )) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/oracle/OracleValidator.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.oracle 2 | 3 | import java.sql.Connection 4 | 5 | import org.apache.log4j.Logger 6 | import spark_etl.ConfigError 7 | 8 | import scala.util.Try 9 | 10 | import spark_etl.util.Validation 11 | import spark_etl.util.Validation._ 12 | 13 | /** 14 | * Oracle Validator, checks connectivity and existance of tables. 15 | */ 16 | // FIXME: needs tests!!! 17 | object OracleValidator { 18 | private val log = Logger.getLogger(getClass) 19 | 20 | def validateOracle(connStr: String, user: String, pwd: String, requiredTables: Seq[String]): Validation[ConfigError, Unit] = 21 | for { 22 | // open connection 23 | conn <- Try { 24 | java.sql.DriverManager.getConnection(connStr, user, pwd) 25 | } match { 26 | case scala.util.Success(conn) => conn.success[ConfigError] 27 | case scala.util.Failure(e) => ConfigError(s"Failed to access Oracle @ $connStr, as user $user", Some(e)).failure[Connection] 28 | } 29 | // check all tables 30 | _ <- { 31 | val tableVs = requiredTables.map { 32 | tableName => 33 | Try { 34 | val ps = conn.prepareStatement("SELECT COUNT(1) FROM USER_TABLES WHERE LOWER(TABLE_NAME) = ?") 35 | ps.setString(1, tableName.toLowerCase) 36 | val rs = ps.executeQuery() 37 | rs.next() 38 | val tableCount = rs.getInt(1) 39 | if (tableCount == 0) 40 | throw new Exception(s"Failed to find table $tableName") 41 | else 42 | log.info(s"Validated Oracle table: $tableName") 43 | } match { 44 | case scala.util.Success(_) => ().success[ConfigError] 45 | case scala.util.Failure(e) => ConfigError(s"Failed to access Oracle table: $tableName", Some(e)).failure[Unit] 46 | } 47 | } 48 | // validate all tables (allowing for empty set) 49 | tableVs.foldLeft(().success[ConfigError]) { 50 | case (v1, v2) => v1 +++ v2 51 | } 52 | } 53 | // close the connection 54 | _ <- Try(conn.close()) match { 55 | case scala.util.Success(_) => 56 | log.info(s"Validated Oracle connection: $connStr as user: $user") 57 | ().success[ConfigError] 58 | case scala.util.Failure(e) => 59 | ConfigError(s"Failed to close the Oracle connection", Some(e)).failure[Unit] 60 | } 61 | } yield () 62 | } 63 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/parquet/WriteReadRoundtripSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.parquet 2 | 3 | import java.io.File 4 | 5 | import org.apache.commons.io.FileUtils 6 | import org.apache.spark.sql.SparkSession 7 | import org.scalatest.{FlatSpec, Inside, Matchers} 8 | import spark_etl.CLI 9 | import spark_etl.model.{Extract, Load} 10 | import spark_etl.util.Files 11 | 12 | class WriteReadRoundtripSpec extends FlatSpec with Matchers with Inside { 13 | val root = Files.rootResource 14 | 15 | "Reader and Writer" should "roundtrip" in { 16 | testRoundtrip(None) 17 | } 18 | 19 | it should "roundtrip partitioned" in { 20 | testRoundtrip(Some(List("s"))) 21 | } 22 | 23 | "CLI" should "transform" in { 24 | // cleanup dir if exists 25 | FileUtils.deleteDirectory(new File(s"$root/parquet-roundtrip/x")) 26 | 27 | implicit val spark = SparkSession.builder.appName("test") 28 | .master("local[1]") 29 | .config("spark.ui.port", 4046).config("spark.history.ui.port", 18086) 30 | .getOrCreate 31 | try { 32 | import spark.implicits._ 33 | val in = Seq(ParquetTestLoad("a", 1), ParquetTestLoad("b", 2)) 34 | val dfIn = in.toDF() 35 | val writer = new ParquetLoadWriter(Map.empty) 36 | writer.write(Seq( 37 | (Load("x", "x", s"$root/parquet-roundtrip/x", Some(List("s"))), dfIn) 38 | )) 39 | } finally { 40 | spark.stop 41 | } 42 | 43 | CLI.main(Array(s"-Denv.path=$root/parquet-roundtrip", "--conf-uri", s"/parquet-roundtrip/app.yaml", "validate-remote")) 44 | } 45 | 46 | private def testRoundtrip(partitionBy: Option[List[String]]) = { 47 | // cleanup dir if exists 48 | FileUtils.deleteDirectory(new File(s"$root/x")) 49 | 50 | // run the test 51 | implicit val spark = SparkSession.builder.appName("test") 52 | .master("local[1]") 53 | .config("spark.ui.port", 4045).config("spark.history.ui.port", 18085) 54 | .getOrCreate 55 | try { 56 | import spark.implicits._ 57 | val in = Seq(ParquetTestLoad("a", 1), ParquetTestLoad("b", 2)) 58 | val dfIn = in.toDF() 59 | val writer = new ParquetLoadWriter(Map.empty) 60 | writer.write(Seq( 61 | (Load("x", "x", s"$root/x", partitionBy), dfIn) 62 | )) 63 | val reader = new ParquetExtractReader(Map.empty) 64 | val (_, dfOut) :: Nil = reader.read(Seq(Extract("x", s"$root/x"))) 65 | 66 | dfOut.as[ParquetTestLoad].collect() should contain allElementsOf(in) 67 | } finally { 68 | spark.stop 69 | } 70 | } 71 | } 72 | 73 | case class ParquetTestLoad(s: String, i: Int) 74 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/oracle/OracleLoadAppender.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.oracle 2 | 3 | import org.apache.log4j.Logger 4 | import org.apache.spark.sql.{DataFrame, SaveMode} 5 | import spark_etl.model.Load 6 | import spark_etl.util.Validation 7 | import spark_etl.util.Validation._ 8 | import spark_etl.{ConfigError, LoadWriter} 9 | 10 | import scala.util.Try 11 | 12 | /** 13 | * Sample Oracle appender. 14 | */ 15 | class OracleLoadAppender(params: Map[String, Any]) extends LoadWriter(params) { 16 | private val log = Logger.getLogger(getClass) 17 | 18 | override def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit = { 19 | val oracleUri = params("oracle_uri").toString 20 | val username = params("oracle_user").toString 21 | val password = params("oracle_password").toString 22 | val driver = params.get("oracle_driver").map(_.toString).getOrElse("oracle.jdbc.driver.OracleDriver") 23 | val batchSize = Try(params("oracle_batch").asInstanceOf[Int]).getOrElse(1000) 24 | val props = { 25 | val p = new java.util.Properties() 26 | p.setProperty("user", username) 27 | p.setProperty("password", password) 28 | p.setProperty("fetchsize", batchSize.toString) 29 | p.setProperty("batchsize", batchSize.toString) 30 | p.setProperty("isolationLevel", "READ_COMMITTED") 31 | p.setProperty("driver", driver) 32 | p 33 | } 34 | 35 | log.info(s"Load writing to $oracleUri, with jdbc driver: $driver") 36 | loadsAndDfs.foreach { 37 | case (Load(_, _, tableName, _), df) => 38 | df.write.mode(SaveMode.Append).jdbc(oracleUri, tableName, props) 39 | } 40 | } 41 | 42 | override def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit] = { 43 | merge(toVal[String]("oracle_uri"), 44 | toVal[String]("oracle_user"), 45 | toVal[String]("oracle_password"), 46 | toVal[Int]("oracle_batch")) { (_, _, _, _) => () } 47 | } 48 | 49 | override def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit] = { 50 | val oracleUri = params("oracle_uri").toString 51 | val username = params("oracle_user").toString 52 | val password = params("oracle_password").toString 53 | val requiredTables = loads.map(_.uri) 54 | OracleValidator.validateOracle(oracleUri, username, password, requiredTables) 55 | } 56 | 57 | private def toVal[T](key: String): Validation[ConfigError, T] = 58 | params.get(key) match { 59 | case Some(v) => 60 | Try(v.asInstanceOf[T]) match { 61 | case scala.util.Success(v2) => v2.success[ConfigError] 62 | case scala.util.Failure(_) => ConfigError(s"Invalid type of the env_var: $key").failure[T] 63 | } 64 | case None => 65 | ConfigError(s"Missing ${getClass.getSimpleName} param: $key").failure[T] 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/util/UriLoader.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import spark_etl.ConfigError 4 | import spark_etl.util.Validation._ 5 | 6 | import scala.io.Source 7 | 8 | object UriLoader { 9 | private val fileProtocol = "file:" 10 | private val resourceProtocol = "resource:" 11 | 12 | def load(uri: String, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, String] = 13 | for { 14 | contents <- 15 | if (uri.startsWith(fileProtocol)) { 16 | val filePath = uri.substring(fileProtocol.length) 17 | val fqFilePath = if (filePath.startsWith("/")) 18 | filePath 19 | else if (filePathRoot.endsWith("/")) 20 | s"$filePathRoot$filePath" 21 | else 22 | s"$filePathRoot/$filePath" 23 | loadFile(fqFilePath, env) 24 | } else if (uri.startsWith(resourceProtocol)) 25 | loadResource(uri.substring(resourceProtocol.length), env) 26 | else 27 | loadResource(uri, env) 28 | withIncludes <- { 29 | // load #include 30 | val includePattern = "(?m)^\\s*#include\\s*<(.+)>.*$".r 31 | val includeUris = includePattern.findAllIn(contents).matchData.map(_.group(1)) 32 | if (includeUris.isEmpty) 33 | contents.success[ConfigError] 34 | else { 35 | val byUriIncludes = getIncludes(includeUris, filePathRoot, env) 36 | byUriIncludes.map(includes => includePattern.replaceAllIn(contents, m => includes(m.group(1)))) 37 | } 38 | } 39 | withEnvVars <- envVarSub(uri, withIncludes, env) 40 | } yield withEnvVars 41 | 42 | private def loadResource(uri: String, env: Map[String, String]): Validation[ConfigError, String] = { 43 | val fqUri = getClass.getResource(uri) 44 | if (fqUri != null) 45 | Source.fromURL(fqUri).mkString.success[ConfigError] 46 | else 47 | ConfigError(s"Failed to read resource $uri").failure[String] 48 | } 49 | 50 | private def loadFile(uri: String, env: Map[String, String]): Validation[ConfigError, String] = { 51 | val file = new java.io.File(uri) 52 | if (file.canRead) 53 | scala.io.Source.fromFile(file).mkString.success[ConfigError] 54 | else 55 | ConfigError(s"Failed to read file $uri").failure[String] 56 | } 57 | 58 | private def envVarSub(uri: String, contents: String, env: Map[String, String]): Validation[ConfigError, String] = { 59 | // replace all ${k}, with v, ensuring v can contain '$' 60 | val contents2 = env.foldLeft(contents) { case (soFar, (k, v)) => soFar.replaceAll("\\$\\{" + k + "\\}", v.replaceAll("\\$", "\\\\\\$")) } 61 | val remainingVars = "\\$\\{.*?\\}".r.findAllIn(contents2) 62 | if (remainingVars.isEmpty) 63 | contents2.success[ConfigError] 64 | else { 65 | val varNames = remainingVars.toList.distinct 66 | ConfigError(s"Unresolved env vars in $uri: ${varNames.mkString(", ")}").failure[String] 67 | } 68 | } 69 | 70 | private def getIncludes(uris: Iterator[String], filePathRoot: String, env: Map[String, String]): Validation[ConfigError, Map[String, String]] = { 71 | uris.map(uri => load(uri, filePathRoot, env).map(contents => Map(uri -> contents))).reduce(_ +++ _) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/oracle/OracleLoadAppenderSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.oracle 2 | 3 | import java.sql.DriverManager 4 | import java.util.Properties 5 | 6 | import org.apache.spark.sql.SparkSession 7 | import org.scalatest._ 8 | import spark_etl.model.Load 9 | 10 | class OracleLoadAppenderSpec extends FlatSpec with Matchers with Inside with BeforeAndAfterAll { 11 | val dbUri = "jdbc:h2:mem:testDB;user=user;password=pwd" 12 | val props = new Properties() 13 | val conn = DriverManager.getConnection(s"$dbUri;create=true", props) 14 | 15 | conn.createStatement().execute("CREATE TABLE user_tables(table_name VARCHAR(20))") 16 | conn.createStatement().execute("CREATE TABLE test_table(loadid VARCHAR(36), name VARCHAR(20), age INT)") 17 | conn.prepareStatement("INSERT INTO user_tables(table_name) values ('test_table')").executeUpdate() 18 | conn.commit() 19 | 20 | val loads = Seq(Load("test_load", "test_transform", "test_table")) 21 | 22 | "OracleLoadAppender" should "validate-local" in { 23 | new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkLocal(loads).isSuccess shouldBe true 24 | new OracleLoadAppender(Map("oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkLocal(loads).isSuccess shouldBe false 25 | new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkLocal(loads).isSuccess shouldBe false 26 | new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_batch" -> 100)).checkLocal(loads).isSuccess shouldBe false 27 | new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_password" -> "pwd")).checkLocal(loads).isSuccess shouldBe false 28 | } 29 | 30 | it should "validate-remote" in { 31 | new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkRemote(loads).isSuccess shouldBe true 32 | new OracleLoadAppender(Map("oracle_uri" -> "jdbc:h2:mem:__bogus_db__", "oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkRemote(loads).isSuccess shouldBe false 33 | } 34 | 35 | it should "write" in { 36 | implicit val spark = SparkSession.builder.appName("test") 37 | .master("local[1]") 38 | .config("spark.ui.port", 4050).config("spark.history.ui.port", 18090) 39 | .getOrCreate 40 | try { 41 | import spark.implicits._ 42 | val inDF = Seq(PersonRow("Joe", 50), PersonRow("Jane", 23)).toDF() 43 | // mocking driver for db 44 | new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_driver" -> "org.h2.Driver")).write(Seq( 45 | (Load("test_load", "test_transform", "test_table"), inDF) 46 | )) 47 | } finally { 48 | spark.stop() 49 | } 50 | 51 | conn.commit() 52 | 53 | // validate load write 54 | val rs = conn.prepareStatement("SELECT count(1) FROM test_table").executeQuery() 55 | rs.next() 56 | rs.getInt(1) shouldBe 2 57 | } 58 | 59 | override def afterAll: Unit = conn.createStatement().execute("SHUTDOWN") 60 | } 61 | 62 | case class PersonRow(NAME: String, AGE: Int) // Note, fields need to be uppercase, otherwise it confuses spark.jdbc 63 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/util/UriLoaderSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import org.scalatest.{FlatSpec, Inside, Matchers} 4 | import spark_etl.ConfigError 5 | import spark_etl.util.Validation._ 6 | 7 | class UriLoaderSpec extends FlatSpec with Matchers with Inside { 8 | "UriLoader" should "read resource without tokens" in { 9 | validateWithoutVars("/uri-loader/without_env_vars", "__ignore__") 10 | validateWithoutVars("resource:/uri-loader/without_env_vars", "__ignore__") 11 | } 12 | 13 | it should "read resource with tokens" in { 14 | validateWithVars("/uri-loader/with_env_vars", "__ignore__") 15 | validateWithVars("resource:/uri-loader/with_env_vars", "__ignore__") 16 | } 17 | 18 | it should "read file without tokens" in { 19 | val asFilename = getClass.getResource("/uri-loader/without_env_vars").toString 20 | asFilename should startWith("file:") 21 | validateWithoutVars(asFilename, "__ignore__") 22 | } 23 | 24 | it should "read file with tokens" in { 25 | val asFilename = getClass.getResource("/uri-loader/with_env_vars").toString 26 | asFilename should startWith("file:") 27 | validateWithVars(asFilename, "__ignore__") 28 | validateWithVars("file:uri-loader/with_env_vars", getClass.getResource("/").getFile) 29 | } 30 | 31 | it should "fail to read bogus files/resources" in { 32 | validateNotExists("/bogus_uri", "__ignore__") 33 | validateNotExists("/bogus_uri", Files.pwd) 34 | validateNotExists("resource:/bogus_uri", "__ignore__") 35 | validateNotExists("resource:/bogus_uri", Files.pwd) 36 | validateNotExists("file:/bogus_uri", "__ignore__") 37 | validateNotExists("file:/bogus_uri", Files.pwd) 38 | } 39 | 40 | it should "read #includes" in { 41 | val expected = 42 | """|=== 43 | |hello there 44 | |123 45 | | 46 | |--- 47 | |111 val1 222 val2 333 val1 444 48 | | 49 | |+++""".stripMargin 50 | UriLoader.load("/uri-loader/with_includes", "__ignore__", Map("var1" -> "val1", "var2" -> "val2")) shouldBe expected.success[ConfigError] 51 | } 52 | 53 | it should "read fail on bogus #include" in { 54 | inside(UriLoader.load("/uri-loader/with_bogus_includes", "__ignore__", Map("var1" -> "val1", "var2" -> "val2"))) { 55 | case Failure(err) => 56 | err.toList shouldBe List(ConfigError("Failed to read resource __bogus_include__")) 57 | } 58 | } 59 | 60 | private def validateWithoutVars(uri: String, fileRoot: String) = { 61 | UriLoader.load(uri, fileRoot, Map("var1" -> "val1")) shouldBe "hello there\n123\n".success[ConfigError] 62 | } 63 | 64 | private def validateWithVars(uri: String, fileRoot: String) = { 65 | UriLoader.load(uri, fileRoot, Map("var1" -> "val1", "var2" -> "val2")) shouldBe "111 val1 222 val2 333 val1 444\n".success[ConfigError] 66 | inside(UriLoader.load(uri, fileRoot, Map.empty)) { 67 | case Failure(err) => 68 | err.toList.length shouldBe 1 69 | err.head.msg should startWith("Unresolved env vars in") 70 | err.head.msg should endWith("${var1}, ${var2}") 71 | } 72 | } 73 | 74 | private def validateNotExists(uri: String, fileRoot: String) = { 75 | inside(UriLoader.load(uri, fileRoot, Map.empty)) { 76 | case Failure(err) => 77 | err.toList.length shouldBe 1 78 | err.head.msg should startWith("Failed to read") 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/parquet/PathValidator.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.parquet 2 | 3 | import java.io.File 4 | 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.hadoop.fs.{FileSystem, RemoteIterator} 7 | import spark_etl.ConfigError 8 | import spark_etl.util.Validation 9 | import spark_etl.util.Validation._ 10 | 11 | /** 12 | * Validate parquet paths on hdfs and local. 13 | * - validates path 14 | * - (optionally) validates path children 15 | * - expects children to be _temporary, _SUCCESS, ._SUCCESS.crc 16 | * - or if expecting partitions, children to contain "=" 17 | */ 18 | object PathValidator { 19 | def validate(checkChildren: Boolean, expectPartition: Boolean, paths: String*): Validation[ConfigError, List[String]] = { 20 | val validated = paths.map { p => 21 | if (p.toLowerCase.startsWith("hdfs://")) { 22 | val hadoopConf = new Configuration() 23 | val fs = org.apache.hadoop.fs.FileSystem.get(hadoopConf) 24 | val fsPath = new org.apache.hadoop.fs.Path(p) 25 | if (!fs.exists(fsPath)) 26 | ConfigError(s"hdfs path doesn't exist: $p").failure[String] 27 | else if (checkChildren) { 28 | val children = hdfsList(fs, fsPath) 29 | if (areValid(children, expectPartition)) 30 | p.success[ConfigError] 31 | else if (children.isEmpty) 32 | ConfigError(s"hdfs path is empty for $p").failure[String] 33 | else 34 | ConfigError(s"Unexpected hdfs children for $p: ${children.map(trimRoot(p)).mkString(", ")} ").failure[String] 35 | } 36 | else 37 | p.success[ConfigError] 38 | } else { 39 | val ioPath = new File(p) 40 | if (! ioPath.exists) 41 | ConfigError(s"Local path doesn't exist: $p").failure[String] 42 | else if (checkChildren) { 43 | val children = ioPath.list().toSeq 44 | if (areValid(children, expectPartition)) 45 | p.success[ConfigError] 46 | else if (children.isEmpty) 47 | ConfigError(s"Local path is empty for $p").failure[String] 48 | else 49 | ConfigError(s"Unexpected local children for $p: ${children.map(trimRoot(p)).mkString(", ")}").failure[String] 50 | } 51 | else 52 | p.success[ConfigError] 53 | } 54 | } 55 | val res = validated.map(_.map(List(_))).reduce(_ +++ _) 56 | res 57 | } 58 | 59 | def hdfsList(fs: FileSystem, parent: org.apache.hadoop.fs.Path): Seq[String] = { 60 | // as per: https://issues.apache.org/jira/browse/HDFS-7921 61 | // listing recursively, including all files, then filtering distinct immediate children 62 | val parentStr = parent.toUri.toString 63 | toIterator(fs.listFiles(parent, true)) 64 | .map { 65 | f => 66 | val descendant = f.getPath.toUri.toString 67 | val relativeDescendant = descendant.substring(parentStr.length + 1) 68 | val immediateChild = relativeDescendant.split("/").head 69 | s"$parent/$immediateChild" 70 | }.toSeq.distinct 71 | } 72 | 73 | private def trimRoot(root: String)(path: String): String = { 74 | val root2 = if (root.endsWith("/")) 75 | root 76 | else 77 | root + "/" 78 | if (path.startsWith(root2)) 79 | path.substring(root2.length) 80 | else 81 | path 82 | } 83 | 84 | private def toIterator[T](iter: RemoteIterator[T]) = 85 | new Iterator[T] { 86 | def hasNext = iter.hasNext 87 | def next = iter.next 88 | } 89 | 90 | private def areValid(paths: Seq[String], expectPartition: Boolean) = { 91 | lazy val hasValidChildren = paths.forall { 92 | path => 93 | val lastElem = path.split("/").last 94 | lastElem == "_temporary" || lastElem == "_SUCCESS" || lastElem == "._SUCCESS.crc" || (expectPartition && lastElem.contains("=")) 95 | } 96 | paths.nonEmpty && hasValidChildren 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | spark-etl 2 | ========== 3 | 4 | Tooling for configuration and SQL transform driven Spark ETLs. For usage example, see [spark-etl-demo](https://github.com/konrads/spark-etl-demo). 5 | 6 | Build status (master): [![Build Status](https://travis-ci.org/konrads/spark-etl.svg?branch=master)](https://travis-ci.org/konrads/spark-etl) 7 | 8 | Philosophy 9 | ---------- 10 | This library facilitates productionizing of configuration/SQL driven Spark ETL pipelines. Emphasis is on: 11 | * configuration and SQLs treated as first class citizens 12 | * build time validation comprising syntactical checks of config and SQL, ensuring that SQL datasources map to configured `extract`s and `transform`s 13 | * run time validations comprising verification of data source (`extract`s) uris and connectivity to [LoadWriter](src/main/scala/spark_etl/LoadWriter.scala) 14 | * optional validation of `extract` datasources 15 | * optional validation of `transform` outputs (pre `load` writing) 16 | * config and SQL parametrization via `${var}` style variables, configured at runtime via `-Denv.var=value`. Some default envs (eg. `${yyyy-MM-dd-1d}`, `${utc-eod-1d}`) are supplied by [DefaultEnv](src/main/scala/spark_etl/util/DefaultEnv.scala) 17 | * CLI support for commands: `validate-local`, `validate-remote`, `extract-check`, `transform-load`, `transform-load` 18 | 19 | Sample setup 20 | ------------ 21 | Setup `src/main/resources/app.yaml`: 22 | ``` 23 | extracts: 24 | - name: client 25 | uri: "hdfs://${path}/client_2017" 26 | check: "/spark/extract-check/client.sql" 27 | cache: true 28 | - name: item 29 | uri: "hdfs://${path}/item_2017" 30 | - name: transaction 31 | uri: "hdfs://${path}/transaction_2017" 32 | 33 | transforms: 34 | - name: client_spending 35 | sql: "/spark/transform/client_spending.sql" 36 | - name: item_purchase 37 | sql: "/spark/transform/item_purchase.sql" 38 | - name: minor_purchase 39 | check: "/spark/transform-check/minor_purchase.sql" 40 | sql: "/spark/transform/minor_purchase.sql" 41 | cache: true 42 | 43 | loads: 44 | - name: client_spending_out 45 | source: client_spending 46 | uri: "hdfs://out/client_spending" 47 | partition_by: ["col1", "col2"] 48 | - name: item_purchase_out 49 | source: item_purchase 50 | uri: "hdfs://out/item_purchase" 51 | - name: minor_purchase_out 52 | source: minor_purchase 53 | uri: "hdfs://out/minor_purchase" 54 | 55 | load_writer: 56 | class: "spark_etl.JdbcLoadWriter" 57 | params: 58 | jdbc_uri: ${jdbc_uri} 59 | jdbc_user: ${jdbc_user} 60 | jdbc_password: ${jdbc_password} 61 | ``` 62 | 63 | Setup your SQLs as per below. All SQLs are `SELECT` statements, `transform`s produce potentially sizable `Dataframes` to be persisted as `load`s, `extract-check` and `transform-check` produce smaller `Dataframees` which are loged out for visual inspection: 64 | ``` 65 | src -+ 66 | | 67 | +- spark 68 | | 69 | +- extract-check 70 | | | 71 | | +- client.sql # NOTE: optional extract validation! 72 | | 73 | +- transform 74 | | | 75 | | +- client_spending.sql 76 | | | 77 | | +- item_purchase.sql 78 | | | 79 | | +- minor_purchase.sql 80 | | 81 | +- transform-check 82 | | 83 | +- minor_purchase.sql # NOTE: optional transform validation! 84 | ``` 85 | 86 | Generate lineage in dot format: 87 | ``` 88 | sbt "run-main spark_etl.CLI -Denv.path=some_path lineage-dot" 89 | ``` 90 | 91 | Validate local config/SQLs. Suggested use is to run this as part of the build, with validation failure stopping the build: 92 | ``` 93 | sbt "run-main spark_etl.CLI -Denv.path=some_path validate-local" 94 | ``` 95 | 96 | Deploy to cluster, with read access to `hdfs://some_path`, write access to `hdfs://out`. If using yarn, utilize: [run.sh](src/main/resources/run.sh) 97 | ``` 98 | run.sh -Denv.path=some_path validate-remote 99 | ``` 100 | 101 | Run extract and transform validations on the cluster. The following will fail *only* if any of the return set rows contains a `false`: 102 | ``` 103 | run.sh -Denv.path=some_path extract-check 104 | run.sh -Denv.path=some_path transform-check 105 | ``` 106 | 107 | Run transformation and persist loads: 108 | ``` 109 | run.sh -Denv.path=some_path transform-load 110 | ``` 111 | 112 | If env `PACKAGE_LOGS=true`, `run.sh`'s cluster operations (`transform-load`, `extract-check`, `transform-check`) capture both driver and yarn logs under `logs/$app_id/logs_$app_id.zip`. 113 | -------------------------------------------------------------------------------- /src/main/resources/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # configurable env vars 4 | RUN_DIR=${RUN_DIR:-.} 5 | MAIN_CLASS=${MAIN_CLASS:-spark_etl.CLI} 6 | HADOOP_VSN=${HADOOP_VSN:-2.7.3} 7 | SPARK_JARS=${SPARK_JARS:-/opt/spark/spark-2.1.0-bin-hadoop2.7/jars} 8 | HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf} 9 | PACKAGE_LOGS=${PACKAGE_LOGS:-false} 10 | 11 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 12 | cd $DIR 13 | 14 | OTHER_CP=$SPARK_JARS/hadoop-hdfs-${HADOOP_VSN}.jar:$SPARK_JARS/hadoop-common-${HADOOP_VSN}.jar:$HADOOP_CONF_DIR 15 | SPARK_STORAGE_LEVEL=MEMORY_AND_DISK_SER_2 16 | SPARK_NUM_EXECUTORS=250 17 | SPARK_EXECUTOR_MEMORY=7G 18 | SPARK_EXECUTOR_CORES=2 19 | SPARK_HOME=/opt/spark/spark-2.1.0-bin-hadoop2.7 20 | #export YARN_CONF_DIR=$HADOOP_CONF_DIR 21 | JAR=$(ls $RUN_DIR/*-assembly*.jar) 22 | RT=$(date --date="6:00 today" --iso-8601=seconds | cut -f1 -d'+') 23 | TIMESTAMP="${RT}Australia/Sydney" 24 | 25 | CMD="$SPARK_HOME/bin/spark-submit \ 26 | --conf spark.debug.maxToStringFields=1024 \ 27 | --conf spark.driver.extraJavaOptions='-XX:PermSize=512m -XX:MaxPermSize=512m' \ 28 | --conf spark.yarn.maxAppAttempts=1 \ 29 | --conf spark.yarn.max.executor.failures=200 \ 30 | --conf spark.driver.memory=7G \ 31 | --conf spark.driver.maxResultSize=4G \ 32 | --conf spark.sql.warehouse.dir=hdfs://nameservice1/user/hive/warehouse \ 33 | --conf spark.locality.wait=0 \ 34 | --conf spark.io.compression.codec=org.apache.spark.io.SnappyCompressionCodec \ 35 | --conf spark.rdd.compress=true \ 36 | --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ 37 | --conf spark.sql.parquet.compression.codec=snappy \ 38 | --conf spark.sql.inMemoryColumnarStorage.compressed=true \ 39 | --conf spark.sql.inMemoryColumnarStorage.batchSize=100000 \ 40 | --conf spark.sql.crossJoin.enabled=true \ 41 | --conf spark.task.maxFailures=20 \ 42 | --master yarn \ 43 | --deploy-mode cluster \ 44 | --num-executors $SPARK_NUM_EXECUTORS \ 45 | --executor-memory $SPARK_EXECUTOR_MEMORY \ 46 | --executor-cores $SPARK_EXECUTOR_CORES \ 47 | --class $MAIN_CLASS \ 48 | $JAR" 49 | 50 | green="\033[32m" 51 | red="\033[31m" 52 | bold="\033[1m" 53 | reset="\033[0m" 54 | 55 | log_green() { 56 | echo -e "${green}$@${reset}" 57 | } 58 | 59 | log_bold() { 60 | echo -e "${bold}$@${reset}" 61 | } 62 | 63 | log_red() { 64 | echo -e "${red}$@${reset}" 65 | } 66 | 67 | check_package_logs() { 68 | if [ "$PACKAGE_LOGS" == "true" ] 69 | then 70 | log_bold "...Log packaging enabled" 71 | else 72 | log_bold "...Log packaging disabled" 73 | fi 74 | } 75 | 76 | package_logs() { 77 | if [ "$PACKAGE_LOGS" == "true" ] 78 | then 79 | local_log_file=$1 80 | app_id=`cat $local_log_file | grep 'Submitting application application_[0-9]*_[0-9]*' | sed -r 's/.*(application_[0-9]*_[0-9]*).*/\1/g'` 81 | if [ -z "$app_id" ] 82 | then 83 | log_red "No application_XXX_YYY found in local log $local_log_file!" 84 | exit 11 85 | else 86 | log_bold "Packaging logs for $app_id after a 10 sec sleep" 87 | sleep 10 88 | mkdir -p logs/$app_id 89 | rm -rf logs/current 90 | ln -s $app_id logs/current 91 | cp $local_log_file logs/$app_id/$app_id.local.log 92 | yarn logs --applicationId $app_id > logs/$app_id/$app_id.remote.log 93 | cd logs/$app_id 94 | zip logs_$app_id.zip *.log 95 | cd ../.. 96 | log_bold "Logs available at logs/$app_id/logs_$app_id.zip" 97 | fi 98 | fi 99 | } 100 | 101 | usage() { 102 | log_bold " Usage:" 103 | log_bold " help" 104 | log_bold " validate-local" 105 | log_bold " validate-remote" 106 | log_bold " transform" 107 | log_bold " extract-check" 108 | log_bold " transform-check" 109 | exit 1 110 | } 111 | 112 | set -e 113 | trail_arg="${@: -1}" 114 | if [[ $# -lt 1 || "$trail_arg" == "help" ]]; then usage; fi 115 | case "$trail_arg" in 116 | "validate-local") 117 | log_bold "Validating local configuration..." 118 | java -cp $JAR:$OTHER_CP $MAIN_CLASS $@ 119 | ;; 120 | "validate-remote") 121 | log_bold "Validating remote aspects..." 122 | java -cp $JAR:$OTHER_CP $MAIN_CLASS $@ 123 | ;; 124 | "transform") 125 | log_bold "Run and persist transform..." 126 | check_package_logs 127 | YARN_CONF_DIR=$HADOOP_CONF_DIR eval $CMD $@ 2>&1 | tee .current_local.log 128 | package_logs .current_local.log 129 | ;; 130 | "extract-check") 131 | log_bold "Run extract check..." 132 | check_package_logs 133 | YARN_CONF_DIR=$HADOOP_CONF_DIR eval $CMD $@ 2>&1 | tee .current_local.log 134 | package_logs .current_local.log 135 | ;; 136 | "transform-check") 137 | log_bold "Run transform check..." 138 | check_package_logs 139 | YARN_CONF_DIR=$HADOOP_CONF_DIR eval $CMD $@ 2>&1 | tee .current_local.log 140 | package_logs .current_local.log 141 | ;; 142 | *) 143 | log_red "Not a valid command: $@" 144 | usage 145 | ;; 146 | esac 147 | log_green "$trail_arg done!" 148 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/util/DepTree.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import scala.annotation.tailrec 4 | 5 | /** 6 | * Constructs dependency graph consisting of Vertices and Edges: 7 | * - Vertex has id and VertexType 8 | * - VertexType classification: 9 | * 10 | * VertexType 11 | * | 12 | * +-----------+----------+ 13 | * | | 14 | * RootType OtherType 15 | * +----+-------+ +-+-+--+----+ 16 | * | | | | | | 17 | * L LCheck ECheck E T Dangling 18 | * 19 | * - Edge has source and target Vertices 20 | * 21 | * Process: 22 | * - go through all E|ECheck|T|TCheck|L 23 | * - add all Edges, by first looking up all the Vertices, then linking them. If Vertex doesn't exist, mark it as Dangling 24 | * 25 | * Note: Order of Vertices and Edges is preserved by the LinkedHashSet 26 | */ 27 | class DepTree(knownVertices : Seq[Vertex]) { 28 | private val vertices = collection.mutable.LinkedHashSet[Vertex](knownVertices:_*) 29 | private val edges = collection.mutable.LinkedHashSet.empty[Edge] 30 | private val allTypes = Set(E, T, L, Echeck, Tcheck, Dangling) 31 | private val nonDangling = allTypes - Dangling 32 | 33 | /** 34 | * Add Edge, fetching from either a list of known, or marking it as Dangling. 35 | */ 36 | def addEdge(sourceId: String, target: Vertex, isExplicit: Boolean = false): Unit = { 37 | val source = vertices 38 | .collectFirst { case v @ Vertex(id, _:OtherType) if sourceId == id => v } 39 | .getOrElse(Vertex(sourceId, Dangling)) 40 | vertices += source // add if dangling 41 | edges += Edge(source, target, isExplicit) 42 | } 43 | 44 | def dangling: Seq[Edge] = 45 | edges.filter(_.source.`type` == Dangling).toSeq 46 | 47 | /** 48 | * Walk down the graph from root objects, collect all encountered Vertices. 49 | */ 50 | def rootfull(types: Set[VertexType] = nonDangling): Seq[Vertex] = { 51 | val vs = collect(vertices.collect { case v @ Vertex(_, _: RootType) => v }.toSeq) 52 | val orderedVs = vertices.intersect(vs.toSet).toSeq // re-order as per original 53 | orderedVs.filter(v => types.contains(v.`type`)) 54 | } 55 | 56 | /** 57 | * All vertices -- rootfull 58 | */ 59 | def rootless: Seq[Vertex] = 60 | (vertices -- rootfull()).toSeq 61 | 62 | def forType(`type`: VertexType): Seq[Vertex] = 63 | rootfull().filter(_.`type` == `type`) 64 | 65 | def asDot(name: String = "Lineage", fontSize: Int = 12): String = { 66 | val plottableVertices = rootfull(Set(E, T, L)) 67 | val plottableEdges = edges.filter(e => plottableVertices.contains(e.source) && plottableVertices.contains(e.target)) 68 | val edgeStrs = plottableEdges.collect { 69 | case Edge(srcV, targetV, false) => 70 | s"${srcV.id} -> ${targetV.id} [style=dotted]" 71 | case Edge(srcV, targetV, true) => 72 | s"${srcV.id} -> ${targetV.id}" 73 | } 74 | val verticeStrs = plottableVertices.collect { 75 | case Vertex(id, E) => s"$id" 76 | case Vertex(id, T) => s"""$id [shape=component]""" 77 | case Vertex(id, L) => s"""$id [shape=cylinder]""" 78 | } 79 | val rankStrs = plottableVertices.groupBy(_.`type`).flatMap { 80 | case (E, vs) => Seq(s"""{ rank=same; ${vs.map(_.id).mkString(" ")} }""") 81 | case (L, vs) => Seq(s"""{ rank=same; ${vs.map(_.id).mkString(" ")} }""") 82 | case _ => Nil 83 | } 84 | s"""digraph $name { 85 | | rankdir=LR 86 | | node [fontsize=$fontSize] 87 | | 88 | | # vertices 89 | | ${verticeStrs.mkString("\n ")} 90 | | 91 | | # edges 92 | | ${edgeStrs.mkString("\n ")} 93 | | 94 | | # ranks 95 | | ${rankStrs.toList.sorted.mkString("\n ")} 96 | |}""".stripMargin 97 | } 98 | 99 | @tailrec 100 | private def collect( 101 | roots: Seq[Vertex], 102 | types: Set[VertexType] = nonDangling, 103 | soFar: Seq[Vertex] = Nil): Seq[Vertex] = { 104 | val sources = roots.flatMap(v => edges.collect { 105 | case e if e.target == v && types.contains(e.source.`type`) && ! soFar.contains(e.source) => e.source 106 | }) 107 | val soFar2 = (soFar ++ roots).distinct 108 | if (sources.nonEmpty) 109 | collect(sources, types, soFar2) 110 | else 111 | soFar2 112 | } 113 | } 114 | 115 | case class Vertex(id: String, `type`: VertexType) 116 | case class Edge(source: Vertex, target: Vertex, isExplicit: Boolean) 117 | 118 | sealed trait VertexType { def asStr: String } 119 | sealed trait RootType extends VertexType 120 | sealed trait OtherType extends VertexType 121 | object E extends OtherType { override def asStr = "extract" } 122 | object T extends OtherType { override def asStr = "transform" } 123 | object L extends RootType { override def asStr = "load" } 124 | object Echeck extends RootType { override def asStr = "extract-check" } 125 | object Tcheck extends RootType { override def asStr = "transform-check" } 126 | object Dangling extends OtherType { override def asStr = "dangling" } 127 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/CLI.scala: -------------------------------------------------------------------------------- 1 | package spark_etl 2 | 3 | import java.io.{File, PrintWriter, StringWriter} 4 | 5 | import org.apache.log4j.Logger 6 | import org.apache.spark.sql._ 7 | import org.joda.time.DateTime 8 | import org.rogach.scallop._ 9 | import spark_etl.util.{DefaultEnv, Failure, Files, Success} 10 | 11 | import scala.util.Random 12 | 13 | object CLI { 14 | val log = Logger.getLogger(getClass) 15 | 16 | sealed trait CliCommand 17 | object LineageDot extends CliCommand 18 | object ValidateLocal extends CliCommand 19 | object ValidateRemote extends CliCommand 20 | object TransformLoad extends CliCommand 21 | object ExtractCheck extends CliCommand 22 | object TransformCheck extends CliCommand 23 | object StripPrefixes extends CliCommand 24 | object CliCommand { 25 | implicit val cliCommandConverter = singleArgConverter[CliCommand] { 26 | case "lineage-dot" => LineageDot 27 | case "validate-local" => ValidateLocal 28 | case "validate-remote" => ValidateRemote 29 | case "transform-load" => TransformLoad 30 | case "extract-check" => ExtractCheck 31 | case "transform-check" => TransformCheck 32 | case "strip-prefixes" => StripPrefixes 33 | } 34 | } 35 | 36 | val className = getClass.getSimpleName 37 | class CliConf(args: Seq[String]) extends ScallopConf(args) { 38 | banner(s"""Usage: $className [OPTIONS] (all options required unless otherwise indicated)\n\tOptions:""") 39 | val extraProps = props[String]() 40 | val confUri = opt[String](name = "conf-uri", descr = "configuration resource uri", default = Some("/app.yaml")) 41 | val lineageFile = opt[String](name = "lineage-file", descr = "target lineage dot file", default = Some("lineage.dot")) 42 | val baSqlDir = opt[File](name = "ba-sql-dir", descr = "dir with BA sql", default = Some(new File("src/main/resources/spark"))) 43 | val devSqlDir = opt[File](name = "dev-sql-dir", descr = "dir with DEV sql", default = Some(new File("src/main/resources/spark"))) 44 | val rmDevSqlDir = opt[Boolean](name = "rm-dev-sql-dir", descr = "should remove dev sql dir?", default = Some(false)) 45 | val count = toggle(name = "count", descrYes = "enable transform counts", default = Some(false)) 46 | val command = trailArg[CliCommand](name = "command", descr = "command") 47 | verify() 48 | } 49 | 50 | type ErrorHandler = List[ConfigError] => Unit 51 | 52 | def main(args: Array[String]): Unit = { 53 | val conf = new CliConf(args) 54 | main(conf.command(), conf.confUri(), conf.extraProps, conf.count(), conf.lineageFile(), conf.baSqlDir(), conf.devSqlDir(), conf.rmDevSqlDir()) 55 | } 56 | 57 | def main(command: CliCommand, confUri: String, extraProps: Map[String, String], shouldCount: Boolean, lineageFile: String, baSqlDir: File, devSqlDir: File, rmDevSqlDir: Boolean, errorHandler: ErrorHandler = die): Unit = { 58 | def createSpark(name: String, props: Map[String, String], isMaster: Boolean): SparkSession = { 59 | val builder = if (isMaster) 60 | SparkSession.builder.appName(name).master("local[1]").config("spark.ui.port", random(4041, 4999)).config("spark.history.ui.port", random(18080, 19000)) 61 | else 62 | SparkSession.builder.appName(name) 63 | if (isMaster) 64 | props.collect { case (k, v) if k.startsWith("spark.") => builder.config(k, v) } 65 | builder.getOrCreate 66 | } 67 | 68 | val pwd = Files.pwd 69 | val defaultEnv = DefaultEnv.getAll(DateTime.now) 70 | val paramEnv = extraProps.collect { case (k, v) if k.startsWith("env.") => k.substring("env.".length) -> v } 71 | val env = defaultEnv ++ paramEnv 72 | val res = command match { 73 | case LineageDot => 74 | CLIOps.dotLineage(confUri, pwd, env, lineageFile) 75 | case ValidateLocal => 76 | CLIOps.validateLocal(confUri, pwd, env) 77 | case ValidateRemote => 78 | implicit val spark = createSpark(className, extraProps, true) 79 | try { 80 | CLIOps.validateRemote(confUri, pwd, env) 81 | } finally { 82 | spark.stop() 83 | } 84 | case TransformLoad => 85 | implicit val spark = createSpark(className, extraProps, false) 86 | try { 87 | CLIOps.transformAndLoad(confUri, pwd, env, extraProps, shouldCount) 88 | } finally { 89 | spark.stop() 90 | } 91 | case ExtractCheck => 92 | implicit val spark = createSpark(className, extraProps, false) 93 | try { 94 | CLIOps.extractCheck(confUri, pwd, env) 95 | } finally { 96 | spark.stop() 97 | } 98 | case TransformCheck => 99 | implicit val spark = createSpark(className, extraProps, false) 100 | try { 101 | CLIOps.transformCheck(confUri, pwd, env, shouldCount) 102 | } finally { 103 | spark.stop() 104 | } 105 | case StripPrefixes => 106 | CLIOps.stripPrefixes(baSqlDir, devSqlDir, rmDevSqlDir) 107 | } 108 | 109 | res match { 110 | case Success(_) => 111 | log.info("Success!") 112 | case Failure(errors) => 113 | val errorStr = errors.map(e => e.exc.map(exc => s"• ${e.msg}, exception: $exc\n${stacktrace(exc)}").getOrElse(s"• ${e.msg}")).toList.mkString("\n") 114 | log.error(s"Failed due to:\n$errorStr") 115 | errorHandler(errors.toList) 116 | } 117 | } 118 | 119 | private def die(errors: List[ConfigError]): Unit = 120 | System.exit(1) 121 | 122 | private def stacktrace(t: Throwable) = { 123 | val w = new StringWriter 124 | t.printStackTrace(new PrintWriter(w)) 125 | w.toString 126 | } 127 | 128 | private val rand = new Random(System.currentTimeMillis) 129 | private def random(min: Int, max: Int) = min + rand.nextInt(max - min) 130 | } 131 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/util/ValidationSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | import org.scalatest.{FlatSpec, Inside, Matchers} 4 | import spark_etl.util.Validation._ 5 | 6 | class ValidationSpec extends FlatSpec with Matchers with Inside { 7 | val unitSuccesses = (0 to 9).map(_ => ().success[String]) 8 | val errorStrs = (0 to 9).map(i => s"error:$i") 9 | val failures = errorStrs.map(i => i.failure[Unit]) 10 | 11 | "Validation" should "merge up to 10 successes" in { 12 | (2 to 9).foreach(i => unitSuccesses.take(i).reduce(_ +++ _) shouldBe Success(())) 13 | merge("00".success[String], 11.success[String]) { 14 | (r0, r1) => s"$r0:$r1" 15 | } shouldBe Success("00:11") 16 | merge("00".success[String], 11.success[String], "22".success[String]) { 17 | (r0, r1, r2) => s"$r0:$r1:$r2" 18 | } shouldBe Success("00:11:22") 19 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String]) { 20 | (r0, r1, r2, r3) => s"$r0:$r1:$r2:$r3" 21 | } shouldBe Success("00:11:22:33") 22 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String]) { 23 | (r0, r1, r2, r3, r4) => s"$r0:$r1:$r2:$r3:$r4" 24 | } shouldBe Success("00:11:22:33:44") 25 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String]) { 26 | (r0, r1, r2, r3, r4, r5) => s"$r0:$r1:$r2:$r3:$r4:$r5" 27 | } shouldBe Success("00:11:22:33:44:55") 28 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String]) { 29 | (r0, r1, r2, r3, r4, r5, r6) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6" 30 | } shouldBe Success("00:11:22:33:44:55:66") 31 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String]) { 32 | (r0, r1, r2, r3, r4, r5, r6, r7) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7" 33 | } shouldBe Success("00:11:22:33:44:55:66:77") 34 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String], "88".success[String]) { 35 | (r0, r1, r2, r3, r4, r5, r6, r7, r8) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7:$r8" 36 | } shouldBe Success("00:11:22:33:44:55:66:77:88") 37 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String], "88".success[String], 99.success[String]) { 38 | (r0, r1, r2, r3, r4, r5, r6, r7, r8, r9) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7:$r8:$r9" 39 | } shouldBe Success("00:11:22:33:44:55:66:77:88:99") 40 | } 41 | 42 | it should "merge up to 10 failures" in { 43 | (2 to 9).foreach(i => failures.take(i).reduce(_ +++ _) shouldBe Failure(errorStrs.take(i))) 44 | merge("00".success[String], "err".failure[String]) { 45 | (r0, r1) => s"$r0:$r1" 46 | } shouldBe Failure(List("err")) 47 | merge("00".success[String], 11.success[String], "err".failure[String]) { 48 | (r0, r1, r2) => s"$r0:$r1:$r2" 49 | } shouldBe Failure(List("err")) 50 | merge("00".success[String], 11.success[String], "22".success[String], "err".failure[String]) { 51 | (r0, r1, r2, r3) => s"$r0:$r1:$r2:$r3" 52 | } shouldBe Failure(List("err")) 53 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "err".failure[String]) { 54 | (r0, r1, r2, r3, r4) => s"$r0:$r1:$r2:$r3:$r4" 55 | } shouldBe Failure(List("err")) 56 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], "err".failure[String]) { 57 | (r0, r1, r2, r3, r4, r5) => s"$r0:$r1:$r2:$r3:$r4:$r5" 58 | } shouldBe Failure(List("err")) 59 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "err".failure[String]) { 60 | (r0, r1, r2, r3, r4, r5, r6) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6" 61 | } shouldBe Failure(List("err")) 62 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], "err".failure[String]) { 63 | (r0, r1, r2, r3, r4, r5, r6, r7) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7" 64 | } shouldBe Failure(List("err")) 65 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String], "err".failure[String]) { 66 | (r0, r1, r2, r3, r4, r5, r6, r7, r8) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7:$r8" 67 | } shouldBe Failure(List("err")) 68 | merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String], "88".success[String], "err".failure[String]) { 69 | (r0, r1, r2, r3, r4, r5, r6, r7, r8, r9) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7:$r8:$r9" 70 | } shouldBe Failure(List("err")) 71 | } 72 | 73 | it should "merge with success: unit, list, seq, set, map" in { 74 | ().success[String] +++ ().success[String] shouldBe Success(()) 75 | List(1, 2).success[String] +++ List(1, 3).success[String] shouldBe Success(List(1, 2, 1, 3)) 76 | Seq(1, 2).success[String] +++ Seq(1, 3).success[String] shouldBe Success(Seq(1, 2, 1, 3)) 77 | Set(1, 2).success[String] +++ Set(1, 3).success[String] shouldBe Success(Set(1, 2, 3)) 78 | Map("a" -> 1, "b" -> 2).success[String] +++ Map("a" -> 11, "c" -> 33).success[String] shouldBe Success(Map("a" -> 11, "b" -> 2, "c" -> 33)) 79 | } 80 | 81 | it should "map" in { 82 | 11.success[Int].map(_ * 2) shouldBe Success(22) 83 | "err".failure[Int].map(_ * 2) shouldBe Failure(List("err")) 84 | } 85 | 86 | it should "flatMap" in { 87 | (for { 88 | x1 <- 11.success[Int] 89 | x2 <- 22.success[Int] 90 | } yield x1 + x2) shouldBe Success(33) 91 | (for { 92 | x1 <- "err".failure[Int] 93 | x2 <- 22.success[String] 94 | } yield x1 + x2) shouldBe Failure(List("err")) 95 | } 96 | 97 | it should "foldl" in { 98 | 11.success[Double].foldl( 99 | { 100 | _ => false 101 | }, 102 | { 103 | _ => true 104 | } 105 | ) shouldBe true 106 | 107 | (-99.5).failure[Int].foldl( 108 | { 109 | _ => false 110 | }, 111 | { 112 | _ => true 113 | } 114 | ) shouldBe false 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/util/Validation.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.util 2 | 3 | /** 4 | * Limited scalaz's Validation. Supports map(), flatMap() and +++ (eg. for reduce()) 5 | */ 6 | trait Validation[Err, S] { 7 | def +++[S2, Out](v: Validation[Err, S2])(implicit merger: ValidationMerger[S, S2, Out]): Validation[Err, Out] = { 8 | (this, v) match { 9 | case (Failure(ls1), Failure(ls2)) => Failure(ls1 ++ ls2) 10 | case (Failure(ls), Success(_)) => Failure(ls) 11 | case (Success(_), Failure(ls)) => Failure(ls) 12 | case (Success(r1), Success(r2)) => Success(merger.merge(r1, r2)) 13 | } 14 | } 15 | 16 | def map[Out](f: S => Out): Validation[Err, Out] = this match { 17 | case Success(r) => Success(f(r)) 18 | case Failure(ls) => Failure(ls) 19 | } 20 | 21 | def flatMap[Out](f: S => Validation[Err, Out]): Validation[Err, Out] = this match { 22 | case Success(r) => f(r) 23 | case Failure(ls) => Failure(ls) 24 | } 25 | 26 | def foldl[Out](failure: Seq[Err] => Out, success: S => Out): Out = this match { 27 | case Failure(err) => failure(err) 28 | case Success(s) => success(s) 29 | } 30 | 31 | def isSuccess: Boolean 32 | 33 | protected def errs: Seq[Err] 34 | } 35 | 36 | object Validation { 37 | implicit val unitMapper = new ValidationMerger[Unit, Unit, Unit] { def merge(in1: Unit, in2: Unit) = () } 38 | implicit def listMerger[X] = new ValidationMerger[List[X], List[X], List[X]] { def merge(in1: List[X], in2: List[X]) = in1 ++ in2 } 39 | implicit def seqMerger[X] = new ValidationMerger[Seq[X], Seq[X], Seq[X]] { def merge(in1: Seq[X], in2: Seq[X]) = in1 ++ in2 } 40 | implicit def setMerger[X] = new ValidationMerger[Set[X], Set[X], Set[X]] { def merge(in1: Set[X], in2: Set[X]) = in1 ++ in2 } 41 | implicit def mapMerger[K, V] = new ValidationMerger[Map[K, V], Map[K, V], Map[K, V]] { def merge(in1: Map[K, V], in2: Map[K, V]) = in1 ++ in2 } 42 | 43 | import scala.language.implicitConversions 44 | implicit def validationOps[A](a: A): ValidationOps[A] = new ValidationOps(a) 45 | 46 | def merge[Err, S0, S1, Out]( 47 | v0: Validation[Err, S0], 48 | v1: Validation[Err, S1])(map: (S0, S1) => Out): Validation[Err, Out] = 49 | (v0, v1) match { 50 | case (Success(s0), Success(s1)) => 51 | val out = map(s0, s1) 52 | Success[Err, Out](out) 53 | case _ => 54 | Failure(v0.errs ++ v1.errs) 55 | } 56 | 57 | def merge[Err, S0, S1, S2, Out]( 58 | v0: Validation[Err, S0], 59 | v1: Validation[Err, S1], 60 | v2: Validation[Err, S2])(merge: (S0, S1, S2) => Out): Validation[Err, Out] = 61 | (v0, v1, v2) match { 62 | case (Success(s0), Success(s1), Success(s2)) => 63 | val out = merge(s0, s1, s2) 64 | Success[Err, Out](out) 65 | case _ => 66 | Failure(v0.errs ++ v1.errs ++ v2.errs) 67 | } 68 | 69 | def merge[Err, S0, S1, S2, S3, Out]( 70 | v0: Validation[Err, S0], 71 | v1: Validation[Err, S1], 72 | v2: Validation[Err, S2], 73 | v3: Validation[Err, S3])(merge: (S0, S1, S2, S3) => Out): Validation[Err, Out] = 74 | (v0, v1, v2, v3) match { 75 | case (Success(s0), Success(s1), Success(s2), Success(s3)) => 76 | val out = merge(s0, s1, s2, s3) 77 | Success[Err, Out](out) 78 | case _ => 79 | Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs) 80 | } 81 | 82 | def merge[Err, S0, S1, S2, S3, S4, Out]( 83 | v0: Validation[Err, S0], 84 | v1: Validation[Err, S1], 85 | v2: Validation[Err, S2], 86 | v3: Validation[Err, S3], 87 | v4: Validation[Err, S4])(merge: (S0, S1, S2, S3, S4) => Out): Validation[Err, Out] = 88 | (v0, v1, v2, v3, v4) match { 89 | case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4)) => 90 | val out = merge(s0, s1, s2, s3, s4) 91 | Success[Err, Out](out) 92 | case _ => 93 | Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs) 94 | } 95 | 96 | def merge[Err, S0, S1, S2, S3, S4, S5, Out]( 97 | v0: Validation[Err, S0], 98 | v1: Validation[Err, S1], 99 | v2: Validation[Err, S2], 100 | v3: Validation[Err, S3], 101 | v4: Validation[Err, S4], 102 | v5: Validation[Err, S5])(merge: (S0, S1, S2, S3, S4, S5) => Out): Validation[Err, Out] = 103 | (v0, v1, v2, v3, v4, v5) match { 104 | case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5)) => 105 | val out = merge(s0, s1, s2, s3, s4, s5) 106 | Success[Err, Out](out) 107 | case _ => 108 | Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs) 109 | } 110 | 111 | def merge[Err, S0, S1, S2, S3, S4, S5, S6, Out]( 112 | v0: Validation[Err, S0], 113 | v1: Validation[Err, S1], 114 | v2: Validation[Err, S2], 115 | v3: Validation[Err, S3], 116 | v4: Validation[Err, S4], 117 | v5: Validation[Err, S5], 118 | v6: Validation[Err, S6])(merge: (S0, S1, S2, S3, S4, S5, S6) => Out): Validation[Err, Out] = 119 | (v0, v1, v2, v3, v4, v5, v6) match { 120 | case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5), Success(s6)) => 121 | val out = merge(s0, s1, s2, s3, s4, s5, s6) 122 | Success[Err, Out](out) 123 | case _ => 124 | Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs ++ v6.errs) 125 | } 126 | 127 | def merge[Err, S0, S1, S2, S3, S4, S5, S6, S7, Out]( 128 | v0: Validation[Err, S0], 129 | v1: Validation[Err, S1], 130 | v2: Validation[Err, S2], 131 | v3: Validation[Err, S3], 132 | v4: Validation[Err, S4], 133 | v5: Validation[Err, S5], 134 | v6: Validation[Err, S6], 135 | v7: Validation[Err, S7])(merge: (S0, S1, S2, S3, S4, S5, S6, S7) => Out): Validation[Err, Out] = 136 | (v0, v1, v2, v3, v4, v5, v6, v7) match { 137 | case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5), Success(s6), Success(s7)) => 138 | val out = merge(s0, s1, s2, s3, s4, s5, s6, s7) 139 | Success[Err, Out](out) 140 | case _ => 141 | Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs ++ v6.errs ++ v7.errs) 142 | } 143 | 144 | def merge[Err, S0, S1, S2, S3, S4, S5, S6, S7, S8, Out]( 145 | v0: Validation[Err, S0], 146 | v1: Validation[Err, S1], 147 | v2: Validation[Err, S2], 148 | v3: Validation[Err, S3], 149 | v4: Validation[Err, S4], 150 | v5: Validation[Err, S5], 151 | v6: Validation[Err, S6], 152 | v7: Validation[Err, S7], 153 | v8: Validation[Err, S8])(merge: (S0, S1, S2, S3, S4, S5, S6, S7, S8) => Out): Validation[Err, Out] = 154 | (v0, v1, v2, v3, v4, v5, v6, v7, v8) match { 155 | case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5), Success(s6), Success(s7), Success(s8)) => 156 | val out = merge(s0, s1, s2, s3, s4, s5, s6, s7, s8) 157 | Success[Err, Out](out) 158 | case _ => 159 | Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs ++ v6.errs ++ v7.errs ++ v8.errs) 160 | } 161 | 162 | def merge[Err, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, Out]( 163 | v0: Validation[Err, S0], 164 | v1: Validation[Err, S1], 165 | v2: Validation[Err, S2], 166 | v3: Validation[Err, S3], 167 | v4: Validation[Err, S4], 168 | v5: Validation[Err, S5], 169 | v6: Validation[Err, S6], 170 | v7: Validation[Err, S7], 171 | v8: Validation[Err, S8], 172 | v9: Validation[Err, S9])(merge: (S0, S1, S2, S3, S4, S5, S6, S7, S8, S9) => Out): Validation[Err, Out] = 173 | (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9) match { 174 | case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5), Success(s6), Success(s7), Success(s8), Success(s9)) => 175 | val out = merge(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9) 176 | Success[Err, Out](out) 177 | case _ => 178 | Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs ++ v6.errs ++ v7.errs ++ v8.errs ++ v9.errs) 179 | } 180 | } 181 | 182 | case class Failure[Err, R](errs: Seq[Err]) extends Validation[Err, R] { 183 | override def isSuccess: Boolean = false 184 | } 185 | 186 | case class Success[Err, R](r: R) extends Validation[Err, R] { 187 | override def isSuccess: Boolean = true 188 | protected def errs = Nil 189 | } 190 | 191 | trait ValidationMerger[In1, In2, Out] { 192 | def merge(in1: In1, in2: In2): Out 193 | } 194 | 195 | final class ValidationOps[A](val self: A) extends AnyVal { 196 | def success[X]: Validation[X, A] = Success[X, A](self) 197 | def failure[X]: Validation[A, X] = Failure[A, X](List(self)) 198 | } -------------------------------------------------------------------------------- /src/main/scala/spark_etl/model/RuntimeContext.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.model 2 | 3 | import net.jcazevedo.moultingyaml._ 4 | import org.apache.spark.sql.catalyst.parser._ 5 | import spark_etl.util.Validation._ 6 | import spark_etl.util._ 7 | import spark_etl.{ConfigError, ExtractReader, LoadWriter} 8 | 9 | import scala.util.{Failure, Success, Try} 10 | 11 | case class RuntimeExtract(org: Extract, checkContents: Option[String]) 12 | 13 | case class RuntimeTransform(org: Transform, sqlContents: String, checkContents: Option[String]) 14 | 15 | class RuntimeContext(extracts: List[RuntimeExtract], transforms: List[RuntimeTransform], val loads: List[Load], val extractReader: ExtractReader, val loadWriter: LoadWriter, depTree: DepTree, conf: Config) { 16 | def allExtracts: List[RuntimeExtract] = 17 | (for { 18 | r <- depTree.forType(E) 19 | e <- extracts 20 | if r.id == e.org.name 21 | } yield e).toList 22 | 23 | def allTransforms: List[RuntimeTransform] = 24 | (for { 25 | r <- depTree.forType(T) 26 | t <- transforms 27 | if r.id == t.org.name 28 | } yield t).toList 29 | 30 | def allLoads: List[Load] = 31 | (for { 32 | r <- depTree.forType(L) 33 | l <- loads 34 | if r.id == l.name 35 | } yield l).toList 36 | 37 | def asDot = depTree.asDot() 38 | } 39 | 40 | object RuntimeContext extends DefaultYamlProtocol { 41 | 42 | /** 43 | * Emphasis on *maximum* validation. 44 | */ 45 | def load(_conf: Config, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, RuntimeContext] = { 46 | // depTree, with the known universe 47 | val conf = toLowerCase(_conf) 48 | val depTree = new DepTree( 49 | conf.extracts.map(e => Vertex(e.name, E)) ++ 50 | conf.transforms.map(t => Vertex(t.name, T)) ++ 51 | conf.loads.map(l => Vertex(l.name, L)) 52 | ) 53 | 54 | // read in entities and add their deps 55 | val regExtracts = conf.extracts 56 | .map(e => registerExtractDeps(e, depTree, filePathRoot, env)) 57 | .map(_.map(List(_))).reduce(_ +++ _) 58 | 59 | val regTransforms = conf.transforms 60 | .map(t => registerTransformDeps(t, depTree, filePathRoot, env)) 61 | .map(_.map(List(_))).reduce(_ +++ _) 62 | 63 | conf.loads.foreach(l => depTree.addEdge(l.source, Vertex(l.name, L), true)) 64 | 65 | val validatedDuplicates = validateDuplicates(conf) 66 | 67 | val validatedDepTree = validateDepTree(depTree) 68 | 69 | val validatedExtractor = instantiate[ExtractReader](conf.extract_reader.get, classOf[ExtractReader]) 70 | 71 | val validatedTransformer = instantiate[LoadWriter](conf.load_writer.get, classOf[LoadWriter]) 72 | 73 | merge(validatedDuplicates, regExtracts, regTransforms, validatedExtractor, validatedTransformer, validatedDepTree) { (dups, es, ts, e, t, dt) => new RuntimeContext(es, ts, conf.loads, e, t, depTree, conf) } 74 | } 75 | 76 | private def toLowerCase(conf: Config): Config = 77 | conf.copy( 78 | extracts = conf.extracts.map(e => e.copy(name = e.name.toLowerCase)), 79 | transforms = conf.transforms.map(t => t.copy(name = t.name.toLowerCase)), 80 | loads = conf.loads.map(l => l.copy(source = l.source.toLowerCase)) 81 | ) 82 | 83 | private def validateDuplicates(conf: Config): Validation[ConfigError, Unit] = { 84 | def valDups(desc: String, candidates: Seq[String]): Validation[ConfigError, Unit] = 85 | (candidates diff candidates.distinct).distinct match { 86 | case Nil => ().success[ConfigError] 87 | case other => ConfigError(s"Duplicates found for $desc: ${other.sorted.mkString(", ")}").failure[Unit] 88 | } 89 | 90 | valDups("extract names", conf.extracts.map(_.name)) +++ 91 | valDups("extract uris", conf.extracts.map(_.uri)) +++ 92 | valDups("extract check", conf.extracts.collect { case e if e.check.isDefined => e.check.get }) +++ 93 | valDups("transform names", conf.transforms.map(_.name)) +++ 94 | valDups("transform sqls", conf.transforms.map(_.sql)) +++ 95 | valDups("transform check", conf.extracts.collect { case t if t.check.isDefined => t.check.get }) +++ 96 | valDups("load names", conf.loads.map(_.name)) +++ 97 | valDups("load uris", conf.loads.map(_.uri)) 98 | } 99 | 100 | /** 101 | * Load & parse check, if specified 102 | * Note, extract check is only dependant on the extract 103 | */ 104 | private def registerExtractDeps(extract: Extract, depTree: DepTree, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, RuntimeExtract] = 105 | extract.check match { 106 | case Some(checkUri) => 107 | UriLoader.load(checkUri, filePathRoot, env) 108 | .flatMap(validateResolvedDsos(depTree, extract.name, Echeck, s"extract check ${extract.name} (uri $checkUri)")) 109 | .map(checkTxt => RuntimeExtract(extract, Some(checkTxt))) 110 | case None => 111 | RuntimeExtract(extract, None).success[ConfigError] 112 | } 113 | 114 | /** 115 | * Load & parse sql 116 | * Load & parse pre_check, if specified 117 | * Load & parse post_check, if specified 118 | * Check dso dependencies 119 | */ 120 | private def registerTransformDeps(transform: Transform, depTree: DepTree, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, RuntimeTransform] = { 121 | // load resources 122 | val validatedSql = UriLoader.load(transform.sql, filePathRoot, env) 123 | .flatMap(validateResolvedDsos(depTree, transform.name, T, s"Unparsable sql of transform ${transform.name}")) 124 | val validatedCheck = liftOpt(transform.check)(r => UriLoader.load(r, filePathRoot, env) 125 | .flatMap(validateResolvedDsos(depTree, transform.name, Tcheck, s"Unparsable sql of transform check ${transform.name}"))) 126 | 127 | merge(validatedSql, validatedCheck) { (sql, check) => RuntimeTransform(transform, sql, check) } 128 | } 129 | 130 | private def liftOpt[T1, T2](opt: Option[T1])(toVal: T1 => Validation[ConfigError, T2]): Validation[ConfigError, Option[T2]] = 131 | opt match { 132 | case Some(r) => toVal(r).map(Some(_)) 133 | case None => (None:Option[T2]).success[ConfigError] 134 | } 135 | 136 | private def validateResolvedDsos(depTree: DepTree, name: String, `type`: VertexType, errMsgPrefix: String)(contents: String): Validation[ConfigError, String] = 137 | Try(SparkParser.getDeps(contents)) match { 138 | case Success(usedDsos) => 139 | val withPrefixes = usedDsos.filter(_.prefix.isDefined) 140 | if (withPrefixes.nonEmpty) 141 | ConfigError(s"$errMsgPrefix: contains prefixed dsos: ${withPrefixes.map(_.qfStr).mkString(", ")}").failure[String] 142 | else { 143 | usedDsos.map(_.qfStr).foreach(d => depTree.addEdge(d, Vertex(name, `type`))) 144 | contents.success[ConfigError] 145 | } 146 | case Failure(e: ParseException) => 147 | ConfigError(s"$errMsgPrefix: failed to parse, error: ${e.getMessage}").failure[String] 148 | case Failure(e) => 149 | ConfigError(s"$errMsgPrefix: failed to parse", Some(e)).failure[String] 150 | } 151 | 152 | private def validateDepTree(depTree: DepTree): Validation[ConfigError, Unit] = { 153 | val danglingDeps = depTree.dangling 154 | if (danglingDeps.isEmpty) { 155 | ().success[ConfigError] 156 | } else { 157 | val errors = for { 158 | dangling <- danglingDeps 159 | } yield ConfigError(s"Unresolved dependency ${dangling.source.id} for ${dangling.target.`type`.asStr} ${dangling.target.id}").failure[Unit] 160 | errors.reduce(_ +++ _) 161 | } 162 | } 163 | 164 | private def instantiate[T](paramConstr: ParametrizedConstructor, parentClass: Class[_]): Validation[ConfigError, T] = { 165 | Try { 166 | val clazz = Class.forName(paramConstr.`class`) 167 | if (parentClass.isAssignableFrom(clazz)) { 168 | val constructor = clazz.getConstructors()(0) 169 | constructor.newInstance(paramConstr.params.get).asInstanceOf[T].success[ConfigError] 170 | } else { 171 | ConfigError(s"Failed to cast class ${paramConstr.`class`} to ${parentClass.getName}").failure[T] 172 | } 173 | } match { 174 | case scala.util.Success(validated) => validated 175 | case scala.util.Failure(e) => ConfigError(s"Failed to instantiate class ${paramConstr.`class`} with params: ${paramConstr.params}", Some(e)).failure[T] 176 | } 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/test/scala/spark_etl/model/RuntimeContextSpec.scala: -------------------------------------------------------------------------------- 1 | package spark_etl.model 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.scalatest.{FlatSpec, Inside, Matchers} 5 | import spark_etl.util.Validation._ 6 | import spark_etl.util._ 7 | import spark_etl.{ConfigError, ExtractReader, LoadWriter} 8 | 9 | class RuntimeContextSpec extends FlatSpec with Matchers with Inside { 10 | val extractsAndTransformsStr = 11 | """ 12 | |extracts: 13 | | - name: client 14 | | uri: "data/dev/client_2017" 15 | | check: "/runtime-ctx/spark/extract-check/client.sql" 16 | | - name: item 17 | | uri: "data/dev/item_2017" 18 | | check: "/runtime-ctx/spark/extract-check/item.sql" 19 | | - name: transaction 20 | | uri: "data/dev/transaction_2017" 21 | | check: "/runtime-ctx/spark/extract-check/transaction.sql" 22 | | # unused extract 23 | | - name: ____bogus_extract_not_loaded____ 24 | | uri: "hdfs://aaa.bbb" 25 | | 26 | |transforms: 27 | | - name: client_spending 28 | | check: "/runtime-ctx/spark/transform-check/client_spending.sql" 29 | | sql: "/runtime-ctx/spark/transform/client_spending.sql" 30 | | - name: item_purchase 31 | | check: "/runtime-ctx/spark/transform-check/item_purchase.sql" 32 | | sql: "/runtime-ctx/spark/transform/item_purchase.sql" 33 | | - name: minor_purchase 34 | | check: "/runtime-ctx/spark/transform-check/minor_purchase.sql" 35 | | sql: "/runtime-ctx/spark/transform/minor_purchase.sql" 36 | | 37 | |loads: 38 | | - name: client_spending_out 39 | | source: client_spending 40 | | uri: "/tmp/out/client_spending" 41 | | # no partition_by 42 | | - name: item_purchase_out 43 | | source: item_purchase 44 | | uri: "/tmp/out/item_purchase" 45 | | # no partition_by 46 | | - name: minor_purchase_out 47 | | source: minor_purchase 48 | | uri: "/tmp/out/minor_purchase" 49 | | # no partition_by 50 | | """.stripMargin 51 | 52 | "RuntimeContext" should "validate ok extract_reader/load_writer" in { 53 | val confStr = extractsAndTransformsStr + 54 | """ 55 | |extract_reader: 56 | | class: spark_etl.model.OkExtractReader 57 | | params: 58 | | x: 11 59 | | y: aa 60 | | 61 | |load_writer: 62 | | class: spark_etl.model.OkLoadWriter 63 | | params: 64 | | b: false 65 | | a: [1, xxx] 66 | """.stripMargin 67 | Config.parse(confStr) match { 68 | case Success(conf) => 69 | RuntimeContext.load(conf, ".", Map.empty) match { 70 | case Success(ctx) => 71 | ctx.extractReader.asInstanceOf[OkExtractReader].params shouldBe Map("x" -> 11d, "y" -> "aa") 72 | ctx.loadWriter.asInstanceOf[OkLoadWriter].params shouldBe Map("b" -> false, "a" -> List(1d, "xxx")) 73 | } 74 | } 75 | } 76 | 77 | it should "fail on incorrect inheritance of extract_reader/load_writer" in { 78 | val confStr = extractsAndTransformsStr + 79 | """ 80 | |extract_reader: 81 | | class: spark_etl.model.BogusExtractReader1 82 | | 83 | |load_writer: 84 | | class: spark_etl.model.BogusLoadWriter1 85 | """.stripMargin 86 | Config.parse(confStr) match { 87 | case Success(conf) => 88 | RuntimeContext.load(conf, ".", Map.empty) match { 89 | case Failure(errs) => 90 | errs.toList.length shouldBe 2 91 | errs.toList.forall(_.msg.startsWith("Failed to cast class")) shouldBe true 92 | } 93 | } 94 | } 95 | 96 | it should "fail on parameterless constructors extract_reader/load_writer" in { 97 | val confStr = extractsAndTransformsStr + 98 | """ 99 | |extract_reader: 100 | | class: spark_etl.model.BogusExtractReader2 101 | | 102 | |load_writer: 103 | | class: spark_etl.model.BogusLoadWriter2 104 | """.stripMargin 105 | Config.parse(confStr) match { 106 | case Success(conf) => 107 | RuntimeContext.load(conf, ".", Map.empty) match { 108 | case Failure(errs) => 109 | errs.toList.length shouldBe 2 110 | errs.toList.forall(_.msg.startsWith("Failed to instantiate class")) shouldBe true 111 | } 112 | } 113 | } 114 | 115 | it should "fail on duplicates" in { 116 | val confStr = 117 | """ 118 | |extracts: 119 | | - name: client 120 | | uri: "data/dev/client_2017" 121 | | check: "/runtime-ctx/spark/extract-check/client.sql" 122 | | - name: client 123 | | uri: "data/dev/client_2017" 124 | | check: "/runtime-ctx/spark/extract-check/client.sql" 125 | | 126 | |transforms: 127 | | - name: client_spending 128 | | check: "/runtime-ctx/spark/transform-check/client_spending.sql" 129 | | sql: "/runtime-ctx/spark/transform/client_all.sql" 130 | | - name: client_spending 131 | | check: "/runtime-ctx/spark/transform-check/client_spending.sql" 132 | | sql: "/runtime-ctx/spark/transform/client_all.sql" 133 | | 134 | |loads: 135 | | - name: client_spending_out 136 | | source: client_spending 137 | | uri: "/tmp/out/client_spending" 138 | | - name: client_spending_out 139 | | source: client_spending 140 | | uri: "/tmp/out/client_spending" 141 | |""".stripMargin 142 | 143 | Config.parse(confStr) match { 144 | case Success(conf) => 145 | RuntimeContext.load(conf, ".", Map.empty) match { 146 | case Failure(errs) => 147 | val errList = errs.toList 148 | errList.length shouldBe 8 149 | errList.forall(_.msg.startsWith("Duplicates found for")) shouldBe true 150 | } 151 | } 152 | } 153 | 154 | it should "produce dot file" in { 155 | Config.parse(extractsAndTransformsStr) match { 156 | case Success(conf) => 157 | RuntimeContext.load(conf, ".", Map.empty) match { 158 | case Success(ctx) => 159 | val v = ctx.asDot 160 | ctx.asDot shouldBe 161 | """digraph Lineage { 162 | | rankdir=LR 163 | | node [fontsize=12] 164 | | 165 | | # vertices 166 | | client 167 | | item 168 | | transaction 169 | | client_spending [shape=component] 170 | | item_purchase [shape=component] 171 | | minor_purchase [shape=component] 172 | | client_spending_out [shape=cylinder] 173 | | item_purchase_out [shape=cylinder] 174 | | minor_purchase_out [shape=cylinder] 175 | | 176 | | # edges 177 | | item -> client_spending [style=dotted] 178 | | transaction -> client_spending [style=dotted] 179 | | client -> client_spending [style=dotted] 180 | | item -> item_purchase [style=dotted] 181 | | transaction -> item_purchase [style=dotted] 182 | | client -> item_purchase [style=dotted] 183 | | item -> minor_purchase [style=dotted] 184 | | transaction -> minor_purchase [style=dotted] 185 | | client -> minor_purchase [style=dotted] 186 | | client_spending -> client_spending_out 187 | | item_purchase -> item_purchase_out 188 | | minor_purchase -> minor_purchase_out 189 | | 190 | | # ranks 191 | | { rank=same; client item transaction } 192 | | { rank=same; client_spending_out item_purchase_out minor_purchase_out } 193 | |}""".stripMargin 194 | } 195 | } 196 | 197 | } 198 | } 199 | 200 | class OkExtractReader(val params: Map[String, Any]) extends ExtractReader(params) { 201 | override def checkLocal(extracts: Seq[Extract]): Validation[ConfigError, Unit] = ().success[ConfigError] 202 | override def checkRemote(extracts: Seq[Extract]): Validation[ConfigError, Unit] = ??? 203 | override def read(extracts: Seq[Extract])(implicit spark: SparkSession): Seq[(Extract, DataFrame)] = ??? 204 | } 205 | 206 | class OkLoadWriter(val params: Map[String, Any]) extends LoadWriter(params) { 207 | override def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit = ??? 208 | override def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit] = ().success[ConfigError] 209 | override def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit] = ??? 210 | } 211 | 212 | class BogusExtractReader1(params: Map[String, Any]) 213 | 214 | class BogusLoadWriter1(params: Map[String, Any]) 215 | 216 | class BogusExtractReader2 extends ExtractReader(Map.empty) { 217 | override def checkLocal(extracts: Seq[Extract]): Validation[ConfigError, Unit] = ().success[ConfigError] 218 | override def checkRemote(extracts: Seq[Extract]): Validation[ConfigError, Unit] = ??? 219 | override def read(extracts: Seq[Extract])(implicit spark: SparkSession): Seq[(Extract, DataFrame)] = ??? 220 | } 221 | 222 | class BogusLoadWriter2 extends LoadWriter(Map.empty) { 223 | override def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit = ??? 224 | override def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit] = ().success[ConfigError] 225 | override def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit] = ??? 226 | } 227 | -------------------------------------------------------------------------------- /src/main/scala/spark_etl/CLIOps.scala: -------------------------------------------------------------------------------- 1 | package spark_etl 2 | 3 | import java.io.{File, PrintWriter} 4 | 5 | import org.apache.log4j.Logger 6 | import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession} 7 | import spark_etl.model._ 8 | import spark_etl.util.{BAHelper, Validation} 9 | import spark_etl.util.Validation._ 10 | 11 | import scala.util.Try 12 | 13 | object CLIOps { 14 | val log = Logger.getLogger(getClass) 15 | 16 | def dotLineage(confUri: String, filePathRoot: String, env: Map[String, String], filename: String): Validation[ConfigError, Unit] = 17 | withCtx(confUri, filePathRoot, env) { 18 | ctx => 19 | new PrintWriter(filename) { 20 | write(ctx.asDot) 21 | close 22 | } 23 | ().success[ConfigError] 24 | } 25 | 26 | def validateLocal(confUri: String, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, Unit] = 27 | withCtx(confUri, filePathRoot, env) { 28 | ctx => 29 | val orgExtracts = ctx.allExtracts.map(_.org) 30 | val extractReaderValidation = ctx.extractReader.checkLocal(orgExtracts) 31 | val loadWriterValidation = ctx.loadWriter.checkLocal(ctx.loads) 32 | (extractReaderValidation +++ loadWriterValidation).map(_ => 33 | log.info( 34 | s"""Local context validated! 35 | | 36 | |ExtractReader validated! 37 | | 38 | |LoadWriter validated!""".stripMargin)) 39 | } 40 | 41 | def validateRemote(confUri: String, filePathRoot: String, env: Map[String, String])(implicit spark: SparkSession): Validation[ConfigError, Unit] = 42 | withCtx(confUri, filePathRoot, env) { 43 | ctx => 44 | val orgExtracts = ctx.allExtracts.map(_.org) 45 | val extractReaderValidation = ctx.extractReader.checkRemote(orgExtracts) 46 | val loadWriterValidation = ctx.loadWriter.checkRemote(ctx.loads) 47 | for { 48 | _ <- extractReaderValidation +++ loadWriterValidation 49 | _ <- { 50 | // for validation - do not persist 51 | val withoutCacheOrPersist = ctx.allExtracts.map(e => e.copy(org = e.org.copy(cache = None, persist = None))) 52 | readExtracts(ctx.extractReader, withoutCacheOrPersist) 53 | } 54 | _ <- { 55 | // for validation - do not persist 56 | val withoutCacheOrPersist = ctx.allTransforms.map(t => t.copy(org = t.org.copy(cache = None, persist = None))) 57 | loadTransforms(withoutCacheOrPersist) 58 | } 59 | } yield { 60 | log.info( 61 | s"""Remote context validated! 62 | | 63 | |ExtractReader validated! 64 | | 65 | |LoadWriter validated! 66 | | 67 | |Transforms loaded in session!""".stripMargin) 68 | } 69 | } 70 | 71 | def transformAndLoad(confUri: String, filePathRoot: String, env: Map[String, String], props: Map[String, String], showCounts: Boolean)(implicit spark: SparkSession): Validation[ConfigError, Unit] = 72 | withCtx(confUri, filePathRoot, env) { 73 | ctx => 74 | for { 75 | _ <- readExtracts(ctx.extractReader, ctx.allExtracts) 76 | transformed <- loadTransforms(ctx.allTransforms) 77 | _ <- runCounts(transformed, showCounts) 78 | written <- Try { 79 | val loadsAndDfs = for { 80 | (t, df) <- transformed 81 | l <- ctx.allLoads 82 | l <- if (t.org.name == l.source) List(l) else Nil 83 | } yield (l, df) 84 | ctx.loadWriter.write(loadsAndDfs) 85 | } match { 86 | case scala.util.Success(_) => ().success[ConfigError] 87 | case scala.util.Failure(exc:AnalysisException) => ConfigError(s"Failed to write out transform due to AnalysisException, ${exc.getMessage}").failure[Seq[(RuntimeTransform, DataFrame)]] 88 | case scala.util.Failure(e) => ConfigError("Failed to write out transform", Some(e)).failure[Unit] 89 | } 90 | } yield () 91 | } 92 | 93 | def extractCheck(confUri: String, filePathRoot: String, env: Map[String, String])(implicit spark: SparkSession): Validation[ConfigError, Unit] = 94 | withCtx(confUri, filePathRoot, env) { 95 | ctx => 96 | for { 97 | _ <- readExtracts(ctx.extractReader, ctx.allExtracts) 98 | _ <- { 99 | // 100 | val runnableChecks = ctx.allExtracts.collect { case e if e.checkContents.isDefined => e.org.name -> e.checkContents.get } 101 | runAndReport("Extract check results", runnableChecks) 102 | } 103 | } yield () 104 | } 105 | 106 | def transformCheck(confUri: String, filePathRoot: String, env: Map[String, String], showCounts: Boolean)(implicit spark: SparkSession): Validation[ConfigError, Unit] = 107 | withCtx(confUri, filePathRoot, env) { 108 | ctx => 109 | for { 110 | _ <- readExtracts(ctx.extractReader, ctx.allExtracts) 111 | transformed <- loadTransforms(ctx.allTransforms) 112 | _ <- runCounts(transformed, showCounts) 113 | _ <- { 114 | val runnableChecks = ctx.allTransforms.collect { case t if t.checkContents.isDefined => t.org.name -> t.checkContents.get } 115 | runAndReport("Transform check results", runnableChecks) 116 | } 117 | } yield () 118 | } 119 | 120 | private def withCtx(confUri: String, filePathRoot: String, env: Map[String, String])(run: (RuntimeContext) => Validation[ConfigError, Unit]): Validation[ConfigError, Unit] = { 121 | val validatedCtx = for { 122 | conf <- Config.load(confUri, filePathRoot, env) 123 | ctx <- { 124 | val relFilePath = 125 | if (confUri.startsWith("file:/")) 126 | new File(confUri.substring("file:".length)).getParent 127 | else if (confUri.startsWith("file:")) 128 | new File(confUri.substring("file:".length)).getParent match { 129 | case null => filePathRoot 130 | case confParent => new File(filePathRoot, confParent).getAbsolutePath 131 | } 132 | else 133 | filePathRoot 134 | RuntimeContext.load(conf, relFilePath, env) 135 | } 136 | } yield { 137 | val ctxDesc = 138 | s"""|Validated runtime context 139 | |========================= 140 | | 141 | |Extracts: 142 | |${toBullets(ctx.allExtracts.map(e => e.org.name -> e.org.uri))} 143 | |Extract checks: 144 | |${toBullets(ctx.allExtracts.flatMap(e => e.org.check.map(checkUri => e.org.name -> checkUri)))} 145 | |Transforms: 146 | |${toBullets(ctx.allTransforms.map(t => t.org.name -> t.org.sql))} 147 | |Transform checks: 148 | |${toBullets(ctx.allTransforms.flatMap(t => t.org.check.map(checkUri => t.org.name -> checkUri)))} 149 | |Loads: 150 | |${toBullets(ctx.loads.map(l => l.name -> l.uri))} 151 | """.stripMargin 152 | 153 | log.info(ctxDesc) 154 | ctx 155 | } 156 | 157 | validatedCtx.flatMap(run) 158 | } 159 | 160 | private def readExtracts(extractor: ExtractReader, extracts: Seq[RuntimeExtract])(implicit spark: SparkSession): Validation[ConfigError, Unit] = { 161 | val orgExtracts = extracts.map(_.org) 162 | Try { 163 | extractor.read(orgExtracts).foreach { 164 | case (e, df) => 165 | e.cache.foreach(c => if (c) df.cache()) 166 | e.persist.foreach(p => df.persist(p.asSpark)) 167 | df.createOrReplaceTempView(e.name) 168 | } 169 | } match { 170 | case scala.util.Success(res) => res.success[ConfigError] 171 | case scala.util.Failure(exc) => ConfigError("Failed to load extracts", Some(exc)).failure[Unit] 172 | } 173 | } 174 | 175 | private def loadTransforms(transforms: Seq[RuntimeTransform])(implicit spark: SparkSession): Validation[ConfigError, Seq[(RuntimeTransform, DataFrame)]] = 176 | Try { 177 | transforms.map { 178 | t => 179 | val df = spark.sql(t.sqlContents) 180 | t.org.cache.foreach(c => if (c) df.cache()) 181 | t.org.persist.foreach(p => df.persist(p.asSpark)) 182 | df.createOrReplaceTempView(t.org.name) 183 | (t, df) 184 | } 185 | } match { 186 | case scala.util.Success(res) => res.success[ConfigError] 187 | case scala.util.Failure(exc:AnalysisException) => ConfigError(s"Failed to run transforms due to AnalysisException, ${exc.getMessage}").failure[Seq[(RuntimeTransform, DataFrame)]] 188 | case scala.util.Failure(exc) => ConfigError("Failed to run transforms", Some(exc)).failure[Seq[(RuntimeTransform, DataFrame)]] 189 | } 190 | 191 | private def runCounts(transformsAndDfs: Seq[(RuntimeTransform, DataFrame)], showCounts: Boolean): Validation[ConfigError, Unit] = 192 | if (! showCounts) 193 | ().success[ConfigError] 194 | else 195 | Try { 196 | val countDescrs = toBullets(transformsAndDfs.map { case (t, df) => t.org.name -> df.count.toString }, ": ") 197 | log.info(s"Transform counts:\n$countDescrs") 198 | } match { 199 | case scala.util.Success(_) => ().success[ConfigError] 200 | case scala.util.Failure(exc) => ConfigError("Failed to run counts", Some(exc)).failure[Unit] 201 | } 202 | 203 | protected def runAndReport(desc: String, descAndSql: Seq[(String, String)])(implicit spark: SparkSession): Validation[ConfigError, Unit] = 204 | Try { 205 | val outputs = descAndSql.map { 206 | case (sqlDesc, sql) => 207 | val df = spark.sql(sql) 208 | val valFieldDesc = df.take(100).map(r => df.schema.fields zip r.toSeq).flatMap(_.map { 209 | // Fail on false only, succeed and report on all others 210 | case (f, false) => 211 | ConfigError(s"$desc: $sqlDesc's check ${f.name} returned false!").failure[(String, String)] 212 | case (f, value) => 213 | (f.name -> value.toString).success[ConfigError] 214 | }) 215 | val valRes = valFieldDesc.map(_.map(List(_))).reduce(_ +++ _) 216 | valRes.map(fieldDesc => s"$sqlDesc:\n${toBullets(fieldDesc)}") 217 | } 218 | outputs.map(_.map(List(_))).reduce(_ +++ _) 219 | } match { 220 | case scala.util.Success(res) => res.map(outputs => log.info(s"$desc:\n${outputs.mkString("\n")}")) 221 | case scala.util.Failure(exc) => ConfigError(s"Failed to load $desc", Some(exc)).failure[Unit] 222 | } 223 | 224 | def stripPrefixes(srcDir: File, targetDir: File, rmTargetDir: Boolean): Validation[ConfigError, Unit] = 225 | Try(BAHelper.copySqls(srcDir, targetDir, rmTargetDir)) match { 226 | case scala.util.Success(descs) => 227 | log.info(s"""Copied BA sql to DEV:\n${CLIOps.toBullets(descs)}""").success[ConfigError] 228 | case scala.util.Failure(e) => 229 | ConfigError(s"Failed to copy SQL from $srcDir to $targetDir", Some(e)).failure[Unit] 230 | } 231 | 232 | private def toBullets(kvs: Seq[(String, String)], sep: String = " -> ") = 233 | if (kvs.isEmpty) 234 | " NA" 235 | else { 236 | val maxKLen = kvs.map(_._1.length).max 237 | kvs.map { case (k, v) => s"• ${k.padTo(maxKLen, ' ')}$sep$v" }.mkString("\n") 238 | } 239 | } 240 | --------------------------------------------------------------------------------