├── project
    ├── build.properties
    └── plugins.sbt
├── src
    ├── test
    │   ├── resources
    │   │   ├── parquet
    │   │   │   ├── good
    │   │   │   │   ├── _temporary
    │   │   │   │   │   └── .placeholder
    │   │   │   │   └── year=2017
    │   │   │   │   │   └── .placeholder
    │   │   │   └── with_backup_dir
    │   │   │   │   └── _backup
    │   │   │   │       └── .placeholder
    │   │   ├── parquet-roundtrip
    │   │   │   ├── transform
    │   │   │   │   └── t.sql
    │   │   │   └── app.yaml
    │   │   ├── uri-loader
    │   │   │   ├── without_env_vars
    │   │   │   ├── with_env_vars
    │   │   │   ├── with_bogus_includes
    │   │   │   └── with_includes
    │   │   ├── runtime-ctx
    │   │   │   └── spark
    │   │   │   │   ├── transform
    │   │   │   │       ├── client_all.sql
    │   │   │   │       ├── client_spending.sql
    │   │   │   │       ├── minor_purchase.sql
    │   │   │   │       └── item_purchase.sql
    │   │   │   │   ├── extract-check
    │   │   │   │       ├── client.sql
    │   │   │   │       ├── transaction.sql
    │   │   │   │       └── item.sql
    │   │   │   │   └── transform-check
    │   │   │   │       ├── item_purchase.sql
    │   │   │   │       ├── minor_purchase.sql
    │   │   │   │       └── client_spending.sql
    │   │   ├── main-utils
    │   │   │   ├── spark
    │   │   │   │   ├── extract-check
    │   │   │   │   │   ├── client.sql
    │   │   │   │   │   ├── transaction.sql
    │   │   │   │   │   └── item.sql
    │   │   │   │   ├── transform-check
    │   │   │   │   │   ├── item_purchase.sql
    │   │   │   │   │   ├── minor_purchase.sql
    │   │   │   │   │   └── client_spending.sql
    │   │   │   │   └── transform
    │   │   │   │   │   ├── client_spending.sql
    │   │   │   │   │   ├── item_purchase.sql
    │   │   │   │   │   └── minor_purchase.sql
    │   │   │   └── config
    │   │   │   │   └── app.yaml
    │   │   └── log4j.properties
    │   └── scala
    │   │   └── spark_etl
    │   │       ├── parser
    │   │           └── ParserSpec.scala
    │   │       ├── util
    │   │           ├── DeaultEnvSpec.scala
    │   │           ├── SparkParserSpec.scala
    │   │           ├── DepTreeSpec.scala
    │   │           ├── UriLoaderSpec.scala
    │   │           └── ValidationSpec.scala
    │   │       ├── parquet
    │   │           ├── PathValidatorSpec.scala
    │   │           └── WriteReadRoundtripSpec.scala
    │   │       ├── CLIOpsSpec.scala
    │   │       ├── model
    │   │           ├── ConfigSpec.scala
    │   │           └── RuntimeContextSpec.scala
    │   │       └── oracle
    │   │           └── OracleLoadAppenderSpec.scala
    └── main
    │   ├── scala
    │       └── spark_etl
    │       │   ├── ConfigError.scala
    │       │   ├── util
    │       │       ├── Files.scala
    │       │       ├── DefaultEnv.scala
    │       │       ├── BAHelper.scala
    │       │       ├── SparkParser.scala
    │       │       ├── UriLoader.scala
    │       │       ├── DepTree.scala
    │       │       └── Validation.scala
    │       │   ├── model
    │       │       ├── Load.scala
    │       │       ├── Extract.scala
    │       │       ├── Transform.scala
    │       │       ├── ParametrizedConstructor.scala
    │       │       ├── Config.scala
    │       │       ├── Persist.scala
    │       │       └── RuntimeContext.scala
    │       │   ├── LoadWriter.scala
    │       │   ├── ExtractReader.scala
    │       │   ├── parquet
    │       │       ├── ParquetLoadWriter.scala
    │       │       ├── ParquetExtractReader.scala
    │       │       └── PathValidator.scala
    │       │   ├── parser
    │       │       └── Parser.scala
    │       │   ├── oracle
    │       │       ├── OracleValidator.scala
    │       │       └── OracleLoadAppender.scala
    │       │   ├── CLI.scala
    │       │   └── CLIOps.scala
    │   └── resources
    │       ├── monkey-patch.sh
    │       ├── log4j.properties
    │       └── run.sh
├── .gitignore
├── .travis.yml
└── README.md


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.9


--------------------------------------------------------------------------------
/src/test/resources/parquet/good/_temporary/.placeholder:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/test/resources/parquet/good/year=2017/.placeholder:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/test/resources/parquet/with_backup_dir/_backup/.placeholder:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/test/resources/parquet-roundtrip/transform/t.sql:
--------------------------------------------------------------------------------
1 | select s from x


--------------------------------------------------------------------------------
/src/test/resources/uri-loader/without_env_vars:
--------------------------------------------------------------------------------
1 | hello there
2 | 123
3 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.3.5")
2 | 


--------------------------------------------------------------------------------
/src/test/resources/uri-loader/with_env_vars:
--------------------------------------------------------------------------------
1 | 111 ${var1} 222 ${var2} 333 ${var1} 444
2 | 


--------------------------------------------------------------------------------
/src/test/resources/uri-loader/with_bogus_includes:
--------------------------------------------------------------------------------
1 | ===
2 | #include<__bogus_include__>
3 | ---


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | project/target
3 | project/project
4 | **/.DS_Store
5 | .idea/
6 | *~
7 | 


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/transform/client_all.sql:
--------------------------------------------------------------------------------
1 | -- transform - client - get all
2 | SELECT * from client


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - 2.11.8
4 | script:
5 | - sbt ++$TRAVIS_SCALA_VERSION coverage test coverageReport
6 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/ConfigError.scala:
--------------------------------------------------------------------------------
1 | package spark_etl
2 | 
3 | case class ConfigError(msg: String, exc: Option[Throwable] = None)
4 | 


--------------------------------------------------------------------------------
/src/test/resources/uri-loader/with_includes:
--------------------------------------------------------------------------------
1 | ===
2 | #include</uri-loader/without_env_vars>
3 | ---
4 |  #include </uri-loader/with_env_vars>
5 | +++


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/util/Files.scala:
--------------------------------------------------------------------------------
1 | package spark_etl.util
2 | 
3 | import java.io.File
4 | 
5 | object Files {
6 |   def pwd = new File(".").getCanonicalPath
7 |   def rootResource = getClass.getResource("/").getFile
8 | }
9 | 


--------------------------------------------------------------------------------
/src/test/resources/parquet-roundtrip/app.yaml:
--------------------------------------------------------------------------------
 1 | extracts:
 2 |   - name:  x
 3 |     uri:   "${path}/x"
 4 | 
 5 | transforms:
 6 |   - name:  transform
 7 |     sql:   "/parquet-roundtrip/transform/t.sql"
 8 | 
 9 | loads:
10 |   - name:   y
11 |     source: transform
12 |     uri:    "${path}/y"
13 | 


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/extract-check/client.sql:
--------------------------------------------------------------------------------
1 | -- pre-check - client
2 | SELECT -- null checks
3 |        (SELECT count(1) FROM client WHERE id IS NULL)   = 0 AS id_null_less,
4 |        (SELECT count(1) FROM client WHERE name IS NULL) = 0 AS name_null_less,
5 |        (SELECT count(1) FROM client WHERE age IS NULL)  = 0 AS age_null_less


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/model/Load.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.model
 2 | 
 3 | import net.jcazevedo.moultingyaml.DefaultYamlProtocol
 4 | 
 5 | case class Load(name: String, source: String, uri: String, partition_by: Option[List[String]] = None)
 6 | 
 7 | object Load extends DefaultYamlProtocol {
 8 |   implicit val yamlFormat = yamlFormat4(Load.apply)
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/test/resources/main-utils/spark/extract-check/client.sql:
--------------------------------------------------------------------------------
1 | -- pre-check - client
2 | SELECT -- null checks
3 |        (SELECT ${count_fun}(1) FROM client WHERE id IS NULL)   = 0 AS id_null_less,
4 |        (SELECT ${count_fun}(1) FROM client WHERE name IS NULL) = 0 AS name_null_less,
5 |        (SELECT ${count_fun}(1) FROM client WHERE age IS NULL)  = 0 AS age_null_less


--------------------------------------------------------------------------------
/src/main/resources/monkey-patch.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage() {
 4 |   echo "  Usage:"
 5 |   echo "    <jar> <file_in_jar>"
 6 |   exit 1
 7 | }
 8 | 
 9 | if [[ $# -lt 2 ]]; then usage;  fi
10 | jar=$1
11 | file=$2
12 | 
13 | rm -rf jar_exploded
14 | mkdir jar_exploded
15 | pushd jar_exploded
16 | unzip -q ../$jar
17 | vi $file
18 | zip -ur ../$jar $file
19 | popd
20 | rm -rf jar_exploded


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/model/Extract.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.model
 2 | 
 3 | import net.jcazevedo.moultingyaml.DefaultYamlProtocol
 4 | 
 5 | case class Extract(name: String, uri: String, cache: Option[Boolean] = None, persist: Option[Persist] = None, check: Option[String] = None)
 6 | 
 7 | object Extract extends DefaultYamlProtocol {
 8 |   implicit val yamlFormat = yamlFormat5(Extract.apply)
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/model/Transform.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.model
 2 | 
 3 | import net.jcazevedo.moultingyaml.DefaultYamlProtocol
 4 | 
 5 | case class Transform(name: String, sql: String, cache: Option[Boolean] = None, persist: Option[Persist] = None, check: Option[String] = None)
 6 | 
 7 | object Transform extends DefaultYamlProtocol {
 8 |   implicit val yamlFormat = yamlFormat5(Transform.apply)
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootCategory=INFO, console
2 | log4j.logger.org.apache.spark=WARN
3 | log4j.logger.org.spark_project=WARN
4 | log4j.appender.console=org.apache.log4j.ConsoleAppender
5 | log4j.appender.console.target=System.out
6 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.console.layout.ConversionPattern=%d{ISO8601} logLevel=%p thread=%t class=%C line_number=%L %m%n
8 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/LoadWriter.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl
 2 | 
 3 | import org.apache.spark.sql.DataFrame
 4 | import spark_etl.model.Load
 5 | import spark_etl.util.Validation
 6 | 
 7 | abstract class LoadWriter(params: Map[String, Any]) {
 8 |   def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit
 9 |   def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit]
10 |   def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit]
11 | }
12 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootCategory=WARN, console
2 | log4j.logger.org.apache.hadoop=ERROR
3 | log4j.logger.org.apache.spark=ERROR
4 | log4j.logger.org.spark_project=ERROR
5 | log4j.appender.console=org.apache.log4j.ConsoleAppender
6 | log4j.appender.console.target=System.out
7 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.console.layout.ConversionPattern=%d{ISO8601} logLevel=%p thread=%t class=%C line_number=%L %m%n
9 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/ExtractReader.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, SparkSession}
 4 | import spark_etl.model.Extract
 5 | import spark_etl.util.Validation
 6 | 
 7 | abstract class ExtractReader(params: Map[String, Any]) {
 8 |   def checkLocal(extracts: Seq[Extract]): Validation[ConfigError, Unit]
 9 |   def checkRemote(extracts: Seq[Extract]): Validation[ConfigError, Unit]
10 |   def read(extracts: Seq[Extract])(implicit spark: SparkSession): Seq[(Extract, DataFrame)]
11 | }
12 | 


--------------------------------------------------------------------------------
/src/test/resources/main-utils/spark/transform-check/item_purchase.sql:
--------------------------------------------------------------------------------
1 | SELECT -- null checks
2 |        (SELECT count(1) FROM item_purchase WHERE id IS NULL)   = 0 AS id_null_less,
3 |        (SELECT count(1) FROM item_purchase WHERE name IS NULL) = 0 AS name_null_less,
4 |        -- min checks
5 |        min(id)             > 0                                     AS id_positive_ok,
6 |        min(total_purchase) > 0                                     AS total_purchase_ok,
7 |        -- col width checks
8 |        max(${length_fun}(name))  <= 10                             AS name_ok
9 |   FROM item_purchase


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/transform-check/item_purchase.sql:
--------------------------------------------------------------------------------
1 | SELECT -- null checks
2 |        (SELECT count(1) FROM item_purchase WHERE id IS NULL)   = 0 AS id_null_less,
3 |        (SELECT count(1) FROM item_purchase WHERE name IS NULL) = 0 AS name_null_less,
4 |        -- min checks
5 |        min(id)             > 0                                     AS id_positive_ok,
6 |        min(total_purchase) > 0                                     AS total_purchase_ok,
7 |        -- col width checks
8 |        max(length(name))  <= 10                                    AS name_ok
9 |   FROM item_purchase


--------------------------------------------------------------------------------
/src/test/resources/main-utils/spark/transform-check/minor_purchase.sql:
--------------------------------------------------------------------------------
1 | SELECT -- null checks
2 |        (SELECT count(1) FROM minor_purchase WHERE id IS NULL)   = 0 AS id_null_less,
3 |        (SELECT count(1) FROM minor_purchase WHERE name IS NULL) = 0 AS name_null_less,
4 |        -- min checks
5 |        min(id)             > 0                                      AS id_positive_ok,
6 |        min(sold_to_minors) > 0                                      AS sold_to_minors_ok,
7 |        -- col width checks
8 |        max(length(name))  <= 10                                     AS name_ok
9 |   FROM minor_purchase


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/transform-check/minor_purchase.sql:
--------------------------------------------------------------------------------
1 | SELECT -- null checks
2 |        (SELECT count(1) FROM minor_purchase WHERE id IS NULL)   = 0 AS id_null_less,
3 |        (SELECT count(1) FROM minor_purchase WHERE name IS NULL) = 0 AS name_null_less,
4 |        -- min checks
5 |        min(id)             > 0                                      AS id_positive_ok,
6 |        min(sold_to_minors) > 0                                      AS sold_to_minors_ok,
7 |        -- col width checks
8 |        max(length(name))  <= 10                                     AS name_ok
9 |   FROM minor_purchase


--------------------------------------------------------------------------------
/src/test/resources/main-utils/spark/transform-check/client_spending.sql:
--------------------------------------------------------------------------------
1 | SELECT -- null checks
2 |        (SELECT count(1) FROM client_spending WHERE id IS NULL)   = 0 AS id_null_less,
3 |        (SELECT count(1) FROM client_spending WHERE name IS NULL) = 0 AS name_null_less,
4 |        -- min checks
5 |        min(id)             > 0                                       AS id_positive_ok,
6 |        min(total_spending) > 0                                       AS total_spending_ok,
7 |        -- col width checks
8 |        max(length(name))  <= 10                                      AS name_ok
9 |   FROM client_spending


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/transform-check/client_spending.sql:
--------------------------------------------------------------------------------
1 | SELECT -- null checks
2 |        (SELECT count(1) FROM client_spending WHERE id IS NULL)   = 0 AS id_null_less,
3 |        (SELECT count(1) FROM client_spending WHERE name IS NULL) = 0 AS name_null_less,
4 |        -- min checks
5 |        min(id)             > 0                                       AS id_positive_ok,
6 |        min(total_spending) > 0                                       AS total_spending_ok,
7 |        -- col width checks
8 |        max(length(name))  <= 10                                      AS name_ok
9 |   FROM client_spending


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/parser/ParserSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.parser
 2 | 
 3 | import org.scalatest.{FlatSpec, Inside, Matchers}
 4 | import spark_etl.parser.Parser._
 5 | 
 6 | class ParserSpec extends FlatSpec with Matchers with Inside {
 7 |   "Parser" should "resolve all UnresolvedRelations" in {
 8 |     getDsos("SELECT a from b.b").toSet shouldBe Set("b.b")
 9 |     getDsos("SELECT a from b.c").toSet shouldBe Set("b.c")
10 |     getDsos("SELECT a from b"  ).toSet shouldBe Set("b")
11 |     getDsos("SELECT a.x from b").toSet shouldBe Set("b")
12 |     getDsos("SELECT z.x from b").toSet shouldBe Set("b")
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/test/resources/main-utils/spark/extract-check/transaction.sql:
--------------------------------------------------------------------------------
1 | -- pre-check - transaction
2 | SELECT -- null checks
3 |       (SELECT count(1) FROM transaction WHERE client_id IS NULL)                                    = 0 AS client_id_null_less,
4 |       (SELECT count(1) FROM transaction WHERE item_id IS NULL)                                      = 0 AS item_id_null_less,
5 |       (SELECT count(1) FROM transaction WHERE quantity IS NULL)                                     = 0 AS quantity_null_less,
6 |       -- positive price checks
7 |       (SELECT count(1) FROM (SELECT CAST(quantity AS INTEGER) FROM transaction) WHERE quantity < 0) = 0 AS has_positive_quantity


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/extract-check/transaction.sql:
--------------------------------------------------------------------------------
1 | -- pre-check - transaction
2 | SELECT -- null checks
3 |       (SELECT count(1) FROM transaction WHERE client_id IS NULL)                                    = 0 AS client_id_null_less,
4 |       (SELECT count(1) FROM transaction WHERE item_id IS NULL)                                      = 0 AS item_id_null_less,
5 |       (SELECT count(1) FROM transaction WHERE quantity IS NULL)                                     = 0 AS quantity_null_less,
6 |       -- positive price checks
7 |       (SELECT count(1) FROM (SELECT CAST(quantity AS INTEGER) FROM transaction) WHERE quantity < 0) = 0 AS has_positive_quantity


--------------------------------------------------------------------------------
/src/test/resources/main-utils/spark/extract-check/item.sql:
--------------------------------------------------------------------------------
1 | -- pre-check - item
2 | SELECT -- null checks
3 |       (SELECT count(1) FROM item WHERE id IS NULL)                                     = 0 AS id_null_less,
4 |       (SELECT count(1) FROM item WHERE name IS NULL)                                   = 0 AS name_null_less,
5 |       (SELECT count(1) FROM item WHERE price IS NULL)                                  = 0 AS age_null_less,
6 |       (SELECT count(1) FROM item WHERE for_adults IS NULL)                             = 0 AS for_adults_null_less,
7 |       -- positive price checks
8 |       (SELECT count(1) FROM (SELECT CAST(price AS INTEGER) FROM item) WHERE price < 0) = 0 AS has_positive_prices


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/extract-check/item.sql:
--------------------------------------------------------------------------------
1 | -- pre-check - item
2 | SELECT -- null checks
3 |       (SELECT count(1) FROM item WHERE id IS NULL)                                     = 0 AS id_null_less,
4 |       (SELECT count(1) FROM item WHERE name IS NULL)                                   = 0 AS name_null_less,
5 |       (SELECT count(1) FROM item WHERE price IS NULL)                                  = 0 AS age_null_less,
6 |       (SELECT count(1) FROM item WHERE for_adults IS NULL)                             = 0 AS for_adults_null_less,
7 |       -- positive price checks
8 |       (SELECT count(1) FROM (SELECT CAST(price AS INTEGER) FROM item) WHERE price < 0) = 0 AS has_positive_prices


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/parquet/ParquetLoadWriter.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.parquet
 2 | 
 3 | import org.apache.spark.sql.DataFrame
 4 | import spark_etl.model.Load
 5 | import spark_etl.util.Validation
 6 | import spark_etl.util.Validation._
 7 | import spark_etl.{ConfigError, LoadWriter}
 8 | 
 9 | class ParquetLoadWriter(params: Map[String, String]) extends LoadWriter(params) {
10 |   override def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit = {
11 |     loadsAndDfs.foreach {
12 |       case (Load(_, _, uri, Some(partitionBy)), df) => df.write.partitionBy(partitionBy:_*).parquet(uri)
13 |       case (Load(_, _, uri, None), df)              => df.write.parquet(uri)
14 |     }
15 |   }
16 | 
17 |   // nothing to validate
18 |   override def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit] =
19 |     ().success[ConfigError]
20 | 
21 |   override def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit] =
22 |     ().success[ConfigError]
23 | }
24 | 


--------------------------------------------------------------------------------
/src/test/resources/main-utils/spark/transform/client_spending.sql:
--------------------------------------------------------------------------------
 1 | -- transform - client_spending
 2 | SELECT c.id,
 3 |        c.name,
 4 |        totals.total_spending
 5 |   FROM client c
 6 |        INNER JOIN
 7 |        (SELECT id,
 8 |                SUM(spent) AS total_spending
 9 |           FROM (SELECT typed_client.id,
10 |                        (typed_item.price * typed_transaction.quantity) AS spent
11 |                   FROM (SELECT id, name FROM client) AS typed_client
12 |                        LEFT OUTER JOIN
13 |                        (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) AS typed_transaction
14 |                        ON typed_transaction.client_id = typed_client.id
15 |                        LEFT OUTER JOIN
16 |                        (SELECT id, CAST(price AS INTEGER) FROM item) AS typed_item
17 |                        ON typed_transaction.item_id = typed_item.id
18 |                ) GROUP BY id
19 |          ) AS totals
20 |          ON c.id = totals.id
21 | 


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/transform/client_spending.sql:
--------------------------------------------------------------------------------
 1 | -- transform - client_spending
 2 | SELECT c.id,
 3 |        c.name,
 4 |        totals.total_spending
 5 |   FROM client c
 6 |        INNER JOIN
 7 |        (SELECT id,
 8 |                SUM(spent) AS total_spending
 9 |           FROM (SELECT typed_client.id,
10 |                        (typed_item.price * typed_transaction.quantity) AS spent
11 |                   FROM (SELECT id, name FROM client) AS typed_client
12 |                        LEFT OUTER JOIN
13 |                        (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) AS typed_transaction
14 |                        ON typed_transaction.client_id = typed_client.id
15 |                        LEFT OUTER JOIN
16 |                        (SELECT id, CAST(price AS INTEGER) FROM item) AS typed_item
17 |                        ON typed_transaction.item_id = typed_item.id
18 |                ) GROUP BY id
19 |          ) AS totals
20 |          ON c.id = totals.id
21 | 


--------------------------------------------------------------------------------
/src/test/resources/main-utils/spark/transform/item_purchase.sql:
--------------------------------------------------------------------------------
 1 | -- transform - item_purchase
 2 | SELECT i.id,
 3 |        i.name,
 4 |        totals.total_purchase
 5 |   FROM item i
 6 |        INNER JOIN
 7 |        (SELECT id,
 8 |                SUM(purchase) AS total_purchase
 9 |           FROM (SELECT typed_client.id,
10 |                        (typed_item.price * typed_transaction.quantity) AS purchase
11 |                   FROM (SELECT id, name, CAST(age AS INTEGER) FROM client) AS typed_client
12 |                        ${join_type}
13 |                        (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) AS typed_transaction
14 |                        ON typed_transaction.client_id = typed_client.id
15 |                        ${join_type}
16 |                        (SELECT id, CAST(price AS INTEGER) FROM item) AS typed_item
17 |                        ON typed_transaction.item_id = typed_item.id
18 |                ) GROUP BY id
19 |          ) AS totals
20 |          ON i.id = totals.id
21 | 


--------------------------------------------------------------------------------
/src/test/resources/main-utils/spark/transform/minor_purchase.sql:
--------------------------------------------------------------------------------
 1 | -- transform - minor_purchase
 2 | SELECT i.id,
 3 |        i.name,
 4 |        totals.sold_to_minors
 5 |   FROM item i
 6 |        INNER JOIN
 7 |        (SELECT id,
 8 |                SUM(sold_to_minors) AS sold_to_minors
 9 |           FROM (SELECT typed_client.id,
10 |                        positive_transaction.sold_to_minors
11 |                   FROM (SELECT id, CAST(age AS INTEGER) FROM client) AS typed_client
12 |                        INNER JOIN
13 |                        (SELECT client_id, item_id, quantity AS sold_to_minors FROM (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) WHERE quantity > 0) AS positive_transaction
14 |                        ON positive_transaction.client_id = typed_client.id
15 |                        INNER JOIN
16 |                        item
17 |                        ON positive_transaction.item_id = item.id
18 |                ) GROUP BY id
19 |          ) AS totals
20 |          ON i.id = totals.id
21 | 


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/transform/minor_purchase.sql:
--------------------------------------------------------------------------------
 1 | -- transform - minor_purchase
 2 | SELECT i.id,
 3 |        i.name,
 4 |        totals.sold_to_minors
 5 |   FROM item i
 6 |        INNER JOIN
 7 |        (SELECT id,
 8 |                SUM(sold_to_minors) AS sold_to_minors
 9 |           FROM (SELECT typed_client.id,
10 |                        positive_transaction.sold_to_minors
11 |                   FROM (SELECT id, CAST(age AS INTEGER) FROM client) AS typed_client
12 |                        INNER JOIN
13 |                        (SELECT client_id, item_id, quantity AS sold_to_minors FROM (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) WHERE quantity > 0) AS positive_transaction
14 |                        ON positive_transaction.client_id = typed_client.id
15 |                        INNER JOIN
16 |                        item
17 |                        ON positive_transaction.item_id = item.id
18 |                ) GROUP BY id
19 |          ) AS totals
20 |          ON i.id = totals.id
21 | 


--------------------------------------------------------------------------------
/src/test/resources/runtime-ctx/spark/transform/item_purchase.sql:
--------------------------------------------------------------------------------
 1 | -- transform - item_purchase
 2 | SELECT i.id,
 3 |        i.name,
 4 |        totals.total_purchase
 5 |   FROM item i
 6 |        INNER JOIN
 7 |        (SELECT id,
 8 |                SUM(purchase) AS total_purchase
 9 |           FROM (SELECT typed_client.id,
10 |                        (typed_item.price * typed_transaction.quantity) AS purchase
11 |                   FROM (SELECT id, name, CAST(age AS INTEGER) FROM client) AS typed_client
12 |                        LEFT OUTER JOIN
13 |                        (SELECT client_id, item_id, CAST(quantity AS INTEGER) FROM transaction) AS typed_transaction
14 |                        ON typed_transaction.client_id = typed_client.id
15 |                        LEFT OUTER JOIN
16 |                        (SELECT id, CAST(price AS INTEGER) FROM item) AS typed_item
17 |                        ON typed_transaction.item_id = typed_item.id
18 |                ) GROUP BY id
19 |          ) AS totals
20 |          ON i.id = totals.id
21 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/util/DeaultEnvSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.util
 2 | 
 3 | import org.joda.time.DateTime
 4 | import org.scalatest.{FlatSpec, Inside, Matchers}
 5 | 
 6 | class DeaultEnvSpec extends FlatSpec with Matchers with Inside {
 7 |   "DefaultEnv" should "obtain dates for start of epoch" in {
 8 |     DefaultEnv.getAll(new DateTime(0)).toList should contain allElementsOf Seq(
 9 |       // t-1d
10 |       "yyyy-MM-1d"        -> "1969-12",
11 |       "yyyy-MM-dd-1d"     -> "1969-12-31",
12 |       "sod-1d"            -> "1969-12-31 00:00:00",
13 |       "eod-1d"            -> "1969-12-31 23:59:59",
14 |       "y-1d"              -> "1969",
15 |       "m-1d"              -> "12",
16 |       "d-1d"              -> "31",
17 |       // utc-1d
18 |       "utc-yyyy-MM-1d"    -> "1969-12",
19 |       "utc-yyyy-MM-dd-1d" -> "1969-12-31",
20 |       "utc-sod-1d"        -> "1969-12-31 00:00:00",
21 |       "utc-eod-1d"        -> "1969-12-31 23:59:59",
22 |       "utc-y-1d"          -> "1969",
23 |       "utc-m-1d"          -> "12",
24 |       "utc-d-1d"          -> "31"
25 |     )
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/util/SparkParserSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.util
 2 | 
 3 | import spark_etl.util.SparkParser.QfDep
 4 | import org.scalatest._
 5 | 
 6 | class SparkParserSpec extends FlatSpec with Matchers {
 7 |   val complexSql =
 8 |     """
 9 |       |-- select all from client, transaction, item
10 |       |SELECT *
11 |       |  FROM namespace1.client c,
12 |       |       namespace2.transaction t,
13 |       |       item i
14 |       | WHERE c.id = t.c_id AND t.i_id = i.id""".stripMargin
15 | 
16 |   "SparkParser" should "fetch deps" in {
17 |     SparkParser.getDeps(complexSql) should contain allOf(
18 |       QfDep("client",      Some("namespace1")),
19 |       QfDep("transaction", Some("namespace2")),
20 |       QfDep("item")
21 |     )
22 |   }
23 | 
24 |   it should "strip db prefixes" in {
25 |     SparkParser.stripDbs(complexSql) shouldBe
26 |       """
27 |         |-- select all from client, transaction, item
28 |         |SELECT *
29 |         |  FROM client c,
30 |         |       transaction t,
31 |         |       item i
32 |         | WHERE c.id = t.c_id AND t.i_id = i.id""".stripMargin
33 |   }
34 | }


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/parser/Parser.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.parser
 2 | 
 3 | import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 4 | import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 5 | import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LogicalPlan, UnaryNode, Union}
 6 | 
 7 | object Parser {
 8 |   def getDsos(sql: String): List[String] = {
 9 |     def toStr(r: UnresolvedRelation) =
10 |       r.tableIdentifier.database
11 |         .map(db => s"$db.${r.tableIdentifier.table}")
12 |         .getOrElse(r.tableIdentifier.table)
13 | 
14 |     def getDsoNames(plan: LogicalPlan, soFar: List[String] = List.empty): List[String] = {
15 |       plan match {
16 |         case un: UnaryNode => getDsoNames(un.child, soFar)
17 |         case bn: BinaryNode => getDsoNames(bn.right, getDsoNames(bn.left, soFar))
18 |         case ur: UnresolvedRelation => toStr(ur) :: soFar
19 |         case u: Union => u.children.foldLeft(soFar) { case (soFar2, c) => getDsoNames(c, soFar2)}
20 |         case _ => soFar
21 |       }
22 |     }
23 |     val plan = CatalystSqlParser.parsePlan(sql)
24 |     getDsoNames(plan)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/parquet/ParquetExtractReader.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.parquet
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, SparkSession}
 4 | import spark_etl.model.Extract
 5 | import spark_etl.util.Validation
 6 | import spark_etl.util.Validation._
 7 | import spark_etl.{ConfigError, ExtractReader}
 8 | 
 9 | import scala.util.Try
10 | 
11 | class ParquetExtractReader(params: Map[String, Any]) extends ExtractReader(params) {
12 |   val checkChildren = Try(params("check-children").asInstanceOf[Boolean]).getOrElse(false)
13 |   val expectPartition = Try(params("expect-partition").asInstanceOf[Boolean]).getOrElse(false)
14 | 
15 |   // nothing to validate
16 |   override def checkLocal(extracts: Seq[Extract]): Validation[ConfigError, Unit] =
17 |     ().success[ConfigError]
18 | 
19 |   override def checkRemote(extracts: Seq[Extract]): Validation[ConfigError, Unit] = {
20 |     val parquetUris = extracts.map(_.uri)
21 |     PathValidator.validate(checkChildren, expectPartition, parquetUris: _*).map(_ => ())
22 |   }
23 | 
24 |   override def read(extracts: Seq[Extract])(implicit spark: SparkSession): Seq[(Extract, DataFrame)] = {
25 |     extracts.map {
26 |       e =>
27 |         val df = spark.read.parquet(e.uri)
28 |         (e, df)
29 |     }
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/test/resources/main-utils/config/app.yaml:
--------------------------------------------------------------------------------
 1 | extracts:
 2 |   - name:  client
 3 |     uri:   "data/dev/client_2017"
 4 |     check: "file:../${engine}/extract-check/client.sql"
 5 |   - name:  item
 6 |     uri:   "data/dev/item_2017"
 7 |     check: "file:../${engine}/extract-check/item.sql"
 8 |   - name:  transaction
 9 |     uri:   "data/dev/transaction_2017"
10 |     check: "file:../${engine}/extract-check/transaction.sql"
11 | 
12 | transforms:
13 |   - name:  client_spending
14 |     check: "file:../${engine}/transform-check/client_spending.sql"
15 |     sql:   "file:../${engine}/transform/client_spending.sql"
16 |   - name:  item_purchase
17 |     check: "file:../${engine}/transform-check/item_purchase.sql"
18 |     sql:   "file:../${engine}/transform/item_purchase.sql"
19 |   - name:  minor_purchase
20 |     check: "file:../${engine}/transform-check/minor_purchase.sql"
21 |     sql:   "file:../${engine}/transform/minor_purchase.sql"
22 | 
23 | loads:
24 |   - name:   client_spending_out
25 |     source: client_spending
26 |     uri:    "/tmp/out/client_spending"
27 |   - name:   item_purchase_out
28 |     source: item_purchase
29 |     uri:    "/tmp/out/item_purchase"
30 |   - name:   minor_purchase_out
31 |     source: minor_purchase
32 |     uri:    "/tmp/out/minor_purchase"
33 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/parquet/PathValidatorSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.parquet
 2 | 
 3 | import java.io.File
 4 | 
 5 | import org.scalatest.{FlatSpec, Inside, Matchers}
 6 | import spark_etl.util._
 7 | 
 8 | class PathValidatorSpec extends FlatSpec with Matchers with Inside {
 9 |   val root = Files.rootResource
10 | 
11 |   // create an empty dir, without a placeholder
12 |   val emptyDir = new File(s"$root/parquet/empty")
13 |   if (! emptyDir.exists)
14 |     emptyDir.mkdir()
15 | 
16 |   "PathValidator" should "validate local `good` path" in {
17 |     PathValidator.validate(
18 |       true,
19 |       true,
20 |       s"$root/parquet/good"
21 |     ) shouldBe Success(List(s"$root/parquet/good"))
22 |   }
23 | 
24 |   it should "validate local `bad` path" in {
25 |     val res = PathValidator.validate(
26 |       true,
27 |       true,
28 |       s"$root/parquet/empty",
29 |       s"$root/parquet/with_backup_dir",
30 |       s"$root/parquet/__bogus_dir__"
31 |     )
32 |     inside(res) {
33 |       case Failure(errs) =>
34 |         val errMsgs = errs.toList.map(_.msg).sorted
35 |         errMsgs.length shouldBe 3
36 |         errMsgs(0) should startWith("Local path doesn't exist")
37 |         errMsgs(1) should startWith("Local path is empty for")
38 |         errMsgs(2) should startWith("Unexpected local children for")
39 |     }
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/util/DefaultEnv.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.util
 2 | 
 3 | import org.joda.time.{DateTime, DateTimeZone}
 4 | import org.joda.time.format.DateTimeFormat
 5 | 
 6 | /**
 7 |   * Default env vars, for token substitution in queries/paths.
 8 |   */
 9 | object DefaultEnv {
10 |   private val `yyyy-MM` = DateTimeFormat.forPattern("yyyy-MM")
11 |   private val `yyyy-MM-dd` = DateTimeFormat.forPattern("yyyy-MM-dd")
12 |   private val `yyyy-MM-dd HH:mm:ss` = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
13 | 
14 |   def getAll(date: DateTime, prefix: String = ""): Map[String, String] = {
15 |     val `t-1` = date.minusDays(1)
16 |     val `utc-1` = date.withZone(DateTimeZone.UTC).minusDays(1)
17 |     get(`t-1`, prefix, "-1d") ++
18 |       get(`utc-1`, prefix + "utc-", "-1d")
19 | 
20 |   }
21 | 
22 |   def get(date: DateTime, prefix: String = "", suffix: String = ""): Map[String, String] = {
23 |     Map(
24 |       "yyyy-MM"    -> `yyyy-MM`.print(date),
25 |       "yyyy-MM-dd" -> `yyyy-MM-dd`.print(date),
26 |       "sod"        -> `yyyy-MM-dd HH:mm:ss`.print(date.withHourOfDay(0).withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0)),
27 |       "eod"        -> `yyyy-MM-dd HH:mm:ss`.print(date.withHourOfDay(23).withMinuteOfHour(59).withSecondOfMinute(59).withMillisOfSecond(999)),
28 |       "y"          -> date.getYear.toString,
29 |       "m"          -> date.getMonthOfYear.toString,
30 |       "d"          -> date.getDayOfMonth.toString
31 |     ).map { case (k,v) => s"$prefix$k$suffix" -> v }
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/util/BAHelper.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.util
 2 | 
 3 | import java.io.{File, PrintWriter}
 4 | 
 5 | import scala.io.Source
 6 | 
 7 | object BAHelper {
 8 |   def copySqls(sourceDir: File, targetDir: File, rmTargetDir: Boolean): Seq[(String, String)] = {
 9 |     if (rmTargetDir)
10 |       rmdir(targetDir)
11 |     val sqlFiles = descendants(sourceDir).filter(_.getName.toLowerCase.endsWith(".sql"))
12 |     sqlFiles.map {
13 |       f =>
14 |         val fPerms = java.nio.file.Files.getPosixFilePermissions(f.toPath)
15 |         val contents = Source.fromFile(f).mkString
16 |         val target = new File(targetDir, f.getAbsolutePath.replace(sourceDir.getAbsolutePath, ""))
17 |         val targetParent = target.getParentFile
18 |         targetParent.mkdirs()
19 |         new PrintWriter(target) {
20 |           val stripped = SparkParser.stripDbs(contents)
21 |           write(stripped)
22 |           close()
23 |         }
24 |         java.nio.file.Files.setPosixFilePermissions(target.toPath, fPerms)
25 |         (f.getPath, target.getPath)
26 |     }
27 |   }
28 | 
29 |   private def descendants(f: File): Seq[File] = {
30 |     val children = f.listFiles
31 |     if (children == null)
32 |       Nil
33 |     else
34 |       children ++ children.filter(_.isDirectory).flatMap(descendants)
35 |   }
36 | 
37 |   private def rmdir(file: File): Unit = {
38 |     if (file.isDirectory)
39 |       file.listFiles.foreach(rmdir)
40 |     if (file.exists && !file.delete)
41 |       throw new Exception(s"Unable to delete ${file.getAbsolutePath}")
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/util/SparkParser.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.util
 2 | 
 3 | import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 4 | import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 5 | import org.apache.spark.sql.catalyst.plans.logical.{BinaryNode, LogicalPlan, UnaryNode, Union}
 6 | 
 7 | object SparkParser {
 8 |   /**
 9 |     * Get a list of qualified dependencies.
10 |     */
11 |   def getDeps(sql: String): List[QfDep] = {
12 |     def toDep(r: UnresolvedRelation) =
13 |       r.tableIdentifier.database
14 |         .map(db => QfDep(r.tableIdentifier.table, Some(db)))
15 |         .getOrElse(QfDep(r.tableIdentifier.table))
16 | 
17 |     def getDepNames(plan: LogicalPlan, soFar: List[QfDep] = List.empty): List[QfDep] = {
18 |       plan match {
19 |         case un: UnaryNode => getDepNames(un.child, soFar)
20 |         case bn: BinaryNode => getDepNames(bn.right, getDepNames(bn.left, soFar))
21 |         case ur: UnresolvedRelation => toDep(ur) :: soFar
22 |         case u: Union => u.children.foldLeft(soFar) { case (soFar2, c) => getDepNames(c, soFar2) }
23 |         case _ => soFar
24 |       }
25 |     }
26 |     val plan = CatalystSqlParser.parsePlan(sql)
27 |     getDepNames(plan).distinct
28 |   }
29 | 
30 |   /**
31 |     * Strip db prefixes.
32 |     */
33 |   def stripDbs(sql: String): String =
34 |     getDeps(sql).foldLeft(sql) {
35 |       case (soFar, dep) =>
36 |         soFar.replace(dep.qfStr, dep.dep)
37 |     }
38 | 
39 |   case class QfDep(dep: String, prefix: Option[String] = None) {
40 |     def qfStr: String = prefix.map(dbId => s"$dbId.$dep").getOrElse(dep)
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/util/DepTreeSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.util
 2 | 
 3 | import org.scalatest.{FlatSpec, Inside, Matchers}
 4 | 
 5 | class DepTreeSpec extends FlatSpec with Matchers with Inside {
 6 |   val initVertices = Seq(
 7 |     Vertex("l1", L),
 8 |     Vertex("l2", L),
 9 |     Vertex("t1", T),
10 |     Vertex("t2", T),
11 |     Vertex("e1", E),
12 |     Vertex("e2", E),
13 |     Vertex("e3", E)
14 |   )
15 | 
16 |   "DepTree" should "validate simple tree with no dangling" in {
17 |     val tree = new DepTree(initVertices)
18 | 
19 |     // add actual deps
20 |     tree.addEdge("e1", Vertex("t1", T))
21 |     tree.addEdge("e2", Vertex("t1", T))
22 |     tree.addEdge("t1", Vertex("t2", T))
23 |     tree.addEdge("e3", Vertex("t2", T))
24 |     tree.addEdge("t1", Vertex("l1", L), true)
25 |     tree.addEdge("t2", Vertex("l2", L), true)
26 | 
27 |     // validate
28 |     tree.dangling shouldBe Nil
29 |     tree.forType(L) shouldBe Seq(
30 |       Vertex("l1", L),
31 |       Vertex("l2", L)
32 |     )
33 |     tree.forType(T) shouldBe Seq(
34 |       Vertex("t1", T),
35 |       Vertex("t2", T)
36 |     )
37 |   }
38 | 
39 |   it should "find dangling" in {
40 |     val tree = new DepTree(initVertices)
41 | 
42 |     // add actual deps
43 |     tree.addEdge("e1", Vertex("t1", T))
44 |     tree.addEdge("__bogus_e__", Vertex("t1", T))
45 |     tree.addEdge("t1", Vertex("l1", L), true)
46 | 
47 |     // validate
48 |     tree.dangling shouldBe Seq(Edge(Vertex("__bogus_e__", Dangling), Vertex("t1", T), false))
49 | 
50 |     tree.rootless shouldBe Seq(Vertex("t2", T), Vertex("e2", E), Vertex("e3", E), Vertex("__bogus_e__", Dangling))
51 | 
52 |     tree.forType(T) shouldBe Seq(Vertex("t1", T))
53 |     tree.forType(E) shouldBe Seq(Vertex("e1", E))
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/CLIOpsSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl
 2 | 
 3 | import org.scalatest.{FlatSpec, Inside, Matchers}
 4 | import spark_etl.util._
 5 | 
 6 | class CLIOpsSpec extends FlatSpec with Matchers with Inside {
 7 |   val root = Files.rootResource
 8 | 
 9 |   "CLI" should "validate-local complex file specs" in {
10 |     CLI.main(Array("-Denv.engine=spark", "-Denv.length_fun=length", "-Denv.count_fun=count", "-Denv.join_type=LEFT OUTER JOIN", s"--conf-uri=file:$root/main-utils/config/app.yaml", "validate-local"))
11 |   }
12 | 
13 |   "CLIOps" should "validate-local complex file specs" in {
14 |     val envVars = Map("engine" -> "spark", "length_fun" -> "length", "count_fun" -> "count", "join_type" -> "LEFT OUTER JOIN")
15 |     CLIOps.validateLocal("file:main-utils/config/app.yaml", root, envVars) shouldBe Success(())
16 |   }
17 | 
18 |   it should "fail on missing app.yaml env vars" in {
19 |     val envVars = Map("length_fun" -> "length", "count_fun" -> "count", "join_type" -> "LEFT OUTER JOIN")
20 |     inside(CLIOps.validateLocal("file:main-utils/config/app.yaml", root, envVars)) {
21 |       case Failure(errs) =>
22 |         errs.length shouldBe 1
23 |         errs.head.msg shouldBe "Unresolved env vars in file:main-utils/config/app.yaml: ${engine}"
24 |     }
25 |   }
26 | 
27 |   it should "fail on missing SQL env vars" in {
28 |     val envVars = Map("engine" -> "spark")
29 |     inside(CLIOps.validateLocal("file:main-utils/config/app.yaml", root, envVars)) {
30 |       case Failure(errs) =>
31 |         errs.toList.map(_.msg).sorted shouldBe List(
32 |           "Unresolved env vars in file:../spark/extract-check/client.sql: ${count_fun}",
33 |           "Unresolved env vars in file:../spark/transform-check/item_purchase.sql: ${length_fun}",
34 |           "Unresolved env vars in file:../spark/transform/item_purchase.sql: ${join_type}"
35 |         )
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/model/ParametrizedConstructor.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.model
 2 | 
 3 | import net.jcazevedo.moultingyaml._
 4 | 
 5 | case class ParametrizedConstructor(`class`: String, params: Option[Map[String, Any]] = Some(Map.empty))
 6 | 
 7 | object ParametrizedConstructor extends DefaultYamlProtocol {
 8 |   implicit val mapFormat: YamlFormat[Map[String, Any]] = new YamlFormat[Map[String, Any]] {
 9 |     override def read(v: YamlValue): Map[String, Any] = v match {
10 |       case o: YamlObject => readValue(o).asInstanceOf[Map[String, Any]]
11 |       case other => deserializationError(s"Map like object expected, got $other")
12 |     }
13 | 
14 |     def readValue: (YamlValue) => Any = {
15 |       case x: YamlBoolean  => x.boolean
16 |       case x: YamlDate     => x.date.toDate
17 |       case x: YamlNumber   => x.value.toInt
18 |       case x: YamlString   => x.value
19 |       case x: YamlObject   => x.fields.map { case (k, v) => asStr(k) -> readValue(v)}
20 |       case x: YamlArray    => x.elements.map(readValue)
21 |       case x: YamlSet      => x.set.map(readValue)
22 |       case YamlNull        => null
23 |       case YamlNaN         => Double.NaN
24 |       case YamlNegativeInf => Double.NegativeInfinity
25 |       case YamlPositiveInf => Double.PositiveInfinity
26 |     }
27 | 
28 |     def asStr: (YamlValue) => String = {
29 |       case x: YamlBoolean => x.boolean.toString
30 |       case x: YamlDate    => x.date.toString
31 |       case x: YamlNumber  => x.value.toString
32 |       case x: YamlString  => x.value
33 |       case YamlNull        => null
34 |       case YamlNaN         => "nan"
35 |       case YamlNegativeInf => "-∞"
36 |       case YamlPositiveInf => "∞"
37 |       case other => deserializationError(s"Failed to stringify map key: $other")
38 |     }
39 | 
40 |     override def write(obj: Map[String, Any]): YamlValue = ???
41 |   }
42 | 
43 |   implicit val yamlFormat = yamlFormat2(ParametrizedConstructor.apply)
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/model/Config.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.model
 2 | 
 3 | import net.jcazevedo.moultingyaml._
 4 | import spark_etl.ConfigError
 5 | import spark_etl.parquet.{ParquetExtractReader, ParquetLoadWriter}
 6 | import spark_etl.util.Validation._
 7 | import spark_etl.util.{UriLoader, Validation}
 8 | 
 9 | import scala.util.{Failure, Success, Try}
10 | 
11 | case class Config(
12 |   extracts: List[Extract],
13 |   transforms: List[Transform],
14 |   loads: List[Load],
15 |   extract_reader: Option[ParametrizedConstructor] = Some(ParametrizedConstructor(classOf[ParquetExtractReader].getName, Some(Map.empty))),
16 |   load_writer: Option[ParametrizedConstructor] = Some(ParametrizedConstructor(classOf[ParquetLoadWriter].getName, Some(Map.empty))))
17 | 
18 | object Config extends DefaultYamlProtocol {
19 |   implicit val yamlFormat = yamlFormat5(Config.apply)
20 | 
21 |   /**
22 |     * Load Config from resource/file Uri
23 |     */
24 |   def load(resourceUri: String, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, Config] =
25 |     UriLoader.load(resourceUri, filePathRoot, env).flatMap(parse(_, env))
26 | 
27 |   def parse(configStr: String, env: Map[String, String] = Map.empty): Validation[ConfigError, Config] =
28 |     Try(configStr.parseYaml.convertTo[Config]) match {
29 |       case Success(conf) =>
30 |         // yaml parser does not populate with defaults - force them
31 |         val defaultExtractReader = Config(Nil, Nil, Nil).extract_reader
32 |         val defaultLoadWriter = Config(Nil, Nil, Nil).load_writer
33 |         val conf2 = conf.copy(
34 |           extract_reader = conf.extract_reader.orElse(defaultExtractReader),
35 |           load_writer = conf.load_writer.orElse(defaultLoadWriter)
36 |         )
37 |         conf2.success[ConfigError]
38 |       case Failure(e: DeserializationException) =>
39 |         ConfigError(s"Failed to deserialize config body, exception: ${e.getMessage}").failure[Config]
40 |       case Failure(e) =>
41 |         ConfigError(s"Failed to parse config body", Some(e)).failure[Config]
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/model/Persist.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.model
 2 | 
 3 | import net.jcazevedo.moultingyaml._
 4 | import org.apache.spark.storage.StorageLevel
 5 | 
 6 | sealed trait Persist { def asSpark: StorageLevel }
 7 | 
 8 | object Persist {
 9 |   object NONE extends Persist { def asSpark = StorageLevel.NONE }
10 |   object DISK_ONLY extends Persist { def asSpark = StorageLevel.DISK_ONLY }
11 |   object DISK_ONLY_2 extends Persist { def asSpark = StorageLevel.DISK_ONLY_2 }
12 |   object MEMORY_ONLY extends Persist { def asSpark = StorageLevel.MEMORY_ONLY }
13 |   object MEMORY_ONLY_2 extends Persist { def asSpark = StorageLevel.MEMORY_ONLY_2 }
14 |   object MEMORY_ONLY_SER extends Persist { def asSpark = StorageLevel.MEMORY_ONLY_SER }
15 |   object MEMORY_ONLY_SER_2 extends Persist { def asSpark = StorageLevel.MEMORY_ONLY_SER_2 }
16 |   object MEMORY_AND_DISK extends Persist { def asSpark = StorageLevel.MEMORY_AND_DISK }
17 |   object MEMORY_AND_DISK_2 extends Persist { def asSpark = StorageLevel.MEMORY_AND_DISK_2 }
18 |   object MEMORY_AND_DISK_SER extends Persist { def asSpark = StorageLevel.MEMORY_AND_DISK_SER }
19 |   object MEMORY_AND_DISK_SER_2 extends Persist { def asSpark = StorageLevel.MEMORY_AND_DISK_SER_2 }
20 |   object OFF_HEAP extends Persist { def asSpark = StorageLevel.OFF_HEAP }
21 | 
22 |   implicit val typeFormat = new YamlFormat[Persist] {
23 |     def read(value: YamlValue): Persist = value match {
24 |       case YamlString(x) =>
25 |         x.toUpperCase match {
26 |           case "NONE" => NONE
27 |           case "DISK_ONLY" => DISK_ONLY
28 |           case "DISK_ONLY_2" => DISK_ONLY_2
29 |           case "MEMORY_ONLY" => MEMORY_ONLY
30 |           case "MEMORY_ONLY_2" => MEMORY_ONLY_2
31 |           case "MEMORY_ONLY_SER" => MEMORY_ONLY_SER
32 |           case "MEMORY_ONLY_SER_2" => MEMORY_ONLY_SER_2
33 |           case "MEMORY_AND_DISK" => MEMORY_AND_DISK
34 |           case "MEMORY_AND_DISK_2" => MEMORY_AND_DISK_2
35 |           case "MEMORY_AND_DISK_SER" => MEMORY_AND_DISK_SER
36 |           case "MEMORY_AND_DISK_SER_2" => MEMORY_AND_DISK_SER_2
37 |           case "OFF_HEAP" => OFF_HEAP
38 |         }
39 |       case _ => deserializationError("Invalid Persist mode, see Spark StorageLevel options")
40 |     }
41 |     def write(g: Persist) = ???
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/model/ConfigSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.model
 2 | 
 3 | import org.scalatest.{FlatSpec, Inside, Matchers}
 4 | 
 5 | import spark_etl.util._
 6 | 
 7 | class ConfigSpec extends FlatSpec with Matchers with Inside {
 8 |   "Config" should "fail to parse" in {
 9 |     val bogusConfig = "NOT A CONFIG"
10 |     inside(Config.parse(bogusConfig)) {
11 |       case Failure(Seq(err)) =>
12 |         err.msg should startWith("Failed to deserialize")
13 |     }
14 |   }
15 | 
16 |   it should "read simple config" in {
17 |     val simpleConfig =
18 |       s"""extracts:
19 |          |  - name: e1
20 |          |    uri: e1_uri
21 |          |    cache: true
22 |          |    persist: MEMORY_ONLY
23 |          |
24 |          |transforms:
25 |          |  - name: t1
26 |          |    cache: false
27 |          |    sql: t1_uri
28 |          |    persist: DISK_ONLY
29 |          |
30 |          |loads:
31 |          |  - name: l1
32 |          |    source: t1
33 |          |    uri: l1_uri
34 |        """.stripMargin
35 |     Config.parse(simpleConfig) shouldBe Success(Config(
36 |       List(Extract("e1", "e1_uri", Some(true), Some(Persist.MEMORY_ONLY))),
37 |       List(Transform("t1", "t1_uri", Some(false), Some(Persist.DISK_ONLY))),
38 |       List(Load("l1", "t1", "l1_uri"))
39 |     ))
40 |   }
41 | 
42 |   it should "read reader/writer constructors" in {
43 |     val simpleConfig =
44 |       s"""extracts:
45 |          |  - name: e1
46 |          |    uri: e1_uri
47 |          |
48 |          |transforms:
49 |          |  - name: t1
50 |          |    sql: t1_uri
51 |          |
52 |          |loads:
53 |          |  - name: l1
54 |          |    source: t1
55 |          |    uri: l1_uri
56 |          |
57 |          |extract_reader:
58 |          |  class: DummyExtractReader
59 |          |  params:
60 |          |    x: 11
61 |          |    y: aa
62 |          |
63 |          |load_writer:
64 |          |  class: DummyLoadWriter
65 |          |  params:
66 |          |    b: false
67 |          |    a: [1, xxx]
68 |        """.stripMargin
69 |     Config.parse(simpleConfig) shouldBe Success(Config(
70 |       List(Extract("e1", "e1_uri")),
71 |       List(Transform("t1", "t1_uri")),
72 |       List(Load("l1", "t1", "l1_uri")),
73 |       Some(ParametrizedConstructor("DummyExtractReader", Some(Map("x" -> 11d, "y" -> "aa")))),
74 |       Some(ParametrizedConstructor("DummyLoadWriter", Some(Map("b" -> false, "a" -> List(1d, "xxx")))))
75 |     ))
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/oracle/OracleValidator.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.oracle
 2 | 
 3 | import java.sql.Connection
 4 | 
 5 | import org.apache.log4j.Logger
 6 | import spark_etl.ConfigError
 7 | 
 8 | import scala.util.Try
 9 | 
10 | import spark_etl.util.Validation
11 | import spark_etl.util.Validation._
12 | 
13 | /**
14 |   * Oracle Validator, checks connectivity and existance of tables.
15 |   */
16 | // FIXME: needs tests!!!
17 | object OracleValidator {
18 |   private val log = Logger.getLogger(getClass)
19 | 
20 |   def validateOracle(connStr: String, user: String, pwd: String, requiredTables: Seq[String]): Validation[ConfigError, Unit] =
21 |     for {
22 |     // open connection
23 |       conn <- Try {
24 |         java.sql.DriverManager.getConnection(connStr, user, pwd)
25 |       } match {
26 |         case scala.util.Success(conn) => conn.success[ConfigError]
27 |         case scala.util.Failure(e) => ConfigError(s"Failed to access Oracle @ $connStr, as user $user", Some(e)).failure[Connection]
28 |       }
29 |       // check all tables
30 |       _ <- {
31 |         val tableVs = requiredTables.map {
32 |           tableName =>
33 |             Try {
34 |               val ps = conn.prepareStatement("SELECT COUNT(1) FROM USER_TABLES WHERE LOWER(TABLE_NAME) = ?")
35 |               ps.setString(1, tableName.toLowerCase)
36 |               val rs = ps.executeQuery()
37 |               rs.next()
38 |               val tableCount = rs.getInt(1)
39 |               if (tableCount == 0)
40 |                 throw new Exception(s"Failed to find table $tableName")
41 |               else
42 |                 log.info(s"Validated Oracle table: $tableName")
43 |             } match {
44 |               case scala.util.Success(_) => ().success[ConfigError]
45 |               case scala.util.Failure(e) => ConfigError(s"Failed to access Oracle table: $tableName", Some(e)).failure[Unit]
46 |             }
47 |         }
48 |         // validate all tables (allowing for empty set)
49 |         tableVs.foldLeft(().success[ConfigError]) {
50 |           case (v1, v2) => v1 +++ v2
51 |         }
52 |       }
53 |       // close the connection
54 |       _ <- Try(conn.close()) match {
55 |         case scala.util.Success(_) =>
56 |           log.info(s"Validated Oracle connection: $connStr as user: $user")
57 |           ().success[ConfigError]
58 |         case scala.util.Failure(e) =>
59 |           ConfigError(s"Failed to close the Oracle connection", Some(e)).failure[Unit]
60 |       }
61 |     } yield ()
62 | }
63 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/parquet/WriteReadRoundtripSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.parquet
 2 | 
 3 | import java.io.File
 4 | 
 5 | import org.apache.commons.io.FileUtils
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.scalatest.{FlatSpec, Inside, Matchers}
 8 | import spark_etl.CLI
 9 | import spark_etl.model.{Extract, Load}
10 | import spark_etl.util.Files
11 | 
12 | class WriteReadRoundtripSpec extends FlatSpec with Matchers with Inside {
13 |   val root = Files.rootResource
14 | 
15 |   "Reader and Writer" should "roundtrip" in {
16 |     testRoundtrip(None)
17 |   }
18 | 
19 |   it should "roundtrip partitioned" in {
20 |     testRoundtrip(Some(List("s")))
21 |   }
22 | 
23 |   "CLI" should "transform" in {
24 |     // cleanup dir if exists
25 |     FileUtils.deleteDirectory(new File(s"$root/parquet-roundtrip/x"))
26 | 
27 |     implicit val spark = SparkSession.builder.appName("test")
28 |       .master("local[1]")
29 |       .config("spark.ui.port", 4046).config("spark.history.ui.port", 18086)
30 |       .getOrCreate
31 |     try {
32 |       import spark.implicits._
33 |       val in = Seq(ParquetTestLoad("a", 1), ParquetTestLoad("b", 2))
34 |       val dfIn = in.toDF()
35 |       val writer = new ParquetLoadWriter(Map.empty)
36 |       writer.write(Seq(
37 |         (Load("x", "x", s"$root/parquet-roundtrip/x", Some(List("s"))), dfIn)
38 |       ))
39 |     } finally {
40 |       spark.stop
41 |     }
42 | 
43 |     CLI.main(Array(s"-Denv.path=$root/parquet-roundtrip", "--conf-uri", s"/parquet-roundtrip/app.yaml", "validate-remote"))
44 |   }
45 | 
46 |   private def testRoundtrip(partitionBy: Option[List[String]]) = {
47 |     // cleanup dir if exists
48 |     FileUtils.deleteDirectory(new File(s"$root/x"))
49 | 
50 |     // run the test
51 |     implicit val spark = SparkSession.builder.appName("test")
52 |       .master("local[1]")
53 |       .config("spark.ui.port", 4045).config("spark.history.ui.port", 18085)
54 |       .getOrCreate
55 |     try {
56 |       import spark.implicits._
57 |       val in = Seq(ParquetTestLoad("a", 1), ParquetTestLoad("b", 2))
58 |       val dfIn = in.toDF()
59 |       val writer = new ParquetLoadWriter(Map.empty)
60 |       writer.write(Seq(
61 |         (Load("x", "x", s"$root/x", partitionBy), dfIn)
62 |       ))
63 |       val reader = new ParquetExtractReader(Map.empty)
64 |       val (_, dfOut) :: Nil = reader.read(Seq(Extract("x", s"$root/x")))
65 | 
66 |       dfOut.as[ParquetTestLoad].collect() should contain allElementsOf(in)
67 |     } finally {
68 |       spark.stop
69 |     }
70 |   }
71 | }
72 | 
73 | case class ParquetTestLoad(s: String, i: Int)
74 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/oracle/OracleLoadAppender.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.oracle
 2 | 
 3 | import org.apache.log4j.Logger
 4 | import org.apache.spark.sql.{DataFrame, SaveMode}
 5 | import spark_etl.model.Load
 6 | import spark_etl.util.Validation
 7 | import spark_etl.util.Validation._
 8 | import spark_etl.{ConfigError, LoadWriter}
 9 | 
10 | import scala.util.Try
11 | 
12 | /**
13 |   * Sample Oracle appender.
14 |   */
15 | class OracleLoadAppender(params: Map[String, Any]) extends LoadWriter(params) {
16 |   private val log = Logger.getLogger(getClass)
17 | 
18 |   override def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit = {
19 |     val oracleUri = params("oracle_uri").toString
20 |     val username  = params("oracle_user").toString
21 |     val password  = params("oracle_password").toString
22 |     val driver    = params.get("oracle_driver").map(_.toString).getOrElse("oracle.jdbc.driver.OracleDriver")
23 |     val batchSize = Try(params("oracle_batch").asInstanceOf[Int]).getOrElse(1000)
24 |     val props = {
25 |       val p = new java.util.Properties()
26 |       p.setProperty("user",           username)
27 |       p.setProperty("password",       password)
28 |       p.setProperty("fetchsize",      batchSize.toString)
29 |       p.setProperty("batchsize",      batchSize.toString)
30 |       p.setProperty("isolationLevel", "READ_COMMITTED")
31 |       p.setProperty("driver",         driver)
32 |       p
33 |     }
34 | 
35 |     log.info(s"Load writing to $oracleUri, with jdbc driver: $driver")
36 |     loadsAndDfs.foreach {
37 |       case (Load(_, _, tableName, _), df) =>
38 |         df.write.mode(SaveMode.Append).jdbc(oracleUri, tableName, props)
39 |     }
40 |   }
41 | 
42 |   override def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit] = {
43 |     merge(toVal[String]("oracle_uri"),
44 |       toVal[String]("oracle_user"),
45 |       toVal[String]("oracle_password"),
46 |       toVal[Int]("oracle_batch")) { (_, _, _, _) => () }
47 |   }
48 | 
49 |   override def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit] = {
50 |     val oracleUri = params("oracle_uri").toString
51 |     val username  = params("oracle_user").toString
52 |     val password  = params("oracle_password").toString
53 |     val requiredTables = loads.map(_.uri)
54 |     OracleValidator.validateOracle(oracleUri, username, password, requiredTables)
55 |   }
56 | 
57 |   private def toVal[T](key: String): Validation[ConfigError, T] =
58 |     params.get(key) match {
59 |       case Some(v) =>
60 |         Try(v.asInstanceOf[T]) match {
61 |           case scala.util.Success(v2) => v2.success[ConfigError]
62 |           case scala.util.Failure(_) => ConfigError(s"Invalid type of the env_var: $key").failure[T]
63 |         }
64 |       case None =>
65 |         ConfigError(s"Missing ${getClass.getSimpleName} param: $key").failure[T]
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/util/UriLoader.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.util
 2 | 
 3 | import spark_etl.ConfigError
 4 | import spark_etl.util.Validation._
 5 | 
 6 | import scala.io.Source
 7 | 
 8 | object UriLoader {
 9 |   private val fileProtocol = "file:"
10 |   private val resourceProtocol = "resource:"
11 | 
12 |   def load(uri: String, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, String] =
13 |     for {
14 |       contents <-
15 |       if (uri.startsWith(fileProtocol)) {
16 |         val filePath = uri.substring(fileProtocol.length)
17 |         val fqFilePath = if (filePath.startsWith("/"))
18 |           filePath
19 |         else if (filePathRoot.endsWith("/"))
20 |           s"$filePathRoot$filePath"
21 |         else
22 |           s"$filePathRoot/$filePath"
23 |         loadFile(fqFilePath, env)
24 |       } else if (uri.startsWith(resourceProtocol))
25 |         loadResource(uri.substring(resourceProtocol.length), env)
26 |       else
27 |         loadResource(uri, env)
28 |       withIncludes <- {
29 |         // load #include<xxx>
30 |         val includePattern = "(?m)^\\s*#include\\s*<(.+)>.*$".r
31 |         val includeUris = includePattern.findAllIn(contents).matchData.map(_.group(1))
32 |         if (includeUris.isEmpty)
33 |           contents.success[ConfigError]
34 |         else {
35 |           val byUriIncludes = getIncludes(includeUris, filePathRoot, env)
36 |           byUriIncludes.map(includes => includePattern.replaceAllIn(contents, m => includes(m.group(1))))
37 |         }
38 |       }
39 |       withEnvVars <- envVarSub(uri, withIncludes, env)
40 |     } yield withEnvVars
41 | 
42 |   private def loadResource(uri: String, env: Map[String, String]): Validation[ConfigError, String] = {
43 |     val fqUri = getClass.getResource(uri)
44 |     if (fqUri != null)
45 |       Source.fromURL(fqUri).mkString.success[ConfigError]
46 |     else
47 |       ConfigError(s"Failed to read resource $uri").failure[String]
48 |   }
49 | 
50 |   private def loadFile(uri: String, env: Map[String, String]): Validation[ConfigError, String] = {
51 |     val file = new java.io.File(uri)
52 |     if (file.canRead)
53 |       scala.io.Source.fromFile(file).mkString.success[ConfigError]
54 |     else
55 |       ConfigError(s"Failed to read file $uri").failure[String]
56 |   }
57 | 
58 |   private def envVarSub(uri: String, contents: String, env: Map[String, String]): Validation[ConfigError, String] = {
59 |     // replace all ${k}, with v, ensuring v can contain '$'
60 |     val contents2 = env.foldLeft(contents) { case (soFar, (k, v)) => soFar.replaceAll("\\$\\{" + k + "\\}", v.replaceAll("\\$", "\\\\\\$")) }
61 |     val remainingVars = "\\$\\{.*?\\}".r.findAllIn(contents2)
62 |     if (remainingVars.isEmpty)
63 |       contents2.success[ConfigError]
64 |     else {
65 |       val varNames = remainingVars.toList.distinct
66 |       ConfigError(s"Unresolved env vars in $uri: ${varNames.mkString(", ")}").failure[String]
67 |     }
68 |   }
69 | 
70 |   private def getIncludes(uris: Iterator[String], filePathRoot: String, env: Map[String, String]): Validation[ConfigError, Map[String, String]] = {
71 |     uris.map(uri => load(uri, filePathRoot, env).map(contents => Map(uri -> contents))).reduce(_ +++ _)
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/oracle/OracleLoadAppenderSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.oracle
 2 | 
 3 | import java.sql.DriverManager
 4 | import java.util.Properties
 5 | 
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.scalatest._
 8 | import spark_etl.model.Load
 9 | 
10 | class OracleLoadAppenderSpec extends FlatSpec with Matchers with Inside with BeforeAndAfterAll {
11 |   val dbUri = "jdbc:h2:mem:testDB;user=user;password=pwd"
12 |   val props = new Properties()
13 |   val conn = DriverManager.getConnection(s"$dbUri;create=true", props)
14 | 
15 |   conn.createStatement().execute("CREATE TABLE user_tables(table_name VARCHAR(20))")
16 |   conn.createStatement().execute("CREATE TABLE test_table(loadid VARCHAR(36), name VARCHAR(20), age INT)")
17 |   conn.prepareStatement("INSERT INTO user_tables(table_name) values ('test_table')").executeUpdate()
18 |   conn.commit()
19 | 
20 |   val loads = Seq(Load("test_load", "test_transform", "test_table"))
21 | 
22 |   "OracleLoadAppender" should "validate-local" in {
23 |     new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkLocal(loads).isSuccess shouldBe true
24 |     new OracleLoadAppender(Map("oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkLocal(loads).isSuccess shouldBe false
25 |     new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkLocal(loads).isSuccess shouldBe false
26 |     new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_batch" -> 100)).checkLocal(loads).isSuccess shouldBe false
27 |     new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_password" -> "pwd")).checkLocal(loads).isSuccess shouldBe false
28 |   }
29 | 
30 |   it should "validate-remote" in {
31 |     new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkRemote(loads).isSuccess shouldBe true
32 |     new OracleLoadAppender(Map("oracle_uri" -> "jdbc:h2:mem:__bogus_db__", "oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_batch" -> 100)).checkRemote(loads).isSuccess shouldBe false
33 |   }
34 | 
35 |   it should "write" in {
36 |     implicit val spark = SparkSession.builder.appName("test")
37 |       .master("local[1]")
38 |       .config("spark.ui.port", 4050).config("spark.history.ui.port", 18090)
39 |       .getOrCreate
40 |     try {
41 |       import spark.implicits._
42 |       val inDF = Seq(PersonRow("Joe", 50), PersonRow("Jane", 23)).toDF()
43 |       // mocking driver for db
44 |       new OracleLoadAppender(Map("oracle_uri" -> dbUri, "oracle_user" -> "user", "oracle_password" -> "pwd", "oracle_driver" -> "org.h2.Driver")).write(Seq(
45 |         (Load("test_load", "test_transform", "test_table"), inDF)
46 |       ))
47 |     } finally {
48 |       spark.stop()
49 |     }
50 | 
51 |     conn.commit()
52 | 
53 |     // validate load write
54 |     val rs = conn.prepareStatement("SELECT count(1) FROM test_table").executeQuery()
55 |     rs.next()
56 |     rs.getInt(1) shouldBe 2
57 |   }
58 | 
59 |   override def afterAll: Unit = conn.createStatement().execute("SHUTDOWN")
60 | }
61 | 
62 | case class PersonRow(NAME: String, AGE: Int)  // Note, fields need to be uppercase, otherwise it confuses spark.jdbc
63 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/util/UriLoaderSpec.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.util
 2 | 
 3 | import org.scalatest.{FlatSpec, Inside, Matchers}
 4 | import spark_etl.ConfigError
 5 | import spark_etl.util.Validation._
 6 | 
 7 | class UriLoaderSpec extends FlatSpec with Matchers with Inside {
 8 |   "UriLoader" should "read resource without tokens" in {
 9 |     validateWithoutVars("/uri-loader/without_env_vars", "__ignore__")
10 |     validateWithoutVars("resource:/uri-loader/without_env_vars", "__ignore__")
11 |   }
12 | 
13 |   it should "read resource with tokens" in {
14 |     validateWithVars("/uri-loader/with_env_vars", "__ignore__")
15 |     validateWithVars("resource:/uri-loader/with_env_vars", "__ignore__")
16 |   }
17 | 
18 |   it should "read file without tokens" in {
19 |     val asFilename = getClass.getResource("/uri-loader/without_env_vars").toString
20 |     asFilename should startWith("file:")
21 |     validateWithoutVars(asFilename, "__ignore__")
22 |   }
23 | 
24 |   it should "read file with tokens" in {
25 |     val asFilename = getClass.getResource("/uri-loader/with_env_vars").toString
26 |     asFilename should startWith("file:")
27 |     validateWithVars(asFilename, "__ignore__")
28 |     validateWithVars("file:uri-loader/with_env_vars", getClass.getResource("/").getFile)
29 |   }
30 | 
31 |   it should "fail to read bogus files/resources" in {
32 |     validateNotExists("/bogus_uri", "__ignore__")
33 |     validateNotExists("/bogus_uri", Files.pwd)
34 |     validateNotExists("resource:/bogus_uri", "__ignore__")
35 |     validateNotExists("resource:/bogus_uri", Files.pwd)
36 |     validateNotExists("file:/bogus_uri", "__ignore__")
37 |     validateNotExists("file:/bogus_uri", Files.pwd)
38 |   }
39 | 
40 |   it should "read #includes" in {
41 |     val expected =
42 |       """|===
43 |          |hello there
44 |          |123
45 |          |
46 |          |---
47 |          |111 val1 222 val2 333 val1 444
48 |          |
49 |          |+++""".stripMargin
50 |     UriLoader.load("/uri-loader/with_includes", "__ignore__", Map("var1" -> "val1", "var2" -> "val2")) shouldBe expected.success[ConfigError]
51 |   }
52 | 
53 |   it should "read fail on bogus #include" in {
54 |     inside(UriLoader.load("/uri-loader/with_bogus_includes", "__ignore__", Map("var1" -> "val1", "var2" -> "val2"))) {
55 |       case Failure(err) =>
56 |         err.toList shouldBe List(ConfigError("Failed to read resource __bogus_include__"))
57 |     }
58 |   }
59 | 
60 |   private def validateWithoutVars(uri: String, fileRoot: String) = {
61 |     UriLoader.load(uri, fileRoot, Map("var1" -> "val1")) shouldBe "hello there\n123\n".success[ConfigError]
62 |   }
63 | 
64 |   private def validateWithVars(uri: String, fileRoot: String) = {
65 |     UriLoader.load(uri, fileRoot, Map("var1" -> "val1", "var2" -> "val2")) shouldBe "111 val1 222 val2 333 val1 444\n".success[ConfigError]
66 |     inside(UriLoader.load(uri, fileRoot, Map.empty)) {
67 |       case Failure(err) =>
68 |         err.toList.length shouldBe 1
69 |         err.head.msg should startWith("Unresolved env vars in")
70 |         err.head.msg should endWith("${var1}, ${var2}")
71 |     }
72 |   }
73 | 
74 |   private def validateNotExists(uri: String, fileRoot: String) = {
75 |     inside(UriLoader.load(uri, fileRoot, Map.empty)) {
76 |       case Failure(err) =>
77 |         err.toList.length shouldBe 1
78 |         err.head.msg should startWith("Failed to read")
79 |     }
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/parquet/PathValidator.scala:
--------------------------------------------------------------------------------
 1 | package spark_etl.parquet
 2 | 
 3 | import java.io.File
 4 | 
 5 | import org.apache.hadoop.conf.Configuration
 6 | import org.apache.hadoop.fs.{FileSystem, RemoteIterator}
 7 | import spark_etl.ConfigError
 8 | import spark_etl.util.Validation
 9 | import spark_etl.util.Validation._
10 | 
11 | /**
12 |   * Validate parquet paths on hdfs and local.
13 |   * - validates path
14 |   * - (optionally) validates path children
15 |   *   - expects children to be _temporary, _SUCCESS, ._SUCCESS.crc
16 |   *   - or if expecting partitions, children to contain "="
17 |   */
18 | object PathValidator {
19 |   def validate(checkChildren: Boolean, expectPartition: Boolean, paths: String*): Validation[ConfigError, List[String]] = {
20 |     val validated = paths.map { p =>
21 |       if (p.toLowerCase.startsWith("hdfs://")) {
22 |         val hadoopConf = new Configuration()
23 |         val fs = org.apache.hadoop.fs.FileSystem.get(hadoopConf)
24 |         val fsPath = new org.apache.hadoop.fs.Path(p)
25 |         if (!fs.exists(fsPath))
26 |           ConfigError(s"hdfs path doesn't exist: $p").failure[String]
27 |         else if (checkChildren) {
28 |           val children = hdfsList(fs, fsPath)
29 |           if (areValid(children, expectPartition))
30 |             p.success[ConfigError]
31 |           else if (children.isEmpty)
32 |             ConfigError(s"hdfs path is empty for $p").failure[String]
33 |           else
34 |             ConfigError(s"Unexpected hdfs children for $p: ${children.map(trimRoot(p)).mkString(", ")} ").failure[String]
35 |         }
36 |         else
37 |           p.success[ConfigError]
38 |       } else {
39 |         val ioPath = new File(p)
40 |         if (! ioPath.exists)
41 |           ConfigError(s"Local path doesn't exist: $p").failure[String]
42 |         else if (checkChildren) {
43 |           val children = ioPath.list().toSeq
44 |           if (areValid(children, expectPartition))
45 |             p.success[ConfigError]
46 |           else if (children.isEmpty)
47 |             ConfigError(s"Local path is empty for $p").failure[String]
48 |           else
49 |             ConfigError(s"Unexpected local children for $p: ${children.map(trimRoot(p)).mkString(", ")}").failure[String]
50 |         }
51 |         else
52 |           p.success[ConfigError]
53 |       }
54 |     }
55 |     val res = validated.map(_.map(List(_))).reduce(_ +++ _)
56 |     res
57 |   }
58 | 
59 |   def hdfsList(fs: FileSystem, parent: org.apache.hadoop.fs.Path): Seq[String] = {
60 |     // as per: https://issues.apache.org/jira/browse/HDFS-7921
61 |     // listing recursively, including all files, then filtering distinct immediate children
62 |     val parentStr = parent.toUri.toString
63 |     toIterator(fs.listFiles(parent, true))
64 |       .map {
65 |         f =>
66 |           val descendant = f.getPath.toUri.toString
67 |           val relativeDescendant = descendant.substring(parentStr.length + 1)
68 |           val immediateChild = relativeDescendant.split("/").head
69 |           s"$parent/$immediateChild"
70 |       }.toSeq.distinct
71 |   }
72 | 
73 |   private def trimRoot(root: String)(path: String): String = {
74 |     val root2 = if (root.endsWith("/"))
75 |       root
76 |     else
77 |       root + "/"
78 |     if (path.startsWith(root2))
79 |       path.substring(root2.length)
80 |     else
81 |       path
82 |   }
83 | 
84 |   private def toIterator[T](iter: RemoteIterator[T]) =
85 |     new Iterator[T] {
86 |       def hasNext = iter.hasNext
87 |       def next = iter.next
88 |     }
89 | 
90 |   private def areValid(paths: Seq[String], expectPartition: Boolean) = {
91 |     lazy val hasValidChildren = paths.forall {
92 |       path =>
93 |         val lastElem = path.split("/").last
94 |         lastElem == "_temporary" || lastElem == "_SUCCESS" || lastElem == "._SUCCESS.crc" || (expectPartition && lastElem.contains("="))
95 |     }
96 |     paths.nonEmpty && hasValidChildren
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | spark-etl
  2 | ==========
  3 | 
  4 | Tooling for configuration and SQL transform driven Spark ETLs. For usage example, see [spark-etl-demo](https://github.com/konrads/spark-etl-demo).
  5 | 
  6 | Build status (master): [![Build Status](https://travis-ci.org/konrads/spark-etl.svg?branch=master)](https://travis-ci.org/konrads/spark-etl)
  7 | 
  8 | Philosophy
  9 | ----------
 10 | This library facilitates productionizing of configuration/SQL driven Spark ETL pipelines. Emphasis is on:
 11 | * configuration and SQLs treated as first class citizens
 12 | * build time validation comprising syntactical checks of config and SQL, ensuring that SQL datasources map to configured `extract`s and `transform`s
 13 | * run time validations comprising verification of data source (`extract`s) uris and connectivity to [LoadWriter](src/main/scala/spark_etl/LoadWriter.scala)
 14 | * optional validation of `extract` datasources
 15 | * optional validation of `transform` outputs (pre `load` writing)
 16 | * config and SQL parametrization via `${var}` style variables, configured at runtime via `-Denv.var=value`. Some default envs (eg. `${yyyy-MM-dd-1d}`, `${utc-eod-1d}`) are supplied by [DefaultEnv](src/main/scala/spark_etl/util/DefaultEnv.scala)
 17 | * CLI support for commands: `validate-local`, `validate-remote`, `extract-check`, `transform-load`, `transform-load`
 18 | 
 19 | Sample setup
 20 | ------------
 21 | Setup `src/main/resources/app.yaml`:
 22 | ```
 23 | extracts:
 24 |   - name:  client
 25 |     uri:   "hdfs://${path}/client_2017"
 26 |     check: "/spark/extract-check/client.sql"
 27 |     cache: true
 28 |   - name:  item
 29 |     uri:   "hdfs://${path}/item_2017"
 30 |   - name:  transaction
 31 |     uri:   "hdfs://${path}/transaction_2017"
 32 | 
 33 | transforms:
 34 |   - name:  client_spending
 35 |     sql:   "/spark/transform/client_spending.sql"
 36 |   - name:  item_purchase
 37 |     sql:   "/spark/transform/item_purchase.sql"
 38 |   - name:  minor_purchase
 39 |     check: "/spark/transform-check/minor_purchase.sql"
 40 |     sql:   "/spark/transform/minor_purchase.sql"
 41 |     cache: true
 42 | 
 43 | loads:
 44 |   - name:   client_spending_out
 45 |     source: client_spending
 46 |     uri:    "hdfs://out/client_spending"
 47 |     partition_by: ["col1", "col2"]
 48 |   - name:   item_purchase_out
 49 |     source: item_purchase
 50 |     uri:    "hdfs://out/item_purchase"
 51 |   - name:   minor_purchase_out
 52 |     source: minor_purchase
 53 |     uri:    "hdfs://out/minor_purchase"
 54 | 
 55 | load_writer:
 56 |   class: "spark_etl.JdbcLoadWriter"
 57 |   params:
 58 |     jdbc_uri:      ${jdbc_uri}
 59 |     jdbc_user:     ${jdbc_user}
 60 |     jdbc_password: ${jdbc_password}
 61 | ```
 62 | 
 63 | Setup your SQLs as per below. All SQLs are `SELECT` statements, `transform`s produce potentially sizable `Dataframes` to be persisted as `load`s, `extract-check` and `transform-check` produce smaller `Dataframees` which are loged out for visual inspection:
 64 | ```
 65 | src -+
 66 |      |
 67 |      +- spark
 68 |           |
 69 |           +- extract-check
 70 |           |    |
 71 |           |    +- client.sql            # NOTE: optional extract validation!
 72 |           |
 73 |           +- transform
 74 |           |    |
 75 |           |    +- client_spending.sql
 76 |           |    |
 77 |           |    +- item_purchase.sql
 78 |           |    |
 79 |           |    +- minor_purchase.sql
 80 |           |
 81 |           +- transform-check
 82 |                |
 83 |                +- minor_purchase.sql   # NOTE: optional transform validation!
 84 | ```
 85 | 
 86 | Generate lineage in dot format:
 87 | ```
 88 | sbt "run-main spark_etl.CLI -Denv.path=some_path lineage-dot"
 89 | ```
 90 | 
 91 | Validate local config/SQLs. Suggested use is to run this as part of the build, with validation failure stopping the build:
 92 | ```
 93 | sbt "run-main spark_etl.CLI -Denv.path=some_path validate-local"
 94 | ```
 95 | 
 96 | Deploy to cluster, with read access to `hdfs://some_path`, write access to `hdfs://out`. If using yarn, utilize: [run.sh](src/main/resources/run.sh)
 97 | ```
 98 | run.sh -Denv.path=some_path validate-remote
 99 | ```
100 | 
101 | Run extract and transform validations on the cluster. The following will fail *only* if any of the return set rows contains a `false`:
102 | ```
103 | run.sh -Denv.path=some_path extract-check
104 | run.sh -Denv.path=some_path transform-check
105 | ```
106 | 
107 | Run transformation and persist loads:
108 | ```
109 | run.sh -Denv.path=some_path transform-load
110 | ```
111 | 
112 | If env `PACKAGE_LOGS=true`, `run.sh`'s cluster operations (`transform-load`, `extract-check`, `transform-check`) capture both driver and yarn logs under `logs/$app_id/logs_$app_id.zip`.
113 | 


--------------------------------------------------------------------------------
/src/main/resources/run.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # configurable env vars
  4 | RUN_DIR=${RUN_DIR:-.}
  5 | MAIN_CLASS=${MAIN_CLASS:-spark_etl.CLI}
  6 | HADOOP_VSN=${HADOOP_VSN:-2.7.3}
  7 | SPARK_JARS=${SPARK_JARS:-/opt/spark/spark-2.1.0-bin-hadoop2.7/jars}
  8 | HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf}
  9 | PACKAGE_LOGS=${PACKAGE_LOGS:-false}
 10 | 
 11 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 12 | cd $DIR
 13 | 
 14 | OTHER_CP=$SPARK_JARS/hadoop-hdfs-${HADOOP_VSN}.jar:$SPARK_JARS/hadoop-common-${HADOOP_VSN}.jar:$HADOOP_CONF_DIR
 15 | SPARK_STORAGE_LEVEL=MEMORY_AND_DISK_SER_2
 16 | SPARK_NUM_EXECUTORS=250
 17 | SPARK_EXECUTOR_MEMORY=7G
 18 | SPARK_EXECUTOR_CORES=2
 19 | SPARK_HOME=/opt/spark/spark-2.1.0-bin-hadoop2.7
 20 | #export YARN_CONF_DIR=$HADOOP_CONF_DIR
 21 | JAR=$(ls $RUN_DIR/*-assembly*.jar)
 22 | RT=$(date --date="6:00 today" --iso-8601=seconds | cut -f1 -d'+')
 23 | TIMESTAMP="${RT}Australia/Sydney"
 24 | 
 25 | CMD="$SPARK_HOME/bin/spark-submit \
 26 | --conf spark.debug.maxToStringFields=1024 \
 27 | --conf spark.driver.extraJavaOptions='-XX:PermSize=512m -XX:MaxPermSize=512m' \
 28 | --conf spark.yarn.maxAppAttempts=1 \
 29 | --conf spark.yarn.max.executor.failures=200 \
 30 | --conf spark.driver.memory=7G \
 31 | --conf spark.driver.maxResultSize=4G \
 32 | --conf spark.sql.warehouse.dir=hdfs://nameservice1/user/hive/warehouse \
 33 | --conf spark.locality.wait=0 \
 34 | --conf spark.io.compression.codec=org.apache.spark.io.SnappyCompressionCodec \
 35 | --conf spark.rdd.compress=true \
 36 | --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
 37 | --conf spark.sql.parquet.compression.codec=snappy \
 38 | --conf spark.sql.inMemoryColumnarStorage.compressed=true \
 39 | --conf spark.sql.inMemoryColumnarStorage.batchSize=100000 \
 40 | --conf spark.sql.crossJoin.enabled=true \
 41 | --conf spark.task.maxFailures=20 \
 42 | --master yarn \
 43 | --deploy-mode cluster \
 44 | --num-executors $SPARK_NUM_EXECUTORS \
 45 | --executor-memory $SPARK_EXECUTOR_MEMORY \
 46 | --executor-cores $SPARK_EXECUTOR_CORES \
 47 | --class $MAIN_CLASS \
 48 | $JAR"
 49 | 
 50 | green="\033[32m"
 51 | red="\033[31m"
 52 | bold="\033[1m"
 53 | reset="\033[0m"
 54 | 
 55 | log_green() {
 56 |   echo -e "${green}$@${reset}"
 57 | }
 58 | 
 59 | log_bold() {
 60 |   echo -e "${bold}$@${reset}"
 61 | }
 62 | 
 63 | log_red() {
 64 |   echo -e "${red}$@${reset}"
 65 | }
 66 | 
 67 | check_package_logs() {
 68 |   if [ "$PACKAGE_LOGS" == "true" ]
 69 |   then
 70 |     log_bold "...Log packaging enabled"
 71 |   else
 72 |     log_bold "...Log packaging disabled"
 73 |   fi
 74 | }
 75 | 
 76 | package_logs() {
 77 |   if [ "$PACKAGE_LOGS" == "true" ]
 78 |   then
 79 |     local_log_file=$1
 80 |     app_id=`cat $local_log_file | grep 'Submitting application application_[0-9]*_[0-9]*' | sed -r 's/.*(application_[0-9]*_[0-9]*).*/\1/g'`
 81 |     if [ -z "$app_id" ]
 82 |     then
 83 |       log_red "No application_XXX_YYY found in local log $local_log_file!"
 84 |       exit 11
 85 |     else
 86 |       log_bold "Packaging logs for $app_id after a 10 sec sleep"
 87 |       sleep 10
 88 |       mkdir -p logs/$app_id
 89 |       rm -rf logs/current
 90 |       ln -s $app_id logs/current
 91 |       cp $local_log_file logs/$app_id/$app_id.local.log
 92 |       yarn logs --applicationId $app_id > logs/$app_id/$app_id.remote.log
 93 |       cd logs/$app_id
 94 |       zip logs_$app_id.zip *.log
 95 |       cd ../..
 96 |       log_bold "Logs available at logs/$app_id/logs_$app_id.zip"
 97 |     fi
 98 |   fi
 99 | }
100 | 
101 | usage() {
102 |   log_bold "  Usage:"
103 |   log_bold "    help"
104 |   log_bold "    validate-local"
105 |   log_bold "    validate-remote"
106 |   log_bold "    transform"
107 |   log_bold "    extract-check"
108 |   log_bold "    transform-check"
109 |   exit 1
110 | }
111 | 
112 | set -e
113 | trail_arg="${@: -1}"
114 | if [[ $# -lt 1 || "$trail_arg" == "help" ]]; then usage;  fi
115 | case "$trail_arg" in
116 |   "validate-local")
117 |     log_bold "Validating local configuration..."
118 |     java -cp $JAR:$OTHER_CP $MAIN_CLASS $@
119 |     ;;
120 |   "validate-remote")
121 |     log_bold "Validating remote aspects..."
122 |     java -cp $JAR:$OTHER_CP $MAIN_CLASS $@
123 |     ;;
124 |   "transform")
125 |     log_bold "Run and persist transform..."
126 |     check_package_logs
127 |     YARN_CONF_DIR=$HADOOP_CONF_DIR eval $CMD $@ 2>&1 | tee .current_local.log
128 |     package_logs .current_local.log
129 |     ;;
130 |   "extract-check")
131 |     log_bold "Run extract check..."
132 |     check_package_logs
133 |     YARN_CONF_DIR=$HADOOP_CONF_DIR eval $CMD $@ 2>&1 | tee .current_local.log
134 |     package_logs .current_local.log
135 |     ;;
136 |   "transform-check")
137 |     log_bold "Run transform check..."
138 |     check_package_logs
139 |     YARN_CONF_DIR=$HADOOP_CONF_DIR eval $CMD $@ 2>&1 | tee .current_local.log
140 |     package_logs .current_local.log
141 |     ;;
142 |   *)
143 |     log_red "Not a valid command: $@"
144 |     usage
145 |     ;;
146 | esac
147 | log_green "$trail_arg done!"
148 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/util/DepTree.scala:
--------------------------------------------------------------------------------
  1 | package spark_etl.util
  2 | 
  3 | import scala.annotation.tailrec
  4 | 
  5 | /**
  6 |   * Constructs dependency graph consisting of Vertices and Edges:
  7 |   * - Vertex has id and VertexType
  8 |   * - VertexType classification:
  9 |   *
 10 |   *              VertexType
 11 |   *                  |
 12 |   *      +-----------+----------+
 13 |   *      |                      |
 14 |   *    RootType              OtherType
 15 |   * +----+-------+       +-+-+--+----+
 16 |   * |    |       |       |   |       |
 17 |   * L  LCheck  ECheck    E   T    Dangling
 18 |   *
 19 |   * - Edge has source and target Vertices
 20 |   *
 21 |   * Process:
 22 |   * - go through all E|ECheck|T|TCheck|L
 23 |   * - add all Edges, by first looking up all the Vertices, then linking them. If Vertex doesn't exist, mark it as Dangling
 24 |   *
 25 |   * Note: Order of Vertices and Edges is preserved by the LinkedHashSet
 26 |   */
 27 | class DepTree(knownVertices : Seq[Vertex]) {
 28 |   private val vertices = collection.mutable.LinkedHashSet[Vertex](knownVertices:_*)
 29 |   private val edges = collection.mutable.LinkedHashSet.empty[Edge]
 30 |   private val allTypes = Set(E, T, L, Echeck, Tcheck, Dangling)
 31 |   private val nonDangling = allTypes - Dangling
 32 | 
 33 |   /**
 34 |     * Add Edge, fetching from either a list of known, or marking it as Dangling.
 35 |     */
 36 |   def addEdge(sourceId: String, target: Vertex, isExplicit: Boolean = false): Unit = {
 37 |     val source = vertices
 38 |       .collectFirst { case v @ Vertex(id, _:OtherType) if sourceId == id => v }
 39 |       .getOrElse(Vertex(sourceId, Dangling))
 40 |     vertices += source  // add if dangling
 41 |     edges += Edge(source, target, isExplicit)
 42 |   }
 43 | 
 44 |   def dangling: Seq[Edge] =
 45 |     edges.filter(_.source.`type` == Dangling).toSeq
 46 | 
 47 |   /**
 48 |     * Walk down the graph from root objects, collect all encountered Vertices.
 49 |     */
 50 |   def rootfull(types: Set[VertexType] = nonDangling): Seq[Vertex] = {
 51 |     val vs = collect(vertices.collect { case v @ Vertex(_, _: RootType) => v }.toSeq)
 52 |     val orderedVs = vertices.intersect(vs.toSet).toSeq // re-order as per original
 53 |     orderedVs.filter(v => types.contains(v.`type`))
 54 |   }
 55 | 
 56 |   /**
 57 |     * All vertices -- rootfull
 58 |     */
 59 |   def rootless: Seq[Vertex] =
 60 |     (vertices -- rootfull()).toSeq
 61 | 
 62 |   def forType(`type`: VertexType): Seq[Vertex] =
 63 |     rootfull().filter(_.`type` == `type`)
 64 | 
 65 |   def asDot(name: String = "Lineage", fontSize: Int = 12): String = {
 66 |     val plottableVertices = rootfull(Set(E, T, L))
 67 |     val plottableEdges = edges.filter(e => plottableVertices.contains(e.source) && plottableVertices.contains(e.target))
 68 |     val edgeStrs = plottableEdges.collect {
 69 |       case Edge(srcV, targetV, false)  =>
 70 |         s"${srcV.id} -> ${targetV.id} [style=dotted]"
 71 |       case Edge(srcV, targetV, true)  =>
 72 |         s"${srcV.id} -> ${targetV.id}"
 73 |     }
 74 |     val verticeStrs = plottableVertices.collect {
 75 |       case Vertex(id, E) => s"$id"
 76 |       case Vertex(id, T) => s"""$id [shape=component]"""
 77 |       case Vertex(id, L) => s"""$id [shape=cylinder]"""
 78 |     }
 79 |     val rankStrs = plottableVertices.groupBy(_.`type`).flatMap {
 80 |       case (E, vs) => Seq(s"""{ rank=same; ${vs.map(_.id).mkString(" ")} }""")
 81 |       case (L, vs) => Seq(s"""{ rank=same; ${vs.map(_.id).mkString(" ")} }""")
 82 |       case _ => Nil
 83 |     }
 84 |     s"""digraph $name {
 85 |        |  rankdir=LR
 86 |        |  node [fontsize=$fontSize]
 87 |        |
 88 |        |  # vertices
 89 |        |  ${verticeStrs.mkString("\n  ")}
 90 |        |
 91 |        |  # edges
 92 |        |  ${edgeStrs.mkString("\n  ")}
 93 |        |
 94 |        |  # ranks
 95 |        |  ${rankStrs.toList.sorted.mkString("\n  ")}
 96 |        |}""".stripMargin
 97 |   }
 98 | 
 99 |   @tailrec
100 |   private def collect(
101 |                        roots: Seq[Vertex],
102 |                        types: Set[VertexType] = nonDangling,
103 |                        soFar: Seq[Vertex] = Nil): Seq[Vertex] = {
104 |     val sources = roots.flatMap(v => edges.collect {
105 |       case e if e.target == v && types.contains(e.source.`type`) && ! soFar.contains(e.source) => e.source
106 |     })
107 |     val soFar2 = (soFar ++ roots).distinct
108 |     if (sources.nonEmpty)
109 |       collect(sources, types, soFar2)
110 |     else
111 |       soFar2
112 |   }
113 | }
114 | 
115 | case class Vertex(id: String, `type`: VertexType)
116 | case class Edge(source: Vertex, target: Vertex, isExplicit: Boolean)
117 | 
118 | sealed trait VertexType { def asStr: String }
119 | sealed trait RootType extends VertexType
120 | sealed trait OtherType extends VertexType
121 | object E extends OtherType { override def asStr = "extract" }
122 | object T extends OtherType { override def asStr = "transform" }
123 | object L extends RootType  { override def asStr = "load" }
124 | object Echeck extends RootType { override def asStr = "extract-check" }
125 | object Tcheck extends RootType { override def asStr = "transform-check" }
126 | object Dangling extends OtherType { override def asStr = "dangling" }
127 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/CLI.scala:
--------------------------------------------------------------------------------
  1 | package spark_etl
  2 | 
  3 | import java.io.{File, PrintWriter, StringWriter}
  4 | 
  5 | import org.apache.log4j.Logger
  6 | import org.apache.spark.sql._
  7 | import org.joda.time.DateTime
  8 | import org.rogach.scallop._
  9 | import spark_etl.util.{DefaultEnv, Failure, Files, Success}
 10 | 
 11 | import scala.util.Random
 12 | 
 13 | object CLI {
 14 |   val log = Logger.getLogger(getClass)
 15 | 
 16 |   sealed trait CliCommand
 17 |   object LineageDot extends CliCommand
 18 |   object ValidateLocal extends CliCommand
 19 |   object ValidateRemote extends CliCommand
 20 |   object TransformLoad extends CliCommand
 21 |   object ExtractCheck extends CliCommand
 22 |   object TransformCheck extends CliCommand
 23 |   object StripPrefixes extends CliCommand
 24 |   object CliCommand {
 25 |     implicit val cliCommandConverter = singleArgConverter[CliCommand] {
 26 |       case "lineage-dot"     => LineageDot
 27 |       case "validate-local"  => ValidateLocal
 28 |       case "validate-remote" => ValidateRemote
 29 |       case "transform-load"  => TransformLoad
 30 |       case "extract-check"   => ExtractCheck
 31 |       case "transform-check" => TransformCheck
 32 |       case "strip-prefixes"  => StripPrefixes
 33 |     }
 34 |   }
 35 | 
 36 |   val className = getClass.getSimpleName
 37 |   class CliConf(args: Seq[String]) extends ScallopConf(args) {
 38 |     banner(s"""Usage: $className [OPTIONS] (all options required unless otherwise indicated)\n\tOptions:""")
 39 |     val extraProps  = props[String]()
 40 |     val confUri     = opt[String](name = "conf-uri", descr = "configuration resource uri", default = Some("/app.yaml"))
 41 |     val lineageFile = opt[String](name = "lineage-file", descr = "target lineage dot file", default = Some("lineage.dot"))
 42 |     val baSqlDir    = opt[File](name = "ba-sql-dir", descr = "dir with BA sql", default = Some(new File("src/main/resources/spark")))
 43 |     val devSqlDir   = opt[File](name = "dev-sql-dir", descr = "dir with DEV sql", default = Some(new File("src/main/resources/spark")))
 44 |     val rmDevSqlDir = opt[Boolean](name = "rm-dev-sql-dir", descr = "should remove dev sql dir?", default = Some(false))
 45 |     val count       = toggle(name = "count", descrYes = "enable transform counts", default = Some(false))
 46 |     val command     = trailArg[CliCommand](name = "command", descr = "command")
 47 |     verify()
 48 |   }
 49 | 
 50 |   type ErrorHandler = List[ConfigError] => Unit
 51 | 
 52 |   def main(args: Array[String]): Unit = {
 53 |     val conf = new CliConf(args)
 54 |     main(conf.command(), conf.confUri(), conf.extraProps, conf.count(), conf.lineageFile(), conf.baSqlDir(), conf.devSqlDir(), conf.rmDevSqlDir())
 55 |   }
 56 | 
 57 |   def main(command: CliCommand, confUri: String, extraProps: Map[String, String], shouldCount: Boolean, lineageFile: String, baSqlDir: File, devSqlDir: File, rmDevSqlDir: Boolean, errorHandler: ErrorHandler = die): Unit = {
 58 |     def createSpark(name: String, props: Map[String, String], isMaster: Boolean): SparkSession = {
 59 |       val builder = if (isMaster)
 60 |           SparkSession.builder.appName(name).master("local[1]").config("spark.ui.port", random(4041, 4999)).config("spark.history.ui.port", random(18080, 19000))
 61 |         else
 62 |           SparkSession.builder.appName(name)
 63 |       if (isMaster)
 64 |       props.collect { case (k, v) if k.startsWith("spark.") => builder.config(k, v) }
 65 |       builder.getOrCreate
 66 |     }
 67 | 
 68 |     val pwd = Files.pwd
 69 |     val defaultEnv = DefaultEnv.getAll(DateTime.now)
 70 |     val paramEnv = extraProps.collect { case (k, v) if k.startsWith("env.") => k.substring("env.".length) -> v }
 71 |     val env = defaultEnv ++ paramEnv
 72 |     val res = command match {
 73 |       case LineageDot =>
 74 |         CLIOps.dotLineage(confUri, pwd, env, lineageFile)
 75 |       case ValidateLocal =>
 76 |         CLIOps.validateLocal(confUri, pwd, env)
 77 |       case ValidateRemote =>
 78 |         implicit val spark = createSpark(className, extraProps, true)
 79 |         try {
 80 |           CLIOps.validateRemote(confUri, pwd, env)
 81 |         } finally {
 82 |           spark.stop()
 83 |         }
 84 |       case TransformLoad =>
 85 |         implicit val spark = createSpark(className, extraProps, false)
 86 |         try {
 87 |           CLIOps.transformAndLoad(confUri, pwd, env, extraProps, shouldCount)
 88 |         } finally {
 89 |           spark.stop()
 90 |         }
 91 |       case ExtractCheck =>
 92 |         implicit val spark = createSpark(className, extraProps, false)
 93 |         try {
 94 |           CLIOps.extractCheck(confUri, pwd, env)
 95 |         } finally {
 96 |           spark.stop()
 97 |         }
 98 |       case TransformCheck =>
 99 |         implicit val spark = createSpark(className, extraProps, false)
100 |         try {
101 |           CLIOps.transformCheck(confUri, pwd, env, shouldCount)
102 |         } finally {
103 |           spark.stop()
104 |         }
105 |       case StripPrefixes =>
106 |         CLIOps.stripPrefixes(baSqlDir, devSqlDir, rmDevSqlDir)
107 |     }
108 | 
109 |     res match {
110 |       case Success(_) =>
111 |         log.info("Success!")
112 |       case Failure(errors) =>
113 |         val errorStr = errors.map(e => e.exc.map(exc => s"• ${e.msg}, exception: $exc\n${stacktrace(exc)}").getOrElse(s"• ${e.msg}")).toList.mkString("\n")
114 |         log.error(s"Failed due to:\n$errorStr")
115 |         errorHandler(errors.toList)
116 |     }
117 |   }
118 | 
119 |   private def die(errors: List[ConfigError]): Unit =
120 |     System.exit(1)
121 | 
122 |   private def stacktrace(t: Throwable) = {
123 |     val w = new StringWriter
124 |     t.printStackTrace(new PrintWriter(w))
125 |     w.toString
126 |   }
127 | 
128 |   private val rand = new Random(System.currentTimeMillis)
129 |   private def random(min: Int, max: Int) = min + rand.nextInt(max - min)
130 | }
131 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/util/ValidationSpec.scala:
--------------------------------------------------------------------------------
  1 | package spark_etl.util
  2 | 
  3 | import org.scalatest.{FlatSpec, Inside, Matchers}
  4 | import spark_etl.util.Validation._
  5 | 
  6 | class ValidationSpec extends FlatSpec with Matchers with Inside {
  7 |   val unitSuccesses = (0 to 9).map(_ => ().success[String])
  8 |   val errorStrs = (0 to 9).map(i => s"error:$i")
  9 |   val failures = errorStrs.map(i => i.failure[Unit])
 10 | 
 11 |   "Validation" should "merge up to 10 successes" in {
 12 |     (2 to 9).foreach(i => unitSuccesses.take(i).reduce(_ +++ _) shouldBe Success(()))
 13 |     merge("00".success[String], 11.success[String]) {
 14 |       (r0, r1) => s"$r0:$r1"
 15 |     } shouldBe Success("00:11")
 16 |     merge("00".success[String], 11.success[String], "22".success[String]) {
 17 |       (r0, r1, r2) => s"$r0:$r1:$r2"
 18 |     } shouldBe Success("00:11:22")
 19 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String]) {
 20 |       (r0, r1, r2, r3) => s"$r0:$r1:$r2:$r3"
 21 |     } shouldBe Success("00:11:22:33")
 22 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String]) {
 23 |       (r0, r1, r2, r3, r4) => s"$r0:$r1:$r2:$r3:$r4"
 24 |     } shouldBe Success("00:11:22:33:44")
 25 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String]) {
 26 |       (r0, r1, r2, r3, r4, r5) => s"$r0:$r1:$r2:$r3:$r4:$r5"
 27 |     } shouldBe Success("00:11:22:33:44:55")
 28 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String]) {
 29 |       (r0, r1, r2, r3, r4, r5, r6) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6"
 30 |     } shouldBe Success("00:11:22:33:44:55:66")
 31 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String]) {
 32 |       (r0, r1, r2, r3, r4, r5, r6, r7) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7"
 33 |     } shouldBe Success("00:11:22:33:44:55:66:77")
 34 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String], "88".success[String]) {
 35 |       (r0, r1, r2, r3, r4, r5, r6, r7, r8) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7:$r8"
 36 |     } shouldBe Success("00:11:22:33:44:55:66:77:88")
 37 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String], "88".success[String], 99.success[String]) {
 38 |       (r0, r1, r2, r3, r4, r5, r6, r7, r8, r9) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7:$r8:$r9"
 39 |     } shouldBe Success("00:11:22:33:44:55:66:77:88:99")
 40 |   }
 41 | 
 42 |   it should "merge up to 10 failures" in {
 43 |     (2 to 9).foreach(i => failures.take(i).reduce(_ +++ _) shouldBe Failure(errorStrs.take(i)))
 44 |     merge("00".success[String], "err".failure[String]) {
 45 |       (r0, r1) => s"$r0:$r1"
 46 |     } shouldBe Failure(List("err"))
 47 |     merge("00".success[String], 11.success[String], "err".failure[String]) {
 48 |       (r0, r1, r2) => s"$r0:$r1:$r2"
 49 |     } shouldBe Failure(List("err"))
 50 |     merge("00".success[String], 11.success[String], "22".success[String], "err".failure[String]) {
 51 |       (r0, r1, r2, r3) => s"$r0:$r1:$r2:$r3"
 52 |     } shouldBe Failure(List("err"))
 53 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "err".failure[String]) {
 54 |       (r0, r1, r2, r3, r4) => s"$r0:$r1:$r2:$r3:$r4"
 55 |     } shouldBe Failure(List("err"))
 56 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], "err".failure[String]) {
 57 |       (r0, r1, r2, r3, r4, r5) => s"$r0:$r1:$r2:$r3:$r4:$r5"
 58 |     } shouldBe Failure(List("err"))
 59 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "err".failure[String]) {
 60 |       (r0, r1, r2, r3, r4, r5, r6) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6"
 61 |     } shouldBe Failure(List("err"))
 62 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], "err".failure[String]) {
 63 |       (r0, r1, r2, r3, r4, r5, r6, r7) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7"
 64 |     } shouldBe Failure(List("err"))
 65 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String], "err".failure[String]) {
 66 |       (r0, r1, r2, r3, r4, r5, r6, r7, r8) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7:$r8"
 67 |     } shouldBe Failure(List("err"))
 68 |     merge("00".success[String], 11.success[String], "22".success[String], 33.success[String], "44".success[String], 55.success[String], "66".success[String], 77.success[String], "88".success[String], "err".failure[String]) {
 69 |       (r0, r1, r2, r3, r4, r5, r6, r7, r8, r9) => s"$r0:$r1:$r2:$r3:$r4:$r5:$r6:$r7:$r8:$r9"
 70 |     } shouldBe Failure(List("err"))
 71 |   }
 72 | 
 73 |   it should "merge with success: unit, list, seq, set, map" in {
 74 |     ().success[String] +++ ().success[String] shouldBe Success(())
 75 |     List(1, 2).success[String] +++ List(1, 3).success[String] shouldBe Success(List(1, 2, 1, 3))
 76 |     Seq(1, 2).success[String] +++ Seq(1, 3).success[String] shouldBe Success(Seq(1, 2, 1, 3))
 77 |     Set(1, 2).success[String] +++ Set(1, 3).success[String] shouldBe Success(Set(1, 2, 3))
 78 |     Map("a" -> 1, "b" -> 2).success[String] +++ Map("a" -> 11, "c" -> 33).success[String] shouldBe Success(Map("a" -> 11, "b" -> 2, "c" -> 33))
 79 |   }
 80 | 
 81 |   it should "map" in {
 82 |     11.success[Int].map(_ * 2) shouldBe Success(22)
 83 |     "err".failure[Int].map(_ * 2) shouldBe Failure(List("err"))
 84 |   }
 85 | 
 86 |   it should "flatMap" in {
 87 |     (for {
 88 |       x1 <- 11.success[Int]
 89 |       x2 <- 22.success[Int]
 90 |     } yield x1 + x2) shouldBe Success(33)
 91 |     (for {
 92 |       x1 <- "err".failure[Int]
 93 |       x2 <- 22.success[String]
 94 |     } yield x1 + x2) shouldBe Failure(List("err"))
 95 |   }
 96 | 
 97 |   it should "foldl" in {
 98 |     11.success[Double].foldl(
 99 |       {
100 |         _ => false
101 |       },
102 |       {
103 |         _ => true
104 |       }
105 |     ) shouldBe true
106 | 
107 |     (-99.5).failure[Int].foldl(
108 |       {
109 |         _ => false
110 |       },
111 |       {
112 |         _ => true
113 |       }
114 |     ) shouldBe false
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/util/Validation.scala:
--------------------------------------------------------------------------------
  1 | package spark_etl.util
  2 | 
  3 | /**
  4 |   * Limited scalaz's Validation. Supports map(), flatMap() and +++ (eg. for reduce())
  5 |   */
  6 | trait Validation[Err, S] {
  7 |   def +++[S2, Out](v: Validation[Err, S2])(implicit merger: ValidationMerger[S, S2, Out]): Validation[Err, Out] = {
  8 |     (this, v) match {
  9 |       case (Failure(ls1), Failure(ls2)) => Failure(ls1 ++ ls2)
 10 |       case (Failure(ls),  Success(_))   => Failure(ls)
 11 |       case (Success(_),   Failure(ls))  => Failure(ls)
 12 |       case (Success(r1),  Success(r2))  => Success(merger.merge(r1, r2))
 13 |     }
 14 |   }
 15 | 
 16 |   def map[Out](f: S => Out): Validation[Err, Out] = this match {
 17 |     case Success(r)  => Success(f(r))
 18 |     case Failure(ls) => Failure(ls)
 19 |   }
 20 | 
 21 |   def flatMap[Out](f: S => Validation[Err, Out]): Validation[Err, Out] = this match {
 22 |     case Success(r)  => f(r)
 23 |     case Failure(ls) => Failure(ls)
 24 |   }
 25 | 
 26 |   def foldl[Out](failure: Seq[Err] => Out, success: S => Out): Out = this match {
 27 |     case Failure(err) => failure(err)
 28 |     case Success(s)   => success(s)
 29 |   }
 30 | 
 31 |   def isSuccess: Boolean
 32 | 
 33 |   protected def errs: Seq[Err]
 34 | }
 35 | 
 36 | object Validation {
 37 |   implicit val unitMapper = new ValidationMerger[Unit, Unit, Unit] { def merge(in1: Unit, in2: Unit) = () }
 38 |   implicit def listMerger[X] = new ValidationMerger[List[X], List[X], List[X]] { def merge(in1: List[X], in2: List[X]) = in1 ++ in2 }
 39 |   implicit def seqMerger[X] = new ValidationMerger[Seq[X], Seq[X], Seq[X]] { def merge(in1: Seq[X], in2: Seq[X]) = in1 ++ in2 }
 40 |   implicit def setMerger[X] = new ValidationMerger[Set[X], Set[X], Set[X]] { def merge(in1: Set[X], in2: Set[X]) = in1 ++ in2 }
 41 |   implicit def mapMerger[K, V] = new ValidationMerger[Map[K, V], Map[K, V], Map[K, V]] { def merge(in1: Map[K, V], in2: Map[K, V]) = in1 ++ in2 }
 42 | 
 43 |   import scala.language.implicitConversions
 44 |   implicit def validationOps[A](a: A): ValidationOps[A] = new ValidationOps(a)
 45 | 
 46 |   def merge[Err, S0, S1, Out](
 47 |     v0: Validation[Err, S0],
 48 |     v1: Validation[Err, S1])(map: (S0, S1) => Out): Validation[Err, Out] =
 49 |     (v0, v1) match {
 50 |       case (Success(s0), Success(s1)) =>
 51 |         val out = map(s0, s1)
 52 |         Success[Err, Out](out)
 53 |       case _ =>
 54 |         Failure(v0.errs ++ v1.errs)
 55 |     }
 56 | 
 57 |   def merge[Err, S0, S1, S2, Out](
 58 |     v0: Validation[Err, S0],
 59 |     v1: Validation[Err, S1],
 60 |     v2: Validation[Err, S2])(merge: (S0, S1, S2) => Out): Validation[Err, Out] =
 61 |     (v0, v1, v2) match {
 62 |       case (Success(s0), Success(s1), Success(s2)) =>
 63 |         val out = merge(s0, s1, s2)
 64 |         Success[Err, Out](out)
 65 |       case _ =>
 66 |         Failure(v0.errs ++ v1.errs ++ v2.errs)
 67 |     }
 68 | 
 69 |   def merge[Err, S0, S1, S2, S3, Out](
 70 |     v0: Validation[Err, S0],
 71 |     v1: Validation[Err, S1],
 72 |     v2: Validation[Err, S2],
 73 |     v3: Validation[Err, S3])(merge: (S0, S1, S2, S3) => Out): Validation[Err, Out] =
 74 |     (v0, v1, v2, v3) match {
 75 |       case (Success(s0), Success(s1), Success(s2), Success(s3)) =>
 76 |         val out = merge(s0, s1, s2, s3)
 77 |         Success[Err, Out](out)
 78 |       case _ =>
 79 |         Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs)
 80 |     }
 81 | 
 82 |   def merge[Err, S0, S1, S2, S3, S4, Out](
 83 |     v0: Validation[Err, S0],
 84 |     v1: Validation[Err, S1],
 85 |     v2: Validation[Err, S2],
 86 |     v3: Validation[Err, S3],
 87 |     v4: Validation[Err, S4])(merge: (S0, S1, S2, S3, S4) => Out): Validation[Err, Out] =
 88 |     (v0, v1, v2, v3, v4) match {
 89 |       case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4)) =>
 90 |         val out = merge(s0, s1, s2, s3, s4)
 91 |         Success[Err, Out](out)
 92 |       case _ =>
 93 |         Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs)
 94 |     }
 95 | 
 96 |   def merge[Err, S0, S1, S2, S3, S4, S5, Out](
 97 |     v0: Validation[Err, S0],
 98 |     v1: Validation[Err, S1],
 99 |     v2: Validation[Err, S2],
100 |     v3: Validation[Err, S3],
101 |     v4: Validation[Err, S4],
102 |     v5: Validation[Err, S5])(merge: (S0, S1, S2, S3, S4, S5) => Out): Validation[Err, Out] =
103 |     (v0, v1, v2, v3, v4, v5) match {
104 |       case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5)) =>
105 |         val out = merge(s0, s1, s2, s3, s4, s5)
106 |         Success[Err, Out](out)
107 |       case _ =>
108 |         Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs)
109 |     }
110 | 
111 |   def merge[Err, S0, S1, S2, S3, S4, S5, S6, Out](
112 |     v0: Validation[Err, S0],
113 |     v1: Validation[Err, S1],
114 |     v2: Validation[Err, S2],
115 |     v3: Validation[Err, S3],
116 |     v4: Validation[Err, S4],
117 |     v5: Validation[Err, S5],
118 |     v6: Validation[Err, S6])(merge: (S0, S1, S2, S3, S4, S5, S6) => Out): Validation[Err, Out] =
119 |     (v0, v1, v2, v3, v4, v5, v6) match {
120 |       case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5), Success(s6)) =>
121 |         val out = merge(s0, s1, s2, s3, s4, s5, s6)
122 |         Success[Err, Out](out)
123 |       case _ =>
124 |         Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs ++ v6.errs)
125 |     }
126 | 
127 |   def merge[Err, S0, S1, S2, S3, S4, S5, S6, S7, Out](
128 |     v0: Validation[Err, S0],
129 |     v1: Validation[Err, S1],
130 |     v2: Validation[Err, S2],
131 |     v3: Validation[Err, S3],
132 |     v4: Validation[Err, S4],
133 |     v5: Validation[Err, S5],
134 |     v6: Validation[Err, S6],
135 |     v7: Validation[Err, S7])(merge: (S0, S1, S2, S3, S4, S5, S6, S7) => Out): Validation[Err, Out] =
136 |     (v0, v1, v2, v3, v4, v5, v6, v7) match {
137 |       case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5), Success(s6), Success(s7)) =>
138 |         val out = merge(s0, s1, s2, s3, s4, s5, s6, s7)
139 |         Success[Err, Out](out)
140 |       case _ =>
141 |         Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs ++ v6.errs ++ v7.errs)
142 |     }
143 | 
144 |   def merge[Err, S0, S1, S2, S3, S4, S5, S6, S7, S8, Out](
145 |     v0: Validation[Err, S0],
146 |     v1: Validation[Err, S1],
147 |     v2: Validation[Err, S2],
148 |     v3: Validation[Err, S3],
149 |     v4: Validation[Err, S4],
150 |     v5: Validation[Err, S5],
151 |     v6: Validation[Err, S6],
152 |     v7: Validation[Err, S7],
153 |     v8: Validation[Err, S8])(merge: (S0, S1, S2, S3, S4, S5, S6, S7, S8) => Out): Validation[Err, Out] =
154 |     (v0, v1, v2, v3, v4, v5, v6, v7, v8) match {
155 |       case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5), Success(s6), Success(s7), Success(s8)) =>
156 |         val out = merge(s0, s1, s2, s3, s4, s5, s6, s7, s8)
157 |         Success[Err, Out](out)
158 |       case _ =>
159 |         Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs ++ v6.errs ++ v7.errs ++ v8.errs)
160 |     }
161 | 
162 |   def merge[Err, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, Out](
163 |     v0: Validation[Err, S0],
164 |     v1: Validation[Err, S1],
165 |     v2: Validation[Err, S2],
166 |     v3: Validation[Err, S3],
167 |     v4: Validation[Err, S4],
168 |     v5: Validation[Err, S5],
169 |     v6: Validation[Err, S6],
170 |     v7: Validation[Err, S7],
171 |     v8: Validation[Err, S8],
172 |     v9: Validation[Err, S9])(merge: (S0, S1, S2, S3, S4, S5, S6, S7, S8, S9) => Out): Validation[Err, Out] =
173 |     (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9) match {
174 |       case (Success(s0), Success(s1), Success(s2), Success(s3), Success(s4), Success(s5), Success(s6), Success(s7), Success(s8), Success(s9)) =>
175 |         val out = merge(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9)
176 |         Success[Err, Out](out)
177 |       case _ =>
178 |         Failure(v0.errs ++ v1.errs ++ v2.errs ++ v3.errs ++ v4.errs ++ v5.errs ++ v6.errs ++ v7.errs ++ v8.errs ++ v9.errs)
179 |     }
180 | }
181 | 
182 | case class Failure[Err, R](errs: Seq[Err]) extends Validation[Err, R] {
183 |   override def isSuccess: Boolean = false
184 | }
185 | 
186 | case class Success[Err, R](r: R) extends Validation[Err, R] {
187 |   override def isSuccess: Boolean = true
188 |   protected def errs = Nil
189 | }
190 | 
191 | trait ValidationMerger[In1, In2, Out] {
192 |   def merge(in1: In1, in2: In2): Out
193 | }
194 | 
195 | final class ValidationOps[A](val self: A) extends AnyVal {
196 |   def success[X]: Validation[X, A] = Success[X, A](self)
197 |   def failure[X]: Validation[A, X] = Failure[A, X](List(self))
198 | }


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/model/RuntimeContext.scala:
--------------------------------------------------------------------------------
  1 | package spark_etl.model
  2 | 
  3 | import net.jcazevedo.moultingyaml._
  4 | import org.apache.spark.sql.catalyst.parser._
  5 | import spark_etl.util.Validation._
  6 | import spark_etl.util._
  7 | import spark_etl.{ConfigError, ExtractReader, LoadWriter}
  8 | 
  9 | import scala.util.{Failure, Success, Try}
 10 | 
 11 | case class RuntimeExtract(org: Extract, checkContents: Option[String])
 12 | 
 13 | case class RuntimeTransform(org: Transform, sqlContents: String, checkContents: Option[String])
 14 | 
 15 | class RuntimeContext(extracts: List[RuntimeExtract], transforms: List[RuntimeTransform], val loads: List[Load], val extractReader: ExtractReader, val loadWriter: LoadWriter, depTree: DepTree, conf: Config) {
 16 |   def allExtracts: List[RuntimeExtract] =
 17 |     (for {
 18 |       r <- depTree.forType(E)
 19 |       e <- extracts
 20 |       if r.id == e.org.name
 21 |     } yield e).toList
 22 | 
 23 |   def allTransforms: List[RuntimeTransform] =
 24 |     (for {
 25 |       r <- depTree.forType(T)
 26 |       t <- transforms
 27 |       if r.id == t.org.name
 28 |     } yield t).toList
 29 | 
 30 |   def allLoads: List[Load] =
 31 |     (for {
 32 |       r <- depTree.forType(L)
 33 |       l <- loads
 34 |       if r.id == l.name
 35 |     } yield l).toList
 36 | 
 37 |   def asDot = depTree.asDot()
 38 | }
 39 | 
 40 | object RuntimeContext extends DefaultYamlProtocol {
 41 | 
 42 |   /**
 43 |     * Emphasis on *maximum* validation.
 44 |     */
 45 |   def load(_conf: Config, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, RuntimeContext] = {
 46 |     // depTree, with the known universe
 47 |     val conf = toLowerCase(_conf)
 48 |     val depTree = new DepTree(
 49 |       conf.extracts.map(e => Vertex(e.name, E)) ++
 50 |         conf.transforms.map(t => Vertex(t.name, T)) ++
 51 |         conf.loads.map(l => Vertex(l.name, L))
 52 |     )
 53 | 
 54 |     // read in entities and add their deps
 55 |     val regExtracts = conf.extracts
 56 |       .map(e => registerExtractDeps(e, depTree, filePathRoot, env))
 57 |       .map(_.map(List(_))).reduce(_ +++ _)
 58 | 
 59 |     val regTransforms = conf.transforms
 60 |       .map(t => registerTransformDeps(t, depTree, filePathRoot, env))
 61 |       .map(_.map(List(_))).reduce(_ +++ _)
 62 | 
 63 |     conf.loads.foreach(l => depTree.addEdge(l.source, Vertex(l.name, L), true))
 64 | 
 65 |     val validatedDuplicates = validateDuplicates(conf)
 66 | 
 67 |     val validatedDepTree = validateDepTree(depTree)
 68 | 
 69 |     val validatedExtractor = instantiate[ExtractReader](conf.extract_reader.get, classOf[ExtractReader])
 70 | 
 71 |     val validatedTransformer = instantiate[LoadWriter](conf.load_writer.get, classOf[LoadWriter])
 72 | 
 73 |     merge(validatedDuplicates, regExtracts, regTransforms, validatedExtractor, validatedTransformer, validatedDepTree) { (dups, es, ts, e, t, dt) => new RuntimeContext(es, ts, conf.loads, e, t, depTree, conf)  }
 74 |   }
 75 | 
 76 |   private def toLowerCase(conf: Config): Config =
 77 |     conf.copy(
 78 |       extracts = conf.extracts.map(e => e.copy(name = e.name.toLowerCase)),
 79 |       transforms = conf.transforms.map(t => t.copy(name = t.name.toLowerCase)),
 80 |       loads = conf.loads.map(l => l.copy(source = l.source.toLowerCase))
 81 |     )
 82 | 
 83 |   private def validateDuplicates(conf: Config): Validation[ConfigError, Unit] = {
 84 |     def valDups(desc: String, candidates: Seq[String]): Validation[ConfigError, Unit] =
 85 |       (candidates diff candidates.distinct).distinct match {
 86 |         case Nil   => ().success[ConfigError]
 87 |         case other => ConfigError(s"Duplicates found for $desc: ${other.sorted.mkString(", ")}").failure[Unit]
 88 |       }
 89 | 
 90 |     valDups("extract names",   conf.extracts.map(_.name))                                             +++
 91 |     valDups("extract uris",    conf.extracts.map(_.uri))                                              +++
 92 |     valDups("extract check",   conf.extracts.collect { case e if e.check.isDefined => e.check.get })  +++
 93 |     valDups("transform names", conf.transforms.map(_.name))                                           +++
 94 |     valDups("transform sqls",  conf.transforms.map(_.sql))                                            +++
 95 |     valDups("transform check", conf.extracts.collect { case t if t.check.isDefined => t.check.get })  +++
 96 |     valDups("load names",      conf.loads.map(_.name))                                                +++
 97 |     valDups("load uris",       conf.loads.map(_.uri))
 98 |   }
 99 | 
100 |   /**
101 |     * Load & parse check, if specified
102 |     * Note, extract check is only dependant on the extract
103 |     */
104 |   private def registerExtractDeps(extract: Extract, depTree: DepTree, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, RuntimeExtract] =
105 |     extract.check match {
106 |       case Some(checkUri) =>
107 |         UriLoader.load(checkUri, filePathRoot, env)
108 |           .flatMap(validateResolvedDsos(depTree, extract.name, Echeck, s"extract check ${extract.name} (uri $checkUri)"))
109 |           .map(checkTxt => RuntimeExtract(extract, Some(checkTxt)))
110 |       case None =>
111 |         RuntimeExtract(extract, None).success[ConfigError]
112 |     }
113 | 
114 |   /**
115 |     * Load & parse sql
116 |     * Load & parse pre_check, if specified
117 |     * Load & parse post_check, if specified
118 |     * Check dso dependencies
119 |     */
120 |   private def registerTransformDeps(transform: Transform, depTree: DepTree, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, RuntimeTransform] = {
121 |     // load resources
122 |     val validatedSql = UriLoader.load(transform.sql, filePathRoot, env)
123 |       .flatMap(validateResolvedDsos(depTree, transform.name, T, s"Unparsable sql of transform ${transform.name}"))
124 |     val validatedCheck = liftOpt(transform.check)(r => UriLoader.load(r, filePathRoot, env)
125 |       .flatMap(validateResolvedDsos(depTree, transform.name, Tcheck, s"Unparsable sql of transform check ${transform.name}")))
126 | 
127 |     merge(validatedSql, validatedCheck) { (sql, check) => RuntimeTransform(transform, sql, check) }
128 |   }
129 | 
130 |   private def liftOpt[T1, T2](opt: Option[T1])(toVal: T1 => Validation[ConfigError, T2]): Validation[ConfigError, Option[T2]] =
131 |     opt match {
132 |       case Some(r) => toVal(r).map(Some(_))
133 |       case None => (None:Option[T2]).success[ConfigError]
134 |     }
135 | 
136 |   private def validateResolvedDsos(depTree: DepTree, name: String, `type`: VertexType, errMsgPrefix: String)(contents: String): Validation[ConfigError, String] =
137 |     Try(SparkParser.getDeps(contents)) match {
138 |       case Success(usedDsos) =>
139 |         val withPrefixes = usedDsos.filter(_.prefix.isDefined)
140 |         if (withPrefixes.nonEmpty)
141 |           ConfigError(s"$errMsgPrefix: contains prefixed dsos: ${withPrefixes.map(_.qfStr).mkString(", ")}").failure[String]
142 |         else {
143 |           usedDsos.map(_.qfStr).foreach(d => depTree.addEdge(d, Vertex(name, `type`)))
144 |           contents.success[ConfigError]
145 |         }
146 |       case Failure(e: ParseException) =>
147 |         ConfigError(s"$errMsgPrefix: failed to parse, error: ${e.getMessage}").failure[String]
148 |       case Failure(e) =>
149 |         ConfigError(s"$errMsgPrefix: failed to parse", Some(e)).failure[String]
150 |     }
151 | 
152 |   private def validateDepTree(depTree: DepTree): Validation[ConfigError, Unit] = {
153 |     val danglingDeps = depTree.dangling
154 |     if (danglingDeps.isEmpty) {
155 |       ().success[ConfigError]
156 |     } else {
157 |       val errors = for {
158 |         dangling <- danglingDeps
159 |       } yield ConfigError(s"Unresolved dependency ${dangling.source.id} for ${dangling.target.`type`.asStr} ${dangling.target.id}").failure[Unit]
160 |       errors.reduce(_ +++ _)
161 |     }
162 |   }
163 | 
164 |   private def instantiate[T](paramConstr: ParametrizedConstructor, parentClass: Class[_]): Validation[ConfigError, T] = {
165 |     Try {
166 |       val clazz = Class.forName(paramConstr.`class`)
167 |       if (parentClass.isAssignableFrom(clazz)) {
168 |         val constructor = clazz.getConstructors()(0)
169 |         constructor.newInstance(paramConstr.params.get).asInstanceOf[T].success[ConfigError]
170 |       } else {
171 |         ConfigError(s"Failed to cast class ${paramConstr.`class`} to ${parentClass.getName}").failure[T]
172 |       }
173 |     } match {
174 |       case scala.util.Success(validated) => validated
175 |       case scala.util.Failure(e) => ConfigError(s"Failed to instantiate class ${paramConstr.`class`} with params: ${paramConstr.params}", Some(e)).failure[T]
176 |     }
177 |   }
178 | }
179 | 


--------------------------------------------------------------------------------
/src/test/scala/spark_etl/model/RuntimeContextSpec.scala:
--------------------------------------------------------------------------------
  1 | package spark_etl.model
  2 | 
  3 | import org.apache.spark.sql.{DataFrame, SparkSession}
  4 | import org.scalatest.{FlatSpec, Inside, Matchers}
  5 | import spark_etl.util.Validation._
  6 | import spark_etl.util._
  7 | import spark_etl.{ConfigError, ExtractReader, LoadWriter}
  8 | 
  9 | class RuntimeContextSpec extends FlatSpec with Matchers with Inside {
 10 |   val extractsAndTransformsStr =
 11 |     """
 12 |       |extracts:
 13 |       |  - name:  client
 14 |       |    uri:   "data/dev/client_2017"
 15 |       |    check: "/runtime-ctx/spark/extract-check/client.sql"
 16 |       |  - name:  item
 17 |       |    uri:   "data/dev/item_2017"
 18 |       |    check: "/runtime-ctx/spark/extract-check/item.sql"
 19 |       |  - name:  transaction
 20 |       |    uri:   "data/dev/transaction_2017"
 21 |       |    check: "/runtime-ctx/spark/extract-check/transaction.sql"
 22 |       |  # unused extract
 23 |       |  - name:  ____bogus_extract_not_loaded____
 24 |       |    uri:   "hdfs://aaa.bbb"
 25 |       |
 26 |       |transforms:
 27 |       |  - name:  client_spending
 28 |       |    check: "/runtime-ctx/spark/transform-check/client_spending.sql"
 29 |       |    sql:   "/runtime-ctx/spark/transform/client_spending.sql"
 30 |       |  - name:  item_purchase
 31 |       |    check: "/runtime-ctx/spark/transform-check/item_purchase.sql"
 32 |       |    sql:   "/runtime-ctx/spark/transform/item_purchase.sql"
 33 |       |  - name:  minor_purchase
 34 |       |    check: "/runtime-ctx/spark/transform-check/minor_purchase.sql"
 35 |       |    sql:   "/runtime-ctx/spark/transform/minor_purchase.sql"
 36 |       |
 37 |       |loads:
 38 |       |  - name:   client_spending_out
 39 |       |    source: client_spending
 40 |       |    uri:    "/tmp/out/client_spending"
 41 |       |    # no partition_by
 42 |       |  - name:   item_purchase_out
 43 |       |    source: item_purchase
 44 |       |    uri:    "/tmp/out/item_purchase"
 45 |       |    # no partition_by
 46 |       |  - name:   minor_purchase_out
 47 |       |    source: minor_purchase
 48 |       |    uri:    "/tmp/out/minor_purchase"
 49 |       |    # no partition_by
 50 |       |    """.stripMargin
 51 | 
 52 |   "RuntimeContext" should "validate ok extract_reader/load_writer" in {
 53 |     val confStr = extractsAndTransformsStr +
 54 |       """
 55 |         |extract_reader:
 56 |         |  class: spark_etl.model.OkExtractReader
 57 |         |  params:
 58 |         |    x: 11
 59 |         |    y: aa
 60 |         |
 61 |         |load_writer:
 62 |         |  class: spark_etl.model.OkLoadWriter
 63 |         |  params:
 64 |         |    b: false
 65 |         |    a: [1, xxx]
 66 |       """.stripMargin
 67 |     Config.parse(confStr) match {
 68 |       case Success(conf) =>
 69 |         RuntimeContext.load(conf, ".", Map.empty) match {
 70 |           case Success(ctx) =>
 71 |             ctx.extractReader.asInstanceOf[OkExtractReader].params shouldBe Map("x" -> 11d, "y" -> "aa")
 72 |             ctx.loadWriter.asInstanceOf[OkLoadWriter].params shouldBe Map("b" -> false, "a" -> List(1d, "xxx"))
 73 |         }
 74 |     }
 75 |   }
 76 | 
 77 |   it should "fail on incorrect inheritance of extract_reader/load_writer" in {
 78 |     val confStr = extractsAndTransformsStr +
 79 |       """
 80 |         |extract_reader:
 81 |         |  class: spark_etl.model.BogusExtractReader1
 82 |         |
 83 |         |load_writer:
 84 |         |  class: spark_etl.model.BogusLoadWriter1
 85 |       """.stripMargin
 86 |     Config.parse(confStr) match {
 87 |       case Success(conf) =>
 88 |         RuntimeContext.load(conf, ".", Map.empty) match {
 89 |           case Failure(errs) =>
 90 |             errs.toList.length shouldBe 2
 91 |             errs.toList.forall(_.msg.startsWith("Failed to cast class")) shouldBe true
 92 |         }
 93 |     }
 94 |   }
 95 | 
 96 |   it should "fail on parameterless constructors extract_reader/load_writer" in {
 97 |     val confStr = extractsAndTransformsStr +
 98 |       """
 99 |         |extract_reader:
100 |         |  class: spark_etl.model.BogusExtractReader2
101 |         |
102 |         |load_writer:
103 |         |  class: spark_etl.model.BogusLoadWriter2
104 |       """.stripMargin
105 |     Config.parse(confStr) match {
106 |       case Success(conf) =>
107 |         RuntimeContext.load(conf, ".", Map.empty) match {
108 |           case Failure(errs) =>
109 |             errs.toList.length shouldBe 2
110 |             errs.toList.forall(_.msg.startsWith("Failed to instantiate class")) shouldBe true
111 |         }
112 |     }
113 |   }
114 | 
115 |   it should "fail on duplicates" in {
116 |     val confStr =
117 |       """
118 |         |extracts:
119 |         |  - name:  client
120 |         |    uri:   "data/dev/client_2017"
121 |         |    check: "/runtime-ctx/spark/extract-check/client.sql"
122 |         |  - name:  client
123 |         |    uri:   "data/dev/client_2017"
124 |         |    check: "/runtime-ctx/spark/extract-check/client.sql"
125 |         |
126 |         |transforms:
127 |         |  - name:  client_spending
128 |         |    check: "/runtime-ctx/spark/transform-check/client_spending.sql"
129 |         |    sql:   "/runtime-ctx/spark/transform/client_all.sql"
130 |         |  - name:  client_spending
131 |         |    check: "/runtime-ctx/spark/transform-check/client_spending.sql"
132 |         |    sql:   "/runtime-ctx/spark/transform/client_all.sql"
133 |         |
134 |         |loads:
135 |         |  - name:   client_spending_out
136 |         |    source: client_spending
137 |         |    uri:    "/tmp/out/client_spending"
138 |         |  - name:   client_spending_out
139 |         |    source: client_spending
140 |         |    uri:    "/tmp/out/client_spending"
141 |         |""".stripMargin
142 | 
143 |     Config.parse(confStr) match {
144 |       case Success(conf) =>
145 |         RuntimeContext.load(conf, ".", Map.empty) match {
146 |           case Failure(errs) =>
147 |             val errList = errs.toList
148 |             errList.length shouldBe 8
149 |             errList.forall(_.msg.startsWith("Duplicates found for")) shouldBe true
150 |         }
151 |     }
152 |   }
153 | 
154 |   it should "produce dot file" in {
155 |     Config.parse(extractsAndTransformsStr) match {
156 |       case Success(conf) =>
157 |         RuntimeContext.load(conf, ".", Map.empty) match {
158 |           case Success(ctx) =>
159 |             val v = ctx.asDot
160 |             ctx.asDot shouldBe
161 |               """digraph Lineage {
162 |                 |  rankdir=LR
163 |                 |  node [fontsize=12]
164 |                 |
165 |                 |  # vertices
166 |                 |  client
167 |                 |  item
168 |                 |  transaction
169 |                 |  client_spending [shape=component]
170 |                 |  item_purchase [shape=component]
171 |                 |  minor_purchase [shape=component]
172 |                 |  client_spending_out [shape=cylinder]
173 |                 |  item_purchase_out [shape=cylinder]
174 |                 |  minor_purchase_out [shape=cylinder]
175 |                 |
176 |                 |  # edges
177 |                 |  item -> client_spending [style=dotted]
178 |                 |  transaction -> client_spending [style=dotted]
179 |                 |  client -> client_spending [style=dotted]
180 |                 |  item -> item_purchase [style=dotted]
181 |                 |  transaction -> item_purchase [style=dotted]
182 |                 |  client -> item_purchase [style=dotted]
183 |                 |  item -> minor_purchase [style=dotted]
184 |                 |  transaction -> minor_purchase [style=dotted]
185 |                 |  client -> minor_purchase [style=dotted]
186 |                 |  client_spending -> client_spending_out
187 |                 |  item_purchase -> item_purchase_out
188 |                 |  minor_purchase -> minor_purchase_out
189 |                 |
190 |                 |  # ranks
191 |                 |  { rank=same; client item transaction }
192 |                 |  { rank=same; client_spending_out item_purchase_out minor_purchase_out }
193 |                 |}""".stripMargin
194 |         }
195 |     }
196 | 
197 |   }
198 | }
199 | 
200 | class OkExtractReader(val params: Map[String, Any]) extends ExtractReader(params) {
201 |   override def checkLocal(extracts: Seq[Extract]): Validation[ConfigError, Unit] = ().success[ConfigError]
202 |   override def checkRemote(extracts: Seq[Extract]): Validation[ConfigError, Unit] = ???
203 |   override def read(extracts: Seq[Extract])(implicit spark: SparkSession): Seq[(Extract, DataFrame)] = ???
204 | }
205 | 
206 | class OkLoadWriter(val params: Map[String, Any]) extends LoadWriter(params) {
207 |   override def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit = ???
208 |   override def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit] = ().success[ConfigError]
209 |   override def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit] = ???
210 | }
211 | 
212 | class BogusExtractReader1(params: Map[String, Any])
213 | 
214 | class BogusLoadWriter1(params: Map[String, Any])
215 | 
216 | class BogusExtractReader2 extends ExtractReader(Map.empty) {
217 |   override def checkLocal(extracts: Seq[Extract]): Validation[ConfigError, Unit] = ().success[ConfigError]
218 |   override def checkRemote(extracts: Seq[Extract]): Validation[ConfigError, Unit] = ???
219 |   override def read(extracts: Seq[Extract])(implicit spark: SparkSession): Seq[(Extract, DataFrame)] = ???
220 | }
221 | 
222 | class BogusLoadWriter2 extends LoadWriter(Map.empty) {
223 |   override def write(loadsAndDfs: Seq[(Load, DataFrame)]): Unit = ???
224 |   override def checkLocal(loads: Seq[Load]): Validation[ConfigError, Unit] = ().success[ConfigError]
225 |   override def checkRemote(loads: Seq[Load]): Validation[ConfigError, Unit] = ???
226 | }
227 | 


--------------------------------------------------------------------------------
/src/main/scala/spark_etl/CLIOps.scala:
--------------------------------------------------------------------------------
  1 | package spark_etl
  2 | 
  3 | import java.io.{File, PrintWriter}
  4 | 
  5 | import org.apache.log4j.Logger
  6 | import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession}
  7 | import spark_etl.model._
  8 | import spark_etl.util.{BAHelper, Validation}
  9 | import spark_etl.util.Validation._
 10 | 
 11 | import scala.util.Try
 12 | 
 13 | object CLIOps {
 14 |   val log = Logger.getLogger(getClass)
 15 | 
 16 |   def dotLineage(confUri: String, filePathRoot: String, env: Map[String, String], filename: String): Validation[ConfigError, Unit] =
 17 |     withCtx(confUri, filePathRoot, env) {
 18 |       ctx =>
 19 |         new PrintWriter(filename) {
 20 |           write(ctx.asDot)
 21 |           close
 22 |         }
 23 |         ().success[ConfigError]
 24 |     }
 25 | 
 26 |   def validateLocal(confUri: String, filePathRoot: String, env: Map[String, String]): Validation[ConfigError, Unit] =
 27 |     withCtx(confUri, filePathRoot, env) {
 28 |       ctx =>
 29 |         val orgExtracts = ctx.allExtracts.map(_.org)
 30 |         val extractReaderValidation = ctx.extractReader.checkLocal(orgExtracts)
 31 |         val loadWriterValidation = ctx.loadWriter.checkLocal(ctx.loads)
 32 |         (extractReaderValidation +++ loadWriterValidation).map(_ =>
 33 |           log.info(
 34 |             s"""Local context validated!
 35 |                |
 36 |                |ExtractReader validated!
 37 |                |
 38 |                |LoadWriter validated!""".stripMargin))
 39 |     }
 40 | 
 41 |   def validateRemote(confUri: String, filePathRoot: String, env: Map[String, String])(implicit spark: SparkSession): Validation[ConfigError, Unit] =
 42 |     withCtx(confUri, filePathRoot, env) {
 43 |       ctx =>
 44 |         val orgExtracts = ctx.allExtracts.map(_.org)
 45 |         val extractReaderValidation = ctx.extractReader.checkRemote(orgExtracts)
 46 |         val loadWriterValidation = ctx.loadWriter.checkRemote(ctx.loads)
 47 |         for {
 48 |           _ <- extractReaderValidation +++ loadWriterValidation
 49 |           _ <- {
 50 |             // for validation - do not persist
 51 |             val withoutCacheOrPersist = ctx.allExtracts.map(e => e.copy(org = e.org.copy(cache = None, persist = None)))
 52 |             readExtracts(ctx.extractReader, withoutCacheOrPersist)
 53 |           }
 54 |           _ <- {
 55 |             // for validation - do not persist
 56 |             val withoutCacheOrPersist = ctx.allTransforms.map(t => t.copy(org = t.org.copy(cache = None, persist = None)))
 57 |             loadTransforms(withoutCacheOrPersist)
 58 |           }
 59 |         } yield {
 60 |           log.info(
 61 |             s"""Remote context validated!
 62 |                |
 63 |                |ExtractReader validated!
 64 |                |
 65 |                |LoadWriter validated!
 66 |                |
 67 |                |Transforms loaded in session!""".stripMargin)
 68 |         }
 69 |     }
 70 | 
 71 |   def transformAndLoad(confUri: String, filePathRoot: String, env: Map[String, String], props: Map[String, String], showCounts: Boolean)(implicit spark: SparkSession): Validation[ConfigError, Unit] =
 72 |     withCtx(confUri, filePathRoot, env) {
 73 |       ctx =>
 74 |         for {
 75 |           _ <- readExtracts(ctx.extractReader, ctx.allExtracts)
 76 |           transformed <- loadTransforms(ctx.allTransforms)
 77 |           _ <- runCounts(transformed, showCounts)
 78 |           written <- Try {
 79 |             val loadsAndDfs = for {
 80 |               (t, df) <- transformed
 81 |               l <- ctx.allLoads
 82 |               l <- if (t.org.name == l.source) List(l) else Nil
 83 |             } yield (l, df)
 84 |             ctx.loadWriter.write(loadsAndDfs)
 85 |           } match {
 86 |             case scala.util.Success(_) => ().success[ConfigError]
 87 |             case scala.util.Failure(exc:AnalysisException) => ConfigError(s"Failed to write out transform due to AnalysisException, ${exc.getMessage}").failure[Seq[(RuntimeTransform, DataFrame)]]
 88 |             case scala.util.Failure(e) => ConfigError("Failed to write out transform", Some(e)).failure[Unit]
 89 |           }
 90 |         } yield ()
 91 |     }
 92 | 
 93 |   def extractCheck(confUri: String, filePathRoot: String, env: Map[String, String])(implicit spark: SparkSession): Validation[ConfigError, Unit] =
 94 |     withCtx(confUri, filePathRoot, env) {
 95 |       ctx =>
 96 |         for {
 97 |           _ <- readExtracts(ctx.extractReader, ctx.allExtracts)
 98 |           _ <- {
 99 |             //
100 |             val runnableChecks = ctx.allExtracts.collect { case e if e.checkContents.isDefined => e.org.name -> e.checkContents.get }
101 |             runAndReport("Extract check results", runnableChecks)
102 |           }
103 |         } yield ()
104 |     }
105 | 
106 |   def transformCheck(confUri: String, filePathRoot: String, env: Map[String, String], showCounts: Boolean)(implicit spark: SparkSession): Validation[ConfigError, Unit] =
107 |     withCtx(confUri, filePathRoot, env) {
108 |       ctx =>
109 |         for {
110 |           _ <- readExtracts(ctx.extractReader, ctx.allExtracts)
111 |           transformed <- loadTransforms(ctx.allTransforms)
112 |           _ <- runCounts(transformed, showCounts)
113 |           _ <- {
114 |             val runnableChecks = ctx.allTransforms.collect { case t if t.checkContents.isDefined => t.org.name -> t.checkContents.get }
115 |             runAndReport("Transform check results", runnableChecks)
116 |           }
117 |         } yield ()
118 |     }
119 | 
120 |   private def withCtx(confUri: String, filePathRoot: String, env: Map[String, String])(run: (RuntimeContext) => Validation[ConfigError, Unit]): Validation[ConfigError, Unit] = {
121 |     val validatedCtx = for {
122 |       conf <- Config.load(confUri, filePathRoot, env)
123 |       ctx  <- {
124 |         val relFilePath =
125 |           if (confUri.startsWith("file:/"))
126 |             new File(confUri.substring("file:".length)).getParent
127 |           else if (confUri.startsWith("file:"))
128 |             new File(confUri.substring("file:".length)).getParent match {
129 |               case null => filePathRoot
130 |               case confParent => new File(filePathRoot, confParent).getAbsolutePath
131 |             }
132 |           else
133 |             filePathRoot
134 |         RuntimeContext.load(conf, relFilePath, env)
135 |       }
136 |     } yield {
137 |       val ctxDesc =
138 |         s"""|Validated runtime context
139 |             |=========================
140 |             |
141 |             |Extracts:
142 |             |${toBullets(ctx.allExtracts.map(e => e.org.name -> e.org.uri))}
143 |             |Extract checks:
144 |             |${toBullets(ctx.allExtracts.flatMap(e => e.org.check.map(checkUri => e.org.name -> checkUri)))}
145 |             |Transforms:
146 |             |${toBullets(ctx.allTransforms.map(t => t.org.name -> t.org.sql))}
147 |             |Transform checks:
148 |             |${toBullets(ctx.allTransforms.flatMap(t => t.org.check.map(checkUri => t.org.name -> checkUri)))}
149 |             |Loads:
150 |             |${toBullets(ctx.loads.map(l => l.name -> l.uri))}
151 |            """.stripMargin
152 | 
153 |       log.info(ctxDesc)
154 |       ctx
155 |     }
156 | 
157 |     validatedCtx.flatMap(run)
158 |   }
159 | 
160 |   private def readExtracts(extractor: ExtractReader, extracts: Seq[RuntimeExtract])(implicit spark: SparkSession): Validation[ConfigError, Unit] = {
161 |     val orgExtracts = extracts.map(_.org)
162 |     Try {
163 |       extractor.read(orgExtracts).foreach {
164 |         case (e, df) =>
165 |           e.cache.foreach(c => if (c) df.cache())
166 |           e.persist.foreach(p => df.persist(p.asSpark))
167 |           df.createOrReplaceTempView(e.name)
168 |       }
169 |     } match {
170 |       case scala.util.Success(res) => res.success[ConfigError]
171 |       case scala.util.Failure(exc) => ConfigError("Failed to load extracts", Some(exc)).failure[Unit]
172 |     }
173 |   }
174 | 
175 |   private def loadTransforms(transforms: Seq[RuntimeTransform])(implicit spark: SparkSession): Validation[ConfigError, Seq[(RuntimeTransform, DataFrame)]] =
176 |     Try {
177 |       transforms.map {
178 |         t =>
179 |           val df = spark.sql(t.sqlContents)
180 |           t.org.cache.foreach(c => if (c) df.cache())
181 |           t.org.persist.foreach(p => df.persist(p.asSpark))
182 |           df.createOrReplaceTempView(t.org.name)
183 |           (t, df)
184 |       }
185 |     } match {
186 |       case scala.util.Success(res) => res.success[ConfigError]
187 |       case scala.util.Failure(exc:AnalysisException) => ConfigError(s"Failed to run transforms due to AnalysisException, ${exc.getMessage}").failure[Seq[(RuntimeTransform, DataFrame)]]
188 |       case scala.util.Failure(exc) => ConfigError("Failed to run transforms", Some(exc)).failure[Seq[(RuntimeTransform, DataFrame)]]
189 |     }
190 | 
191 |   private def runCounts(transformsAndDfs: Seq[(RuntimeTransform, DataFrame)], showCounts: Boolean): Validation[ConfigError, Unit] =
192 |     if (! showCounts)
193 |       ().success[ConfigError]
194 |     else
195 |       Try {
196 |         val countDescrs = toBullets(transformsAndDfs.map { case (t, df) => t.org.name -> df.count.toString }, ": ")
197 |         log.info(s"Transform counts:\n$countDescrs")
198 |       } match {
199 |         case scala.util.Success(_) => ().success[ConfigError]
200 |         case scala.util.Failure(exc) => ConfigError("Failed to run counts", Some(exc)).failure[Unit]
201 |       }
202 | 
203 |   protected def runAndReport(desc: String, descAndSql: Seq[(String, String)])(implicit spark: SparkSession): Validation[ConfigError, Unit] =
204 |     Try {
205 |       val outputs = descAndSql.map {
206 |         case (sqlDesc, sql) =>
207 |           val df = spark.sql(sql)
208 |           val valFieldDesc = df.take(100).map(r => df.schema.fields zip r.toSeq).flatMap(_.map {
209 |             // Fail on false only, succeed and report on all others
210 |             case (f, false) =>
211 |               ConfigError(s"$desc: $sqlDesc's check ${f.name} returned false!").failure[(String, String)]
212 |             case (f, value) =>
213 |               (f.name -> value.toString).success[ConfigError]
214 |           })
215 |           val valRes = valFieldDesc.map(_.map(List(_))).reduce(_ +++ _)
216 |           valRes.map(fieldDesc => s"$sqlDesc:\n${toBullets(fieldDesc)}")
217 |       }
218 |       outputs.map(_.map(List(_))).reduce(_ +++ _)
219 |     } match {
220 |       case scala.util.Success(res) => res.map(outputs => log.info(s"$desc:\n${outputs.mkString("\n")}"))
221 |       case scala.util.Failure(exc) => ConfigError(s"Failed to load $desc", Some(exc)).failure[Unit]
222 |     }
223 | 
224 |   def stripPrefixes(srcDir: File, targetDir: File, rmTargetDir: Boolean): Validation[ConfigError, Unit] =
225 |     Try(BAHelper.copySqls(srcDir, targetDir, rmTargetDir)) match {
226 |       case scala.util.Success(descs) =>
227 |         log.info(s"""Copied BA sql to DEV:\n${CLIOps.toBullets(descs)}""").success[ConfigError]
228 |       case scala.util.Failure(e) =>
229 |         ConfigError(s"Failed to copy SQL from $srcDir to $targetDir", Some(e)).failure[Unit]
230 |     }
231 | 
232 |   private def toBullets(kvs: Seq[(String, String)], sep: String = " -> ") =
233 |     if (kvs.isEmpty)
234 |       "  NA"
235 |     else {
236 |       val maxKLen = kvs.map(_._1.length).max
237 |       kvs.map { case (k, v) => s"• ${k.padTo(maxKLen, ' ')}$sep$v" }.mkString("\n")
238 |     }
239 | }
240 | 


--------------------------------------------------------------------------------