├── .gitignore ├── LICENSE ├── README.md ├── Supported Objects.md ├── build.sbt ├── project ├── assembly.sbt └── build.properties └── src ├── main └── scala │ └── datapipeline │ ├── compiler │ └── AwsDataPipelineCompiler.scala │ └── dsl │ ├── Action.scala │ ├── Activity.scala │ ├── AwsDataPipeline.scala │ ├── Database.scala │ ├── Defaults.scala │ ├── Eithers.scala │ ├── Json.scala │ ├── Precondition.scala │ ├── Resource.scala │ ├── Schedule.scala │ ├── Traits.scala │ └── package.scala └── test └── scala └── datapipeline └── dsl └── AwsDataPipelineSpec.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | target -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWS DataPipeline DSL for Scala 2 | 3 | A Scala domain-specific language and toolkit to help you build and maintain AWS DataPipeline definitions. 4 | 5 | This tool aims to ease the burden of maintaining a large suite of AWS DataPipelines. At Shazam, we use this tool to 6 | define our data pipelines in Scala code and avoid the boilerplate and maintenance headache of managing 10s or 100s of 7 | JSON pipeline configuration files. 8 | 9 | Benefits:- 10 | - Write and maintain Scala code instead of JSON configuration 11 | - Use the DSL's `>>` syntax to clearly express dependencies between your pipeline's activities 12 | - Share code/configuration between your pipeline definitions 13 | - Never write `dependsOn` or `precondition` again, this library manages all ids and object references for you 14 | - Add your own wrapper around this library to predefine most your most commonly-used data pipeline objects 15 | 16 | ## Tutorial 17 | 18 | Build the compiler using `sbt`: 19 | ``` 20 | $ sbt assembly 21 | ``` 22 | 23 | Create a "Hello World" AWS Data Pipeline definition Scala file: 24 | ```scala 25 | object HelloWorldPipeline { 26 | 27 | import datapipeline.dsl._ 28 | 29 | val pipeline = 30 | AwsDataPipeline(name = "HelloWorldPipeline") 31 | .withSchedule( 32 | frequency = Daily, 33 | startDateTimeIso = "2018-01-01T00:00:00" 34 | ) 35 | .withActivities( 36 | ShellCommandActivity( 37 | name = "Echo Hello World", 38 | workerGroup = "my-task-runner", 39 | Command("echo 'Hello AWS Data Pipeline World!'") 40 | ) 41 | ) 42 | 43 | } 44 | ``` 45 | 46 | Use the compiler to produce JSON from our Scala definition: 47 | 48 | ``` 49 | $ java -jar target/scala-2.12/datapipeline-compiler.jar HelloWorldPipeline HelloWorldPipeline.scala 50 | Writing pipeline definition to: ./HelloWorldPipeline.json 51 | ``` 52 | 53 | The output JSON file contains your pipeline definition ready to deploy to AWS. 54 | 55 | ## Supported AWS DataPipeline Objects 56 | 57 | For details see [Supported Objects](Supported%20Objects.md). 58 | 59 | ## License 60 | 61 | This tool is licensed under [Apache License 2.0](LICENSE). 62 | -------------------------------------------------------------------------------- /Supported Objects.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Below is a list of all AWS DataPipeline object types and the level of support from this library. 4 | 5 | ## General Support Notes 6 | 7 | * Object `id`s are inferred from the `name` of the object specified in the dag, thus an object named `"My EMR Cluster"` 8 | will have the id `"My-EMR-Cluster""`. 9 | * `onFail`, `failureAndRerunMode`, `role`, `resourceRole` and `pipelineLogUri` are only supported in the `Default` 10 | object, meaning there can only be one global definition of these per pipeline. 11 | * `onSuccess` actions are not supported. 12 | * `maxActiveInstances` is not supported. 13 | * The fields `reportProgressTimeout`, `lateAfterTimeout` and `onLateAction` are not currently supported. 14 | * The `parent` parameter is not supported. `parent` is used to build object hierarchies in AWS datapipeline, but this 15 | is much better achieved via Scala hierarchies and/or Scala factory methods. 16 | * User-defined fields are not supported. 17 | 18 | # Supported DataPipeline Objects 19 | 20 | ## Data Nodes 21 | 22 | ### DynamoDBDataNode 23 | 24 | * [ ] Not yet supported 25 | 26 | ### MySqlDataNode 27 | 28 | * [ ] Not yet supported 29 | 30 | ### RedshiftDataNode 31 | 32 | * [ ] Not yet supported 33 | 34 | ### S3DataNode 35 | 36 | * [ ] Not yet supported 37 | 38 | ### SqlDataNode 39 | 40 | * [ ] Not yet supported 41 | 42 | 43 | ## Activities 44 | 45 | ### CopyActivity 46 | 47 | * [ ] Not yet supported 48 | 49 | ### EmrActivity 50 | 51 | * [x] Supported 52 | * TODO: Support for various fields 53 | 54 | ### HadoopActivity 55 | 56 | * [ ] Not yet supported 57 | 58 | ### HiveActivity 59 | 60 | * [ ] Not yet supported 61 | 62 | ### HiveCopyActivity 63 | 64 | * [ ] Not yet supported 65 | 66 | ### PigActivity 67 | 68 | * [ ] Not yet supported 69 | 70 | ### RedshiftCopyActivity 71 | 72 | * [ ] Not yet supported 73 | 74 | ### ShellCommandActivity 75 | 76 | * [x] Supported 77 | * TODO: Support for `scriptArgument`, `runsOn` 78 | 79 | ### SqlActivity 80 | 81 | * [x] Supported 82 | * TODO: Support for `scriptUri`, `scriptArgument`, `runsOn` 83 | 84 | 85 | ## Resources 86 | 87 | ### Ec2Resource 88 | 89 | * [ ] Not yet supported 90 | 91 | ### EmrCluster 92 | 93 | * [x] Supported 94 | 95 | ### HttpProxy 96 | 97 | * [ ] Not yet supported 98 | 99 | 100 | ## Preconditions 101 | 102 | ### DynamoDBDataExists 103 | 104 | * [ ] Not yet supported 105 | 106 | ### DynamoDBTableExists 107 | 108 | * [ ] Not yet supported 109 | 110 | ### Exists 111 | 112 | * [ ] Not yet supported 113 | 114 | ### S3KeyExists 115 | 116 | * [x] Supported 117 | 118 | ### S3PrefixNotEmpty 119 | 120 | * [x] Supported 121 | 122 | ### ShellCommandPrecondition 123 | 124 | * [x] Supported 125 | * TODO: Support for `scriptArgument` and presumably `workerGroup` and `runsOn` but they are not documented 126 | 127 | 128 | ## Databases 129 | 130 | ### JdbcDatabase 131 | 132 | * [ ] Not yet supported 133 | 134 | ### RdsDatabase 135 | 136 | * [ ] Not yet supported 137 | 138 | ### RedshiftDatabase 139 | 140 | * [x] Supported 141 | 142 | ## Data Formats 143 | 144 | * [ ] Not yet supported 145 | 146 | 147 | ## Actions 148 | 149 | ### SnsAlarm 150 | 151 | * [x] Supported 152 | * Note that currently only one alarm per pipeline is supported 153 | 154 | ### Terminate 155 | 156 | * [ ] Not yet supported 157 | 158 | 159 | ## Schedule 160 | 161 | * [x] Supported 162 | * Both `ondemand` and `cron` schedules are supported 163 | * `timeseries` schedules are not supported 164 | * Note that currently only one schedule per pipeline is supported 165 | 166 | ## Utilities 167 | 168 | ### ShellScriptConfig 169 | 170 | * [ ] Not yet supported 171 | 172 | ### EmrConfiguration 173 | 174 | * [x] Supported 175 | 176 | ### Property 177 | 178 | * [x] Supported 179 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "datapipeline-dsl" 2 | 3 | version := "0.1.1-SNAPSHOT" 4 | 5 | scalaVersion := "2.12.5" 6 | 7 | mainClass in assembly := Some("datapipeline.compiler.AwsDataPipelineCompiler") 8 | 9 | assemblyJarName in assembly := "datapipeline-compiler.jar" 10 | 11 | libraryDependencies += "org.scala-lang" % "scala-compiler" % "2.12.5" 12 | 13 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.6.0-M2" 14 | 15 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.5" % Test 16 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.1.2 -------------------------------------------------------------------------------- /src/main/scala/datapipeline/compiler/AwsDataPipelineCompiler.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.compiler 18 | 19 | import java.io.{File, FileOutputStream, FilenameFilter} 20 | import java.net.URLClassLoader 21 | import java.nio.file.Paths 22 | 23 | import datapipeline.dsl.PipelineBuilder 24 | 25 | import scala.tools.nsc._ 26 | 27 | object AwsDataPipelineCompiler extends App { 28 | 29 | import AwsDataPipelineCompilerHelpers._ 30 | 31 | if (args.length < 2) fail( 32 | """Usage: datapipeline-compiler [source...] 33 | | 34 | |Where: 35 | | - fqcn is the fully-qualified class name of the pipeline definition singleton. E.g. myorg.DataPipeline 36 | | - source is either the Scala source file for your pipeline or a directory containing Scala source files 37 | """.stripMargin 38 | ) 39 | 40 | val (className :: sourcePaths) = args.toList 41 | 42 | compileSources(sourcePaths) 43 | 44 | val pipelineBuilder: PipelineBuilder = reflectivelyLoadPipelineBuilder(className) 45 | 46 | val filename = s"$CurrentWorkingDir${pipelineBuilder.name}.json" 47 | writePipelineJsonToFile(pipelineBuilder, filename) 48 | 49 | } 50 | 51 | object AwsDataPipelineCompilerHelpers { 52 | 53 | def compileSources(sourcePaths: List[String]): Unit = { 54 | val sourceFiles = sourcePaths.map(new File(_)).flatMap { 55 | case directory if directory.isDirectory => directory.listFiles(ScalaFilenameFilter) 56 | case file if file.isFile => file :: Nil 57 | case other => sys.error(s"Unexpected input file/directory: $other") 58 | } 59 | 60 | val compiler = { 61 | val settings = new Settings() 62 | settings.usejavacp.value = true 63 | 64 | val global = new Global(settings) 65 | 66 | new global.Run 67 | } 68 | 69 | compiler.compile(sourceFiles.map(_.getAbsolutePath)) 70 | } 71 | 72 | def reflectivelyLoadPipelineBuilder(className: String): PipelineBuilder = { 73 | val classLoader = { 74 | val environmentalClasspath = Option(System.getenv("CLASSPATH")).toList.flatMap(_.split(":")) 75 | val classPath = (CurrentWorkingDir :: environmentalClasspath).map(new File(_).toURI.toURL) 76 | new URLClassLoader(classPath.toArray, this.getClass.getClassLoader) 77 | } 78 | 79 | val clazz = classLoader.loadClass(className + "$") 80 | 81 | if (!clazz.getDeclaredFields.map(_.getName).contains(PipelineField)) fail( 82 | s"""Error: The class $className does not have a field named '$PipelineField'. 83 | |Your pipeline definition singleton should include a field named '$PipelineField' of type datapipeline.dsl.PipelineBuilder, 84 | |e.g.: 85 | | 86 | |object MyDataPipeline { 87 | | 88 | | import datapipeline.dsl._ 89 | | 90 | | val $PipelineField = AwsDataPipeline(name = "MyDataPipeline", ...) 91 | | 92 | |} 93 | """.stripMargin 94 | ) 95 | 96 | val pipelineBuilderField = clazz.getDeclaredField(PipelineField) 97 | pipelineBuilderField.setAccessible(true) 98 | 99 | val obj = clazz.getField("MODULE$").get(null) // retrieve the Scala singleton instance 100 | 101 | val pipelineBuilder = pipelineBuilderField.get(obj).asInstanceOf[PipelineBuilder] 102 | 103 | if (pipelineBuilder == null && clazz.getDeclaredMethods.exists(_.getName == "delayedInit")) fail( 104 | s"Error: Class $className cannot be loaded because it extends either DelayedInit or App." 105 | ) 106 | 107 | pipelineBuilder 108 | } 109 | 110 | def writePipelineJsonToFile(pipelineBuilder: PipelineBuilder, filename: String): Unit = { 111 | println(s"Writing pipeline definition to: $filename") 112 | 113 | val os = new FileOutputStream(filename) 114 | try { 115 | os.write { 116 | import org.json4s.native.JsonMethods._ 117 | 118 | pretty(render(pipelineBuilder.json)).getBytes("UTF-8") 119 | } 120 | } finally { 121 | os.close() 122 | } 123 | } 124 | 125 | def fail(message: String): Unit = { 126 | System.err.println(message) 127 | System.exit(1) 128 | } 129 | 130 | val PipelineField = "pipeline" 131 | 132 | lazy val CurrentWorkingDir = s"${Paths.get("").toAbsolutePath}${File.separator}" 133 | 134 | lazy val ScalaFilenameFilter: FilenameFilter = (_: File, name: String) => name.toLowerCase.endsWith(".scala") 135 | 136 | } -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Action.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | sealed trait Action extends DataPipelineObject 20 | 21 | case class SnsAlarm(topicArn: String, subject: String, message: String) extends Action { 22 | 23 | override val name = "SnsAlarm" 24 | 25 | override val objectType = "SnsAlarm" 26 | 27 | } -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Activity.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | import scala.concurrent.duration.FiniteDuration 20 | 21 | sealed trait Activity extends DataPipelineObject with Retryable { 22 | 23 | val preconditions: Seq[Precondition] 24 | 25 | val dependsOn: Seq[Activity] = Nil 26 | 27 | def withDependencies(activities: Seq[Activity]): Activity 28 | 29 | def >>(activity: Activity): Activity = { 30 | // println(s"${activity.id} depends on ${this.id}") 31 | activity.withDependencies(Seq(this) ++ activity.dependsOn) 32 | } 33 | 34 | def >>(activity: Activity, activities: Activity*): Seq[Activity] = 35 | (Seq(activity) ++ activities).map(this >> _) 36 | } 37 | 38 | object Activity { 39 | 40 | def resolveDependencyTree(rootActivities: Seq[Activity]): Seq[Activity] = 41 | mergeActivities(flattenActivities(rootActivities).reverse) 42 | 43 | /** 44 | * Traverses dependent activities via 'dependsOn' relationships and returns a flat sequence of activities 45 | */ 46 | private def flattenActivities(activities: Seq[Activity], 47 | visitedIds: Set[String] = Set.empty): Seq[Activity] = { 48 | activities.reverse.flatMap { // FIXME double reverse (see also resolveDependencyTree) 49 | activity => 50 | // println(s"Visiting ${activity.id} => resolved deps ${activity.dependsOn}") 51 | if (visitedIds.contains(activity.id)) throw CyclicalActivitiesException(activity, visitedIds) 52 | Seq(activity) ++ flattenActivities(activity.dependsOn, visitedIds ++ Set(activity.id)) 53 | } 54 | } 55 | 56 | /** 57 | * Merges any activities that are declared more than once. Throws DuplicateActivityException if 58 | * multiple activities share the same id (but differ in their definition). Preserves order. 59 | */ 60 | private def mergeActivities(activities: Seq[Activity]): Seq[Activity] = { 61 | activities.zipWithIndex.groupBy(_._1.id).values.toSeq.map { 62 | activitiesWithIndex => 63 | val (as, idxs) = activitiesWithIndex.unzip 64 | 65 | if (as.map(_.withDependencies(Nil)).distinct.length > 1) throw DuplicateActivityException(id = as.head.id) 66 | 67 | val mergedActivity = as.head.withDependencies(as.flatMap(_.dependsOn).distinct) 68 | 69 | val index = idxs.min 70 | index -> mergedActivity 71 | }.sortBy(_._1).unzip._2 72 | } 73 | } 74 | 75 | case class EmrActivity(name: String, 76 | emrCluster: EmrCluster, 77 | steps: Seq[String], 78 | preconditions: Seq[Precondition] = Nil, 79 | attemptTimeout: Option[FiniteDuration] = None, 80 | maximumRetries: Option[Int] = None, 81 | retryDelay: Option[FiniteDuration] = None, 82 | override val dependsOn: Seq[Activity] = Nil) extends Activity { 83 | 84 | require(steps.nonEmpty, "EmrActivity must have at least one step") 85 | 86 | override val objectType = "EmrActivity" 87 | 88 | override def withDependencies(activities: Seq[Activity]): Activity = copy(dependsOn = activities) 89 | } 90 | 91 | case class SqlActivity(name: String, 92 | database: Database, 93 | workerGroup: String, 94 | script: String, 95 | preconditions: Seq[Precondition] = Nil, 96 | attemptTimeout: Option[FiniteDuration] = None, 97 | maximumRetries: Option[Int] = None, 98 | retryDelay: Option[FiniteDuration] = None, 99 | override val dependsOn: Seq[Activity] = Nil) extends Activity { 100 | 101 | override val objectType = "SqlActivity" 102 | 103 | override def withDependencies(activities: Seq[Activity]): Activity = copy(dependsOn = activities) 104 | } 105 | 106 | case class ShellCommandActivity(name: String, 107 | workerGroup: String, 108 | commandOrScriptUri: CommandOrScriptUri, 109 | stdout: Option[String] = None, 110 | stderr: Option[String] = None, 111 | preconditions: Seq[Precondition] = Nil, 112 | attemptTimeout: Option[FiniteDuration] = None, 113 | maximumRetries: Option[Int] = None, 114 | retryDelay: Option[FiniteDuration] = None, 115 | override val dependsOn: Seq[Activity] = Nil) extends Activity { 116 | 117 | override val objectType = "ShellCommandActivity" 118 | 119 | override def withDependencies(activities: Seq[Activity]): Activity = copy(dependsOn = activities) 120 | } 121 | 122 | case class CyclicalActivitiesException(activity: Activity, visitedIds: Set[String]) extends Exception( 123 | s"Cyclical DAG detected when visiting activity with id: '${activity.id}' with already visited activities: ${visitedIds.mkString("'", "', '", "'")}" 124 | ) 125 | 126 | case class DuplicateActivityException(id: String) extends Exception( 127 | s"Duplicate activities detected with id: '$id'" 128 | ) 129 | -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/AwsDataPipeline.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | import java.time.LocalDateTime 20 | import java.time.format.DateTimeFormatter.ISO_LOCAL_DATE_TIME 21 | 22 | import org.json4s._ 23 | 24 | object AwsDataPipeline { 25 | 26 | def apply(name: String): PipelineBuilder = PipelineBuilder(name) 27 | 28 | } 29 | 30 | case class PipelineBuilder(name: String, 31 | defaults: Defaults = Defaults(), 32 | schedule: Option[Schedule] = None, 33 | snsAlarm: Option[SnsAlarm] = None, 34 | rootActivities: Seq[Activity] = Nil) { 35 | 36 | def withDefaults(failureAndRerunMode: FailureAndRerunMode = FailureAndRerunMode.None, 37 | role: Option[String] = None, 38 | resourceRole: Option[String] = None, 39 | pipelineLogUri: Option[String] = None): PipelineBuilder = 40 | this.copy(defaults = Defaults(failureAndRerunMode = failureAndRerunMode, role = role, resourceRole = resourceRole, pipelineLogUri = pipelineLogUri)) 41 | 42 | def withSnsAlarm(topicArn: String, subject: String, message: String): PipelineBuilder = 43 | this.copy(snsAlarm = Some(SnsAlarm(topicArn, subject, message))) 44 | 45 | def withSchedule(frequency: PipelineFrequency, startDateTimeIso: String): PipelineBuilder = 46 | this.copy(schedule = Some(CronSchedule(frequency = frequency, startTime = LocalDateTime.parse(startDateTimeIso, ISO_LOCAL_DATE_TIME)))) 47 | 48 | def withOnDemandSchedule: PipelineBuilder = this.copy(schedule = Some(OnDemandSchedule)) 49 | 50 | def withActivities(activity: Activity, activities: Activity*): PipelineBuilder = 51 | withActivities(Seq(activity) ++ activities) 52 | 53 | def withActivities(activities: Seq[Activity], moreActivities: Seq[Activity]*): PipelineBuilder = 54 | this.copy(rootActivities = rootActivities ++ activities ++ moreActivities.flatten) 55 | 56 | /** 57 | * Flattened list of all activities 58 | */ 59 | lazy val activities: Seq[Activity] = Activity.resolveDependencyTree(rootActivities) 60 | 61 | /** 62 | * All databases 63 | */ 64 | lazy val databases: Seq[Database] = { 65 | activities 66 | .collect { 67 | case sqlActivity: SqlActivity => sqlActivity 68 | } 69 | .map(_.database) 70 | .distinct 71 | } 72 | 73 | /** 74 | * All preconditions 75 | */ 76 | lazy val preconditions: Seq[Precondition] = activities.flatMap(_.preconditions).distinct 77 | 78 | /** 79 | * All resources 80 | */ 81 | lazy val resources: Seq[Resource] = { 82 | activities 83 | .collect { 84 | case emrActivity: EmrActivity => emrActivity 85 | } 86 | .map(_.emrCluster) 87 | .distinct 88 | } 89 | 90 | /** 91 | * Data pipeline rendered as JSON 92 | */ 93 | lazy val json: JObject = Json.render(this) 94 | 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Database.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | sealed trait Database extends DataPipelineObject 20 | 21 | case class RedshiftDatabase(name: String, 22 | username: String, 23 | password: String, 24 | clusterId: String, 25 | region: Option[String] = None) extends Database { 26 | 27 | override val objectType = "RedshiftDatabase" 28 | } -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Defaults.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | case class Defaults(failureAndRerunMode: FailureAndRerunMode = FailureAndRerunMode.None, 20 | role: Option[String] = None, 21 | resourceRole: Option[String] = None, 22 | pipelineLogUri: Option[String] = None) 23 | 24 | trait FailureAndRerunMode { 25 | val mode: String 26 | } 27 | 28 | object FailureAndRerunMode { 29 | 30 | object None extends FailureAndRerunMode { 31 | override val mode = "none" 32 | } 33 | 34 | object Cascade extends FailureAndRerunMode { 35 | override val mode = "cascade" 36 | } 37 | 38 | } -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Eithers.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | import org.json4s.JsonDSL._ 20 | import org.json4s._ 21 | 22 | sealed trait CommandOrScriptUri { 23 | val json: JObject 24 | } 25 | 26 | case class Command(command: String, arguments: Seq[String] = Nil) extends CommandOrScriptUri { 27 | override val json: JObject = ("command" -> command) ~ ("scriptArgument" -> arguments) 28 | } 29 | 30 | case class ScriptUri(scriptUri: String) extends CommandOrScriptUri { 31 | override val json: JObject = "scriptUri" -> scriptUri 32 | } -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Json.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | import java.time.format.DateTimeFormatter 20 | 21 | import org.json4s.JsonDSL._ 22 | import org.json4s._ 23 | 24 | object Json { 25 | 26 | def render(pipelineBuilder: PipelineBuilder): JObject = { 27 | import pipelineBuilder._ 28 | 29 | val objects = 30 | Seq(DefaultJsonRenderer(pipelineBuilder)) ++ 31 | ScheduleJsonRenderer(schedule) ++ 32 | ActivityJsonRenderer(activities) ++ 33 | ResourceJsonRenderer(resources, name) ++ 34 | PreconditionJsonRenderer(preconditions) ++ 35 | DatabaseJsonRenderer(databases) ++ 36 | SnsAlarmRenderer(snsAlarm) 37 | 38 | ("objects" -> objects) ~ ("parameters" -> Seq.empty[JObject]) 39 | } 40 | } 41 | 42 | object IdTypeAndNameJsonRenderer { 43 | def apply(objectType: String, dataPipelineObject: DataPipelineObject): JObject = { 44 | ("id" -> dataPipelineObject.id) ~ 45 | ("type" -> objectType) ~ 46 | ("name" -> dataPipelineObject.name) 47 | } 48 | } 49 | 50 | object RetryableJsonRenderer { 51 | def apply(retryable: Retryable): JObject = 52 | ("attemptTimeout" -> retryable.attemptTimeout.map(_.toString)) ~ 53 | ("maximumRetries" -> retryable.maximumRetries.map(_.toString)) ~ 54 | ("retryDelay" -> retryable.retryDelay.map(_.toString)) 55 | } 56 | 57 | object DefaultJsonRenderer { 58 | def apply(pipelineBuilder: PipelineBuilder): JObject = { 59 | 60 | import pipelineBuilder._ 61 | 62 | require(schedule.nonEmpty, "A schedule must be provided.") 63 | 64 | ("id" -> "Default") ~ 65 | ("name" -> "Default") ~ 66 | ("role" -> defaults.role) ~ 67 | ("resourceRole" -> defaults.resourceRole) ~ 68 | ("failureAndRerunMode" -> defaults.failureAndRerunMode.mode) ~ 69 | ("pipelineLogUri" -> defaults.pipelineLogUri) ~ 70 | ("scheduleType" -> schedule.map(_.scheduleType)) ~ 71 | ("schedule" -> schedule.filter(_.isInstanceOf[CronSchedule]).map(_ => "ref" -> "Schedule")) ~ 72 | ("onFail" -> snsAlarm.map { alarm => "ref" -> alarm.id }) 73 | } 74 | } 75 | 76 | object ScheduleJsonRenderer { 77 | def apply(maybeSchedule: Option[Schedule]): Option[JObject] = 78 | maybeSchedule.collect { 79 | case cron: CronSchedule => 80 | ("id" -> "Schedule") ~ 81 | ("type" -> "Schedule") ~ 82 | ("name" -> "Schedule") ~ 83 | ("period" -> cron.frequency.period) ~ 84 | ("startDateTime" -> DateTimeFormatter.ISO_DATE_TIME.format(cron.startTime)) 85 | } 86 | } 87 | 88 | object SnsAlarmRenderer { 89 | def apply(maybeAlarm: Option[SnsAlarm]): Option[JObject] = 90 | maybeAlarm.map { 91 | alarm => 92 | ("id" -> alarm.id) ~ 93 | ("type" -> "SnsAlarm") ~ 94 | ("name" -> alarm.name) ~ 95 | ("topicArn" -> alarm.topicArn) ~ 96 | ("subject" -> alarm.subject) ~ 97 | ("message" -> alarm.message) 98 | } 99 | } 100 | 101 | object ActivityJsonRenderer { 102 | 103 | def apply(activities: Seq[Activity]): Seq[JObject] = activities.map { 104 | activity => 105 | IdTypeAndNameJsonRenderer(activity.objectType, activity).merge { 106 | 107 | activity match { 108 | 109 | case emrActivity: EmrActivity => 110 | import emrActivity._ 111 | ("runsOn" -> ("ref" -> emrCluster.id)) ~ 112 | ("step" -> steps.map(step => step)) ~ 113 | ("precondition" -> preconditions.map(pre => "ref" -> pre.id)) ~ 114 | ("dependsOn" -> dependsOn.map(dep => "ref" -> dep.id)) 115 | 116 | case sqlActivity: SqlActivity => 117 | import sqlActivity._ 118 | ("database" -> ("ref" -> database.id)) ~ 119 | ("workerGroup" -> workerGroup) ~ 120 | ("script" -> script) ~ 121 | ("precondition" -> preconditions.map(pre => "ref" -> pre.id)) ~ 122 | ("dependsOn" -> dependsOn.map(dep => "ref" -> dep.id)) 123 | 124 | case shellCommandActivity: ShellCommandActivity => 125 | import shellCommandActivity._ 126 | commandOrScriptUri.json.merge( 127 | ("workerGroup" -> workerGroup) ~ 128 | ("stdout" -> stdout) ~ 129 | ("stderr" -> stderr) ~ 130 | ("precondition" -> preconditions.map(pre => "ref" -> pre.id)) ~ 131 | ("dependsOn" -> dependsOn.map(dep => "ref" -> dep.id)) 132 | ) 133 | } 134 | }.merge { 135 | RetryableJsonRenderer(activity) 136 | } 137 | } 138 | } 139 | 140 | object PreconditionJsonRenderer { 141 | def apply(preconditions: Seq[Precondition]): Seq[JObject] = preconditions.map { 142 | precondition => 143 | IdTypeAndNameJsonRenderer(precondition.objectType, precondition).merge { 144 | precondition match { 145 | case s3KeyExists: S3KeyExists => 146 | import s3KeyExists._ 147 | ("s3Key" -> s3Key) ~ 148 | ("preconditionTimeout" -> preconditionTimeout.map(_.toString)) 149 | 150 | case s3PrefixNotEmpty: S3PrefixNotEmpty => 151 | import s3PrefixNotEmpty._ 152 | ("s3Prefix" -> s3Prefix) ~ 153 | ("preconditionTimeout" -> preconditionTimeout.map(_.toString)) 154 | 155 | case shellCommandPrecondition: ShellCommandPrecondition => 156 | import shellCommandPrecondition._ 157 | 158 | commandOrScriptUri.json.merge( 159 | ("stdout" -> stdout) ~ 160 | ("stderr" -> stderr) ~ 161 | ("preconditionTimeout" -> preconditionTimeout.map(_.toString)) 162 | ) 163 | } 164 | }.merge { 165 | RetryableJsonRenderer(precondition) 166 | } 167 | } 168 | } 169 | 170 | object DatabaseJsonRenderer { 171 | def apply(databases: Seq[Database]): Seq[JObject] = databases.map { 172 | database => 173 | IdTypeAndNameJsonRenderer(database.objectType, database).merge { 174 | database match { 175 | case redshiftDatabase: RedshiftDatabase => 176 | import redshiftDatabase._ 177 | 178 | ("username" -> username) ~ 179 | ("*password" -> password) ~ 180 | ("clusterId" -> clusterId) ~ 181 | ("region" -> region) 182 | } 183 | } 184 | } 185 | } 186 | 187 | object ResourceJsonRenderer { 188 | def apply(resources: Seq[Resource], pipelineName: String): Seq[JObject] = resources.flatMap { 189 | case emrCluster: EmrCluster => 190 | import emrCluster._ 191 | 192 | val configurationJson = configuration.toSeq.map { 193 | conf => 194 | ("name" -> conf.name) ~ 195 | ("type" -> "EmrConfiguration") ~ 196 | ("id" -> conf.id) ~ 197 | ("classification" -> conf.classification) ~ 198 | ("property" -> conf.properties.toSeq.indices.map { index => "ref" -> s"${conf.id}-property-$index" }) 199 | } 200 | 201 | val propertiesJson = configuration.toSeq.flatMap(conf => conf.properties.toSeq.map(conf.name -> _)).zipWithIndex.map { 202 | case ((confName, (key, value)), index) => 203 | val propertyName = s"$confName property $index" 204 | ("id" -> propertyName.replaceAllLiterally(" ", "-")) ~ 205 | ("type" -> "Property") ~ 206 | ("name" -> propertyName) ~ 207 | ("key" -> key) ~ 208 | ("value" -> value) 209 | } 210 | 211 | val cluster = IdTypeAndNameJsonRenderer(emrCluster.objectType, emrCluster).merge { 212 | ("enableDebugging" -> enableDebugging.map(_.toString)) ~ 213 | ("releaseLabel" -> releaseLabel) ~ 214 | ("masterInstanceType" -> masterInstanceType) ~ 215 | ("coreInstanceCount" -> coreInstanceCount.map(_.toString)) ~ 216 | ("coreInstanceType" -> coreInstanceType) ~ 217 | ("coreInstanceBidPrice" -> coreInstanceBidPrice.map(_.toString)) ~ 218 | ("useOnDemandOnLastAttempt" -> useOnDemandOnLastAttempt.map(_.toString)) ~ 219 | ("terminateAfter" -> terminateAfter.map(_.toString)) ~ 220 | ("emrLogUri" -> emrLogUri) ~ 221 | ("keyPair" -> keyPair) ~ 222 | ("region" -> region) ~ 223 | ("applications" -> applications) ~ 224 | ("configuration" -> configuration.map { conf => "ref" -> conf.name.replaceAllLiterally(" ", "-") }) 225 | } 226 | Seq(cluster) ++ configurationJson ++ propertiesJson 227 | } 228 | } 229 | -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Precondition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | import scala.concurrent.duration.FiniteDuration 20 | 21 | sealed trait Precondition extends DataPipelineObject with Retryable 22 | 23 | case class S3KeyExists(name: String, 24 | s3Key: String, 25 | preconditionTimeout: Option[FiniteDuration] = None, 26 | attemptTimeout: Option[FiniteDuration] = None, 27 | maximumRetries: Option[Int] = None, 28 | retryDelay: Option[FiniteDuration] = None) extends Precondition { 29 | 30 | override val objectType = "S3KeyExists" 31 | } 32 | 33 | case class S3PrefixNotEmpty(name: String, 34 | s3Prefix: String, 35 | preconditionTimeout: Option[FiniteDuration] = None, 36 | attemptTimeout: Option[FiniteDuration] = None, 37 | maximumRetries: Option[Int] = None, 38 | retryDelay: Option[FiniteDuration] = None) extends Precondition { 39 | 40 | override val objectType = "S3PrefixNotEmpty" 41 | } 42 | 43 | case class ShellCommandPrecondition(name: String, 44 | commandOrScriptUri: CommandOrScriptUri, 45 | stdout: Option[String], 46 | stderr: Option[String], 47 | preconditionTimeout: Option[FiniteDuration] = None, 48 | attemptTimeout: Option[FiniteDuration] = None, 49 | maximumRetries: Option[Int] = None, 50 | retryDelay: Option[FiniteDuration] = None) extends Precondition { 51 | 52 | override val objectType = "ShellCommandPrecondition" 53 | } 54 | 55 | -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Resource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | import scala.concurrent.duration.FiniteDuration 20 | 21 | sealed trait Resource extends DataPipelineObject 22 | 23 | case class EmrCluster(name: String, 24 | enableDebugging: Option[Boolean] = None, 25 | releaseLabel: Option[String] = None, 26 | masterInstanceType: Option[String] = None, 27 | coreInstanceCount: Option[Int] = None, 28 | coreInstanceType: Option[String] = None, 29 | coreInstanceBidPrice: Option[BigDecimal] = None, 30 | useOnDemandOnLastAttempt: Option[Boolean] = None, 31 | terminateAfter: Option[FiniteDuration] = None, 32 | emrLogUri: Option[String] = None, 33 | keyPair: Option[String] = None, 34 | region: Option[String] = None, 35 | applications: Seq[String] = Nil, 36 | configuration: Option[EmrConfiguration] = None) extends Resource { 37 | 38 | override val objectType = "EmrCluster" 39 | } 40 | 41 | case class EmrConfiguration(name: String, 42 | classification: String, 43 | properties: Map[String, String]) { 44 | val id = name.replaceAllLiterally(" ", "-") 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Schedule.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | import java.time.LocalDateTime 20 | 21 | import scala.concurrent.duration._ 22 | 23 | sealed trait Schedule { 24 | val scheduleType: String 25 | } 26 | 27 | object OnDemandSchedule extends Schedule { 28 | override val scheduleType = "ondemand" 29 | } 30 | 31 | case class CronSchedule(frequency: PipelineFrequency, startTime: LocalDateTime) extends Schedule { 32 | override val scheduleType = "cron" 33 | } 34 | 35 | sealed abstract class PipelineFrequency(val period: String) 36 | 37 | case object Hourly extends PipelineFrequency(1.hour.toString) 38 | 39 | case object Daily extends PipelineFrequency(1.day.toString) 40 | 41 | case object Weekly extends PipelineFrequency("1 week") 42 | 43 | case object Monthly extends PipelineFrequency("1 month") 44 | 45 | case class RunEvery(duration: FiniteDuration) extends PipelineFrequency(duration.toString) 46 | -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/Traits.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | import scala.concurrent.duration.FiniteDuration 20 | 21 | trait DataPipelineObject { 22 | val name: String 23 | 24 | lazy val id: String = name.map { 25 | case ' ' => '-' 26 | case c if DataPipelineObject.ValidIdCharacter(c) => c 27 | case _ => '_' 28 | } 29 | 30 | val objectType: String 31 | } 32 | 33 | object DataPipelineObject { 34 | val ValidIdCharacter: Set[Char] = Set('*', '.', '_', '-') ++ ('0' to '9') ++ ('a' to 'z') ++ ('A' to 'Z') 35 | } 36 | 37 | trait Retryable { 38 | val attemptTimeout: Option[FiniteDuration] 39 | val maximumRetries: Option[Int] 40 | val retryDelay: Option[FiniteDuration] 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/main/scala/datapipeline/dsl/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline 18 | 19 | package object dsl { 20 | 21 | /** 22 | * RichSeqOfActivities provides a shorthand for times when you have a set of activities that 23 | * collectively have a downstream dependency, 24 | * e.g. (a, b) >> c 25 | * or (a, b) >> (c, d) 26 | * 27 | */ 28 | implicit class RichSeqOfActivities(activities: Seq[Activity]) { 29 | def >>(activity: Activity): Activity = { 30 | // println(s"${activity.id} depends on ${activities.map(_.id)}") 31 | activity.withDependencies(activities ++ activity.dependsOn) 32 | } 33 | 34 | def >>(activity: Activity, activities: Activity*): Seq[Activity] = (Seq(activity) ++ activities).map(this >> _) 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/datapipeline/dsl/AwsDataPipelineSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Shazam Entertainment Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * 7 | * You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on 12 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 13 | * either express or implied. See the License for the specific 14 | * language governing permissions and limitations under the License 15 | */ 16 | 17 | package datapipeline.dsl 18 | 19 | import datapipeline.dsl.FailureAndRerunMode.Cascade 20 | import org.json4s._ 21 | import org.json4s.native.JsonMethods._ 22 | import org.scalatest.{Matchers, WordSpec} 23 | 24 | import scala.concurrent.duration.DurationLong 25 | 26 | class AwsDataPipelineSpec extends WordSpec with Matchers { 27 | 28 | implicit val formats = DefaultFormats 29 | 30 | "An AWS DataPipeline" can { 31 | 32 | "return pipeline defaults" should { 33 | 34 | "Default" in { 35 | val pipelineJson = 36 | basePipeline 37 | .withDefaults( 38 | failureAndRerunMode = Cascade, 39 | role = Some("role"), 40 | resourceRole = Some("resource-role"), 41 | pipelineLogUri = Some("s3://log-uri/") 42 | ) 43 | .withSnsAlarm(topicArn = "", subject = "", message = "") 44 | .json 45 | 46 | objectWithId(pipelineJson, "Default") shouldBe Some(parse( 47 | """{ 48 | | "id": "Default", 49 | | "name": "Default", 50 | | "role": "role", 51 | | "resourceRole": "resource-role", 52 | | "failureAndRerunMode": "cascade", 53 | | "pipelineLogUri": "s3://log-uri/", 54 | | "scheduleType": "cron", 55 | | "schedule": { 56 | | "ref": "Schedule" 57 | | }, 58 | | "onFail": { 59 | | "ref": "SnsAlarm" 60 | | } 61 | |} 62 | """.stripMargin 63 | )) 64 | } 65 | } 66 | 67 | "return JSON for schedules" should { 68 | 69 | "cron schedule" in { 70 | val startDateTime = "2018-01-01T00:00:00" 71 | val cronPipelineJson = 72 | AwsDataPipeline(name = "pipeline").withSchedule(frequency = Daily, startDateTimeIso = startDateTime).json 73 | 74 | objectsWithType(cronPipelineJson, "Schedule") shouldBe Seq(parse( 75 | s"""{ 76 | | "id": "Schedule", 77 | | "type": "Schedule", 78 | | "name": "Schedule", 79 | | "period": "1 day", 80 | | "startDateTime": "$startDateTime" 81 | |} 82 | """.stripMargin 83 | )) 84 | } 85 | 86 | "on-demand schedule" in { 87 | val onDemandPipelineJson = AwsDataPipeline(name = "pipeline").withOnDemandSchedule.json 88 | 89 | objectsWithType(onDemandPipelineJson, "Schedule") shouldBe empty 90 | 91 | objectWithId(onDemandPipelineJson, "Default").map(_ \ "scheduleType") shouldBe Some(JString("ondemand")) 92 | objectWithId(onDemandPipelineJson, "Default").map(_ \ "schedule") shouldBe Some(JNothing) 93 | } 94 | } 95 | 96 | "return JSON for actions" should { 97 | 98 | "SnsAlarm" in { 99 | val pipelineJson = 100 | basePipeline 101 | .withSnsAlarm( 102 | topicArn = "arn:topic:arn", 103 | subject = "It failed!", 104 | message = "Oh noes" 105 | ) 106 | .json 107 | 108 | objectWithId(pipelineJson, "SnsAlarm") shouldBe Some(parse( 109 | """{ 110 | | "id": "SnsAlarm", 111 | | "type": "SnsAlarm", 112 | | "name": "SnsAlarm", 113 | | "topicArn": "arn:topic:arn", 114 | | "subject": "It failed!", 115 | | "message": "Oh noes" 116 | |} 117 | """.stripMargin 118 | )) 119 | } 120 | } 121 | 122 | "return JSON for activities" should { 123 | 124 | "EmrActivity" in { 125 | val emrActivity = 126 | EmrActivity( 127 | name = "emr activity", 128 | emrCluster = EmrCluster( 129 | name = "emr cluster" 130 | ), 131 | steps = Seq( 132 | "step-1", 133 | "step-2" 134 | ), 135 | preconditions = twoPreconditions, 136 | attemptTimeout = Some(1.hour), 137 | maximumRetries = Some(2), 138 | retryDelay = Some(10.minutes) 139 | ) 140 | 141 | val pipelineJson = basePipeline.withActivities(emrActivity).json 142 | 143 | objectWithId(pipelineJson, emrActivity.id) shouldBe Some(parse( 144 | """{ 145 | | "id": "emr-activity", 146 | | "type": "EmrActivity", 147 | | "name": "emr activity", 148 | | "runsOn": { 149 | | "ref": "emr-cluster" 150 | | }, 151 | | "step": [ 152 | | "step-1", 153 | | "step-2" 154 | | ], 155 | | "precondition": [ 156 | | {"ref": "precondition-1"}, 157 | | {"ref": "precondition-2"} 158 | | ], 159 | | "attemptTimeout": "1 hour", 160 | | "maximumRetries": "2", 161 | | "retryDelay": "10 minutes", 162 | | "dependsOn": [] 163 | |} 164 | """.stripMargin 165 | )) 166 | } 167 | 168 | "SqlActivity" in { 169 | val sqlActivity = 170 | SqlActivity( 171 | name = "sql activity", 172 | database = redshiftDatabase( 173 | name = "redshift database" 174 | ), 175 | workerGroup = "worker-group", 176 | script = "sql script", 177 | preconditions = twoPreconditions, 178 | attemptTimeout = Some(1.hour), 179 | maximumRetries = Some(2), 180 | retryDelay = Some(10.minutes) 181 | ) 182 | 183 | val pipelineJson = basePipeline.withActivities(sqlActivity).json 184 | 185 | objectWithId(pipelineJson, sqlActivity.id) shouldBe Some(parse( 186 | """{ 187 | | "id": "sql-activity", 188 | | "type": "SqlActivity", 189 | | "name": "sql activity", 190 | | "database": { 191 | | "ref": "redshift-database" 192 | | }, 193 | | "workerGroup": "worker-group", 194 | | "script": "sql script", 195 | | "precondition": [ 196 | | {"ref": "precondition-1"}, 197 | | {"ref": "precondition-2"} 198 | | ], 199 | | "attemptTimeout": "1 hour", 200 | | "maximumRetries": "2", 201 | | "retryDelay": "10 minutes", 202 | | "dependsOn": [] 203 | |} 204 | """.stripMargin 205 | )) 206 | } 207 | 208 | "ShellCommandActivity" in { 209 | val shellCommandActivity = ShellCommandActivity( 210 | name = "shell command activity", 211 | workerGroup = "worker-group", 212 | commandOrScriptUri = Command(command = "echo Hello $1 world", arguments = Seq("datapipeline")), 213 | stdout = Some("s3://stdout/"), 214 | stderr = Some("s3://stderr/"), 215 | preconditions = twoPreconditions, 216 | attemptTimeout = Some(1.hour), 217 | maximumRetries = Some(2), 218 | retryDelay = Some(10.minutes) 219 | ) 220 | 221 | val pipelineJson = basePipeline.withActivities(shellCommandActivity).json 222 | 223 | objectWithId(pipelineJson, shellCommandActivity.id) shouldBe Some(parse( 224 | """{ 225 | | "id": "shell-command-activity", 226 | | "type": "ShellCommandActivity", 227 | | "name": "shell command activity", 228 | | "command": "echo Hello $1 world", 229 | | "scriptArgument": [ "datapipeline" ], 230 | | "workerGroup": "worker-group" 231 | | "stdout": "s3://stdout/", 232 | | "stderr": "s3://stderr/", 233 | | "precondition": [ 234 | | {"ref": "precondition-1"}, 235 | | {"ref": "precondition-2"} 236 | | ], 237 | | "attemptTimeout": "1 hour", 238 | | "maximumRetries": "2", 239 | | "retryDelay": "10 minutes", 240 | | "dependsOn": [] 241 | |} 242 | """.stripMargin 243 | )) 244 | } 245 | } 246 | 247 | "return JSON for resources" should { 248 | 249 | "EmrCluster" in { 250 | 251 | val emrCluster = EmrCluster( 252 | name = "emr cluster", 253 | enableDebugging = Some(true), 254 | releaseLabel = Some("emr-5.9.0"), 255 | masterInstanceType = Some("c3.xlarge"), 256 | coreInstanceCount = Some(6), 257 | coreInstanceType = Some("c3.xlarge"), 258 | coreInstanceBidPrice = Some(0.66), 259 | useOnDemandOnLastAttempt = Some(true), 260 | terminateAfter = Some(5.hours), 261 | keyPair = Some("data-engineering"), 262 | region = Some("us-east-1"), 263 | emrLogUri = Some("s3://s3-log-bucket"), 264 | applications = Seq("Spark"), 265 | configuration = Some(EmrConfiguration( 266 | name = "optimize spark", 267 | classification = "spark", 268 | properties = Map() 269 | )) 270 | ) 271 | 272 | val pipelineJson = basePipeline 273 | .withActivities( 274 | EmrActivity( 275 | name = "don't care", 276 | emrCluster = emrCluster, 277 | steps = Seq("") 278 | ) 279 | ) 280 | .json 281 | 282 | objectWithId(pipelineJson, emrCluster.id) shouldBe Some(parse( 283 | """{ 284 | | "id": "emr-cluster", 285 | | "type": "EmrCluster", 286 | | "name": "emr cluster", 287 | | "enableDebugging": "true", 288 | | "releaseLabel": "emr-5.9.0", 289 | | "masterInstanceType": "c3.xlarge", 290 | | "coreInstanceCount": "6", 291 | | "coreInstanceType": "c3.xlarge", 292 | | "coreInstanceBidPrice": "0.66", 293 | | "useOnDemandOnLastAttempt": "true", 294 | | "terminateAfter": "5 hours", 295 | | "emrLogUri": "s3://s3-log-bucket", 296 | | "keyPair": "data-engineering", 297 | | "region": "us-east-1", 298 | | "applications": ["Spark"], 299 | | "configuration": { 300 | | "ref": "optimize-spark" 301 | | } 302 | | 303 | |} 304 | """.stripMargin 305 | )) 306 | } 307 | 308 | "EmrConfiguration and Properties" in { 309 | 310 | val emrCluster = EmrCluster( 311 | name = "emr cluster id", 312 | configuration = Some(EmrConfiguration( 313 | name = "optimize spark", 314 | classification = "spark", 315 | properties = Map( 316 | "maximizeResourceAllocation" -> "true", 317 | "reifyMonads" -> "false" 318 | ) 319 | )) 320 | ) 321 | 322 | val pipelineJson = basePipeline 323 | .withActivities( 324 | EmrActivity( 325 | name = "don't care", 326 | emrCluster = emrCluster, 327 | steps = Seq("") 328 | ) 329 | ) 330 | .json 331 | 332 | objectWithId(pipelineJson, emrCluster.configuration.get.id) shouldBe Some(parse( 333 | """{ 334 | | "id": "optimize-spark", 335 | | "type": "EmrConfiguration", 336 | | "name": "optimize spark", 337 | | "classification": "spark", 338 | | "property": [ 339 | | { 340 | | "ref": "optimize-spark-property-0" 341 | | }, 342 | | { 343 | | "ref": "optimize-spark-property-1" 344 | | } 345 | | ] 346 | |} 347 | """.stripMargin 348 | )) 349 | 350 | objectWithId(pipelineJson, "optimize-spark-property-0") shouldBe Some(parse( 351 | """{ 352 | | "id": "optimize-spark-property-0", 353 | | "type": "Property", 354 | | "name": "optimize spark property 0", 355 | | "key": "maximizeResourceAllocation", 356 | | "value": "true" 357 | |} 358 | """.stripMargin 359 | )) 360 | 361 | objectWithId(pipelineJson, "optimize-spark-property-1") shouldBe Some(parse( 362 | """{ 363 | | "id": "optimize-spark-property-1", 364 | | "type": "Property", 365 | | "name": "optimize spark property 1", 366 | | "key": "reifyMonads", 367 | | "value": "false" 368 | |} 369 | """.stripMargin 370 | )) 371 | } 372 | 373 | } 374 | 375 | "return JSON for preconditions" should { 376 | 377 | "S3KeyExists" in { 378 | val s3KeyExists = S3KeyExists( 379 | name = "s3 key exists", 380 | s3Key = "s3://key/", 381 | preconditionTimeout = Some(1.minute), 382 | attemptTimeout = Some(1.hour), 383 | maximumRetries = Some(2), 384 | retryDelay = Some(10.minutes) 385 | ) 386 | 387 | val pipelineJson = basePipeline.withActivities( 388 | EmrActivity( 389 | name = "don't-care", 390 | emrCluster = EmrCluster( 391 | name = "don't-care" 392 | ), 393 | steps = Seq(""), 394 | preconditions = Seq( 395 | s3KeyExists 396 | ) 397 | ) 398 | ).json 399 | 400 | objectWithId(pipelineJson, s3KeyExists.id) shouldBe Some(parse( 401 | """{ 402 | | "id": "s3-key-exists", 403 | | "type": "S3KeyExists", 404 | | "name": "s3 key exists", 405 | | "s3Key": "s3://key/", 406 | | "preconditionTimeout": "1 minute", 407 | | "attemptTimeout": "1 hour", 408 | | "maximumRetries": "2", 409 | | "retryDelay": "10 minutes" 410 | |} 411 | """.stripMargin 412 | )) 413 | } 414 | 415 | "S3PrefixNotEmpty" in { 416 | val s3PrefixNotEmpty = S3PrefixNotEmpty( 417 | name = "s3 prefix not empty", 418 | s3Prefix = "s3://prefix/", 419 | preconditionTimeout = Some(1.minute), 420 | attemptTimeout = Some(1.hour), 421 | maximumRetries = Some(2), 422 | retryDelay = Some(10.minutes) 423 | ) 424 | 425 | val pipelineJson = basePipeline.withActivities( 426 | EmrActivity( 427 | name = "don't-care", 428 | emrCluster = EmrCluster( 429 | name = "don't-care" 430 | ), 431 | steps = Seq(""), 432 | preconditions = Seq( 433 | s3PrefixNotEmpty 434 | ) 435 | ) 436 | ).json 437 | 438 | objectWithId(pipelineJson, s3PrefixNotEmpty.id) shouldBe Some(parse( 439 | """{ 440 | | "id": "s3-prefix-not-empty", 441 | | "type": "S3PrefixNotEmpty", 442 | | "name": "s3 prefix not empty", 443 | | "s3Prefix": "s3://prefix/", 444 | | "preconditionTimeout": "1 minute", 445 | | "attemptTimeout": "1 hour", 446 | | "maximumRetries": "2", 447 | | "retryDelay": "10 minutes" 448 | |} 449 | """.stripMargin 450 | )) 451 | } 452 | 453 | "ShellCommandPrecondition" in { 454 | val shellCommandPrecondition = ShellCommandPrecondition( 455 | name = "shell command precondition", 456 | commandOrScriptUri = Command(command = "echo $1", arguments = Seq("hello")), 457 | stdout = Some("s3://stdout/"), 458 | stderr = Some("s3://stderr/"), 459 | preconditionTimeout = Some(1.minute), 460 | attemptTimeout = Some(1.hour), 461 | maximumRetries = Some(2), 462 | retryDelay = Some(10.minutes) 463 | ) 464 | 465 | val pipelineJson = basePipeline.withActivities( 466 | EmrActivity( 467 | name = "don't-care", 468 | emrCluster = EmrCluster( 469 | name = "don't-care" 470 | ), 471 | steps = Seq(""), 472 | preconditions = Seq( 473 | shellCommandPrecondition 474 | ) 475 | ) 476 | ).json 477 | 478 | objectWithId(pipelineJson, shellCommandPrecondition.id) shouldBe Some(parse( 479 | """{ 480 | | "id": "shell-command-precondition", 481 | | "type": "ShellCommandPrecondition", 482 | | "name": "shell command precondition", 483 | | "command": "echo $1", 484 | | "scriptArgument": [ "hello" ], 485 | | "stdout": "s3://stdout/", 486 | | "stderr": "s3://stderr/", 487 | | "preconditionTimeout": "1 minute", 488 | | "attemptTimeout": "1 hour", 489 | | "maximumRetries": "2", 490 | | "retryDelay": "10 minutes" 491 | |} 492 | """.stripMargin 493 | )) 494 | } 495 | } 496 | 497 | "return JSON for databases" should { 498 | 499 | "RedshiftDatabase" in { 500 | val pipelineJson = basePipeline.withActivities( 501 | SqlActivity( 502 | name = "don't care", 503 | script = "", 504 | database = RedshiftDatabase( 505 | name = "redshift database", 506 | username = "user", 507 | password = "pass", 508 | clusterId = "cluster-id", 509 | region = Some("us-east-1") 510 | ), 511 | workerGroup = "worker-group" 512 | ) 513 | ).json 514 | 515 | objectWithId(pipelineJson, "redshift-database") shouldBe Some(parse( 516 | """{ 517 | | "id": "redshift-database", 518 | | "type": "RedshiftDatabase", 519 | | "name": "redshift database", 520 | | "username": "user", 521 | | "*password": "pass", 522 | | "clusterId": "cluster-id", 523 | | "region": "us-east-1" 524 | |} 525 | """.stripMargin 526 | )) 527 | } 528 | 529 | } 530 | 531 | "express dependencies between activities" should { 532 | 533 | "a simple A >> B dependency" in { 534 | val pipeline = 535 | basePipeline 536 | .withActivities( 537 | emrActivity("A") >> emrActivity("B") 538 | ) 539 | 540 | val activities = emrActivityObjects(pipeline) 541 | 542 | activities should have length 2 543 | activities.head.dependsOn shouldBe Some(Nil) 544 | activities.last.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"))) 545 | } 546 | 547 | "an A >> B >> C dependency chain" in { 548 | val pipeline = 549 | basePipeline 550 | .withActivities( 551 | emrActivity("A") >> emrActivity("B") >> emrActivity("C") 552 | ) 553 | 554 | val activities = emrActivityObjects(pipeline) 555 | 556 | activities should have length 3 557 | activities(0).dependsOn shouldBe Some(Nil) 558 | activities(1).dependsOn shouldBe Some(Seq(DependsOn(ref = "A"))) 559 | activities(2).dependsOn shouldBe Some(Seq(DependsOn(ref = "B"))) 560 | } 561 | 562 | "an A >> (B,C) dependency chain" in { 563 | val pipeline = 564 | basePipeline 565 | .withActivities( 566 | emrActivity("A") >> ( 567 | emrActivity("B"), 568 | emrActivity("C") 569 | ) 570 | ) 571 | 572 | val activities = emrActivityObjects(pipeline) 573 | 574 | activities should have length 3 575 | activities.find(_.id == "A").get.dependsOn shouldBe Some(Nil) 576 | activities.find(_.id == "B").get.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"))) 577 | activities.find(_.id == "C").get.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"))) 578 | } 579 | 580 | "an (A,B) >> C dependency chain" in { 581 | val pipeline = 582 | basePipeline 583 | .withActivities( 584 | Seq( 585 | emrActivity("A"), 586 | emrActivity("B") 587 | ) >> 588 | emrActivity("C") 589 | ) 590 | 591 | val activities = emrActivityObjects(pipeline) 592 | 593 | activities should have length 3 594 | activities.find(_.id == "A").get.dependsOn shouldBe Some(Nil) 595 | activities.find(_.id == "B").get.dependsOn shouldBe Some(Nil) 596 | activities.find(_.id == "C").get.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"), DependsOn(ref = "B"))) 597 | } 598 | 599 | "an (A,B) >> (C,D) dependency chain" in { 600 | val pipeline = 601 | basePipeline 602 | .withActivities( 603 | Seq( 604 | emrActivity("A"), 605 | emrActivity("B") 606 | ) >> 607 | ( 608 | emrActivity("C"), 609 | emrActivity("D"), 610 | ) 611 | ) 612 | 613 | val activities = emrActivityObjects(pipeline) 614 | 615 | activities should have length 4 616 | activities.find(_.id == "A").get.dependsOn shouldBe Some(Nil) 617 | activities.find(_.id == "B").get.dependsOn shouldBe Some(Nil) 618 | activities.find(_.id == "C").get.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"), DependsOn(ref = "B"))) 619 | activities.find(_.id == "D").get.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"), DependsOn(ref = "B"))) 620 | } 621 | 622 | "an (A,B) >> (C,D) dependency chain expressed long-hand" in { 623 | val activityC = emrActivity("C") 624 | val activityD = emrActivity("D") 625 | val pipeline = 626 | basePipeline 627 | .withActivities( 628 | emrActivity("A") >> (activityC, activityD), 629 | emrActivity("B") >> (activityC, activityD) 630 | ) 631 | 632 | val activities = emrActivityObjects(pipeline) 633 | 634 | activities should have length 4 635 | activities.find(_.id == "A").get.dependsOn shouldBe Some(Nil) 636 | activities.find(_.id == "B").get.dependsOn shouldBe Some(Nil) 637 | activities.find(_.id == "C").get.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"), DependsOn(ref = "B"))) 638 | activities.find(_.id == "D").get.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"), DependsOn(ref = "B"))) 639 | } 640 | 641 | "an (A,B)->C dependency where each instance of C is an equal, but different object" in { 642 | val activityC = emrActivity("C") 643 | val pipeline = 644 | basePipeline 645 | .withActivities( 646 | emrActivity("A") >> activityC, 647 | emrActivity("B") >> activityC.copy() 648 | ) 649 | 650 | val activities = emrActivityObjects(pipeline) 651 | 652 | activities should have length 3 653 | activities.find(_.id == "A").get.dependsOn shouldBe Some(Nil) 654 | activities.find(_.id == "B").get.dependsOn shouldBe Some(Nil) 655 | activities.find(_.id == "C").get.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"), DependsOn(ref = "B"))) 656 | } 657 | 658 | "fail on an A -> B -> A cyclical dependency chain" in { 659 | val activityA = emrActivity("Cycle A") 660 | val activityB = emrActivity("Cycle B") 661 | val pipeline = 662 | basePipeline 663 | .withActivities( 664 | activityA >> 665 | activityB >> 666 | activityA 667 | ) 668 | 669 | val thrown = the[CyclicalActivitiesException] thrownBy emrActivityObjects(pipeline) 670 | 671 | thrown.getMessage shouldBe "Cyclical DAG detected when visiting activity with id: 'Cycle-A' with already visited activities: 'Cycle-A', 'Cycle-B'" 672 | } 673 | 674 | "fail if activities share an id" in { 675 | val pipeline = 676 | basePipeline 677 | .withActivities( 678 | emrActivity("Shared Id"), 679 | emrActivity("Shared Id", emrClusterId = "alternate-emr-cluster-id") 680 | ) 681 | 682 | val thrown = the[DuplicateActivityException] thrownBy emrActivityObjects(pipeline) 683 | 684 | thrown.getMessage shouldBe "Duplicate activities detected with id: 'Shared-Id'" 685 | } 686 | 687 | "allow a dependency declared more than once" in { 688 | val activityA = emrActivity("A") 689 | val activityB = emrActivity("B") 690 | val pipeline = 691 | basePipeline 692 | .withActivities( 693 | activityA >> activityB, 694 | activityA >> activityB 695 | ) 696 | 697 | val activities = emrActivityObjects(pipeline) 698 | 699 | activities should have length 2 700 | activities.find(_.id == "A").get.dependsOn shouldBe Some(Nil) 701 | activities.find(_.id == "B").get.dependsOn shouldBe Some(Seq(DependsOn(ref = "A"))) 702 | } 703 | 704 | } 705 | 706 | "convert names to ids" should { 707 | 708 | "basic latin characters and numbers as well as [*.-_] are permitted" in { 709 | val validId = "ABC-YXZ_abc-xyz_0-9_.*" 710 | emrActivity(name = validId).id shouldBe validId 711 | } 712 | 713 | "spaces become dashes" in { 714 | emrActivity(name = "EMR Activity").id shouldBe "EMR-Activity" 715 | } 716 | 717 | "non-ascii characters become underscores" in { 718 | emrActivity(name = "\u0000\u001f\u007f").id shouldBe "___" 719 | } 720 | 721 | } 722 | 723 | } 724 | 725 | val basePipeline: PipelineBuilder = AwsDataPipeline(name = "base-pipeline") 726 | .withSchedule(frequency = Daily, startDateTimeIso = "2018-01-01T00:00:00") 727 | 728 | def objectWithId(pipelineJson: JObject, objectId: String): Option[JObject] = { 729 | (pipelineJson \ "objects").find(_ \ "id" == JString(objectId)).map(_.asInstanceOf[JObject]) 730 | } 731 | 732 | def objectsWithType(pipelineJson: JObject, typeName: String): Seq[JObject] = { 733 | (pipelineJson \ "objects").filter(_ \ "type" == JString(typeName)).map(_.asInstanceOf[JObject]) 734 | } 735 | 736 | def pipelineObjects(pipelineBuilder: PipelineBuilder): Seq[TestDataPipelineObject] = { 737 | pipelineBuilder 738 | .json 739 | .extract[TestDataPipeline] 740 | .objects 741 | } 742 | 743 | def emrActivityObjects(pipelineBuilder: PipelineBuilder): Seq[TestDataPipelineObject] = { 744 | pipelineObjects(pipelineBuilder).filter(_.`type`.contains("EmrActivity")) 745 | } 746 | 747 | def emrActivity(name: String, emrClusterId: String = "emr-cluster-id", steps: Seq[String] = Seq("")): EmrActivity = 748 | EmrActivity(name = name, emrCluster = EmrCluster(name = emrClusterId), steps = steps) 749 | 750 | def redshiftDatabase(name: String): RedshiftDatabase = RedshiftDatabase(name = name, username = "username", password = "password", clusterId = "cluster-id") 751 | 752 | def s3PrefixNotEmpty(name: String): S3PrefixNotEmpty = S3PrefixNotEmpty(name = name, s3Prefix = "s3://prefix/") 753 | 754 | val twoPreconditions = Seq( 755 | s3PrefixNotEmpty(name = "precondition 1"), 756 | s3PrefixNotEmpty(name = "precondition 2") 757 | ) 758 | 759 | } 760 | 761 | case class TestDataPipeline(objects: Seq[TestDataPipelineObject]) 762 | 763 | case class TestDataPipelineObject(id: String, `type`: Option[String], dependsOn: Option[Seq[DependsOn]]) 764 | 765 | case class DependsOn(ref: String) 766 | --------------------------------------------------------------------------------