├── .github
    └── workflows
    │   └── scala.yml
├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
└── src
    ├── main
        └── scala
        │   └── glue
        │       └── ExampleJob.scala
    └── test
        └── scala
            └── ExampleSpec.scala


/.github/workflows/scala.yml:
--------------------------------------------------------------------------------
 1 | name: Scala CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 | 
 7 | jobs:
 8 |   build:
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 |     - name: Set up JDK 1.8
15 |       uses: actions/setup-java@v1
16 |       with:
17 |         java-version: 1.8
18 |     - name: Compile
19 |       run: sbt clean compile
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | project
3 | .metals
4 | .bloop
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Gamesight
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AWS GLUE LOCAL SCALA
 2 | 
 3 | This is a tool to develop and test AWS Glue scripts written in Scala. It uses SBT to manage the necessary resources for local testing. It was inspired by the documentation on locally testing Glue here: https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html which suggests local installations of packages and running with Maven. At Gamesight, we wanted something that we could easily integrate into our CI/CD systems. This tool enables us to do that.
 4 | 
 5 | ### Dependencies
 6 | * Java 8 - Later versions of Java will not work with AWS Glue
 7 | * SBT Version 1.3.10 - get it here https://www.scala-sbt.org/index.html
 8 | * Scala 2.11.1 or later
 9 | 
10 | ### Setup
11 | 
12 | 1. Clone the repository.
13 | 2. Update the test.  
14 | Update the following portion of ExampleSpec.scala with real S3 bucket names and prefixes that you have privileges to use (List, Read, Write).
15 | ```
16 | io.gamesight.AWSGlue.ExampleJob.main(Array(
17 |   "--JOB_NAME", "job",
18 |   "--stage", "dev",
19 |   "--inputBucket", "<YOUR BUCKET NAME>",
20 |   "--outputBucket", "<YOUR OUTPUT BUCKET NAME>",
21 |   "--inputPrefix", "<YOUR INPUT PREFIX>",
22 |   "--outputPrefix", "<YOUR OUTPUT PREFIX>"
23 | ))
24 | ```
25 | 3. Compile the package.  
26 | `sbt clean compile`
27 | 4. Verify that your AWS credentials are active
28 | 5. Run the test example.  
29 | `sbt test`
30 | 
31 | ### Usage
32 | 
33 | We suggest that you start with ExampleJob.scala as the boilerplate for development. There is some additional handling for stage separation that allows the script to run locally. The calls to `Job` functions should only run during executions within the AWS Glue environment. Additionally, when `dev` is passed to the `--stage` argument, the example adds local configuration to the `SparkContext` without affecting deployed executions. The example can be executed either locally or deployed, as long as the correct arguments are passed.  
34 | 
35 | While it is possible to run ExampleJob or any user-defined job from the command line with `sbt "run <args here>"`, we suggest that executions are triggered from within a testing framework. The example uses http://www.scalatest.org/ which allows easy control over the main class arguments and gives the ability to add assertions. This also fits with our goal of being able to integrate AWS Glue job scripts into a CI/CD flow.  
36 | 
37 | NOTE: Testing locally should be done with a small data set. Local executions do not have the parallelism benefits of a true distributed cluster. Instead, the local testing works on a single-node cluster.
38 | 
39 | ### Deployment
40 | 
41 | In order to deploy your glue job, either copy and paste the contents of your tested script using the AWS console, or upload the script to S3 using your preferred deployment tool.  
42 | 
43 | The following yaml template has all of the necessary Cloudformation details for deployment.
44 | 
45 | ```
46 | ExampleGlueJob:
47 |   Type: AWS::Glue::Job
48 |   Properties:
49 |     Name: example-job
50 |     Role: !Ref <IAM ROLE THAT THE JOB WILL ASSUME>
51 |     GlueVersion: "1.0"
52 |     ExecutionProperty:
53 |       MaxConcurrentRuns : 1
54 |     Command:
55 |       Name: glueetl
56 |       ScriptLocation:
57 |         Fn::Join:
58 |           - ""
59 |           - - s3://
60 |             - <YOUR SCRIPT BUCKET>/
61 |             - <YOUR SCRIPT PREFIX>/
62 |             - ExampleJob.scala
63 |     DefaultArguments:
64 |       "--stage": <YOUR DEPLOYMENT STAGE>
65 |       "--job-language": scala
66 |       "--class": io.gamesight.AWSGlue.ExampleJob
67 |       "--TempDir": s3://<YOUR BUCKET>/temp/
68 | ```
69 | 
70 | ### How to Collaborate
71 | 
72 | If you have questions or ideas, feel free to post issues.
73 | 
74 | ### About Us
75 | 
76 | Visit https://gamesight.io/ to learn more.
77 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | ThisBuild / scalaVersion := "2.11.1"
 2 | ThisBuild / organization := "io.gamesight"
 3 | 
 4 | lazy val glueetl = (project in file("."))
 5 |   .settings(
 6 |     name := "aws-glue-local-scala",
 7 |     resolvers ++= Seq(
 8 |       "aws-glue-etl-artifacts" at "https://aws-glue-etl-artifacts.s3.amazonaws.com/release/"
 9 |     ),
10 |     libraryDependencies ++= Seq(
11 |       "com.amazonaws" % "AWSGlueETL" % "1.0.0",
12 |       "org.apache.logging.log4j" % "log4j-core" % "2.13.1",
13 |       "org.apache.spark" %% "spark-core" % "2.4.3" % "provided",
14 |       "org.apache.spark" %% "spark-mllib" % "2.4.3" % "provided",
15 |       "org.apache.spark" %% "spark-sql" % "2.4.3" % "provided",
16 |       "org.scalactic" %% "scalactic" % "3.1.1",
17 |       "org.scalamock" %% "scalamock" % "4.4.0" % "test",
18 |       "org.scalatest" %% "scalatest" % "3.1.1" % "test",
19 |       "software.amazon.awssdk" % "aws-sdk-java" % "2.13.0",
20 |     ),
21 |     dependencyOverrides ++= Seq(
22 |       "com.fasterxml.jackson.core" % "jackson-core" % "2.6.7",
23 |       "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.7",
24 |       "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.6.7.1",
25 |     )
26 |   )
27 | 


--------------------------------------------------------------------------------
/src/main/scala/glue/ExampleJob.scala:
--------------------------------------------------------------------------------
  1 | /******************************************************************************\
  2 |  * An example that includes all necessary boilerplate code for a deployable   *
  3 |  * AWS Glue job script in Scala that can also be tested locally with          *
  4 |  * scalatest and sbt. The script reads JSON objects into a DataFrame from an  *
  5 |  * S3 DataSource, inspects and prints the schema, and then writes it as       *
  6 |  * parquet to another S3 location                                             *
  7 |  *                                                                            *
  8 |  * Org: Gamesight - https://gamesight.io                                      *
  9 |  * Author: jeremy@gamesight.io                                                *
 10 |  * License: MIT                                                               *
 11 |  * Copyright (c) 2020 Gamesight                                               *
 12 | \******************************************************************************/
 13 | 
 14 | package io.gamesight.AWSGlue
 15 | 
 16 | import com.amazonaws.services.glue.util.JsonOptions
 17 | import com.amazonaws.services.glue.{DynamicFrame, GlueContext, DataSink, DataSource}
 18 | import org.apache.spark.{SparkContext, SparkConf}
 19 | import com.amazonaws.services.glue.util.Job
 20 | import com.amazonaws.services.glue.util.GlueArgParser
 21 | import scala.collection.JavaConverters._
 22 | 
 23 | object ExampleJob {
 24 | 
 25 |   def main(sysArgs: Array[String]): Unit = {
 26 |     // Read in the arguments
 27 |     // JOB_NAME - is usually supplied automatically by AWS but must be included in the test
 28 |     // DO NOT OVERRIDE JOB_NAME IN DEPLOYED CODE
 29 |     // stage - the stage of production. Suggested values: "dev", "staging", "prod"
 30 |     // inputBucket - the bucket the DataSource will read from
 31 |     // inputPrefix - the specific prefix (must end in "/") for the source
 32 |     // outputBucket - the bucket the DataSink will write to
 33 |     // outputPrefix - the prefix to prepend to files when writing
 34 |     val args = GlueArgParser.getResolvedOptions(
 35 |       sysArgs,
 36 |       Seq(
 37 |         "JOB_NAME", "stage", "inputBucket", "outputBucket", "inputPrefix", "outputPrefix"
 38 |       ).toArray)
 39 | 
 40 |     println("Initializing Spark and GlueContext")
 41 | 
 42 |     /**********************************************************************\
 43 |      * Here we are initialize the SparkContext. If we are running locally *
 44 |      * we need to add a SparkConf that declares a locally spawned Hadoop  *
 45 |      * cluster.                                                           *
 46 |     \**********************************************************************/
 47 |     val sc: SparkContext = if (args("stage") == "dev") {
 48 |       // For testing, we need to use local execution
 49 |       val conf = new SparkConf().setAppName("GlueExample").setMaster("local")
 50 |       new SparkContext(conf)
 51 |     } else {
 52 |       new SparkContext()
 53 |     }
 54 | 
 55 |     sc.setLogLevel("FATAL") // this can be changed to INFO, ERROR, or WARN
 56 |     val glueContext: GlueContext = new GlueContext(sc)
 57 | 
 58 |     /**********************************************************************\
 59 |      * Job actions should only happen when executed by AWS Glue, so we    *
 60 |      * ensure correct stage. These may need to be updated if you have     *
 61 |      * different names for your deployed stages. For this example, if     *
 62 |      * either "prod" or "staging" is passed as the "--stage" argument we  *
 63 |      * we will execute Job commands. The example test script uses "dev"   *                                                     *
 64 |     \**********************************************************************/
 65 |     if (args("stage") == "prod" || args("stage") == "staging") {
 66 |       Job.init(if (args("JOB_NAME") != null) args("JOB_NAME") else "test", glueContext, args.asJava)
 67 |     }
 68 | 
 69 |     // Set the connection options using the --inputBucket and --inputPrefix arguments
 70 |     val connectionOptions = JsonOptions(Map(
 71 |       "paths" ->  Seq(s"s3://${args("inputBucket")}/${args("inputPrefix")}"),
 72 |       "compression" -> "gzip",
 73 |       "groupFiles" -> "inPartition",
 74 |       "groupSize" -> (1024*1024*64).toString()
 75 |     ))
 76 | 
 77 |     println("Getting Frame")
 78 | 
 79 |     // Create the DataSource
 80 |     val source: DataSource = glueContext.getSourceWithFormat(
 81 |       connectionType = "s3",
 82 |       options = connectionOptions,
 83 |       transformationContext = "",
 84 |       format = "json",
 85 |       formatOptions = JsonOptions.empty
 86 |     )
 87 | 
 88 |     // Convert the source to a DynamicFrame
 89 |     val frame: DynamicFrame = source.getDynamicFrame()
 90 | 
 91 |     println("Got Frame")
 92 | 
 93 |     // Print the schema of our data to the console
 94 |     frame.printSchema()
 95 | 
 96 |     println("Creating Sink")
 97 | 
 98 |     // Create the sink, using the --outputBucket and --outputPrefix arguments
 99 |     val sink: DataSink = glueContext.getSinkWithFormat(
100 |       connectionType = "s3",
101 |       options = JsonOptions(Map("path" -> s"s3://${args("outputBucket")}/${args("outputPrefix")}")),
102 |       format = "parquet",
103 |       transformationContext = ""
104 |     )
105 | 
106 |     println("Writing to Sink")
107 | 
108 |     // Write the frame to our output destination
109 |     sink.writeDynamicFrame(frame)
110 | 
111 |     println("Wrote Frame")
112 | 
113 |     // Job actions should only happen when executed by AWS Glue, so we ensure correct stage
114 |     if (args("stage") == "prod" || args("stage") == "staging") {
115 |       Job.commit()
116 |     }
117 | 
118 |   }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/test/scala/ExampleSpec.scala:
--------------------------------------------------------------------------------
 1 | /******************************************************************************\
 2 |  * A  class that runs a local execution of an AWS Glue job within a scalatest *
 3 |  * Instead of running our local executions, it is preferred to call them from *
 4 |  * a test framework, where we are able to add assertions for verification.    *
 5 |  *                                                                            *
 6 |  * Org: Gamesight - https://gamesight.io                                      *
 7 |  * Author: jeremy@gamesight.io                                                *
 8 |  * License: MIT                                                               *
 9 |  * Copyright (c) 2020 Gamesight                                               *
10 | \******************************************************************************/
11 | 
12 | import org.scalatest._
13 | 
14 | class ExampleSpec extends FunSpec {
15 |   describe("Example") {
16 |     it("should run the job") {
17 | 
18 |       println(s"Starting ExampleJob at ${new java.util.Date()}")
19 | 
20 |       // Trigger the execution by directly calling the main class and supplying
21 |       // arguments. AWS Glue job arguments always begin with "--" so that the
22 |       // resolver can correctly convert it to a Map
23 |       io.gamesight.AWSGlue.ExampleJob.main(Array(
24 |         "--JOB_NAME", "job",
25 |         "--stage", "dev",
26 |         "--inputBucket", "<YOUR BUCKET NAME>",
27 |         "--outputBucket", "<YOUR OUTPUT BUCKET NAME>",
28 |         "--inputPrefix", "<YOUR INPUT PREFIX>",
29 |         "--outputPrefix", "<YOUR OUTPUT PREFIX>"
30 |       ))
31 | 
32 |       println(s"ExampleJob Finished at ${new java.util.Date()}")
33 | 
34 |     }
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------