├── .gitignore ├── .scalafmt.conf ├── README.md ├── build.sbt ├── project ├── Dependencies.scala ├── build.properties ├── metals.sbt ├── plugins.sbt └── project │ ├── metals.sbt │ └── project │ └── metals.sbt ├── screenshots ├── 1-make-buckets.png ├── 1.5-change-bucekt-name.png ├── 10-running-step.png ├── 11-history-server.png ├── 12-complete-step.png ├── 13-download-data.png ├── 14-display-data.png ├── 2-sbt-assembly.png ├── 3-upload-jar.png ├── 4-create-cluster.png ├── 5-step-execution.png ├── 6-spark-application.png ├── 7-configure-step.png ├── 7.5-configured-step.png ├── 8-finish-create-cluster.png └── 9-steps-tab.png └── src └── main └── scala └── com └── revature └── commoncrawlemrdemo └── Runner.scala /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .bloop 3 | .metals 4 | .vscode -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = "2.7.4" 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EMR Tutorial 2 | This guide walks you through submitting a Scala Spark application to EMR that queries 500k job urls from Common Crawl and saves the results to an S3 bucket in CSV format. 3 | 4 | Running the application on EMR will cost about 50 cents. 5 | 6 | ## Prerequisites 7 | This isn't required, but I highly reccomend this 6 minute intro to EMR. It's the simplest EMR hello world you could ask for: 8 | 9 | https://www.youtube.com/watch?v=gOT7El8rMws&ab_channel=JohnnyChivers 10 | 11 | ## S3 Setup 12 | Create two new S3 buckets. 13 | The first will used be to upload your Spark application jar file. 14 | The second will store the output data produced by your application. 15 | I'll refer to these as your "input" and "output" buckets. 16 | 17 | I named my two buckets `input-bucket-revusf` and `output-bucket-revusf` 18 | 19 | ```shell 20 | s3cmd mb s3://input-bucket-revusf 21 | s3cmd mb s3://output-bucket-revusf 22 | ``` 23 | 24 | I use `s3cmd` for throughout this guide, but feel free to use AWS-cli or the S3 console. 25 | 26 | ## Clone the Repo 27 | We're going to build our application jar locally then submit it to EMR. 28 | 29 | To start, let's clone the repo: 30 | 31 | ```shell 32 | git clone https://github.com/haydenhw/commoncrawl-emr-tutorial 33 | ``` 34 | ```shell 35 | cd commoncrawl-emr-tutorial 36 | ``` 37 | 38 | ## Update the output bucket name 39 | You'll need to modifiy one line of the application Runner to tell Spark where to find the "output" bucket you created earlier: 40 | 41 | `src/main/scala/com/revature/commoncrawlemrdemo/Runner.scala` 42 | 43 | Replace YOUR-BUCKET-NAME with the name of your output bucket at the line shown below. 44 | 45 | ![](screenshots/1.5-change-bucekt-name.png) 46 | 47 | Since I named my output bucket `output-bucket-revusf` the string would become `"s3a://output-bucket-revusf/commoncrawl-demo-data"` 48 | 49 | When the application runs it will create a folder inside your bucket called `commoncrawl-demo-data` and store the results there. 50 | 51 | ## Create a jar file 52 | Now simply build the application as usual with sbt assembly: 53 | 54 | `sbt assembly` 55 | 56 | *Note: If you try to run this locally it will fail. I've ommitted the dependencies needed to connect to S3 locally to minimize the size of the jar file* 57 | 58 | ## Upload the jar file 59 | Once sbt assembly completes we need to upload our jar file to the "input" bucket we created earlier in S3. 60 | 61 | ```shell 62 | s3cmd put target/scala-2.11/commoncrawl-emr-demo-assembly-0.1.0-SNAPSHOT.jar s3://input-bucket-revusf 63 | ``` 64 | 65 | ![](screenshots/3-upload-jar.png) 66 | 67 | *Note: If you're uploading your jar file using the S3 console and it's taking a long time, try a CLI tool instead. I've found that to be much faster* 68 | 69 | ## Create an EMR Cluster 70 | Now open the AWS EMR conosole in your browser and click the **Create cluster** button 71 | 72 | ![](screenshots/4-create-cluster.png) 73 | 74 | ## Name Cluster and set Launch Mode 75 | 1. Give your cluster a name 76 | 2. Select the **Step execution** option for **Launch mode** 77 | 78 | ![](screenshots/5-step-execution.png) 79 | 80 | Step execution will automatically terminate the cluster after our application completes. This is nice beacause we won't need to worry about accidentally leaving the cluster running and racking up charges when we aren't using it. 81 | 82 | ## Set Step type and Configure 83 | 1.Select **Spark application** for **Step type** 84 | 2.Click **Configure** 85 | 86 | ![](screenshots/6-spark-application.png) 87 | 88 | ## Spark application configuration 89 | 1. For **Spark-submit options** provide the path to your Runner class: 90 | `--class com.revature.commoncrawlemrdemo.Runner` 91 | 2. Here we need to tell EMR where to find our jar file on S3. Click the folder icon then locate and select the jar file you uploaded earlier. 92 | 3. Select **Terminate cluster** for **Action on failure** 93 | 94 | ![](screenshots/7-configure-step.png) 95 | 96 | ## Finish Cluster Creation 97 | Leave the rest of the settings as default and click the **Create cluster** button 98 | 99 | ![](screenshots/8-finish-create-cluster.png) 100 | 101 | ## Monitor your application 102 | Open the **Steps** tab 103 | 104 | At first the application status will show Pending. After 5-10 minutes it will change to Running 105 | ![](screenshots/9-steps-tab.png) 106 | ![](screenshots/10-running-step.png) 107 | If a job you submit ever fails, click the **stderr** link to see debug logs. 108 | 109 | ## Check on your application progress 110 | Once the application is in a Running status you can monitor progress in the **Spark history server**. 111 | ![](screenshots/11-history-server.png) 112 | 113 | ## Download your output data 114 | 115 | As you can see the job took 38 minutes to complete. This is about on par with other similar queries I've run on the columnar index. 116 | ![](screenshots/12-complete-step.png) 117 | 118 | After the job is finished, your ouput bucket should be populated with a nice CSV file packed full of job URLS 119 | 120 | ![](screenshots/13-download-data.png) 121 | ![](screenshots/14-display-data.png) 122 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | ThisBuild / scalaVersion := "2.11.12" 4 | ThisBuild / version := "0.1.0-SNAPSHOT" 5 | ThisBuild / organization := "com.revature" 6 | ThisBuild / organizationName := "revature" 7 | 8 | lazy val root = (project in file(".")) 9 | .settings( 10 | name := "commoncrawl-emr-demo", 11 | libraryDependencies += scalaTest % Test, 12 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.7" % "provided" 13 | // provided means the dep will be provided in the environment we run this project 14 | // Spark already has the spark depedencies, so we mark them as provided 15 | ) 16 | 17 | // See https://www.scala-sbt.org/1.x/docs/Using-Sonatype.html for instructions on how to publish to Sonatype. 18 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Dependencies { 4 | lazy val scalaTest = "org.scalatest" %% "scalatest" % "3.2.2" 5 | } 6 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.4.7 2 | -------------------------------------------------------------------------------- /project/metals.sbt: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT! This file is auto-generated. 2 | // This file enables sbt-bloop to create bloop config files. 3 | 4 | addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.4.8") 5 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") -------------------------------------------------------------------------------- /project/project/metals.sbt: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT! This file is auto-generated. 2 | // This file enables sbt-bloop to create bloop config files. 3 | 4 | addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.4.8") 5 | -------------------------------------------------------------------------------- /project/project/project/metals.sbt: -------------------------------------------------------------------------------- 1 | // DO NOT EDIT! This file is auto-generated. 2 | // This file enables sbt-bloop to create bloop config files. 3 | 4 | addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.4.8") 5 | -------------------------------------------------------------------------------- /screenshots/1-make-buckets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/1-make-buckets.png -------------------------------------------------------------------------------- /screenshots/1.5-change-bucekt-name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/1.5-change-bucekt-name.png -------------------------------------------------------------------------------- /screenshots/10-running-step.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/10-running-step.png -------------------------------------------------------------------------------- /screenshots/11-history-server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/11-history-server.png -------------------------------------------------------------------------------- /screenshots/12-complete-step.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/12-complete-step.png -------------------------------------------------------------------------------- /screenshots/13-download-data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/13-download-data.png -------------------------------------------------------------------------------- /screenshots/14-display-data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/14-display-data.png -------------------------------------------------------------------------------- /screenshots/2-sbt-assembly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/2-sbt-assembly.png -------------------------------------------------------------------------------- /screenshots/3-upload-jar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/3-upload-jar.png -------------------------------------------------------------------------------- /screenshots/4-create-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/4-create-cluster.png -------------------------------------------------------------------------------- /screenshots/5-step-execution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/5-step-execution.png -------------------------------------------------------------------------------- /screenshots/6-spark-application.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/6-spark-application.png -------------------------------------------------------------------------------- /screenshots/7-configure-step.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/7-configure-step.png -------------------------------------------------------------------------------- /screenshots/7.5-configured-step.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/7.5-configured-step.png -------------------------------------------------------------------------------- /screenshots/8-finish-create-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/8-finish-create-cluster.png -------------------------------------------------------------------------------- /screenshots/9-steps-tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haydenhw/commoncrawl-emr-tutorial/488ec9918327cec6ed0136189e994630de1a7dba/screenshots/9-steps-tab.png -------------------------------------------------------------------------------- /src/main/scala/com/revature/commoncrawlemrdemo/Runner.scala: -------------------------------------------------------------------------------- 1 | package com.revature.commoncrawlemrdemo 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark._ 5 | 6 | /** 7 | * Spark job ready to be run on EMR 8 | * Finds 500k job urls from the common crawl columnar index 9 | * Stores the result as a CSV file on the S3 bucket of your choosing 10 | */ 11 | 12 | object Runner { 13 | def main(args: Array[String]) { 14 | val spark = SparkSession 15 | .builder() 16 | .appName("commoncrawl emr demo") 17 | .getOrCreate() 18 | 19 | // Note: we're not providing any credentials or doing any s3 config here 20 | // EMR takes care of all of that for us 21 | 22 | import spark.implicits._ 23 | spark.sparkContext.setLogLevel("WARN") 24 | 25 | val df = spark.read.load("s3a://commoncrawl/cc-index/table/cc-main/warc/") 26 | 27 | val crawl = "CC-MAIN-2020-05" 28 | val jobUrls = df 29 | .select("url_host_name", "url_path") 30 | .filter($"crawl" === crawl) 31 | .filter($"subset" === "warc") 32 | .filter($"url_path".contains("job")) 33 | .limit(500000) 34 | 35 | // Change YOUR-BUCKET-NAME to the name of the output bucket you created on S3 36 | val s3OutputBucket = "s3a://YOUR-BUCKET-NAME/commoncrawl-demo-data" 37 | 38 | jobUrls.write.format("csv").mode("overwrite").save(s3OutputBucket ) 39 | 40 | spark.close 41 | } 42 | } 43 | --------------------------------------------------------------------------------