├── .gitignore ├── README.md ├── src └── main │ └── scala │ └── com │ └── examples │ └── MainExample.scala └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | .cache 2 | target/ 3 | target/* 4 | ./.settings/ 5 | ./.settings/* 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Instructions: 2 | 3 | [Follow this article to find more detailed instructions.](https://nosqlnocry.wordpress.com/2015/02/27/how-to-build-a-spark-fat-jar-in-scala-and-submit-a-job/) 4 | 5 | Modify the class "MainExample.scala" writing your Spark code, then compile the project with the command: 6 | 7 | ```mvn clean package``` 8 | 9 | Inside the ```/target``` folder you will find the result fat jar called ```spark-scala-maven-project-0.0.1-SNAPSHOT-jar-with-depencencies.jar```. In order to launch the Spark job use this command in a shell with a configured Spark environment: 10 | 11 | spark-submit --class com.examples.MainExample \ 12 | --master yarn-cluster \ 13 | spark-scala-maven-project-0.0.1-SNAPSHOT-jar-with-depencencies.jar \ 14 | inputhdfspath \ 15 | outputhdfspath 16 | 17 | The parameters ```inputhdfspath``` and ```outputhdfspath``` don't have to present the form ```hdfs://path/to/your/file``` but directly ```/path/to/your/files/``` because submitting a job the default file system is HDFS. To retrieve the result locally: 18 | 19 | hadoop fs -getmerge outputhdfspath resultSavedLocally 20 | -------------------------------------------------------------------------------- /src/main/scala/com/examples/MainExample.scala: -------------------------------------------------------------------------------- 1 | package com.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.SparkConf 6 | import org.apache.log4j.Logger 7 | 8 | object MainExample { 9 | 10 | def main(arg: Array[String]) { 11 | 12 | var logger = Logger.getLogger(this.getClass()) 13 | 14 | if (arg.length < 2) { 15 | logger.error("=> wrong parameters number") 16 | System.err.println("Usage: MainExample

") 17 | System.exit(1) 18 | } 19 | 20 | val jobName = "MainExample" 21 | 22 | val conf = new SparkConf().setAppName(jobName) 23 | val sc = new SparkContext(conf) 24 | 25 | val pathToFiles = arg(0) 26 | val outputPath = arg(1) 27 | 28 | logger.info("=> jobName \"" + jobName + "\"") 29 | logger.info("=> pathToFiles \"" + pathToFiles + "\"") 30 | 31 | val files = sc.textFile(pathToFiles) 32 | 33 | // do your work here 34 | val rowsWithoutSpaces = files.map(_.replaceAll(" ", ",")) 35 | 36 | // and save the result 37 | rowsWithoutSpaces.saveAsTextFile(outputPath) 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | spark-scala-maven-project 5 | spark-scala-maven-project 6 | 0.0.1-SNAPSHOT 7 | ${project.artifactId} 8 | This is a boilerplate maven project to start using Spark in Scala 9 | 2010 10 | 11 | 12 | 1.6 13 | 1.6 14 | UTF-8 15 | 2.10 16 | 17 | 2.10.4 18 | 19 | 20 | 21 | 22 | 23 | cloudera-repo-releases 24 | https://repository.cloudera.com/artifactory/repo/ 25 | 26 | 27 | 28 | 29 | src/main/scala 30 | src/test/scala 31 | 32 | 33 | 34 | net.alchim31.maven 35 | scala-maven-plugin 36 | 3.1.3 37 | 38 | 39 | 40 | compile 41 | testCompile 42 | 43 | 44 | 45 | -make:transitive 46 | -dependencyfile 47 | ${project.build.directory}/.scala_dependencies 48 | 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-surefire-plugin 56 | 2.13 57 | 58 | false 59 | true 60 | 61 | 62 | 63 | **/*Test.* 64 | **/*Suite.* 65 | 66 | 67 | 68 | 69 | 70 | 71 | maven-assembly-plugin 72 | 2.4.1 73 | 74 | 75 | jar-with-dependencies 76 | 77 | 78 | 79 | 80 | make-assembly 81 | package 82 | 83 | single 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | org.scala-lang 95 | scala-library 96 | ${scala.version} 97 | 98 | 99 | org.apache.spark 100 | spark-core_2.10 101 | 1.2.0-cdh5.3.1 102 | 103 | 104 | 105 | 106 | 107 | --------------------------------------------------------------------------------