├── .classpath ├── .gitignore ├── .project ├── README.md ├── beowulf.json ├── build.sbt ├── lib └── mongo-hadoop-core_2.2.0-1.2.0.jar ├── project └── build.properties └── src └── main ├── java └── JavaWordCount.java └── scala └── ScalaWordCount.scala /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | lib_managed 3 | bin 4 | project/target 5 | project/project/target 6 | 7 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | Simple Project 3 | 4 | 5 | org.scala-ide.sdt.core.scalabuilder 6 | 7 | 8 | 9 | org.scala-ide.sdt.core.scalanature 10 | org.eclipse.jdt.core.javanature 11 | 12 | 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mongo-spark 2 | =========== 3 | 4 | Example application on how to use [mongo-hadoop][1] connector with [Apache Spark][2]. 5 | 6 | Read more details at http://codeforhire.com/2014/02/18/using-spark-with-mongodb/ 7 | 8 | [1]: https://github.com/mongodb/mongo-hadoop 9 | [2]: https://spark.incubator.apache.org/ 10 | 11 | 12 | Prerequisites 13 | ------------- 14 | 15 | * MongoDB installed and running on localhost 16 | * Scala 2.10 and SBT installed 17 | 18 | 19 | Running 20 | ------- 21 | 22 | Import data into the database, run either `JavaWordCount` or `ScalaWordCount` and print the results. 23 | 24 | mongoimport -d beowulf -c input beowulf.json 25 | sbt 'run-main JavaWordCount' 26 | sbt 'run-main ScalaWordCount' 27 | mongo beowulf --eval 'printjson(db.output.find().toArray())' | less 28 | 29 | 30 | License 31 | ------- 32 | 33 | The code itself is released to the public domain according to the [Creative Commons CC0][3]. 34 | 35 | The example files are based on [Beowulf][4] from Project Gutenberg and is under its corresponding license. 36 | 37 | [3]: http://creativecommons.org/publicdomain/zero/1.0/ 38 | [4]: http://www.gutenberg.org/ebooks/981 39 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "mongo-spark" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.3" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "0.9.0-incubating" 8 | 9 | // Select which Hadoop version to use 10 | libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.2.0" 11 | 12 | libraryDependencies += "org.mongodb" % "mongo-java-driver" % "2.11.4" 13 | 14 | retrieveManaged := true 15 | 16 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/" 17 | 18 | 19 | -------------------------------------------------------------------------------- /lib/mongo-hadoop-core_2.2.0-1.2.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plaa/mongo-spark/0f7525343e79a365b827eb809f033f735c3fe779/lib/mongo-hadoop-core_2.2.0-1.2.0.jar -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.12.4 2 | 3 | -------------------------------------------------------------------------------- /src/main/java/JavaWordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * JavaWordCount.java 3 | * Written in 2014 by Sampo Niskanen / Mobile Wellness Solutions MWS Ltd 4 | * 5 | * To the extent possible under law, the author(s) have dedicated all copyright and 6 | * related and neighboring rights to this software to the public domain worldwide. 7 | * This software is distributed without any warranty. 8 | * 9 | * See for full details. 10 | */ 11 | import java.util.Arrays; 12 | import java.util.Collections; 13 | 14 | import org.apache.hadoop.conf.Configuration; 15 | import org.apache.spark.api.java.JavaPairRDD; 16 | import org.apache.spark.api.java.JavaRDD; 17 | import org.apache.spark.api.java.JavaSparkContext; 18 | import org.apache.spark.api.java.function.FlatMapFunction; 19 | import org.apache.spark.api.java.function.Function2; 20 | import org.apache.spark.api.java.function.PairFunction; 21 | import org.bson.BSONObject; 22 | import org.bson.BasicBSONObject; 23 | 24 | import scala.Tuple2; 25 | 26 | import com.mongodb.hadoop.MongoOutputFormat; 27 | 28 | public class JavaWordCount { 29 | 30 | public static void main(String[] args) { 31 | 32 | JavaSparkContext sc = new JavaSparkContext("local", "Java Word Count"); 33 | 34 | Configuration config = new Configuration(); 35 | config.set("mongo.input.uri", "mongodb://127.0.0.1:27017/beowulf.input"); 36 | config.set("mongo.output.uri", "mongodb://127.0.0.1:27017/beowulf.output"); 37 | 38 | 39 | JavaPairRDD mongoRDD = sc.newAPIHadoopRDD(config, com.mongodb.hadoop.MongoInputFormat.class, Object.class, BSONObject.class); 40 | 41 | // Input contains tuples of (ObjectId, BSONObject) 42 | JavaRDD words = mongoRDD.flatMap(new FlatMapFunction, String>() { 43 | @Override 44 | public Iterable call(Tuple2 arg) { 45 | Object o = arg._2.get("text"); 46 | if (o instanceof String) { 47 | String str = (String) o; 48 | str = str.toLowerCase().replaceAll("[.,!?\n]", " "); 49 | return Arrays.asList(str.split(" ")); 50 | } else { 51 | return Collections.emptyList(); 52 | } 53 | } 54 | }); 55 | JavaPairRDD ones = words.map(new PairFunction() { 56 | public Tuple2 call(String s) { 57 | return new Tuple2<>(s, 1); 58 | } 59 | }); 60 | JavaPairRDD counts = ones.reduceByKey(new Function2() { 61 | public Integer call(Integer i1, Integer i2) { 62 | return i1 + i2; 63 | } 64 | }); 65 | 66 | 67 | // Output contains tuples of (null, BSONObject) - ObjectId will be generated by Mongo driver if null 68 | JavaPairRDD save = counts.map(new PairFunction, Object, BSONObject>() { 69 | @Override 70 | public Tuple2 call(Tuple2 tuple) { 71 | BSONObject bson = new BasicBSONObject(); 72 | bson.put("word", tuple._1); 73 | bson.put("count", tuple._2); 74 | return new Tuple2<>(null, bson); 75 | } 76 | }); 77 | 78 | // Only MongoOutputFormat and config are relevant 79 | save.saveAsNewAPIHadoopFile("file:///bogus", Object.class, Object.class, MongoOutputFormat.class, config); 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /src/main/scala/ScalaWordCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * ScalaWordCount.scala 3 | * Written in 2014 by Sampo Niskanen / Mobile Wellness Solutions MWS Ltd 4 | * 5 | * To the extent possible under law, the author(s) have dedicated all copyright and 6 | * related and neighboring rights to this software to the public domain worldwide. 7 | * This software is distributed without any warranty. 8 | * 9 | * See for full details. 10 | */ 11 | import org.apache.spark.SparkContext 12 | import org.apache.spark.SparkContext._ 13 | import org.apache.hadoop.conf.Configuration 14 | import org.bson.BSONObject 15 | import org.bson.BasicBSONObject 16 | 17 | object ScalaWordCount { 18 | 19 | def main(args: Array[String]) { 20 | 21 | val sc = new SparkContext("local", "Scala Word Count") 22 | 23 | val config = new Configuration() 24 | config.set("mongo.input.uri", "mongodb://127.0.0.1:27017/beowulf.input") 25 | config.set("mongo.output.uri", "mongodb://127.0.0.1:27017/beowulf.output") 26 | 27 | val mongoRDD = sc.newAPIHadoopRDD(config, classOf[com.mongodb.hadoop.MongoInputFormat], classOf[Object], classOf[BSONObject]) 28 | 29 | // Input contains tuples of (ObjectId, BSONObject) 30 | val countsRDD = mongoRDD.flatMap(arg => { 31 | var str = arg._2.get("text").toString 32 | str = str.toLowerCase().replaceAll("[.,!?\n]", " ") 33 | str.split(" ") 34 | }) 35 | .map(word => (word, 1)) 36 | .reduceByKey((a, b) => a + b) 37 | 38 | // Output contains tuples of (null, BSONObject) - ObjectId will be generated by Mongo driver if null 39 | val saveRDD = countsRDD.map((tuple) => { 40 | var bson = new BasicBSONObject() 41 | bson.put("word", tuple._1) 42 | bson.put("count", tuple._2) 43 | (null, bson) 44 | }) 45 | 46 | // Only MongoOutputFormat and config are relevant 47 | saveRDD.saveAsNewAPIHadoopFile("file:///bogus", classOf[Any], classOf[Any], classOf[com.mongodb.hadoop.MongoOutputFormat[Any, Any]], config) 48 | 49 | } 50 | } 51 | --------------------------------------------------------------------------------