├── .classpath
├── .gitignore
├── .project
├── README.md
├── beowulf.json
├── build.sbt
├── lib
└── mongo-hadoop-core_2.2.0-1.2.0.jar
├── project
└── build.properties
└── src
└── main
├── java
└── JavaWordCount.java
└── scala
└── ScalaWordCount.scala
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | lib_managed
3 | bin
4 | project/target
5 | project/project/target
6 |
7 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 | Simple Project
3 |
4 |
5 | org.scala-ide.sdt.core.scalabuilder
6 |
7 |
8 |
9 | org.scala-ide.sdt.core.scalanature
10 | org.eclipse.jdt.core.javanature
11 |
12 |
13 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | mongo-spark
2 | ===========
3 |
4 | Example application on how to use [mongo-hadoop][1] connector with [Apache Spark][2].
5 |
6 | Read more details at http://codeforhire.com/2014/02/18/using-spark-with-mongodb/
7 |
8 | [1]: https://github.com/mongodb/mongo-hadoop
9 | [2]: https://spark.incubator.apache.org/
10 |
11 |
12 | Prerequisites
13 | -------------
14 |
15 | * MongoDB installed and running on localhost
16 | * Scala 2.10 and SBT installed
17 |
18 |
19 | Running
20 | -------
21 |
22 | Import data into the database, run either `JavaWordCount` or `ScalaWordCount` and print the results.
23 |
24 | mongoimport -d beowulf -c input beowulf.json
25 | sbt 'run-main JavaWordCount'
26 | sbt 'run-main ScalaWordCount'
27 | mongo beowulf --eval 'printjson(db.output.find().toArray())' | less
28 |
29 |
30 | License
31 | -------
32 |
33 | The code itself is released to the public domain according to the [Creative Commons CC0][3].
34 |
35 | The example files are based on [Beowulf][4] from Project Gutenberg and is under its corresponding license.
36 |
37 | [3]: http://creativecommons.org/publicdomain/zero/1.0/
38 | [4]: http://www.gutenberg.org/ebooks/981
39 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | name := "mongo-spark"
2 |
3 | version := "1.0"
4 |
5 | scalaVersion := "2.10.3"
6 |
7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "0.9.0-incubating"
8 |
9 | // Select which Hadoop version to use
10 | libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.2.0"
11 |
12 | libraryDependencies += "org.mongodb" % "mongo-java-driver" % "2.11.4"
13 |
14 | retrieveManaged := true
15 |
16 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
17 |
18 |
19 |
--------------------------------------------------------------------------------
/lib/mongo-hadoop-core_2.2.0-1.2.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/plaa/mongo-spark/0f7525343e79a365b827eb809f033f735c3fe779/lib/mongo-hadoop-core_2.2.0-1.2.0.jar
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.12.4
2 |
3 |
--------------------------------------------------------------------------------
/src/main/java/JavaWordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * JavaWordCount.java
3 | * Written in 2014 by Sampo Niskanen / Mobile Wellness Solutions MWS Ltd
4 | *
5 | * To the extent possible under law, the author(s) have dedicated all copyright and
6 | * related and neighboring rights to this software to the public domain worldwide.
7 | * This software is distributed without any warranty.
8 | *
9 | * See for full details.
10 | */
11 | import java.util.Arrays;
12 | import java.util.Collections;
13 |
14 | import org.apache.hadoop.conf.Configuration;
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaRDD;
17 | import org.apache.spark.api.java.JavaSparkContext;
18 | import org.apache.spark.api.java.function.FlatMapFunction;
19 | import org.apache.spark.api.java.function.Function2;
20 | import org.apache.spark.api.java.function.PairFunction;
21 | import org.bson.BSONObject;
22 | import org.bson.BasicBSONObject;
23 |
24 | import scala.Tuple2;
25 |
26 | import com.mongodb.hadoop.MongoOutputFormat;
27 |
28 | public class JavaWordCount {
29 |
30 | public static void main(String[] args) {
31 |
32 | JavaSparkContext sc = new JavaSparkContext("local", "Java Word Count");
33 |
34 | Configuration config = new Configuration();
35 | config.set("mongo.input.uri", "mongodb://127.0.0.1:27017/beowulf.input");
36 | config.set("mongo.output.uri", "mongodb://127.0.0.1:27017/beowulf.output");
37 |
38 |
39 | JavaPairRDD