├── .gitignore
├── README.md
├── pom.xml
├── src
└── main
│ └── java
│ └── com
│ └── mongodb
│ └── spark
│ └── demo
│ └── Recommender.java
└── LICENSE
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Mobile Tools for Java (J2ME)
4 | .mtj.tmp/
5 |
6 | # Package Files #
7 | *.jar
8 | *.war
9 | *.ear
10 |
11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
12 | hs_err_pid*
13 | /target
14 |
15 | demo.iml
16 | .classpath
17 | .project
18 | /.idea
19 | .settings
20 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MongoDB-Spark Demo
2 |
3 | ## Prerequisites
4 |
5 | To build the MongoDB-Spark demo application, you'll need to have the following:
6 |
7 | * [Maven](http://maven.apache.org)
8 | * [MongoDB-Hadoop Connector](http://github.com/mongodb/mongo-hadoop)
9 | * [Spark](http://spark.apache.org) (1.0 or greater, for Hadoop 2.x)
10 |
11 | ### MongoDB-Hadoop
12 |
13 | *Note*: you must build the MongoDB-Hadoop connector from source for your version of Hadoop and install the `core` JAR file to your local Maven repo. For example:
14 |
15 | $ git clone http://github.com/mongodb/mongo-hadoop.git
16 | $ cd mongo-hadoop
17 | $ ./gradlew jar -Phadoop_version='2.4'
18 | $ mvn install:install-file \
19 | -Dfile=core/build/lib/mongo-hadoop-core-1.2.1-SNAPSHOT-hadoop_2.4.jar \
20 | -DgroupId=com.mongodb \
21 | -DartifactId=hadoop \
22 | -Dversion=1.2.1-SNAPSHOT \
23 | -Dpackaging=jar
24 |
25 | ### Spark
26 |
27 | Refer to the [Spark overview](http://spark.apache.org/docs/latest/index.html) to get started.
28 |
29 | ## Building
30 |
31 | To the build the MongoDB-Hadoop demo applications use Maven:
32 |
33 | $ mvn package
34 |
35 | This will build the demo application and place all of the dependencies in `target/lib`. If instead you want to build a single jar with all of the dependencies, execute the `assembly:single` Maven goal:
36 |
37 | $ mvn compile assembly:single
38 |
39 |
44 |
45 | ## Running
46 |
47 | $ cd your-spark-directory
48 | $ SPARK_JAR=assembly/target/scala-2.10/spark-assembly-1.0.0-hadoop2.4.0.jar \
49 | HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop \
50 | bin/spark-submit --master local --class com.mongodb.spark.demo.Recommender /path/to/demo-1.0-SNAPSHOT.jar \
51 | --jars /path/to/mongo-java-driver-2.12.2.jar,/path/to/hadoop-1.2.1-SNAPSHOT.jar \
52 | --executor-memory 4G /movielens/ratings.bson /movielens/users.bson \
53 | /movielens/movies.bson movielens.predictions
54 |
55 | ## Notes
56 |
57 | None at this time.
58 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.mongodb.spark
5 | demo
6 | jar
7 | 1.0-SNAPSHOT
8 | demo
9 | http://maven.apache.org
10 |
11 |
12 | org.apache.spark
13 | spark-core_2.10
14 | 1.0.0
15 |
16 |
17 | org.apache.spark
18 | spark-mllib_2.10
19 | 1.0.0
20 |
21 |
22 | org.mongodb
23 | mongo-java-driver
24 | 2.12.3
25 |
26 |
27 | org.mongodb
28 | mongo-hadoop-core
29 | 1.3.0
30 |
31 |
32 | org.apache.hadoop
33 | hadoop-common
34 | 2.3.0
35 |
36 |
37 | org.apache.hadoop
38 | hadoop-client
39 | 2.3.0
40 |
41 |
42 | junit
43 | junit
44 | 3.8.1
45 | test
46 |
47 |
48 |
49 |
50 |
51 | maven-dependency-plugin
52 |
53 |
54 | copy-dependencies
55 | prepare-package
56 |
57 | copy-dependencies
58 |
59 |
60 |
61 | mongo-hadoop-core,mongo-java-driver
62 |
63 | ${project.build.directory}/lib
64 | false
65 | false
66 | true
67 |
68 |
69 |
70 | build-classpath
71 | generate-sources
72 |
73 | build-classpath
74 |
75 |
76 | ${project.build.directory}/classpath.txt
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/src/main/java/com/mongodb/spark/demo/Recommender.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.spark.demo;
2 |
3 | import com.mongodb.hadoop.BSONFileInputFormat;
4 | import com.mongodb.hadoop.MongoOutputFormat;
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.spark.SparkConf;
7 | import org.apache.spark.api.java.JavaPairRDD;
8 | import org.apache.spark.api.java.JavaRDD;
9 | import org.apache.spark.api.java.JavaSparkContext;
10 | import org.apache.spark.api.java.function.Function;
11 | import org.apache.spark.api.java.function.PairFunction;
12 | import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
13 | import org.apache.spark.mllib.recommendation.Rating;
14 | import org.apache.spark.mllib.recommendation.ALS;
15 | import org.bson.BSONObject;
16 | import org.bson.BasicBSONObject;
17 | import org.slf4j.Logger;
18 | import scala.Tuple2;
19 |
20 | import java.util.*;
21 |
22 |
23 | public class Recommender {
24 |
25 | private static String HDFS_HOST = "hdfs://crcsmnky.local:9000";
26 | private static String MONGODB_HOST = "mongodb://127.0.0.1:27017/";
27 | // private static int SCALE_MAX = 5;
28 | // private static int SCALE_MIN = 1;
29 |
30 | // public static Comparator RatingComparator = new Comparator() {
31 | // @Override
32 | // public int compare(Rating r1, Rating r2) {
33 | // return Double.valueOf(r1.rating() - r2.rating()).intValue();
34 | // }
35 | // };
36 |
37 | public static void main(String[] args) {
38 | if(args.length < 4) {
39 | System.err.println("Usage: Recommender ");
40 | System.err.println("Example: Recommender /movielens/ratings.bson /movielens/users.bson /movielens/movies.bson movielens.predictions");
41 | System.exit(-1);
42 | }
43 |
44 | String ratingsUri = HDFS_HOST + args[0];
45 | String usersUri = HDFS_HOST + args[1];
46 | String moviesUri = HDFS_HOST + args[2];
47 | String mongodbUri = MONGODB_HOST + args[3];
48 |
49 | SparkConf conf = new SparkConf().setAppName("SparkRecommender");
50 | JavaSparkContext sc = new JavaSparkContext(conf);
51 | Logger log = sc.sc().log();
52 |
53 | Configuration bsonDataConfig = new Configuration();
54 | bsonDataConfig.set("mongo.job.input.format", "com.mongodb.hadoop.BSONFileInputFormat");
55 |
56 | Configuration predictionsConfig = new Configuration();
57 | predictionsConfig.set("mongo.output.uri", mongodbUri);
58 |
59 | JavaPairRDD