├── .gitignore
├── README.md
├── build.sbt
├── project
    ├── build.properties
    └── plugins.sbt
└── src
    └── main
        ├── resources
            └── log4j.properties
        └── scala
            ├── DatasetExample.scala
            ├── HdfsExample.scala
            ├── JavaExample.java
            ├── KeyValueExample.scala
            ├── MultiBucketExample.scala
            ├── N1QLExample.scala
            ├── SparkSQLExample.scala
            ├── StreamingExample.scala
            ├── StructuredStreamingExample.scala
            ├── SubdocLookupExample.scala
            ├── SubdocMutationExample.scala
            ├── TransformationExample.scala
            └── Word2VecExample.scala


/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | .idea/
3 | reviews/
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Couchbase Spark Samples
 2 | =======================
 3 | 
 4 | This repository contains small snippets that showcase the usage of
 5 | the [Couchbase Spark Connector](https://github.com/couchbase/couchbase-spark-connector) 
 6 | with [Apache Spark](http://spark.apache.org/). 
 7 | 
 8 | Please see the [Documentation](http://docs.couchbase.com) for quickstart and
 9 | full documentation. Also make sure to have at least the `travel-sample` and the
10 | `default` bucket loaded. Some examples also need more setup like the twitter stream,
11 | JDBC and Hadoop examples.
12 | 
13 | All examples can be run from the IDE by calling the `main` methods and you
14 | can also run them from the command line, for example : `sbt "run-main DatasetExample"`


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "couchbase-spark-samples"
 2 | 
 3 | organization := "com.couchbase"
 4 | 
 5 | version := "1.0.0-SNAPSHOT"
 6 | 
 7 | scalaVersion := "2.12.0"
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   "org.apache.spark" %% "spark-core" % "2.4.0",
11 |   "org.apache.spark" %% "spark-streaming" % "2.4.0",
12 |   "org.apache.spark" %% "spark-sql" % "2.4.0",
13 |   "com.couchbase.client" %% "spark-connector" % "2.4.0",
14 |   "org.apache.spark" %% "spark-mllib" % "2.4.0",
15 |   "mysql" % "mysql-connector-java" % "5.1.37"
16 | )
17 | 
18 | resolvers += Resolver.mavenLocal


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.8


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n


--------------------------------------------------------------------------------
/src/main/scala/DatasetExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import org.apache.spark.sql.SparkSession
17 | import org.apache.spark.sql.sources.EqualTo
18 | import com.couchbase.spark.sql._
19 | 
20 | /** Airline has subset of the fields that are in the database */
21 | case class Airline(name: String, iata: String, icao: String, country: String)
22 | 
23 | /**
24 |   * This example is very similar to [[SparkSQLExample]] but it shows how a DataFrame is converted into a
25 |   * Dataset and can then be used through regular scala code instead of SparkSQL syntax while still preserving
26 |   * all the optimizations of the underlying spark execution engine.
27 |   *
28 |   * @author Michael Nitschinger
29 |   */
30 | object DatasetExample {
31 | 
32 |   def main(args: Array[String]): Unit = {
33 | 
34 |     // The SparkSession is the main entry point into spark
35 |     val spark = SparkSession
36 |       .builder()
37 |       .appName("DatasetExample")
38 |       .master("local[*]") // use the JVM as the master, great for testing
39 |       .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
40 |       .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
41 |       .config("com.couchbase.username", "Administrator")
42 |       .config("com.couchbase.password", "password")
43 |       .getOrCreate()
44 | 
45 |     // Import needed for case class conversions
46 |     import spark.implicits._
47 | 
48 |     // Create the DataFrame and convert it into a Dataset through `as[Airline]`
49 |     val airlines = spark.read.couchbase(EqualTo("type", "airline")).as[Airline]
50 | 
51 |     // Print schema like on a DataFrame
52 |     airlines.printSchema()
53 | 
54 |     // Print airlines that start with A, using a "scala" API instead of SparkSQL syntax.
55 |     airlines
56 |       .map(_.name)
57 |       .filter(_.toLowerCase.startsWith("a"))
58 |       .foreach(println(_))
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/HdfsExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import com.couchbase.client.java.query.N1qlQuery
17 | import org.apache.spark.sql.SparkSession
18 | import org.apache.spark.sql.sources.EqualTo
19 | import com.couchbase.spark.sql._
20 | import com.couchbase.spark._
21 | 
22 | /**
23 |   * This example shows how to combine DataFrames from Couchbase and HDFS together to perform useful queries.
24 |   *
25 |   * Note that when you run it for the first time run this script as-is, but then comment out the first block
26 |   * where the records are initially imported into HDFS, this is just the "data loading" phase for this
27 |   * example.
28 |   *
29 |   * Make sure HDFS runs on port 9000 on localhost or adapt the read/write paths in this example.
30 |   *
31 |   * @author Michael Nitschinger
32 |   * @author Matt Ingenthron
33 |   */
34 | object HdfsExample {
35 | 
36 |   def main(args: Array[String]): Unit = {
37 | 
38 |     // The SparkSession is the main entry point into spark
39 |     val spark = SparkSession
40 |       .builder()
41 |       .appName("HdfsExample")
42 |       .master("local[*]") // use the JVM as the master, great for testing
43 |       .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
44 |       .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
45 |       .getOrCreate()
46 | 
47 |     // ---- Write Data into HDFS  (! do once to load and then comment out ... !) ----
48 |     val query = "SELECT `travel-sample`.* from `travel-sample` WHERE type = 'landmark'"
49 |     spark.sparkContext
50 |       .couchbaseQuery(N1qlQuery.simple(query))
51 |       .map(_.value.toString)
52 |       .saveAsTextFile("hdfs://127.0.0.1:9000/landmarks")
53 | 
54 |     // ---- Load data from HDFS and join with records in Couchbase ----
55 | 
56 |     // Load Landmarks from HDFS
57 |     val landmarks = spark.read.json("hdfs://127.0.0.1:9000/landmarks/*")
58 |     landmarks.createOrReplaceTempView("landmarks")
59 | 
60 |     // Load Airports from Couchbase
61 |     val airports = spark.read.couchbase(schemaFilter = EqualTo("type", "airport"))
62 | 
63 |     // find all landmarks in the same city as the given FAA code
64 |     val toFind = "SFO" // try SFO or LAX
65 |     airports
66 |       .join(landmarks, airports("city") === landmarks("city"))
67 |       .where(airports("faa") === toFind and landmarks("url").isNotNull)
68 |       .select(landmarks("name"), landmarks("address"), airports("faa"))
69 |       .orderBy(landmarks("name").asc)
70 |       .show()
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/JavaExample.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import com.couchbase.client.java.document.JsonDocument;
17 | import com.couchbase.client.java.document.json.JsonObject;
18 | import com.couchbase.client.java.query.N1qlQuery;
19 | import com.couchbase.spark.japi.CouchbaseSparkContext;
20 | import com.couchbase.spark.rdd.CouchbaseQueryRow;
21 | import org.apache.spark.api.java.JavaRDD;
22 | import org.apache.spark.api.java.JavaSparkContext;
23 | import org.apache.spark.sql.Dataset;
24 | import org.apache.spark.sql.Row;
25 | import org.apache.spark.sql.SparkSession;
26 | import org.apache.spark.sql.sources.EqualTo;
27 | 
28 | import java.util.Arrays;
29 | import java.util.List;
30 | 
31 | import static com.couchbase.spark.japi.CouchbaseDataFrameReader.couchbaseReader;
32 | import static com.couchbase.spark.japi.CouchbaseDocumentRDD.couchbaseDocumentRDD;
33 | import static com.couchbase.spark.japi.CouchbaseRDD.couchbaseRDD;
34 | import static com.couchbase.spark.japi.CouchbaseSparkContext.couchbaseContext;
35 | 
36 | /**
37 |  * This example shows how to use Spark and the Couchbase Connector from Java.
38 |  *
39 |  * @author Michael Nitschinger
40 |  */
41 | public class JavaExample {
42 | 
43 |     public static void main(String[] args) {
44 | 
45 |         SparkSession spark = SparkSession
46 |             .builder()
47 |             .appName("JavaExample")
48 |             .master("local[*]") // use the JVM as the master, great for testing
49 |             .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
50 |             .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
51 |             .config("com.couchbase.username", "Administrator")
52 |             .config("com.couchbase.password", "password")
53 |             .getOrCreate();
54 | 
55 |         // The Java wrapper around the SparkContext
56 |         JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
57 | 
58 |         // The Couchbase-Enabled SparkContext
59 |         CouchbaseSparkContext csc = couchbaseContext(jsc);
60 | 
61 |         // Load docs through K/V
62 |         List<JsonDocument> docs = csc
63 |             .couchbaseGet(Arrays.asList("airline_10226", "airline_10748"))
64 |             .collect();
65 |         System.out.println(docs);
66 | 
67 |         // Load docs through K/V from a mapped RDD
68 |         JavaRDD<String> ids = jsc.parallelize(Arrays.asList("airline_10226", "airline_10748"));
69 |         docs = couchbaseRDD(ids).couchbaseGet().collect();
70 |         System.out.println(docs);
71 | 
72 |         // Perform a N1QL query
73 |         List<CouchbaseQueryRow> results = csc
74 |             .couchbaseQuery(N1qlQuery.simple("SELECT * FROM `travel-sample` LIMIT 10"))
75 |             .collect();
76 | 
77 |         System.out.println(results);
78 | 
79 |         // Store A (empty) Document
80 |         couchbaseDocumentRDD(
81 |             jsc.parallelize(Arrays.asList(JsonDocument.create("doc1", JsonObject.empty())))
82 |         ).saveToCouchbase();
83 | 
84 |         // Wrap the Reader and create the DataFrame from Couchbase
85 |         // Note that since Spark 2.0, a DataFrame == Dataset<Row>
86 |         Dataset<Row> airlines = couchbaseReader(spark.read()).couchbase(new EqualTo("type", "airline"));
87 | 
88 |         // Print the Inferred Schema
89 |         airlines.printSchema();
90 | 
91 |         // Print the number of airline
92 |         System.out.println("Number of Airlines: " + airlines.count());
93 | 
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/main/scala/KeyValueExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import com.couchbase.client.java.document.JsonDocument
17 | import org.apache.spark.sql.SparkSession
18 | import com.couchbase.spark._
19 | 
20 | /**
21 |   * This example fetches two documents by its document ID and prints out their contents.
22 |   *
23 |   * This prints:
24 |   *
25 |   * {{{
26 |   * JsonDocument{id='airline_10748', cas=312610725036034, expiry=0,
27 |   * content={"country":"United States","iata":"ZQ","name":"Locair", "callsign":"LOCAIR","icao":"LOC","id":10748,"type":"airline"},
28 |   * mutationToken=null}
29 |   *
30 |   * JsonDocument{id='airline_10123', cas=312605445586946, expiry=0,
31 |   * content={"country":"United States","iata":"TQ","name":"Texas Wings","callsign":"TXW","icao":"TXW","id":10123,"type":"airline"},
32 |   * mutationToken=null}
33 |   * }}}
34 |   *
35 |   * @author Michael Nitschinger
36 |   */
37 | object KeyValueExample {
38 | 
39 |   def main(args: Array[String]): Unit = {
40 | 
41 |     // The SparkSession is the main entry point into spark
42 |     val spark = SparkSession
43 |       .builder()
44 |       .appName("KeyValueExample")
45 |       .master("local[*]") // use the JVM as the master, great for testing
46 |       .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
47 |       .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
48 |       .config("com.couchbase.username", "Administrator")
49 |       .config("com.couchbase.password", "password")
50 |       .getOrCreate()
51 | 
52 |     spark.sparkContext
53 |       .couchbaseGet[JsonDocument](Seq("airline_10123", "airline_10748")) // Load documents from couchbase
54 |       .collect() // collect all data from the spark workers
55 |       .foreach(println) // print each document content
56 |   }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/MultiBucketExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import com.couchbase.client.java.document.JsonDocument
17 | import org.apache.spark.sql.SparkSession
18 | import com.couchbase.spark._
19 | 
20 | /**
21 |   * This example shows how to connect to more than one bucket, fetching two documents from one
22 |   * and saving them in the other.
23 |   *
24 |   * @author Michael Nitschinger
25 |   */
26 | object MultiBucketExample {
27 | 
28 |     def main(args: Array[String]): Unit = {
29 | 
30 |       // The SparkSession is the main entry point into spark
31 |       val spark = SparkSession
32 |         .builder()
33 |         .appName("MultiBucketExample")
34 |         .master("local[*]") // use the JVM as the master, great for testing
35 |         .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
36 |         .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
37 |         .config("spark.couchbase.bucket.default", "") // open the default bucket with empty password
38 |         .config("com.couchbase.username", "Administrator")
39 |         .config("com.couchbase.password", "password")
40 |         .getOrCreate()
41 | 
42 |       spark.sparkContext
43 |         .couchbaseGet[JsonDocument](Seq("airline_10123", "airline_10748"), "travel-sample")
44 |         .saveToCouchbase("default") // write them into default
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/N1QLExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import com.couchbase.client.java.query.N1qlQuery
17 | import org.apache.spark.sql.{Row, SparkSession}
18 | import com.couchbase.spark._
19 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
20 | 
21 | /**
22 |   * This example shows how to perform a "raw" N1QL query generating an RDD.
23 |   *
24 |   * If you want to work with Spark SQL, please see [[SparkSQLExample]].
25 |   *
26 |   * This code prints:
27 |   *
28 |   * {{{
29 |   * {"count":1560,"country":"United States"}
30 |   * {"count":221,"country":"France"}
31 |   * {"count":187,"country":"United Kingdom"}
32 |   * }}}
33 |   *
34 |   * @author Michael Nitschinger
35 |   */
36 | object N1QLExample {
37 | 
38 |   def main(args: Array[String]): Unit = {
39 | 
40 |     // The SparkSession is the main entry point into spark
41 |     val spark = SparkSession
42 |       .builder()
43 |       .appName("N1QLExample")
44 |       .master("local[*]") // use the JVM as the master, great for testing
45 |       .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
46 |       .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
47 |       .config("com.couchbase.username", "Administrator")
48 |       .config("com.couchbase.password", "password")
49 |       .getOrCreate()
50 | 
51 |     // This query groups airports by country and counts them.
52 |     val query = N1qlQuery.simple("" +
53 |       "select country, count(*) as count " +
54 |       "from `travel-sample` " +
55 |       "where type = 'airport' " +
56 |       "group by country " +
57 |       "order by count desc")
58 | 
59 |     // Perform the query and print the country name and the count.
60 |     spark.sparkContext
61 |       .couchbaseQuery(query)
62 |       .map(_.value)
63 |       .foreach(println)
64 | 
65 | 
66 |     val schema = StructType(
67 |         StructField("count", IntegerType) ::
68 |         StructField("country", StringType) :: Nil
69 |     )
70 | 
71 |     val rdd = spark.sparkContext.couchbaseQuery(query).map(r => Row(r.value.getInt("count"), r.value.getString("country")))
72 |     spark.createDataFrame(rdd, schema).show()
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/SparkSQLExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import org.apache.spark.sql.SparkSession
17 | import org.apache.spark.sql.sources.EqualTo
18 | import com.couchbase.spark.sql._
19 | 
20 | /**
21 |   * This example shows how to work with Spark SQL / DataFrames.
22 |   *
23 |   * If you want to see how to use N1QL directly, please see [[N1QLExample]].
24 |   * If you want to see Dataset conversions, check out [[DatasetExample]].
25 |   *
26 |   * Note that this code uses automatic schema inference based on the predicate provided when
27 |   * creating the DataFrame. It is also possible to do manual schema inference:
28 |   *
29 |   * {{{
30 |   * val airlines = sql.n1ql(StructType(
31 |   *    StructField("name", StringType) ::
32 |   *    StructField("abv", DoubleType) ::
33 |   *    StructField("type", StringType) :: Nil
34 |   * ))
35 |   * }}}
36 |   *
37 |   * @author Michael Nitschinger
38 |   */
39 | object SparkSQLExample {
40 | 
41 |   def main(args: Array[String]): Unit = {
42 | 
43 |     // The SparkSession is the main entry point into spark
44 |     val spark = SparkSession
45 |       .builder()
46 |       .appName("SparkSQLExample")
47 |       .master("local[*]") // use the JVM as the master, great for testing
48 |       .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
49 |       .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
50 |       .config("com.couchbase.username", "Administrator")
51 |       .config("com.couchbase.password", "password")
52 |       .getOrCreate()
53 | 
54 |     // Creates DataFrames with automatic schema inference based on a "type" predicate
55 |     val airlines = spark.read.couchbase(EqualTo("type", "airline"))
56 |     val airports = spark.read.couchbase(EqualTo("type", "airport"))
57 | 
58 |     // The inferred schemata can be nicely printed to cross-check
59 |     println(s"Airline Schema: ${airlines.schema.treeString}")
60 |     println(s"Airport Schema: ${airports.schema.treeString}")
61 | 
62 |     // Querying the Airlines DataFrame through Spark SQL
63 |     println("Name and callsign for 10 airlines:")
64 |     airlines
65 |       .select("name", "callsign")
66 |       .sort(airlines("callsign").desc)
67 |       .show(10)
68 | 
69 |     // Counting all Airports
70 |     println(s"Number of Airports: ${airports.count()}")
71 | 
72 |     // Group and count airports by country
73 |     println("Airports by Country:")
74 |     airports
75 |       .groupBy("country")
76 |       .count()
77 |       .show()
78 |   }
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/StreamingExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import org.apache.spark.SparkConf
17 | import org.apache.spark.streaming.{StreamingContext, Seconds}
18 | import com.couchbase.spark.streaming._
19 | 
20 | /**
21 |   * This example shows how to perform raw Spark Streaming from a Couchbase DCP feed.
22 |   *
23 |   * If you are looking for streaming structured data more easily, take a look at the newly introduced
24 |   * [[StructuredStreamingExample]] instead, which is also easier to use and provides stronger
25 |   * guarantees out of the box.
26 |   *
27 |   * @author Michael Nitschinger
28 |   */
29 | object StreamingExample {
30 | 
31 |   def main(args: Array[String]): Unit = {
32 | 
33 |     // Create the Spark Config and instruct to use the travel-sample bucket
34 |     // with no password.
35 |     val conf = new SparkConf()
36 |       .setMaster("local[*]")
37 |       .setAppName("StreamingExample")
38 |       .set("com.couchbase.username", "Administrator")
39 |       .set("com.couchbase.password", "password")
40 |       .set("com.couchbase.bucket.travel-sample", "")
41 | 
42 |     // Initialize StreamingContext with a Batch interval of 5 seconds
43 |     val ssc = new StreamingContext(conf, Seconds(5))
44 | 
45 |     // Consume the DCP Stream from the beginning and never stop.
46 |     // This counts the messages per interval and prints their count.
47 |     ssc
48 |       .couchbaseStream(from = FromBeginning, to = ToInfinity)
49 |       .count()
50 |       .print()
51 | 
52 |     // Start the Stream and await termination
53 |     ssc.start()
54 |     ssc.awaitTermination()
55 |   }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/StructuredStreamingExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import org.apache.spark.sql.SparkSession
17 | import org.apache.spark.sql.types._
18 | 
19 | /**
20 |   * This example shows how to utilize the new structured streaming approach together with the
21 |   * changes feed from couchbase.
22 |   *
23 |   * If no traffic runs on the couchbase bucket at start of this script, it prints
24 |   *
25 |   * {{{
26 |   * +--------+-----+
27 |   * |    type|count|
28 |   * +--------+-----+
29 |   * |   hotel|  917|
30 |   * |    null|    1|
31 |   * |landmark| 4495|
32 |   * | airline|  187|
33 |   * | airport| 1968|
34 |   * |   route|24024|
35 |   * +--------+-----+
36 |   * }}}
37 |   *
38 |   * Since it keeps the counts as a total value, if you then modify a document in the UI, for
39 |   * example a airport you'll see the airport type count increasing by one.
40 |   *
41 |   * @author Michael Nitschinger
42 |   */
43 | object StructuredStreamingExample {
44 | 
45 |   // Very simple schema, feel free to add more properties here. Properties that do not
46 |   // exist in a streamed document show as null.
47 |   val schema = StructType(
48 |     StructField("META_ID", StringType) ::
49 |     StructField("type", StringType) ::
50 |     StructField("name", StringType) :: Nil
51 |   )
52 | 
53 |   def main(args: Array[String]): Unit = {
54 | 
55 |     // The SparkSession is the main entry point into spark
56 |     val spark = SparkSession
57 |       .builder()
58 |       .appName("N1QLExample")
59 |       .master("local[*]") // use the JVM as the master, great for testing
60 |       .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
61 |       .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
62 |       .config("com.couchbase.username", "Administrator")
63 |       .config("com.couchbase.password", "password")
64 |       .getOrCreate()
65 | 
66 |     // Define the Structured Stream from Couchbase with the given Schema
67 |     val records = spark.readStream
68 |       .format("com.couchbase.spark.sql")
69 |       .schema(schema)
70 |       .load()
71 | 
72 |     // Count per type and print to screen
73 |     records
74 |       .groupBy("type")
75 |       .count()
76 |       .writeStream
77 |       .outputMode("complete")
78 |       .format("console")
79 |       .start()
80 |       .awaitTermination()
81 |   }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/scala/SubdocLookupExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016 Couchbase, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | import com.couchbase.spark._
17 | import org.apache.spark.sql.SparkSession
18 | 
19 | /**
20 |   * This example shows how to use the Subdocument API which has been introduced with
21 |   * Couchbase Server 4.5.0.
22 |   *
23 |   * @author Michael Nitschinger
24 |   */
25 | object SubdocLookupExample {
26 | 
27 |   def main(args: Array[String]): Unit = {
28 | 
29 |     // The SparkSession is the main entry point into spark
30 |     val spark = SparkSession
31 |       .builder()
32 |       .appName("SubdocExample")
33 |       .master("local[*]") // use the JVM as the master, great for testing
34 |       .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
35 |       .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
36 |       .config("com.couchbase.username", "Administrator")
37 |       .config("com.couchbase.password", "password")
38 |       .getOrCreate()
39 | 
40 | 
41 |     val result = spark.sparkContext.couchbaseSubdocLookup(
42 |       Seq("airline_10123"), // fetch these document ids
43 |       Seq("name", "iata"),  // only fetch their name and iata code
44 |       Seq("foobar") // but also check if the foobar key exists in the doc
45 |     ).collect()
46 | 
47 |     // Prints
48 |     // SubdocLookupResult(
49 |     //    airline_10123,0,Map(name -> Texas Wings, iata -> TQ),Map(foobar -> false)
50 |     // )
51 |     result.foreach(println)
52 | 
53 |     // Same as above, but omits the exists check, just looks up the fields.
54 |     val r2  = spark.sparkContext.couchbaseSubdocLookup(
55 |       Seq("airline_10123"),
56 |       Seq("name", "iata")
57 |     )
58 | 
59 |     // Prints
60 |     // SubdocLookupResult(
61 |     //    airline_10123,0,Map(name -> Texas Wings, iata -> TQ),Map()
62 |     // )
63 |     r2.foreach(println)
64 | 
65 |   }
66 | }


--------------------------------------------------------------------------------
/src/main/scala/SubdocMutationExample.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.sql.SparkSession
 2 | import com.couchbase.spark._
 3 | import com.couchbase.spark.connection._
 4 | 
 5 | /**
 6 |   * Created by daschl on 13/07/17.
 7 |   */
 8 | object SubdocMutationExample {
 9 | 
10 |   def main(args: Array[String]): Unit = {
11 |     // The SparkSession is the main entry point into spark
12 |     val spark = SparkSession
13 |       .builder()
14 |       .appName("SubdocExample")
15 |       .master("local[*]") // use the JVM as the master, great for testing
16 |       .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
17 |       .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
18 |       .config("spark.couchbase.username", "Administrator") // If you are using RBAC / Server 5.0
19 |       .config("spark.couchbase.password", "password") // If you are using RBAC / Server 5.0
20 |       .getOrCreate()
21 | 
22 | 
23 | 
24 | 
25 |     // Change a field in a document that already exists
26 |     spark.sparkContext
27 |       .couchbaseSubdocMutate(Seq(SubdocReplace("airline_10", "name", "42-Mile-Air")))
28 |       .collect()
29 | 
30 |     // Append an element to an array
31 |     spark.sparkContext
32 |       .couchbaseSubdocMutate(Seq(SubdocArrayAppend("airline_1191", "codes", 1, true)))
33 |       .collect()
34 | 
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/TransformationExample.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2015 Couchbase, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *    http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | import com.couchbase.client.java.document.JsonDocument
 17 | import com.couchbase.client.java.document.json.JsonObject
 18 | import com.couchbase.spark._
 19 | import com.couchbase.spark.streaming._
 20 | import org.apache.spark.sql.{DataFrameReader, SQLContext, SparkSession}
 21 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 22 | 
 23 | /**
 24 |   * A sample Apache Spark program to show how Couchbase may be used with Spark
 25 |   * when doing data transformations.
 26 |   *
 27 |   * Assuming a MySQL Database and documents with this format:
 28 |   *
 29 |   * {
 30 |   *  "givenname": "Matt",
 31 |   *   "surname": "Ingenthron",
 32 |   *   "email": "matt@email.com"
 33 |   * }
 34 |   *
 35 |   * Stream out all documents, look them up in the data loaded from mysql, join on
 36 |   * the email address and add the entitlement token.
 37 |   *
 38 |   * If you run this in the IDE, make sure to set the master to local[*]!
 39 |   */
 40 | object TransformationExample {
 41 | 
 42 |   /** Returns a JsonDocument based on a tuple of two strings */
 43 |   def CreateDocument(s: (String, String)): JsonDocument = {
 44 |     JsonDocument.create(s._1, JsonObject.fromJson(s._2))
 45 |   }
 46 | 
 47 |   /** Returns an RDD based on email address extraced from the document */
 48 |   def CreateMappableRdd(s: (String, String)): (String, JsonDocument) = {
 49 |     val return_doc = JsonDocument.create(s._1, JsonObject.fromJson(s._2))
 50 |     (return_doc.content().getString("email"), return_doc)
 51 |   }
 52 | 
 53 |   /** Returns a JsonDocument enriched with the entitlement token */
 54 |   def mergeIntoDoc(t: (String, (JsonDocument, Integer))): JsonDocument = {
 55 |     val jsonToEnrich = t._2._1.content()
 56 |     val entitlementFromJoin = t._2._2
 57 |     jsonToEnrich.put("entitlementtoken", entitlementFromJoin)
 58 |     t._2._1
 59 |   }
 60 | 
 61 |   def getMysqlReader(sqlctx: SQLContext): DataFrameReader = {
 62 | 
 63 |     // Now get set up to fetch things from MySQL
 64 |     // The database name is ext_users with the data we want in a table named profiles
 65 |     // and a read-only user named profiles
 66 |     val mysql_connstr = "jdbc:mysql://localhost:3306/ext_users"
 67 |     val mysql_uname = "profiles"
 68 |     val mysql_password = "profiles"
 69 | 
 70 |     sqlctx.read.format("jdbc").options(
 71 |       Map("url" -> (mysql_connstr + "?user=" + mysql_uname + "&password=" + mysql_password),
 72 |         "dbtable" -> "ext_users.profiles"))
 73 |   }
 74 | 
 75 |   def main(args: Array[String]): Unit = {
 76 |     Class.forName("com.mysql.jdbc.Driver").newInstance // Load the MySQL Connector
 77 | 
 78 |     // Initialize the spark session.
 79 |     val spark = SparkSession
 80 |       .builder()
 81 |       .appName("TransformationExample")
 82 |       // Configure for the Couchbase bucket "transformative"
 83 |       .config("spark.couchbase.bucket.transformative", "")
 84 |       .config("com.couchbase.username", "Administrator")
 85 |       .config("com.couchbase.password", "password")
 86 |       .getOrCreate()
 87 | 
 88 |     val mysqlReader = getMysqlReader(spark.sqlContext) // set up a MySQL Reader
 89 | 
 90 |     // Note that if the database was quite large you could push down other predicates to MySQL or partition
 91 |     // the DataFrame
 92 |     //    mysqlReader.load().filter("email = \"matt@email.com\"")
 93 | 
 94 |     // load the DataFrame of all of the users from MySQL.
 95 |     // Note, appending .cache() may make sense here (or not) depending on amount of data.
 96 |     val entitlements = mysqlReader.load()
 97 | 
 98 |     /* loading this:
 99 |       +---------+-----------+-----------------+----------------+
100 |       |givenname|    surname|            email|entitlementtoken|
101 |       +---------+-----------+-----------------+----------------+
102 |       |     Matt| Ingenthron|   matt@email.com|           11211|
103 |       |  Michael|Nitschinger|michael@email.com|           11210|
104 |       +---------+-----------+-----------------+----------------+
105 |      */
106 | 
107 |     val entitlementsSansSchema = entitlements.rdd.map[(String, Integer)](f => (f.getAs[String]("email"), f.getAs[Integer]("entitlementtoken")))
108 | 
109 |     val ssc = new StreamingContext(spark.sparkContext, Seconds(5))
110 | 
111 |     ssc.couchbaseStream("transformative")
112 |       .filter(_.isInstanceOf[Mutation])
113 |       .map(m => (new String(m.asInstanceOf[Mutation].key), new String(m.asInstanceOf[Mutation].content)))
114 |       .map(s => CreateMappableRdd(s))
115 |       .filter(_._2.content().get("entitlementtoken").eq(null))
116 |       .foreachRDD(rdd => {
117 |         rdd
118 |           .join(entitlementsSansSchema)
119 |           .map(mergeIntoDoc)
120 |           //.foreach(println) // a good place to see the effect
121 |           .saveToCouchbase("transformative")
122 |       })
123 | 
124 |     ssc.start()
125 |     ssc.awaitTermination()
126 |   }
127 | 
128 | }
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/src/main/scala/Word2VecExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2016 Couchbase, Inc.
 3 | *
 4 | * Licensed under the Apache License, Version 2.0 (the "License");
 5 | * you may not use this file except in compliance with the License.
 6 | * You may obtain a copy of the License at
 7 | *
 8 | *    http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | import com.couchbase.client.java.query.N1qlQuery
17 | import org.apache.spark.SparkContext
18 | import com.couchbase.spark._
19 | import java.io.File
20 | 
21 | import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
22 | import org.apache.spark.sql.SparkSession
23 | 
24 | /**
25 |   * This example shows how to use the Machine Learning Word2Vec model together with Couchbase.
26 |   *
27 |   * It takes reviews from the travel-sample bucket, trains the model and then searches for
28 |   * synonyms for "hotel".
29 |   *
30 |   * @author Will Gardella
31 |   * @author Michael Nitschinger
32 |   */
33 | object Word2VecExample {
34 | 
35 |   def main(args: Array[String]): Unit = {
36 | 
37 |     // The SparkSession is the main entry point into spark
38 |     val spark = SparkSession
39 |       .builder()
40 |       .appName("Word2VecExample")
41 |       .master("local[*]") // use the JVM as the master, great for testing
42 |       .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost
43 |       .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password
44 |       .config("com.couchbase.username", "Administrator")
45 |       .config("com.couchbase.password", "password")
46 |       .getOrCreate()
47 | 
48 |     // Train the model if not trained already
49 |     val model = trainAndLoadModel(spark.sparkContext)
50 | 
51 |     // Find the synonyms in the trained model and print them out
52 |     val synonyms = model
53 |       .findSynonyms(args.lift(0).getOrElse("hotel"), args.lift(1).map(_.toInt).getOrElse(10))
54 |       .foreach { case (syn, sim) =>
55 |         println(s"\t(•͡˘㇁•͡˘) --> \t[$syn] (with a similarity of $sim)")
56 |       }
57 | 
58 |     spark.stop()
59 |   }
60 | 
61 |   /**
62 |     * Trains the model if no training data exists (based on the result of a n1ql query) and returns the trained
63 |     * data (the model).
64 |     */
65 |   def trainAndLoadModel(sc: SparkContext): Word2VecModel = {
66 |     val word2vec = new Word2Vec
67 |     val path = "reviews"
68 | 
69 |     if (!new File(path).exists) {
70 |       val reviews = "SELECT m.content from (" +
71 |         "SELECT ELEMENT reviews FROM `travel-sample` WHERE type = 'hotel' AND ARRAY_LENGTH(reviews) > 0" +
72 |         ") AS x UNNEST x AS m;"
73 |       val input = sc.couchbaseQuery(N1qlQuery.simple(reviews))
74 |         .map(_.value.getString("content").split(" ").map(_.toLowerCase.replaceAll("[^a-z0-9]", "")).toSeq)
75 | 
76 |       val m = word2vec.fit(input)
77 |       m.save(sc, path)
78 |       m
79 |     } else {
80 |       Word2VecModel.load(sc,  path)
81 |     }
82 |   }
83 | }


--------------------------------------------------------------------------------