├── .gitignore ├── README.md ├── build.sbt ├── project ├── build.properties └── plugins.sbt └── src └── main ├── resources └── log4j.properties └── scala ├── DatasetExample.scala ├── HdfsExample.scala ├── JavaExample.java ├── KeyValueExample.scala ├── MultiBucketExample.scala ├── N1QLExample.scala ├── SparkSQLExample.scala ├── StreamingExample.scala ├── StructuredStreamingExample.scala ├── SubdocLookupExample.scala ├── SubdocMutationExample.scala ├── TransformationExample.scala └── Word2VecExample.scala /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .idea/ 3 | reviews/ 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Couchbase Spark Samples 2 | ======================= 3 | 4 | This repository contains small snippets that showcase the usage of 5 | the [Couchbase Spark Connector](https://github.com/couchbase/couchbase-spark-connector) 6 | with [Apache Spark](http://spark.apache.org/). 7 | 8 | Please see the [Documentation](http://docs.couchbase.com) for quickstart and 9 | full documentation. Also make sure to have at least the `travel-sample` and the 10 | `default` bucket loaded. Some examples also need more setup like the twitter stream, 11 | JDBC and Hadoop examples. 12 | 13 | All examples can be run from the IDE by calling the `main` methods and you 14 | can also run them from the command line, for example : `sbt "run-main DatasetExample"` -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "couchbase-spark-samples" 2 | 3 | organization := "com.couchbase" 4 | 5 | version := "1.0.0-SNAPSHOT" 6 | 7 | scalaVersion := "2.12.0" 8 | 9 | libraryDependencies ++= Seq( 10 | "org.apache.spark" %% "spark-core" % "2.4.0", 11 | "org.apache.spark" %% "spark-streaming" % "2.4.0", 12 | "org.apache.spark" %% "spark-sql" % "2.4.0", 13 | "com.couchbase.client" %% "spark-connector" % "2.4.0", 14 | "org.apache.spark" %% "spark-mllib" % "2.4.0", 15 | "mysql" % "mysql-connector-java" % "5.1.37" 16 | ) 17 | 18 | resolvers += Resolver.mavenLocal -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.8 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n -------------------------------------------------------------------------------- /src/main/scala/DatasetExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import org.apache.spark.sql.SparkSession 17 | import org.apache.spark.sql.sources.EqualTo 18 | import com.couchbase.spark.sql._ 19 | 20 | /** Airline has subset of the fields that are in the database */ 21 | case class Airline(name: String, iata: String, icao: String, country: String) 22 | 23 | /** 24 | * This example is very similar to [[SparkSQLExample]] but it shows how a DataFrame is converted into a 25 | * Dataset and can then be used through regular scala code instead of SparkSQL syntax while still preserving 26 | * all the optimizations of the underlying spark execution engine. 27 | * 28 | * @author Michael Nitschinger 29 | */ 30 | object DatasetExample { 31 | 32 | def main(args: Array[String]): Unit = { 33 | 34 | // The SparkSession is the main entry point into spark 35 | val spark = SparkSession 36 | .builder() 37 | .appName("DatasetExample") 38 | .master("local[*]") // use the JVM as the master, great for testing 39 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 40 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 41 | .config("com.couchbase.username", "Administrator") 42 | .config("com.couchbase.password", "password") 43 | .getOrCreate() 44 | 45 | // Import needed for case class conversions 46 | import spark.implicits._ 47 | 48 | // Create the DataFrame and convert it into a Dataset through `as[Airline]` 49 | val airlines = spark.read.couchbase(EqualTo("type", "airline")).as[Airline] 50 | 51 | // Print schema like on a DataFrame 52 | airlines.printSchema() 53 | 54 | // Print airlines that start with A, using a "scala" API instead of SparkSQL syntax. 55 | airlines 56 | .map(_.name) 57 | .filter(_.toLowerCase.startsWith("a")) 58 | .foreach(println(_)) 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/HdfsExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import com.couchbase.client.java.query.N1qlQuery 17 | import org.apache.spark.sql.SparkSession 18 | import org.apache.spark.sql.sources.EqualTo 19 | import com.couchbase.spark.sql._ 20 | import com.couchbase.spark._ 21 | 22 | /** 23 | * This example shows how to combine DataFrames from Couchbase and HDFS together to perform useful queries. 24 | * 25 | * Note that when you run it for the first time run this script as-is, but then comment out the first block 26 | * where the records are initially imported into HDFS, this is just the "data loading" phase for this 27 | * example. 28 | * 29 | * Make sure HDFS runs on port 9000 on localhost or adapt the read/write paths in this example. 30 | * 31 | * @author Michael Nitschinger 32 | * @author Matt Ingenthron 33 | */ 34 | object HdfsExample { 35 | 36 | def main(args: Array[String]): Unit = { 37 | 38 | // The SparkSession is the main entry point into spark 39 | val spark = SparkSession 40 | .builder() 41 | .appName("HdfsExample") 42 | .master("local[*]") // use the JVM as the master, great for testing 43 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 44 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 45 | .getOrCreate() 46 | 47 | // ---- Write Data into HDFS (! do once to load and then comment out ... !) ---- 48 | val query = "SELECT `travel-sample`.* from `travel-sample` WHERE type = 'landmark'" 49 | spark.sparkContext 50 | .couchbaseQuery(N1qlQuery.simple(query)) 51 | .map(_.value.toString) 52 | .saveAsTextFile("hdfs://127.0.0.1:9000/landmarks") 53 | 54 | // ---- Load data from HDFS and join with records in Couchbase ---- 55 | 56 | // Load Landmarks from HDFS 57 | val landmarks = spark.read.json("hdfs://127.0.0.1:9000/landmarks/*") 58 | landmarks.createOrReplaceTempView("landmarks") 59 | 60 | // Load Airports from Couchbase 61 | val airports = spark.read.couchbase(schemaFilter = EqualTo("type", "airport")) 62 | 63 | // find all landmarks in the same city as the given FAA code 64 | val toFind = "SFO" // try SFO or LAX 65 | airports 66 | .join(landmarks, airports("city") === landmarks("city")) 67 | .where(airports("faa") === toFind and landmarks("url").isNotNull) 68 | .select(landmarks("name"), landmarks("address"), airports("faa")) 69 | .orderBy(landmarks("name").asc) 70 | .show() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/JavaExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import com.couchbase.client.java.document.JsonDocument; 17 | import com.couchbase.client.java.document.json.JsonObject; 18 | import com.couchbase.client.java.query.N1qlQuery; 19 | import com.couchbase.spark.japi.CouchbaseSparkContext; 20 | import com.couchbase.spark.rdd.CouchbaseQueryRow; 21 | import org.apache.spark.api.java.JavaRDD; 22 | import org.apache.spark.api.java.JavaSparkContext; 23 | import org.apache.spark.sql.Dataset; 24 | import org.apache.spark.sql.Row; 25 | import org.apache.spark.sql.SparkSession; 26 | import org.apache.spark.sql.sources.EqualTo; 27 | 28 | import java.util.Arrays; 29 | import java.util.List; 30 | 31 | import static com.couchbase.spark.japi.CouchbaseDataFrameReader.couchbaseReader; 32 | import static com.couchbase.spark.japi.CouchbaseDocumentRDD.couchbaseDocumentRDD; 33 | import static com.couchbase.spark.japi.CouchbaseRDD.couchbaseRDD; 34 | import static com.couchbase.spark.japi.CouchbaseSparkContext.couchbaseContext; 35 | 36 | /** 37 | * This example shows how to use Spark and the Couchbase Connector from Java. 38 | * 39 | * @author Michael Nitschinger 40 | */ 41 | public class JavaExample { 42 | 43 | public static void main(String[] args) { 44 | 45 | SparkSession spark = SparkSession 46 | .builder() 47 | .appName("JavaExample") 48 | .master("local[*]") // use the JVM as the master, great for testing 49 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 50 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 51 | .config("com.couchbase.username", "Administrator") 52 | .config("com.couchbase.password", "password") 53 | .getOrCreate(); 54 | 55 | // The Java wrapper around the SparkContext 56 | JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); 57 | 58 | // The Couchbase-Enabled SparkContext 59 | CouchbaseSparkContext csc = couchbaseContext(jsc); 60 | 61 | // Load docs through K/V 62 | List docs = csc 63 | .couchbaseGet(Arrays.asList("airline_10226", "airline_10748")) 64 | .collect(); 65 | System.out.println(docs); 66 | 67 | // Load docs through K/V from a mapped RDD 68 | JavaRDD ids = jsc.parallelize(Arrays.asList("airline_10226", "airline_10748")); 69 | docs = couchbaseRDD(ids).couchbaseGet().collect(); 70 | System.out.println(docs); 71 | 72 | // Perform a N1QL query 73 | List results = csc 74 | .couchbaseQuery(N1qlQuery.simple("SELECT * FROM `travel-sample` LIMIT 10")) 75 | .collect(); 76 | 77 | System.out.println(results); 78 | 79 | // Store A (empty) Document 80 | couchbaseDocumentRDD( 81 | jsc.parallelize(Arrays.asList(JsonDocument.create("doc1", JsonObject.empty()))) 82 | ).saveToCouchbase(); 83 | 84 | // Wrap the Reader and create the DataFrame from Couchbase 85 | // Note that since Spark 2.0, a DataFrame == Dataset 86 | Dataset airlines = couchbaseReader(spark.read()).couchbase(new EqualTo("type", "airline")); 87 | 88 | // Print the Inferred Schema 89 | airlines.printSchema(); 90 | 91 | // Print the number of airline 92 | System.out.println("Number of Airlines: " + airlines.count()); 93 | 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/KeyValueExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import com.couchbase.client.java.document.JsonDocument 17 | import org.apache.spark.sql.SparkSession 18 | import com.couchbase.spark._ 19 | 20 | /** 21 | * This example fetches two documents by its document ID and prints out their contents. 22 | * 23 | * This prints: 24 | * 25 | * {{{ 26 | * JsonDocument{id='airline_10748', cas=312610725036034, expiry=0, 27 | * content={"country":"United States","iata":"ZQ","name":"Locair", "callsign":"LOCAIR","icao":"LOC","id":10748,"type":"airline"}, 28 | * mutationToken=null} 29 | * 30 | * JsonDocument{id='airline_10123', cas=312605445586946, expiry=0, 31 | * content={"country":"United States","iata":"TQ","name":"Texas Wings","callsign":"TXW","icao":"TXW","id":10123,"type":"airline"}, 32 | * mutationToken=null} 33 | * }}} 34 | * 35 | * @author Michael Nitschinger 36 | */ 37 | object KeyValueExample { 38 | 39 | def main(args: Array[String]): Unit = { 40 | 41 | // The SparkSession is the main entry point into spark 42 | val spark = SparkSession 43 | .builder() 44 | .appName("KeyValueExample") 45 | .master("local[*]") // use the JVM as the master, great for testing 46 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 47 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 48 | .config("com.couchbase.username", "Administrator") 49 | .config("com.couchbase.password", "password") 50 | .getOrCreate() 51 | 52 | spark.sparkContext 53 | .couchbaseGet[JsonDocument](Seq("airline_10123", "airline_10748")) // Load documents from couchbase 54 | .collect() // collect all data from the spark workers 55 | .foreach(println) // print each document content 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/MultiBucketExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import com.couchbase.client.java.document.JsonDocument 17 | import org.apache.spark.sql.SparkSession 18 | import com.couchbase.spark._ 19 | 20 | /** 21 | * This example shows how to connect to more than one bucket, fetching two documents from one 22 | * and saving them in the other. 23 | * 24 | * @author Michael Nitschinger 25 | */ 26 | object MultiBucketExample { 27 | 28 | def main(args: Array[String]): Unit = { 29 | 30 | // The SparkSession is the main entry point into spark 31 | val spark = SparkSession 32 | .builder() 33 | .appName("MultiBucketExample") 34 | .master("local[*]") // use the JVM as the master, great for testing 35 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 36 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 37 | .config("spark.couchbase.bucket.default", "") // open the default bucket with empty password 38 | .config("com.couchbase.username", "Administrator") 39 | .config("com.couchbase.password", "password") 40 | .getOrCreate() 41 | 42 | spark.sparkContext 43 | .couchbaseGet[JsonDocument](Seq("airline_10123", "airline_10748"), "travel-sample") 44 | .saveToCouchbase("default") // write them into default 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/N1QLExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import com.couchbase.client.java.query.N1qlQuery 17 | import org.apache.spark.sql.{Row, SparkSession} 18 | import com.couchbase.spark._ 19 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 20 | 21 | /** 22 | * This example shows how to perform a "raw" N1QL query generating an RDD. 23 | * 24 | * If you want to work with Spark SQL, please see [[SparkSQLExample]]. 25 | * 26 | * This code prints: 27 | * 28 | * {{{ 29 | * {"count":1560,"country":"United States"} 30 | * {"count":221,"country":"France"} 31 | * {"count":187,"country":"United Kingdom"} 32 | * }}} 33 | * 34 | * @author Michael Nitschinger 35 | */ 36 | object N1QLExample { 37 | 38 | def main(args: Array[String]): Unit = { 39 | 40 | // The SparkSession is the main entry point into spark 41 | val spark = SparkSession 42 | .builder() 43 | .appName("N1QLExample") 44 | .master("local[*]") // use the JVM as the master, great for testing 45 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 46 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 47 | .config("com.couchbase.username", "Administrator") 48 | .config("com.couchbase.password", "password") 49 | .getOrCreate() 50 | 51 | // This query groups airports by country and counts them. 52 | val query = N1qlQuery.simple("" + 53 | "select country, count(*) as count " + 54 | "from `travel-sample` " + 55 | "where type = 'airport' " + 56 | "group by country " + 57 | "order by count desc") 58 | 59 | // Perform the query and print the country name and the count. 60 | spark.sparkContext 61 | .couchbaseQuery(query) 62 | .map(_.value) 63 | .foreach(println) 64 | 65 | 66 | val schema = StructType( 67 | StructField("count", IntegerType) :: 68 | StructField("country", StringType) :: Nil 69 | ) 70 | 71 | val rdd = spark.sparkContext.couchbaseQuery(query).map(r => Row(r.value.getInt("count"), r.value.getString("country"))) 72 | spark.createDataFrame(rdd, schema).show() 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/SparkSQLExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import org.apache.spark.sql.SparkSession 17 | import org.apache.spark.sql.sources.EqualTo 18 | import com.couchbase.spark.sql._ 19 | 20 | /** 21 | * This example shows how to work with Spark SQL / DataFrames. 22 | * 23 | * If you want to see how to use N1QL directly, please see [[N1QLExample]]. 24 | * If you want to see Dataset conversions, check out [[DatasetExample]]. 25 | * 26 | * Note that this code uses automatic schema inference based on the predicate provided when 27 | * creating the DataFrame. It is also possible to do manual schema inference: 28 | * 29 | * {{{ 30 | * val airlines = sql.n1ql(StructType( 31 | * StructField("name", StringType) :: 32 | * StructField("abv", DoubleType) :: 33 | * StructField("type", StringType) :: Nil 34 | * )) 35 | * }}} 36 | * 37 | * @author Michael Nitschinger 38 | */ 39 | object SparkSQLExample { 40 | 41 | def main(args: Array[String]): Unit = { 42 | 43 | // The SparkSession is the main entry point into spark 44 | val spark = SparkSession 45 | .builder() 46 | .appName("SparkSQLExample") 47 | .master("local[*]") // use the JVM as the master, great for testing 48 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 49 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 50 | .config("com.couchbase.username", "Administrator") 51 | .config("com.couchbase.password", "password") 52 | .getOrCreate() 53 | 54 | // Creates DataFrames with automatic schema inference based on a "type" predicate 55 | val airlines = spark.read.couchbase(EqualTo("type", "airline")) 56 | val airports = spark.read.couchbase(EqualTo("type", "airport")) 57 | 58 | // The inferred schemata can be nicely printed to cross-check 59 | println(s"Airline Schema: ${airlines.schema.treeString}") 60 | println(s"Airport Schema: ${airports.schema.treeString}") 61 | 62 | // Querying the Airlines DataFrame through Spark SQL 63 | println("Name and callsign for 10 airlines:") 64 | airlines 65 | .select("name", "callsign") 66 | .sort(airlines("callsign").desc) 67 | .show(10) 68 | 69 | // Counting all Airports 70 | println(s"Number of Airports: ${airports.count()}") 71 | 72 | // Group and count airports by country 73 | println("Airports by Country:") 74 | airports 75 | .groupBy("country") 76 | .count() 77 | .show() 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/StreamingExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import org.apache.spark.SparkConf 17 | import org.apache.spark.streaming.{StreamingContext, Seconds} 18 | import com.couchbase.spark.streaming._ 19 | 20 | /** 21 | * This example shows how to perform raw Spark Streaming from a Couchbase DCP feed. 22 | * 23 | * If you are looking for streaming structured data more easily, take a look at the newly introduced 24 | * [[StructuredStreamingExample]] instead, which is also easier to use and provides stronger 25 | * guarantees out of the box. 26 | * 27 | * @author Michael Nitschinger 28 | */ 29 | object StreamingExample { 30 | 31 | def main(args: Array[String]): Unit = { 32 | 33 | // Create the Spark Config and instruct to use the travel-sample bucket 34 | // with no password. 35 | val conf = new SparkConf() 36 | .setMaster("local[*]") 37 | .setAppName("StreamingExample") 38 | .set("com.couchbase.username", "Administrator") 39 | .set("com.couchbase.password", "password") 40 | .set("com.couchbase.bucket.travel-sample", "") 41 | 42 | // Initialize StreamingContext with a Batch interval of 5 seconds 43 | val ssc = new StreamingContext(conf, Seconds(5)) 44 | 45 | // Consume the DCP Stream from the beginning and never stop. 46 | // This counts the messages per interval and prints their count. 47 | ssc 48 | .couchbaseStream(from = FromBeginning, to = ToInfinity) 49 | .count() 50 | .print() 51 | 52 | // Start the Stream and await termination 53 | ssc.start() 54 | ssc.awaitTermination() 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/StructuredStreamingExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import org.apache.spark.sql.SparkSession 17 | import org.apache.spark.sql.types._ 18 | 19 | /** 20 | * This example shows how to utilize the new structured streaming approach together with the 21 | * changes feed from couchbase. 22 | * 23 | * If no traffic runs on the couchbase bucket at start of this script, it prints 24 | * 25 | * {{{ 26 | * +--------+-----+ 27 | * | type|count| 28 | * +--------+-----+ 29 | * | hotel| 917| 30 | * | null| 1| 31 | * |landmark| 4495| 32 | * | airline| 187| 33 | * | airport| 1968| 34 | * | route|24024| 35 | * +--------+-----+ 36 | * }}} 37 | * 38 | * Since it keeps the counts as a total value, if you then modify a document in the UI, for 39 | * example a airport you'll see the airport type count increasing by one. 40 | * 41 | * @author Michael Nitschinger 42 | */ 43 | object StructuredStreamingExample { 44 | 45 | // Very simple schema, feel free to add more properties here. Properties that do not 46 | // exist in a streamed document show as null. 47 | val schema = StructType( 48 | StructField("META_ID", StringType) :: 49 | StructField("type", StringType) :: 50 | StructField("name", StringType) :: Nil 51 | ) 52 | 53 | def main(args: Array[String]): Unit = { 54 | 55 | // The SparkSession is the main entry point into spark 56 | val spark = SparkSession 57 | .builder() 58 | .appName("N1QLExample") 59 | .master("local[*]") // use the JVM as the master, great for testing 60 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 61 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 62 | .config("com.couchbase.username", "Administrator") 63 | .config("com.couchbase.password", "password") 64 | .getOrCreate() 65 | 66 | // Define the Structured Stream from Couchbase with the given Schema 67 | val records = spark.readStream 68 | .format("com.couchbase.spark.sql") 69 | .schema(schema) 70 | .load() 71 | 72 | // Count per type and print to screen 73 | records 74 | .groupBy("type") 75 | .count() 76 | .writeStream 77 | .outputMode("complete") 78 | .format("console") 79 | .start() 80 | .awaitTermination() 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/SubdocLookupExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import com.couchbase.spark._ 17 | import org.apache.spark.sql.SparkSession 18 | 19 | /** 20 | * This example shows how to use the Subdocument API which has been introduced with 21 | * Couchbase Server 4.5.0. 22 | * 23 | * @author Michael Nitschinger 24 | */ 25 | object SubdocLookupExample { 26 | 27 | def main(args: Array[String]): Unit = { 28 | 29 | // The SparkSession is the main entry point into spark 30 | val spark = SparkSession 31 | .builder() 32 | .appName("SubdocExample") 33 | .master("local[*]") // use the JVM as the master, great for testing 34 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 35 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 36 | .config("com.couchbase.username", "Administrator") 37 | .config("com.couchbase.password", "password") 38 | .getOrCreate() 39 | 40 | 41 | val result = spark.sparkContext.couchbaseSubdocLookup( 42 | Seq("airline_10123"), // fetch these document ids 43 | Seq("name", "iata"), // only fetch their name and iata code 44 | Seq("foobar") // but also check if the foobar key exists in the doc 45 | ).collect() 46 | 47 | // Prints 48 | // SubdocLookupResult( 49 | // airline_10123,0,Map(name -> Texas Wings, iata -> TQ),Map(foobar -> false) 50 | // ) 51 | result.foreach(println) 52 | 53 | // Same as above, but omits the exists check, just looks up the fields. 54 | val r2 = spark.sparkContext.couchbaseSubdocLookup( 55 | Seq("airline_10123"), 56 | Seq("name", "iata") 57 | ) 58 | 59 | // Prints 60 | // SubdocLookupResult( 61 | // airline_10123,0,Map(name -> Texas Wings, iata -> TQ),Map() 62 | // ) 63 | r2.foreach(println) 64 | 65 | } 66 | } -------------------------------------------------------------------------------- /src/main/scala/SubdocMutationExample.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.SparkSession 2 | import com.couchbase.spark._ 3 | import com.couchbase.spark.connection._ 4 | 5 | /** 6 | * Created by daschl on 13/07/17. 7 | */ 8 | object SubdocMutationExample { 9 | 10 | def main(args: Array[String]): Unit = { 11 | // The SparkSession is the main entry point into spark 12 | val spark = SparkSession 13 | .builder() 14 | .appName("SubdocExample") 15 | .master("local[*]") // use the JVM as the master, great for testing 16 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 17 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 18 | .config("spark.couchbase.username", "Administrator") // If you are using RBAC / Server 5.0 19 | .config("spark.couchbase.password", "password") // If you are using RBAC / Server 5.0 20 | .getOrCreate() 21 | 22 | 23 | 24 | 25 | // Change a field in a document that already exists 26 | spark.sparkContext 27 | .couchbaseSubdocMutate(Seq(SubdocReplace("airline_10", "name", "42-Mile-Air"))) 28 | .collect() 29 | 30 | // Append an element to an array 31 | spark.sparkContext 32 | .couchbaseSubdocMutate(Seq(SubdocArrayAppend("airline_1191", "codes", 1, true))) 33 | .collect() 34 | 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/TransformationExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2015 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import com.couchbase.client.java.document.JsonDocument 17 | import com.couchbase.client.java.document.json.JsonObject 18 | import com.couchbase.spark._ 19 | import com.couchbase.spark.streaming._ 20 | import org.apache.spark.sql.{DataFrameReader, SQLContext, SparkSession} 21 | import org.apache.spark.streaming.{Seconds, StreamingContext} 22 | 23 | /** 24 | * A sample Apache Spark program to show how Couchbase may be used with Spark 25 | * when doing data transformations. 26 | * 27 | * Assuming a MySQL Database and documents with this format: 28 | * 29 | * { 30 | * "givenname": "Matt", 31 | * "surname": "Ingenthron", 32 | * "email": "matt@email.com" 33 | * } 34 | * 35 | * Stream out all documents, look them up in the data loaded from mysql, join on 36 | * the email address and add the entitlement token. 37 | * 38 | * If you run this in the IDE, make sure to set the master to local[*]! 39 | */ 40 | object TransformationExample { 41 | 42 | /** Returns a JsonDocument based on a tuple of two strings */ 43 | def CreateDocument(s: (String, String)): JsonDocument = { 44 | JsonDocument.create(s._1, JsonObject.fromJson(s._2)) 45 | } 46 | 47 | /** Returns an RDD based on email address extraced from the document */ 48 | def CreateMappableRdd(s: (String, String)): (String, JsonDocument) = { 49 | val return_doc = JsonDocument.create(s._1, JsonObject.fromJson(s._2)) 50 | (return_doc.content().getString("email"), return_doc) 51 | } 52 | 53 | /** Returns a JsonDocument enriched with the entitlement token */ 54 | def mergeIntoDoc(t: (String, (JsonDocument, Integer))): JsonDocument = { 55 | val jsonToEnrich = t._2._1.content() 56 | val entitlementFromJoin = t._2._2 57 | jsonToEnrich.put("entitlementtoken", entitlementFromJoin) 58 | t._2._1 59 | } 60 | 61 | def getMysqlReader(sqlctx: SQLContext): DataFrameReader = { 62 | 63 | // Now get set up to fetch things from MySQL 64 | // The database name is ext_users with the data we want in a table named profiles 65 | // and a read-only user named profiles 66 | val mysql_connstr = "jdbc:mysql://localhost:3306/ext_users" 67 | val mysql_uname = "profiles" 68 | val mysql_password = "profiles" 69 | 70 | sqlctx.read.format("jdbc").options( 71 | Map("url" -> (mysql_connstr + "?user=" + mysql_uname + "&password=" + mysql_password), 72 | "dbtable" -> "ext_users.profiles")) 73 | } 74 | 75 | def main(args: Array[String]): Unit = { 76 | Class.forName("com.mysql.jdbc.Driver").newInstance // Load the MySQL Connector 77 | 78 | // Initialize the spark session. 79 | val spark = SparkSession 80 | .builder() 81 | .appName("TransformationExample") 82 | // Configure for the Couchbase bucket "transformative" 83 | .config("spark.couchbase.bucket.transformative", "") 84 | .config("com.couchbase.username", "Administrator") 85 | .config("com.couchbase.password", "password") 86 | .getOrCreate() 87 | 88 | val mysqlReader = getMysqlReader(spark.sqlContext) // set up a MySQL Reader 89 | 90 | // Note that if the database was quite large you could push down other predicates to MySQL or partition 91 | // the DataFrame 92 | // mysqlReader.load().filter("email = \"matt@email.com\"") 93 | 94 | // load the DataFrame of all of the users from MySQL. 95 | // Note, appending .cache() may make sense here (or not) depending on amount of data. 96 | val entitlements = mysqlReader.load() 97 | 98 | /* loading this: 99 | +---------+-----------+-----------------+----------------+ 100 | |givenname| surname| email|entitlementtoken| 101 | +---------+-----------+-----------------+----------------+ 102 | | Matt| Ingenthron| matt@email.com| 11211| 103 | | Michael|Nitschinger|michael@email.com| 11210| 104 | +---------+-----------+-----------------+----------------+ 105 | */ 106 | 107 | val entitlementsSansSchema = entitlements.rdd.map[(String, Integer)](f => (f.getAs[String]("email"), f.getAs[Integer]("entitlementtoken"))) 108 | 109 | val ssc = new StreamingContext(spark.sparkContext, Seconds(5)) 110 | 111 | ssc.couchbaseStream("transformative") 112 | .filter(_.isInstanceOf[Mutation]) 113 | .map(m => (new String(m.asInstanceOf[Mutation].key), new String(m.asInstanceOf[Mutation].content))) 114 | .map(s => CreateMappableRdd(s)) 115 | .filter(_._2.content().get("entitlementtoken").eq(null)) 116 | .foreachRDD(rdd => { 117 | rdd 118 | .join(entitlementsSansSchema) 119 | .map(mergeIntoDoc) 120 | //.foreach(println) // a good place to see the effect 121 | .saveToCouchbase("transformative") 122 | }) 123 | 124 | ssc.start() 125 | ssc.awaitTermination() 126 | } 127 | 128 | } 129 | 130 | 131 | -------------------------------------------------------------------------------- /src/main/scala/Word2VecExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016 Couchbase, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | import com.couchbase.client.java.query.N1qlQuery 17 | import org.apache.spark.SparkContext 18 | import com.couchbase.spark._ 19 | import java.io.File 20 | 21 | import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} 22 | import org.apache.spark.sql.SparkSession 23 | 24 | /** 25 | * This example shows how to use the Machine Learning Word2Vec model together with Couchbase. 26 | * 27 | * It takes reviews from the travel-sample bucket, trains the model and then searches for 28 | * synonyms for "hotel". 29 | * 30 | * @author Will Gardella 31 | * @author Michael Nitschinger 32 | */ 33 | object Word2VecExample { 34 | 35 | def main(args: Array[String]): Unit = { 36 | 37 | // The SparkSession is the main entry point into spark 38 | val spark = SparkSession 39 | .builder() 40 | .appName("Word2VecExample") 41 | .master("local[*]") // use the JVM as the master, great for testing 42 | .config("spark.couchbase.nodes", "127.0.0.1") // connect to couchbase on localhost 43 | .config("spark.couchbase.bucket.travel-sample", "") // open the travel-sample bucket with empty password 44 | .config("com.couchbase.username", "Administrator") 45 | .config("com.couchbase.password", "password") 46 | .getOrCreate() 47 | 48 | // Train the model if not trained already 49 | val model = trainAndLoadModel(spark.sparkContext) 50 | 51 | // Find the synonyms in the trained model and print them out 52 | val synonyms = model 53 | .findSynonyms(args.lift(0).getOrElse("hotel"), args.lift(1).map(_.toInt).getOrElse(10)) 54 | .foreach { case (syn, sim) => 55 | println(s"\t(•͡˘㇁•͡˘) --> \t[$syn] (with a similarity of $sim)") 56 | } 57 | 58 | spark.stop() 59 | } 60 | 61 | /** 62 | * Trains the model if no training data exists (based on the result of a n1ql query) and returns the trained 63 | * data (the model). 64 | */ 65 | def trainAndLoadModel(sc: SparkContext): Word2VecModel = { 66 | val word2vec = new Word2Vec 67 | val path = "reviews" 68 | 69 | if (!new File(path).exists) { 70 | val reviews = "SELECT m.content from (" + 71 | "SELECT ELEMENT reviews FROM `travel-sample` WHERE type = 'hotel' AND ARRAY_LENGTH(reviews) > 0" + 72 | ") AS x UNNEST x AS m;" 73 | val input = sc.couchbaseQuery(N1qlQuery.simple(reviews)) 74 | .map(_.value.getString("content").split(" ").map(_.toLowerCase.replaceAll("[^a-z0-9]", "")).toSeq) 75 | 76 | val m = word2vec.fit(input) 77 | m.save(sc, path) 78 | m 79 | } else { 80 | Word2VecModel.load(sc, path) 81 | } 82 | } 83 | } --------------------------------------------------------------------------------