├── project ├── build.properties ├── plugins.sbt └── Versions.scala ├── rootdoc.txt ├── spark-cassandra-connector └── src │ ├── it │ ├── resources │ │ ├── triggers │ │ │ └── README.txt │ │ └── log4j.properties │ └── scala │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ ├── cql │ │ ├── CassandraAuthenticatedConnectorSpec.scala │ │ └── CassandraPartitionKeyWhereSpec.scala │ │ └── streaming │ │ ├── StreamingSpec.scala │ │ └── ActorStreamSpec.scala │ ├── main │ └── scala │ │ ├── org │ │ └── apache │ │ │ └── spark │ │ │ └── sql │ │ │ └── cassandra │ │ │ ├── package-info.java │ │ │ ├── package.scala │ │ │ ├── api │ │ │ └── java │ │ │ │ └── JavaCassandraSQLContext.scala │ │ │ ├── InsertIntoCassandraTable.scala │ │ │ ├── CassandraCatalog.scala │ │ │ └── CassandraSQLRow.scala │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ ├── util │ │ ├── package.scala │ │ ├── CountingIterator.scala │ │ ├── MagicalTypeTricks.scala │ │ ├── ReflectionUtil.scala │ │ └── JavaApiHelper.scala │ │ ├── writer │ │ ├── package.scala │ │ ├── QueryExecutor.scala │ │ ├── RowWriter.scala │ │ ├── GenericRowWriter.scala │ │ ├── SqlRowWriter.scala │ │ ├── PropertyExtractor.scala │ │ ├── ConvertingPropertyExtractor.scala │ │ ├── AbstractRowWriter.scala │ │ ├── RowWriterFactory.scala │ │ ├── ObjectSizeEstimator.scala │ │ ├── WriteOption.scala │ │ ├── AsyncExecutor.scala │ │ └── WritableToCassandra.scala │ │ ├── rdd │ │ ├── package.scala │ │ ├── partitioner │ │ │ ├── package.scala │ │ │ ├── TokenRangeSplitter.scala │ │ │ ├── dht │ │ │ │ ├── Token.scala │ │ │ │ ├── TokenRange.scala │ │ │ │ └── TokenFactory.scala │ │ │ ├── CassandraRDDPartition.scala │ │ │ ├── Murmur3PartitionerTokenRangeSplitter.scala │ │ │ ├── RandomPartitionerTokenRangeSplitter.scala │ │ │ ├── ServerSideTokenRangeSplitter.scala │ │ │ └── TokenRangeClusterer.scala │ │ ├── reader │ │ │ ├── package.scala │ │ │ ├── PrefetchingResultSetIterator.scala │ │ │ ├── RowReader.scala │ │ │ ├── KeyValueRowReader.scala │ │ │ └── ValueRowReader.scala │ │ ├── ValidRDDType.scala │ │ ├── CqlWhereClause.scala │ │ └── ReadConf.scala │ │ ├── mapper │ │ ├── package.scala │ │ ├── ColumnMap.scala │ │ ├── TupleColumnMapper.scala │ │ ├── JavaBeanColumnMapper.scala │ │ ├── DefaultColumnMapper.scala │ │ └── ReflectionColumnMapper.scala │ │ ├── types │ │ ├── package.scala │ │ ├── TimestampFormatter.scala │ │ ├── TimestampParser.scala │ │ ├── CollectionColumnType.scala │ │ └── ColumnType.scala │ │ ├── cql │ │ ├── package.scala │ │ ├── MultipleRetryPolicy.scala │ │ ├── CassandraClientProxy.scala │ │ ├── PreparedStatementCache.scala │ │ ├── CassandraConnectorConf.scala │ │ ├── RefCountMap.scala │ │ ├── SessionProxy.scala │ │ └── AuthConf.scala │ │ ├── streaming │ │ ├── package.scala │ │ ├── CassandraStreamingRDD.scala │ │ ├── DStreamFunctions.scala │ │ └── StreamingContextFunctions.scala │ │ ├── ColumnSelector.scala │ │ ├── BatchSize.scala │ │ ├── RDDFunctions.scala │ │ ├── ColumnRef.scala │ │ ├── package.scala │ │ └── SparkContextFunctions.scala │ └── test │ ├── scala │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ ├── testkit │ │ ├── package.scala │ │ └── SparkCassandraFixture.scala │ │ ├── writer │ │ ├── DefaultRowWriterTest.scala │ │ ├── PropertyExtractorTest.scala │ │ ├── ObjectSizeEstimatorTest.scala │ │ ├── ConvertingPropertyExtractorTest.scala │ │ ├── AsyncExecutorTest.scala │ │ ├── WriteConfTest.scala │ │ └── WriteOptionTest.scala │ │ ├── rdd │ │ ├── reader │ │ │ └── ClassBasedRowReaderTest.scala │ │ └── partitioner │ │ │ ├── RandomPartitionerTokenRangeSplitterTest.scala │ │ │ └── Murmur3PartitionerTokenRangeSplitterTest.scala │ │ ├── samples.scala │ │ ├── streaming │ │ └── TestProducer.scala │ │ ├── types │ │ ├── CanBuildFromTest.scala │ │ └── TypeSerializationTest.scala │ │ ├── mapper │ │ └── TupleColumnMapperTest.scala │ │ └── util │ │ └── ReflectionUtilSpec.scala │ └── java │ └── com │ └── datastax │ └── spark │ └── connector │ ├── SampleJavaBeanWithoutNoArgsCtor.java │ ├── SampleJavaBean.java │ ├── SampleJavaBeanWithMultipleCtors.java │ ├── SampleWithNestedJavaBean.java │ └── SampleWithDeeplyNestedJavaBean.java ├── .travis.yml ├── .gitignore ├── spark-cassandra-connector-embedded └── src │ └── main │ └── scala │ └── com │ └── datastax │ └── spark │ └── connector │ └── embedded │ ├── Event.scala │ ├── package.scala │ ├── SparkTemplate.scala │ ├── SparkRepl.scala │ ├── Assertions.scala │ ├── KafkaProducer.scala │ ├── KafkaConsumer.scala │ └── EmbeddedZookeeper.scala ├── spark-cassandra-connector-java └── src │ ├── main │ ├── scala │ │ └── com │ │ │ └── datastax │ │ │ └── spark │ │ │ └── connector │ │ │ └── japi │ │ │ └── types │ │ │ └── JavaTypeConverter.scala │ └── java │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ └── japi │ │ ├── StreamingContextJavaFunctions.java │ │ ├── RDDJavaFunctions.java │ │ ├── DStreamJavaFunctions.java │ │ └── GenericJavaRowReaderFactory.java │ └── test │ └── java │ └── com │ └── datastax │ └── spark │ └── connector │ └── japi │ └── CustomTypeConverterTest.java ├── spark-cassandra-connector-demos ├── simple-demos │ └── src │ │ └── main │ │ ├── resources │ │ ├── application.conf │ │ ├── log4j.properties │ │ └── data │ │ │ └── words │ │ └── scala │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ └── demo │ │ ├── DemoApp.scala │ │ ├── SparkCassandraSettings.scala │ │ ├── WordCountDemo.scala │ │ ├── TableCopyDemo.scala │ │ ├── BasicReadWriteDemo.scala │ │ └── SQLDemo.scala ├── twitter-streaming │ └── src │ │ └── main │ │ ├── resources │ │ ├── application.conf │ │ └── log4j.properties │ │ └── scala │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ └── demo │ │ └── TwitterStreamingHashTagsByInterval.scala └── kafka-streaming │ └── src │ └── main │ └── resources │ ├── log4j.properties │ └── data │ └── words ├── scripts └── submit-demos ├── doc ├── 10_embedded.md ├── 3_selection.md └── 5_saving.md └── sbt └── sbt /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.6 2 | -------------------------------------------------------------------------------- /rootdoc.txt: -------------------------------------------------------------------------------- 1 | Cassandra connector for Apache Spark. 2 | See documentation of package [[com.datastax.spark.connector]]. -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/resources/triggers/README.txt: -------------------------------------------------------------------------------- 1 | Place triggers to be loaded in this directory, as jar files. 2 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/package-info.java: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.cassandra; -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | jdk: oraclejdk7 3 | sudo: false 4 | scala: 5 | - 2.10.4 6 | script: 7 | - "sbt ++$TRAVIS_SCALA_VERSION test" 8 | 9 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/package.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql 2 | 3 | package object cassandra { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | /** Useful stuff that didn't fit elsewhere. */ 4 | package object util { 5 | 6 | } 7 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | /** Contains components for writing RDDs to Cassandra */ 4 | package object writer { 5 | 6 | } 7 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | 4 | /** Contains [[com.datastax.spark.connector.rdd.CassandraRDD]] class that is the main entry point for 5 | * analyzing Cassandra data from Spark. */ 6 | package object rdd { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | .DS_Store 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | .idea 19 | .idea_modules 20 | 21 | checkpoint 22 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/QueryExecutor.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.driver.core.{Statement, Session} 4 | 5 | class QueryExecutor(session: Session, maxConcurrentQueries: Int) 6 | extends AsyncExecutor(session.executeAsync(_ : Statement), maxConcurrentQueries) 7 | 8 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | /** Provides components for partitioning a Cassandra table into smaller parts of appropriate size. 4 | * Each partition can be processed locally on at least one cluster node. */ 5 | package object partitioner { 6 | 7 | } 8 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | /** Provides machinery for mapping Cassandra tables to user defined Scala classes or tuples. 4 | * The main class in this package is [[mapper.ColumnMapper]] responsible for matching Scala object's 5 | * properties with Cassandra column names.*/ 6 | package object mapper { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | /** Offers type conversion magic, so you can receive Cassandra column values in a form you like the most. 4 | * Simply specify the type you want to use on the Scala side, and the column value will be converted automatically. 5 | * Works also with complex objects like collections. */ 6 | package object types { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | import com.datastax.spark.connector.CassandraRow 4 | 5 | /** Provides components for reading data rows from Cassandra and converting them to objects of desired type. 6 | * Additionally provides a generic [[CassandraRow CassandraRow]] class which can represent any row.*/ 7 | package object reader { 8 | 9 | } 10 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "0.7.1") 2 | 3 | addSbtPlugin("com.typesafe.sbt" % "sbt-scalariform" % "1.3.0") 4 | 5 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.6.2") 6 | 7 | addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.6") 8 | 9 | addSbtPlugin("com.typesafe.sbt" % "sbt-pgp" % "0.8.3") 10 | 11 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 12 | 13 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4") 14 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | 4 | /** Contains a [[cql.CassandraConnector]] object which is used to connect 5 | * to a Cassandra cluster and to send CQL statements to it. `CassandraConnector` 6 | * provides a Scala-idiomatic way of working with `Cluster` and `Session` object 7 | * and takes care of connection pooling and proper resource disposal.*/ 8 | package object cql { 9 | 10 | } 11 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/testkit/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import scala.collection.immutable 4 | import scala.concurrent.duration._ 5 | import akka.util.Timeout 6 | 7 | package object testkit { 8 | 9 | final val DefaultHost = "127.0.0.1" 10 | 11 | implicit val DefaultTimeout = Timeout(5.seconds) 12 | 13 | val data = immutable.Set("words ", "may ", "count ") 14 | 15 | val actorName = "my-actor" 16 | 17 | } 18 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/CountingIterator.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.util 2 | 3 | /** Counts elements fetched form the underlying iterator. */ 4 | class CountingIterator[T](iterator: Iterator[T]) extends Iterator[T] { 5 | private var _count = 0 6 | 7 | /** Returns the number of successful invocations of `next` */ 8 | def count = _count 9 | 10 | def hasNext = iterator.hasNext 11 | 12 | def next() = { 13 | val item = iterator.next() 14 | _count += 1 15 | item 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/DefaultRowWriterTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.cql.TableDef 4 | import org.apache.commons.lang3.SerializationUtils 5 | import org.junit.Test 6 | 7 | class DefaultRowWriterTest { 8 | 9 | @Test 10 | def testSerializability() { 11 | val table = TableDef("test", "table", Nil, Nil, Nil) 12 | val rowWriter = new DefaultRowWriter[DefaultRowWriterTest](table, Nil) 13 | SerializationUtils.roundtrip(rowWriter) 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/TokenRangeSplitter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import com.datastax.spark.connector.rdd.partitioner.dht.{Token, TokenRange} 4 | 5 | /** Splits a token range into smaller sub-ranges, 6 | * each with the desired approximate number of rows. */ 7 | trait TokenRangeSplitter[V, T <: Token[V]] { 8 | 9 | /** Splits given token range into n equal sub-ranges. */ 10 | def split(range: TokenRange[V, T], splitSize: Long): Seq[TokenRange[V, T]] 11 | } 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import org.apache.spark.streaming.StreamingContext 4 | import org.apache.spark.streaming.dstream.DStream 5 | 6 | import scala.reflect.ClassTag 7 | 8 | package object streaming { 9 | 10 | implicit def toStreamingContextFunctions(ssc: StreamingContext): SparkContextFunctions = 11 | new StreamingContextFunctions(ssc) 12 | 13 | implicit def toDStreamFunctions[T: ClassTag](ds: DStream[T]): DStreamFunctions[T] = 14 | new DStreamFunctions[T](ds) 15 | 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/ColumnSelector.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import scala.language.implicitConversions 4 | 5 | sealed trait ColumnSelector 6 | case object AllColumns extends ColumnSelector 7 | case class SomeColumns(columns: NamedColumnRef*) extends ColumnSelector 8 | 9 | object SomeColumns { 10 | @deprecated("Use com.datastax.spark.connector.rdd.SomeColumns instead of Seq", "1.0") 11 | implicit def seqToSomeColumns(columns: Seq[String]): SomeColumns = 12 | SomeColumns(columns.map(x => x: NamedColumnRef): _*) 13 | } 14 | 15 | 16 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/BatchSize.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import com.datastax.spark.connector.writer.WriteConf 4 | 5 | sealed trait BatchSize 6 | 7 | case class RowsInBatch(batchSize: Int) extends BatchSize 8 | case class BytesInBatch(batchSize: Int) extends BatchSize 9 | 10 | object BatchSize { 11 | @deprecated("Use com.datastax.spark.connector.FixedBatchSize instead of a number", "1.1") 12 | implicit def intToFixedBatchSize(batchSize: Int): RowsInBatch = RowsInBatch(batchSize) 13 | 14 | val Automatic = BytesInBatch(WriteConf.DefaultBatchSizeInBytes) 15 | } 16 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/dht/Token.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner.dht 2 | 3 | trait Token[T] extends Ordered[Token[T]] { 4 | def value: T 5 | } 6 | 7 | case class LongToken(value: Long) extends Token[Long] { 8 | override def compare(that: Token[Long]) = value.compareTo(that.value) 9 | override def toString = value.toString 10 | } 11 | 12 | case class BigIntToken(value: BigInt) extends Token[BigInt] { 13 | override def compare(that: Token[BigInt]) = value.compare(that.value) 14 | override def toString = value.toString() 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/TimestampFormatter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import java.util.Date 4 | 5 | import org.apache.cassandra.serializers.TimestampSerializer 6 | import org.joda.time.DateTime 7 | import org.joda.time.format.DateTimeFormat 8 | 9 | /** Formats timestamps and dates using CQL timestamp format `yyyy-MM-dd HH:mm:ssZ` */ 10 | object TimestampFormatter { 11 | 12 | private val TimestampPattern = "yyyy-MM-dd HH:mm:ssZ" 13 | 14 | def format(date: Date): String = 15 | DateTimeFormat.forPattern(TimestampPattern).print(new DateTime(date.getTime)) 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/rdd/reader/ClassBasedRowReaderTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.reader 2 | 3 | import com.datastax.spark.connector.cql.TableDef 4 | import org.apache.commons.lang3.SerializationUtils 5 | import org.junit.Test 6 | 7 | case class TestClass(a: String, b: Int, c: Option[Long]) 8 | 9 | class ClassBasedRowReaderTest { 10 | 11 | private val tableDef = TableDef("test", "table", Nil, Nil, Nil) 12 | 13 | @Test 14 | def testSerialize() { 15 | val reader = new ClassBasedRowReader[TestClass](tableDef) 16 | SerializationUtils.roundtrip(reader) 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/Event.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import akka.actor.ActorRef 4 | 5 | object Event { 6 | 7 | sealed trait Status extends Serializable 8 | 9 | case class ReceiverStarted(ref: ActorRef) extends Status 10 | 11 | case class Pushed(data: AnyRef) extends Status 12 | 13 | case object Completed extends Status 14 | 15 | case object Report extends Status 16 | 17 | sealed trait Task extends Serializable 18 | case object QueryTask extends Task 19 | 20 | case class WordCount(word: String, count: Int) extends Serializable 21 | 22 | } 23 | -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import java.net.InetAddress 4 | 5 | import scala.concurrent.duration.FiniteDuration 6 | 7 | package object embedded { 8 | 9 | implicit val ZookeeperConnectionString = s"${InetAddress.getLocalHost.getHostAddress}:2181" 10 | 11 | /* Factor by which to scale timeouts during tests, e.g. to account for shared build system load. */ 12 | implicit class SparkTestDuration(val duration: FiniteDuration) extends AnyVal { 13 | def dilated: FiniteDuration = (duration * 1.0).asInstanceOf[FiniteDuration] 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /spark-cassandra-connector-java/src/main/scala/com/datastax/spark/connector/japi/types/JavaTypeConverter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.japi.types 2 | 3 | import akka.japi.JavaPartialFunction 4 | import com.datastax.spark.connector.types.NullableTypeConverter 5 | 6 | import scala.reflect.runtime.universe._ 7 | 8 | class JavaTypeConverter[T <: AnyRef](typeTag: TypeTag[T], convertFunction: JavaPartialFunction[Any, T]) 9 | extends NullableTypeConverter[T] { 10 | 11 | override def targetTypeTag: TypeTag[T] = typeTag 12 | 13 | override def convertPF: PartialFunction[Any, T] = convertFunction 14 | 15 | def noMatch() = JavaPartialFunction.noMatch() 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Streaming Demo Reference Config File # 3 | #################################### 4 | 5 | streaming-demo { 6 | 7 | # spark://127.0.0.1@7077,127.0.0.2@7077,127.0.0.3@7077 8 | # or a local spark://host@7077 9 | # This defaults to local 10 | spark.master = "local[12]" 11 | # Would normally be `ms` in config but Spark just wants the Long 12 | spark.streaming.batch.duration = 300 13 | spark.cleaner.ttl = 3600 14 | spark.cassandra.connection.host = "127.0.0.1" 15 | 16 | spark.cassandra.keyspace = "streaming_demo" 17 | spark.cassandra.table = "words" 18 | data = ["words ", "may ", "count "] 19 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/ValidRDDType.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | import java.io.{Serializable => JavaSerializable} 4 | 5 | import com.datastax.spark.connector.types.TypeConverter 6 | 7 | import scala.annotation.implicitNotFound 8 | 9 | @implicitNotFound("Not a valid RDD type. There should exists either a type converter for the type or the type should implement Serializable") 10 | trait ValidRDDType[T] 11 | 12 | object ValidRDDType { 13 | implicit def withTypeConverterAsValidRDDType[T](implicit tc: TypeConverter[T]): ValidRDDType[T] = null 14 | 15 | implicit def javaSerializableAsValidRDDType[T <: JavaSerializable]: ValidRDDType[T] = null 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/SparkTemplate.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import org.apache.spark.{SparkEnv, SparkConf, SparkContext} 4 | 5 | trait SparkTemplate { 6 | val conf = SparkTemplate.conf 7 | val sc = SparkTemplate.sc 8 | } 9 | 10 | object SparkTemplate { 11 | 12 | val conf = new SparkConf(true) 13 | .set("spark.cassandra.connection.host", EmbeddedCassandra.cassandraHost.getHostAddress) 14 | .set("spark.cleaner.ttl", "3600") 15 | .setMaster(sys.env.getOrElse("IT_TEST_SPARK_MASTER", "local[*]")) 16 | .setAppName(getClass.getSimpleName) 17 | 18 | 19 | val sc = new SparkContext(conf) 20 | 21 | lazy val actorSystem = SparkEnv.get.actorSystem 22 | 23 | } 24 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/ColumnMap.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.mapper 2 | 3 | import com.datastax.spark.connector.ColumnRef 4 | 5 | /** Associates constructor parameters and property accessors with table columns */ 6 | trait ColumnMap extends Serializable { 7 | def constructor: Seq[ColumnRef] 8 | 9 | def getters: Map[String, ColumnRef] 10 | 11 | def setters: Map[String, ColumnRef] 12 | 13 | def allowsNull: Boolean 14 | } 15 | 16 | case class SimpleColumnMap(constructor: Seq[ColumnRef], 17 | getters: Map[String, ColumnRef], 18 | setters: Map[String, ColumnRef], 19 | allowsNull: Boolean = false) extends ColumnMap 20 | 21 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/api/java/JavaCassandraSQLContext.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.cassandra.api.java 2 | 3 | import org.apache.spark.api.java.JavaSparkContext 4 | import org.apache.spark.sql.api.java.{JavaSQLContext, JavaSchemaRDD} 5 | import org.apache.spark.sql.cassandra.CassandraSQLContext 6 | 7 | class JavaCassandraSQLContext(sparkContext: JavaSparkContext) extends JavaSQLContext(sparkContext) { 8 | 9 | override val sqlContext = new CassandraSQLContext(sparkContext) 10 | 11 | /** 12 | * Executes a query expressed in SQL, returning the result as a JavaSchemaRDD. 13 | */ 14 | def cql(cqlQuery: String): JavaSchemaRDD = 15 | new JavaSchemaRDD(sqlContext, sqlContext.parseSql(cqlQuery)) 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/testkit/SparkCassandraFixture.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.testkit 2 | 3 | import org.scalatest.{BeforeAndAfter, Matchers, WordSpecLike} 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import com.datastax.spark.connector.embedded.EmbeddedCassandra 6 | 7 | /** Basic unit test abstraction. */ 8 | trait AbstractSpec extends WordSpecLike with Matchers with BeforeAndAfter 9 | 10 | /** Used for IT tests. */ 11 | trait SharedEmbeddedCassandra extends EmbeddedCassandra { 12 | 13 | def clearCache(): Unit = CassandraConnector.evictCache() 14 | 15 | } 16 | 17 | private[connector] object TestEvent { 18 | 19 | case object Stop 20 | 21 | case object Completed 22 | 23 | case class WordCount(word: String, count: Int) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/CqlWhereClause.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | /** Represents a logical conjunction of CQL predicates. 4 | * Each predicate can have placeholders denoted by '?' which get substituted by values from the `values` array. 5 | * The number of placeholders must match the size of the `values` array. */ 6 | case class CqlWhereClause(predicates: Seq[String], values: Seq[Any]) { 7 | 8 | /** Returns a conjunction of this clause and the given predicate. */ 9 | def and(other: CqlWhereClause) = 10 | CqlWhereClause(predicates ++ other.predicates, values ++ other.values) 11 | 12 | } 13 | 14 | object CqlWhereClause { 15 | 16 | /** Empty CQL WHERE clause selects all rows */ 17 | val empty = new CqlWhereClause(Nil, Nil) 18 | } 19 | 20 | 21 | -------------------------------------------------------------------------------- /spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/StreamingContextJavaFunctions.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.japi; 2 | 3 | import org.apache.spark.streaming.StreamingContext; 4 | 5 | /** 6 | * Java API wrapper over {@link org.apache.spark.streaming.StreamingContext} to provide Spark Cassandra Connector 7 | * functionality. 8 | * 9 | *
To obtain an instance of this wrapper, use one of the factory methods in {@link 10 | * com.datastax.spark.connector.japi.CassandraJavaUtil} class.
11 | */ 12 | @SuppressWarnings("UnusedDeclaration") 13 | public class StreamingContextJavaFunctions extends SparkContextJavaFunctions { 14 | public final StreamingContext ssc; 15 | 16 | StreamingContextJavaFunctions(StreamingContext ssc) { 17 | super(ssc.sparkContext()); 18 | this.ssc = ssc; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/DemoApp.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.datastax.spark.connector.util.Logging 4 | import org.apache.spark.{SparkContext, SparkConf} 5 | 6 | trait DemoApp extends App with Logging { 7 | 8 | val words = "./spark-cassandra-connector-demos/simple-demos/src/main/resources/data/words" 9 | 10 | val SparkMasterHost = "127.0.0.1" 11 | 12 | val CassandraHost = "127.0.0.1" 13 | 14 | // Tell Spark the address of one Cassandra node: 15 | val conf = new SparkConf(true) 16 | .set("spark.cassandra.connection.host", CassandraHost) 17 | .set("spark.cleaner.ttl", "3600") 18 | .setMaster("local[12]") 19 | .setAppName(getClass.getSimpleName) 20 | 21 | // Connect to the Spark cluster: 22 | lazy val sc = new SparkContext(conf) 23 | } 24 | 25 | object DemoApp { 26 | def apply(): DemoApp = new DemoApp {} 27 | } 28 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleJavaBeanWithoutNoArgsCtor.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleJavaBeanWithoutNoArgsCtor implements Serializable { 10 | private Integer key; 11 | private String value; 12 | 13 | private SampleJavaBeanWithoutNoArgsCtor(Integer key, String value) { 14 | this.key = key; 15 | this.value = value; 16 | } 17 | 18 | public Integer getKey() { 19 | return key; 20 | } 21 | 22 | public void setKey(Integer key) { 23 | this.key = key; 24 | } 25 | 26 | public String getValue() { 27 | return value; 28 | } 29 | 30 | public void setValue(String value) { 31 | this.value = value; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/samples.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector { 2 | 3 | case class SampleScalaCaseClass(key: Int, value: String) 4 | 5 | class SampleScalaClass(val key: Int, val value: String) extends Serializable 6 | 7 | class SampleScalaClassWithNoFields(key: Int, value: String) extends Serializable 8 | 9 | class SampleScalaClassWithMultipleCtors(var key: Int, var value: String) extends Serializable { 10 | def this(key: Int) = this(key, null) 11 | 12 | def this() = this(0, null) 13 | } 14 | 15 | class SampleWithNestedScalaCaseClass extends Serializable { 16 | 17 | case class InnerClass(key: Int, value: String) 18 | 19 | } 20 | 21 | class SampleWithDeeplyNestedScalaCaseClass extends Serializable { 22 | 23 | class IntermediateClass extends Serializable { 24 | 25 | case class InnerClass(key: Int, value: String) 26 | 27 | } 28 | 29 | } 30 | 31 | object SampleObject { 32 | 33 | case class ClassInObject(key: Int, value: String) 34 | 35 | } 36 | 37 | } -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/SparkCassandraSettings.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.typesafe.config.{Config, ConfigFactory} 4 | 5 | /* Initializes Akka, Cassandra and Spark settings. */ 6 | final class SparkCassandraSettings(rootConfig: Config) { 7 | def this() = this(ConfigFactory.load) 8 | 9 | protected val config = rootConfig.getConfig("streaming-demo") 10 | 11 | val SparkMaster: String = config.getString("spark.master") 12 | 13 | val SparkCleanerTtl: Int = config.getInt("spark.cleaner.ttl") 14 | 15 | val SparkStreamingBatchDuration: Long = config.getLong("spark.streaming.batch.duration") 16 | 17 | val Data = akka.japi.Util.immutableSeq(config.getStringList("data")).toSet 18 | 19 | val CassandraSeed: String = config.getString("spark.cassandra.connection.host") 20 | 21 | val CassandraKeyspace = config.getString("spark.cassandra.keyspace") 22 | 23 | val CassandraTable = config.getString("spark.cassandra.table") 24 | } -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/WordCountDemo.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import org.apache.spark.SparkContext._ 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import com.datastax.spark.connector._ 6 | 7 | object WordCountDemo extends DemoApp { 8 | 9 | CassandraConnector(conf).withSessionDo { session => 10 | session.execute(s"CREATE KEYSPACE IF NOT EXISTS demo WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") 11 | session.execute(s"CREATE TABLE IF NOT EXISTS demo.wordcount (word TEXT PRIMARY KEY, count COUNTER)") 12 | session.execute(s"TRUNCATE demo.wordcount") 13 | } 14 | 15 | sc.textFile(words) 16 | .flatMap(_.split("\\s+")) 17 | .map(word => (word.toLowerCase, 1)) 18 | .reduceByKey(_ + _) 19 | .saveToCassandra("demo", "wordcount") 20 | 21 | // print out the data saved from Spark to Cassandra 22 | sc.cassandraTable("demo", "wordcount").collect.foreach(println) 23 | sc.stop() 24 | } 25 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleJavaBean.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleJavaBean implements Serializable { 10 | private Integer key; 11 | private String value; 12 | 13 | public static SampleJavaBean newInstance(Integer key, String value) { 14 | SampleJavaBean bean = new SampleJavaBean(); 15 | bean.setKey(key); 16 | bean.setValue(value); 17 | return bean; 18 | } 19 | 20 | public Integer getKey() { 21 | return key; 22 | } 23 | 24 | public void setKey(Integer key) { 25 | this.key = key; 26 | } 27 | 28 | public String getValue() { 29 | return value; 30 | } 31 | 32 | public void setValue(String value) { 33 | this.value = value; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/RowWriter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.driver.core.{ProtocolVersion, BoundStatement, PreparedStatement} 4 | 5 | /** `RowWriter` knows how to write an object to Cassandra using the Java Cassandra driver. 6 | * */ 7 | trait RowWriter[T] extends Serializable { 8 | 9 | /** Extracts column values from `data` object and binds them to the given statement. 10 | * Variables of the prepared statement are named the same as column names to be saved. 11 | * This method must not rely on any particular order of variables.*/ 12 | def bind(data: T, stmt: PreparedStatement, protocolVersion: ProtocolVersion): BoundStatement 13 | 14 | /** Estimates serialized size in bytes of a data object. 15 | * Used for grouping statements into batches. */ 16 | def estimateSizeInBytes(data: T): Int 17 | 18 | /** List of columns this `RowWriter` is going to write. 19 | * Used to construct appropriate INSERT or UPDATE statement. */ 20 | def columnNames: Seq[String] 21 | 22 | } 23 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/PrefetchingResultSetIterator.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.reader 2 | 3 | import com.datastax.driver.core.{Row, ResultSet} 4 | 5 | /** Allows to efficiently iterate over a large, paged ResultSet, 6 | * asynchronously prefetching the next page. 7 | * 8 | * @param resultSet result set obtained from the Java driver 9 | * @param prefetchWindowSize if there are less than this rows available without blocking, 10 | * initiates fetching the next page 11 | */ 12 | class PrefetchingResultSetIterator(resultSet: ResultSet, prefetchWindowSize: Int) extends Iterator[Row] { 13 | 14 | private[this] val iterator = resultSet.iterator() 15 | 16 | override def hasNext = iterator.hasNext 17 | 18 | private[this] def maybePrefetch(): Unit = { 19 | if (!resultSet.isFullyFetched && resultSet.getAvailableWithoutFetching < prefetchWindowSize) 20 | resultSet.fetchMoreResults() 21 | } 22 | 23 | override def next() = { 24 | maybePrefetch() 25 | iterator.next() 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/GenericRowWriter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.CassandraRow 4 | import com.datastax.spark.connector.cql.TableDef 5 | 6 | /** A [[RowWriter]] that can write [[CassandraRow]] objects.*/ 7 | class GenericRowWriter(table: TableDef, selectedColumns: Seq[String]) 8 | extends AbstractRowWriter[CassandraRow](table: TableDef, selectedColumns: Seq[String]) { 9 | 10 | override protected def getColumnValue(data: CassandraRow, columnName: String): AnyRef = { 11 | val index = data.indexOf(columnName) 12 | if (index >= 0) { 13 | val converter = table.columnByName(columnName).columnType.converterToCassandra 14 | val value = data.getRaw(index) 15 | converter.convert(value) 16 | } 17 | else 18 | null 19 | } 20 | } 21 | 22 | 23 | object GenericRowWriter { 24 | 25 | object Factory extends RowWriterFactory[CassandraRow] { 26 | override def rowWriter(table: TableDef, columnNames: Seq[String]) = 27 | new GenericRowWriter(table, columnNames) 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/TupleColumnMapper.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.mapper 2 | 3 | import com.datastax.spark.connector.{ColumnRef, ColumnIndex} 4 | import com.datastax.spark.connector.cql.TableDef 5 | 6 | import scala.reflect.ClassTag 7 | 8 | class TupleColumnMapper[T <: Product : ClassTag] extends ColumnMapper[T] { 9 | 10 | override def classTag: ClassTag[T] = implicitly[ClassTag[T]] 11 | 12 | private def indexedColumnRefs(n: Int) = 13 | (0 until n).map(ColumnIndex) 14 | 15 | override def columnMap(tableDef: TableDef): ColumnMap = { 16 | 17 | val GetterRegex = "_([0-9]+)".r 18 | val cls = implicitly[ClassTag[T]].runtimeClass 19 | 20 | val constructor = 21 | indexedColumnRefs(cls.getConstructors()(0).getParameterTypes.length) 22 | 23 | val getters = { 24 | for (name@GetterRegex(id) <- cls.getMethods.map(_.getName)) 25 | yield (name, ColumnIndex(id.toInt - 1)) 26 | }.toMap 27 | 28 | val setters = 29 | Map.empty[String, ColumnRef] 30 | 31 | SimpleColumnMap(constructor, getters, setters) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/SqlRowWriter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.cql.TableDef 4 | import org.apache.spark.sql.catalyst.expressions.Row 5 | 6 | /** A [[RowWriter]] that can write [[Row]] objects.*/ 7 | class SqlRowWriter(table: TableDef, selectedColumns: Seq[String]) extends AbstractRowWriter[Row](table: TableDef, selectedColumns: Seq[String]) { 8 | 9 | override protected def getColumnValue(data: Row, columnName: String): AnyRef = { 10 | val index = columnNames.indexOf(columnName) 11 | if (index >= 0 && index < data.size) { 12 | val converter = table.columnByName(columnName).columnType.converterToCassandra 13 | val value = data.apply(index) 14 | if (value == null) null else converter.convert(value).asInstanceOf[AnyRef] 15 | } 16 | else 17 | null 18 | } 19 | } 20 | 21 | 22 | object SqlRowWriter { 23 | 24 | object Factory extends RowWriterFactory[Row] { 25 | override def rowWriter(table: TableDef, columnNames: Seq[String]) = 26 | new SqlRowWriter(table, columnNames) 27 | } 28 | 29 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/CassandraStreamingRDD.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import com.datastax.spark.connector.{ColumnSelector, AllColumns} 5 | 6 | import scala.reflect.ClassTag 7 | import org.apache.spark.streaming.StreamingContext 8 | import com.datastax.spark.connector.rdd.{ReadConf, CassandraRDD, CqlWhereClause} 9 | import com.datastax.spark.connector.rdd.reader._ 10 | 11 | /** RDD representing a Cassandra table for Spark Streaming. 12 | * @see [[com.datastax.spark.connector.rdd.CassandraRDD]] */ 13 | class CassandraStreamingRDD[R] private[connector] ( 14 | sctx: StreamingContext, 15 | connector: CassandraConnector, 16 | keyspace: String, 17 | table: String, 18 | columns: ColumnSelector = AllColumns, 19 | where: CqlWhereClause = CqlWhereClause.empty, 20 | readConf: ReadConf = ReadConf())( 21 | implicit 22 | ct : ClassTag[R], 23 | @transient rrf: RowReaderFactory[R]) 24 | extends CassandraRDD[R](sctx.sparkContext, connector, keyspace, table, columns, where, readConf) 25 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleJavaBeanWithMultipleCtors.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleJavaBeanWithMultipleCtors implements Serializable { 10 | private Integer key; 11 | private String value; 12 | 13 | public SampleJavaBeanWithMultipleCtors(Integer key) { 14 | this.key = key; 15 | } 16 | 17 | public SampleJavaBeanWithMultipleCtors() { 18 | } 19 | 20 | public SampleJavaBeanWithMultipleCtors(Integer key, String value) { 21 | this.key = key; 22 | this.value = value; 23 | } 24 | 25 | public Integer getKey() { 26 | return key; 27 | } 28 | 29 | public void setKey(Integer key) { 30 | this.key = key; 31 | } 32 | 33 | public String getValue() { 34 | return value; 35 | } 36 | 37 | public void setValue(String value) { 38 | this.value = value; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/streaming/TestProducer.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import scala.concurrent.duration._ 4 | import akka.actor.{PoisonPill, Actor, ActorRef} 5 | 6 | class TestProducer(data: Array[String], to: ActorRef) extends Counter { 7 | import scala.util.Random 8 | import context.dispatcher 9 | 10 | val rand = new Random() 11 | 12 | val task = context.system.scheduler.schedule(2.second, 1.millis) { 13 | if (count < scale) { // we need this test to avoid generating more than 'scale' messages 14 | to ! makeMessage() 15 | increment() 16 | } 17 | } 18 | 19 | def receive: Actor.Receive = { 20 | case _ => 21 | } 22 | 23 | def makeMessage(): String = { 24 | val x = rand.nextInt(3) 25 | data(x) + data(2 - x) 26 | } 27 | } 28 | 29 | trait CounterFixture { 30 | val scale = 30 31 | } 32 | 33 | // CountDownLatch is not Serializable, can't use in stream so we do this. 34 | trait Counter extends Actor with CounterFixture { 35 | 36 | var count = 0 37 | 38 | def increment(): Unit = { 39 | count += 1 40 | if (count == scale) self ! PoisonPill 41 | } 42 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/types/CanBuildFromTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import org.apache.commons.lang3.SerializationUtils 4 | import org.junit.Assert._ 5 | import org.junit.Test 6 | 7 | class CanBuildFromTest { 8 | 9 | @Test 10 | def testBuild() { 11 | val bf = CanBuildFrom.setCanBuildFrom[Int] 12 | val builder = bf.apply() 13 | builder += 1 14 | builder += 2 15 | builder += 3 16 | assertEquals(Set(1,2,3), builder.result()) 17 | } 18 | 19 | @Test 20 | def testSerializeAndBuild() { 21 | val bf = CanBuildFrom.setCanBuildFrom[Int] 22 | val bf2 = SerializationUtils.roundtrip(bf) 23 | val builder = bf2.apply() 24 | builder += 1 25 | builder += 2 26 | builder += 3 27 | assertEquals(Set(1,2,3), builder.result()) 28 | } 29 | 30 | @Test 31 | def testSerializeAndBuildWithOrdering() { 32 | val bf = CanBuildFrom.treeSetCanBuildFrom[Int] 33 | val bf2 = SerializationUtils.roundtrip(bf) 34 | val builder = bf2.apply() 35 | builder += 1 36 | builder += 2 37 | builder += 3 38 | assertEquals(Set(1,2,3), builder.result()) 39 | } 40 | 41 | 42 | } 43 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/twitter-streaming/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Spark Cassandra Connector (Twitter Demo App) Config File # 3 | #################################### 4 | 5 | # This is the reference config file that contains all the default settings. 6 | streaming-app { 7 | 8 | # These can be search terms to filter for, or hashtags 9 | # ["android", "iphone"] 10 | filters = ["#android", "#iphone"] 11 | 12 | spark { 13 | # The fallback Spark master, it auto-detection fails. 14 | # Can change to spark://127.0.0.1:7077 for example. 15 | master = "local[*]" 16 | 17 | # In seconds: Not using hcon 5s format until Spark 18 | # Upgrades their akka and thus config versions (to avoid a deprecation issue). 19 | streaming.batch.interval = 5 20 | 21 | # The default 22 | executor.memory = 2g 23 | cores.max = 2 24 | 25 | jars = [ 26 | "./spark-cassandra-connector-demos/twitter-streaming/target/scala-2.10/twitter-streaming-assembly-1.1.0-SNAPSHOT.jar" 27 | ] 28 | 29 | cassandra { 30 | connection.host = ["127.0.0.1"] 31 | keyspace = "twitter_stream" 32 | table = "hashtags_by_interval" 33 | } 34 | } 35 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/cql/CassandraAuthenticatedConnectorSpec.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import com.datastax.spark.connector.testkit.SharedEmbeddedCassandra 4 | import org.scalatest.{Matchers, FlatSpec} 5 | 6 | class CassandraAuthenticatedConnectorSpec extends FlatSpec with Matchers with SharedEmbeddedCassandra { 7 | 8 | useCassandraConfig("cassandra-password-auth.yaml" + 9 | ".template") 10 | val conn = CassandraConnector(Set(cassandraHost), authConf = PasswordAuthConf("cassandra", "cassandra")) 11 | 12 | // Wait for the default user to be created in Cassandra. 13 | Thread.sleep(1000) 14 | 15 | "A CassandraConnector" should "authenticate with username and password when using native protocol" in { 16 | conn.withSessionDo { session => 17 | assert(session !== null) 18 | assert(session.isClosed === false) 19 | assert(session.getCluster.getMetadata.getClusterName === "Test Cluster") 20 | } 21 | } 22 | 23 | it should "authenticate with username and password when using thrift" in { 24 | conn.withCassandraClientDo { client => 25 | assert(client.describe_cluster_name() === "Test Cluster") 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/dht/TokenRange.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner.dht 2 | 3 | import java.net.InetAddress 4 | 5 | 6 | case class CassandraNode(rpcAddress: InetAddress, localAddress: InetAddress) { 7 | require(rpcAddress != InetAddress.getByName("0.0.0.0"), "rpcAddress must not be 0.0.0.0") 8 | require(localAddress != InetAddress.getByName("0.0.0.0"), "localAddress must not be 0.0.0.0") 9 | def allAddresses = Set(rpcAddress, localAddress) 10 | } 11 | 12 | object CassandraNode { 13 | implicit def ordering: Ordering[CassandraNode] = Ordering.by(_.rpcAddress.toString) 14 | } 15 | 16 | case class TokenRange[V, T <: Token[V]] ( 17 | start: T, end: T, endpoints: Set[CassandraNode], rowCount: Option[Long]) { 18 | 19 | def isWrapAround: Boolean = 20 | start >= end 21 | 22 | def unwrap(implicit tokenFactory: TokenFactory[V, T]): Seq[TokenRange[V, T]] = { 23 | val minToken = tokenFactory.minToken 24 | if (isWrapAround) 25 | Seq( 26 | TokenRange(start, minToken, endpoints, rowCount.map(_ / 2)), 27 | TokenRange(minToken, end, endpoints, rowCount.map(_ / 2))) 28 | else 29 | Seq(this) 30 | } 31 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleWithNestedJavaBean.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleWithNestedJavaBean implements Serializable { 10 | public class InnerClass implements Serializable { 11 | private Integer key; 12 | private String value; 13 | 14 | public InnerClass(Integer key) { 15 | this.key = key; 16 | } 17 | 18 | public InnerClass() { 19 | } 20 | 21 | public InnerClass(Integer key, String value) { 22 | this.key = key; 23 | this.value = value; 24 | } 25 | 26 | public Integer getKey() { 27 | return key; 28 | } 29 | 30 | public void setKey(Integer key) { 31 | this.key = key; 32 | } 33 | 34 | public String getValue() { 35 | return value; 36 | } 37 | 38 | public void setValue(String value) { 39 | this.value = value; 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/PropertyExtractorTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import org.junit.Assert._ 4 | import org.junit.Test 5 | 6 | class PropertyExtractorTest { 7 | 8 | class TestClass(val field1: String, val field2: Int) 9 | 10 | @Test 11 | def testSimpleExtraction() { 12 | val testObject = new TestClass("a", 1) 13 | val propertyExtractor = new PropertyExtractor(classOf[TestClass], Seq("field1", "field2")) 14 | val result = propertyExtractor.extract(testObject) 15 | assertEquals(2, result.size) 16 | assertEquals("a", result(0)) 17 | assertEquals(1, result(1)) 18 | } 19 | 20 | @Test 21 | def testAvailableProperties() { 22 | val triedProperties = Seq("field1", "foo", "bar") 23 | val availableProperties = PropertyExtractor.availablePropertyNames(classOf[TestClass], triedProperties) 24 | assertEquals(Seq("field1"), availableProperties) 25 | } 26 | 27 | @Test(expected = classOf[NoSuchMethodException]) 28 | def testWrongPropertyName() { 29 | val testObject = new TestClass("a", 1) 30 | val propertyExtractor = new PropertyExtractor(classOf[TestClass], Seq("foo")) 31 | propertyExtractor.extract(testObject) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/ObjectSizeEstimatorTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import java.nio.ByteBuffer 4 | import java.util.Date 5 | 6 | import org.junit.Assert._ 7 | import org.junit.Test 8 | 9 | class ObjectSizeEstimatorTest { 10 | 11 | @Test 12 | def testFunctionality() { 13 | val size0 = ObjectSizeEstimator.measureSerializedSize(Array(1)) 14 | val size1 = ObjectSizeEstimator.measureSerializedSize(Array(1, 2)) 15 | val size2 = ObjectSizeEstimator.measureSerializedSize(Array(1, 2, "abc", List("item1", "item2"), new Date())) 16 | assertTrue(size0 > 16) 17 | assertTrue(size1 > size0) 18 | assertTrue(size2 > size1) 19 | } 20 | 21 | @Test 22 | def testByteBuffers() { 23 | val buffer = ByteBuffer.allocate(100) 24 | val size0 = ObjectSizeEstimator.measureSerializedSize(Array(buffer)) 25 | val size1 = ObjectSizeEstimator.measureSerializedSize(Array(List(buffer))) 26 | val size2 = ObjectSizeEstimator.measureSerializedSize(Array(Set(buffer))) 27 | val size3 = ObjectSizeEstimator.measureSerializedSize(Array(Map(1 -> buffer))) 28 | assertTrue(size0 > 100) 29 | assertTrue(size1 > 100) 30 | assertTrue(size2 > 100) 31 | assertTrue(size3 > 100) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/RDDFunctions.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import com.datastax.spark.connector.writer._ 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.rdd.RDD 7 | 8 | /** Provides Cassandra-specific methods on `RDD` */ 9 | class RDDFunctions[T](rdd: RDD[T]) extends WritableToCassandra[T] with Serializable { 10 | 11 | override val sparkContext: SparkContext = rdd.sparkContext 12 | 13 | /** 14 | * Saves the data from `RDD` to a Cassandra table. Uses the specified column names. 15 | * @see [[com.datastax.spark.connector.writer.WritableToCassandra]] 16 | */ 17 | def saveToCassandra(keyspaceName: String, 18 | tableName: String, 19 | columns: ColumnSelector = AllColumns, 20 | writeConf: WriteConf = WriteConf.fromSparkConf(sparkContext.getConf)) 21 | (implicit connector: CassandraConnector = CassandraConnector(sparkContext.getConf), 22 | rwf: RowWriterFactory[T]): Unit = { 23 | val writer = TableWriter(connector, keyspaceName, tableName, columns, writeConf) 24 | rdd.sparkContext.runJob(rdd, writer.write _) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/types/TypeSerializationTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import org.apache.commons.lang3.SerializationUtils 4 | import org.junit.Assert._ 5 | import org.junit.Test 6 | 7 | class TypeSerializationTest { 8 | 9 | private def testSerialization(t: ColumnType[_]) { 10 | assertEquals(t, SerializationUtils.roundtrip(t)) 11 | } 12 | 13 | @Test 14 | def testSerializationOfPrimitiveTypes() { 15 | testSerialization(AsciiType) 16 | testSerialization(TextType) 17 | testSerialization(IntType) 18 | testSerialization(BigIntType) 19 | testSerialization(DoubleType) 20 | testSerialization(FloatType) 21 | testSerialization(BooleanType) 22 | testSerialization(UUIDType) 23 | testSerialization(TimeUUIDType) 24 | testSerialization(TimestampType) 25 | testSerialization(DecimalType) 26 | testSerialization(BigIntType) 27 | testSerialization(InetType) 28 | testSerialization(CounterType) 29 | } 30 | 31 | @Test 32 | def testSerializationOfCollectionTypes() { 33 | testSerialization(ListType(IntType)) 34 | testSerialization(ListType(ListType(IntType))) 35 | testSerialization(SetType(TextType)) 36 | testSerialization(MapType(BigIntType, TimestampType)) 37 | } 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/PropertyExtractor.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import java.lang.reflect.Method 4 | 5 | import scala.util.Try 6 | 7 | /** Extracts values from fields of an object. */ 8 | class PropertyExtractor[T](val cls: Class[T], val propertyNames: Seq[String]) extends Serializable { 9 | 10 | private def getter(name: String) = 11 | cls.getMethod(name) 12 | 13 | @transient 14 | private lazy val methods: Array[Method] = 15 | propertyNames.map(getter).toArray 16 | 17 | @transient 18 | private lazy val methodByName = 19 | methods.map(m => (m.getName, m)).toMap 20 | 21 | def extract(obj: T): Array[AnyRef] = 22 | extract(obj, Array.ofDim(methods.length)) 23 | 24 | def extract(obj: T, target: Array[AnyRef]): Array[AnyRef] = { 25 | for (i <- 0 until methods.length) 26 | target(i) = methods(i).invoke(obj) 27 | target 28 | } 29 | 30 | def extractProperty(obj: T, propertyName: String): AnyRef = { 31 | val m = methodByName(propertyName) 32 | m.invoke(obj) 33 | } 34 | } 35 | 36 | object PropertyExtractor { 37 | 38 | def availablePropertyNames(cls: Class[_], requestedPropertyNames: Seq[String]): Seq[String] = 39 | requestedPropertyNames.filter(name => Try(cls.getMethod(name)).isSuccess) 40 | 41 | } 42 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/CassandraRDDPartition.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import java.net.InetAddress 4 | 5 | import org.apache.spark.Partition 6 | 7 | /** Stores a CQL `WHERE` predicate matching a range of tokens. */ 8 | case class CqlTokenRange(cql: String, values: Any*) 9 | 10 | /** Metadata describing Cassandra table partition processed by a single Spark task. 11 | * Beware the term "partition" is overloaded. Here, in the context of Spark, 12 | * it means an arbitrary collection of rows that can be processed locally on a single Cassandra cluster node. 13 | * A `CassandraPartition` typically contains multiple CQL partitions, i.e. rows identified by different values of 14 | * the CQL partitioning key. 15 | * 16 | * @param index identifier of the partition, used internally by Spark 17 | * @param endpoints which nodes the data partition is located on 18 | * @param tokenRanges token ranges determining the row set to be fetched 19 | * @param rowCount estimated total row count in a partition 20 | */ 21 | case class CassandraPartition(index: Int, 22 | endpoints: Iterable[InetAddress], 23 | tokenRanges: Iterable[CqlTokenRange], 24 | rowCount: Long) extends Partition 25 | 26 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/ConvertingPropertyExtractor.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.types.TypeConverter 4 | 5 | /** Extracts property values from an object and additionally converts them to desired types */ 6 | class ConvertingPropertyExtractor[T](val cls: Class[T], properties: Seq[(String, TypeConverter[_])]) 7 | extends Serializable { 8 | 9 | val (propertyNames, propertyTypes) = properties.toArray.unzip 10 | val propertyTypeByName = properties.toMap 11 | 12 | private val simpleExtractor = 13 | new PropertyExtractor[T](cls, propertyNames) 14 | 15 | def extract(obj: T): Array[AnyRef] = 16 | convert(simpleExtractor.extract(obj)) 17 | 18 | 19 | def extract(obj: T, target: Array[AnyRef]): Array[AnyRef] = 20 | convert(simpleExtractor.extract(obj, target)) 21 | 22 | def extractProperty(obj: T, propertyName: String): AnyRef = { 23 | val propertyValue = simpleExtractor.extractProperty(obj, propertyName) 24 | val converter = propertyTypeByName(propertyName) 25 | converter.convert(propertyValue).asInstanceOf[AnyRef] 26 | } 27 | 28 | def convert(data: Array[AnyRef]): Array[AnyRef] = { 29 | for (i <- 0 until data.length) 30 | data(i) = propertyTypes(i).convert(data(i)).asInstanceOf[AnyRef] 31 | data 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/AbstractRowWriter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.driver.core.{ProtocolVersion, PreparedStatement} 4 | import com.datastax.spark.connector.cql.TableDef 5 | import org.apache.spark.sql.catalyst.expressions.Row 6 | 7 | /** A [[RowWriter]] that can write SparkSQL [[Row]] objects or [[com.datastax.spark.connector.CassandraRow]] objects .*/ 8 | abstract class AbstractRowWriter[T](table: TableDef, selectedColumns: Seq[String]) extends RowWriter[T] { 9 | 10 | override def columnNames = 11 | selectedColumns.toIndexedSeq 12 | 13 | protected def getColumnValue(data: T, columnName: String): AnyRef 14 | 15 | @transient 16 | protected lazy val buffer = new ThreadLocal[Array[AnyRef]] { 17 | override def initialValue() = Array.ofDim[AnyRef](columnNames.size) 18 | } 19 | 20 | protected def fillBuffer(data: T): Array[AnyRef] = { 21 | val buf = buffer.get 22 | for (i <- 0 until columnNames.size) 23 | buf(i) = getColumnValue(data, columnNames(i)) 24 | buf 25 | } 26 | 27 | override def bind(data: T, stmt: PreparedStatement, protocolVersion: ProtocolVersion) = { 28 | stmt.bind(fillBuffer(data): _*) 29 | } 30 | 31 | override def estimateSizeInBytes(data: T) = { 32 | ObjectSizeEstimator.measureSerializedSize(fillBuffer(data)) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/ConvertingPropertyExtractorTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.types.TypeConverter.{StringConverter, OptionToNullConverter, IntConverter} 4 | 5 | import org.junit.Assert._ 6 | import org.junit.Test 7 | 8 | class ConvertingPropertyExtractorTest { 9 | 10 | class TestClass(val field1: String, val field2: Option[Int]) 11 | 12 | private def createExtractor: ConvertingPropertyExtractor[TestClass] = { 13 | new ConvertingPropertyExtractor[TestClass]( 14 | classOf[TestClass], Seq( 15 | ("field1", IntConverter), 16 | ("field2", new OptionToNullConverter(StringConverter)))) 17 | } 18 | 19 | @Test 20 | def testExtraction() { 21 | val obj = new TestClass("123", Some(5)) 22 | val extractor = createExtractor 23 | val data = extractor.extract(obj) 24 | assertNotNull(data) 25 | assertEquals(2, data.length) 26 | assertEquals(123, data(0)) 27 | assertEquals("5", data(1)) 28 | } 29 | 30 | @Test 31 | def testExtractionNoAlloc() { 32 | val obj = new TestClass("123", Some(5)) 33 | val extractor = createExtractor 34 | val data = Array.ofDim[AnyRef](extractor.propertyNames.size) 35 | extractor.extract(obj, data) 36 | assertEquals(123, data(0)) 37 | assertEquals("5", data(1)) 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/MultipleRetryPolicy.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import com.datastax.driver.core.policies.RetryPolicy 4 | import com.datastax.driver.core.policies.RetryPolicy.RetryDecision 5 | import com.datastax.driver.core.{ConsistencyLevel, Statement, WriteType} 6 | 7 | /** Always retries with the same CL, constant number of times, regardless of circumstances */ 8 | class MultipleRetryPolicy(maxRetryCount: Int) extends RetryPolicy { 9 | 10 | private def retryOrThrow(cl: ConsistencyLevel, nbRetry: Int): RetryDecision = { 11 | if (nbRetry < maxRetryCount) 12 | RetryDecision.retry(cl) 13 | else 14 | RetryDecision.rethrow() 15 | } 16 | 17 | override def onReadTimeout(stmt: Statement, cl: ConsistencyLevel, 18 | requiredResponses: Int, receivedResponses: Int, 19 | dataRetrieved: Boolean, nbRetry: Int) = retryOrThrow(cl, nbRetry) 20 | 21 | override def onUnavailable(stmt: Statement, cl: ConsistencyLevel, 22 | requiredReplica: Int, aliveReplica: Int, nbRetry: Int) = retryOrThrow(cl, nbRetry) 23 | 24 | override def onWriteTimeout(stmt: Statement, cl: ConsistencyLevel, writeType: WriteType, 25 | requiredAcks: Int, receivedAcks: Int, nbRetry: Int) = retryOrThrow(cl, nbRetry) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /scripts/submit-demos: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This very basic script that submits the demos jar to a local spark master. 3 | # TODO add input validation and error handling. 4 | 5 | # ## 1. Build the demos assembly jar: 6 | # sbt -Dspark.cassandra.connector.demos.assembly=true assembly 7 | 8 | # ## 2. Run this script ## 9 | # Pass in 3 parameters: 10 | # 1. Path to /bin/spark-submit 11 | # 2. Spark master 12 | # 3. The FQCN of the demo class to run, e.g: com.datastax.spark.connector.demo.BasicReadWriteDemo 13 | # For further customization options see https://spark.apache.org/docs/latest/submitting-applications.html 14 | # Example: 15 | # sudo ./scripts/submit-demos /path/to/spark/bin spark://master:7077 com.datastax.spark.connector.demo.BasicReadWriteDemo 16 | # ## 17 | 18 | 19 | PATH_TO_SPARK_BIN_SCRIPTS=$1 20 | SPARK_MASTER=$2 21 | APP_TO_RUN=$3 22 | 23 | # TODO read from Settings.scala scalaVersion and version in ThisBuild: 24 | VERSION="1.0.0-SNAPSHOT" 25 | SCALA_VERSION="scala-2.10" 26 | DEMOS_ASSEMBLY_JAR="spark-cassandra-connector-demos-assembly-$VERSION.jar" 27 | PATH_TO_JAR="spark-cassandra-connector-demos/target/$SCALA_VERSION/$DEMOS_ASSEMBLY_JAR" 28 | SPARK_SUBMIT="$PATH_TO_SPARK_BIN_SCRIPTS/spark-submit" 29 | 30 | # Run on a Spark standalone cluster 31 | echo "Attempting to submit demo $SPARK_SUBMIT on $SPARK_MASTER with $PATH_TO_JAR" 32 | $SPARK_SUBMIT --class $APP_TO_RUN --master $SPARK_MASTER $PATH_TO_JAR 100 33 | 34 | 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/ReadConf.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | import org.apache.spark.SparkConf 4 | 5 | import com.datastax.driver.core.ConsistencyLevel 6 | 7 | /** Read settings for RDD 8 | * 9 | * @param splitSize number of Cassandra partitions to be read in a single Spark task 10 | * @param fetchSize number of CQL rows to fetch in a single round-trip to Cassandra 11 | * @param consistencyLevel consistency level for reads, default LOCAL_ONE; 12 | * higher consistency level will disable data-locality */ 13 | case class ReadConf( 14 | splitSize: Int = ReadConf.DefaultSplitSize, 15 | fetchSize: Int = ReadConf.DefaultFetchSize, 16 | consistencyLevel: ConsistencyLevel = ReadConf.DefaultConsistencyLevel) 17 | 18 | 19 | object ReadConf { 20 | val DefaultSplitSize = 100000 21 | val DefaultFetchSize = 1000 22 | val DefaultConsistencyLevel = ConsistencyLevel.LOCAL_ONE 23 | 24 | def fromSparkConf(conf: SparkConf): ReadConf = { 25 | ReadConf( 26 | fetchSize = conf.getInt("spark.cassandra.input.page.row.size", DefaultFetchSize), 27 | splitSize = conf.getInt("spark.cassandra.input.split.size", DefaultSplitSize), 28 | consistencyLevel = ConsistencyLevel.valueOf( 29 | conf.get("spark.cassandra.input.consistency.level", DefaultConsistencyLevel.name())) 30 | ) 31 | } 32 | 33 | } 34 | 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleWithDeeplyNestedJavaBean.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleWithDeeplyNestedJavaBean implements Serializable { 10 | public class IntermediateClass implements Serializable { 11 | public class InnerClass implements Serializable { 12 | private Integer key; 13 | private String value; 14 | 15 | public InnerClass(Integer key) { 16 | this.key = key; 17 | } 18 | 19 | public InnerClass() { 20 | } 21 | 22 | public InnerClass(Integer key, String value) { 23 | this.key = key; 24 | this.value = value; 25 | } 26 | 27 | public Integer getKey() { 28 | return key; 29 | } 30 | 31 | public void setKey(Integer key) { 32 | this.key = key; 33 | } 34 | 35 | public String getValue() { 36 | return value; 37 | } 38 | 39 | public void setValue(String value) { 40 | this.value = value; 41 | } 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/DStreamFunctions.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import com.datastax.spark.connector._ 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import com.datastax.spark.connector.writer.{TableWriter, WriteConf, RowWriterFactory, WritableToCassandra} 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.dstream.DStream 8 | 9 | class DStreamFunctions[T](dstream: DStream[T]) extends WritableToCassandra[T] with Serializable { 10 | 11 | override def sparkContext: SparkContext = dstream.context.sparkContext 12 | 13 | def conf = sparkContext.getConf 14 | 15 | /** 16 | * Performs [[com.datastax.spark.connector.writer.WritableToCassandra]] for each produced RDD. 17 | * Uses specific column names with an additional batch size. 18 | */ 19 | def saveToCassandra(keyspaceName: String, 20 | tableName: String, 21 | columnNames: ColumnSelector = AllColumns, 22 | writeConf: WriteConf = WriteConf.fromSparkConf(conf)) 23 | (implicit connector: CassandraConnector = CassandraConnector(conf), 24 | rwf: RowWriterFactory[T]): Unit = { 25 | val writer = TableWriter(connector, keyspaceName, tableName, columnNames, writeConf) 26 | dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, writer.write _)) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/Murmur3PartitionerTokenRangeSplitter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import com.datastax.spark.connector.rdd.partitioner.dht.{LongToken, TokenFactory, TokenRange} 4 | 5 | import scala.math.BigDecimal.RoundingMode 6 | 7 | /** Fast token range splitter assuming that data are spread out evenly in the whole range. */ 8 | class Murmur3PartitionerTokenRangeSplitter(cassandraPartitionsPerToken: Double) extends TokenRangeSplitter[Long, LongToken] { 9 | 10 | private val tokenFactory = 11 | TokenFactory.Murmur3TokenFactory 12 | 13 | def split(range: TokenRange[Long, LongToken], splitSize: Long) = { 14 | val left = range.start.value 15 | val right = range.end.value 16 | val rangeSize = 17 | if (right > left) BigDecimal(right) - BigDecimal(left) 18 | else BigDecimal(right) - BigDecimal(left) + BigDecimal(tokenFactory.totalTokenCount) 19 | val estimatedRows = rangeSize * cassandraPartitionsPerToken 20 | val n = math.max(1, (estimatedRows / splitSize).setScale(0, RoundingMode.HALF_UP).toInt) 21 | val splitPoints = 22 | (for (i <- 0 until n) yield left + (rangeSize * i.toDouble / n).toLong) :+ right 23 | for (Seq(l, r) <- splitPoints.sliding(2).toSeq) yield 24 | new TokenRange[Long, LongToken]( 25 | new LongToken(l), 26 | new LongToken(r), 27 | range.endpoints, 28 | Some((estimatedRows / n).toInt)) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/TableCopyDemo.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | 5 | object TableCopyDemo extends DemoApp { 6 | 7 | CassandraConnector(conf).withSessionDo { session => 8 | session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") 9 | session.execute("CREATE TABLE IF NOT EXISTS test.source (key INT PRIMARY KEY, data VARCHAR)") 10 | session.execute("CREATE TABLE IF NOT EXISTS test.destination (key INT PRIMARY KEY, data VARCHAR)") 11 | session.execute("TRUNCATE test.source") 12 | session.execute("TRUNCATE test.destination") 13 | session.execute("INSERT INTO test.source(key, data) VALUES (1, 'first row')") 14 | session.execute("INSERT INTO test.source(key, data) VALUES (2, 'second row')") 15 | session.execute("INSERT INTO test.source(key, data) VALUES (3, 'third row')") 16 | } 17 | 18 | import com.datastax.spark.connector._ 19 | 20 | val src = sc.cassandraTable("test", "source") 21 | src.saveToCassandra("test", "destination") 22 | 23 | val dest = sc.cassandraTable("test", "destination") 24 | dest.collect().foreach(row => log.info(s"$row")) 25 | 26 | // Assert the rows were copied from test.source to test.destination table: 27 | assert(dest.collect().length == 3) 28 | 29 | log.info(s"Work completed, stopping the Spark context.") 30 | sc.stop() 31 | } 32 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/InsertIntoCassandraTable.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.cassandra 2 | 3 | import com.datastax.spark.connector._ 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import com.datastax.spark.connector.writer.SqlRowWriter 6 | import org.apache.spark.annotation.DeveloperApi 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Row} 9 | import org.apache.spark.sql.execution.{SparkPlan, UnaryNode} 10 | 11 | @DeveloperApi 12 | case class InsertIntoCassandraTable(cassandraRelation: CassandraRelation, 13 | childPlan: SparkPlan, 14 | overwrite: Boolean) 15 | (@transient cc: CassandraSQLContext) extends UnaryNode { 16 | self: Product => 17 | 18 | override def output: Seq[Attribute] = childPlan.output 19 | 20 | override def execute(): RDD[Row] = result 21 | 22 | override def child: SparkPlan = childPlan 23 | 24 | override def otherCopyArgs = cc :: Nil 25 | 26 | /** 27 | * Insert RDD[[Row]] to Cassandra 28 | */ 29 | private lazy val result: RDD[Row] = { 30 | val childRdd = child.execute() 31 | 32 | //TODO: cluster level CassandraConnector, write configuration settings 33 | childRdd.saveToCassandra(cassandraRelation.keyspaceName, cassandraRelation.tableName)(CassandraConnector(sparkContext.getConf), SqlRowWriter.Factory) 34 | 35 | cc.sparkContext.makeRDD(Nil, 1) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /doc/10_embedded.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | ## The `spark-cassandra-connector-embedded` Artifact 3 | The `spark-cassandra-connector-embedded` artifact can be used as a test or prototype dependency to spin up embedded servers for testing ideas, quickly learning, integration, etc. 4 | Pulling this dependency in allows you to do 5 | 6 | - Integration Tests (IT) tests with an embedded Cassandra instance 7 | - if your sbt project is configured to [run IT configs](https://github.com/datastax/spark-cassandra-connector/blob/master/project/Settings.scala#L78-L94) 8 | - Easily write and run a Spark Streaming app using 9 | - Apache Kafka streams (including an embedded Zookeeper), all with no Ops work involved 10 | - Twitter streams (needs the 4 auth credentials required by twitter) 11 | - And of course Cassandra but you currently need to sping up a local instance: [Download Cassandra latest](http://cassandra.apache.org/download/), open the tar, and run `sudo ./apache-cassandra-2.1.0/bin/cassandra` 12 | 13 | ## The Code 14 | See: [https://github.com/datastax/spark-cassandra-connector/tree/master/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded](https://github.com/datastax/spark-cassandra-connector/tree/master/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded) 15 | 16 | ## How To Add The Dependency 17 | Simply add this to your SBT build, or in the appropriate format for a Maven build 18 | 19 | "com.datastax.spark" %% "spark-cassandra-connector-embedded" % {latest.verson} -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/RowReader.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.reader 2 | 3 | import com.datastax.driver.core.{ProtocolVersion, Row} 4 | 5 | /** Transforms a Cassandra Java driver `Row` into high-level row representation, e.g. a tuple 6 | * or a user-defined case class object. The target type `T` must be serializable. */ 7 | trait RowReader[T] extends Serializable { 8 | 9 | /** Reads column values from low-level `Row` and turns them into higher level representation. 10 | * @param row row fetched from Cassandra 11 | * @param columnNames column names available in the `row` 12 | * @param protocolVersion java driver protocol version to be used for deserialization */ 13 | def read(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): T 14 | 15 | /** List of columns this `RowReader` is going to read. 16 | * Useful to avoid fetching the columns that are not needed. */ 17 | def columnNames: Option[Seq[String]] 18 | 19 | /** The number of columns that need to be fetched from C*. */ 20 | def requiredColumns: Option[Int] 21 | 22 | /** This method should be implemented by those row readers which reads fields in the consecutive 23 | * positions from a CassandraRow. When a row reader implements it so that it returns a non-empty, 24 | * it denotes the number of columns this reader moves the column cursor forward for compound row 25 | * readers (such as [[KeyValueRowReader]]). */ 26 | def consumedColumns: Option[Int] = None 27 | 28 | } 29 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/RowWriterFactory.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.cql.TableDef 4 | import com.datastax.spark.connector.mapper.ColumnMapper 5 | 6 | import scala.reflect.ClassTag 7 | 8 | /** Creates instances of [[RowWriter]] objects for the given row type `T`. 9 | * `RowWriterFactory` is the trait you need to implement if you want to support row representations 10 | * which cannot be simply mapped by a [[com.datastax.spark.connector.mapper.ColumnMapper ColumnMapper]].*/ 11 | trait RowWriterFactory[T] { 12 | 13 | /** Creates a new `RowWriter` instance. 14 | * @param table target table the user wants to write into 15 | * @param columnNames columns selected by the user; the user might wish to write only a subset of columns */ 16 | def rowWriter(table: TableDef, columnNames: Seq[String]): RowWriter[T] 17 | } 18 | 19 | /** Provides a low-priority implicit `RowWriterFactory` able to write objects of any class for which 20 | * a [[com.datastax.spark.connector.mapper.ColumnMapper ColumnMapper]] is defined.*/ 21 | trait LowPriorityRowWriterFactoryImplicits { 22 | implicit def defaultRowWriterFactory[T : ColumnMapper]: RowWriterFactory[T] = DefaultRowWriter.factory 23 | } 24 | 25 | /** Provides an implicit `RowWriterFactory` for saving [[com.datastax.spark.connector.CassandraRow CassandraRow]] objects.*/ 26 | object RowWriterFactory extends LowPriorityRowWriterFactoryImplicits { 27 | implicit val genericRowWriterFactory = GenericRowWriter.Factory 28 | } -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/SparkRepl.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import java.io.{PrintWriter, StringWriter, StringReader, BufferedReader} 4 | import java.net.URLClassLoader 5 | 6 | import scala.collection.mutable.ArrayBuffer 7 | import org.apache.spark.repl.SparkILoop 8 | 9 | trait SparkRepl { 10 | def runInterpreter(master: String, input: String): String = { 11 | System.setProperty("spark.cassandra.connection.host", EmbeddedCassandra.cassandraHost.getHostAddress) 12 | val in = new BufferedReader(new StringReader(input + "\n")) 13 | val out = new StringWriter() 14 | val cl = getClass.getClassLoader 15 | var paths = new ArrayBuffer[String] 16 | cl match { 17 | case urlLoader: URLClassLoader => 18 | for (url <- urlLoader.getURLs) { 19 | if (url.getProtocol == "file") { 20 | paths += url.getFile 21 | } 22 | } 23 | case _ => 24 | } 25 | val interp = new SparkILoop(in, new PrintWriter(out), master) 26 | org.apache.spark.repl.Main.interp = interp 27 | val separator = System.getProperty("path.separator") 28 | interp.process(Array("-classpath", paths.mkString(separator))) 29 | org.apache.spark.repl.Main.interp = null 30 | if (interp.sparkContext != null) { 31 | interp.sparkContext.stop() 32 | } 33 | // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown 34 | System.clearProperty("spark.driver.port") 35 | out.toString 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/BasicReadWriteDemo.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | 5 | object BasicReadWriteDemo extends DemoApp { 6 | 7 | CassandraConnector(conf).withSessionDo { session => 8 | session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") 9 | session.execute("CREATE TABLE IF NOT EXISTS test.key_value (key INT PRIMARY KEY, value VARCHAR)") 10 | session.execute("TRUNCATE test.key_value") 11 | session.execute("INSERT INTO test.key_value(key, value) VALUES (1, 'first row')") 12 | session.execute("INSERT INTO test.key_value(key, value) VALUES (2, 'second row')") 13 | session.execute("INSERT INTO test.key_value(key, value) VALUES (3, 'third row')") 14 | } 15 | 16 | import com.datastax.spark.connector._ 17 | 18 | // Read table test.kv and print its contents: 19 | val rdd = sc.cassandraTable("test", "key_value").select("key", "value") 20 | rdd.collect().foreach(row => log.info(s"Existing Data: $row")) 21 | 22 | // Write two new rows to the test.kv table: 23 | val col = sc.parallelize(Seq((4, "fourth row"), (5, "fifth row"))) 24 | col.saveToCassandra("test", "key_value", SomeColumns("key", "value")) 25 | 26 | // Assert the two new rows were stored in test.kv table: 27 | assert(col.collect().length == 2) 28 | 29 | col.collect().foreach(row => log.info(s"New Data: $row")) 30 | log.info(s"Work completed, stopping the Spark context.") 31 | sc.stop() 32 | } 33 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # for production, you should probably set pattern to %c instead of %l. 18 | # (%l is slower.) 19 | 20 | # output messages into a rolling log file as well as stdout 21 | log4j.rootLogger=WARN,stdout 22 | 23 | # stdout 24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 26 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 27 | 28 | # Adding this to avoid thrift logging disconnect errors. 29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR 30 | 31 | # Avoid "no host ID found" when starting a fresh node 32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR 33 | 34 | log4j.logger.com.datastax.spark.connector=INFO 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/kafka-streaming/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # for production, you should probably set pattern to %c instead of %l. 18 | # (%l is slower.) 19 | 20 | # output messages into a rolling log file as well as stdout 21 | log4j.rootLogger=WARN,stdout 22 | 23 | # stdout 24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 26 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 27 | 28 | # Adding this to avoid thrift logging disconnect errors. 29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR 30 | 31 | # Avoid "no host ID found" when starting a fresh node 32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR 33 | 34 | log4j.logger.com.datastax.spark.connector=INFO 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/twitter-streaming/src/main/scala/com/datastax/spark/connector/demo/TwitterStreamingHashTagsByInterval.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import scala.util.matching.Regex 4 | import org.apache.spark.storage.StorageLevel 5 | import org.apache.spark.streaming.{Time, Seconds, StreamingContext} 6 | import org.apache.spark.streaming.twitter.TwitterUtils 7 | import org.joda.time.{DateTimeZone, DateTime} 8 | import twitter4j.auth.Authorization 9 | import com.datastax.spark.connector.streaming._ 10 | import com.datastax.spark.connector.SomeColumns 11 | 12 | class TwitterStreamingHashTagsByInterval extends Serializable { 13 | 14 | def start(auth: Option[Authorization], ssc: StreamingContext, filters: Regex, keyspace: String, table: String): Unit = { 15 | 16 | val transform = (cruft: String) => filters.findAllIn(cruft).flatMap(_.stripPrefix("#")) 17 | 18 | val stream = TwitterUtils.createStream(ssc, auth, Nil, StorageLevel.MEMORY_ONLY_SER_2) 19 | 20 | /** Note that Cassandra is doing the sorting for you here. */ 21 | stream.flatMap(_.getText.toLowerCase.split("""\s+""")) 22 | .map(transform) 23 | .countByValueAndWindow(Seconds(5), Seconds(5)) 24 | .transform((rdd, time) => rdd.map { case (term, count) => (term, count, now(time))}) 25 | .saveToCassandra(keyspace, table, SomeColumns("hashtag", "mentions", "interval")) 26 | 27 | ssc.checkpoint("./checkpoint") 28 | ssc.start() 29 | ssc.awaitTermination() 30 | } 31 | 32 | private def now(time: Time): String = 33 | new DateTime(time.milliseconds, DateTimeZone.UTC).toString("yyyyMMddHH:mm:ss.SSS") 34 | } 35 | 36 | 37 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/streaming/StreamingSpec.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import com.datastax.spark.connector.testkit._ 4 | import com.datastax.spark.connector.embedded._ 5 | 6 | /** 7 | * Usages: Create the [[org.apache.spark.streaming.StreamingContext]] then write async to the stream. 8 | * 9 | * val ssc = new StreamingContext(conf, Milliseconds(500)) 10 | * 11 | * Akka 12 | * {{{ 13 | * val stream = ssc.actorStream[String](Props[SimpleActor], actorName, StorageLevel.MEMORY_AND_DISK) 14 | * }}} 15 | * 16 | * On upgrade examples: 17 | * Kafka 18 | * {{{ 19 | * val stream: ReceiverInputDStream[(String, String)] = 20 | * KafkaUtils.createStream(ssc, kafkaParams, topics, StorageLevel.MEMORY_AND_DISK_SER_2) 21 | * }}} 22 | * 23 | * ZeroMQ 24 | * {{{ 25 | * val stream: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects) 26 | * }}} 27 | * 28 | * Twitter 29 | * {{{ 30 | * val stream: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) 31 | * }}} 32 | * 33 | * etc. 34 | */ 35 | trait StreamingSpec extends AbstractSpec with SharedEmbeddedCassandra with SparkTemplate { 36 | import org.apache.spark.streaming.StreamingContext 37 | import scala.concurrent.duration._ 38 | 39 | val duration = 10.seconds 40 | 41 | useCassandraConfig("cassandra-default.yaml.template") 42 | 43 | def ssc: StreamingContext 44 | 45 | after { 46 | // Spark Context is shared among all integration test so we don't want to stop it here 47 | ssc.stop(stopSparkContext = false, stopGracefully = true) 48 | } 49 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/RandomPartitionerTokenRangeSplitter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import com.datastax.spark.connector.rdd.partitioner.dht.{BigIntToken, TokenFactory, TokenRange} 4 | 5 | import scala.math.BigDecimal.RoundingMode 6 | 7 | /** Fast token range splitter assuming that data are spread out evenly in the whole range. */ 8 | class RandomPartitionerTokenRangeSplitter(cassandraPartitionsPerToken: Double) extends TokenRangeSplitter[BigInt, BigIntToken] { 9 | 10 | private val tokenFactory = 11 | TokenFactory.RandomPartitionerTokenFactory 12 | 13 | private def wrap(token: BigInt): BigInt = { 14 | val max = tokenFactory.maxToken.value 15 | if (token <= max) token else token - max 16 | } 17 | 18 | def split(range: TokenRange[BigInt, BigIntToken], splitSize: Long) = { 19 | val left = range.start.value 20 | val right = range.end.value 21 | val rangeSize = 22 | if (right > left) BigDecimal(right - left) 23 | else BigDecimal(right - left + tokenFactory.totalTokenCount) 24 | val estimatedRows = rangeSize * cassandraPartitionsPerToken 25 | val n = math.max(1, (estimatedRows / splitSize).setScale(0, RoundingMode.HALF_UP).toInt) 26 | val splitPoints = 27 | (for (i <- 0 until n) yield wrap(left + (rangeSize * i.toDouble / n).toBigInt)) :+ right 28 | for (Seq(l, r) <- splitPoints.sliding(2).toSeq) yield 29 | new TokenRange[BigInt, BigIntToken]( 30 | new BigIntToken(l.bigInteger), 31 | new BigIntToken(r.bigInteger), 32 | range.endpoints, 33 | Some((estimatedRows / n).toInt)) 34 | } 35 | } -------------------------------------------------------------------------------- /project/Versions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | object Versions { 18 | val Akka = "2.2.3"//spark master on 2.3.4 https://github.com/apache/spark/blob/master/pom.xml#L113-L114 19 | val Cassandra = "2.1.2" 20 | val CassandraDriver = "2.1.3" 21 | val CommonsIO = "2.4" 22 | val CommonsLang3 = "3.3.2" 23 | val Config = "1.2.1" 24 | val Guava = "14.0.1" 25 | val JDK = "1.7" 26 | val JodaC = "1.2" 27 | val JodaT = "2.3" 28 | val JOpt = "3.2"//4.7 29 | val Kafka = "0.8.0"//https://github.com/apache/spark/pull/3631 30 | val Lzf = "0.8.4" 31 | val CodaHaleMetrics = "3.0.2" 32 | val Scala = "2.10.4" 33 | val ScalaTest = "2.2.2" 34 | val Scalactic = "2.2.2" 35 | val Slf4j = "1.7.7" 36 | val Spark = "1.1.1" 37 | 38 | } 39 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/SQLDemo.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import org.apache.spark.sql.cassandra.CassandraSQLContext 5 | 6 | /** This demo creates a table in Cassandra, populates it with sample data, 7 | * then queries it using SparkSQL and finally displays the query results to the standard output. 8 | * You need to start Cassandra on local node prior to executing this demo. */ 9 | object SQLDemo extends DemoApp { 10 | 11 | val cc = new CassandraSQLContext(sc) 12 | 13 | CassandraConnector(conf).withSessionDo { session => 14 | session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") 15 | session.execute("DROP TABLE IF EXISTS test.sql_demo") 16 | session.execute("CREATE TABLE test.sql_demo (key INT PRIMARY KEY, grp INT, value DOUBLE)") 17 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (1, 1, 1.0)") 18 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (2, 1, 2.5)") 19 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (3, 1, 10.0)") 20 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (4, 2, 4.0)") 21 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (5, 2, 2.2)") 22 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (6, 2, 2.8)") 23 | } 24 | 25 | val rdd = cc.cassandraSql("SELECT grp, max(value) AS mv FROM test.sql_demo GROUP BY grp ORDER BY mv") 26 | rdd.collect().foreach(println) // [2, 4.0] [1, 10.0] 27 | 28 | sc.stop() 29 | } 30 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/TimestampParser.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import java.util.Date 4 | 5 | import org.joda.time.format.DateTimeFormat 6 | 7 | import scala.util.{Success, Try} 8 | 9 | /** Parses CQL timestamps. 10 | * 11 | * Supported formats: 12 | * - `yyyy-MM-dd HH:mm` 13 | * - `yyyy-MM-dd HH:mmZ` 14 | * - `yyyy-MM-dd HH:mm:ss` 15 | * - `yyyy-MM-dd HH:mm:ssZ` 16 | * - `yyyy-MM-dd HH:mm:ss.SSS` 17 | * - `yyyy-MM-dd HH:mm:ss.SSSZ` 18 | * - `yyyy-MM-dd'T'HH:mm` 19 | * - `yyyy-MM-dd'T'HH:mmZ` 20 | * - `yyyy-MM-dd'T'HH:mm:ss` 21 | * - `yyyy-MM-dd'T'HH:mm:ssZ` 22 | * - `yyyy-MM-dd'T'HH:mm:ss.SSS` 23 | * - `yyyy-MM-dd'T'HH:mm:ss.SSSZ` 24 | * - `yyyy-MM-dd` 25 | * - `yyyy-MM-ddZ` 26 | */ 27 | object TimestampParser { 28 | private val dateStringPatterns = Array[String]( 29 | "yyyy-MM-dd HH:mm", 30 | "yyyy-MM-dd HH:mmZ", 31 | "yyyy-MM-dd HH:mm:ss", 32 | "yyyy-MM-dd HH:mm:ssZ", 33 | "yyyy-MM-dd HH:mm:ss.SSS", 34 | "yyyy-MM-dd HH:mm:ss.SSSZ", 35 | "yyyy-MM-dd'T'HH:mm", 36 | "yyyy-MM-dd'T'HH:mmZ", 37 | "yyyy-MM-dd'T'HH:mm:ss", 38 | "yyyy-MM-dd'T'HH:mm:ssZ", 39 | "yyyy-MM-dd'T'HH:mm:ss.SSS", 40 | "yyyy-MM-dd'T'HH:mm:ss.SSSZ", 41 | "yyyy-MM-dd", 42 | "yyyy-MM-ddZ") 43 | 44 | private val parsers = 45 | dateStringPatterns.map(DateTimeFormat.forPattern) 46 | 47 | def parse(date: String): Date = { 48 | parsers.view.map(p => Try(p.parseDateTime(date))).find(_.isSuccess) match { 49 | case Some(Success(d)) => d.toDate 50 | case _ => throw new IllegalArgumentException(s"Invalid date: $date") 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/CassandraClientProxy.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import java.lang.reflect.{InvocationTargetException, Proxy, Method, InvocationHandler} 4 | 5 | import org.apache.cassandra.thrift.Cassandra 6 | import org.apache.thrift.transport.TTransport 7 | 8 | /** Extends `Cassandra.Iface` with `close` method to close the underlying thrift transport */ 9 | trait CassandraClientProxy extends Cassandra.Iface { 10 | def close() 11 | } 12 | 13 | private class ClientProxyHandler(client: Cassandra.Iface, transport: TTransport) extends InvocationHandler { 14 | 15 | override def invoke(proxy: scala.Any, method: Method, args: Array[AnyRef]): AnyRef = { 16 | if (method.getName == "close") { 17 | transport.close() 18 | null 19 | } 20 | else 21 | try { 22 | method.invoke(client, args: _*) 23 | } 24 | catch { 25 | case e: InvocationTargetException => 26 | throw e.getCause 27 | } 28 | } 29 | } 30 | 31 | object CassandraClientProxy { 32 | 33 | /** Returns a proxy to the thrift client that provides closing the underlying transport by calling `close` method. 34 | * Without this method we'd have to keep references to two objects: the client and the transport. */ 35 | def wrap(client: Cassandra.Iface, transport: TTransport): CassandraClientProxy = { 36 | val classLoader = getClass.getClassLoader 37 | val interfaces = Array[Class[_]](classOf[CassandraClientProxy]) 38 | val invocationHandler = new ClientProxyHandler(client, transport) 39 | Proxy.newProxyInstance(classLoader, interfaces, invocationHandler).asInstanceOf[CassandraClientProxy] 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # for production, you should probably set pattern to %c instead of %l. 18 | # (%l is slower.) 19 | 20 | # output messages into a rolling log file as well as stdout 21 | log4j.rootLogger=WARN,stdout 22 | 23 | # stdout 24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 26 | log4j.appender.stdout.layout.ConversionPattern=%5p %d{HH:mm:ss,SSS} %C (%F:%L) - %m%n 27 | 28 | # Adding this to avoid thrift logging disconnect errors. 29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR 30 | 31 | # Avoid "no host ID found" when starting a fresh node 32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR 33 | 34 | # Avoid "address already in use" when starting multiple local Spark masters 35 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 36 | -------------------------------------------------------------------------------- /spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/RDDJavaFunctions.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.japi; 2 | 3 | import com.datastax.spark.connector.ColumnSelector; 4 | import com.datastax.spark.connector.RDDFunctions; 5 | import com.datastax.spark.connector.cql.CassandraConnector; 6 | import com.datastax.spark.connector.writer.RowWriterFactory; 7 | import com.datastax.spark.connector.writer.WriteConf; 8 | import org.apache.spark.SparkConf; 9 | import org.apache.spark.rdd.RDD; 10 | 11 | /** 12 | * A Java API wrapper over {@link org.apache.spark.rdd.RDD} to provide Spark Cassandra Connector functionality. 13 | * 14 | *To obtain an instance of this wrapper, use one of the factory methods in {@link 15 | * com.datastax.spark.connector.japi.CassandraJavaUtil} class.
16 | */ 17 | @SuppressWarnings("UnusedDeclaration") 18 | public class RDDJavaFunctionsTo obtain an instance of this wrapper, use one of the factory methods in {@link 16 | * com.datastax.spark.connector.japi.CassandraJavaUtil} class.
17 | */ 18 | @SuppressWarnings("UnusedDeclaration") 19 | public class DStreamJavaFunctions