├── project ├── build.properties ├── plugins.sbt └── Versions.scala ├── rootdoc.txt ├── spark-cassandra-connector └── src │ ├── it │ ├── resources │ │ ├── triggers │ │ │ └── README.txt │ │ └── log4j.properties │ └── scala │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ ├── cql │ │ ├── CassandraAuthenticatedConnectorSpec.scala │ │ └── CassandraPartitionKeyWhereSpec.scala │ │ └── streaming │ │ ├── StreamingSpec.scala │ │ └── ActorStreamSpec.scala │ ├── main │ └── scala │ │ ├── org │ │ └── apache │ │ │ └── spark │ │ │ └── sql │ │ │ └── cassandra │ │ │ ├── package-info.java │ │ │ ├── package.scala │ │ │ ├── api │ │ │ └── java │ │ │ │ └── JavaCassandraSQLContext.scala │ │ │ ├── InsertIntoCassandraTable.scala │ │ │ ├── CassandraCatalog.scala │ │ │ └── CassandraSQLRow.scala │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ ├── util │ │ ├── package.scala │ │ ├── CountingIterator.scala │ │ ├── MagicalTypeTricks.scala │ │ ├── ReflectionUtil.scala │ │ └── JavaApiHelper.scala │ │ ├── writer │ │ ├── package.scala │ │ ├── QueryExecutor.scala │ │ ├── RowWriter.scala │ │ ├── GenericRowWriter.scala │ │ ├── SqlRowWriter.scala │ │ ├── PropertyExtractor.scala │ │ ├── ConvertingPropertyExtractor.scala │ │ ├── AbstractRowWriter.scala │ │ ├── RowWriterFactory.scala │ │ ├── ObjectSizeEstimator.scala │ │ ├── WriteOption.scala │ │ ├── AsyncExecutor.scala │ │ └── WritableToCassandra.scala │ │ ├── rdd │ │ ├── package.scala │ │ ├── partitioner │ │ │ ├── package.scala │ │ │ ├── TokenRangeSplitter.scala │ │ │ ├── dht │ │ │ │ ├── Token.scala │ │ │ │ ├── TokenRange.scala │ │ │ │ └── TokenFactory.scala │ │ │ ├── CassandraRDDPartition.scala │ │ │ ├── Murmur3PartitionerTokenRangeSplitter.scala │ │ │ ├── RandomPartitionerTokenRangeSplitter.scala │ │ │ ├── ServerSideTokenRangeSplitter.scala │ │ │ └── TokenRangeClusterer.scala │ │ ├── reader │ │ │ ├── package.scala │ │ │ ├── PrefetchingResultSetIterator.scala │ │ │ ├── RowReader.scala │ │ │ ├── KeyValueRowReader.scala │ │ │ └── ValueRowReader.scala │ │ ├── ValidRDDType.scala │ │ ├── CqlWhereClause.scala │ │ └── ReadConf.scala │ │ ├── mapper │ │ ├── package.scala │ │ ├── ColumnMap.scala │ │ ├── TupleColumnMapper.scala │ │ ├── JavaBeanColumnMapper.scala │ │ ├── DefaultColumnMapper.scala │ │ └── ReflectionColumnMapper.scala │ │ ├── types │ │ ├── package.scala │ │ ├── TimestampFormatter.scala │ │ ├── TimestampParser.scala │ │ ├── CollectionColumnType.scala │ │ └── ColumnType.scala │ │ ├── cql │ │ ├── package.scala │ │ ├── MultipleRetryPolicy.scala │ │ ├── CassandraClientProxy.scala │ │ ├── PreparedStatementCache.scala │ │ ├── CassandraConnectorConf.scala │ │ ├── RefCountMap.scala │ │ ├── SessionProxy.scala │ │ └── AuthConf.scala │ │ ├── streaming │ │ ├── package.scala │ │ ├── CassandraStreamingRDD.scala │ │ ├── DStreamFunctions.scala │ │ └── StreamingContextFunctions.scala │ │ ├── ColumnSelector.scala │ │ ├── BatchSize.scala │ │ ├── RDDFunctions.scala │ │ ├── ColumnRef.scala │ │ ├── package.scala │ │ └── SparkContextFunctions.scala │ └── test │ ├── scala │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ ├── testkit │ │ ├── package.scala │ │ └── SparkCassandraFixture.scala │ │ ├── writer │ │ ├── DefaultRowWriterTest.scala │ │ ├── PropertyExtractorTest.scala │ │ ├── ObjectSizeEstimatorTest.scala │ │ ├── ConvertingPropertyExtractorTest.scala │ │ ├── AsyncExecutorTest.scala │ │ ├── WriteConfTest.scala │ │ └── WriteOptionTest.scala │ │ ├── rdd │ │ ├── reader │ │ │ └── ClassBasedRowReaderTest.scala │ │ └── partitioner │ │ │ ├── RandomPartitionerTokenRangeSplitterTest.scala │ │ │ └── Murmur3PartitionerTokenRangeSplitterTest.scala │ │ ├── samples.scala │ │ ├── streaming │ │ └── TestProducer.scala │ │ ├── types │ │ ├── CanBuildFromTest.scala │ │ └── TypeSerializationTest.scala │ │ ├── mapper │ │ └── TupleColumnMapperTest.scala │ │ └── util │ │ └── ReflectionUtilSpec.scala │ └── java │ └── com │ └── datastax │ └── spark │ └── connector │ ├── SampleJavaBeanWithoutNoArgsCtor.java │ ├── SampleJavaBean.java │ ├── SampleJavaBeanWithMultipleCtors.java │ ├── SampleWithNestedJavaBean.java │ └── SampleWithDeeplyNestedJavaBean.java ├── .travis.yml ├── .gitignore ├── spark-cassandra-connector-embedded └── src │ └── main │ └── scala │ └── com │ └── datastax │ └── spark │ └── connector │ └── embedded │ ├── Event.scala │ ├── package.scala │ ├── SparkTemplate.scala │ ├── SparkRepl.scala │ ├── Assertions.scala │ ├── KafkaProducer.scala │ ├── KafkaConsumer.scala │ └── EmbeddedZookeeper.scala ├── spark-cassandra-connector-java └── src │ ├── main │ ├── scala │ │ └── com │ │ │ └── datastax │ │ │ └── spark │ │ │ └── connector │ │ │ └── japi │ │ │ └── types │ │ │ └── JavaTypeConverter.scala │ └── java │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ └── japi │ │ ├── StreamingContextJavaFunctions.java │ │ ├── RDDJavaFunctions.java │ │ ├── DStreamJavaFunctions.java │ │ └── GenericJavaRowReaderFactory.java │ └── test │ └── java │ └── com │ └── datastax │ └── spark │ └── connector │ └── japi │ └── CustomTypeConverterTest.java ├── spark-cassandra-connector-demos ├── simple-demos │ └── src │ │ └── main │ │ ├── resources │ │ ├── application.conf │ │ ├── log4j.properties │ │ └── data │ │ │ └── words │ │ └── scala │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ └── demo │ │ ├── DemoApp.scala │ │ ├── SparkCassandraSettings.scala │ │ ├── WordCountDemo.scala │ │ ├── TableCopyDemo.scala │ │ ├── BasicReadWriteDemo.scala │ │ └── SQLDemo.scala ├── twitter-streaming │ └── src │ │ └── main │ │ ├── resources │ │ ├── application.conf │ │ └── log4j.properties │ │ └── scala │ │ └── com │ │ └── datastax │ │ └── spark │ │ └── connector │ │ └── demo │ │ └── TwitterStreamingHashTagsByInterval.scala └── kafka-streaming │ └── src │ └── main │ └── resources │ ├── log4j.properties │ └── data │ └── words ├── scripts └── submit-demos ├── doc ├── 10_embedded.md ├── 3_selection.md └── 5_saving.md └── sbt └── sbt /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.6 2 | -------------------------------------------------------------------------------- /rootdoc.txt: -------------------------------------------------------------------------------- 1 | Cassandra connector for Apache Spark. 2 | See documentation of package [[com.datastax.spark.connector]]. -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/resources/triggers/README.txt: -------------------------------------------------------------------------------- 1 | Place triggers to be loaded in this directory, as jar files. 2 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/package-info.java: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.cassandra; -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | jdk: oraclejdk7 3 | sudo: false 4 | scala: 5 | - 2.10.4 6 | script: 7 | - "sbt ++$TRAVIS_SCALA_VERSION test" 8 | 9 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/package.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql 2 | 3 | package object cassandra { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | /** Useful stuff that didn't fit elsewhere. */ 4 | package object util { 5 | 6 | } 7 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | /** Contains components for writing RDDs to Cassandra */ 4 | package object writer { 5 | 6 | } 7 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | 4 | /** Contains [[com.datastax.spark.connector.rdd.CassandraRDD]] class that is the main entry point for 5 | * analyzing Cassandra data from Spark. */ 6 | package object rdd { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | .DS_Store 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | .idea 19 | .idea_modules 20 | 21 | checkpoint 22 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/QueryExecutor.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.driver.core.{Statement, Session} 4 | 5 | class QueryExecutor(session: Session, maxConcurrentQueries: Int) 6 | extends AsyncExecutor(session.executeAsync(_ : Statement), maxConcurrentQueries) 7 | 8 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | /** Provides components for partitioning a Cassandra table into smaller parts of appropriate size. 4 | * Each partition can be processed locally on at least one cluster node. */ 5 | package object partitioner { 6 | 7 | } 8 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | /** Provides machinery for mapping Cassandra tables to user defined Scala classes or tuples. 4 | * The main class in this package is [[mapper.ColumnMapper]] responsible for matching Scala object's 5 | * properties with Cassandra column names.*/ 6 | package object mapper { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | /** Offers type conversion magic, so you can receive Cassandra column values in a form you like the most. 4 | * Simply specify the type you want to use on the Scala side, and the column value will be converted automatically. 5 | * Works also with complex objects like collections. */ 6 | package object types { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | import com.datastax.spark.connector.CassandraRow 4 | 5 | /** Provides components for reading data rows from Cassandra and converting them to objects of desired type. 6 | * Additionally provides a generic [[CassandraRow CassandraRow]] class which can represent any row.*/ 7 | package object reader { 8 | 9 | } 10 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "0.7.1") 2 | 3 | addSbtPlugin("com.typesafe.sbt" % "sbt-scalariform" % "1.3.0") 4 | 5 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.6.2") 6 | 7 | addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.6") 8 | 9 | addSbtPlugin("com.typesafe.sbt" % "sbt-pgp" % "0.8.3") 10 | 11 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 12 | 13 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4") 14 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | 4 | /** Contains a [[cql.CassandraConnector]] object which is used to connect 5 | * to a Cassandra cluster and to send CQL statements to it. `CassandraConnector` 6 | * provides a Scala-idiomatic way of working with `Cluster` and `Session` object 7 | * and takes care of connection pooling and proper resource disposal.*/ 8 | package object cql { 9 | 10 | } 11 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/testkit/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import scala.collection.immutable 4 | import scala.concurrent.duration._ 5 | import akka.util.Timeout 6 | 7 | package object testkit { 8 | 9 | final val DefaultHost = "127.0.0.1" 10 | 11 | implicit val DefaultTimeout = Timeout(5.seconds) 12 | 13 | val data = immutable.Set("words ", "may ", "count ") 14 | 15 | val actorName = "my-actor" 16 | 17 | } 18 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/CountingIterator.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.util 2 | 3 | /** Counts elements fetched form the underlying iterator. */ 4 | class CountingIterator[T](iterator: Iterator[T]) extends Iterator[T] { 5 | private var _count = 0 6 | 7 | /** Returns the number of successful invocations of `next` */ 8 | def count = _count 9 | 10 | def hasNext = iterator.hasNext 11 | 12 | def next() = { 13 | val item = iterator.next() 14 | _count += 1 15 | item 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/DefaultRowWriterTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.cql.TableDef 4 | import org.apache.commons.lang3.SerializationUtils 5 | import org.junit.Test 6 | 7 | class DefaultRowWriterTest { 8 | 9 | @Test 10 | def testSerializability() { 11 | val table = TableDef("test", "table", Nil, Nil, Nil) 12 | val rowWriter = new DefaultRowWriter[DefaultRowWriterTest](table, Nil) 13 | SerializationUtils.roundtrip(rowWriter) 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/TokenRangeSplitter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import com.datastax.spark.connector.rdd.partitioner.dht.{Token, TokenRange} 4 | 5 | /** Splits a token range into smaller sub-ranges, 6 | * each with the desired approximate number of rows. */ 7 | trait TokenRangeSplitter[V, T <: Token[V]] { 8 | 9 | /** Splits given token range into n equal sub-ranges. */ 10 | def split(range: TokenRange[V, T], splitSize: Long): Seq[TokenRange[V, T]] 11 | } 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import org.apache.spark.streaming.StreamingContext 4 | import org.apache.spark.streaming.dstream.DStream 5 | 6 | import scala.reflect.ClassTag 7 | 8 | package object streaming { 9 | 10 | implicit def toStreamingContextFunctions(ssc: StreamingContext): SparkContextFunctions = 11 | new StreamingContextFunctions(ssc) 12 | 13 | implicit def toDStreamFunctions[T: ClassTag](ds: DStream[T]): DStreamFunctions[T] = 14 | new DStreamFunctions[T](ds) 15 | 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/ColumnSelector.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import scala.language.implicitConversions 4 | 5 | sealed trait ColumnSelector 6 | case object AllColumns extends ColumnSelector 7 | case class SomeColumns(columns: NamedColumnRef*) extends ColumnSelector 8 | 9 | object SomeColumns { 10 | @deprecated("Use com.datastax.spark.connector.rdd.SomeColumns instead of Seq", "1.0") 11 | implicit def seqToSomeColumns(columns: Seq[String]): SomeColumns = 12 | SomeColumns(columns.map(x => x: NamedColumnRef): _*) 13 | } 14 | 15 | 16 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/BatchSize.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import com.datastax.spark.connector.writer.WriteConf 4 | 5 | sealed trait BatchSize 6 | 7 | case class RowsInBatch(batchSize: Int) extends BatchSize 8 | case class BytesInBatch(batchSize: Int) extends BatchSize 9 | 10 | object BatchSize { 11 | @deprecated("Use com.datastax.spark.connector.FixedBatchSize instead of a number", "1.1") 12 | implicit def intToFixedBatchSize(batchSize: Int): RowsInBatch = RowsInBatch(batchSize) 13 | 14 | val Automatic = BytesInBatch(WriteConf.DefaultBatchSizeInBytes) 15 | } 16 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/dht/Token.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner.dht 2 | 3 | trait Token[T] extends Ordered[Token[T]] { 4 | def value: T 5 | } 6 | 7 | case class LongToken(value: Long) extends Token[Long] { 8 | override def compare(that: Token[Long]) = value.compareTo(that.value) 9 | override def toString = value.toString 10 | } 11 | 12 | case class BigIntToken(value: BigInt) extends Token[BigInt] { 13 | override def compare(that: Token[BigInt]) = value.compare(that.value) 14 | override def toString = value.toString() 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/TimestampFormatter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import java.util.Date 4 | 5 | import org.apache.cassandra.serializers.TimestampSerializer 6 | import org.joda.time.DateTime 7 | import org.joda.time.format.DateTimeFormat 8 | 9 | /** Formats timestamps and dates using CQL timestamp format `yyyy-MM-dd HH:mm:ssZ` */ 10 | object TimestampFormatter { 11 | 12 | private val TimestampPattern = "yyyy-MM-dd HH:mm:ssZ" 13 | 14 | def format(date: Date): String = 15 | DateTimeFormat.forPattern(TimestampPattern).print(new DateTime(date.getTime)) 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/rdd/reader/ClassBasedRowReaderTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.reader 2 | 3 | import com.datastax.spark.connector.cql.TableDef 4 | import org.apache.commons.lang3.SerializationUtils 5 | import org.junit.Test 6 | 7 | case class TestClass(a: String, b: Int, c: Option[Long]) 8 | 9 | class ClassBasedRowReaderTest { 10 | 11 | private val tableDef = TableDef("test", "table", Nil, Nil, Nil) 12 | 13 | @Test 14 | def testSerialize() { 15 | val reader = new ClassBasedRowReader[TestClass](tableDef) 16 | SerializationUtils.roundtrip(reader) 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/Event.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import akka.actor.ActorRef 4 | 5 | object Event { 6 | 7 | sealed trait Status extends Serializable 8 | 9 | case class ReceiverStarted(ref: ActorRef) extends Status 10 | 11 | case class Pushed(data: AnyRef) extends Status 12 | 13 | case object Completed extends Status 14 | 15 | case object Report extends Status 16 | 17 | sealed trait Task extends Serializable 18 | case object QueryTask extends Task 19 | 20 | case class WordCount(word: String, count: Int) extends Serializable 21 | 22 | } 23 | -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import java.net.InetAddress 4 | 5 | import scala.concurrent.duration.FiniteDuration 6 | 7 | package object embedded { 8 | 9 | implicit val ZookeeperConnectionString = s"${InetAddress.getLocalHost.getHostAddress}:2181" 10 | 11 | /* Factor by which to scale timeouts during tests, e.g. to account for shared build system load. */ 12 | implicit class SparkTestDuration(val duration: FiniteDuration) extends AnyVal { 13 | def dilated: FiniteDuration = (duration * 1.0).asInstanceOf[FiniteDuration] 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /spark-cassandra-connector-java/src/main/scala/com/datastax/spark/connector/japi/types/JavaTypeConverter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.japi.types 2 | 3 | import akka.japi.JavaPartialFunction 4 | import com.datastax.spark.connector.types.NullableTypeConverter 5 | 6 | import scala.reflect.runtime.universe._ 7 | 8 | class JavaTypeConverter[T <: AnyRef](typeTag: TypeTag[T], convertFunction: JavaPartialFunction[Any, T]) 9 | extends NullableTypeConverter[T] { 10 | 11 | override def targetTypeTag: TypeTag[T] = typeTag 12 | 13 | override def convertPF: PartialFunction[Any, T] = convertFunction 14 | 15 | def noMatch() = JavaPartialFunction.noMatch() 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Streaming Demo Reference Config File # 3 | #################################### 4 | 5 | streaming-demo { 6 | 7 | # spark://127.0.0.1@7077,127.0.0.2@7077,127.0.0.3@7077 8 | # or a local spark://host@7077 9 | # This defaults to local 10 | spark.master = "local[12]" 11 | # Would normally be `ms` in config but Spark just wants the Long 12 | spark.streaming.batch.duration = 300 13 | spark.cleaner.ttl = 3600 14 | spark.cassandra.connection.host = "127.0.0.1" 15 | 16 | spark.cassandra.keyspace = "streaming_demo" 17 | spark.cassandra.table = "words" 18 | data = ["words ", "may ", "count "] 19 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/ValidRDDType.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | import java.io.{Serializable => JavaSerializable} 4 | 5 | import com.datastax.spark.connector.types.TypeConverter 6 | 7 | import scala.annotation.implicitNotFound 8 | 9 | @implicitNotFound("Not a valid RDD type. There should exists either a type converter for the type or the type should implement Serializable") 10 | trait ValidRDDType[T] 11 | 12 | object ValidRDDType { 13 | implicit def withTypeConverterAsValidRDDType[T](implicit tc: TypeConverter[T]): ValidRDDType[T] = null 14 | 15 | implicit def javaSerializableAsValidRDDType[T <: JavaSerializable]: ValidRDDType[T] = null 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/SparkTemplate.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import org.apache.spark.{SparkEnv, SparkConf, SparkContext} 4 | 5 | trait SparkTemplate { 6 | val conf = SparkTemplate.conf 7 | val sc = SparkTemplate.sc 8 | } 9 | 10 | object SparkTemplate { 11 | 12 | val conf = new SparkConf(true) 13 | .set("spark.cassandra.connection.host", EmbeddedCassandra.cassandraHost.getHostAddress) 14 | .set("spark.cleaner.ttl", "3600") 15 | .setMaster(sys.env.getOrElse("IT_TEST_SPARK_MASTER", "local[*]")) 16 | .setAppName(getClass.getSimpleName) 17 | 18 | 19 | val sc = new SparkContext(conf) 20 | 21 | lazy val actorSystem = SparkEnv.get.actorSystem 22 | 23 | } 24 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/ColumnMap.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.mapper 2 | 3 | import com.datastax.spark.connector.ColumnRef 4 | 5 | /** Associates constructor parameters and property accessors with table columns */ 6 | trait ColumnMap extends Serializable { 7 | def constructor: Seq[ColumnRef] 8 | 9 | def getters: Map[String, ColumnRef] 10 | 11 | def setters: Map[String, ColumnRef] 12 | 13 | def allowsNull: Boolean 14 | } 15 | 16 | case class SimpleColumnMap(constructor: Seq[ColumnRef], 17 | getters: Map[String, ColumnRef], 18 | setters: Map[String, ColumnRef], 19 | allowsNull: Boolean = false) extends ColumnMap 20 | 21 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/api/java/JavaCassandraSQLContext.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.cassandra.api.java 2 | 3 | import org.apache.spark.api.java.JavaSparkContext 4 | import org.apache.spark.sql.api.java.{JavaSQLContext, JavaSchemaRDD} 5 | import org.apache.spark.sql.cassandra.CassandraSQLContext 6 | 7 | class JavaCassandraSQLContext(sparkContext: JavaSparkContext) extends JavaSQLContext(sparkContext) { 8 | 9 | override val sqlContext = new CassandraSQLContext(sparkContext) 10 | 11 | /** 12 | * Executes a query expressed in SQL, returning the result as a JavaSchemaRDD. 13 | */ 14 | def cql(cqlQuery: String): JavaSchemaRDD = 15 | new JavaSchemaRDD(sqlContext, sqlContext.parseSql(cqlQuery)) 16 | } 17 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/testkit/SparkCassandraFixture.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.testkit 2 | 3 | import org.scalatest.{BeforeAndAfter, Matchers, WordSpecLike} 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import com.datastax.spark.connector.embedded.EmbeddedCassandra 6 | 7 | /** Basic unit test abstraction. */ 8 | trait AbstractSpec extends WordSpecLike with Matchers with BeforeAndAfter 9 | 10 | /** Used for IT tests. */ 11 | trait SharedEmbeddedCassandra extends EmbeddedCassandra { 12 | 13 | def clearCache(): Unit = CassandraConnector.evictCache() 14 | 15 | } 16 | 17 | private[connector] object TestEvent { 18 | 19 | case object Stop 20 | 21 | case object Completed 22 | 23 | case class WordCount(word: String, count: Int) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/CqlWhereClause.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | /** Represents a logical conjunction of CQL predicates. 4 | * Each predicate can have placeholders denoted by '?' which get substituted by values from the `values` array. 5 | * The number of placeholders must match the size of the `values` array. */ 6 | case class CqlWhereClause(predicates: Seq[String], values: Seq[Any]) { 7 | 8 | /** Returns a conjunction of this clause and the given predicate. */ 9 | def and(other: CqlWhereClause) = 10 | CqlWhereClause(predicates ++ other.predicates, values ++ other.values) 11 | 12 | } 13 | 14 | object CqlWhereClause { 15 | 16 | /** Empty CQL WHERE clause selects all rows */ 17 | val empty = new CqlWhereClause(Nil, Nil) 18 | } 19 | 20 | 21 | -------------------------------------------------------------------------------- /spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/StreamingContextJavaFunctions.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.japi; 2 | 3 | import org.apache.spark.streaming.StreamingContext; 4 | 5 | /** 6 | * Java API wrapper over {@link org.apache.spark.streaming.StreamingContext} to provide Spark Cassandra Connector 7 | * functionality. 8 | * 9 | *

To obtain an instance of this wrapper, use one of the factory methods in {@link 10 | * com.datastax.spark.connector.japi.CassandraJavaUtil} class.

11 | */ 12 | @SuppressWarnings("UnusedDeclaration") 13 | public class StreamingContextJavaFunctions extends SparkContextJavaFunctions { 14 | public final StreamingContext ssc; 15 | 16 | StreamingContextJavaFunctions(StreamingContext ssc) { 17 | super(ssc.sparkContext()); 18 | this.ssc = ssc; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/DemoApp.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.datastax.spark.connector.util.Logging 4 | import org.apache.spark.{SparkContext, SparkConf} 5 | 6 | trait DemoApp extends App with Logging { 7 | 8 | val words = "./spark-cassandra-connector-demos/simple-demos/src/main/resources/data/words" 9 | 10 | val SparkMasterHost = "127.0.0.1" 11 | 12 | val CassandraHost = "127.0.0.1" 13 | 14 | // Tell Spark the address of one Cassandra node: 15 | val conf = new SparkConf(true) 16 | .set("spark.cassandra.connection.host", CassandraHost) 17 | .set("spark.cleaner.ttl", "3600") 18 | .setMaster("local[12]") 19 | .setAppName(getClass.getSimpleName) 20 | 21 | // Connect to the Spark cluster: 22 | lazy val sc = new SparkContext(conf) 23 | } 24 | 25 | object DemoApp { 26 | def apply(): DemoApp = new DemoApp {} 27 | } 28 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleJavaBeanWithoutNoArgsCtor.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleJavaBeanWithoutNoArgsCtor implements Serializable { 10 | private Integer key; 11 | private String value; 12 | 13 | private SampleJavaBeanWithoutNoArgsCtor(Integer key, String value) { 14 | this.key = key; 15 | this.value = value; 16 | } 17 | 18 | public Integer getKey() { 19 | return key; 20 | } 21 | 22 | public void setKey(Integer key) { 23 | this.key = key; 24 | } 25 | 26 | public String getValue() { 27 | return value; 28 | } 29 | 30 | public void setValue(String value) { 31 | this.value = value; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/samples.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector { 2 | 3 | case class SampleScalaCaseClass(key: Int, value: String) 4 | 5 | class SampleScalaClass(val key: Int, val value: String) extends Serializable 6 | 7 | class SampleScalaClassWithNoFields(key: Int, value: String) extends Serializable 8 | 9 | class SampleScalaClassWithMultipleCtors(var key: Int, var value: String) extends Serializable { 10 | def this(key: Int) = this(key, null) 11 | 12 | def this() = this(0, null) 13 | } 14 | 15 | class SampleWithNestedScalaCaseClass extends Serializable { 16 | 17 | case class InnerClass(key: Int, value: String) 18 | 19 | } 20 | 21 | class SampleWithDeeplyNestedScalaCaseClass extends Serializable { 22 | 23 | class IntermediateClass extends Serializable { 24 | 25 | case class InnerClass(key: Int, value: String) 26 | 27 | } 28 | 29 | } 30 | 31 | object SampleObject { 32 | 33 | case class ClassInObject(key: Int, value: String) 34 | 35 | } 36 | 37 | } -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/SparkCassandraSettings.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.typesafe.config.{Config, ConfigFactory} 4 | 5 | /* Initializes Akka, Cassandra and Spark settings. */ 6 | final class SparkCassandraSettings(rootConfig: Config) { 7 | def this() = this(ConfigFactory.load) 8 | 9 | protected val config = rootConfig.getConfig("streaming-demo") 10 | 11 | val SparkMaster: String = config.getString("spark.master") 12 | 13 | val SparkCleanerTtl: Int = config.getInt("spark.cleaner.ttl") 14 | 15 | val SparkStreamingBatchDuration: Long = config.getLong("spark.streaming.batch.duration") 16 | 17 | val Data = akka.japi.Util.immutableSeq(config.getStringList("data")).toSet 18 | 19 | val CassandraSeed: String = config.getString("spark.cassandra.connection.host") 20 | 21 | val CassandraKeyspace = config.getString("spark.cassandra.keyspace") 22 | 23 | val CassandraTable = config.getString("spark.cassandra.table") 24 | } -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/WordCountDemo.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import org.apache.spark.SparkContext._ 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import com.datastax.spark.connector._ 6 | 7 | object WordCountDemo extends DemoApp { 8 | 9 | CassandraConnector(conf).withSessionDo { session => 10 | session.execute(s"CREATE KEYSPACE IF NOT EXISTS demo WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") 11 | session.execute(s"CREATE TABLE IF NOT EXISTS demo.wordcount (word TEXT PRIMARY KEY, count COUNTER)") 12 | session.execute(s"TRUNCATE demo.wordcount") 13 | } 14 | 15 | sc.textFile(words) 16 | .flatMap(_.split("\\s+")) 17 | .map(word => (word.toLowerCase, 1)) 18 | .reduceByKey(_ + _) 19 | .saveToCassandra("demo", "wordcount") 20 | 21 | // print out the data saved from Spark to Cassandra 22 | sc.cassandraTable("demo", "wordcount").collect.foreach(println) 23 | sc.stop() 24 | } 25 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleJavaBean.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleJavaBean implements Serializable { 10 | private Integer key; 11 | private String value; 12 | 13 | public static SampleJavaBean newInstance(Integer key, String value) { 14 | SampleJavaBean bean = new SampleJavaBean(); 15 | bean.setKey(key); 16 | bean.setValue(value); 17 | return bean; 18 | } 19 | 20 | public Integer getKey() { 21 | return key; 22 | } 23 | 24 | public void setKey(Integer key) { 25 | this.key = key; 26 | } 27 | 28 | public String getValue() { 29 | return value; 30 | } 31 | 32 | public void setValue(String value) { 33 | this.value = value; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/RowWriter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.driver.core.{ProtocolVersion, BoundStatement, PreparedStatement} 4 | 5 | /** `RowWriter` knows how to write an object to Cassandra using the Java Cassandra driver. 6 | * */ 7 | trait RowWriter[T] extends Serializable { 8 | 9 | /** Extracts column values from `data` object and binds them to the given statement. 10 | * Variables of the prepared statement are named the same as column names to be saved. 11 | * This method must not rely on any particular order of variables.*/ 12 | def bind(data: T, stmt: PreparedStatement, protocolVersion: ProtocolVersion): BoundStatement 13 | 14 | /** Estimates serialized size in bytes of a data object. 15 | * Used for grouping statements into batches. */ 16 | def estimateSizeInBytes(data: T): Int 17 | 18 | /** List of columns this `RowWriter` is going to write. 19 | * Used to construct appropriate INSERT or UPDATE statement. */ 20 | def columnNames: Seq[String] 21 | 22 | } 23 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/PrefetchingResultSetIterator.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.reader 2 | 3 | import com.datastax.driver.core.{Row, ResultSet} 4 | 5 | /** Allows to efficiently iterate over a large, paged ResultSet, 6 | * asynchronously prefetching the next page. 7 | * 8 | * @param resultSet result set obtained from the Java driver 9 | * @param prefetchWindowSize if there are less than this rows available without blocking, 10 | * initiates fetching the next page 11 | */ 12 | class PrefetchingResultSetIterator(resultSet: ResultSet, prefetchWindowSize: Int) extends Iterator[Row] { 13 | 14 | private[this] val iterator = resultSet.iterator() 15 | 16 | override def hasNext = iterator.hasNext 17 | 18 | private[this] def maybePrefetch(): Unit = { 19 | if (!resultSet.isFullyFetched && resultSet.getAvailableWithoutFetching < prefetchWindowSize) 20 | resultSet.fetchMoreResults() 21 | } 22 | 23 | override def next() = { 24 | maybePrefetch() 25 | iterator.next() 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/GenericRowWriter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.CassandraRow 4 | import com.datastax.spark.connector.cql.TableDef 5 | 6 | /** A [[RowWriter]] that can write [[CassandraRow]] objects.*/ 7 | class GenericRowWriter(table: TableDef, selectedColumns: Seq[String]) 8 | extends AbstractRowWriter[CassandraRow](table: TableDef, selectedColumns: Seq[String]) { 9 | 10 | override protected def getColumnValue(data: CassandraRow, columnName: String): AnyRef = { 11 | val index = data.indexOf(columnName) 12 | if (index >= 0) { 13 | val converter = table.columnByName(columnName).columnType.converterToCassandra 14 | val value = data.getRaw(index) 15 | converter.convert(value) 16 | } 17 | else 18 | null 19 | } 20 | } 21 | 22 | 23 | object GenericRowWriter { 24 | 25 | object Factory extends RowWriterFactory[CassandraRow] { 26 | override def rowWriter(table: TableDef, columnNames: Seq[String]) = 27 | new GenericRowWriter(table, columnNames) 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/TupleColumnMapper.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.mapper 2 | 3 | import com.datastax.spark.connector.{ColumnRef, ColumnIndex} 4 | import com.datastax.spark.connector.cql.TableDef 5 | 6 | import scala.reflect.ClassTag 7 | 8 | class TupleColumnMapper[T <: Product : ClassTag] extends ColumnMapper[T] { 9 | 10 | override def classTag: ClassTag[T] = implicitly[ClassTag[T]] 11 | 12 | private def indexedColumnRefs(n: Int) = 13 | (0 until n).map(ColumnIndex) 14 | 15 | override def columnMap(tableDef: TableDef): ColumnMap = { 16 | 17 | val GetterRegex = "_([0-9]+)".r 18 | val cls = implicitly[ClassTag[T]].runtimeClass 19 | 20 | val constructor = 21 | indexedColumnRefs(cls.getConstructors()(0).getParameterTypes.length) 22 | 23 | val getters = { 24 | for (name@GetterRegex(id) <- cls.getMethods.map(_.getName)) 25 | yield (name, ColumnIndex(id.toInt - 1)) 26 | }.toMap 27 | 28 | val setters = 29 | Map.empty[String, ColumnRef] 30 | 31 | SimpleColumnMap(constructor, getters, setters) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/SqlRowWriter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.cql.TableDef 4 | import org.apache.spark.sql.catalyst.expressions.Row 5 | 6 | /** A [[RowWriter]] that can write [[Row]] objects.*/ 7 | class SqlRowWriter(table: TableDef, selectedColumns: Seq[String]) extends AbstractRowWriter[Row](table: TableDef, selectedColumns: Seq[String]) { 8 | 9 | override protected def getColumnValue(data: Row, columnName: String): AnyRef = { 10 | val index = columnNames.indexOf(columnName) 11 | if (index >= 0 && index < data.size) { 12 | val converter = table.columnByName(columnName).columnType.converterToCassandra 13 | val value = data.apply(index) 14 | if (value == null) null else converter.convert(value).asInstanceOf[AnyRef] 15 | } 16 | else 17 | null 18 | } 19 | } 20 | 21 | 22 | object SqlRowWriter { 23 | 24 | object Factory extends RowWriterFactory[Row] { 25 | override def rowWriter(table: TableDef, columnNames: Seq[String]) = 26 | new SqlRowWriter(table, columnNames) 27 | } 28 | 29 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/CassandraStreamingRDD.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import com.datastax.spark.connector.{ColumnSelector, AllColumns} 5 | 6 | import scala.reflect.ClassTag 7 | import org.apache.spark.streaming.StreamingContext 8 | import com.datastax.spark.connector.rdd.{ReadConf, CassandraRDD, CqlWhereClause} 9 | import com.datastax.spark.connector.rdd.reader._ 10 | 11 | /** RDD representing a Cassandra table for Spark Streaming. 12 | * @see [[com.datastax.spark.connector.rdd.CassandraRDD]] */ 13 | class CassandraStreamingRDD[R] private[connector] ( 14 | sctx: StreamingContext, 15 | connector: CassandraConnector, 16 | keyspace: String, 17 | table: String, 18 | columns: ColumnSelector = AllColumns, 19 | where: CqlWhereClause = CqlWhereClause.empty, 20 | readConf: ReadConf = ReadConf())( 21 | implicit 22 | ct : ClassTag[R], 23 | @transient rrf: RowReaderFactory[R]) 24 | extends CassandraRDD[R](sctx.sparkContext, connector, keyspace, table, columns, where, readConf) 25 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleJavaBeanWithMultipleCtors.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleJavaBeanWithMultipleCtors implements Serializable { 10 | private Integer key; 11 | private String value; 12 | 13 | public SampleJavaBeanWithMultipleCtors(Integer key) { 14 | this.key = key; 15 | } 16 | 17 | public SampleJavaBeanWithMultipleCtors() { 18 | } 19 | 20 | public SampleJavaBeanWithMultipleCtors(Integer key, String value) { 21 | this.key = key; 22 | this.value = value; 23 | } 24 | 25 | public Integer getKey() { 26 | return key; 27 | } 28 | 29 | public void setKey(Integer key) { 30 | this.key = key; 31 | } 32 | 33 | public String getValue() { 34 | return value; 35 | } 36 | 37 | public void setValue(String value) { 38 | this.value = value; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/streaming/TestProducer.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import scala.concurrent.duration._ 4 | import akka.actor.{PoisonPill, Actor, ActorRef} 5 | 6 | class TestProducer(data: Array[String], to: ActorRef) extends Counter { 7 | import scala.util.Random 8 | import context.dispatcher 9 | 10 | val rand = new Random() 11 | 12 | val task = context.system.scheduler.schedule(2.second, 1.millis) { 13 | if (count < scale) { // we need this test to avoid generating more than 'scale' messages 14 | to ! makeMessage() 15 | increment() 16 | } 17 | } 18 | 19 | def receive: Actor.Receive = { 20 | case _ => 21 | } 22 | 23 | def makeMessage(): String = { 24 | val x = rand.nextInt(3) 25 | data(x) + data(2 - x) 26 | } 27 | } 28 | 29 | trait CounterFixture { 30 | val scale = 30 31 | } 32 | 33 | // CountDownLatch is not Serializable, can't use in stream so we do this. 34 | trait Counter extends Actor with CounterFixture { 35 | 36 | var count = 0 37 | 38 | def increment(): Unit = { 39 | count += 1 40 | if (count == scale) self ! PoisonPill 41 | } 42 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/types/CanBuildFromTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import org.apache.commons.lang3.SerializationUtils 4 | import org.junit.Assert._ 5 | import org.junit.Test 6 | 7 | class CanBuildFromTest { 8 | 9 | @Test 10 | def testBuild() { 11 | val bf = CanBuildFrom.setCanBuildFrom[Int] 12 | val builder = bf.apply() 13 | builder += 1 14 | builder += 2 15 | builder += 3 16 | assertEquals(Set(1,2,3), builder.result()) 17 | } 18 | 19 | @Test 20 | def testSerializeAndBuild() { 21 | val bf = CanBuildFrom.setCanBuildFrom[Int] 22 | val bf2 = SerializationUtils.roundtrip(bf) 23 | val builder = bf2.apply() 24 | builder += 1 25 | builder += 2 26 | builder += 3 27 | assertEquals(Set(1,2,3), builder.result()) 28 | } 29 | 30 | @Test 31 | def testSerializeAndBuildWithOrdering() { 32 | val bf = CanBuildFrom.treeSetCanBuildFrom[Int] 33 | val bf2 = SerializationUtils.roundtrip(bf) 34 | val builder = bf2.apply() 35 | builder += 1 36 | builder += 2 37 | builder += 3 38 | assertEquals(Set(1,2,3), builder.result()) 39 | } 40 | 41 | 42 | } 43 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/twitter-streaming/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Spark Cassandra Connector (Twitter Demo App) Config File # 3 | #################################### 4 | 5 | # This is the reference config file that contains all the default settings. 6 | streaming-app { 7 | 8 | # These can be search terms to filter for, or hashtags 9 | # ["android", "iphone"] 10 | filters = ["#android", "#iphone"] 11 | 12 | spark { 13 | # The fallback Spark master, it auto-detection fails. 14 | # Can change to spark://127.0.0.1:7077 for example. 15 | master = "local[*]" 16 | 17 | # In seconds: Not using hcon 5s format until Spark 18 | # Upgrades their akka and thus config versions (to avoid a deprecation issue). 19 | streaming.batch.interval = 5 20 | 21 | # The default 22 | executor.memory = 2g 23 | cores.max = 2 24 | 25 | jars = [ 26 | "./spark-cassandra-connector-demos/twitter-streaming/target/scala-2.10/twitter-streaming-assembly-1.1.0-SNAPSHOT.jar" 27 | ] 28 | 29 | cassandra { 30 | connection.host = ["127.0.0.1"] 31 | keyspace = "twitter_stream" 32 | table = "hashtags_by_interval" 33 | } 34 | } 35 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/cql/CassandraAuthenticatedConnectorSpec.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import com.datastax.spark.connector.testkit.SharedEmbeddedCassandra 4 | import org.scalatest.{Matchers, FlatSpec} 5 | 6 | class CassandraAuthenticatedConnectorSpec extends FlatSpec with Matchers with SharedEmbeddedCassandra { 7 | 8 | useCassandraConfig("cassandra-password-auth.yaml" + 9 | ".template") 10 | val conn = CassandraConnector(Set(cassandraHost), authConf = PasswordAuthConf("cassandra", "cassandra")) 11 | 12 | // Wait for the default user to be created in Cassandra. 13 | Thread.sleep(1000) 14 | 15 | "A CassandraConnector" should "authenticate with username and password when using native protocol" in { 16 | conn.withSessionDo { session => 17 | assert(session !== null) 18 | assert(session.isClosed === false) 19 | assert(session.getCluster.getMetadata.getClusterName === "Test Cluster") 20 | } 21 | } 22 | 23 | it should "authenticate with username and password when using thrift" in { 24 | conn.withCassandraClientDo { client => 25 | assert(client.describe_cluster_name() === "Test Cluster") 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/dht/TokenRange.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner.dht 2 | 3 | import java.net.InetAddress 4 | 5 | 6 | case class CassandraNode(rpcAddress: InetAddress, localAddress: InetAddress) { 7 | require(rpcAddress != InetAddress.getByName("0.0.0.0"), "rpcAddress must not be 0.0.0.0") 8 | require(localAddress != InetAddress.getByName("0.0.0.0"), "localAddress must not be 0.0.0.0") 9 | def allAddresses = Set(rpcAddress, localAddress) 10 | } 11 | 12 | object CassandraNode { 13 | implicit def ordering: Ordering[CassandraNode] = Ordering.by(_.rpcAddress.toString) 14 | } 15 | 16 | case class TokenRange[V, T <: Token[V]] ( 17 | start: T, end: T, endpoints: Set[CassandraNode], rowCount: Option[Long]) { 18 | 19 | def isWrapAround: Boolean = 20 | start >= end 21 | 22 | def unwrap(implicit tokenFactory: TokenFactory[V, T]): Seq[TokenRange[V, T]] = { 23 | val minToken = tokenFactory.minToken 24 | if (isWrapAround) 25 | Seq( 26 | TokenRange(start, minToken, endpoints, rowCount.map(_ / 2)), 27 | TokenRange(minToken, end, endpoints, rowCount.map(_ / 2))) 28 | else 29 | Seq(this) 30 | } 31 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleWithNestedJavaBean.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleWithNestedJavaBean implements Serializable { 10 | public class InnerClass implements Serializable { 11 | private Integer key; 12 | private String value; 13 | 14 | public InnerClass(Integer key) { 15 | this.key = key; 16 | } 17 | 18 | public InnerClass() { 19 | } 20 | 21 | public InnerClass(Integer key, String value) { 22 | this.key = key; 23 | this.value = value; 24 | } 25 | 26 | public Integer getKey() { 27 | return key; 28 | } 29 | 30 | public void setKey(Integer key) { 31 | this.key = key; 32 | } 33 | 34 | public String getValue() { 35 | return value; 36 | } 37 | 38 | public void setValue(String value) { 39 | this.value = value; 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/PropertyExtractorTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import org.junit.Assert._ 4 | import org.junit.Test 5 | 6 | class PropertyExtractorTest { 7 | 8 | class TestClass(val field1: String, val field2: Int) 9 | 10 | @Test 11 | def testSimpleExtraction() { 12 | val testObject = new TestClass("a", 1) 13 | val propertyExtractor = new PropertyExtractor(classOf[TestClass], Seq("field1", "field2")) 14 | val result = propertyExtractor.extract(testObject) 15 | assertEquals(2, result.size) 16 | assertEquals("a", result(0)) 17 | assertEquals(1, result(1)) 18 | } 19 | 20 | @Test 21 | def testAvailableProperties() { 22 | val triedProperties = Seq("field1", "foo", "bar") 23 | val availableProperties = PropertyExtractor.availablePropertyNames(classOf[TestClass], triedProperties) 24 | assertEquals(Seq("field1"), availableProperties) 25 | } 26 | 27 | @Test(expected = classOf[NoSuchMethodException]) 28 | def testWrongPropertyName() { 29 | val testObject = new TestClass("a", 1) 30 | val propertyExtractor = new PropertyExtractor(classOf[TestClass], Seq("foo")) 31 | propertyExtractor.extract(testObject) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/ObjectSizeEstimatorTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import java.nio.ByteBuffer 4 | import java.util.Date 5 | 6 | import org.junit.Assert._ 7 | import org.junit.Test 8 | 9 | class ObjectSizeEstimatorTest { 10 | 11 | @Test 12 | def testFunctionality() { 13 | val size0 = ObjectSizeEstimator.measureSerializedSize(Array(1)) 14 | val size1 = ObjectSizeEstimator.measureSerializedSize(Array(1, 2)) 15 | val size2 = ObjectSizeEstimator.measureSerializedSize(Array(1, 2, "abc", List("item1", "item2"), new Date())) 16 | assertTrue(size0 > 16) 17 | assertTrue(size1 > size0) 18 | assertTrue(size2 > size1) 19 | } 20 | 21 | @Test 22 | def testByteBuffers() { 23 | val buffer = ByteBuffer.allocate(100) 24 | val size0 = ObjectSizeEstimator.measureSerializedSize(Array(buffer)) 25 | val size1 = ObjectSizeEstimator.measureSerializedSize(Array(List(buffer))) 26 | val size2 = ObjectSizeEstimator.measureSerializedSize(Array(Set(buffer))) 27 | val size3 = ObjectSizeEstimator.measureSerializedSize(Array(Map(1 -> buffer))) 28 | assertTrue(size0 > 100) 29 | assertTrue(size1 > 100) 30 | assertTrue(size2 > 100) 31 | assertTrue(size3 > 100) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/RDDFunctions.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import com.datastax.spark.connector.writer._ 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.rdd.RDD 7 | 8 | /** Provides Cassandra-specific methods on `RDD` */ 9 | class RDDFunctions[T](rdd: RDD[T]) extends WritableToCassandra[T] with Serializable { 10 | 11 | override val sparkContext: SparkContext = rdd.sparkContext 12 | 13 | /** 14 | * Saves the data from `RDD` to a Cassandra table. Uses the specified column names. 15 | * @see [[com.datastax.spark.connector.writer.WritableToCassandra]] 16 | */ 17 | def saveToCassandra(keyspaceName: String, 18 | tableName: String, 19 | columns: ColumnSelector = AllColumns, 20 | writeConf: WriteConf = WriteConf.fromSparkConf(sparkContext.getConf)) 21 | (implicit connector: CassandraConnector = CassandraConnector(sparkContext.getConf), 22 | rwf: RowWriterFactory[T]): Unit = { 23 | val writer = TableWriter(connector, keyspaceName, tableName, columns, writeConf) 24 | rdd.sparkContext.runJob(rdd, writer.write _) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/types/TypeSerializationTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import org.apache.commons.lang3.SerializationUtils 4 | import org.junit.Assert._ 5 | import org.junit.Test 6 | 7 | class TypeSerializationTest { 8 | 9 | private def testSerialization(t: ColumnType[_]) { 10 | assertEquals(t, SerializationUtils.roundtrip(t)) 11 | } 12 | 13 | @Test 14 | def testSerializationOfPrimitiveTypes() { 15 | testSerialization(AsciiType) 16 | testSerialization(TextType) 17 | testSerialization(IntType) 18 | testSerialization(BigIntType) 19 | testSerialization(DoubleType) 20 | testSerialization(FloatType) 21 | testSerialization(BooleanType) 22 | testSerialization(UUIDType) 23 | testSerialization(TimeUUIDType) 24 | testSerialization(TimestampType) 25 | testSerialization(DecimalType) 26 | testSerialization(BigIntType) 27 | testSerialization(InetType) 28 | testSerialization(CounterType) 29 | } 30 | 31 | @Test 32 | def testSerializationOfCollectionTypes() { 33 | testSerialization(ListType(IntType)) 34 | testSerialization(ListType(ListType(IntType))) 35 | testSerialization(SetType(TextType)) 36 | testSerialization(MapType(BigIntType, TimestampType)) 37 | } 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/PropertyExtractor.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import java.lang.reflect.Method 4 | 5 | import scala.util.Try 6 | 7 | /** Extracts values from fields of an object. */ 8 | class PropertyExtractor[T](val cls: Class[T], val propertyNames: Seq[String]) extends Serializable { 9 | 10 | private def getter(name: String) = 11 | cls.getMethod(name) 12 | 13 | @transient 14 | private lazy val methods: Array[Method] = 15 | propertyNames.map(getter).toArray 16 | 17 | @transient 18 | private lazy val methodByName = 19 | methods.map(m => (m.getName, m)).toMap 20 | 21 | def extract(obj: T): Array[AnyRef] = 22 | extract(obj, Array.ofDim(methods.length)) 23 | 24 | def extract(obj: T, target: Array[AnyRef]): Array[AnyRef] = { 25 | for (i <- 0 until methods.length) 26 | target(i) = methods(i).invoke(obj) 27 | target 28 | } 29 | 30 | def extractProperty(obj: T, propertyName: String): AnyRef = { 31 | val m = methodByName(propertyName) 32 | m.invoke(obj) 33 | } 34 | } 35 | 36 | object PropertyExtractor { 37 | 38 | def availablePropertyNames(cls: Class[_], requestedPropertyNames: Seq[String]): Seq[String] = 39 | requestedPropertyNames.filter(name => Try(cls.getMethod(name)).isSuccess) 40 | 41 | } 42 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/CassandraRDDPartition.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import java.net.InetAddress 4 | 5 | import org.apache.spark.Partition 6 | 7 | /** Stores a CQL `WHERE` predicate matching a range of tokens. */ 8 | case class CqlTokenRange(cql: String, values: Any*) 9 | 10 | /** Metadata describing Cassandra table partition processed by a single Spark task. 11 | * Beware the term "partition" is overloaded. Here, in the context of Spark, 12 | * it means an arbitrary collection of rows that can be processed locally on a single Cassandra cluster node. 13 | * A `CassandraPartition` typically contains multiple CQL partitions, i.e. rows identified by different values of 14 | * the CQL partitioning key. 15 | * 16 | * @param index identifier of the partition, used internally by Spark 17 | * @param endpoints which nodes the data partition is located on 18 | * @param tokenRanges token ranges determining the row set to be fetched 19 | * @param rowCount estimated total row count in a partition 20 | */ 21 | case class CassandraPartition(index: Int, 22 | endpoints: Iterable[InetAddress], 23 | tokenRanges: Iterable[CqlTokenRange], 24 | rowCount: Long) extends Partition 25 | 26 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/ConvertingPropertyExtractor.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.types.TypeConverter 4 | 5 | /** Extracts property values from an object and additionally converts them to desired types */ 6 | class ConvertingPropertyExtractor[T](val cls: Class[T], properties: Seq[(String, TypeConverter[_])]) 7 | extends Serializable { 8 | 9 | val (propertyNames, propertyTypes) = properties.toArray.unzip 10 | val propertyTypeByName = properties.toMap 11 | 12 | private val simpleExtractor = 13 | new PropertyExtractor[T](cls, propertyNames) 14 | 15 | def extract(obj: T): Array[AnyRef] = 16 | convert(simpleExtractor.extract(obj)) 17 | 18 | 19 | def extract(obj: T, target: Array[AnyRef]): Array[AnyRef] = 20 | convert(simpleExtractor.extract(obj, target)) 21 | 22 | def extractProperty(obj: T, propertyName: String): AnyRef = { 23 | val propertyValue = simpleExtractor.extractProperty(obj, propertyName) 24 | val converter = propertyTypeByName(propertyName) 25 | converter.convert(propertyValue).asInstanceOf[AnyRef] 26 | } 27 | 28 | def convert(data: Array[AnyRef]): Array[AnyRef] = { 29 | for (i <- 0 until data.length) 30 | data(i) = propertyTypes(i).convert(data(i)).asInstanceOf[AnyRef] 31 | data 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/AbstractRowWriter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.driver.core.{ProtocolVersion, PreparedStatement} 4 | import com.datastax.spark.connector.cql.TableDef 5 | import org.apache.spark.sql.catalyst.expressions.Row 6 | 7 | /** A [[RowWriter]] that can write SparkSQL [[Row]] objects or [[com.datastax.spark.connector.CassandraRow]] objects .*/ 8 | abstract class AbstractRowWriter[T](table: TableDef, selectedColumns: Seq[String]) extends RowWriter[T] { 9 | 10 | override def columnNames = 11 | selectedColumns.toIndexedSeq 12 | 13 | protected def getColumnValue(data: T, columnName: String): AnyRef 14 | 15 | @transient 16 | protected lazy val buffer = new ThreadLocal[Array[AnyRef]] { 17 | override def initialValue() = Array.ofDim[AnyRef](columnNames.size) 18 | } 19 | 20 | protected def fillBuffer(data: T): Array[AnyRef] = { 21 | val buf = buffer.get 22 | for (i <- 0 until columnNames.size) 23 | buf(i) = getColumnValue(data, columnNames(i)) 24 | buf 25 | } 26 | 27 | override def bind(data: T, stmt: PreparedStatement, protocolVersion: ProtocolVersion) = { 28 | stmt.bind(fillBuffer(data): _*) 29 | } 30 | 31 | override def estimateSizeInBytes(data: T) = { 32 | ObjectSizeEstimator.measureSerializedSize(fillBuffer(data)) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/ConvertingPropertyExtractorTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.types.TypeConverter.{StringConverter, OptionToNullConverter, IntConverter} 4 | 5 | import org.junit.Assert._ 6 | import org.junit.Test 7 | 8 | class ConvertingPropertyExtractorTest { 9 | 10 | class TestClass(val field1: String, val field2: Option[Int]) 11 | 12 | private def createExtractor: ConvertingPropertyExtractor[TestClass] = { 13 | new ConvertingPropertyExtractor[TestClass]( 14 | classOf[TestClass], Seq( 15 | ("field1", IntConverter), 16 | ("field2", new OptionToNullConverter(StringConverter)))) 17 | } 18 | 19 | @Test 20 | def testExtraction() { 21 | val obj = new TestClass("123", Some(5)) 22 | val extractor = createExtractor 23 | val data = extractor.extract(obj) 24 | assertNotNull(data) 25 | assertEquals(2, data.length) 26 | assertEquals(123, data(0)) 27 | assertEquals("5", data(1)) 28 | } 29 | 30 | @Test 31 | def testExtractionNoAlloc() { 32 | val obj = new TestClass("123", Some(5)) 33 | val extractor = createExtractor 34 | val data = Array.ofDim[AnyRef](extractor.propertyNames.size) 35 | extractor.extract(obj, data) 36 | assertEquals(123, data(0)) 37 | assertEquals("5", data(1)) 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/MultipleRetryPolicy.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import com.datastax.driver.core.policies.RetryPolicy 4 | import com.datastax.driver.core.policies.RetryPolicy.RetryDecision 5 | import com.datastax.driver.core.{ConsistencyLevel, Statement, WriteType} 6 | 7 | /** Always retries with the same CL, constant number of times, regardless of circumstances */ 8 | class MultipleRetryPolicy(maxRetryCount: Int) extends RetryPolicy { 9 | 10 | private def retryOrThrow(cl: ConsistencyLevel, nbRetry: Int): RetryDecision = { 11 | if (nbRetry < maxRetryCount) 12 | RetryDecision.retry(cl) 13 | else 14 | RetryDecision.rethrow() 15 | } 16 | 17 | override def onReadTimeout(stmt: Statement, cl: ConsistencyLevel, 18 | requiredResponses: Int, receivedResponses: Int, 19 | dataRetrieved: Boolean, nbRetry: Int) = retryOrThrow(cl, nbRetry) 20 | 21 | override def onUnavailable(stmt: Statement, cl: ConsistencyLevel, 22 | requiredReplica: Int, aliveReplica: Int, nbRetry: Int) = retryOrThrow(cl, nbRetry) 23 | 24 | override def onWriteTimeout(stmt: Statement, cl: ConsistencyLevel, writeType: WriteType, 25 | requiredAcks: Int, receivedAcks: Int, nbRetry: Int) = retryOrThrow(cl, nbRetry) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /scripts/submit-demos: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This very basic script that submits the demos jar to a local spark master. 3 | # TODO add input validation and error handling. 4 | 5 | # ## 1. Build the demos assembly jar: 6 | # sbt -Dspark.cassandra.connector.demos.assembly=true assembly 7 | 8 | # ## 2. Run this script ## 9 | # Pass in 3 parameters: 10 | # 1. Path to /bin/spark-submit 11 | # 2. Spark master 12 | # 3. The FQCN of the demo class to run, e.g: com.datastax.spark.connector.demo.BasicReadWriteDemo 13 | # For further customization options see https://spark.apache.org/docs/latest/submitting-applications.html 14 | # Example: 15 | # sudo ./scripts/submit-demos /path/to/spark/bin spark://master:7077 com.datastax.spark.connector.demo.BasicReadWriteDemo 16 | # ## 17 | 18 | 19 | PATH_TO_SPARK_BIN_SCRIPTS=$1 20 | SPARK_MASTER=$2 21 | APP_TO_RUN=$3 22 | 23 | # TODO read from Settings.scala scalaVersion and version in ThisBuild: 24 | VERSION="1.0.0-SNAPSHOT" 25 | SCALA_VERSION="scala-2.10" 26 | DEMOS_ASSEMBLY_JAR="spark-cassandra-connector-demos-assembly-$VERSION.jar" 27 | PATH_TO_JAR="spark-cassandra-connector-demos/target/$SCALA_VERSION/$DEMOS_ASSEMBLY_JAR" 28 | SPARK_SUBMIT="$PATH_TO_SPARK_BIN_SCRIPTS/spark-submit" 29 | 30 | # Run on a Spark standalone cluster 31 | echo "Attempting to submit demo $SPARK_SUBMIT on $SPARK_MASTER with $PATH_TO_JAR" 32 | $SPARK_SUBMIT --class $APP_TO_RUN --master $SPARK_MASTER $PATH_TO_JAR 100 33 | 34 | 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/ReadConf.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd 2 | 3 | import org.apache.spark.SparkConf 4 | 5 | import com.datastax.driver.core.ConsistencyLevel 6 | 7 | /** Read settings for RDD 8 | * 9 | * @param splitSize number of Cassandra partitions to be read in a single Spark task 10 | * @param fetchSize number of CQL rows to fetch in a single round-trip to Cassandra 11 | * @param consistencyLevel consistency level for reads, default LOCAL_ONE; 12 | * higher consistency level will disable data-locality */ 13 | case class ReadConf( 14 | splitSize: Int = ReadConf.DefaultSplitSize, 15 | fetchSize: Int = ReadConf.DefaultFetchSize, 16 | consistencyLevel: ConsistencyLevel = ReadConf.DefaultConsistencyLevel) 17 | 18 | 19 | object ReadConf { 20 | val DefaultSplitSize = 100000 21 | val DefaultFetchSize = 1000 22 | val DefaultConsistencyLevel = ConsistencyLevel.LOCAL_ONE 23 | 24 | def fromSparkConf(conf: SparkConf): ReadConf = { 25 | ReadConf( 26 | fetchSize = conf.getInt("spark.cassandra.input.page.row.size", DefaultFetchSize), 27 | splitSize = conf.getInt("spark.cassandra.input.split.size", DefaultSplitSize), 28 | consistencyLevel = ConsistencyLevel.valueOf( 29 | conf.get("spark.cassandra.input.consistency.level", DefaultConsistencyLevel.name())) 30 | ) 31 | } 32 | 33 | } 34 | 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleWithDeeplyNestedJavaBean.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because 7 | * Scala adds some additional accessors and mutators. 8 | */ 9 | public class SampleWithDeeplyNestedJavaBean implements Serializable { 10 | public class IntermediateClass implements Serializable { 11 | public class InnerClass implements Serializable { 12 | private Integer key; 13 | private String value; 14 | 15 | public InnerClass(Integer key) { 16 | this.key = key; 17 | } 18 | 19 | public InnerClass() { 20 | } 21 | 22 | public InnerClass(Integer key, String value) { 23 | this.key = key; 24 | this.value = value; 25 | } 26 | 27 | public Integer getKey() { 28 | return key; 29 | } 30 | 31 | public void setKey(Integer key) { 32 | this.key = key; 33 | } 34 | 35 | public String getValue() { 36 | return value; 37 | } 38 | 39 | public void setValue(String value) { 40 | this.value = value; 41 | } 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/DStreamFunctions.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import com.datastax.spark.connector._ 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import com.datastax.spark.connector.writer.{TableWriter, WriteConf, RowWriterFactory, WritableToCassandra} 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.streaming.dstream.DStream 8 | 9 | class DStreamFunctions[T](dstream: DStream[T]) extends WritableToCassandra[T] with Serializable { 10 | 11 | override def sparkContext: SparkContext = dstream.context.sparkContext 12 | 13 | def conf = sparkContext.getConf 14 | 15 | /** 16 | * Performs [[com.datastax.spark.connector.writer.WritableToCassandra]] for each produced RDD. 17 | * Uses specific column names with an additional batch size. 18 | */ 19 | def saveToCassandra(keyspaceName: String, 20 | tableName: String, 21 | columnNames: ColumnSelector = AllColumns, 22 | writeConf: WriteConf = WriteConf.fromSparkConf(conf)) 23 | (implicit connector: CassandraConnector = CassandraConnector(conf), 24 | rwf: RowWriterFactory[T]): Unit = { 25 | val writer = TableWriter(connector, keyspaceName, tableName, columnNames, writeConf) 26 | dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, writer.write _)) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/Murmur3PartitionerTokenRangeSplitter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import com.datastax.spark.connector.rdd.partitioner.dht.{LongToken, TokenFactory, TokenRange} 4 | 5 | import scala.math.BigDecimal.RoundingMode 6 | 7 | /** Fast token range splitter assuming that data are spread out evenly in the whole range. */ 8 | class Murmur3PartitionerTokenRangeSplitter(cassandraPartitionsPerToken: Double) extends TokenRangeSplitter[Long, LongToken] { 9 | 10 | private val tokenFactory = 11 | TokenFactory.Murmur3TokenFactory 12 | 13 | def split(range: TokenRange[Long, LongToken], splitSize: Long) = { 14 | val left = range.start.value 15 | val right = range.end.value 16 | val rangeSize = 17 | if (right > left) BigDecimal(right) - BigDecimal(left) 18 | else BigDecimal(right) - BigDecimal(left) + BigDecimal(tokenFactory.totalTokenCount) 19 | val estimatedRows = rangeSize * cassandraPartitionsPerToken 20 | val n = math.max(1, (estimatedRows / splitSize).setScale(0, RoundingMode.HALF_UP).toInt) 21 | val splitPoints = 22 | (for (i <- 0 until n) yield left + (rangeSize * i.toDouble / n).toLong) :+ right 23 | for (Seq(l, r) <- splitPoints.sliding(2).toSeq) yield 24 | new TokenRange[Long, LongToken]( 25 | new LongToken(l), 26 | new LongToken(r), 27 | range.endpoints, 28 | Some((estimatedRows / n).toInt)) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/TableCopyDemo.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | 5 | object TableCopyDemo extends DemoApp { 6 | 7 | CassandraConnector(conf).withSessionDo { session => 8 | session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") 9 | session.execute("CREATE TABLE IF NOT EXISTS test.source (key INT PRIMARY KEY, data VARCHAR)") 10 | session.execute("CREATE TABLE IF NOT EXISTS test.destination (key INT PRIMARY KEY, data VARCHAR)") 11 | session.execute("TRUNCATE test.source") 12 | session.execute("TRUNCATE test.destination") 13 | session.execute("INSERT INTO test.source(key, data) VALUES (1, 'first row')") 14 | session.execute("INSERT INTO test.source(key, data) VALUES (2, 'second row')") 15 | session.execute("INSERT INTO test.source(key, data) VALUES (3, 'third row')") 16 | } 17 | 18 | import com.datastax.spark.connector._ 19 | 20 | val src = sc.cassandraTable("test", "source") 21 | src.saveToCassandra("test", "destination") 22 | 23 | val dest = sc.cassandraTable("test", "destination") 24 | dest.collect().foreach(row => log.info(s"$row")) 25 | 26 | // Assert the rows were copied from test.source to test.destination table: 27 | assert(dest.collect().length == 3) 28 | 29 | log.info(s"Work completed, stopping the Spark context.") 30 | sc.stop() 31 | } 32 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/InsertIntoCassandraTable.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.cassandra 2 | 3 | import com.datastax.spark.connector._ 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import com.datastax.spark.connector.writer.SqlRowWriter 6 | import org.apache.spark.annotation.DeveloperApi 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Row} 9 | import org.apache.spark.sql.execution.{SparkPlan, UnaryNode} 10 | 11 | @DeveloperApi 12 | case class InsertIntoCassandraTable(cassandraRelation: CassandraRelation, 13 | childPlan: SparkPlan, 14 | overwrite: Boolean) 15 | (@transient cc: CassandraSQLContext) extends UnaryNode { 16 | self: Product => 17 | 18 | override def output: Seq[Attribute] = childPlan.output 19 | 20 | override def execute(): RDD[Row] = result 21 | 22 | override def child: SparkPlan = childPlan 23 | 24 | override def otherCopyArgs = cc :: Nil 25 | 26 | /** 27 | * Insert RDD[[Row]] to Cassandra 28 | */ 29 | private lazy val result: RDD[Row] = { 30 | val childRdd = child.execute() 31 | 32 | //TODO: cluster level CassandraConnector, write configuration settings 33 | childRdd.saveToCassandra(cassandraRelation.keyspaceName, cassandraRelation.tableName)(CassandraConnector(sparkContext.getConf), SqlRowWriter.Factory) 34 | 35 | cc.sparkContext.makeRDD(Nil, 1) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /doc/10_embedded.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | ## The `spark-cassandra-connector-embedded` Artifact 3 | The `spark-cassandra-connector-embedded` artifact can be used as a test or prototype dependency to spin up embedded servers for testing ideas, quickly learning, integration, etc. 4 | Pulling this dependency in allows you to do 5 | 6 | - Integration Tests (IT) tests with an embedded Cassandra instance 7 | - if your sbt project is configured to [run IT configs](https://github.com/datastax/spark-cassandra-connector/blob/master/project/Settings.scala#L78-L94) 8 | - Easily write and run a Spark Streaming app using 9 | - Apache Kafka streams (including an embedded Zookeeper), all with no Ops work involved 10 | - Twitter streams (needs the 4 auth credentials required by twitter) 11 | - And of course Cassandra but you currently need to sping up a local instance: [Download Cassandra latest](http://cassandra.apache.org/download/), open the tar, and run `sudo ./apache-cassandra-2.1.0/bin/cassandra` 12 | 13 | ## The Code 14 | See: [https://github.com/datastax/spark-cassandra-connector/tree/master/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded](https://github.com/datastax/spark-cassandra-connector/tree/master/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded) 15 | 16 | ## How To Add The Dependency 17 | Simply add this to your SBT build, or in the appropriate format for a Maven build 18 | 19 | "com.datastax.spark" %% "spark-cassandra-connector-embedded" % {latest.verson} -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/RowReader.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.reader 2 | 3 | import com.datastax.driver.core.{ProtocolVersion, Row} 4 | 5 | /** Transforms a Cassandra Java driver `Row` into high-level row representation, e.g. a tuple 6 | * or a user-defined case class object. The target type `T` must be serializable. */ 7 | trait RowReader[T] extends Serializable { 8 | 9 | /** Reads column values from low-level `Row` and turns them into higher level representation. 10 | * @param row row fetched from Cassandra 11 | * @param columnNames column names available in the `row` 12 | * @param protocolVersion java driver protocol version to be used for deserialization */ 13 | def read(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): T 14 | 15 | /** List of columns this `RowReader` is going to read. 16 | * Useful to avoid fetching the columns that are not needed. */ 17 | def columnNames: Option[Seq[String]] 18 | 19 | /** The number of columns that need to be fetched from C*. */ 20 | def requiredColumns: Option[Int] 21 | 22 | /** This method should be implemented by those row readers which reads fields in the consecutive 23 | * positions from a CassandraRow. When a row reader implements it so that it returns a non-empty, 24 | * it denotes the number of columns this reader moves the column cursor forward for compound row 25 | * readers (such as [[KeyValueRowReader]]). */ 26 | def consumedColumns: Option[Int] = None 27 | 28 | } 29 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/RowWriterFactory.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.cql.TableDef 4 | import com.datastax.spark.connector.mapper.ColumnMapper 5 | 6 | import scala.reflect.ClassTag 7 | 8 | /** Creates instances of [[RowWriter]] objects for the given row type `T`. 9 | * `RowWriterFactory` is the trait you need to implement if you want to support row representations 10 | * which cannot be simply mapped by a [[com.datastax.spark.connector.mapper.ColumnMapper ColumnMapper]].*/ 11 | trait RowWriterFactory[T] { 12 | 13 | /** Creates a new `RowWriter` instance. 14 | * @param table target table the user wants to write into 15 | * @param columnNames columns selected by the user; the user might wish to write only a subset of columns */ 16 | def rowWriter(table: TableDef, columnNames: Seq[String]): RowWriter[T] 17 | } 18 | 19 | /** Provides a low-priority implicit `RowWriterFactory` able to write objects of any class for which 20 | * a [[com.datastax.spark.connector.mapper.ColumnMapper ColumnMapper]] is defined.*/ 21 | trait LowPriorityRowWriterFactoryImplicits { 22 | implicit def defaultRowWriterFactory[T : ColumnMapper]: RowWriterFactory[T] = DefaultRowWriter.factory 23 | } 24 | 25 | /** Provides an implicit `RowWriterFactory` for saving [[com.datastax.spark.connector.CassandraRow CassandraRow]] objects.*/ 26 | object RowWriterFactory extends LowPriorityRowWriterFactoryImplicits { 27 | implicit val genericRowWriterFactory = GenericRowWriter.Factory 28 | } -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/SparkRepl.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import java.io.{PrintWriter, StringWriter, StringReader, BufferedReader} 4 | import java.net.URLClassLoader 5 | 6 | import scala.collection.mutable.ArrayBuffer 7 | import org.apache.spark.repl.SparkILoop 8 | 9 | trait SparkRepl { 10 | def runInterpreter(master: String, input: String): String = { 11 | System.setProperty("spark.cassandra.connection.host", EmbeddedCassandra.cassandraHost.getHostAddress) 12 | val in = new BufferedReader(new StringReader(input + "\n")) 13 | val out = new StringWriter() 14 | val cl = getClass.getClassLoader 15 | var paths = new ArrayBuffer[String] 16 | cl match { 17 | case urlLoader: URLClassLoader => 18 | for (url <- urlLoader.getURLs) { 19 | if (url.getProtocol == "file") { 20 | paths += url.getFile 21 | } 22 | } 23 | case _ => 24 | } 25 | val interp = new SparkILoop(in, new PrintWriter(out), master) 26 | org.apache.spark.repl.Main.interp = interp 27 | val separator = System.getProperty("path.separator") 28 | interp.process(Array("-classpath", paths.mkString(separator))) 29 | org.apache.spark.repl.Main.interp = null 30 | if (interp.sparkContext != null) { 31 | interp.sparkContext.stop() 32 | } 33 | // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown 34 | System.clearProperty("spark.driver.port") 35 | out.toString 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/BasicReadWriteDemo.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | 5 | object BasicReadWriteDemo extends DemoApp { 6 | 7 | CassandraConnector(conf).withSessionDo { session => 8 | session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") 9 | session.execute("CREATE TABLE IF NOT EXISTS test.key_value (key INT PRIMARY KEY, value VARCHAR)") 10 | session.execute("TRUNCATE test.key_value") 11 | session.execute("INSERT INTO test.key_value(key, value) VALUES (1, 'first row')") 12 | session.execute("INSERT INTO test.key_value(key, value) VALUES (2, 'second row')") 13 | session.execute("INSERT INTO test.key_value(key, value) VALUES (3, 'third row')") 14 | } 15 | 16 | import com.datastax.spark.connector._ 17 | 18 | // Read table test.kv and print its contents: 19 | val rdd = sc.cassandraTable("test", "key_value").select("key", "value") 20 | rdd.collect().foreach(row => log.info(s"Existing Data: $row")) 21 | 22 | // Write two new rows to the test.kv table: 23 | val col = sc.parallelize(Seq((4, "fourth row"), (5, "fifth row"))) 24 | col.saveToCassandra("test", "key_value", SomeColumns("key", "value")) 25 | 26 | // Assert the two new rows were stored in test.kv table: 27 | assert(col.collect().length == 2) 28 | 29 | col.collect().foreach(row => log.info(s"New Data: $row")) 30 | log.info(s"Work completed, stopping the Spark context.") 31 | sc.stop() 32 | } 33 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # for production, you should probably set pattern to %c instead of %l. 18 | # (%l is slower.) 19 | 20 | # output messages into a rolling log file as well as stdout 21 | log4j.rootLogger=WARN,stdout 22 | 23 | # stdout 24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 26 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 27 | 28 | # Adding this to avoid thrift logging disconnect errors. 29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR 30 | 31 | # Avoid "no host ID found" when starting a fresh node 32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR 33 | 34 | log4j.logger.com.datastax.spark.connector=INFO 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/kafka-streaming/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # for production, you should probably set pattern to %c instead of %l. 18 | # (%l is slower.) 19 | 20 | # output messages into a rolling log file as well as stdout 21 | log4j.rootLogger=WARN,stdout 22 | 23 | # stdout 24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 26 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 27 | 28 | # Adding this to avoid thrift logging disconnect errors. 29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR 30 | 31 | # Avoid "no host ID found" when starting a fresh node 32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR 33 | 34 | log4j.logger.com.datastax.spark.connector=INFO 35 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/twitter-streaming/src/main/scala/com/datastax/spark/connector/demo/TwitterStreamingHashTagsByInterval.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import scala.util.matching.Regex 4 | import org.apache.spark.storage.StorageLevel 5 | import org.apache.spark.streaming.{Time, Seconds, StreamingContext} 6 | import org.apache.spark.streaming.twitter.TwitterUtils 7 | import org.joda.time.{DateTimeZone, DateTime} 8 | import twitter4j.auth.Authorization 9 | import com.datastax.spark.connector.streaming._ 10 | import com.datastax.spark.connector.SomeColumns 11 | 12 | class TwitterStreamingHashTagsByInterval extends Serializable { 13 | 14 | def start(auth: Option[Authorization], ssc: StreamingContext, filters: Regex, keyspace: String, table: String): Unit = { 15 | 16 | val transform = (cruft: String) => filters.findAllIn(cruft).flatMap(_.stripPrefix("#")) 17 | 18 | val stream = TwitterUtils.createStream(ssc, auth, Nil, StorageLevel.MEMORY_ONLY_SER_2) 19 | 20 | /** Note that Cassandra is doing the sorting for you here. */ 21 | stream.flatMap(_.getText.toLowerCase.split("""\s+""")) 22 | .map(transform) 23 | .countByValueAndWindow(Seconds(5), Seconds(5)) 24 | .transform((rdd, time) => rdd.map { case (term, count) => (term, count, now(time))}) 25 | .saveToCassandra(keyspace, table, SomeColumns("hashtag", "mentions", "interval")) 26 | 27 | ssc.checkpoint("./checkpoint") 28 | ssc.start() 29 | ssc.awaitTermination() 30 | } 31 | 32 | private def now(time: Time): String = 33 | new DateTime(time.milliseconds, DateTimeZone.UTC).toString("yyyyMMddHH:mm:ss.SSS") 34 | } 35 | 36 | 37 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/streaming/StreamingSpec.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import com.datastax.spark.connector.testkit._ 4 | import com.datastax.spark.connector.embedded._ 5 | 6 | /** 7 | * Usages: Create the [[org.apache.spark.streaming.StreamingContext]] then write async to the stream. 8 | * 9 | * val ssc = new StreamingContext(conf, Milliseconds(500)) 10 | * 11 | * Akka 12 | * {{{ 13 | * val stream = ssc.actorStream[String](Props[SimpleActor], actorName, StorageLevel.MEMORY_AND_DISK) 14 | * }}} 15 | * 16 | * On upgrade examples: 17 | * Kafka 18 | * {{{ 19 | * val stream: ReceiverInputDStream[(String, String)] = 20 | * KafkaUtils.createStream(ssc, kafkaParams, topics, StorageLevel.MEMORY_AND_DISK_SER_2) 21 | * }}} 22 | * 23 | * ZeroMQ 24 | * {{{ 25 | * val stream: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects) 26 | * }}} 27 | * 28 | * Twitter 29 | * {{{ 30 | * val stream: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None) 31 | * }}} 32 | * 33 | * etc. 34 | */ 35 | trait StreamingSpec extends AbstractSpec with SharedEmbeddedCassandra with SparkTemplate { 36 | import org.apache.spark.streaming.StreamingContext 37 | import scala.concurrent.duration._ 38 | 39 | val duration = 10.seconds 40 | 41 | useCassandraConfig("cassandra-default.yaml.template") 42 | 43 | def ssc: StreamingContext 44 | 45 | after { 46 | // Spark Context is shared among all integration test so we don't want to stop it here 47 | ssc.stop(stopSparkContext = false, stopGracefully = true) 48 | } 49 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/RandomPartitionerTokenRangeSplitter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import com.datastax.spark.connector.rdd.partitioner.dht.{BigIntToken, TokenFactory, TokenRange} 4 | 5 | import scala.math.BigDecimal.RoundingMode 6 | 7 | /** Fast token range splitter assuming that data are spread out evenly in the whole range. */ 8 | class RandomPartitionerTokenRangeSplitter(cassandraPartitionsPerToken: Double) extends TokenRangeSplitter[BigInt, BigIntToken] { 9 | 10 | private val tokenFactory = 11 | TokenFactory.RandomPartitionerTokenFactory 12 | 13 | private def wrap(token: BigInt): BigInt = { 14 | val max = tokenFactory.maxToken.value 15 | if (token <= max) token else token - max 16 | } 17 | 18 | def split(range: TokenRange[BigInt, BigIntToken], splitSize: Long) = { 19 | val left = range.start.value 20 | val right = range.end.value 21 | val rangeSize = 22 | if (right > left) BigDecimal(right - left) 23 | else BigDecimal(right - left + tokenFactory.totalTokenCount) 24 | val estimatedRows = rangeSize * cassandraPartitionsPerToken 25 | val n = math.max(1, (estimatedRows / splitSize).setScale(0, RoundingMode.HALF_UP).toInt) 26 | val splitPoints = 27 | (for (i <- 0 until n) yield wrap(left + (rangeSize * i.toDouble / n).toBigInt)) :+ right 28 | for (Seq(l, r) <- splitPoints.sliding(2).toSeq) yield 29 | new TokenRange[BigInt, BigIntToken]( 30 | new BigIntToken(l.bigInteger), 31 | new BigIntToken(r.bigInteger), 32 | range.endpoints, 33 | Some((estimatedRows / n).toInt)) 34 | } 35 | } -------------------------------------------------------------------------------- /project/Versions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | object Versions { 18 | val Akka = "2.2.3"//spark master on 2.3.4 https://github.com/apache/spark/blob/master/pom.xml#L113-L114 19 | val Cassandra = "2.1.2" 20 | val CassandraDriver = "2.1.3" 21 | val CommonsIO = "2.4" 22 | val CommonsLang3 = "3.3.2" 23 | val Config = "1.2.1" 24 | val Guava = "14.0.1" 25 | val JDK = "1.7" 26 | val JodaC = "1.2" 27 | val JodaT = "2.3" 28 | val JOpt = "3.2"//4.7 29 | val Kafka = "0.8.0"//https://github.com/apache/spark/pull/3631 30 | val Lzf = "0.8.4" 31 | val CodaHaleMetrics = "3.0.2" 32 | val Scala = "2.10.4" 33 | val ScalaTest = "2.2.2" 34 | val Scalactic = "2.2.2" 35 | val Slf4j = "1.7.7" 36 | val Spark = "1.1.1" 37 | 38 | } 39 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/SQLDemo.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.demo 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import org.apache.spark.sql.cassandra.CassandraSQLContext 5 | 6 | /** This demo creates a table in Cassandra, populates it with sample data, 7 | * then queries it using SparkSQL and finally displays the query results to the standard output. 8 | * You need to start Cassandra on local node prior to executing this demo. */ 9 | object SQLDemo extends DemoApp { 10 | 11 | val cc = new CassandraSQLContext(sc) 12 | 13 | CassandraConnector(conf).withSessionDo { session => 14 | session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") 15 | session.execute("DROP TABLE IF EXISTS test.sql_demo") 16 | session.execute("CREATE TABLE test.sql_demo (key INT PRIMARY KEY, grp INT, value DOUBLE)") 17 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (1, 1, 1.0)") 18 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (2, 1, 2.5)") 19 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (3, 1, 10.0)") 20 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (4, 2, 4.0)") 21 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (5, 2, 2.2)") 22 | session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (6, 2, 2.8)") 23 | } 24 | 25 | val rdd = cc.cassandraSql("SELECT grp, max(value) AS mv FROM test.sql_demo GROUP BY grp ORDER BY mv") 26 | rdd.collect().foreach(println) // [2, 4.0] [1, 10.0] 27 | 28 | sc.stop() 29 | } 30 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/TimestampParser.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import java.util.Date 4 | 5 | import org.joda.time.format.DateTimeFormat 6 | 7 | import scala.util.{Success, Try} 8 | 9 | /** Parses CQL timestamps. 10 | * 11 | * Supported formats: 12 | * - `yyyy-MM-dd HH:mm` 13 | * - `yyyy-MM-dd HH:mmZ` 14 | * - `yyyy-MM-dd HH:mm:ss` 15 | * - `yyyy-MM-dd HH:mm:ssZ` 16 | * - `yyyy-MM-dd HH:mm:ss.SSS` 17 | * - `yyyy-MM-dd HH:mm:ss.SSSZ` 18 | * - `yyyy-MM-dd'T'HH:mm` 19 | * - `yyyy-MM-dd'T'HH:mmZ` 20 | * - `yyyy-MM-dd'T'HH:mm:ss` 21 | * - `yyyy-MM-dd'T'HH:mm:ssZ` 22 | * - `yyyy-MM-dd'T'HH:mm:ss.SSS` 23 | * - `yyyy-MM-dd'T'HH:mm:ss.SSSZ` 24 | * - `yyyy-MM-dd` 25 | * - `yyyy-MM-ddZ` 26 | */ 27 | object TimestampParser { 28 | private val dateStringPatterns = Array[String]( 29 | "yyyy-MM-dd HH:mm", 30 | "yyyy-MM-dd HH:mmZ", 31 | "yyyy-MM-dd HH:mm:ss", 32 | "yyyy-MM-dd HH:mm:ssZ", 33 | "yyyy-MM-dd HH:mm:ss.SSS", 34 | "yyyy-MM-dd HH:mm:ss.SSSZ", 35 | "yyyy-MM-dd'T'HH:mm", 36 | "yyyy-MM-dd'T'HH:mmZ", 37 | "yyyy-MM-dd'T'HH:mm:ss", 38 | "yyyy-MM-dd'T'HH:mm:ssZ", 39 | "yyyy-MM-dd'T'HH:mm:ss.SSS", 40 | "yyyy-MM-dd'T'HH:mm:ss.SSSZ", 41 | "yyyy-MM-dd", 42 | "yyyy-MM-ddZ") 43 | 44 | private val parsers = 45 | dateStringPatterns.map(DateTimeFormat.forPattern) 46 | 47 | def parse(date: String): Date = { 48 | parsers.view.map(p => Try(p.parseDateTime(date))).find(_.isSuccess) match { 49 | case Some(Success(d)) => d.toDate 50 | case _ => throw new IllegalArgumentException(s"Invalid date: $date") 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/CassandraClientProxy.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import java.lang.reflect.{InvocationTargetException, Proxy, Method, InvocationHandler} 4 | 5 | import org.apache.cassandra.thrift.Cassandra 6 | import org.apache.thrift.transport.TTransport 7 | 8 | /** Extends `Cassandra.Iface` with `close` method to close the underlying thrift transport */ 9 | trait CassandraClientProxy extends Cassandra.Iface { 10 | def close() 11 | } 12 | 13 | private class ClientProxyHandler(client: Cassandra.Iface, transport: TTransport) extends InvocationHandler { 14 | 15 | override def invoke(proxy: scala.Any, method: Method, args: Array[AnyRef]): AnyRef = { 16 | if (method.getName == "close") { 17 | transport.close() 18 | null 19 | } 20 | else 21 | try { 22 | method.invoke(client, args: _*) 23 | } 24 | catch { 25 | case e: InvocationTargetException => 26 | throw e.getCause 27 | } 28 | } 29 | } 30 | 31 | object CassandraClientProxy { 32 | 33 | /** Returns a proxy to the thrift client that provides closing the underlying transport by calling `close` method. 34 | * Without this method we'd have to keep references to two objects: the client and the transport. */ 35 | def wrap(client: Cassandra.Iface, transport: TTransport): CassandraClientProxy = { 36 | val classLoader = getClass.getClassLoader 37 | val interfaces = Array[Class[_]](classOf[CassandraClientProxy]) 38 | val invocationHandler = new ClientProxyHandler(client, transport) 39 | Proxy.newProxyInstance(classLoader, interfaces, invocationHandler).asInstanceOf[CassandraClientProxy] 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # for production, you should probably set pattern to %c instead of %l. 18 | # (%l is slower.) 19 | 20 | # output messages into a rolling log file as well as stdout 21 | log4j.rootLogger=WARN,stdout 22 | 23 | # stdout 24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 26 | log4j.appender.stdout.layout.ConversionPattern=%5p %d{HH:mm:ss,SSS} %C (%F:%L) - %m%n 27 | 28 | # Adding this to avoid thrift logging disconnect errors. 29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR 30 | 31 | # Avoid "no host ID found" when starting a fresh node 32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR 33 | 34 | # Avoid "address already in use" when starting multiple local Spark masters 35 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 36 | -------------------------------------------------------------------------------- /spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/RDDJavaFunctions.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.japi; 2 | 3 | import com.datastax.spark.connector.ColumnSelector; 4 | import com.datastax.spark.connector.RDDFunctions; 5 | import com.datastax.spark.connector.cql.CassandraConnector; 6 | import com.datastax.spark.connector.writer.RowWriterFactory; 7 | import com.datastax.spark.connector.writer.WriteConf; 8 | import org.apache.spark.SparkConf; 9 | import org.apache.spark.rdd.RDD; 10 | 11 | /** 12 | * A Java API wrapper over {@link org.apache.spark.rdd.RDD} to provide Spark Cassandra Connector functionality. 13 | * 14 | *

To obtain an instance of this wrapper, use one of the factory methods in {@link 15 | * com.datastax.spark.connector.japi.CassandraJavaUtil} class.

16 | */ 17 | @SuppressWarnings("UnusedDeclaration") 18 | public class RDDJavaFunctions extends RDDAndDStreamCommonJavaFunctions { 19 | public final RDD rdd; 20 | private final RDDFunctions rddf; 21 | 22 | RDDJavaFunctions(RDD rdd) { 23 | this.rdd = rdd; 24 | this.rddf = new RDDFunctions<>(rdd); 25 | } 26 | 27 | @Override 28 | public CassandraConnector defaultConnector() { 29 | return rddf.connector(); 30 | } 31 | 32 | @Override 33 | protected SparkConf getConf() { 34 | return rdd.conf(); 35 | } 36 | 37 | @Override 38 | protected void saveToCassandra(String keyspace, String table, RowWriterFactory rowWriterFactory, 39 | ColumnSelector columnNames, WriteConf conf, CassandraConnector connector) { 40 | rddf.saveToCassandra(keyspace, table, columnNames, conf, connector, rowWriterFactory); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/AsyncExecutorTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import java.util.concurrent.atomic.AtomicInteger 4 | import java.util.concurrent.{Callable, Executors} 5 | 6 | import com.google.common.util.concurrent.MoreExecutors 7 | import org.junit.Assert._ 8 | import org.junit.Test 9 | 10 | class AsyncExecutorTest { 11 | 12 | @Test 13 | def test() { 14 | val taskCount = 20 15 | val maxParallel = 5 16 | 17 | val currentlyRunningCounter = new AtomicInteger(0) 18 | val maxParallelCounter = new AtomicInteger(0) 19 | val totalFinishedExecutionsCounter = new AtomicInteger(0) 20 | 21 | val task = new Callable[String] { 22 | override def call() = { 23 | val c = currentlyRunningCounter.incrementAndGet() 24 | var m = maxParallelCounter.get() 25 | while (m < c && !maxParallelCounter.compareAndSet(m, c)) 26 | m = maxParallelCounter.get() 27 | Thread.sleep(100) 28 | currentlyRunningCounter.decrementAndGet() 29 | totalFinishedExecutionsCounter.incrementAndGet() 30 | "ok" 31 | } 32 | } 33 | 34 | val underlyingExecutor = MoreExecutors.listeningDecorator(Executors.newCachedThreadPool()) 35 | val asyncExecutor = new AsyncExecutor(underlyingExecutor.submit(_: Callable[String]), maxParallel) 36 | 37 | for (i <- 1 to taskCount) 38 | asyncExecutor.executeAsync(task) 39 | 40 | asyncExecutor.waitForCurrentlyExecutingTasks() 41 | assertEquals(maxParallel, maxParallelCounter.get()) 42 | assertEquals(taskCount, totalFinishedExecutionsCounter.get()) 43 | assertEquals(taskCount, asyncExecutor.successCount) 44 | assertEquals(0, asyncExecutor.failureCount) 45 | } 46 | 47 | 48 | 49 | 50 | } 51 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/simple-demos/src/main/resources/data/words: -------------------------------------------------------------------------------- 1 | Kitty lorem ipsum inspect anything brought into the house stick sit in box so loves cheeseburgers 2 | Run in circles shake treat bag hide when guests come over use lap as chair Sit in box throwup on your pillow purr while eating 3 | but present belly scratch hand when pet Chase dog then run away burrow under covers scratch hand when pet burrow under covers 4 | intrigued by the shower why must they do that Curl into a furry donut need to chase tail burrow under covers 5 | so swat at dog or sleep in the bathroom sink inspect anything brought into the house Hide when guests come over Under the bed 6 | I like big cats and i can not lie leave dead animals as gifts Curl into a furry donut shake treat bag hunt by meowing loudly at 5am 7 | next to human slave food dispenser need to chase tail and chew iPad power cord 8 | Hack up furballs hunt anything that moves but favor packaging over toy yet stand in front of the computer screen 9 | Favor packaging over toy throwup on your pillow who's the baby Give attitude Purr while eating chew iPad power cord hopped up on catnip 10 | so always hungry Stare at ceiling kick up litter or hunt by meowing loudly at 5am next to human slave food dispenser or stretch 11 | yet under the bed claw drapes Intently stare at the same spot make muffins but intently stare at the same spot 12 | or kick up litter and why must they do that Missing until dinner time hopped up on catnip who's the baby 13 | Sleep on keyboard favor packaging over toy why must they do that but bathe private parts with tongue then lick owner's face 14 | Purr for no reason Scamper sweet beast but mark territory and stand in front of the computer screen favor packaging over toy 15 | Sleep on desk infront of laptop sit on keyboard push mouse jump on bed and make bird sound at 3am -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/kafka-streaming/src/main/resources/data/words: -------------------------------------------------------------------------------- 1 | Kitty lorem ipsum inspect anything brought into the house stick sit in box so loves cheeseburgers 2 | Run in circles shake treat bag hide when guests come over use lap as chair Sit in box throwup on your pillow purr while eating 3 | but present belly scratch hand when pet Chase dog then run away burrow under covers scratch hand when pet burrow under covers 4 | intrigued by the shower why must they do that Curl into a furry donut need to chase tail burrow under covers 5 | so swat at dog or sleep in the bathroom sink inspect anything brought into the house Hide when guests come over Under the bed 6 | I like big cats and i can not lie leave dead animals as gifts Curl into a furry donut shake treat bag hunt by meowing loudly at 5am 7 | next to human slave food dispenser need to chase tail and chew iPad power cord 8 | Hack up furballs hunt anything that moves but favor packaging over toy yet stand in front of the computer screen 9 | Favor packaging over toy throwup on your pillow who's the baby Give attitude Purr while eating chew iPad power cord hopped up on catnip 10 | so always hungry Stare at ceiling kick up litter or hunt by meowing loudly at 5am next to human slave food dispenser or stretch 11 | yet under the bed claw drapes Intently stare at the same spot make muffins but intently stare at the same spot 12 | or kick up litter and why must they do that Missing until dinner time hopped up on catnip who's the baby 13 | Sleep on keyboard favor packaging over toy why must they do that but bathe private parts with tongue then lick owner's face 14 | Purr for no reason Scamper sweet beast but mark territory and stand in front of the computer screen favor packaging over toy 15 | Sleep on desk infront of laptop sit on keyboard push mouse jump on bed and make bird sound at 3am -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/CollectionColumnType.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import com.datastax.spark.connector.types.TypeConverter.OptionToNullConverter 4 | 5 | import scala.language.existentials 6 | import scala.reflect.runtime.universe._ 7 | 8 | trait CollectionColumnType[T] extends ColumnType[T] { 9 | def isCollection = true 10 | } 11 | 12 | case class ListType[T](elemType: ColumnType[T]) extends CollectionColumnType[Vector[T]] { 13 | @transient 14 | lazy val converterToCassandra = 15 | TypeConverter.javaArrayListConverter(elemType.converterToCassandra) 16 | 17 | @transient 18 | lazy val scalaTypeTag = TypeTag.synchronized { 19 | implicit val elemTypeTag = elemType.scalaTypeTag 20 | implicitly[TypeTag[Vector[T]]] 21 | } 22 | } 23 | 24 | case class SetType[T](elemType: ColumnType[T]) extends CollectionColumnType[Set[T]] { 25 | @transient 26 | lazy val converterToCassandra = 27 | new OptionToNullConverter(TypeConverter.javaHashSetConverter(elemType.converterToCassandra)) 28 | 29 | @transient 30 | lazy val scalaTypeTag = TypeTag.synchronized { 31 | implicit val elemTypeTag = elemType.scalaTypeTag 32 | implicitly[TypeTag[Set[T]]] 33 | } 34 | } 35 | 36 | case class MapType[K, V](keyType: ColumnType[K], valueType: ColumnType[V]) extends CollectionColumnType[Map[K, V]] { 37 | @transient 38 | lazy val converterToCassandra = 39 | new OptionToNullConverter( 40 | TypeConverter.javaHashMapConverter(keyType.converterToCassandra, valueType.converterToCassandra)) 41 | 42 | @transient 43 | lazy val scalaTypeTag = TypeTag.synchronized { 44 | implicit val keyTypeTag = keyType.scalaTypeTag 45 | implicit val valueTypeTag = valueType.scalaTypeTag 46 | implicitly[TypeTag[Map[K, V]]] 47 | } 48 | } 49 | 50 | -------------------------------------------------------------------------------- /spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/DStreamJavaFunctions.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.japi; 2 | 3 | import com.datastax.spark.connector.ColumnSelector; 4 | import com.datastax.spark.connector.cql.CassandraConnector; 5 | import com.datastax.spark.connector.streaming.DStreamFunctions; 6 | import com.datastax.spark.connector.writer.RowWriterFactory; 7 | import com.datastax.spark.connector.writer.WriteConf; 8 | import org.apache.spark.SparkConf; 9 | import org.apache.spark.streaming.dstream.DStream; 10 | 11 | /** 12 | * A Java API wrapper over {@link org.apache.spark.streaming.dstream.DStream} to provide Spark Cassandra Connector 13 | * functionality. 14 | * 15 | *

To obtain an instance of this wrapper, use one of the factory methods in {@link 16 | * com.datastax.spark.connector.japi.CassandraJavaUtil} class.

17 | */ 18 | @SuppressWarnings("UnusedDeclaration") 19 | public class DStreamJavaFunctions extends RDDAndDStreamCommonJavaFunctions { 20 | public final DStream dstream; 21 | private final DStreamFunctions dsf; 22 | 23 | DStreamJavaFunctions(DStream dStream) { 24 | this.dstream = dStream; 25 | this.dsf = new DStreamFunctions<>(dStream); 26 | } 27 | 28 | @Override 29 | public CassandraConnector defaultConnector() { 30 | return dsf.connector(); 31 | } 32 | 33 | @Override 34 | protected SparkConf getConf() { 35 | return dstream.ssc().conf(); 36 | } 37 | 38 | @Override 39 | protected void saveToCassandra(String keyspace, String table, RowWriterFactory rowWriterFactory, 40 | ColumnSelector columnNames, WriteConf conf, CassandraConnector connector) { 41 | dsf.saveToCassandra(keyspace, table, columnNames, conf, connector, rowWriterFactory); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /spark-cassandra-connector-demos/twitter-streaming/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # for production, you should probably set pattern to %c instead of %l. 18 | # (%l is slower.) 19 | 20 | # output messages into a rolling log file as well as stdout 21 | log4j.rootLogger=INFO,stdout 22 | 23 | # stdout 24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 26 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 27 | 28 | # Adding this to avoid thrift logging disconnect errors. 29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR 30 | 31 | # Avoid "no host ID found" when starting a fresh node 32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR 33 | 34 | # If running spark local, ignore block input exists warnings, which are expected. 35 | log4j.logger.org.apache.spark.storage.BlockManager=ERROR 36 | log4j.logger.com.datastax.spark.connector=INFO 37 | log4j.logger.org.apache.spark=WARN 38 | log4j.logger.com.datastax.driver.core=WARN -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/ColumnRef.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import scala.language.implicitConversions 4 | 5 | /** Unambiguous reference to a column in the query result set row. */ 6 | sealed trait ColumnRef 7 | 8 | sealed trait NamedColumnRef extends ColumnRef { 9 | /** Returns the column name which this selection bases on. In case of a function, such as `ttl` or 10 | * `writetime`, it returns the column name passed to that function. */ 11 | def columnName: String 12 | 13 | /** Returns a CQL phrase which has to be passed to the `SELECT` clause with appropriate quotation 14 | * marks. */ 15 | def cql: String 16 | 17 | /** Returns a name of the selection as it is seen in the result set. Most likely this is going to be 18 | * used when providing custom column name to field name mapping. */ 19 | def selectedAs: String 20 | } 21 | 22 | object NamedColumnRef { 23 | def unapply(columnRef: NamedColumnRef) = Some((columnRef.columnName, columnRef.selectedAs)) 24 | } 25 | 26 | /** References a column by name. */ 27 | case class ColumnName(columnName: String) extends NamedColumnRef { 28 | val cql = s""""$columnName"""" 29 | val selectedAs = columnName 30 | 31 | override def toString: String = selectedAs 32 | } 33 | 34 | case class TTL(columnName: String) extends NamedColumnRef { 35 | val cql = s"""TTL("$columnName")""" 36 | val selectedAs = s"ttl($columnName)" 37 | 38 | override def toString: String = selectedAs 39 | } 40 | 41 | case class WriteTime(columnName: String) extends NamedColumnRef { 42 | val cql = s"""WRITETIME("$columnName")""" 43 | val selectedAs = s"writetime($columnName)" 44 | 45 | override def toString: String = selectedAs 46 | } 47 | 48 | /** References a column by its index in the row. Useful for tuples. */ 49 | case class ColumnIndex(columnIndex: Int) extends ColumnRef 50 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/KeyValueRowReader.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.reader 2 | 3 | import com.datastax.driver.core.{ProtocolVersion, Row} 4 | import com.datastax.spark.connector.cql.TableDef 5 | 6 | class KeyValueRowReaderFactory[K, V](keyRRF: RowReaderFactory[K], valueRRF: RowReaderFactory[V]) 7 | extends RowReaderFactory[(K, V)] { 8 | 9 | override def rowReader(table: TableDef, options: RowReaderOptions): RowReader[(K, V)] = { 10 | val keyReader = keyRRF.rowReader(table, options) 11 | val valueReaderOptions = options.copy(offset = options.offset + keyReader.consumedColumns.getOrElse(0)) 12 | val valueReader = valueRRF.rowReader(table, valueReaderOptions) 13 | new KeyValueRowReader(keyReader, valueReader) 14 | } 15 | 16 | override def targetClass: Class[(K, V)] = classOf[(K, V)] 17 | } 18 | 19 | class KeyValueRowReader[K, V](keyReader: RowReader[K], valueReader: RowReader[V]) extends RowReader[(K, V)] { 20 | 21 | override def requiredColumns: Option[Int] = 22 | (for (keyCnt <- keyReader.requiredColumns; valueCnt <- valueReader.requiredColumns) yield keyCnt max valueCnt) 23 | .orElse(keyReader.requiredColumns).orElse(valueReader.requiredColumns) 24 | 25 | override def columnNames: Option[Seq[String]] = 26 | (for (keyNames <- keyReader.columnNames; valueNames <- valueReader.columnNames) yield keyNames ++ valueNames) 27 | .orElse(keyReader.columnNames).orElse(valueReader.columnNames) 28 | 29 | override def read(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): (K, V) = { 30 | (keyReader.read(row, columnNames, protocolVersion), valueReader.read(row, columnNames, protocolVersion)) 31 | } 32 | 33 | override def consumedColumns: Option[Int] = 34 | for (keySkip <- keyReader.consumedColumns; valueSkip <- valueReader.consumedColumns) 35 | yield keySkip + valueSkip 36 | } 37 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/mapper/TupleColumnMapperTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.mapper 2 | 3 | import com.datastax.spark.connector.ColumnIndex 4 | import com.datastax.spark.connector.cql.{TableDef, RegularColumn, ColumnDef} 5 | import com.datastax.spark.connector.types.IntType 6 | import org.apache.commons.lang3.SerializationUtils 7 | import org.junit.Assert._ 8 | import org.junit.Test 9 | 10 | class TupleColumnMapperTest { 11 | 12 | private val c1 = ColumnDef("test", "table", "column1", RegularColumn, IntType) 13 | private val c2 = ColumnDef("test", "table", "column2", RegularColumn, IntType) 14 | private val c3 = ColumnDef("test", "table", "column3", RegularColumn, IntType) 15 | private val tableDef = TableDef("test", "table", Seq(c1), Seq(c2), Seq(c3)) 16 | 17 | @Test 18 | def testGetters() { 19 | val columnMap = new TupleColumnMapper[(Int, String, Boolean)].columnMap(tableDef) 20 | val getters = columnMap.getters 21 | assertEquals(ColumnIndex(0), getters("_1")) 22 | assertEquals(ColumnIndex(1), getters("_2")) 23 | assertEquals(ColumnIndex(2), getters("_3")) 24 | } 25 | 26 | @Test 27 | def testConstructor() { 28 | val columnMap = new TupleColumnMapper[(Int, String, Boolean)].columnMap(tableDef) 29 | assertEquals(Seq(ColumnIndex(0), ColumnIndex(1), ColumnIndex(2)), columnMap.constructor) 30 | } 31 | 32 | @Test 33 | def testSerialize() { 34 | val columnMap = new TupleColumnMapper[(Int, String, Boolean)].columnMap(tableDef) 35 | SerializationUtils.roundtrip(columnMap) 36 | } 37 | 38 | @Test 39 | def testImplicit() { 40 | val columnMap = implicitly[ColumnMapper[(Int, String, Boolean)]].columnMap(tableDef) 41 | val getters = columnMap.getters 42 | assertEquals(ColumnIndex(0), getters("_1")) 43 | assertEquals(ColumnIndex(1), getters("_2")) 44 | assertEquals(ColumnIndex(2), getters("_3")) 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/util/ReflectionUtilSpec.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.util 2 | 3 | import com.datastax.spark.connector.cql.{CassandraConnectorConf, DefaultConnectionFactory, CassandraConnectionFactory} 4 | import org.scalatest.{FlatSpec, Matchers} 5 | 6 | class ReflectionUtilSpec extends FlatSpec with Matchers { 7 | 8 | "ReflectionUtil.findGlobalObject" should "be able to find DefaultConnectionFactory" in { 9 | val factory = ReflectionUtil.findGlobalObject[CassandraConnectionFactory]( 10 | "com.datastax.spark.connector.cql.DefaultConnectionFactory") 11 | factory should be(DefaultConnectionFactory) 12 | } 13 | 14 | it should "be able to instantiate a singleton object based on Java class name" in { 15 | val obj = ReflectionUtil.findGlobalObject[String]("java.lang.String") 16 | obj should be ("") 17 | } 18 | 19 | it should "cache Java class instances" in { 20 | val obj1 = ReflectionUtil.findGlobalObject[String]("java.lang.String") 21 | val obj2 = ReflectionUtil.findGlobalObject[String]("java.lang.String") 22 | obj1 shouldBe theSameInstanceAs (obj2) 23 | } 24 | 25 | it should "throw IllegalArgumentException when asked for a Scala object of wrong type" in { 26 | intercept[IllegalArgumentException] { 27 | ReflectionUtil.findGlobalObject[CassandraConnectorConf]( 28 | "com.datastax.spark.connector.cql.DefaultConnectionFactory") 29 | } 30 | } 31 | 32 | it should "throw IllegalArgumentException when asked for class instance of wrong type" in { 33 | intercept[IllegalArgumentException] { 34 | ReflectionUtil.findGlobalObject[Integer]("java.lang.String") 35 | } 36 | } 37 | 38 | it should "throw IllegalArgumentException when object does not exist" in { 39 | intercept[IllegalArgumentException] { 40 | ReflectionUtil.findGlobalObject[CassandraConnectorConf]("NoSuchObject") 41 | } 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/dht/TokenFactory.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner.dht 2 | 3 | import scala.language.existentials 4 | 5 | trait TokenFactory[V, T <: Token[V]] { 6 | def minToken: T 7 | def maxToken: T 8 | def totalTokenCount: BigInt 9 | def fromString(string: String): T 10 | def toString(token: T): String 11 | } 12 | 13 | object TokenFactory { 14 | 15 | type V = t forSome { type t } 16 | type T = t forSome { type t <: Token[V] } 17 | 18 | implicit object Murmur3TokenFactory extends TokenFactory[Long, LongToken] { 19 | override val minToken = LongToken(Long.MinValue) 20 | override val maxToken = LongToken(Long.MaxValue) 21 | override val totalTokenCount = BigInt(maxToken.value) - BigInt(minToken.value) 22 | override def fromString(string: String) = LongToken(string.toLong) 23 | override def toString(token: LongToken) = token.value.toString 24 | } 25 | 26 | implicit object RandomPartitionerTokenFactory extends TokenFactory[BigInt, BigIntToken] { 27 | override val minToken = BigIntToken(-1) 28 | override val maxToken = BigIntToken(BigInt(2).pow(127)) 29 | override val totalTokenCount = maxToken.value - minToken.value 30 | override def fromString(string: String) = BigIntToken(BigInt(string)) 31 | override def toString(token: BigIntToken) = token.value.toString() 32 | } 33 | 34 | def forCassandraPartitioner(partitionerClassName: String): TokenFactory[V, T] = { 35 | val partitioner = 36 | partitionerClassName match { 37 | case "org.apache.cassandra.dht.Murmur3Partitioner" => Murmur3TokenFactory 38 | case "org.apache.cassandra.dht.RandomPartitioner" => RandomPartitionerTokenFactory 39 | case _ => throw new IllegalArgumentException(s"Unsupported partitioner: $partitionerClassName") 40 | } 41 | partitioner.asInstanceOf[TokenFactory[V, T]] 42 | } 43 | } 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/MagicalTypeTricks.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.util 2 | 3 | object MagicalTypeTricks { 4 | 5 | trait DoesntHaveImplicit[A, B] 6 | implicit def doesntHaveImplicit[A, B]: A DoesntHaveImplicit B = null 7 | implicit def doesntHaveImplicitAmbiguity1[A, B](implicit ev: B): A DoesntHaveImplicit B = null 8 | implicit def doesntHaveImplicitAmbiguity2[A, B](implicit ev: B): A DoesntHaveImplicit B = null 9 | 10 | trait IsNotEqualTo[A, B] 11 | implicit def neq[A, B]: A IsNotEqualTo B = null 12 | implicit def neqAmbiguity1[A]: A IsNotEqualTo A = null 13 | implicit def neqAmbiguity2[A]: A IsNotEqualTo A = null 14 | 15 | trait IsNotSubclassOf[A, B] 16 | implicit def nsub[A, B]: A IsNotSubclassOf B = null 17 | implicit def nsubAmbiguity1[A, B >: A]: A IsNotSubclassOf B = null 18 | implicit def nsubAmbiguity2[A, B >: A]: A IsNotSubclassOf B = null 19 | 20 | type ¬[A] = A => Nothing 21 | type λ[A] = ¬[¬[A]] 22 | 23 | /** 24 | * Example of how disjunction can be used: 25 | * {{{ 26 | * scala> import com.datastax.spark.connector.util.MagicalTypeTricks._ 27 | * import com.datastax.spark.connector.util.MagicalTypeTricks._ 28 | * 29 | * scala> def function[T](t: T)(implicit ev: (λ[T] <:< (Int ∪ String))) = { println(s"t = $t") } 30 | * function: [T](t: T)(implicit ev: <:<[(T => Nothing) => Nothing,Int => Nothing with String => Nothing => Nothing])Unit 31 | * 32 | * scala> function(5) 33 | * t = 5 34 | * 35 | * scala> function("five") 36 | * t = five 37 | * 38 | * scala> function(5d) 39 | * :13: error: Cannot prove that (Double => Nothing) => Nothing <:< Int => Nothing with String => Nothing => Nothing. 40 | * function(5d) 41 | * ^ 42 | * }}} 43 | * 44 | * Based on [[http://www.chuusai.com/2011/06/09/scala-union-types-curry-howard/ this article]]. 45 | */ 46 | type ∪[T, U] = ¬[¬[T] with ¬[U]] 47 | 48 | } 49 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/ObjectSizeEstimator.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import java.io.{OutputStream, ObjectOutputStream} 4 | import java.nio.ByteBuffer 5 | 6 | import scala.collection.JavaConversions._ 7 | 8 | import org.apache.cassandra.utils.ByteBufferUtil 9 | 10 | 11 | /** Estimates amount of memory required to serialize Java/Scala objects */ 12 | object ObjectSizeEstimator { 13 | 14 | private def makeSerializable(obj: Any): AnyRef = { 15 | obj match { 16 | case bb: ByteBuffer => ByteBufferUtil.getArray(bb) 17 | case list: java.util.List[_] => list.map(makeSerializable) 18 | case list: List[_] => list.map(makeSerializable) 19 | case set: java.util.Set[_] => set.map(makeSerializable) 20 | case set: Set[_] => set.map(makeSerializable) 21 | case map: java.util.Map[_, _] => map.map { case (k, v) => (makeSerializable(k), makeSerializable(v)) } 22 | case map: Map[_, _] => map.map { case (k, v) => (makeSerializable(k), makeSerializable(v)) } 23 | case other => other.asInstanceOf[AnyRef] 24 | } 25 | } 26 | 27 | /** Records only how many bytes were written but the actual data is discarded */ 28 | private class CountingOutputStream extends OutputStream { 29 | private var _length = 0 30 | override def write(b: Int) = _length += 1 31 | override def write(b: Array[Byte]) = _length += b.length 32 | override def write(b: Array[Byte], off: Int, len: Int) = _length += len 33 | def length = _length 34 | } 35 | 36 | /** Serializes passed objects and reports their total size */ 37 | def measureSerializedSize(objects: Seq[Any]): Int = { 38 | val countingStream = new CountingOutputStream 39 | val objectStream = new ObjectOutputStream(countingStream) 40 | for (obj <- objects) 41 | objectStream.writeObject(makeSerializable(obj)) 42 | objectStream.close() 43 | countingStream.length 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/PreparedStatementCache.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import com.datastax.driver.core.{RegularStatement, Session, Cluster, PreparedStatement} 4 | import com.datastax.spark.connector.util.Logging 5 | 6 | import scala.collection.concurrent.TrieMap 7 | 8 | /** Caches prepared statements so they are not prepared 9 | * multiple times by different threads. */ 10 | object PreparedStatementCache extends Logging { 11 | 12 | private val clusterCache = 13 | TrieMap[Cluster, TrieMap[String, PreparedStatement]]() 14 | 15 | private def get(cluster: Cluster, query: String): Option[PreparedStatement] = 16 | for (statementCache <- clusterCache.get(cluster); 17 | statement <- statementCache.get(query)) yield statement 18 | 19 | private def put(cluster: Cluster, query: String, statement: PreparedStatement): PreparedStatement = { 20 | clusterCache.get(cluster) match { 21 | case Some(statementCache) => statementCache.put(query, statement) 22 | case None => clusterCache.put(cluster, TrieMap(query -> statement)) 23 | } 24 | statement 25 | } 26 | 27 | /** Removes all statements associated with the `Cluster` from the cache. */ 28 | def remove(cluster: Cluster) { 29 | synchronized { 30 | clusterCache.remove(cluster) 31 | } 32 | } 33 | 34 | /** Retrieves a `PreparedStatement` from cache or 35 | * creates a new one if not found and updates the cache. */ 36 | def prepareStatement(session: Session, query: RegularStatement): PreparedStatement = { 37 | val cluster = session.getCluster 38 | get(cluster, query.toString) match { 39 | case Some(stmt) => stmt 40 | case None => 41 | synchronized { 42 | get(cluster, query.toString) match { 43 | case Some(stmt) => stmt 44 | case None => 45 | val stmt = session.prepare(query) 46 | put(cluster, query.toString, stmt) 47 | } 48 | } 49 | } 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /doc/3_selection.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | ## Server-side data selection and filtering 3 | 4 | In this section, you'll learn how to reduce the amount of data transferred from Cassandra to Spark 5 | to speed up processing. 6 | 7 | ### Selecting a subset of columns 8 | 9 | For performance reasons, you should not fetch columns you don't need. You can achieve this with the `select` method. 10 | 11 | ```scala 12 | sc.cassandraTable("test", "users").select("username").toArray.foreach(println) 13 | // CassandraRow{username: noemail} 14 | // CassandraRow{username: someone} 15 | ``` 16 | 17 | The `select` method can be chained. Every next call can be used to select a subset of columns already selected. 18 | Selecting a non-existing column would result in throwing an exception. 19 | 20 | ### Filtering rows 21 | 22 | To filter rows, you can use the filter transformation provided by Spark. 23 | However, this approach causes all rows to be fetched from Cassandra and then filtered by Spark. 24 | Also, some CPU cycles are wasted serializing and deserializing objects that wouldn't be 25 | included in the result. To avoid this overhead, `CassandraRDD` offers the `where` method, which lets you pass 26 | arbitrary CQL condition(s) to filter the row set on the server. 27 | 28 | ```scala 29 | sc.cassandraTable("test", "cars").select("id", "model").where("color = ?", "black").toArray.foreach(println) 30 | // CassandraRow[id: KF-334L, model: Ford Mondeo] 31 | // CassandraRow[id: MT-8787, model: Hyundai x35] 32 | 33 | sc.cassandraTable("test", "cars").select("id", "model").where("color = ?", "silver").toArray.foreach(println) 34 | // CassandraRow[id: WX-2234, model: Toyota Yaris] 35 | ``` 36 | 37 | Note: Although the `ALLOW FILTERING` clause is implicitly added to the generated CQL query, not all predicates 38 | are currently allowed by the Cassandra engine. This limitation is going to be addressed in the future 39 | Cassandra releases. Currently, `ALLOW FILTERING` works well 40 | with columns indexed by secondary indexes or clustering columns. 41 | 42 | 43 | [Next - Working with user-defined case classes and tuples](4_mapper.md) -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/ValueRowReader.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.reader 2 | 3 | import com.datastax.driver.core.{ProtocolVersion, Row} 4 | import com.datastax.spark.connector._ 5 | import com.datastax.spark.connector.cql.TableDef 6 | import com.datastax.spark.connector.types.TypeConverter 7 | import com.datastax.spark.connector.util.JavaApiHelper 8 | 9 | class ValueRowReader[T: TypeConverter](columnRef: ColumnRef) extends RowReader[T] { 10 | 11 | private val converter = implicitly[TypeConverter[T]] 12 | 13 | /** Reads column values from low-level `Row` and turns them into higher level representation. 14 | * @param row row fetched from Cassandra 15 | * @param columnNames column names available in the `row` */ 16 | override def read(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): T = { 17 | columnRef match { 18 | case ColumnIndex(idx) => converter.convert(AbstractRow.get(row, idx, protocolVersion)) 19 | case NamedColumnRef(_, selectedAs) => converter.convert(AbstractRow.get(row, selectedAs, protocolVersion)) 20 | } 21 | } 22 | 23 | /** List of columns this `RowReader` is going to read. 24 | * Useful to avoid fetching the columns that are not needed. */ 25 | override def columnNames: Option[Seq[String]] = columnRef match { 26 | case NamedColumnRef(_, selectedAs) => Some(Seq(selectedAs)) 27 | case _ => None 28 | } 29 | 30 | /** The number of columns that need to be fetched from C*. */ 31 | override def requiredColumns: Option[Int] = columnRef match { 32 | case ColumnIndex(idx) => Some(idx) 33 | case _ => None 34 | } 35 | 36 | override def consumedColumns: Option[Int] = Some(1) 37 | } 38 | 39 | class ValueRowReaderFactory[T: TypeConverter] 40 | extends RowReaderFactory[T] { 41 | 42 | override def rowReader(table: TableDef, options: RowReaderOptions): RowReader[T] = { 43 | new ValueRowReader[T](ColumnIndex(options.offset)) 44 | } 45 | 46 | override def targetClass: Class[T] = JavaApiHelper.getRuntimeClass(implicitly[TypeConverter[T]].targetTypeTag) 47 | } 48 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/WriteOption.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import java.util.Date 4 | 5 | import org.apache.spark.streaming.{Duration => SparkDuration} 6 | import org.joda.time.{DateTime, Duration => JodaDuration} 7 | 8 | import scala.concurrent.duration.{Duration => ScalaDuration} 9 | 10 | sealed trait WriteOption[+T] 11 | 12 | sealed trait TTLOption extends WriteOption[Int] 13 | 14 | sealed trait TimestampOption extends WriteOption[Long] 15 | 16 | case class StaticWriteOption[T](value: T) extends WriteOption[T] 17 | 18 | case class PerRowWriteOption[T](placeholder: String) extends WriteOption[T] 19 | 20 | object TTLOption { 21 | 22 | case object auto extends TTLOption 23 | 24 | def forever: TTLOption = new StaticWriteOption[Int](0) with TTLOption 25 | 26 | def constant(ttl: Int): TTLOption = { 27 | require(ttl > 0, "Explicitly specified TTL must be greater than zero.") 28 | new StaticWriteOption[Int](ttl) with TTLOption 29 | } 30 | 31 | def constant(ttl: SparkDuration): TTLOption = constant((ttl.milliseconds / 1000L).toInt) 32 | 33 | def constant(ttl: JodaDuration): TTLOption = constant(ttl.getStandardSeconds.toInt) 34 | 35 | def constant(ttl: ScalaDuration): TTLOption = if (ttl.isFinite()) constant(ttl.toSeconds.toInt) else forever 36 | 37 | def perRow(placeholder: String): TTLOption = 38 | new PerRowWriteOption[Int](placeholder) with TTLOption 39 | 40 | } 41 | 42 | object TimestampOption { 43 | 44 | case object auto extends TimestampOption 45 | 46 | def constant(microseconds: Long): TimestampOption = { 47 | require(microseconds > 0, "Explicitly specified time must be greater than zero.") 48 | new StaticWriteOption[Long](microseconds) with TimestampOption 49 | } 50 | 51 | def constant(timestamp: Date): TimestampOption = constant(timestamp.getTime * 1000L) 52 | 53 | def constant(timestamp: DateTime): TimestampOption = constant(timestamp.getMillis * 1000L) 54 | 55 | def perRow(placeholder: String): TimestampOption = 56 | new PerRowWriteOption[Long](placeholder) with TimestampOption 57 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/JavaBeanColumnMapper.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.mapper 2 | 3 | import java.lang.reflect.Method 4 | 5 | import com.datastax.spark.connector.cql.TableDef 6 | 7 | import scala.reflect.ClassTag 8 | 9 | class JavaBeanColumnMapper[T : ClassTag](columnNameOverride: Map[String, String] = Map.empty) extends ReflectionColumnMapper[T] { 10 | 11 | import com.datastax.spark.connector.mapper.JavaBeanColumnMapper._ 12 | 13 | override def classTag: ClassTag[T] = implicitly[ClassTag[T]] 14 | 15 | private def propertyName(accessorName: String) = { 16 | val AccessorRegex(_, strippedName) = accessorName 17 | strippedName(0).toLower + strippedName.substring(1) 18 | } 19 | 20 | override protected def isGetter(method: Method): Boolean = 21 | GetterRegex.findFirstMatchIn(method.getName).isDefined && 22 | method.getParameterTypes.size == 0 && 23 | method.getReturnType != Void.TYPE 24 | 25 | override protected def isSetter(method: Method): Boolean = 26 | SetterRegex.findFirstMatchIn(method.getName).isDefined && 27 | method.getParameterTypes.size == 1 && 28 | method.getReturnType == Void.TYPE 29 | 30 | override protected def getterToColumnName(getterName: String, tableDef: TableDef) = { 31 | val p = propertyName(getterName) 32 | columnNameOverride.getOrElse(p, columnNameForProperty(p, tableDef)) 33 | } 34 | 35 | override protected def setterToColumnName(setterName: String, tableDef: TableDef) = { 36 | val p = propertyName(setterName) 37 | columnNameOverride.getOrElse(p, columnNameForProperty(p, tableDef)) 38 | } 39 | 40 | override protected def constructorParamToColumnName(paramName: String, tableDef: TableDef) = { 41 | columnNameOverride.getOrElse(paramName, columnNameForProperty(paramName, tableDef)) 42 | } 43 | 44 | /** Java Beans allow nulls in property values */ 45 | override protected def allowsNull = true 46 | } 47 | 48 | object JavaBeanColumnMapper { 49 | val GetterRegex = "^(get|is)(.+)$".r 50 | val SetterRegex = "^(set)(.+)$".r 51 | val AccessorRegex = "^(get|is|set)(.+)$".r 52 | } -------------------------------------------------------------------------------- /spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/GenericJavaRowReaderFactory.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.japi; 2 | 3 | import com.datastax.driver.core.ProtocolVersion; 4 | import com.datastax.driver.core.Row; 5 | import com.datastax.spark.connector.cql.TableDef; 6 | import com.datastax.spark.connector.rdd.reader.RowReader; 7 | import com.datastax.spark.connector.rdd.reader.RowReaderFactory; 8 | import com.datastax.spark.connector.rdd.reader.RowReaderOptions; 9 | import scala.Option; 10 | import scala.collection.Seq; 11 | 12 | public class GenericJavaRowReaderFactory { 13 | public final static RowReaderFactory instance = new RowReaderFactory() { 14 | @Override 15 | public RowReader rowReader(TableDef table, RowReaderOptions options) { 16 | return JavaRowReader.instance; 17 | } 18 | 19 | @Override 20 | public RowReaderOptions rowReader$default$2() { 21 | return new RowReaderOptions(RowReaderOptions.apply$default$1()); 22 | } 23 | 24 | @Override 25 | public Class targetClass() { 26 | return CassandraRow.class; 27 | } 28 | }; 29 | 30 | 31 | public static class JavaRowReader implements RowReader { 32 | public final static JavaRowReader instance = new JavaRowReader(); 33 | 34 | private JavaRowReader() { 35 | } 36 | 37 | @Override 38 | public CassandraRow read(Row row, String[] columnNames, ProtocolVersion protocolVersion) { 39 | assert row.getColumnDefinitions().size() == columnNames.length : 40 | "Number of columns in a row must match the number of columns in the table metadata"; 41 | return CassandraRow$.MODULE$.fromJavaDriverRow(row, columnNames, protocolVersion); 42 | } 43 | 44 | @Override 45 | public Option> columnNames() { 46 | return Option.empty(); 47 | } 48 | 49 | @Override 50 | public Option requiredColumns() { 51 | return Option.empty(); 52 | } 53 | 54 | @Override 55 | public Option consumedColumns() { 56 | return Option.empty(); 57 | } 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /sbt/sbt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # This script launches sbt for this project. If present it uses the system 21 | # version of sbt. If there is no system version of sbt it attempts to download 22 | # sbt locally. 23 | SBT_VERSION=0.13.1 24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar 27 | 28 | # Download sbt launch jar if it hasn't been downloaded yet 29 | if [ ! -f ${JAR} ]; then 30 | # Download 31 | printf "Attempting to fetch sbt\n" 32 | JAR_DL=${JAR}.part 33 | if hash curl 2>/dev/null; then 34 | (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR} 35 | elif hash wget 2>/dev/null; then 36 | (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} 37 | else 38 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" 39 | exit -1 40 | fi 41 | fi 42 | if [ ! -f ${JAR} ]; then 43 | # We failed to download 44 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" 45 | exit -1 46 | fi 47 | printf "Launching sbt from ${JAR}\n" 48 | java \ 49 | -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \ 50 | -jar ${JAR} \ 51 | "$@" 52 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/AsyncExecutor.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import java.util.concurrent.Semaphore 4 | import java.util.concurrent.atomic.AtomicInteger 5 | 6 | import com.datastax.spark.connector.util.Logging 7 | import com.google.common.util.concurrent.{FutureCallback, Futures, ListenableFuture, SettableFuture} 8 | 9 | import scala.collection.concurrent.TrieMap 10 | import scala.util.Try 11 | 12 | /** Asynchronously executes tasks but blocks if the limit of unfinished tasks is reached. */ 13 | class AsyncExecutor[T, R](asyncAction: T => ListenableFuture[R], maxConcurrentTasks: Int) extends Logging { 14 | 15 | private val _successCount = new AtomicInteger(0) 16 | private val _failureCount = new AtomicInteger(0) 17 | 18 | private val semaphore = new Semaphore(maxConcurrentTasks) 19 | private val pendingFutures = new TrieMap[ListenableFuture[R], Boolean] 20 | 21 | /** Executes task asynchronously or blocks if more than `maxConcurrentTasks` limit is reached */ 22 | def executeAsync(task: T): ListenableFuture[R] = { 23 | semaphore.acquire() 24 | 25 | val settable = SettableFuture.create[R]() 26 | pendingFutures.put(settable, true) 27 | 28 | val future = asyncAction(task) 29 | 30 | Futures.addCallback(future, new FutureCallback[R] { 31 | def release() { 32 | semaphore.release() 33 | pendingFutures.remove(settable) 34 | } 35 | def onSuccess(result: R) { 36 | _successCount.incrementAndGet() 37 | release() 38 | settable.set(result) 39 | } 40 | def onFailure(throwable: Throwable) { 41 | logError("Failed to execute: " + task, throwable) 42 | _failureCount.incrementAndGet() 43 | release() 44 | settable.setException(throwable) 45 | } 46 | }) 47 | 48 | settable 49 | } 50 | 51 | /** Waits until the tasks being currently executed get completed. 52 | * It will not wait for tasks scheduled for execution during this method call, 53 | * nor tasks for which the [[executeAsync]] method did not complete. */ 54 | def waitForCurrentlyExecutingTasks() { 55 | for ((future, _) <- pendingFutures.snapshot()) 56 | Try(future.get()) 57 | } 58 | 59 | def successCount = _successCount.get() 60 | def failureCount = _failureCount.get() 61 | 62 | } 63 | -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/Assertions.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import scala.annotation.tailrec 4 | import scala.concurrent.duration._ 5 | 6 | /** 7 | * Simple helper assertions. Some stolen from Akka akka.testkit.TestKit.scala for now. 8 | */ 9 | trait Assertions { 10 | 11 | /** Obtain current time (`System.nanoTime`) as Duration. */ 12 | def now: FiniteDuration = System.nanoTime.nanos 13 | 14 | private var end: Duration = Duration.Undefined 15 | 16 | /** 17 | * Obtain time remaining for execution of the innermost enclosing `within` 18 | * block or missing that it returns the properly dilated default for this 19 | * case from settings (key "akka.test.single-expect-default"). 20 | */ 21 | def remainingOrDefault = remainingOr(1.seconds.dilated) 22 | 23 | /** 24 | * Obtain time remaining for execution of the innermost enclosing `within` 25 | * block or missing that it returns the given duration. 26 | */ 27 | def remainingOr(duration: FiniteDuration): FiniteDuration = end match { 28 | case x if x eq Duration.Undefined => duration 29 | case x if !x.isFinite => throw new IllegalArgumentException("`end` cannot be infinite") 30 | case f: FiniteDuration => f - now 31 | } 32 | 33 | /** 34 | * Await until the given condition evaluates to `true` or the timeout 35 | * expires, whichever comes first. 36 | * If no timeout is given, take it from the innermost enclosing `within` 37 | * block. 38 | */ 39 | def awaitCond(p: => Boolean, max: Duration = 3.seconds, interval: Duration = 100.millis, message: String = "") { 40 | val _max = remainingOrDilated(max) 41 | val stop = now + _max 42 | 43 | @tailrec 44 | def poll(t: Duration) { 45 | if (!p) { 46 | assert(now < stop, s"timeout ${_max} expired: $message") 47 | Thread.sleep(t.toMillis) 48 | poll((stop - now) min interval) 49 | } 50 | } 51 | 52 | poll(_max min interval) 53 | } 54 | 55 | private def remainingOrDilated(max: Duration): FiniteDuration = max match { 56 | case x if x eq Duration.Undefined => remainingOrDefault 57 | case x if !x.isFinite => throw new IllegalArgumentException("max duration cannot be infinite") 58 | case f: FiniteDuration => f.dilated 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/WriteConfTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.driver.core.ConsistencyLevel 4 | import com.datastax.spark.connector.{RowsInBatch, BytesInBatch} 5 | import org.apache.spark.SparkConf 6 | import org.scalatest.{FlatSpec, Matchers} 7 | 8 | class WriteConfTest extends FlatSpec with Matchers { 9 | 10 | "WriteConf" should "be configured with proper defaults" in { 11 | val conf = new SparkConf(false) 12 | val writeConf = WriteConf.fromSparkConf(conf) 13 | 14 | writeConf.batchSize should be(BytesInBatch(WriteConf.DefaultBatchSizeInBytes)) 15 | writeConf.consistencyLevel should be(WriteConf.DefaultConsistencyLevel) 16 | writeConf.parallelismLevel should be(WriteConf.DefaultParallelismLevel) 17 | } 18 | 19 | it should "allow to set consistency level" in { 20 | val conf = new SparkConf(false) 21 | .set("spark.cassandra.output.consistency.level", "THREE") 22 | val writeConf = WriteConf.fromSparkConf(conf) 23 | 24 | writeConf.consistencyLevel should be(ConsistencyLevel.THREE) 25 | } 26 | 27 | it should "allow to set parallelism level" in { 28 | val conf = new SparkConf(false) 29 | .set("spark.cassandra.output.concurrent.writes", "17") 30 | val writeConf = WriteConf.fromSparkConf(conf) 31 | 32 | writeConf.parallelismLevel should be(17) 33 | } 34 | 35 | it should "allow to set batch size in bytes" in { 36 | val conf = new SparkConf(false) 37 | .set("spark.cassandra.output.batch.size.bytes", "12345") 38 | val writeConf = WriteConf.fromSparkConf(conf) 39 | 40 | writeConf.batchSize should be(BytesInBatch(12345)) 41 | } 42 | 43 | it should "allow to set batch size in bytes when rows are set to auto" in { 44 | val conf = new SparkConf(false) 45 | .set("spark.cassandra.output.batch.size.bytes", "12345") 46 | .set("spark.cassandra.output.batch.size.rows", "auto") 47 | val writeConf = WriteConf.fromSparkConf(conf) 48 | 49 | writeConf.batchSize should be(BytesInBatch(12345)) 50 | } 51 | 52 | it should "allow to set batch size in rows" in { 53 | val conf = new SparkConf(false) 54 | .set("spark.cassandra.output.batch.size.rows", "12345") 55 | val writeConf = WriteConf.fromSparkConf(conf) 56 | 57 | writeConf.batchSize should be(RowsInBatch(12345)) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/CassandraCatalog.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.cassandra 2 | 3 | import java.io.IOException 4 | import java.util.concurrent.TimeUnit 5 | 6 | import com.datastax.spark.connector.cql.{CassandraConnector, Schema} 7 | import com.google.common.cache.{CacheBuilder, CacheLoader} 8 | import org.apache.spark.sql.catalyst.analysis.Catalog 9 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery} 10 | 11 | private[cassandra] class CassandraCatalog(cc: CassandraSQLContext) extends Catalog { 12 | 13 | val caseSensitive: Boolean = true 14 | 15 | val schemas = CacheBuilder.newBuilder 16 | .maximumSize(100) 17 | .expireAfterWrite(cc.conf.getLong("schema.expire.in.minutes", 10), TimeUnit.MINUTES) 18 | .build( 19 | new CacheLoader[String, Schema] { 20 | def load(cluster: String) : Schema = { 21 | Schema.fromCassandra(CassandraConnector(cc.conf)) 22 | } 23 | }) 24 | 25 | override def lookupRelation( 26 | databaseName: Option[String], 27 | tableName: String, 28 | alias: Option[String] = None): LogicalPlan = { 29 | 30 | lazy val defaultDatabase = databaseName.getOrElse(cc.getKeyspace) 31 | val defaultCluster = "default" 32 | val (cluster, database, table) = tableName.split("\\.") match { 33 | case Array(t) => (defaultCluster, defaultDatabase, t) 34 | case Array(d, t) => (defaultCluster, d, t) 35 | case Array(c, d, t) => (c, d, t) 36 | case _ => throw new IOException(s"Wrong table name: $tableName") 37 | } 38 | 39 | val schema = schemas.get(cluster) 40 | val keyspaceDef = schema.keyspaceByName.getOrElse(database, throw new IOException(s"Keyspace not found: $database")) 41 | val tableDef = keyspaceDef.tableByName.getOrElse(table, throw new IOException(s"Table not found: $database.$table")) 42 | val tableWithQualifiers = Subquery(table, CassandraRelation(tableDef, alias)(cc)) 43 | alias.map(a => Subquery(a, tableWithQualifiers)).getOrElse(tableWithQualifiers) 44 | } 45 | 46 | override def registerTable(databaseName: Option[String], tableName: String, plan: LogicalPlan): Unit = ??? 47 | 48 | override def unregisterTable(databaseName: Option[String], tableName: String): Unit = ??? 49 | 50 | override def unregisterAllTables(): Unit = ??? 51 | } 52 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/WritableToCassandra.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import com.datastax.spark.connector.ColumnSelector 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import org.apache.spark.SparkContext 6 | 7 | abstract class WritableToCassandra[T] { 8 | 9 | def sparkContext: SparkContext 10 | 11 | private[connector] lazy val connector = CassandraConnector(sparkContext.getConf) 12 | 13 | /** 14 | * Saves the data from `RDD` to a Cassandra table. 15 | * By default, it saves all properties that have corresponding Cassandra columns. 16 | * 17 | * Example: 18 | * {{{ 19 | * CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }; 20 | * CREATE TABLE test.words(word VARCHAR PRIMARY KEY, count INT, other VARCHAR); 21 | * }}} 22 | * 23 | * {{{ 24 | * case class WordCount(word: String, count: Int, other: String) 25 | * val rdd = sc.parallelize(Seq(WordCount("foo", 5, "bar"))) 26 | * }}} 27 | * 28 | * By default, the underlying RDD class must provide data for all columns: 29 | * {{{ 30 | * rdd.saveToCassandra("test", "words") 31 | * }}} 32 | * 33 | * By default, writes are performed at ConsistencyLevel.ONE in order to leverage data-locality and minimize network traffic. 34 | * This write consistency level is controlled by the following property: 35 | * - spark.cassandra.output.consistency.level: consistency level for RDD writes, string matching the ConsistencyLevel enum name. 36 | * 37 | * @param keyspaceName the name of the Keyspace to use 38 | * @param tableName the name of the Table to use 39 | * @param columnNames The list of column names to save data to. 40 | * Uses only the unique column names, and you must select at least all primary key 41 | * columns. All other fields are discarded. Non-selected property/column names are left unchanged. 42 | * @param writeConf additional configuration object allowing to set consistency level, batch size, etc. 43 | */ 44 | def saveToCassandra(keyspaceName: String, 45 | tableName: String, 46 | columnNames: ColumnSelector, 47 | writeConf: WriteConf) 48 | (implicit connector: CassandraConnector, rwf: RowWriterFactory[T]) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/CassandraSQLRow.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.cassandra 2 | 3 | import com.datastax.driver.core.{Row, ProtocolVersion} 4 | import com.datastax.spark.connector.AbstractRow 5 | import com.datastax.spark.connector.rdd.reader.{ThisRowReaderAsFactory, RowReader} 6 | import com.datastax.spark.connector.types.TypeConverter 7 | import org.apache.spark.sql.catalyst.expressions.{Row => SparkRow} 8 | 9 | final class CassandraSQLRow(data: IndexedSeq[AnyRef], columnNames: IndexedSeq[String]) 10 | extends AbstractRow(data, columnNames) with SparkRow with Serializable { 11 | 12 | private[spark] def this() = this(null, null) // required by Kryo for deserialization :( 13 | 14 | 15 | /** Generic getter for getting columns of any type. 16 | * Looks the column up by its index. First column starts at index 0. */ 17 | private def get[T](index: Int)(implicit c: TypeConverter[T]): T = 18 | c.convert(data(index)) 19 | 20 | override def apply(i: Int) = data(i) 21 | override def copy() = this // immutable 22 | override def size = super.size 23 | 24 | override def getDouble(i: Int) = get[Double](i) 25 | override def getFloat(i: Int) = get[Float](i) 26 | override def getLong(i: Int) = get[Long](i) 27 | override def getByte(i: Int) = get[Byte](i) 28 | override def getBoolean(i: Int) = get[Boolean](i) 29 | override def getShort(i: Int) = get[Short](i) 30 | override def getInt(i: Int) = get[Int](i) 31 | override def getString(i: Int) = get[String](i) 32 | override def iterator = data.iterator 33 | } 34 | 35 | 36 | object CassandraSQLRow { 37 | 38 | def fromJavaDriverRow(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): CassandraSQLRow = { 39 | val data = new Array[Object](columnNames.length) 40 | for (i <- 0 until columnNames.length) 41 | data(i) = AbstractRow.get(row, i, protocolVersion) 42 | new CassandraSQLRow(data, columnNames) 43 | } 44 | 45 | implicit object CassandraSQLRowReader extends RowReader[CassandraSQLRow] with ThisRowReaderAsFactory[CassandraSQLRow] { 46 | 47 | override def read(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): CassandraSQLRow = 48 | fromJavaDriverRow(row, columnNames, protocolVersion) 49 | 50 | override def requiredColumns = None 51 | override def columnNames = None 52 | override def targetClass = classOf[CassandraSQLRow] 53 | } 54 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/ReflectionUtil.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.util 2 | 3 | import scala.collection.concurrent.TrieMap 4 | import scala.reflect.runtime.universe._ 5 | import scala.util.{Try, Success, Failure} 6 | 7 | object ReflectionUtil { 8 | private val rm = runtimeMirror(getClass.getClassLoader) 9 | private val singletonCache = TrieMap[String, Any]() 10 | 11 | private def findScalaObject[T : TypeTag](objectName: String): Try[T] = { 12 | Try { 13 | val targetType = implicitly[TypeTag[T]].tpe 14 | val module = rm.staticModule(objectName) 15 | if (!(module.typeSignature <:< targetType)) 16 | throw new IllegalArgumentException(s"Object $objectName is not instance of $targetType") 17 | 18 | val moduleMirror = rm.reflectModule(module) 19 | moduleMirror.instance.asInstanceOf[T] 20 | } 21 | } 22 | 23 | private def findSingletonClassInstance[T : TypeTag](className: String): Try[T] = { 24 | Try { 25 | val targetType = implicitly[TypeTag[T]].tpe 26 | val targetClass = rm.runtimeClass(targetType.typeSymbol.asClass) 27 | val instance = 28 | singletonCache.get(className) match { 29 | case Some(obj) => obj 30 | case None => 31 | val newInstance = Class.forName(className).getConstructor(Array.empty[Class[_]]: _*).newInstance() 32 | singletonCache.putIfAbsent(className, newInstance) match { 33 | case None => newInstance 34 | case Some(previousInstance) => previousInstance 35 | } 36 | } 37 | 38 | if (!targetClass.isInstance(instance)) 39 | throw new IllegalArgumentException(s"Class $className is not $targetType") 40 | instance.asInstanceOf[T] 41 | } 42 | } 43 | 44 | /** Returns either a global Scala object by its fully qualified name or a singleton 45 | * instance of a Java class identified by its fully qualified class name. 46 | * Java class instances are cached. The Java class must provide a default constructor. */ 47 | def findGlobalObject[T : TypeTag](objectName: String): T = { 48 | val scalaObject: Try[T] = findScalaObject[T](objectName) 49 | val classInstance: Try[T] = findSingletonClassInstance[T](objectName) 50 | scalaObject orElse classInstance match { 51 | case Success(obj) => obj 52 | case Failure(e) => throw new IllegalArgumentException(s"Singleton object not available: $objectName", e) 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/CassandraConnectorConf.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import java.net.InetAddress 4 | 5 | import com.datastax.spark.connector.util.Logging 6 | import org.apache.spark.SparkConf 7 | import scala.util.control.NonFatal 8 | 9 | /** Stores configuration of a connection to Cassandra. 10 | * Provides information about cluster nodes, ports and optional credentials for authentication. */ 11 | case class CassandraConnectorConf( 12 | hosts: Set[InetAddress], 13 | nativePort: Int = CassandraConnectorConf.DefaultNativePort, 14 | rpcPort: Int = CassandraConnectorConf.DefaultRpcPort, 15 | authConf: AuthConf = NoAuthConf, 16 | connectionFactory: CassandraConnectionFactory = DefaultConnectionFactory) 17 | 18 | /** A factory for `CassandraConnectorConf` objects. 19 | * Allows for manually setting connection properties or reading them from `SparkConf` object. 20 | * By embedding connection information in `SparkConf`, `SparkContext` can offer Cassandra specific methods 21 | * which require establishing connections to a Cassandra cluster.*/ 22 | object CassandraConnectorConf extends Logging { 23 | 24 | val DefaultRpcPort = 9160 25 | val DefaultNativePort = 9042 26 | 27 | val CassandraConnectionHostProperty = "spark.cassandra.connection.host" 28 | val CassandraConnectionRpcPortProperty = "spark.cassandra.connection.rpc.port" 29 | val CassandraConnectionNativePortProperty = "spark.cassandra.connection.native.port" 30 | 31 | private def resolveHost(hostName: String): Option[InetAddress] = { 32 | try Some(InetAddress.getByName(hostName)) 33 | catch { 34 | case NonFatal(e) => 35 | logError(s"Unknown host '$hostName'", e) 36 | None 37 | } 38 | } 39 | 40 | def apply(conf: SparkConf): CassandraConnectorConf = { 41 | val hostsStr = conf.get(CassandraConnectionHostProperty, InetAddress.getLocalHost.getHostAddress) 42 | val hosts = for { 43 | hostName <- hostsStr.split(",").toSet[String] 44 | hostAddress <- resolveHost(hostName) 45 | } yield hostAddress 46 | 47 | val rpcPort = conf.getInt(CassandraConnectionRpcPortProperty, DefaultRpcPort) 48 | val nativePort = conf.getInt(CassandraConnectionNativePortProperty, DefaultNativePort) 49 | val authConf = AuthConf.fromSparkConf(conf) 50 | val connectionFactory = CassandraConnectionFactory.fromSparkConf(conf) 51 | CassandraConnectorConf(hosts, nativePort, rpcPort, authConf, connectionFactory) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/RefCountMap.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import scala.collection.concurrent.TrieMap 4 | import scala.annotation.tailrec 5 | 6 | /** Atomically counts references to objects of any type */ 7 | class RefCountMap[T] { 8 | 9 | private val refCounts = new TrieMap[T, Int] 10 | 11 | /** Returns current reference count for the given key. 12 | * This value may be constantly changing, so do not use it for synchronization purposes. */ 13 | final def get(key: T): Int = 14 | refCounts.getOrElse(key, 0) 15 | 16 | /** Atomically increases reference count only if the reference counter is already greater than 0. 17 | * @return true if reference counter was greater than zero and has been increased */ 18 | @tailrec 19 | final def acquireIfNonZero(key: T): Int = { 20 | refCounts.get(key) match { 21 | case Some(count) if count > 0 => 22 | if (refCounts.replace(key, count, count + 1)) 23 | count + 1 24 | else 25 | acquireIfNonZero(key) 26 | case _ => 27 | 0 28 | } 29 | } 30 | 31 | /** Atomically increases reference count by one. 32 | * @return reference count after increase */ 33 | @tailrec 34 | final def acquire(key: T): Int = { 35 | refCounts.get(key) match { 36 | case Some(count) => 37 | if (refCounts.replace(key, count, count + 1)) 38 | count + 1 39 | else 40 | acquire(key) 41 | case None => 42 | if (!refCounts.putIfAbsent(key, 1).isDefined) 43 | 1 44 | else 45 | acquire(key) 46 | } 47 | } 48 | 49 | /** Atomically decreases reference count by `n`. 50 | * @return reference count after decrease 51 | * @throws IllegalStateException if the reference count before decrease is less than `n` */ 52 | @tailrec 53 | final def release(key: T, n: Int = 1): Int = { 54 | refCounts.get(key) match { 55 | case Some(count) if count > n => 56 | if (refCounts.replace(key, count, count - n)) 57 | count - n 58 | else 59 | release(key, n) 60 | case Some(count) if count == n => 61 | if (refCounts.remove(key, n)) 62 | 0 63 | else 64 | release(key, n) 65 | case _ => 66 | throw new IllegalStateException("Release without acquire for key: " + key) 67 | } 68 | } 69 | 70 | /** Resets state of all counters to 0 */ 71 | def clear(): Unit = refCounts.clear() 72 | 73 | } 74 | -------------------------------------------------------------------------------- /doc/5_saving.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | ## Saving datasets to Cassandra 3 | 4 | It is possible to save any `RDD` to Cassandra, not just `CassandraRDD`. 5 | The only requirement is that the object class of `RDD` is a tuple or has property names 6 | corresponding to Cassandra column names. 7 | 8 | To save an `RDD`, import `com.datastax.spark.connector._` and call the `saveToCassandra` method with the 9 | keyspace name, table name and a list of columns. Make sure to include at least all primary key columns. 10 | 11 | ## Saving a collection of tuples 12 | 13 | ```scala 14 | collection = sc.parallelize(Seq(("cat", 30), ("fox", 40))) 15 | collection.saveToCassandra("test", "words", SomeColumns("word", "count")) 16 | ``` 17 | 18 | cqlsh:test> select * from words; 19 | 20 | word | count 21 | ------+------- 22 | bar | 20 23 | foo | 10 24 | cat | 30 25 | fox | 40 26 | 27 | (4 rows) 28 | 29 | ## Saving a collection of objects 30 | When saving a collection of objects of a user-defined class, the items to be saved 31 | must provide appropriately named public property accessors for getting every column 32 | to be saved. This example provides more information on property-column naming conventions is described [here](4_mapper.md). 33 | 34 | ```scala 35 | case class WordCount(word: String, count: Long) 36 | collection = sc.parallelize(Seq(WordCount("dog", 50), WordCount("cow", 60))) 37 | collection.saveToCassandra("test", "words", SomeColumns("word", "count")) 38 | ``` 39 | 40 | cqlsh:test> select * from words; 41 | 42 | word | count 43 | ------+------- 44 | bar | 20 45 | foo | 10 46 | cat | 30 47 | fox | 40 48 | dog | 50 49 | cow | 60 50 | 51 | The driver will execute a CQL `INSERT` statement for every object in the `RDD`, 52 | grouped in unlogged batches. The consistency level for writes is `ONE`. 53 | 54 | ## Tuning 55 | The following properties set in `SparkConf` can be used to fine-tune the saving process: 56 | 57 | - `spark.cassandra.output.batch.size.rows`: number of rows per single batch; default is 'auto' which means the connector 58 | will adjust the number of rows based on the amount of data in each row 59 | - `spark.cassandra.output.batch.size.bytes`: maximum total size of the batch in bytes; defaults to 16 kB. 60 | - `spark.cassandra.output.concurrent.writes`: maximum number of batches executed in parallel by a single Spark task; defaults to 5 61 | 62 | [Next - Customizing the object mapping](6_advanced_mapper.md) 63 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/ServerSideTokenRangeSplitter.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import java.io.IOException 4 | import java.net.InetAddress 5 | 6 | import scala.collection.JavaConversions._ 7 | import scala.util.{Failure, Success, Try} 8 | 9 | import org.apache.cassandra.thrift.CfSplit 10 | 11 | import com.datastax.spark.connector.cql.CassandraConnector 12 | import com.datastax.spark.connector.rdd.partitioner.dht.{CassandraNode, Token, TokenFactory, TokenRange} 13 | import com.datastax.spark.connector.util.Logging 14 | 15 | /** Delegates token range splitting to Cassandra server. */ 16 | class ServerSideTokenRangeSplitter[V, T <: Token[V]]( 17 | connector: CassandraConnector, 18 | keyspaceName: String, 19 | tableName: String, 20 | tokenFactory: TokenFactory[V, T]) 21 | extends TokenRangeSplitter[V, T] with Logging { 22 | 23 | private def unthriftify(cfSplit: CfSplit, endpoints: Set[CassandraNode]): TokenRange[V, T] = { 24 | val left = tokenFactory.fromString(cfSplit.start_token) 25 | val right = tokenFactory.fromString(cfSplit.end_token) 26 | TokenRange(left, right, endpoints, Some(cfSplit.row_count)) 27 | } 28 | 29 | private def fetchSplits(range: TokenRange[V, T], endpoint: InetAddress, splitSize: Long): Seq[TokenRange[V, T]] = { 30 | val startToken = tokenFactory.toString(range.start) 31 | val endToken = tokenFactory.toString(range.end) 32 | 33 | connector.withCassandraClientDo(endpoint) { 34 | client => 35 | client.set_keyspace(keyspaceName) 36 | client 37 | .describe_splits_ex(tableName, startToken, endToken, splitSize.toInt) 38 | .map(unthriftify(_, range.endpoints)) 39 | } 40 | } 41 | 42 | def split(range: TokenRange[V, T], splitSize: Long) = { 43 | val fetchResults = 44 | for (endpoint <- range.endpoints.toStream) 45 | yield Try(fetchSplits(range, endpoint.rpcAddress, splitSize)) 46 | 47 | fetchResults 48 | .collectFirst { case Success(splits) => splits } 49 | .getOrElse { 50 | for (Failure(e) <- fetchResults) 51 | logError("Failure while fetching splits from Cassandra", e) 52 | if (range.endpoints.isEmpty) 53 | throw new IOException(s"Failed to fetch splits of $range because there are no replicas for the keyspace in the current datacenter.") 54 | else 55 | throw new IOException(s"Failed to fetch splits of $range from all endpoints: ${range.endpoints.mkString(", ")}") 56 | } 57 | } 58 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/package.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | 6 | import scala.language.implicitConversions 7 | import scala.reflect.ClassTag 8 | 9 | /** 10 | * The root package of Cassandra connector for Apache Spark. 11 | * Offers handy implicit conversions that add Cassandra-specific methods to `SparkContext` and `RDD`. 12 | * 13 | * Call [[com.datastax.spark.connector.SparkContextFunctions#cassandraTable cassandraTable]] method on the `SparkContext` object 14 | * to create a [[com.datastax.spark.connector.rdd.CassandraRDD CassandraRDD]] exposing Cassandra tables as Spark RDDs. 15 | * 16 | * Call [[com.datastax.spark.connector.RDDFunctions]] `saveToCassandra` 17 | * function on any `RDD` to save distributed collection to a Cassandra table. 18 | * 19 | * Example: 20 | * {{{ 21 | * CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }; 22 | * CREATE TABLE test.words (word text PRIMARY KEY, count int); 23 | * INSERT INTO test.words(word, count) VALUES ("and", 50); 24 | * }}} 25 | * 26 | * {{{ 27 | * import com.datastax.spark.connector._ 28 | * 29 | * val sparkMasterHost = "127.0.0.1" 30 | * val cassandraHost = "127.0.0.1" 31 | * val keyspace = "test" 32 | * val table = "words" 33 | * 34 | * // Tell Spark the address of one Cassandra node: 35 | * val conf = new SparkConf(true).set("spark.cassandra.connection.host", cassandraHost) 36 | * 37 | * // Connect to the Spark cluster: 38 | * val sc = new SparkContext("spark://" + sparkMasterHost + ":7077", "example", conf) 39 | * 40 | * // Read the table and print its contents: 41 | * val rdd = sc.cassandraTable(keyspace, table) 42 | * rdd.toArray().foreach(println) 43 | * 44 | * // Write two rows to the table: 45 | * val col = sc.parallelize(Seq(("of", 1200), ("the", "863"))) 46 | * col.saveToCassandra(keyspace, table) 47 | * 48 | * sc.stop() 49 | * }}} 50 | */ 51 | package object connector { 52 | 53 | implicit def toSparkContextFunctions(sc: SparkContext): SparkContextFunctions = 54 | new SparkContextFunctions(sc) 55 | 56 | implicit def toRDDFunctions[T : ClassTag](rdd: RDD[T]): RDDFunctions[T] = 57 | new RDDFunctions[T](rdd) 58 | 59 | implicit class ColumnNameFunctions(val columnName: String) extends AnyVal { 60 | def writeTime: WriteTime = WriteTime(columnName) 61 | def ttl: TTL = TTL(columnName) 62 | } 63 | 64 | implicit def toNamedColumnRef(columnName: String): NamedColumnRef = ColumnName(columnName) 65 | } 66 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/SessionProxy.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import java.lang.reflect.{InvocationHandler, InvocationTargetException, Method, Proxy} 4 | 5 | import com.datastax.driver.core.{RegularStatement, Session, SimpleStatement} 6 | 7 | /** Wraps a `Session` and intercepts: 8 | * - `close` method to invoke `afterClose` handler 9 | * - `prepare` methods to cache `PreparedStatement` objects. */ 10 | class SessionProxy(session: Session, afterClose: Session => Any) extends InvocationHandler { 11 | 12 | private var closed = false 13 | 14 | override def invoke(proxy: Any, method: Method, args: Array[AnyRef]) = { 15 | try { 16 | val StringClass = classOf[String] 17 | val RegularStatementClass = classOf[String] 18 | 19 | (method.getName, method.getParameterTypes) match { 20 | case ("close", Array()) => 21 | null 22 | case ("closeUnderlying", Array()) => 23 | session.close() 24 | null 25 | case ("isClosed", Array()) => 26 | closed.asInstanceOf[AnyRef] 27 | case ("prepare", Array(StringClass)) => 28 | PreparedStatementCache.prepareStatement(session, new SimpleStatement(args(0).asInstanceOf[String])) 29 | case ("prepare", Array(RegularStatementClass)) => 30 | PreparedStatementCache.prepareStatement(session, args(0).asInstanceOf[RegularStatement]) 31 | case _ => 32 | try { 33 | method.invoke(session, args: _*) 34 | } 35 | catch { 36 | case e: InvocationTargetException => 37 | throw e.getCause 38 | } 39 | } 40 | } 41 | finally { 42 | if (method.getName == "close" && !closed) { 43 | closed = true 44 | afterClose(session) 45 | } 46 | } 47 | } 48 | } 49 | 50 | object SessionProxy { 51 | 52 | /** Creates a new `SessionProxy` delegating to the given `Session`. 53 | * The proxy adds prepared statement caching functionality. */ 54 | def wrap(session: Session): Session = 55 | wrapWithCloseAction(session)(_ => ()) 56 | 57 | /** Creates a new `SessionProxy` delegating to the given `Session`. 58 | * Additionally registers a callback on `Session#close` method. 59 | * @param afterClose code to be invoked after the session has been closed */ 60 | def wrapWithCloseAction(session: Session)(afterClose: Session => Any): Session = 61 | Proxy.newProxyInstance( 62 | session.getClass.getClassLoader, 63 | Array(classOf[Session]), 64 | new SessionProxy(session, afterClose)).asInstanceOf[Session] 65 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/SparkContextFunctions.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector 2 | 3 | import com.datastax.spark.connector.cql.CassandraConnector 4 | import com.datastax.spark.connector.rdd.{ReadConf, ValidRDDType, CassandraRDD} 5 | import com.datastax.spark.connector.rdd.reader.RowReaderFactory 6 | import org.apache.spark.SparkContext 7 | 8 | import scala.reflect.ClassTag 9 | 10 | /** Provides Cassandra-specific methods on `SparkContext` */ 11 | class SparkContextFunctions(@transient val sc: SparkContext) extends Serializable { 12 | 13 | /** Returns a view of a Cassandra table as `CassandraRDD`. 14 | * This method is made available on `SparkContext` by importing `com.datastax.spark.connector._` 15 | * 16 | * Depending on the type parameter passed to `cassandraTable`, every row is converted to one of the following: 17 | * - an [[CassandraRow]] object (default, if no type given) 18 | * - a tuple containing column values in the same order as columns selected by [[com.datastax.spark.connector.rdd.CassandraRDD#select CassandraRDD#select]] 19 | * - object of a user defined class, populated by appropriate [[com.datastax.spark.connector.mapper.ColumnMapper ColumnMapper]] 20 | * 21 | * Example: 22 | * {{{ 23 | * CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }; 24 | * CREATE TABLE test.words (word text PRIMARY KEY, count int); 25 | * INSERT INTO test.words (word, count) VALUES ('foo', 20); 26 | * INSERT INTO test.words (word, count) VALUES ('bar', 20); 27 | * ... 28 | * }}} 29 | * {{{ 30 | * // Obtaining RDD of CassandraRow objects: 31 | * val rdd1 = sc.cassandraTable("test", "words") 32 | * rdd1.first.getString("word") // foo 33 | * rdd1.first.getInt("count") // 20 34 | * 35 | * // Obtaining RDD of tuples: 36 | * val rdd2 = sc.cassandraTable[(String, Int)]("test", "words").select("word", "count") 37 | * rdd2.first._1 // foo 38 | * rdd2.first._2 // 20 39 | * 40 | * // Obtaining RDD of user defined objects: 41 | * case class WordCount(word: String, count: Int) 42 | * val rdd3 = sc.cassandraTable[WordCount]("test", "words") 43 | * rdd3.first.word // foo 44 | * rdd3.first.count // 20 45 | * }}}*/ 46 | def cassandraTable[T](keyspace: String, table: String) 47 | (implicit connector: CassandraConnector = CassandraConnector(sc.getConf), 48 | ct: ClassTag[T], rrf: RowReaderFactory[T], 49 | ev: ValidRDDType[T]) = 50 | new CassandraRDD[T](sc, connector, keyspace, table, readConf = ReadConf.fromSparkConf(sc.getConf)) 51 | } 52 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/DefaultColumnMapper.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.mapper 2 | 3 | import java.lang.reflect.Method 4 | 5 | import com.datastax.spark.connector.cql.TableDef 6 | 7 | import scala.reflect.ClassTag 8 | 9 | /** A [[ColumnMapper]] that assumes camel case naming convention for property accessors and constructor names 10 | * and underscore naming convention for column names. 11 | * 12 | * Example mapping: 13 | * {{{ 14 | * case class User( 15 | * login: String, // mapped to "login" column 16 | * emailAddress: String // mapped to "email_address" column 17 | * emailAddress2: String // mapped to "email_address_2" column 18 | * ) 19 | * }}} 20 | * 21 | * Additionally, it is possible to name columns exactly the same as property names (case-sensitive): 22 | * {{{ 23 | * case class TaxPayer( 24 | * TIN: String // mapped to "TIN" column 25 | * ) 26 | * }}} 27 | * 28 | * @param columnNameOverride maps property names to column names; use it to override default mapping for some properties 29 | */ 30 | class DefaultColumnMapper[T : ClassTag](columnNameOverride: Map[String, String] = Map.empty) extends ReflectionColumnMapper[T] { 31 | 32 | import com.datastax.spark.connector.mapper.DefaultColumnMapper._ 33 | 34 | override def classTag: ClassTag[T] = implicitly[ClassTag[T]] 35 | 36 | private def setterNameToPropertyName(str: String) = 37 | str.substring(0, str.length - SetterSuffix.length) 38 | 39 | override def isGetter(method: Method) = { 40 | method.getParameterTypes.size == 0 && 41 | method.getReturnType != Void.TYPE 42 | } 43 | 44 | override def isSetter(method: Method) = { 45 | method.getParameterTypes.size == 1 && 46 | method.getReturnType == Void.TYPE && 47 | method.getName.endsWith(SetterSuffix) 48 | } 49 | 50 | override def constructorParamToColumnName(paramName: String, tableDef: TableDef) = 51 | columnNameOverride.getOrElse(paramName, columnNameForProperty(paramName, tableDef)) 52 | 53 | override def getterToColumnName(getterName: String, tableDef: TableDef) = 54 | columnNameOverride.getOrElse(getterName, columnNameForProperty(getterName, tableDef)) 55 | 56 | override def setterToColumnName(setterName: String, tableDef: TableDef) = { 57 | val propertyName = setterNameToPropertyName(setterName) 58 | columnNameOverride.getOrElse(propertyName, columnNameForProperty(propertyName, tableDef)) 59 | } 60 | 61 | /** Don't allow nulls in Scala - fail fast with NPE if null is tried. */ 62 | override protected def allowsNull = false 63 | } 64 | 65 | object DefaultColumnMapper { 66 | private val SetterSuffix: String = "_$eq" 67 | } 68 | -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/KafkaProducer.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import java.util.Properties 4 | 5 | import akka.actor.{Actor, ActorLogging} 6 | import kafka.producer.{KeyedMessage, Producer, ProducerConfig} 7 | import kafka.serializer.StringEncoder 8 | import kafka.server.KafkaConfig 9 | 10 | /** Simple producer for an Akka Actor using string encoder and default partitioner. */ 11 | abstract class KafkaProducerActor[K, V] extends Actor with ActorLogging { 12 | import KafkaEvent._ 13 | 14 | def producerConfig: ProducerConfig 15 | 16 | private val producer = new KafkaProducer[K, V](producerConfig) 17 | 18 | override def postStop(): Unit = { 19 | log.info("Shutting down producer.") 20 | producer.close() 21 | } 22 | 23 | def receive = { 24 | case e: KafkaMessageEnvelope[K,V] => producer.send(e) 25 | } 26 | } 27 | 28 | /** Simple producer using string encoder and default partitioner. */ 29 | class KafkaProducer[K, V](producerConfig: ProducerConfig) { 30 | 31 | def this(brokers: Set[String], batchSize: Int, producerType: String, serializerFqcn: String) = 32 | this(KafkaProducer.createConfig(brokers, batchSize, producerType, serializerFqcn)) 33 | 34 | def this(config: KafkaConfig) = 35 | this(KafkaProducer.defaultConfig(config)) 36 | 37 | import KafkaEvent._ 38 | 39 | private val producer = new Producer[K, V](producerConfig) 40 | 41 | /** Sends the data, partitioned by key to the topic. */ 42 | def send(e: KafkaMessageEnvelope[K,V]): Unit = 43 | batchSend(e.topic, e.key, e.messages) 44 | 45 | /* Sends a single message. */ 46 | def send(topic : String, key : K, message : V): Unit = 47 | batchSend(topic, key, Seq(message)) 48 | 49 | def batchSend(topic: String, key: K, batch: Seq[V]): Unit = { 50 | val messages = batch map (msg => new KeyedMessage[K, V](topic, key, msg)) 51 | producer.send(messages.toArray: _*) 52 | } 53 | 54 | def close(): Unit = producer.close() 55 | 56 | } 57 | 58 | object KafkaEvent { 59 | case class KafkaMessageEnvelope[K,V](topic: String, key: K, messages: V*) 60 | } 61 | 62 | object KafkaProducer { 63 | 64 | def createConfig(brokers: Set[String], batchSize: Int, producerType: String, serializerFqcn: String): ProducerConfig = { 65 | val props = new Properties() 66 | props.put("metadata.broker.list", brokers.mkString(",")) 67 | props.put("serializer.class", serializerFqcn) 68 | props.put("partitioner.class", "kafka.producer.DefaultPartitioner") 69 | props.put("producer.type", producerType) 70 | props.put("request.required.acks", "1") 71 | props.put("batch.num.messages", batchSize.toString) 72 | new ProducerConfig(props) 73 | } 74 | 75 | def defaultConfig(config: KafkaConfig): ProducerConfig = 76 | createConfig(Set(s"${config.hostName}:${config.port}"), 100, "async", classOf[StringEncoder].getName) 77 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/ReflectionColumnMapper.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.mapper 2 | 3 | import java.lang.reflect.{Constructor, Method} 4 | 5 | import com.datastax.spark.connector.{ColumnRef, ColumnName} 6 | import com.datastax.spark.connector.cql.TableDef 7 | import com.datastax.spark.connector.rdd.reader.AnyObjectFactory 8 | import org.apache.commons.lang.StringUtils 9 | 10 | import scala.reflect.ClassTag 11 | 12 | abstract class ReflectionColumnMapper[T : ClassTag] extends ColumnMapper[T] { 13 | 14 | import AnyObjectFactory._ 15 | 16 | protected def isSetter(method: Method): Boolean 17 | protected def isGetter(method: Method): Boolean 18 | protected def setterToColumnName(setterName: String, tableDef: TableDef): String 19 | protected def getterToColumnName(getterName: String, tableDef: TableDef): String 20 | protected def constructorParamToColumnName(paramName: String, tableDef: TableDef): String 21 | protected def allowsNull: Boolean 22 | 23 | protected final def camelCaseToUnderscore(str: String): String = 24 | StringUtils.splitByCharacterTypeCamelCase(str).mkString("_").replaceAll("_+", "_").toLowerCase 25 | 26 | protected final def columnNameForProperty(propertyName: String, tableDef: TableDef): String = { 27 | val underscoreName = camelCaseToUnderscore(propertyName) 28 | val candidateColumnNames = Seq(propertyName, underscoreName) 29 | val columnRef = candidateColumnNames.iterator.map(tableDef.columnByName.get).find(_.isDefined).flatten 30 | columnRef.fold(underscoreName)(_.columnName) 31 | } 32 | 33 | override def columnMap(tableDef: TableDef): ColumnMap = { 34 | 35 | val cls = implicitly[ClassTag[T]].runtimeClass 36 | 37 | def columnsOf(ctor: Constructor[_]): Seq[ColumnRef] = { 38 | if (isNoArgsConstructor(ctor)) 39 | Nil 40 | else { 41 | val paramNames = paranamer.lookupParameterNames(ctor) 42 | val columnNames = paramNames 43 | .map(constructorParamToColumnName(_, tableDef)) 44 | .filter(_ != "$_outer") 45 | columnNames.map(ColumnName) 46 | } 47 | } 48 | 49 | val constructor = columnsOf(resolveConstructor(cls)) 50 | 51 | val getters: Map[String, ColumnRef] = { 52 | for (method <- cls.getMethods if isGetter(method)) yield { 53 | val methodName = method.getName 54 | val columnName = getterToColumnName(methodName, tableDef) 55 | (methodName, ColumnName(columnName)) 56 | } 57 | }.toMap 58 | 59 | val setters: Map[String, ColumnRef] = { 60 | for (method <- cls.getMethods if isSetter(method)) yield { 61 | val methodName = method.getName 62 | val columnName = setterToColumnName(methodName, tableDef) 63 | (methodName, ColumnName(columnName)) 64 | } 65 | }.toMap 66 | 67 | new SimpleColumnMap(constructor, getters, setters, allowsNull) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/StreamingContextFunctions.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import akka.actor.{ActorRef, Actor} 4 | import com.datastax.spark.connector.cql.CassandraConnector 5 | import com.datastax.spark.connector.rdd.{ReadConf, ValidRDDType} 6 | import org.apache.spark.streaming.StreamingContext 7 | import org.apache.spark.streaming.scheduler.StreamingListener 8 | import org.apache.spark.streaming.receiver.ActorHelper 9 | import com.datastax.spark.connector.SparkContextFunctions 10 | import com.datastax.spark.connector.rdd.reader.RowReaderFactory 11 | 12 | import scala.reflect.ClassTag 13 | 14 | /** Provides Cassandra-specific methods on `org.apache.spark.streaming.StreamingContext`. 15 | * @param ssc the Spark Streaming context 16 | */ 17 | class StreamingContextFunctions (ssc: StreamingContext) extends SparkContextFunctions(ssc.sparkContext) { 18 | import scala.reflect.ClassTag 19 | 20 | override def cassandraTable[T](keyspace: String, table: String)( 21 | implicit 22 | connector: CassandraConnector = CassandraConnector(ssc.sparkContext.getConf), 23 | ct: ClassTag[T], 24 | rrf: RowReaderFactory[T], 25 | ev: ValidRDDType[T]): CassandraStreamingRDD[T] = { 26 | 27 | val readConf = ReadConf.fromSparkConf(ssc.sparkContext.getConf) 28 | new CassandraStreamingRDD[T](ssc, connector, keyspace, table, readConf = readConf) 29 | } 30 | } 31 | 32 | /** Simple akka.actor.Actor mixin. */ 33 | trait SparkStreamingActor extends Actor with ActorHelper { 34 | 35 | override def preStart(): Unit = { 36 | context.system.eventStream.publish(StreamingEvent.ReceiverStarted(self)) 37 | } 38 | } 39 | 40 | abstract class TypedStreamingActor[T : ClassTag] extends SparkStreamingActor { 41 | 42 | def receive: Actor.Receive = { 43 | case e: T => push(e) 44 | } 45 | 46 | def push(event: T): Unit = 47 | store(event) 48 | } 49 | 50 | /** Simple StreamingListener. Currently just used to listen for initialization of a receiver. 51 | * Implement further to access information about an ongoing streaming computation.*/ 52 | class SparkStreamingListener[T: ClassTag] extends StreamingListener { 53 | import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted 54 | import java.util.concurrent.atomic.AtomicBoolean 55 | 56 | private val listenerInitialized = new AtomicBoolean() 57 | 58 | def initialized: Boolean = listenerInitialized.get 59 | 60 | /** Called when a receiver has been started */ 61 | override def onReceiverStarted(started: StreamingListenerReceiverStarted): Unit = 62 | listenerInitialized.set(true) 63 | 64 | } 65 | 66 | object StreamingEvent { 67 | /** Base marker for Receiver events */ 68 | sealed trait ReceiverEvent extends Serializable 69 | 70 | /** 71 | * @param actor the receiver actor 72 | */ 73 | case class ReceiverStarted(actor: ActorRef) extends ReceiverEvent 74 | } -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/ColumnType.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.types 2 | 3 | import com.datastax.driver.core.{UDTValue, DataType} 4 | import scala.collection.JavaConversions._ 5 | import scala.reflect.runtime.universe._ 6 | 7 | import com.datastax.spark.connector.types.TypeConverter.OptionToNullConverter 8 | 9 | /** Serializable representation of column data type. */ 10 | trait ColumnType[T] extends Serializable { 11 | 12 | /** Returns a converter that converts values to the type of this column expected by the 13 | * Cassandra Java driver when saving the row.*/ 14 | def converterToCassandra: TypeConverter[_ <: AnyRef] 15 | 16 | /** Returns a converter that converts values to the Scala type associated with this column. */ 17 | lazy val converterToScala: TypeConverter[T] = 18 | TypeConverter.forType(scalaTypeTag) 19 | 20 | /** Returns the TypeTag of the Scala type recommended to represent values of this column. */ 21 | def scalaTypeTag: TypeTag[T] 22 | 23 | /** Name of the Scala type. Useful for source generation.*/ 24 | def scalaTypeName: String 25 | = scalaTypeTag.tpe.toString 26 | 27 | def isCollection: Boolean 28 | } 29 | 30 | object ColumnType { 31 | 32 | private val primitiveTypeMap = Map[DataType, ColumnType[_]]( 33 | DataType.text() -> TextType, 34 | DataType.ascii() -> AsciiType, 35 | DataType.varchar() -> VarCharType, 36 | DataType.cint() -> IntType, 37 | DataType.bigint() -> BigIntType, 38 | DataType.cfloat() -> FloatType, 39 | DataType.cdouble() -> DoubleType, 40 | DataType.cboolean() -> BooleanType, 41 | DataType.varint() -> VarIntType, 42 | DataType.decimal() -> DecimalType, 43 | DataType.timestamp() -> TimestampType, 44 | DataType.inet() -> InetType, 45 | DataType.uuid() -> UUIDType, 46 | DataType.blob() -> BlobType, 47 | DataType.counter() -> CounterType, 48 | DataType.timeuuid() -> TimeUUIDType 49 | ) 50 | 51 | def fromDriverType(dataType: DataType): ColumnType[_] = { 52 | val typeArgs = dataType.getTypeArguments.map(fromDriverType) 53 | dataType.getName match { 54 | case DataType.Name.LIST => ListType(typeArgs(0)) 55 | case DataType.Name.SET => SetType(typeArgs(0)) 56 | case DataType.Name.MAP => MapType(typeArgs(0), typeArgs(1)) 57 | case DataType.Name.UDT => UserDefinedTypeStub 58 | case _ => primitiveTypeMap(dataType) 59 | } 60 | } 61 | } 62 | 63 | // TODO: This is a stub. 64 | // UDTValues are not Serializable. 65 | // Properly, we should use a dedicated, 66 | // serializable class for UDTValues and also allow to map them to case classes. 67 | case object UserDefinedTypeStub extends ColumnType[UDTValue] { 68 | def converterToCassandra = new OptionToNullConverter(TypeConverter.forType[UDTValue]) 69 | override def isCollection = false 70 | override def scalaTypeTag = TypeTag.synchronized { implicitly[TypeTag[UDTValue]] } 71 | } 72 | -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/KafkaConsumer.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import java.util.Properties 4 | import java.util.concurrent.Executors 5 | import java.util.concurrent.atomic.AtomicInteger 6 | 7 | import scala.concurrent.duration._ 8 | import akka.actor.{ActorLogging, Actor} 9 | import kafka.serializer.StringDecoder 10 | import kafka.consumer.{Consumer, ConsumerConfig} 11 | 12 | /** The KafkaConsumer is a very simple consumer of a single Kafka topic. 13 | * This is a helpful utility for IT tests to insure data is getting published to Kafka 14 | * for streaming ingestion upstream. 15 | */ 16 | class KafkaConsumer(zookeeper: String, topic: String, groupId: String, partitions: Int, numThreads: Int, count: AtomicInteger) { 17 | 18 | val connector = Consumer.create(createConsumerConfig) 19 | 20 | // create n partitions of the stream for topic “test”, to allow n threads to consume 21 | val streams = connector 22 | .createMessageStreams(Map(topic -> partitions), new StringDecoder(), new StringDecoder()) 23 | .get(topic) 24 | 25 | // launch all the threads 26 | val executor = Executors.newFixedThreadPool(numThreads) 27 | 28 | // consume the messages in the threads 29 | for(stream <- streams) { 30 | executor.submit(new Runnable() { 31 | def run() { 32 | for(s <- stream) { 33 | while(s.iterator.hasNext) { 34 | count.getAndIncrement 35 | } 36 | } 37 | } 38 | }) 39 | } 40 | 41 | private def createConsumerConfig: ConsumerConfig = { 42 | val props = new Properties() 43 | props.put("consumer.timeout.ms", "2000") 44 | props.put("zookeeper.connect", zookeeper) 45 | props.put("group.id", groupId) 46 | props.put("zookeeper.session.timeout.ms", "400") 47 | props.put("zookeeper.sync.time.ms", "10") 48 | props.put("auto.commit.interval.ms", "1000") 49 | 50 | new ConsumerConfig(props) 51 | } 52 | 53 | def shutdown() { 54 | println("Consumer shutting down.") 55 | Option(connector) map (_.shutdown()) 56 | Option(executor) map (_.shutdown()) 57 | } 58 | } 59 | 60 | /** Simple actor with a Kafka consumer to report the latest message count in a Kafka Topic. */ 61 | class KafkaTopicLogger(topic: String, group: String, taskInterval: FiniteDuration = 3.seconds) 62 | extends Actor with ActorLogging { 63 | import Event._ 64 | import context.dispatcher 65 | 66 | val atomic = new AtomicInteger(0) 67 | 68 | val consumer = new KafkaConsumer(ZookeeperConnectionString, topic, group, 1, 10, atomic) 69 | 70 | var task = context.system.scheduler.schedule(3.seconds, taskInterval) { 71 | self ! QueryTask 72 | } 73 | 74 | override def postStop(): Unit = { 75 | task.cancel 76 | consumer.shutdown() 77 | } 78 | 79 | def receive: Actor.Receive = { 80 | case QueryTask => 81 | log.info(s"Kafka message count [{}]", atomic.get) 82 | } 83 | } -------------------------------------------------------------------------------- /spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/EmbeddedZookeeper.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.embedded 2 | 3 | import java.net.InetSocketAddress 4 | 5 | import scala.util.Try 6 | import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer} 7 | 8 | /** Implements a simple standalone ZooKeeperServer. 9 | * To create a ZooKeeper client object, the application needs to pass a 10 | * connection string containing a comma separated list of host:port pairs, 11 | * each corresponding to a ZooKeeper server. 12 | *

13 | * Session establishment is asynchronous. This constructor will initiate 14 | * connection to the server and return immediately - potentially (usually) 15 | * before the session is fully established. The watcher argument specifies 16 | * the watcher that will be notified of any changes in state. This 17 | * notification can come at any point before or after the constructor call 18 | * has returned. 19 | *

20 | * The instantiated ZooKeeper client object will pick an arbitrary server 21 | * from the connectString and attempt to connect to it. If establishment of 22 | * the connection fails, another server in the connect string will be tried 23 | * (the order is non-deterministic, as we random shuffle the list), until a 24 | * connection is established. The client will continue attempts until the 25 | * session is explicitly closed. 26 | * 27 | * @param connectString comma separated host:port pairs, each corresponding to a zk 28 | * server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002" If 29 | * the optional chroot suffix is used the example would look 30 | * like: "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002/app/a" 31 | * where the client would be rooted at "/app/a" and all paths 32 | * would be relative to this root - ie getting/setting/etc... 33 | * "/foo/bar" would result in operations being run on 34 | * "/app/a/foo/bar" (from the server perspective). 35 | * Default: the local IP and default port: 2180. 36 | */ 37 | class EmbeddedZookeeper(val connectString: String = ZookeeperConnectionString) extends Embedded { 38 | 39 | val snapshotDir = createTempDir 40 | 41 | val logDir = createTempDir 42 | 43 | val server = new ZooKeeperServer(snapshotDir, logDir, 500) 44 | 45 | val (ip, port) = { 46 | val splits = connectString.split(":") 47 | (splits(0), splits(1).toInt) 48 | } 49 | 50 | val factory = new NIOServerCnxnFactory() 51 | factory.configure(new InetSocketAddress(ip, port), 16) 52 | factory.startup(server) 53 | println(s"ZooKeeperServer isRunning: $isRunning") 54 | 55 | def isRunning: Boolean = Try(server.isRunning) getOrElse false 56 | 57 | def shutdown(): Unit = { 58 | println(s"Shutting down ZK NIOServerCnxnFactory.") 59 | factory.shutdown() 60 | deleteRecursively(snapshotDir) 61 | deleteRecursively(logDir) 62 | } 63 | } -------------------------------------------------------------------------------- /spark-cassandra-connector-java/src/test/java/com/datastax/spark/connector/japi/CustomTypeConverterTest.java: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.japi; 2 | 3 | import akka.japi.JavaPartialFunction; 4 | import com.datastax.spark.connector.japi.types.JavaTypeConverter; 5 | import com.datastax.spark.connector.util.JavaApiHelper$; 6 | import org.junit.Test; 7 | 8 | import static org.junit.Assert.assertEquals; 9 | 10 | public class CustomTypeConverterTest { 11 | 12 | public static enum SampleEnum { 13 | ONE, TWO, THREE 14 | } 15 | 16 | public final static JavaTypeConverter sampleEnumConverter = 17 | new JavaTypeConverter(JavaApiHelper$.MODULE$.getTypeTag(SampleEnum.class), 18 | new JavaPartialFunction() { 19 | @Override 20 | public SampleEnum apply(Object x, boolean isCheck) throws Exception { 21 | if (x == null) { 22 | return null; 23 | } else if (x instanceof String) { 24 | try { 25 | return SampleEnum.valueOf((String) x); 26 | } catch (IllegalArgumentException ex) { 27 | throw noMatch(); 28 | } 29 | } else if (x instanceof Number) { 30 | switch (((Number) x).intValue()) { 31 | case 1: 32 | return SampleEnum.ONE; 33 | case 2: 34 | return SampleEnum.TWO; 35 | case 3: 36 | return SampleEnum.THREE; 37 | } 38 | } 39 | throw noMatch(); 40 | } 41 | }); 42 | 43 | 44 | @Test 45 | public void test1() { 46 | assertEquals(SampleEnum.class.getName(), sampleEnumConverter.targetTypeName()); 47 | 48 | assertEquals(true, sampleEnumConverter.convertPF().isDefinedAt(1)); 49 | assertEquals(true, sampleEnumConverter.convertPF().isDefinedAt(2.5)); 50 | assertEquals(true, sampleEnumConverter.convertPF().isDefinedAt("THREE")); 51 | assertEquals(false, sampleEnumConverter.convertPF().isDefinedAt("asdf")); 52 | assertEquals(false, sampleEnumConverter.convertPF().isDefinedAt(4)); 53 | assertEquals(true, sampleEnumConverter.convertPF().isDefinedAt(null)); 54 | 55 | assertEquals(SampleEnum.ONE, sampleEnumConverter.convertPF().apply(1)); 56 | assertEquals(SampleEnum.TWO, sampleEnumConverter.convertPF().apply(2.5)); 57 | assertEquals(SampleEnum.THREE, sampleEnumConverter.convertPF().apply("THREE")); 58 | assertEquals(null, sampleEnumConverter.convertPF().apply(null)); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/TokenRangeClusterer.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import java.net.InetAddress 4 | 5 | import Ordering.Implicits._ 6 | 7 | import com.datastax.spark.connector.rdd.partitioner.dht.{Token, TokenRange} 8 | 9 | import scala.annotation.tailrec 10 | 11 | /** Divides a set of token ranges into groups containing not more than `maxRowCountPerGroup` rows 12 | * and not more than `maxGroupSize` token ranges. Each group will form a single `CassandraRDDPartition`. 13 | * 14 | * The algorithm is as follows: 15 | * 1. Sort token ranges by endpoints lexicographically. 16 | * 2. Take the highest possible number of token ranges from the beginning of the list, 17 | * such that their sum of rowCounts does not exceed `maxRowCountPerGroup` and they all contain at 18 | * least one common endpoint. If it is not possible, take at least one item. 19 | * Those token ranges will make a group. 20 | * 3. Repeat the previous step until no more token ranges left.*/ 21 | class TokenRangeClusterer[V, T <: Token[V]](maxRowCountPerGroup: Long, maxGroupSize: Int = Int.MaxValue) { 22 | 23 | private implicit object InetAddressOrdering extends Ordering[InetAddress] { 24 | override def compare(x: InetAddress, y: InetAddress) = 25 | x.getHostAddress.compareTo(y.getHostAddress) 26 | } 27 | 28 | @tailrec 29 | private def group(tokenRanges: Stream[TokenRange[V, T]], 30 | result: Vector[Seq[TokenRange[V, T]]]): Iterable[Seq[TokenRange[V, T]]] = { 31 | tokenRanges match { 32 | case Stream.Empty => result 33 | case head #:: rest => 34 | val firstEndpoint = head.endpoints.min 35 | val rowCounts = tokenRanges.map(_.rowCount.get) 36 | val cumulativeRowCounts = rowCounts.scanLeft(0L)(_ + _).tail // drop first item always == 0 37 | val rowLimit = math.max(maxRowCountPerGroup, head.rowCount.get) // make sure first element will be always included 38 | val cluster = tokenRanges 39 | .take(math.max(1, maxGroupSize)) 40 | .zip(cumulativeRowCounts) 41 | .takeWhile { case (tr, count) => count <= rowLimit && tr.endpoints.min == firstEndpoint } 42 | .map(_._1) 43 | .toVector 44 | val remainingTokenRanges = tokenRanges.drop(cluster.length) 45 | group(remainingTokenRanges, result :+ cluster) 46 | } 47 | } 48 | 49 | /** Groups small token ranges on the same server(s) in order to reduce task scheduling overhead. 50 | * Useful mostly with virtual nodes, which may create lots of small token range splits. 51 | * Each group will make a single Spark task. */ 52 | def group(tokenRanges: Seq[TokenRange[V, T]]): Iterable[Seq[TokenRange[V, T]]] = { 53 | // sort by endpoints lexicographically 54 | // this way ranges on the same host are grouped together 55 | val sortedRanges = tokenRanges.sortBy(_.endpoints.toSeq.sorted) 56 | group(sortedRanges.toStream, Vector.empty) 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/streaming/ActorStreamSpec.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.streaming 2 | 3 | import akka.actor.{ActorSystem, Props, Terminated} 4 | import akka.testkit.{ImplicitSender, TestKit} 5 | import com.datastax.spark.connector.{RowsInBatch, SomeColumns} 6 | import com.datastax.spark.connector.cql.CassandraConnector 7 | import com.datastax.spark.connector.embedded._ 8 | import com.datastax.spark.connector.streaming.StreamingEvent.ReceiverStarted 9 | import com.datastax.spark.connector.testkit._ 10 | import com.datastax.spark.connector.writer.WriteConf 11 | import org.apache.spark.SparkEnv 12 | import org.apache.spark.storage.StorageLevel 13 | import org.apache.spark.streaming.StreamingContext.toPairDStreamFunctions 14 | import org.apache.spark.streaming.{Milliseconds, StreamingContext} 15 | 16 | class ActorStreamingSpec extends ActorSpec with CounterFixture with ImplicitSender { 17 | import com.datastax.spark.connector.testkit.TestEvent._ 18 | 19 | /* Initializations - does not work in the actor test context in a static before() */ 20 | CassandraConnector(SparkTemplate.conf).withSessionDo { session => 21 | session.execute("CREATE KEYSPACE IF NOT EXISTS demo WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }") 22 | session.execute("CREATE TABLE IF NOT EXISTS demo.streaming_wordcount (word TEXT PRIMARY KEY, count COUNTER)") 23 | session.execute("TRUNCATE demo.streaming_wordcount") 24 | } 25 | 26 | "actorStream" must { 27 | "write from the actor stream to cassandra table: demo.streaming_wordcount" in { 28 | 29 | val stream = ssc.actorStream[String](Props[TestStreamingActor], actorName, StorageLevel.MEMORY_AND_DISK) 30 | 31 | val wc = stream.flatMap(_.split("\\s+")) 32 | .map(x => (x, 1)) 33 | .reduceByKey(_ + _) 34 | .saveToCassandra("demo", "streaming_wordcount") 35 | 36 | // start the streaming context so the data can be processed and actor started 37 | ssc.start() 38 | 39 | system.eventStream.subscribe(self, classOf[StreamingEvent.ReceiverStarted]) 40 | 41 | expectMsgPF(duration) { case ReceiverStarted(receiver) => 42 | watch(receiver) 43 | system.actorOf(Props(new TestProducer(data.toArray, receiver))) 44 | } 45 | 46 | expectMsgPF(duration) { case Terminated(ref) => 47 | val rdd = ssc.cassandraTable[WordCount]("demo", "streaming_wordcount") 48 | awaitCond(rdd.collect.nonEmpty && rdd.map(_.count).reduce(_ + _) == scale * 2) 49 | rdd.collect.size should be (data.size) 50 | } 51 | } 52 | } 53 | } 54 | 55 | /** A very basic Akka actor which streams `String` event data to spark. */ 56 | class TestStreamingActor extends TypedStreamingActor[String] with Counter { 57 | 58 | override def push(e: String): Unit = { 59 | super.push(e) 60 | increment() 61 | } 62 | } 63 | 64 | abstract class ActorSpec(val ssc: StreamingContext, _system: ActorSystem) 65 | extends TestKit(_system) with StreamingSpec { 66 | 67 | def this() = this (new StreamingContext(SparkTemplate.sc, Milliseconds(300)), SparkEnv.get.actorSystem) 68 | 69 | } 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/rdd/partitioner/RandomPartitionerTokenRangeSplitterTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import java.net.InetAddress 4 | 5 | import com.datastax.spark.connector.rdd.partitioner.dht.{CassandraNode, BigIntToken, TokenFactory} 6 | import org.junit.Assert._ 7 | import org.junit.Test 8 | 9 | 10 | class RandomPartitionerTokenRangeSplitterTest { 11 | 12 | type TokenRange = com.datastax.spark.connector.rdd.partitioner.dht.TokenRange[BigInt, BigIntToken] 13 | 14 | private def assertNoHoles(tokenRanges: Seq[TokenRange]) { 15 | for (Seq(range1, range2) <- tokenRanges.sliding(2)) 16 | assertEquals(range1.end, range2.start) 17 | } 18 | 19 | @Test 20 | def testSplit() { 21 | val node = CassandraNode(InetAddress.getLocalHost, InetAddress.getLocalHost) 22 | val splitter = new RandomPartitionerTokenRangeSplitter(2.0) 23 | val rangeLeft = BigInt("0") 24 | val rangeRight = BigInt("100") 25 | val range = new TokenRange( 26 | new BigIntToken(rangeLeft), 27 | new BigIntToken(rangeRight), Set(node), None) 28 | val out = splitter.split(range, 20) 29 | 30 | // 2 rows per token on average; to so 10 tokens = 20 rows; therefore 10 splits 31 | assertEquals(10, out.size) 32 | assertEquals(rangeLeft, out.head.start.value) 33 | assertEquals(rangeRight, out.last.end.value) 34 | assertTrue(out.forall(_.endpoints == Set(node))) 35 | assertNoHoles(out) 36 | } 37 | 38 | @Test 39 | def testNoSplit() { 40 | val splitter = new RandomPartitionerTokenRangeSplitter(2.0) 41 | val rangeLeft = BigInt("0") 42 | val rangeRight = BigInt("100") 43 | val range = new TokenRange( 44 | new BigIntToken(rangeLeft), 45 | new BigIntToken(rangeRight), Set.empty, None) 46 | val out = splitter.split(range, 500) 47 | 48 | // range is too small to contain 500 rows 49 | assertEquals(1, out.size) 50 | assertEquals(rangeLeft, out.head.start.value) 51 | assertEquals(rangeRight, out.last.end.value) 52 | } 53 | 54 | @Test 55 | def testZeroRows() { 56 | val splitter = new RandomPartitionerTokenRangeSplitter(0.0) 57 | val rangeLeft = BigInt("0") 58 | val rangeRight = BigInt("100") 59 | val range = new TokenRange( 60 | new BigIntToken(rangeLeft), 61 | new BigIntToken(rangeRight), Set.empty, None) 62 | val out = splitter.split(range, 500) 63 | assertEquals(1, out.size) 64 | assertEquals(rangeLeft, out.head.start.value) 65 | assertEquals(rangeRight, out.last.end.value) 66 | } 67 | 68 | @Test 69 | def testWrapAround() { 70 | val splitter = new RandomPartitionerTokenRangeSplitter(2.0) 71 | val rangeLeft = TokenFactory.RandomPartitionerTokenFactory.maxToken.value - 100 72 | val rangeRight = BigInt("100") 73 | val range = new TokenRange( 74 | new BigIntToken(rangeLeft), 75 | new BigIntToken(rangeRight), Set.empty, None) 76 | val out = splitter.split(range, 20) 77 | assertEquals(20, out.size) 78 | assertEquals(rangeLeft, out.head.start.value) 79 | assertEquals(rangeRight, out.last.end.value) 80 | assertNoHoles(out) 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/rdd/partitioner/Murmur3PartitionerTokenRangeSplitterTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.rdd.partitioner 2 | 3 | import java.net.InetAddress 4 | 5 | import com.datastax.spark.connector.rdd.partitioner.dht.{CassandraNode, LongToken} 6 | import com.datastax.spark.connector.rdd.partitioner.dht.TokenFactory.Murmur3TokenFactory 7 | import org.junit.Assert._ 8 | import org.junit.Test 9 | 10 | class Murmur3PartitionerTokenRangeSplitterTest { 11 | 12 | type TokenRange = com.datastax.spark.connector.rdd.partitioner.dht.TokenRange[Long, LongToken] 13 | 14 | private def assertNoHoles(tokenRanges: Seq[TokenRange]) { 15 | for (Seq(range1, range2) <- tokenRanges.sliding(2)) 16 | assertEquals(range1.end, range2.start) 17 | } 18 | 19 | @Test 20 | def testSplit() { 21 | val node = CassandraNode(InetAddress.getLocalHost, InetAddress.getLocalHost) 22 | val splitter = new Murmur3PartitionerTokenRangeSplitter(2.0) 23 | val range = new TokenRange( 24 | new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(0), 25 | new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(100), 26 | Set(node), None) 27 | val out = splitter.split(range, 20) 28 | 29 | // 2 rows per token on average; to so 10 tokens = 20 rows; therefore 10 splits 30 | assertEquals(10, out.size) 31 | assertEquals(0L, out.head.start.value) 32 | assertEquals(100L, out.last.end.value) 33 | assertTrue(out.forall(s => s.end.value - s.start.value == 10)) 34 | assertTrue(out.forall(_.endpoints == Set(node))) 35 | assertNoHoles(out) 36 | } 37 | 38 | @Test 39 | def testNoSplit() { 40 | val splitter = new Murmur3PartitionerTokenRangeSplitter(2.0) 41 | val range = new TokenRange( 42 | new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(0), new LongToken(100), Set.empty, None) 43 | val out = splitter.split(range, 500) 44 | 45 | // range is too small to contain 500 rows 46 | assertEquals(1, out.size) 47 | assertEquals(0L, out.head.start.value) 48 | assertEquals(100L, out.last.end.value) 49 | } 50 | 51 | @Test 52 | def testZeroRows() { 53 | val splitter = new Murmur3PartitionerTokenRangeSplitter(0.0) 54 | val range = new TokenRange( 55 | new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(0), new LongToken(100), Set.empty, None) 56 | val out = splitter.split(range, 500) 57 | assertEquals(1, out.size) 58 | assertEquals(0L, out.head.start.value) 59 | assertEquals(100L, out.last.end.value) 60 | } 61 | 62 | @Test 63 | def testWrapAround() { 64 | val splitter = new Murmur3PartitionerTokenRangeSplitter(2.0) 65 | val maxValue = Murmur3TokenFactory.maxToken.value 66 | val minValue = Murmur3TokenFactory.minToken.value 67 | val range = new TokenRange( 68 | new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(maxValue - 100), 69 | new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(minValue + 100), Set.empty, None) 70 | val splits = splitter.split(range, 20) 71 | assertEquals(20, splits.size) 72 | assertEquals(maxValue - 100, splits.head.start.value) 73 | assertEquals(minValue + 100, splits.last.end.value) 74 | assertNoHoles(splits) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/JavaApiHelper.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.util 2 | 3 | import java.util.{Map => JavaMap} 4 | 5 | import com.datastax.spark.connector.mapper.{ColumnMapper, JavaBeanColumnMapper} 6 | import com.datastax.spark.connector.rdd.reader.RowReaderFactory 7 | import com.datastax.spark.connector.writer.RowWriterFactory 8 | import com.datastax.spark.connector.CassandraRow 9 | 10 | import scala.collection.JavaConversions._ 11 | import scala.reflect._ 12 | import scala.reflect.api.{Mirror, TypeCreator, _} 13 | import scala.reflect.runtime.universe._ 14 | 15 | /** A helper class to make it possible to access components written in Scala from Java code. 16 | * INTERNAL API 17 | */ 18 | object JavaApiHelper { 19 | 20 | def mirror = runtimeMirror(Thread.currentThread().getContextClassLoader) 21 | 22 | /** Returns a `TypeTag` for the given class. */ 23 | def getTypeTag[T](clazz: Class[T]): TypeTag[T] = TypeTag.synchronized { 24 | TypeTag.apply(mirror, new TypeCreator { 25 | override def apply[U <: Universe with Singleton](m: Mirror[U]): U#Type = { 26 | m.staticClass(clazz.getName).toTypeConstructor 27 | } 28 | }) 29 | } 30 | 31 | /** Returns a `TypeTag` for the given class and type parameters. */ 32 | def getTypeTag[T](clazz: Class[_], typeParams: TypeTag[_]*): TypeTag[T] = TypeTag.synchronized { 33 | TypeTag.apply(mirror, new TypeCreator { 34 | override def apply[U <: Universe with Singleton](m: Mirror[U]) = { 35 | val ct = m.staticClass(clazz.getName).toTypeConstructor.asInstanceOf[m.universe.Type] 36 | val tpt = typeParams.map(_.in(m).tpe.asInstanceOf[m.universe.Type]).toList 37 | m.universe.appliedType(ct, tpt).asInstanceOf[U#Type] 38 | } 39 | }) 40 | } 41 | 42 | /** Returns a `ClassTag` of a given runtime class. */ 43 | def getClassTag[T](clazz: Class[T]): ClassTag[T] = ClassTag(clazz) 44 | 45 | /** Returns a runtime class of a given `TypeTag`. */ 46 | def getRuntimeClass[T](typeTag: TypeTag[T]): Class[T] = mirror.runtimeClass(typeTag.tpe).asInstanceOf[Class[T]] 47 | 48 | /** Returns a runtime class of a given `ClassTag`. */ 49 | def getRuntimeClass[T](classTag: ClassTag[T]): Class[T] = classTag.runtimeClass.asInstanceOf[Class[T]] 50 | 51 | /** Converts a Java `Map` to a Scala immutable `Map`. */ 52 | def toScalaMap[K, V](map: JavaMap[K, V]): Map[K, V] = Map(map.toSeq: _*) 53 | 54 | /** Converts an array to a Scala `Seq`. */ 55 | def toScalaSeq[T](array: Array[T]): Seq[T] = array 56 | 57 | /** Converts a Java `Iterable` to Scala `Seq`. */ 58 | def toScalaSeq[T](iterable: java.lang.Iterable[T]): Seq[T] = iterable.toSeq 59 | 60 | /** Returns the default `RowWriterFactory` initialized with the given `ColumnMapper`. */ 61 | def defaultRowWriterFactory[T](mapper: ColumnMapper[T]) = { 62 | RowWriterFactory.defaultRowWriterFactory(mapper) 63 | } 64 | 65 | /** Returns the `JavaBeanColumnMapper` instance for the given `ClassTag` and column mapping. */ 66 | def javaBeanColumnMapper[T](classTag: ClassTag[T], columnNameOverride: JavaMap[String, String]): ColumnMapper[T] = 67 | new JavaBeanColumnMapper[T](toScalaMap(columnNameOverride))(classTag) 68 | 69 | /** Returns the default `RowReaderFactory`. */ 70 | def genericRowReaderFactory: RowReaderFactory[CassandraRow] = RowReaderFactory.GenericRowReader$ 71 | 72 | } 73 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/AuthConf.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import com.datastax.driver.core.{AuthProvider, PlainTextAuthProvider} 4 | import com.datastax.spark.connector.util.ReflectionUtil 5 | import org.apache.spark.SparkConf 6 | 7 | /** Stores credentials used to authenticate to a Cassandra cluster and uses them 8 | * to configure a Cassandra connection. 9 | * This driver provides implementations [[NoAuthConf]] for no authentication 10 | * and [[PasswordAuthConf]] for password authentication. Other authentication 11 | * configurators can be plugged in by setting `cassandra.authentication.conf.factory.class` 12 | * option. See [[AuthConfFactory]]. */ 13 | trait AuthConf extends Serializable { 14 | 15 | /** Returns auth provider to be passed to the `Cluster.Builder` object. */ 16 | def authProvider: AuthProvider 17 | 18 | /** Returns auth credentials to be set in the Thrift authentication request. */ 19 | def thriftCredentials: Map[String, String] 20 | } 21 | 22 | /** Performs no authentication. Use with `AllowAllAuthenticator` in Cassandra. */ 23 | case object NoAuthConf extends AuthConf { 24 | override def authProvider = AuthProvider.NONE 25 | 26 | override def thriftCredentials: Map[String, String] = Map.empty 27 | } 28 | 29 | /** Performs plain-text password authentication. Use with `PasswordAuthenticator` in Cassandra. */ 30 | case class PasswordAuthConf(user: String, password: String) extends AuthConf { 31 | override def authProvider = new PlainTextAuthProvider(user, password) 32 | 33 | override def thriftCredentials: Map[String, String] = Map("username" -> user, "password" -> password) 34 | } 35 | 36 | /** Obtains authentication configuration by reading `SparkConf` object. */ 37 | trait AuthConfFactory { 38 | def authConf(conf: SparkConf): AuthConf 39 | } 40 | 41 | /** Default `AuthConfFactory` that supports no authentication or password authentication. 42 | * Password authentication is enabled when both `spark.cassandra.auth.username` and `spark.cassandra.auth.password` 43 | * options are present in `SparkConf`.*/ 44 | object DefaultAuthConfFactory extends AuthConfFactory { 45 | 46 | val CassandraUserNameProperty = "spark.cassandra.auth.username" 47 | val CassandraPasswordProperty = "spark.cassandra.auth.password" 48 | 49 | def authConf(conf: SparkConf): AuthConf = { 50 | val credentials = 51 | for (username <- conf.getOption(CassandraUserNameProperty); 52 | password <- conf.getOption(CassandraPasswordProperty)) yield (username, password) 53 | 54 | credentials match { 55 | case Some((user, password)) => PasswordAuthConf(user, password) 56 | case None => NoAuthConf 57 | } 58 | } 59 | } 60 | 61 | /** Entry point for obtaining `AuthConf` object from `SparkConf`, used when establishing connections to Cassandra. 62 | * The actual `AuthConf` creation is delegated to the [[AuthConfFactory]] pointed by `spark.cassandra.auth.conf.factory` property. */ 63 | object AuthConf { 64 | val AuthConfFactoryProperty = "spark.cassandra.auth.conf.factory" 65 | 66 | def fromSparkConf(conf: SparkConf) = { 67 | val authConfFactory = conf 68 | .getOption(AuthConfFactoryProperty) 69 | .map(ReflectionUtil.findGlobalObject[AuthConfFactory]) 70 | .getOrElse(DefaultAuthConfFactory) 71 | 72 | authConfFactory.authConf(conf) 73 | } 74 | } 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/cql/CassandraPartitionKeyWhereSpec.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.cql 2 | 3 | import com.datastax.spark.connector._ 4 | import com.datastax.spark.connector.testkit.SharedEmbeddedCassandra 5 | import com.datastax.spark.connector.embedded._ 6 | import org.scalatest.{FlatSpec, Matchers} 7 | 8 | class CassandraPartitionKeyWhereSpec extends FlatSpec with Matchers with SharedEmbeddedCassandra with SparkTemplate { 9 | 10 | useCassandraConfig("cassandra-default.yaml.template") 11 | val conn = CassandraConnector(Set(cassandraHost)) 12 | 13 | conn.withSessionDo { session => 14 | session.execute("CREATE KEYSPACE IF NOT EXISTS where_test WITH REPLICATION = { 'class': 'SimpleStrategy', 'replication_factor': 1 }") 15 | 16 | session.execute("CREATE TABLE IF NOT EXISTS where_test.key_value (key INT, group BIGINT, value TEXT, PRIMARY KEY (key, group))") 17 | session.execute("INSERT INTO where_test.key_value (key, group, value) VALUES (1, 100, '0001')") 18 | session.execute("INSERT INTO where_test.key_value (key, group, value) VALUES (2, 200, '0002')") 19 | session.execute("INSERT INTO where_test.key_value (key, group, value) VALUES (3, 300, '0003')") 20 | session.execute("CREATE TABLE IF NOT EXISTS where_test.ckey_value (key1 INT, \"Key2\" BIGINT, group INT, value TEXT, PRIMARY KEY ((key1, \"Key2\"), group))") 21 | session.execute("INSERT INTO where_test.ckey_value (key1, \"Key2\", group, value) VALUES (1, 100, 1000, '0001')") 22 | session.execute("INSERT INTO where_test.ckey_value (key1, \"Key2\", group, value) VALUES (2, 200, 2000, '0002')") 23 | session.execute("INSERT INTO where_test.ckey_value (key1, \"Key2\", group, value) VALUES (3, 300, 3000, '0003')") 24 | 25 | } 26 | 27 | "A CassandraRDD" should "allow partition key eq in where" in { 28 | val rdd = sc.cassandraTable("where_test", "key_value").where("key = ?", 1) 29 | val result = rdd.collect() 30 | result should have length 1 31 | result.head.getInt("key") should be (1) 32 | } 33 | 34 | it should "allow partition key 'in' in where" in { 35 | val result = sc.cassandraTable("where_test", "key_value").where("key in (?, ?)", 2,3).collect() 36 | result should have length 2 37 | result.head.getInt("key") should (be (2) or be (3)) 38 | } 39 | 40 | it should "allow cluster key 'in' in where" in { 41 | val result = sc.cassandraTable("where_test", "key_value").where("group in (?, ?)", 200,300).collect() 42 | result should have length 2 43 | result.head.getInt("key") should (be (2) or be (3)) 44 | } 45 | 46 | it should "work with composite keys in" in { 47 | val result = sc.cassandraTable("where_test", "ckey_value").where("key1 = 1 and \"Key2\" in (?, ?)", 100,200).collect() 48 | result should have length 1 49 | result.head.getInt("key1") should be (1) 50 | } 51 | 52 | it should "work with composite keys eq" in { 53 | val result = sc.cassandraTable("where_test", "ckey_value").where("key1 = ? and \"Key2\" = ?", 1,100).collect() 54 | result should have length 1 55 | result.head.getInt("key1") should be (1) 56 | } 57 | 58 | it should "work with composite keys in2" in { 59 | val result = sc.cassandraTable("where_test", "ckey_value").where("\"Key2\" in (?, ?) and key1 = 1", 100,200).collect() 60 | result should have length 1 61 | result.head.getInt("key1") should be (1) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/WriteOptionTest.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.spark.connector.writer 2 | 3 | import java.util.Date 4 | import java.util.concurrent.TimeUnit 5 | 6 | import org.apache.spark.streaming.{Duration => SparkDuration} 7 | import org.joda.time.{DateTime, Duration => JodaDuration} 8 | import org.scalatest.{FlatSpec, Matchers} 9 | 10 | import scala.concurrent.duration.{Duration => ScalaDuration} 11 | 12 | class WriteOptionTest extends FlatSpec with Matchers { 13 | 14 | "TTLOption" should "properly create constant write option with duration in seconds" in { 15 | val option = TTLOption.constant(5) 16 | option shouldBe a[StaticWriteOption[_]] 17 | option.asInstanceOf[StaticWriteOption[Int]].value should be(5) 18 | } 19 | 20 | it should "properly create constant write option with scala.concurrent.duration.Duration" in { 21 | val option = TTLOption.constant(ScalaDuration.apply(5, TimeUnit.SECONDS)) 22 | option shouldBe a[StaticWriteOption[_]] 23 | option.asInstanceOf[StaticWriteOption[Int]].value should be(5) 24 | } 25 | 26 | it should "properly create constant write option with scala.concurrent.duration.Duration.Infinite" in { 27 | val option = TTLOption.constant(ScalaDuration.Inf) 28 | option shouldBe a[StaticWriteOption[_]] 29 | option.asInstanceOf[StaticWriteOption[Int]].value should be(0) 30 | } 31 | 32 | it should "properly create constant write option with org.apache.spark.streaming.Duration" in { 33 | val option = TTLOption.constant(SparkDuration.apply(5123L)) 34 | option shouldBe a[StaticWriteOption[_]] 35 | option.asInstanceOf[StaticWriteOption[Int]].value should be(5) 36 | } 37 | 38 | it should "properly create constant write option with org.joda.time.Duration" in { 39 | val option = TTLOption.constant(JodaDuration.millis(5123L)) 40 | option shouldBe a[StaticWriteOption[_]] 41 | option.asInstanceOf[StaticWriteOption[Int]].value should be(5) 42 | } 43 | 44 | it should "properly create infinite duration" in { 45 | val option = TTLOption.forever 46 | option shouldBe a[StaticWriteOption[_]] 47 | option.asInstanceOf[StaticWriteOption[Int]].value should be(0) 48 | } 49 | 50 | it should "properly create per-row duration placeholder" in { 51 | val option = TTLOption.perRow("test") 52 | option shouldBe a[PerRowWriteOption[_]] 53 | option.asInstanceOf[PerRowWriteOption[Int]].placeholder should be("test") 54 | } 55 | 56 | "TimestampOption" should "properly create constant write option with timestamp in microseconds" in { 57 | val option = TimestampOption.constant(12345L) 58 | option shouldBe a[StaticWriteOption[_]] 59 | option.asInstanceOf[StaticWriteOption[Long]].value should be(12345L) 60 | } 61 | 62 | it should "properly create constant write option with DateTime" in { 63 | val option = TimestampOption.constant(new DateTime(2010, 5, 6, 7, 8, 8, 10)) 64 | option shouldBe a[StaticWriteOption[_]] 65 | option.asInstanceOf[StaticWriteOption[Long]].value should be(new DateTime(2010, 5, 6, 7, 8, 8, 10).getMillis * 1000L) 66 | } 67 | 68 | it should "properly create constant write option with Date" in { 69 | val t = new Date() 70 | val option = TimestampOption.constant(t) 71 | option shouldBe a[StaticWriteOption[_]] 72 | option.asInstanceOf[StaticWriteOption[Long]].value should be(t.getTime * 1000L) 73 | } 74 | 75 | } 76 | --------------------------------------------------------------------------------