├── project
    ├── build.properties
    ├── plugins.sbt
    └── Versions.scala
├── rootdoc.txt
├── spark-cassandra-connector
    └── src
    │   ├── it
    │       ├── resources
    │       │   ├── triggers
    │       │   │   └── README.txt
    │       │   └── log4j.properties
    │       └── scala
    │       │   └── com
    │       │       └── datastax
    │       │           └── spark
    │       │               └── connector
    │       │                   ├── cql
    │       │                       ├── CassandraAuthenticatedConnectorSpec.scala
    │       │                       └── CassandraPartitionKeyWhereSpec.scala
    │       │                   └── streaming
    │       │                       ├── StreamingSpec.scala
    │       │                       └── ActorStreamSpec.scala
    │   ├── main
    │       └── scala
    │       │   ├── org
    │       │       └── apache
    │       │       │   └── spark
    │       │       │       └── sql
    │       │       │           └── cassandra
    │       │       │               ├── package-info.java
    │       │       │               ├── package.scala
    │       │       │               ├── api
    │       │       │                   └── java
    │       │       │                   │   └── JavaCassandraSQLContext.scala
    │       │       │               ├── InsertIntoCassandraTable.scala
    │       │       │               ├── CassandraCatalog.scala
    │       │       │               └── CassandraSQLRow.scala
    │       │   └── com
    │       │       └── datastax
    │       │           └── spark
    │       │               └── connector
    │       │                   ├── util
    │       │                       ├── package.scala
    │       │                       ├── CountingIterator.scala
    │       │                       ├── MagicalTypeTricks.scala
    │       │                       ├── ReflectionUtil.scala
    │       │                       └── JavaApiHelper.scala
    │       │                   ├── writer
    │       │                       ├── package.scala
    │       │                       ├── QueryExecutor.scala
    │       │                       ├── RowWriter.scala
    │       │                       ├── GenericRowWriter.scala
    │       │                       ├── SqlRowWriter.scala
    │       │                       ├── PropertyExtractor.scala
    │       │                       ├── ConvertingPropertyExtractor.scala
    │       │                       ├── AbstractRowWriter.scala
    │       │                       ├── RowWriterFactory.scala
    │       │                       ├── ObjectSizeEstimator.scala
    │       │                       ├── WriteOption.scala
    │       │                       ├── AsyncExecutor.scala
    │       │                       └── WritableToCassandra.scala
    │       │                   ├── rdd
    │       │                       ├── package.scala
    │       │                       ├── partitioner
    │       │                       │   ├── package.scala
    │       │                       │   ├── TokenRangeSplitter.scala
    │       │                       │   ├── dht
    │       │                       │   │   ├── Token.scala
    │       │                       │   │   ├── TokenRange.scala
    │       │                       │   │   └── TokenFactory.scala
    │       │                       │   ├── CassandraRDDPartition.scala
    │       │                       │   ├── Murmur3PartitionerTokenRangeSplitter.scala
    │       │                       │   ├── RandomPartitionerTokenRangeSplitter.scala
    │       │                       │   ├── ServerSideTokenRangeSplitter.scala
    │       │                       │   └── TokenRangeClusterer.scala
    │       │                       ├── reader
    │       │                       │   ├── package.scala
    │       │                       │   ├── PrefetchingResultSetIterator.scala
    │       │                       │   ├── RowReader.scala
    │       │                       │   ├── KeyValueRowReader.scala
    │       │                       │   └── ValueRowReader.scala
    │       │                       ├── ValidRDDType.scala
    │       │                       ├── CqlWhereClause.scala
    │       │                       └── ReadConf.scala
    │       │                   ├── mapper
    │       │                       ├── package.scala
    │       │                       ├── ColumnMap.scala
    │       │                       ├── TupleColumnMapper.scala
    │       │                       ├── JavaBeanColumnMapper.scala
    │       │                       ├── DefaultColumnMapper.scala
    │       │                       └── ReflectionColumnMapper.scala
    │       │                   ├── types
    │       │                       ├── package.scala
    │       │                       ├── TimestampFormatter.scala
    │       │                       ├── TimestampParser.scala
    │       │                       ├── CollectionColumnType.scala
    │       │                       └── ColumnType.scala
    │       │                   ├── cql
    │       │                       ├── package.scala
    │       │                       ├── MultipleRetryPolicy.scala
    │       │                       ├── CassandraClientProxy.scala
    │       │                       ├── PreparedStatementCache.scala
    │       │                       ├── CassandraConnectorConf.scala
    │       │                       ├── RefCountMap.scala
    │       │                       ├── SessionProxy.scala
    │       │                       └── AuthConf.scala
    │       │                   ├── streaming
    │       │                       ├── package.scala
    │       │                       ├── CassandraStreamingRDD.scala
    │       │                       ├── DStreamFunctions.scala
    │       │                       └── StreamingContextFunctions.scala
    │       │                   ├── ColumnSelector.scala
    │       │                   ├── BatchSize.scala
    │       │                   ├── RDDFunctions.scala
    │       │                   ├── ColumnRef.scala
    │       │                   ├── package.scala
    │       │                   └── SparkContextFunctions.scala
    │   └── test
    │       ├── scala
    │           └── com
    │           │   └── datastax
    │           │       └── spark
    │           │           └── connector
    │           │               ├── testkit
    │           │                   ├── package.scala
    │           │                   └── SparkCassandraFixture.scala
    │           │               ├── writer
    │           │                   ├── DefaultRowWriterTest.scala
    │           │                   ├── PropertyExtractorTest.scala
    │           │                   ├── ObjectSizeEstimatorTest.scala
    │           │                   ├── ConvertingPropertyExtractorTest.scala
    │           │                   ├── AsyncExecutorTest.scala
    │           │                   ├── WriteConfTest.scala
    │           │                   └── WriteOptionTest.scala
    │           │               ├── rdd
    │           │                   ├── reader
    │           │                   │   └── ClassBasedRowReaderTest.scala
    │           │                   └── partitioner
    │           │                   │   ├── RandomPartitionerTokenRangeSplitterTest.scala
    │           │                   │   └── Murmur3PartitionerTokenRangeSplitterTest.scala
    │           │               ├── samples.scala
    │           │               ├── streaming
    │           │                   └── TestProducer.scala
    │           │               ├── types
    │           │                   ├── CanBuildFromTest.scala
    │           │                   └── TypeSerializationTest.scala
    │           │               ├── mapper
    │           │                   └── TupleColumnMapperTest.scala
    │           │               └── util
    │           │                   └── ReflectionUtilSpec.scala
    │       └── java
    │           └── com
    │               └── datastax
    │                   └── spark
    │                       └── connector
    │                           ├── SampleJavaBeanWithoutNoArgsCtor.java
    │                           ├── SampleJavaBean.java
    │                           ├── SampleJavaBeanWithMultipleCtors.java
    │                           ├── SampleWithNestedJavaBean.java
    │                           └── SampleWithDeeplyNestedJavaBean.java
├── .travis.yml
├── .gitignore
├── spark-cassandra-connector-embedded
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── datastax
    │                   └── spark
    │                       └── connector
    │                           └── embedded
    │                               ├── Event.scala
    │                               ├── package.scala
    │                               ├── SparkTemplate.scala
    │                               ├── SparkRepl.scala
    │                               ├── Assertions.scala
    │                               ├── KafkaProducer.scala
    │                               ├── KafkaConsumer.scala
    │                               └── EmbeddedZookeeper.scala
├── spark-cassandra-connector-java
    └── src
    │   ├── main
    │       ├── scala
    │       │   └── com
    │       │   │   └── datastax
    │       │   │       └── spark
    │       │   │           └── connector
    │       │   │               └── japi
    │       │   │                   └── types
    │       │   │                       └── JavaTypeConverter.scala
    │       └── java
    │       │   └── com
    │       │       └── datastax
    │       │           └── spark
    │       │               └── connector
    │       │                   └── japi
    │       │                       ├── StreamingContextJavaFunctions.java
    │       │                       ├── RDDJavaFunctions.java
    │       │                       ├── DStreamJavaFunctions.java
    │       │                       └── GenericJavaRowReaderFactory.java
    │   └── test
    │       └── java
    │           └── com
    │               └── datastax
    │                   └── spark
    │                       └── connector
    │                           └── japi
    │                               └── CustomTypeConverterTest.java
├── spark-cassandra-connector-demos
    ├── simple-demos
    │   └── src
    │   │   └── main
    │   │       ├── resources
    │   │           ├── application.conf
    │   │           ├── log4j.properties
    │   │           └── data
    │   │           │   └── words
    │   │       └── scala
    │   │           └── com
    │   │               └── datastax
    │   │                   └── spark
    │   │                       └── connector
    │   │                           └── demo
    │   │                               ├── DemoApp.scala
    │   │                               ├── SparkCassandraSettings.scala
    │   │                               ├── WordCountDemo.scala
    │   │                               ├── TableCopyDemo.scala
    │   │                               ├── BasicReadWriteDemo.scala
    │   │                               └── SQLDemo.scala
    ├── twitter-streaming
    │   └── src
    │   │   └── main
    │   │       ├── resources
    │   │           ├── application.conf
    │   │           └── log4j.properties
    │   │       └── scala
    │   │           └── com
    │   │               └── datastax
    │   │                   └── spark
    │   │                       └── connector
    │   │                           └── demo
    │   │                               └── TwitterStreamingHashTagsByInterval.scala
    └── kafka-streaming
    │   └── src
    │       └── main
    │           └── resources
    │               ├── log4j.properties
    │               └── data
    │                   └── words
├── scripts
    └── submit-demos
├── doc
    ├── 10_embedded.md
    ├── 3_selection.md
    └── 5_saving.md
└── sbt
    └── sbt


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.6
2 | 


--------------------------------------------------------------------------------
/rootdoc.txt:
--------------------------------------------------------------------------------
1 | Cassandra connector for Apache Spark.
2 | See documentation of package [[com.datastax.spark.connector]].


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/it/resources/triggers/README.txt:
--------------------------------------------------------------------------------
1 | Place triggers to be loaded in this directory, as jar files.
2 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/package-info.java:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql.cassandra;


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | jdk: oraclejdk7
3 | sudo: false
4 | scala:
5 |   - 2.10.4
6 | script:
7 |   - "sbt ++$TRAVIS_SCALA_VERSION test"
8 | 
9 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/package.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql
2 | 
3 | package object cassandra {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/package.scala:
--------------------------------------------------------------------------------
1 | package com.datastax.spark.connector
2 | 
3 | /** Useful stuff that didn't fit elsewhere. */
4 | package object util {
5 | 
6 | }
7 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/package.scala:
--------------------------------------------------------------------------------
1 | package com.datastax.spark.connector
2 | 
3 | /** Contains components for writing RDDs to Cassandra */
4 | package object writer {
5 | 
6 | }
7 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/package.scala:
--------------------------------------------------------------------------------
1 | package com.datastax.spark.connector
2 | 
3 | 
4 | /** Contains [[com.datastax.spark.connector.rdd.CassandraRDD]] class that is the main entry point for
5 |   * analyzing Cassandra data from Spark. */
6 | package object rdd {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | .DS_Store
 4 | # sbt specific
 5 | .cache/
 6 | .history/
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 | .idea
19 | .idea_modules
20 | 
21 | checkpoint
22 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/QueryExecutor.scala:
--------------------------------------------------------------------------------
1 | package com.datastax.spark.connector.writer
2 | 
3 | import com.datastax.driver.core.{Statement, Session}
4 | 
5 | class QueryExecutor(session: Session, maxConcurrentQueries: Int)
6 |   extends AsyncExecutor(session.executeAsync(_ : Statement), maxConcurrentQueries)
7 | 
8 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/package.scala:
--------------------------------------------------------------------------------
1 | package com.datastax.spark.connector.rdd
2 | 
3 | /** Provides components for partitioning a Cassandra table into smaller parts of appropriate size.
4 |   * Each partition can be processed locally on at least one cluster node. */
5 | package object partitioner {
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/package.scala:
--------------------------------------------------------------------------------
1 | package com.datastax.spark.connector
2 | 
3 | /** Provides machinery for mapping Cassandra tables to user defined Scala classes or tuples.
4 |   * The main class in this package is [[mapper.ColumnMapper]] responsible for matching Scala object's
5 |   * properties with Cassandra column names.*/
6 | package object mapper {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/package.scala:
--------------------------------------------------------------------------------
1 | package com.datastax.spark.connector
2 | 
3 | /** Offers type conversion magic, so you can receive Cassandra column values in a form you like the most.
4 |   * Simply specify the type you want to use on the Scala side, and the column value will be converted automatically.
5 |   * Works also with complex objects like collections. */
6 | package object types {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/package.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd
 2 | 
 3 | import com.datastax.spark.connector.CassandraRow
 4 | 
 5 | /** Provides components for reading data rows from Cassandra and converting them to objects of desired type.
 6 |   * Additionally provides a generic [[CassandraRow CassandraRow]] class which can represent any row.*/
 7 | package object reader {
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "0.7.1")
 2 | 
 3 | addSbtPlugin("com.typesafe.sbt" % "sbt-scalariform" % "1.3.0")
 4 | 
 5 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.6.2")
 6 | 
 7 | addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.6")
 8 | 
 9 | addSbtPlugin("com.typesafe.sbt" % "sbt-pgp" % "0.8.3")
10 | 
11 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
12 | 
13 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4")
14 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/package.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector
 2 | 
 3 | 
 4 | /** Contains a [[cql.CassandraConnector]] object which is used to connect
 5 |   * to a Cassandra cluster and to send CQL statements to it. `CassandraConnector`
 6 |   * provides a Scala-idiomatic way of working with `Cluster` and `Session` object
 7 |   * and takes care of connection pooling and proper resource disposal.*/
 8 | package object cql {
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/testkit/package.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector
 2 | 
 3 | import scala.collection.immutable
 4 | import scala.concurrent.duration._
 5 | import akka.util.Timeout
 6 | 
 7 | package object testkit {
 8 | 
 9 |   final val DefaultHost = "127.0.0.1"
10 | 
11 |   implicit val DefaultTimeout = Timeout(5.seconds)
12 | 
13 |   val data = immutable.Set("words ", "may ", "count ")
14 | 
15 |   val actorName = "my-actor"
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/CountingIterator.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.util
 2 | 
 3 | /** Counts elements fetched form the underlying iterator. */
 4 | class CountingIterator[T](iterator: Iterator[T]) extends Iterator[T] {
 5 |   private var _count = 0
 6 | 
 7 |   /** Returns the number of successful invocations of `next` */
 8 |   def count = _count
 9 | 
10 |   def hasNext = iterator.hasNext
11 | 
12 |   def next() = {
13 |     val item = iterator.next()
14 |     _count += 1
15 |     item
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/DefaultRowWriterTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.spark.connector.cql.TableDef
 4 | import org.apache.commons.lang3.SerializationUtils
 5 | import org.junit.Test
 6 | 
 7 | class DefaultRowWriterTest {
 8 | 
 9 |   @Test
10 |   def testSerializability() {
11 |     val table = TableDef("test", "table", Nil, Nil, Nil)
12 |     val rowWriter = new DefaultRowWriter[DefaultRowWriterTest](table, Nil)
13 |     SerializationUtils.roundtrip(rowWriter)
14 |   }
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/TokenRangeSplitter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner
 2 | 
 3 | import com.datastax.spark.connector.rdd.partitioner.dht.{Token, TokenRange}
 4 | 
 5 | /** Splits a token range into smaller sub-ranges,
 6 |   * each with the desired approximate number of rows. */
 7 | trait TokenRangeSplitter[V, T <: Token[V]] {
 8 | 
 9 |   /** Splits given token range into n equal sub-ranges. */
10 |   def split(range: TokenRange[V, T], splitSize: Long): Seq[TokenRange[V, T]]
11 | }
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/package.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector
 2 | 
 3 | import org.apache.spark.streaming.StreamingContext
 4 | import org.apache.spark.streaming.dstream.DStream
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | package object streaming {
 9 | 
10 |   implicit def toStreamingContextFunctions(ssc: StreamingContext): SparkContextFunctions =
11 |     new StreamingContextFunctions(ssc)
12 | 
13 |   implicit def toDStreamFunctions[T: ClassTag](ds: DStream[T]): DStreamFunctions[T] =
14 |     new DStreamFunctions[T](ds)
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/ColumnSelector.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector
 2 | 
 3 | import scala.language.implicitConversions
 4 | 
 5 | sealed trait ColumnSelector
 6 | case object AllColumns extends ColumnSelector
 7 | case class SomeColumns(columns: NamedColumnRef*) extends ColumnSelector
 8 | 
 9 | object SomeColumns {
10 |   @deprecated("Use com.datastax.spark.connector.rdd.SomeColumns instead of Seq", "1.0")
11 |   implicit def seqToSomeColumns(columns: Seq[String]): SomeColumns =
12 |     SomeColumns(columns.map(x => x: NamedColumnRef): _*)
13 | }
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/BatchSize.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector
 2 | 
 3 | import com.datastax.spark.connector.writer.WriteConf
 4 | 
 5 | sealed trait BatchSize
 6 | 
 7 | case class RowsInBatch(batchSize: Int) extends BatchSize
 8 | case class BytesInBatch(batchSize: Int) extends BatchSize
 9 | 
10 | object BatchSize {
11 |   @deprecated("Use com.datastax.spark.connector.FixedBatchSize instead of a number", "1.1")
12 |   implicit def intToFixedBatchSize(batchSize: Int): RowsInBatch = RowsInBatch(batchSize)
13 | 
14 |   val Automatic = BytesInBatch(WriteConf.DefaultBatchSizeInBytes)
15 | }
16 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/dht/Token.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner.dht
 2 | 
 3 | trait Token[T] extends Ordered[Token[T]] {
 4 |   def value: T
 5 | }
 6 | 
 7 | case class LongToken(value: Long) extends Token[Long] {
 8 |   override def compare(that: Token[Long]) = value.compareTo(that.value)
 9 |   override def toString = value.toString
10 | }
11 | 
12 | case class BigIntToken(value: BigInt) extends Token[BigInt] {
13 |   override def compare(that: Token[BigInt]) = value.compare(that.value)
14 |   override def toString = value.toString()
15 | }
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/TimestampFormatter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.types
 2 | 
 3 | import java.util.Date
 4 | 
 5 | import org.apache.cassandra.serializers.TimestampSerializer
 6 | import org.joda.time.DateTime
 7 | import org.joda.time.format.DateTimeFormat
 8 | 
 9 | /** Formats timestamps and dates using CQL timestamp format `yyyy-MM-dd HH:mm:ssZ` */
10 | object TimestampFormatter {
11 | 
12 |   private val TimestampPattern = "yyyy-MM-dd HH:mm:ssZ"
13 | 
14 |   def format(date: Date): String =
15 |     DateTimeFormat.forPattern(TimestampPattern).print(new DateTime(date.getTime))
16 | }
17 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/rdd/reader/ClassBasedRowReaderTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.reader
 2 | 
 3 | import com.datastax.spark.connector.cql.TableDef
 4 | import org.apache.commons.lang3.SerializationUtils
 5 | import org.junit.Test
 6 | 
 7 | case class TestClass(a: String, b: Int, c: Option[Long])
 8 | 
 9 | class ClassBasedRowReaderTest {
10 | 
11 |   private val tableDef = TableDef("test", "table", Nil, Nil, Nil)
12 | 
13 |   @Test
14 |   def testSerialize() {
15 |     val reader = new ClassBasedRowReader[TestClass](tableDef)
16 |     SerializationUtils.roundtrip(reader)
17 |   }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/Event.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.embedded
 2 | 
 3 | import akka.actor.ActorRef
 4 | 
 5 | object Event {
 6 | 
 7 |   sealed trait Status extends Serializable
 8 | 
 9 |   case class ReceiverStarted(ref: ActorRef) extends Status
10 | 
11 |   case class Pushed(data: AnyRef) extends Status
12 | 
13 |   case object Completed extends Status
14 | 
15 |   case object Report extends Status
16 | 
17 |   sealed trait Task extends Serializable
18 |   case object QueryTask extends Task
19 | 
20 |   case class WordCount(word: String, count: Int) extends Serializable
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/package.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector
 2 | 
 3 | import java.net.InetAddress
 4 | 
 5 | import scala.concurrent.duration.FiniteDuration
 6 | 
 7 | package object embedded {
 8 | 
 9 |   implicit val ZookeeperConnectionString = s"${InetAddress.getLocalHost.getHostAddress}:2181"
10 | 
11 |   /* Factor by which to scale timeouts during tests, e.g. to account for shared build system load. */
12 |   implicit class SparkTestDuration(val duration: FiniteDuration) extends AnyVal {
13 |     def dilated: FiniteDuration = (duration * 1.0).asInstanceOf[FiniteDuration]
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-java/src/main/scala/com/datastax/spark/connector/japi/types/JavaTypeConverter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.japi.types
 2 | 
 3 | import akka.japi.JavaPartialFunction
 4 | import com.datastax.spark.connector.types.NullableTypeConverter
 5 | 
 6 | import scala.reflect.runtime.universe._
 7 | 
 8 | class JavaTypeConverter[T <: AnyRef](typeTag: TypeTag[T], convertFunction: JavaPartialFunction[Any, T])
 9 |   extends NullableTypeConverter[T] {
10 | 
11 |   override def targetTypeTag: TypeTag[T] = typeTag
12 | 
13 |   override def convertPF: PartialFunction[Any, T] = convertFunction
14 | 
15 |   def noMatch() = JavaPartialFunction.noMatch()
16 | }
17 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/simple-demos/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Streaming Demo Reference Config File #
 3 | ####################################
 4 | 
 5 | streaming-demo {
 6 | 
 7 |   # spark://127.0.0.1@7077,127.0.0.2@7077,127.0.0.3@7077
 8 |   # or a local spark://host@7077
 9 |   # This defaults to local
10 |   spark.master = "local[12]"
11 |   # Would normally be `ms` in config but Spark just wants the Long
12 |   spark.streaming.batch.duration = 300
13 |   spark.cleaner.ttl = 3600
14 |   spark.cassandra.connection.host =  "127.0.0.1"
15 | 
16 |   spark.cassandra.keyspace = "streaming_demo"
17 |   spark.cassandra.table = "words"
18 |   data = ["words ", "may ", "count "]
19 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/ValidRDDType.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd
 2 | 
 3 | import java.io.{Serializable => JavaSerializable}
 4 | 
 5 | import com.datastax.spark.connector.types.TypeConverter
 6 | 
 7 | import scala.annotation.implicitNotFound
 8 | 
 9 | @implicitNotFound("Not a valid RDD type. There should exists either a type converter for the type or the type should implement Serializable")
10 | trait ValidRDDType[T]
11 | 
12 | object ValidRDDType {
13 |   implicit def withTypeConverterAsValidRDDType[T](implicit tc: TypeConverter[T]): ValidRDDType[T] = null
14 | 
15 |   implicit def javaSerializableAsValidRDDType[T <: JavaSerializable]: ValidRDDType[T] = null
16 | }
17 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/SparkTemplate.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.embedded
 2 | 
 3 | import org.apache.spark.{SparkEnv, SparkConf, SparkContext}
 4 | 
 5 | trait SparkTemplate {
 6 |   val conf = SparkTemplate.conf
 7 |   val sc = SparkTemplate.sc
 8 | }
 9 | 
10 | object SparkTemplate {
11 | 
12 |   val conf = new SparkConf(true)
13 |     .set("spark.cassandra.connection.host", EmbeddedCassandra.cassandraHost.getHostAddress)
14 |     .set("spark.cleaner.ttl", "3600")
15 |     .setMaster(sys.env.getOrElse("IT_TEST_SPARK_MASTER", "local[*]"))
16 |     .setAppName(getClass.getSimpleName)
17 | 
18 | 
19 |   val sc = new SparkContext(conf)
20 | 
21 |   lazy val actorSystem = SparkEnv.get.actorSystem
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/ColumnMap.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.mapper
 2 | 
 3 | import com.datastax.spark.connector.ColumnRef
 4 | 
 5 | /** Associates constructor parameters and property accessors with table columns */
 6 | trait ColumnMap extends Serializable {
 7 |   def constructor: Seq[ColumnRef]
 8 | 
 9 |   def getters: Map[String, ColumnRef]
10 | 
11 |   def setters: Map[String, ColumnRef]
12 | 
13 |   def allowsNull: Boolean
14 | }
15 | 
16 | case class SimpleColumnMap(constructor: Seq[ColumnRef],
17 |                            getters: Map[String, ColumnRef],
18 |                            setters: Map[String, ColumnRef],
19 |                            allowsNull: Boolean = false) extends ColumnMap
20 | 
21 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/api/java/JavaCassandraSQLContext.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.cassandra.api.java
 2 | 
 3 | import org.apache.spark.api.java.JavaSparkContext
 4 | import org.apache.spark.sql.api.java.{JavaSQLContext, JavaSchemaRDD}
 5 | import org.apache.spark.sql.cassandra.CassandraSQLContext
 6 | 
 7 | class JavaCassandraSQLContext(sparkContext: JavaSparkContext) extends JavaSQLContext(sparkContext) {
 8 | 
 9 |     override val sqlContext = new CassandraSQLContext(sparkContext)
10 | 
11 |     /**
12 |       * Executes a query expressed in SQL, returning the result as a JavaSchemaRDD.
13 |       */
14 |     def cql(cqlQuery: String): JavaSchemaRDD =
15 |       new JavaSchemaRDD(sqlContext, sqlContext.parseSql(cqlQuery))
16 | }
17 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/testkit/SparkCassandraFixture.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.testkit
 2 | 
 3 | import org.scalatest.{BeforeAndAfter, Matchers, WordSpecLike}
 4 | import com.datastax.spark.connector.cql.CassandraConnector
 5 | import com.datastax.spark.connector.embedded.EmbeddedCassandra
 6 | 
 7 | /** Basic unit test abstraction. */
 8 | trait AbstractSpec extends WordSpecLike with Matchers with BeforeAndAfter
 9 | 
10 | /** Used for IT tests. */
11 | trait SharedEmbeddedCassandra extends EmbeddedCassandra {
12 | 
13 |   def clearCache(): Unit = CassandraConnector.evictCache()
14 | 
15 | }
16 | 
17 | private[connector] object TestEvent {
18 | 
19 |   case object Stop
20 | 
21 |   case object Completed
22 | 
23 |   case class WordCount(word: String, count: Int)
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/CqlWhereClause.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd
 2 | 
 3 | /** Represents a logical conjunction of CQL predicates.
 4 |   * Each predicate can have placeholders denoted by '?' which get substituted by values from the `values` array.
 5 |   * The number of placeholders must match the size of the `values` array. */
 6 | case class CqlWhereClause(predicates: Seq[String], values: Seq[Any]) {
 7 | 
 8 |   /** Returns a conjunction of this clause and the given predicate. */
 9 |   def and(other: CqlWhereClause) =
10 |     CqlWhereClause(predicates ++ other.predicates, values ++ other.values)
11 | 
12 | }
13 | 
14 | object CqlWhereClause {
15 | 
16 |   /** Empty CQL WHERE clause selects all rows */
17 |   val empty = new CqlWhereClause(Nil, Nil)
18 | }
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/StreamingContextJavaFunctions.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.japi;
 2 | 
 3 | import org.apache.spark.streaming.StreamingContext;
 4 | 
 5 | /**
 6 |  * Java API wrapper over {@link org.apache.spark.streaming.StreamingContext} to provide Spark Cassandra Connector
 7 |  * functionality.
 8 |  *
 9 |  * <p>To obtain an instance of this wrapper, use one of the factory methods in {@link
10 |  * com.datastax.spark.connector.japi.CassandraJavaUtil} class.</p>
11 |  */
12 | @SuppressWarnings("UnusedDeclaration")
13 | public class StreamingContextJavaFunctions extends SparkContextJavaFunctions {
14 |     public final StreamingContext ssc;
15 | 
16 |     StreamingContextJavaFunctions(StreamingContext ssc) {
17 |         super(ssc.sparkContext());
18 |         this.ssc = ssc;
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/DemoApp.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.demo
 2 | 
 3 | import com.datastax.spark.connector.util.Logging
 4 | import org.apache.spark.{SparkContext, SparkConf}
 5 | 
 6 | trait DemoApp extends App with Logging {
 7 | 
 8 |   val words = "./spark-cassandra-connector-demos/simple-demos/src/main/resources/data/words"
 9 | 
10 |   val SparkMasterHost = "127.0.0.1"
11 | 
12 |   val CassandraHost = "127.0.0.1"
13 | 
14 |   // Tell Spark the address of one Cassandra node:
15 |   val conf = new SparkConf(true)
16 |     .set("spark.cassandra.connection.host", CassandraHost)
17 |     .set("spark.cleaner.ttl", "3600")
18 |     .setMaster("local[12]")
19 |     .setAppName(getClass.getSimpleName)
20 | 
21 |   // Connect to the Spark cluster:
22 |   lazy val sc = new SparkContext(conf)
23 | }
24 | 
25 | object DemoApp {
26 |   def apply(): DemoApp = new DemoApp {}
27 | }
28 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleJavaBeanWithoutNoArgsCtor.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because
 7 |  * Scala adds some additional accessors and mutators.
 8 |  */
 9 | public class SampleJavaBeanWithoutNoArgsCtor implements Serializable {
10 |     private Integer key;
11 |     private String value;
12 | 
13 |     private SampleJavaBeanWithoutNoArgsCtor(Integer key, String value) {
14 |         this.key = key;
15 |         this.value = value;
16 |     }
17 | 
18 |     public Integer getKey() {
19 |         return key;
20 |     }
21 | 
22 |     public void setKey(Integer key) {
23 |         this.key = key;
24 |     }
25 | 
26 |     public String getValue() {
27 |         return value;
28 |     }
29 | 
30 |     public void setValue(String value) {
31 |         this.value = value;
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/samples.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector {
 2 | 
 3 | case class SampleScalaCaseClass(key: Int, value: String)
 4 | 
 5 | class SampleScalaClass(val key: Int, val value: String) extends Serializable
 6 | 
 7 | class SampleScalaClassWithNoFields(key: Int, value: String) extends Serializable
 8 | 
 9 | class SampleScalaClassWithMultipleCtors(var key: Int, var value: String) extends Serializable {
10 |   def this(key: Int) = this(key, null)
11 | 
12 |   def this() = this(0, null)
13 | }
14 | 
15 | class SampleWithNestedScalaCaseClass extends Serializable {
16 | 
17 |   case class InnerClass(key: Int, value: String)
18 | 
19 | }
20 | 
21 | class SampleWithDeeplyNestedScalaCaseClass extends Serializable {
22 | 
23 |   class IntermediateClass extends Serializable {
24 | 
25 |     case class InnerClass(key: Int, value: String)
26 | 
27 |   }
28 | 
29 | }
30 | 
31 | object SampleObject {
32 | 
33 |   case class ClassInObject(key: Int, value: String)
34 | 
35 | }
36 | 
37 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/SparkCassandraSettings.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.demo
 2 | 
 3 | import com.typesafe.config.{Config, ConfigFactory}
 4 | 
 5 | /* Initializes Akka, Cassandra and Spark settings. */
 6 | final class SparkCassandraSettings(rootConfig: Config) {
 7 |   def this() = this(ConfigFactory.load)
 8 | 
 9 |   protected val config = rootConfig.getConfig("streaming-demo")
10 | 
11 |   val SparkMaster: String = config.getString("spark.master")
12 | 
13 |   val SparkCleanerTtl: Int = config.getInt("spark.cleaner.ttl")
14 | 
15 |   val SparkStreamingBatchDuration: Long = config.getLong("spark.streaming.batch.duration")
16 | 
17 |   val Data = akka.japi.Util.immutableSeq(config.getStringList("data")).toSet
18 | 
19 |   val CassandraSeed: String = config.getString("spark.cassandra.connection.host")
20 | 
21 |   val CassandraKeyspace = config.getString("spark.cassandra.keyspace")
22 | 
23 |   val CassandraTable = config.getString("spark.cassandra.table")
24 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/WordCountDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.demo
 2 | 
 3 | import org.apache.spark.SparkContext._
 4 | import com.datastax.spark.connector.cql.CassandraConnector
 5 | import com.datastax.spark.connector._
 6 | 
 7 | object WordCountDemo extends DemoApp {
 8 | 
 9 |   CassandraConnector(conf).withSessionDo { session =>
10 |     session.execute(s"CREATE KEYSPACE IF NOT EXISTS demo WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
11 |     session.execute(s"CREATE TABLE IF NOT EXISTS demo.wordcount (word TEXT PRIMARY KEY, count COUNTER)")
12 |     session.execute(s"TRUNCATE demo.wordcount")
13 |   }
14 | 
15 |   sc.textFile(words)
16 |     .flatMap(_.split("\\s+"))
17 |     .map(word => (word.toLowerCase, 1))
18 |     .reduceByKey(_ + _)
19 |     .saveToCassandra("demo", "wordcount")
20 | 
21 |   // print out the data saved from Spark to Cassandra
22 |   sc.cassandraTable("demo", "wordcount").collect.foreach(println)
23 |   sc.stop()
24 | }
25 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleJavaBean.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because
 7 |  * Scala adds some additional accessors and mutators.
 8 |  */
 9 | public class SampleJavaBean implements Serializable {
10 |     private Integer key;
11 |     private String value;
12 | 
13 |     public static SampleJavaBean newInstance(Integer key, String value) {
14 |         SampleJavaBean bean = new SampleJavaBean();
15 |         bean.setKey(key);
16 |         bean.setValue(value);
17 |         return bean;
18 |     }
19 | 
20 |     public Integer getKey() {
21 |         return key;
22 |     }
23 | 
24 |     public void setKey(Integer key) {
25 |         this.key = key;
26 |     }
27 | 
28 |     public String getValue() {
29 |         return value;
30 |     }
31 | 
32 |     public void setValue(String value) {
33 |         this.value = value;
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/RowWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.driver.core.{ProtocolVersion, BoundStatement, PreparedStatement}
 4 | 
 5 | /** `RowWriter` knows how to write an object to Cassandra using the Java Cassandra driver.
 6 |   *  */
 7 | trait RowWriter[T] extends Serializable {
 8 |   
 9 |   /** Extracts column values from `data` object and binds them to the given statement.
10 |     * Variables of the prepared statement are named the same as column names to be saved.
11 |     * This method must not rely on any particular order of variables.*/
12 |   def bind(data: T, stmt: PreparedStatement, protocolVersion: ProtocolVersion): BoundStatement
13 |   
14 |   /** Estimates serialized size in bytes of a data object.
15 |     * Used for grouping statements into batches. */
16 |   def estimateSizeInBytes(data: T): Int
17 | 
18 |   /** List of columns this `RowWriter` is going to write.
19 |     * Used to construct appropriate INSERT or UPDATE statement. */
20 |   def columnNames: Seq[String]
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/PrefetchingResultSetIterator.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.reader
 2 | 
 3 | import com.datastax.driver.core.{Row, ResultSet}
 4 | 
 5 | /** Allows to efficiently iterate over a large, paged ResultSet,
 6 |   * asynchronously prefetching the next page.
 7 |   * 
 8 |   * @param resultSet result set obtained from the Java driver
 9 |   * @param prefetchWindowSize if there are less than this rows available without blocking,
10 |   *                           initiates fetching the next page
11 |   */
12 | class PrefetchingResultSetIterator(resultSet: ResultSet, prefetchWindowSize: Int) extends Iterator[Row] {
13 | 
14 |   private[this] val iterator = resultSet.iterator()
15 | 
16 |   override def hasNext = iterator.hasNext
17 | 
18 |   private[this] def maybePrefetch(): Unit = {
19 |     if (!resultSet.isFullyFetched && resultSet.getAvailableWithoutFetching < prefetchWindowSize)
20 |       resultSet.fetchMoreResults()
21 |   }
22 | 
23 |   override def next() = {
24 |     maybePrefetch()
25 |     iterator.next()
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/GenericRowWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.spark.connector.CassandraRow
 4 | import com.datastax.spark.connector.cql.TableDef
 5 | 
 6 | /** A [[RowWriter]] that can write [[CassandraRow]] objects.*/
 7 | class GenericRowWriter(table: TableDef, selectedColumns: Seq[String])
 8 |   extends AbstractRowWriter[CassandraRow](table: TableDef, selectedColumns: Seq[String]) {
 9 | 
10 |   override protected def getColumnValue(data: CassandraRow, columnName: String): AnyRef = {
11 |     val index = data.indexOf(columnName)
12 |     if (index >= 0) {
13 |       val converter = table.columnByName(columnName).columnType.converterToCassandra
14 |       val value = data.getRaw(index)
15 |       converter.convert(value)
16 |     }
17 |     else
18 |       null
19 |   }
20 | }
21 | 
22 | 
23 | object GenericRowWriter {
24 | 
25 |   object Factory extends RowWriterFactory[CassandraRow] {
26 |     override def rowWriter(table: TableDef, columnNames: Seq[String]) =
27 |       new GenericRowWriter(table, columnNames)
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/TupleColumnMapper.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.mapper
 2 | 
 3 | import com.datastax.spark.connector.{ColumnRef, ColumnIndex}
 4 | import com.datastax.spark.connector.cql.TableDef
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | class TupleColumnMapper[T <: Product : ClassTag] extends ColumnMapper[T] {
 9 | 
10 |   override def classTag: ClassTag[T] = implicitly[ClassTag[T]]
11 | 
12 |   private def indexedColumnRefs(n: Int) =
13 |     (0 until n).map(ColumnIndex)
14 | 
15 |   override def columnMap(tableDef: TableDef): ColumnMap = {
16 | 
17 |     val GetterRegex = "_([0-9]+)".r
18 |     val cls = implicitly[ClassTag[T]].runtimeClass
19 | 
20 |     val constructor =
21 |       indexedColumnRefs(cls.getConstructors()(0).getParameterTypes.length)
22 | 
23 |     val getters = {
24 |       for (name@GetterRegex(id) <- cls.getMethods.map(_.getName))
25 |       yield (name, ColumnIndex(id.toInt - 1))
26 |     }.toMap
27 | 
28 |     val setters =
29 |       Map.empty[String, ColumnRef]
30 | 
31 |     SimpleColumnMap(constructor, getters, setters)
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/SqlRowWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.spark.connector.cql.TableDef
 4 | import org.apache.spark.sql.catalyst.expressions.Row
 5 | 
 6 | /** A [[RowWriter]] that can write [[Row]] objects.*/
 7 | class SqlRowWriter(table: TableDef, selectedColumns: Seq[String]) extends AbstractRowWriter[Row](table: TableDef, selectedColumns: Seq[String]) {
 8 | 
 9 |   override protected def getColumnValue(data: Row, columnName: String): AnyRef = {
10 |     val index = columnNames.indexOf(columnName)
11 |     if (index >= 0 && index < data.size) {
12 |       val converter = table.columnByName(columnName).columnType.converterToCassandra
13 |       val value = data.apply(index)
14 |       if (value == null) null else converter.convert(value).asInstanceOf[AnyRef]
15 |     }
16 |     else
17 |       null
18 |   }
19 | }
20 | 
21 | 
22 | object SqlRowWriter {
23 | 
24 |   object Factory extends RowWriterFactory[Row] {
25 |     override def rowWriter(table: TableDef, columnNames: Seq[String]) =
26 |       new SqlRowWriter(table, columnNames)
27 |   }
28 | 
29 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/CassandraStreamingRDD.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.streaming
 2 | 
 3 | import com.datastax.spark.connector.cql.CassandraConnector
 4 | import com.datastax.spark.connector.{ColumnSelector, AllColumns}
 5 | 
 6 | import scala.reflect.ClassTag
 7 | import org.apache.spark.streaming.StreamingContext
 8 | import com.datastax.spark.connector.rdd.{ReadConf, CassandraRDD, CqlWhereClause}
 9 | import com.datastax.spark.connector.rdd.reader._
10 | 
11 | /** RDD representing a Cassandra table for Spark Streaming.
12 |   * @see [[com.datastax.spark.connector.rdd.CassandraRDD]] */
13 | class CassandraStreamingRDD[R] private[connector] (
14 |     sctx: StreamingContext,
15 |     connector: CassandraConnector,
16 |     keyspace: String,
17 |     table: String,
18 |     columns: ColumnSelector = AllColumns,
19 |     where: CqlWhereClause = CqlWhereClause.empty,
20 |     readConf: ReadConf = ReadConf())(
21 |   implicit
22 |     ct : ClassTag[R],
23 |     @transient rrf: RowReaderFactory[R])
24 |   extends CassandraRDD[R](sctx.sparkContext, connector, keyspace, table, columns, where, readConf)
25 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleJavaBeanWithMultipleCtors.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because
 7 |  * Scala adds some additional accessors and mutators.
 8 |  */
 9 | public class SampleJavaBeanWithMultipleCtors implements Serializable {
10 |     private Integer key;
11 |     private String value;
12 | 
13 |     public SampleJavaBeanWithMultipleCtors(Integer key) {
14 |         this.key = key;
15 |     }
16 | 
17 |     public SampleJavaBeanWithMultipleCtors() {
18 |     }
19 | 
20 |     public SampleJavaBeanWithMultipleCtors(Integer key, String value) {
21 |         this.key = key;
22 |         this.value = value;
23 |     }
24 | 
25 |     public Integer getKey() {
26 |         return key;
27 |     }
28 | 
29 |     public void setKey(Integer key) {
30 |         this.key = key;
31 |     }
32 | 
33 |     public String getValue() {
34 |         return value;
35 |     }
36 | 
37 |     public void setValue(String value) {
38 |         this.value = value;
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/streaming/TestProducer.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.streaming
 2 | 
 3 | import scala.concurrent.duration._
 4 | import akka.actor.{PoisonPill, Actor, ActorRef}
 5 | 
 6 | class TestProducer(data: Array[String], to: ActorRef) extends Counter {
 7 |   import scala.util.Random
 8 |   import context.dispatcher
 9 | 
10 |   val rand = new Random()
11 | 
12 |   val task = context.system.scheduler.schedule(2.second, 1.millis) {
13 |     if (count < scale) {  // we need this test to avoid generating more than 'scale' messages
14 |       to ! makeMessage()
15 |       increment()
16 |     }
17 |   }
18 | 
19 |   def receive: Actor.Receive = {
20 |     case _ =>
21 |   }
22 | 
23 |   def makeMessage(): String = {
24 |     val x = rand.nextInt(3)
25 |     data(x) + data(2 - x)
26 |   }
27 | }
28 | 
29 | trait CounterFixture {
30 |   val scale = 30
31 | }
32 | 
33 | // CountDownLatch is not Serializable, can't use in stream so we do this.
34 | trait Counter extends Actor with CounterFixture {
35 | 
36 |   var count = 0
37 | 
38 |   def increment(): Unit = {
39 |     count += 1
40 |     if (count == scale) self ! PoisonPill
41 |   }
42 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/types/CanBuildFromTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.types
 2 | 
 3 | import org.apache.commons.lang3.SerializationUtils
 4 | import org.junit.Assert._
 5 | import org.junit.Test
 6 | 
 7 | class CanBuildFromTest {
 8 | 
 9 |   @Test
10 |   def testBuild() {
11 |     val bf = CanBuildFrom.setCanBuildFrom[Int]
12 |     val builder = bf.apply()
13 |     builder += 1
14 |     builder += 2
15 |     builder += 3
16 |     assertEquals(Set(1,2,3), builder.result())
17 |   }
18 | 
19 |   @Test
20 |   def testSerializeAndBuild() {
21 |     val bf = CanBuildFrom.setCanBuildFrom[Int]
22 |     val bf2 = SerializationUtils.roundtrip(bf)
23 |     val builder = bf2.apply()
24 |     builder += 1
25 |     builder += 2
26 |     builder += 3
27 |     assertEquals(Set(1,2,3), builder.result())
28 |   }
29 | 
30 |   @Test
31 |   def testSerializeAndBuildWithOrdering() {
32 |     val bf = CanBuildFrom.treeSetCanBuildFrom[Int]
33 |     val bf2 = SerializationUtils.roundtrip(bf)
34 |     val builder = bf2.apply()
35 |     builder += 1
36 |     builder += 2
37 |     builder += 3
38 |     assertEquals(Set(1,2,3), builder.result())
39 |   }
40 | 
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/twitter-streaming/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Spark Cassandra Connector (Twitter Demo App) Config File #
 3 | ####################################
 4 | 
 5 | # This is the reference config file that contains all the default settings.
 6 | streaming-app {
 7 | 
 8 |   # These can be search terms to filter for, or hashtags
 9 |   # ["android", "iphone"]
10 |   filters = ["#android", "#iphone"]
11 | 
12 |   spark {
13 |     # The fallback Spark master, it auto-detection fails.
14 |     # Can change to spark://127.0.0.1:7077 for example.
15 |     master = "local[*]"
16 | 
17 |     # In seconds: Not using hcon 5s format until Spark
18 |     # Upgrades their akka and thus config versions (to avoid a deprecation issue).
19 |     streaming.batch.interval = 5
20 | 
21 |     # The default
22 |     executor.memory = 2g
23 |     cores.max = 2
24 | 
25 |     jars = [
26 |       "./spark-cassandra-connector-demos/twitter-streaming/target/scala-2.10/twitter-streaming-assembly-1.1.0-SNAPSHOT.jar"
27 |     ]
28 | 
29 |     cassandra {
30 |       connection.host = ["127.0.0.1"]
31 |       keyspace = "twitter_stream"
32 |       table = "hashtags_by_interval"
33 |     }
34 |   }
35 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/cql/CassandraAuthenticatedConnectorSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.cql
 2 | 
 3 | import com.datastax.spark.connector.testkit.SharedEmbeddedCassandra
 4 | import org.scalatest.{Matchers, FlatSpec}
 5 | 
 6 | class CassandraAuthenticatedConnectorSpec  extends FlatSpec with Matchers with SharedEmbeddedCassandra {
 7 | 
 8 |   useCassandraConfig("cassandra-password-auth.yaml" +
 9 |     ".template")
10 |   val conn = CassandraConnector(Set(cassandraHost), authConf = PasswordAuthConf("cassandra", "cassandra"))
11 | 
12 |   // Wait for the default user to be created in Cassandra.
13 |   Thread.sleep(1000)
14 | 
15 |   "A CassandraConnector" should "authenticate with username and password when using native protocol" in {
16 |     conn.withSessionDo { session =>
17 |       assert(session !== null)
18 |       assert(session.isClosed === false)
19 |       assert(session.getCluster.getMetadata.getClusterName === "Test Cluster")
20 |     }
21 |   }
22 | 
23 |   it should "authenticate with username and password when using thrift" in {
24 |     conn.withCassandraClientDo { client =>
25 |       assert(client.describe_cluster_name() === "Test Cluster")
26 |     }
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/dht/TokenRange.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner.dht
 2 | 
 3 | import java.net.InetAddress
 4 | 
 5 | 
 6 | case class CassandraNode(rpcAddress: InetAddress, localAddress: InetAddress) {
 7 |   require(rpcAddress != InetAddress.getByName("0.0.0.0"), "rpcAddress must not be 0.0.0.0")
 8 |   require(localAddress != InetAddress.getByName("0.0.0.0"), "localAddress must not be 0.0.0.0")
 9 |   def allAddresses = Set(rpcAddress, localAddress)
10 | }
11 | 
12 | object CassandraNode {
13 |   implicit def ordering: Ordering[CassandraNode] = Ordering.by(_.rpcAddress.toString)
14 | }
15 | 
16 | case class TokenRange[V, T <: Token[V]] (
17 |     start: T, end: T, endpoints: Set[CassandraNode], rowCount: Option[Long]) {
18 | 
19 |   def isWrapAround: Boolean =
20 |     start >= end
21 | 
22 |   def unwrap(implicit tokenFactory: TokenFactory[V, T]): Seq[TokenRange[V, T]] = {
23 |     val minToken = tokenFactory.minToken
24 |     if (isWrapAround)
25 |       Seq(
26 |         TokenRange(start, minToken, endpoints, rowCount.map(_ / 2)),
27 |         TokenRange(minToken, end, endpoints, rowCount.map(_ / 2)))
28 |     else
29 |       Seq(this)
30 |   }
31 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleWithNestedJavaBean.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because
 7 |  * Scala adds some additional accessors and mutators.
 8 |  */
 9 | public class SampleWithNestedJavaBean implements Serializable {
10 |     public class InnerClass implements Serializable {
11 |         private Integer key;
12 |         private String value;
13 | 
14 |         public InnerClass(Integer key) {
15 |             this.key = key;
16 |         }
17 | 
18 |         public InnerClass() {
19 |         }
20 | 
21 |         public InnerClass(Integer key, String value) {
22 |             this.key = key;
23 |             this.value = value;
24 |         }
25 | 
26 |         public Integer getKey() {
27 |             return key;
28 |         }
29 | 
30 |         public void setKey(Integer key) {
31 |             this.key = key;
32 |         }
33 | 
34 |         public String getValue() {
35 |             return value;
36 |         }
37 | 
38 |         public void setValue(String value) {
39 |             this.value = value;
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/PropertyExtractorTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import org.junit.Assert._
 4 | import org.junit.Test
 5 | 
 6 | class PropertyExtractorTest {
 7 | 
 8 |   class TestClass(val field1: String, val field2: Int)
 9 | 
10 |   @Test
11 |   def testSimpleExtraction() {
12 |     val testObject = new TestClass("a", 1)
13 |     val propertyExtractor = new PropertyExtractor(classOf[TestClass], Seq("field1", "field2"))
14 |     val result = propertyExtractor.extract(testObject)
15 |     assertEquals(2, result.size)
16 |     assertEquals("a", result(0))
17 |     assertEquals(1, result(1))
18 |   }
19 | 
20 |   @Test
21 |   def testAvailableProperties() {
22 |     val triedProperties = Seq("field1", "foo", "bar")
23 |     val availableProperties = PropertyExtractor.availablePropertyNames(classOf[TestClass], triedProperties)
24 |     assertEquals(Seq("field1"), availableProperties)
25 |   }
26 | 
27 |   @Test(expected = classOf[NoSuchMethodException])
28 |   def testWrongPropertyName() {
29 |     val testObject = new TestClass("a", 1)
30 |     val propertyExtractor = new PropertyExtractor(classOf[TestClass], Seq("foo"))
31 |     propertyExtractor.extract(testObject)
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/ObjectSizeEstimatorTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import java.nio.ByteBuffer
 4 | import java.util.Date
 5 | 
 6 | import org.junit.Assert._
 7 | import org.junit.Test
 8 | 
 9 | class ObjectSizeEstimatorTest {
10 | 
11 |   @Test
12 |   def testFunctionality() {
13 |     val size0 = ObjectSizeEstimator.measureSerializedSize(Array(1))
14 |     val size1 = ObjectSizeEstimator.measureSerializedSize(Array(1, 2))
15 |     val size2 = ObjectSizeEstimator.measureSerializedSize(Array(1, 2, "abc", List("item1", "item2"), new Date()))
16 |     assertTrue(size0 > 16)
17 |     assertTrue(size1 > size0)
18 |     assertTrue(size2 > size1)
19 |   }
20 | 
21 |   @Test
22 |   def testByteBuffers() {
23 |     val buffer = ByteBuffer.allocate(100)
24 |     val size0 = ObjectSizeEstimator.measureSerializedSize(Array(buffer))
25 |     val size1 = ObjectSizeEstimator.measureSerializedSize(Array(List(buffer)))
26 |     val size2 = ObjectSizeEstimator.measureSerializedSize(Array(Set(buffer)))
27 |     val size3 = ObjectSizeEstimator.measureSerializedSize(Array(Map(1 -> buffer)))
28 |     assertTrue(size0 > 100)
29 |     assertTrue(size1 > 100)
30 |     assertTrue(size2 > 100)
31 |     assertTrue(size3 > 100)
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/RDDFunctions.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector
 2 | 
 3 | import com.datastax.spark.connector.cql.CassandraConnector
 4 | import com.datastax.spark.connector.writer._
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.rdd.RDD
 7 | 
 8 | /** Provides Cassandra-specific methods on `RDD` */
 9 | class RDDFunctions[T](rdd: RDD[T]) extends WritableToCassandra[T] with Serializable {
10 | 
11 |   override val sparkContext: SparkContext = rdd.sparkContext
12 | 
13 |   /**
14 |    * Saves the data from `RDD` to a Cassandra table. Uses the specified column names.
15 |    * @see [[com.datastax.spark.connector.writer.WritableToCassandra]]
16 |    */
17 |   def saveToCassandra(keyspaceName: String,
18 |                       tableName: String,
19 |                       columns: ColumnSelector = AllColumns,
20 |                       writeConf: WriteConf = WriteConf.fromSparkConf(sparkContext.getConf))
21 |                      (implicit connector: CassandraConnector = CassandraConnector(sparkContext.getConf),
22 |                       rwf: RowWriterFactory[T]): Unit = {
23 |     val writer = TableWriter(connector, keyspaceName, tableName, columns, writeConf)
24 |     rdd.sparkContext.runJob(rdd, writer.write _)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/types/TypeSerializationTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.types
 2 | 
 3 | import org.apache.commons.lang3.SerializationUtils
 4 | import org.junit.Assert._
 5 | import org.junit.Test
 6 | 
 7 | class TypeSerializationTest {
 8 | 
 9 |   private def testSerialization(t: ColumnType[_]) {
10 |     assertEquals(t, SerializationUtils.roundtrip(t))
11 |   }
12 | 
13 |   @Test
14 |   def testSerializationOfPrimitiveTypes() {
15 |     testSerialization(AsciiType)
16 |     testSerialization(TextType)
17 |     testSerialization(IntType)
18 |     testSerialization(BigIntType)
19 |     testSerialization(DoubleType)
20 |     testSerialization(FloatType)
21 |     testSerialization(BooleanType)
22 |     testSerialization(UUIDType)
23 |     testSerialization(TimeUUIDType)
24 |     testSerialization(TimestampType)
25 |     testSerialization(DecimalType)
26 |     testSerialization(BigIntType)
27 |     testSerialization(InetType)
28 |     testSerialization(CounterType)
29 |   }
30 | 
31 |   @Test
32 |   def testSerializationOfCollectionTypes() {
33 |     testSerialization(ListType(IntType))
34 |     testSerialization(ListType(ListType(IntType)))
35 |     testSerialization(SetType(TextType))
36 |     testSerialization(MapType(BigIntType, TimestampType))
37 |   }
38 | 
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/PropertyExtractor.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import java.lang.reflect.Method
 4 | 
 5 | import scala.util.Try
 6 | 
 7 | /** Extracts values from fields of an object. */
 8 | class PropertyExtractor[T](val cls: Class[T], val propertyNames: Seq[String]) extends Serializable {
 9 | 
10 |   private def getter(name: String) =
11 |     cls.getMethod(name)
12 | 
13 |   @transient
14 |   private lazy val methods: Array[Method] =
15 |     propertyNames.map(getter).toArray
16 | 
17 |   @transient
18 |   private lazy val methodByName =
19 |     methods.map(m => (m.getName, m)).toMap
20 | 
21 |   def extract(obj: T): Array[AnyRef] =
22 |     extract(obj, Array.ofDim(methods.length))
23 | 
24 |   def extract(obj: T, target: Array[AnyRef]): Array[AnyRef] = {
25 |     for (i <- 0 until methods.length)
26 |       target(i) = methods(i).invoke(obj)
27 |     target
28 |   }
29 | 
30 |   def extractProperty(obj: T, propertyName: String): AnyRef = {
31 |     val m = methodByName(propertyName)
32 |     m.invoke(obj)
33 |   }
34 | }
35 | 
36 | object PropertyExtractor {
37 | 
38 |   def availablePropertyNames(cls: Class[_], requestedPropertyNames: Seq[String]): Seq[String] =
39 |     requestedPropertyNames.filter(name => Try(cls.getMethod(name)).isSuccess)
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/CassandraRDDPartition.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner
 2 | 
 3 | import java.net.InetAddress
 4 | 
 5 | import org.apache.spark.Partition
 6 | 
 7 | /** Stores a CQL `WHERE` predicate matching a range of tokens. */
 8 | case class CqlTokenRange(cql: String, values: Any*)
 9 | 
10 | /** Metadata describing Cassandra table partition processed by a single Spark task.
11 |   * Beware the term "partition" is overloaded. Here, in the context of Spark,
12 |   * it means an arbitrary collection of rows that can be processed locally on a single Cassandra cluster node.
13 |   * A `CassandraPartition` typically contains multiple CQL partitions, i.e. rows identified by different values of
14 |   * the CQL partitioning key.
15 |   *
16 |   * @param index identifier of the partition, used internally by Spark
17 |   * @param endpoints which nodes the data partition is located on
18 |   * @param tokenRanges token ranges determining the row set to be fetched
19 |   * @param rowCount estimated total row count in a partition
20 |   */
21 | case class CassandraPartition(index: Int,
22 |                               endpoints: Iterable[InetAddress],
23 |                               tokenRanges: Iterable[CqlTokenRange],
24 |                               rowCount: Long) extends Partition
25 | 
26 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/ConvertingPropertyExtractor.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.spark.connector.types.TypeConverter
 4 | 
 5 | /** Extracts property values from an object and additionally converts them to desired types */
 6 | class ConvertingPropertyExtractor[T](val cls: Class[T], properties: Seq[(String, TypeConverter[_])])
 7 |   extends Serializable {
 8 | 
 9 |   val (propertyNames, propertyTypes) = properties.toArray.unzip
10 |   val propertyTypeByName = properties.toMap
11 | 
12 |   private val simpleExtractor =
13 |     new PropertyExtractor[T](cls, propertyNames)
14 | 
15 |   def extract(obj: T): Array[AnyRef] =
16 |     convert(simpleExtractor.extract(obj))
17 | 
18 | 
19 |   def extract(obj: T, target: Array[AnyRef]): Array[AnyRef] =
20 |     convert(simpleExtractor.extract(obj, target))
21 | 
22 |   def extractProperty(obj: T, propertyName: String): AnyRef = {
23 |     val propertyValue = simpleExtractor.extractProperty(obj, propertyName)
24 |     val converter = propertyTypeByName(propertyName)
25 |     converter.convert(propertyValue).asInstanceOf[AnyRef]
26 |   }
27 | 
28 |   def convert(data: Array[AnyRef]): Array[AnyRef] = {
29 |     for (i <- 0 until data.length)
30 |       data(i) = propertyTypes(i).convert(data(i)).asInstanceOf[AnyRef]
31 |     data
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/AbstractRowWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.driver.core.{ProtocolVersion, PreparedStatement}
 4 | import com.datastax.spark.connector.cql.TableDef
 5 | import org.apache.spark.sql.catalyst.expressions.Row
 6 | 
 7 | /** A [[RowWriter]] that can write SparkSQL [[Row]] objects or [[com.datastax.spark.connector.CassandraRow]] objects .*/
 8 | abstract class AbstractRowWriter[T](table: TableDef, selectedColumns: Seq[String]) extends RowWriter[T] {
 9 | 
10 |   override def columnNames =
11 |     selectedColumns.toIndexedSeq
12 | 
13 |   protected def getColumnValue(data: T, columnName: String): AnyRef
14 | 
15 |   @transient
16 |   protected lazy val buffer = new ThreadLocal[Array[AnyRef]] {
17 |     override def initialValue() = Array.ofDim[AnyRef](columnNames.size)
18 |   }
19 | 
20 |   protected def fillBuffer(data: T): Array[AnyRef] = {
21 |     val buf = buffer.get
22 |     for (i <- 0 until columnNames.size)
23 |       buf(i) = getColumnValue(data, columnNames(i))
24 |     buf
25 |   }
26 | 
27 |   override def bind(data: T, stmt: PreparedStatement, protocolVersion: ProtocolVersion) = {
28 |     stmt.bind(fillBuffer(data): _*)
29 |   }
30 | 
31 |   override def estimateSizeInBytes(data: T) = {
32 |     ObjectSizeEstimator.measureSerializedSize(fillBuffer(data))
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/ConvertingPropertyExtractorTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.spark.connector.types.TypeConverter.{StringConverter, OptionToNullConverter, IntConverter}
 4 | 
 5 | import org.junit.Assert._
 6 | import org.junit.Test
 7 | 
 8 | class ConvertingPropertyExtractorTest {
 9 | 
10 |   class TestClass(val field1: String, val field2: Option[Int])
11 | 
12 |   private def createExtractor: ConvertingPropertyExtractor[TestClass] = {
13 |     new ConvertingPropertyExtractor[TestClass](
14 |       classOf[TestClass], Seq(
15 |         ("field1", IntConverter),
16 |         ("field2", new OptionToNullConverter(StringConverter))))
17 |   }
18 | 
19 |   @Test
20 |   def testExtraction() {
21 |     val obj = new TestClass("123", Some(5))
22 |     val extractor = createExtractor
23 |     val data = extractor.extract(obj)
24 |     assertNotNull(data)
25 |     assertEquals(2, data.length)
26 |     assertEquals(123, data(0))
27 |     assertEquals("5", data(1))
28 |   }
29 | 
30 |   @Test
31 |   def testExtractionNoAlloc() {
32 |     val obj = new TestClass("123", Some(5))
33 |     val extractor = createExtractor
34 |     val data = Array.ofDim[AnyRef](extractor.propertyNames.size)
35 |     extractor.extract(obj, data)
36 |     assertEquals(123, data(0))
37 |     assertEquals("5", data(1))
38 | 
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/MultipleRetryPolicy.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.cql
 2 | 
 3 | import com.datastax.driver.core.policies.RetryPolicy
 4 | import com.datastax.driver.core.policies.RetryPolicy.RetryDecision
 5 | import com.datastax.driver.core.{ConsistencyLevel, Statement, WriteType}
 6 | 
 7 | /** Always retries with the same CL, constant number of times, regardless of circumstances */
 8 | class MultipleRetryPolicy(maxRetryCount: Int) extends RetryPolicy {
 9 | 
10 |   private def retryOrThrow(cl: ConsistencyLevel, nbRetry: Int): RetryDecision = {
11 |     if (nbRetry < maxRetryCount)
12 |       RetryDecision.retry(cl)
13 |     else
14 |       RetryDecision.rethrow()
15 |   }
16 | 
17 |   override def onReadTimeout(stmt: Statement, cl: ConsistencyLevel,
18 |                              requiredResponses: Int, receivedResponses: Int,
19 |                              dataRetrieved: Boolean, nbRetry: Int) = retryOrThrow(cl, nbRetry)
20 | 
21 |   override def onUnavailable(stmt: Statement, cl: ConsistencyLevel,
22 |                              requiredReplica: Int, aliveReplica: Int, nbRetry: Int) = retryOrThrow(cl, nbRetry)
23 | 
24 |   override def onWriteTimeout(stmt: Statement, cl: ConsistencyLevel, writeType: WriteType,
25 |                               requiredAcks: Int, receivedAcks: Int, nbRetry: Int) = retryOrThrow(cl, nbRetry)
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/scripts/submit-demos:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This very basic script that submits the demos jar to a local spark master.
 3 | # TODO add input validation and error handling.
 4 | 
 5 | # ## 1. Build the demos assembly jar:
 6 | # sbt -Dspark.cassandra.connector.demos.assembly=true assembly
 7 | 
 8 | # ## 2. Run this script ##
 9 | # Pass in 3 parameters:
10 | # 1. Path to /bin/spark-submit
11 | # 2. Spark master
12 | # 3. The FQCN of the demo class to run, e.g: com.datastax.spark.connector.demo.BasicReadWriteDemo
13 | # For further customization options see https://spark.apache.org/docs/latest/submitting-applications.html
14 | # Example:
15 | # sudo ./scripts/submit-demos /path/to/spark/bin spark://master:7077 com.datastax.spark.connector.demo.BasicReadWriteDemo
16 | # ##
17 | 
18 | 
19 | PATH_TO_SPARK_BIN_SCRIPTS=$1
20 | SPARK_MASTER=$2
21 | APP_TO_RUN=$3
22 | 
23 | # TODO read from Settings.scala scalaVersion and version in ThisBuild:
24 | VERSION="1.0.0-SNAPSHOT"
25 | SCALA_VERSION="scala-2.10"
26 | DEMOS_ASSEMBLY_JAR="spark-cassandra-connector-demos-assembly-$VERSION.jar"
27 | PATH_TO_JAR="spark-cassandra-connector-demos/target/$SCALA_VERSION/$DEMOS_ASSEMBLY_JAR"
28 | SPARK_SUBMIT="$PATH_TO_SPARK_BIN_SCRIPTS/spark-submit"
29 | 
30 | # Run on a Spark standalone cluster
31 | echo "Attempting to submit demo $SPARK_SUBMIT on $SPARK_MASTER with $PATH_TO_JAR"
32 | $SPARK_SUBMIT --class $APP_TO_RUN --master $SPARK_MASTER $PATH_TO_JAR 100
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/ReadConf.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | 
 5 | import com.datastax.driver.core.ConsistencyLevel
 6 | 
 7 | /** Read settings for RDD
 8 |   *
 9 |   * @param splitSize number of Cassandra partitions to be read in a single Spark task
10 |   * @param fetchSize number of CQL rows to fetch in a single round-trip to Cassandra
11 |   * @param consistencyLevel consistency level for reads, default LOCAL_ONE;
12 |   *                         higher consistency level will disable data-locality */
13 | case class ReadConf(
14 |   splitSize: Int = ReadConf.DefaultSplitSize,
15 |   fetchSize: Int = ReadConf.DefaultFetchSize,
16 |   consistencyLevel: ConsistencyLevel = ReadConf.DefaultConsistencyLevel)
17 | 
18 | 
19 | object ReadConf {
20 |   val DefaultSplitSize = 100000
21 |   val DefaultFetchSize = 1000
22 |   val DefaultConsistencyLevel = ConsistencyLevel.LOCAL_ONE
23 | 
24 |   def fromSparkConf(conf: SparkConf): ReadConf = {
25 |     ReadConf(
26 |       fetchSize = conf.getInt("spark.cassandra.input.page.row.size", DefaultFetchSize),
27 |       splitSize = conf.getInt("spark.cassandra.input.split.size", DefaultSplitSize),
28 |       consistencyLevel = ConsistencyLevel.valueOf(
29 |         conf.get("spark.cassandra.input.consistency.level", DefaultConsistencyLevel.name()))
30 |     )
31 |   }
32 | 
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/java/com/datastax/spark/connector/SampleWithDeeplyNestedJavaBean.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  * This is a sample JavaBean style class. In order to test JavaAPI correctly, we cannot implement this in Scala because
 7 |  * Scala adds some additional accessors and mutators.
 8 |  */
 9 | public class SampleWithDeeplyNestedJavaBean implements Serializable {
10 |     public class IntermediateClass implements Serializable {
11 |         public class InnerClass implements Serializable {
12 |             private Integer key;
13 |             private String value;
14 | 
15 |             public InnerClass(Integer key) {
16 |                 this.key = key;
17 |             }
18 | 
19 |             public InnerClass() {
20 |             }
21 | 
22 |             public InnerClass(Integer key, String value) {
23 |                 this.key = key;
24 |                 this.value = value;
25 |             }
26 | 
27 |             public Integer getKey() {
28 |                 return key;
29 |             }
30 | 
31 |             public void setKey(Integer key) {
32 |                 this.key = key;
33 |             }
34 | 
35 |             public String getValue() {
36 |                 return value;
37 |             }
38 | 
39 |             public void setValue(String value) {
40 |                 this.value = value;
41 |             }
42 |         }
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/DStreamFunctions.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.streaming
 2 | 
 3 | import com.datastax.spark.connector._
 4 | import com.datastax.spark.connector.cql.CassandraConnector
 5 | import com.datastax.spark.connector.writer.{TableWriter, WriteConf, RowWriterFactory, WritableToCassandra}
 6 | import org.apache.spark.SparkContext
 7 | import org.apache.spark.streaming.dstream.DStream
 8 | 
 9 | class DStreamFunctions[T](dstream: DStream[T]) extends WritableToCassandra[T] with Serializable {
10 | 
11 |   override def sparkContext: SparkContext = dstream.context.sparkContext
12 | 
13 |   def conf = sparkContext.getConf
14 | 
15 |   /**
16 |    * Performs [[com.datastax.spark.connector.writer.WritableToCassandra]] for each produced RDD.
17 |    * Uses specific column names with an additional batch size.
18 |    */
19 |   def saveToCassandra(keyspaceName: String,
20 |                       tableName: String,
21 |                       columnNames: ColumnSelector = AllColumns,
22 |                       writeConf: WriteConf = WriteConf.fromSparkConf(conf))
23 |                      (implicit connector: CassandraConnector = CassandraConnector(conf),
24 |                       rwf: RowWriterFactory[T]): Unit = {
25 |     val writer = TableWriter(connector, keyspaceName, tableName, columnNames, writeConf)
26 |     dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, writer.write _))
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/Murmur3PartitionerTokenRangeSplitter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner
 2 | 
 3 | import com.datastax.spark.connector.rdd.partitioner.dht.{LongToken, TokenFactory, TokenRange}
 4 | 
 5 | import scala.math.BigDecimal.RoundingMode
 6 | 
 7 | /** Fast token range splitter assuming that data are spread out evenly in the whole range. */
 8 | class Murmur3PartitionerTokenRangeSplitter(cassandraPartitionsPerToken: Double) extends TokenRangeSplitter[Long, LongToken] {
 9 | 
10 |   private val tokenFactory =
11 |     TokenFactory.Murmur3TokenFactory
12 | 
13 |   def split(range: TokenRange[Long, LongToken], splitSize: Long) = {
14 |     val left = range.start.value
15 |     val right = range.end.value
16 |     val rangeSize =
17 |       if (right > left) BigDecimal(right) - BigDecimal(left)
18 |       else BigDecimal(right) - BigDecimal(left) + BigDecimal(tokenFactory.totalTokenCount)
19 |     val estimatedRows = rangeSize * cassandraPartitionsPerToken
20 |     val n = math.max(1, (estimatedRows / splitSize).setScale(0, RoundingMode.HALF_UP).toInt)
21 |     val splitPoints =
22 |       (for (i <- 0 until n) yield left + (rangeSize * i.toDouble / n).toLong) :+ right
23 |     for (Seq(l, r) <- splitPoints.sliding(2).toSeq) yield
24 |       new TokenRange[Long, LongToken](
25 |         new LongToken(l),
26 |         new LongToken(r),
27 |         range.endpoints,
28 |         Some((estimatedRows / n).toInt))
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/TableCopyDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.demo
 2 | 
 3 | import com.datastax.spark.connector.cql.CassandraConnector
 4 | 
 5 | object TableCopyDemo extends DemoApp {
 6 | 
 7 |   CassandraConnector(conf).withSessionDo { session =>
 8 |     session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
 9 |     session.execute("CREATE TABLE IF NOT EXISTS test.source (key INT PRIMARY KEY, data VARCHAR)")
10 |     session.execute("CREATE TABLE IF NOT EXISTS test.destination (key INT PRIMARY KEY, data VARCHAR)")
11 |     session.execute("TRUNCATE test.source")
12 |     session.execute("TRUNCATE test.destination")
13 |     session.execute("INSERT INTO test.source(key, data) VALUES (1, 'first row')")
14 |     session.execute("INSERT INTO test.source(key, data) VALUES (2, 'second row')")
15 |     session.execute("INSERT INTO test.source(key, data) VALUES (3, 'third row')")
16 |   }
17 | 
18 |   import com.datastax.spark.connector._
19 | 
20 |   val src = sc.cassandraTable("test", "source")
21 |   src.saveToCassandra("test", "destination")
22 | 
23 |   val dest = sc.cassandraTable("test", "destination")
24 |   dest.collect().foreach(row => log.info(s"$row"))
25 | 
26 |   // Assert the rows were copied from test.source to test.destination table:
27 |   assert(dest.collect().length == 3)
28 | 
29 |   log.info(s"Work completed, stopping the Spark context.")
30 |   sc.stop()
31 | }
32 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/InsertIntoCassandraTable.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.cassandra
 2 | 
 3 | import com.datastax.spark.connector._
 4 | import com.datastax.spark.connector.cql.CassandraConnector
 5 | import com.datastax.spark.connector.writer.SqlRowWriter
 6 | import org.apache.spark.annotation.DeveloperApi
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
 9 | import org.apache.spark.sql.execution.{SparkPlan, UnaryNode}
10 | 
11 | @DeveloperApi
12 | case class InsertIntoCassandraTable(cassandraRelation: CassandraRelation,
13 |                                childPlan: SparkPlan,
14 |                                overwrite: Boolean)
15 |                               (@transient cc: CassandraSQLContext) extends UnaryNode {
16 |   self: Product =>
17 | 
18 |   override def output: Seq[Attribute] = childPlan.output
19 | 
20 |   override def execute(): RDD[Row] = result
21 | 
22 |   override def child: SparkPlan = childPlan
23 | 
24 |   override def otherCopyArgs = cc :: Nil
25 | 
26 |   /**
27 |    * Insert RDD[[Row]] to Cassandra
28 |    */
29 |   private lazy val result: RDD[Row] = {
30 |     val childRdd = child.execute()
31 | 
32 |     //TODO: cluster level CassandraConnector, write configuration settings
33 |     childRdd.saveToCassandra(cassandraRelation.keyspaceName, cassandraRelation.tableName)(CassandraConnector(sparkContext.getConf), SqlRowWriter.Factory)
34 | 
35 |     cc.sparkContext.makeRDD(Nil, 1)
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/doc/10_embedded.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | ## The `spark-cassandra-connector-embedded` Artifact
 3 | The `spark-cassandra-connector-embedded` artifact can be used as a test or prototype dependency to spin up embedded servers for testing ideas, quickly learning, integration, etc.
 4 | Pulling this dependency in allows you to do 
 5 | 
 6 | - Integration Tests (IT) tests with an embedded Cassandra instance 
 7 |   - if your sbt project is configured to [run IT configs](https://github.com/datastax/spark-cassandra-connector/blob/master/project/Settings.scala#L78-L94)
 8 | - Easily write and run a Spark Streaming app using 
 9 |   - Apache Kafka streams (including an embedded Zookeeper), all with no Ops work involved
10 |   - Twitter streams (needs the 4 auth credentials required by twitter)
11 |   - And of course Cassandra but you currently need to sping up a local instance: [Download Cassandra latest](http://cassandra.apache.org/download/), open the tar, and run `sudo ./apache-cassandra-2.1.0/bin/cassandra`
12 | 
13 | ## The Code
14 | See: [https://github.com/datastax/spark-cassandra-connector/tree/master/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded](https://github.com/datastax/spark-cassandra-connector/tree/master/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded)
15 | 
16 | ## How To Add The Dependency
17 | Simply add this to your SBT build, or in the appropriate format for a Maven build
18 | 
19 |     "com.datastax.spark"  %% "spark-cassandra-connector-embedded" % {latest.verson}


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/RowReader.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.reader
 2 | 
 3 | import com.datastax.driver.core.{ProtocolVersion, Row}
 4 | 
 5 | /** Transforms a Cassandra Java driver `Row` into high-level row representation, e.g. a tuple
 6 |   * or a user-defined case class object. The target type `T` must be serializable. */
 7 | trait RowReader[T] extends Serializable {
 8 | 
 9 |   /** Reads column values from low-level `Row` and turns them into higher level representation.
10 |     * @param row row fetched from Cassandra
11 |     * @param columnNames column names available in the `row`
12 |     * @param protocolVersion java driver protocol version to be used for deserialization */
13 |   def read(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): T
14 | 
15 |   /** List of columns this `RowReader` is going to read.
16 |     * Useful to avoid fetching the columns that are not needed. */
17 |   def columnNames: Option[Seq[String]]
18 | 
19 |   /** The number of columns that need to be fetched from C*. */
20 |   def requiredColumns: Option[Int]
21 | 
22 |   /** This method should be implemented by those row readers which reads fields in the consecutive
23 |     * positions from a CassandraRow. When a row reader implements it so that it returns a non-empty,
24 |     * it denotes the number of columns this reader moves the column cursor forward for compound row
25 |     * readers (such as [[KeyValueRowReader]]). */
26 |   def consumedColumns: Option[Int] = None
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/RowWriterFactory.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.spark.connector.cql.TableDef
 4 | import com.datastax.spark.connector.mapper.ColumnMapper
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | /** Creates instances of [[RowWriter]] objects for the given row type `T`.
 9 |   * `RowWriterFactory` is the trait you need to implement if you want to support row representations
10 |   * which cannot be simply mapped by a [[com.datastax.spark.connector.mapper.ColumnMapper ColumnMapper]].*/
11 | trait RowWriterFactory[T] {
12 | 
13 |   /** Creates a new `RowWriter` instance.
14 |     * @param table target table the user wants to write into
15 |     * @param columnNames columns selected by the user; the user might wish to write only a subset of columns */
16 |   def rowWriter(table: TableDef, columnNames: Seq[String]): RowWriter[T]
17 | }
18 | 
19 | /** Provides a low-priority implicit `RowWriterFactory` able to write objects of any class for which
20 |   * a [[com.datastax.spark.connector.mapper.ColumnMapper ColumnMapper]] is defined.*/
21 | trait LowPriorityRowWriterFactoryImplicits {
22 |   implicit def defaultRowWriterFactory[T : ColumnMapper]: RowWriterFactory[T] = DefaultRowWriter.factory
23 | }
24 | 
25 | /** Provides an implicit `RowWriterFactory` for saving [[com.datastax.spark.connector.CassandraRow CassandraRow]] objects.*/
26 | object RowWriterFactory extends LowPriorityRowWriterFactoryImplicits {
27 |   implicit val genericRowWriterFactory = GenericRowWriter.Factory
28 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/SparkRepl.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.embedded
 2 | 
 3 | import java.io.{PrintWriter, StringWriter, StringReader, BufferedReader}
 4 | import java.net.URLClassLoader
 5 | 
 6 | import scala.collection.mutable.ArrayBuffer
 7 | import org.apache.spark.repl.SparkILoop
 8 | 
 9 | trait SparkRepl {
10 |   def runInterpreter(master: String, input: String): String = {
11 |     System.setProperty("spark.cassandra.connection.host", EmbeddedCassandra.cassandraHost.getHostAddress)
12 |     val in = new BufferedReader(new StringReader(input + "\n"))
13 |     val out = new StringWriter()
14 |     val cl = getClass.getClassLoader
15 |     var paths = new ArrayBuffer[String]
16 |     cl match {
17 |       case urlLoader: URLClassLoader =>
18 |         for (url <- urlLoader.getURLs) {
19 |           if (url.getProtocol == "file") {
20 |             paths += url.getFile
21 |           }
22 |         }
23 |       case _ =>
24 |     }
25 |     val interp = new SparkILoop(in, new PrintWriter(out), master)
26 |     org.apache.spark.repl.Main.interp = interp
27 |     val separator = System.getProperty("path.separator")
28 |     interp.process(Array("-classpath", paths.mkString(separator)))
29 |     org.apache.spark.repl.Main.interp = null
30 |     if (interp.sparkContext != null) {
31 |       interp.sparkContext.stop()
32 |     }
33 |     // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
34 |     System.clearProperty("spark.driver.port")
35 |     out.toString
36 |   }
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/BasicReadWriteDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.demo
 2 | 
 3 | import com.datastax.spark.connector.cql.CassandraConnector
 4 | 
 5 | object BasicReadWriteDemo extends DemoApp {
 6 | 
 7 |   CassandraConnector(conf).withSessionDo { session =>
 8 |     session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
 9 |     session.execute("CREATE TABLE IF NOT EXISTS test.key_value (key INT PRIMARY KEY, value VARCHAR)")
10 |     session.execute("TRUNCATE test.key_value")
11 |     session.execute("INSERT INTO test.key_value(key, value) VALUES (1, 'first row')")
12 |     session.execute("INSERT INTO test.key_value(key, value) VALUES (2, 'second row')")
13 |     session.execute("INSERT INTO test.key_value(key, value) VALUES (3, 'third row')")
14 |   }
15 | 
16 |   import com.datastax.spark.connector._
17 | 
18 |   // Read table test.kv and print its contents:
19 |   val rdd = sc.cassandraTable("test", "key_value").select("key", "value")
20 |   rdd.collect().foreach(row => log.info(s"Existing Data: $row"))
21 | 
22 |   // Write two new rows to the test.kv table:
23 |   val col = sc.parallelize(Seq((4, "fourth row"), (5, "fifth row")))
24 |   col.saveToCassandra("test", "key_value", SomeColumns("key", "value"))
25 | 
26 |   // Assert the two new rows were stored in test.kv table:
27 |   assert(col.collect().length == 2)
28 | 
29 |   col.collect().foreach(row => log.info(s"New Data: $row"))
30 |   log.info(s"Work completed, stopping the Spark context.")
31 |   sc.stop()
32 | }
33 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/simple-demos/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # for production, you should probably set pattern to %c instead of %l.  
18 | # (%l is slower.)
19 | 
20 | # output messages into a rolling log file as well as stdout
21 | log4j.rootLogger=WARN,stdout
22 | 
23 | # stdout
24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
26 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
27 | 
28 | # Adding this to avoid thrift logging disconnect errors.
29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR
30 | 
31 | # Avoid "no host ID found" when starting a fresh node
32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR
33 | 
34 | log4j.logger.com.datastax.spark.connector=INFO
35 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/kafka-streaming/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # for production, you should probably set pattern to %c instead of %l.  
18 | # (%l is slower.)
19 | 
20 | # output messages into a rolling log file as well as stdout
21 | log4j.rootLogger=WARN,stdout
22 | 
23 | # stdout
24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
26 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
27 | 
28 | # Adding this to avoid thrift logging disconnect errors.
29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR
30 | 
31 | # Avoid "no host ID found" when starting a fresh node
32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR
33 | 
34 | log4j.logger.com.datastax.spark.connector=INFO
35 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/twitter-streaming/src/main/scala/com/datastax/spark/connector/demo/TwitterStreamingHashTagsByInterval.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.demo
 2 | 
 3 | import scala.util.matching.Regex
 4 | import org.apache.spark.storage.StorageLevel
 5 | import org.apache.spark.streaming.{Time, Seconds, StreamingContext}
 6 | import org.apache.spark.streaming.twitter.TwitterUtils
 7 | import org.joda.time.{DateTimeZone, DateTime}
 8 | import twitter4j.auth.Authorization
 9 | import com.datastax.spark.connector.streaming._
10 | import com.datastax.spark.connector.SomeColumns
11 | 
12 | class TwitterStreamingHashTagsByInterval extends Serializable {
13 | 
14 |   def start(auth: Option[Authorization], ssc: StreamingContext, filters: Regex, keyspace: String, table: String): Unit = {
15 | 
16 |     val transform = (cruft: String) => filters.findAllIn(cruft).flatMap(_.stripPrefix("#"))
17 | 
18 |     val stream = TwitterUtils.createStream(ssc, auth, Nil, StorageLevel.MEMORY_ONLY_SER_2)
19 | 
20 |     /** Note that Cassandra is doing the sorting for you here. */
21 |     stream.flatMap(_.getText.toLowerCase.split("""\s+"""))
22 |       .map(transform)
23 |       .countByValueAndWindow(Seconds(5), Seconds(5))
24 |       .transform((rdd, time) => rdd.map { case (term, count) => (term, count, now(time))})
25 |       .saveToCassandra(keyspace, table, SomeColumns("hashtag", "mentions", "interval"))
26 | 
27 |     ssc.checkpoint("./checkpoint")
28 |     ssc.start()
29 |     ssc.awaitTermination()
30 |   }
31 | 
32 |   private def now(time: Time): String =
33 |     new DateTime(time.milliseconds, DateTimeZone.UTC).toString("yyyyMMddHH:mm:ss.SSS")
34 | }
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/streaming/StreamingSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.streaming
 2 | 
 3 | import com.datastax.spark.connector.testkit._
 4 | import com.datastax.spark.connector.embedded._
 5 | 
 6 | /**
 7 |  * Usages: Create the [[org.apache.spark.streaming.StreamingContext]] then write async to the stream.
 8 |  *
 9 |  * val ssc = new StreamingContext(conf, Milliseconds(500))
10 |  *
11 |  * Akka
12 |  * {{{
13 |  *   val stream = ssc.actorStream[String](Props[SimpleActor], actorName, StorageLevel.MEMORY_AND_DISK)
14 |  * }}}
15 |  *
16 |  * On upgrade examples:
17 |  * Kafka
18 |  * {{{
19 |  *   val stream: ReceiverInputDStream[(String, String)] =
20 |  *     KafkaUtils.createStream(ssc, kafkaParams, topics, StorageLevel.MEMORY_AND_DISK_SER_2)
21 |  * }}}
22 |  *
23 |  * ZeroMQ
24 |  * {{{
25 |  *   val stream: ReceiverInputDStream[String] = ZeroMQUtils.createStream(ssc, publishUrl, subscribe, bytesToObjects)
26 |  * }}}
27 |  *
28 |  * Twitter
29 |  * {{{
30 |  *   val stream: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None)
31 |  * }}}
32 |  *
33 |  * etc.
34 |  */
35 | trait StreamingSpec extends AbstractSpec with SharedEmbeddedCassandra with SparkTemplate {
36 |   import org.apache.spark.streaming.StreamingContext
37 |   import scala.concurrent.duration._
38 | 
39 |   val duration = 10.seconds
40 | 
41 |   useCassandraConfig("cassandra-default.yaml.template")
42 | 
43 |   def ssc: StreamingContext
44 | 
45 |   after {
46 |     // Spark Context is shared among all integration test so we don't want to stop it here
47 |     ssc.stop(stopSparkContext = false, stopGracefully = true)
48 |   }
49 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/RandomPartitionerTokenRangeSplitter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner
 2 | 
 3 | import com.datastax.spark.connector.rdd.partitioner.dht.{BigIntToken, TokenFactory, TokenRange}
 4 | 
 5 | import scala.math.BigDecimal.RoundingMode
 6 | 
 7 | /** Fast token range splitter assuming that data are spread out evenly in the whole range. */
 8 | class RandomPartitionerTokenRangeSplitter(cassandraPartitionsPerToken: Double) extends TokenRangeSplitter[BigInt, BigIntToken] {
 9 | 
10 |   private val tokenFactory =
11 |     TokenFactory.RandomPartitionerTokenFactory
12 | 
13 |   private def wrap(token: BigInt): BigInt = {
14 |     val max = tokenFactory.maxToken.value
15 |     if (token <= max) token else token - max
16 |   }
17 | 
18 |   def split(range: TokenRange[BigInt, BigIntToken], splitSize: Long) = {
19 |     val left = range.start.value
20 |     val right = range.end.value
21 |     val rangeSize =
22 |       if (right > left) BigDecimal(right - left)
23 |       else BigDecimal(right - left + tokenFactory.totalTokenCount)
24 |     val estimatedRows = rangeSize * cassandraPartitionsPerToken
25 |     val n = math.max(1, (estimatedRows / splitSize).setScale(0, RoundingMode.HALF_UP).toInt)
26 |     val splitPoints =
27 |       (for (i <- 0 until n) yield wrap(left + (rangeSize * i.toDouble / n).toBigInt)) :+ right
28 |     for (Seq(l, r) <- splitPoints.sliding(2).toSeq) yield
29 |       new TokenRange[BigInt, BigIntToken](
30 |         new BigIntToken(l.bigInteger),
31 |         new BigIntToken(r.bigInteger),
32 |         range.endpoints,
33 |         Some((estimatedRows / n).toInt))
34 |   }
35 | }


--------------------------------------------------------------------------------
/project/Versions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *    http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | object Versions {
18 |   val Akka            = "2.2.3"//spark master on 2.3.4 https://github.com/apache/spark/blob/master/pom.xml#L113-L114
19 |   val Cassandra       = "2.1.2"
20 |   val CassandraDriver = "2.1.3"
21 |   val CommonsIO       = "2.4"
22 |   val CommonsLang3    = "3.3.2"
23 |   val Config          = "1.2.1"
24 |   val Guava           = "14.0.1"
25 |   val JDK             = "1.7"
26 |   val JodaC           = "1.2"
27 |   val JodaT           = "2.3"
28 |   val JOpt            = "3.2"//4.7
29 |   val Kafka           = "0.8.0"//https://github.com/apache/spark/pull/3631
30 |   val Lzf             = "0.8.4"
31 |   val CodaHaleMetrics = "3.0.2"
32 |   val Scala           = "2.10.4"
33 |   val ScalaTest       = "2.2.2"
34 |   val Scalactic       = "2.2.2"
35 |   val Slf4j           = "1.7.7"
36 |   val Spark           = "1.1.1"
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/simple-demos/src/main/scala/com/datastax/spark/connector/demo/SQLDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.demo
 2 | 
 3 | import com.datastax.spark.connector.cql.CassandraConnector
 4 | import org.apache.spark.sql.cassandra.CassandraSQLContext
 5 | 
 6 | /** This demo creates a table in Cassandra, populates it with sample data,
 7 |   * then queries it using SparkSQL and finally displays the query results to the standard output.
 8 |   * You need to start Cassandra on local node prior to executing this demo. */
 9 | object SQLDemo extends DemoApp {
10 | 
11 |   val cc = new CassandraSQLContext(sc)
12 | 
13 |   CassandraConnector(conf).withSessionDo { session =>
14 |     session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
15 |     session.execute("DROP TABLE IF EXISTS test.sql_demo")
16 |     session.execute("CREATE TABLE test.sql_demo (key INT PRIMARY KEY, grp INT, value DOUBLE)")
17 |     session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (1, 1, 1.0)")
18 |     session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (2, 1, 2.5)")
19 |     session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (3, 1, 10.0)")
20 |     session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (4, 2, 4.0)")
21 |     session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (5, 2, 2.2)")
22 |     session.execute("INSERT INTO test.sql_demo(key, grp, value) VALUES (6, 2, 2.8)")
23 |   }
24 | 
25 |   val rdd = cc.cassandraSql("SELECT grp, max(value) AS mv FROM test.sql_demo GROUP BY grp ORDER BY mv")
26 |   rdd.collect().foreach(println)  // [2, 4.0] [1, 10.0]
27 | 
28 |   sc.stop()
29 | }
30 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/TimestampParser.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.types
 2 | 
 3 | import java.util.Date
 4 | 
 5 | import org.joda.time.format.DateTimeFormat
 6 | 
 7 | import scala.util.{Success, Try}
 8 | 
 9 | /** Parses CQL timestamps.
10 |   *
11 |   * Supported formats:
12 |   *   - `yyyy-MM-dd HH:mm`
13 |   *   - `yyyy-MM-dd HH:mmZ`
14 |   *   - `yyyy-MM-dd HH:mm:ss`
15 |   *   - `yyyy-MM-dd HH:mm:ssZ`
16 |   *   - `yyyy-MM-dd HH:mm:ss.SSS`
17 |   *   - `yyyy-MM-dd HH:mm:ss.SSSZ`
18 |   *   - `yyyy-MM-dd'T'HH:mm`
19 |   *   - `yyyy-MM-dd'T'HH:mmZ`
20 |   *   - `yyyy-MM-dd'T'HH:mm:ss`
21 |   *   - `yyyy-MM-dd'T'HH:mm:ssZ`
22 |   *   - `yyyy-MM-dd'T'HH:mm:ss.SSS`
23 |   *   - `yyyy-MM-dd'T'HH:mm:ss.SSSZ`
24 |   *   - `yyyy-MM-dd`
25 |   *   - `yyyy-MM-ddZ`
26 |   */
27 | object TimestampParser {
28 |   private val dateStringPatterns = Array[String](
29 |     "yyyy-MM-dd HH:mm",
30 |     "yyyy-MM-dd HH:mmZ",
31 |     "yyyy-MM-dd HH:mm:ss",
32 |     "yyyy-MM-dd HH:mm:ssZ",
33 |     "yyyy-MM-dd HH:mm:ss.SSS",
34 |     "yyyy-MM-dd HH:mm:ss.SSSZ",
35 |     "yyyy-MM-dd'T'HH:mm",
36 |     "yyyy-MM-dd'T'HH:mmZ",
37 |     "yyyy-MM-dd'T'HH:mm:ss",
38 |     "yyyy-MM-dd'T'HH:mm:ssZ",
39 |     "yyyy-MM-dd'T'HH:mm:ss.SSS",
40 |     "yyyy-MM-dd'T'HH:mm:ss.SSSZ",
41 |     "yyyy-MM-dd",
42 |     "yyyy-MM-ddZ")
43 | 
44 |   private val parsers =
45 |     dateStringPatterns.map(DateTimeFormat.forPattern)
46 | 
47 |   def parse(date: String): Date = {
48 |     parsers.view.map(p => Try(p.parseDateTime(date))).find(_.isSuccess) match {
49 |       case Some(Success(d)) => d.toDate
50 |       case _ => throw new IllegalArgumentException(s"Invalid date: $date")
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/CassandraClientProxy.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.cql
 2 | 
 3 | import java.lang.reflect.{InvocationTargetException, Proxy, Method, InvocationHandler}
 4 | 
 5 | import org.apache.cassandra.thrift.Cassandra
 6 | import org.apache.thrift.transport.TTransport
 7 | 
 8 | /** Extends `Cassandra.Iface` with `close` method to close the underlying thrift transport */
 9 | trait CassandraClientProxy extends Cassandra.Iface {
10 |   def close()
11 | }
12 | 
13 | private class ClientProxyHandler(client: Cassandra.Iface, transport: TTransport) extends InvocationHandler {
14 |   
15 |   override def invoke(proxy: scala.Any, method: Method, args: Array[AnyRef]): AnyRef = {
16 |     if (method.getName == "close") {
17 |       transport.close()
18 |       null
19 |     }
20 |     else
21 |       try {
22 |         method.invoke(client, args: _*)
23 |       }
24 |       catch {
25 |         case e: InvocationTargetException =>
26 |           throw e.getCause
27 |       }
28 |   }
29 | }
30 | 
31 | object CassandraClientProxy {
32 | 
33 |   /** Returns a proxy to the thrift client that provides closing the underlying transport by calling `close` method.
34 |     * Without this method we'd have to keep references to two objects: the client and the transport. */
35 |   def wrap(client: Cassandra.Iface, transport: TTransport): CassandraClientProxy = {
36 |     val classLoader = getClass.getClassLoader
37 |     val interfaces = Array[Class[_]](classOf[CassandraClientProxy])
38 |     val invocationHandler = new ClientProxyHandler(client, transport)
39 |     Proxy.newProxyInstance(classLoader, interfaces, invocationHandler).asInstanceOf[CassandraClientProxy]
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/it/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # for production, you should probably set pattern to %c instead of %l.  
18 | # (%l is slower.)
19 | 
20 | # output messages into a rolling log file as well as stdout
21 | log4j.rootLogger=WARN,stdout
22 | 
23 | # stdout
24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
26 | log4j.appender.stdout.layout.ConversionPattern=%5p %d{HH:mm:ss,SSS} %C (%F:%L) - %m%n
27 | 
28 | # Adding this to avoid thrift logging disconnect errors.
29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR
30 | 
31 | # Avoid "no host ID found" when starting a fresh node
32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR
33 | 
34 | # Avoid "address already in use" when starting multiple local Spark masters
35 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
36 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/RDDJavaFunctions.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.japi;
 2 | 
 3 | import com.datastax.spark.connector.ColumnSelector;
 4 | import com.datastax.spark.connector.RDDFunctions;
 5 | import com.datastax.spark.connector.cql.CassandraConnector;
 6 | import com.datastax.spark.connector.writer.RowWriterFactory;
 7 | import com.datastax.spark.connector.writer.WriteConf;
 8 | import org.apache.spark.SparkConf;
 9 | import org.apache.spark.rdd.RDD;
10 | 
11 | /**
12 |  * A Java API wrapper over {@link org.apache.spark.rdd.RDD} to provide Spark Cassandra Connector functionality.
13 |  *
14 |  * <p>To obtain an instance of this wrapper, use one of the factory methods in {@link
15 |  * com.datastax.spark.connector.japi.CassandraJavaUtil} class.</p>
16 |  */
17 | @SuppressWarnings("UnusedDeclaration")
18 | public class RDDJavaFunctions<T> extends RDDAndDStreamCommonJavaFunctions<T> {
19 |     public final RDD<T> rdd;
20 |     private final RDDFunctions<T> rddf;
21 | 
22 |     RDDJavaFunctions(RDD<T> rdd) {
23 |         this.rdd = rdd;
24 |         this.rddf = new RDDFunctions<>(rdd);
25 |     }
26 | 
27 |     @Override
28 |     public CassandraConnector defaultConnector() {
29 |         return rddf.connector();
30 |     }
31 | 
32 |     @Override
33 |     protected SparkConf getConf() {
34 |         return rdd.conf();
35 |     }
36 | 
37 |     @Override
38 |     protected void saveToCassandra(String keyspace, String table, RowWriterFactory<T> rowWriterFactory,
39 |                                    ColumnSelector columnNames, WriteConf conf, CassandraConnector connector) {
40 |         rddf.saveToCassandra(keyspace, table, columnNames, conf, connector, rowWriterFactory);
41 |     }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/AsyncExecutorTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import java.util.concurrent.atomic.AtomicInteger
 4 | import java.util.concurrent.{Callable, Executors}
 5 | 
 6 | import com.google.common.util.concurrent.MoreExecutors
 7 | import org.junit.Assert._
 8 | import org.junit.Test
 9 | 
10 | class AsyncExecutorTest {
11 | 
12 |   @Test
13 |   def test() {
14 |     val taskCount = 20
15 |     val maxParallel = 5
16 | 
17 |     val currentlyRunningCounter = new AtomicInteger(0)
18 |     val maxParallelCounter = new AtomicInteger(0)
19 |     val totalFinishedExecutionsCounter = new AtomicInteger(0)
20 | 
21 |     val task = new Callable[String] {
22 |       override def call() = {
23 |         val c = currentlyRunningCounter.incrementAndGet()
24 |         var m = maxParallelCounter.get()
25 |         while (m < c && !maxParallelCounter.compareAndSet(m, c))
26 |           m = maxParallelCounter.get()
27 |         Thread.sleep(100)
28 |         currentlyRunningCounter.decrementAndGet()
29 |         totalFinishedExecutionsCounter.incrementAndGet()
30 |         "ok"
31 |       }
32 |     }
33 | 
34 |     val underlyingExecutor = MoreExecutors.listeningDecorator(Executors.newCachedThreadPool())
35 |     val asyncExecutor = new AsyncExecutor(underlyingExecutor.submit(_: Callable[String]), maxParallel)
36 | 
37 |     for (i <- 1 to taskCount)
38 |       asyncExecutor.executeAsync(task)
39 | 
40 |     asyncExecutor.waitForCurrentlyExecutingTasks()
41 |     assertEquals(maxParallel, maxParallelCounter.get())
42 |     assertEquals(taskCount, totalFinishedExecutionsCounter.get())
43 |     assertEquals(taskCount, asyncExecutor.successCount)
44 |     assertEquals(0, asyncExecutor.failureCount)
45 |   }
46 | 
47 | 
48 | 
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/simple-demos/src/main/resources/data/words:
--------------------------------------------------------------------------------
 1 | Kitty lorem ipsum inspect anything brought into the house stick sit in box so loves cheeseburgers
 2 | Run in circles shake treat bag hide when guests come over use lap as chair Sit in box throwup on your pillow purr while eating
 3 | but present belly scratch hand when pet Chase dog then run away burrow under covers scratch hand when pet burrow under covers
 4 | intrigued by the shower why must they do that Curl into a furry donut need to chase tail burrow under covers
 5 | so swat at dog or sleep in the bathroom sink inspect anything brought into the house Hide when guests come over Under the bed
 6 | I like big cats and i can not lie leave dead animals as gifts Curl into a furry donut shake treat bag hunt by meowing loudly at 5am
 7 | next to human slave food dispenser need to chase tail and chew iPad power cord
 8 | Hack up furballs hunt anything that moves but favor packaging over toy yet stand in front of the computer screen
 9 | Favor packaging over toy throwup on your pillow who's the baby Give attitude Purr while eating chew iPad power cord hopped up on catnip
10 | so always hungry Stare at ceiling kick up litter or hunt by meowing loudly at 5am next to human slave food dispenser or stretch
11 | yet under the bed claw drapes Intently stare at the same spot make muffins but intently stare at the same spot
12 | or kick up litter and why must they do that Missing until dinner time hopped up on catnip who's the baby
13 | Sleep on keyboard favor packaging over toy why must they do that but bathe private parts with tongue then lick owner's face
14 | Purr for no reason Scamper sweet beast but mark territory and stand in front of the computer screen favor packaging over toy
15 | Sleep on desk infront of laptop sit on keyboard push mouse jump on bed and make bird sound at 3am


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/kafka-streaming/src/main/resources/data/words:
--------------------------------------------------------------------------------
 1 | Kitty lorem ipsum inspect anything brought into the house stick sit in box so loves cheeseburgers
 2 | Run in circles shake treat bag hide when guests come over use lap as chair Sit in box throwup on your pillow purr while eating
 3 | but present belly scratch hand when pet Chase dog then run away burrow under covers scratch hand when pet burrow under covers
 4 | intrigued by the shower why must they do that Curl into a furry donut need to chase tail burrow under covers
 5 | so swat at dog or sleep in the bathroom sink inspect anything brought into the house Hide when guests come over Under the bed
 6 | I like big cats and i can not lie leave dead animals as gifts Curl into a furry donut shake treat bag hunt by meowing loudly at 5am
 7 | next to human slave food dispenser need to chase tail and chew iPad power cord
 8 | Hack up furballs hunt anything that moves but favor packaging over toy yet stand in front of the computer screen
 9 | Favor packaging over toy throwup on your pillow who's the baby Give attitude Purr while eating chew iPad power cord hopped up on catnip
10 | so always hungry Stare at ceiling kick up litter or hunt by meowing loudly at 5am next to human slave food dispenser or stretch
11 | yet under the bed claw drapes Intently stare at the same spot make muffins but intently stare at the same spot
12 | or kick up litter and why must they do that Missing until dinner time hopped up on catnip who's the baby
13 | Sleep on keyboard favor packaging over toy why must they do that but bathe private parts with tongue then lick owner's face
14 | Purr for no reason Scamper sweet beast but mark territory and stand in front of the computer screen favor packaging over toy
15 | Sleep on desk infront of laptop sit on keyboard push mouse jump on bed and make bird sound at 3am


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/CollectionColumnType.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.types
 2 | 
 3 | import com.datastax.spark.connector.types.TypeConverter.OptionToNullConverter
 4 | 
 5 | import scala.language.existentials
 6 | import scala.reflect.runtime.universe._
 7 | 
 8 | trait CollectionColumnType[T] extends ColumnType[T] {
 9 |   def isCollection = true
10 | }
11 | 
12 | case class ListType[T](elemType: ColumnType[T]) extends CollectionColumnType[Vector[T]] {
13 |   @transient
14 |   lazy val converterToCassandra =
15 |     TypeConverter.javaArrayListConverter(elemType.converterToCassandra)
16 | 
17 |   @transient
18 |   lazy val scalaTypeTag = TypeTag.synchronized {
19 |     implicit val elemTypeTag = elemType.scalaTypeTag
20 |     implicitly[TypeTag[Vector[T]]]
21 |   }
22 | }
23 | 
24 | case class SetType[T](elemType: ColumnType[T]) extends CollectionColumnType[Set[T]] {
25 |   @transient
26 |   lazy val converterToCassandra =
27 |     new OptionToNullConverter(TypeConverter.javaHashSetConverter(elemType.converterToCassandra))
28 | 
29 |   @transient
30 |   lazy val scalaTypeTag = TypeTag.synchronized {
31 |     implicit val elemTypeTag = elemType.scalaTypeTag
32 |     implicitly[TypeTag[Set[T]]]
33 |   }
34 | }
35 | 
36 | case class MapType[K, V](keyType: ColumnType[K], valueType: ColumnType[V]) extends CollectionColumnType[Map[K, V]] {
37 |   @transient
38 |   lazy val converterToCassandra =
39 |     new OptionToNullConverter(
40 |       TypeConverter.javaHashMapConverter(keyType.converterToCassandra, valueType.converterToCassandra))
41 | 
42 |   @transient
43 |   lazy val scalaTypeTag = TypeTag.synchronized {
44 |     implicit val keyTypeTag = keyType.scalaTypeTag
45 |     implicit val valueTypeTag = valueType.scalaTypeTag
46 |     implicitly[TypeTag[Map[K, V]]]
47 |   }
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/DStreamJavaFunctions.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.japi;
 2 | 
 3 | import com.datastax.spark.connector.ColumnSelector;
 4 | import com.datastax.spark.connector.cql.CassandraConnector;
 5 | import com.datastax.spark.connector.streaming.DStreamFunctions;
 6 | import com.datastax.spark.connector.writer.RowWriterFactory;
 7 | import com.datastax.spark.connector.writer.WriteConf;
 8 | import org.apache.spark.SparkConf;
 9 | import org.apache.spark.streaming.dstream.DStream;
10 | 
11 | /**
12 |  * A Java API wrapper over {@link org.apache.spark.streaming.dstream.DStream} to provide Spark Cassandra Connector
13 |  * functionality.
14 |  *
15 |  * <p>To obtain an instance of this wrapper, use one of the factory methods in {@link
16 |  * com.datastax.spark.connector.japi.CassandraJavaUtil} class.</p>
17 |  */
18 | @SuppressWarnings("UnusedDeclaration")
19 | public class DStreamJavaFunctions<T> extends RDDAndDStreamCommonJavaFunctions<T> {
20 |     public final DStream<T> dstream;
21 |     private final DStreamFunctions<T> dsf;
22 | 
23 |     DStreamJavaFunctions(DStream<T> dStream) {
24 |         this.dstream = dStream;
25 |         this.dsf = new DStreamFunctions<>(dStream);
26 |     }
27 | 
28 |     @Override
29 |     public CassandraConnector defaultConnector() {
30 |         return dsf.connector();
31 |     }
32 | 
33 |     @Override
34 |     protected SparkConf getConf() {
35 |         return dstream.ssc().conf();
36 |     }
37 | 
38 |     @Override
39 |     protected void saveToCassandra(String keyspace, String table, RowWriterFactory<T> rowWriterFactory,
40 |                                    ColumnSelector columnNames, WriteConf conf, CassandraConnector connector) {
41 |         dsf.saveToCassandra(keyspace, table, columnNames, conf, connector, rowWriterFactory);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-demos/twitter-streaming/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # for production, you should probably set pattern to %c instead of %l.  
18 | # (%l is slower.)
19 | 
20 | # output messages into a rolling log file as well as stdout
21 | log4j.rootLogger=INFO,stdout
22 | 
23 | # stdout
24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
26 | log4j.appender.stdout.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
27 | 
28 | # Adding this to avoid thrift logging disconnect errors.
29 | log4j.logger.org.apache.thrift.server.TNonblockingServer=ERROR
30 | 
31 | # Avoid "no host ID found" when starting a fresh node
32 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR
33 | 
34 | #  If running spark local, ignore block input exists warnings, which are expected.
35 | log4j.logger.org.apache.spark.storage.BlockManager=ERROR
36 | log4j.logger.com.datastax.spark.connector=INFO
37 | log4j.logger.org.apache.spark=WARN
38 | log4j.logger.com.datastax.driver.core=WARN


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/ColumnRef.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector
 2 | 
 3 | import scala.language.implicitConversions
 4 | 
 5 | /** Unambiguous reference to a column in the query result set row. */
 6 | sealed trait ColumnRef
 7 | 
 8 | sealed trait NamedColumnRef extends ColumnRef {
 9 |   /** Returns the column name which this selection bases on. In case of a function, such as `ttl` or
10 |     * `writetime`, it returns the column name passed to that function. */
11 |   def columnName: String
12 | 
13 |   /** Returns a CQL phrase which has to be passed to the `SELECT` clause with appropriate quotation
14 |     * marks. */
15 |   def cql: String
16 | 
17 |   /** Returns a name of the selection as it is seen in the result set. Most likely this is going to be
18 |     * used when providing custom column name to field name mapping. */
19 |   def selectedAs: String
20 | }
21 | 
22 | object NamedColumnRef {
23 |   def unapply(columnRef: NamedColumnRef) = Some((columnRef.columnName, columnRef.selectedAs))
24 | }
25 | 
26 | /** References a column by name. */
27 | case class ColumnName(columnName: String) extends NamedColumnRef {
28 |   val cql = s""""$columnName""""
29 |   val selectedAs = columnName
30 | 
31 |   override def toString: String = selectedAs
32 | }
33 | 
34 | case class TTL(columnName: String) extends NamedColumnRef {
35 |   val cql = s"""TTL("$columnName")"""
36 |   val selectedAs = s"ttl($columnName)"
37 | 
38 |   override def toString: String = selectedAs
39 | }
40 | 
41 | case class WriteTime(columnName: String) extends NamedColumnRef {
42 |   val cql = s"""WRITETIME("$columnName")"""
43 |   val selectedAs = s"writetime($columnName)"
44 | 
45 |   override def toString: String = selectedAs
46 | }
47 | 
48 | /** References a column by its index in the row. Useful for tuples. */
49 | case class ColumnIndex(columnIndex: Int) extends ColumnRef
50 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/KeyValueRowReader.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.reader
 2 | 
 3 | import com.datastax.driver.core.{ProtocolVersion, Row}
 4 | import com.datastax.spark.connector.cql.TableDef
 5 | 
 6 | class KeyValueRowReaderFactory[K, V](keyRRF: RowReaderFactory[K], valueRRF: RowReaderFactory[V])
 7 |   extends RowReaderFactory[(K, V)] {
 8 | 
 9 |   override def rowReader(table: TableDef, options: RowReaderOptions): RowReader[(K, V)] = {
10 |     val keyReader = keyRRF.rowReader(table, options)
11 |     val valueReaderOptions = options.copy(offset = options.offset + keyReader.consumedColumns.getOrElse(0))
12 |     val valueReader = valueRRF.rowReader(table, valueReaderOptions)
13 |     new KeyValueRowReader(keyReader, valueReader)
14 |   }
15 | 
16 |   override def targetClass: Class[(K, V)] = classOf[(K, V)]
17 | }
18 | 
19 | class KeyValueRowReader[K, V](keyReader: RowReader[K], valueReader: RowReader[V]) extends RowReader[(K, V)] {
20 | 
21 |   override def requiredColumns: Option[Int] =
22 |     (for (keyCnt <- keyReader.requiredColumns; valueCnt <- valueReader.requiredColumns) yield keyCnt max valueCnt)
23 |       .orElse(keyReader.requiredColumns).orElse(valueReader.requiredColumns)
24 | 
25 |   override def columnNames: Option[Seq[String]] =
26 |     (for (keyNames <- keyReader.columnNames; valueNames <- valueReader.columnNames) yield keyNames ++ valueNames)
27 |       .orElse(keyReader.columnNames).orElse(valueReader.columnNames)
28 | 
29 |   override def read(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): (K, V) = {
30 |     (keyReader.read(row, columnNames, protocolVersion), valueReader.read(row, columnNames, protocolVersion))
31 |   }
32 | 
33 |   override def consumedColumns: Option[Int] =
34 |     for (keySkip <- keyReader.consumedColumns; valueSkip <- valueReader.consumedColumns)
35 |     yield keySkip + valueSkip
36 | }
37 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/mapper/TupleColumnMapperTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.mapper
 2 | 
 3 | import com.datastax.spark.connector.ColumnIndex
 4 | import com.datastax.spark.connector.cql.{TableDef, RegularColumn, ColumnDef}
 5 | import com.datastax.spark.connector.types.IntType
 6 | import org.apache.commons.lang3.SerializationUtils
 7 | import org.junit.Assert._
 8 | import org.junit.Test
 9 | 
10 | class TupleColumnMapperTest {
11 | 
12 |   private val c1 = ColumnDef("test", "table", "column1", RegularColumn, IntType)
13 |   private val c2 = ColumnDef("test", "table", "column2", RegularColumn, IntType)
14 |   private val c3 = ColumnDef("test", "table", "column3", RegularColumn, IntType)
15 |   private val tableDef = TableDef("test", "table", Seq(c1), Seq(c2), Seq(c3))
16 | 
17 |   @Test
18 |   def testGetters() {
19 |     val columnMap = new TupleColumnMapper[(Int, String, Boolean)].columnMap(tableDef)
20 |     val getters = columnMap.getters
21 |     assertEquals(ColumnIndex(0), getters("_1"))
22 |     assertEquals(ColumnIndex(1), getters("_2"))
23 |     assertEquals(ColumnIndex(2), getters("_3"))
24 |   }
25 | 
26 |   @Test
27 |   def testConstructor() {
28 |     val columnMap = new TupleColumnMapper[(Int, String, Boolean)].columnMap(tableDef)
29 |     assertEquals(Seq(ColumnIndex(0), ColumnIndex(1), ColumnIndex(2)), columnMap.constructor)
30 |   }
31 | 
32 |   @Test
33 |   def testSerialize() {
34 |     val columnMap = new TupleColumnMapper[(Int, String, Boolean)].columnMap(tableDef)
35 |     SerializationUtils.roundtrip(columnMap)
36 |   }
37 | 
38 |   @Test
39 |   def testImplicit() {
40 |     val columnMap = implicitly[ColumnMapper[(Int, String, Boolean)]].columnMap(tableDef)
41 |     val getters = columnMap.getters
42 |     assertEquals(ColumnIndex(0), getters("_1"))
43 |     assertEquals(ColumnIndex(1), getters("_2"))
44 |     assertEquals(ColumnIndex(2), getters("_3"))
45 |   }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/util/ReflectionUtilSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.util
 2 | 
 3 | import com.datastax.spark.connector.cql.{CassandraConnectorConf, DefaultConnectionFactory, CassandraConnectionFactory}
 4 | import org.scalatest.{FlatSpec, Matchers}
 5 | 
 6 | class ReflectionUtilSpec extends FlatSpec with Matchers {
 7 | 
 8 |   "ReflectionUtil.findGlobalObject" should "be able to find DefaultConnectionFactory" in {
 9 |     val factory = ReflectionUtil.findGlobalObject[CassandraConnectionFactory](
10 |       "com.datastax.spark.connector.cql.DefaultConnectionFactory")
11 |     factory should be(DefaultConnectionFactory)
12 |   }
13 | 
14 |   it should "be able to instantiate a singleton object based on Java class name" in {
15 |     val obj = ReflectionUtil.findGlobalObject[String]("java.lang.String")
16 |     obj should be ("")
17 |   }
18 | 
19 |   it should "cache Java class instances" in {
20 |     val obj1 = ReflectionUtil.findGlobalObject[String]("java.lang.String")
21 |     val obj2 = ReflectionUtil.findGlobalObject[String]("java.lang.String")
22 |     obj1 shouldBe theSameInstanceAs (obj2)
23 |   }
24 | 
25 |   it should "throw IllegalArgumentException when asked for a Scala object of wrong type" in {
26 |     intercept[IllegalArgumentException] {
27 |       ReflectionUtil.findGlobalObject[CassandraConnectorConf](
28 |         "com.datastax.spark.connector.cql.DefaultConnectionFactory")
29 |     }
30 |   }
31 | 
32 |   it should "throw IllegalArgumentException when asked for class instance of wrong type" in {
33 |     intercept[IllegalArgumentException] {
34 |       ReflectionUtil.findGlobalObject[Integer]("java.lang.String")
35 |     }
36 |   }
37 | 
38 |   it should "throw IllegalArgumentException when object does not exist" in {
39 |     intercept[IllegalArgumentException] {
40 |       ReflectionUtil.findGlobalObject[CassandraConnectorConf]("NoSuchObject")
41 |     }
42 |   }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/dht/TokenFactory.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner.dht
 2 | 
 3 | import scala.language.existentials
 4 | 
 5 | trait TokenFactory[V, T <: Token[V]] {
 6 |   def minToken: T
 7 |   def maxToken: T
 8 |   def totalTokenCount: BigInt
 9 |   def fromString(string: String): T
10 |   def toString(token: T): String
11 | }
12 | 
13 | object TokenFactory {
14 | 
15 |   type V = t forSome { type t }
16 |   type T = t forSome { type t <: Token[V] }
17 | 
18 |   implicit object Murmur3TokenFactory extends TokenFactory[Long, LongToken] {
19 |     override val minToken = LongToken(Long.MinValue)
20 |     override val maxToken = LongToken(Long.MaxValue)
21 |     override val totalTokenCount = BigInt(maxToken.value) - BigInt(minToken.value)
22 |     override def fromString(string: String) = LongToken(string.toLong)
23 |     override def toString(token: LongToken) = token.value.toString
24 |   }
25 | 
26 |   implicit object RandomPartitionerTokenFactory extends TokenFactory[BigInt, BigIntToken] {
27 |     override val minToken = BigIntToken(-1)
28 |     override val maxToken = BigIntToken(BigInt(2).pow(127))
29 |     override val totalTokenCount = maxToken.value - minToken.value
30 |     override def fromString(string: String) = BigIntToken(BigInt(string))
31 |     override def toString(token: BigIntToken) = token.value.toString()
32 |   }
33 | 
34 |   def forCassandraPartitioner(partitionerClassName: String): TokenFactory[V, T] = {
35 |     val partitioner =
36 |       partitionerClassName match {
37 |         case "org.apache.cassandra.dht.Murmur3Partitioner" => Murmur3TokenFactory
38 |         case "org.apache.cassandra.dht.RandomPartitioner" => RandomPartitionerTokenFactory
39 |         case _ => throw new IllegalArgumentException(s"Unsupported partitioner: $partitionerClassName")
40 |       }
41 |     partitioner.asInstanceOf[TokenFactory[V, T]]
42 |   }
43 | }
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/MagicalTypeTricks.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.util
 2 | 
 3 | object MagicalTypeTricks {
 4 | 
 5 |   trait DoesntHaveImplicit[A, B]
 6 |   implicit def doesntHaveImplicit[A, B]: A DoesntHaveImplicit B = null
 7 |   implicit def doesntHaveImplicitAmbiguity1[A, B](implicit ev: B): A DoesntHaveImplicit B = null
 8 |   implicit def doesntHaveImplicitAmbiguity2[A, B](implicit ev: B): A DoesntHaveImplicit B = null
 9 | 
10 |   trait IsNotEqualTo[A, B]
11 |   implicit def neq[A, B]: A IsNotEqualTo B = null
12 |   implicit def neqAmbiguity1[A]: A IsNotEqualTo A = null
13 |   implicit def neqAmbiguity2[A]: A IsNotEqualTo A = null
14 | 
15 |   trait IsNotSubclassOf[A, B]
16 |   implicit def nsub[A, B]: A IsNotSubclassOf B = null
17 |   implicit def nsubAmbiguity1[A, B >: A]: A IsNotSubclassOf B = null
18 |   implicit def nsubAmbiguity2[A, B >: A]: A IsNotSubclassOf B = null
19 | 
20 |   type ¬[A] = A => Nothing
21 |   type λ[A] = ¬[¬[A]]
22 | 
23 |   /**
24 |    * Example of how disjunction can be used:
25 |    * {{{
26 |    * scala> import com.datastax.spark.connector.util.MagicalTypeTricks._
27 |    * import com.datastax.spark.connector.util.MagicalTypeTricks._
28 |    *
29 |    * scala> def function[T](t: T)(implicit ev: (λ[T] <:< (Int ∪ String))) = { println(s"t = $t") }
30 |    * function: [T](t: T)(implicit ev: <:<[(T => Nothing) => Nothing,Int => Nothing with String => Nothing => Nothing])Unit
31 |    *
32 |    * scala> function(5)
33 |    * t = 5
34 |    *
35 |    * scala> function("five")
36 |    * t = five
37 |    *
38 |    * scala> function(5d)
39 |    * <console>:13: error: Cannot prove that (Double => Nothing) => Nothing <:< Int => Nothing with String => Nothing => Nothing.
40 |    * function(5d)
41 |    * ^
42 |    * }}}
43 |    *
44 |    * Based on [[http://www.chuusai.com/2011/06/09/scala-union-types-curry-howard/ this article]].
45 |    */
46 |   type ∪[T, U] = ¬[¬[T] with ¬[U]]
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/ObjectSizeEstimator.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import java.io.{OutputStream, ObjectOutputStream}
 4 | import java.nio.ByteBuffer
 5 | 
 6 | import scala.collection.JavaConversions._
 7 | 
 8 | import org.apache.cassandra.utils.ByteBufferUtil
 9 | 
10 | 
11 | /** Estimates amount of memory required to serialize Java/Scala objects */
12 | object ObjectSizeEstimator {
13 | 
14 |   private def makeSerializable(obj: Any): AnyRef = {
15 |     obj match {
16 |       case bb: ByteBuffer => ByteBufferUtil.getArray(bb)
17 |       case list: java.util.List[_] => list.map(makeSerializable)
18 |       case list: List[_] => list.map(makeSerializable)
19 |       case set: java.util.Set[_] => set.map(makeSerializable)
20 |       case set: Set[_] => set.map(makeSerializable)
21 |       case map: java.util.Map[_, _] => map.map { case (k, v) => (makeSerializable(k), makeSerializable(v)) }
22 |       case map: Map[_, _] => map.map { case (k, v) => (makeSerializable(k), makeSerializable(v)) }
23 |       case other => other.asInstanceOf[AnyRef]
24 |     }
25 |   }
26 | 
27 |   /** Records only how many bytes were written but the actual data is discarded */
28 |   private class CountingOutputStream extends OutputStream {
29 |     private var _length = 0
30 |     override def write(b: Int) = _length += 1
31 |     override def write(b: Array[Byte]) = _length += b.length
32 |     override def write(b: Array[Byte], off: Int, len: Int) = _length += len
33 |     def length = _length
34 |   }
35 | 
36 |   /** Serializes passed objects and reports their total size */
37 |   def measureSerializedSize(objects: Seq[Any]): Int = {
38 |     val countingStream = new CountingOutputStream
39 |     val objectStream = new ObjectOutputStream(countingStream)
40 |     for (obj <- objects)
41 |       objectStream.writeObject(makeSerializable(obj))
42 |     objectStream.close()
43 |     countingStream.length
44 |   }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/PreparedStatementCache.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.cql
 2 | 
 3 | import com.datastax.driver.core.{RegularStatement, Session, Cluster, PreparedStatement}
 4 | import com.datastax.spark.connector.util.Logging
 5 | 
 6 | import scala.collection.concurrent.TrieMap
 7 | 
 8 | /** Caches prepared statements so they are not prepared
 9 |   * multiple times by different threads. */
10 | object PreparedStatementCache extends Logging {
11 | 
12 |   private val clusterCache = 
13 |     TrieMap[Cluster, TrieMap[String, PreparedStatement]]()
14 | 
15 |   private def get(cluster: Cluster, query: String): Option[PreparedStatement] =
16 |     for (statementCache <- clusterCache.get(cluster);
17 |          statement <- statementCache.get(query)) yield statement
18 | 
19 |   private def put(cluster: Cluster, query: String, statement: PreparedStatement): PreparedStatement = {
20 |     clusterCache.get(cluster) match {
21 |       case Some(statementCache) => statementCache.put(query, statement)
22 |       case None => clusterCache.put(cluster, TrieMap(query -> statement))
23 |     }
24 |     statement
25 |   }
26 | 
27 |   /** Removes all statements associated with the `Cluster` from the cache. */
28 |   def remove(cluster: Cluster) {
29 |     synchronized {
30 |       clusterCache.remove(cluster)
31 |     }
32 |   }
33 | 
34 |   /** Retrieves a `PreparedStatement` from cache or
35 |     * creates a new one if not found and updates the cache. */
36 |   def prepareStatement(session: Session, query: RegularStatement): PreparedStatement = {
37 |     val cluster = session.getCluster
38 |     get(cluster, query.toString) match {
39 |       case Some(stmt) => stmt
40 |       case None =>
41 |         synchronized {
42 |           get(cluster, query.toString) match {
43 |             case Some(stmt) => stmt
44 |             case None =>
45 |               val stmt = session.prepare(query)
46 |               put(cluster, query.toString, stmt)
47 |           }
48 |         }
49 |     }
50 |   }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/doc/3_selection.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | ## Server-side data selection and filtering
 3 | 
 4 | In this section, you'll learn how to reduce the amount of data transferred from Cassandra to Spark
 5 | to speed up processing.
 6 | 
 7 | ### Selecting a subset of columns
 8 | 
 9 | For performance reasons, you should not fetch columns you don't need. You can achieve this with the `select` method.
10 | 
11 | ```scala
12 | sc.cassandraTable("test", "users").select("username").toArray.foreach(println)
13 | // CassandraRow{username: noemail} 
14 | // CassandraRow{username: someone}
15 | ```
16 | 
17 | The `select` method can be chained. Every next call can be used to select a subset of columns already selected.
18 | Selecting a non-existing column would result in throwing an exception.
19 | 
20 | ### Filtering rows
21 | 
22 | To filter rows, you can use the filter transformation provided by Spark. 
23 | However, this approach causes all rows to be fetched from Cassandra and then filtered by Spark. 
24 | Also, some CPU cycles are wasted serializing and deserializing objects that wouldn't be 
25 | included in the result. To avoid this overhead, `CassandraRDD` offers the `where` method, which lets you pass 
26 | arbitrary CQL condition(s) to filter the row set on the server.
27 | 
28 | ```scala
29 | sc.cassandraTable("test", "cars").select("id", "model").where("color = ?", "black").toArray.foreach(println)
30 | // CassandraRow[id: KF-334L, model: Ford Mondeo]
31 | // CassandraRow[id: MT-8787, model: Hyundai x35]
32 | 
33 | sc.cassandraTable("test", "cars").select("id", "model").where("color = ?", "silver").toArray.foreach(println)
34 | // CassandraRow[id: WX-2234, model: Toyota Yaris]
35 | ```
36 | 
37 | Note: Although the `ALLOW FILTERING` clause is implicitly added to the generated CQL query, not all predicates 
38 | are currently allowed by the Cassandra engine. This limitation is going to be addressed in the future 
39 | Cassandra releases. Currently, `ALLOW FILTERING` works well 
40 | with columns indexed by secondary indexes or clustering columns.  
41 | 
42 | 
43 | [Next - Working with user-defined case classes and tuples](4_mapper.md)


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/reader/ValueRowReader.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.reader
 2 | 
 3 | import com.datastax.driver.core.{ProtocolVersion, Row}
 4 | import com.datastax.spark.connector._
 5 | import com.datastax.spark.connector.cql.TableDef
 6 | import com.datastax.spark.connector.types.TypeConverter
 7 | import com.datastax.spark.connector.util.JavaApiHelper
 8 | 
 9 | class ValueRowReader[T: TypeConverter](columnRef: ColumnRef) extends RowReader[T] {
10 | 
11 |   private val converter = implicitly[TypeConverter[T]]
12 | 
13 |   /** Reads column values from low-level `Row` and turns them into higher level representation.
14 |     * @param row row fetched from Cassandra
15 |     * @param columnNames column names available in the `row` */
16 |   override def read(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): T = {
17 |     columnRef match {
18 |       case ColumnIndex(idx) => converter.convert(AbstractRow.get(row, idx, protocolVersion))
19 |       case NamedColumnRef(_, selectedAs) => converter.convert(AbstractRow.get(row, selectedAs, protocolVersion))
20 |     }
21 |   }
22 | 
23 |   /** List of columns this `RowReader` is going to read.
24 |     * Useful to avoid fetching the columns that are not needed. */
25 |   override def columnNames: Option[Seq[String]] = columnRef match {
26 |     case NamedColumnRef(_, selectedAs) => Some(Seq(selectedAs))
27 |     case _ => None
28 |   }
29 | 
30 |   /** The number of columns that need to be fetched from C*. */
31 |   override def requiredColumns: Option[Int] = columnRef match {
32 |     case ColumnIndex(idx) => Some(idx)
33 |     case _ => None
34 |   }
35 | 
36 |   override def consumedColumns: Option[Int] = Some(1)
37 | }
38 | 
39 | class ValueRowReaderFactory[T: TypeConverter]
40 |   extends RowReaderFactory[T] {
41 | 
42 |   override def rowReader(table: TableDef, options: RowReaderOptions): RowReader[T] = {
43 |     new ValueRowReader[T](ColumnIndex(options.offset))
44 |   }
45 | 
46 |   override def targetClass: Class[T] = JavaApiHelper.getRuntimeClass(implicitly[TypeConverter[T]].targetTypeTag)
47 | }
48 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/WriteOption.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import java.util.Date
 4 | 
 5 | import org.apache.spark.streaming.{Duration => SparkDuration}
 6 | import org.joda.time.{DateTime, Duration => JodaDuration}
 7 | 
 8 | import scala.concurrent.duration.{Duration => ScalaDuration}
 9 | 
10 | sealed trait WriteOption[+T]
11 | 
12 | sealed trait TTLOption extends WriteOption[Int]
13 | 
14 | sealed trait TimestampOption extends WriteOption[Long]
15 | 
16 | case class StaticWriteOption[T](value: T) extends WriteOption[T]
17 | 
18 | case class PerRowWriteOption[T](placeholder: String) extends WriteOption[T]
19 | 
20 | object TTLOption {
21 | 
22 |   case object auto extends TTLOption
23 | 
24 |   def forever: TTLOption = new StaticWriteOption[Int](0) with TTLOption
25 | 
26 |   def constant(ttl: Int): TTLOption = {
27 |     require(ttl > 0, "Explicitly specified TTL must be greater than zero.")
28 |     new StaticWriteOption[Int](ttl) with TTLOption
29 |   }
30 | 
31 |   def constant(ttl: SparkDuration): TTLOption = constant((ttl.milliseconds / 1000L).toInt)
32 | 
33 |   def constant(ttl: JodaDuration): TTLOption = constant(ttl.getStandardSeconds.toInt)
34 | 
35 |   def constant(ttl: ScalaDuration): TTLOption = if (ttl.isFinite()) constant(ttl.toSeconds.toInt) else forever
36 | 
37 |   def perRow(placeholder: String): TTLOption =
38 |     new PerRowWriteOption[Int](placeholder) with TTLOption
39 | 
40 | }
41 | 
42 | object TimestampOption {
43 | 
44 |   case object auto extends TimestampOption
45 | 
46 |   def constant(microseconds: Long): TimestampOption = {
47 |     require(microseconds > 0, "Explicitly specified time must be greater than zero.")
48 |     new StaticWriteOption[Long](microseconds) with TimestampOption
49 |   }
50 | 
51 |   def constant(timestamp: Date): TimestampOption = constant(timestamp.getTime * 1000L)
52 | 
53 |   def constant(timestamp: DateTime): TimestampOption = constant(timestamp.getMillis * 1000L)
54 | 
55 |   def perRow(placeholder: String): TimestampOption =
56 |     new PerRowWriteOption[Long](placeholder) with TimestampOption
57 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/JavaBeanColumnMapper.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.mapper
 2 | 
 3 | import java.lang.reflect.Method
 4 | 
 5 | import com.datastax.spark.connector.cql.TableDef
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | class JavaBeanColumnMapper[T : ClassTag](columnNameOverride: Map[String, String] = Map.empty) extends ReflectionColumnMapper[T] {
10 | 
11 |   import com.datastax.spark.connector.mapper.JavaBeanColumnMapper._
12 | 
13 |   override def classTag: ClassTag[T] = implicitly[ClassTag[T]]
14 | 
15 |   private def propertyName(accessorName: String) = {
16 |     val AccessorRegex(_, strippedName) = accessorName
17 |     strippedName(0).toLower + strippedName.substring(1)
18 |   }
19 | 
20 |   override protected def isGetter(method: Method): Boolean =
21 |     GetterRegex.findFirstMatchIn(method.getName).isDefined &&
22 |     method.getParameterTypes.size == 0 &&
23 |     method.getReturnType != Void.TYPE
24 | 
25 |   override protected def isSetter(method: Method): Boolean =
26 |     SetterRegex.findFirstMatchIn(method.getName).isDefined &&
27 |     method.getParameterTypes.size == 1 &&
28 |     method.getReturnType == Void.TYPE
29 | 
30 |   override protected def getterToColumnName(getterName: String, tableDef: TableDef) = {
31 |     val p = propertyName(getterName)
32 |     columnNameOverride.getOrElse(p, columnNameForProperty(p, tableDef))
33 |   }
34 | 
35 |   override protected def setterToColumnName(setterName: String, tableDef: TableDef) = {
36 |     val p = propertyName(setterName)
37 |     columnNameOverride.getOrElse(p, columnNameForProperty(p, tableDef))
38 |   }
39 | 
40 |   override protected def constructorParamToColumnName(paramName: String, tableDef: TableDef) = {
41 |     columnNameOverride.getOrElse(paramName, columnNameForProperty(paramName, tableDef))
42 |   }
43 | 
44 |   /** Java Beans allow nulls in property values */
45 |   override protected def allowsNull = true
46 | }
47 | 
48 | object JavaBeanColumnMapper {
49 |   val GetterRegex = "^(get|is)(.+)$".r
50 |   val SetterRegex = "^(set)(.+)$".r
51 |   val AccessorRegex = "^(get|is|set)(.+)$".r
52 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector-java/src/main/java/com/datastax/spark/connector/japi/GenericJavaRowReaderFactory.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.japi;
 2 | 
 3 | import com.datastax.driver.core.ProtocolVersion;
 4 | import com.datastax.driver.core.Row;
 5 | import com.datastax.spark.connector.cql.TableDef;
 6 | import com.datastax.spark.connector.rdd.reader.RowReader;
 7 | import com.datastax.spark.connector.rdd.reader.RowReaderFactory;
 8 | import com.datastax.spark.connector.rdd.reader.RowReaderOptions;
 9 | import scala.Option;
10 | import scala.collection.Seq;
11 | 
12 | public class GenericJavaRowReaderFactory {
13 |     public final static RowReaderFactory<CassandraRow> instance = new RowReaderFactory<CassandraRow>() {
14 |         @Override
15 |         public RowReader<CassandraRow> rowReader(TableDef table, RowReaderOptions options) {
16 |             return JavaRowReader.instance;
17 |         }
18 | 
19 |         @Override
20 |         public RowReaderOptions rowReader$default$2() {
21 |             return new RowReaderOptions(RowReaderOptions.apply$default$1());
22 |         }
23 | 
24 |         @Override
25 |         public Class<CassandraRow> targetClass() {
26 |             return CassandraRow.class;
27 |         }
28 |     };
29 | 
30 | 
31 |     public static class JavaRowReader implements RowReader<CassandraRow> {
32 |         public final static JavaRowReader instance = new JavaRowReader();
33 | 
34 |         private JavaRowReader() {
35 |         }
36 | 
37 |         @Override
38 |         public CassandraRow read(Row row, String[] columnNames, ProtocolVersion protocolVersion) {
39 |             assert row.getColumnDefinitions().size() == columnNames.length :
40 |                     "Number of columns in a row must match the number of columns in the table metadata";
41 |             return CassandraRow$.MODULE$.fromJavaDriverRow(row, columnNames, protocolVersion);
42 |         }
43 | 
44 |         @Override
45 |         public Option<Seq<String>> columnNames() {
46 |             return Option.empty();
47 |         }
48 | 
49 |         @Override
50 |         public Option<Object> requiredColumns() {
51 |             return Option.empty();
52 |         }
53 | 
54 |         @Override
55 |         public Option<Object> consumedColumns() {
56 |             return Option.empty();
57 |         }
58 |     }
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/sbt/sbt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # This script launches sbt for this project. If present it uses the system 
21 | # version of sbt. If there is no system version of sbt it attempts to download
22 | # sbt locally.
23 | SBT_VERSION=0.13.1
24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar
27 | 
28 | # Download sbt launch jar if it hasn't been downloaded yet
29 | if [ ! -f ${JAR} ]; then
30 |   # Download
31 |   printf "Attempting to fetch sbt\n"
32 |   JAR_DL=${JAR}.part
33 |   if hash curl 2>/dev/null; then
34 |     (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
35 |   elif hash wget 2>/dev/null; then
36 |     (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
37 |   else
38 |     printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
39 |     exit -1
40 |   fi
41 | fi
42 | if [ ! -f ${JAR} ]; then
43 |   # We failed to download
44 |   printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
45 |   exit -1
46 | fi
47 | printf "Launching sbt from ${JAR}\n"
48 | java \
49 |   -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
50 |   -jar ${JAR} \
51 |   "$@"
52 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/AsyncExecutor.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import java.util.concurrent.Semaphore
 4 | import java.util.concurrent.atomic.AtomicInteger
 5 | 
 6 | import com.datastax.spark.connector.util.Logging
 7 | import com.google.common.util.concurrent.{FutureCallback, Futures, ListenableFuture, SettableFuture}
 8 | 
 9 | import scala.collection.concurrent.TrieMap
10 | import scala.util.Try
11 | 
12 | /** Asynchronously executes tasks but blocks if the limit of unfinished tasks is reached. */
13 | class AsyncExecutor[T, R](asyncAction: T => ListenableFuture[R], maxConcurrentTasks: Int) extends Logging {
14 | 
15 |   private val _successCount = new AtomicInteger(0)
16 |   private val _failureCount = new AtomicInteger(0)
17 | 
18 |   private val semaphore = new Semaphore(maxConcurrentTasks)
19 |   private val pendingFutures = new TrieMap[ListenableFuture[R], Boolean]
20 | 
21 |   /** Executes task asynchronously or blocks if more than `maxConcurrentTasks` limit is reached */
22 |   def executeAsync(task: T): ListenableFuture[R] = {
23 |     semaphore.acquire()
24 | 
25 |     val settable = SettableFuture.create[R]()
26 |     pendingFutures.put(settable, true)
27 | 
28 |     val future = asyncAction(task)
29 | 
30 |     Futures.addCallback(future, new FutureCallback[R] {
31 |       def release() {
32 |         semaphore.release()
33 |         pendingFutures.remove(settable)
34 |       }
35 |       def onSuccess(result: R) {
36 |         _successCount.incrementAndGet()
37 |         release()
38 |         settable.set(result)
39 |       }
40 |       def onFailure(throwable: Throwable) {
41 |         logError("Failed to execute: " + task, throwable)
42 |         _failureCount.incrementAndGet()
43 |         release()
44 |         settable.setException(throwable)
45 |       }
46 |     })
47 | 
48 |     settable
49 |   }
50 | 
51 |   /** Waits until the tasks being currently executed get completed.     
52 |     * It will not wait for tasks scheduled for execution during this method call,
53 |     * nor tasks for which the [[executeAsync]] method did not complete. */
54 |   def waitForCurrentlyExecutingTasks() {
55 |     for ((future, _) <- pendingFutures.snapshot())
56 |       Try(future.get())
57 |   }
58 | 
59 |   def successCount = _successCount.get()
60 |   def failureCount = _failureCount.get()
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/Assertions.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.embedded
 2 | 
 3 | import scala.annotation.tailrec
 4 | import scala.concurrent.duration._
 5 | 
 6 | /**
 7 |  * Simple helper assertions. Some stolen from Akka akka.testkit.TestKit.scala for now.
 8 |  */
 9 | trait Assertions {
10 | 
11 |   /** Obtain current time (`System.nanoTime`) as Duration. */
12 |   def now: FiniteDuration = System.nanoTime.nanos
13 | 
14 |   private var end: Duration = Duration.Undefined
15 | 
16 |   /**
17 |    * Obtain time remaining for execution of the innermost enclosing `within`
18 |    * block or missing that it returns the properly dilated default for this
19 |    * case from settings (key "akka.test.single-expect-default").
20 |    */
21 |   def remainingOrDefault = remainingOr(1.seconds.dilated)
22 | 
23 |   /**
24 |    * Obtain time remaining for execution of the innermost enclosing `within`
25 |    * block or missing that it returns the given duration.
26 |    */
27 |   def remainingOr(duration: FiniteDuration): FiniteDuration = end match {
28 |     case x if x eq Duration.Undefined => duration
29 |     case x if !x.isFinite             => throw new IllegalArgumentException("`end` cannot be infinite")
30 |     case f: FiniteDuration            => f - now
31 |   }
32 | 
33 |   /**
34 |    * Await until the given condition evaluates to `true` or the timeout
35 |    * expires, whichever comes first.
36 |    * If no timeout is given, take it from the innermost enclosing `within`
37 |    * block.
38 |    */
39 |   def awaitCond(p: => Boolean, max: Duration = 3.seconds, interval: Duration = 100.millis, message: String = "") {
40 |     val _max = remainingOrDilated(max)
41 |     val stop = now + _max
42 | 
43 |     @tailrec
44 |     def poll(t: Duration) {
45 |       if (!p) {
46 |         assert(now < stop, s"timeout ${_max} expired: $message")
47 |         Thread.sleep(t.toMillis)
48 |         poll((stop - now) min interval)
49 |       }
50 |     }
51 | 
52 |     poll(_max min interval)
53 |   }
54 | 
55 |   private def remainingOrDilated(max: Duration): FiniteDuration = max match {
56 |     case x if x eq Duration.Undefined => remainingOrDefault
57 |     case x if !x.isFinite             => throw new IllegalArgumentException("max duration cannot be infinite")
58 |     case f: FiniteDuration            => f.dilated
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/WriteConfTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.driver.core.ConsistencyLevel
 4 | import com.datastax.spark.connector.{RowsInBatch, BytesInBatch}
 5 | import org.apache.spark.SparkConf
 6 | import org.scalatest.{FlatSpec, Matchers}
 7 | 
 8 | class WriteConfTest extends FlatSpec with Matchers {
 9 | 
10 |   "WriteConf" should "be configured with proper defaults" in {
11 |     val conf = new SparkConf(false)
12 |     val writeConf = WriteConf.fromSparkConf(conf)
13 | 
14 |     writeConf.batchSize should be(BytesInBatch(WriteConf.DefaultBatchSizeInBytes))
15 |     writeConf.consistencyLevel should be(WriteConf.DefaultConsistencyLevel)
16 |     writeConf.parallelismLevel should be(WriteConf.DefaultParallelismLevel)
17 |   }
18 | 
19 |   it should "allow to set consistency level" in {
20 |     val conf = new SparkConf(false)
21 |       .set("spark.cassandra.output.consistency.level", "THREE")
22 |     val writeConf = WriteConf.fromSparkConf(conf)
23 | 
24 |     writeConf.consistencyLevel should be(ConsistencyLevel.THREE)
25 |   }
26 | 
27 |   it should "allow to set parallelism level" in {
28 |     val conf = new SparkConf(false)
29 |       .set("spark.cassandra.output.concurrent.writes", "17")
30 |     val writeConf = WriteConf.fromSparkConf(conf)
31 | 
32 |     writeConf.parallelismLevel should be(17)
33 |   }
34 | 
35 |   it should "allow to set batch size in bytes" in {
36 |     val conf = new SparkConf(false)
37 |       .set("spark.cassandra.output.batch.size.bytes", "12345")
38 |     val writeConf = WriteConf.fromSparkConf(conf)
39 | 
40 |     writeConf.batchSize should be(BytesInBatch(12345))
41 |   }
42 | 
43 |   it should "allow to set batch size in bytes when rows are set to auto" in {
44 |     val conf = new SparkConf(false)
45 |       .set("spark.cassandra.output.batch.size.bytes", "12345")
46 |       .set("spark.cassandra.output.batch.size.rows", "auto")
47 |     val writeConf = WriteConf.fromSparkConf(conf)
48 | 
49 |     writeConf.batchSize should be(BytesInBatch(12345))
50 |   }
51 | 
52 |   it should "allow to set batch size in rows" in {
53 |     val conf = new SparkConf(false)
54 |       .set("spark.cassandra.output.batch.size.rows", "12345")
55 |     val writeConf = WriteConf.fromSparkConf(conf)
56 | 
57 |     writeConf.batchSize should be(RowsInBatch(12345))
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/CassandraCatalog.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.cassandra
 2 | 
 3 | import java.io.IOException
 4 | import java.util.concurrent.TimeUnit
 5 | 
 6 | import com.datastax.spark.connector.cql.{CassandraConnector, Schema}
 7 | import com.google.common.cache.{CacheBuilder, CacheLoader}
 8 | import org.apache.spark.sql.catalyst.analysis.Catalog
 9 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery}
10 | 
11 | private[cassandra] class CassandraCatalog(cc: CassandraSQLContext) extends Catalog {
12 | 
13 |   val caseSensitive: Boolean = true
14 | 
15 |   val schemas = CacheBuilder.newBuilder
16 |        .maximumSize(100)
17 |        .expireAfterWrite(cc.conf.getLong("schema.expire.in.minutes", 10), TimeUnit.MINUTES)
18 |        .build(
19 |           new CacheLoader[String, Schema] {
20 |             def load(cluster: String) : Schema = {
21 |               Schema.fromCassandra(CassandraConnector(cc.conf))
22 |             }
23 |           })
24 | 
25 |   override def lookupRelation(
26 |     databaseName: Option[String],
27 |     tableName: String,
28 |     alias: Option[String] = None): LogicalPlan = {
29 | 
30 |     lazy val defaultDatabase = databaseName.getOrElse(cc.getKeyspace)
31 |     val defaultCluster = "default"
32 |     val (cluster, database, table) = tableName.split("\\.") match {
33 |       case Array(t)       => (defaultCluster, defaultDatabase, t)
34 |       case Array(d, t)    => (defaultCluster, d, t)
35 |       case Array(c, d, t) => (c, d, t)
36 |       case _              => throw new IOException(s"Wrong table name: $tableName")
37 |     }
38 | 
39 |     val schema = schemas.get(cluster)
40 |     val keyspaceDef = schema.keyspaceByName.getOrElse(database, throw new IOException(s"Keyspace not found: $database"))
41 |     val tableDef = keyspaceDef.tableByName.getOrElse(table, throw new IOException(s"Table not found: $database.$table"))
42 |     val tableWithQualifiers = Subquery(table, CassandraRelation(tableDef, alias)(cc))
43 |     alias.map(a => Subquery(a, tableWithQualifiers)).getOrElse(tableWithQualifiers)
44 |   }
45 | 
46 |   override def registerTable(databaseName: Option[String], tableName: String, plan: LogicalPlan): Unit = ???
47 | 
48 |   override def unregisterTable(databaseName: Option[String], tableName: String): Unit = ???
49 | 
50 |   override def unregisterAllTables(): Unit = ???
51 | }
52 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/writer/WritableToCassandra.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import com.datastax.spark.connector.ColumnSelector
 4 | import com.datastax.spark.connector.cql.CassandraConnector
 5 | import org.apache.spark.SparkContext
 6 | 
 7 | abstract class WritableToCassandra[T] {
 8 | 
 9 |   def sparkContext: SparkContext
10 | 
11 |   private[connector] lazy val connector = CassandraConnector(sparkContext.getConf)
12 | 
13 |   /**
14 |    * Saves the data from `RDD` to a Cassandra table.
15 |    * By default, it saves all properties that have corresponding Cassandra columns.
16 |    *
17 |    * Example:
18 |    * {{{
19 |    *   CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 };
20 |    *   CREATE TABLE test.words(word VARCHAR PRIMARY KEY, count INT, other VARCHAR);
21 |    * }}}
22 |    *
23 |    * {{{
24 |    *   case class WordCount(word: String, count: Int, other: String)
25 |    *   val rdd = sc.parallelize(Seq(WordCount("foo", 5, "bar")))
26 |    * }}}
27 |    *
28 |    * By default, the underlying RDD class must provide data for all columns:
29 |    * {{{
30 |    *   rdd.saveToCassandra("test", "words")
31 |    * }}}
32 |    *
33 |    * By default, writes are performed at ConsistencyLevel.ONE in order to leverage data-locality and minimize network traffic.
34 |    * This write consistency level is controlled by the following property:
35 |    *   - spark.cassandra.output.consistency.level: consistency level for RDD writes, string matching the ConsistencyLevel enum name.
36 |    *
37 |    * @param keyspaceName the name of the Keyspace to use
38 |    * @param tableName the name of the Table to use
39 |    * @param columnNames The list of column names to save data to.
40 |    *                Uses only the unique column names, and you must select at least all primary key
41 |    *                columns. All other fields are discarded. Non-selected property/column names are left unchanged.
42 |    * @param writeConf additional configuration object allowing to set consistency level, batch size, etc.
43 |    */
44 |   def saveToCassandra(keyspaceName: String,
45 |                       tableName: String,
46 |                       columnNames: ColumnSelector,
47 |                       writeConf: WriteConf)
48 |                      (implicit connector: CassandraConnector, rwf: RowWriterFactory[T])
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/org/apache/spark/sql/cassandra/CassandraSQLRow.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.cassandra
 2 | 
 3 | import com.datastax.driver.core.{Row, ProtocolVersion}
 4 | import com.datastax.spark.connector.AbstractRow
 5 | import com.datastax.spark.connector.rdd.reader.{ThisRowReaderAsFactory, RowReader}
 6 | import com.datastax.spark.connector.types.TypeConverter
 7 | import org.apache.spark.sql.catalyst.expressions.{Row => SparkRow}
 8 | 
 9 | final class CassandraSQLRow(data: IndexedSeq[AnyRef], columnNames: IndexedSeq[String])
10 |   extends AbstractRow(data, columnNames) with SparkRow with Serializable {
11 | 
12 |   private[spark] def this() = this(null, null) // required by Kryo for deserialization :(
13 | 
14 | 
15 |   /** Generic getter for getting columns of any type.
16 |     * Looks the column up by its index. First column starts at index 0. */
17 |   private def get[T](index: Int)(implicit c: TypeConverter[T]): T =
18 |     c.convert(data(index))
19 | 
20 |   override def apply(i: Int) = data(i)
21 |   override def copy() = this // immutable
22 |   override def size = super.size
23 | 
24 |   override def getDouble(i: Int) = get[Double](i)
25 |   override def getFloat(i: Int) = get[Float](i)
26 |   override def getLong(i: Int) = get[Long](i)
27 |   override def getByte(i: Int) = get[Byte](i)
28 |   override def getBoolean(i: Int) = get[Boolean](i)
29 |   override def getShort(i: Int) = get[Short](i)
30 |   override def getInt(i: Int) = get[Int](i)
31 |   override def getString(i: Int) = get[String](i)
32 |   override def iterator = data.iterator
33 | }
34 | 
35 | 
36 | object CassandraSQLRow {
37 | 
38 |   def fromJavaDriverRow(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): CassandraSQLRow = {
39 |     val data = new Array[Object](columnNames.length)
40 |     for (i <- 0 until columnNames.length)
41 |       data(i) = AbstractRow.get(row, i, protocolVersion)
42 |     new CassandraSQLRow(data, columnNames)
43 |   }
44 | 
45 |   implicit object CassandraSQLRowReader extends RowReader[CassandraSQLRow] with ThisRowReaderAsFactory[CassandraSQLRow] {
46 | 
47 |     override def read(row: Row, columnNames: Array[String], protocolVersion: ProtocolVersion): CassandraSQLRow =
48 |       fromJavaDriverRow(row, columnNames, protocolVersion)
49 | 
50 |     override def requiredColumns = None
51 |     override def columnNames = None
52 |     override def targetClass = classOf[CassandraSQLRow]
53 |   }
54 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/ReflectionUtil.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.util
 2 | 
 3 | import scala.collection.concurrent.TrieMap
 4 | import scala.reflect.runtime.universe._
 5 | import scala.util.{Try, Success, Failure}
 6 | 
 7 | object ReflectionUtil {
 8 |   private val rm = runtimeMirror(getClass.getClassLoader)
 9 |   private val singletonCache = TrieMap[String, Any]()
10 | 
11 |   private def findScalaObject[T : TypeTag](objectName: String): Try[T] = {
12 |     Try {
13 |       val targetType = implicitly[TypeTag[T]].tpe
14 |       val module = rm.staticModule(objectName)
15 |       if (!(module.typeSignature <:< targetType))
16 |         throw new IllegalArgumentException(s"Object $objectName is not instance of $targetType")
17 | 
18 |       val moduleMirror = rm.reflectModule(module)
19 |       moduleMirror.instance.asInstanceOf[T]
20 |     }
21 |   }
22 | 
23 |   private def findSingletonClassInstance[T : TypeTag](className: String): Try[T] = {
24 |     Try {
25 |       val targetType = implicitly[TypeTag[T]].tpe
26 |       val targetClass = rm.runtimeClass(targetType.typeSymbol.asClass)
27 |       val instance =
28 |         singletonCache.get(className) match {
29 |           case Some(obj) => obj
30 |           case None =>
31 |             val newInstance = Class.forName(className).getConstructor(Array.empty[Class[_]]: _*).newInstance()
32 |             singletonCache.putIfAbsent(className, newInstance) match {
33 |               case None => newInstance
34 |               case Some(previousInstance) => previousInstance
35 |             }
36 |         }
37 | 
38 |       if (!targetClass.isInstance(instance))
39 |         throw new IllegalArgumentException(s"Class $className is not $targetType")
40 |       instance.asInstanceOf[T]
41 |     }
42 |   }
43 | 
44 |   /** Returns either a global Scala object by its fully qualified name or a singleton
45 |     * instance of a Java class identified by its fully qualified class name.
46 |     * Java class instances are cached. The Java class must provide a default constructor. */
47 |   def findGlobalObject[T : TypeTag](objectName: String): T = {
48 |     val scalaObject: Try[T] = findScalaObject[T](objectName)
49 |     val classInstance: Try[T] = findSingletonClassInstance[T](objectName)
50 |     scalaObject orElse classInstance match {
51 |       case Success(obj) => obj
52 |       case Failure(e) => throw new IllegalArgumentException(s"Singleton object not available: $objectName", e)
53 |     }
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/CassandraConnectorConf.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.cql
 2 | 
 3 | import java.net.InetAddress
 4 | 
 5 | import com.datastax.spark.connector.util.Logging
 6 | import org.apache.spark.SparkConf
 7 | import scala.util.control.NonFatal
 8 | 
 9 | /** Stores configuration of a connection to Cassandra.
10 |   * Provides information about cluster nodes, ports and optional credentials for authentication. */
11 | case class CassandraConnectorConf(
12 |   hosts: Set[InetAddress],
13 |   nativePort: Int = CassandraConnectorConf.DefaultNativePort,
14 |   rpcPort: Int = CassandraConnectorConf.DefaultRpcPort,
15 |   authConf: AuthConf = NoAuthConf,
16 |   connectionFactory: CassandraConnectionFactory = DefaultConnectionFactory)
17 | 
18 | /** A factory for `CassandraConnectorConf` objects.
19 |   * Allows for manually setting connection properties or reading them from `SparkConf` object.
20 |   * By embedding connection information in `SparkConf`, `SparkContext` can offer Cassandra specific methods
21 |   * which require establishing connections to a Cassandra cluster.*/
22 | object CassandraConnectorConf extends Logging {
23 | 
24 |   val DefaultRpcPort = 9160
25 |   val DefaultNativePort = 9042
26 |   
27 |   val CassandraConnectionHostProperty = "spark.cassandra.connection.host"
28 |   val CassandraConnectionRpcPortProperty = "spark.cassandra.connection.rpc.port"
29 |   val CassandraConnectionNativePortProperty = "spark.cassandra.connection.native.port"
30 | 
31 |   private def resolveHost(hostName: String): Option[InetAddress] = {
32 |     try Some(InetAddress.getByName(hostName))
33 |     catch {
34 |       case NonFatal(e) =>
35 |         logError(s"Unknown host '$hostName'", e)
36 |         None
37 |     }
38 |   }
39 | 
40 |   def apply(conf: SparkConf): CassandraConnectorConf = {
41 |     val hostsStr = conf.get(CassandraConnectionHostProperty, InetAddress.getLocalHost.getHostAddress)
42 |     val hosts = for {
43 |       hostName <- hostsStr.split(",").toSet[String]
44 |       hostAddress <- resolveHost(hostName)
45 |     } yield hostAddress
46 | 
47 |     val rpcPort = conf.getInt(CassandraConnectionRpcPortProperty, DefaultRpcPort)
48 |     val nativePort = conf.getInt(CassandraConnectionNativePortProperty, DefaultNativePort)
49 |     val authConf = AuthConf.fromSparkConf(conf)
50 |     val connectionFactory = CassandraConnectionFactory.fromSparkConf(conf)
51 |     CassandraConnectorConf(hosts, nativePort, rpcPort, authConf, connectionFactory)
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/RefCountMap.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.cql
 2 | 
 3 | import scala.collection.concurrent.TrieMap
 4 | import scala.annotation.tailrec
 5 | 
 6 | /** Atomically counts references to objects of any type */
 7 | class RefCountMap[T] {
 8 | 
 9 |   private val refCounts = new TrieMap[T, Int]
10 | 
11 |   /** Returns current reference count for the given key.
12 |     * This value may be constantly changing, so do not use it for synchronization purposes. */
13 |   final def get(key: T): Int =
14 |     refCounts.getOrElse(key, 0)
15 | 
16 |   /** Atomically increases reference count only if the reference counter is already greater than 0.
17 |     * @return true if reference counter was greater than zero and has been increased */
18 |   @tailrec
19 |   final def acquireIfNonZero(key: T): Int = {
20 |     refCounts.get(key) match {
21 |       case Some(count) if count > 0 =>
22 |         if (refCounts.replace(key, count, count + 1))
23 |           count + 1
24 |         else
25 |           acquireIfNonZero(key)
26 |       case _ =>
27 |         0
28 |     }
29 |   }
30 | 
31 |   /** Atomically increases reference count by one.
32 |     * @return reference count after increase */
33 |   @tailrec
34 |   final def acquire(key: T): Int = {
35 |     refCounts.get(key) match {
36 |       case Some(count) =>
37 |         if (refCounts.replace(key, count, count + 1))
38 |           count + 1
39 |         else
40 |           acquire(key)
41 |       case None =>
42 |         if (!refCounts.putIfAbsent(key, 1).isDefined)
43 |           1
44 |         else
45 |           acquire(key)
46 |     }
47 |   }
48 | 
49 |   /** Atomically decreases reference count by `n`.
50 |     * @return reference count after decrease
51 |     * @throws IllegalStateException if the reference count before decrease is less than `n` */
52 |   @tailrec
53 |   final def release(key: T, n: Int = 1): Int = {
54 |     refCounts.get(key) match {
55 |       case Some(count) if count > n =>
56 |         if (refCounts.replace(key, count, count - n))
57 |           count - n
58 |         else
59 |           release(key, n)
60 |       case Some(count) if count == n =>
61 |         if (refCounts.remove(key, n))
62 |           0
63 |         else
64 |           release(key, n)
65 |       case _ =>
66 |         throw new IllegalStateException("Release without acquire for key: " + key)
67 |     }
68 |   }
69 | 
70 |   /** Resets state of all counters to 0 */
71 |   def clear(): Unit = refCounts.clear()
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/doc/5_saving.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | ## Saving datasets to Cassandra
 3 | 
 4 | It is possible to save any `RDD` to Cassandra, not just `CassandraRDD`. 
 5 | The only requirement is that the object class of `RDD` is a tuple or has property names 
 6 | corresponding to Cassandra column names. 
 7 | 
 8 | To save an `RDD`, import `com.datastax.spark.connector._` and call the `saveToCassandra` method with the
 9 | keyspace name, table name and a list of columns. Make sure to include at least all primary key columns.
10 |  
11 | ## Saving a collection of tuples
12 | 
13 | ```scala
14 | collection = sc.parallelize(Seq(("cat", 30), ("fox", 40)))
15 | collection.saveToCassandra("test", "words", SomeColumns("word", "count"))
16 | ```
17 |     
18 |     cqlsh:test> select * from words;
19 | 
20 |      word | count
21 |     ------+-------
22 |       bar |    20
23 |       foo |    10
24 |       cat |    30
25 |       fox |    40
26 | 
27 |     (4 rows)
28 |    
29 | ## Saving a collection of objects
30 | When saving a collection of objects of a user-defined class, the items to be saved
31 | must provide appropriately named public property accessors for getting every column
32 | to be saved. This example provides more information on property-column naming conventions is described [here](4_mapper.md).
33 | 
34 | ```scala
35 | case class WordCount(word: String, count: Long)
36 | collection = sc.parallelize(Seq(WordCount("dog", 50), WordCount("cow", 60)))    
37 | collection.saveToCassandra("test", "words", SomeColumns("word", "count"))
38 | ```
39 | 
40 |     cqlsh:test> select * from words;
41 | 
42 |      word | count
43 |     ------+-------
44 |       bar |    20
45 |       foo |    10
46 |       cat |    30
47 |       fox |    40
48 |       dog |    50
49 |       cow |    60
50 |       
51 | The driver will execute a CQL `INSERT` statement for every object in the `RDD`, 
52 | grouped in unlogged batches. The consistency level for writes is `ONE`. 
53 | 
54 | ## Tuning
55 | The following properties set in `SparkConf` can be used to fine-tune the saving process:
56 | 
57 |   - `spark.cassandra.output.batch.size.rows`: number of rows per single batch; default is 'auto' which means the connector 
58 |      will adjust the number of rows based on the amount of data in each row  
59 |   - `spark.cassandra.output.batch.size.bytes`: maximum total size of the batch in bytes; defaults to 16 kB.
60 |   - `spark.cassandra.output.concurrent.writes`: maximum number of batches executed in parallel by a single Spark task; defaults to 5
61 | 
62 | [Next - Customizing the object mapping](6_advanced_mapper.md)
63 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/ServerSideTokenRangeSplitter.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner
 2 | 
 3 | import java.io.IOException
 4 | import java.net.InetAddress
 5 | 
 6 | import scala.collection.JavaConversions._
 7 | import scala.util.{Failure, Success, Try}
 8 | 
 9 | import org.apache.cassandra.thrift.CfSplit
10 | 
11 | import com.datastax.spark.connector.cql.CassandraConnector
12 | import com.datastax.spark.connector.rdd.partitioner.dht.{CassandraNode, Token, TokenFactory, TokenRange}
13 | import com.datastax.spark.connector.util.Logging
14 | 
15 | /** Delegates token range splitting to Cassandra server. */
16 | class ServerSideTokenRangeSplitter[V, T <: Token[V]](
17 |   connector: CassandraConnector,
18 |   keyspaceName: String,
19 |   tableName: String,
20 |   tokenFactory: TokenFactory[V, T])
21 |   extends TokenRangeSplitter[V, T] with Logging {
22 | 
23 |   private def unthriftify(cfSplit: CfSplit, endpoints: Set[CassandraNode]): TokenRange[V, T] = {
24 |     val left = tokenFactory.fromString(cfSplit.start_token)
25 |     val right = tokenFactory.fromString(cfSplit.end_token)
26 |     TokenRange(left, right, endpoints, Some(cfSplit.row_count))
27 |   }
28 | 
29 |   private def fetchSplits(range: TokenRange[V, T], endpoint: InetAddress, splitSize: Long): Seq[TokenRange[V, T]] = {
30 |     val startToken = tokenFactory.toString(range.start)
31 |     val endToken = tokenFactory.toString(range.end)
32 | 
33 |     connector.withCassandraClientDo(endpoint) {
34 |       client =>
35 |         client.set_keyspace(keyspaceName)
36 |         client
37 |           .describe_splits_ex(tableName, startToken, endToken, splitSize.toInt)
38 |           .map(unthriftify(_, range.endpoints))
39 |     }
40 |   }
41 | 
42 |   def split(range: TokenRange[V, T], splitSize: Long) = {
43 |     val fetchResults =
44 |       for (endpoint <- range.endpoints.toStream)
45 |       yield Try(fetchSplits(range, endpoint.rpcAddress, splitSize))
46 | 
47 |     fetchResults
48 |       .collectFirst { case Success(splits) => splits }
49 |       .getOrElse {
50 |         for (Failure(e) <- fetchResults)
51 |           logError("Failure while fetching splits from Cassandra", e)
52 |         if (range.endpoints.isEmpty)
53 |           throw new IOException(s"Failed to fetch splits of $range because there are no replicas for the keyspace in the current datacenter.")
54 |         else
55 |           throw new IOException(s"Failed to fetch splits of $range from all endpoints: ${range.endpoints.mkString(", ")}")
56 |       }
57 |   }
58 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/package.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | import scala.language.implicitConversions
 7 | import scala.reflect.ClassTag
 8 | 
 9 | /**
10 |  * The root package of Cassandra connector for Apache Spark.
11 |  * Offers handy implicit conversions that add Cassandra-specific methods to `SparkContext` and `RDD`.
12 |  *
13 |  * Call [[com.datastax.spark.connector.SparkContextFunctions#cassandraTable cassandraTable]] method on the `SparkContext` object
14 |  * to create a [[com.datastax.spark.connector.rdd.CassandraRDD CassandraRDD]] exposing Cassandra tables as Spark RDDs.
15 |  *
16 |  * Call [[com.datastax.spark.connector.RDDFunctions]] `saveToCassandra`
17 |  * function on any `RDD` to save distributed collection to a Cassandra table.
18 |  *
19 |  * Example:
20 |  * {{{
21 |  *   CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 };
22 |  *   CREATE TABLE test.words (word text PRIMARY KEY, count int);
23 |  *   INSERT INTO test.words(word, count) VALUES ("and", 50);
24 |  * }}}
25 |  *
26 |  * {{{
27 |  *   import com.datastax.spark.connector._
28 |  *
29 |  *   val sparkMasterHost = "127.0.0.1"
30 |  *   val cassandraHost = "127.0.0.1"
31 |  *   val keyspace = "test"
32 |  *   val table = "words"
33 |  *
34 |  *   // Tell Spark the address of one Cassandra node:
35 |  *   val conf = new SparkConf(true).set("spark.cassandra.connection.host", cassandraHost)
36 |  *
37 |  *   // Connect to the Spark cluster:
38 |  *   val sc = new SparkContext("spark://" + sparkMasterHost + ":7077", "example", conf)
39 |  *
40 |  *   // Read the table and print its contents:
41 |  *   val rdd = sc.cassandraTable(keyspace, table)
42 |  *   rdd.toArray().foreach(println)
43 |  *
44 |  *   // Write two rows to the table:
45 |  *   val col = sc.parallelize(Seq(("of", 1200), ("the", "863")))
46 |  *   col.saveToCassandra(keyspace, table)
47 |  *
48 |  *   sc.stop()
49 |  * }}}
50 |  */
51 | package object connector {
52 | 
53 |   implicit def toSparkContextFunctions(sc: SparkContext): SparkContextFunctions =
54 |     new SparkContextFunctions(sc)
55 | 
56 |   implicit def toRDDFunctions[T : ClassTag](rdd: RDD[T]): RDDFunctions[T] =
57 |     new RDDFunctions[T](rdd)
58 | 
59 |   implicit class ColumnNameFunctions(val columnName: String) extends AnyVal {
60 |     def writeTime: WriteTime = WriteTime(columnName)
61 |     def ttl: TTL = TTL(columnName)
62 |   }
63 | 
64 |   implicit def toNamedColumnRef(columnName: String): NamedColumnRef = ColumnName(columnName)
65 | }
66 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/SessionProxy.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.cql
 2 | 
 3 | import java.lang.reflect.{InvocationHandler, InvocationTargetException, Method, Proxy}
 4 | 
 5 | import com.datastax.driver.core.{RegularStatement, Session, SimpleStatement}
 6 | 
 7 | /** Wraps a `Session` and intercepts:
 8 |   *  - `close` method to invoke `afterClose` handler
 9 |   *  - `prepare` methods to cache `PreparedStatement` objects. */
10 | class SessionProxy(session: Session, afterClose: Session => Any) extends InvocationHandler {
11 | 
12 |   private var closed = false
13 | 
14 |   override def invoke(proxy: Any, method: Method, args: Array[AnyRef]) = {
15 |     try {
16 |       val StringClass = classOf[String]
17 |       val RegularStatementClass = classOf[String]
18 | 
19 |       (method.getName, method.getParameterTypes) match {
20 |         case ("close", Array()) =>
21 |           null
22 |         case ("closeUnderlying", Array()) =>
23 |           session.close()
24 |           null
25 |         case ("isClosed", Array()) =>
26 |           closed.asInstanceOf[AnyRef]
27 |         case ("prepare", Array(StringClass)) =>
28 |           PreparedStatementCache.prepareStatement(session, new SimpleStatement(args(0).asInstanceOf[String]))
29 |         case ("prepare", Array(RegularStatementClass)) =>
30 |           PreparedStatementCache.prepareStatement(session, args(0).asInstanceOf[RegularStatement])
31 |         case _ =>
32 |           try {
33 |             method.invoke(session, args: _*)
34 |           }
35 |           catch {
36 |             case e: InvocationTargetException =>
37 |               throw e.getCause
38 |           }
39 |       }
40 |     }
41 |     finally {
42 |       if (method.getName == "close" && !closed) {
43 |         closed = true
44 |         afterClose(session)
45 |       }
46 |     }
47 |   }
48 | }
49 | 
50 | object SessionProxy {
51 | 
52 |   /** Creates a new `SessionProxy` delegating to the given `Session`.
53 |     * The proxy adds prepared statement caching functionality. */
54 |   def wrap(session: Session): Session =
55 |     wrapWithCloseAction(session)(_ => ())
56 | 
57 |   /** Creates a new `SessionProxy` delegating to the given `Session`.
58 |     * Additionally registers a callback on `Session#close` method.
59 |     * @param afterClose code to be invoked after the session has been closed */
60 |   def wrapWithCloseAction(session: Session)(afterClose: Session => Any): Session =
61 |     Proxy.newProxyInstance(
62 |       session.getClass.getClassLoader,
63 |       Array(classOf[Session]),
64 |       new SessionProxy(session, afterClose)).asInstanceOf[Session]
65 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/SparkContextFunctions.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector
 2 | 
 3 | import com.datastax.spark.connector.cql.CassandraConnector
 4 | import com.datastax.spark.connector.rdd.{ReadConf, ValidRDDType, CassandraRDD}
 5 | import com.datastax.spark.connector.rdd.reader.RowReaderFactory
 6 | import org.apache.spark.SparkContext
 7 | 
 8 | import scala.reflect.ClassTag
 9 | 
10 | /** Provides Cassandra-specific methods on `SparkContext` */
11 | class SparkContextFunctions(@transient val sc: SparkContext) extends Serializable {
12 | 
13 |   /** Returns a view of a Cassandra table as `CassandraRDD`.
14 |     * This method is made available on `SparkContext` by importing `com.datastax.spark.connector._`
15 |     *
16 |     * Depending on the type parameter passed to `cassandraTable`, every row is converted to one of the following:
17 |     *   - an [[CassandraRow]] object (default, if no type given)
18 |     *   - a tuple containing column values in the same order as columns selected by [[com.datastax.spark.connector.rdd.CassandraRDD#select CassandraRDD#select]]
19 |     *   - object of a user defined class, populated by appropriate [[com.datastax.spark.connector.mapper.ColumnMapper ColumnMapper]]
20 |     *
21 |     * Example:
22 |     * {{{
23 |     *   CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 };
24 |     *   CREATE TABLE test.words (word text PRIMARY KEY, count int);
25 |     *   INSERT INTO test.words (word, count) VALUES ('foo', 20);
26 |     *   INSERT INTO test.words (word, count) VALUES ('bar', 20);
27 |     *   ...
28 |     * }}}
29 |     * {{{
30 |     *   // Obtaining RDD of CassandraRow objects:
31 |     *   val rdd1 = sc.cassandraTable("test", "words")
32 |     *   rdd1.first.getString("word")  // foo
33 |     *   rdd1.first.getInt("count")    // 20
34 |     *
35 |     *   // Obtaining RDD of tuples:
36 |     *   val rdd2 = sc.cassandraTable[(String, Int)]("test", "words").select("word", "count")
37 |     *   rdd2.first._1  // foo
38 |     *   rdd2.first._2  // 20
39 |     *
40 |     *   // Obtaining RDD of user defined objects:
41 |     *   case class WordCount(word: String, count: Int)
42 |     *   val rdd3 = sc.cassandraTable[WordCount]("test", "words")
43 |     *   rdd3.first.word  // foo
44 |     *   rdd3.first.count // 20
45 |     * }}}*/
46 |   def cassandraTable[T](keyspace: String, table: String)
47 |                        (implicit connector: CassandraConnector = CassandraConnector(sc.getConf),
48 |                         ct: ClassTag[T], rrf: RowReaderFactory[T],
49 |                         ev: ValidRDDType[T]) =
50 |     new CassandraRDD[T](sc, connector, keyspace, table, readConf = ReadConf.fromSparkConf(sc.getConf))
51 | }
52 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/DefaultColumnMapper.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.mapper
 2 | 
 3 | import java.lang.reflect.Method
 4 | 
 5 | import com.datastax.spark.connector.cql.TableDef
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | /** A [[ColumnMapper]] that assumes camel case naming convention for property accessors and constructor names
10 |   * and underscore naming convention for column names.
11 |   *
12 |   * Example mapping:
13 |   * {{{
14 |   *   case class User(
15 |   *     login: String,         // mapped to "login" column
16 |   *     emailAddress: String   // mapped to "email_address" column
17 |   *     emailAddress2: String  // mapped to "email_address_2" column
18 |   *   )
19 |   * }}}
20 |   *
21 |   * Additionally, it is possible to name columns exactly the same as property names (case-sensitive):
22 |   * {{{
23 |   *   case class TaxPayer(
24 |   *     TIN: String            // mapped to "TIN" column
25 |   *   )
26 |   * }}}
27 |   *
28 |   * @param columnNameOverride maps property names to column names; use it to override default mapping for some properties
29 |   */
30 | class DefaultColumnMapper[T : ClassTag](columnNameOverride: Map[String, String] = Map.empty) extends ReflectionColumnMapper[T] {
31 | 
32 |   import com.datastax.spark.connector.mapper.DefaultColumnMapper._
33 | 
34 |   override def classTag: ClassTag[T] = implicitly[ClassTag[T]]
35 | 
36 |   private def setterNameToPropertyName(str: String) =
37 |     str.substring(0, str.length - SetterSuffix.length)
38 | 
39 |   override def isGetter(method: Method) = {
40 |     method.getParameterTypes.size == 0 &&
41 |     method.getReturnType != Void.TYPE
42 |   }
43 | 
44 |   override def isSetter(method: Method) = {
45 |     method.getParameterTypes.size == 1 &&
46 |     method.getReturnType == Void.TYPE &&
47 |     method.getName.endsWith(SetterSuffix)
48 |   }
49 | 
50 |   override def constructorParamToColumnName(paramName: String, tableDef: TableDef) =
51 |     columnNameOverride.getOrElse(paramName, columnNameForProperty(paramName, tableDef))
52 | 
53 |   override def getterToColumnName(getterName: String, tableDef: TableDef) =
54 |     columnNameOverride.getOrElse(getterName, columnNameForProperty(getterName, tableDef))
55 | 
56 |   override def setterToColumnName(setterName: String, tableDef: TableDef) = {
57 |     val propertyName = setterNameToPropertyName(setterName)
58 |     columnNameOverride.getOrElse(propertyName, columnNameForProperty(propertyName, tableDef))
59 |   }
60 | 
61 |   /** Don't allow nulls in Scala - fail fast with NPE if null is tried. */
62 |   override protected def allowsNull = false
63 | }
64 | 
65 | object DefaultColumnMapper {
66 |   private val SetterSuffix: String = "_$eq"
67 | }
68 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/KafkaProducer.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.embedded
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import akka.actor.{Actor, ActorLogging}
 6 | import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
 7 | import kafka.serializer.StringEncoder
 8 | import kafka.server.KafkaConfig
 9 | 
10 | /** Simple producer for an Akka Actor using string encoder and default partitioner. */
11 | abstract class KafkaProducerActor[K, V] extends Actor with ActorLogging {
12 |   import KafkaEvent._
13 | 
14 |   def producerConfig: ProducerConfig
15 | 
16 |   private val producer = new KafkaProducer[K, V](producerConfig)
17 | 
18 |   override def postStop(): Unit = {
19 |     log.info("Shutting down producer.")
20 |     producer.close()
21 |   }
22 | 
23 |   def receive = {
24 |     case e: KafkaMessageEnvelope[K,V] => producer.send(e)
25 |   }
26 | }
27 | 
28 | /** Simple producer using string encoder and default partitioner. */
29 | class KafkaProducer[K, V](producerConfig: ProducerConfig) {
30 | 
31 |   def this(brokers: Set[String], batchSize: Int, producerType: String, serializerFqcn: String) =
32 |     this(KafkaProducer.createConfig(brokers, batchSize, producerType, serializerFqcn))
33 | 
34 |   def this(config: KafkaConfig) =
35 |     this(KafkaProducer.defaultConfig(config))
36 | 
37 |   import KafkaEvent._
38 | 
39 |   private val producer = new Producer[K, V](producerConfig)
40 | 
41 |   /** Sends the data, partitioned by key to the topic. */
42 |   def send(e: KafkaMessageEnvelope[K,V]): Unit =
43 |     batchSend(e.topic, e.key, e.messages)
44 | 
45 |   /* Sends a single message. */
46 |   def send(topic : String, key : K, message : V): Unit =
47 |     batchSend(topic, key, Seq(message))
48 | 
49 |   def batchSend(topic: String, key: K, batch: Seq[V]): Unit = {
50 |     val messages = batch map (msg => new KeyedMessage[K, V](topic, key, msg))
51 |     producer.send(messages.toArray: _*)
52 |   }
53 | 
54 |   def close(): Unit = producer.close()
55 | 
56 | }
57 | 
58 | object KafkaEvent {
59 |   case class KafkaMessageEnvelope[K,V](topic: String, key: K, messages: V*)
60 | }
61 | 
62 | object KafkaProducer {
63 | 
64 |   def createConfig(brokers: Set[String], batchSize: Int, producerType: String, serializerFqcn: String): ProducerConfig = {
65 |     val props = new Properties()
66 |     props.put("metadata.broker.list", brokers.mkString(","))
67 |     props.put("serializer.class", serializerFqcn)
68 |     props.put("partitioner.class", "kafka.producer.DefaultPartitioner")
69 |     props.put("producer.type", producerType)
70 |     props.put("request.required.acks", "1")
71 |     props.put("batch.num.messages", batchSize.toString)
72 |     new ProducerConfig(props)
73 |   }
74 | 
75 |   def defaultConfig(config: KafkaConfig): ProducerConfig =
76 |     createConfig(Set(s"${config.hostName}:${config.port}"), 100, "async", classOf[StringEncoder].getName)
77 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/mapper/ReflectionColumnMapper.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.mapper
 2 | 
 3 | import java.lang.reflect.{Constructor, Method}
 4 | 
 5 | import com.datastax.spark.connector.{ColumnRef, ColumnName}
 6 | import com.datastax.spark.connector.cql.TableDef
 7 | import com.datastax.spark.connector.rdd.reader.AnyObjectFactory
 8 | import org.apache.commons.lang.StringUtils
 9 | 
10 | import scala.reflect.ClassTag
11 | 
12 | abstract class ReflectionColumnMapper[T : ClassTag] extends ColumnMapper[T] {
13 | 
14 |   import AnyObjectFactory._
15 | 
16 |   protected def isSetter(method: Method): Boolean
17 |   protected def isGetter(method: Method): Boolean
18 |   protected def setterToColumnName(setterName: String, tableDef: TableDef): String
19 |   protected def getterToColumnName(getterName: String, tableDef: TableDef): String
20 |   protected def constructorParamToColumnName(paramName: String, tableDef: TableDef): String
21 |   protected def allowsNull: Boolean
22 | 
23 |   protected final def camelCaseToUnderscore(str: String): String =
24 |     StringUtils.splitByCharacterTypeCamelCase(str).mkString("_").replaceAll("_+", "_").toLowerCase
25 | 
26 |   protected final def columnNameForProperty(propertyName: String, tableDef: TableDef): String = {
27 |     val underscoreName = camelCaseToUnderscore(propertyName)
28 |     val candidateColumnNames = Seq(propertyName, underscoreName)
29 |     val columnRef = candidateColumnNames.iterator.map(tableDef.columnByName.get).find(_.isDefined).flatten
30 |     columnRef.fold(underscoreName)(_.columnName)
31 |   }
32 | 
33 |   override def columnMap(tableDef: TableDef): ColumnMap = {
34 | 
35 |     val cls = implicitly[ClassTag[T]].runtimeClass
36 | 
37 |     def columnsOf(ctor: Constructor[_]): Seq[ColumnRef] = {
38 |       if (isNoArgsConstructor(ctor))
39 |         Nil
40 |       else {
41 |         val paramNames = paranamer.lookupParameterNames(ctor)
42 |         val columnNames = paramNames
43 |           .map(constructorParamToColumnName(_, tableDef))
44 |           .filter(_ != "$_outer")
45 |         columnNames.map(ColumnName)
46 |       }
47 |     }
48 | 
49 |     val constructor = columnsOf(resolveConstructor(cls))
50 | 
51 |     val getters: Map[String, ColumnRef] = {
52 |       for (method <- cls.getMethods if isGetter(method)) yield {
53 |         val methodName = method.getName
54 |         val columnName = getterToColumnName(methodName, tableDef)
55 |         (methodName, ColumnName(columnName))
56 |       }
57 |     }.toMap
58 | 
59 |     val setters: Map[String, ColumnRef] = {
60 |       for (method <- cls.getMethods if isSetter(method)) yield {
61 |         val methodName = method.getName
62 |         val columnName = setterToColumnName(methodName, tableDef)
63 |         (methodName, ColumnName(columnName))
64 |       }
65 |     }.toMap
66 | 
67 |     new SimpleColumnMap(constructor, getters, setters, allowsNull)
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/streaming/StreamingContextFunctions.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.streaming
 2 | 
 3 | import akka.actor.{ActorRef, Actor}
 4 | import com.datastax.spark.connector.cql.CassandraConnector
 5 | import com.datastax.spark.connector.rdd.{ReadConf, ValidRDDType}
 6 | import org.apache.spark.streaming.StreamingContext
 7 | import org.apache.spark.streaming.scheduler.StreamingListener
 8 | import org.apache.spark.streaming.receiver.ActorHelper
 9 | import com.datastax.spark.connector.SparkContextFunctions
10 | import com.datastax.spark.connector.rdd.reader.RowReaderFactory
11 | 
12 | import scala.reflect.ClassTag
13 | 
14 | /** Provides Cassandra-specific methods on `org.apache.spark.streaming.StreamingContext`.
15 |   * @param ssc the Spark Streaming context
16 |   */
17 | class StreamingContextFunctions (ssc: StreamingContext) extends SparkContextFunctions(ssc.sparkContext) {
18 |   import scala.reflect.ClassTag
19 | 
20 |   override def cassandraTable[T](keyspace: String, table: String)(
21 |     implicit
22 |       connector: CassandraConnector = CassandraConnector(ssc.sparkContext.getConf),
23 |       ct: ClassTag[T],
24 |       rrf: RowReaderFactory[T],
25 |       ev: ValidRDDType[T]): CassandraStreamingRDD[T] = {
26 | 
27 |     val readConf = ReadConf.fromSparkConf(ssc.sparkContext.getConf)
28 |     new CassandraStreamingRDD[T](ssc, connector, keyspace, table, readConf = readConf)
29 |   }
30 | }
31 | 
32 | /** Simple akka.actor.Actor mixin. */
33 | trait SparkStreamingActor extends Actor with ActorHelper {
34 | 
35 |   override def preStart(): Unit = {
36 |     context.system.eventStream.publish(StreamingEvent.ReceiverStarted(self))
37 |   }
38 | }
39 | 
40 | abstract class TypedStreamingActor[T : ClassTag] extends SparkStreamingActor {
41 | 
42 |   def receive: Actor.Receive = {
43 |     case e: T => push(e)
44 |   }
45 | 
46 |   def push(event: T): Unit =
47 |     store(event)
48 | }
49 | 
50 | /** Simple StreamingListener. Currently just used to listen for initialization of a receiver.
51 |   * Implement further to access information about an ongoing streaming computation.*/
52 | class SparkStreamingListener[T: ClassTag] extends StreamingListener {
53 |   import org.apache.spark.streaming.scheduler.StreamingListenerReceiverStarted
54 |   import java.util.concurrent.atomic.AtomicBoolean
55 | 
56 |   private val listenerInitialized = new AtomicBoolean()
57 | 
58 |   def initialized: Boolean = listenerInitialized.get
59 | 
60 |   /** Called when a receiver has been started */
61 |   override def onReceiverStarted(started: StreamingListenerReceiverStarted): Unit =
62 |     listenerInitialized.set(true)
63 | 
64 | }
65 | 
66 | object StreamingEvent {
67 |   /** Base marker for Receiver events */
68 |   sealed trait ReceiverEvent extends Serializable
69 | 
70 |   /**
71 |    * @param actor the receiver actor
72 |    */
73 |   case class ReceiverStarted(actor: ActorRef) extends ReceiverEvent
74 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/types/ColumnType.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.types
 2 | 
 3 | import com.datastax.driver.core.{UDTValue, DataType}
 4 | import scala.collection.JavaConversions._
 5 | import scala.reflect.runtime.universe._
 6 | 
 7 | import com.datastax.spark.connector.types.TypeConverter.OptionToNullConverter
 8 | 
 9 | /** Serializable representation of column data type. */
10 | trait ColumnType[T] extends Serializable {
11 | 
12 |   /** Returns a converter that converts values to the type of this column expected by the
13 |     * Cassandra Java driver when saving the row.*/
14 |   def converterToCassandra: TypeConverter[_ <: AnyRef]
15 | 
16 |   /** Returns a converter that converts values to the Scala type associated with this column. */
17 |   lazy val converterToScala: TypeConverter[T] =
18 |     TypeConverter.forType(scalaTypeTag)
19 | 
20 |   /** Returns the TypeTag of the Scala type recommended to represent values of this column. */
21 |   def scalaTypeTag: TypeTag[T]
22 | 
23 |   /** Name of the Scala type. Useful for source generation.*/
24 |   def scalaTypeName: String
25 |     = scalaTypeTag.tpe.toString
26 | 
27 |   def isCollection: Boolean
28 | }
29 | 
30 | object ColumnType {
31 | 
32 |   private val primitiveTypeMap = Map[DataType, ColumnType[_]](
33 |     DataType.text() -> TextType,
34 |     DataType.ascii() -> AsciiType,
35 |     DataType.varchar() -> VarCharType,
36 |     DataType.cint() -> IntType,
37 |     DataType.bigint() -> BigIntType,
38 |     DataType.cfloat() -> FloatType,
39 |     DataType.cdouble() -> DoubleType,
40 |     DataType.cboolean() -> BooleanType,
41 |     DataType.varint() -> VarIntType,
42 |     DataType.decimal() -> DecimalType,
43 |     DataType.timestamp() -> TimestampType,
44 |     DataType.inet() -> InetType,
45 |     DataType.uuid() -> UUIDType,
46 |     DataType.blob() -> BlobType,
47 |     DataType.counter() -> CounterType,
48 |     DataType.timeuuid() -> TimeUUIDType
49 |   )
50 | 
51 |   def fromDriverType(dataType: DataType): ColumnType[_] = {
52 |     val typeArgs = dataType.getTypeArguments.map(fromDriverType)
53 |     dataType.getName match {
54 |       case DataType.Name.LIST => ListType(typeArgs(0))
55 |       case DataType.Name.SET => SetType(typeArgs(0))
56 |       case DataType.Name.MAP => MapType(typeArgs(0), typeArgs(1))
57 |       case DataType.Name.UDT => UserDefinedTypeStub
58 |       case _ => primitiveTypeMap(dataType)
59 |     }
60 |   }
61 | }
62 | 
63 | // TODO: This is a stub.
64 | // UDTValues are not Serializable.
65 | // Properly, we should use a dedicated,
66 | // serializable class for UDTValues and also allow to map them to case classes.
67 | case object UserDefinedTypeStub extends ColumnType[UDTValue] {
68 |   def converterToCassandra = new OptionToNullConverter(TypeConverter.forType[UDTValue])
69 |   override def isCollection = false
70 |   override def scalaTypeTag = TypeTag.synchronized { implicitly[TypeTag[UDTValue]] }
71 | }
72 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/KafkaConsumer.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.embedded
 2 | 
 3 | import java.util.Properties
 4 | import java.util.concurrent.Executors
 5 | import java.util.concurrent.atomic.AtomicInteger
 6 | 
 7 | import scala.concurrent.duration._
 8 | import akka.actor.{ActorLogging, Actor}
 9 | import kafka.serializer.StringDecoder
10 | import kafka.consumer.{Consumer, ConsumerConfig}
11 | 
12 | /** The KafkaConsumer is a very simple consumer of a single Kafka topic.
13 |   * This is a helpful utility for IT tests to insure data is getting published to Kafka
14 |   * for streaming ingestion upstream.
15 |   */
16 | class KafkaConsumer(zookeeper: String, topic: String, groupId: String, partitions: Int, numThreads: Int, count: AtomicInteger) {
17 | 
18 |   val connector = Consumer.create(createConsumerConfig)
19 | 
20 |   // create n partitions of the stream for topic “test”, to allow n threads to consume
21 |   val streams = connector
22 |     .createMessageStreams(Map(topic -> partitions), new StringDecoder(), new StringDecoder())
23 |     .get(topic)
24 | 
25 |   // launch all the threads
26 |   val executor = Executors.newFixedThreadPool(numThreads)
27 | 
28 |   // consume the messages in the threads
29 |   for(stream <- streams) {
30 |     executor.submit(new Runnable() {
31 |       def run() {
32 |         for(s <- stream) {
33 |           while(s.iterator.hasNext) {
34 |             count.getAndIncrement
35 |           }
36 |         }
37 |       }
38 |     })
39 |   }
40 | 
41 |   private def createConsumerConfig: ConsumerConfig = {
42 |     val props = new Properties()
43 |     props.put("consumer.timeout.ms", "2000")
44 |     props.put("zookeeper.connect", zookeeper)
45 |     props.put("group.id", groupId)
46 |     props.put("zookeeper.session.timeout.ms", "400")
47 |     props.put("zookeeper.sync.time.ms", "10")
48 |     props.put("auto.commit.interval.ms", "1000")
49 | 
50 |     new ConsumerConfig(props)
51 |   }
52 | 
53 |   def shutdown() {
54 |     println("Consumer shutting down.")
55 |     Option(connector) map (_.shutdown())
56 |     Option(executor) map (_.shutdown())
57 |   }
58 | }
59 | 
60 | /** Simple actor with a Kafka consumer to report the latest message count in a Kafka Topic. */
61 | class KafkaTopicLogger(topic: String, group: String, taskInterval: FiniteDuration = 3.seconds)
62 |   extends Actor with ActorLogging {
63 |   import Event._
64 |   import context.dispatcher
65 | 
66 |   val atomic = new AtomicInteger(0)
67 | 
68 |   val consumer = new KafkaConsumer(ZookeeperConnectionString, topic, group, 1, 10, atomic)
69 | 
70 |   var task = context.system.scheduler.schedule(3.seconds, taskInterval) {
71 |     self ! QueryTask
72 |   }
73 |  
74 |   override def postStop(): Unit = {
75 |     task.cancel
76 |     consumer.shutdown()
77 |   }
78 | 
79 |   def receive: Actor.Receive = {
80 |     case QueryTask =>
81 |       log.info(s"Kafka message count [{}]", atomic.get)
82 |   }
83 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector-embedded/src/main/scala/com/datastax/spark/connector/embedded/EmbeddedZookeeper.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.embedded
 2 | 
 3 | import java.net.InetSocketAddress
 4 | 
 5 | import scala.util.Try
 6 | import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer}
 7 | 
 8 | /** Implements a simple standalone ZooKeeperServer.
 9 |   * To create a ZooKeeper client object, the application needs to pass a
10 |   * connection string containing a comma separated list of host:port pairs,
11 |   * each corresponding to a ZooKeeper server.
12 |   * <p>
13 |   * Session establishment is asynchronous. This constructor will initiate
14 |   * connection to the server and return immediately - potentially (usually)
15 |   * before the session is fully established. The watcher argument specifies
16 |   * the watcher that will be notified of any changes in state. This
17 |   * notification can come at any point before or after the constructor call
18 |   * has returned.
19 |   * <p>
20 |   * The instantiated ZooKeeper client object will pick an arbitrary server
21 |   * from the connectString and attempt to connect to it. If establishment of
22 |   * the connection fails, another server in the connect string will be tried
23 |   * (the order is non-deterministic, as we random shuffle the list), until a
24 |   * connection is established. The client will continue attempts until the
25 |   * session is explicitly closed.
26 |   *
27 |   * @param connectString comma separated host:port pairs, each corresponding to a zk
28 |   *            server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002" If
29 |   *            the optional chroot suffix is used the example would look
30 |   *            like: "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002/app/a"
31 |   *            where the client would be rooted at "/app/a" and all paths
32 |   *            would be relative to this root - ie getting/setting/etc...
33 |   *            "/foo/bar" would result in operations being run on
34 |   *            "/app/a/foo/bar" (from the server perspective).
35 |   *            Default: the local IP and default port: 2180.
36 |   */
37 | class EmbeddedZookeeper(val connectString: String = ZookeeperConnectionString) extends Embedded {
38 | 
39 |   val snapshotDir = createTempDir
40 | 
41 |   val logDir = createTempDir
42 | 
43 |   val server = new ZooKeeperServer(snapshotDir, logDir, 500)
44 | 
45 |   val (ip, port) = {
46 |     val splits = connectString.split(":")
47 |     (splits(0), splits(1).toInt)
48 |   }
49 | 
50 |   val factory = new NIOServerCnxnFactory()
51 |   factory.configure(new InetSocketAddress(ip, port), 16)
52 |   factory.startup(server)
53 |   println(s"ZooKeeperServer isRunning: $isRunning")
54 | 
55 |   def isRunning: Boolean = Try(server.isRunning) getOrElse false
56 | 
57 |   def shutdown(): Unit = {
58 |     println(s"Shutting down ZK NIOServerCnxnFactory.")
59 |     factory.shutdown()
60 |     deleteRecursively(snapshotDir)
61 |     deleteRecursively(logDir)
62 |   }
63 | }


--------------------------------------------------------------------------------
/spark-cassandra-connector-java/src/test/java/com/datastax/spark/connector/japi/CustomTypeConverterTest.java:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.japi;
 2 | 
 3 | import akka.japi.JavaPartialFunction;
 4 | import com.datastax.spark.connector.japi.types.JavaTypeConverter;
 5 | import com.datastax.spark.connector.util.JavaApiHelper$;
 6 | import org.junit.Test;
 7 | 
 8 | import static org.junit.Assert.assertEquals;
 9 | 
10 | public class CustomTypeConverterTest {
11 | 
12 |     public static enum SampleEnum {
13 |         ONE, TWO, THREE
14 |     }
15 | 
16 |     public final static JavaTypeConverter<SampleEnum> sampleEnumConverter =
17 |             new JavaTypeConverter<SampleEnum>(JavaApiHelper$.MODULE$.getTypeTag(SampleEnum.class),
18 |                     new JavaPartialFunction<Object, SampleEnum>() {
19 |                         @Override
20 |                         public SampleEnum apply(Object x, boolean isCheck) throws Exception {
21 |                             if (x == null) {
22 |                                 return null;
23 |                             } else if (x instanceof String) {
24 |                                 try {
25 |                                     return SampleEnum.valueOf((String) x);
26 |                                 } catch (IllegalArgumentException ex) {
27 |                                     throw noMatch();
28 |                                 }
29 |                             } else if (x instanceof Number) {
30 |                                 switch (((Number) x).intValue()) {
31 |                                     case 1:
32 |                                         return SampleEnum.ONE;
33 |                                     case 2:
34 |                                         return SampleEnum.TWO;
35 |                                     case 3:
36 |                                         return SampleEnum.THREE;
37 |                                 }
38 |                             }
39 |                             throw noMatch();
40 |                         }
41 |                     });
42 | 
43 | 
44 |     @Test
45 |     public void test1() {
46 |         assertEquals(SampleEnum.class.getName(), sampleEnumConverter.targetTypeName());
47 | 
48 |         assertEquals(true, sampleEnumConverter.convertPF().isDefinedAt(1));
49 |         assertEquals(true, sampleEnumConverter.convertPF().isDefinedAt(2.5));
50 |         assertEquals(true, sampleEnumConverter.convertPF().isDefinedAt("THREE"));
51 |         assertEquals(false, sampleEnumConverter.convertPF().isDefinedAt("asdf"));
52 |         assertEquals(false, sampleEnumConverter.convertPF().isDefinedAt(4));
53 |         assertEquals(true, sampleEnumConverter.convertPF().isDefinedAt(null));
54 | 
55 |         assertEquals(SampleEnum.ONE, sampleEnumConverter.convertPF().apply(1));
56 |         assertEquals(SampleEnum.TWO, sampleEnumConverter.convertPF().apply(2.5));
57 |         assertEquals(SampleEnum.THREE, sampleEnumConverter.convertPF().apply("THREE"));
58 |         assertEquals(null, sampleEnumConverter.convertPF().apply(null));
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/TokenRangeClusterer.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner
 2 | 
 3 | import java.net.InetAddress
 4 | 
 5 | import Ordering.Implicits._
 6 | 
 7 | import com.datastax.spark.connector.rdd.partitioner.dht.{Token, TokenRange}
 8 | 
 9 | import scala.annotation.tailrec
10 | 
11 | /** Divides a set of token ranges into groups containing not more than `maxRowCountPerGroup` rows
12 |   * and not more than `maxGroupSize` token ranges. Each group will form a single `CassandraRDDPartition`.
13 |   *
14 |   * The algorithm is as follows:
15 |   * 1. Sort token ranges by endpoints lexicographically.
16 |   * 2. Take the highest possible number of token ranges from the beginning of the list,
17 |   *    such that their sum of rowCounts does not exceed `maxRowCountPerGroup` and they all contain at
18 |   *    least one common endpoint. If it is not possible, take at least one item.
19 |   *    Those token ranges will make a group.
20 |   * 3. Repeat the previous step until no more token ranges left.*/
21 | class TokenRangeClusterer[V, T <: Token[V]](maxRowCountPerGroup: Long, maxGroupSize: Int = Int.MaxValue) {
22 | 
23 |   private implicit object InetAddressOrdering extends Ordering[InetAddress] {
24 |     override def compare(x: InetAddress, y: InetAddress) =
25 |       x.getHostAddress.compareTo(y.getHostAddress)
26 |   }
27 | 
28 |   @tailrec
29 |   private def group(tokenRanges: Stream[TokenRange[V, T]],
30 |                     result: Vector[Seq[TokenRange[V, T]]]): Iterable[Seq[TokenRange[V, T]]] = {
31 |     tokenRanges match {
32 |       case Stream.Empty => result
33 |       case head #:: rest =>
34 |         val firstEndpoint = head.endpoints.min
35 |         val rowCounts = tokenRanges.map(_.rowCount.get)
36 |         val cumulativeRowCounts = rowCounts.scanLeft(0L)(_ + _).tail // drop first item always == 0
37 |         val rowLimit = math.max(maxRowCountPerGroup, head.rowCount.get) // make sure first element will be always included
38 |         val cluster = tokenRanges
39 |           .take(math.max(1, maxGroupSize))
40 |           .zip(cumulativeRowCounts)
41 |           .takeWhile { case (tr, count) => count <= rowLimit && tr.endpoints.min == firstEndpoint }
42 |           .map(_._1)
43 |           .toVector
44 |         val remainingTokenRanges = tokenRanges.drop(cluster.length)
45 |         group(remainingTokenRanges, result :+ cluster)
46 |     }
47 |   }
48 | 
49 |   /** Groups small token ranges on the same server(s) in order to reduce task scheduling overhead.
50 |     * Useful mostly with virtual nodes, which may create lots of small token range splits.
51 |     * Each group will make a single Spark task. */
52 |   def group(tokenRanges: Seq[TokenRange[V, T]]): Iterable[Seq[TokenRange[V, T]]] = {
53 |     // sort by endpoints lexicographically
54 |     // this way ranges on the same host are grouped together
55 |     val sortedRanges = tokenRanges.sortBy(_.endpoints.toSeq.sorted)
56 |     group(sortedRanges.toStream, Vector.empty)
57 |   }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/streaming/ActorStreamSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.streaming
 2 | 
 3 | import akka.actor.{ActorSystem, Props, Terminated}
 4 | import akka.testkit.{ImplicitSender, TestKit}
 5 | import com.datastax.spark.connector.{RowsInBatch, SomeColumns}
 6 | import com.datastax.spark.connector.cql.CassandraConnector
 7 | import com.datastax.spark.connector.embedded._
 8 | import com.datastax.spark.connector.streaming.StreamingEvent.ReceiverStarted
 9 | import com.datastax.spark.connector.testkit._
10 | import com.datastax.spark.connector.writer.WriteConf
11 | import org.apache.spark.SparkEnv
12 | import org.apache.spark.storage.StorageLevel
13 | import org.apache.spark.streaming.StreamingContext.toPairDStreamFunctions
14 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
15 | 
16 | class ActorStreamingSpec extends ActorSpec with CounterFixture with ImplicitSender {
17 |   import com.datastax.spark.connector.testkit.TestEvent._
18 | 
19 |   /* Initializations - does not work in the actor test context in a static before() */
20 |   CassandraConnector(SparkTemplate.conf).withSessionDo { session =>
21 |     session.execute("CREATE KEYSPACE IF NOT EXISTS demo WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1 }")
22 |     session.execute("CREATE TABLE IF NOT EXISTS demo.streaming_wordcount (word TEXT PRIMARY KEY, count COUNTER)")
23 |     session.execute("TRUNCATE demo.streaming_wordcount")
24 |   }
25 | 
26 |   "actorStream" must {
27 |     "write from the actor stream to cassandra table: demo.streaming_wordcount" in {
28 | 
29 |       val stream = ssc.actorStream[String](Props[TestStreamingActor], actorName, StorageLevel.MEMORY_AND_DISK)
30 | 
31 |       val wc = stream.flatMap(_.split("\\s+"))
32 |         .map(x => (x, 1))
33 |         .reduceByKey(_ + _)
34 |         .saveToCassandra("demo", "streaming_wordcount")
35 | 
36 |       // start the streaming context so the data can be processed and actor started
37 |       ssc.start()
38 | 
39 |       system.eventStream.subscribe(self, classOf[StreamingEvent.ReceiverStarted])
40 | 
41 |       expectMsgPF(duration) { case ReceiverStarted(receiver) =>
42 |         watch(receiver)
43 |         system.actorOf(Props(new TestProducer(data.toArray, receiver)))
44 |       }
45 | 
46 |       expectMsgPF(duration) { case Terminated(ref) =>
47 |         val rdd = ssc.cassandraTable[WordCount]("demo", "streaming_wordcount")
48 |         awaitCond(rdd.collect.nonEmpty && rdd.map(_.count).reduce(_ + _) == scale * 2)
49 |         rdd.collect.size should be (data.size)
50 |       }
51 |     }
52 |   }
53 | }
54 | 
55 | /** A very basic Akka actor which streams `String` event data to spark. */
56 | class TestStreamingActor extends TypedStreamingActor[String] with Counter {
57 | 
58 |   override def push(e: String): Unit = {
59 |     super.push(e)
60 |     increment()
61 |   }
62 | }
63 | 
64 | abstract class ActorSpec(val ssc: StreamingContext, _system: ActorSystem)
65 |   extends TestKit(_system) with StreamingSpec {
66 | 
67 |   def this() = this (new StreamingContext(SparkTemplate.sc, Milliseconds(300)), SparkEnv.get.actorSystem)
68 | 
69 | }
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/rdd/partitioner/RandomPartitionerTokenRangeSplitterTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner
 2 | 
 3 | import java.net.InetAddress
 4 | 
 5 | import com.datastax.spark.connector.rdd.partitioner.dht.{CassandraNode, BigIntToken, TokenFactory}
 6 | import org.junit.Assert._
 7 | import org.junit.Test
 8 | 
 9 | 
10 | class RandomPartitionerTokenRangeSplitterTest {
11 | 
12 |   type TokenRange = com.datastax.spark.connector.rdd.partitioner.dht.TokenRange[BigInt, BigIntToken]
13 | 
14 |   private def assertNoHoles(tokenRanges: Seq[TokenRange]) {
15 |     for (Seq(range1, range2) <- tokenRanges.sliding(2))
16 |       assertEquals(range1.end, range2.start)
17 |   }
18 | 
19 |   @Test
20 |   def testSplit() {
21 |     val node = CassandraNode(InetAddress.getLocalHost, InetAddress.getLocalHost)
22 |     val splitter = new RandomPartitionerTokenRangeSplitter(2.0)
23 |     val rangeLeft = BigInt("0")
24 |     val rangeRight = BigInt("100")
25 |     val range = new TokenRange(
26 |       new BigIntToken(rangeLeft),
27 |       new BigIntToken(rangeRight), Set(node), None)
28 |     val out = splitter.split(range, 20)
29 | 
30 |     // 2 rows per token on average; to so 10 tokens = 20 rows; therefore 10 splits
31 |     assertEquals(10, out.size)
32 |     assertEquals(rangeLeft, out.head.start.value)
33 |     assertEquals(rangeRight, out.last.end.value)
34 |     assertTrue(out.forall(_.endpoints == Set(node)))
35 |     assertNoHoles(out)
36 |   }
37 | 
38 |   @Test
39 |   def testNoSplit() {
40 |     val splitter = new RandomPartitionerTokenRangeSplitter(2.0)
41 |     val rangeLeft = BigInt("0")
42 |     val rangeRight = BigInt("100")
43 |     val range = new TokenRange(
44 |       new BigIntToken(rangeLeft),
45 |       new BigIntToken(rangeRight), Set.empty, None)
46 |     val out = splitter.split(range, 500)
47 | 
48 |     // range is too small to contain 500 rows
49 |     assertEquals(1, out.size)
50 |     assertEquals(rangeLeft, out.head.start.value)
51 |     assertEquals(rangeRight, out.last.end.value)
52 |   }
53 | 
54 |   @Test
55 |   def testZeroRows() {
56 |     val splitter = new RandomPartitionerTokenRangeSplitter(0.0)
57 |     val rangeLeft = BigInt("0")
58 |     val rangeRight = BigInt("100")
59 |     val range = new TokenRange(
60 |       new BigIntToken(rangeLeft),
61 |       new BigIntToken(rangeRight), Set.empty, None)
62 |     val out = splitter.split(range, 500)
63 |     assertEquals(1, out.size)
64 |     assertEquals(rangeLeft, out.head.start.value)
65 |     assertEquals(rangeRight, out.last.end.value)
66 |   }
67 | 
68 |   @Test
69 |   def testWrapAround() {
70 |     val splitter = new RandomPartitionerTokenRangeSplitter(2.0)
71 |     val rangeLeft = TokenFactory.RandomPartitionerTokenFactory.maxToken.value - 100
72 |     val rangeRight = BigInt("100")
73 |     val range = new TokenRange(
74 |       new BigIntToken(rangeLeft),
75 |       new BigIntToken(rangeRight), Set.empty, None)
76 |     val out = splitter.split(range, 20)
77 |     assertEquals(20, out.size)
78 |     assertEquals(rangeLeft, out.head.start.value)
79 |     assertEquals(rangeRight, out.last.end.value)
80 |     assertNoHoles(out)
81 |   }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/rdd/partitioner/Murmur3PartitionerTokenRangeSplitterTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.rdd.partitioner
 2 | 
 3 | import java.net.InetAddress
 4 | 
 5 | import com.datastax.spark.connector.rdd.partitioner.dht.{CassandraNode, LongToken}
 6 | import com.datastax.spark.connector.rdd.partitioner.dht.TokenFactory.Murmur3TokenFactory
 7 | import org.junit.Assert._
 8 | import org.junit.Test
 9 | 
10 | class Murmur3PartitionerTokenRangeSplitterTest {
11 | 
12 |   type TokenRange = com.datastax.spark.connector.rdd.partitioner.dht.TokenRange[Long, LongToken]
13 | 
14 |   private def assertNoHoles(tokenRanges: Seq[TokenRange]) {
15 |     for (Seq(range1, range2) <- tokenRanges.sliding(2))
16 |       assertEquals(range1.end, range2.start)
17 |   }
18 | 
19 |   @Test
20 |   def testSplit() {
21 |     val node = CassandraNode(InetAddress.getLocalHost, InetAddress.getLocalHost)
22 |     val splitter = new Murmur3PartitionerTokenRangeSplitter(2.0)
23 |     val range = new TokenRange(
24 |       new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(0),
25 |       new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(100),
26 |       Set(node), None)
27 |     val out = splitter.split(range, 20)
28 | 
29 |     // 2 rows per token on average; to so 10 tokens = 20 rows; therefore 10 splits
30 |     assertEquals(10, out.size)
31 |     assertEquals(0L, out.head.start.value)
32 |     assertEquals(100L, out.last.end.value)
33 |     assertTrue(out.forall(s => s.end.value - s.start.value == 10))
34 |     assertTrue(out.forall(_.endpoints == Set(node)))
35 |     assertNoHoles(out)
36 |   }
37 | 
38 |   @Test
39 |   def testNoSplit() {
40 |     val splitter = new Murmur3PartitionerTokenRangeSplitter(2.0)
41 |     val range = new TokenRange(
42 |       new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(0), new LongToken(100), Set.empty, None)
43 |     val out = splitter.split(range, 500)
44 | 
45 |     // range is too small to contain 500 rows
46 |     assertEquals(1, out.size)
47 |     assertEquals(0L, out.head.start.value)
48 |     assertEquals(100L, out.last.end.value)
49 |   }
50 | 
51 |   @Test
52 |   def testZeroRows() {
53 |     val splitter = new Murmur3PartitionerTokenRangeSplitter(0.0)
54 |     val range = new TokenRange(
55 |       new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(0), new LongToken(100), Set.empty, None)
56 |     val out = splitter.split(range, 500)
57 |     assertEquals(1, out.size)
58 |     assertEquals(0L, out.head.start.value)
59 |     assertEquals(100L, out.last.end.value)
60 |   }
61 | 
62 |   @Test
63 |   def testWrapAround() {
64 |     val splitter = new Murmur3PartitionerTokenRangeSplitter(2.0)
65 |     val maxValue = Murmur3TokenFactory.maxToken.value
66 |     val minValue = Murmur3TokenFactory.minToken.value
67 |     val range = new TokenRange(
68 |       new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(maxValue - 100),
69 |       new com.datastax.spark.connector.rdd.partitioner.dht.LongToken(minValue + 100), Set.empty, None)
70 |     val splits = splitter.split(range, 20)
71 |     assertEquals(20, splits.size)
72 |     assertEquals(maxValue - 100, splits.head.start.value)
73 |     assertEquals(minValue + 100, splits.last.end.value)
74 |     assertNoHoles(splits)
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/util/JavaApiHelper.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.util
 2 | 
 3 | import java.util.{Map => JavaMap}
 4 | 
 5 | import com.datastax.spark.connector.mapper.{ColumnMapper, JavaBeanColumnMapper}
 6 | import com.datastax.spark.connector.rdd.reader.RowReaderFactory
 7 | import com.datastax.spark.connector.writer.RowWriterFactory
 8 | import com.datastax.spark.connector.CassandraRow
 9 | 
10 | import scala.collection.JavaConversions._
11 | import scala.reflect._
12 | import scala.reflect.api.{Mirror, TypeCreator, _}
13 | import scala.reflect.runtime.universe._
14 | 
15 | /** A helper class to make it possible to access components written in Scala from Java code.
16 |   * INTERNAL API
17 |   */
18 | object JavaApiHelper {
19 | 
20 |   def mirror = runtimeMirror(Thread.currentThread().getContextClassLoader)
21 | 
22 |   /** Returns a `TypeTag` for the given class. */
23 |   def getTypeTag[T](clazz: Class[T]): TypeTag[T] = TypeTag.synchronized {
24 |     TypeTag.apply(mirror, new TypeCreator {
25 |       override def apply[U <: Universe with Singleton](m: Mirror[U]): U#Type = {
26 |         m.staticClass(clazz.getName).toTypeConstructor
27 |       }
28 |     })
29 |   }
30 | 
31 |   /** Returns a `TypeTag` for the given class and type parameters. */
32 |   def getTypeTag[T](clazz: Class[_], typeParams: TypeTag[_]*): TypeTag[T] = TypeTag.synchronized {
33 |     TypeTag.apply(mirror, new TypeCreator {
34 |       override def apply[U <: Universe with Singleton](m: Mirror[U]) = {
35 |         val ct = m.staticClass(clazz.getName).toTypeConstructor.asInstanceOf[m.universe.Type]
36 |         val tpt = typeParams.map(_.in(m).tpe.asInstanceOf[m.universe.Type]).toList
37 |         m.universe.appliedType(ct, tpt).asInstanceOf[U#Type]
38 |       }
39 |     })
40 |   }
41 | 
42 |   /** Returns a `ClassTag` of a given runtime class. */
43 |   def getClassTag[T](clazz: Class[T]): ClassTag[T] = ClassTag(clazz)
44 | 
45 |   /** Returns a runtime class of a given `TypeTag`. */
46 |   def getRuntimeClass[T](typeTag: TypeTag[T]): Class[T] = mirror.runtimeClass(typeTag.tpe).asInstanceOf[Class[T]]
47 | 
48 |   /** Returns a runtime class of a given `ClassTag`. */
49 |   def getRuntimeClass[T](classTag: ClassTag[T]): Class[T] = classTag.runtimeClass.asInstanceOf[Class[T]]
50 | 
51 |   /** Converts a Java `Map` to a Scala immutable `Map`. */
52 |   def toScalaMap[K, V](map: JavaMap[K, V]): Map[K, V] = Map(map.toSeq: _*)
53 | 
54 |   /** Converts an array to a Scala `Seq`. */
55 |   def toScalaSeq[T](array: Array[T]): Seq[T] = array
56 | 
57 |   /** Converts a Java `Iterable` to Scala `Seq`. */
58 |   def toScalaSeq[T](iterable: java.lang.Iterable[T]): Seq[T] = iterable.toSeq
59 | 
60 |   /** Returns the default `RowWriterFactory` initialized with the given `ColumnMapper`. */
61 |   def defaultRowWriterFactory[T](mapper: ColumnMapper[T]) = {
62 |     RowWriterFactory.defaultRowWriterFactory(mapper)
63 |   }
64 | 
65 |   /** Returns the `JavaBeanColumnMapper` instance for the given `ClassTag` and column mapping. */
66 |   def javaBeanColumnMapper[T](classTag: ClassTag[T], columnNameOverride: JavaMap[String, String]): ColumnMapper[T] =
67 |     new JavaBeanColumnMapper[T](toScalaMap(columnNameOverride))(classTag)
68 | 
69 |   /** Returns the default `RowReaderFactory`. */
70 |   def genericRowReaderFactory: RowReaderFactory[CassandraRow] = RowReaderFactory.GenericRowReader$
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/main/scala/com/datastax/spark/connector/cql/AuthConf.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.cql
 2 | 
 3 | import com.datastax.driver.core.{AuthProvider, PlainTextAuthProvider}
 4 | import com.datastax.spark.connector.util.ReflectionUtil
 5 | import org.apache.spark.SparkConf
 6 | 
 7 | /** Stores credentials used to authenticate to a Cassandra cluster and uses them
 8 |   * to configure a Cassandra connection.
 9 |   * This driver provides implementations [[NoAuthConf]] for no authentication
10 |   * and [[PasswordAuthConf]] for password authentication. Other authentication
11 |   * configurators can be plugged in by setting `cassandra.authentication.conf.factory.class`
12 |   * option. See [[AuthConfFactory]]. */
13 | trait AuthConf extends Serializable {
14 | 
15 |   /** Returns auth provider to be passed to the `Cluster.Builder` object. */
16 |   def authProvider: AuthProvider
17 | 
18 |   /** Returns auth credentials to be set in the Thrift authentication request. */
19 |   def thriftCredentials: Map[String, String]
20 | }
21 | 
22 | /** Performs no authentication. Use with `AllowAllAuthenticator` in Cassandra. */
23 | case object NoAuthConf extends AuthConf {
24 |   override def authProvider = AuthProvider.NONE
25 | 
26 |   override def thriftCredentials: Map[String, String] = Map.empty
27 | }
28 | 
29 | /** Performs plain-text password authentication. Use with `PasswordAuthenticator` in Cassandra. */
30 | case class PasswordAuthConf(user: String, password: String) extends AuthConf {
31 |   override def authProvider = new PlainTextAuthProvider(user, password)
32 | 
33 |   override def thriftCredentials: Map[String, String] = Map("username" -> user, "password" -> password)
34 | }
35 | 
36 | /** Obtains authentication configuration by reading  `SparkConf` object. */
37 | trait AuthConfFactory {
38 |   def authConf(conf: SparkConf): AuthConf
39 | }
40 | 
41 | /** Default `AuthConfFactory` that supports no authentication or password authentication.
42 |   * Password authentication is enabled when both `spark.cassandra.auth.username` and `spark.cassandra.auth.password`
43 |   * options are present in `SparkConf`.*/
44 | object DefaultAuthConfFactory extends AuthConfFactory {
45 | 
46 |   val CassandraUserNameProperty = "spark.cassandra.auth.username"
47 |   val CassandraPasswordProperty = "spark.cassandra.auth.password"
48 | 
49 |   def authConf(conf: SparkConf): AuthConf = {
50 |     val credentials =
51 |       for (username <- conf.getOption(CassandraUserNameProperty);
52 |            password <- conf.getOption(CassandraPasswordProperty)) yield (username, password)
53 | 
54 |     credentials match {
55 |       case Some((user, password)) => PasswordAuthConf(user, password)
56 |       case None => NoAuthConf
57 |     }
58 |   }
59 | }
60 | 
61 | /** Entry point for obtaining `AuthConf` object from `SparkConf`, used when establishing connections to Cassandra.
62 |   * The actual `AuthConf` creation is delegated to the [[AuthConfFactory]] pointed by `spark.cassandra.auth.conf.factory` property. */
63 | object AuthConf {
64 |   val AuthConfFactoryProperty = "spark.cassandra.auth.conf.factory"
65 | 
66 |   def fromSparkConf(conf: SparkConf) = {
67 |     val authConfFactory = conf
68 |       .getOption(AuthConfFactoryProperty)
69 |       .map(ReflectionUtil.findGlobalObject[AuthConfFactory])
70 |       .getOrElse(DefaultAuthConfFactory)
71 | 
72 |     authConfFactory.authConf(conf)
73 |   }
74 | }
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/it/scala/com/datastax/spark/connector/cql/CassandraPartitionKeyWhereSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.cql
 2 | 
 3 | import com.datastax.spark.connector._
 4 | import com.datastax.spark.connector.testkit.SharedEmbeddedCassandra
 5 | import com.datastax.spark.connector.embedded._
 6 | import org.scalatest.{FlatSpec, Matchers}
 7 | 
 8 | class CassandraPartitionKeyWhereSpec extends FlatSpec with Matchers with SharedEmbeddedCassandra with SparkTemplate {
 9 | 
10 |   useCassandraConfig("cassandra-default.yaml.template")
11 |   val conn = CassandraConnector(Set(cassandraHost))
12 | 
13 |   conn.withSessionDo { session =>
14 |     session.execute("CREATE KEYSPACE IF NOT EXISTS where_test WITH REPLICATION = { 'class': 'SimpleStrategy', 'replication_factor': 1 }")
15 | 
16 |     session.execute("CREATE TABLE IF NOT EXISTS where_test.key_value (key INT, group BIGINT, value TEXT, PRIMARY KEY (key, group))")
17 |     session.execute("INSERT INTO where_test.key_value (key, group, value) VALUES (1, 100, '0001')")
18 |     session.execute("INSERT INTO where_test.key_value (key, group, value) VALUES (2, 200, '0002')")
19 |     session.execute("INSERT INTO where_test.key_value (key, group, value) VALUES (3, 300, '0003')")
20 |     session.execute("CREATE TABLE IF NOT EXISTS where_test.ckey_value (key1 INT, \"Key2\" BIGINT, group INT, value TEXT, PRIMARY KEY ((key1, \"Key2\"), group))")
21 |     session.execute("INSERT INTO where_test.ckey_value (key1, \"Key2\", group, value) VALUES (1, 100, 1000, '0001')")
22 |     session.execute("INSERT INTO where_test.ckey_value (key1, \"Key2\", group, value) VALUES (2, 200, 2000, '0002')")
23 |     session.execute("INSERT INTO where_test.ckey_value (key1, \"Key2\", group, value) VALUES (3, 300, 3000, '0003')")
24 | 
25 |   }
26 | 
27 |   "A CassandraRDD" should "allow partition key eq in where" in {
28 |     val rdd = sc.cassandraTable("where_test", "key_value").where("key = ?", 1)
29 |     val result =  rdd.collect()
30 |     result should have length 1
31 |     result.head.getInt("key") should be (1)
32 |   }
33 | 
34 |   it should "allow partition key 'in' in where" in {
35 |     val result = sc.cassandraTable("where_test", "key_value").where("key in (?, ?)", 2,3).collect()
36 |     result should have length 2
37 |     result.head.getInt("key") should (be (2) or be (3))
38 |   }
39 | 
40 |   it should "allow cluster key 'in' in where" in {
41 |     val result = sc.cassandraTable("where_test", "key_value").where("group in (?, ?)", 200,300).collect()
42 |     result should have length 2
43 |     result.head.getInt("key") should (be (2) or be (3))
44 |   }
45 | 
46 |   it should "work with composite keys in" in {
47 |     val result = sc.cassandraTable("where_test", "ckey_value").where("key1 = 1 and \"Key2\" in (?, ?)", 100,200).collect()
48 |     result should have length 1
49 |     result.head.getInt("key1") should be (1)
50 |   }
51 | 
52 |   it should "work with composite keys eq" in {
53 |     val result = sc.cassandraTable("where_test", "ckey_value").where("key1 = ? and \"Key2\" = ?", 1,100).collect()
54 |     result should have length 1
55 |     result.head.getInt("key1") should be (1)
56 |   }
57 | 
58 |   it should "work with composite keys in2" in {
59 |     val result = sc.cassandraTable("where_test", "ckey_value").where("\"Key2\" in (?, ?) and key1 = 1", 100,200).collect()
60 |     result should have length 1
61 |     result.head.getInt("key1") should be (1)
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/spark-cassandra-connector/src/test/scala/com/datastax/spark/connector/writer/WriteOptionTest.scala:
--------------------------------------------------------------------------------
 1 | package com.datastax.spark.connector.writer
 2 | 
 3 | import java.util.Date
 4 | import java.util.concurrent.TimeUnit
 5 | 
 6 | import org.apache.spark.streaming.{Duration => SparkDuration}
 7 | import org.joda.time.{DateTime, Duration => JodaDuration}
 8 | import org.scalatest.{FlatSpec, Matchers}
 9 | 
10 | import scala.concurrent.duration.{Duration => ScalaDuration}
11 | 
12 | class WriteOptionTest extends FlatSpec with Matchers {
13 | 
14 |   "TTLOption" should "properly create constant write option with duration in seconds" in {
15 |     val option = TTLOption.constant(5)
16 |     option shouldBe a[StaticWriteOption[_]]
17 |     option.asInstanceOf[StaticWriteOption[Int]].value should be(5)
18 |   }
19 | 
20 |   it should "properly create constant write option with scala.concurrent.duration.Duration" in {
21 |     val option = TTLOption.constant(ScalaDuration.apply(5, TimeUnit.SECONDS))
22 |     option shouldBe a[StaticWriteOption[_]]
23 |     option.asInstanceOf[StaticWriteOption[Int]].value should be(5)
24 |   }
25 | 
26 |   it should "properly create constant write option with scala.concurrent.duration.Duration.Infinite" in {
27 |     val option = TTLOption.constant(ScalaDuration.Inf)
28 |     option shouldBe a[StaticWriteOption[_]]
29 |     option.asInstanceOf[StaticWriteOption[Int]].value should be(0)
30 |   }
31 | 
32 |   it should "properly create constant write option with org.apache.spark.streaming.Duration" in {
33 |     val option = TTLOption.constant(SparkDuration.apply(5123L))
34 |     option shouldBe a[StaticWriteOption[_]]
35 |     option.asInstanceOf[StaticWriteOption[Int]].value should be(5)
36 |   }
37 | 
38 |   it should "properly create constant write option with org.joda.time.Duration" in {
39 |     val option = TTLOption.constant(JodaDuration.millis(5123L))
40 |     option shouldBe a[StaticWriteOption[_]]
41 |     option.asInstanceOf[StaticWriteOption[Int]].value should be(5)
42 |   }
43 | 
44 |   it should "properly create infinite duration" in {
45 |     val option = TTLOption.forever
46 |     option shouldBe a[StaticWriteOption[_]]
47 |     option.asInstanceOf[StaticWriteOption[Int]].value should be(0)
48 |   }
49 | 
50 |   it should "properly create per-row duration placeholder" in {
51 |     val option = TTLOption.perRow("test")
52 |     option shouldBe a[PerRowWriteOption[_]]
53 |     option.asInstanceOf[PerRowWriteOption[Int]].placeholder should be("test")
54 |   }
55 | 
56 |   "TimestampOption" should "properly create constant write option with timestamp in microseconds" in {
57 |     val option = TimestampOption.constant(12345L)
58 |     option shouldBe a[StaticWriteOption[_]]
59 |     option.asInstanceOf[StaticWriteOption[Long]].value should be(12345L)
60 |   }
61 | 
62 |   it should "properly create constant write option with DateTime" in {
63 |     val option = TimestampOption.constant(new DateTime(2010, 5, 6, 7, 8, 8, 10))
64 |     option shouldBe a[StaticWriteOption[_]]
65 |     option.asInstanceOf[StaticWriteOption[Long]].value should be(new DateTime(2010, 5, 6, 7, 8, 8, 10).getMillis * 1000L)
66 |   }
67 | 
68 |   it should "properly create constant write option with Date" in {
69 |     val t = new Date()
70 |     val option = TimestampOption.constant(t)
71 |     option shouldBe a[StaticWriteOption[_]]
72 |     option.asInstanceOf[StaticWriteOption[Long]].value should be(t.getTime * 1000L)
73 |   }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------