├── .gitignore ├── README.md ├── build.sbt ├── exampleCsv ├── exampleMapping ├── project └── plugins.sbt ├── run.sh ├── sbt ├── sbt └── sbt-launch.jar └── src └── main └── scala └── com └── datastax └── sparkcsv └── ExampleLoad.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.class 3 | *.log 4 | 5 | # sbt specific 6 | .cache/ 7 | .history/ 8 | .lib/ 9 | dist/* 10 | target/ 11 | lib_managed/ 12 | src_managed/ 13 | project/boot/ 14 | project/plugins/project/ 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Spark CSV Loader for Cassandra 2 | ============================== 3 | An Example Tool for Using Spark to load a CSV file into Cassandra using spark 4 | Pull Requests and Issues Welcome! 5 | 6 | 7 | Spark CSV Loader 1.0 8 | Usage: sparkcsvexample [options] filename keyspace table mapping [master] [cassandraIp] 9 | 10 | filename 11 | Filename to read, csv, ex.(file:///temp/file.csv). If no locator uri it provided will look in Hadoop DefaultFS (CFS on DSE) 12 | keyspace 13 | Keyspace to save to 14 | table 15 | Table to save to 16 | mapping 17 | A file containing the names of the Cassandra columns that the csv columns should map to, comma-delimited 18 | master 19 | Spark Address of Master Node, Default runs `dsetool sparkmaster` to find master 20 | cassandraIp 21 | Ip Address of Cassandra Server, Default uses Spark Master IP address 22 | -m | --maxcores 23 | Number of cores to use by this application 24 | -x | --executormemory 25 | Amount of memory for each executor (JVM Style Strings) 26 | -v | --verify 27 | Run verification checks after inserting data 28 | --help 29 | CLI Help 30 | 31 | 32 | This tool is designed to work with both standalone Apache Spark and Cassandra Clusters as well as DataStax 33 | Cassandra/Spark Clusters. 34 | 35 | Requirements 36 | ------------- 37 | (DSE > 4.5.2 or Apache C* > 2.0.5 ) and Spark > 0.9.1 38 | 39 | 40 | Building the project 41 | --------------------- 42 | To build go to the home directory of the project and run 43 | 44 | ./sbt/sbt assembly 45 | 46 | This will produce a fat-jar in `target/scala-2.10/spark-csv-assembly-1.0.jar`. Which needs to be included in any running 47 | Spark job. It contains the references to the anonymous functions which Spark will use when running. 48 | 49 | Creating the Example Keyspace and Table 50 | -------------------------------- 51 | This application assumes that the keyspace and table to be inserted to already exist. To create 52 | the table used in the example used below run the following commands in cqlsh. 53 | 54 | CREATE KEYSPACE ks WITH replication = { 55 | 'class': 'SimpleStrategy', 56 | 'replication_factor': '1' 57 | }; 58 | 59 | USE ks; 60 | 61 | CREATE TABLE tab ( 62 | key int, 63 | data1 int, 64 | data2 int, 65 | data3 int, 66 | PRIMARY KEY ((key)) 67 | ) 68 | 69 | 70 | Running with Datastax Enterprise 71 | -------------------------------- 72 | 73 | When running on a Datstax Enterprise Cluster with Spark Enabled the app can be run with the included 74 | run.sh script. This will include the fat-jar referenced above on the classpath for the dse spark-class call 75 | and run the application. Running with this method will pickup your spark-env.sh file and correctly place the logs 76 | in your predefined locations. 77 | 78 | ##example 79 | ./run.sh -m 4 file://`pwd`/exampleCsv ks tab exampleMapping 80 | 81 | Running with Apache Cassandra 82 | ------------------------------- 83 | 84 | We can run directly from sbt using 85 | 86 | #Note that here we need to specify the spark master uri and cassandra ip, otherwise 87 | #the program will try to use DataStax Enterprise to pick up these values 88 | ./sbt/sbt "run -m 4 file://`pwd`/exampleCsv ks tab exampleMapping spark://127.0.0.1:7077 127.0.0.1" 89 | 90 | 91 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | name := "spark-csv" 4 | 5 | version := "1.0" 6 | 7 | organization := "com.datastax" 8 | 9 | scalaVersion := "2.10.4" 10 | 11 | libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.1.6" % "test" 12 | 13 | libraryDependencies += "com.github.scopt" %% "scopt" % "3.2.0" 14 | 15 | libraryDependencies += "org.apache.spark" %% "spark-core" % "0.9.1" % "provided" 16 | 17 | libraryDependencies += "com.datastax.spark" %% "spark-cassandra-connector" % "1.0.0-beta2" 18 | 19 | resolvers += Resolver.sonatypeRepo("public") 20 | 21 | //We do this so that Spark Dependencies will not be bundled with our fat jar but will still be included on the classpath 22 | //When we do a sbt/run 23 | run in Compile <<= Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)) 24 | 25 | assemblySettings 26 | 27 | 28 | -------------------------------------------------------------------------------- /exampleCsv: -------------------------------------------------------------------------------- 1 | 1,1,1,1 2 | 2,2,2,2 3 | 3,3,3,3 4 | 4,4,4,4 5 | 5,5,5,5 6 | 6,6,6,6 7 | 7,7,7,7 8 | 8,8,8,8 9 | 9,9,9,9 10 | 10,10,10,10 11 | 11,11,11,11 12 | 12,12,12,12 13 | -------------------------------------------------------------------------------- /exampleMapping: -------------------------------------------------------------------------------- 1 | key,data1,data2,data3 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Sonatype snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/" 2 | 3 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 4 | 5 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.9.2") -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export SPARK_CLIENT_CLASSPATH=`pwd`/target/scala-2.10/spark-csv-assembly-1.0.jar 3 | if [ ! -f $SPARK_CLIENT_CLASSPATH ]; then 4 | echo "Couldn't find $SPARK_CLIENT_CLASSPATH" 5 | exit 1 6 | fi 7 | 8 | exec dse spark-class com.datastax.sparkcsv.ExampleLoad "$@" 9 | -------------------------------------------------------------------------------- /sbt/sbt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | java -Xmx1200m -XX:MaxPermSize=500m -XX:ReservedCodeCacheSize=128m $EXTRA_ARGS -jar ./sbt/sbt-launch.jar "$@" 3 | -------------------------------------------------------------------------------- /sbt/sbt-launch.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RussellSpitzer/spark-cassandra-csv/cfa5a9a9582f3a423325445e06c4deca691d56c0/sbt/sbt-launch.jar -------------------------------------------------------------------------------- /src/main/scala/com/datastax/sparkcsv/ExampleLoad.scala: -------------------------------------------------------------------------------- 1 | package com.datastax.sparkcsv 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.SparkContext._ 5 | import scala.sys.process._ 6 | import com.datastax.spark.connector._ 7 | 8 | case class Config(master: String = "", 9 | filename: String = "exampleCsv", 10 | keyspace: String = "ks", 11 | table: String = "tab", 12 | mapping: String = "exampleMapping", 13 | maxCores: Int = 1, 14 | executorMemory: String = "2g", 15 | verify: Boolean = false, 16 | cassandraIp: String = "" 17 | ) 18 | 19 | 20 | object ExampleLoad { 21 | 22 | 23 | def main(args: Array[String]) { 24 | if (System.getenv("SPARK_HOME") == null){ 25 | println("SPARK_HOME is not set\nExiting") 26 | sys.exit(1) 27 | } 28 | 29 | val ipReg = """\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}""".r 30 | 31 | val parser = new scopt.OptionParser[Config]("sparkcsvexample") { 32 | head("Spark CSV Loader", "1.0") 33 | arg[String]("filename") action { (arg, config) => config.copy(filename = arg)} text { 34 | "Filename to read, csv, ex.(file:///temp/file.csv). If no locator uri it provided will look in Hadoop DefaultFS (CFS on DSE)" 35 | } 36 | arg[String]("keyspace") action { (arg, config) => config.copy(keyspace = arg)} text { 37 | "Keyspace to save to" 38 | } 39 | arg[String]("table") action { (arg, config) => config.copy(table = arg)} text { 40 | "Table to save to" 41 | } 42 | arg[String]("mapping") action { (arg, config) => config.copy(mapping = arg)} text { 43 | "A file containing the names of the Cassandra columns that the csv columns should map to, comma-delimited" 44 | } 45 | arg[String]("master") optional() action { (arg, config) => config.copy(master = arg)} text { 46 | "Spark Address of Master Node, Default runs `dsetool sparkmaster` to find master uses localhost if dsetool is not avaliable" 47 | } 48 | arg[String]("cassandraIp") optional() action { (arg, config) => config.copy(cassandraIp = arg)} text{ 49 | "Ip Address of Cassandra Server, Default uses Spark Master IP address" 50 | } 51 | opt[Int]('m', "maxcores") optional() action { (arg, config) => config.copy(maxCores = arg)} text { 52 | "Number of cores to use by this application" 53 | } 54 | opt[String]('x', "executormemory") optional() action { (arg, config) => config.copy(executorMemory = arg)} text { 55 | "Amount of memory for each executor (JVM Style Strings)" 56 | } 57 | opt[Unit]('v', "verify") optional() action { (_, config) => config.copy(verify = true)} text { 58 | "Run verification checks after inserting data" 59 | } 60 | 61 | help("help") text { 62 | "CLI Help" 63 | } 64 | } 65 | 66 | val master = try {"dsetool sparkmaster".!!.trim } catch { case x:Exception => "localhost"} 67 | val cassandraIp = ipReg findFirstIn (master) match { 68 | case Some(ipReg) => ipReg 69 | case None => "127.0.0.1" 70 | } 71 | // Calls a subprocess to get the spark master 72 | parser.parse(args, Config(master = master, cassandraIp = cassandraIp)) map { config => 73 | println("SparkMaster: " + config.master) 74 | println("CassandraIP: " + config.cassandraIp) 75 | loadCSV(config) 76 | } getOrElse { 77 | System.exit(1) 78 | } 79 | } 80 | 81 | def loadCSV(config: Config) { 82 | 83 | 84 | //Read in the mapping file 85 | val mappingString = scala.io.Source.fromFile(config.mapping).getLines.mkString 86 | val mappingArray = mappingString.split(",") 87 | 88 | val sparkconf = new SparkConf() 89 | .setMaster(config.master) 90 | .setAppName("SparkExample: Load CSV") 91 | .setSparkHome(System.getenv("SPARK_HOME")) 92 | .setJars(Array(System.getProperty("user.dir") + "/target/scala-2.10/spark-csv-assembly-1.0.jar")) 93 | .set("spark.cores.max", config.maxCores.toString) 94 | .set("spark.executor.memory", config.executorMemory) 95 | .set("spark.cassandra.connection.host", config.cassandraIp) 96 | 97 | //Make a spark context 98 | val sc = new SparkContext(sparkconf) 99 | 100 | //Make a CassandraRDD for our target table 101 | val cassRDD = sc.cassandraTable(config.keyspace, config.table) 102 | 103 | //Make an RDD from a text file and split it on ',' 104 | println(config.filename) 105 | val textFileRDD = sc.textFile(config.filename) 106 | val lineRDD = textFileRDD.map { line => line.split(",",mappingArray.length) } 107 | 108 | 109 | //Print quick diagnostic about what we are about to do 110 | println("About to do the following inserts") 111 | println(mappingArray.mkString("\t")) 112 | println(lineRDD.take(5).map { 113 | _.mkString("\t") 114 | }.mkString("\n")) 115 | println("...") 116 | 117 | val insertRDD = lineRDD.map{elementArray => CassandraRow.fromMap((mappingArray zip elementArray) toMap)} 118 | 119 | //Count the lines to check whether or not we have inserted as many lines as we had 120 | var csvLineCount = 0l 121 | var cassRDDCount = 0l 122 | if (config.verify) { 123 | csvLineCount = lineRDD.count 124 | cassRDDCount = cassRDD.count 125 | } 126 | 127 | //Save text file to cassandra 128 | insertRDD.saveToCassandra(config.keyspace, config.table) 129 | 130 | if (config.verify) { 131 | val rddNewCount = cassRDD.count() 132 | println(s"Lines in CSV File: $csvLineCount") 133 | println(s"Lines in Table Before Insert File: $cassRDDCount") 134 | println(s"Lines in Table After Insert File : $rddNewCount") 135 | if (rddNewCount - cassRDDCount != csvLineCount) { 136 | println("Some lines were either not added or were overwritten, checking inserted data") 137 | val insertRDDKV = insertRDD.map( row => (mappingArray.map{col => row.get[Any](col)}.mkString(","),1)) 138 | val cassRDDKV = cassRDD.map( row => (mappingArray.map{col => row.get[Any](col)}.mkString(","),1)) 139 | val missingRows = insertRDDKV.leftOuterJoin(cassRDDKV).filter( kv => kv._2._2.isEmpty) 140 | missingRows.collect.foreach( row => println("Not Found in C*",row._1.toString)) 141 | val missingRowCount = missingRows.count 142 | println(s"Found $missingRowCount Missing Rows") 143 | missingRows.foreach( row => println(row._1.toString())) 144 | } 145 | } 146 | 147 | println("Finished") 148 | 149 | } 150 | } 151 | --------------------------------------------------------------------------------