├── README.md ├── lib ├── README.md └── mongo-hadoop-core_2.2.0-1.2.0.jar ├── pom.xml └── src └── main └── scala └── de └── kp └── spark └── connect ├── ConnectConfig.scala ├── GaRDD.scala ├── GaReader.scala ├── GaSource.scala ├── SQLSource.scala ├── aerospike ├── AerospikeReader.scala └── AerospikeSource.scala ├── cassandra ├── CassandraReader.scala └── CassandraSource.scala ├── dmp └── CxenseClient.scala ├── elasticsearch ├── ElasticReader.scala └── ElasticSource.scala ├── hbase ├── HBaseReader.scala └── HBaseSource.scala ├── jdbc ├── JdbcReader.scala └── JdbcSource.scala ├── log ├── ApacheLogAnalyzer.scala └── ApacheLogParser.scala ├── mongodb ├── MongoReader.scala └── MongoSource.scala ├── parquet ├── ParquetReader.scala └── ParquetSource.scala └── shop ├── BigClient.scala ├── BigDataset.scala ├── ShopifyClient.scala └── ShopifyRDD.scala /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Common Access Layer for Apache Spark 3 | 4 | [Predictiveworks](http://predictiveworks.eu) supports raw data retrieval from multiple NoSQL and JDBC data sources. 5 | 6 | Read requests are supported for the following big data sources: 7 | 8 | * Cassandra 9 | * Elasticsearch 10 | * HBase 11 | * MongoDB 12 | * Parquet 13 | 14 | In addition, this project also provides an increasing number of connector to data sources relevant for analytics: 15 | 16 | * Google Analytics v3 17 | * Shopify 18 | -------------------------------------------------------------------------------- /lib/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## MongoDB Hadoop Connector 3 | 4 | The Maven repositories contain the mongo-hadoop connector for several different Hadoop versions, 5 | but not for 2.2.0. Therefore the mongo-hadoop connector is included as an unmanaged library. -------------------------------------------------------------------------------- /lib/mongo-hadoop-core_2.2.0-1.2.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skrusche63/spark-connect/aa8112941863526c7a6397da92a86a82146602da/lib/mongo-hadoop-core_2.2.0-1.2.0.jar -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | spark-connect 4 | spark-connect 5 | 0.0.1 6 | Spark-Connect 7 | Common Access Layer for Predictiveworks 8 | 2010 9 | 10 | 11 | My License 12 | http://.... 13 | repo 14 | 15 | 16 | 17 | 18 | 1.6 19 | 1.6 20 | UTF-8 21 | 3.0.10.Final 22 | 2.10 23 | 2.10.2 24 | 1.2.0 25 | 26 | 27 | 28 | 29 | org.scala-lang 30 | scala-library 31 | ${scala.version} 32 | 33 | 34 | 35 | 36 | junit 37 | junit 38 | 4.11 39 | test 40 | 41 | 42 | org.specs2 43 | specs2_${scala.tools.version} 44 | 1.13 45 | test 46 | 47 | 48 | org.scalatest 49 | scalatest_${scala.tools.version} 50 | 2.0.M6-SNAP8 51 | test 52 | 53 | 54 | 55 | 56 | org.apache.spark 57 | spark-core_2.10 58 | ${spark.version} 59 | 60 | 61 | 62 | 63 | org.apache.spark 64 | spark-sql_2.10 65 | ${spark.version} 66 | 67 | 68 | 69 | 70 | cascading 71 | cascading-core 72 | 2.5.4 73 | 74 | 75 | 76 | cascading 77 | cascading-hadoop 78 | 2.5.4 79 | 80 | 81 | 82 | 83 | org.elasticsearch 84 | elasticsearch-hadoop 85 | 2.0.0 86 | 87 | 88 | 89 | 90 | org.elasticsearch 91 | elasticsearch 92 | 1.3.2 93 | 94 | 95 | 99 | 100 | org.mongodb 101 | mongo-java-driver 102 | 2.11.4 103 | 104 | 105 | 109 | 110 | com.datastax.spark 111 | spark-cassandra-connector_2.10 112 | 1.2.0-alpha1 113 | 114 | 115 | 116 | 117 | org.apache.hbase 118 | hbase-common 119 | 0.98.8-hadoop2 120 | 121 | 122 | 123 | org.apache.hbase 124 | hbase-client 125 | 0.98.8-hadoop2 126 | 127 | 128 | 129 | org.apache.hbase 130 | hbase-server 131 | 0.98.8-hadoop2 132 | 133 | 134 | 135 | mysql 136 | mysql-connector-java 137 | 5.1.31 138 | 139 | 140 | 141 | 142 | com.google.gdata 143 | gdata-core-1.0 144 | 1.41.5 145 | 146 | 147 | 148 | com.google.gdata 149 | gdata-analytics-2.1 150 | 1.41.5 151 | 152 | 153 | org.jboss.resteasy 154 | resteasy-jaxb-provider 155 | ${resteasy.version} 156 | 157 | 158 | 159 | 160 | org.jboss.resteasy 161 | resteasy-jackson-provider 162 | ${resteasy.version} 163 | 164 | 165 | 166 | org.jboss.resteasy 167 | resteasy-client 168 | ${resteasy.version} 169 | 170 | 171 | 176 | 177 | com.fasterxml.jackson.module 178 | jackson-module-scala_2.10 179 | 2.3.1 180 | 181 | 182 | 183 | 184 | org.scribe 185 | scribe 186 | 1.3.7 187 | 188 | 189 | 190 | 191 | com.aerospike 192 | aerospike-client 193 | 3.0.34 194 | 195 | 196 | 197 | 198 | 199 | 200 | conjars.org 201 | http://conjars.org/repo 202 | 203 | 204 | Mandubian Repository 205 | http://mandubian-mvn.googlecode.com/svn/trunk/mandubian-mvn/repository/ 206 | 207 | 208 | 209 | 210 | src/main/scala 211 | src/test/scala 212 | 213 | 214 | 215 | net.alchim31.maven 216 | scala-maven-plugin 217 | 3.1.3 218 | 219 | 220 | 221 | compile 222 | testCompile 223 | 224 | 225 | 226 | -make:transitive 227 | -dependencyfile 228 | ${project.build.directory}/.scala_dependencies 229 | 230 | 231 | 232 | 233 | 234 | 235 | org.apache.maven.plugins 236 | maven-surefire-plugin 237 | 2.13 238 | 239 | false 240 | true 241 | 242 | 243 | 244 | **/*Test.* 245 | **/*Suite.* 246 | 247 | 248 | 249 | 250 | 251 | 252 | Dr. Krusche & Partner PartG 253 | http://dr-kruscheundpartner.com 254 | 255 | 256 | -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/ConnectConfig.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | import org.apache.hadoop.conf.{Configuration => HConf} 21 | 22 | trait ConnectConfig { 23 | 24 | /** 25 | * This method retrieves the settings to access 26 | * an Aerospike Cluster 27 | */ 28 | def aerospike:Map[String,String] 29 | /** 30 | * This method retrieves the settings to access 31 | * a Cassandra Cluster 32 | */ 33 | def cassandra:Map[String,String] 34 | /** 35 | * This method retrieves a Hadoop configuration 36 | * to access Elasticsearch 37 | */ 38 | def elastic:HConf 39 | /** 40 | * This method retrieves the settings to access 41 | * Google Analytics 42 | */ 43 | def ga:Map[String,String] 44 | /** 45 | * This method retrieves the settings to access 46 | * HBase 47 | */ 48 | def hbase:Map[String,String] 49 | /** 50 | * This method retrieves a Hadoop configuration 51 | * to access MongoDB 52 | */ 53 | def mongo:HConf 54 | /** 55 | * This method retrieves the access parameter for a MySQL 56 | * data source, comprising url, db, user, password 57 | */ 58 | def mysql:Map[String,String] 59 | /** 60 | * This method retrieves Apache Spark configuration 61 | */ 62 | def spark:Map[String,String] 63 | 64 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/GaRDD.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import java.net.URL 22 | 23 | import org.apache.spark.{Partition,SparkContext,TaskContext} 24 | import org.apache.spark.TaskKilledException 25 | 26 | import org.apache.spark.rdd.RDD 27 | 28 | import org.apache.spark.util.NextIterator 29 | 30 | import com.google.gdata.client.analytics.{AnalyticsService,DataQuery} 31 | import com.google.gdata.data.analytics.{DataEntry,DataFeed} 32 | 33 | import scala.collection.JavaConversions._ 34 | import scala.collection.mutable.Buffer 35 | 36 | case class GaRow(columns:Seq[GaColumn]) 37 | case class GaColumn(name:String,category:String,datatype:String,value:String) 38 | 39 | class GaPartition(idx:Int,val startIndex:Int,val maxResult:Int) extends Partition { 40 | override def index = idx 41 | } 42 | 43 | 44 | class GaRDD( 45 | /* Reference to SparkContext */ 46 | @transient sc:SparkContext, 47 | /* Request parameters */ 48 | params:Map[String,String], 49 | /* Total number of results */ 50 | numResults:Int, 51 | /* Total number of partitions */ 52 | numPartitions:Int) extends RDD[GaRow](sc,Nil) { 53 | 54 | override def getPartitions:Array[Partition] = { 55 | 56 | /* 57 | * The maximum number of results returned with a request; 58 | * note, that the Analytics Core Reporting API returns a 59 | * maximum of 10,000 rows per request, no matter how many 60 | * one asks for 61 | */ 62 | val maxResult = Math.round(numResults.toDouble / numPartitions).toInt 63 | 64 | (0 until numPartitions).map(i => { 65 | 66 | val startIx = 1 + i * maxResult 67 | new GaPartition(i,startIx,maxResult) 68 | 69 | }).toArray 70 | 71 | } 72 | 73 | override def compute(thePart:Partition,context:TaskContext) = new Iterator[GaRow] { 74 | 75 | private var closed = false 76 | private var finished = false 77 | 78 | context.addTaskCompletionListener{ context => closeIfNeeded() } 79 | 80 | private val partition = thePart.asInstanceOf[GaPartition] 81 | private val query = buildQuery(partition) 82 | 83 | val service = buildService 84 | val datafeed = service.getFeed(query.getUrl,classOf[DataFeed]) 85 | 86 | val dataset = datafeed.getEntries.map(mapEntry(_)).toIterator 87 | 88 | /* 89 | * Build query and determine maximum number of results 90 | * from the request parameters (or default = 10.000) 91 | */ 92 | def hasNext:Boolean = { 93 | 94 | if (context.isInterrupted()) 95 | throw new TaskKilledException 96 | 97 | !finished && dataset.hasNext 98 | 99 | } 100 | 101 | def next:GaRow = { 102 | 103 | if (hasNext) { 104 | dataset.next 105 | 106 | } else { 107 | 108 | finished = true 109 | null.asInstanceOf[GaRow] 110 | 111 | } 112 | 113 | } 114 | 115 | def closeIfNeeded() { 116 | if (!closed) { 117 | close() 118 | closed = true 119 | } 120 | } 121 | 122 | def close() { 123 | /* 124 | * The connection to a GData service is properly closed 125 | * after the request has been performed; this implies 126 | * that we do nothing here 127 | */ 128 | } 129 | 130 | private def mapEntry(entry:DataEntry):GaRow = { 131 | 132 | val columns = Buffer.empty[GaColumn] 133 | 134 | /* DIMENSIONS */ 135 | val dimensions = entry.getDimensions 136 | if (!dimensions.isEmpty) { 137 | dimensions.map(dimension => GaColumn(dimension.getName,"dimension","string",dimension.getValue)) 138 | } 139 | 140 | /* METRICS */ 141 | val metrics = entry.getMetrics 142 | metrics.map(metric => GaColumn(metric.getName,"metric",metric.getType,metric.getValue)) 143 | 144 | GaRow(columns.toSeq) 145 | 146 | } 147 | 148 | private def buildQuery(partition:GaPartition):DataQuery = { 149 | 150 | /* REQURED */ 151 | val query = new DataQuery(new URL(params("url"))) 152 | 153 | /* REQUIRED */ 154 | val start_date = params("start_date") 155 | query.setStartDate(start_date) 156 | 157 | val end_date = params("end_date") 158 | query.setEndDate(end_date) 159 | 160 | /* 161 | * REQUIRED 162 | * 163 | * The aggregated statistics for user activity in a view (profile), 164 | * such as clicks or pageviews. When queried by alone, metrics provide 165 | * the total values for the requested date range, such as overall pageviews 166 | * or total bounces. 167 | * 168 | * However, when requested with dimensions, values are segmented by the dimension. 169 | * For example, ga:pageviews requested with ga:country returns the total pageviews 170 | * per country. 171 | * 172 | * When requesting metrics, keep in mind: All requests require at least one metric. 173 | * 174 | * You can supply a maximum of 10 metrics for any query.Not all dimensions and metrics 175 | * can be used together. Consult the Valid Combinations tool to see which combinations 176 | * work together. 177 | * 178 | */ 179 | val metrics = params("metrics") 180 | query.setMetrics(metrics) 181 | /* 182 | * REQUIRED 183 | * 184 | * The unique table ID used to retrieve the Analytics Report data. 185 | */ 186 | val table_id = params("table_id") 187 | query.setIds(table_id) 188 | 189 | /* OPTIONAL */ 190 | if (params.contains("dimensions")) { 191 | query.setDimensions(params("dimensions")) 192 | } 193 | 194 | /* OPTIONAL */ 195 | if (params.contains("filters")) { 196 | query.setFilters(params("filters")) 197 | } 198 | 199 | /* OPTIONAL */ 200 | if (params.contains("sort")) { 201 | query.setSort(params("sort")) 202 | } 203 | 204 | query.setStartIndex(partition.startIndex) 205 | query.setMaxResults(partition.maxResult) 206 | 207 | query 208 | 209 | } 210 | 211 | private def buildService:AnalyticsService = { 212 | 213 | val app_name = params("app_name") 214 | val analytics = new AnalyticsService(app_name) 215 | 216 | val user_name = params("user_name") 217 | val password = params("password") 218 | 219 | analytics.setUserCredentials(user_name,password) 220 | analytics 221 | 222 | } 223 | 224 | } 225 | 226 | } 227 | -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/GaReader.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | class GaReader(@transient sc:SparkContext) extends Serializable { 25 | 26 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = { 27 | 28 | val settings = config.ga 29 | 30 | val req_params = params ++ Map( 31 | 32 | "app_name" -> params("app_name"), 33 | 34 | "user_name" -> params("user_name"), 35 | "password" -> params("password") 36 | ) 37 | 38 | val numResults = params("num_results").toInt 39 | val numPartitions = params("num_partitions").toInt 40 | 41 | val source = new GaRDD(sc,req_params,numResults,numPartitions) 42 | source.map(toMap(_)) 43 | 44 | } 45 | 46 | private def toMap(row:GaRow):Map[String,Any] = { 47 | 48 | val columns = row.columns 49 | columns.map(column => { 50 | 51 | val k = column.name 52 | val v = if (column.category == "dimension") { 53 | column.value 54 | 55 | } else { 56 | 57 | column.datatype match { 58 | /* 59 | * The datatype 'integer' describes a Long (see Metric 60 | * implementation); all other values describe Doubles 61 | */ 62 | case "integer" => column.value.toLong 63 | /* 64 | * currency, us_currency, float, percent, time 65 | */ 66 | case _ => column.value.toDouble 67 | } 68 | } 69 | 70 | (k,v) 71 | 72 | }).toMap 73 | 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/GaSource.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | class GaSource(@transient sc:SparkContext) extends Serializable { 25 | 26 | def connect(config:ConnectConfig,requestParams:Map[String,String]):RDD[Map[String,Any]] = { 27 | new GaReader(sc).read(config,requestParams) 28 | 29 | } 30 | 31 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/SQLSource.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.rdd.RDD 22 | import org.apache.spark.sql._ 23 | 24 | import de.kp.spark.connect.aerospike.AerospikeSource 25 | import de.kp.spark.connect.cassandra.CassandraSource 26 | 27 | import de.kp.spark.connect.elasticsearch.ElasticSource 28 | import de.kp.spark.connect.hbase.HBaseSource 29 | 30 | import de.kp.spark.connect.jdbc.JdbcSource 31 | import de.kp.spark.connect.mongodb.MongoSource 32 | 33 | import de.kp.spark.connect.parquet.ParquetSource 34 | 35 | object Sources { 36 | 37 | val AEROSPIKE:String = "aerospike" 38 | val CASSANDRA:String = "cassandra" 39 | 40 | val ELASTICSEARCH:String = "elasticsearch" 41 | 42 | val HBASE:String = "hbase" 43 | val JDBC:String = "jdbc" 44 | 45 | val MONGODB:String = "mongodb" 46 | val PARQUET:String = "parquet" 47 | 48 | } 49 | 50 | class SQLSource( 51 | @transient sqlContext:SQLContext, 52 | config:ConnectConfig, 53 | source:String, 54 | table:String, 55 | schema:StructType, 56 | params:Map[String,String]) extends Serializable { 57 | 58 | /* 59 | * Retrieve dataset from source and convert 60 | * result into Row 61 | */ 62 | private val names = sqlContext.sparkContext.broadcast(schema.fieldNames) 63 | 64 | private val rowRDD = getRDD.map(rec => { 65 | val values = names.value.map(name => rec(name)) 66 | Row.fromSeq(values) 67 | }) 68 | 69 | /* 70 | * Apply schema to rows and register as table 71 | */ 72 | private val tableRDD = sqlContext.applySchema(rowRDD, schema) 73 | tableRDD.registerTempTable(table) 74 | 75 | def executeQuery(query:String):SchemaRDD = sqlContext.sql(query) 76 | 77 | private def getRDD:RDD[Map[String,Any]] = { 78 | 79 | val sc = sqlContext.sparkContext 80 | val columns = schema.fieldNames 81 | 82 | source match { 83 | 84 | case Sources.AEROSPIKE => { 85 | /* 86 | * Column names are called bin names in the 87 | * terminology of Aerospike 88 | */ 89 | val req_params = params ++ Map("binnames" -> columns.mkString(",")) 90 | new AerospikeSource(sc).read(config,req_params) 91 | 92 | } 93 | case Sources.CASSANDRA => { 94 | 95 | val req_params = params ++ Map("columns" -> columns.mkString(",")) 96 | new CassandraSource(sc).read(config,req_params) 97 | 98 | } 99 | case Sources.ELASTICSEARCH => new ElasticSource(sc).read(config,params) 100 | case Sources.HBASE => { 101 | 102 | val types = schema.fields.map(field => { 103 | 104 | field.dataType match { 105 | 106 | case DoubleType => "double" 107 | case IntegerType => "integer" 108 | 109 | case LongType => "long" 110 | case StringType => "string" 111 | 112 | case _ => throw new Exception("Data type is not supported.") 113 | } 114 | 115 | }) 116 | 117 | val req_params = params ++ Map("names" -> columns.mkString(","), "types" -> types.mkString(",")) 118 | new HBaseSource(sc).read(config,req_params) 119 | 120 | } 121 | case Sources.JDBC => { 122 | 123 | val req_params = params ++ Map("fields" -> columns.mkString(",")) 124 | new JdbcSource(sc).read(config,req_params) 125 | 126 | } 127 | case Sources.MONGODB => new MongoSource(sc).read(config,params) 128 | 129 | case Sources.PARQUET => { 130 | 131 | val req_params = params ++ Map("fields" -> columns.mkString(",")) 132 | new ParquetSource(sc).read(config,params) 133 | 134 | } 135 | 136 | case _ => throw new Exception(String.format("""Data source %s is not supported.""",source)) 137 | 138 | } 139 | 140 | } 141 | 142 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/aerospike/AerospikeReader.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.aerospike 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import org.apache.hadoop.conf.{Configuration => HConfig} 25 | 26 | import com.aerospike.hadoop._ 27 | import de.kp.spark.connect.ConnectConfig 28 | 29 | import scala.collection.JavaConversions._ 30 | 31 | class AerospikeReader(@transient sc:SparkContext) extends Serializable { 32 | 33 | /* 34 | * Background to Aerospike: 35 | * 36 | * At the highest level, data is collected in containers called namespaces; 37 | * namespaces are similar to databases. Within a namespace, data are divided 38 | * into sets (equivalent to tables), and finally records (rows). 39 | */ 40 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = { 41 | 42 | val settings = config.aerospike 43 | 44 | val conf = new HConfig() 45 | /* Add host & port to configuration */ 46 | val host = if (settings.contains("aerospike.input.host")) 47 | settings("aerospike.input.host") else "localhost" 48 | 49 | conf.set("aerospike.input.host", host) 50 | 51 | val port = if (settings.contains("aerospike.input.port")) 52 | settings("aerospike.input.port") else "3000" 53 | 54 | conf.set("aerospike.input.port", port) 55 | 56 | /* Add namespace and set name to configuration */ 57 | conf.set("aerospike.input.namespace",params("namespace")) 58 | conf.set("aerospike.input.setname",params("setnames")) 59 | 60 | /* Add bin names & operation */ 61 | val binnames = if (params.contains("binnames")) 62 | params("binnames") else "" 63 | 64 | conf.set("aerospike.input.binnames",binnames) 65 | 66 | val operation = if (params.contains("operation")) 67 | params("operation") else "scan" 68 | 69 | conf.set("aerospike.input.operation",operation) 70 | 71 | if (operation == "numrange") { 72 | 73 | conf.set("aerospike.input.numrange.bin",params("numrange_bin")) 74 | 75 | conf.set("aerospike.input.numrange.begin",params("numrange_begin")) 76 | conf.set("aerospike.input.numrange.end",params("numrange_end")) 77 | 78 | } 79 | 80 | read(conf) 81 | 82 | } 83 | 84 | def read(config:HConfig):RDD[Map[String,Any]] = { 85 | 86 | val source = sc.newAPIHadoopRDD(config, classOf[AerospikeInputFormat], classOf[AerospikeKey], classOf[AerospikeRecord]) 87 | source.map{case(key,record) => toMap(key,record)} 88 | 89 | } 90 | 91 | private def toMap(key:AerospikeKey,record:AerospikeRecord):Map[String,Any] = { 92 | 93 | val bins = record.bins 94 | bins.toMap 95 | 96 | } 97 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/aerospike/AerospikeSource.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.aerospike 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import de.kp.spark.connect.ConnectConfig 25 | 26 | class AerospikeSource(@transient sc:SparkContext) extends Serializable { 27 | 28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = { 29 | new AerospikeReader(sc).read(config,params) 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/cassandra/CassandraReader.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.cassandra 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import com.datastax.spark.connector._ 25 | import de.kp.spark.connect.ConnectConfig 26 | 27 | class CassandraReader(@transient sc:SparkContext) extends Serializable { 28 | /** 29 | * This method reads the content of a Cassandra table of a specific 30 | * keyspace. Actually, all data records are retrieved from the table 31 | */ 32 | def read(config:ConnectConfig,keyspace:String,table:String,columns:List[String] = List.empty[String]):RDD[Map[String,Any]] = { 33 | 34 | val settings = config.cassandra 35 | val host = settings("spark.cassandra.connection.host") 36 | 37 | /* 38 | * We add the configuration parameters 39 | * to connect to a Cassandra cluster here 40 | */ 41 | sc.getConf.set("spark.cassandra.connection.host",host) 42 | /* 43 | * Read from specified keyspace and table; note, that the number 44 | * of entries to be returned must be specified 45 | */ 46 | val source = if (columns.isEmpty) 47 | sc.cassandraTable(keyspace, table) else sc.cassandraTable(keyspace, table).select(columns.map(ColumnName(_)):_*) 48 | 49 | source.map(toMap(_)) 50 | 51 | } 52 | 53 | /** 54 | * For the primitive data types required by the different 55 | * engines of Predictiveworks, the conversion of the column 56 | * names and values using the toMap method is sufficient. 57 | * 58 | * In case of more complex data types, this method must be 59 | * adapted to these additional requirements 60 | */ 61 | private def toMap(row:CassandraRow):Map[String,Any] = { 62 | row.toMap 63 | } 64 | 65 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/cassandra/CassandraSource.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.cassandra 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import de.kp.spark.connect.ConnectConfig 25 | 26 | class CassandraSource(@transient sc:SparkContext) extends Serializable { 27 | 28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = { 29 | 30 | val keyspace = params("keyspace") 31 | val table = params("table") 32 | 33 | val columns = if (params.contains("columns")) params("columns").split(",").toList else List.empty[String] 34 | 35 | new CassandraReader(sc).read(config,keyspace,table,columns) 36 | 37 | } 38 | 39 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/dmp/CxenseClient.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.dmp 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import com.fasterxml.jackson.databind.{Module, ObjectMapper} 22 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 23 | 24 | import javax.ws.rs.HttpMethod 25 | 26 | import javax.ws.rs.client.{ClientBuilder,Entity} 27 | import javax.ws.rs.core.MediaType 28 | 29 | import javax.crypto.Mac 30 | import javax.crypto.spec.SecretKeySpec 31 | 32 | import org.joda.time.DateTime 33 | import org.joda.time.DateTimeZone 34 | import org.joda.time.format.ISODateTimeFormat 35 | 36 | import org.apache.commons.codec.binary.Base64 37 | 38 | class CxenseClient(username:String,secret:String) { 39 | 40 | private val CXENSE_URI = "https://api.cxense.com" 41 | 42 | private val JSON_MAPPER = new ObjectMapper() 43 | JSON_MAPPER.registerModule(DefaultScalaModule) 44 | 45 | def getProfileContentFetch(params:Map[String,Any]):Map[String,Any] = { 46 | 47 | val endpoint = "profile/content/fetch" 48 | getResponse(endpoint,params) 49 | 50 | } 51 | 52 | def getProfileUserExternalRead(params:Map[String,Any]):Map[String,Any] = { 53 | 54 | val endpoint = "profile/user/external/read" 55 | getResponse(endpoint,params) 56 | 57 | } 58 | /* 59 | * Collect interest profile for a certain user; the relevant 60 | * part of the response (profile) is equivalent to the 'content 61 | * fetch' request 62 | */ 63 | def getProfileUser(params:Map[String,Any]):Map[String,Any] = { 64 | 65 | val endpoint = "profile/user" 66 | getResponse(endpoint,params) 67 | 68 | } 69 | 70 | def getProfileUserSegment(params:Map[String,Any]):Map[String,Any] = { 71 | 72 | val endpoint = "profile/user/segment" 73 | getResponse(endpoint,params) 74 | 75 | } 76 | 77 | def getSegmentRead(params:Map[String,Any]):Map[String,Any] = { 78 | 79 | val endpoint = "segment/read" 80 | getResponse(endpoint,params) 81 | 82 | } 83 | 84 | def getSite(params:Map[String,Any]):Map[String,Any] = { 85 | 86 | val endpoint = "site" 87 | getResponse(endpoint,params) 88 | 89 | } 90 | 91 | def getSiteGroup(params:Map[String,Any]):Map[String,Any] = { 92 | 93 | val endpoint = "site/group" 94 | getResponse(endpoint,params) 95 | 96 | } 97 | 98 | def getTraffic(params:Map[String,Any]):Map[String,Any] = { 99 | 100 | val endpoint = "traffic" 101 | getResponse(endpoint,params) 102 | 103 | } 104 | 105 | def getTrafficCompare(params:Map[String,Any]):Map[String,Any] = { 106 | 107 | val endpoint = "traffic/compare" 108 | getResponse(endpoint,params) 109 | 110 | } 111 | 112 | def getTrafficCustom(params:Map[String,Any]):Map[String,Any] = { 113 | 114 | val endpoint = "traffic/custom" 115 | getResponse(endpoint,params) 116 | 117 | } 118 | 119 | def getTrafficCustomDescribe(params:Map[String,Any]):Map[String,Any] = { 120 | 121 | val endpoint = "traffic/custom/describe" 122 | getResponse(endpoint,params) 123 | 124 | } 125 | 126 | def getTrafficEvent(params:Map[String,Any]):Map[String,Any] = { 127 | 128 | val endpoint = "traffic/event" 129 | getResponse(endpoint,params) 130 | 131 | } 132 | 133 | def getTrafficEventDescribe(params:Map[String,Any]):Map[String,Any] = { 134 | 135 | val endpoint = "traffic/event/describe" 136 | getResponse(endpoint,params) 137 | 138 | } 139 | 140 | def getTrafficIntent(params:Map[String,Any]):Map[String,Any] = { 141 | 142 | val endpoint = "traffic/intent" 143 | getResponse(endpoint,params) 144 | 145 | } 146 | 147 | def getTrafficKeyword(params:Map[String,Any]):Map[String,Any] = { 148 | 149 | val endpoint = "traffic/keyword" 150 | getResponse(endpoint,params) 151 | 152 | } 153 | 154 | def getTrafficKeywordDescribe(params:Map[String,Any]):Map[String,Any] = { 155 | 156 | val endpoint = "traffic/keyword/describe" 157 | getResponse(endpoint,params) 158 | 159 | } 160 | 161 | def getTrafficRelated(params:Map[String,Any]):Map[String,Any] = { 162 | 163 | val endpoint = "traffic/related" 164 | getResponse(endpoint,params) 165 | 166 | } 167 | 168 | def getTrafficUser(params:Map[String,Any]):Map[String,Any] = { 169 | 170 | val endpoint = "traffic/user" 171 | getResponse(endpoint,params) 172 | 173 | } 174 | 175 | def getTrafficUserExternal(params:Map[String,Any]):Map[String,Any] = { 176 | 177 | val endpoint = "traffic/user/external" 178 | getResponse(endpoint,params) 179 | 180 | } 181 | 182 | def getTrafficUserHistogram(params:Map[String,Any]):Map[String,Any] = { 183 | 184 | val endpoint = "traffic/user/histogram" 185 | getResponse(endpoint,params) 186 | 187 | } 188 | 189 | def getTrafficUserHistogramEvent(params:Map[String,Any]):Map[String,Any] = { 190 | 191 | val endpoint = "traffic/user/histogram/event" 192 | getResponse(endpoint,params) 193 | 194 | } 195 | 196 | def getTrafficUserInterest(params:Map[String,Any]):Map[String,Any] = { 197 | 198 | val endpoint = "traffic/user/interest" 199 | getResponse(endpoint,params) 200 | 201 | } 202 | 203 | def getTrafficUserKeyword(params:Map[String,Any]):Map[String,Any] = { 204 | 205 | val endpoint = "traffic/user/keyword" 206 | getResponse(endpoint,params) 207 | 208 | } 209 | 210 | private def getAuthenticationHeader:String = { 211 | 212 | val mac = Mac.getInstance("HmacSHA256") 213 | mac.init(new SecretKeySpec(secret.getBytes("UTF-8"), "HmacSHA256")) 214 | 215 | val date = ISODateTimeFormat.dateTime().print(new DateTime(DateTimeZone.UTC)) 216 | val signature = new String(Base64.encodeBase64(mac.doFinal(date.getBytes("UTF-8")))) 217 | 218 | "username=" + username + " date=" + date + " hmac-sha256-base64=" + signature 219 | 220 | } 221 | 222 | private def getResponse(endpoint:String,req_params:Map[String,Any]):Map[String,Any] = { 223 | 224 | val body = JSON_MAPPER.writeValueAsString(req_params) 225 | 226 | val client = ClientBuilder.newClient() 227 | val request = client.target(CXENSE_URI).path("/").path(endpoint).request(MediaType.APPLICATION_JSON_TYPE) 228 | 229 | val response = request 230 | .header("X-cXense-Authentication", getAuthenticationHeader) 231 | .method(HttpMethod.POST, if (body == null) null else Entity.json(body), classOf[String]) 232 | 233 | client.close() 234 | 235 | JSON_MAPPER.readValue(response, classOf[Map[String,Any]]) 236 | 237 | } 238 | 239 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/elasticsearch/ElasticReader.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.elasticsearch 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import org.apache.hadoop.io.{ArrayWritable,DoubleWritable,IntWritable,LongWritable,MapWritable,NullWritable,Text,Writable} 25 | 26 | import org.apache.hadoop.conf.{Configuration => HConfig} 27 | 28 | import org.elasticsearch.hadoop.mr.EsInputFormat 29 | import de.kp.spark.connect.ConnectConfig 30 | 31 | import scala.collection.JavaConversions._ 32 | 33 | class ElasticReader(@transient sc:SparkContext) extends Serializable { 34 | 35 | val ES_QUERY:String = "es.query" 36 | val ES_RESOURCE:String = "es.resource" 37 | 38 | def read(config:HConfig):RDD[Map[String,Any]] = { 39 | 40 | val source = sc.newAPIHadoopRDD(config, classOf[EsInputFormat[Text, MapWritable]], classOf[Text], classOf[MapWritable]) 41 | source.map(hit => toMap(hit._2)) 42 | 43 | } 44 | 45 | def read(config:ConnectConfig,index:String,mapping:String,query:String):RDD[Map[String,Any]] = { 46 | 47 | val conf = config.elastic 48 | 49 | /* 50 | * Append dynamic request specific data to Elasticsearch configuration; 51 | * this comprises the search query to be used and the index (and mapping) 52 | * to be accessed 53 | */ 54 | conf.set(ES_QUERY,query) 55 | conf.set(ES_RESOURCE,(index + "/" + mapping)) 56 | 57 | read(conf) 58 | 59 | } 60 | 61 | private def toMap(mw:MapWritable):Map[String,Any] = { 62 | 63 | mw.entrySet().map(kv => { 64 | 65 | val k = kv.getKey().asInstanceOf[Text].toString 66 | val v = kv.getValue() match { 67 | 68 | case valu:ArrayWritable => { 69 | 70 | val array = valu.get 71 | array.map(record => { 72 | 73 | record.asInstanceOf[MapWritable].entrySet().map(entry => { 74 | 75 | val sub_k = entry.getKey().asInstanceOf[Text].toString() 76 | val sub_v = entry.getValue() match { 77 | 78 | case sub_valu:IntWritable => valu.get() 79 | case sub_valu:DoubleWritable => valu.get() 80 | 81 | case sub_valu:LongWritable => valu.get() 82 | case sub_valu:Text => valu.toString 83 | 84 | case _ => throw new Exception("Data type is not supported.") 85 | 86 | } 87 | 88 | (sub_k,sub_v) 89 | 90 | }).toMap 91 | 92 | }).toList 93 | 94 | } 95 | 96 | case valu:IntWritable => valu.get() 97 | case valu:DoubleWritable => valu.get() 98 | 99 | case valu:LongWritable => valu.get() 100 | case valu:Text => valu.toString 101 | 102 | case _ => throw new Exception("Data type is not supported.") 103 | 104 | } 105 | 106 | (k,v) 107 | 108 | }).toMap 109 | 110 | } 111 | 112 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/elasticsearch/ElasticSource.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.elasticsearch 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import de.kp.spark.connect.ConnectConfig 25 | 26 | class ElasticSource(@transient sc:SparkContext) extends Serializable { 27 | 28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = { 29 | 30 | val index = params("index") 31 | val mapping = params("mapping") 32 | 33 | val query = params("query").asInstanceOf[String] 34 | new ElasticReader(sc).read(config,index,mapping,query) 35 | 36 | } 37 | 38 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/hbase/HBaseReader.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.hbase 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import org.apache.hadoop.hbase.util.Bytes 25 | import org.apache.hadoop.hbase.HBaseConfiguration 26 | 27 | import org.apache.hadoop.hbase.CellUtil 28 | import org.apache.hadoop.hbase.client.Result 29 | 30 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 31 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 32 | 33 | import de.kp.spark.connect.ConnectConfig 34 | 35 | class HBaseReader(@transient sc:SparkContext) extends Serializable { 36 | 37 | private val HBASE_ROOTDIR = "/hbase" 38 | 39 | /** 40 | * This method reads the content of an HBase table of a specific 41 | * keyspace. Actually, all data records are retrieved from the table 42 | */ 43 | def read(config:ConnectConfig,columnfamily:String,table:String,names:List[String],types:List[String]):RDD[Map[String,Any]] = { 44 | 45 | val settings = config.hbase 46 | val host = settings("spark.hbase.host") 47 | 48 | val conf = HBaseConfiguration.create 49 | conf.setBoolean("hbase.cluster.distributed", true) 50 | conf.setInt("hbase.client.scanner.caching", 10000) 51 | 52 | conf.set("hbase.rootdir", HBASE_ROOTDIR) 53 | 54 | conf.set("hbase.zookeeper.quorum", host) 55 | conf.set("hbase.zookeeper.property.clientPort","2181") 56 | 57 | val columns = names.map(name => columnfamily + ":" + name) 58 | conf.set(TableInputFormat.SCAN_COLUMNS, columns.mkString(" ")) 59 | 60 | val typedNames = names.zip(types) 61 | 62 | def toMap(key:ImmutableBytesWritable,row:Result):Map[String,Any] = { 63 | 64 | typedNames.map{case(colname,coltype) => { 65 | /* 66 | * Convert column family and respective columns 67 | * into HBase readable Byte array 68 | */ 69 | val cf = Bytes.toBytes(columnfamily) 70 | val cn = Bytes.toBytes(colname) 71 | 72 | if (row.containsColumn(cf,cn) == false) throw new Exception( 73 | String.format("""Combination of cf:%s and cn:%s does not exist""",columnfamily,colname)) 74 | 75 | val byteValue = CellUtil.cloneValue(row.getColumnLatestCell(cf,cn)).array 76 | /* 77 | * We actually support the following data types: 78 | * 79 | * double, integer, long, string 80 | * 81 | * as these are needed by Predictiveworks 82 | */ 83 | val colvalu = coltype match { 84 | 85 | case "double" => Bytes.toDouble(byteValue) 86 | 87 | case "integer" => Bytes.toInt(byteValue) 88 | 89 | case "long" => Bytes.toLong(byteValue) 90 | 91 | case "string" => Bytes.toString(byteValue) 92 | 93 | case _ => throw new Exception(String.format("""The data type '%s' is not supported.""",coltype)) 94 | 95 | } 96 | 97 | (colname,colvalu) 98 | 99 | }}.toMap 100 | 101 | } 102 | 103 | val source = sc.newAPIHadoopRDD(conf,classOf[TableInputFormat],classOf[ImmutableBytesWritable],classOf[Result]) 104 | source.map{case(key,row) => toMap(key,row)} 105 | 106 | } 107 | 108 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/hbase/HBaseSource.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.hbase 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import de.kp.spark.connect.ConnectConfig 25 | 26 | class HBaseSource(@transient sc:SparkContext) extends Serializable { 27 | 28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = { 29 | 30 | val columnfamily = params("columnfamily") 31 | val table = params("table") 32 | 33 | val names = params("names").split(",").toList 34 | val types = params("types").split(",").toList 35 | 36 | new HBaseReader(sc).read(config,columnfamily,table,names,types) 37 | 38 | } 39 | 40 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/jdbc/JdbcReader.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.jdbc 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import java.sql.{Connection,DriverManager,ResultSet} 22 | 23 | import org.apache.spark.SparkContext 24 | import org.apache.spark.rdd.{JdbcRDD,RDD} 25 | 26 | import scala.collection.mutable.HashMap 27 | import de.kp.spark.connect.ConnectConfig 28 | 29 | class JdbcReader(@transient sc:SparkContext) extends Serializable { 30 | 31 | protected val MYSQL_DRIVER = "com.mysql.jdbc.Driver" 32 | protected val NUM_PARTITIONS = 1 33 | 34 | def read(config:ConnectConfig,site:Int,query:String,fields:List[String] = List.empty[String]):RDD[Map[String,Any]] = { 35 | 36 | val conf = config.mysql 37 | 38 | val url = conf("url") 39 | val database = conf("database") 40 | 41 | val user = conf("user") 42 | val password = conf("password") 43 | /* 44 | * The value of 'site' is used as upper and lower bound for 45 | * the range (key) variable of the database table 46 | */ 47 | val result = new JdbcRDD(sc,() => getConnection(url,database,user,password), 48 | query,site,site,NUM_PARTITIONS, 49 | (rs:ResultSet) => getRow(rs,fields) 50 | ).cache() 51 | 52 | result 53 | 54 | } 55 | 56 | /** 57 | * Convert database row into Map[String,Any] and restrict 58 | * to column names that are defined by the field spec 59 | */ 60 | protected def getRow(rs:ResultSet,fields:List[String]):Map[String,Any] = { 61 | val metadata = rs.getMetaData() 62 | val numCols = metadata.getColumnCount() 63 | 64 | val row = HashMap.empty[String,Any] 65 | (1 to numCols).foreach(i => { 66 | 67 | val k = metadata.getColumnName(i) 68 | val v = rs.getObject(i) 69 | 70 | if (fields.isEmpty) { 71 | row += k -> v 72 | 73 | } else { 74 | if (fields.contains(k)) row += k -> v 75 | 76 | } 77 | 78 | }) 79 | 80 | row.toMap 81 | 82 | } 83 | 84 | protected def getConnection(url:String,database:String,user:String,password:String):Connection = { 85 | 86 | /* Create MySQL connection */ 87 | Class.forName(MYSQL_DRIVER).newInstance() 88 | val endpoint = getEndpoint(url,database) 89 | 90 | /* Generate database connection */ 91 | val connection = DriverManager.getConnection(endpoint,user,password) 92 | connection 93 | 94 | } 95 | 96 | protected def getEndpoint(url:String,database:String):String = { 97 | 98 | val endpoint = "jdbc:mysql://" + url + "/" + database 99 | endpoint 100 | 101 | } 102 | 103 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/jdbc/JdbcSource.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.jdbc 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import de.kp.spark.connect.ConnectConfig 25 | 26 | class JdbcSource(@transient sc:SparkContext) extends Serializable { 27 | 28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = { 29 | 30 | val site = params("site").asInstanceOf[Int] 31 | val query = params("query") 32 | 33 | val fields = params("fields").split(",").toList 34 | 35 | new JdbcReader(sc).read(config,site,query,fields) 36 | 37 | } 38 | 39 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/log/ApacheLogAnalyzer.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.log 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.sql.SQLContext 23 | 24 | case class ApacheLogStats( 25 | contentSizeStats:(Long,Long,Long,Long), 26 | responseCodeCount:Seq[(Int,Long)], 27 | ipAddresses:Seq[String], 28 | topEndpoints:Seq[(String,Long)] 29 | ) 30 | 31 | class ApacheLogAnalyzer(@transient sc:SparkContext) extends Serializable { 32 | 33 | private val sqlContext = new SQLContext(sc) 34 | import sqlContext.createSchemaRDD 35 | 36 | def stats(store:String):ApacheLogStats = { 37 | 38 | /* 39 | * Data structure 40 | * 41 | * ip_address 42 | * client_identd 43 | * user_id 44 | * datetime 45 | * method 46 | * endpoint 47 | * protocol 48 | * response_code 49 | * content_size 50 | * 51 | */ 52 | val logs = sc.textFile(store).map(ApacheLogParser.parse(_)) 53 | logs.registerTempTable("logs") 54 | 55 | /* Calculate statistics based on the content size */ 56 | val CONTENT_SIZE_SQL = "SELECT SUM(content_size), COUNT(*), MIN(content_size), MAX(content_size) FROM logs" 57 | val contentSizeStats = sqlContext.sql(CONTENT_SIZE_SQL).map(row => 58 | (row.getLong(0), row.getLong(1), row.getLong(2), row.getLong(3)) 59 | 60 | ).first 61 | 62 | /* Compute Response Code to Count */ 63 | val RESPONSE_CODE_SQL = "SELECT response_code, COUNT(*) FROM logs GROUP BY response_code" 64 | val responseCodeCount = sqlContext.sql(RESPONSE_CODE_SQL).map(row => 65 | (row.getInt(0), row.getLong(1)) 66 | 67 | ).take(1000).toList 68 | 69 | /* Any IPAddress that has accessed the server more than 10 times */ 70 | val IP_ADDRESS_SQL = "SELECT ip_address, COUNT(*) AS total FROM logs GROUP BY ip_address HAVING total > 10" 71 | val ipAddresses = sqlContext.sql(IP_ADDRESS_SQL).map(row => 72 | row.getString(0) 73 | ).take(100) // Take only 100 in case this is a super large data set. 74 | 75 | /* Top Endpoints */ 76 | val ENDPOINT_SQL = "SELECT endpoint, COUNT(*) AS total FROM logs GROUP BY endpoint ORDER BY total DESC LIMIT 10" 77 | val topEndpoints = sqlContext.sql(ENDPOINT_SQL).map(row => 78 | (row.getString(0), row.getLong(1)) 79 | ).collect() 80 | 81 | ApacheLogStats( 82 | contentSizeStats,responseCodeCount,ipAddresses,topEndpoints 83 | ) 84 | 85 | } 86 | 87 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/log/ApacheLogParser.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.log 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import java.util.regex.Matcher 22 | import java.util.regex.Pattern 23 | 24 | case class ApacheLogInfo( 25 | ip_address:String, 26 | client_identd:String, 27 | user_id:String, 28 | datetime:String, 29 | method:String, 30 | endpoint:String, 31 | protocol:String, 32 | response_code:Int, 33 | content_size:Long 34 | ) 35 | 36 | object ApacheLogParser extends Serializable{ 37 | /* 38 | * Example Apache log line: 39 | * 40 | * 127.0.0.1 - - [21/Jul/2014:9:55:27 -0800] "GET /home.html HTTP/1.1" 200 2048 41 | * 42 | */ 43 | private val LOG_ENTRY_PATTERN = 44 | // 1:IP 2:client 3:user 4:date time 5:method 6:req 7:proto 8:respcode 9:size 45 | "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)" 46 | 47 | private val PATTERN = Pattern.compile(LOG_ENTRY_PATTERN) 48 | 49 | def parse(logline:String):ApacheLogInfo = { 50 | 51 | val m = PATTERN.matcher(logline) 52 | if (!m.find()) { 53 | throw new RuntimeException("Error parsing logline"); 54 | } 55 | 56 | ApacheLogInfo( 57 | m.group(1), 58 | m.group(2), 59 | m.group(3), 60 | m.group(4), 61 | m.group(5), 62 | m.group(6), 63 | m.group(7), 64 | m.group(8).toInt, 65 | m.group(9).toLong) 66 | 67 | } 68 | 69 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/mongodb/MongoReader.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.mongodb 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | 24 | import com.mongodb.hadoop.MongoInputFormat 25 | import org.bson.BSONObject 26 | 27 | import scala.collection.mutable.HashMap 28 | import scala.collection.JavaConversions._ 29 | 30 | import de.kp.spark.connect.ConnectConfig 31 | 32 | class MongoReader(@transient sc:SparkContext) extends Serializable { 33 | 34 | def read(config:ConnectConfig,query:String):RDD[Map[String,Any]] = { 35 | 36 | val conf = config.mongo 37 | conf.set("mongo.input.query",query) 38 | 39 | val source = sc.newAPIHadoopRDD(conf, classOf[MongoInputFormat], classOf[Object], classOf[BSONObject]) 40 | source.map(x => toMap(x._2)) 41 | 42 | } 43 | 44 | private def toMap(obj:BSONObject):Map[String,Any] = { 45 | 46 | val data = HashMap.empty[String,Any] 47 | 48 | val keys = obj.keySet() 49 | for (k <- keys) { 50 | 51 | val v = obj.get(k) 52 | data += k -> v 53 | 54 | } 55 | 56 | data.toMap 57 | 58 | } 59 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/mongodb/MongoSource.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.mongodb 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | import de.kp.spark.connect.ConnectConfig 24 | 25 | class MongoSource(@transient sc:SparkContext) extends Serializable { 26 | 27 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = { 28 | 29 | val query = params("query") 30 | new MongoReader(sc).read(config,query) 31 | 32 | } 33 | 34 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/parquet/ParquetReader.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.parquet 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.SparkContext._ 23 | 24 | import org.apache.spark.sql._ 25 | import org.apache.spark.rdd.RDD 26 | 27 | import scala.collection.mutable.HashMap 28 | 29 | class ParquetReader(@transient sc:SparkContext) extends Serializable { 30 | 31 | def read(store:String,fields:List[String] = List.empty[String]):RDD[Map[String,Any]] = { 32 | 33 | val sqlCtx = new SQLContext(sc) 34 | import sqlCtx.createSchemaRDD 35 | 36 | /* 37 | * Read in the parquet file created above. Parquet files are self-describing 38 | * so the schema is preserved. The result of loading a Parquet file is also a 39 | * SchemaRDD. 40 | */ 41 | val parquetFile = sqlCtx.parquetFile(store) 42 | val metadata = parquetFile.schema.fields.zipWithIndex 43 | 44 | parquetFile.map(row => toMap(row,metadata,fields)) 45 | 46 | } 47 | 48 | private def toMap(row:Row,metadata:Seq[(StructField,Int)],fields:List[String]):Map[String,Any] = { 49 | 50 | val data = HashMap.empty[String,Any] 51 | val values = row.iterator.zipWithIndex.map(x => (x._2,x._1)).toMap 52 | 53 | metadata.foreach(entry => { 54 | 55 | val field = entry._1 56 | val col = entry._2 57 | 58 | val colname = field.name 59 | val colvalu = values(col) 60 | 61 | if (fields.isEmpty) { 62 | data += colname -> colvalu 63 | 64 | } else { 65 | if (fields.contains(colname)) data += colname -> colvalu 66 | 67 | } 68 | 69 | }) 70 | 71 | data.toMap 72 | 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/parquet/ParquetSource.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.parquet 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.rdd.RDD 23 | import de.kp.spark.connect.ConnectConfig 24 | 25 | class ParquetSource(@transient sc:SparkContext) extends Serializable { 26 | 27 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = { 28 | 29 | val store = params("store") 30 | val fields = params("fields").split(",").toList 31 | 32 | new ParquetReader(sc).read(store,fields) 33 | 34 | } 35 | 36 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/shop/BigClient.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.shop 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.scribe.model._ 22 | import org.slf4j.LoggerFactory 23 | 24 | import com.fasterxml.jackson.databind.ObjectMapper 25 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 26 | 27 | import scala.collection.mutable.Buffer 28 | 29 | class BigClient(val key:String,val token:String,val context:String) { 30 | 31 | private val LOG = LoggerFactory.getLogger(classOf[BigClient]) 32 | 33 | private val JSON_MAPPER = new ObjectMapper() 34 | JSON_MAPPER.registerModule(DefaultScalaModule) 35 | 36 | val ENDPOINT = String.format("""https://api.bigcommerce.com/%s/v2/""",context) 37 | 38 | def getResources(resource:String,id:Int,params:Map[String,String]):List[Map[String,Any]] = { 39 | 40 | resource match { 41 | 42 | case "brand" => getBrands(params) 43 | case "customer" => getCustomers(params) 44 | 45 | case "image" => if (id == -1) List.empty[Map[String,Any]] else getImages(id,params) 46 | case "lineitem" => if (id == -1) List.empty[Map[String,Any]] else getLineItems(id,params) 47 | 48 | case "order" => getOrders(params) 49 | case "product" => getProducts(params) 50 | 51 | case _ => List.empty[Map[String,Any]] 52 | 53 | } 54 | 55 | } 56 | 57 | def getBrands(requestParams:Map[String,String]):List[Map[String,Any]] = { 58 | 59 | val endpoint = ENDPOINT + "brands" + getSimpleUrlParams(requestParams) 60 | getResponseAsList(endpoint) 61 | 62 | } 63 | 64 | def getCustomers(requestParams:Map[String,String]):List[Map[String,Any]] = { 65 | 66 | val endpoint = ENDPOINT + "customers" + getSimpleUrlParams(requestParams) 67 | getResponseAsList(endpoint) 68 | 69 | } 70 | 71 | def getOrders(requestParams:Map[String,String]):List[Map[String,Any]] = { 72 | 73 | val endpoint = ENDPOINT + "orders" 74 | getResponseAsList(endpoint) 75 | 76 | } 77 | 78 | def getBrand(brand:Int):Map[String,Any] = { 79 | 80 | val endpoint = ENDPOINT + "brands/" + brand 81 | getResponseAsObject(endpoint) 82 | 83 | } 84 | def getLineItems(order:Int,requestParams:Map[String,String]):List[Map[String,Any]] = { 85 | 86 | val endpoint = ENDPOINT + "orders/" + order + "/products" + getSimpleUrlParams(requestParams) 87 | getResponseAsList(endpoint) 88 | 89 | } 90 | 91 | def getImages(product:Int,requestParams:Map[String,String]):List[Map[String,Any]] = { 92 | 93 | val endpoint = ENDPOINT + "products/" + product + "/images" + getSimpleUrlParams(requestParams) 94 | getResponseAsList(endpoint) 95 | 96 | } 97 | 98 | def getProducts(requestParams:Map[String,String]):List[Map[String,Any]] = { 99 | 100 | val endpoint = ENDPOINT + "products" + getSimpleUrlParams(requestParams) 101 | getResponseAsList(endpoint) 102 | 103 | } 104 | 105 | private def getOrderUrlParams(params:Map[String,String]):String = { 106 | 107 | 108 | val accepted = List("page","limit","min_date_created","status_id","max_date_created") 109 | 110 | val sb = Buffer.empty[String] 111 | for (kv <- params) { 112 | 113 | if (accepted.contains(kv._1)) { 114 | 115 | val value = String.format("""?%s=%s""",kv._1,kv._2) 116 | sb += value 117 | 118 | } 119 | 120 | } 121 | 122 | val s = "?" + sb.mkString("&") 123 | java.net.URLEncoder.encode(s, "UTF-8") 124 | 125 | } 126 | 127 | private def getSimpleUrlParams(params:Map[String,String]):String = { 128 | 129 | val accepted = List("page","limit") 130 | 131 | val sb = Buffer.empty[String] 132 | for (kv <- params) { 133 | 134 | if (accepted.contains(kv._1)) { 135 | 136 | val value = String.format("""?%s=%s""",kv._1,kv._2) 137 | sb += value 138 | 139 | } 140 | 141 | } 142 | 143 | val s = "?" + sb.mkString("&") 144 | java.net.URLEncoder.encode(s, "UTF-8") 145 | 146 | } 147 | 148 | def getResponseAsList(endpoint:String):List[Map[String,Any]] = { 149 | 150 | val request = new OAuthRequest(Verb.GET, endpoint) 151 | request.addHeader("accept", "application/json") 152 | 153 | request.addHeader("X-Auth-Client", key) 154 | request.addHeader("X-Auth-Token", token) 155 | 156 | val response = request.send() 157 | if (response.getCode == 200) { 158 | 159 | val body = response.getBody 160 | JSON_MAPPER.readValue(body, classOf[List[Map[String,Any]]]) 161 | 162 | } else { 163 | throw new Exception("Bad request: " + response.getCode) 164 | } 165 | 166 | } 167 | 168 | def getResponseAsObject(endpoint:String):Map[String,Any] = { 169 | 170 | val request = new OAuthRequest(Verb.GET, endpoint) 171 | request.addHeader("accept", "application/json") 172 | 173 | request.addHeader("X-Auth-Client", key) 174 | request.addHeader("X-Auth-Token", token) 175 | 176 | val response = request.send() 177 | if (response.getCode == 200) { 178 | 179 | val body = response.getBody 180 | JSON_MAPPER.readValue(body, classOf[Map[String,Any]]) 181 | 182 | } else { 183 | throw new Exception("Bad request: " + response.getCode) 184 | } 185 | 186 | } 187 | 188 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/shop/BigDataset.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.shop 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | import org.apache.spark.SparkContext 21 | import org.apache.spark.SparkContext._ 22 | import org.apache.spark.rdd.RDD 23 | 24 | import scala.collection.mutable.Buffer 25 | /** 26 | * The Bigcommerce REST API does not support counting for supported resources; 27 | * this implies, that we have not enough data to do partitioning and retrieve 28 | * resource data per partition. 29 | * 30 | * In other words, we cannot user Spark's RDD mechanism directly, but have to 31 | * collect all the data first, and partition then. 32 | */ 33 | class BigDataset( 34 | /* Reference to SparkContext */ 35 | @transient sc:SparkContext, 36 | /* resource */ 37 | resource:String, 38 | /* Request parameters */ 39 | params:Map[String,String], 40 | /* Total number of partitions */ 41 | numPartitions:Int) { 42 | 43 | /* 44 | * Prepare request parameters, i.e. in case of an identifier provided, 45 | * this value is used to determine the list of a dependent resource: 46 | * 47 | * E.g. articles have to be retrieved by provided the identifier of 48 | * the associated blog 49 | */ 50 | private val excludes = List("id") 51 | private val req_params = params.filter(kv => excludes.contains(kv._1) == false) 52 | 53 | private val rid = if (params.contains("id")) params("id").toInt else -1 54 | private val client = createClient 55 | 56 | private val dataset = getDataset 57 | 58 | def toRDD = sc.parallelize(dataset, numPartitions) 59 | 60 | private def createClient:BigClient = { 61 | 62 | val key = params("key") 63 | val token = params("token") 64 | 65 | val context = params("context") 66 | new BigClient(key,token,context) 67 | 68 | } 69 | 70 | private def getDataset:Seq[Map[String,Any]] = { 71 | 72 | val dataset = Buffer.empty[Map[String,Any]] 73 | 74 | var page = 1 75 | var finished = false 76 | 77 | while (finished == false) { 78 | 79 | val records = client.getResources(resource,rid,req_params ++ Map("limit" -> "250","page" -> page.toString)) 80 | dataset ++= records 81 | 82 | page += 1 83 | /* 84 | * Check whether this request has been the last request; 85 | * the respective condition is given, if less than 250 86 | * records are retrieved 87 | */ 88 | if (records.size < 250) finished = true 89 | 90 | } 91 | 92 | dataset 93 | 94 | } 95 | 96 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/shop/ShopifyClient.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.shop 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import java.io.IOException 22 | 23 | import javax.ws.rs.HttpMethod 24 | import javax.ws.rs.client.{Client,ClientBuilder,Entity,WebTarget} 25 | import javax.ws.rs.core.MediaType 26 | 27 | import com.fasterxml.jackson.databind.{Module, ObjectMapper} 28 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 29 | 30 | import org.slf4j.{Logger,LoggerFactory} 31 | 32 | import scala.collection.mutable.HashMap 33 | import scala.collection.JavaConversions._ 34 | 35 | class ShopifyClient(key:String,secret:String,url:String) extends Serializable { 36 | 37 | private val JSON_MAPPER = new ObjectMapper() 38 | JSON_MAPPER.registerModule(DefaultScalaModule) 39 | 40 | private val client = ClientBuilder.newClient() 41 | private val endpoint = url.replaceFirst("://", "://" + key + ":" + secret + "@") 42 | 43 | private val webTarget = client.target(endpoint).path("admin") 44 | 45 | def close = client.close 46 | 47 | def getResourceCount(name:String,id:Long,params:Map[String,String]):Long = { 48 | 49 | name match { 50 | 51 | case "article" => if (id == -1) -1 else getArticlesCount(id,params) 52 | case "blog" => getBlogsCount(params) 53 | 54 | case "customer" => getCustomersCount(params) 55 | 56 | case "order" => getOrdersCount(params) 57 | case "product" => getProductsCount(params) 58 | 59 | case _ => -1 60 | 61 | } 62 | 63 | } 64 | def getResources(name:String,id:Long,params:Map[String,String]):List[Map[String,Any]] = { 65 | 66 | name match { 67 | 68 | case "article" => if (id == -1) List.empty[Map[String,Any]] else getArticles(id,params) 69 | case "blog" => getBlogs(params) 70 | 71 | case "customer" => getCustomers(params) 72 | 73 | case "order" => getOrders(params) 74 | case "product" => getProducts(params) 75 | 76 | case _ => List.empty[Map[String,Any]] 77 | 78 | } 79 | 80 | } 81 | 82 | /************************************************************************** 83 | * 84 | * ARTICLE SUPPORT 85 | * 86 | *************************************************************************/ 87 | 88 | def getArticles(bid:Long,params:Map[String,String]):List[Map[String,Any]] = { 89 | 90 | val result = getResponse("blogs/" + bid + "/articles.json", params, HttpMethod.GET) 91 | /* 92 | * { "articles": [ ... ] } 93 | */ 94 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 95 | if (response.contains("articles")) 96 | response("articles").asInstanceOf[List[Map[String,Any]]] 97 | 98 | else List.empty[Map[String,Any]] 99 | 100 | } 101 | 102 | def getArticlesCount(bid:Long,params:Map[String,String]):Long = { 103 | 104 | val result = getResponse("blogs/" + bid + "/articles/count.json", params, HttpMethod.GET) 105 | /* 106 | * { "count": 1 } 107 | */ 108 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 109 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1 110 | 111 | } 112 | 113 | /************************************************************************** 114 | * 115 | * BLOG SUPPORT 116 | * 117 | *************************************************************************/ 118 | 119 | def getBlogs(params:Map[String,String]):List[Map[String,Any]] = { 120 | 121 | val result = getResponse("blogs.json", params, HttpMethod.GET) 122 | /* 123 | * { "blogs": [ ... ] } 124 | */ 125 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 126 | if (response.contains("blogs")) 127 | response("blogs").asInstanceOf[List[Map[String,Any]]] 128 | 129 | else List.empty[Map[String,Any]] 130 | 131 | } 132 | 133 | def getBlogsCount(params:Map[String,String]):Long = { 134 | 135 | val result = getResponse("blogs/count.json", params, HttpMethod.GET) 136 | /* 137 | * { "count": 1 } 138 | */ 139 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 140 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1 141 | 142 | } 143 | 144 | /************************************************************************** 145 | * 146 | * CUSTOMER SUPPORT 147 | * 148 | *************************************************************************/ 149 | 150 | def getCustomers(params:Map[String,String]):List[Map[String,Any]] = { 151 | 152 | val result = getResponse("customers.json", params, HttpMethod.GET) 153 | /* 154 | * { "customers": [ ... ] } 155 | */ 156 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 157 | if (response.contains("customers")) 158 | response("customers").asInstanceOf[List[Map[String,Any]]] 159 | 160 | else List.empty[Map[String,Any]] 161 | 162 | } 163 | 164 | def getCustomersCount(params:Map[String,String]):Long = { 165 | 166 | val result = getResponse("customers/count.json", params, HttpMethod.GET) 167 | /* 168 | * { "count": 1 } 169 | */ 170 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 171 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1 172 | 173 | } 174 | 175 | /************************************************************************** 176 | * 177 | * PRODUCT SUPPORT 178 | * 179 | *************************************************************************/ 180 | 181 | def getProducts(params:Map[String,String]):List[Map[String,Any]] = { 182 | 183 | val result = getResponse("products.json", params, HttpMethod.GET) 184 | /* 185 | * { "products": [ ... ] } 186 | */ 187 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 188 | if (response.contains("products")) 189 | response("customers").asInstanceOf[List[Map[String,Any]]] 190 | 191 | else List.empty[Map[String,Any]] 192 | 193 | } 194 | 195 | def getProductsCount(params:Map[String,String]):Long = { 196 | 197 | val result = getResponse("products/count.json", params, HttpMethod.GET) 198 | /* 199 | * { "count": 1 } 200 | */ 201 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 202 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1 203 | 204 | } 205 | 206 | /************************************************************************** 207 | * 208 | * ORDER SUPPORT 209 | * 210 | *************************************************************************/ 211 | 212 | def getOrders(params:Map[String,String]):List[Map[String,Any]] = { 213 | 214 | val result = getResponse("orders.json", params, HttpMethod.GET) 215 | /* 216 | * { "orders": [ ... ] } 217 | */ 218 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 219 | if (response.contains("orders")) 220 | response("orders").asInstanceOf[List[Map[String,Any]]] 221 | 222 | else List.empty[Map[String,Any]] 223 | 224 | } 225 | 226 | def getOrdersCount(params:Map[String,String]):Long = { 227 | 228 | val result = getResponse("orders/count.json", params, HttpMethod.GET) 229 | /* 230 | * { "count": 1 } 231 | */ 232 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]]) 233 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1 234 | 235 | } 236 | 237 | private def getResponse(resource:String,params:Map[String,String],method:String):String = { 238 | 239 | try { 240 | 241 | var qt = webTarget.path(resource) 242 | for (entry <- params) { 243 | val (k,v) = entry 244 | qt = qt.queryParam(k,v) 245 | } 246 | 247 | qt.request(MediaType.APPLICATION_JSON_TYPE).method(method, null, classOf[String]) 248 | 249 | } catch { 250 | case e:Exception => throw new Exception("Could not process query",e) 251 | } 252 | 253 | } 254 | 255 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/connect/shop/ShopifyRDD.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.connect.shop 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Connect project 5 | * (https://github.com/skrusche63/spark-connect). 6 | * 7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Connect. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.{Partition,SparkContext,TaskContext} 22 | import org.apache.spark.TaskKilledException 23 | 24 | import org.apache.spark.rdd.RDD 25 | import org.apache.spark.util.NextIterator 26 | 27 | import scala.collection.mutable.Buffer 28 | 29 | class ShopifyPartition(idx:Int,val start:Int,val end:Int) extends Partition { 30 | override def index = idx 31 | } 32 | 33 | class ShopifyRDD( 34 | /* Reference to SparkContext */ 35 | @transient sc:SparkContext, 36 | /* resource */ 37 | resource:String, 38 | /* Request parameters */ 39 | params:Map[String,String], 40 | /* Total number of partitions */ 41 | numPartitions:Int) extends RDD[Map[String,Any]](sc,Nil) { 42 | 43 | /* 44 | * Prepare request parameters, i.e. in case of an identifier provided, 45 | * this value is used to determine the list of a dependent resource: 46 | * 47 | * E.g. articles have to be retrieved by provided the identifier of 48 | * the associated blog 49 | */ 50 | private val excludes = List("id") 51 | private val req_params = params.filter(kv => excludes.contains(kv._1) == false) 52 | 53 | private val rid = if (params.contains("id")) params("id").toLong else -1 54 | 55 | private def createClient:ShopifyClient = { 56 | 57 | val key = params("key") 58 | val secret = params("secret") 59 | 60 | val url = params("url") 61 | new ShopifyClient(key,secret,url) 62 | 63 | } 64 | 65 | override def getPartitions:Array[Partition] = { 66 | 67 | val client = createClient 68 | 69 | /* 70 | * The ShopifyRDD collects all items of a certain resource from the 71 | * shop platform; in order to calculate the respective partitions, 72 | * we have to determine the total number of items first 73 | */ 74 | val count = client.getResourceCount(resource,rid,req_params) 75 | client.close 76 | 77 | val pages = Math.ceil(count / 250.0).toInt 78 | 79 | val pagesPerPartition = Math.floor(pages.toDouble / numPartitions).toInt 80 | val diff = pages - numPartitions * pagesPerPartition 81 | 82 | 83 | (0 until numPartitions).map(i => { 84 | 85 | val start = 1 + i * pagesPerPartition 86 | val end = (i+1) * pagesPerPartition 87 | 88 | if (i == numPartitions - 1) 89 | new ShopifyPartition(i,start,end + diff) 90 | 91 | else 92 | new ShopifyPartition(i,start,end) 93 | 94 | }).toArray 95 | 96 | } 97 | 98 | override def compute(thePart:Partition,context:TaskContext) = new Iterator[Map[String,Any]] { 99 | 100 | private var closed = false 101 | private var finished = false 102 | 103 | context.addTaskCompletionListener{ context => closeIfNeeded() } 104 | 105 | /* 106 | * A partition is characterized by a begin & end page 107 | */ 108 | private val partition = thePart.asInstanceOf[ShopifyPartition] 109 | 110 | val start = partition.start 111 | val end = partition.end 112 | 113 | val client = createClient 114 | 115 | val resources = Buffer.empty[Map[String,Any]] 116 | (start to end).foreach(page => { 117 | resources ++= client.getResources(resource,rid,req_params ++ Map("page" -> page.toString,"limit" -> "250")) 118 | }) 119 | 120 | val dataset = resources.toIterator 121 | 122 | def hasNext:Boolean = { 123 | 124 | if (context.isInterrupted()) 125 | throw new TaskKilledException 126 | 127 | !finished && dataset.hasNext 128 | 129 | } 130 | 131 | def next:Map[String,Any] = { 132 | 133 | if (hasNext) { 134 | dataset.next 135 | 136 | } else { 137 | 138 | finished = true 139 | null.asInstanceOf[Map[String,Any]] 140 | 141 | } 142 | 143 | } 144 | 145 | def closeIfNeeded() { 146 | if (!closed) { 147 | close() 148 | closed = true 149 | } 150 | } 151 | 152 | def close() { 153 | client.close 154 | } 155 | 156 | } 157 | 158 | } --------------------------------------------------------------------------------