├── README.md
├── lib
├── README.md
└── mongo-hadoop-core_2.2.0-1.2.0.jar
├── pom.xml
└── src
└── main
└── scala
└── de
└── kp
└── spark
└── connect
├── ConnectConfig.scala
├── GaRDD.scala
├── GaReader.scala
├── GaSource.scala
├── SQLSource.scala
├── aerospike
├── AerospikeReader.scala
└── AerospikeSource.scala
├── cassandra
├── CassandraReader.scala
└── CassandraSource.scala
├── dmp
└── CxenseClient.scala
├── elasticsearch
├── ElasticReader.scala
└── ElasticSource.scala
├── hbase
├── HBaseReader.scala
└── HBaseSource.scala
├── jdbc
├── JdbcReader.scala
└── JdbcSource.scala
├── log
├── ApacheLogAnalyzer.scala
└── ApacheLogParser.scala
├── mongodb
├── MongoReader.scala
└── MongoSource.scala
├── parquet
├── ParquetReader.scala
└── ParquetSource.scala
└── shop
├── BigClient.scala
├── BigDataset.scala
├── ShopifyClient.scala
└── ShopifyRDD.scala
/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Common Access Layer for Apache Spark
3 |
4 | [Predictiveworks](http://predictiveworks.eu) supports raw data retrieval from multiple NoSQL and JDBC data sources.
5 |
6 | Read requests are supported for the following big data sources:
7 |
8 | * Cassandra
9 | * Elasticsearch
10 | * HBase
11 | * MongoDB
12 | * Parquet
13 |
14 | In addition, this project also provides an increasing number of connector to data sources relevant for analytics:
15 |
16 | * Google Analytics v3
17 | * Shopify
18 |
--------------------------------------------------------------------------------
/lib/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## MongoDB Hadoop Connector
3 |
4 | The Maven repositories contain the mongo-hadoop connector for several different Hadoop versions,
5 | but not for 2.2.0. Therefore the mongo-hadoop connector is included as an unmanaged library.
--------------------------------------------------------------------------------
/lib/mongo-hadoop-core_2.2.0-1.2.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skrusche63/spark-connect/aa8112941863526c7a6397da92a86a82146602da/lib/mongo-hadoop-core_2.2.0-1.2.0.jar
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | spark-connect
4 | spark-connect
5 | 0.0.1
6 | Spark-Connect
7 | Common Access Layer for Predictiveworks
8 | 2010
9 |
10 |
11 | My License
12 | http://....
13 | repo
14 |
15 |
16 |
17 |
18 | 1.6
19 | 1.6
20 | UTF-8
21 | 3.0.10.Final
22 | 2.10
23 | 2.10.2
24 | 1.2.0
25 |
26 |
27 |
28 |
29 | org.scala-lang
30 | scala-library
31 | ${scala.version}
32 |
33 |
34 |
35 |
36 | junit
37 | junit
38 | 4.11
39 | test
40 |
41 |
42 | org.specs2
43 | specs2_${scala.tools.version}
44 | 1.13
45 | test
46 |
47 |
48 | org.scalatest
49 | scalatest_${scala.tools.version}
50 | 2.0.M6-SNAP8
51 | test
52 |
53 |
54 |
55 |
56 | org.apache.spark
57 | spark-core_2.10
58 | ${spark.version}
59 |
60 |
61 |
62 |
63 | org.apache.spark
64 | spark-sql_2.10
65 | ${spark.version}
66 |
67 |
68 |
69 |
70 | cascading
71 | cascading-core
72 | 2.5.4
73 |
74 |
75 |
76 | cascading
77 | cascading-hadoop
78 | 2.5.4
79 |
80 |
81 |
82 |
83 | org.elasticsearch
84 | elasticsearch-hadoop
85 | 2.0.0
86 |
87 |
88 |
89 |
90 | org.elasticsearch
91 | elasticsearch
92 | 1.3.2
93 |
94 |
95 |
99 |
100 | org.mongodb
101 | mongo-java-driver
102 | 2.11.4
103 |
104 |
105 |
109 |
110 | com.datastax.spark
111 | spark-cassandra-connector_2.10
112 | 1.2.0-alpha1
113 |
114 |
115 |
116 |
117 | org.apache.hbase
118 | hbase-common
119 | 0.98.8-hadoop2
120 |
121 |
122 |
123 | org.apache.hbase
124 | hbase-client
125 | 0.98.8-hadoop2
126 |
127 |
128 |
129 | org.apache.hbase
130 | hbase-server
131 | 0.98.8-hadoop2
132 |
133 |
134 |
135 | mysql
136 | mysql-connector-java
137 | 5.1.31
138 |
139 |
140 |
141 |
142 | com.google.gdata
143 | gdata-core-1.0
144 | 1.41.5
145 |
146 |
147 |
148 | com.google.gdata
149 | gdata-analytics-2.1
150 | 1.41.5
151 |
152 |
153 | org.jboss.resteasy
154 | resteasy-jaxb-provider
155 | ${resteasy.version}
156 |
157 |
158 |
159 |
160 | org.jboss.resteasy
161 | resteasy-jackson-provider
162 | ${resteasy.version}
163 |
164 |
165 |
166 | org.jboss.resteasy
167 | resteasy-client
168 | ${resteasy.version}
169 |
170 |
171 |
176 |
177 | com.fasterxml.jackson.module
178 | jackson-module-scala_2.10
179 | 2.3.1
180 |
181 |
182 |
183 |
184 | org.scribe
185 | scribe
186 | 1.3.7
187 |
188 |
189 |
190 |
191 | com.aerospike
192 | aerospike-client
193 | 3.0.34
194 |
195 |
196 |
197 |
198 |
199 |
200 | conjars.org
201 | http://conjars.org/repo
202 |
203 |
204 | Mandubian Repository
205 | http://mandubian-mvn.googlecode.com/svn/trunk/mandubian-mvn/repository/
206 |
207 |
208 |
209 |
210 | src/main/scala
211 | src/test/scala
212 |
213 |
214 |
215 | net.alchim31.maven
216 | scala-maven-plugin
217 | 3.1.3
218 |
219 |
220 |
221 | compile
222 | testCompile
223 |
224 |
225 |
226 | -make:transitive
227 | -dependencyfile
228 | ${project.build.directory}/.scala_dependencies
229 |
230 |
231 |
232 |
233 |
234 |
235 | org.apache.maven.plugins
236 | maven-surefire-plugin
237 | 2.13
238 |
239 | false
240 | true
241 |
242 |
243 |
244 | **/*Test.*
245 | **/*Suite.*
246 |
247 |
248 |
249 |
250 |
251 |
252 | Dr. Krusche & Partner PartG
253 | http://dr-kruscheundpartner.com
254 |
255 |
256 |
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/ConnectConfig.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 | import org.apache.hadoop.conf.{Configuration => HConf}
21 |
22 | trait ConnectConfig {
23 |
24 | /**
25 | * This method retrieves the settings to access
26 | * an Aerospike Cluster
27 | */
28 | def aerospike:Map[String,String]
29 | /**
30 | * This method retrieves the settings to access
31 | * a Cassandra Cluster
32 | */
33 | def cassandra:Map[String,String]
34 | /**
35 | * This method retrieves a Hadoop configuration
36 | * to access Elasticsearch
37 | */
38 | def elastic:HConf
39 | /**
40 | * This method retrieves the settings to access
41 | * Google Analytics
42 | */
43 | def ga:Map[String,String]
44 | /**
45 | * This method retrieves the settings to access
46 | * HBase
47 | */
48 | def hbase:Map[String,String]
49 | /**
50 | * This method retrieves a Hadoop configuration
51 | * to access MongoDB
52 | */
53 | def mongo:HConf
54 | /**
55 | * This method retrieves the access parameter for a MySQL
56 | * data source, comprising url, db, user, password
57 | */
58 | def mysql:Map[String,String]
59 | /**
60 | * This method retrieves Apache Spark configuration
61 | */
62 | def spark:Map[String,String]
63 |
64 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/GaRDD.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import java.net.URL
22 |
23 | import org.apache.spark.{Partition,SparkContext,TaskContext}
24 | import org.apache.spark.TaskKilledException
25 |
26 | import org.apache.spark.rdd.RDD
27 |
28 | import org.apache.spark.util.NextIterator
29 |
30 | import com.google.gdata.client.analytics.{AnalyticsService,DataQuery}
31 | import com.google.gdata.data.analytics.{DataEntry,DataFeed}
32 |
33 | import scala.collection.JavaConversions._
34 | import scala.collection.mutable.Buffer
35 |
36 | case class GaRow(columns:Seq[GaColumn])
37 | case class GaColumn(name:String,category:String,datatype:String,value:String)
38 |
39 | class GaPartition(idx:Int,val startIndex:Int,val maxResult:Int) extends Partition {
40 | override def index = idx
41 | }
42 |
43 |
44 | class GaRDD(
45 | /* Reference to SparkContext */
46 | @transient sc:SparkContext,
47 | /* Request parameters */
48 | params:Map[String,String],
49 | /* Total number of results */
50 | numResults:Int,
51 | /* Total number of partitions */
52 | numPartitions:Int) extends RDD[GaRow](sc,Nil) {
53 |
54 | override def getPartitions:Array[Partition] = {
55 |
56 | /*
57 | * The maximum number of results returned with a request;
58 | * note, that the Analytics Core Reporting API returns a
59 | * maximum of 10,000 rows per request, no matter how many
60 | * one asks for
61 | */
62 | val maxResult = Math.round(numResults.toDouble / numPartitions).toInt
63 |
64 | (0 until numPartitions).map(i => {
65 |
66 | val startIx = 1 + i * maxResult
67 | new GaPartition(i,startIx,maxResult)
68 |
69 | }).toArray
70 |
71 | }
72 |
73 | override def compute(thePart:Partition,context:TaskContext) = new Iterator[GaRow] {
74 |
75 | private var closed = false
76 | private var finished = false
77 |
78 | context.addTaskCompletionListener{ context => closeIfNeeded() }
79 |
80 | private val partition = thePart.asInstanceOf[GaPartition]
81 | private val query = buildQuery(partition)
82 |
83 | val service = buildService
84 | val datafeed = service.getFeed(query.getUrl,classOf[DataFeed])
85 |
86 | val dataset = datafeed.getEntries.map(mapEntry(_)).toIterator
87 |
88 | /*
89 | * Build query and determine maximum number of results
90 | * from the request parameters (or default = 10.000)
91 | */
92 | def hasNext:Boolean = {
93 |
94 | if (context.isInterrupted())
95 | throw new TaskKilledException
96 |
97 | !finished && dataset.hasNext
98 |
99 | }
100 |
101 | def next:GaRow = {
102 |
103 | if (hasNext) {
104 | dataset.next
105 |
106 | } else {
107 |
108 | finished = true
109 | null.asInstanceOf[GaRow]
110 |
111 | }
112 |
113 | }
114 |
115 | def closeIfNeeded() {
116 | if (!closed) {
117 | close()
118 | closed = true
119 | }
120 | }
121 |
122 | def close() {
123 | /*
124 | * The connection to a GData service is properly closed
125 | * after the request has been performed; this implies
126 | * that we do nothing here
127 | */
128 | }
129 |
130 | private def mapEntry(entry:DataEntry):GaRow = {
131 |
132 | val columns = Buffer.empty[GaColumn]
133 |
134 | /* DIMENSIONS */
135 | val dimensions = entry.getDimensions
136 | if (!dimensions.isEmpty) {
137 | dimensions.map(dimension => GaColumn(dimension.getName,"dimension","string",dimension.getValue))
138 | }
139 |
140 | /* METRICS */
141 | val metrics = entry.getMetrics
142 | metrics.map(metric => GaColumn(metric.getName,"metric",metric.getType,metric.getValue))
143 |
144 | GaRow(columns.toSeq)
145 |
146 | }
147 |
148 | private def buildQuery(partition:GaPartition):DataQuery = {
149 |
150 | /* REQURED */
151 | val query = new DataQuery(new URL(params("url")))
152 |
153 | /* REQUIRED */
154 | val start_date = params("start_date")
155 | query.setStartDate(start_date)
156 |
157 | val end_date = params("end_date")
158 | query.setEndDate(end_date)
159 |
160 | /*
161 | * REQUIRED
162 | *
163 | * The aggregated statistics for user activity in a view (profile),
164 | * such as clicks or pageviews. When queried by alone, metrics provide
165 | * the total values for the requested date range, such as overall pageviews
166 | * or total bounces.
167 | *
168 | * However, when requested with dimensions, values are segmented by the dimension.
169 | * For example, ga:pageviews requested with ga:country returns the total pageviews
170 | * per country.
171 | *
172 | * When requesting metrics, keep in mind: All requests require at least one metric.
173 | *
174 | * You can supply a maximum of 10 metrics for any query.Not all dimensions and metrics
175 | * can be used together. Consult the Valid Combinations tool to see which combinations
176 | * work together.
177 | *
178 | */
179 | val metrics = params("metrics")
180 | query.setMetrics(metrics)
181 | /*
182 | * REQUIRED
183 | *
184 | * The unique table ID used to retrieve the Analytics Report data.
185 | */
186 | val table_id = params("table_id")
187 | query.setIds(table_id)
188 |
189 | /* OPTIONAL */
190 | if (params.contains("dimensions")) {
191 | query.setDimensions(params("dimensions"))
192 | }
193 |
194 | /* OPTIONAL */
195 | if (params.contains("filters")) {
196 | query.setFilters(params("filters"))
197 | }
198 |
199 | /* OPTIONAL */
200 | if (params.contains("sort")) {
201 | query.setSort(params("sort"))
202 | }
203 |
204 | query.setStartIndex(partition.startIndex)
205 | query.setMaxResults(partition.maxResult)
206 |
207 | query
208 |
209 | }
210 |
211 | private def buildService:AnalyticsService = {
212 |
213 | val app_name = params("app_name")
214 | val analytics = new AnalyticsService(app_name)
215 |
216 | val user_name = params("user_name")
217 | val password = params("password")
218 |
219 | analytics.setUserCredentials(user_name,password)
220 | analytics
221 |
222 | }
223 |
224 | }
225 |
226 | }
227 |
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/GaReader.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | class GaReader(@transient sc:SparkContext) extends Serializable {
25 |
26 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
27 |
28 | val settings = config.ga
29 |
30 | val req_params = params ++ Map(
31 |
32 | "app_name" -> params("app_name"),
33 |
34 | "user_name" -> params("user_name"),
35 | "password" -> params("password")
36 | )
37 |
38 | val numResults = params("num_results").toInt
39 | val numPartitions = params("num_partitions").toInt
40 |
41 | val source = new GaRDD(sc,req_params,numResults,numPartitions)
42 | source.map(toMap(_))
43 |
44 | }
45 |
46 | private def toMap(row:GaRow):Map[String,Any] = {
47 |
48 | val columns = row.columns
49 | columns.map(column => {
50 |
51 | val k = column.name
52 | val v = if (column.category == "dimension") {
53 | column.value
54 |
55 | } else {
56 |
57 | column.datatype match {
58 | /*
59 | * The datatype 'integer' describes a Long (see Metric
60 | * implementation); all other values describe Doubles
61 | */
62 | case "integer" => column.value.toLong
63 | /*
64 | * currency, us_currency, float, percent, time
65 | */
66 | case _ => column.value.toDouble
67 | }
68 | }
69 |
70 | (k,v)
71 |
72 | }).toMap
73 |
74 | }
75 |
76 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/GaSource.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | class GaSource(@transient sc:SparkContext) extends Serializable {
25 |
26 | def connect(config:ConnectConfig,requestParams:Map[String,String]):RDD[Map[String,Any]] = {
27 | new GaReader(sc).read(config,requestParams)
28 |
29 | }
30 |
31 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/SQLSource.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.rdd.RDD
22 | import org.apache.spark.sql._
23 |
24 | import de.kp.spark.connect.aerospike.AerospikeSource
25 | import de.kp.spark.connect.cassandra.CassandraSource
26 |
27 | import de.kp.spark.connect.elasticsearch.ElasticSource
28 | import de.kp.spark.connect.hbase.HBaseSource
29 |
30 | import de.kp.spark.connect.jdbc.JdbcSource
31 | import de.kp.spark.connect.mongodb.MongoSource
32 |
33 | import de.kp.spark.connect.parquet.ParquetSource
34 |
35 | object Sources {
36 |
37 | val AEROSPIKE:String = "aerospike"
38 | val CASSANDRA:String = "cassandra"
39 |
40 | val ELASTICSEARCH:String = "elasticsearch"
41 |
42 | val HBASE:String = "hbase"
43 | val JDBC:String = "jdbc"
44 |
45 | val MONGODB:String = "mongodb"
46 | val PARQUET:String = "parquet"
47 |
48 | }
49 |
50 | class SQLSource(
51 | @transient sqlContext:SQLContext,
52 | config:ConnectConfig,
53 | source:String,
54 | table:String,
55 | schema:StructType,
56 | params:Map[String,String]) extends Serializable {
57 |
58 | /*
59 | * Retrieve dataset from source and convert
60 | * result into Row
61 | */
62 | private val names = sqlContext.sparkContext.broadcast(schema.fieldNames)
63 |
64 | private val rowRDD = getRDD.map(rec => {
65 | val values = names.value.map(name => rec(name))
66 | Row.fromSeq(values)
67 | })
68 |
69 | /*
70 | * Apply schema to rows and register as table
71 | */
72 | private val tableRDD = sqlContext.applySchema(rowRDD, schema)
73 | tableRDD.registerTempTable(table)
74 |
75 | def executeQuery(query:String):SchemaRDD = sqlContext.sql(query)
76 |
77 | private def getRDD:RDD[Map[String,Any]] = {
78 |
79 | val sc = sqlContext.sparkContext
80 | val columns = schema.fieldNames
81 |
82 | source match {
83 |
84 | case Sources.AEROSPIKE => {
85 | /*
86 | * Column names are called bin names in the
87 | * terminology of Aerospike
88 | */
89 | val req_params = params ++ Map("binnames" -> columns.mkString(","))
90 | new AerospikeSource(sc).read(config,req_params)
91 |
92 | }
93 | case Sources.CASSANDRA => {
94 |
95 | val req_params = params ++ Map("columns" -> columns.mkString(","))
96 | new CassandraSource(sc).read(config,req_params)
97 |
98 | }
99 | case Sources.ELASTICSEARCH => new ElasticSource(sc).read(config,params)
100 | case Sources.HBASE => {
101 |
102 | val types = schema.fields.map(field => {
103 |
104 | field.dataType match {
105 |
106 | case DoubleType => "double"
107 | case IntegerType => "integer"
108 |
109 | case LongType => "long"
110 | case StringType => "string"
111 |
112 | case _ => throw new Exception("Data type is not supported.")
113 | }
114 |
115 | })
116 |
117 | val req_params = params ++ Map("names" -> columns.mkString(","), "types" -> types.mkString(","))
118 | new HBaseSource(sc).read(config,req_params)
119 |
120 | }
121 | case Sources.JDBC => {
122 |
123 | val req_params = params ++ Map("fields" -> columns.mkString(","))
124 | new JdbcSource(sc).read(config,req_params)
125 |
126 | }
127 | case Sources.MONGODB => new MongoSource(sc).read(config,params)
128 |
129 | case Sources.PARQUET => {
130 |
131 | val req_params = params ++ Map("fields" -> columns.mkString(","))
132 | new ParquetSource(sc).read(config,params)
133 |
134 | }
135 |
136 | case _ => throw new Exception(String.format("""Data source %s is not supported.""",source))
137 |
138 | }
139 |
140 | }
141 |
142 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/aerospike/AerospikeReader.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.aerospike
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import org.apache.hadoop.conf.{Configuration => HConfig}
25 |
26 | import com.aerospike.hadoop._
27 | import de.kp.spark.connect.ConnectConfig
28 |
29 | import scala.collection.JavaConversions._
30 |
31 | class AerospikeReader(@transient sc:SparkContext) extends Serializable {
32 |
33 | /*
34 | * Background to Aerospike:
35 | *
36 | * At the highest level, data is collected in containers called namespaces;
37 | * namespaces are similar to databases. Within a namespace, data are divided
38 | * into sets (equivalent to tables), and finally records (rows).
39 | */
40 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
41 |
42 | val settings = config.aerospike
43 |
44 | val conf = new HConfig()
45 | /* Add host & port to configuration */
46 | val host = if (settings.contains("aerospike.input.host"))
47 | settings("aerospike.input.host") else "localhost"
48 |
49 | conf.set("aerospike.input.host", host)
50 |
51 | val port = if (settings.contains("aerospike.input.port"))
52 | settings("aerospike.input.port") else "3000"
53 |
54 | conf.set("aerospike.input.port", port)
55 |
56 | /* Add namespace and set name to configuration */
57 | conf.set("aerospike.input.namespace",params("namespace"))
58 | conf.set("aerospike.input.setname",params("setnames"))
59 |
60 | /* Add bin names & operation */
61 | val binnames = if (params.contains("binnames"))
62 | params("binnames") else ""
63 |
64 | conf.set("aerospike.input.binnames",binnames)
65 |
66 | val operation = if (params.contains("operation"))
67 | params("operation") else "scan"
68 |
69 | conf.set("aerospike.input.operation",operation)
70 |
71 | if (operation == "numrange") {
72 |
73 | conf.set("aerospike.input.numrange.bin",params("numrange_bin"))
74 |
75 | conf.set("aerospike.input.numrange.begin",params("numrange_begin"))
76 | conf.set("aerospike.input.numrange.end",params("numrange_end"))
77 |
78 | }
79 |
80 | read(conf)
81 |
82 | }
83 |
84 | def read(config:HConfig):RDD[Map[String,Any]] = {
85 |
86 | val source = sc.newAPIHadoopRDD(config, classOf[AerospikeInputFormat], classOf[AerospikeKey], classOf[AerospikeRecord])
87 | source.map{case(key,record) => toMap(key,record)}
88 |
89 | }
90 |
91 | private def toMap(key:AerospikeKey,record:AerospikeRecord):Map[String,Any] = {
92 |
93 | val bins = record.bins
94 | bins.toMap
95 |
96 | }
97 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/aerospike/AerospikeSource.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.aerospike
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import de.kp.spark.connect.ConnectConfig
25 |
26 | class AerospikeSource(@transient sc:SparkContext) extends Serializable {
27 |
28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 | new AerospikeReader(sc).read(config,params)
30 | }
31 |
32 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/cassandra/CassandraReader.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.cassandra
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import com.datastax.spark.connector._
25 | import de.kp.spark.connect.ConnectConfig
26 |
27 | class CassandraReader(@transient sc:SparkContext) extends Serializable {
28 | /**
29 | * This method reads the content of a Cassandra table of a specific
30 | * keyspace. Actually, all data records are retrieved from the table
31 | */
32 | def read(config:ConnectConfig,keyspace:String,table:String,columns:List[String] = List.empty[String]):RDD[Map[String,Any]] = {
33 |
34 | val settings = config.cassandra
35 | val host = settings("spark.cassandra.connection.host")
36 |
37 | /*
38 | * We add the configuration parameters
39 | * to connect to a Cassandra cluster here
40 | */
41 | sc.getConf.set("spark.cassandra.connection.host",host)
42 | /*
43 | * Read from specified keyspace and table; note, that the number
44 | * of entries to be returned must be specified
45 | */
46 | val source = if (columns.isEmpty)
47 | sc.cassandraTable(keyspace, table) else sc.cassandraTable(keyspace, table).select(columns.map(ColumnName(_)):_*)
48 |
49 | source.map(toMap(_))
50 |
51 | }
52 |
53 | /**
54 | * For the primitive data types required by the different
55 | * engines of Predictiveworks, the conversion of the column
56 | * names and values using the toMap method is sufficient.
57 | *
58 | * In case of more complex data types, this method must be
59 | * adapted to these additional requirements
60 | */
61 | private def toMap(row:CassandraRow):Map[String,Any] = {
62 | row.toMap
63 | }
64 |
65 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/cassandra/CassandraSource.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.cassandra
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import de.kp.spark.connect.ConnectConfig
25 |
26 | class CassandraSource(@transient sc:SparkContext) extends Serializable {
27 |
28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 |
30 | val keyspace = params("keyspace")
31 | val table = params("table")
32 |
33 | val columns = if (params.contains("columns")) params("columns").split(",").toList else List.empty[String]
34 |
35 | new CassandraReader(sc).read(config,keyspace,table,columns)
36 |
37 | }
38 |
39 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/dmp/CxenseClient.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.dmp
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import com.fasterxml.jackson.databind.{Module, ObjectMapper}
22 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
23 |
24 | import javax.ws.rs.HttpMethod
25 |
26 | import javax.ws.rs.client.{ClientBuilder,Entity}
27 | import javax.ws.rs.core.MediaType
28 |
29 | import javax.crypto.Mac
30 | import javax.crypto.spec.SecretKeySpec
31 |
32 | import org.joda.time.DateTime
33 | import org.joda.time.DateTimeZone
34 | import org.joda.time.format.ISODateTimeFormat
35 |
36 | import org.apache.commons.codec.binary.Base64
37 |
38 | class CxenseClient(username:String,secret:String) {
39 |
40 | private val CXENSE_URI = "https://api.cxense.com"
41 |
42 | private val JSON_MAPPER = new ObjectMapper()
43 | JSON_MAPPER.registerModule(DefaultScalaModule)
44 |
45 | def getProfileContentFetch(params:Map[String,Any]):Map[String,Any] = {
46 |
47 | val endpoint = "profile/content/fetch"
48 | getResponse(endpoint,params)
49 |
50 | }
51 |
52 | def getProfileUserExternalRead(params:Map[String,Any]):Map[String,Any] = {
53 |
54 | val endpoint = "profile/user/external/read"
55 | getResponse(endpoint,params)
56 |
57 | }
58 | /*
59 | * Collect interest profile for a certain user; the relevant
60 | * part of the response (profile) is equivalent to the 'content
61 | * fetch' request
62 | */
63 | def getProfileUser(params:Map[String,Any]):Map[String,Any] = {
64 |
65 | val endpoint = "profile/user"
66 | getResponse(endpoint,params)
67 |
68 | }
69 |
70 | def getProfileUserSegment(params:Map[String,Any]):Map[String,Any] = {
71 |
72 | val endpoint = "profile/user/segment"
73 | getResponse(endpoint,params)
74 |
75 | }
76 |
77 | def getSegmentRead(params:Map[String,Any]):Map[String,Any] = {
78 |
79 | val endpoint = "segment/read"
80 | getResponse(endpoint,params)
81 |
82 | }
83 |
84 | def getSite(params:Map[String,Any]):Map[String,Any] = {
85 |
86 | val endpoint = "site"
87 | getResponse(endpoint,params)
88 |
89 | }
90 |
91 | def getSiteGroup(params:Map[String,Any]):Map[String,Any] = {
92 |
93 | val endpoint = "site/group"
94 | getResponse(endpoint,params)
95 |
96 | }
97 |
98 | def getTraffic(params:Map[String,Any]):Map[String,Any] = {
99 |
100 | val endpoint = "traffic"
101 | getResponse(endpoint,params)
102 |
103 | }
104 |
105 | def getTrafficCompare(params:Map[String,Any]):Map[String,Any] = {
106 |
107 | val endpoint = "traffic/compare"
108 | getResponse(endpoint,params)
109 |
110 | }
111 |
112 | def getTrafficCustom(params:Map[String,Any]):Map[String,Any] = {
113 |
114 | val endpoint = "traffic/custom"
115 | getResponse(endpoint,params)
116 |
117 | }
118 |
119 | def getTrafficCustomDescribe(params:Map[String,Any]):Map[String,Any] = {
120 |
121 | val endpoint = "traffic/custom/describe"
122 | getResponse(endpoint,params)
123 |
124 | }
125 |
126 | def getTrafficEvent(params:Map[String,Any]):Map[String,Any] = {
127 |
128 | val endpoint = "traffic/event"
129 | getResponse(endpoint,params)
130 |
131 | }
132 |
133 | def getTrafficEventDescribe(params:Map[String,Any]):Map[String,Any] = {
134 |
135 | val endpoint = "traffic/event/describe"
136 | getResponse(endpoint,params)
137 |
138 | }
139 |
140 | def getTrafficIntent(params:Map[String,Any]):Map[String,Any] = {
141 |
142 | val endpoint = "traffic/intent"
143 | getResponse(endpoint,params)
144 |
145 | }
146 |
147 | def getTrafficKeyword(params:Map[String,Any]):Map[String,Any] = {
148 |
149 | val endpoint = "traffic/keyword"
150 | getResponse(endpoint,params)
151 |
152 | }
153 |
154 | def getTrafficKeywordDescribe(params:Map[String,Any]):Map[String,Any] = {
155 |
156 | val endpoint = "traffic/keyword/describe"
157 | getResponse(endpoint,params)
158 |
159 | }
160 |
161 | def getTrafficRelated(params:Map[String,Any]):Map[String,Any] = {
162 |
163 | val endpoint = "traffic/related"
164 | getResponse(endpoint,params)
165 |
166 | }
167 |
168 | def getTrafficUser(params:Map[String,Any]):Map[String,Any] = {
169 |
170 | val endpoint = "traffic/user"
171 | getResponse(endpoint,params)
172 |
173 | }
174 |
175 | def getTrafficUserExternal(params:Map[String,Any]):Map[String,Any] = {
176 |
177 | val endpoint = "traffic/user/external"
178 | getResponse(endpoint,params)
179 |
180 | }
181 |
182 | def getTrafficUserHistogram(params:Map[String,Any]):Map[String,Any] = {
183 |
184 | val endpoint = "traffic/user/histogram"
185 | getResponse(endpoint,params)
186 |
187 | }
188 |
189 | def getTrafficUserHistogramEvent(params:Map[String,Any]):Map[String,Any] = {
190 |
191 | val endpoint = "traffic/user/histogram/event"
192 | getResponse(endpoint,params)
193 |
194 | }
195 |
196 | def getTrafficUserInterest(params:Map[String,Any]):Map[String,Any] = {
197 |
198 | val endpoint = "traffic/user/interest"
199 | getResponse(endpoint,params)
200 |
201 | }
202 |
203 | def getTrafficUserKeyword(params:Map[String,Any]):Map[String,Any] = {
204 |
205 | val endpoint = "traffic/user/keyword"
206 | getResponse(endpoint,params)
207 |
208 | }
209 |
210 | private def getAuthenticationHeader:String = {
211 |
212 | val mac = Mac.getInstance("HmacSHA256")
213 | mac.init(new SecretKeySpec(secret.getBytes("UTF-8"), "HmacSHA256"))
214 |
215 | val date = ISODateTimeFormat.dateTime().print(new DateTime(DateTimeZone.UTC))
216 | val signature = new String(Base64.encodeBase64(mac.doFinal(date.getBytes("UTF-8"))))
217 |
218 | "username=" + username + " date=" + date + " hmac-sha256-base64=" + signature
219 |
220 | }
221 |
222 | private def getResponse(endpoint:String,req_params:Map[String,Any]):Map[String,Any] = {
223 |
224 | val body = JSON_MAPPER.writeValueAsString(req_params)
225 |
226 | val client = ClientBuilder.newClient()
227 | val request = client.target(CXENSE_URI).path("/").path(endpoint).request(MediaType.APPLICATION_JSON_TYPE)
228 |
229 | val response = request
230 | .header("X-cXense-Authentication", getAuthenticationHeader)
231 | .method(HttpMethod.POST, if (body == null) null else Entity.json(body), classOf[String])
232 |
233 | client.close()
234 |
235 | JSON_MAPPER.readValue(response, classOf[Map[String,Any]])
236 |
237 | }
238 |
239 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/elasticsearch/ElasticReader.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.elasticsearch
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import org.apache.hadoop.io.{ArrayWritable,DoubleWritable,IntWritable,LongWritable,MapWritable,NullWritable,Text,Writable}
25 |
26 | import org.apache.hadoop.conf.{Configuration => HConfig}
27 |
28 | import org.elasticsearch.hadoop.mr.EsInputFormat
29 | import de.kp.spark.connect.ConnectConfig
30 |
31 | import scala.collection.JavaConversions._
32 |
33 | class ElasticReader(@transient sc:SparkContext) extends Serializable {
34 |
35 | val ES_QUERY:String = "es.query"
36 | val ES_RESOURCE:String = "es.resource"
37 |
38 | def read(config:HConfig):RDD[Map[String,Any]] = {
39 |
40 | val source = sc.newAPIHadoopRDD(config, classOf[EsInputFormat[Text, MapWritable]], classOf[Text], classOf[MapWritable])
41 | source.map(hit => toMap(hit._2))
42 |
43 | }
44 |
45 | def read(config:ConnectConfig,index:String,mapping:String,query:String):RDD[Map[String,Any]] = {
46 |
47 | val conf = config.elastic
48 |
49 | /*
50 | * Append dynamic request specific data to Elasticsearch configuration;
51 | * this comprises the search query to be used and the index (and mapping)
52 | * to be accessed
53 | */
54 | conf.set(ES_QUERY,query)
55 | conf.set(ES_RESOURCE,(index + "/" + mapping))
56 |
57 | read(conf)
58 |
59 | }
60 |
61 | private def toMap(mw:MapWritable):Map[String,Any] = {
62 |
63 | mw.entrySet().map(kv => {
64 |
65 | val k = kv.getKey().asInstanceOf[Text].toString
66 | val v = kv.getValue() match {
67 |
68 | case valu:ArrayWritable => {
69 |
70 | val array = valu.get
71 | array.map(record => {
72 |
73 | record.asInstanceOf[MapWritable].entrySet().map(entry => {
74 |
75 | val sub_k = entry.getKey().asInstanceOf[Text].toString()
76 | val sub_v = entry.getValue() match {
77 |
78 | case sub_valu:IntWritable => valu.get()
79 | case sub_valu:DoubleWritable => valu.get()
80 |
81 | case sub_valu:LongWritable => valu.get()
82 | case sub_valu:Text => valu.toString
83 |
84 | case _ => throw new Exception("Data type is not supported.")
85 |
86 | }
87 |
88 | (sub_k,sub_v)
89 |
90 | }).toMap
91 |
92 | }).toList
93 |
94 | }
95 |
96 | case valu:IntWritable => valu.get()
97 | case valu:DoubleWritable => valu.get()
98 |
99 | case valu:LongWritable => valu.get()
100 | case valu:Text => valu.toString
101 |
102 | case _ => throw new Exception("Data type is not supported.")
103 |
104 | }
105 |
106 | (k,v)
107 |
108 | }).toMap
109 |
110 | }
111 |
112 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/elasticsearch/ElasticSource.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.elasticsearch
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import de.kp.spark.connect.ConnectConfig
25 |
26 | class ElasticSource(@transient sc:SparkContext) extends Serializable {
27 |
28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 |
30 | val index = params("index")
31 | val mapping = params("mapping")
32 |
33 | val query = params("query").asInstanceOf[String]
34 | new ElasticReader(sc).read(config,index,mapping,query)
35 |
36 | }
37 |
38 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/hbase/HBaseReader.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.hbase
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import org.apache.hadoop.hbase.util.Bytes
25 | import org.apache.hadoop.hbase.HBaseConfiguration
26 |
27 | import org.apache.hadoop.hbase.CellUtil
28 | import org.apache.hadoop.hbase.client.Result
29 |
30 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
31 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
32 |
33 | import de.kp.spark.connect.ConnectConfig
34 |
35 | class HBaseReader(@transient sc:SparkContext) extends Serializable {
36 |
37 | private val HBASE_ROOTDIR = "/hbase"
38 |
39 | /**
40 | * This method reads the content of an HBase table of a specific
41 | * keyspace. Actually, all data records are retrieved from the table
42 | */
43 | def read(config:ConnectConfig,columnfamily:String,table:String,names:List[String],types:List[String]):RDD[Map[String,Any]] = {
44 |
45 | val settings = config.hbase
46 | val host = settings("spark.hbase.host")
47 |
48 | val conf = HBaseConfiguration.create
49 | conf.setBoolean("hbase.cluster.distributed", true)
50 | conf.setInt("hbase.client.scanner.caching", 10000)
51 |
52 | conf.set("hbase.rootdir", HBASE_ROOTDIR)
53 |
54 | conf.set("hbase.zookeeper.quorum", host)
55 | conf.set("hbase.zookeeper.property.clientPort","2181")
56 |
57 | val columns = names.map(name => columnfamily + ":" + name)
58 | conf.set(TableInputFormat.SCAN_COLUMNS, columns.mkString(" "))
59 |
60 | val typedNames = names.zip(types)
61 |
62 | def toMap(key:ImmutableBytesWritable,row:Result):Map[String,Any] = {
63 |
64 | typedNames.map{case(colname,coltype) => {
65 | /*
66 | * Convert column family and respective columns
67 | * into HBase readable Byte array
68 | */
69 | val cf = Bytes.toBytes(columnfamily)
70 | val cn = Bytes.toBytes(colname)
71 |
72 | if (row.containsColumn(cf,cn) == false) throw new Exception(
73 | String.format("""Combination of cf:%s and cn:%s does not exist""",columnfamily,colname))
74 |
75 | val byteValue = CellUtil.cloneValue(row.getColumnLatestCell(cf,cn)).array
76 | /*
77 | * We actually support the following data types:
78 | *
79 | * double, integer, long, string
80 | *
81 | * as these are needed by Predictiveworks
82 | */
83 | val colvalu = coltype match {
84 |
85 | case "double" => Bytes.toDouble(byteValue)
86 |
87 | case "integer" => Bytes.toInt(byteValue)
88 |
89 | case "long" => Bytes.toLong(byteValue)
90 |
91 | case "string" => Bytes.toString(byteValue)
92 |
93 | case _ => throw new Exception(String.format("""The data type '%s' is not supported.""",coltype))
94 |
95 | }
96 |
97 | (colname,colvalu)
98 |
99 | }}.toMap
100 |
101 | }
102 |
103 | val source = sc.newAPIHadoopRDD(conf,classOf[TableInputFormat],classOf[ImmutableBytesWritable],classOf[Result])
104 | source.map{case(key,row) => toMap(key,row)}
105 |
106 | }
107 |
108 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/hbase/HBaseSource.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.hbase
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import de.kp.spark.connect.ConnectConfig
25 |
26 | class HBaseSource(@transient sc:SparkContext) extends Serializable {
27 |
28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 |
30 | val columnfamily = params("columnfamily")
31 | val table = params("table")
32 |
33 | val names = params("names").split(",").toList
34 | val types = params("types").split(",").toList
35 |
36 | new HBaseReader(sc).read(config,columnfamily,table,names,types)
37 |
38 | }
39 |
40 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/jdbc/JdbcReader.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.jdbc
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import java.sql.{Connection,DriverManager,ResultSet}
22 |
23 | import org.apache.spark.SparkContext
24 | import org.apache.spark.rdd.{JdbcRDD,RDD}
25 |
26 | import scala.collection.mutable.HashMap
27 | import de.kp.spark.connect.ConnectConfig
28 |
29 | class JdbcReader(@transient sc:SparkContext) extends Serializable {
30 |
31 | protected val MYSQL_DRIVER = "com.mysql.jdbc.Driver"
32 | protected val NUM_PARTITIONS = 1
33 |
34 | def read(config:ConnectConfig,site:Int,query:String,fields:List[String] = List.empty[String]):RDD[Map[String,Any]] = {
35 |
36 | val conf = config.mysql
37 |
38 | val url = conf("url")
39 | val database = conf("database")
40 |
41 | val user = conf("user")
42 | val password = conf("password")
43 | /*
44 | * The value of 'site' is used as upper and lower bound for
45 | * the range (key) variable of the database table
46 | */
47 | val result = new JdbcRDD(sc,() => getConnection(url,database,user,password),
48 | query,site,site,NUM_PARTITIONS,
49 | (rs:ResultSet) => getRow(rs,fields)
50 | ).cache()
51 |
52 | result
53 |
54 | }
55 |
56 | /**
57 | * Convert database row into Map[String,Any] and restrict
58 | * to column names that are defined by the field spec
59 | */
60 | protected def getRow(rs:ResultSet,fields:List[String]):Map[String,Any] = {
61 | val metadata = rs.getMetaData()
62 | val numCols = metadata.getColumnCount()
63 |
64 | val row = HashMap.empty[String,Any]
65 | (1 to numCols).foreach(i => {
66 |
67 | val k = metadata.getColumnName(i)
68 | val v = rs.getObject(i)
69 |
70 | if (fields.isEmpty) {
71 | row += k -> v
72 |
73 | } else {
74 | if (fields.contains(k)) row += k -> v
75 |
76 | }
77 |
78 | })
79 |
80 | row.toMap
81 |
82 | }
83 |
84 | protected def getConnection(url:String,database:String,user:String,password:String):Connection = {
85 |
86 | /* Create MySQL connection */
87 | Class.forName(MYSQL_DRIVER).newInstance()
88 | val endpoint = getEndpoint(url,database)
89 |
90 | /* Generate database connection */
91 | val connection = DriverManager.getConnection(endpoint,user,password)
92 | connection
93 |
94 | }
95 |
96 | protected def getEndpoint(url:String,database:String):String = {
97 |
98 | val endpoint = "jdbc:mysql://" + url + "/" + database
99 | endpoint
100 |
101 | }
102 |
103 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/jdbc/JdbcSource.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.jdbc
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import de.kp.spark.connect.ConnectConfig
25 |
26 | class JdbcSource(@transient sc:SparkContext) extends Serializable {
27 |
28 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 |
30 | val site = params("site").asInstanceOf[Int]
31 | val query = params("query")
32 |
33 | val fields = params("fields").split(",").toList
34 |
35 | new JdbcReader(sc).read(config,site,query,fields)
36 |
37 | }
38 |
39 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/log/ApacheLogAnalyzer.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.log
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.sql.SQLContext
23 |
24 | case class ApacheLogStats(
25 | contentSizeStats:(Long,Long,Long,Long),
26 | responseCodeCount:Seq[(Int,Long)],
27 | ipAddresses:Seq[String],
28 | topEndpoints:Seq[(String,Long)]
29 | )
30 |
31 | class ApacheLogAnalyzer(@transient sc:SparkContext) extends Serializable {
32 |
33 | private val sqlContext = new SQLContext(sc)
34 | import sqlContext.createSchemaRDD
35 |
36 | def stats(store:String):ApacheLogStats = {
37 |
38 | /*
39 | * Data structure
40 | *
41 | * ip_address
42 | * client_identd
43 | * user_id
44 | * datetime
45 | * method
46 | * endpoint
47 | * protocol
48 | * response_code
49 | * content_size
50 | *
51 | */
52 | val logs = sc.textFile(store).map(ApacheLogParser.parse(_))
53 | logs.registerTempTable("logs")
54 |
55 | /* Calculate statistics based on the content size */
56 | val CONTENT_SIZE_SQL = "SELECT SUM(content_size), COUNT(*), MIN(content_size), MAX(content_size) FROM logs"
57 | val contentSizeStats = sqlContext.sql(CONTENT_SIZE_SQL).map(row =>
58 | (row.getLong(0), row.getLong(1), row.getLong(2), row.getLong(3))
59 |
60 | ).first
61 |
62 | /* Compute Response Code to Count */
63 | val RESPONSE_CODE_SQL = "SELECT response_code, COUNT(*) FROM logs GROUP BY response_code"
64 | val responseCodeCount = sqlContext.sql(RESPONSE_CODE_SQL).map(row =>
65 | (row.getInt(0), row.getLong(1))
66 |
67 | ).take(1000).toList
68 |
69 | /* Any IPAddress that has accessed the server more than 10 times */
70 | val IP_ADDRESS_SQL = "SELECT ip_address, COUNT(*) AS total FROM logs GROUP BY ip_address HAVING total > 10"
71 | val ipAddresses = sqlContext.sql(IP_ADDRESS_SQL).map(row =>
72 | row.getString(0)
73 | ).take(100) // Take only 100 in case this is a super large data set.
74 |
75 | /* Top Endpoints */
76 | val ENDPOINT_SQL = "SELECT endpoint, COUNT(*) AS total FROM logs GROUP BY endpoint ORDER BY total DESC LIMIT 10"
77 | val topEndpoints = sqlContext.sql(ENDPOINT_SQL).map(row =>
78 | (row.getString(0), row.getLong(1))
79 | ).collect()
80 |
81 | ApacheLogStats(
82 | contentSizeStats,responseCodeCount,ipAddresses,topEndpoints
83 | )
84 |
85 | }
86 |
87 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/log/ApacheLogParser.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.log
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import java.util.regex.Matcher
22 | import java.util.regex.Pattern
23 |
24 | case class ApacheLogInfo(
25 | ip_address:String,
26 | client_identd:String,
27 | user_id:String,
28 | datetime:String,
29 | method:String,
30 | endpoint:String,
31 | protocol:String,
32 | response_code:Int,
33 | content_size:Long
34 | )
35 |
36 | object ApacheLogParser extends Serializable{
37 | /*
38 | * Example Apache log line:
39 | *
40 | * 127.0.0.1 - - [21/Jul/2014:9:55:27 -0800] "GET /home.html HTTP/1.1" 200 2048
41 | *
42 | */
43 | private val LOG_ENTRY_PATTERN =
44 | // 1:IP 2:client 3:user 4:date time 5:method 6:req 7:proto 8:respcode 9:size
45 | "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)"
46 |
47 | private val PATTERN = Pattern.compile(LOG_ENTRY_PATTERN)
48 |
49 | def parse(logline:String):ApacheLogInfo = {
50 |
51 | val m = PATTERN.matcher(logline)
52 | if (!m.find()) {
53 | throw new RuntimeException("Error parsing logline");
54 | }
55 |
56 | ApacheLogInfo(
57 | m.group(1),
58 | m.group(2),
59 | m.group(3),
60 | m.group(4),
61 | m.group(5),
62 | m.group(6),
63 | m.group(7),
64 | m.group(8).toInt,
65 | m.group(9).toLong)
66 |
67 | }
68 |
69 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/mongodb/MongoReader.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.mongodb
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 |
24 | import com.mongodb.hadoop.MongoInputFormat
25 | import org.bson.BSONObject
26 |
27 | import scala.collection.mutable.HashMap
28 | import scala.collection.JavaConversions._
29 |
30 | import de.kp.spark.connect.ConnectConfig
31 |
32 | class MongoReader(@transient sc:SparkContext) extends Serializable {
33 |
34 | def read(config:ConnectConfig,query:String):RDD[Map[String,Any]] = {
35 |
36 | val conf = config.mongo
37 | conf.set("mongo.input.query",query)
38 |
39 | val source = sc.newAPIHadoopRDD(conf, classOf[MongoInputFormat], classOf[Object], classOf[BSONObject])
40 | source.map(x => toMap(x._2))
41 |
42 | }
43 |
44 | private def toMap(obj:BSONObject):Map[String,Any] = {
45 |
46 | val data = HashMap.empty[String,Any]
47 |
48 | val keys = obj.keySet()
49 | for (k <- keys) {
50 |
51 | val v = obj.get(k)
52 | data += k -> v
53 |
54 | }
55 |
56 | data.toMap
57 |
58 | }
59 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/mongodb/MongoSource.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.mongodb
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | import de.kp.spark.connect.ConnectConfig
24 |
25 | class MongoSource(@transient sc:SparkContext) extends Serializable {
26 |
27 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
28 |
29 | val query = params("query")
30 | new MongoReader(sc).read(config,query)
31 |
32 | }
33 |
34 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/parquet/ParquetReader.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.parquet
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.SparkContext._
23 |
24 | import org.apache.spark.sql._
25 | import org.apache.spark.rdd.RDD
26 |
27 | import scala.collection.mutable.HashMap
28 |
29 | class ParquetReader(@transient sc:SparkContext) extends Serializable {
30 |
31 | def read(store:String,fields:List[String] = List.empty[String]):RDD[Map[String,Any]] = {
32 |
33 | val sqlCtx = new SQLContext(sc)
34 | import sqlCtx.createSchemaRDD
35 |
36 | /*
37 | * Read in the parquet file created above. Parquet files are self-describing
38 | * so the schema is preserved. The result of loading a Parquet file is also a
39 | * SchemaRDD.
40 | */
41 | val parquetFile = sqlCtx.parquetFile(store)
42 | val metadata = parquetFile.schema.fields.zipWithIndex
43 |
44 | parquetFile.map(row => toMap(row,metadata,fields))
45 |
46 | }
47 |
48 | private def toMap(row:Row,metadata:Seq[(StructField,Int)],fields:List[String]):Map[String,Any] = {
49 |
50 | val data = HashMap.empty[String,Any]
51 | val values = row.iterator.zipWithIndex.map(x => (x._2,x._1)).toMap
52 |
53 | metadata.foreach(entry => {
54 |
55 | val field = entry._1
56 | val col = entry._2
57 |
58 | val colname = field.name
59 | val colvalu = values(col)
60 |
61 | if (fields.isEmpty) {
62 | data += colname -> colvalu
63 |
64 | } else {
65 | if (fields.contains(colname)) data += colname -> colvalu
66 |
67 | }
68 |
69 | })
70 |
71 | data.toMap
72 |
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/parquet/ParquetSource.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.parquet
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | import de.kp.spark.connect.ConnectConfig
24 |
25 | class ParquetSource(@transient sc:SparkContext) extends Serializable {
26 |
27 | def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
28 |
29 | val store = params("store")
30 | val fields = params("fields").split(",").toList
31 |
32 | new ParquetReader(sc).read(store,fields)
33 |
34 | }
35 |
36 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/shop/BigClient.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.shop
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.scribe.model._
22 | import org.slf4j.LoggerFactory
23 |
24 | import com.fasterxml.jackson.databind.ObjectMapper
25 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
26 |
27 | import scala.collection.mutable.Buffer
28 |
29 | class BigClient(val key:String,val token:String,val context:String) {
30 |
31 | private val LOG = LoggerFactory.getLogger(classOf[BigClient])
32 |
33 | private val JSON_MAPPER = new ObjectMapper()
34 | JSON_MAPPER.registerModule(DefaultScalaModule)
35 |
36 | val ENDPOINT = String.format("""https://api.bigcommerce.com/%s/v2/""",context)
37 |
38 | def getResources(resource:String,id:Int,params:Map[String,String]):List[Map[String,Any]] = {
39 |
40 | resource match {
41 |
42 | case "brand" => getBrands(params)
43 | case "customer" => getCustomers(params)
44 |
45 | case "image" => if (id == -1) List.empty[Map[String,Any]] else getImages(id,params)
46 | case "lineitem" => if (id == -1) List.empty[Map[String,Any]] else getLineItems(id,params)
47 |
48 | case "order" => getOrders(params)
49 | case "product" => getProducts(params)
50 |
51 | case _ => List.empty[Map[String,Any]]
52 |
53 | }
54 |
55 | }
56 |
57 | def getBrands(requestParams:Map[String,String]):List[Map[String,Any]] = {
58 |
59 | val endpoint = ENDPOINT + "brands" + getSimpleUrlParams(requestParams)
60 | getResponseAsList(endpoint)
61 |
62 | }
63 |
64 | def getCustomers(requestParams:Map[String,String]):List[Map[String,Any]] = {
65 |
66 | val endpoint = ENDPOINT + "customers" + getSimpleUrlParams(requestParams)
67 | getResponseAsList(endpoint)
68 |
69 | }
70 |
71 | def getOrders(requestParams:Map[String,String]):List[Map[String,Any]] = {
72 |
73 | val endpoint = ENDPOINT + "orders"
74 | getResponseAsList(endpoint)
75 |
76 | }
77 |
78 | def getBrand(brand:Int):Map[String,Any] = {
79 |
80 | val endpoint = ENDPOINT + "brands/" + brand
81 | getResponseAsObject(endpoint)
82 |
83 | }
84 | def getLineItems(order:Int,requestParams:Map[String,String]):List[Map[String,Any]] = {
85 |
86 | val endpoint = ENDPOINT + "orders/" + order + "/products" + getSimpleUrlParams(requestParams)
87 | getResponseAsList(endpoint)
88 |
89 | }
90 |
91 | def getImages(product:Int,requestParams:Map[String,String]):List[Map[String,Any]] = {
92 |
93 | val endpoint = ENDPOINT + "products/" + product + "/images" + getSimpleUrlParams(requestParams)
94 | getResponseAsList(endpoint)
95 |
96 | }
97 |
98 | def getProducts(requestParams:Map[String,String]):List[Map[String,Any]] = {
99 |
100 | val endpoint = ENDPOINT + "products" + getSimpleUrlParams(requestParams)
101 | getResponseAsList(endpoint)
102 |
103 | }
104 |
105 | private def getOrderUrlParams(params:Map[String,String]):String = {
106 |
107 |
108 | val accepted = List("page","limit","min_date_created","status_id","max_date_created")
109 |
110 | val sb = Buffer.empty[String]
111 | for (kv <- params) {
112 |
113 | if (accepted.contains(kv._1)) {
114 |
115 | val value = String.format("""?%s=%s""",kv._1,kv._2)
116 | sb += value
117 |
118 | }
119 |
120 | }
121 |
122 | val s = "?" + sb.mkString("&")
123 | java.net.URLEncoder.encode(s, "UTF-8")
124 |
125 | }
126 |
127 | private def getSimpleUrlParams(params:Map[String,String]):String = {
128 |
129 | val accepted = List("page","limit")
130 |
131 | val sb = Buffer.empty[String]
132 | for (kv <- params) {
133 |
134 | if (accepted.contains(kv._1)) {
135 |
136 | val value = String.format("""?%s=%s""",kv._1,kv._2)
137 | sb += value
138 |
139 | }
140 |
141 | }
142 |
143 | val s = "?" + sb.mkString("&")
144 | java.net.URLEncoder.encode(s, "UTF-8")
145 |
146 | }
147 |
148 | def getResponseAsList(endpoint:String):List[Map[String,Any]] = {
149 |
150 | val request = new OAuthRequest(Verb.GET, endpoint)
151 | request.addHeader("accept", "application/json")
152 |
153 | request.addHeader("X-Auth-Client", key)
154 | request.addHeader("X-Auth-Token", token)
155 |
156 | val response = request.send()
157 | if (response.getCode == 200) {
158 |
159 | val body = response.getBody
160 | JSON_MAPPER.readValue(body, classOf[List[Map[String,Any]]])
161 |
162 | } else {
163 | throw new Exception("Bad request: " + response.getCode)
164 | }
165 |
166 | }
167 |
168 | def getResponseAsObject(endpoint:String):Map[String,Any] = {
169 |
170 | val request = new OAuthRequest(Verb.GET, endpoint)
171 | request.addHeader("accept", "application/json")
172 |
173 | request.addHeader("X-Auth-Client", key)
174 | request.addHeader("X-Auth-Token", token)
175 |
176 | val response = request.send()
177 | if (response.getCode == 200) {
178 |
179 | val body = response.getBody
180 | JSON_MAPPER.readValue(body, classOf[Map[String,Any]])
181 |
182 | } else {
183 | throw new Exception("Bad request: " + response.getCode)
184 | }
185 |
186 | }
187 |
188 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/shop/BigDataset.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.shop
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 | import org.apache.spark.SparkContext
21 | import org.apache.spark.SparkContext._
22 | import org.apache.spark.rdd.RDD
23 |
24 | import scala.collection.mutable.Buffer
25 | /**
26 | * The Bigcommerce REST API does not support counting for supported resources;
27 | * this implies, that we have not enough data to do partitioning and retrieve
28 | * resource data per partition.
29 | *
30 | * In other words, we cannot user Spark's RDD mechanism directly, but have to
31 | * collect all the data first, and partition then.
32 | */
33 | class BigDataset(
34 | /* Reference to SparkContext */
35 | @transient sc:SparkContext,
36 | /* resource */
37 | resource:String,
38 | /* Request parameters */
39 | params:Map[String,String],
40 | /* Total number of partitions */
41 | numPartitions:Int) {
42 |
43 | /*
44 | * Prepare request parameters, i.e. in case of an identifier provided,
45 | * this value is used to determine the list of a dependent resource:
46 | *
47 | * E.g. articles have to be retrieved by provided the identifier of
48 | * the associated blog
49 | */
50 | private val excludes = List("id")
51 | private val req_params = params.filter(kv => excludes.contains(kv._1) == false)
52 |
53 | private val rid = if (params.contains("id")) params("id").toInt else -1
54 | private val client = createClient
55 |
56 | private val dataset = getDataset
57 |
58 | def toRDD = sc.parallelize(dataset, numPartitions)
59 |
60 | private def createClient:BigClient = {
61 |
62 | val key = params("key")
63 | val token = params("token")
64 |
65 | val context = params("context")
66 | new BigClient(key,token,context)
67 |
68 | }
69 |
70 | private def getDataset:Seq[Map[String,Any]] = {
71 |
72 | val dataset = Buffer.empty[Map[String,Any]]
73 |
74 | var page = 1
75 | var finished = false
76 |
77 | while (finished == false) {
78 |
79 | val records = client.getResources(resource,rid,req_params ++ Map("limit" -> "250","page" -> page.toString))
80 | dataset ++= records
81 |
82 | page += 1
83 | /*
84 | * Check whether this request has been the last request;
85 | * the respective condition is given, if less than 250
86 | * records are retrieved
87 | */
88 | if (records.size < 250) finished = true
89 |
90 | }
91 |
92 | dataset
93 |
94 | }
95 |
96 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/shop/ShopifyClient.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.shop
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import java.io.IOException
22 |
23 | import javax.ws.rs.HttpMethod
24 | import javax.ws.rs.client.{Client,ClientBuilder,Entity,WebTarget}
25 | import javax.ws.rs.core.MediaType
26 |
27 | import com.fasterxml.jackson.databind.{Module, ObjectMapper}
28 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
29 |
30 | import org.slf4j.{Logger,LoggerFactory}
31 |
32 | import scala.collection.mutable.HashMap
33 | import scala.collection.JavaConversions._
34 |
35 | class ShopifyClient(key:String,secret:String,url:String) extends Serializable {
36 |
37 | private val JSON_MAPPER = new ObjectMapper()
38 | JSON_MAPPER.registerModule(DefaultScalaModule)
39 |
40 | private val client = ClientBuilder.newClient()
41 | private val endpoint = url.replaceFirst("://", "://" + key + ":" + secret + "@")
42 |
43 | private val webTarget = client.target(endpoint).path("admin")
44 |
45 | def close = client.close
46 |
47 | def getResourceCount(name:String,id:Long,params:Map[String,String]):Long = {
48 |
49 | name match {
50 |
51 | case "article" => if (id == -1) -1 else getArticlesCount(id,params)
52 | case "blog" => getBlogsCount(params)
53 |
54 | case "customer" => getCustomersCount(params)
55 |
56 | case "order" => getOrdersCount(params)
57 | case "product" => getProductsCount(params)
58 |
59 | case _ => -1
60 |
61 | }
62 |
63 | }
64 | def getResources(name:String,id:Long,params:Map[String,String]):List[Map[String,Any]] = {
65 |
66 | name match {
67 |
68 | case "article" => if (id == -1) List.empty[Map[String,Any]] else getArticles(id,params)
69 | case "blog" => getBlogs(params)
70 |
71 | case "customer" => getCustomers(params)
72 |
73 | case "order" => getOrders(params)
74 | case "product" => getProducts(params)
75 |
76 | case _ => List.empty[Map[String,Any]]
77 |
78 | }
79 |
80 | }
81 |
82 | /**************************************************************************
83 | *
84 | * ARTICLE SUPPORT
85 | *
86 | *************************************************************************/
87 |
88 | def getArticles(bid:Long,params:Map[String,String]):List[Map[String,Any]] = {
89 |
90 | val result = getResponse("blogs/" + bid + "/articles.json", params, HttpMethod.GET)
91 | /*
92 | * { "articles": [ ... ] }
93 | */
94 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
95 | if (response.contains("articles"))
96 | response("articles").asInstanceOf[List[Map[String,Any]]]
97 |
98 | else List.empty[Map[String,Any]]
99 |
100 | }
101 |
102 | def getArticlesCount(bid:Long,params:Map[String,String]):Long = {
103 |
104 | val result = getResponse("blogs/" + bid + "/articles/count.json", params, HttpMethod.GET)
105 | /*
106 | * { "count": 1 }
107 | */
108 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
109 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1
110 |
111 | }
112 |
113 | /**************************************************************************
114 | *
115 | * BLOG SUPPORT
116 | *
117 | *************************************************************************/
118 |
119 | def getBlogs(params:Map[String,String]):List[Map[String,Any]] = {
120 |
121 | val result = getResponse("blogs.json", params, HttpMethod.GET)
122 | /*
123 | * { "blogs": [ ... ] }
124 | */
125 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
126 | if (response.contains("blogs"))
127 | response("blogs").asInstanceOf[List[Map[String,Any]]]
128 |
129 | else List.empty[Map[String,Any]]
130 |
131 | }
132 |
133 | def getBlogsCount(params:Map[String,String]):Long = {
134 |
135 | val result = getResponse("blogs/count.json", params, HttpMethod.GET)
136 | /*
137 | * { "count": 1 }
138 | */
139 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
140 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1
141 |
142 | }
143 |
144 | /**************************************************************************
145 | *
146 | * CUSTOMER SUPPORT
147 | *
148 | *************************************************************************/
149 |
150 | def getCustomers(params:Map[String,String]):List[Map[String,Any]] = {
151 |
152 | val result = getResponse("customers.json", params, HttpMethod.GET)
153 | /*
154 | * { "customers": [ ... ] }
155 | */
156 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
157 | if (response.contains("customers"))
158 | response("customers").asInstanceOf[List[Map[String,Any]]]
159 |
160 | else List.empty[Map[String,Any]]
161 |
162 | }
163 |
164 | def getCustomersCount(params:Map[String,String]):Long = {
165 |
166 | val result = getResponse("customers/count.json", params, HttpMethod.GET)
167 | /*
168 | * { "count": 1 }
169 | */
170 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
171 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1
172 |
173 | }
174 |
175 | /**************************************************************************
176 | *
177 | * PRODUCT SUPPORT
178 | *
179 | *************************************************************************/
180 |
181 | def getProducts(params:Map[String,String]):List[Map[String,Any]] = {
182 |
183 | val result = getResponse("products.json", params, HttpMethod.GET)
184 | /*
185 | * { "products": [ ... ] }
186 | */
187 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
188 | if (response.contains("products"))
189 | response("customers").asInstanceOf[List[Map[String,Any]]]
190 |
191 | else List.empty[Map[String,Any]]
192 |
193 | }
194 |
195 | def getProductsCount(params:Map[String,String]):Long = {
196 |
197 | val result = getResponse("products/count.json", params, HttpMethod.GET)
198 | /*
199 | * { "count": 1 }
200 | */
201 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
202 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1
203 |
204 | }
205 |
206 | /**************************************************************************
207 | *
208 | * ORDER SUPPORT
209 | *
210 | *************************************************************************/
211 |
212 | def getOrders(params:Map[String,String]):List[Map[String,Any]] = {
213 |
214 | val result = getResponse("orders.json", params, HttpMethod.GET)
215 | /*
216 | * { "orders": [ ... ] }
217 | */
218 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
219 | if (response.contains("orders"))
220 | response("orders").asInstanceOf[List[Map[String,Any]]]
221 |
222 | else List.empty[Map[String,Any]]
223 |
224 | }
225 |
226 | def getOrdersCount(params:Map[String,String]):Long = {
227 |
228 | val result = getResponse("orders/count.json", params, HttpMethod.GET)
229 | /*
230 | * { "count": 1 }
231 | */
232 | val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
233 | if (response.contains("count")) response("count").asInstanceOf[Long] else -1
234 |
235 | }
236 |
237 | private def getResponse(resource:String,params:Map[String,String],method:String):String = {
238 |
239 | try {
240 |
241 | var qt = webTarget.path(resource)
242 | for (entry <- params) {
243 | val (k,v) = entry
244 | qt = qt.queryParam(k,v)
245 | }
246 |
247 | qt.request(MediaType.APPLICATION_JSON_TYPE).method(method, null, classOf[String])
248 |
249 | } catch {
250 | case e:Exception => throw new Exception("Could not process query",e)
251 | }
252 |
253 | }
254 |
255 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/shop/ShopifyRDD.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.connect.shop
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Connect project
5 | * (https://github.com/skrusche63/spark-connect).
6 | *
7 | * Spark-Connect is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Connect.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.{Partition,SparkContext,TaskContext}
22 | import org.apache.spark.TaskKilledException
23 |
24 | import org.apache.spark.rdd.RDD
25 | import org.apache.spark.util.NextIterator
26 |
27 | import scala.collection.mutable.Buffer
28 |
29 | class ShopifyPartition(idx:Int,val start:Int,val end:Int) extends Partition {
30 | override def index = idx
31 | }
32 |
33 | class ShopifyRDD(
34 | /* Reference to SparkContext */
35 | @transient sc:SparkContext,
36 | /* resource */
37 | resource:String,
38 | /* Request parameters */
39 | params:Map[String,String],
40 | /* Total number of partitions */
41 | numPartitions:Int) extends RDD[Map[String,Any]](sc,Nil) {
42 |
43 | /*
44 | * Prepare request parameters, i.e. in case of an identifier provided,
45 | * this value is used to determine the list of a dependent resource:
46 | *
47 | * E.g. articles have to be retrieved by provided the identifier of
48 | * the associated blog
49 | */
50 | private val excludes = List("id")
51 | private val req_params = params.filter(kv => excludes.contains(kv._1) == false)
52 |
53 | private val rid = if (params.contains("id")) params("id").toLong else -1
54 |
55 | private def createClient:ShopifyClient = {
56 |
57 | val key = params("key")
58 | val secret = params("secret")
59 |
60 | val url = params("url")
61 | new ShopifyClient(key,secret,url)
62 |
63 | }
64 |
65 | override def getPartitions:Array[Partition] = {
66 |
67 | val client = createClient
68 |
69 | /*
70 | * The ShopifyRDD collects all items of a certain resource from the
71 | * shop platform; in order to calculate the respective partitions,
72 | * we have to determine the total number of items first
73 | */
74 | val count = client.getResourceCount(resource,rid,req_params)
75 | client.close
76 |
77 | val pages = Math.ceil(count / 250.0).toInt
78 |
79 | val pagesPerPartition = Math.floor(pages.toDouble / numPartitions).toInt
80 | val diff = pages - numPartitions * pagesPerPartition
81 |
82 |
83 | (0 until numPartitions).map(i => {
84 |
85 | val start = 1 + i * pagesPerPartition
86 | val end = (i+1) * pagesPerPartition
87 |
88 | if (i == numPartitions - 1)
89 | new ShopifyPartition(i,start,end + diff)
90 |
91 | else
92 | new ShopifyPartition(i,start,end)
93 |
94 | }).toArray
95 |
96 | }
97 |
98 | override def compute(thePart:Partition,context:TaskContext) = new Iterator[Map[String,Any]] {
99 |
100 | private var closed = false
101 | private var finished = false
102 |
103 | context.addTaskCompletionListener{ context => closeIfNeeded() }
104 |
105 | /*
106 | * A partition is characterized by a begin & end page
107 | */
108 | private val partition = thePart.asInstanceOf[ShopifyPartition]
109 |
110 | val start = partition.start
111 | val end = partition.end
112 |
113 | val client = createClient
114 |
115 | val resources = Buffer.empty[Map[String,Any]]
116 | (start to end).foreach(page => {
117 | resources ++= client.getResources(resource,rid,req_params ++ Map("page" -> page.toString,"limit" -> "250"))
118 | })
119 |
120 | val dataset = resources.toIterator
121 |
122 | def hasNext:Boolean = {
123 |
124 | if (context.isInterrupted())
125 | throw new TaskKilledException
126 |
127 | !finished && dataset.hasNext
128 |
129 | }
130 |
131 | def next:Map[String,Any] = {
132 |
133 | if (hasNext) {
134 | dataset.next
135 |
136 | } else {
137 |
138 | finished = true
139 | null.asInstanceOf[Map[String,Any]]
140 |
141 | }
142 |
143 | }
144 |
145 | def closeIfNeeded() {
146 | if (!closed) {
147 | close()
148 | closed = true
149 | }
150 | }
151 |
152 | def close() {
153 | client.close
154 | }
155 |
156 | }
157 |
158 | }
--------------------------------------------------------------------------------