├── README.md
├── lib
    ├── README.md
    └── mongo-hadoop-core_2.2.0-1.2.0.jar
├── pom.xml
└── src
    └── main
        └── scala
            └── de
                └── kp
                    └── spark
                        └── connect
                            ├── ConnectConfig.scala
                            ├── GaRDD.scala
                            ├── GaReader.scala
                            ├── GaSource.scala
                            ├── SQLSource.scala
                            ├── aerospike
                                ├── AerospikeReader.scala
                                └── AerospikeSource.scala
                            ├── cassandra
                                ├── CassandraReader.scala
                                └── CassandraSource.scala
                            ├── dmp
                                └── CxenseClient.scala
                            ├── elasticsearch
                                ├── ElasticReader.scala
                                └── ElasticSource.scala
                            ├── hbase
                                ├── HBaseReader.scala
                                └── HBaseSource.scala
                            ├── jdbc
                                ├── JdbcReader.scala
                                └── JdbcSource.scala
                            ├── log
                                ├── ApacheLogAnalyzer.scala
                                └── ApacheLogParser.scala
                            ├── mongodb
                                ├── MongoReader.scala
                                └── MongoSource.scala
                            ├── parquet
                                ├── ParquetReader.scala
                                └── ParquetSource.scala
                            └── shop
                                ├── BigClient.scala
                                ├── BigDataset.scala
                                ├── ShopifyClient.scala
                                └── ShopifyRDD.scala


/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Common Access Layer for Apache Spark
 3 | 
 4 | [Predictiveworks](http://predictiveworks.eu) supports raw data retrieval from multiple NoSQL and JDBC data sources.
 5 | 
 6 | Read requests are supported for the following big data sources:
 7 | 
 8 | * Cassandra
 9 | * Elasticsearch
10 | * HBase
11 | * MongoDB
12 | * Parquet
13 | 
14 | In addition, this project also provides an increasing number of connector to data sources relevant for analytics:
15 | 
16 | * Google Analytics v3
17 | * Shopify
18 | 


--------------------------------------------------------------------------------
/lib/README.md:
--------------------------------------------------------------------------------
1 | 
2 | ## MongoDB Hadoop Connector
3 | 
4 | The Maven repositories contain the mongo-hadoop connector for several different Hadoop versions, 
5 | but not for 2.2.0. Therefore the mongo-hadoop connector is included as an unmanaged library.


--------------------------------------------------------------------------------
/lib/mongo-hadoop-core_2.2.0-1.2.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skrusche63/spark-connect/aa8112941863526c7a6397da92a86a82146602da/lib/mongo-hadoop-core_2.2.0-1.2.0.jar


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 |   <groupId>spark-connect</groupId>
  4 |   <artifactId>spark-connect</artifactId>
  5 |   <version>0.0.1</version>
  6 |   <name>Spark-Connect</name>
  7 |   <description>Common Access Layer for Predictiveworks</description>
  8 |   <inceptionYear>2010</inceptionYear>
  9 |   <licenses>
 10 |     <license>
 11 |       <name>My License</name>
 12 |       <url>http://....</url>
 13 |       <distribution>repo</distribution>
 14 |     </license>
 15 |   </licenses>
 16 | 
 17 |   <properties>
 18 |     <maven.compiler.source>1.6</maven.compiler.source>
 19 |     <maven.compiler.target>1.6</maven.compiler.target>
 20 |     <encoding>UTF-8</encoding>
 21 |     <resteasy.version>3.0.10.Final</resteasy.version>
 22 |     <scala.tools.version>2.10</scala.tools.version>
 23 |     <scala.version>2.10.2</scala.version>
 24 |     <spark.version>1.2.0</spark.version>
 25 |   </properties>
 26 | 
 27 |   <dependencies>
 28 |     <dependency>
 29 |       <groupId>org.scala-lang</groupId>
 30 |       <artifactId>scala-library</artifactId>
 31 |       <version>${scala.version}</version>
 32 |     </dependency>
 33 | 
 34 |     <!-- Test -->
 35 |     <dependency>
 36 |       <groupId>junit</groupId>
 37 |       <artifactId>junit</artifactId>
 38 |       <version>4.11</version>
 39 |       <scope>test</scope>
 40 |     </dependency>
 41 |     <dependency>
 42 |       <groupId>org.specs2</groupId>
 43 |       <artifactId>specs2_${scala.tools.version}</artifactId>
 44 |       <version>1.13</version>
 45 |       <scope>test</scope>
 46 |     </dependency>
 47 |     <dependency>
 48 |       <groupId>org.scalatest</groupId>
 49 |       <artifactId>scalatest_${scala.tools.version}</artifactId>
 50 |       <version>2.0.M6-SNAP8</version>
 51 |       <scope>test</scope>
 52 |     </dependency>
 53 | 
 54 |     <!-- spark core -->
 55 |     <dependency>
 56 | 	  <groupId>org.apache.spark</groupId>
 57 | 	  <artifactId>spark-core_2.10</artifactId>
 58 | 	  <version>${spark.version}</version>
 59 |     </dependency>
 60 |    
 61 |     <!--  spark sql -->
 62 |     <dependency>
 63 | 	  <groupId>org.apache.spark</groupId>
 64 | 	  <artifactId>spark-sql_2.10</artifactId>
 65 | 	  <version>${spark.version}</version>
 66 |     </dependency>            
 67 |     
 68 |     <!--  cascading (from conjars.org) -->
 69 |     <dependency>
 70 |       <groupId>cascading</groupId>
 71 |       <artifactId>cascading-core</artifactId>
 72 |       <version>2.5.4</version>
 73 |     </dependency>
 74 |  
 75 |      <dependency>
 76 |       <groupId>cascading</groupId>
 77 |       <artifactId>cascading-hadoop</artifactId>
 78 |       <version>2.5.4</version>
 79 |     </dependency>
 80 |     
 81 |     <!-- elasticsearch hadoop -->
 82 |     <dependency>
 83 |       <groupId>org.elasticsearch</groupId>
 84 |       <artifactId>elasticsearch-hadoop</artifactId>
 85 |       <version>2.0.0</version>
 86 |     </dependency>
 87 | 
 88 |     <!-- elasticsearch -->
 89 |     <dependency>
 90 |       <groupId>org.elasticsearch</groupId>
 91 |       <artifactId>elasticsearch</artifactId>
 92 |       <version>1.3.2</version>      
 93 |     </dependency>  
 94 | 
 95 |     <!-- mongodb hadoop
 96 |     
 97 |     Note: this version works with hadoop 2.2.0 
 98 |     -->
 99 |     <dependency>
100 | 	  <groupId>org.mongodb</groupId>
101 | 	  <artifactId>mongo-java-driver</artifactId>
102 | 	  <version>2.11.4</version>
103 |     </dependency>    
104 |    
105 |    <!--  spark cassandra connector
106 |    
107 |     compatible with Apache Spark version 1.2
108 |     -->
109 |     <dependency>
110 | 	  <groupId>com.datastax.spark</groupId>
111 | 	  <artifactId>spark-cassandra-connector_2.10</artifactId>
112 | 	  <version>1.2.0-alpha1</version>
113 |     </dependency>    
114 | 
115 |     <!-- hbase -->
116 |     <dependency>
117 |       <groupId>org.apache.hbase</groupId>
118 |       <artifactId>hbase-common</artifactId>
119 |       <version>0.98.8-hadoop2</version>
120 |     </dependency>    
121 |     
122 |     <dependency>
123 |       <groupId>org.apache.hbase</groupId>
124 |       <artifactId>hbase-client</artifactId>
125 |       <version>0.98.8-hadoop2</version>
126 |     </dependency>    
127 |     
128 |     <dependency>
129 |       <groupId>org.apache.hbase</groupId>
130 |       <artifactId>hbase-server</artifactId>
131 |       <version>0.98.8-hadoop2</version>
132 |     </dependency>    
133 | 
134 |     <dependency>
135 | 	  <groupId>mysql</groupId>
136 | 	  <artifactId>mysql-connector-java</artifactId>
137 | 	  <version>5.1.31</version>
138 |     </dependency>      
139 |     
140 |     <!-- google analytics -->
141 |     <dependency>
142 | 	  <groupId>com.google.gdata</groupId>
143 | 	  <artifactId>gdata-core-1.0</artifactId>
144 | 	  <version>1.41.5</version>
145 |     </dependency>     
146 |     
147 |     <dependency>
148 | 	  <groupId>com.google.gdata</groupId>
149 | 	  <artifactId>gdata-analytics-2.1</artifactId>
150 | 	  <version>1.41.5</version>
151 |     </dependency>   
152 |     <dependency>
153 |       <groupId>org.jboss.resteasy</groupId>
154 |       <artifactId>resteasy-jaxb-provider</artifactId>
155 |       <version>${resteasy.version}</version>
156 |     </dependency>
157 | 
158 |     <!-- resteasy -->
159 |     <dependency>
160 |       <groupId>org.jboss.resteasy</groupId>
161 |       <artifactId>resteasy-jackson-provider</artifactId>
162 |       <version>${resteasy.version}</version>
163 |     </dependency>
164 | 
165 |     <dependency>
166 |       <groupId>org.jboss.resteasy</groupId>
167 |       <artifactId>resteasy-client</artifactId>
168 |       <version>${resteasy.version}</version>
169 |     </dependency>
170 | 
171 |     <!--  
172 |     Various parts of jackson need the same minor version in order to work together.
173 |     When working with Spark that includes Jackson in their dependencies, we are forced
174 |     to with the versions Spark provides.
175 |      -->
176 |     <dependency>
177 | 	  <groupId>com.fasterxml.jackson.module</groupId>
178 | 	  <artifactId>jackson-module-scala_2.10</artifactId>
179 | 	  <version>2.3.1</version>
180 |     </dependency>    
181 | 
182 |     <!-- oauth scribe -->
183 |     <dependency>
184 | 	  <groupId>org.scribe</groupId>
185 | 	  <artifactId>scribe</artifactId>
186 | 	  <version>1.3.7</version>
187 |     </dependency>  
188 | 
189 |     <!-- aerospike client -->
190 |     <dependency>
191 | 	  <groupId>com.aerospike</groupId>
192 | 	  <artifactId>aerospike-client</artifactId>
193 | 	  <version>3.0.34</version>
194 |     </dependency>
195 |     
196 |   </dependencies>
197 |  
198 |   <repositories>
199 |     <repository>
200 |       <id>conjars.org</id>
201 |       <url>http://conjars.org/repo</url>
202 |     </repository>
203 |     <repository>
204 |       <id>Mandubian Repository</id>
205 |       <url>http://mandubian-mvn.googlecode.com/svn/trunk/mandubian-mvn/repository/</url>
206 |     </repository>    
207 |   </repositories>
208 | 
209 |   <build>
210 |     <sourceDirectory>src/main/scala</sourceDirectory>
211 |     <testSourceDirectory>src/test/scala</testSourceDirectory>
212 |     <plugins>
213 |       <plugin>
214 |         <!-- see http://davidb.github.com/scala-maven-plugin -->
215 |         <groupId>net.alchim31.maven</groupId>
216 |         <artifactId>scala-maven-plugin</artifactId>
217 |         <version>3.1.3</version>
218 |         <executions>
219 |           <execution>
220 |             <goals>
221 |               <goal>compile</goal>
222 |               <goal>testCompile</goal>
223 |             </goals>
224 |             <configuration>
225 |               <args>
226 |                 <arg>-make:transitive</arg>
227 |                 <arg>-dependencyfile</arg>
228 |                 <arg>${project.build.directory}/.scala_dependencies</arg>
229 |               </args>
230 |             </configuration>
231 |           </execution>
232 |         </executions>
233 |       </plugin>
234 |       <plugin>
235 |         <groupId>org.apache.maven.plugins</groupId>
236 |         <artifactId>maven-surefire-plugin</artifactId>
237 |         <version>2.13</version>
238 |         <configuration>
239 |           <useFile>false</useFile>
240 |           <disableXmlReport>true</disableXmlReport>
241 |           <!-- If you have classpath issue like NoDefClassError,... -->
242 |           <!-- useManifestOnlyJar>false</useManifestOnlyJar -->
243 |           <includes>
244 |             <include>**/*Test.*</include>
245 |             <include>**/*Suite.*</include>
246 |           </includes>
247 |         </configuration>
248 |       </plugin>
249 |     </plugins>
250 |   </build>
251 |   <organization>
252 |   	<name>Dr. Krusche &amp; Partner PartG</name>
253 |   	<url>http://dr-kruscheundpartner.com</url>
254 |   </organization>
255 | </project>
256 | 


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/ConnectConfig.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | import org.apache.hadoop.conf.{Configuration => HConf}
21 | 
22 | trait ConnectConfig {
23 | 
24 |   /**
25 |    * This method retrieves the settings to access
26 |    * an Aerospike Cluster
27 |    */
28 |   def aerospike:Map[String,String]  
29 |   /**
30 |    * This method retrieves the settings to access
31 |    * a Cassandra Cluster
32 |    */
33 |   def cassandra:Map[String,String]  
34 |   /**
35 |    * This method retrieves a Hadoop configuration
36 |    * to access Elasticsearch
37 |    */
38 |   def elastic:HConf
39 |   /**
40 |    * This method retrieves the settings to access
41 |    * Google Analytics
42 |    */
43 |   def ga:Map[String,String]  
44 |   /**
45 |    * This method retrieves the settings to access
46 |    * HBase
47 |    */
48 |   def hbase:Map[String,String]  
49 |   /**
50 |    * This method retrieves a Hadoop configuration
51 |    * to access MongoDB
52 |    */
53 |   def mongo:HConf
54 |    /**
55 |     * This method retrieves the access parameter for a MySQL
56 |     * data source, comprising url, db, user, password
57 |     */
58 |   def mysql:Map[String,String]
59 |   /**
60 |    * This method retrieves Apache Spark configuration
61 |    */
62 |   def spark:Map[String,String]
63 |   
64 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/GaRDD.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.connect
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 |  * 
  4 |  * This file is part of the Spark-Connect project
  5 |  * (https://github.com/skrusche63/spark-connect).
  6 |  * 
  7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
  8 |  * terms of the GNU General Public License as published by the Free Software
  9 |  * Foundation, either version 3 of the License, or (at your option) any later
 10 |  * version.
 11 |  * 
 12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
 13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 |  * You should have received a copy of the GNU General Public License along with
 16 |  * Spark-Connect. 
 17 |  * 
 18 |  * If not, see <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | import java.net.URL
 22 | 
 23 | import org.apache.spark.{Partition,SparkContext,TaskContext}
 24 | import org.apache.spark.TaskKilledException
 25 | 
 26 | import org.apache.spark.rdd.RDD
 27 | 
 28 | import org.apache.spark.util.NextIterator
 29 | 
 30 | import com.google.gdata.client.analytics.{AnalyticsService,DataQuery}
 31 | import com.google.gdata.data.analytics.{DataEntry,DataFeed}
 32 | 
 33 | import scala.collection.JavaConversions._
 34 | import scala.collection.mutable.Buffer
 35 | 
 36 | case class GaRow(columns:Seq[GaColumn])
 37 | case class GaColumn(name:String,category:String,datatype:String,value:String)
 38 | 
 39 | class GaPartition(idx:Int,val startIndex:Int,val maxResult:Int) extends Partition {
 40 |   override def index = idx  
 41 | }
 42 | 
 43 | 
 44 | class GaRDD(
 45 |     /* Reference to SparkContext */
 46 |     @transient sc:SparkContext,
 47 |     /* Request parameters */
 48 |     params:Map[String,String],
 49 |     /* Total number of results */
 50 |     numResults:Int,
 51 |     /* Total number of partitions */
 52 |     numPartitions:Int) extends RDD[GaRow](sc,Nil) {
 53 | 
 54 |   override def getPartitions:Array[Partition] = {
 55 |     
 56 |     /*
 57 |      * The maximum number of results returned with a request;
 58 |      * note, that the Analytics Core Reporting API returns a
 59 |      * maximum of 10,000 rows per request, no matter how many
 60 |      * one asks for
 61 |      */
 62 |     val maxResult = Math.round(numResults.toDouble / numPartitions).toInt
 63 |     
 64 |     (0 until numPartitions).map(i => {
 65 |       
 66 |       val startIx = 1 + i * maxResult
 67 |       new GaPartition(i,startIx,maxResult)
 68 |     
 69 |     }).toArray
 70 | 
 71 |   }
 72 |   
 73 |   override def compute(thePart:Partition,context:TaskContext) = new Iterator[GaRow] {
 74 |     
 75 |     private var closed = false  
 76 |     private var finished = false
 77 |     
 78 |     context.addTaskCompletionListener{ context => closeIfNeeded() }
 79 | 
 80 |     private val partition = thePart.asInstanceOf[GaPartition]    
 81 |     private val query = buildQuery(partition)
 82 |     
 83 |     val service = buildService    
 84 |     val datafeed = service.getFeed(query.getUrl,classOf[DataFeed])
 85 | 
 86 |     val dataset = datafeed.getEntries.map(mapEntry(_)).toIterator
 87 |       
 88 |     /*
 89 |      * Build query and determine maximum number of results
 90 |      * from the request parameters (or default = 10.000)
 91 |      */
 92 |     def hasNext:Boolean = {
 93 |       
 94 |       if (context.isInterrupted())
 95 |         throw new TaskKilledException
 96 |       
 97 |       !finished && dataset.hasNext
 98 |       
 99 |     }
100 |     
101 |     def next:GaRow = {
102 |       
103 |       if (hasNext) {
104 |         dataset.next
105 |         
106 |       } else {
107 |         
108 |         finished = true
109 |         null.asInstanceOf[GaRow]
110 |       
111 |       }
112 |       
113 |     }
114 |     
115 |     def closeIfNeeded() {
116 |       if (!closed) {
117 |         close()
118 |         closed = true
119 |       }
120 |     }  
121 |     
122 |     def close() {   
123 |       /* 
124 |        * The connection to a GData service is properly closed
125 |        * after the request has been performed; this implies
126 |        * that we do nothing here
127 |        */
128 |     }
129 |   
130 |     private def mapEntry(entry:DataEntry):GaRow = {
131 | 
132 |       val columns = Buffer.empty[GaColumn]
133 | 
134 |       /* DIMENSIONS */
135 |       val dimensions = entry.getDimensions
136 |       if (!dimensions.isEmpty) { 
137 |         dimensions.map(dimension => GaColumn(dimension.getName,"dimension","string",dimension.getValue)) 
138 |       }
139 |         
140 |       /* METRICS */
141 |       val metrics = entry.getMetrics
142 |       metrics.map(metric => GaColumn(metric.getName,"metric",metric.getType,metric.getValue))
143 |       
144 |       GaRow(columns.toSeq)
145 |       
146 |     }
147 |     
148 |     private def buildQuery(partition:GaPartition):DataQuery = {
149 |     
150 |       /* REQURED */
151 |       val query = new DataQuery(new URL(params("url")))
152 |     
153 |       /* REQUIRED */
154 |       val start_date = params("start_date")
155 |       query.setStartDate(start_date)
156 | 
157 |       val end_date = params("end_date")
158 |       query.setEndDate(end_date)
159 |     
160 |       /*
161 |        * REQUIRED
162 |        * 
163 |        * The aggregated statistics for user activity in a view (profile), 
164 |        * such as clicks or pageviews. When queried by alone, metrics provide 
165 |        * the total values for the requested date range, such as overall pageviews 
166 |        * or total bounces. 
167 |        * 
168 |        * However, when requested with dimensions, values are segmented by the dimension. 
169 |        * For example, ga:pageviews requested with ga:country returns the total pageviews 
170 |        * per country. 
171 |        * 
172 |        * When requesting metrics, keep in mind: All requests require at least one metric.
173 |        * 
174 |        * You can supply a maximum of 10 metrics for any query.Not all dimensions and metrics 
175 |        * can be used together. Consult the Valid Combinations tool to see which combinations 
176 |        * work together.
177 |        * 
178 |        */
179 |       val metrics = params("metrics")
180 |       query.setMetrics(metrics)
181 |       /*
182 |        * REQUIRED
183 |        * 
184 |        * The unique table ID used to retrieve the Analytics Report data.
185 |        */
186 |       val table_id = params("table_id")
187 |       query.setIds(table_id)
188 |     
189 |       /* OPTIONAL */    
190 |       if (params.contains("dimensions")) {
191 |         query.setDimensions(params("dimensions"))
192 |       }
193 | 
194 |       /* OPTIONAL */    
195 |       if (params.contains("filters")) {
196 |         query.setFilters(params("filters"))
197 |       }
198 | 
199 |       /* OPTIONAL */    
200 |       if (params.contains("sort")) {
201 |         query.setSort(params("sort"))
202 |       }
203 |     
204 |       query.setStartIndex(partition.startIndex)
205 |       query.setMaxResults(partition.maxResult)
206 |       
207 |       query
208 |      
209 |     }
210 |   
211 |     private def buildService:AnalyticsService = {
212 |   
213 |       val app_name = params("app_name")
214 |       val analytics = new AnalyticsService(app_name)
215 |    
216 |       val user_name = params("user_name")
217 |       val password = params("password")
218 |     
219 |       analytics.setUserCredentials(user_name,password)   
220 |       analytics
221 |    
222 |     }
223 |   
224 |   }
225 |   
226 | }
227 | 


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/GaReader.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | class GaReader(@transient sc:SparkContext) extends Serializable {
25 | 
26 |   def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
27 |       
28 |     val settings = config.ga
29 | 
30 |     val req_params = params ++ Map(
31 |       
32 |       "app_name" -> params("app_name"),
33 |       
34 |       "user_name" ->  params("user_name"),
35 |       "password"  ->  params("password")
36 |     )
37 | 
38 |     val numResults = params("num_results").toInt
39 |     val numPartitions = params("num_partitions").toInt
40 |     
41 |     val source = new GaRDD(sc,req_params,numResults,numPartitions)
42 |     source.map(toMap(_))
43 |     
44 |   }
45 | 
46 |   private def toMap(row:GaRow):Map[String,Any] = {
47 | 
48 |     val columns = row.columns
49 |     columns.map(column => {
50 |       
51 |       val k = column.name
52 |       val v = if (column.category == "dimension") {
53 |         column.value
54 |       
55 |       } else {
56 |         
57 |         column.datatype match {
58 |           /*
59 |            * The datatype 'integer' describes a Long (see Metric
60 |            * implementation); all other values describe Doubles
61 |            */
62 |           case "integer" => column.value.toLong
63 |           /*
64 |            * currency, us_currency, float, percent, time
65 |            */
66 |           case _ => column.value.toDouble
67 |         }
68 |       }
69 |     
70 |       (k,v)
71 |       
72 |     }).toMap
73 |     
74 |   }
75 | 
76 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/GaSource.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | class GaSource(@transient sc:SparkContext) extends Serializable {
25 |  
26 |   def connect(config:ConnectConfig,requestParams:Map[String,String]):RDD[Map[String,Any]] = {
27 |     new GaReader(sc).read(config,requestParams)
28 | 
29 |   }
30 | 
31 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/SQLSource.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.connect
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 |  * 
  4 |  * This file is part of the Spark-Connect project
  5 |  * (https://github.com/skrusche63/spark-connect).
  6 |  * 
  7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
  8 |  * terms of the GNU General Public License as published by the Free Software
  9 |  * Foundation, either version 3 of the License, or (at your option) any later
 10 |  * version.
 11 |  * 
 12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
 13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 |  * You should have received a copy of the GNU General Public License along with
 16 |  * Spark-Connect. 
 17 |  * 
 18 |  * If not, see <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | import org.apache.spark.rdd.RDD
 22 | import org.apache.spark.sql._
 23 | 
 24 | import de.kp.spark.connect.aerospike.AerospikeSource
 25 | import de.kp.spark.connect.cassandra.CassandraSource
 26 | 
 27 | import de.kp.spark.connect.elasticsearch.ElasticSource
 28 | import de.kp.spark.connect.hbase.HBaseSource
 29 | 
 30 | import de.kp.spark.connect.jdbc.JdbcSource
 31 | import de.kp.spark.connect.mongodb.MongoSource
 32 | 
 33 | import de.kp.spark.connect.parquet.ParquetSource
 34 | 
 35 | object Sources {
 36 |     
 37 |   val AEROSPIKE:String = "aerospike"
 38 |   val CASSANDRA:String = "cassandra"
 39 |     
 40 |   val ELASTICSEARCH:String = "elasticsearch"
 41 | 
 42 |   val HBASE:String = "hbase"
 43 |   val JDBC:String  = "jdbc"
 44 |     
 45 |   val MONGODB:String = "mongodb"    
 46 |   val PARQUET:String = "parquet"
 47 |     
 48 | }
 49 | 
 50 | class SQLSource(
 51 |     @transient sqlContext:SQLContext,
 52 |     config:ConnectConfig,
 53 |     source:String,
 54 |     table:String,
 55 |     schema:StructType,
 56 |     params:Map[String,String]) extends Serializable {
 57 | 
 58 |   /*
 59 |    * Retrieve dataset from source and convert
 60 |    * result into Row
 61 |    */
 62 |   private val names = sqlContext.sparkContext.broadcast(schema.fieldNames)
 63 | 
 64 |   private val rowRDD = getRDD.map(rec => {
 65 |     val values = names.value.map(name => rec(name))
 66 |     Row.fromSeq(values)
 67 |   })
 68 |   
 69 |   /*
 70 |    * Apply schema to rows and register as table
 71 |    */
 72 |   private val tableRDD = sqlContext.applySchema(rowRDD, schema)
 73 |   tableRDD.registerTempTable(table)
 74 |   
 75 |   def executeQuery(query:String):SchemaRDD = sqlContext.sql(query)
 76 |   
 77 |   private def getRDD:RDD[Map[String,Any]] = {
 78 |     
 79 |     val sc = sqlContext.sparkContext
 80 |     val columns = schema.fieldNames
 81 |     
 82 |     source match {
 83 |       
 84 |       case Sources.AEROSPIKE => {
 85 |         /*
 86 |          * Column names are called bin names in the 
 87 |          * terminology of Aerospike
 88 |          */
 89 |         val req_params = params ++ Map("binnames" -> columns.mkString(","))
 90 |         new AerospikeSource(sc).read(config,req_params)
 91 |         
 92 |       }
 93 |       case Sources.CASSANDRA => {
 94 |         
 95 |         val req_params = params ++ Map("columns" -> columns.mkString(","))
 96 |         new CassandraSource(sc).read(config,req_params)
 97 |       
 98 |       }
 99 |       case Sources.ELASTICSEARCH => new ElasticSource(sc).read(config,params)
100 |       case Sources.HBASE => {
101 |  
102 |         val types = schema.fields.map(field => {
103 |           
104 |           field.dataType match {
105 |             
106 |             case DoubleType => "double"
107 |             case IntegerType => "integer"
108 |             
109 |             case LongType => "long"
110 |             case StringType => "string"
111 |               
112 |             case _ => throw new Exception("Data type is not supported.")
113 |           }
114 |           
115 |         })
116 | 
117 |         val req_params = params ++ Map("names" -> columns.mkString(","), "types" -> types.mkString(","))
118 |         new HBaseSource(sc).read(config,req_params)
119 |       
120 |       }
121 |       case Sources.JDBC => {
122 |       
123 |         val req_params = params ++ Map("fields" -> columns.mkString(","))
124 |         new JdbcSource(sc).read(config,req_params)
125 |       
126 |       }
127 |       case Sources.MONGODB => new MongoSource(sc).read(config,params)
128 | 
129 |       case Sources.PARQUET => {
130 |       
131 |         val req_params = params ++ Map("fields" -> columns.mkString(","))
132 |         new ParquetSource(sc).read(config,params)
133 |       
134 |       }
135 |       
136 |       case _ => throw new Exception(String.format("""Data source %s is not supported.""",source))
137 |       
138 |     }
139 |     
140 |   }
141 | 
142 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/aerospike/AerospikeReader.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.aerospike
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | import org.apache.hadoop.conf.{Configuration => HConfig}
25 | 
26 | import com.aerospike.hadoop._
27 | import de.kp.spark.connect.ConnectConfig
28 | 
29 | import scala.collection.JavaConversions._
30 | 
31 | class AerospikeReader(@transient sc:SparkContext) extends Serializable {
32 | 
33 |   /*
34 |    * Background to Aerospike:
35 |    * 
36 |    * At the highest level, data is collected in containers called namespaces;
37 |    * namespaces are similar to databases. Within a namespace, data are divided
38 |    * into sets (equivalent to tables), and finally records (rows).  
39 |    */
40 |   def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
41 |     
42 |     val settings = config.aerospike
43 |     
44 |     val conf = new HConfig()
45 |     /* Add host & port to configuration */
46 |     val host = if (settings.contains("aerospike.input.host")) 
47 |       settings("aerospike.input.host") else "localhost"
48 |    
49 |     conf.set("aerospike.input.host", host)
50 | 
51 |     val port = if (settings.contains("aerospike.input.port")) 
52 |       settings("aerospike.input.port") else "3000"
53 |    
54 |     conf.set("aerospike.input.port", port)
55 |     
56 |     /* Add namespace and set name to configuration */
57 |     conf.set("aerospike.input.namespace",params("namespace"))
58 |     conf.set("aerospike.input.setname",params("setnames"))
59 |     
60 |     /* Add bin names & operation */
61 |     val binnames = if (params.contains("binnames"))
62 |       params("binnames") else ""
63 |     
64 |     conf.set("aerospike.input.binnames",binnames)
65 | 
66 |     val operation = if (params.contains("operation"))
67 |       params("operation") else "scan"
68 |     
69 |     conf.set("aerospike.input.operation",operation)
70 | 
71 |     if (operation == "numrange") {
72 | 
73 |       conf.set("aerospike.input.numrange.bin",params("numrange_bin"))
74 |       
75 |       conf.set("aerospike.input.numrange.begin",params("numrange_begin"))
76 |       conf.set("aerospike.input.numrange.end",params("numrange_end"))
77 | 
78 |     }
79 | 
80 |     read(conf)
81 |     
82 |   }
83 | 
84 |   def read(config:HConfig):RDD[Map[String,Any]] = {
85 | 
86 |     val source = sc.newAPIHadoopRDD(config, classOf[AerospikeInputFormat], classOf[AerospikeKey], classOf[AerospikeRecord])
87 |     source.map{case(key,record) => toMap(key,record)}
88 |     
89 |   }
90 | 
91 |   private def toMap(key:AerospikeKey,record:AerospikeRecord):Map[String,Any] = {
92 |     
93 |     val bins = record.bins
94 |     bins.toMap
95 |     
96 |   }
97 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/aerospike/AerospikeSource.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.aerospike
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | import de.kp.spark.connect.ConnectConfig
25 | 
26 | class AerospikeSource(@transient sc:SparkContext) extends Serializable {
27 |  
28 |   def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 |     new AerospikeReader(sc).read(config,params)
30 |   }
31 | 
32 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/cassandra/CassandraReader.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.cassandra
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | import com.datastax.spark.connector._
25 | import de.kp.spark.connect.ConnectConfig
26 | 
27 | class CassandraReader(@transient sc:SparkContext) extends Serializable {
28 |   /**
29 |    * This method reads the content of a Cassandra table of a specific
30 |    * keyspace. Actually, all data records are retrieved from the table 
31 |    */
32 |   def read(config:ConnectConfig,keyspace:String,table:String,columns:List[String] = List.empty[String]):RDD[Map[String,Any]] = {
33 |       
34 |     val settings = config.cassandra
35 |     val host = settings("spark.cassandra.connection.host")
36 | 
37 |     /*
38 |      * We add the configuration parameters 
39 |      * to connect to a Cassandra cluster here
40 |      */
41 |     sc.getConf.set("spark.cassandra.connection.host",host)
42 |     /*
43 |      * Read from specified keyspace and table; note, that the number
44 |      * of entries to be returned must be specified 
45 |      */  
46 |     val source = if (columns.isEmpty) 
47 |       sc.cassandraTable(keyspace, table) else sc.cassandraTable(keyspace, table).select(columns.map(ColumnName(_)):_*)
48 |     
49 |     source.map(toMap(_))
50 |     
51 |   }
52 | 
53 |   /**
54 |    * For the primitive data types required by the different
55 |    * engines of Predictiveworks, the conversion of the column
56 |    * names and values using the toMap method is sufficient.
57 |    * 
58 |    * In case of more complex data types, this method must be
59 |    * adapted to these additional requirements
60 |    */
61 |   private def toMap(row:CassandraRow):Map[String,Any] = {
62 |     row.toMap
63 |   }
64 |   
65 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/cassandra/CassandraSource.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.cassandra
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | import de.kp.spark.connect.ConnectConfig
25 | 
26 | class CassandraSource(@transient sc:SparkContext) extends Serializable {
27 |  
28 |   def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 |     
30 |     val keyspace = params("keyspace")
31 |     val table = params("table")
32 |     
33 |     val columns = if (params.contains("columns")) params("columns").split(",").toList else List.empty[String]
34 |     
35 |     new CassandraReader(sc).read(config,keyspace,table,columns)
36 | 
37 |   }
38 | 
39 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/dmp/CxenseClient.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.connect.dmp
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 |  * 
  4 |  * This file is part of the Spark-Connect project
  5 |  * (https://github.com/skrusche63/spark-connect).
  6 |  * 
  7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
  8 |  * terms of the GNU General Public License as published by the Free Software
  9 |  * Foundation, either version 3 of the License, or (at your option) any later
 10 |  * version.
 11 |  * 
 12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
 13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 |  * You should have received a copy of the GNU General Public License along with
 16 |  * Spark-Connect. 
 17 |  * 
 18 |  * If not, see <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | import com.fasterxml.jackson.databind.{Module, ObjectMapper}
 22 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
 23 | 
 24 | import javax.ws.rs.HttpMethod
 25 | 
 26 | import javax.ws.rs.client.{ClientBuilder,Entity}
 27 | import javax.ws.rs.core.MediaType
 28 | 
 29 | import javax.crypto.Mac
 30 | import javax.crypto.spec.SecretKeySpec
 31 | 
 32 | import org.joda.time.DateTime
 33 | import org.joda.time.DateTimeZone
 34 | import org.joda.time.format.ISODateTimeFormat
 35 | 
 36 | import org.apache.commons.codec.binary.Base64
 37 | 
 38 | class CxenseClient(username:String,secret:String) {
 39 |  
 40 |   private val CXENSE_URI = "https://api.cxense.com"
 41 | 
 42 |   private val JSON_MAPPER = new ObjectMapper()  
 43 |   JSON_MAPPER.registerModule(DefaultScalaModule)
 44 | 
 45 |   def getProfileContentFetch(params:Map[String,Any]):Map[String,Any] = {
 46 |     
 47 |     val endpoint = "profile/content/fetch"
 48 |     getResponse(endpoint,params)
 49 |     
 50 |   } 
 51 | 
 52 |   def getProfileUserExternalRead(params:Map[String,Any]):Map[String,Any] = {
 53 |     
 54 |     val endpoint = "profile/user/external/read"
 55 |     getResponse(endpoint,params)
 56 |  
 57 |   }
 58 |   /*
 59 |    * Collect interest profile for a certain user; the relevant 
 60 |    * part of the response (profile) is equivalent to the 'content
 61 |    * fetch' request 
 62 |    */
 63 |   def getProfileUser(params:Map[String,Any]):Map[String,Any] = {
 64 |     
 65 |     val endpoint = "profile/user"
 66 |     getResponse(endpoint,params)
 67 |   
 68 |   }
 69 |   
 70 |   def getProfileUserSegment(params:Map[String,Any]):Map[String,Any] = {
 71 | 
 72 |     val endpoint = "profile/user/segment"
 73 |     getResponse(endpoint,params)
 74 | 
 75 |   }
 76 |   
 77 |   def getSegmentRead(params:Map[String,Any]):Map[String,Any] = {
 78 | 
 79 |     val endpoint = "segment/read"
 80 |     getResponse(endpoint,params)
 81 | 
 82 |   }
 83 |   
 84 |   def getSite(params:Map[String,Any]):Map[String,Any] = {
 85 | 
 86 |     val endpoint = "site"
 87 |     getResponse(endpoint,params)
 88 | 
 89 |   }
 90 |   
 91 |   def getSiteGroup(params:Map[String,Any]):Map[String,Any] = {
 92 | 
 93 |     val endpoint = "site/group"
 94 |     getResponse(endpoint,params)
 95 | 
 96 |   }
 97 |   
 98 |   def getTraffic(params:Map[String,Any]):Map[String,Any] = {
 99 | 
100 |     val endpoint = "traffic"
101 |     getResponse(endpoint,params)
102 | 
103 |   }
104 |   
105 |   def getTrafficCompare(params:Map[String,Any]):Map[String,Any] = {
106 | 
107 |     val endpoint = "traffic/compare"
108 |     getResponse(endpoint,params)
109 | 
110 |   }
111 | 
112 |   def getTrafficCustom(params:Map[String,Any]):Map[String,Any] = {
113 | 
114 |     val endpoint = "traffic/custom"
115 |     getResponse(endpoint,params)
116 | 
117 |   }
118 | 
119 |   def getTrafficCustomDescribe(params:Map[String,Any]):Map[String,Any] = {
120 | 
121 |     val endpoint = "traffic/custom/describe"
122 |     getResponse(endpoint,params)
123 | 
124 |   }
125 | 
126 |   def getTrafficEvent(params:Map[String,Any]):Map[String,Any] = {
127 | 
128 |     val endpoint = "traffic/event"
129 |     getResponse(endpoint,params)
130 | 
131 |   }
132 | 
133 |   def getTrafficEventDescribe(params:Map[String,Any]):Map[String,Any] = {
134 | 
135 |     val endpoint = "traffic/event/describe"
136 |     getResponse(endpoint,params)
137 | 
138 |   }
139 | 
140 |   def getTrafficIntent(params:Map[String,Any]):Map[String,Any] = {
141 | 
142 |     val endpoint = "traffic/intent"
143 |     getResponse(endpoint,params)
144 | 
145 |   }
146 | 
147 |   def getTrafficKeyword(params:Map[String,Any]):Map[String,Any] = {
148 | 
149 |     val endpoint = "traffic/keyword"
150 |     getResponse(endpoint,params)
151 | 
152 |   }
153 | 
154 |   def getTrafficKeywordDescribe(params:Map[String,Any]):Map[String,Any] = {
155 | 
156 |     val endpoint = "traffic/keyword/describe"
157 |     getResponse(endpoint,params)
158 | 
159 |   }
160 | 
161 |   def getTrafficRelated(params:Map[String,Any]):Map[String,Any] = {
162 | 
163 |     val endpoint = "traffic/related"
164 |     getResponse(endpoint,params)
165 | 
166 |   }
167 | 
168 |   def getTrafficUser(params:Map[String,Any]):Map[String,Any] = {
169 | 
170 |     val endpoint = "traffic/user"
171 |     getResponse(endpoint,params)
172 | 
173 |   }
174 | 
175 |   def getTrafficUserExternal(params:Map[String,Any]):Map[String,Any] = {
176 | 
177 |     val endpoint = "traffic/user/external"
178 |     getResponse(endpoint,params)
179 | 
180 |   }
181 | 
182 |   def getTrafficUserHistogram(params:Map[String,Any]):Map[String,Any] = {
183 | 
184 |     val endpoint = "traffic/user/histogram"
185 |     getResponse(endpoint,params)
186 | 
187 |   }
188 | 
189 |   def getTrafficUserHistogramEvent(params:Map[String,Any]):Map[String,Any] = {
190 | 
191 |     val endpoint = "traffic/user/histogram/event"
192 |     getResponse(endpoint,params)
193 | 
194 |   }
195 | 
196 |   def getTrafficUserInterest(params:Map[String,Any]):Map[String,Any] = {
197 | 
198 |     val endpoint = "traffic/user/interest"
199 |     getResponse(endpoint,params)
200 | 
201 |   }
202 | 
203 |   def getTrafficUserKeyword(params:Map[String,Any]):Map[String,Any] = {
204 | 
205 |     val endpoint = "traffic/user/keyword"
206 |     getResponse(endpoint,params)
207 | 
208 |   }
209 |   
210 |   private def getAuthenticationHeader:String = {
211 |         
212 |     val mac = Mac.getInstance("HmacSHA256")
213 |     mac.init(new SecretKeySpec(secret.getBytes("UTF-8"), "HmacSHA256"))
214 | 
215 |     val date = ISODateTimeFormat.dateTime().print(new DateTime(DateTimeZone.UTC))
216 |     val signature = new String(Base64.encodeBase64(mac.doFinal(date.getBytes("UTF-8"))))
217 | 
218 |     "username=" + username + " date=" + date + " hmac-sha256-base64=" + signature
219 |     
220 |   }
221 |   
222 |   private def getResponse(endpoint:String,req_params:Map[String,Any]):Map[String,Any] = {
223 | 
224 |     val body = JSON_MAPPER.writeValueAsString(req_params)
225 |     
226 |     val client = ClientBuilder.newClient()
227 |     val request = client.target(CXENSE_URI).path("/").path(endpoint).request(MediaType.APPLICATION_JSON_TYPE)
228 | 
229 |     val response = request
230 |                      .header("X-cXense-Authentication", getAuthenticationHeader)
231 |                      .method(HttpMethod.POST, if (body == null) null else Entity.json(body), classOf[String])
232 |     
233 |     client.close()    
234 |     
235 |     JSON_MAPPER.readValue(response, classOf[Map[String,Any]])
236 |     
237 |   }
238 |  
239 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/elasticsearch/ElasticReader.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.connect.elasticsearch
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 |  * 
  4 |  * This file is part of the Spark-Connect project
  5 |  * (https://github.com/skrusche63/spark-connect).
  6 |  * 
  7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
  8 |  * terms of the GNU General Public License as published by the Free Software
  9 |  * Foundation, either version 3 of the License, or (at your option) any later
 10 |  * version.
 11 |  * 
 12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
 13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 |  * You should have received a copy of the GNU General Public License along with
 16 |  * Spark-Connect. 
 17 |  * 
 18 |  * If not, see <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | import org.apache.spark.SparkContext
 22 | import org.apache.spark.rdd.RDD
 23 | 
 24 | import org.apache.hadoop.io.{ArrayWritable,DoubleWritable,IntWritable,LongWritable,MapWritable,NullWritable,Text,Writable}
 25 | 
 26 | import org.apache.hadoop.conf.{Configuration => HConfig}
 27 | 
 28 | import org.elasticsearch.hadoop.mr.EsInputFormat
 29 | import de.kp.spark.connect.ConnectConfig
 30 | 
 31 | import scala.collection.JavaConversions._
 32 | 
 33 | class ElasticReader(@transient sc:SparkContext) extends Serializable {
 34 | 
 35 |   val ES_QUERY:String = "es.query"
 36 |   val ES_RESOURCE:String = "es.resource"
 37 |   
 38 |   def read(config:HConfig):RDD[Map[String,Any]] = {
 39 | 
 40 |     val source = sc.newAPIHadoopRDD(config, classOf[EsInputFormat[Text, MapWritable]], classOf[Text], classOf[MapWritable])
 41 |     source.map(hit => toMap(hit._2))
 42 |     
 43 |   }
 44 |   
 45 |   def read(config:ConnectConfig,index:String,mapping:String,query:String):RDD[Map[String,Any]] = {
 46 |           
 47 |     val conf = config.elastic
 48 |     
 49 |     /*
 50 |      * Append dynamic request specific data to Elasticsearch configuration;
 51 |      * this comprises the search query to be used and the index (and mapping)
 52 |      * to be accessed
 53 |      */
 54 |     conf.set(ES_QUERY,query)
 55 |     conf.set(ES_RESOURCE,(index + "/" + mapping))
 56 |  
 57 |     read(conf)
 58 |     
 59 |   }
 60 |   
 61 |   private def toMap(mw:MapWritable):Map[String,Any] = {
 62 |     
 63 |     mw.entrySet().map(kv => {
 64 |         
 65 |       val k = kv.getKey().asInstanceOf[Text].toString
 66 |       val v = kv.getValue() match {
 67 |           
 68 |         case valu:ArrayWritable => {
 69 | 
 70 |           val array = valu.get
 71 |           array.map(record => {
 72 |               
 73 |             record.asInstanceOf[MapWritable].entrySet().map(entry => {
 74 |                 
 75 |               val sub_k = entry.getKey().asInstanceOf[Text].toString()
 76 |               val sub_v = entry.getValue() match {
 77 |           
 78 |                 case sub_valu:IntWritable => valu.get()
 79 |                 case sub_valu:DoubleWritable => valu.get()
 80 |           
 81 |                 case sub_valu:LongWritable => valu.get()
 82 |                 case sub_valu:Text => valu.toString
 83 | 
 84 |                 case _ => throw new Exception("Data type is not supported.")
 85 |                   
 86 |               }
 87 |                 
 88 |               (sub_k,sub_v)
 89 |                 
 90 |             }).toMap
 91 |               
 92 |           }).toList
 93 |             
 94 |         }
 95 |           
 96 |         case valu:IntWritable => valu.get()
 97 |         case valu:DoubleWritable => valu.get()
 98 |           
 99 |         case valu:LongWritable => valu.get()
100 |         case valu:Text => valu.toString
101 | 
102 |         case _ => throw new Exception("Data type is not supported.")
103 |           
104 |       }
105 |       
106 |       (k,v)
107 |         
108 |     }).toMap
109 | 
110 |   }
111 | 
112 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/elasticsearch/ElasticSource.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.elasticsearch
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | import de.kp.spark.connect.ConnectConfig
25 | 
26 | class ElasticSource(@transient sc:SparkContext) extends Serializable {
27 |  
28 |   def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 | 
30 |     val index = params("index")
31 |     val mapping = params("mapping")
32 |     
33 |     val query = params("query").asInstanceOf[String]
34 |     new ElasticReader(sc).read(config,index,mapping,query)
35 | 
36 |   }
37 | 
38 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/hbase/HBaseReader.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.connect.hbase
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 |  * 
  4 |  * This file is part of the Spark-Connect project
  5 |  * (https://github.com/skrusche63/spark-connect).
  6 |  * 
  7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
  8 |  * terms of the GNU General Public License as published by the Free Software
  9 |  * Foundation, either version 3 of the License, or (at your option) any later
 10 |  * version.
 11 |  * 
 12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
 13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 |  * You should have received a copy of the GNU General Public License along with
 16 |  * Spark-Connect. 
 17 |  * 
 18 |  * If not, see <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | import org.apache.spark.SparkContext
 22 | import org.apache.spark.rdd.RDD
 23 | 
 24 | import org.apache.hadoop.hbase.util.Bytes
 25 | import org.apache.hadoop.hbase.HBaseConfiguration
 26 | 
 27 | import org.apache.hadoop.hbase.CellUtil
 28 | import org.apache.hadoop.hbase.client.Result
 29 | 
 30 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 31 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
 32 | 
 33 | import de.kp.spark.connect.ConnectConfig
 34 | 
 35 | class HBaseReader(@transient sc:SparkContext) extends Serializable {
 36 |   
 37 |   private val HBASE_ROOTDIR = "/hbase"
 38 |     
 39 |   /**
 40 |    * This method reads the content of an HBase table of a specific
 41 |    * keyspace. Actually, all data records are retrieved from the table 
 42 |    */
 43 |   def read(config:ConnectConfig,columnfamily:String,table:String,names:List[String],types:List[String]):RDD[Map[String,Any]] = {
 44 |       
 45 |     val settings = config.hbase
 46 |     val host = settings("spark.hbase.host")
 47 |      
 48 |     val conf = HBaseConfiguration.create
 49 |     conf.setBoolean("hbase.cluster.distributed", true)
 50 |     conf.setInt("hbase.client.scanner.caching", 10000)
 51 |     
 52 |     conf.set("hbase.rootdir", HBASE_ROOTDIR)
 53 |     
 54 |     conf.set("hbase.zookeeper.quorum", host)   
 55 |     conf.set("hbase.zookeeper.property.clientPort","2181")
 56 |     
 57 |     val columns = names.map(name => columnfamily + ":" + name)
 58 |     conf.set(TableInputFormat.SCAN_COLUMNS, columns.mkString(" "))
 59 | 
 60 |     val typedNames = names.zip(types)
 61 |     
 62 |     def toMap(key:ImmutableBytesWritable,row:Result):Map[String,Any] = {
 63 |       
 64 |       typedNames.map{case(colname,coltype) => {
 65 |         /*
 66 |          * Convert column family and respective columns
 67 |          * into HBase readable Byte array
 68 |          */
 69 |         val cf = Bytes.toBytes(columnfamily)
 70 |         val cn = Bytes.toBytes(colname)
 71 |         
 72 |         if (row.containsColumn(cf,cn) == false) throw new Exception(
 73 |             String.format("""Combination of cf:%s and cn:%s does not exist""",columnfamily,colname))
 74 |         
 75 |         val byteValue = CellUtil.cloneValue(row.getColumnLatestCell(cf,cn)).array
 76 |         /*
 77 |          * We actually support the following data types:
 78 |          * 
 79 |          * double, integer, long, string
 80 |          * 
 81 |          * as these are needed by Predictiveworks
 82 |          */
 83 |         val colvalu = coltype match {
 84 |           
 85 |           case "double" => Bytes.toDouble(byteValue)
 86 |           
 87 |           case "integer" => Bytes.toInt(byteValue)
 88 |             
 89 |           case "long" => Bytes.toLong(byteValue)
 90 |             
 91 |           case "string" => Bytes.toString(byteValue)
 92 |             
 93 |           case _ => throw new Exception(String.format("""The data type '%s' is not supported.""",coltype))
 94 |           
 95 |         }
 96 |         
 97 |         (colname,colvalu)
 98 |         
 99 |       }}.toMap
100 |       
101 |     }
102 | 
103 |     val source = sc.newAPIHadoopRDD(conf,classOf[TableInputFormat],classOf[ImmutableBytesWritable],classOf[Result])
104 |     source.map{case(key,row) => toMap(key,row)}
105 | 
106 |   }
107 | 
108 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/hbase/HBaseSource.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.hbase
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | import de.kp.spark.connect.ConnectConfig
25 | 
26 | class HBaseSource(@transient sc:SparkContext) extends Serializable {
27 |  
28 |   def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 |     
30 |     val columnfamily = params("columnfamily")
31 |     val table = params("table")
32 |     
33 |     val names = params("names").split(",").toList
34 |     val types = params("types").split(",").toList
35 |     
36 |     new HBaseReader(sc).read(config,columnfamily,table,names,types)
37 | 
38 |   }
39 | 
40 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/jdbc/JdbcReader.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.connect.jdbc
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 |  * 
  4 |  * This file is part of the Spark-Connect project
  5 |  * (https://github.com/skrusche63/spark-connect).
  6 |  * 
  7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
  8 |  * terms of the GNU General Public License as published by the Free Software
  9 |  * Foundation, either version 3 of the License, or (at your option) any later
 10 |  * version.
 11 |  * 
 12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
 13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 |  * You should have received a copy of the GNU General Public License along with
 16 |  * Spark-Connect. 
 17 |  * 
 18 |  * If not, see <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | import java.sql.{Connection,DriverManager,ResultSet}
 22 | 
 23 | import org.apache.spark.SparkContext
 24 | import org.apache.spark.rdd.{JdbcRDD,RDD}
 25 | 
 26 | import scala.collection.mutable.HashMap
 27 | import de.kp.spark.connect.ConnectConfig
 28 | 
 29 | class JdbcReader(@transient sc:SparkContext) extends Serializable {
 30 | 
 31 |   protected val MYSQL_DRIVER   = "com.mysql.jdbc.Driver"
 32 |   protected val NUM_PARTITIONS = 1
 33 |  
 34 |   def read(config:ConnectConfig,site:Int,query:String,fields:List[String] = List.empty[String]):RDD[Map[String,Any]] = {
 35 |    
 36 |     val conf = config.mysql
 37 |     
 38 |     val url = conf("url")
 39 |     val database = conf("database")
 40 |     
 41 |     val user = conf("user")
 42 |     val password = conf("password")
 43 |     /*
 44 |      * The value of 'site' is used as upper and lower bound for 
 45 |      * the range (key) variable of the database table
 46 |      */
 47 |     val result = new JdbcRDD(sc,() => getConnection(url,database,user,password),
 48 |       query,site,site,NUM_PARTITIONS,
 49 |       (rs:ResultSet) => getRow(rs,fields)
 50 |     ).cache()
 51 | 
 52 |     result
 53 |     
 54 |   }
 55 |   
 56 |   /**
 57 |    * Convert database row into Map[String,Any] and restrict
 58 |    * to column names that are defined by the field spec
 59 |    */
 60 |   protected def getRow(rs:ResultSet,fields:List[String]):Map[String,Any] = {
 61 |     val metadata = rs.getMetaData()
 62 |     val numCols  = metadata.getColumnCount()
 63 |     
 64 |     val row = HashMap.empty[String,Any]
 65 |     (1 to numCols).foreach(i => {
 66 |       
 67 |       val k = metadata.getColumnName(i)
 68 |       val v = rs.getObject(i)
 69 |       
 70 |       if (fields.isEmpty) {
 71 |         row += k -> v
 72 |         
 73 |       } else {        
 74 |         if (fields.contains(k)) row += k -> v
 75 |         
 76 |       }
 77 |       
 78 |     })
 79 | 
 80 |     row.toMap
 81 |     
 82 |   }
 83 |   
 84 |   protected def getConnection(url:String,database:String,user:String,password:String):Connection = {
 85 | 
 86 |     /* Create MySQL connection */
 87 | 	Class.forName(MYSQL_DRIVER).newInstance()	
 88 | 	val endpoint = getEndpoint(url,database)
 89 | 		
 90 | 	/* Generate database connection */
 91 | 	val	connection = DriverManager.getConnection(endpoint,user,password)
 92 |     connection
 93 |     
 94 |   }
 95 |   
 96 |   protected def getEndpoint(url:String,database:String):String = {
 97 | 		
 98 | 	val endpoint = "jdbc:mysql://" + url + "/" + database
 99 | 	endpoint
100 | 		
101 |   }
102 | 
103 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/jdbc/JdbcSource.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.jdbc
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | import de.kp.spark.connect.ConnectConfig
25 | 
26 | class JdbcSource(@transient sc:SparkContext) extends Serializable {
27 |   
28 |   def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
29 |     
30 |     val site  = params("site").asInstanceOf[Int]
31 |     val query = params("query")
32 | 
33 |     val fields = params("fields").split(",").toList
34 |     
35 |     new JdbcReader(sc).read(config,site,query,fields)
36 | 
37 |   }
38 | 
39 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/log/ApacheLogAnalyzer.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.log
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.sql.SQLContext
23 | 
24 | case class ApacheLogStats(
25 |   contentSizeStats:(Long,Long,Long,Long),
26 |   responseCodeCount:Seq[(Int,Long)],
27 |   ipAddresses:Seq[String],
28 |   topEndpoints:Seq[(String,Long)]
29 | )
30 | 
31 | class ApacheLogAnalyzer(@transient sc:SparkContext) extends Serializable {
32 | 
33 |   private val sqlContext = new SQLContext(sc)
34 |   import sqlContext.createSchemaRDD
35 | 
36 |   def stats(store:String):ApacheLogStats = {
37 |     
38 |     /*
39 |      * Data structure
40 |      * 
41 |      * ip_address
42 |      * client_identd
43 |      * user_id
44 |      * datetime
45 |      * method
46 |      * endpoint
47 |      * protocol
48 |      * response_code
49 |      * content_size    
50 |      * 
51 |      */
52 |     val logs = sc.textFile(store).map(ApacheLogParser.parse(_))
53 |     logs.registerTempTable("logs")
54 |     
55 |     /* Calculate statistics based on the content size */
56 |     val CONTENT_SIZE_SQL = "SELECT SUM(content_size), COUNT(*), MIN(content_size), MAX(content_size) FROM logs"
57 |     val contentSizeStats = sqlContext.sql(CONTENT_SIZE_SQL).map(row => 
58 |       (row.getLong(0), row.getLong(1), row.getLong(2), row.getLong(3))
59 |     
60 |     ).first
61 |     
62 |     /* Compute Response Code to Count */
63 |     val RESPONSE_CODE_SQL = "SELECT response_code, COUNT(*) FROM logs GROUP BY response_code"
64 |     val responseCodeCount = sqlContext.sql(RESPONSE_CODE_SQL).map(row => 
65 |       (row.getInt(0), row.getLong(1))
66 |     
67 |     ).take(1000).toList
68 |           
69 |     /* Any IPAddress that has accessed the server more than 10 times */
70 |     val IP_ADDRESS_SQL = "SELECT ip_address, COUNT(*) AS total FROM logs GROUP BY ip_address HAVING total > 10"
71 |     val ipAddresses = sqlContext.sql(IP_ADDRESS_SQL).map(row => 
72 |       row.getString(0)
73 |     ).take(100)  // Take only 100 in case this is a super large data set.
74 | 
75 |     /* Top Endpoints */
76 |     val ENDPOINT_SQL = "SELECT endpoint, COUNT(*) AS total FROM logs GROUP BY endpoint ORDER BY total DESC LIMIT 10"
77 |     val topEndpoints = sqlContext.sql(ENDPOINT_SQL).map(row => 
78 |       (row.getString(0), row.getLong(1))
79 |     ).collect()
80 |     
81 |     ApacheLogStats(
82 |       contentSizeStats,responseCodeCount,ipAddresses,topEndpoints
83 |     )
84 |     
85 |   }
86 | 
87 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/log/ApacheLogParser.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.log
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import java.util.regex.Matcher
22 | import java.util.regex.Pattern
23 | 
24 | case class ApacheLogInfo(
25 |   ip_address:String,
26 |   client_identd:String,
27 |   user_id:String,
28 |   datetime:String,
29 |   method:String,
30 |   endpoint:String,
31 |   protocol:String,
32 |   response_code:Int,
33 |   content_size:Long    
34 | )
35 | 
36 | object ApacheLogParser extends Serializable{
37 |   /*
38 |    * Example Apache log line:
39 |    * 
40 |    * 127.0.0.1 - - [21/Jul/2014:9:55:27 -0800] "GET /home.html HTTP/1.1" 200 2048
41 |    * 
42 |    */
43 |   private val LOG_ENTRY_PATTERN =
44 |       // 1:IP  2:client 3:user 4:date time  5:method 6:req 7:proto   8:respcode 9:size
45 |       "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)"
46 |   
47 |   private val PATTERN = Pattern.compile(LOG_ENTRY_PATTERN)
48 | 
49 |   def parse(logline:String):ApacheLogInfo = {
50 |     
51 |     val m = PATTERN.matcher(logline)
52 |     if (!m.find()) {
53 |       throw new RuntimeException("Error parsing logline");
54 |     }
55 | 
56 |     ApacheLogInfo(
57 |       m.group(1), 
58 |       m.group(2), 
59 |       m.group(3), 
60 |       m.group(4),
61 |       m.group(5), 
62 |       m.group(6), 
63 |       m.group(7), 
64 |       m.group(8).toInt, 
65 |       m.group(9).toLong)
66 |       
67 |   }
68 | 
69 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/mongodb/MongoReader.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.mongodb
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | 
24 | import com.mongodb.hadoop.MongoInputFormat
25 | import org.bson.BSONObject
26 | 
27 | import scala.collection.mutable.HashMap
28 | import scala.collection.JavaConversions._
29 | 
30 | import de.kp.spark.connect.ConnectConfig
31 | 
32 | class MongoReader(@transient sc:SparkContext) extends Serializable {
33 |   
34 |   def read(config:ConnectConfig,query:String):RDD[Map[String,Any]] = {
35 |     
36 |     val conf = config.mongo
37 |     conf.set("mongo.input.query",query)    
38 |     
39 |     val source = sc.newAPIHadoopRDD(conf, classOf[MongoInputFormat], classOf[Object], classOf[BSONObject])
40 |     source.map(x => toMap(x._2))
41 |     
42 |   }
43 | 
44 |   private def toMap(obj:BSONObject):Map[String,Any] = {
45 |     
46 |     val data = HashMap.empty[String,Any]
47 |     
48 |     val keys = obj.keySet()
49 |     for (k <- keys) {
50 |       
51 |       val v = obj.get(k)
52 |       data += k -> v
53 |     
54 |     }
55 |     
56 |     data.toMap
57 |     
58 |   }
59 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/mongodb/MongoSource.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.mongodb
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | import de.kp.spark.connect.ConnectConfig
24 | 
25 | class MongoSource(@transient sc:SparkContext) extends Serializable {
26 |  
27 |   def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
28 |     
29 |     val query = params("query")
30 |     new MongoReader(sc).read(config,query)
31 | 
32 |   }
33 | 
34 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/parquet/ParquetReader.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.parquet
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.SparkContext._
23 | 
24 | import org.apache.spark.sql._
25 | import org.apache.spark.rdd.RDD
26 | 
27 | import scala.collection.mutable.HashMap
28 | 
29 | class ParquetReader(@transient sc:SparkContext) extends Serializable {
30 | 
31 |   def read(store:String,fields:List[String] = List.empty[String]):RDD[Map[String,Any]] = {
32 | 
33 |     val sqlCtx = new SQLContext(sc)
34 |     import sqlCtx.createSchemaRDD
35 |     
36 |     /* 
37 |      * Read in the parquet file created above.  Parquet files are self-describing 
38 |      * so the schema is preserved. The result of loading a Parquet file is also a 
39 |      * SchemaRDD. 
40 |      */
41 |     val parquetFile = sqlCtx.parquetFile(store)
42 |     val metadata = parquetFile.schema.fields.zipWithIndex
43 |     
44 |     parquetFile.map(row => toMap(row,metadata,fields))
45 | 
46 |   }
47 | 
48 |   private def toMap(row:Row,metadata:Seq[(StructField,Int)],fields:List[String]):Map[String,Any] = {
49 | 
50 |     val data = HashMap.empty[String,Any]
51 |     val values = row.iterator.zipWithIndex.map(x => (x._2,x._1)).toMap
52 |     
53 |     metadata.foreach(entry => {
54 |       
55 |       val field = entry._1
56 |       val col   = entry._2
57 |       
58 |       val colname = field.name
59 |       val colvalu = values(col)
60 |       
61 |       if (fields.isEmpty) {
62 |         data += colname -> colvalu
63 |         
64 |       } else {        
65 |         if (fields.contains(colname)) data += colname -> colvalu
66 |         
67 |       }
68 |      
69 |     })
70 |     
71 |     data.toMap
72 |     
73 |   }
74 |   
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/parquet/ParquetSource.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.parquet
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.rdd.RDD
23 | import de.kp.spark.connect.ConnectConfig
24 | 
25 | class ParquetSource(@transient sc:SparkContext) extends Serializable {
26 |   
27 |   def read(config:ConnectConfig,params:Map[String,String]):RDD[Map[String,Any]] = {
28 |   
29 |     val store = params("store")
30 |     val fields = params("fields").split(",").toList
31 |     
32 |     new ParquetReader(sc).read(store,fields)
33 |     
34 |   }
35 |   
36 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/shop/BigClient.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.connect.shop
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 |  * 
  4 |  * This file is part of the Spark-Connect project
  5 |  * (https://github.com/skrusche63/spark-connect).
  6 |  * 
  7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
  8 |  * terms of the GNU General Public License as published by the Free Software
  9 |  * Foundation, either version 3 of the License, or (at your option) any later
 10 |  * version.
 11 |  * 
 12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
 13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 |  * You should have received a copy of the GNU General Public License along with
 16 |  * Spark-Connect. 
 17 |  * 
 18 |  * If not, see <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | import org.scribe.model._
 22 | import org.slf4j.LoggerFactory
 23 | 
 24 | import com.fasterxml.jackson.databind.ObjectMapper
 25 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
 26 | 
 27 | import scala.collection.mutable.Buffer
 28 | 
 29 | class BigClient(val key:String,val token:String,val context:String) {
 30 | 
 31 |   private val LOG = LoggerFactory.getLogger(classOf[BigClient])
 32 |   
 33 |   private val JSON_MAPPER = new ObjectMapper()  
 34 |   JSON_MAPPER.registerModule(DefaultScalaModule)
 35 | 
 36 |   val ENDPOINT = String.format("""https://api.bigcommerce.com/%s/v2/""",context)
 37 | 
 38 |   def getResources(resource:String,id:Int,params:Map[String,String]):List[Map[String,Any]] = {
 39 |     
 40 |     resource match {
 41 |       
 42 |       case "brand" => getBrands(params)
 43 |       case "customer" => getCustomers(params)
 44 |       
 45 |       case "image" => if (id == -1) List.empty[Map[String,Any]] else getImages(id,params)
 46 |       case "lineitem" => if (id == -1) List.empty[Map[String,Any]] else getLineItems(id,params)
 47 |       
 48 |       case "order" => getOrders(params)
 49 |       case "product" => getProducts(params)
 50 |       
 51 |       case _ => List.empty[Map[String,Any]]
 52 |       
 53 |     }
 54 |     
 55 |   }
 56 |   
 57 |   def getBrands(requestParams:Map[String,String]):List[Map[String,Any]] = {
 58 |  
 59 |     val endpoint = ENDPOINT + "brands" + getSimpleUrlParams(requestParams)
 60 |     getResponseAsList(endpoint)
 61 |     
 62 |   }
 63 | 
 64 |   def getCustomers(requestParams:Map[String,String]):List[Map[String,Any]] = {
 65 |  
 66 |     val endpoint = ENDPOINT + "customers" + getSimpleUrlParams(requestParams)
 67 |     getResponseAsList(endpoint)
 68 |     
 69 |   }
 70 |   
 71 |   def getOrders(requestParams:Map[String,String]):List[Map[String,Any]] = {
 72 |  
 73 |     val endpoint = ENDPOINT + "orders"
 74 |     getResponseAsList(endpoint)
 75 |     
 76 |   }
 77 |   
 78 |   def getBrand(brand:Int):Map[String,Any] = {
 79 |  
 80 |     val endpoint = ENDPOINT + "brands/" + brand 
 81 |     getResponseAsObject(endpoint)
 82 |     
 83 |   }
 84 |   def getLineItems(order:Int,requestParams:Map[String,String]):List[Map[String,Any]] = {
 85 |  
 86 |     val endpoint = ENDPOINT + "orders/" + order + "/products" + getSimpleUrlParams(requestParams)
 87 |     getResponseAsList(endpoint)
 88 |     
 89 |   }
 90 |   
 91 |   def getImages(product:Int,requestParams:Map[String,String]):List[Map[String,Any]] = {
 92 |  
 93 |     val endpoint = ENDPOINT + "products/" + product + "/images" + getSimpleUrlParams(requestParams)
 94 |     getResponseAsList(endpoint)
 95 |     
 96 |   }
 97 |   
 98 |   def getProducts(requestParams:Map[String,String]):List[Map[String,Any]] = {
 99 |  
100 |     val endpoint = ENDPOINT + "products" + getSimpleUrlParams(requestParams)
101 |     getResponseAsList(endpoint)
102 |     
103 |   }
104 |   
105 |   private def getOrderUrlParams(params:Map[String,String]):String = {
106 |     
107 |     
108 |     val accepted = List("page","limit","min_date_created","status_id","max_date_created")
109 |     
110 |     val sb = Buffer.empty[String]
111 |     for (kv <- params) {
112 |        
113 |       if (accepted.contains(kv._1)) {
114 |         
115 |         val value = String.format("""?%s=%s""",kv._1,kv._2)
116 |         sb += value
117 |       
118 |       }
119 |       
120 |     }
121 |     
122 |     val s = "?" + sb.mkString("&")    
123 |     java.net.URLEncoder.encode(s, "UTF-8")
124 |     
125 |   }
126 | 
127 |   private def getSimpleUrlParams(params:Map[String,String]):String = {
128 |     
129 |     val accepted = List("page","limit")
130 |     
131 |     val sb = Buffer.empty[String]
132 |     for (kv <- params) {
133 |        
134 |       if (accepted.contains(kv._1)) {
135 |         
136 |         val value = String.format("""?%s=%s""",kv._1,kv._2)
137 |         sb += value
138 |       
139 |       }
140 |       
141 |     }
142 |     
143 |     val s = "?" + sb.mkString("&")    
144 |     java.net.URLEncoder.encode(s, "UTF-8")
145 |     
146 |   }
147 |   
148 |   def getResponseAsList(endpoint:String):List[Map[String,Any]] = {
149 | 
150 |     val request = new OAuthRequest(Verb.GET, endpoint)
151 |     request.addHeader("accept", "application/json")
152 | 
153 |     request.addHeader("X-Auth-Client", key)
154 |     request.addHeader("X-Auth-Token", token)
155 | 		
156 |     val response = request.send()
157 |     if (response.getCode == 200) {
158 |       
159 |       val body = response.getBody
160 |       JSON_MAPPER.readValue(body, classOf[List[Map[String,Any]]])
161 |       
162 |     } else {
163 |       throw new Exception("Bad request: " + response.getCode)
164 |     }
165 |     
166 |   }
167 |   
168 |   def getResponseAsObject(endpoint:String):Map[String,Any] = {
169 | 
170 |     val request = new OAuthRequest(Verb.GET, endpoint)
171 |     request.addHeader("accept", "application/json")
172 | 
173 |     request.addHeader("X-Auth-Client", key)
174 |     request.addHeader("X-Auth-Token", token)
175 | 		
176 |     val response = request.send()
177 |     if (response.getCode == 200) {
178 |       
179 |       val body = response.getBody
180 |       JSON_MAPPER.readValue(body, classOf[Map[String,Any]])
181 |       
182 |     } else {
183 |       throw new Exception("Bad request: " + response.getCode)
184 |     }
185 |     
186 |   }
187 | 
188 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/shop/BigDataset.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.connect.shop
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 |  * 
 4 |  * This file is part of the Spark-Connect project
 5 |  * (https://github.com/skrusche63/spark-connect).
 6 |  * 
 7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
 8 |  * terms of the GNU General Public License as published by the Free Software
 9 |  * Foundation, either version 3 of the License, or (at your option) any later
10 |  * version.
11 |  * 
12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 |  * You should have received a copy of the GNU General Public License along with
16 |  * Spark-Connect. 
17 |  * 
18 |  * If not, see <http://www.gnu.org/licenses/>.
19 |  */
20 | import org.apache.spark.SparkContext
21 | import org.apache.spark.SparkContext._
22 | import org.apache.spark.rdd.RDD
23 | 
24 | import scala.collection.mutable.Buffer
25 | /**
26 |  * The Bigcommerce REST API does not support counting for supported resources; 
27 |  * this implies, that we have not enough data to do partitioning and retrieve 
28 |  * resource data per partition.
29 |  * 
30 |  * In other words, we cannot user Spark's RDD mechanism directly, but have to
31 |  * collect all the data first, and partition then.
32 |  */
33 | class BigDataset(
34 |   /* Reference to SparkContext */
35 |   @transient sc:SparkContext,
36 |   /* resource */
37 |   resource:String,
38 |   /* Request parameters */
39 |   params:Map[String,String],
40 |   /* Total number of partitions */
41 |   numPartitions:Int) {
42 |   
43 |   /*
44 |    * Prepare request parameters, i.e. in case of an identifier provided, 
45 |    * this value is used to determine the list of a dependent resource:
46 |    * 
47 |    * E.g. articles have to be retrieved by provided the identifier of
48 |    * the associated blog
49 |    */
50 |   private val excludes = List("id")
51 |   private val req_params = params.filter(kv => excludes.contains(kv._1) == false)
52 |   
53 |   private val rid = if (params.contains("id")) params("id").toInt else -1
54 |   private val client = createClient
55 |   
56 |   private val dataset = getDataset
57 |   
58 |   def toRDD = sc.parallelize(dataset, numPartitions)
59 |   
60 |   private def createClient:BigClient = {
61 |   
62 |     val key = params("key")
63 |     val token = params("token")
64 |     
65 |     val context = params("context")
66 |     new BigClient(key,token,context)
67 |     
68 |   }
69 |  
70 |   private def getDataset:Seq[Map[String,Any]] = {
71 |     
72 |     val dataset = Buffer.empty[Map[String,Any]]
73 |       
74 |     var page = 1
75 |     var finished = false
76 |     
77 |     while (finished == false) {
78 |       
79 |       val records = client.getResources(resource,rid,req_params ++ Map("limit" -> "250","page" -> page.toString))
80 |       dataset ++= records
81 | 
82 |       page += 1
83 |       /*
84 |        * Check whether this request has been the last request; 
85 |        * the respective condition is given, if less than 250 
86 |        * records are retrieved
87 |        */
88 |       if (records.size < 250) finished = true       
89 |               
90 |     }
91 |    
92 |     dataset 
93 |     
94 |   }
95 |   
96 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/shop/ShopifyClient.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.connect.shop
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 |  * 
  4 |  * This file is part of the Spark-Connect project
  5 |  * (https://github.com/skrusche63/spark-connect).
  6 |  * 
  7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
  8 |  * terms of the GNU General Public License as published by the Free Software
  9 |  * Foundation, either version 3 of the License, or (at your option) any later
 10 |  * version.
 11 |  * 
 12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
 13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 |  * You should have received a copy of the GNU General Public License along with
 16 |  * Spark-Connect. 
 17 |  * 
 18 |  * If not, see <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | import java.io.IOException
 22 | 
 23 | import javax.ws.rs.HttpMethod
 24 | import javax.ws.rs.client.{Client,ClientBuilder,Entity,WebTarget}
 25 | import javax.ws.rs.core.MediaType
 26 | 
 27 | import com.fasterxml.jackson.databind.{Module, ObjectMapper}
 28 | import com.fasterxml.jackson.module.scala.DefaultScalaModule
 29 | 
 30 | import org.slf4j.{Logger,LoggerFactory}
 31 | 
 32 | import scala.collection.mutable.HashMap
 33 | import scala.collection.JavaConversions._
 34 | 
 35 | class ShopifyClient(key:String,secret:String,url:String) extends Serializable {
 36 | 
 37 |   private val JSON_MAPPER = new ObjectMapper()  
 38 |   JSON_MAPPER.registerModule(DefaultScalaModule)
 39 |   
 40 |   private val client = ClientBuilder.newClient()
 41 |   private val endpoint = url.replaceFirst("://", "://" + key + ":" + secret + "@")
 42 |       
 43 |   private val webTarget = client.target(endpoint).path("admin")   
 44 |  
 45 |   def close = client.close
 46 |  
 47 |   def getResourceCount(name:String,id:Long,params:Map[String,String]):Long = {
 48 |     
 49 |     name match {
 50 |       
 51 |       case "article" => if (id == -1) -1 else getArticlesCount(id,params)
 52 |       case "blog" => getBlogsCount(params)
 53 |       
 54 |       case "customer" => getCustomersCount(params)
 55 |       
 56 |       case "order" => getOrdersCount(params)            
 57 |       case "product" => getProductsCount(params)
 58 | 
 59 |       case _ => -1
 60 |     
 61 |     }    
 62 |     
 63 |   }
 64 |   def getResources(name:String,id:Long,params:Map[String,String]):List[Map[String,Any]] = {
 65 |     
 66 |     name match {
 67 |       
 68 |       case "article" => if (id == -1) List.empty[Map[String,Any]] else getArticles(id,params)
 69 |       case "blog" => getBlogs(params)
 70 |       
 71 |       case "customer" => getCustomers(params)
 72 |       
 73 |       case "order" => getOrders(params)            
 74 |       case "product" => getProducts(params)
 75 | 
 76 |       case _ => List.empty[Map[String,Any]]
 77 |     
 78 |     }    
 79 |     
 80 |   }
 81 |   
 82 |   /**************************************************************************
 83 |    * 
 84 |    *                        ARTICLE SUPPORT
 85 |    * 
 86 |    *************************************************************************/
 87 |   
 88 |   def getArticles(bid:Long,params:Map[String,String]):List[Map[String,Any]] = {
 89 |     
 90 |     val result = getResponse("blogs/" + bid + "/articles.json", params, HttpMethod.GET)
 91 |     /*
 92 |      * { "articles": [ ... ] }
 93 |      */
 94 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
 95 |     if (response.contains("articles")) 
 96 |       response("articles").asInstanceOf[List[Map[String,Any]]] 
 97 |     
 98 |     else List.empty[Map[String,Any]]
 99 |   
100 |   }
101 |   
102 |   def getArticlesCount(bid:Long,params:Map[String,String]):Long = {
103 |     
104 |     val result = getResponse("blogs/" + bid + "/articles/count.json", params, HttpMethod.GET)
105 |     /*
106 |      * { "count": 1 }
107 |      */
108 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
109 |     if (response.contains("count")) response("count").asInstanceOf[Long] else -1
110 |     
111 |   }
112 | 
113 |   /**************************************************************************
114 |    * 
115 |    *                        BLOG SUPPORT
116 |    * 
117 |    *************************************************************************/
118 |   
119 |   def getBlogs(params:Map[String,String]):List[Map[String,Any]] = {
120 |     
121 |     val result = getResponse("blogs.json", params, HttpMethod.GET)
122 |     /*
123 |      * { "blogs": [ ... ] }
124 |      */
125 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
126 |     if (response.contains("blogs")) 
127 |       response("blogs").asInstanceOf[List[Map[String,Any]]] 
128 |     
129 |     else List.empty[Map[String,Any]]
130 |   
131 |   }
132 |   
133 |   def getBlogsCount(params:Map[String,String]):Long = {
134 |     
135 |     val result = getResponse("blogs/count.json", params, HttpMethod.GET)
136 |     /*
137 |      * { "count": 1 }
138 |      */
139 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
140 |     if (response.contains("count")) response("count").asInstanceOf[Long] else -1
141 |     
142 |   }
143 | 
144 |   /**************************************************************************
145 |    * 
146 |    *                        CUSTOMER SUPPORT
147 |    * 
148 |    *************************************************************************/
149 |   
150 |   def getCustomers(params:Map[String,String]):List[Map[String,Any]] = {
151 |     
152 |     val result = getResponse("customers.json", params, HttpMethod.GET)
153 |     /*
154 |      * { "customers": [ ... ] }
155 |      */
156 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
157 |     if (response.contains("customers")) 
158 |       response("customers").asInstanceOf[List[Map[String,Any]]] 
159 |     
160 |     else List.empty[Map[String,Any]]
161 |   
162 |   }
163 |   
164 |   def getCustomersCount(params:Map[String,String]):Long = {
165 |     
166 |     val result = getResponse("customers/count.json", params, HttpMethod.GET)
167 |     /*
168 |      * { "count": 1 }
169 |      */
170 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
171 |     if (response.contains("count")) response("count").asInstanceOf[Long] else -1
172 |     
173 |   }
174 | 
175 |   /**************************************************************************
176 |    * 
177 |    *                        PRODUCT SUPPORT
178 |    * 
179 |    *************************************************************************/
180 |   
181 |   def getProducts(params:Map[String,String]):List[Map[String,Any]] = {
182 |    
183 |     val result = getResponse("products.json", params, HttpMethod.GET)
184 |     /*
185 |      * { "products": [ ... ] }
186 |      */
187 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
188 |     if (response.contains("products")) 
189 |       response("customers").asInstanceOf[List[Map[String,Any]]] 
190 |     
191 |     else List.empty[Map[String,Any]]
192 | 
193 |   }
194 |   
195 |   def getProductsCount(params:Map[String,String]):Long = {
196 |     
197 |     val result = getResponse("products/count.json", params, HttpMethod.GET)
198 |     /*
199 |      * { "count": 1 }
200 |      */
201 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
202 |     if (response.contains("count")) response("count").asInstanceOf[Long] else -1
203 |     
204 |   }
205 | 
206 |   /**************************************************************************
207 |    * 
208 |    *                        ORDER SUPPORT
209 |    * 
210 |    *************************************************************************/
211 |   
212 |   def getOrders(params:Map[String,String]):List[Map[String,Any]] = {
213 |     
214 |     val result = getResponse("orders.json", params, HttpMethod.GET)
215 |     /*
216 |      * { "orders": [ ... ] }
217 |      */
218 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
219 |     if (response.contains("orders")) 
220 |       response("orders").asInstanceOf[List[Map[String,Any]]] 
221 |     
222 |     else List.empty[Map[String,Any]]
223 | 
224 |   }
225 |   
226 |   def getOrdersCount(params:Map[String,String]):Long = {
227 |     
228 |     val result = getResponse("orders/count.json", params, HttpMethod.GET)
229 |     /*
230 |      * { "count": 1 }
231 |      */
232 |     val response = JSON_MAPPER.readValue(result, classOf[Map[String,Any]])
233 |     if (response.contains("count")) response("count").asInstanceOf[Long] else -1
234 |     
235 |   }
236 | 
237 |   private def getResponse(resource:String,params:Map[String,String],method:String):String = {
238 |        
239 |     try {
240 |       
241 |       var qt = webTarget.path(resource)
242 |       for (entry <- params) {
243 |         val (k,v) = entry
244 |         qt = qt.queryParam(k,v)
245 |       }
246 | 
247 |       qt.request(MediaType.APPLICATION_JSON_TYPE).method(method, null, classOf[String])
248 |     
249 |     } catch {
250 |       case e:Exception => throw new Exception("Could not process query",e)
251 |     }
252 | 
253 |   }
254 | 
255 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/connect/shop/ShopifyRDD.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.connect.shop
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 |  * 
  4 |  * This file is part of the Spark-Connect project
  5 |  * (https://github.com/skrusche63/spark-connect).
  6 |  * 
  7 |  * Spark-Connect is free software: you can redistribute it and/or modify it under the
  8 |  * terms of the GNU General Public License as published by the Free Software
  9 |  * Foundation, either version 3 of the License, or (at your option) any later
 10 |  * version.
 11 |  * 
 12 |  * Spark-Connect is distributed in the hope that it will be useful, but WITHOUT ANY
 13 |  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 |  * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 |  * You should have received a copy of the GNU General Public License along with
 16 |  * Spark-Connect. 
 17 |  * 
 18 |  * If not, see <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | import org.apache.spark.{Partition,SparkContext,TaskContext}
 22 | import org.apache.spark.TaskKilledException
 23 | 
 24 | import org.apache.spark.rdd.RDD
 25 | import org.apache.spark.util.NextIterator
 26 | 
 27 | import scala.collection.mutable.Buffer
 28 | 
 29 | class ShopifyPartition(idx:Int,val start:Int,val end:Int) extends Partition {
 30 |   override def index = idx  
 31 | }
 32 | 
 33 | class ShopifyRDD(
 34 |     /* Reference to SparkContext */
 35 |     @transient sc:SparkContext,
 36 |     /* resource */
 37 |     resource:String,
 38 |     /* Request parameters */
 39 |     params:Map[String,String],
 40 |     /* Total number of partitions */
 41 |     numPartitions:Int) extends RDD[Map[String,Any]](sc,Nil) {
 42 |   
 43 |   /*
 44 |    * Prepare request parameters, i.e. in case of an identifier provided, 
 45 |    * this value is used to determine the list of a dependent resource:
 46 |    * 
 47 |    * E.g. articles have to be retrieved by provided the identifier of
 48 |    * the associated blog
 49 |    */
 50 |   private val excludes = List("id")
 51 |   private val req_params = params.filter(kv => excludes.contains(kv._1) == false)
 52 |   
 53 |   private val rid = if (params.contains("id")) params("id").toLong else -1
 54 |   
 55 |   private def createClient:ShopifyClient = {
 56 |   
 57 |     val key = params("key")
 58 |     val secret = params("secret")
 59 |     
 60 |     val url = params("url")
 61 |     new ShopifyClient(key,secret,url)
 62 |     
 63 |   }
 64 |   
 65 |   override def getPartitions:Array[Partition] = {
 66 | 
 67 |     val client = createClient
 68 |     
 69 |     /*
 70 |      * The ShopifyRDD collects all items of a certain resource from the
 71 |      * shop platform; in order to calculate the respective partitions,
 72 |      * we have to determine the total number of items first  
 73 |      */
 74 |     val count = client.getResourceCount(resource,rid,req_params)
 75 |     client.close
 76 |     
 77 |     val pages = Math.ceil(count / 250.0).toInt
 78 |     
 79 |     val pagesPerPartition = Math.floor(pages.toDouble / numPartitions).toInt
 80 |     val diff = pages - numPartitions * pagesPerPartition
 81 |     
 82 |     
 83 |     (0 until numPartitions).map(i => {
 84 |       
 85 |       val start = 1 + i * pagesPerPartition
 86 |       val end = (i+1) * pagesPerPartition
 87 |     
 88 |       if (i == numPartitions - 1)
 89 |         new ShopifyPartition(i,start,end + diff)
 90 |       
 91 |       else
 92 |         new ShopifyPartition(i,start,end)
 93 |     
 94 |     }).toArray
 95 |     
 96 |   }
 97 |   
 98 |   override def compute(thePart:Partition,context:TaskContext) = new Iterator[Map[String,Any]] {
 99 |     
100 |     private var closed = false  
101 |     private var finished = false
102 |     
103 |     context.addTaskCompletionListener{ context => closeIfNeeded() }
104 | 
105 |     /*
106 |      * A partition is characterized by a begin & end page
107 |      */ 
108 |     private val partition = thePart.asInstanceOf[ShopifyPartition]    
109 |     
110 |     val start = partition.start
111 |     val end   = partition.end
112 | 
113 |     val client = createClient
114 |     
115 |     val resources = Buffer.empty[Map[String,Any]]    
116 |     (start to end).foreach(page => {
117 |       resources ++= client.getResources(resource,rid,req_params ++ Map("page" -> page.toString,"limit" -> "250"))
118 |     }) 
119 |     
120 |     val dataset = resources.toIterator
121 |     
122 |     def hasNext:Boolean = {
123 |       
124 |       if (context.isInterrupted())
125 |         throw new TaskKilledException
126 |       
127 |       !finished && dataset.hasNext
128 |       
129 |     }
130 |     
131 |     def next:Map[String,Any] = {
132 |       
133 |       if (hasNext) {
134 |         dataset.next
135 |         
136 |       } else {
137 |         
138 |         finished = true
139 |         null.asInstanceOf[Map[String,Any]]
140 |       
141 |       }
142 |       
143 |     }
144 |     
145 |     def closeIfNeeded() {
146 |       if (!closed) {
147 |         close()
148 |         closed = true
149 |       }
150 |     }  
151 |     
152 |     def close() {   
153 |       client.close
154 |     }
155 |   
156 |   } 
157 | 
158 | }


--------------------------------------------------------------------------------