├── .gitignore
├── README.md
├── build.sbt
├── data
    └── person_hbase
├── project
    ├── build.properties
    └── plugins.sbt
└── src
    ├── main
        ├── main.iml
        └── scala
        │   └── com
        │       └── shengli
        │           └── spark
        │               └── hbase
        │                   ├── DefaultSource.scala
        │                   ├── HBaseRelation.scala
        │                   └── package.scala
    └── test
        ├── resources
            └── log4j.properties
        └── scala
            └── org
                └── apache
                    └── spark
                        └── sql
                            ├── hbase
                                └── HBaseSuite.scala
                            └── sql.iml


/.gitignore:
--------------------------------------------------------------------------------
1 | sbt/sbt-launch*.jar
2 | target/
3 | .idea/
4 | .idea_modules/
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | #Spark SQL HBase Connector
  2 | 
  3 | ##----------------Note: This Project is Deprecated---------------
  4 | ##--------------And This Project is Not Maintained---------------
  5 |  
  6 |  _Spark SQL HBase Connector_ aim to query HBase Table by using Spark SQL.
  7 |  
  8 |  It leverages the functionality of [Spark SQL](http://spark.apache.org/sql/) 1.2+ external datasource API .
  9 | 
 10 | > 该项目仅供参考，学习之用，未经正式测试，并跟最新代码相比是 out of date 的。
 11 | Spark1.2发布之后，Spark SQL支持了External Datasource API，我们才能方便的编写扩展来使Spark SQL能支持更多的外部数据源。
 12 | 
 13 | 
 14 | 
 15 | ##Using SQL Resiger HBase Table
 16 | 
 17 | ###1.Query by Spark SQL
 18 | 
 19 | #### One Column and Multiple Columns Scan.
 20 | 
 21 | Recommended way is to always put the rowkey at the first column in schema. 
 22 | And we use `:key` represent the rowkey in hbase.   
 23 | 
 24 | `sparksql_table_schema`: is the table will register to spark sql. <br/>
 25 | `hbase_table_name`: a real hbase table name in hbase. <br/>
 26 | `hbase_table_schema`: the columns want's to query in hbase table __hbase_table_name__ you provided.  <br/>
 27 |  
 28 |  __Note__:
 29 | `sparksql_table_schema` and `hbase_table_schema` should be a mapping relation, should have same column number and index.
 30 | 
 31 |   ```scala
 32 |   import org.apache.spark.sql.SQLContext  
 33 |   val sqlContext  = new SQLContext(sc)
 34 |   import sqlContext._
 35 |   
 36 |   val hbaseDDL = s"""
 37 |         |CREATE TEMPORARY TABLE hbase_people
 38 |         |USING com.shengli.spark.hbase
 39 |         |OPTIONS (
 40 |         |  sparksql_table_schema   '(row_key string, name string, age int, job string)',
 41 |         |  hbase_table_name    'people',
 42 |         |  hbase_table_schema '(:key , profile:name , profile:age , career:job )'
 43 |         |)""".stripMargin
 44 |   
 45 |         
 46 |   sqlContext.sql(hbaseDDL)
 47 |   sql("select row_key,name,age,job from hbase_people").collect()
 48 | ```
 49 | 
 50 | Let's see the result:
 51 | 
 52 | __select__:
 53 | 
 54 | ```
 55 | scala> sql("select row_key,name,age,job from hbase_people").collect()
 56 | 14/12/27 02:24:22 INFO scheduler.DAGScheduler: Job 0 finished: collect at SparkPlan.scala:81, took 1.576415 s
 57 | res1: Array[org.apache.spark.sql.Row] = Array([rowkey001,Sheng,Li,25,software engineer], [rowkey002,Li,Lei,26,teacher], [rowkey003,Jim Green,24,english teacher], [rowkey004,Lucy,23,doctor], [rowkey005,HanMeiMei,18,student])
 58 | ```
 59 | 
 60 | __functions__:
 61 | 
 62 | __avg__:
 63 | 
 64 | ```scala
 65 | scala> sql("select avg(age) from hbase_people").collect()
 66 | 14/12/27 02:26:55 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool 
 67 | 14/12/27 02:26:55 INFO scheduler.DAGScheduler: Job 1 finished: collect at SparkPlan.scala:81, took 0.459760 s
 68 | res2: Array[org.apache.spark.sql.Row] = Array([23.2])
 69 | ```
 70 | __count:__
 71 | 
 72 | ```scala
 73 | scala> sql("select count(1) from hbase_people").collect()
 74 | res3: Array[org.apache.spark.sql.Row] = Array([5])
 75 | ```
 76 | 
 77 | ### Support RowKey Range Scan
 78 | 
 79 | If you need a range data from a hbase table, you can specify `row_range` in __OPTIONS__.
 80 | We only need start rowkey is `rowkey003` and end rowkey is `rowkey005`
 81 | 
 82 | ```
 83 |  val hbaseDDL = s"""
 84 |        |CREATE TEMPORARY TABLE hbase_people
 85 |        |USING com.shengli.spark.hbase
 86 |        |OPTIONS (
 87 |        |  sparksql_table_schema   '(row_key string, name string, age int, job string)',
 88 |        |  hbase_table_name    'people',
 89 |        |  hbase_table_schema '(:key , profile:name , profile:age , career:job )',
 90 |        |  row_range  'rowkey003->rowkey005'
 91 |        |)""".stripMargin
 92 | ```
 93 | 
 94 | By using RowKey Range Scan, the result of the query only return:
 95 | ```
 96 | res2: Array[org.apache.spark.sql.Row] = Array([rowkey003,Jim Green,24,english teacher], [rowkey004,Lucy,23,doctor])
 97 | ```
 98 | 
 99 | And the count is:
100 | ```
101 | scala> sql("select count(1) from hbase_people").collect()
102 | res3: Array[org.apache.spark.sql.Row] = Array([2])
103 | ```
104 | 
105 | 
106 | ###2. Query by SQLContext API
107 | 
108 | Firstly, import `import com.shengli.spark.hbase._`
109 | Secondly, use `sqlContext.hbaseTable` _API_ to generate a `SchemaRDD`
110 | The `sqlContext.hbaseTable` _API_ need serveral parameters.
111 | 
112 | __Common Way__:
113 | 
114 | If you do common Scan, you just pass three parameters below:
115 | 
116 | ```scala
117 |    sqlContext.hbaseTable(sparksqlTableSchema: String, hbaseTableName: String, hbaseTableSchema: String) 
118 | ```
119 | 
120 | ```scala
121 | scala> import com.shengli.spark.hbase._
122 | import com.shengli.spark.hbase._
123 | 
124 | scala> val hbaseSchema = sqlContext.hbaseTable("(row_key string, name string, age int, job string)","people","(:key , profile:name , profile:age , career:job )")
125 | ......
126 | 14/12/27 02:30:55 INFO spark.SparkContext: Created broadcast 4 from newAPIHadoopRDD at HBaseRelation.scala:158
127 | hbaseSchema: org.apache.spark.sql.SchemaRDD = 
128 | SchemaRDD[16] at RDD at SchemaRDD.scala:108
129 | == Query Plan ==
130 | == Physical Plan ==
131 | PhysicalRDD [row_key#15,name#16,age#17,job#18], MapPartitionsRDD[19] at map at HBaseRelation.scala:166
132 | ```
133 | 
134 | We've got a hbaseSchema so that we can query it with DSL or register it as a temp table query with sql, do whatever you like:
135 | ```
136 | scala> hbaseSchema.select('row_key).collect()
137 | res1: Array[org.apache.spark.sql.Row] = Array([rowkey001], [rowkey002], [rowkey003], [rowkey004], [rowkey005])
138 | ```
139 | 
140 | __RowKey Range Scan__:
141 | 
142 | RowKey Range Scan need pass a `row_range` which format is `starRow->endRow` to let the connector know:
143 | 
144 | ```scala
145 | sqlContext.hbaseTable(sparksqlTableSchema: String, hbaseTableName: String, hbaseTableSchema: String, rowRange: String)
146 | ```
147 | 
148 | 
149 | ```scala
150 | scala> import com.shengli.spark.hbase._
151 | import com.shengli.spark.hbase._
152 | 
153 | scala> val hbaseSchema = sqlContext.hbaseTable("(row_key string, name string, age int, job string)","people","(:key , profile:name , profile:age , career:job )","rowkey002->rowkey004")
154 | hbaseSchema: org.apache.spark.sql.SchemaRDD = 
155 | SchemaRDD[9] at RDD at SchemaRDD.scala:108
156 | == Query Plan ==
157 | == Physical Plan ==
158 | PhysicalRDD [row_key#8,name#9,age#10,job#11], MapPartitionsRDD[12] at map at HBaseRelation.scala:174
159 | 
160 | scala> hbaseSchema.select('row_key).collect()
161 | ......
162 | res0: Array[org.apache.spark.sql.Row] = Array([rowkey002], [rowkey003])
163 | ```
164 | 
165 | 
166 | 
167 | ##HBase Data
168 | 
169 | Let's take look at the `HBase Table` named `person`
170 | 
171 | The `schema` of the table `person`:
172 | 
173 | __column family__: `profile`, `career`
174 | 
175 | __coloumns__:`profile:name`, `profile:age`,`carrer:job`
176 | 
177 | 
178 | ```java
179 | 1.8.7-p357 :024 > scan 'people'
180 | ROW                                  COLUMN+CELL                                                                                               
181 |  rowkey001                           column=career:job, timestamp=1419517844784, value=software engineer                                       
182 |  rowkey001                           column=profile:age, timestamp=1419517844665, value=25                                                     
183 |  rowkey001                           column=profile:name, timestamp=1419517844501, value=Sheng,Li                                              
184 |  rowkey002                           column=career:job, timestamp=1419517844813, value=teacher                                                 
185 |  rowkey002                           column=profile:age, timestamp=1419517844687, value=26                                                     
186 |  rowkey002                           column=profile:name, timestamp=1419517844544, value=Li,Lei                                                
187 |  rowkey003                           column=career:job, timestamp=1419517844832, value=english teacher                                         
188 |  rowkey003                           column=profile:age, timestamp=1419517844704, value=24                                                     
189 |  rowkey003                           column=profile:name, timestamp=1419517844568, value=Jim Green                                             
190 |  rowkey004                           column=career:job, timestamp=1419517844853, value=doctor                                                  
191 |  rowkey004                           column=profile:age, timestamp=1419517844724, value=23                                                     
192 |  rowkey004                           column=profile:name, timestamp=1419517844589, value=Lucy                                                  
193 |  rowkey005                           column=career:job, timestamp=1419517845664, value=student                                                 
194 |  rowkey005                           column=profile:age, timestamp=1419517844744, value=18                                                     
195 |  rowkey005                           column=profile:name, timestamp=1419517844606, value=HanMeiMei                                             
196 | 5 row(s) in 0.0260 seconds
197 | ```
198 | 
199 | ###Note:
200 | 
201 | ####Package
202 | 
203 | In the root directory,  use `sbt package` to package the lib.
204 | 
205 | ####Dependency
206 | 
207 | __1. hbase-site.xml__
208 | 
209 | You need place `hbase-site.xml` under the spark classpath. Also need to configure it correctly first.
210 | Below is my hbase-site.xml:
211 | 
212 | ```scala
213 | <configuration>
214 |  <property>
215 |      <name>hbase.rootdir</name>
216 |      <value>file:///Users/shengli/software/data/hbase</value>
217 |  </property>
218 |  <property>
219 |      <name>hbase.cluster.distributed</name>
220 |          <value>true</value>
221 |  </property>
222 |  <property>
223 |       <name>hbase.zookeeper.property.clientPort</name>
224 |                <value>2181</value>
225 |   </property>
226 | 
227 |  <property>
228 |      <name>hbase.zookeeper.quorum</name>
229 |      <value>localhost</value>
230 |   </property>
231 |   <property>
232 |       <name>hbase.defaults.for.version.skip</name>
233 |           <value>true</value>
234 |   </property>
235 | </configuration>
236 | ```
237 | 
238 | You can simply do it with `ln -s ~/software/hbase/conf/hbase-site.xml ~/git_repos/spark`
239 | 
240 | __2. Add hbase related libs into spark classpath__
241 | 
242 | Below is how I start the spark shell:<br/>
243 | Add hbase related libs into spark classpath to make sure spark can access hbase with spark api first.
244 | <br/>
245 | Then:
246 | 
247 | ```scala
248 | bin/spark-shell --master spark://192.168.2.100:7077 --jars /Users/shengli/software/hbase/lib/hbase-client-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/hbase-server-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/hbase-common-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/hbase-protocol-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/protobuf-java-2.5.0.jar,/Users/shengli/software/hbase/lib/htrace-core-2.04.jar,/Users/shengli/git_repos/spark-sql-hbase/target/scala-2.10/spark-sql-hbase_2.10-0.1.jar --driver-java-options "-Dsun.io.serialization.extendedDebugInfo=true"
249 | ```
250 | 
251 | __3. class not found issues__
252 | 
253 | The below provides the mapping of the classes and their respective jars
254 | 
255 | ```scala
256 | | Class Name |	Jar Name       |
257 | |------------|-----------------|
258 | | TableSplit |	hbase-server.jar |
259 | | HTable |	hbase-client.jar |
260 | | MasterProtos |	hbase-protocol.jar |
261 | | org.cloudera.htrace.Trace |	htrace-core-2.01.jar |
262 | ```
263 | 
264 | - https://support.pivotal.io/hc/en-us/articles/203025186-Hive-Query-from-Tableau-failed-with-error-Execution-Error-return-code-2-from-org-apache-hadoop-hive-ql-exec-mr-MapRedTask
265 | 
266 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "spark-sql-hbase"
 2 | 
 3 | version := "0.1"
 4 | 
 5 | organization := "com.shengli"
 6 | 
 7 | scalaVersion := "2.10.4"
 8 | 
 9 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.2.0" % "provided"
10 | 
11 | libraryDependencies += "org.apache.hbase" % "hbase" % "0.94.14"
12 | 
13 | libraryDependencies += "org.apache.zookeeper" % "zookeeper" % "3.4.5"
14 | 
15 | libraryDependencies += "org.slf4j" % "slf4j-api" % "1.6.1"
16 | 
17 | 
18 | publishMavenStyle := true
19 | 
20 | pomExtra := (
21 |   <url>https://github.com/OopsOutOfMemory/spark-sql-hbase</url>
22 |   <scm>
23 |     <url>git@github.com:OopsOutOfMemory/spark-sql-hbase.git</url>
24 |     <connection>scm:git:git@github.com:OopsOutOfMemory/spark-hbase.git</connection>
25 |   </scm>
26 |   <developers>
27 |     <developer>
28 |       <id>OopsOutOfMemory</id>
29 |       <name>Sheng,Li(盛利)</name>
30 |       <url>https://github.com/OopsOutOfMemory</url>
31 |     </developer>
32 |   </developers>)
33 | 
34 | // Enable Junit testing.
35 | // libraryDependencies += "com.novocode" % "junit-interface" % "0.9" % "test"
36 | 
37 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.1" % "test"
38 | 


--------------------------------------------------------------------------------
/data/person_hbase:
--------------------------------------------------------------------------------
 1 | create 'people',{NAME => 'profile', VERSIONS => 1},{NAME => 'career', VERSIONS => 1}
 2 | 
 3 | put 'people','rowkey001','profile:name','Sheng,Li'
 4 | put 'people','rowkey002','profile:name','Li,Lei'
 5 | put 'people','rowkey003','profile:name','Jim Green'
 6 | put 'people','rowkey004','profile:name','Lucy'
 7 | put 'people','rowkey005','profile:name','HanMeiMei'
 8 | 
 9 | 
10 | put 'people','rowkey001','profile:age','25'
11 | put 'people','rowkey002','profile:age','26'
12 | put 'people','rowkey003','profile:age','24'
13 | put 'people','rowkey004','profile:age','23'
14 | put 'people','rowkey005','profile:age','18'
15 | 
16 | 
17 | put 'people','rowkey001','career:job','software engineer'
18 | put 'people','rowkey002','career:job','teacher'
19 | put 'people','rowkey003','career:job','english teacher'
20 | put 'people','rowkey004','career:job','doctor'
21 | put 'people','rowkey005','career:job','student'
22 | 
23 | get 'people','rowkey001'


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | sbt.version=0.13.6
18 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | scalaVersion := "2.10.4"
 2 | 
 3 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
 4 | 
 5 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
 6 | 
 7 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
 8 | 
 9 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
10 | 
11 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0")
12 | 
13 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")


--------------------------------------------------------------------------------
/src/main/main.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="JAVA_MODULE" version="4">
 3 |   <component name="NewModuleRootManager" inherit-compiler-output="true">
 4 |     <exclude-output />
 5 |     <content url="file://$MODULE_DIR$">
 6 |       <sourceFolder url="file://$MODULE_DIR$/scala" isTestSource="false" />
 7 |     </content>
 8 |     <orderEntry type="inheritedJdk" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |   </component>
11 | </module>
12 | 
13 | 


--------------------------------------------------------------------------------
/src/main/scala/com/shengli/spark/hbase/DefaultSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2014 Sheng,Li
 3 | */
 4 | package com.shengli.spark.hbase
 5 | 
 6 | import org.apache.spark.sql.SQLContext
 7 | import org.apache.spark.sql.sources.RelationProvider
 8 | import com.shengli.spark.hbase
 9 | 
10 | 
11 | class DefaultSource extends RelationProvider {
12 |   def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = {
13 |     HBaseRelation(parameters)(sqlContext)
14 |   }
15 | }


--------------------------------------------------------------------------------
/src/main/scala/com/shengli/spark/hbase/HBaseRelation.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 2014 Sheng, Li
  3 | */
  4 | package com.shengli.spark.hbase
  5 | 
  6 | import java.io.Serializable
  7 | 
  8 | import org.apache.hadoop.fs.Path
  9 | import org.apache.spark.sql._
 10 | import org.apache.spark.sql.sources.TableScan
 11 | import scala.collection.immutable.{HashMap, Map}
 12 | import org.apache.hadoop.hbase.client.{Result, Scan, HTable, HBaseAdmin}
 13 | 
 14 | import org.apache.spark.sql._
 15 | import org.apache.spark.rdd.NewHadoopRDD
 16 | import org.apache.hadoop.hbase.HBaseConfiguration;
 17 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
 18 | import scala.collection.JavaConversions._
 19 | import scala.collection.JavaConverters._
 20 | import scala.collection.mutable.ArrayBuffer
 21 | 
 22 | 
 23 | object Resolver extends  Serializable {
 24 | 
 25 |   def resolve (hbaseField: HBaseSchemaField, result: Result ): Any = {
 26 |     val cfColArray = hbaseField.fieldName.split(":",-1)
 27 |     val cfName = cfColArray(0)
 28 |     val colName =  cfColArray(1)
 29 |     var fieldRs: Any = null
 30 |     //resolve row key otherwise resolve column
 31 |     if(cfName=="" && colName=="key") {
 32 |       fieldRs = resolveRowKey(result, hbaseField.fieldType)
 33 |     } else {
 34 |       fieldRs =  resolveColumn(result, cfName, colName,hbaseField.fieldType)
 35 |     }
 36 |     fieldRs
 37 |   }
 38 | 
 39 |   def resolveRowKey (result: Result, resultType: String): Any = {
 40 |      val rowkey = resultType match {
 41 |       case "string" =>
 42 |         result.getRow.map(_.toChar).mkString
 43 |       case "int" =>
 44 |         result  .getRow.map(_.toChar).mkString.toInt
 45 |       case "long" =>
 46 |         result.getRow.map(_.toChar).mkString.toLong
 47 |     }
 48 |     rowkey
 49 |   }
 50 | 
 51 |   def resolveColumn (result: Result, columnFamily: String, columnName: String, resultType: String): Any = {
 52 |     val column = resultType match {
 53 |       case "string" =>
 54 |         result.getValue(columnFamily.getBytes,columnName.getBytes).map(_.toChar).mkString
 55 |       case "int" =>
 56 |         result.getValue(columnFamily.getBytes,columnName.getBytes).map(_.toChar).mkString.toInt
 57 |       case "long" =>
 58 |         result.getValue(columnFamily.getBytes,columnName.getBytes).map(_.toChar).mkString.toLong
 59 |     }
 60 |     column
 61 |   }
 62 | }
 63 | 
 64 | /**
 65 |    val hbaseDDL = s"""
 66 |       |CREATE TEMPORARY TABLE hbase_people
 67 |       |USING com.shengli.spark.hbase
 68 |       |OPTIONS (
 69 |       |  sparksql_table_schema   '(row_key string, name string, age int, job string)',
 70 |       |   hbase_table_name     'people',
 71 |       | hbase_table_schema '(:key , profile:name , profile:age , career:job )'
 72 |       |)""".stripMargin
 73 |  */
 74 | case class HBaseRelation(@transient val hbaseProps: Map[String,String])(@transient val sqlContext: SQLContext) extends TableScan with Serializable {
 75 | 
 76 |   val hbaseTableName =  hbaseProps.getOrElse("hbase_table_name", sys.error("not valid schema"))
 77 |   val hbaseTableSchema =  hbaseProps.getOrElse("hbase_table_schema", sys.error("not valid schema"))
 78 |   val registerTableSchema = hbaseProps.getOrElse("sparksql_table_schema", sys.error("not valid schema"))
 79 |   val rowRange = hbaseProps.getOrElse("row_range", "->")
 80 |   //get star row and end row
 81 |   val range = rowRange.split("->",-1)
 82 |   val startRowKey = range(0).trim
 83 |   val endRowKey = range(1).trim
 84 | 
 85 |   val tempHBaseFields = extractHBaseSchema(hbaseTableSchema) //do not use this, a temp field
 86 |   val registerTableFields = extractRegisterSchema(registerTableSchema)
 87 |   val tempFieldRelation = tableSchemaFieldMapping(tempHBaseFields,registerTableFields)
 88 | 
 89 |   val hbaseTableFields = feedTypes(tempFieldRelation)
 90 |   val fieldsRelations =  tableSchemaFieldMapping(hbaseTableFields,registerTableFields)
 91 |   val queryColumns =  getQueryTargetCloumns(hbaseTableFields)
 92 | 
 93 |   def  feedTypes( mapping: Map[HBaseSchemaField, RegisteredSchemaField]) :  Array[HBaseSchemaField] = {
 94 |          val hbaseFields = mapping.map{
 95 |            case (k,v) =>
 96 |                val field = k.copy(fieldType=v.fieldType)
 97 |                field
 98 |         }
 99 |         hbaseFields.toArray
100 |   }
101 | 
102 |   def isRowKey(field: HBaseSchemaField) : Boolean = {
103 |     val cfColArray = field.fieldName.split(":",-1)
104 |     val cfName = cfColArray(0)
105 |     val colName =  cfColArray(1)
106 |     if(cfName=="" && colName=="key") true else false
107 |   }
108 | 
109 |   //eg: f1:col1  f1:col2  f1:col3  f2:col1
110 |   def getQueryTargetCloumns(hbaseTableFields: Array[HBaseSchemaField]): String = {
111 |     var str = ArrayBuffer[String]()
112 |     hbaseTableFields.foreach{ field=>
113 |          if(!isRowKey(field)) {
114 |            str +=  field.fieldName
115 |          }
116 |     }
117 |     str.mkString(" ")
118 |   }
119 |   lazy val schema = {
120 |     val fields = hbaseTableFields.map{ field=>
121 |         val name  = fieldsRelations.getOrElse(field, sys.error("table schema is not match the definition.")).fieldName
122 |         val relatedType =  field.fieldType match  {
123 |           case "string" =>
124 |             SchemaType(StringType,nullable = false)
125 |           case "int" =>
126 |             SchemaType(IntegerType,nullable = false)
127 |           case "long" =>
128 |             SchemaType(LongType,nullable = false)
129 |         }
130 |         StructField(name,relatedType.dataType,relatedType.nullable)
131 |     }
132 |     StructType(fields)
133 |   }
134 | 
135 |   def tableSchemaFieldMapping( externalHBaseTable: Array[HBaseSchemaField],  registerTable : Array[RegisteredSchemaField]): Map[HBaseSchemaField, RegisteredSchemaField] = {
136 |        if(externalHBaseTable.length != registerTable.length) sys.error("columns size not match in definition!")
137 |        val rs = externalHBaseTable.zip(registerTable)
138 |        rs.toMap
139 |   }
140 | 
141 |     /**
142 |      * spark sql schema will be register
143 |      *   registerTableSchema   '(rowkey string, value string, column_a string)'
144 |       */
145 |   def extractRegisterSchema(registerTableSchema: String) : Array[RegisteredSchemaField] = {
146 |          val fieldsStr = registerTableSchema.trim.drop(1).dropRight(1)
147 |          val fieldsArray = fieldsStr.split(",").map(_.trim)
148 |          fieldsArray.map{ fildString =>
149 |            val splitedField = fildString.split("\\s+", -1)
150 |            RegisteredSchemaField(splitedField(0), splitedField(1))
151 |          }
152 |    }
153 | 
154 |   //externalTableSchema '(:key , f1:col1 )'
155 |   def extractHBaseSchema(externalTableSchema: String) : Array[HBaseSchemaField] = {
156 |         val fieldsStr = externalTableSchema.trim.drop(1).dropRight(1)
157 |         val fieldsArray = fieldsStr.split(",").map(_.trim)
158 |         fieldsArray.map(fildString => HBaseSchemaField(fildString,""))
159 |   }
160 | 
161 | 
162 | 
163 |   // By making this a lazy val we keep the RDD around, amortizing the cost of locating splits.
164 |   lazy val buildScan = {
165 | 
166 |     val hbaseConf = HBaseConfiguration.create()
167 |     hbaseConf.set(TableInputFormat.INPUT_TABLE, hbaseTableName)
168 |     hbaseConf.set(TableInputFormat.SCAN_COLUMNS, queryColumns);
169 |     hbaseConf.set(TableInputFormat.SCAN_ROW_START, startRowKey);
170 |     hbaseConf.set(TableInputFormat.SCAN_ROW_STOP, endRowKey);
171 | 
172 |     val hbaseRdd = sqlContext.sparkContext.newAPIHadoopRDD(
173 |       hbaseConf,
174 |       classOf[org.apache.hadoop.hbase.mapreduce.TableInputFormat],
175 |       classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
176 |       classOf[org.apache.hadoop.hbase.client.Result]
177 |     )
178 | 
179 | 
180 |     val rs = hbaseRdd.map(tuple => tuple._2).map(result => {
181 |       var values = new ArrayBuffer[Any]()
182 |       hbaseTableFields.foreach{field=>
183 |         values += Resolver.resolve(field,result)
184 |       }
185 |       Row.fromSeq(values.toSeq)
186 |     })
187 |     rs
188 |   }
189 | 
190 |   private case class SchemaType(dataType: DataType, nullable: Boolean)
191 | //
192 | //  private def toSqlType(hbaseSchema: Schema): SchemaType = {
193 | //    SchemaType(StringType,true)
194 | //  }
195 | }


--------------------------------------------------------------------------------
/src/main/scala/com/shengli/spark/hbase/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2014 Sheng,Li
 3 |  */
 4 | package com.shengli.spark
 5 | 
 6 | import org.apache.spark.sql.{SQLContext, SchemaRDD}
 7 | import scala.collection.immutable.HashMap
 8 | 
 9 | 
10 | 
11 | package object hbase {
12 | 
13 |   abstract class SchemaField extends Serializable
14 | 
15 |    case class RegisteredSchemaField(fieldName: String, fieldType: String)  extends  SchemaField  with Serializable
16 | 
17 |    case class HBaseSchemaField(fieldName: String, fieldType: String)  extends  SchemaField  with Serializable
18 | 
19 |    case class Parameter(name: String)
20 | 
21 | 
22 |   protected  val SPARK_SQL_TABLE_SCHEMA = Parameter("sparksql_table_schema")
23 |   protected  val HBASE_TABLE_NAME = Parameter("hbase_table_name")
24 |   protected  val HBASE_TABLE_SCHEMA = Parameter("hbase_table_schema")
25 |   protected  val ROW_RANGE = Parameter("row_range")
26 |   /**
27 |    * Adds a method, `hbaseTable`, to SQLContext that allows reading data stored in hbase table.
28 |    */
29 |   implicit class HBaseContext(sqlContext: SQLContext) {
30 |     def hbaseTable(sparksqlTableSchema: String, hbaseTableName: String, hbaseTableSchema: String, rowRange: String = "->") = {
31 |       var params = new HashMap[String, String]
32 |       params += ( SPARK_SQL_TABLE_SCHEMA.name -> sparksqlTableSchema)
33 |       params += ( HBASE_TABLE_NAME.name -> hbaseTableName)
34 |       params += ( HBASE_TABLE_SCHEMA.name -> hbaseTableSchema)
35 |       //get star row and end row
36 |       params += ( ROW_RANGE.name -> rowRange)
37 | 
38 |       sqlContext.baseRelationToSchemaRDD(HBaseRelation(params)(sqlContext))
39 |     }
40 |   }
41 | 
42 | //  implicit class HBaseSchemaRDD(schemaRDD: SchemaRDD) {
43 | //    def saveIntoTable(tableName: String): Unit = ???
44 | //  }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=DEBUG, CA, FA
 2 | 
 3 | #Console Appender
 4 | log4j.appender.CA=org.apache.log4j.ConsoleAppender
 5 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.CA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %p %c: %m%n
 7 | log4j.appender.CA.Threshold = WARN
 8 | 
 9 | 
10 | #File Appender
11 | log4j.appender.FA=org.apache.log4j.FileAppender
12 | log4j.appender.FA.append=false
13 | log4j.appender.FA.file=target/unit-tests.log
14 | log4j.appender.FA.layout=org.apache.log4j.PatternLayout
15 | log4j.appender.FA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %t %p %c{1}: %m%n
16 | 
17 | # Set the logger level of File Appender to WARN
18 | log4j.appender.FA.Threshold = INFO
19 | 
20 | # Some packages are noisy for no good reason.
21 | log4j.additivity.parquet.hadoop.ParquetRecordReader=false
22 | log4j.logger.parquet.hadoop.ParquetRecordReader=OFF
23 | 
24 | log4j.additivity.org.apache.hadoop.hive.serde2.lazy.LazyStruct=false
25 | log4j.logger.org.apache.hadoop.hive.serde2.lazy.LazyStruct=OFF
26 | 
27 | log4j.additivity.org.apache.hadoop.hive.metastore.RetryingHMSHandler=false
28 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=OFF
29 | 
30 | log4j.additivity.hive.ql.metadata.Hive=false
31 | log4j.logger.hive.ql.metadata.Hive=OFF
32 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/HBaseSuite.scala:
--------------------------------------------------------------------------------
 1 | package com.shengli.spark.hbase.test
 2 | 
 3 | import org.apache.spark.sql.test._
 4 | import org.scalatest.FunSuite
 5 | 
 6 | /* Implicits */
 7 | import TestSQLContext._
 8 | 
 9 | class HBaseSuite extends FunSuite {
10 | 
11 |   test("dsl test") {
12 |     val results = TestSQLContext.hbaseTable("(row_key string, name string, age int, job string)","people","(:key string, profile:name string, profile:age int, career:job string)").select('name).count.collect()
13 |     assert(results.size === 5)
14 |   }
15 | 
16 |   test("sql test") {
17 |     sql(
18 |       s"""
19 |       |CREATE TEMPORARY TABLE hbase_people
20 |       |USING com.shengli.spark.hbase
21 |       |OPTIONS (
22 |       |  sparksql_table_schema   '(row_key string, name string, age int, job string)',
23 |       |  hbase_table_name    'people',
24 |       |  hbase_table_schema '(:key string, profile:name string, profile:age int, career:job string)'
25 |       |)""".stripMargin
26 | 
27 |     assert(sql("SELECT * FROM hbase_people").collect().size === 5)  )
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/sql.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="JAVA_MODULE" version="4">
 3 |   <component name="NewModuleRootManager" inherit-compiler-output="true">
 4 |     <exclude-output />
 5 |     <content url="file://$MODULE_DIR$">
 6 |       <sourceFolder url="file://$MODULE_DIR$/hbase" isTestSource="false" />
 7 |     </content>
 8 |     <orderEntry type="inheritedJdk" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |   </component>
11 | </module>
12 | 
13 | 


--------------------------------------------------------------------------------