├── .gitignore ├── README.md ├── build.sbt ├── data └── person_hbase ├── project ├── build.properties └── plugins.sbt └── src ├── main ├── main.iml └── scala │ └── com │ └── shengli │ └── spark │ └── hbase │ ├── DefaultSource.scala │ ├── HBaseRelation.scala │ └── package.scala └── test ├── resources └── log4j.properties └── scala └── org └── apache └── spark └── sql ├── hbase └── HBaseSuite.scala └── sql.iml /.gitignore: -------------------------------------------------------------------------------- 1 | sbt/sbt-launch*.jar 2 | target/ 3 | .idea/ 4 | .idea_modules/ 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Spark SQL HBase Connector 2 | 3 | ##----------------Note: This Project is Deprecated--------------- 4 | ##--------------And This Project is Not Maintained--------------- 5 | 6 | _Spark SQL HBase Connector_ aim to query HBase Table by using Spark SQL. 7 | 8 | It leverages the functionality of [Spark SQL](http://spark.apache.org/sql/) 1.2+ external datasource API . 9 | 10 | > 该项目仅供参考,学习之用,未经正式测试,并跟最新代码相比是 out of date 的。 11 | Spark1.2发布之后,Spark SQL支持了External Datasource API,我们才能方便的编写扩展来使Spark SQL能支持更多的外部数据源。 12 | 13 | 14 | 15 | ##Using SQL Resiger HBase Table 16 | 17 | ###1.Query by Spark SQL 18 | 19 | #### One Column and Multiple Columns Scan. 20 | 21 | Recommended way is to always put the rowkey at the first column in schema. 22 | And we use `:key` represent the rowkey in hbase. 23 | 24 | `sparksql_table_schema`: is the table will register to spark sql.
25 | `hbase_table_name`: a real hbase table name in hbase.
26 | `hbase_table_schema`: the columns want's to query in hbase table __hbase_table_name__ you provided.
27 | 28 | __Note__: 29 | `sparksql_table_schema` and `hbase_table_schema` should be a mapping relation, should have same column number and index. 30 | 31 | ```scala 32 | import org.apache.spark.sql.SQLContext 33 | val sqlContext = new SQLContext(sc) 34 | import sqlContext._ 35 | 36 | val hbaseDDL = s""" 37 | |CREATE TEMPORARY TABLE hbase_people 38 | |USING com.shengli.spark.hbase 39 | |OPTIONS ( 40 | | sparksql_table_schema '(row_key string, name string, age int, job string)', 41 | | hbase_table_name 'people', 42 | | hbase_table_schema '(:key , profile:name , profile:age , career:job )' 43 | |)""".stripMargin 44 | 45 | 46 | sqlContext.sql(hbaseDDL) 47 | sql("select row_key,name,age,job from hbase_people").collect() 48 | ``` 49 | 50 | Let's see the result: 51 | 52 | __select__: 53 | 54 | ``` 55 | scala> sql("select row_key,name,age,job from hbase_people").collect() 56 | 14/12/27 02:24:22 INFO scheduler.DAGScheduler: Job 0 finished: collect at SparkPlan.scala:81, took 1.576415 s 57 | res1: Array[org.apache.spark.sql.Row] = Array([rowkey001,Sheng,Li,25,software engineer], [rowkey002,Li,Lei,26,teacher], [rowkey003,Jim Green,24,english teacher], [rowkey004,Lucy,23,doctor], [rowkey005,HanMeiMei,18,student]) 58 | ``` 59 | 60 | __functions__: 61 | 62 | __avg__: 63 | 64 | ```scala 65 | scala> sql("select avg(age) from hbase_people").collect() 66 | 14/12/27 02:26:55 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool 67 | 14/12/27 02:26:55 INFO scheduler.DAGScheduler: Job 1 finished: collect at SparkPlan.scala:81, took 0.459760 s 68 | res2: Array[org.apache.spark.sql.Row] = Array([23.2]) 69 | ``` 70 | __count:__ 71 | 72 | ```scala 73 | scala> sql("select count(1) from hbase_people").collect() 74 | res3: Array[org.apache.spark.sql.Row] = Array([5]) 75 | ``` 76 | 77 | ### Support RowKey Range Scan 78 | 79 | If you need a range data from a hbase table, you can specify `row_range` in __OPTIONS__. 80 | We only need start rowkey is `rowkey003` and end rowkey is `rowkey005` 81 | 82 | ``` 83 | val hbaseDDL = s""" 84 | |CREATE TEMPORARY TABLE hbase_people 85 | |USING com.shengli.spark.hbase 86 | |OPTIONS ( 87 | | sparksql_table_schema '(row_key string, name string, age int, job string)', 88 | | hbase_table_name 'people', 89 | | hbase_table_schema '(:key , profile:name , profile:age , career:job )', 90 | | row_range 'rowkey003->rowkey005' 91 | |)""".stripMargin 92 | ``` 93 | 94 | By using RowKey Range Scan, the result of the query only return: 95 | ``` 96 | res2: Array[org.apache.spark.sql.Row] = Array([rowkey003,Jim Green,24,english teacher], [rowkey004,Lucy,23,doctor]) 97 | ``` 98 | 99 | And the count is: 100 | ``` 101 | scala> sql("select count(1) from hbase_people").collect() 102 | res3: Array[org.apache.spark.sql.Row] = Array([2]) 103 | ``` 104 | 105 | 106 | ###2. Query by SQLContext API 107 | 108 | Firstly, import `import com.shengli.spark.hbase._` 109 | Secondly, use `sqlContext.hbaseTable` _API_ to generate a `SchemaRDD` 110 | The `sqlContext.hbaseTable` _API_ need serveral parameters. 111 | 112 | __Common Way__: 113 | 114 | If you do common Scan, you just pass three parameters below: 115 | 116 | ```scala 117 | sqlContext.hbaseTable(sparksqlTableSchema: String, hbaseTableName: String, hbaseTableSchema: String) 118 | ``` 119 | 120 | ```scala 121 | scala> import com.shengli.spark.hbase._ 122 | import com.shengli.spark.hbase._ 123 | 124 | scala> val hbaseSchema = sqlContext.hbaseTable("(row_key string, name string, age int, job string)","people","(:key , profile:name , profile:age , career:job )") 125 | ...... 126 | 14/12/27 02:30:55 INFO spark.SparkContext: Created broadcast 4 from newAPIHadoopRDD at HBaseRelation.scala:158 127 | hbaseSchema: org.apache.spark.sql.SchemaRDD = 128 | SchemaRDD[16] at RDD at SchemaRDD.scala:108 129 | == Query Plan == 130 | == Physical Plan == 131 | PhysicalRDD [row_key#15,name#16,age#17,job#18], MapPartitionsRDD[19] at map at HBaseRelation.scala:166 132 | ``` 133 | 134 | We've got a hbaseSchema so that we can query it with DSL or register it as a temp table query with sql, do whatever you like: 135 | ``` 136 | scala> hbaseSchema.select('row_key).collect() 137 | res1: Array[org.apache.spark.sql.Row] = Array([rowkey001], [rowkey002], [rowkey003], [rowkey004], [rowkey005]) 138 | ``` 139 | 140 | __RowKey Range Scan__: 141 | 142 | RowKey Range Scan need pass a `row_range` which format is `starRow->endRow` to let the connector know: 143 | 144 | ```scala 145 | sqlContext.hbaseTable(sparksqlTableSchema: String, hbaseTableName: String, hbaseTableSchema: String, rowRange: String) 146 | ``` 147 | 148 | 149 | ```scala 150 | scala> import com.shengli.spark.hbase._ 151 | import com.shengli.spark.hbase._ 152 | 153 | scala> val hbaseSchema = sqlContext.hbaseTable("(row_key string, name string, age int, job string)","people","(:key , profile:name , profile:age , career:job )","rowkey002->rowkey004") 154 | hbaseSchema: org.apache.spark.sql.SchemaRDD = 155 | SchemaRDD[9] at RDD at SchemaRDD.scala:108 156 | == Query Plan == 157 | == Physical Plan == 158 | PhysicalRDD [row_key#8,name#9,age#10,job#11], MapPartitionsRDD[12] at map at HBaseRelation.scala:174 159 | 160 | scala> hbaseSchema.select('row_key).collect() 161 | ...... 162 | res0: Array[org.apache.spark.sql.Row] = Array([rowkey002], [rowkey003]) 163 | ``` 164 | 165 | 166 | 167 | ##HBase Data 168 | 169 | Let's take look at the `HBase Table` named `person` 170 | 171 | The `schema` of the table `person`: 172 | 173 | __column family__: `profile`, `career` 174 | 175 | __coloumns__:`profile:name`, `profile:age`,`carrer:job` 176 | 177 | 178 | ```java 179 | 1.8.7-p357 :024 > scan 'people' 180 | ROW COLUMN+CELL 181 | rowkey001 column=career:job, timestamp=1419517844784, value=software engineer 182 | rowkey001 column=profile:age, timestamp=1419517844665, value=25 183 | rowkey001 column=profile:name, timestamp=1419517844501, value=Sheng,Li 184 | rowkey002 column=career:job, timestamp=1419517844813, value=teacher 185 | rowkey002 column=profile:age, timestamp=1419517844687, value=26 186 | rowkey002 column=profile:name, timestamp=1419517844544, value=Li,Lei 187 | rowkey003 column=career:job, timestamp=1419517844832, value=english teacher 188 | rowkey003 column=profile:age, timestamp=1419517844704, value=24 189 | rowkey003 column=profile:name, timestamp=1419517844568, value=Jim Green 190 | rowkey004 column=career:job, timestamp=1419517844853, value=doctor 191 | rowkey004 column=profile:age, timestamp=1419517844724, value=23 192 | rowkey004 column=profile:name, timestamp=1419517844589, value=Lucy 193 | rowkey005 column=career:job, timestamp=1419517845664, value=student 194 | rowkey005 column=profile:age, timestamp=1419517844744, value=18 195 | rowkey005 column=profile:name, timestamp=1419517844606, value=HanMeiMei 196 | 5 row(s) in 0.0260 seconds 197 | ``` 198 | 199 | ###Note: 200 | 201 | ####Package 202 | 203 | In the root directory, use `sbt package` to package the lib. 204 | 205 | ####Dependency 206 | 207 | __1. hbase-site.xml__ 208 | 209 | You need place `hbase-site.xml` under the spark classpath. Also need to configure it correctly first. 210 | Below is my hbase-site.xml: 211 | 212 | ```scala 213 | 214 | 215 | hbase.rootdir 216 | file:///Users/shengli/software/data/hbase 217 | 218 | 219 | hbase.cluster.distributed 220 | true 221 | 222 | 223 | hbase.zookeeper.property.clientPort 224 | 2181 225 | 226 | 227 | 228 | hbase.zookeeper.quorum 229 | localhost 230 | 231 | 232 | hbase.defaults.for.version.skip 233 | true 234 | 235 | 236 | ``` 237 | 238 | You can simply do it with `ln -s ~/software/hbase/conf/hbase-site.xml ~/git_repos/spark` 239 | 240 | __2. Add hbase related libs into spark classpath__ 241 | 242 | Below is how I start the spark shell:
243 | Add hbase related libs into spark classpath to make sure spark can access hbase with spark api first. 244 |
245 | Then: 246 | 247 | ```scala 248 | bin/spark-shell --master spark://192.168.2.100:7077 --jars /Users/shengli/software/hbase/lib/hbase-client-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/hbase-server-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/hbase-common-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/hbase-protocol-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/protobuf-java-2.5.0.jar,/Users/shengli/software/hbase/lib/htrace-core-2.04.jar,/Users/shengli/git_repos/spark-sql-hbase/target/scala-2.10/spark-sql-hbase_2.10-0.1.jar --driver-java-options "-Dsun.io.serialization.extendedDebugInfo=true" 249 | ``` 250 | 251 | __3. class not found issues__ 252 | 253 | The below provides the mapping of the classes and their respective jars 254 | 255 | ```scala 256 | | Class Name | Jar Name | 257 | |------------|-----------------| 258 | | TableSplit | hbase-server.jar | 259 | | HTable | hbase-client.jar | 260 | | MasterProtos | hbase-protocol.jar | 261 | | org.cloudera.htrace.Trace | htrace-core-2.01.jar | 262 | ``` 263 | 264 | - https://support.pivotal.io/hc/en-us/articles/203025186-Hive-Query-from-Tableau-failed-with-error-Execution-Error-return-code-2-from-org-apache-hadoop-hive-ql-exec-mr-MapRedTask 265 | 266 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-sql-hbase" 2 | 3 | version := "0.1" 4 | 5 | organization := "com.shengli" 6 | 7 | scalaVersion := "2.10.4" 8 | 9 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.2.0" % "provided" 10 | 11 | libraryDependencies += "org.apache.hbase" % "hbase" % "0.94.14" 12 | 13 | libraryDependencies += "org.apache.zookeeper" % "zookeeper" % "3.4.5" 14 | 15 | libraryDependencies += "org.slf4j" % "slf4j-api" % "1.6.1" 16 | 17 | 18 | publishMavenStyle := true 19 | 20 | pomExtra := ( 21 | https://github.com/OopsOutOfMemory/spark-sql-hbase 22 | 23 | git@github.com:OopsOutOfMemory/spark-sql-hbase.git 24 | scm:git:git@github.com:OopsOutOfMemory/spark-hbase.git 25 | 26 | 27 | 28 | OopsOutOfMemory 29 | Sheng,Li(盛利) 30 | https://github.com/OopsOutOfMemory 31 | 32 | ) 33 | 34 | // Enable Junit testing. 35 | // libraryDependencies += "com.novocode" % "junit-interface" % "0.9" % "test" 36 | 37 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.1" % "test" 38 | -------------------------------------------------------------------------------- /data/person_hbase: -------------------------------------------------------------------------------- 1 | create 'people',{NAME => 'profile', VERSIONS => 1},{NAME => 'career', VERSIONS => 1} 2 | 3 | put 'people','rowkey001','profile:name','Sheng,Li' 4 | put 'people','rowkey002','profile:name','Li,Lei' 5 | put 'people','rowkey003','profile:name','Jim Green' 6 | put 'people','rowkey004','profile:name','Lucy' 7 | put 'people','rowkey005','profile:name','HanMeiMei' 8 | 9 | 10 | put 'people','rowkey001','profile:age','25' 11 | put 'people','rowkey002','profile:age','26' 12 | put 'people','rowkey003','profile:age','24' 13 | put 'people','rowkey004','profile:age','23' 14 | put 'people','rowkey005','profile:age','18' 15 | 16 | 17 | put 'people','rowkey001','career:job','software engineer' 18 | put 'people','rowkey002','career:job','teacher' 19 | put 'people','rowkey003','career:job','english teacher' 20 | put 'people','rowkey004','career:job','doctor' 21 | put 'people','rowkey005','career:job','student' 22 | 23 | get 'people','rowkey001' -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | sbt.version=0.13.6 18 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | scalaVersion := "2.10.4" 2 | 3 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 4 | 5 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" 6 | 7 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" 8 | 9 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 10 | 11 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0") 12 | 13 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") -------------------------------------------------------------------------------- /src/main/main.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/main/scala/com/shengli/spark/hbase/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Sheng,Li 3 | */ 4 | package com.shengli.spark.hbase 5 | 6 | import org.apache.spark.sql.SQLContext 7 | import org.apache.spark.sql.sources.RelationProvider 8 | import com.shengli.spark.hbase 9 | 10 | 11 | class DefaultSource extends RelationProvider { 12 | def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { 13 | HBaseRelation(parameters)(sqlContext) 14 | } 15 | } -------------------------------------------------------------------------------- /src/main/scala/com/shengli/spark/hbase/HBaseRelation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Sheng, Li 3 | */ 4 | package com.shengli.spark.hbase 5 | 6 | import java.io.Serializable 7 | 8 | import org.apache.hadoop.fs.Path 9 | import org.apache.spark.sql._ 10 | import org.apache.spark.sql.sources.TableScan 11 | import scala.collection.immutable.{HashMap, Map} 12 | import org.apache.hadoop.hbase.client.{Result, Scan, HTable, HBaseAdmin} 13 | 14 | import org.apache.spark.sql._ 15 | import org.apache.spark.rdd.NewHadoopRDD 16 | import org.apache.hadoop.hbase.HBaseConfiguration; 17 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 18 | import scala.collection.JavaConversions._ 19 | import scala.collection.JavaConverters._ 20 | import scala.collection.mutable.ArrayBuffer 21 | 22 | 23 | object Resolver extends Serializable { 24 | 25 | def resolve (hbaseField: HBaseSchemaField, result: Result ): Any = { 26 | val cfColArray = hbaseField.fieldName.split(":",-1) 27 | val cfName = cfColArray(0) 28 | val colName = cfColArray(1) 29 | var fieldRs: Any = null 30 | //resolve row key otherwise resolve column 31 | if(cfName=="" && colName=="key") { 32 | fieldRs = resolveRowKey(result, hbaseField.fieldType) 33 | } else { 34 | fieldRs = resolveColumn(result, cfName, colName,hbaseField.fieldType) 35 | } 36 | fieldRs 37 | } 38 | 39 | def resolveRowKey (result: Result, resultType: String): Any = { 40 | val rowkey = resultType match { 41 | case "string" => 42 | result.getRow.map(_.toChar).mkString 43 | case "int" => 44 | result .getRow.map(_.toChar).mkString.toInt 45 | case "long" => 46 | result.getRow.map(_.toChar).mkString.toLong 47 | } 48 | rowkey 49 | } 50 | 51 | def resolveColumn (result: Result, columnFamily: String, columnName: String, resultType: String): Any = { 52 | val column = resultType match { 53 | case "string" => 54 | result.getValue(columnFamily.getBytes,columnName.getBytes).map(_.toChar).mkString 55 | case "int" => 56 | result.getValue(columnFamily.getBytes,columnName.getBytes).map(_.toChar).mkString.toInt 57 | case "long" => 58 | result.getValue(columnFamily.getBytes,columnName.getBytes).map(_.toChar).mkString.toLong 59 | } 60 | column 61 | } 62 | } 63 | 64 | /** 65 | val hbaseDDL = s""" 66 | |CREATE TEMPORARY TABLE hbase_people 67 | |USING com.shengli.spark.hbase 68 | |OPTIONS ( 69 | | sparksql_table_schema '(row_key string, name string, age int, job string)', 70 | | hbase_table_name 'people', 71 | | hbase_table_schema '(:key , profile:name , profile:age , career:job )' 72 | |)""".stripMargin 73 | */ 74 | case class HBaseRelation(@transient val hbaseProps: Map[String,String])(@transient val sqlContext: SQLContext) extends TableScan with Serializable { 75 | 76 | val hbaseTableName = hbaseProps.getOrElse("hbase_table_name", sys.error("not valid schema")) 77 | val hbaseTableSchema = hbaseProps.getOrElse("hbase_table_schema", sys.error("not valid schema")) 78 | val registerTableSchema = hbaseProps.getOrElse("sparksql_table_schema", sys.error("not valid schema")) 79 | val rowRange = hbaseProps.getOrElse("row_range", "->") 80 | //get star row and end row 81 | val range = rowRange.split("->",-1) 82 | val startRowKey = range(0).trim 83 | val endRowKey = range(1).trim 84 | 85 | val tempHBaseFields = extractHBaseSchema(hbaseTableSchema) //do not use this, a temp field 86 | val registerTableFields = extractRegisterSchema(registerTableSchema) 87 | val tempFieldRelation = tableSchemaFieldMapping(tempHBaseFields,registerTableFields) 88 | 89 | val hbaseTableFields = feedTypes(tempFieldRelation) 90 | val fieldsRelations = tableSchemaFieldMapping(hbaseTableFields,registerTableFields) 91 | val queryColumns = getQueryTargetCloumns(hbaseTableFields) 92 | 93 | def feedTypes( mapping: Map[HBaseSchemaField, RegisteredSchemaField]) : Array[HBaseSchemaField] = { 94 | val hbaseFields = mapping.map{ 95 | case (k,v) => 96 | val field = k.copy(fieldType=v.fieldType) 97 | field 98 | } 99 | hbaseFields.toArray 100 | } 101 | 102 | def isRowKey(field: HBaseSchemaField) : Boolean = { 103 | val cfColArray = field.fieldName.split(":",-1) 104 | val cfName = cfColArray(0) 105 | val colName = cfColArray(1) 106 | if(cfName=="" && colName=="key") true else false 107 | } 108 | 109 | //eg: f1:col1 f1:col2 f1:col3 f2:col1 110 | def getQueryTargetCloumns(hbaseTableFields: Array[HBaseSchemaField]): String = { 111 | var str = ArrayBuffer[String]() 112 | hbaseTableFields.foreach{ field=> 113 | if(!isRowKey(field)) { 114 | str += field.fieldName 115 | } 116 | } 117 | str.mkString(" ") 118 | } 119 | lazy val schema = { 120 | val fields = hbaseTableFields.map{ field=> 121 | val name = fieldsRelations.getOrElse(field, sys.error("table schema is not match the definition.")).fieldName 122 | val relatedType = field.fieldType match { 123 | case "string" => 124 | SchemaType(StringType,nullable = false) 125 | case "int" => 126 | SchemaType(IntegerType,nullable = false) 127 | case "long" => 128 | SchemaType(LongType,nullable = false) 129 | } 130 | StructField(name,relatedType.dataType,relatedType.nullable) 131 | } 132 | StructType(fields) 133 | } 134 | 135 | def tableSchemaFieldMapping( externalHBaseTable: Array[HBaseSchemaField], registerTable : Array[RegisteredSchemaField]): Map[HBaseSchemaField, RegisteredSchemaField] = { 136 | if(externalHBaseTable.length != registerTable.length) sys.error("columns size not match in definition!") 137 | val rs = externalHBaseTable.zip(registerTable) 138 | rs.toMap 139 | } 140 | 141 | /** 142 | * spark sql schema will be register 143 | * registerTableSchema '(rowkey string, value string, column_a string)' 144 | */ 145 | def extractRegisterSchema(registerTableSchema: String) : Array[RegisteredSchemaField] = { 146 | val fieldsStr = registerTableSchema.trim.drop(1).dropRight(1) 147 | val fieldsArray = fieldsStr.split(",").map(_.trim) 148 | fieldsArray.map{ fildString => 149 | val splitedField = fildString.split("\\s+", -1) 150 | RegisteredSchemaField(splitedField(0), splitedField(1)) 151 | } 152 | } 153 | 154 | //externalTableSchema '(:key , f1:col1 )' 155 | def extractHBaseSchema(externalTableSchema: String) : Array[HBaseSchemaField] = { 156 | val fieldsStr = externalTableSchema.trim.drop(1).dropRight(1) 157 | val fieldsArray = fieldsStr.split(",").map(_.trim) 158 | fieldsArray.map(fildString => HBaseSchemaField(fildString,"")) 159 | } 160 | 161 | 162 | 163 | // By making this a lazy val we keep the RDD around, amortizing the cost of locating splits. 164 | lazy val buildScan = { 165 | 166 | val hbaseConf = HBaseConfiguration.create() 167 | hbaseConf.set(TableInputFormat.INPUT_TABLE, hbaseTableName) 168 | hbaseConf.set(TableInputFormat.SCAN_COLUMNS, queryColumns); 169 | hbaseConf.set(TableInputFormat.SCAN_ROW_START, startRowKey); 170 | hbaseConf.set(TableInputFormat.SCAN_ROW_STOP, endRowKey); 171 | 172 | val hbaseRdd = sqlContext.sparkContext.newAPIHadoopRDD( 173 | hbaseConf, 174 | classOf[org.apache.hadoop.hbase.mapreduce.TableInputFormat], 175 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], 176 | classOf[org.apache.hadoop.hbase.client.Result] 177 | ) 178 | 179 | 180 | val rs = hbaseRdd.map(tuple => tuple._2).map(result => { 181 | var values = new ArrayBuffer[Any]() 182 | hbaseTableFields.foreach{field=> 183 | values += Resolver.resolve(field,result) 184 | } 185 | Row.fromSeq(values.toSeq) 186 | }) 187 | rs 188 | } 189 | 190 | private case class SchemaType(dataType: DataType, nullable: Boolean) 191 | // 192 | // private def toSqlType(hbaseSchema: Schema): SchemaType = { 193 | // SchemaType(StringType,true) 194 | // } 195 | } -------------------------------------------------------------------------------- /src/main/scala/com/shengli/spark/hbase/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Sheng,Li 3 | */ 4 | package com.shengli.spark 5 | 6 | import org.apache.spark.sql.{SQLContext, SchemaRDD} 7 | import scala.collection.immutable.HashMap 8 | 9 | 10 | 11 | package object hbase { 12 | 13 | abstract class SchemaField extends Serializable 14 | 15 | case class RegisteredSchemaField(fieldName: String, fieldType: String) extends SchemaField with Serializable 16 | 17 | case class HBaseSchemaField(fieldName: String, fieldType: String) extends SchemaField with Serializable 18 | 19 | case class Parameter(name: String) 20 | 21 | 22 | protected val SPARK_SQL_TABLE_SCHEMA = Parameter("sparksql_table_schema") 23 | protected val HBASE_TABLE_NAME = Parameter("hbase_table_name") 24 | protected val HBASE_TABLE_SCHEMA = Parameter("hbase_table_schema") 25 | protected val ROW_RANGE = Parameter("row_range") 26 | /** 27 | * Adds a method, `hbaseTable`, to SQLContext that allows reading data stored in hbase table. 28 | */ 29 | implicit class HBaseContext(sqlContext: SQLContext) { 30 | def hbaseTable(sparksqlTableSchema: String, hbaseTableName: String, hbaseTableSchema: String, rowRange: String = "->") = { 31 | var params = new HashMap[String, String] 32 | params += ( SPARK_SQL_TABLE_SCHEMA.name -> sparksqlTableSchema) 33 | params += ( HBASE_TABLE_NAME.name -> hbaseTableName) 34 | params += ( HBASE_TABLE_SCHEMA.name -> hbaseTableSchema) 35 | //get star row and end row 36 | params += ( ROW_RANGE.name -> rowRange) 37 | 38 | sqlContext.baseRelationToSchemaRDD(HBaseRelation(params)(sqlContext)) 39 | } 40 | } 41 | 42 | // implicit class HBaseSchemaRDD(schemaRDD: SchemaRDD) { 43 | // def saveIntoTable(tableName: String): Unit = ??? 44 | // } 45 | } 46 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=DEBUG, CA, FA 2 | 3 | #Console Appender 4 | log4j.appender.CA=org.apache.log4j.ConsoleAppender 5 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.CA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %p %c: %m%n 7 | log4j.appender.CA.Threshold = WARN 8 | 9 | 10 | #File Appender 11 | log4j.appender.FA=org.apache.log4j.FileAppender 12 | log4j.appender.FA.append=false 13 | log4j.appender.FA.file=target/unit-tests.log 14 | log4j.appender.FA.layout=org.apache.log4j.PatternLayout 15 | log4j.appender.FA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %t %p %c{1}: %m%n 16 | 17 | # Set the logger level of File Appender to WARN 18 | log4j.appender.FA.Threshold = INFO 19 | 20 | # Some packages are noisy for no good reason. 21 | log4j.additivity.parquet.hadoop.ParquetRecordReader=false 22 | log4j.logger.parquet.hadoop.ParquetRecordReader=OFF 23 | 24 | log4j.additivity.org.apache.hadoop.hive.serde2.lazy.LazyStruct=false 25 | log4j.logger.org.apache.hadoop.hive.serde2.lazy.LazyStruct=OFF 26 | 27 | log4j.additivity.org.apache.hadoop.hive.metastore.RetryingHMSHandler=false 28 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=OFF 29 | 30 | log4j.additivity.hive.ql.metadata.Hive=false 31 | log4j.logger.hive.ql.metadata.Hive=OFF 32 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/HBaseSuite.scala: -------------------------------------------------------------------------------- 1 | package com.shengli.spark.hbase.test 2 | 3 | import org.apache.spark.sql.test._ 4 | import org.scalatest.FunSuite 5 | 6 | /* Implicits */ 7 | import TestSQLContext._ 8 | 9 | class HBaseSuite extends FunSuite { 10 | 11 | test("dsl test") { 12 | val results = TestSQLContext.hbaseTable("(row_key string, name string, age int, job string)","people","(:key string, profile:name string, profile:age int, career:job string)").select('name).count.collect() 13 | assert(results.size === 5) 14 | } 15 | 16 | test("sql test") { 17 | sql( 18 | s""" 19 | |CREATE TEMPORARY TABLE hbase_people 20 | |USING com.shengli.spark.hbase 21 | |OPTIONS ( 22 | | sparksql_table_schema '(row_key string, name string, age int, job string)', 23 | | hbase_table_name 'people', 24 | | hbase_table_schema '(:key string, profile:name string, profile:age int, career:job string)' 25 | |)""".stripMargin 26 | 27 | assert(sql("SELECT * FROM hbase_people").collect().size === 5) ) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/sql.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | --------------------------------------------------------------------------------