├── .gitignore
├── README.md
├── build.sbt
├── data
└── person_hbase
├── project
├── build.properties
└── plugins.sbt
└── src
├── main
├── main.iml
└── scala
│ └── com
│ └── shengli
│ └── spark
│ └── hbase
│ ├── DefaultSource.scala
│ ├── HBaseRelation.scala
│ └── package.scala
└── test
├── resources
└── log4j.properties
└── scala
└── org
└── apache
└── spark
└── sql
├── hbase
└── HBaseSuite.scala
└── sql.iml
/.gitignore:
--------------------------------------------------------------------------------
1 | sbt/sbt-launch*.jar
2 | target/
3 | .idea/
4 | .idea_modules/
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #Spark SQL HBase Connector
2 |
3 | ##----------------Note: This Project is Deprecated---------------
4 | ##--------------And This Project is Not Maintained---------------
5 |
6 | _Spark SQL HBase Connector_ aim to query HBase Table by using Spark SQL.
7 |
8 | It leverages the functionality of [Spark SQL](http://spark.apache.org/sql/) 1.2+ external datasource API .
9 |
10 | > 该项目仅供参考,学习之用,未经正式测试,并跟最新代码相比是 out of date 的。
11 | Spark1.2发布之后,Spark SQL支持了External Datasource API,我们才能方便的编写扩展来使Spark SQL能支持更多的外部数据源。
12 |
13 |
14 |
15 | ##Using SQL Resiger HBase Table
16 |
17 | ###1.Query by Spark SQL
18 |
19 | #### One Column and Multiple Columns Scan.
20 |
21 | Recommended way is to always put the rowkey at the first column in schema.
22 | And we use `:key` represent the rowkey in hbase.
23 |
24 | `sparksql_table_schema`: is the table will register to spark sql.
25 | `hbase_table_name`: a real hbase table name in hbase.
26 | `hbase_table_schema`: the columns want's to query in hbase table __hbase_table_name__ you provided.
27 |
28 | __Note__:
29 | `sparksql_table_schema` and `hbase_table_schema` should be a mapping relation, should have same column number and index.
30 |
31 | ```scala
32 | import org.apache.spark.sql.SQLContext
33 | val sqlContext = new SQLContext(sc)
34 | import sqlContext._
35 |
36 | val hbaseDDL = s"""
37 | |CREATE TEMPORARY TABLE hbase_people
38 | |USING com.shengli.spark.hbase
39 | |OPTIONS (
40 | | sparksql_table_schema '(row_key string, name string, age int, job string)',
41 | | hbase_table_name 'people',
42 | | hbase_table_schema '(:key , profile:name , profile:age , career:job )'
43 | |)""".stripMargin
44 |
45 |
46 | sqlContext.sql(hbaseDDL)
47 | sql("select row_key,name,age,job from hbase_people").collect()
48 | ```
49 |
50 | Let's see the result:
51 |
52 | __select__:
53 |
54 | ```
55 | scala> sql("select row_key,name,age,job from hbase_people").collect()
56 | 14/12/27 02:24:22 INFO scheduler.DAGScheduler: Job 0 finished: collect at SparkPlan.scala:81, took 1.576415 s
57 | res1: Array[org.apache.spark.sql.Row] = Array([rowkey001,Sheng,Li,25,software engineer], [rowkey002,Li,Lei,26,teacher], [rowkey003,Jim Green,24,english teacher], [rowkey004,Lucy,23,doctor], [rowkey005,HanMeiMei,18,student])
58 | ```
59 |
60 | __functions__:
61 |
62 | __avg__:
63 |
64 | ```scala
65 | scala> sql("select avg(age) from hbase_people").collect()
66 | 14/12/27 02:26:55 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool
67 | 14/12/27 02:26:55 INFO scheduler.DAGScheduler: Job 1 finished: collect at SparkPlan.scala:81, took 0.459760 s
68 | res2: Array[org.apache.spark.sql.Row] = Array([23.2])
69 | ```
70 | __count:__
71 |
72 | ```scala
73 | scala> sql("select count(1) from hbase_people").collect()
74 | res3: Array[org.apache.spark.sql.Row] = Array([5])
75 | ```
76 |
77 | ### Support RowKey Range Scan
78 |
79 | If you need a range data from a hbase table, you can specify `row_range` in __OPTIONS__.
80 | We only need start rowkey is `rowkey003` and end rowkey is `rowkey005`
81 |
82 | ```
83 | val hbaseDDL = s"""
84 | |CREATE TEMPORARY TABLE hbase_people
85 | |USING com.shengli.spark.hbase
86 | |OPTIONS (
87 | | sparksql_table_schema '(row_key string, name string, age int, job string)',
88 | | hbase_table_name 'people',
89 | | hbase_table_schema '(:key , profile:name , profile:age , career:job )',
90 | | row_range 'rowkey003->rowkey005'
91 | |)""".stripMargin
92 | ```
93 |
94 | By using RowKey Range Scan, the result of the query only return:
95 | ```
96 | res2: Array[org.apache.spark.sql.Row] = Array([rowkey003,Jim Green,24,english teacher], [rowkey004,Lucy,23,doctor])
97 | ```
98 |
99 | And the count is:
100 | ```
101 | scala> sql("select count(1) from hbase_people").collect()
102 | res3: Array[org.apache.spark.sql.Row] = Array([2])
103 | ```
104 |
105 |
106 | ###2. Query by SQLContext API
107 |
108 | Firstly, import `import com.shengli.spark.hbase._`
109 | Secondly, use `sqlContext.hbaseTable` _API_ to generate a `SchemaRDD`
110 | The `sqlContext.hbaseTable` _API_ need serveral parameters.
111 |
112 | __Common Way__:
113 |
114 | If you do common Scan, you just pass three parameters below:
115 |
116 | ```scala
117 | sqlContext.hbaseTable(sparksqlTableSchema: String, hbaseTableName: String, hbaseTableSchema: String)
118 | ```
119 |
120 | ```scala
121 | scala> import com.shengli.spark.hbase._
122 | import com.shengli.spark.hbase._
123 |
124 | scala> val hbaseSchema = sqlContext.hbaseTable("(row_key string, name string, age int, job string)","people","(:key , profile:name , profile:age , career:job )")
125 | ......
126 | 14/12/27 02:30:55 INFO spark.SparkContext: Created broadcast 4 from newAPIHadoopRDD at HBaseRelation.scala:158
127 | hbaseSchema: org.apache.spark.sql.SchemaRDD =
128 | SchemaRDD[16] at RDD at SchemaRDD.scala:108
129 | == Query Plan ==
130 | == Physical Plan ==
131 | PhysicalRDD [row_key#15,name#16,age#17,job#18], MapPartitionsRDD[19] at map at HBaseRelation.scala:166
132 | ```
133 |
134 | We've got a hbaseSchema so that we can query it with DSL or register it as a temp table query with sql, do whatever you like:
135 | ```
136 | scala> hbaseSchema.select('row_key).collect()
137 | res1: Array[org.apache.spark.sql.Row] = Array([rowkey001], [rowkey002], [rowkey003], [rowkey004], [rowkey005])
138 | ```
139 |
140 | __RowKey Range Scan__:
141 |
142 | RowKey Range Scan need pass a `row_range` which format is `starRow->endRow` to let the connector know:
143 |
144 | ```scala
145 | sqlContext.hbaseTable(sparksqlTableSchema: String, hbaseTableName: String, hbaseTableSchema: String, rowRange: String)
146 | ```
147 |
148 |
149 | ```scala
150 | scala> import com.shengli.spark.hbase._
151 | import com.shengli.spark.hbase._
152 |
153 | scala> val hbaseSchema = sqlContext.hbaseTable("(row_key string, name string, age int, job string)","people","(:key , profile:name , profile:age , career:job )","rowkey002->rowkey004")
154 | hbaseSchema: org.apache.spark.sql.SchemaRDD =
155 | SchemaRDD[9] at RDD at SchemaRDD.scala:108
156 | == Query Plan ==
157 | == Physical Plan ==
158 | PhysicalRDD [row_key#8,name#9,age#10,job#11], MapPartitionsRDD[12] at map at HBaseRelation.scala:174
159 |
160 | scala> hbaseSchema.select('row_key).collect()
161 | ......
162 | res0: Array[org.apache.spark.sql.Row] = Array([rowkey002], [rowkey003])
163 | ```
164 |
165 |
166 |
167 | ##HBase Data
168 |
169 | Let's take look at the `HBase Table` named `person`
170 |
171 | The `schema` of the table `person`:
172 |
173 | __column family__: `profile`, `career`
174 |
175 | __coloumns__:`profile:name`, `profile:age`,`carrer:job`
176 |
177 |
178 | ```java
179 | 1.8.7-p357 :024 > scan 'people'
180 | ROW COLUMN+CELL
181 | rowkey001 column=career:job, timestamp=1419517844784, value=software engineer
182 | rowkey001 column=profile:age, timestamp=1419517844665, value=25
183 | rowkey001 column=profile:name, timestamp=1419517844501, value=Sheng,Li
184 | rowkey002 column=career:job, timestamp=1419517844813, value=teacher
185 | rowkey002 column=profile:age, timestamp=1419517844687, value=26
186 | rowkey002 column=profile:name, timestamp=1419517844544, value=Li,Lei
187 | rowkey003 column=career:job, timestamp=1419517844832, value=english teacher
188 | rowkey003 column=profile:age, timestamp=1419517844704, value=24
189 | rowkey003 column=profile:name, timestamp=1419517844568, value=Jim Green
190 | rowkey004 column=career:job, timestamp=1419517844853, value=doctor
191 | rowkey004 column=profile:age, timestamp=1419517844724, value=23
192 | rowkey004 column=profile:name, timestamp=1419517844589, value=Lucy
193 | rowkey005 column=career:job, timestamp=1419517845664, value=student
194 | rowkey005 column=profile:age, timestamp=1419517844744, value=18
195 | rowkey005 column=profile:name, timestamp=1419517844606, value=HanMeiMei
196 | 5 row(s) in 0.0260 seconds
197 | ```
198 |
199 | ###Note:
200 |
201 | ####Package
202 |
203 | In the root directory, use `sbt package` to package the lib.
204 |
205 | ####Dependency
206 |
207 | __1. hbase-site.xml__
208 |
209 | You need place `hbase-site.xml` under the spark classpath. Also need to configure it correctly first.
210 | Below is my hbase-site.xml:
211 |
212 | ```scala
213 |
214 |
215 | hbase.rootdir
216 | file:///Users/shengli/software/data/hbase
217 |
218 |
219 | hbase.cluster.distributed
220 | true
221 |
222 |
223 | hbase.zookeeper.property.clientPort
224 | 2181
225 |
226 |
227 |
228 | hbase.zookeeper.quorum
229 | localhost
230 |
231 |
232 | hbase.defaults.for.version.skip
233 | true
234 |
235 |
236 | ```
237 |
238 | You can simply do it with `ln -s ~/software/hbase/conf/hbase-site.xml ~/git_repos/spark`
239 |
240 | __2. Add hbase related libs into spark classpath__
241 |
242 | Below is how I start the spark shell:
243 | Add hbase related libs into spark classpath to make sure spark can access hbase with spark api first.
244 |
245 | Then:
246 |
247 | ```scala
248 | bin/spark-shell --master spark://192.168.2.100:7077 --jars /Users/shengli/software/hbase/lib/hbase-client-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/hbase-server-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/hbase-common-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/hbase-protocol-0.98.8-hadoop2.jar,/Users/shengli/software/hbase/lib/protobuf-java-2.5.0.jar,/Users/shengli/software/hbase/lib/htrace-core-2.04.jar,/Users/shengli/git_repos/spark-sql-hbase/target/scala-2.10/spark-sql-hbase_2.10-0.1.jar --driver-java-options "-Dsun.io.serialization.extendedDebugInfo=true"
249 | ```
250 |
251 | __3. class not found issues__
252 |
253 | The below provides the mapping of the classes and their respective jars
254 |
255 | ```scala
256 | | Class Name | Jar Name |
257 | |------------|-----------------|
258 | | TableSplit | hbase-server.jar |
259 | | HTable | hbase-client.jar |
260 | | MasterProtos | hbase-protocol.jar |
261 | | org.cloudera.htrace.Trace | htrace-core-2.01.jar |
262 | ```
263 |
264 | - https://support.pivotal.io/hc/en-us/articles/203025186-Hive-Query-from-Tableau-failed-with-error-Execution-Error-return-code-2-from-org-apache-hadoop-hive-ql-exec-mr-MapRedTask
265 |
266 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | name := "spark-sql-hbase"
2 |
3 | version := "0.1"
4 |
5 | organization := "com.shengli"
6 |
7 | scalaVersion := "2.10.4"
8 |
9 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.2.0" % "provided"
10 |
11 | libraryDependencies += "org.apache.hbase" % "hbase" % "0.94.14"
12 |
13 | libraryDependencies += "org.apache.zookeeper" % "zookeeper" % "3.4.5"
14 |
15 | libraryDependencies += "org.slf4j" % "slf4j-api" % "1.6.1"
16 |
17 |
18 | publishMavenStyle := true
19 |
20 | pomExtra := (
21 | https://github.com/OopsOutOfMemory/spark-sql-hbase
22 |
23 | git@github.com:OopsOutOfMemory/spark-sql-hbase.git
24 | scm:git:git@github.com:OopsOutOfMemory/spark-hbase.git
25 |
26 |
27 |
28 | OopsOutOfMemory
29 | Sheng,Li(盛利)
30 | https://github.com/OopsOutOfMemory
31 |
32 | )
33 |
34 | // Enable Junit testing.
35 | // libraryDependencies += "com.novocode" % "junit-interface" % "0.9" % "test"
36 |
37 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.1" % "test"
38 |
--------------------------------------------------------------------------------
/data/person_hbase:
--------------------------------------------------------------------------------
1 | create 'people',{NAME => 'profile', VERSIONS => 1},{NAME => 'career', VERSIONS => 1}
2 |
3 | put 'people','rowkey001','profile:name','Sheng,Li'
4 | put 'people','rowkey002','profile:name','Li,Lei'
5 | put 'people','rowkey003','profile:name','Jim Green'
6 | put 'people','rowkey004','profile:name','Lucy'
7 | put 'people','rowkey005','profile:name','HanMeiMei'
8 |
9 |
10 | put 'people','rowkey001','profile:age','25'
11 | put 'people','rowkey002','profile:age','26'
12 | put 'people','rowkey003','profile:age','24'
13 | put 'people','rowkey004','profile:age','23'
14 | put 'people','rowkey005','profile:age','18'
15 |
16 |
17 | put 'people','rowkey001','career:job','software engineer'
18 | put 'people','rowkey002','career:job','teacher'
19 | put 'people','rowkey003','career:job','english teacher'
20 | put 'people','rowkey004','career:job','doctor'
21 | put 'people','rowkey005','career:job','student'
22 |
23 | get 'people','rowkey001'
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | sbt.version=0.13.6
18 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | scalaVersion := "2.10.4"
2 |
3 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
4 |
5 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
6 |
7 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
8 |
9 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
10 |
11 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0")
12 |
13 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
--------------------------------------------------------------------------------
/src/main/main.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/main/scala/com/shengli/spark/hbase/DefaultSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Sheng,Li
3 | */
4 | package com.shengli.spark.hbase
5 |
6 | import org.apache.spark.sql.SQLContext
7 | import org.apache.spark.sql.sources.RelationProvider
8 | import com.shengli.spark.hbase
9 |
10 |
11 | class DefaultSource extends RelationProvider {
12 | def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = {
13 | HBaseRelation(parameters)(sqlContext)
14 | }
15 | }
--------------------------------------------------------------------------------
/src/main/scala/com/shengli/spark/hbase/HBaseRelation.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Sheng, Li
3 | */
4 | package com.shengli.spark.hbase
5 |
6 | import java.io.Serializable
7 |
8 | import org.apache.hadoop.fs.Path
9 | import org.apache.spark.sql._
10 | import org.apache.spark.sql.sources.TableScan
11 | import scala.collection.immutable.{HashMap, Map}
12 | import org.apache.hadoop.hbase.client.{Result, Scan, HTable, HBaseAdmin}
13 |
14 | import org.apache.spark.sql._
15 | import org.apache.spark.rdd.NewHadoopRDD
16 | import org.apache.hadoop.hbase.HBaseConfiguration;
17 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
18 | import scala.collection.JavaConversions._
19 | import scala.collection.JavaConverters._
20 | import scala.collection.mutable.ArrayBuffer
21 |
22 |
23 | object Resolver extends Serializable {
24 |
25 | def resolve (hbaseField: HBaseSchemaField, result: Result ): Any = {
26 | val cfColArray = hbaseField.fieldName.split(":",-1)
27 | val cfName = cfColArray(0)
28 | val colName = cfColArray(1)
29 | var fieldRs: Any = null
30 | //resolve row key otherwise resolve column
31 | if(cfName=="" && colName=="key") {
32 | fieldRs = resolveRowKey(result, hbaseField.fieldType)
33 | } else {
34 | fieldRs = resolveColumn(result, cfName, colName,hbaseField.fieldType)
35 | }
36 | fieldRs
37 | }
38 |
39 | def resolveRowKey (result: Result, resultType: String): Any = {
40 | val rowkey = resultType match {
41 | case "string" =>
42 | result.getRow.map(_.toChar).mkString
43 | case "int" =>
44 | result .getRow.map(_.toChar).mkString.toInt
45 | case "long" =>
46 | result.getRow.map(_.toChar).mkString.toLong
47 | }
48 | rowkey
49 | }
50 |
51 | def resolveColumn (result: Result, columnFamily: String, columnName: String, resultType: String): Any = {
52 | val column = resultType match {
53 | case "string" =>
54 | result.getValue(columnFamily.getBytes,columnName.getBytes).map(_.toChar).mkString
55 | case "int" =>
56 | result.getValue(columnFamily.getBytes,columnName.getBytes).map(_.toChar).mkString.toInt
57 | case "long" =>
58 | result.getValue(columnFamily.getBytes,columnName.getBytes).map(_.toChar).mkString.toLong
59 | }
60 | column
61 | }
62 | }
63 |
64 | /**
65 | val hbaseDDL = s"""
66 | |CREATE TEMPORARY TABLE hbase_people
67 | |USING com.shengli.spark.hbase
68 | |OPTIONS (
69 | | sparksql_table_schema '(row_key string, name string, age int, job string)',
70 | | hbase_table_name 'people',
71 | | hbase_table_schema '(:key , profile:name , profile:age , career:job )'
72 | |)""".stripMargin
73 | */
74 | case class HBaseRelation(@transient val hbaseProps: Map[String,String])(@transient val sqlContext: SQLContext) extends TableScan with Serializable {
75 |
76 | val hbaseTableName = hbaseProps.getOrElse("hbase_table_name", sys.error("not valid schema"))
77 | val hbaseTableSchema = hbaseProps.getOrElse("hbase_table_schema", sys.error("not valid schema"))
78 | val registerTableSchema = hbaseProps.getOrElse("sparksql_table_schema", sys.error("not valid schema"))
79 | val rowRange = hbaseProps.getOrElse("row_range", "->")
80 | //get star row and end row
81 | val range = rowRange.split("->",-1)
82 | val startRowKey = range(0).trim
83 | val endRowKey = range(1).trim
84 |
85 | val tempHBaseFields = extractHBaseSchema(hbaseTableSchema) //do not use this, a temp field
86 | val registerTableFields = extractRegisterSchema(registerTableSchema)
87 | val tempFieldRelation = tableSchemaFieldMapping(tempHBaseFields,registerTableFields)
88 |
89 | val hbaseTableFields = feedTypes(tempFieldRelation)
90 | val fieldsRelations = tableSchemaFieldMapping(hbaseTableFields,registerTableFields)
91 | val queryColumns = getQueryTargetCloumns(hbaseTableFields)
92 |
93 | def feedTypes( mapping: Map[HBaseSchemaField, RegisteredSchemaField]) : Array[HBaseSchemaField] = {
94 | val hbaseFields = mapping.map{
95 | case (k,v) =>
96 | val field = k.copy(fieldType=v.fieldType)
97 | field
98 | }
99 | hbaseFields.toArray
100 | }
101 |
102 | def isRowKey(field: HBaseSchemaField) : Boolean = {
103 | val cfColArray = field.fieldName.split(":",-1)
104 | val cfName = cfColArray(0)
105 | val colName = cfColArray(1)
106 | if(cfName=="" && colName=="key") true else false
107 | }
108 |
109 | //eg: f1:col1 f1:col2 f1:col3 f2:col1
110 | def getQueryTargetCloumns(hbaseTableFields: Array[HBaseSchemaField]): String = {
111 | var str = ArrayBuffer[String]()
112 | hbaseTableFields.foreach{ field=>
113 | if(!isRowKey(field)) {
114 | str += field.fieldName
115 | }
116 | }
117 | str.mkString(" ")
118 | }
119 | lazy val schema = {
120 | val fields = hbaseTableFields.map{ field=>
121 | val name = fieldsRelations.getOrElse(field, sys.error("table schema is not match the definition.")).fieldName
122 | val relatedType = field.fieldType match {
123 | case "string" =>
124 | SchemaType(StringType,nullable = false)
125 | case "int" =>
126 | SchemaType(IntegerType,nullable = false)
127 | case "long" =>
128 | SchemaType(LongType,nullable = false)
129 | }
130 | StructField(name,relatedType.dataType,relatedType.nullable)
131 | }
132 | StructType(fields)
133 | }
134 |
135 | def tableSchemaFieldMapping( externalHBaseTable: Array[HBaseSchemaField], registerTable : Array[RegisteredSchemaField]): Map[HBaseSchemaField, RegisteredSchemaField] = {
136 | if(externalHBaseTable.length != registerTable.length) sys.error("columns size not match in definition!")
137 | val rs = externalHBaseTable.zip(registerTable)
138 | rs.toMap
139 | }
140 |
141 | /**
142 | * spark sql schema will be register
143 | * registerTableSchema '(rowkey string, value string, column_a string)'
144 | */
145 | def extractRegisterSchema(registerTableSchema: String) : Array[RegisteredSchemaField] = {
146 | val fieldsStr = registerTableSchema.trim.drop(1).dropRight(1)
147 | val fieldsArray = fieldsStr.split(",").map(_.trim)
148 | fieldsArray.map{ fildString =>
149 | val splitedField = fildString.split("\\s+", -1)
150 | RegisteredSchemaField(splitedField(0), splitedField(1))
151 | }
152 | }
153 |
154 | //externalTableSchema '(:key , f1:col1 )'
155 | def extractHBaseSchema(externalTableSchema: String) : Array[HBaseSchemaField] = {
156 | val fieldsStr = externalTableSchema.trim.drop(1).dropRight(1)
157 | val fieldsArray = fieldsStr.split(",").map(_.trim)
158 | fieldsArray.map(fildString => HBaseSchemaField(fildString,""))
159 | }
160 |
161 |
162 |
163 | // By making this a lazy val we keep the RDD around, amortizing the cost of locating splits.
164 | lazy val buildScan = {
165 |
166 | val hbaseConf = HBaseConfiguration.create()
167 | hbaseConf.set(TableInputFormat.INPUT_TABLE, hbaseTableName)
168 | hbaseConf.set(TableInputFormat.SCAN_COLUMNS, queryColumns);
169 | hbaseConf.set(TableInputFormat.SCAN_ROW_START, startRowKey);
170 | hbaseConf.set(TableInputFormat.SCAN_ROW_STOP, endRowKey);
171 |
172 | val hbaseRdd = sqlContext.sparkContext.newAPIHadoopRDD(
173 | hbaseConf,
174 | classOf[org.apache.hadoop.hbase.mapreduce.TableInputFormat],
175 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
176 | classOf[org.apache.hadoop.hbase.client.Result]
177 | )
178 |
179 |
180 | val rs = hbaseRdd.map(tuple => tuple._2).map(result => {
181 | var values = new ArrayBuffer[Any]()
182 | hbaseTableFields.foreach{field=>
183 | values += Resolver.resolve(field,result)
184 | }
185 | Row.fromSeq(values.toSeq)
186 | })
187 | rs
188 | }
189 |
190 | private case class SchemaType(dataType: DataType, nullable: Boolean)
191 | //
192 | // private def toSqlType(hbaseSchema: Schema): SchemaType = {
193 | // SchemaType(StringType,true)
194 | // }
195 | }
--------------------------------------------------------------------------------
/src/main/scala/com/shengli/spark/hbase/package.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2014 Sheng,Li
3 | */
4 | package com.shengli.spark
5 |
6 | import org.apache.spark.sql.{SQLContext, SchemaRDD}
7 | import scala.collection.immutable.HashMap
8 |
9 |
10 |
11 | package object hbase {
12 |
13 | abstract class SchemaField extends Serializable
14 |
15 | case class RegisteredSchemaField(fieldName: String, fieldType: String) extends SchemaField with Serializable
16 |
17 | case class HBaseSchemaField(fieldName: String, fieldType: String) extends SchemaField with Serializable
18 |
19 | case class Parameter(name: String)
20 |
21 |
22 | protected val SPARK_SQL_TABLE_SCHEMA = Parameter("sparksql_table_schema")
23 | protected val HBASE_TABLE_NAME = Parameter("hbase_table_name")
24 | protected val HBASE_TABLE_SCHEMA = Parameter("hbase_table_schema")
25 | protected val ROW_RANGE = Parameter("row_range")
26 | /**
27 | * Adds a method, `hbaseTable`, to SQLContext that allows reading data stored in hbase table.
28 | */
29 | implicit class HBaseContext(sqlContext: SQLContext) {
30 | def hbaseTable(sparksqlTableSchema: String, hbaseTableName: String, hbaseTableSchema: String, rowRange: String = "->") = {
31 | var params = new HashMap[String, String]
32 | params += ( SPARK_SQL_TABLE_SCHEMA.name -> sparksqlTableSchema)
33 | params += ( HBASE_TABLE_NAME.name -> hbaseTableName)
34 | params += ( HBASE_TABLE_SCHEMA.name -> hbaseTableSchema)
35 | //get star row and end row
36 | params += ( ROW_RANGE.name -> rowRange)
37 |
38 | sqlContext.baseRelationToSchemaRDD(HBaseRelation(params)(sqlContext))
39 | }
40 | }
41 |
42 | // implicit class HBaseSchemaRDD(schemaRDD: SchemaRDD) {
43 | // def saveIntoTable(tableName: String): Unit = ???
44 | // }
45 | }
46 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=DEBUG, CA, FA
2 |
3 | #Console Appender
4 | log4j.appender.CA=org.apache.log4j.ConsoleAppender
5 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.CA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %p %c: %m%n
7 | log4j.appender.CA.Threshold = WARN
8 |
9 |
10 | #File Appender
11 | log4j.appender.FA=org.apache.log4j.FileAppender
12 | log4j.appender.FA.append=false
13 | log4j.appender.FA.file=target/unit-tests.log
14 | log4j.appender.FA.layout=org.apache.log4j.PatternLayout
15 | log4j.appender.FA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %t %p %c{1}: %m%n
16 |
17 | # Set the logger level of File Appender to WARN
18 | log4j.appender.FA.Threshold = INFO
19 |
20 | # Some packages are noisy for no good reason.
21 | log4j.additivity.parquet.hadoop.ParquetRecordReader=false
22 | log4j.logger.parquet.hadoop.ParquetRecordReader=OFF
23 |
24 | log4j.additivity.org.apache.hadoop.hive.serde2.lazy.LazyStruct=false
25 | log4j.logger.org.apache.hadoop.hive.serde2.lazy.LazyStruct=OFF
26 |
27 | log4j.additivity.org.apache.hadoop.hive.metastore.RetryingHMSHandler=false
28 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=OFF
29 |
30 | log4j.additivity.hive.ql.metadata.Hive=false
31 | log4j.logger.hive.ql.metadata.Hive=OFF
32 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/HBaseSuite.scala:
--------------------------------------------------------------------------------
1 | package com.shengli.spark.hbase.test
2 |
3 | import org.apache.spark.sql.test._
4 | import org.scalatest.FunSuite
5 |
6 | /* Implicits */
7 | import TestSQLContext._
8 |
9 | class HBaseSuite extends FunSuite {
10 |
11 | test("dsl test") {
12 | val results = TestSQLContext.hbaseTable("(row_key string, name string, age int, job string)","people","(:key string, profile:name string, profile:age int, career:job string)").select('name).count.collect()
13 | assert(results.size === 5)
14 | }
15 |
16 | test("sql test") {
17 | sql(
18 | s"""
19 | |CREATE TEMPORARY TABLE hbase_people
20 | |USING com.shengli.spark.hbase
21 | |OPTIONS (
22 | | sparksql_table_schema '(row_key string, name string, age int, job string)',
23 | | hbase_table_name 'people',
24 | | hbase_table_schema '(:key string, profile:name string, profile:age int, career:job string)'
25 | |)""".stripMargin
26 |
27 | assert(sql("SELECT * FROM hbase_people").collect().size === 5) )
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/sql.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------