├── .gitignore ├── LICENSE ├── README.md ├── bin ├── compute-classpath.cmd ├── compute-classpath.sh └── hbase-sql ├── doc └── SparkSQLOnHBase_v2.1.docx ├── pom.xml ├── python └── pyspark │ ├── __init__.py │ ├── java_gateway.py │ └── sql.py └── src ├── main └── scala │ └── org │ └── apache │ └── spark │ └── sql │ └── hbase │ ├── HBaseCatalog.scala │ ├── HBaseCriticalPoint.scala │ ├── HBasePartition.scala │ ├── HBasePartitioner.scala │ ├── HBaseRelation.scala │ ├── HBaseSQLCliDriver.scala │ ├── HBaseSQLConf.scala │ ├── HBaseSQLContext.scala │ ├── HBaseSQLParser.scala │ ├── HBaseSQLReaderRDD.scala │ ├── HBaseShuffledRDD.scala │ ├── HadoopReader.scala │ ├── IndexMappable.scala │ ├── ScanPredClassifier.scala │ ├── catalyst │ ├── NotPusher.scala │ └── expressions │ │ └── PartialPredicateOperations.scala │ ├── execution │ ├── HBaseSQLTableScan.scala │ ├── HBaseStrategies.scala │ └── hbaseCommands.scala │ ├── package.scala │ ├── types │ ├── HBaseBytesType.scala │ ├── PartialOrderingDataType.scala │ └── RangeType.scala │ └── util │ ├── BytesUtils.scala │ ├── DataTypeUtils.scala │ ├── HBaseKVHelper.scala │ └── Util.scala └── test ├── java └── org │ └── apache │ └── spark │ └── sql │ └── hbase │ └── api │ └── java │ └── JavaAPISuite.java ├── resources ├── joinTable1.txt ├── joinTable2.txt ├── joinTable3.txt ├── joinTable4.txt ├── loadData.txt ├── loadNullableData.txt ├── log4j.properties ├── onecoljoin1.txt ├── onecoljoin2.txt ├── splitLoadData.txt ├── splitLoadData1.txt ├── store_sales.txt └── testTable.txt └── scala └── org └── apache └── spark └── sql └── hbase ├── AggregateQueriesSuite.scala ├── BasicQueriesSuite.scala ├── BulkLoadIntoTableSuite.scala ├── BytesUtilsSuite.scala ├── CatalogTestSuite.scala ├── CriticalPointsTestSuite.scala ├── HBaseAdvancedSQLQuerySuite.scala ├── HBaseBasicOperationSuite.scala ├── HBaseInsertTableSuite.scala ├── HBaseIntegrationTestBase.scala ├── HBasePartitionerSuite.scala ├── HBaseSQLQuerySuite.scala ├── HBaseSplitTestData.scala ├── HBaseTestData.scala ├── TestData.scala ├── TestHbase.scala └── TpcMiniTestSuite.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.#* 3 | *#*# 4 | *.swp 5 | *.ipr 6 | *.iml 7 | *.iws 8 | *.pyc 9 | .idea/ 10 | .idea_modules/ 11 | build/*.jar 12 | .settings 13 | .cache 14 | cache 15 | .generated-mima* 16 | work/ 17 | out/ 18 | .DS_Store 19 | third_party/libmesos.so 20 | third_party/libmesos.dylib 21 | build/apache-maven* 22 | build/zinc* 23 | build/scala* 24 | conf/java-opts 25 | conf/*.sh 26 | conf/*.cmd 27 | conf/*.properties 28 | conf/*.conf 29 | conf/*.xml 30 | conf/slaves 31 | docs/_site 32 | docs/api 33 | target/ 34 | reports/ 35 | .project 36 | .classpath 37 | .scala_dependencies 38 | lib_managed/ 39 | src_managed/ 40 | project/boot/ 41 | project/plugins/project/build.properties 42 | project/build/target/ 43 | project/plugins/target/ 44 | project/plugins/lib_managed/ 45 | project/plugins/src_managed/ 46 | logs/ 47 | log/ 48 | spark-tests.log 49 | streaming-tests.log 50 | dependency-reduced-pom.xml 51 | .ensime 52 | .ensime_lucene 53 | checkpoint 54 | derby.log 55 | dist/ 56 | dev/create-release/*txt 57 | dev/create-release/*final 58 | spark-*-bin-*.tgz 59 | unit-tests.log 60 | /lib/ 61 | ec2/lib/ 62 | rat-results.txt 63 | scalastyle.txt 64 | scalastyle-output.xml 65 | 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark HBase 2 | 3 | Apache HBase is a distributed Key-Value store of data on HDFS. It is modeled after Google’s Big Table, and provides APIs to query the data. The data is organized, partitioned and distributed by its “row keys”. Per partition, the data is further physically partitioned by “column families” that specify collections of “columns” of data. The data model is for wide and sparse tables where columns are dynamic and may well be sparse. 4 | 5 | Although HBase is a very useful big data store, its access mechanism is very primitive and only through client-side APIs, Map/Reduce interfaces and interactive shells. SQL accesses to HBase data are available through Map/Reduce or interfaces mechanisms such as Apache Hive and Impala, or some “native” SQL technologies like Apache Phoenix. While the former is usually cheaper to implement and use, their latencies and efficiencies often cannot compare favorably with the latter and are often suitable only for offline analysis. The latter category, in contrast, often performs better and qualifies more as online engines; they are often on top of purpose-built execution engines. 6 | 7 | Currently Spark supports queries against HBase data through HBase’s Map/Reduce interface (i.e., TableInputFormat). SparkSQL supports use of Hive data, which theoretically should be able to support HBase data access, out-of-box, through HBase’s Map/Reduce interface and therefore falls into the first category of the “SQL on HBase” technologies. 8 | 9 | We believe, as a unified big data processing engine, Spark is in good position to provide better HBase support. 10 | 11 | ## Online Documentation 12 | 13 | Online documentation can be found on [Spark JIRA page](https://issues.apache.org/jira/browse/SPARK-3880). 14 | 15 | ## Building Spark HBase 16 | 17 | Spark HBase is built using [Apache Maven](http://maven.apache.org/). 18 | 19 | The refactoring job of separating the spark hbase sub-project from the spark project is ongoing. 20 | Some manual steps are required to build the new stand-alone spark-hbase project until this task 21 | is complete. 22 | 23 | In an effort the avoid confusion over the terms spark, spark-hbase, and hbase, these two projects 24 | are referred to here as 25 | 26 | "Spark-Huawei/spark": https://github.com/Huawei-Spark/spark.git (spark + all sub-modules) 27 | "Spark-Huawei/hbase": https://github.com/Huawei-Spark/hbase.git (standalone spark-hbase project) 28 | 29 | In short, you will need to manually delete the spark/sql/hbase module from the Spark-Huawei/spark 30 | source tree, all references to it in the spark build infrastructure, build/install spark, then build 31 | the standalone Spark-Huawei/hbase project. 32 | 33 | 34 | Here is the step-by-step process: 35 | 36 | I. Clone, edit, build Spark-Huawei/spark 37 | 38 | Define a SPARK_HOME environment variable on your development machine and clone the project to that location. 39 | ``` 40 | $ git clone https://github.com/Huawei-Spark/spark.git 41 | ``` 42 | Change your current working dir to your SPARK_HOME and make sure you downloaded branch 'hbase'. 43 | ``` 44 | $ git branch 45 | output: * hbase 46 | ``` 47 | Manually remove the sql/hbase module from the Spark-Huawei/spark project. 48 | ``` 49 | $ rm -rf $SPARK_HOME/sql/hbase 50 | ``` 51 | Edit the spark project's parent pom.xml -- delete the line 'sql/hbase' (from two locations). 52 | 53 | Build and install Spark-Huawei/spark; it must be installed in your local maven repo. 54 | ``` 55 | $ mvn -e -T1C -Pyarn,hadoop-2.4,hive -Dhadoop.version=2.4.0 -DskipTests clean package install 56 | ``` 57 | II. Clone and build Spark-Huawei/hbase (new standalone spark-hbase project) 58 | 59 | Change your current working dir to ../$SPARK_HOME and clone the standalone spark-hbase project. 60 | ``` 61 | $ git clone https://github.com/Huawei-Spark/hbase.git 62 | ``` 63 | Make sure you downloaded branch 'master'. 64 | ``` 65 | $ git branch 66 | output: * master 67 | ``` 68 | You have installed spark in your local maven repo; now you can build Spark-Huawei/hbase against it. 69 | ``` 70 | $ mvn -e -T1C -Phbase,hadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package install 71 | ``` 72 | III. Run Spark-Huawei/hbase test suites against an HBase minicluster, from Maven. 73 | ``` 74 | $ mvn -e -T1C -Phbase,hadoop-2.4 -Dhadoop.version=2.4.0 test 75 | ``` 76 | 77 | ## Interactive Scala Shell 78 | 79 | The easiest way to start using Spark HBase is through the Scala shell: 80 | 81 | ./bin/hbase-sql 82 | 83 | 84 | ## Running Tests 85 | 86 | Testing first requires [building Spark HBase](#building-spark). Once Spark HBase is built, tests 87 | can be run using: 88 | 89 | ./dev/run-tests 90 | 91 | Run all test suites from Maven: 92 | 93 | mvn -Phbase,hadoop-2.4 test 94 | 95 | Run a single test suite from Maven, for example: 96 | 97 | mvn -Phbase,hadoop-2.4 test -DwildcardSuites=org.apache.spark.sql.hbase.BasicQueriesSuite 98 | 99 | ## IDE Setup 100 | 101 | We use IntelliJ IDEA for Spark HBase development. You can get the community edition for free and install the JetBrains Scala plugin from Preferences > Plugins. 102 | 103 | To import the current Spark HBase project for IntelliJ: 104 | 105 | 1. Download IntelliJ and install the Scala plug-in for IntelliJ. You may also need to install Maven plug-in for IntelliJ. 106 | 2. Go to "File -> Import Project", locate the Spark HBase source directory, and select "Maven Project". 107 | 3. In the Import Wizard, select "Import Maven projects automatically" and leave other settings at their default. 108 | 4. Make sure some specific profiles are enabled. Select corresponding Hadoop version, "maven3" and also"hbase" in order to get dependencies. 109 | 5. Leave other settings at their default and you should be able to start your development. 110 | 6. When you run the scala test, sometimes you will get out of memory exception. You can increase your VM memory usage by the following setting, for example: 111 | 112 | ``` 113 | -XX:MaxPermSize=512m -Xmx3072m 114 | ``` 115 | 116 | You can also make those setting to be the default by setting to the "Defaults -> ScalaTest". 117 | 118 | ## Configuration 119 | 120 | Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html) 121 | in the online documentation for an overview on how to configure Spark. 122 | -------------------------------------------------------------------------------- /bin/compute-classpath.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem 4 | rem Licensed to the Apache Software Foundation (ASF) under one or more 5 | rem contributor license agreements. See the NOTICE file distributed with 6 | rem this work for additional information regarding copyright ownership. 7 | rem The ASF licenses this file to You under the Apache License, Version 2.0 8 | rem (the "License"); you may not use this file except in compliance with 9 | rem the License. You may obtain a copy of the License at 10 | rem 11 | rem http://www.apache.org/licenses/LICENSE-2.0 12 | rem 13 | rem Unless required by applicable law or agreed to in writing, software 14 | rem distributed under the License is distributed on an "AS IS" BASIS, 15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | rem See the License for the specific language governing permissions and 17 | rem limitations under the License. 18 | rem 19 | 20 | rem This script computes Spark's classpath and prints it to stdout; it's used by both the "run" 21 | rem script and the ExecutorRunner in standalone cluster mode. 22 | 23 | rem If we're called from spark-class2.cmd, it already set enabledelayedexpansion and setting 24 | rem it here would stop us from affecting its copy of the CLASSPATH variable; otherwise we 25 | rem need to set it here because we use !datanucleus_jars! below. 26 | if "%DONT_PRINT_CLASSPATH%"=="1" goto skip_delayed_expansion 27 | setlocal enabledelayedexpansion 28 | :skip_delayed_expansion 29 | 30 | set SCALA_VERSION=2.10 31 | 32 | rem Figure out where the Spark framework is installed 33 | set FWDIR=%~dp0..\ 34 | 35 | rem Load environment variables from conf\spark-env.cmd, if it exists 36 | if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd" 37 | 38 | rem Build up classpath 39 | set CLASSPATH=%SPARK_CLASSPATH%;%SPARK_SUBMIT_CLASSPATH% 40 | 41 | if not "x%SPARK_CONF_DIR%"=="x" ( 42 | set CLASSPATH=%CLASSPATH%;%SPARK_CONF_DIR% 43 | ) else ( 44 | set CLASSPATH=%CLASSPATH%;%FWDIR%conf 45 | ) 46 | 47 | if exist "%FWDIR%RELEASE" ( 48 | for %%d in ("%FWDIR%lib\spark-assembly*.jar") do ( 49 | set ASSEMBLY_JAR=%%d 50 | ) 51 | ) else ( 52 | for %%d in ("%FWDIR%assembly\target\scala-%SCALA_VERSION%\spark-assembly*hadoop*.jar") do ( 53 | set ASSEMBLY_JAR=%%d 54 | ) 55 | ) 56 | 57 | set CLASSPATH=%CLASSPATH%;%ASSEMBLY_JAR% 58 | 59 | rem When Hive support is needed, Datanucleus jars must be included on the classpath. 60 | rem Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost. 61 | rem Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is 62 | rem built with Hive, so look for them there. 63 | if exist "%FWDIR%RELEASE" ( 64 | set datanucleus_dir=%FWDIR%lib 65 | ) else ( 66 | set datanucleus_dir=%FWDIR%lib_managed\jars 67 | ) 68 | set "datanucleus_jars=" 69 | for %%d in ("%datanucleus_dir%\datanucleus-*.jar") do ( 70 | set datanucleus_jars=!datanucleus_jars!;%%d 71 | ) 72 | set CLASSPATH=%CLASSPATH%;%datanucleus_jars% 73 | 74 | set SPARK_CLASSES=%FWDIR%core\target\scala-%SCALA_VERSION%\classes 75 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%repl\target\scala-%SCALA_VERSION%\classes 76 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%mllib\target\scala-%SCALA_VERSION%\classes 77 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%bagel\target\scala-%SCALA_VERSION%\classes 78 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%graphx\target\scala-%SCALA_VERSION%\classes 79 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%streaming\target\scala-%SCALA_VERSION%\classes 80 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%tools\target\scala-%SCALA_VERSION%\classes 81 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%sql\catalyst\target\scala-%SCALA_VERSION%\classes 82 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%sql\core\target\scala-%SCALA_VERSION%\classes 83 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%sql\hive\target\scala-%SCALA_VERSION%\classes 84 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%sql\hbase\target\scala-%SCALA_VERSION%\classes 85 | 86 | set SPARK_TEST_CLASSES=%FWDIR%core\target\scala-%SCALA_VERSION%\test-classes 87 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%repl\target\scala-%SCALA_VERSION%\test-classes 88 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%mllib\target\scala-%SCALA_VERSION%\test-classes 89 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%bagel\target\scala-%SCALA_VERSION%\test-classes 90 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%graphx\target\scala-%SCALA_VERSION%\test-classes 91 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%streaming\target\scala-%SCALA_VERSION%\test-classes 92 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%sql\catalyst\target\scala-%SCALA_VERSION%\test-classes 93 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%sql\core\target\scala-%SCALA_VERSION%\test-classes 94 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%sql\hive\target\scala-%SCALA_VERSION%\test-classes 95 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%sql\hbase\target\scala-%SCALA_VERSION%\test-classes 96 | 97 | if "x%SPARK_TESTING%"=="x1" ( 98 | rem Add test clases to path - note, add SPARK_CLASSES and SPARK_TEST_CLASSES before CLASSPATH 99 | rem so that local compilation takes precedence over assembled jar 100 | set CLASSPATH=%SPARK_CLASSES%;%SPARK_TEST_CLASSES%;%CLASSPATH% 101 | ) 102 | 103 | rem Add hadoop conf dir - else FileSystem.*, etc fail 104 | rem Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts 105 | rem the configurtion files. 106 | if "x%HADOOP_CONF_DIR%"=="x" goto no_hadoop_conf_dir 107 | set CLASSPATH=%CLASSPATH%;%HADOOP_CONF_DIR% 108 | :no_hadoop_conf_dir 109 | 110 | if "x%YARN_CONF_DIR%"=="x" goto no_yarn_conf_dir 111 | set CLASSPATH=%CLASSPATH%;%YARN_CONF_DIR% 112 | :no_yarn_conf_dir 113 | 114 | rem To allow for distributions to append needed libraries to the classpath (e.g. when 115 | rem using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and 116 | rem append it to tbe final classpath. 117 | if not "x%$SPARK_DIST_CLASSPATH%"=="x" ( 118 | set CLASSPATH=%CLASSPATH%;%SPARK_DIST_CLASSPATH% 119 | ) 120 | 121 | rem A bit of a hack to allow calling this script within run2.cmd without seeing output 122 | if "%DONT_PRINT_CLASSPATH%"=="1" goto exit 123 | 124 | echo %CLASSPATH% 125 | 126 | :exit 127 | -------------------------------------------------------------------------------- /bin/compute-classpath.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # This script computes Spark's classpath and prints it to stdout; it's used by both the "run" 21 | # script and the ExecutorRunner in standalone cluster mode. 22 | 23 | # Figure out where Spark is installed 24 | FWDIR="$(cd "`dirname "$0"`"/..; pwd)" 25 | 26 | . "$FWDIR"/bin/load-spark-env.sh 27 | 28 | if [ -n "$SPARK_CLASSPATH" ]; then 29 | CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH" 30 | else 31 | CLASSPATH="$SPARK_SUBMIT_CLASSPATH" 32 | fi 33 | 34 | # Build up classpath 35 | if [ -n "$SPARK_CONF_DIR" ]; then 36 | CLASSPATH="$CLASSPATH:$SPARK_CONF_DIR" 37 | else 38 | CLASSPATH="$CLASSPATH:$FWDIR/conf" 39 | fi 40 | 41 | ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SPARK_SCALA_VERSION" 42 | 43 | if [ -n "$JAVA_HOME" ]; then 44 | JAR_CMD="$JAVA_HOME/bin/jar" 45 | else 46 | JAR_CMD="jar" 47 | fi 48 | 49 | # A developer option to prepend more recently compiled Spark classes 50 | if [ -n "$SPARK_PREPEND_CLASSES" ]; then 51 | echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\ 52 | "classes ahead of assembly." >&2 53 | CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/classes" 54 | CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*" 55 | CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/classes" 56 | CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/classes" 57 | CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/classes" 58 | CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/classes" 59 | CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/classes" 60 | CLASSPATH="$CLASSPATH:$FWDIR/tools/target/scala-$SPARK_SCALA_VERSION/classes" 61 | CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/classes" 62 | CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/classes" 63 | CLASSPATH="$CLASSPATH:$FWDIR/sql/hbase/target/scala-$SPARK_SCALA_VERSION/classes" 64 | CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/classes" 65 | CLASSPATH="$CLASSPATH:$FWDIR/sql/hive-thriftserver/target/scala-$SPARK_SCALA_VERSION/classes" 66 | CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SPARK_SCALA_VERSION/classes" 67 | fi 68 | 69 | # Use spark-assembly jar from either RELEASE or assembly directory 70 | if [ -f "$FWDIR/RELEASE" ]; then 71 | assembly_folder="$FWDIR"/lib 72 | else 73 | assembly_folder="$ASSEMBLY_DIR" 74 | fi 75 | 76 | num_jars=0 77 | 78 | for f in ${assembly_folder}/spark-assembly*hadoop*.jar; do 79 | if [[ ! -e "$f" ]]; then 80 | echo "Failed to find Spark assembly in $assembly_folder" 1>&2 81 | echo "You need to build Spark before running this program." 1>&2 82 | exit 1 83 | fi 84 | ASSEMBLY_JAR="$f" 85 | num_jars=$((num_jars+1)) 86 | done 87 | 88 | if [ "$num_jars" -gt "1" ]; then 89 | echo "Found multiple Spark assembly jars in $assembly_folder:" 1>&2 90 | ls ${assembly_folder}/spark-assembly*hadoop*.jar 1>&2 91 | echo "Please remove all but one jar." 1>&2 92 | exit 1 93 | fi 94 | 95 | # Verify that versions of java used to build the jars and run Spark are compatible 96 | jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1) 97 | if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then 98 | echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2 99 | echo "This is likely because Spark was compiled with Java 7 and run " 1>&2 100 | echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2 101 | echo "or build Spark with Java 6." 1>&2 102 | exit 1 103 | fi 104 | 105 | CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR" 106 | 107 | # When Hive support is needed, Datanucleus jars must be included on the classpath. 108 | # Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost. 109 | # Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is 110 | # built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark 111 | # assembly is built for Hive, before actually populating the CLASSPATH with the jars. 112 | # Note that this check order is faster (by up to half a second) in the case where Hive is not used. 113 | if [ -f "$FWDIR/RELEASE" ]; then 114 | datanucleus_dir="$FWDIR"/lib 115 | else 116 | datanucleus_dir="$FWDIR"/lib_managed/jars 117 | fi 118 | 119 | datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar$")" 120 | datanucleus_jars="$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)" 121 | 122 | if [ -n "$datanucleus_jars" ]; then 123 | hive_files=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null) 124 | if [ -n "$hive_files" ]; then 125 | echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2 126 | CLASSPATH="$CLASSPATH:$datanucleus_jars" 127 | fi 128 | fi 129 | 130 | # Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1 131 | if [[ $SPARK_TESTING == 1 ]]; then 132 | CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/test-classes" 133 | CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/test-classes" 134 | CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/test-classes" 135 | CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/test-classes" 136 | CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/test-classes" 137 | CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/test-classes" 138 | CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/test-classes" 139 | CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/test-classes" 140 | CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/test-classes" 141 | CLASSPATH="$CLASSPATH:$FWDIR/sql/hbase/target/scala-$SPARK_SCALA_VERSION/test-classes" 142 | fi 143 | 144 | # Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail ! 145 | # Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts 146 | # the configurtion files. 147 | if [ -n "$HADOOP_CONF_DIR" ]; then 148 | CLASSPATH="$CLASSPATH:$HADOOP_CONF_DIR" 149 | fi 150 | if [ -n "$YARN_CONF_DIR" ]; then 151 | CLASSPATH="$CLASSPATH:$YARN_CONF_DIR" 152 | fi 153 | 154 | # To allow for distributions to append needed libraries to the classpath (e.g. when 155 | # using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and 156 | # append it to tbe final classpath. 157 | if [ -n "$SPARK_DIST_CLASSPATH" ]; then 158 | CLASSPATH="$CLASSPATH:$SPARK_DIST_CLASSPATH" 159 | fi 160 | 161 | echo "$CLASSPATH" 162 | -------------------------------------------------------------------------------- /bin/hbase-sql: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # 21 | # Shell script for starting the Spark SQL for HBase CLI 22 | 23 | # Enter posix mode for bash 24 | set -o posix 25 | 26 | CLASS="org.apache.spark.sql.hbase.HBaseSQLCliDriver" 27 | 28 | # Figure out where Spark is installed 29 | FWDIR=$SPARK_HOME 30 | if [ -z "$FWDIR" ] 31 | then 32 | echo "\$SPARK_HOME is not set" 33 | fi 34 | 35 | function usage { 36 | echo "Usage: ./bin/hbase-sql [options] [cli option]" 37 | pattern="usage" 38 | pattern+="\|Spark assembly has been built with hbase" 39 | pattern+="\|NOTE: SPARK_PREPEND_CLASSES is set" 40 | pattern+="\|Spark Command: " 41 | pattern+="\|--help" 42 | pattern+="\|=======" 43 | 44 | "$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2 45 | echo 46 | echo "CLI options:" 47 | "$FWDIR"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 48 | } 49 | 50 | if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then 51 | usage 52 | exit 0 53 | fi 54 | 55 | source "$FWDIR"/bin/utils.sh 56 | SUBMIT_USAGE_FUNCTION=usage 57 | gatherSparkSubmitOpts "$@" 58 | 59 | exec "$FWDIR"/bin/spark-submit --class $CLASS "${SUBMISSION_OPTS[@]}" spark-internal "${APPLICATION_OPTS[@]}" 60 | -------------------------------------------------------------------------------- /doc/SparkSQLOnHBase_v2.1.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stratio/hbase/10f96374963c63f201bc8916fec5ec18ce1372a8/doc/SparkSQLOnHBase_v2.1.docx -------------------------------------------------------------------------------- /python/pyspark/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """ 19 | PySpark is the Python API for Spark. 20 | 21 | Public classes: 22 | 23 | - :class:`SparkContext`: 24 | Main entry point for Spark functionality. 25 | - L{RDD} 26 | A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. 27 | - L{Broadcast} 28 | A broadcast variable that gets reused across tasks. 29 | - L{Accumulator} 30 | An "add-only" shared variable that tasks can only add values to. 31 | - L{SparkConf} 32 | For configuring Spark. 33 | - L{SparkFiles} 34 | Access files shipped with jobs. 35 | - L{StorageLevel} 36 | Finer-grained cache persistence levels. 37 | 38 | """ 39 | 40 | from pyspark.conf import SparkConf 41 | from pyspark.context import SparkContext 42 | from pyspark.rdd import RDD 43 | from pyspark.files import SparkFiles 44 | from pyspark.storagelevel import StorageLevel 45 | from pyspark.accumulators import Accumulator, AccumulatorParam 46 | from pyspark.broadcast import Broadcast 47 | from pyspark.serializers import MarshalSerializer, PickleSerializer 48 | 49 | # for back compatibility 50 | from pyspark.sql import SQLContext, HiveContext, HBaseSQLContext, SchemaRDD, Row 51 | 52 | __all__ = [ 53 | "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast", 54 | "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer", 55 | ] 56 | -------------------------------------------------------------------------------- /python/pyspark/java_gateway.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import atexit 19 | import os 20 | import sys 21 | import signal 22 | import shlex 23 | import platform 24 | from subprocess import Popen, PIPE 25 | from threading import Thread 26 | from py4j.java_gateway import java_import, JavaGateway, GatewayClient 27 | 28 | 29 | def launch_gateway(): 30 | SPARK_HOME = os.environ["SPARK_HOME"] 31 | 32 | gateway_port = -1 33 | if "PYSPARK_GATEWAY_PORT" in os.environ: 34 | gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"]) 35 | else: 36 | # Launch the Py4j gateway using Spark's run command so that we pick up the 37 | # proper classpath and settings from spark-env.sh 38 | on_windows = platform.system() == "Windows" 39 | script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit" 40 | submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS") 41 | submit_args = submit_args if submit_args is not None else "" 42 | submit_args = shlex.split(submit_args) 43 | command = [os.path.join(SPARK_HOME, script)] + submit_args + ["pyspark-shell"] 44 | if not on_windows: 45 | # Don't send ctrl-c / SIGINT to the Java gateway: 46 | def preexec_func(): 47 | signal.signal(signal.SIGINT, signal.SIG_IGN) 48 | env = dict(os.environ) 49 | env["IS_SUBPROCESS"] = "1" # tell JVM to exit after python exits 50 | proc = Popen(command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func, env=env) 51 | else: 52 | # preexec_fn not supported on Windows 53 | proc = Popen(command, stdout=PIPE, stdin=PIPE) 54 | 55 | try: 56 | # Determine which ephemeral port the server started on: 57 | gateway_port = proc.stdout.readline() 58 | gateway_port = int(gateway_port) 59 | except ValueError: 60 | # Grab the remaining lines of stdout 61 | (stdout, _) = proc.communicate() 62 | exit_code = proc.poll() 63 | error_msg = "Launching GatewayServer failed" 64 | error_msg += " with exit code %d!\n" % exit_code if exit_code else "!\n" 65 | error_msg += "Warning: Expected GatewayServer to output a port, but found " 66 | if gateway_port == "" and stdout == "": 67 | error_msg += "no output.\n" 68 | else: 69 | error_msg += "the following:\n\n" 70 | error_msg += "--------------------------------------------------------------\n" 71 | error_msg += gateway_port + stdout 72 | error_msg += "--------------------------------------------------------------\n" 73 | raise Exception(error_msg) 74 | 75 | # In Windows, ensure the Java child processes do not linger after Python has exited. 76 | # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when 77 | # the parent process' stdin sends an EOF). In Windows, however, this is not possible 78 | # because java.lang.Process reads directly from the parent process' stdin, contending 79 | # with any opportunity to read an EOF from the parent. Note that this is only best 80 | # effort and will not take effect if the python process is violently terminated. 81 | if on_windows: 82 | # In Windows, the child process here is "spark-submit.cmd", not the JVM itself 83 | # (because the UNIX "exec" command is not available). This means we cannot simply 84 | # call proc.kill(), which kills only the "spark-submit.cmd" process but not the 85 | # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all 86 | # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx) 87 | def killChild(): 88 | Popen(["cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid)]) 89 | atexit.register(killChild) 90 | 91 | # Create a thread to echo output from the GatewayServer, which is required 92 | # for Java log output to show up: 93 | class EchoOutputThread(Thread): 94 | 95 | def __init__(self, stream): 96 | Thread.__init__(self) 97 | self.daemon = True 98 | self.stream = stream 99 | 100 | def run(self): 101 | while True: 102 | line = self.stream.readline() 103 | sys.stderr.write(line) 104 | EchoOutputThread(proc.stdout).start() 105 | 106 | # Connect to the gateway 107 | gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False) 108 | 109 | # Import the classes used by PySpark 110 | java_import(gateway.jvm, "org.apache.spark.SparkConf") 111 | java_import(gateway.jvm, "org.apache.spark.api.java.*") 112 | java_import(gateway.jvm, "org.apache.spark.api.python.*") 113 | java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") 114 | java_import(gateway.jvm, "org.apache.spark.sql.SQLContext") 115 | java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext") 116 | java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext") 117 | java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext") 118 | java_import(gateway.jvm, "scala.Tuple2") 119 | 120 | java_import(gateway.jvm, "org.apache.spark.sql.hbase.*") 121 | 122 | return gateway 123 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/HBasePartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.hbase 18 | 19 | import org.apache.spark.{Logging, Partition} 20 | import org.apache.spark.sql.catalyst.expressions._ 21 | import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._ 22 | import org.apache.spark.sql.hbase.types.{HBaseBytesType, PartitionRange, Range} 23 | 24 | 25 | private[hbase] class HBasePartition( 26 | val idx: Int, val mappedIndex: Int, 27 | start: Option[HBaseRawType] = None, 28 | end: Option[HBaseRawType] = None, 29 | val server: Option[String] = None, 30 | val filterPredicates: Option[Expression] = None, 31 | @transient relation: HBaseRelation = null) 32 | extends Range[HBaseRawType](start, true, end, false, HBaseBytesType) 33 | with Partition with IndexMappable with Logging { 34 | 35 | override def index: Int = idx 36 | 37 | override def hashCode(): Int = idx 38 | 39 | @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start) 40 | 41 | @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end) 42 | 43 | def computePredicate(relation: HBaseRelation): Option[Expression] = { 44 | val predicate = if (filterPredicates.isDefined && 45 | filterPredicates.get.references.exists(_.exprId == relation.partitionKeys(0).exprId)) { 46 | val oriPredicate = filterPredicates.get 47 | val predicateReferences = oriPredicate.references.toSeq 48 | val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences) 49 | val row = new GenericMutableRow(predicateReferences.size) 50 | var rowIndex = 0 51 | var i = 0 52 | var range: PartitionRange[_] = null 53 | while (i < relation.keyColumns.size) { 54 | range = relation.generateRange(this, oriPredicate, i) 55 | if (range != null) { 56 | rowIndex = relation.rowIndex(predicateReferences, i) 57 | if (rowIndex >= 0) row.update(rowIndex, range) 58 | // if the non-last dimension range is not point, do not proceed to the next dims 59 | if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size 60 | else i = i + 1 61 | } else i = relation.keyColumns.size 62 | } 63 | val pr = boundReference.partialReduce(row, predicateReferences) 64 | pr match { 65 | case (null, e: Expression) => Some(e) 66 | case (true, _) => None 67 | case (false, _) => Some(Literal(false)) 68 | } 69 | } else filterPredicates 70 | logInfo(predicate.toString) 71 | predicate 72 | } 73 | 74 | override def toString = { 75 | s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates" 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/HBasePartitioner.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import java.io.{IOException, ObjectInputStream, ObjectOutputStream} 21 | 22 | import org.apache.hadoop.hbase.util.Bytes 23 | import org.apache.spark.serializer.JavaSerializer 24 | import org.apache.spark.util.{CollectionsUtils, Utils} 25 | import org.apache.spark.{Partitioner, SparkEnv} 26 | 27 | object HBasePartitioner { 28 | implicit object HBaseRawOrdering extends Ordering[HBaseRawType] { 29 | def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b) 30 | } 31 | } 32 | 33 | class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner { 34 | import HBasePartitioner.HBaseRawOrdering 35 | 36 | type t = HBaseRawType 37 | 38 | lazy private val len = splitKeys.length 39 | 40 | // For pre-split table splitKeys(0) = bytes[0], to remove it, 41 | // otherwise partition 0 always be empty and 42 | // we will miss the last region's date when bulk load 43 | lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail 44 | 45 | def numPartitions = if (len == 0) 1 else len 46 | 47 | @transient private val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t] 48 | 49 | def getPartition(key: Any): Int = { 50 | val k = key.asInstanceOf[t] 51 | var partition = 0 52 | if (len <= 128 && len > 0) { 53 | // If we have less than 128 partitions naive search 54 | val ordering = implicitly[Ordering[t]] 55 | while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) { 56 | partition += 1 57 | } 58 | } else { 59 | // Determine which binary search method to use only once. 60 | partition = binarySearch(realSplitKeys, k) 61 | // binarySearch either returns the match location or -[insertion point]-1 62 | if (partition < 0) { 63 | partition = -partition - 1 64 | } 65 | if (partition > realSplitKeys.length) { 66 | partition = realSplitKeys.length 67 | } 68 | } 69 | partition 70 | } 71 | 72 | override def equals(other: Any): Boolean = other match { 73 | case r: HBasePartitioner => 74 | r.splitKeys.sameElements(splitKeys) 75 | case _ => 76 | false 77 | } 78 | 79 | override def hashCode(): Int = { 80 | val prime = 31 81 | var result = 1 82 | var i = 0 83 | while (i < splitKeys.length) { 84 | result = prime * result + splitKeys(i).hashCode 85 | i += 1 86 | } 87 | result = prime * result 88 | result 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/HBaseSQLCliDriver.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import java.io.File 21 | 22 | import jline._ 23 | import org.apache.spark.{Logging, SparkConf, SparkContext} 24 | 25 | /** 26 | * HBaseSQLCliDriver 27 | * 28 | */ 29 | object HBaseSQLCliDriver extends Logging { 30 | private val prompt = "spark-hbaseql" 31 | private val continuedPrompt = "".padTo(prompt.length, ' ') 32 | private val conf = new SparkConf() 33 | private val sc = new SparkContext(conf) 34 | private val hbaseCtx = new HBaseSQLContext(sc) 35 | 36 | private val QUIT = "QUIT" 37 | private val EXIT = "EXIT" 38 | private val HELP = "HELP" 39 | 40 | def getCompletors: Seq[Completor] = { 41 | val sc: SimpleCompletor = new SimpleCompletor(new Array[String](0)) 42 | 43 | // add keywords, including lower-cased versions 44 | HBaseSQLParser.getKeywords.foreach { kw => 45 | sc.addCandidateString(kw) 46 | sc.addCandidateString(kw.toLowerCase) 47 | } 48 | 49 | 50 | Seq(sc) 51 | } 52 | 53 | def main(args: Array[String]) { 54 | 55 | val reader = new ConsoleReader() 56 | reader.setBellEnabled(false) 57 | getCompletors.foreach(reader.addCompletor) 58 | 59 | val historyDirectory = System.getProperty("user.home") 60 | 61 | try { 62 | if (new File(historyDirectory).exists()) { 63 | val historyFile = historyDirectory + File.separator + ".hbaseqlhistory" 64 | reader.setHistory(new History(new File(historyFile))) 65 | } else { 66 | System.err.println("WARNING: Directory for hbaseql history file: " + historyDirectory + 67 | " does not exist. History will not be available during this session.") 68 | } 69 | } catch { 70 | case e: Exception => 71 | System.err.println("WARNING: Encountered an error while trying to initialize hbaseql's " + 72 | "history file. History will not be available during this session.") 73 | System.err.println(e.getMessage) 74 | } 75 | 76 | println("Welcome to hbaseql CLI") 77 | var prefix = "" 78 | 79 | def promptPrefix = s"$prompt" 80 | var currentPrompt = promptPrefix 81 | var line = reader.readLine(currentPrompt + "> ") 82 | var ret = 0 83 | 84 | while (line != null) { 85 | if (prefix.nonEmpty) { 86 | prefix += '\n' 87 | } 88 | 89 | if (line.trim.endsWith(";") && !line.trim.endsWith("\\;")) { 90 | line = prefix + line 91 | processLine(line, allowInterrupting = true) 92 | prefix = "" 93 | currentPrompt = promptPrefix 94 | } else { 95 | prefix = prefix + line 96 | currentPrompt = continuedPrompt 97 | } 98 | 99 | line = reader.readLine(currentPrompt + "> ") 100 | } 101 | 102 | System.exit(0) 103 | } 104 | 105 | private def processLine(line: String, allowInterrupting: Boolean) = { 106 | 107 | // TODO: handle multiple command separated by ; 108 | 109 | // Since we are using SqlParser and it does not handle ';', just work around to omit the ';' 110 | val input = line.trim.substring(0, line.length - 1) 111 | 112 | try { 113 | process(input) 114 | } catch { 115 | case e: Exception => 116 | e.printStackTrace() 117 | } 118 | } 119 | 120 | private def process(input: String) = { 121 | val token = input.split("\\s") 122 | token(0).toUpperCase match { 123 | case QUIT => System.exit(0) 124 | case EXIT => System.exit(0) 125 | case HELP => printHelp(token) 126 | case "!" => // TODO: add support for bash command start with ! 127 | case _ => 128 | logInfo(s"Processing $input") 129 | val start = System.currentTimeMillis() 130 | val res = hbaseCtx.sql(input).collect() 131 | val end = System.currentTimeMillis() 132 | res.foreach(println) 133 | val timeTaken: Double = (end - start) / 1000.0 134 | println(s"Time taken: $timeTaken seconds") 135 | } 136 | } 137 | 138 | private def printHelp(token: Array[String]) = { 139 | if (token.length > 1) { 140 | token(1).toUpperCase match { 141 | case "CREATE" => 142 | println( """CREATE TABLE table_name (col_name data_type, ..., PRIMARY KEY(col_name, ...)) 143 | MAPPED BY (htable_name, COLS=[col_name=family_name.qualifier])""".stripMargin) 144 | case "DROP" => 145 | println("DROP TABLE table_name") 146 | case "ALTER" => 147 | println("ALTER TABLE table_name ADD (col_name data_type, ...) MAPPED BY (expression)") 148 | println("ALTER TABLE table_name DROP col_name") 149 | case "LOAD" => 150 | println( """LOAD DATA [LOCAL] INPATH file_path [OVERWRITE] INTO TABLE 151 | table_name [FIELDS TERMINATED BY char]""".stripMargin) 152 | case "SELECT" => 153 | println( """SELECT [ALL | DISTINCT] select_expr, select_expr, ... 154 | |FROM table_reference 155 | |[WHERE where_condition] 156 | |[GROUP BY col_list] 157 | |[CLUSTER BY col_list 158 | | | [DISTRIBUTE BY col_list] [SORT BY col_list] 159 | |] 160 | |[LIMIT number]""") 161 | case "INSERT" => 162 | println("INSERT INTO table_name SELECT clause") 163 | println("INSERT INTO table_name VALUES (value, ...)") 164 | case "DESCRIBE" => 165 | println("DESCRIBE table_name") 166 | case "SHOW" => 167 | println("SHOW TABLES") 168 | case _ => 169 | printHelpUsage() 170 | } 171 | } else { 172 | printHelpUsage() 173 | } 174 | } 175 | 176 | private def printHelpUsage() = { 177 | println("""Usage: HELP Statement 178 | Statement: 179 | CREATE | DROP | ALTER | LOAD | SELECT | INSERT | DESCRIBE | SHOW""") 180 | } 181 | } 182 | 183 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/HBaseSQLConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.spark.sql.SQLConf 21 | 22 | private[hbase] object HBaseSQLConf { 23 | val PARTITION_EXPIRATION = "spark.sql.hbase.partition.expiration" 24 | val SCANNER_FETCH_SIZE = "spark.sql.hbase.scanner.fetchsize" 25 | } 26 | 27 | /** 28 | * A trait that enables the setting and getting of mutable config parameters/hints. 29 | * 30 | */ 31 | private[hbase] class HBaseSQLConf extends SQLConf { 32 | import org.apache.spark.sql.hbase.HBaseSQLConf._ 33 | 34 | /** The expiration of cached partition (i.e., region) info; defaults to 10 minutes . */ 35 | private[spark] def partitionExpiration: Long = getConf(PARTITION_EXPIRATION, "600").toLong 36 | private[spark] def scannerFetchSize: Int = getConf(SCANNER_FETCH_SIZE, "1000").toInt 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/HBaseSQLContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.hadoop.hbase.HBaseConfiguration 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.api.java.JavaSparkContext 23 | import org.apache.spark.sql._ 24 | import org.apache.spark.sql.SparkSQLParser 25 | import org.apache.spark.sql.catalyst.analysis.OverrideCatalog 26 | import org.apache.spark.sql.hbase.execution.HBaseStrategies 27 | 28 | class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) { 29 | def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) 30 | 31 | protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf 32 | 33 | @transient 34 | override protected[sql] val sqlParser = { 35 | val fallback = new HBaseSQLParser 36 | new SparkSQLParser(fallback.parse(_)) 37 | } 38 | 39 | HBaseConfiguration.merge( 40 | sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration)) 41 | 42 | @transient 43 | override protected[sql] lazy val catalog: HBaseCatalog = 44 | new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog 45 | 46 | experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource) 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/HBaseSQLReaderRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.hbase 18 | 19 | 20 | import org.apache.hadoop.hbase.client.{ResultScanner, Result, Get} 21 | import org.apache.hadoop.hbase.util.Bytes 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.SQLContext 24 | import org.apache.spark.sql.catalyst.expressions._ 25 | import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate 26 | import org.apache.spark.sql.execution.SparkPlan 27 | import org.apache.spark.sql.hbase.util.{BytesUtils, HBaseKVHelper, DataTypeUtils} 28 | import org.apache.spark.sql.types.AtomicType 29 | 30 | import org.apache.spark.{InterruptibleIterator, Logging, Partition, TaskContext} 31 | 32 | import scala.collection.mutable.{ArrayBuffer, ListBuffer} 33 | 34 | 35 | /** 36 | * HBaseSQLReaderRDD 37 | */ 38 | class HBaseSQLReaderRDD( 39 | relation: HBaseRelation, 40 | codegenEnabled: Boolean, 41 | output: Seq[Attribute], 42 | filterPred: Option[Expression], 43 | coprocSubPlan: Option[SparkPlan], 44 | @transient sqlContext: SQLContext) 45 | extends RDD[Row](sqlContext.sparkContext, Nil) with Logging { 46 | 47 | override def getPartitions: Array[Partition] = { 48 | RangeCriticalPoint.generatePrunedPartitions(relation, filterPred).toArray 49 | } 50 | 51 | override def getPreferredLocations(split: Partition): Seq[String] = { 52 | split.asInstanceOf[HBasePartition].server.map { 53 | identity 54 | }.toSeq 55 | } 56 | 57 | private def createIterator(context: TaskContext, 58 | scanner: ResultScanner, 59 | otherFilters: Option[Expression]): Iterator[Row] = { 60 | var finalOutput = output.distinct 61 | if (otherFilters.isDefined) { 62 | finalOutput = finalOutput.union(otherFilters.get.references.toSeq) 63 | } 64 | val row = new GenericMutableRow(finalOutput.size) 65 | val projections = finalOutput.zipWithIndex 66 | 67 | var finished: Boolean = false 68 | var gotNext: Boolean = false 69 | var result: Result = null 70 | 71 | val otherFilter: (Row) => Boolean = if (otherFilters.isDefined) { 72 | if (codegenEnabled) { 73 | GeneratePredicate.generate(otherFilters.get, finalOutput) 74 | } else { 75 | InterpretedPredicate.create(otherFilters.get, finalOutput) 76 | } 77 | } else null 78 | 79 | val iterator = new Iterator[Row] { 80 | override def hasNext: Boolean = { 81 | if (!finished) { 82 | if (!gotNext) { 83 | result = scanner.next 84 | finished = result == null 85 | gotNext = true 86 | } 87 | } 88 | if (finished) { 89 | close() 90 | } 91 | !finished 92 | } 93 | 94 | override def next(): Row = { 95 | if (hasNext) { 96 | gotNext = false 97 | relation.buildRow(projections, result, row) 98 | } else { 99 | null 100 | } 101 | } 102 | 103 | def close() = { 104 | try { 105 | scanner.close() 106 | relation.closeHTable() 107 | } catch { 108 | case e: Exception => logWarning("Exception in scanner.close", e) 109 | } 110 | } 111 | } 112 | if (otherFilter == null) { 113 | new InterruptibleIterator(context, iterator) 114 | } else { 115 | new InterruptibleIterator(context, iterator.filter(otherFilter)) 116 | } 117 | } 118 | 119 | /** 120 | * construct row key based on the critical point range information 121 | * @param cpr the critical point range 122 | * @param isStart the switch between start and end value 123 | * @return the encoded row key, or null if the value is None 124 | */ 125 | private def constructRowKey(cpr: MDCriticalPointRange[_], isStart: Boolean): HBaseRawType = { 126 | val prefix = cpr.prefix 127 | val head: Seq[(HBaseRawType, AtomicType)] = prefix.map { 128 | case (itemValue, itemType) => 129 | (DataTypeUtils.dataToBytes(itemValue, itemType), itemType) 130 | } 131 | 132 | val key = if (isStart) cpr.lastRange.start else cpr.lastRange.end 133 | val keyType = cpr.lastRange.dt 134 | val list = if (key.isDefined) { 135 | val tail: (HBaseRawType, AtomicType) = { 136 | (DataTypeUtils.dataToBytes(key.get, keyType), keyType) 137 | } 138 | head :+ tail 139 | } else { 140 | head 141 | } 142 | if (list.size == 0) { 143 | null 144 | } else { 145 | HBaseKVHelper.encodingRawKeyColumns(list) 146 | } 147 | } 148 | 149 | // For critical-point-based predicate pushdown 150 | // partial reduction for those partitions mapped to multiple critical point ranges, 151 | // as indicated by the keyPartialEvalIndex in the partition, where the original 152 | // filter predicate will be used 153 | override def compute(split: Partition, context: TaskContext): Iterator[Row] = { 154 | val partition = split.asInstanceOf[HBasePartition] 155 | val predicates = partition.computePredicate(relation) 156 | val expandedCPRs: Seq[MDCriticalPointRange[_]] = 157 | RangeCriticalPoint.generateCriticalPointRanges(relation, predicates). 158 | flatMap(_.flatten(new ArrayBuffer[(Any, AtomicType)](relation.dimSize))) 159 | 160 | if (expandedCPRs.isEmpty) { 161 | val (filters, otherFilters, pushdownPreds) = relation.buildPushdownFilterList(predicates) 162 | val pushablePreds = if (pushdownPreds.isDefined) { 163 | ListBuffer[Expression](pushdownPreds.get) 164 | } else { 165 | ListBuffer[Expression]() 166 | } 167 | val scan = relation.buildScan(partition.start, partition.end, filters, otherFilters, 168 | pushablePreds, output) 169 | val scanner = relation.htable.getScanner(scan) 170 | createIterator(context, scanner, otherFilters) 171 | } else { 172 | // expandedCPRs is not empty 173 | val isPointRanges = expandedCPRs.forall( 174 | p => p.lastRange.isPoint && p.prefix.size == relation.keyColumns.size - 1) 175 | if (isPointRanges) { 176 | // all of the last ranges are point range, build a list of get 177 | val gets: java.util.List[Get] = new java.util.ArrayList[Get]() 178 | 179 | val distinctProjectionList = output.distinct 180 | val nonKeyColumns = relation.nonKeyColumns.filter { 181 | case nkc => distinctProjectionList.exists(nkc.sqlName == _.name) 182 | } 183 | 184 | def generateGet(range: MDCriticalPointRange[_]): Get = { 185 | val rowKey = constructRowKey(range, isStart = true) 186 | val get = new Get(rowKey) 187 | for (nonKeyColumn <- nonKeyColumns) { 188 | get.addColumn(Bytes.toBytes(nonKeyColumn.family), Bytes.toBytes(nonKeyColumn.qualifier)) 189 | } 190 | get 191 | } 192 | val predForEachRange: Seq[Expression] = expandedCPRs.map(range => { 193 | gets.add(generateGet(range)) 194 | range.lastRange.pred 195 | }) 196 | val resultsWithPred = relation.htable.get(gets).zip(predForEachRange).filter(!_._1.isEmpty) 197 | 198 | def evalResultForBoundPredicate(input: Row, predicate: Expression): Boolean = { 199 | val boundPredicate = BindReferences.bindReference(predicate, output) 200 | boundPredicate.eval(input).asInstanceOf[Boolean] 201 | } 202 | val projections = output.zipWithIndex 203 | val resultRows: Seq[Row] = for { 204 | (result, predicate) <- resultsWithPred 205 | row = new GenericMutableRow(output.size) 206 | resultRow = relation.buildRow(projections, result, row) 207 | if predicate == null || evalResultForBoundPredicate(resultRow, predicate) 208 | } yield resultRow 209 | 210 | resultRows.toIterator 211 | } 212 | else { 213 | // isPointRanges is false 214 | // calculate the range start 215 | val startRowKey = constructRowKey(expandedCPRs(0), isStart = true) 216 | val start = if (startRowKey != null) { 217 | if (partition.start.isDefined && Bytes.compareTo(partition.start.get, startRowKey) > 0) { 218 | Some(partition.start.get) 219 | } else { 220 | Some(startRowKey) 221 | } 222 | } else { 223 | partition.start 224 | } 225 | 226 | // calculate the range end 227 | val size = expandedCPRs.size - 1 228 | val endKey: Option[Any] = expandedCPRs(size).lastRange.end 229 | val endInclusive: Boolean = expandedCPRs(size).lastRange.endInclusive 230 | val endRowKey = constructRowKey(expandedCPRs(size), isStart = false) 231 | val end = if (endRowKey != null) { 232 | val finalKey: HBaseRawType = { 233 | if (endInclusive || endKey.isEmpty) { 234 | BytesUtils.addOne(endRowKey) 235 | } else { 236 | endRowKey 237 | } 238 | } 239 | 240 | if (finalKey != null) { 241 | if (partition.end.isDefined && Bytes.compareTo(finalKey, partition.end.get) > 0) { 242 | Some(partition.end.get) 243 | } else { 244 | Some(finalKey) 245 | } 246 | } else { 247 | partition.end 248 | } 249 | } else { 250 | partition.end 251 | } 252 | 253 | 254 | val (filters, otherFilters, preds) = 255 | relation.buildCPRFilterList(output, filterPred, expandedCPRs) 256 | val scan = relation.buildScan(start, end, filters, otherFilters, preds, output) 257 | val scanner = relation.htable.getScanner(scan) 258 | createIterator(context, scanner, otherFilters) 259 | } 260 | } 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/HBaseShuffledRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.spark._ 21 | import org.apache.spark.rdd.{RDD, ShuffledRDD, ShuffledRDDPartition} 22 | 23 | class HBaseShuffledRDD ( 24 | prevRdd: RDD[(HBaseRawType, Array[HBaseRawType])], 25 | part: Partitioner, 26 | @transient hbPartitions: Seq[HBasePartition] = Nil) extends ShuffledRDD(prevRdd, part){ 27 | 28 | override def getPartitions: Array[Partition] = { 29 | if (hbPartitions==null || hbPartitions.isEmpty) { 30 | Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) 31 | } else { 32 | // only to be invoked by clients 33 | hbPartitions.toArray 34 | } 35 | } 36 | 37 | override def getPreferredLocations(split: Partition): Seq[String] = { 38 | if (hbPartitions==null || hbPartitions.isEmpty) { 39 | Seq.empty 40 | } else { 41 | split.asInstanceOf[HBasePartition].server.map { 42 | identity[String] 43 | }.toSeq 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/HadoopReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.spark.SparkContext 21 | import org.apache.spark.sql.hbase.util.HBaseKVHelper 22 | import org.apache.spark.sql.types._ 23 | 24 | /** 25 | * Helper class for scanning files stored in Hadoop - e.g., to read text file when bulk loading. 26 | */ 27 | private[hbase] class HadoopReader( 28 | @transient sc: SparkContext, 29 | path: String, 30 | delimiter: Option[String])(baseRelation: HBaseRelation) { 31 | /** make RDD[(SparkImmutableBytesWritable, SparkKeyValue)] from text file. */ 32 | private[hbase] def makeBulkLoadRDDFromTextFile = { 33 | val rdd = sc.textFile(path) 34 | val splitRegex = delimiter.getOrElse(",") 35 | val relation = baseRelation 36 | 37 | rdd.mapPartitions { iter => 38 | val lineBuffer = HBaseKVHelper.createLineBuffer(relation.output) 39 | val keyBytes = new Array[(HBaseRawType, DataType)](relation.keyColumns.size) 40 | iter.flatMap { line => 41 | if (line == "") { 42 | None 43 | } else { 44 | // If the last column in the text file is null, the java parser will 45 | // return a String[] containing only the non-null text values. 46 | // In this case we need to append another element (null) to 47 | // the array returned by line.split(splitRegex). 48 | val valueBytes = new Array[HBaseRawType](relation.nonKeyColumns.size) 49 | var textValueArray = line.split(splitRegex) 50 | while (textValueArray.length < relation.output.length) { 51 | textValueArray = textValueArray :+ "" 52 | } 53 | HBaseKVHelper.string2KV(textValueArray, relation, lineBuffer, keyBytes, valueBytes) 54 | val rowKeyData = HBaseKVHelper.encodingRawKeyColumns(keyBytes) 55 | Seq((rowKeyData, valueBytes)) 56 | } 57 | } 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/IndexMappable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.hbase 18 | 19 | private[hbase] trait IndexMappable { 20 | def mappedIndex: Int 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/ScanPredClassifier.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.spark.sql.catalyst.expressions._ 21 | import org.apache.spark.sql.hbase.util.{BytesUtils, DataTypeUtils} 22 | 23 | /** 24 | * Classifies a predicate into a pair of (pushdownable, non-pushdownable) predicates 25 | * for a Scan; the logic relationship between the two components of the pair is AND 26 | */ 27 | class ScanPredClassifier(relation: HBaseRelation) { 28 | def apply(pred: Expression): (Option[Expression], Option[Expression]) = { 29 | // post-order bottom-up traversal 30 | pred match { 31 | case And(left, right) => 32 | val (ll, lr) = apply(left) 33 | val (rl, rr) = apply(right) 34 | (ll, lr, rl, rr) match { 35 | // All Nones 36 | case (None, None, None, None) => (None, None) 37 | // Three Nones 38 | case (None, None, None, _) => (None, rr) 39 | case (None, None, _, None) => (rl, None) 40 | case (None, _, None, None) => (None, lr) 41 | case (_, None, None, None) => (ll, None) 42 | // two Nones 43 | case (None, None, _, _) => (rl, rr) 44 | case (None, _, None, _) => (None, Some(And(lr.get, rr.get))) 45 | case (None, _, _, None) => (rl, lr) 46 | case (_, None, None, _) => (ll, rr) 47 | case (_, None, _, None) => (Some(And(ll.get, rl.get)), None) 48 | case (_, _, None, None) => (ll, lr) 49 | // One None 50 | case (None, _, _, _) => (rl, Some(And(lr.get, rr.get))) 51 | case (_, None, _, _) => (Some(And(ll.get, rl.get)), rr) 52 | case (_, _, None, _) => (ll, Some(And(lr.get, rr.get))) 53 | case (_, _, _, None) => (Some(And(ll.get, rl.get)), lr) 54 | // No nones 55 | case _ => (Some(And(ll.get, rl.get)), Some(And(lr.get, rr.get))) 56 | } 57 | case Or(left, right) => 58 | val (ll, lr) = apply(left) 59 | val (rl, rr) = apply(right) 60 | (ll, lr, rl, rr) match { 61 | // All Nones 62 | case (None, None, None, None) => (None, None) 63 | // Three Nones 64 | case (None, None, None, _) => (None, rr) 65 | case (None, None, _, None) => (rl, None) 66 | case (None, _, None, None) => (None, lr) 67 | case (_, None, None, None) => (ll, None) 68 | // two Nones 69 | case (None, None, _, _) => (rl, rr) 70 | case (None, _, None, _) => (None, Some(Or(lr.get, rr.get))) 71 | case (None, _, _, None) => (None, Some(Or(lr.get, rl.get))) 72 | case (_, None, None, _) => (None, Some(Or(ll.get, rr.get))) 73 | case (_, None, _, None) => (Some(Or(ll.get, rl.get)), None) 74 | case (_, _, None, None) => (ll, lr) 75 | // One None 76 | case (None, _, _, _) => (None, Some(pred)) 77 | // Accept increased evaluation complexity for improved pushed down 78 | case (_, None, _, _) => (Some(Or(ll.get, rl.get)), Some(Or(ll.get, rr.get))) 79 | case (_, _, None, _) => (None, Some(pred)) 80 | // Accept increased evaluation complexity for improved pushed down 81 | case (_, _, _, None) => (Some(Or(ll.get, rl.get)), Some(Or(lr.get, rl.get))) 82 | // No nones 83 | // Accept increased evaluation complexity for improved pushed down 84 | case _ => (Some(Or(ll.get, rl.get)), Some(And(Or(ll.get, rr.get), 85 | And(Or(lr.get, rl.get), Or(lr.get, rr.get))))) 86 | } 87 | case EqualTo(left, right) => classifyBinary(left, right, pred) 88 | case LessThan(left, right) => classifyBinary(left, right, pred) 89 | case LessThanOrEqual(left, right) => classifyBinary(left, right, pred) 90 | case GreaterThan(left, right) => classifyBinary(left, right, pred) 91 | case GreaterThanOrEqual(left, right) => classifyBinary(left, right, pred) 92 | case In(value@AttributeReference(_, _, _, _), list) => 93 | if (relation.isNonKey(value) && list.filter(!_.isInstanceOf[Literal]).isEmpty) { 94 | (Some(pred), None) 95 | } else { 96 | (None, Some(pred)) 97 | } 98 | case InSet(value@AttributeReference(name, dataType, _, _), hset) 99 | if relation.nonKeyColumns.exists(_.sqlName == name) => 100 | var errorOccurred = false 101 | for (item <- hset if !errorOccurred) { 102 | try { 103 | /** 104 | * Use try-catch to make sure data type conversion is proper, for example, 105 | * Java throws casting exception while doing col2 in (1, 2, 3), if col2 data type 106 | * if ByteType and 1, 2, 3 is Integer. 107 | */ 108 | DataTypeUtils.getBinaryComparator(BytesUtils.create(dataType), Literal.create(item, dataType)) 109 | } catch { 110 | case e: Exception => errorOccurred = true 111 | } 112 | } 113 | if (errorOccurred) { 114 | (None, Some(pred)) 115 | } else { 116 | (Some(pred), None) 117 | } 118 | // everything else are treated as non pushdownable 119 | case _ => (None, Some(pred)) 120 | } 121 | } 122 | 123 | // returns true if the binary operator of the two args can be pushed down 124 | private def classifyBinary(left: Expression, right: Expression, pred: Expression) 125 | : (Option[Expression], Option[Expression]) = { 126 | (left, right) match { 127 | case (Literal(_, _), AttributeReference(_, _, _, _)) => 128 | if (relation.isNonKey(right.asInstanceOf[AttributeReference])) { 129 | (Some(pred), None) 130 | } else { 131 | (None, Some(pred)) 132 | } 133 | case (AttributeReference(_, _, _, _), Literal(_, _)) => 134 | if (relation.isNonKey(left.asInstanceOf[AttributeReference])) { 135 | (Some(pred), None) 136 | } else { 137 | (None, Some(pred)) 138 | } 139 | case _ => (None, Some(pred)) 140 | } 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/catalyst/NotPusher.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase.catalyst 19 | 20 | import org.apache.spark.sql.catalyst.expressions._ 21 | import org.apache.spark.sql.catalyst.rules._ 22 | 23 | /** 24 | * Pushes NOT through And/Or 25 | */ 26 | object NotPusher extends Rule[Expression] { 27 | def apply(pred: Expression): Expression = pred transformDown { 28 | case Not(And(left, right)) => Or(Not(left), Not(right)) 29 | case Not(Or(left, right)) => And(Not(left), Not(right)) 30 | case not @ Not(exp) => 31 | // This pattern has been caught by optimizer but after NOT pushdown 32 | // more opportunities may present 33 | exp match { 34 | case GreaterThan(l, r) => LessThanOrEqual(l, r) 35 | case GreaterThanOrEqual(l, r) => LessThan(l, r) 36 | case LessThan(l, r) => GreaterThanOrEqual(l, r) 37 | case LessThanOrEqual(l, r) => GreaterThan(l, r) 38 | case Not(e) => e 39 | case _ => not 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/execution/HBaseSQLTableScan.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase.execution 19 | 20 | import org.apache.spark.annotation.DeveloperApi 21 | import org.apache.spark.rdd.RDD 22 | import org.apache.spark.sql.catalyst.expressions._ 23 | import org.apache.spark.sql.catalyst.plans.physical.RangePartitioning 24 | import org.apache.spark.sql.execution.LeafNode 25 | import org.apache.spark.sql.hbase._ 26 | 27 | /** 28 | * :: DeveloperApi :: 29 | * The HBase table scan operator. 30 | */ 31 | @DeveloperApi 32 | case class HBaseSQLTableScan( 33 | relation: HBaseRelation, 34 | output: Seq[Attribute], 35 | result: RDD[Row]) extends LeafNode { 36 | override def outputPartitioning = { 37 | var ordering = List[SortOrder]() 38 | for (key <- relation.partitionKeys) { 39 | ordering = ordering :+ SortOrder(key, Ascending) 40 | } 41 | RangePartitioning(ordering.toSeq, relation.partitions.size) 42 | } 43 | 44 | override protected def doExecute(): RDD[Row] = result 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/execution/HBaseStrategies.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase.execution 19 | 20 | import org.apache.hadoop.hbase.util.Bytes 21 | import org.apache.spark.rdd.RDD 22 | import org.apache.spark.sql.catalyst.expressions._ 23 | import org.apache.spark.sql.catalyst.planning.PhysicalOperation 24 | import org.apache.spark.sql.catalyst.plans.logical 25 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 26 | import org.apache.spark.sql.execution.{Project, SparkPlan} 27 | import org.apache.spark.sql.hbase.{HBasePartition, HBaseRawType, HBaseRelation, KeyColumn} 28 | import org.apache.spark.sql.sources.LogicalRelation 29 | import org.apache.spark.sql.types._ 30 | import org.apache.spark.sql.{SQLContext, Strategy, execution} 31 | 32 | /** 33 | * Retrieves data using a HBaseTableScan. Partition pruning predicates are also detected and 34 | * applied. 35 | */ 36 | private[hbase] trait HBaseStrategies { 37 | self: SQLContext#SparkPlanner => 38 | 39 | private[hbase] object HBaseDataSource extends Strategy { 40 | 41 | def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { 42 | case logical.Aggregate(groupingExpressions, aggregateExpressions, child) 43 | if groupingExpressions.nonEmpty && 44 | canBeAggregatedForAll(groupingExpressions, aggregateExpressions, child) => 45 | val withCodeGen = canBeCodeGened(allAggregates(aggregateExpressions)) && codegenEnabled 46 | if (withCodeGen) execution.GeneratedAggregate( 47 | // In this case, 'partial = true' doesn't mean it is partial, actually, it is not. 48 | // We made it to true to avoid adding Exchange operation. 49 | partial = true, 50 | groupingExpressions, 51 | aggregateExpressions, 52 | true, 53 | planLater(child)) :: Nil 54 | else execution.Aggregate( 55 | // In this case, 'partial = true' doesn't mean it is partial, actually, it is not. 56 | // We made it to true to avoid adding Exchange operation. 57 | partial = true, 58 | groupingExpressions, 59 | aggregateExpressions, 60 | planLater(child)) :: Nil 61 | 62 | case PhysicalOperation(projectList, inPredicates, 63 | l@LogicalRelation(relation: HBaseRelation)) => 64 | pruneFilterProjectHBase( 65 | l, 66 | projectList, 67 | inPredicates, 68 | (a, f) => relation.buildScan(a, f)) :: Nil 69 | 70 | case _ => Nil 71 | } 72 | 73 | def canBeCodeGened(aggs: Seq[AggregateExpression]) = !aggs.exists { 74 | case _: Sum | _: Count | _: Max | _: CombineSetsAndCount => false 75 | // The generated set implementation is pretty limited ATM. 76 | case CollectHashSet(exprs) if exprs.size == 1 && 77 | Seq(IntegerType, LongType).contains(exprs.head.dataType) => false 78 | case _ => true 79 | } 80 | 81 | def allAggregates(exprs: Seq[Expression]) = 82 | exprs.flatMap(_.collect { case a: AggregateExpression => a}) 83 | 84 | /** 85 | * Determined to do the aggregation for all directly or do with partial aggregation 86 | */ 87 | protected def canBeAggregatedForAll(groupingExpressions: Seq[Expression], 88 | aggregateExpressions: Seq[NamedExpression], 89 | child: LogicalPlan): Boolean = { 90 | def findScanNode(physicalChild: SparkPlan): Option[HBaseSQLTableScan] = physicalChild match { 91 | case chd: HBaseSQLTableScan => Some(chd) 92 | case chd if chd.children.size != 1 => None 93 | case chd => findScanNode(chd.children(0)) 94 | } 95 | 96 | /** 97 | * @param headEnd the HBaseRawType for the end of head partition 98 | * @param tailStart the HBaseRawType for the start of tail partition 99 | * @param keysForGroup the remaining key dimension for grouping 100 | * @return whether these two partitions are distinguished or not in the given dimension 101 | */ 102 | def distinguishedForGroupKeys(headEnd: HBaseRawType, 103 | tailStart: HBaseRawType, 104 | keysForGroup: Seq[KeyColumn]): Boolean = { 105 | //Divide raw type into two parts, one is the raw type for current key dimension, 106 | //the other is the raw type for the key dimensions left 107 | def divideRawType(rawType: HBaseRawType, key: KeyColumn) 108 | : (HBaseRawType, HBaseRawType) = key.dataType match { 109 | case dt: StringType => rawType.splitAt(rawType.indexWhere(_ == 0x00) + 1) 110 | case dt if dt.defaultSize >= rawType.size => (rawType, Array()) 111 | case dt => rawType.splitAt(dt.defaultSize) 112 | } 113 | 114 | if (keysForGroup.isEmpty) true 115 | else { 116 | val (curKey, keysLeft) = (keysForGroup.head, keysForGroup.tail) 117 | val (headEndCurKey, headEndKeysLeft) = divideRawType(headEnd, curKey) 118 | val (tailStartCurKey, tailStartKeysLeft) = divideRawType(tailStart, curKey) 119 | 120 | if (headEndKeysLeft.isEmpty || tailStartKeysLeft.isEmpty) true 121 | else if (Bytes.compareTo(tailStartCurKey, headEndCurKey) != 0) true 122 | else if (keysLeft.nonEmpty) distinguishedForGroupKeys( 123 | headEndKeysLeft, tailStartKeysLeft, keysLeft) 124 | else if (headEndKeysLeft.forall(_ == 0x00) || tailStartCurKey.forall(_ == 0x00)) true 125 | else false 126 | } 127 | } 128 | 129 | val physicalChild = planLater(child) 130 | def aggrWithPartial = false 131 | def aggrForAll = true 132 | 133 | findScanNode(physicalChild) match { 134 | case None => aggrWithPartial 135 | case Some(scanNode: HBaseSQLTableScan) => 136 | val hbaseRelation = scanNode.relation 137 | 138 | //If there is only one partition in HBase, 139 | //we don't need to do the partial aggregation 140 | if (hbaseRelation.partitions.size == 1) aggrForAll 141 | else { 142 | val keysForGroup = hbaseRelation.keyColumns.takeWhile(key => 143 | groupingExpressions.exists { 144 | case expr: AttributeReference => expr.name == key.sqlName 145 | case _ => false 146 | }) 147 | 148 | //If there exists some expressions in groupingExpressions are not keys 149 | //or it missed some mid dimensions in the rowkey, 150 | //that means we have to do it with the partial aggregation. 151 | // 152 | //If the groupingExpreesions are composed by all keys, 153 | //that means it need to be grouped by rowkey in all dimensions, 154 | //so we could do the aggregation for all directly. 155 | if (keysForGroup.size != groupingExpressions.size) aggrWithPartial 156 | else if (keysForGroup.size == hbaseRelation.keyColumns.size) aggrForAll 157 | else { 158 | val partitionsAfterFilter = scanNode.result.partitions 159 | val eachPartionApart = (0 to partitionsAfterFilter.size - 2).forall { case i => 160 | val headEnd = partitionsAfterFilter(i).asInstanceOf[HBasePartition] 161 | .end.get.asInstanceOf[HBaseRawType] 162 | val tailStart = partitionsAfterFilter(i + 1).asInstanceOf[HBasePartition] 163 | .start.get.asInstanceOf[HBaseRawType] 164 | //If there exists any two partition are not distinguished from each other 165 | // for the given rowkey dimensions, we could not do the aggregation for all. 166 | distinguishedForGroupKeys(headEnd, tailStart, keysForGroup) 167 | } 168 | if (eachPartionApart) aggrForAll 169 | else aggrWithPartial 170 | } 171 | } 172 | } 173 | } 174 | 175 | // Based on Catalyst expressions. 176 | // Almost identical to pruneFilterProjectRaw 177 | protected def pruneFilterProjectHBase(relation: LogicalRelation, 178 | projectList: Seq[NamedExpression], 179 | filterPredicates: Seq[Expression], 180 | scanBuilder: 181 | (Seq[Attribute], Seq[Expression]) => RDD[Row]) = { 182 | 183 | val projectSet = AttributeSet(projectList.flatMap(_.references)) 184 | val filterSet = AttributeSet(filterPredicates.flatMap(_.references)) 185 | 186 | val pushedFilters = if (filterPredicates.nonEmpty) { 187 | Seq(filterPredicates.map { 188 | _ transform { 189 | // Match original case of attributes. 190 | case a: AttributeReference => relation.attributeMap(a) 191 | // We will do HBase-specific predicate pushdown so just use the original predicate here 192 | } 193 | }.reduceLeft(And)) 194 | } else { 195 | filterPredicates 196 | } 197 | 198 | val hbaseRelation = relation.relation.asInstanceOf[HBaseRelation] 199 | if (projectList.map(_.toAttribute) == projectList && 200 | projectSet.size == projectList.size && 201 | filterSet.subsetOf(projectSet)) { 202 | // When it is possible to just use column pruning to get the right projection and 203 | // when the columns of this projection are enough to evaluate all filter conditions, 204 | // just do a scan followed by a filter, with no extra project. 205 | val requestedColumns = 206 | projectList.asInstanceOf[Seq[Attribute]] // Safe due to if above. 207 | .map(relation.attributeMap) // Match original case of attributes. 208 | 209 | // We have to use a HBase-specific scanner here while maintain as much compatibility 210 | // with the data source API as possible, primarily because 211 | // 1) We need to set up the outputPartitioning field to HBase-specific partitions 212 | // 2) Future use of HBase co-processor 213 | // 3) We will do partition-specific predicate pushdown 214 | // The above two *now* are absent from the PhysicalRDD class. 215 | 216 | HBaseSQLTableScan(hbaseRelation, projectList.map(_.toAttribute), 217 | scanBuilder(requestedColumns, pushedFilters)) 218 | } else { 219 | val requestedColumns = projectSet.map(relation.attributeMap).toSeq 220 | val scan = HBaseSQLTableScan(hbaseRelation, requestedColumns, 221 | scanBuilder(requestedColumns, pushedFilters)) 222 | Project(projectList, scan) 223 | } 224 | } 225 | } 226 | 227 | } 228 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql 18 | 19 | package object hbase { 20 | type HBaseRawType = Array[Byte] 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/types/HBaseBytesType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase.types 19 | 20 | import org.apache.spark.sql.types._ 21 | 22 | import scala.reflect.runtime.universe.typeTag 23 | 24 | /** 25 | * Almost identical to BinaryType except for a different ordering to be consistent 26 | * with that of HBase's internal ordering 27 | * This is a data type for Low-Level HBase entities. 28 | * It should not be used in High-Level processing 29 | */ 30 | private[hbase] case object HBaseBytesType extends AtomicType /*with PrimitiveType*/ { 31 | override def defaultSize: Int = 4096 32 | private[sql] type InternalType = Array[Byte] 33 | // TODO: can not use ScalaReflectionLock now for its accessibility 34 | // @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] } 35 | @transient private[sql] lazy val tag = synchronized(typeTag[InternalType]) 36 | private[sql] val ordering = new Ordering[InternalType] { 37 | def compare(x: Array[Byte], y: Array[Byte]): Int = { 38 | for (i <- 0 until x.length; if i < y.length) { 39 | val a: Int = x(i) & 0xff 40 | val b: Int = y(i) & 0xff 41 | val res = a - b 42 | if (res != 0) return res 43 | } 44 | x.length - y.length 45 | } 46 | } 47 | 48 | private[spark] override def asNullable = this 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/types/PartialOrderingDataType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.hbase.types 18 | 19 | import org.apache.spark.sql.types._ 20 | 21 | import scala.reflect.runtime.universe.TypeTag 22 | 23 | abstract class PartialOrderingDataType extends DataType { 24 | private[sql] type JvmType 25 | def toPartiallyOrderingDataType(s: Any, dt: AtomicType): Any 26 | @transient private[sql] val tag: TypeTag[JvmType] 27 | private[sql] val partialOrdering: PartialOrdering[JvmType] 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/types/RangeType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.hbase.types 18 | 19 | import java.sql.Timestamp 20 | 21 | import org.apache.spark.sql.catalyst.expressions.Expression 22 | import org.apache.spark.sql.types._ 23 | 24 | import scala.collection.immutable.HashMap 25 | import scala.language.implicitConversions 26 | import scala.math.PartialOrdering 27 | import scala.reflect.runtime.universe.typeTag 28 | 29 | class Range[T](val start: Option[T], // None for open ends 30 | val startInclusive: Boolean, 31 | val end: Option[T], // None for open ends 32 | val endInclusive: Boolean, 33 | val dt: AtomicType) extends Serializable { 34 | require(dt != null && !(start.isDefined && end.isDefined && 35 | ((dt.ordering.eq(start.get, end.get) && 36 | (!startInclusive || !endInclusive)) || 37 | dt.ordering.gt(start.get.asInstanceOf[dt.InternalType], end.get.asInstanceOf[dt.InternalType]))), 38 | "Inappropriate range parameters") 39 | @transient lazy val isPoint: Boolean = start.isDefined && end.isDefined && 40 | startInclusive && endInclusive && start.get.equals(end.get) 41 | } 42 | 43 | /** 44 | * HBase partition range 45 | * @param start start position 46 | * @param startInclusive whether the start position is inclusive or not 47 | * @param end end position 48 | * @param endInclusive whether the end position is inclusive or not 49 | * @param id the partition id 50 | * @param dt the data type 51 | * @param pred the associated predicate 52 | * @tparam T template of the type 53 | */ 54 | class PartitionRange[T](start: Option[T], startInclusive: Boolean, 55 | end: Option[T], endInclusive: Boolean, 56 | val id: Int, dt: AtomicType, var pred: Expression) 57 | extends Range[T](start, startInclusive, end, endInclusive, dt) 58 | 59 | private[hbase] class RangeType[T] extends PartialOrderingDataType { 60 | override def defaultSize: Int = 4096 61 | private[sql] type JvmType = Range[T] 62 | // TODO: can not use ScalaReflectionLock now for its accessibility 63 | // @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] } 64 | @transient private[sql] lazy val tag = synchronized(typeTag[JvmType]) 65 | 66 | private[spark] override def asNullable: RangeType[T] = this 67 | 68 | def toPartiallyOrderingDataType(s: Any, dt: AtomicType): Any = s match { 69 | case b: Boolean => new Range[Boolean](Some(b), true, Some(b), true, BooleanType) 70 | case b: Byte => new Range[Byte](Some(b), true, Some(b), true, ByteType) 71 | case d: Double => new Range[Double](Some(d), true, Some(d), true, DoubleType) 72 | case f: Float => new Range[Float](Some(f), true, Some(f), true, FloatType) 73 | case i: Int => new Range[Int](Some(i), true, Some(i), true, IntegerType) 74 | case l: Long => new Range[Long](Some(l), true, Some(l), true, LongType) 75 | case s: Short => new Range[Short](Some(s), true, Some(s), true, ShortType) 76 | case s: String => new Range[String](Some(s), true, Some(s), true, StringType) 77 | case t: Timestamp => new Range[Timestamp](Some(t), true, Some(t), true, TimestampType) 78 | case _ => s 79 | } 80 | 81 | val partialOrdering = new PartialOrdering[JvmType] { 82 | // Right now we just support comparisons between a range and a point 83 | // In the future when more generic range comparisons, these two methods 84 | // must be functional as expected 85 | // return -2 if a < b; -1 if a <= b; 0 if a = b; 1 if a >= b; 2 if a > b 86 | def tryCompare(a: JvmType, b: JvmType): Option[Int] = { 87 | val aRange = a.asInstanceOf[Range[T]] 88 | val aStartInclusive = aRange.startInclusive 89 | val aStart = aRange.start.getOrElse(null).asInstanceOf[aRange.dt.InternalType] 90 | val aEnd = aRange.end.getOrElse(null).asInstanceOf[aRange.dt.InternalType] 91 | val aEndInclusive = aRange.endInclusive 92 | val bRange = b.asInstanceOf[Range[T]] 93 | val bStart = bRange.start.getOrElse(null).asInstanceOf[aRange.dt.InternalType] 94 | val bEnd = bRange.end.getOrElse(null).asInstanceOf[aRange.dt.InternalType] 95 | val bStartInclusive = bRange.startInclusive 96 | val bEndInclusive = bRange.endInclusive 97 | 98 | // return 1 iff aStart > bEnd 99 | // return 1 iff aStart = bEnd, aStartInclusive & bEndInclusive are not true at same position 100 | if ((aStart != null && bEnd != null) 101 | && (aRange.dt.ordering.gt(aStart, bEnd) 102 | || (aRange.dt.ordering.equiv(aStart, bEnd) && !(aStartInclusive && bEndInclusive)))) { 103 | Some(2) 104 | } // Vice versa 105 | else if ((bStart != null && aEnd != null) 106 | && (aRange.dt.ordering.gt(bStart, aEnd) 107 | || (aRange.dt.ordering.equiv(bStart, aEnd) && !(bStartInclusive && aEndInclusive)))) { 108 | Some(-2) 109 | } else if (aStart != null && aEnd != null && bStart != null && bEnd != null && 110 | aRange.dt.ordering.equiv(bStart, aEnd) 111 | && aRange.dt.ordering.equiv(aStart, aEnd) 112 | && aRange.dt.ordering.equiv(bStart, bEnd) 113 | && (aStartInclusive && aEndInclusive && bStartInclusive && bEndInclusive)) { 114 | Some(0) 115 | } else if (aEnd != null && bStart != null && aRange.dt.ordering.equiv(aEnd, bStart) 116 | && aEndInclusive && bStartInclusive) { 117 | Some(-1) 118 | } else if (aStart != null && bEnd != null && aRange.dt.ordering.equiv(aStart, bEnd) 119 | && aStartInclusive && bEndInclusive) { 120 | Some(1) 121 | } else { 122 | None 123 | } 124 | } 125 | 126 | def lteq(a: JvmType, b: JvmType): Boolean = { 127 | // [(aStart, aEnd)] and [(bStart, bEnd)] 128 | // [( and )] mean the possibilities of the inclusive and exclusive condition 129 | val aRange = a.asInstanceOf[Range[T]] 130 | val aStartInclusive = aRange.startInclusive 131 | val aEnd = if (aRange.end.isEmpty) null else aRange.end.get 132 | val aEndInclusive = aRange.endInclusive 133 | val bRange = b.asInstanceOf[Range[T]] 134 | val bStart = if (bRange.start.isEmpty) null else bRange.start.get 135 | val bStartInclusive = bRange.startInclusive 136 | val bEndInclusive = bRange.endInclusive 137 | 138 | // Compare two ranges, return true iff the upper bound of the lower range is lteq to 139 | // the lower bound of the upper range. Because the exclusive boundary could be null, which 140 | // means the boundary could be infinity, we need to further check this conditions. 141 | val result = 142 | (aStartInclusive, aEndInclusive, bStartInclusive, bEndInclusive) match { 143 | // [(aStart, aEnd] compare to [bStart, bEnd)] 144 | case (_, true, true, _) => 145 | if (aRange.dt.ordering.lteq(aEnd.asInstanceOf[aRange.dt.InternalType], 146 | bStart.asInstanceOf[aRange.dt.InternalType])) { 147 | true 148 | } else { 149 | false 150 | } 151 | // [(aStart, aEnd] compare to (bStart, bEnd)] 152 | case (_, true, false, _) => 153 | if (bStart != null && aRange.dt.ordering.lteq(aEnd.asInstanceOf[aRange.dt.InternalType], 154 | bStart.asInstanceOf[aRange.dt.InternalType])) { 155 | true 156 | } else { 157 | false 158 | } 159 | // [(aStart, aEnd) compare to [bStart, bEnd)] 160 | case (_, false, true, _) => 161 | if (aEnd != null && aRange.dt.ordering.lteq(aEnd.asInstanceOf[aRange.dt.InternalType], 162 | bStart.asInstanceOf[aRange.dt.InternalType])) { 163 | true 164 | } else { 165 | false 166 | } 167 | // [(aStart, aEnd) compare to (bStart, bEnd)] 168 | case (_, false, false, _) => 169 | if (aEnd != null && bStart != null && 170 | aRange.dt.ordering.lteq(aEnd.asInstanceOf[aRange.dt.InternalType], 171 | bStart.asInstanceOf[aRange.dt.InternalType])) { 172 | true 173 | } else { 174 | false 175 | } 176 | } 177 | 178 | result 179 | } 180 | } 181 | } 182 | 183 | object RangeType { 184 | 185 | object BooleanRangeType extends RangeType[Boolean] 186 | 187 | object ByteRangeType extends RangeType[Byte] 188 | 189 | object DecimalRangeType extends RangeType[BigDecimal] 190 | 191 | object DoubleRangeType extends RangeType[Double] 192 | 193 | object FloatRangeType extends RangeType[Float] 194 | 195 | object IntegerRangeType extends RangeType[Int] 196 | 197 | object LongRangeType extends RangeType[Long] 198 | 199 | object ShortRangeType extends RangeType[Short] 200 | 201 | object StringRangeType extends RangeType[String] 202 | 203 | object TimestampRangeType extends RangeType[Timestamp] 204 | 205 | val primitiveToPODataTypeMap: HashMap[AtomicType, PartialOrderingDataType] = 206 | HashMap( 207 | BooleanType -> BooleanRangeType, 208 | ByteType -> ByteRangeType, 209 | DoubleType -> DoubleRangeType, 210 | FloatType -> FloatRangeType, 211 | IntegerType -> IntegerRangeType, 212 | LongType -> LongRangeType, 213 | ShortType -> ShortRangeType, 214 | StringType -> StringRangeType, 215 | TimestampType -> TimestampRangeType 216 | ) 217 | } 218 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/util/BytesUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.hbase.util 18 | 19 | import org.apache.hadoop.hbase.util.Bytes 20 | import org.apache.spark.sql.types._ 21 | import org.apache.spark.sql.hbase._ 22 | 23 | object BytesUtils { 24 | def create(dataType: DataType): BytesUtils = { 25 | dataType match { 26 | case BooleanType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_BOOLEAN), BooleanType) 27 | case ByteType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_BYTE), ByteType) 28 | case DoubleType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_DOUBLE), DoubleType) 29 | case FloatType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_FLOAT), FloatType) 30 | case IntegerType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_INT), IntegerType) 31 | case LongType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_LONG), LongType) 32 | case ShortType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_SHORT), ShortType) 33 | case StringType => new BytesUtils(null, StringType) 34 | } 35 | } 36 | 37 | def toString(input: HBaseRawType, offset: Int, length: Int): String = { 38 | Bytes.toString(input, offset, length) 39 | } 40 | 41 | def toByte(input: HBaseRawType, offset: Int): Byte = { 42 | // Flip sign bit back 43 | val v: Int = input(offset) ^ 0x80 44 | v.asInstanceOf[Byte] 45 | } 46 | 47 | def toBoolean(input: HBaseRawType, offset: Int): Boolean = { 48 | input(offset) != 0 49 | } 50 | 51 | def toDouble(input: HBaseRawType, offset: Int): Double = { 52 | var l: Long = Bytes.toLong(input, offset, Bytes.SIZEOF_DOUBLE) 53 | l = l - 1 54 | l ^= (~l >> java.lang.Long.SIZE - 1) | java.lang.Long.MIN_VALUE 55 | java.lang.Double.longBitsToDouble(l) 56 | } 57 | 58 | def toShort(input: HBaseRawType, offset: Int): Short = { 59 | // flip sign bit back 60 | var v: Int = input(offset) ^ 0x80 61 | v = (v << 8) + (input(1 + offset) & 0xff) 62 | v.asInstanceOf[Short] 63 | } 64 | 65 | def toFloat(input: HBaseRawType, offset: Int): Float = { 66 | var i = Bytes.toInt(input, offset) 67 | i = i - 1 68 | i ^= (~i >> Integer.SIZE - 1) | Integer.MIN_VALUE 69 | java.lang.Float.intBitsToFloat(i) 70 | } 71 | 72 | def toInt(input: HBaseRawType, offset: Int): Int = { 73 | // Flip sign bit back 74 | var v: Int = input(offset) ^ 0x80 75 | for (i <- 1 to Bytes.SIZEOF_INT - 1) { 76 | v = (v << 8) + (input(i + offset) & 0xff) 77 | } 78 | v 79 | } 80 | 81 | def toLong(input: HBaseRawType, offset: Int): Long = { 82 | // Flip sign bit back 83 | var v: Long = input(offset) ^ 0x80 84 | for (i <- 1 to Bytes.SIZEOF_LONG - 1) { 85 | v = (v << 8) + (input(i + offset) & 0xff) 86 | } 87 | v 88 | } 89 | 90 | /** 91 | * add one to the unsigned byte array 92 | * @param input the unsigned byte array 93 | * @return null if the byte array is all 0xff, otherwise increase by 1 94 | */ 95 | def addOne(input: HBaseRawType): HBaseRawType = { 96 | val len = input.length 97 | val result = new HBaseRawType(len) 98 | Array.copy(input, 0, result, 0, len) 99 | var setValue = false 100 | for (index <- len - 1 to 0 by -1 if !setValue) { 101 | val item: Byte = input(index) 102 | if (item != 0xff.toByte) { 103 | setValue = true 104 | if ((item & 0x01.toByte) == 0.toByte) { 105 | result(index) = (item ^ 0x01.toByte).toByte 106 | } else if ((item & 0x02.toByte) == 0.toByte) { 107 | result(index) = (item ^ 0x03.toByte).toByte 108 | } else if ((item & 0x04.toByte) == 0.toByte) { 109 | result(index) = (item ^ 0x07.toByte).toByte 110 | } else if ((item & 0x08.toByte) == 0.toByte) { 111 | result(index) = (item ^ 0x0f.toByte).toByte 112 | } else if ((item & 0x10.toByte) == 0.toByte) { 113 | result(index) = (item ^ 0x1f.toByte).toByte 114 | } else if ((item & 0x20.toByte) == 0.toByte) { 115 | result(index) = (item ^ 0x3f.toByte).toByte 116 | } else if ((item & 0x40.toByte) == 0.toByte) { 117 | result(index) = (item ^ 0x7f.toByte).toByte 118 | } else { 119 | result(index) = (item ^ 0xff.toByte).toByte 120 | } 121 | // after increment, set remaining bytes to zero 122 | for (rest <- index + 1 until len) { 123 | result(rest) = 0x00.toByte 124 | } 125 | } 126 | } 127 | if (!setValue) { 128 | null 129 | } else { 130 | result 131 | } 132 | } 133 | } 134 | 135 | class BytesUtils(var buffer: HBaseRawType, dt: DataType) { 136 | val dataType = dt 137 | 138 | def toBytes(input: String): HBaseRawType = { 139 | buffer = Bytes.toBytes(input) 140 | buffer 141 | } 142 | 143 | def toBytes(input: Byte): HBaseRawType = { 144 | // Flip sign bit so that Byte is binary comparable 145 | buffer(0) = (input ^ 0x80).asInstanceOf[Byte] 146 | buffer 147 | } 148 | 149 | def toBytes(input: Boolean): HBaseRawType = { 150 | if (input) { 151 | buffer(0) = (-1).asInstanceOf[Byte] 152 | } else { 153 | buffer(0) = 0.asInstanceOf[Byte] 154 | } 155 | buffer 156 | } 157 | 158 | def toBytes(input: Double): HBaseRawType = { 159 | var l: Long = java.lang.Double.doubleToLongBits(input) 160 | l = (l ^ ((l >> java.lang.Long.SIZE - 1) | java.lang.Long.MIN_VALUE)) + 1 161 | Bytes.putLong(buffer, 0, l) 162 | buffer 163 | } 164 | 165 | def toBytes(input: Short): HBaseRawType = { 166 | buffer(0) = ((input >> 8) ^ 0x80).asInstanceOf[Byte] 167 | buffer(1) = input.asInstanceOf[Byte] 168 | buffer 169 | } 170 | 171 | def toBytes(input: Float): HBaseRawType = { 172 | var i: Int = java.lang.Float.floatToIntBits(input) 173 | i = (i ^ ((i >> Integer.SIZE - 1) | Integer.MIN_VALUE)) + 1 174 | Bytes.putInt(buffer, 0, i) 175 | buffer 176 | } 177 | 178 | def toBytes(input: Int): HBaseRawType = { 179 | // Flip sign bit so that INTEGER is binary comparable 180 | buffer(0) = ((input >> 24) ^ 0x80).asInstanceOf[Byte] 181 | buffer(1) = (input >> 16).asInstanceOf[Byte] 182 | buffer(2) = (input >> 8).asInstanceOf[Byte] 183 | buffer(3) = input.asInstanceOf[Byte] 184 | buffer 185 | } 186 | 187 | def toBytes(input: Long): HBaseRawType = { 188 | buffer(0) = ((input >> 56) ^ 0x80).asInstanceOf[Byte] 189 | buffer(1) = (input >> 48).asInstanceOf[Byte] 190 | buffer(2) = (input >> 40).asInstanceOf[Byte] 191 | buffer(3) = (input >> 32).asInstanceOf[Byte] 192 | buffer(4) = (input >> 24).asInstanceOf[Byte] 193 | buffer(5) = (input >> 16).asInstanceOf[Byte] 194 | buffer(6) = (input >> 8).asInstanceOf[Byte] 195 | buffer(7) = input.asInstanceOf[Byte] 196 | buffer 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/util/DataTypeUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.hbase.util 18 | 19 | import org.apache.hadoop.hbase.filter.BinaryComparator 20 | import org.apache.spark.sql.catalyst.expressions.{Literal, MutableRow, Row} 21 | import org.apache.spark.sql.types._ 22 | import org.apache.spark.sql.hbase._ 23 | 24 | /** 25 | * Data Type conversion utilities 26 | */ 27 | object DataTypeUtils { 28 | /** 29 | * convert the byte array to data 30 | * @param src the input byte array 31 | * @param offset the offset in the byte array 32 | * @param length the length of the data, only used by StringType 33 | * @param dt the data type 34 | * @return the actual data converted from byte array 35 | */ 36 | def bytesToData(src: HBaseRawType, offset: Int, length: Int, dt: DataType): Any = { 37 | dt match { 38 | case BooleanType => BytesUtils.toBoolean(src, offset) 39 | case ByteType => src(offset) 40 | case DoubleType => BytesUtils.toDouble(src, offset) 41 | case FloatType => BytesUtils.toFloat(src, offset) 42 | case IntegerType => BytesUtils.toInt(src, offset) 43 | case LongType => BytesUtils.toLong(src, offset) 44 | case ShortType => BytesUtils.toShort(src, offset) 45 | case StringType => BytesUtils.toString(src, offset, length) 46 | case _ => throw new Exception("Unsupported HBase SQL Data Type") 47 | } 48 | } 49 | 50 | /** 51 | * convert data to byte array 52 | * @param src the input data 53 | * @param dt the data type 54 | * @return the output byte array 55 | */ 56 | def dataToBytes(src: Any, 57 | dt: DataType): HBaseRawType = { 58 | // TODO: avoid new instance per invocation 59 | val bu = BytesUtils.create(dt) 60 | dt match { 61 | case BooleanType => bu.toBytes(src.asInstanceOf[Boolean]) 62 | case ByteType => bu.toBytes(src.asInstanceOf[Byte]) 63 | case DoubleType => bu.toBytes(src.asInstanceOf[Double]) 64 | case FloatType => bu.toBytes(src.asInstanceOf[Float]) 65 | case IntegerType => bu.toBytes(src.asInstanceOf[Int]) 66 | case LongType => bu.toBytes(src.asInstanceOf[Long]) 67 | case ShortType => bu.toBytes(src.asInstanceOf[Short]) 68 | case StringType => bu.toBytes(src.asInstanceOf[String]) 69 | case _ => throw new Exception("Unsupported HBase SQL Data Type") 70 | } 71 | } 72 | 73 | /** 74 | * set the row data from byte array 75 | * @param row the row to be set 76 | * @param index the index in the row 77 | * @param src the input byte array 78 | * @param offset the offset in the byte array 79 | * @param length the length of the data, only used by StringType 80 | * @param dt the data type 81 | */ 82 | def setRowColumnFromHBaseRawType(row: MutableRow, 83 | index: Int, 84 | src: HBaseRawType, 85 | offset: Int, 86 | length: => Int, 87 | dt: DataType): Unit = { 88 | if (src == null || src.isEmpty) { 89 | row.setNullAt(index) 90 | return 91 | } 92 | dt match { 93 | case BooleanType => row.setBoolean(index, BytesUtils.toBoolean(src, offset)) 94 | case ByteType => row.setByte(index, BytesUtils.toByte(src, offset)) 95 | case DoubleType => row.setDouble(index, BytesUtils.toDouble(src, offset)) 96 | case FloatType => row.setFloat(index, BytesUtils.toFloat(src, offset)) 97 | case IntegerType => row.setInt(index, BytesUtils.toInt(src, offset)) 98 | case LongType => row.setLong(index, BytesUtils.toLong(src, offset)) 99 | case ShortType => row.setShort(index, BytesUtils.toShort(src, offset)) 100 | case StringType => row.setString(index, BytesUtils.toString(src, offset, length)) 101 | case _ => throw new Exception("Unsupported HBase SQL Data Type") 102 | } 103 | } 104 | 105 | def string2TypeData(v: String, dt: DataType): Any = { 106 | v match { 107 | case null => null 108 | case _ => 109 | dt match { 110 | // TODO: handle some complex types 111 | case BooleanType => v.toBoolean 112 | case ByteType => v.getBytes()(0) 113 | case DoubleType => v.toDouble 114 | case FloatType => v.toFloat 115 | case IntegerType => v.toInt 116 | case LongType => v.toLong 117 | case ShortType => v.toShort 118 | case StringType => v 119 | } 120 | } 121 | } 122 | 123 | /** 124 | * get the data from row based on index 125 | * @param row the input row 126 | * @param index the index of the data 127 | * @param dt the data type 128 | * @return the data from the row based on index 129 | */ 130 | def getRowColumnInHBaseRawType(row: Row, index: Int, dt: DataType): HBaseRawType = { 131 | if (row.isNullAt(index)) return new Array[Byte](0) 132 | 133 | val bu = BytesUtils.create(dt) 134 | dt match { 135 | case BooleanType => bu.toBytes(row.getBoolean(index)) 136 | case ByteType => bu.toBytes(row.getByte(index)) 137 | case DoubleType => bu.toBytes(row.getDouble(index)) 138 | case FloatType => bu.toBytes(row.getFloat(index)) 139 | case IntegerType => bu.toBytes(row.getInt(index)) 140 | case LongType => bu.toBytes(row.getLong(index)) 141 | case ShortType => bu.toBytes(row.getShort(index)) 142 | case StringType => bu.toBytes(row.getString(index)) 143 | case _ => throw new Exception("Unsupported HBase SQL Data Type") 144 | } 145 | } 146 | 147 | /** 148 | * create binary comparator for the input expression 149 | * @param bu the byte utility 150 | * @param expression the input expression 151 | * @return the constructed binary comparator 152 | */ 153 | def getBinaryComparator(bu: BytesUtils, expression: Literal): BinaryComparator = { 154 | expression.dataType match { 155 | case BooleanType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Boolean])) 156 | case ByteType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Byte])) 157 | case DoubleType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Double])) 158 | case FloatType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Float])) 159 | case IntegerType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Int])) 160 | case LongType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Long])) 161 | case ShortType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Short])) 162 | case StringType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[String])) 163 | case _ => throw new Exception("Cannot convert the data type using BinaryComparator") 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/util/HBaseKVHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase.util 19 | 20 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Row} 21 | import org.apache.spark.sql.hbase._ 22 | import org.apache.spark.sql.types._ 23 | 24 | import scala.collection.mutable.ArrayBuffer 25 | 26 | object HBaseKVHelper { 27 | private val delimiter: Byte = 0 28 | 29 | /** 30 | * create row key based on key columns information 31 | * for strings, it will add '0x00' as its delimiter 32 | * @param rawKeyColumns sequence of byte array and data type representing the key columns 33 | * @return array of bytes 34 | */ 35 | def encodingRawKeyColumns(rawKeyColumns: Seq[(HBaseRawType, DataType)]): HBaseRawType = { 36 | val length = rawKeyColumns.foldLeft(0)((b, a) => { 37 | val len = b + a._1.length 38 | if (a._2 == StringType) len + 1 else len 39 | }) 40 | val result = new HBaseRawType(length) 41 | var index = 0 42 | for (rawKeyColumn <- rawKeyColumns) { 43 | Array.copy(rawKeyColumn._1, 0, result, index, rawKeyColumn._1.length) 44 | index += rawKeyColumn._1.length 45 | if (rawKeyColumn._2 == StringType) { 46 | result(index) = delimiter 47 | index += 1 48 | } 49 | } 50 | result 51 | } 52 | 53 | /** 54 | * generate the sequence information of key columns from the byte array 55 | * @param rowKey array of bytes 56 | * @param keyColumns the sequence of key columns 57 | * @return sequence of information in (offset, length) tuple 58 | */ 59 | def decodingRawKeyColumns(rowKey: HBaseRawType, keyColumns: Seq[KeyColumn]): Seq[(Int, Int)] = { 60 | var index = 0 61 | keyColumns.map { 62 | case c => 63 | if (index >= rowKey.length) (-1, -1) 64 | else { 65 | val offset = index 66 | if (c.dataType == StringType) { 67 | val pos = rowKey.indexOf(delimiter, index) 68 | index = pos + 1 69 | (offset, pos - offset) 70 | } else { 71 | val length = c.dataType.asInstanceOf[AtomicType].defaultSize 72 | index += length 73 | (offset, length) 74 | } 75 | } 76 | } 77 | } 78 | 79 | /** 80 | * Takes a record, translate it into HBase row key column and value by matching with metadata 81 | * @param values record that as a sequence of string 82 | * @param relation HBaseRelation 83 | * @param keyBytes output parameter, array of (key column and its type); 84 | * @param valueBytes array of (column family, column qualifier, value) 85 | */ 86 | def string2KV(values: Seq[String], 87 | relation: HBaseRelation, 88 | lineBuffer: Array[BytesUtils], 89 | keyBytes: Array[(Array[Byte], DataType)], 90 | valueBytes: Array[HBaseRawType]) = { 91 | assert(values.length == relation.output.length, 92 | s"values length ${values.length} not equals columns length ${relation.output.length}") 93 | 94 | relation.keyColumns.foreach(kc => { 95 | val ordinal = kc.ordinal 96 | keyBytes(kc.order) = (string2Bytes(values(ordinal), lineBuffer(ordinal)), 97 | relation.output(ordinal).dataType) 98 | }) 99 | for (i <- 0 until relation.nonKeyColumns.size) { 100 | val nkc = relation.nonKeyColumns(i) 101 | val bytes = { 102 | // we should not use the same buffer in bulk-loading otherwise it will lead to corrupted 103 | lineBuffer(nkc.ordinal) = BytesUtils.create(lineBuffer(nkc.ordinal).dataType) 104 | string2Bytes(values(nkc.ordinal), lineBuffer(nkc.ordinal)) 105 | } 106 | valueBytes(i) = bytes 107 | } 108 | } 109 | 110 | private def string2Bytes(v: String, bu: BytesUtils): Array[Byte] = { 111 | v match { 112 | case "" => new Array[Byte](0) 113 | case null => new Array[Byte](0) 114 | case _ => 115 | bu.dataType match { 116 | // todo: handle some complex types 117 | case BooleanType => bu.toBytes(v.toBoolean) 118 | case ByteType => bu.toBytes(v) 119 | case DoubleType => bu.toBytes(v.toDouble) 120 | case FloatType => bu.toBytes(v.toFloat) 121 | case IntegerType => bu.toBytes(v.toInt) 122 | case LongType => bu.toBytes(v.toLong) 123 | case ShortType => bu.toBytes(v.toShort) 124 | case StringType => bu.toBytes(v) 125 | } 126 | } 127 | } 128 | 129 | /** 130 | * create a array of buffer that to be used for creating HBase Put object 131 | * @param schema the schema of the line buffer 132 | * @return 133 | */ 134 | private[hbase] def createLineBuffer(schema: Seq[Attribute]): Array[BytesUtils] = { 135 | val buffer = ArrayBuffer[BytesUtils]() 136 | schema.foreach { x => 137 | buffer.append(BytesUtils.create(x.dataType)) 138 | } 139 | buffer.toArray 140 | } 141 | 142 | /** 143 | * create a row key 144 | * @param row the generic row 145 | * @param dataTypeOfKeys sequence of data type 146 | * @return the row key 147 | */ 148 | def makeRowKey(row: Row, dataTypeOfKeys: Seq[DataType]): HBaseRawType = { 149 | val rawKeyCol = dataTypeOfKeys.zipWithIndex.map { 150 | case (dataType, index) => 151 | (DataTypeUtils.getRowColumnInHBaseRawType(row, index, dataType), dataType) 152 | } 153 | 154 | encodingRawKeyColumns(rawKeyCol) 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hbase/util/Util.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase.util 19 | 20 | import java.io._ 21 | import java.util.concurrent.atomic.AtomicInteger 22 | import java.util.zip.{DeflaterOutputStream, InflaterInputStream} 23 | 24 | import org.apache.hadoop.conf.Configuration 25 | import org.apache.hadoop.fs.{FileSystem, Path} 26 | import org.apache.hadoop.hbase.HBaseConfiguration 27 | 28 | object Util { 29 | val iteration = new AtomicInteger(0) 30 | 31 | def getTempFilePath(conf: Configuration, prefix: String): String = { 32 | val fileSystem = FileSystem.get(conf) 33 | val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}") 34 | if (fileSystem.exists(path)) { 35 | fileSystem.delete(path, true) 36 | } 37 | path.getName 38 | } 39 | 40 | def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { 41 | val bos = new ByteArrayOutputStream 42 | val deflaterOutputStream = new DeflaterOutputStream(bos) 43 | val dos = new DataOutputStream(deflaterOutputStream) 44 | configuration.write(dos) 45 | dos.close() 46 | bos.toByteArray 47 | } 48 | 49 | def deserializeHBaseConfiguration(arr: Array[Byte]) = { 50 | val conf = HBaseConfiguration.create 51 | conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) 52 | conf 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/test/java/org/apache/spark/sql/hbase/api/java/JavaAPISuite.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase.api.java; 19 | 20 | import java.io.Serializable; 21 | 22 | import org.apache.hadoop.hbase.HBaseTestingUtility; 23 | import org.apache.hadoop.hbase.MiniHBaseCluster; 24 | import org.apache.hadoop.hbase.client.HBaseAdmin; 25 | import org.apache.spark.SparkConf; 26 | import org.apache.spark.sql.SQLContext; 27 | import org.apache.spark.sql.hbase.*; 28 | import org.junit.After; 29 | import org.junit.Before; 30 | import org.junit.Test; 31 | 32 | import org.apache.spark.api.java.JavaSparkContext; 33 | import org.apache.spark.sql.Row; 34 | 35 | public class JavaAPISuite extends HBaseIntegrationTestBase implements Serializable { 36 | private transient JavaSparkContext sc; 37 | private transient SQLContext hsc; 38 | private transient MiniHBaseCluster cluster; 39 | private transient HBaseAdmin hbaseAdmin; 40 | 41 | private final String hb_staging_table = "HbStagingTable"; 42 | private final String staging_table = "StagingTable"; 43 | private final String create_sql = "CREATE TABLE " + staging_table + "(strcol STRING, bytecol String, shortcol String, intcol String, " + 44 | "longcol string, floatcol string, doublecol string, PRIMARY KEY(doublecol, strcol, intcol))" + 45 | " MAPPED BY (" + hb_staging_table + ", COLS=[bytecol=cf1.hbytecol, " + 46 | "shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])"; 47 | private final String insert_sql = "INSERT INTO " + staging_table + " VALUES (\"strcol\" , \"bytecol\" , \"shortcol\" , \"intcol\" ," + 48 | " \"longcol\" , \"floatcol\" , \"doublecol\")"; 49 | private final String retrieve_sql = "SELECT * FROM " + staging_table; 50 | 51 | @Before 52 | public void setUp() { 53 | System.setProperty("spark.hadoop.hbase.zookeeper.quorum", "localhost"); 54 | 55 | sc = new JavaSparkContext("local[2]", "JavaAPISuite", new SparkConf(true)); 56 | hsc = new HBaseSQLContext(sc); 57 | 58 | HBaseTestingUtility testUtil = new HBaseTestingUtility(hsc.sparkContext(). 59 | hadoopConfiguration()); 60 | 61 | int nRegionServers = 1; 62 | int nDataNodes = 1; 63 | int nMasters = 1; 64 | 65 | try { 66 | cluster = testUtil.startMiniCluster(nMasters, nRegionServers, nDataNodes); 67 | hbaseAdmin = new HBaseAdmin(hsc.sparkContext().hadoopConfiguration()); 68 | } catch (Exception e) { 69 | e.printStackTrace(); 70 | } 71 | } 72 | 73 | @Test 74 | public void testCreateInsertRetrieveTable() { 75 | hsc.sql(create_sql).collect(); 76 | hsc.sql(insert_sql).collect(); 77 | Row[] row = hsc.sql(retrieve_sql).collect(); 78 | 79 | assert (row[0].toString().equals("[strcol,bytecol,shortcol,intcol,longcol,floatcol,doublecol]")); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/test/resources/joinTable1.txt: -------------------------------------------------------------------------------- 1 | RowA1,a,12345,23456789,3456789012345,45657.89, 5678912.345678 2 | RowA2,a,12346,23456790,3456789012346,45657.90, 5678912.345679 3 | Row2,b,12342,23456782,3456789012342,45657.82, 5678912.345682 4 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683 5 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684 6 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685 7 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686 8 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687 9 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688 10 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689 11 | RowA10a,j,12340,23456780,3456789012340,45657.80, 5678912.345690 12 | RowA10b,j,12341,23456781,3456789012341,45657.81, 5678912.345691 13 | RowA10c,j,12342,23456782,3456789012342,45657.82, 5678912.345692 14 | -------------------------------------------------------------------------------- /src/test/resources/joinTable2.txt: -------------------------------------------------------------------------------- 1 | RowB1,a,12345,23456789,3456789012345,45657.89, 5678912.345678 2 | Row2,b1,12342,23456782,3456789012342,45657.82, 5678912.345682 3 | Row2,b2,12342,23456782,3456789012342,45657.82, 5678912.345683 4 | Row2,b3,12342,23456782,3456789012342,45657.82, 5678912.345684 5 | Row2,b4,12342,23456782,3456789012342,45657.82, 5678912.345685 6 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683 7 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684 8 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685 9 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686 10 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687 11 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688 12 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689 13 | RowB10a,j,12340,23456780,3456789012340,45657.80, 5678912.345690 14 | RowB10b,k,12341,23456781,3456789012341,45657.81, 5678912.345691 -------------------------------------------------------------------------------- /src/test/resources/joinTable3.txt: -------------------------------------------------------------------------------- 1 | RowC1,a,12345,23456789,3456789012345,45657.89, 5678912.345678 2 | RowC2,a,12346,23456790,3456789012346,45657.90, 5678912.345679 3 | Row2,b,12342,23456782,3456789012342,45657.82, 5678912.345682 4 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683 5 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684 6 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685 7 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686 8 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687 9 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688 10 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689 11 | RowC10a,j,12340,23456780,3456789012340,45657.80, 5678912.345690 12 | RowC10b,j,12341,23456781,3456789012341,45657.81, 5678912.345691 13 | RowC10c,j,12342,23456782,3456789012342,45657.82, 5678912.345692 14 | -------------------------------------------------------------------------------- /src/test/resources/joinTable4.txt: -------------------------------------------------------------------------------- 1 | RowD1,a,12345,23456789,3456789012345,45657.89, 5678912.345678 2 | RowD2,a,12346,23456790,3456789012346,45657.90, 5678912.345679 3 | Row2,b,12342,23456782,3456789012342,45657.82, 5678912.345682 4 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683 5 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684 6 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685 7 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686 8 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687 9 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688 10 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689 11 | RowD10a,j,12340,23456780,3456789012340,45657.80, 5678912.345690 12 | RowD10b,j,12341,23456781,3456789012341,45657.81, 5678912.345691 13 | RowD10c,j,12342,23456782,3456789012342,45657.82, 5678912.345692 14 | -------------------------------------------------------------------------------- /src/test/resources/loadData.txt: -------------------------------------------------------------------------------- 1 | row5,5,10 2 | row4,4,8 3 | row5,5,10 4 | row6,6,12 -------------------------------------------------------------------------------- /src/test/resources/loadNullableData.txt: -------------------------------------------------------------------------------- 1 | row1,,8,101 2 | row2,2,,102 3 | row3,3,10, 4 | row4,,, -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file core/target/unit-tests.log 19 | log4j.rootLogger=WARN,CA,FA 20 | 21 | #Console Appender 22 | log4j.appender.CA=org.apache.log4j.ConsoleAppender 23 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.CA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %p %c: %m%n 25 | log4j.appender.CA.Threshold = INFO 26 | 27 | 28 | #File Appender 29 | log4j.appender.FA=org.apache.log4j.FileAppender 30 | log4j.appender.FA.append=false 31 | log4j.appender.FA.file=target/unit-tests.log 32 | log4j.appender.FA.layout=org.apache.log4j.PatternLayout 33 | log4j.appender.FA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %p %c{1}: %m%n 34 | log4j.appender.FA.Threshold = INFO 35 | 36 | log4j.logger.org.mortbay=WARN 37 | 38 | log4j.logger.BlockStateChange=WARN 39 | log4j.logger.org.eclipse.jetty=WARN 40 | log4j.logger.org.apache.hadoop.hbase.ZNodeClearer=ERROR 41 | log4j.logger.org.apache.hadoop.hbase=WARN 42 | log4j.logger.org.apache.hadoop=WARN 43 | log4j.logger.org.apache.zookeeper=WARN 44 | 45 | log4j.logger.org.apache.spark.sql.hbase=DEBUG 46 | log4j.logger.org.apache.spark=WARN 47 | log4j.logger.org.scalatest=WARN 48 | -------------------------------------------------------------------------------- /src/test/resources/onecoljoin1.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 -------------------------------------------------------------------------------- /src/test/resources/onecoljoin2.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 -------------------------------------------------------------------------------- /src/test/resources/splitLoadData.txt: -------------------------------------------------------------------------------- 1 | 1,6,val6 2 | 2,12,val12 3 | 3,18,val18 4 | 4,24,val24 5 | 5,30,val30 6 | 6,36,val36 7 | 7,42,val42 8 | 8,48,val48 9 | 9,54,val54 10 | 10,60,val60 11 | 11,66,val66 12 | 12,72,val72 13 | 13,78,val78 14 | 14,84,val84 15 | 15,90,val90 16 | 16,96,val96 -------------------------------------------------------------------------------- /src/test/resources/splitLoadData1.txt: -------------------------------------------------------------------------------- 1 | 1,0a,1024,v1 2 | 1024,0b,0,v2 3 | 2048,cc,1024,v3 4 | 4096,0a,0,v4 5 | 4096,0b,1024,v5 6 | 4096,cc,0,v6 7 | 4096,cc,1024,v7 -------------------------------------------------------------------------------- /src/test/resources/testTable.txt: -------------------------------------------------------------------------------- 1 | Row2,b,12342,23456782,3456789012342,45657.82, 5678912.345682 2 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684 3 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685 4 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687 5 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689 6 | Row0,j,12340,23456780,3456789012340,45657.80, 5678912.345690 7 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686 8 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683 9 | Row1,a,12345,23456789,3456789012345,45657.89, 5678912.345678 10 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688 11 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689 12 | 13 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/AggregateQueriesSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | class AggregateQueriesSuite extends HBaseTestData { 21 | var testnm = "Group by with cols in select list and with order by" 22 | test("Group by with cols in select list and with order by") { 23 | val query = 24 | s"""SELECT count(1) as cnt, intcol, floatcol, strcol, max(bytecol) bytecol, max(shortcol) shortcol, 25 | max(floatcol) floatcolmax, max(doublecol) doublecol, max(longcol) from $DefaultTableName 26 | WHERE strcol LIKE '%Row%' AND shortcol < 12345 AND doublecol > 5678912.345681 27 | AND doublecol < 5678912.345684 28 | GROUP BY intcol, floatcol, strcol ORDER BY strcol DESC""" 29 | 30 | testGroupBy(testnm, query) 31 | } 32 | 33 | testnm = "Group by with cols in select list and with having and order by" 34 | test("Group by with cols in select list and with having and order by") { 35 | val query = s"""SELECT count(1) as cnt, intcol, floatcol, strcol, max(bytecol) bytecolmax, 36 | max(shortcol) shortcolmax, max(floatcol) floatcolmax, max(doublecol) doublecolmax, 37 | max(longcol) longcolmax 38 | FROM $DefaultTableName 39 | WHERE strcol like '%Row%' AND shortcol < 12345 AND doublecol > 5678912.345681 40 | AND doublecol < 5678912.345685 41 | GROUP BY intcol, floatcol, strcol 42 | HAVING max(doublecol) < 5678912.345684 43 | ORDER BY strcol DESC""".stripMargin 44 | testGroupBy(testnm, query) 45 | } 46 | 47 | def testGroupBy(testName: String, query: String) = { 48 | val result1 = runSql(query) 49 | assert(result1.size == 2, s"$testName failed on size") 50 | val exparr = Array( 51 | Array(1, 23456783, 45657.83F, "Row3", 'c', 12343, 45657.83F, 5678912.345683, 3456789012343L), 52 | Array(1, 23456782, 45657.82F, "Row2", 'b', 12342, 45657.82F, 5678912.345682, 3456789012342L)) 53 | 54 | val res = { 55 | for (rx <- 0 until exparr.size) 56 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 57 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 58 | assert(res, "One or more rows did not match expected") 59 | 60 | logInfo(s"$query came back with ${result1.size} results") 61 | logInfo(result1.mkString) 62 | 63 | logInfo(s"Test $testName completed successfully") 64 | } 65 | 66 | testnm = "Another Group by with cols in select list and with having and order by" 67 | test("Another Group by with cols in select list and with having and order by") { 68 | val query1 = 69 | s"""SELECT count(1) as cnt, intcol, floatcol, strcol, max(bytecol) bytecolmax, max(shortcol) shortcolmax, 70 | max(floatcol) floatcolmax, max(doublecol) doublecolmax, max(longcol) longcolmax FROM $DefaultTableName 71 | WHERE strcol LIKE '%Row%' AND shortcol < 12345 AND doublecol > 5678912.345681 72 | AND doublecol < 5678912.345685 73 | GROUP BY intcol, floatcol, strcol HAVING max(doublecol) < 5678912.345684 ORDER BY strcol DESC""" 74 | .stripMargin 75 | 76 | val result1 = runSql(query1) 77 | assert(result1.size == 2, s"$testnm failed on size") 78 | val exparr = Array( 79 | Array(1, 23456783, 45657.83F, "Row3", 'c', 12343, 45657.83F, 5678912.345683, 3456789012343L), 80 | Array(1, 23456782, 45657.82F, "Row2", 'b', 12342, 45657.82F, 5678912.345682, 3456789012342L)) 81 | 82 | val res = { 83 | for (rx <- 0 until exparr.size) 84 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 85 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 86 | assert(res, "One or more rows did not match expected") 87 | 88 | logInfo(s"$query1 came back with ${result1.size} results") 89 | logInfo(result1.mkString) 90 | 91 | logInfo(s"Test $testnm completed successfully") 92 | } 93 | } 94 | 95 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/BasicQueriesSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | class BasicQueriesSuite extends HBaseTestData { 21 | var testnm = "StarOperator * with limit" 22 | test("StarOperator * with limit") { 23 | val query1 = 24 | s"""SELECT * FROM $DefaultTableName LIMIT 3""" 25 | .stripMargin 26 | 27 | val result1 = runSql(query1) 28 | assert(result1.size == 3, s"$testnm failed on size") 29 | val exparr = Array(Array("Row1", 'a', 12345, 23456789, 3456789012345L, 45657.89F, 5678912.345678), 30 | Array("Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F, 5678912.345682), 31 | Array("Row3", 'c', 12343, 23456783, 3456789012343L, 45657.83F, 5678912.345683)) 32 | 33 | var res = { 34 | for (rx <- 0 until 3) 35 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 36 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 37 | assert(res, "One or more rows did not match expected") 38 | 39 | logInfo(s"$query1 came back with ${result1.size} results") 40 | logInfo(result1.mkString) 41 | 42 | val sql2 = 43 | s"""SELECT * FROM $DefaultTableName LIMIT 2""" 44 | .stripMargin 45 | 46 | val results = runSql(sql2) 47 | logInfo(s"$sql2 came back with ${results.size} results") 48 | assert(results.size == 2, s"$testnm failed assertion on size") 49 | res = { 50 | for (rx <- 0 until 2) 51 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 52 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 53 | logInfo(results.mkString) 54 | assert(res, "One or more rows did not match expected") 55 | 56 | logInfo(s"Test $testnm completed successfully") 57 | } 58 | 59 | testnm = "Select all cols with filter" 60 | test("Select all cols with filter") { 61 | val query1 = 62 | s"""SELECT * FROM $DefaultTableName WHERE shortcol < 12345 LIMIT 2""" 63 | .stripMargin 64 | 65 | val result1 = runSql(query1) 66 | logInfo(s"$query1 came back with ${result1.size} results") 67 | assert(result1.size == 2, s"$testnm failed on size") 68 | val exparr = Array( 69 | Array("Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F, 5678912.345682), 70 | Array("Row3", 'c', 12343, 23456783, 3456789012343L, 45657.83F, 5678912.345683)) 71 | 72 | val res = { 73 | for (rx <- 0 until 2) 74 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 75 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 76 | logInfo(result1.mkString) 77 | assert(res, "One or more rows did not match expected") 78 | 79 | logInfo(s"Test $testnm completed successfully") 80 | } 81 | 82 | testnm = "Select all cols with order by" 83 | test("Select all cols with order by") { 84 | val query1 = 85 | s"""SELECT * FROM $DefaultTableName WHERE shortcol < 12344 ORDER BY strcol DESC LIMIT 2""" 86 | .stripMargin 87 | 88 | val result1 = runSql(query1) 89 | assert(result1.size == 2, s"$testnm failed on size") 90 | val exparr = Array( 91 | Array("Row3", 'c', 12343, 23456783, 3456789012343L, 45657.83F, 5678912.345683), 92 | Array("Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F, 5678912.345682)) 93 | 94 | val res = { 95 | for (rx <- 0 until 2) 96 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 97 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 98 | assert(res, "One or more rows did not match expected") 99 | 100 | logInfo(s"Test $testnm completed successfully") 101 | } 102 | 103 | testnm = "Select same column twice" 104 | test("Select same column twice") { 105 | val query1 = 106 | s"""SELECT doublecol AS double1, doublecol AS doublecol 107 | | FROM $DefaultTableName 108 | | WHERE doublecol > 5678912.345681 AND doublecol < 5678912.345683""" 109 | .stripMargin 110 | 111 | val result1 = runSql(query1) 112 | logInfo(s"$query1 came back with ${result1.size} results") 113 | assert(result1.size == 1, s"$testnm failed on size") 114 | val exparr = Array( 115 | Array(5678912.345682, 5678912.345682)) 116 | 117 | assert(result1.size == 1, s"$testnm failed assertion on size") 118 | val res = { 119 | for (rx <- 0 until 1) 120 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 121 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 122 | logInfo(result1.mkString) 123 | assert(res, "One or more rows did not match expected") 124 | 125 | logInfo(s"Test $testnm completed successfully") 126 | } 127 | 128 | testnm = "Select specific cols with filter" 129 | test("Select specific cols with filter") { 130 | val query1 = 131 | s"""SELECT doublecol AS double1, -1 * doublecol AS minusdouble, 132 | | substr(strcol, 2) as substrcol, doublecol, strcol, 133 | | bytecol, shortcol, intcol, longcol, floatcol FROM $DefaultTableName WHERE strcol LIKE 134 | | '%Row%' AND shortcol < 12345 135 | | AND doublecol > 5678912.345681 AND doublecol < 5678912.345683 LIMIT 2""" 136 | .stripMargin 137 | 138 | val result1 = runSql(query1) 139 | logInfo(s"$query1 came back with ${result1.size} results") 140 | assert(result1.size == 1, s"$testnm failed on size") 141 | val exparr = Array( 142 | Array(5678912.345682, -5678912.345682, "ow2", 5678912.345682, 143 | "Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F)) 144 | 145 | assert(result1.size == 1, s"$testnm failed assertion on size") 146 | val res = { 147 | for (rx <- 0 until 1) 148 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 149 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 150 | logInfo(result1.mkString) 151 | assert(res, "One or more rows did not match expected") 152 | 153 | logInfo(s"Test $testnm completed successfully") 154 | } 155 | 156 | testnm = "Mixed And/or predicates" 157 | test("Mixed And/or predicates") { 158 | val query1 = s"""SELECT doublecol AS double1, -1 * doublecol AS minusdouble, 159 | substr(strcol, 2) AS substrcol, doublecol, strcol, 160 | bytecol, shortcol, intcol, longcol, floatcol FROM $DefaultTableName 161 | WHERE strcol LIKE '%Row%' 162 | AND shortcol < 12345 163 | AND doublecol > 5678912.345681 AND doublecol < 5678912.345683 164 | OR (doublecol = 5678912.345683 AND strcol IS NOT NULL) 165 | OR (doublecol = 5678912.345683 AND strcol IS NOT NULL or intcol > 12345 AND intcol < 0) 166 | OR (doublecol <> 5678912.345683 AND (strcol IS NULL or intcol > 12345 AND intcol < 0)) 167 | AND floatcol IS NOT NULL 168 | AND (intcol IS NOT NULL and intcol > 0) 169 | AND (intcol < 0 OR intcol IS NOT NULL)""".stripMargin 170 | 171 | val result1 = runSql(query1) 172 | logInfo(s"$query1 came back with ${result1.size} results") 173 | assert(result1.size == 2, s"$testnm failed on size") 174 | val exparr = Array( 175 | Array(5678912.345682, -5678912.345682, "ow2", 5678912.345682, 176 | "Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F), 177 | Array(5678912.345683, -5678912.345683, "ow3", 5678912.345683, 178 | "Row3", -29, 12343, 23456783, 3456789012343L, 45657.83)) 179 | 180 | val res = { 181 | for (rx <- 0 until 1) 182 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 183 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 184 | logInfo(result1.mkString) 185 | assert(res, "One or more rows did not match expected") 186 | 187 | logInfo(s"Test $testnm completed successfully") 188 | } 189 | 190 | testnm = "In predicates" 191 | test("In predicates") { 192 | val query1 = s"""SELECT doublecol AS double1, -1 * doublecol AS minusdouble, 193 | substr(strcol, 2) AS substrcol, doublecol, strcol, 194 | bytecol, shortcol, intcol, longcol, floatcol FROM $DefaultTableName 195 | WHERE doublecol IN (doublecol + 5678912.345682 - doublecol, doublecol + 5678912.345683 - doublecol)""".stripMargin 196 | 197 | val result1 = runSql(query1) 198 | logInfo(s"$query1 came back with ${result1.size} results") 199 | assert(result1.size == 2, s"$testnm failed on size") 200 | val exparr = Array( 201 | Array(5678912.345682, -5678912.345682, "ow2", 5678912.345682, 202 | "Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F), 203 | Array(5678912.345683, -5678912.345683, "ow3", 5678912.345683, 204 | "Row3", -29, 12343, 23456783, 3456789012343L, 45657.83)) 205 | 206 | val res = { 207 | for (rx <- 0 until 1) 208 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 209 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 210 | logInfo(result1.mkString) 211 | assert(res, "One or more rows did not match expected") 212 | 213 | logInfo(s"Test $testnm completed successfully") 214 | } 215 | 216 | testnm = "InSet predicates" 217 | test("InSet predicates") { 218 | val query1 = s"""SELECT doublecol AS double1, -1 * doublecol AS minusdouble, 219 | substr(strcol, 2) AS substrcol, doublecol, strcol, 220 | bytecol, shortcol, intcol, longcol, floatcol FROM $DefaultTableName 221 | WHERE doublecol IN (5678912.345682, 5678912.345683)""".stripMargin 222 | 223 | val result1 = runSql(query1) 224 | logInfo(s"$query1 came back with ${result1.size} results") 225 | assert(result1.size == 2, s"$testnm failed on size") 226 | val exparr = Array( 227 | Array(5678912.345682, -5678912.345682, "ow2", 5678912.345682, 228 | "Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F), 229 | Array(5678912.345683, -5678912.345683, "ow3", 5678912.345683, 230 | "Row3", -29, 12343, 23456783, 3456789012343L, 45657.83)) 231 | 232 | val res = { 233 | for (rx <- 0 until 1) 234 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 235 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 236 | logInfo(result1.mkString) 237 | assert(res, "One or more rows did not match expected") 238 | 239 | logInfo(s"Test $testnm completed successfully") 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/BytesUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.spark.Logging 21 | import org.apache.hadoop.hbase.util.Bytes 22 | import org.apache.spark.sql.types._ 23 | import org.apache.spark.sql.hbase.types.HBaseBytesType 24 | import org.apache.spark.sql.hbase.util.BytesUtils 25 | import org.scalatest.{BeforeAndAfterAll, FunSuite} 26 | 27 | class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging { 28 | test("Bytes Ordering Test") { 29 | val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1, 30 | 0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257) 31 | val result = s.map(i => (i, BytesUtils.create(IntegerType).toBytes(i))) 32 | .sortWith((f, s) => 33 | HBaseBytesType.ordering.gt( 34 | f._2.asInstanceOf[HBaseBytesType.InternalType], s._2.asInstanceOf[HBaseBytesType.InternalType])) 35 | assert(result.map(a => a._1).toSeq == s.sorted.reverse) 36 | } 37 | 38 | def compare(a: Array[Byte], b: Array[Byte]): Int = { 39 | val length = Math.min(a.length, b.length) 40 | var result: Int = 0 41 | for (i <- 0 to length - 1) { 42 | val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte] 43 | if (diff != 0) { 44 | result = diff 45 | } 46 | } 47 | result 48 | } 49 | 50 | test("Bytes Utility Test") { 51 | assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType) 52 | .toBytes(input = true), 0) === true) 53 | assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType) 54 | .toBytes(input = false), 0) === false) 55 | 56 | assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(12.34d), 0) 57 | === 12.34d) 58 | assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(-12.34d), 0) 59 | === -12.34d) 60 | 61 | assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(12.34f), 0) 62 | === 12.34f) 63 | assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(-12.34f), 0) 64 | === -12.34f) 65 | 66 | assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(12), 0) 67 | === 12) 68 | assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(-12), 0) 69 | === -12) 70 | 71 | assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(1234l), 0) 72 | === 1234l) 73 | assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(-1234l), 0) 74 | === -1234l) 75 | 76 | assert(BytesUtils.toShort(BytesUtils.create(ShortType) 77 | .toBytes(12.asInstanceOf[Short]), 0) === 12) 78 | assert(BytesUtils.toShort(BytesUtils.create(ShortType) 79 | .toBytes(-12.asInstanceOf[Short]), 0) === -12) 80 | 81 | assert(BytesUtils.toString(BytesUtils.create(StringType).toBytes("abc"), 0, 3) 82 | === "abc") 83 | assert(BytesUtils.toString(BytesUtils.create(StringType).toBytes(""), 0, 0) === "") 84 | 85 | assert(BytesUtils.toByte(BytesUtils.create(ByteType) 86 | .toBytes(5.asInstanceOf[Byte]), 0) === 5) 87 | assert(BytesUtils.toByte(BytesUtils.create(ByteType) 88 | .toBytes(-5.asInstanceOf[Byte]), 0) === -5) 89 | 90 | assert(compare(BytesUtils.create(IntegerType).toBytes(128), 91 | BytesUtils.create(IntegerType).toBytes(-128)) > 0) 92 | } 93 | 94 | test("byte array plus one") { 95 | var byteArray = Array[Byte](0x01.toByte, 127.toByte) 96 | assert(Bytes.compareTo(BytesUtils.addOne(byteArray), Array[Byte](0x01.toByte, 0x80.toByte)) == 0) 97 | 98 | byteArray = Array[Byte](0xff.toByte, 0xff.toByte) 99 | assert(BytesUtils.addOne(byteArray) == null) 100 | 101 | byteArray = Array[Byte](0x02.toByte, 0xff.toByte) 102 | assert(Bytes.compareTo(BytesUtils.addOne(byteArray), Array[Byte](0x03.toByte, 0x00.toByte)) == 0) 103 | } 104 | 105 | test("float comparison") { 106 | val f1 = BytesUtils.create(FloatType).toBytes(-1.23f) 107 | val f2 = BytesUtils.create(FloatType).toBytes(100f) 108 | assert(Bytes.compareTo(f1, f2) < 0) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/CatalogTestSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.sql.hbase 18 | 19 | import org.apache.hadoop.hbase._ 20 | import org.apache.hadoop.hbase.client.HBaseAdmin 21 | import org.apache.spark.sql.catalyst.expressions.GenericRow 22 | import org.apache.spark.sql.catalyst.plans.logical.Subquery 23 | import org.apache.spark.sql.types._ 24 | import org.apache.spark.sql.hbase.util.HBaseKVHelper 25 | import org.apache.spark.sql.sources.LogicalRelation 26 | 27 | class CatalogTestSuite extends HBaseIntegrationTestBase { 28 | val (catalog, configuration) = (TestHbase.catalog, TestHbase.sparkContext.hadoopConfiguration) 29 | 30 | test("Create Table") { 31 | // prepare the test data 32 | val namespace = "testNamespace" 33 | val tableName = "testTable" 34 | val hbaseTableName = "hbaseTable" 35 | val family1 = "family1" 36 | val family2 = "family2" 37 | 38 | if (!catalog.checkHBaseTableExists(hbaseTableName)) { 39 | val admin = new HBaseAdmin(configuration) 40 | val desc = new HTableDescriptor(TableName.valueOf(hbaseTableName)) 41 | desc.addFamily(new HColumnDescriptor(family1)) 42 | desc.addFamily(new HColumnDescriptor(family2)) 43 | admin.createTable(desc) 44 | } 45 | 46 | var allColumns = List[AbstractColumn]() 47 | allColumns = allColumns :+ KeyColumn("column2", IntegerType, 1) 48 | allColumns = allColumns :+ KeyColumn("column1", StringType, 0) 49 | allColumns = allColumns :+ NonKeyColumn("column4", FloatType, family2, "qualifier2") 50 | allColumns = allColumns :+ NonKeyColumn("column3", BooleanType, family1, "qualifier1") 51 | 52 | val splitKeys: Array[Array[Byte]] = Array( 53 | new GenericRow(Array(1024.0, "Upen", 128: Short)), 54 | new GenericRow(Array(1024.0, "Upen", 256: Short)), 55 | new GenericRow(Array(4096.0, "SF", 512: Short)) 56 | ).map(HBaseKVHelper.makeRowKey(_, Seq(DoubleType, StringType, ShortType))) 57 | 58 | catalog.createTable(tableName, namespace, hbaseTableName, allColumns, splitKeys) 59 | 60 | assert(catalog.checkLogicalTableExist(tableName) === true) 61 | } 62 | 63 | test("Get Table") { 64 | // prepare the test data 65 | val hbaseNamespace = "testNamespace" 66 | val tableName = "testTable" 67 | val hbaseTableName = "hbaseTable" 68 | 69 | val oresult = catalog.getTable(tableName) 70 | assert(oresult.isDefined) 71 | val result = oresult.get 72 | assert(result.tableName === tableName) 73 | assert(result.hbaseNamespace === hbaseNamespace) 74 | assert(result.hbaseTableName === hbaseTableName) 75 | assert(result.keyColumns.size === 2) 76 | assert(result.nonKeyColumns.size === 2) 77 | assert(result.allColumns.size === 4) 78 | 79 | // check the data type 80 | assert(result.keyColumns(0).dataType === StringType) 81 | assert(result.keyColumns(1).dataType === IntegerType) 82 | assert(result.nonKeyColumns(1).dataType === FloatType) 83 | assert(result.nonKeyColumns(0).dataType === BooleanType) 84 | 85 | val relation = catalog.lookupRelation(Seq(tableName)) 86 | val subquery = relation.asInstanceOf[Subquery] 87 | val hbRelation = subquery.child.asInstanceOf[LogicalRelation].relation.asInstanceOf[HBaseRelation] 88 | assert(hbRelation.nonKeyColumns.map(_.family) == List("family1", "family2")) 89 | val keyColumns = Seq(KeyColumn("column1", StringType, 0), KeyColumn("column2", IntegerType, 1)) 90 | assert(hbRelation.keyColumns.equals(keyColumns)) 91 | assert(relation.childrenResolved) 92 | } 93 | 94 | test("Alter Table") { 95 | val tableName = "testTable" 96 | 97 | val family1 = "family1" 98 | val column = NonKeyColumn("column5", BooleanType, family1, "qualifier3") 99 | 100 | catalog.alterTableAddNonKey(tableName, column) 101 | 102 | var result = catalog.getTable(tableName) 103 | var table = result.get 104 | assert(table.allColumns.size === 5) 105 | 106 | catalog.alterTableDropNonKey(tableName, column.sqlName) 107 | result = catalog.getTable(tableName) 108 | table = result.get 109 | assert(table.allColumns.size === 4) 110 | } 111 | 112 | test("Delete Table") { 113 | // prepare the test data 114 | val tableName = "testTable" 115 | 116 | catalog.deleteTable(tableName) 117 | 118 | assert(catalog.checkLogicalTableExist(tableName) === false) 119 | } 120 | 121 | test("Check Logical Table Exist") { 122 | val tableName = "non-exist" 123 | 124 | assert(catalog.checkLogicalTableExist(tableName) === false) 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/HBaseAdvancedSQLQuerySuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.spark.sql.types._ 21 | import org.apache.spark.sql.{SQLConf, _} 22 | 23 | class HBaseAdvancedSQLQuerySuite extends HBaseSplitTestData { 24 | import org.apache.spark.sql.hbase.TestHbase._ 25 | import org.apache.spark.sql.hbase.TestHbase.implicits._ 26 | 27 | test("aggregation with codegen") { 28 | val originalValue = TestHbase.conf.codegenEnabled 29 | setConf(SQLConf.CODEGEN_ENABLED, "true") 30 | val result = sql("SELECT col1 FROM ta GROUP BY col1").collect() 31 | assert(result.size == 14, s"aggregation with codegen test failed on size") 32 | setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString) 33 | } 34 | 35 | test("dsl simple select 0") { 36 | val tableA = sql("SELECT * FROM ta") 37 | checkAnswer( 38 | tableA.where('col7 === 1).orderBy('col2.asc).select('col4), 39 | Row(1) :: Nil) 40 | checkAnswer( 41 | tableA.where('col2 === 6).orderBy('col2.asc).select('col7), 42 | Row(-31) :: Nil) 43 | } 44 | 45 | test("metadata is propagated correctly") { 46 | val tableA = sql("SELECT col7, col1, col3 FROM ta") 47 | val schema = tableA.schema 48 | val docKey = "doc" 49 | val docValue = "first name" 50 | val metadata = new MetadataBuilder() 51 | .putString(docKey, docValue) 52 | .build() 53 | val schemaWithMeta = new StructType(Array( 54 | schema("col7"), schema("col1").copy(metadata = metadata), schema("col3"))) 55 | val personWithMeta = createDataFrame(tableA.rdd, schemaWithMeta) 56 | def validateMetadata(rdd: DataFrame): Unit = { 57 | assert(rdd.schema("col1").metadata.getString(docKey) == docValue) 58 | } 59 | personWithMeta.registerTempTable("personWithMeta") 60 | validateMetadata(personWithMeta.select($"col1")) 61 | validateMetadata(personWithMeta.select($"col1")) 62 | validateMetadata(personWithMeta.select($"col7", $"col1")) 63 | validateMetadata(sql("SELECT * FROM personWithMeta")) 64 | validateMetadata(sql("SELECT col7, col1 FROM personWithMeta")) 65 | validateMetadata(sql("SELECT * FROM personWithMeta JOIN salary ON col7 = personId")) 66 | validateMetadata(sql("SELECT col1, salary FROM personWithMeta JOIN salary ON col7 = personId")) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/HBaseBasicOperationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | /** 21 | * Test insert / query against the table 22 | */ 23 | class HBaseBasicOperationSuite extends HBaseSplitTestData { 24 | import org.apache.spark.sql.hbase.TestHbase._ 25 | 26 | override def afterAll() = { 27 | if (TestHbase.hbaseAdmin.tableExists("ht0")) { 28 | TestHbase.hbaseAdmin.disableTable("ht0") 29 | TestHbase.hbaseAdmin.deleteTable("ht0") 30 | } 31 | if (TestHbase.hbaseAdmin.tableExists("ht1")) { 32 | TestHbase.hbaseAdmin.disableTable("ht1") 33 | TestHbase.hbaseAdmin.deleteTable("ht1") 34 | } 35 | super.afterAll() 36 | } 37 | 38 | test("Insert Into table0") { 39 | sql( """CREATE TABLE tb0 (column2 INTEGER, column1 INTEGER, column4 FLOAT, 40 | column3 SHORT, PRIMARY KEY(column1, column2)) 41 | MAPPED BY (testNamespace.ht0, COLS=[column3=family1.qualifier1, 42 | column4=family2.qualifier2])""" 43 | ) 44 | 45 | assert(sql( """SELECT * FROM tb0""").collect().size == 0) 46 | sql( """INSERT INTO tb0 SELECT col4,col4,col6,col3 FROM ta""") 47 | assert(sql( """SELECT * FROM tb0""").collect().size == 14) 48 | 49 | sql( """DROP TABLE tb0""") 50 | } 51 | 52 | test("Insert and Query Single Row") { 53 | sql( """CREATE TABLE tb1 (column1 INTEGER, column2 STRING, 54 | PRIMARY KEY(column1)) 55 | MAPPED BY (ht1, COLS=[column2=cf.cq])""" 56 | ) 57 | 58 | assert(sql( """SELECT * FROM tb1""").collect().size == 0) 59 | sql( """INSERT INTO tb1 VALUES (1024, "abc")""") 60 | sql( """INSERT INTO tb1 VALUES (1028, "abd")""") 61 | assert(sql( """SELECT * FROM tb1""").collect().size == 2) 62 | assert( 63 | sql( """SELECT * FROM tb1 WHERE (column1 = 1023 AND column2 ="abc")""").collect().size == 0) 64 | assert(sql( 65 | """SELECT * FROM tb1 WHERE (column1 = 1024) 66 | |OR (column1 = 1028 AND column2 ="abd")""".stripMargin).collect().size == 2) 67 | 68 | sql( """DROP TABLE tb1""") 69 | } 70 | 71 | test("Select test 0") { 72 | assert(sql( """SELECT * FROM ta""").count() == 14) 73 | } 74 | 75 | test("Count(*/1) and Non-Key Column Query") { 76 | assert(sql( """SELECT count(*) FROM ta""").collect()(0).get(0) == 14) 77 | assert(sql( """SELECT count(*) FROM ta where col2 < 8""").collect()(0).get(0) == 7) 78 | assert(sql( """SELECT count(*) FROM ta where col4 < 0""").collect()(0).get(0) == 7) 79 | assert(sql( """SELECT count(1) FROM ta where col2 < 8""").collect()(0).get(0) == 7) 80 | assert(sql( """SELECT count(1) FROM ta where col4 < 0""").collect()(0).get(0) == 7) 81 | } 82 | 83 | test("InSet Query") { 84 | assert(sql( """SELECT count(*) FROM ta where col2 IN (1, 2, 3)""").collect()(0).get(0) == 3) 85 | assert(sql( """SELECT count(*) FROM ta where col4 IN (1, 2, 3)""").collect()(0).get(0) == 1) 86 | } 87 | 88 | test("Select test 1 (AND, OR)") { 89 | assert(sql( """SELECT * FROM ta WHERE col7 = 255 OR col7 = 127""").collect().size == 2) 90 | assert(sql( """SELECT * FROM ta WHERE col7 < 0 AND col4 < -255""").collect().size == 4) 91 | } 92 | 93 | test("Select test 2 (WHERE)") { 94 | assert(sql( """SELECT * FROM ta WHERE col7 > 128""").count() == 3) 95 | assert(sql( """SELECT * FROM ta WHERE (col7 - 10 > 128) AND col1 = ' p255 '""").collect().size == 1) 96 | } 97 | 98 | test("Select test 3 (ORDER BY)") { 99 | val result = sql( """SELECT col1, col7 FROM ta ORDER BY col7 DESC""").collect() 100 | val sortedResult = result.sortWith( 101 | (r1, r2) => r1(1).asInstanceOf[Int] > r2(1).asInstanceOf[Int]) 102 | for ((r1, r2) <- result zip sortedResult) { 103 | assert(r1.equals(r2)) 104 | } 105 | } 106 | 107 | test("Select test 4 (join)") { 108 | assert(sql( """SELECT ta.col2 FROM ta join tb on ta.col4=tb.col7""").collect().size == 2) 109 | assert(sql( """SELECT * FROM ta FULL OUTER JOIN tb WHERE tb.col7 = 1""").collect().size == 14) 110 | assert(sql( """SELECT * FROM ta LEFT JOIN tb WHERE tb.col7 = 1""").collect().size == 14) 111 | assert(sql( """SELECT * FROM ta RIGHT JOIN tb WHERE tb.col7 = 1""").collect().size == 14) 112 | } 113 | 114 | test("Alter Add column and Alter Drop column") { 115 | assert(sql( """SELECT * FROM ta""").collect()(0).size == 7) 116 | sql( """ALTER TABLE ta ADD col8 STRING MAPPED BY (col8 = cf1.cf13)""") 117 | assert(sql( """SELECT * FROM ta""").collect()(0).size == 8) 118 | sql( """ALTER TABLE ta DROP col8""") 119 | assert(sql( """SELECT * FROM ta""").collect()(0).size == 7) 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/HBaseInsertTableSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.spark.sql.Row 21 | 22 | class HBaseInsertTableSuite extends HBaseTestData { 23 | 24 | var testnm = "Insert all rows to the table from other table" 25 | test("Insert all rows to the table from other table") { 26 | val createQuery = s"""CREATE TABLE insertTestTable(strcol STRING, bytecol BYTE, shortcol SHORT, 27 | intcol INTEGER, longcol LONG, floatcol FLOAT, doublecol DOUBLE, 28 | PRIMARY KEY(doublecol, strcol, intcol)) 29 | MAPPED BY (hinsertTestTable, COLS=[bytecol=cf1.hbytecol, 30 | shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])""" 31 | .stripMargin 32 | runSql(createQuery) 33 | 34 | val insertQuery = 35 | s"""INSERT INTO insertTestTable SELECT * FROM $DefaultTableName""" 36 | .stripMargin 37 | runSql(insertQuery) 38 | 39 | val testQuery = "SELECT * FROM insertTestTable" 40 | val testResult = runSql(testQuery) 41 | val targetResult = runSql(s"SELECT * FROM $DefaultTableName") 42 | assert(testResult.size == targetResult.size, s"$testnm failed on size") 43 | 44 | compareResults(testResult, targetResult) 45 | 46 | runSql("DROP TABLE insertTestTable") 47 | } 48 | 49 | testnm = "Insert few rows to the table from other table after applying filter" 50 | test("Insert few rows to the table from other table after applying filter") { 51 | val createQuery = s"""CREATE TABLE insertTestTableFilter(strcol STRING, bytecol BYTE, 52 | shortcol SHORT, intcol INTEGER, longcol LONG, floatcol FLOAT, doublecol DOUBLE, 53 | PRIMARY KEY(doublecol, strcol, intcol)) 54 | MAPPED BY (hinsertTestTableFilter, COLS=[bytecol=cf1.hbytecol, 55 | shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])""" 56 | .stripMargin 57 | runSql(createQuery) 58 | 59 | val insertQuery = 60 | s"""insert into insertTestTableFilter select * from $DefaultTableName 61 | where doublecol > 5678912.345681""" 62 | .stripMargin 63 | runSql(insertQuery) 64 | 65 | val testQuery = "select * from insertTestTableFilter" 66 | val testResult = runSql(testQuery) 67 | val targetResult = runSql(s"select * from $DefaultTableName where doublecol > 5678912.345681") 68 | assert(testResult.size == targetResult.size, s"$testnm failed on size") 69 | 70 | compareResults(testResult, targetResult) 71 | 72 | runSql("Drop Table insertTestTableFilter") 73 | } 74 | 75 | def compareResults(fetchResult: Array[Row], targetResult: Array[Row]) = { 76 | val res = { 77 | for (rx <- 0 until targetResult.size) 78 | yield compareWithTol(fetchResult(rx).toSeq, targetResult(rx).toSeq, s"Row$rx failed") 79 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 80 | assert(res, "One or more rows did not match expected") 81 | } 82 | 83 | testnm = "Insert few columns to the table from other table" 84 | test("Insert few columns to the table from other table") { 85 | val createQuery = s"""CREATE TABLE insertTestTableFewCols(strcol STRING, bytecol BYTE, 86 | shortcol SHORT, intcol INTEGER, PRIMARY KEY(strcol, intcol)) 87 | MAPPED BY (hinsertTestTableFewCols, COLS=[bytecol=cf1.hbytecol, 88 | shortcol=cf1.hshortcol])""" 89 | .stripMargin 90 | runSql(createQuery) 91 | 92 | val insertQuery = 93 | s"""INSERT INTO insertTestTableFewCols SELECT strcol, bytecol, 94 | shortcol, intcol FROM $DefaultTableName ORDER BY strcol""" 95 | .stripMargin 96 | runSql(insertQuery) 97 | 98 | val testQuery = 99 | "SELECT strcol, bytecol, shortcol, intcol FROM insertTestTableFewCols ORDER BY strcol" 100 | val testResult = runSql(testQuery) 101 | val targetResult = 102 | runSql(s"SELECT strcol, bytecol, shortcol, intcol FROM $DefaultTableName ORDER BY strcol") 103 | assert(testResult.size == targetResult.size, s"$testnm failed on size") 104 | 105 | compareResults(testResult, targetResult) 106 | 107 | runSql("DROP TABLE insertTestTableFewCols") 108 | } 109 | 110 | testnm = "Insert into values test" 111 | test("Insert into values test") { 112 | val createQuery = s"""CREATE TABLE insertValuesTest(strcol STRING, bytecol BYTE, 113 | shortcol SHORT, intcol INTEGER, PRIMARY KEY(strcol, intcol)) 114 | MAPPED BY (hinsertValuesTest, COLS=[bytecol=cf1.hbytecol, 115 | shortcol=cf1.hshortcol])""" 116 | .stripMargin 117 | runSql(createQuery) 118 | 119 | val insertQuery1 = s"INSERT INTO insertValuesTest VALUES('Row0','a',12340,23456780)" 120 | val insertQuery2 = s"INSERT INTO insertValuesTest VALUES('Row1','b',12345,23456789)" 121 | val insertQuery3 = s"INSERT INTO insertValuesTest VALUES('Row2','c',12342,23456782)" 122 | runSql(insertQuery1) 123 | runSql(insertQuery2) 124 | runSql(insertQuery3) 125 | 126 | val testQuery = "SELECT * FROM insertValuesTest ORDER BY strcol" 127 | val testResult = runSql(testQuery) 128 | assert(testResult.size == 3, s"$testnm failed on size") 129 | 130 | val exparr = Array(Array("Row0", 'a', 12340, 23456780), 131 | Array("Row1", 'b', 12345, 23456789), 132 | Array("Row2", 'c', 12342, 23456782)) 133 | 134 | val res = { 135 | for (rx <- 0 until 3) 136 | yield compareWithTol(testResult(rx).toSeq, exparr(rx), s"Row$rx failed") 137 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 138 | assert(res, "One or more rows did not match expected") 139 | 140 | runSql("DROP TABLE insertValuesTest") 141 | } 142 | 143 | testnm = "Insert nullable values test" 144 | test("Insert nullable values test") { 145 | val createQuery = s"""CREATE TABLE insertNullValuesTest(strcol STRING, bytecol BYTE, 146 | shortcol SHORT, intcol INTEGER, PRIMARY KEY(strcol)) 147 | MAPPED BY (hinsertNullValuesTest, COLS=[bytecol=cf1.hbytecol, 148 | shortcol=cf1.hshortcol, intcol=cf1.hintcol])""" 149 | .stripMargin 150 | runSql(createQuery) 151 | 152 | val insertQuery1 = s"INSERT INTO insertNullValuesTest VALUES('Row0', null, 12340, 23456780)" 153 | val insertQuery2 = s"INSERT INTO insertNullValuesTest VALUES('Row1', 'b', null, 23456789)" 154 | val insertQuery3 = s"INSERT INTO insertNullValuesTest VALUES('Row2', 'c', 12342, null)" 155 | runSql(insertQuery1) 156 | runSql(insertQuery2) 157 | runSql(insertQuery3) 158 | 159 | val selectAllQuery = "SELECT * FROM insertNullValuesTest ORDER BY strcol" 160 | val selectAllResult = runSql(selectAllQuery) 161 | 162 | assert(selectAllResult.size == 3, s"$testnm failed on size") 163 | 164 | var currentResultRow: Int = 0 165 | 166 | // check 1st result row 167 | assert(selectAllResult(currentResultRow).length == 4, s"$testnm failed on row size (# of cols)") 168 | assert(selectAllResult(currentResultRow)(0) === s"Row0", s"$testnm failed on returned Row0, key value") 169 | assert(selectAllResult(currentResultRow)(1) == null, s"$testnm failed on returned Row0, null col1 value") 170 | assert(selectAllResult(currentResultRow)(2) == 12340, s"$testnm failed on returned Row0, col2 value") 171 | assert(selectAllResult(currentResultRow)(3) == 23456780, s"$testnm failed on returned Row0, col3 value") 172 | 173 | currentResultRow += 1 174 | 175 | // check 2nd result row 176 | assert(selectAllResult(currentResultRow)(0) === s"Row1", s"$testnm failed on returned Row1, key value") 177 | // skip comparison of actual and expected bytecol value 178 | assert(selectAllResult(currentResultRow)(2) == null, s"$testnm failed on returned Row1, null col2 value") 179 | assert(selectAllResult(currentResultRow)(3) == 23456789, s"$testnm failed on returned Row1, col3 value") 180 | 181 | currentResultRow += 1 182 | 183 | // check 3rd result row 184 | assert(selectAllResult(currentResultRow)(0) === s"Row2", s"$testnm failed on returned Row2, key value") 185 | // skip comparison of actual and expected bytecol value 186 | assert(selectAllResult(currentResultRow)(2) == 12342, s"$testnm failed on returned Row2, col2 value") 187 | assert(selectAllResult(currentResultRow)(3) == null, s"$testnm failed on returned Row2, null col3 value") 188 | 189 | // test 'where col is not null' 190 | 191 | val selectWhereIsNotNullQuery = "SELECT * FROM insertNullValuesTest WHERE intcol IS NOT NULL ORDER BY strcol" 192 | val selectWhereIsNotNullResult = runSql(selectWhereIsNotNullQuery) 193 | assert(selectWhereIsNotNullResult.size == 2, s"$testnm failed on size") 194 | 195 | currentResultRow = 0 196 | // check 1st result row 197 | assert(selectWhereIsNotNullResult(currentResultRow)(0) === s"Row0", s"$testnm failed on returned Row0, key value") 198 | assert(selectWhereIsNotNullResult(currentResultRow)(1) == null, s"$testnm failed on returned Row0, null col1 value") 199 | assert(selectWhereIsNotNullResult(currentResultRow)(2) == 12340, s"$testnm failed on returned Row0, col2 value") 200 | assert(selectWhereIsNotNullResult(currentResultRow)(3) == 23456780, s"$testnm failed on returned Row0, col3 value") 201 | 202 | currentResultRow += 1 203 | // check 2nd result row 204 | assert(selectWhereIsNotNullResult(currentResultRow)(0) === s"Row1", s"$testnm failed on returned Row1, key value") 205 | // skip comparison of actual and expected bytecol value 206 | assert(selectWhereIsNotNullResult(currentResultRow)(2) == null, s"$testnm failed on returned Row1, null col2 value") 207 | assert(selectWhereIsNotNullResult(currentResultRow)(3) == 23456789, s"$testnm failed on returned Row1, col3 value") 208 | 209 | 210 | runSql(" Drop Table insertNullValuesTest") 211 | } 212 | 213 | 214 | } 215 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/HBaseIntegrationTestBase.scala: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.spark.sql.hbase 20 | 21 | import java.util.Date 22 | 23 | import org.apache.spark.Logging 24 | import org.apache.spark.sql.catalyst.plans.logical 25 | import org.apache.spark.sql.catalyst.util._ 26 | import org.apache.spark.sql.{DataFrame, Row} 27 | import org.scalatest.{BeforeAndAfterAll, FunSuite, Suite} 28 | 29 | abstract class HBaseIntegrationTestBase 30 | extends FunSuite with BeforeAndAfterAll with Logging { 31 | self: Suite => 32 | 33 | val startTime = (new Date).getTime 34 | 35 | /** 36 | * Runs the plan and makes sure the answer matches the expected result. 37 | * @param rdd the [[DataFrame]] to be executed 38 | * @param expectedAnswer the expected result, can either be an Any, Seq[Product], or Seq[ Seq[Any] ]. 39 | */ 40 | protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = { 41 | val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s}.nonEmpty 42 | def prepareAnswer(answer: Seq[Row]): Seq[Row] = { 43 | // Converts data to types that we can do equality comparison using Scala collections. 44 | // For BigDecimal type, the Scala type has a better definition of equality test (similar to 45 | // Java's java.math.BigDecimal.compareTo). 46 | val converted: Seq[Row] = answer.map { s => 47 | Row.fromSeq(s.toSeq.map { 48 | case d: java.math.BigDecimal => BigDecimal(d) 49 | case o => o 50 | }) 51 | } 52 | if (!isSorted) converted.sortBy(_.toString()) else converted 53 | } 54 | val sparkAnswer = try rdd.collect().toSeq catch { 55 | case e: Exception => 56 | fail( 57 | s""" 58 | |Exception thrown while executing query: 59 | |${rdd.queryExecution} 60 | |== Exception == 61 | |$e 62 | |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)} 63 | """.stripMargin) 64 | } 65 | 66 | if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) { 67 | fail( s""" 68 | |Results do not match for query: 69 | |${rdd.logicalPlan} 70 | |== Analyzed Plan == 71 | |${rdd.queryExecution.analyzed} 72 | |== Physical Plan == 73 | |${rdd.queryExecution.executedPlan} 74 | |== Results == 75 | |${ 76 | sideBySide( 77 | s"== Correct Answer - ${expectedAnswer.size} ==" +: 78 | prepareAnswer(expectedAnswer).map(_.toString()), 79 | s"== Spark Answer - ${sparkAnswer.size} ==" +: 80 | prepareAnswer(sparkAnswer).map(_.toString())).mkString("\n") 81 | } 82 | """.stripMargin) 83 | } 84 | } 85 | 86 | protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = { 87 | checkAnswer(rdd, Seq(expectedAnswer)) 88 | } 89 | 90 | def runSql(sql: String):Array[Row] = { 91 | logInfo(sql) 92 | TestHbase.sql(sql).collect() 93 | } 94 | 95 | override protected def afterAll(): Unit = { 96 | val msg = s"Test ${getClass.getName} completed at ${(new java.util.Date).toString} duration=${((new java.util.Date).getTime - startTime) / 1000}" 97 | logInfo(msg) 98 | } 99 | 100 | val CompareTol = 1e-6 101 | 102 | def compareWithTol(actarr: Seq[Any], exparr: Seq[Any], emsg: String): Boolean = { 103 | actarr.zip(exparr).forall { case (aa, ee) => 104 | val eq = (aa, ee) match { 105 | case (a: Double, e: Double) => 106 | Math.abs(a - e) <= CompareTol 107 | case (a: Float, e: Float) => 108 | Math.abs(a - e) <= CompareTol 109 | case (a: Byte, e) => true //For now, we assume it is ok 110 | case (a, e) => 111 | if(a == null && e == null) { 112 | logDebug(s"a=null e=null") 113 | } else { 114 | logDebug(s"atype=${a.getClass.getName} etype=${e.getClass.getName}") 115 | } 116 | a == e 117 | case _ => throw new IllegalArgumentException("Expected tuple") 118 | } 119 | if (!eq) { 120 | logError(s"$emsg: Mismatch- act=$aa exp=$ee") 121 | } 122 | eq 123 | } 124 | } 125 | 126 | def verify(testName: String, sql: String, result1: Seq[Seq[Any]], exparr: Seq[Seq[Any]]) = { 127 | val res = { 128 | for (rx <- 0 until exparr.size) 129 | yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed") 130 | }.foldLeft(true) { case (res1, newres) => res1 && newres} 131 | 132 | logInfo(s"$sql came back with ${result1.size} results") 133 | logInfo(result1.mkString) 134 | assert(res, "One or more rows did not match expected") 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/HBaseSplitTestData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import java.io.{ByteArrayOutputStream, DataOutputStream} 21 | 22 | import org.apache.hadoop.hbase._ 23 | import org.apache.hadoop.hbase.client._ 24 | import org.apache.hadoop.hbase.util.Bytes 25 | import org.apache.spark.sql.catalyst.expressions.{GenericRow, Row} 26 | import org.apache.spark.sql.types._ 27 | import org.apache.spark.sql.hbase.util.{DataTypeUtils, HBaseKVHelper, BytesUtils} 28 | 29 | /** 30 | * HBaseMainTest 31 | * create HbTestTable and metadata table, and insert some data 32 | */ 33 | class HBaseSplitTestData extends HBaseIntegrationTestBase 34 | { 35 | val TableName_a: String = "ta" 36 | val TableName_b: String = "tb" 37 | val HbaseTableName: String = "ht" 38 | val Metadata_Table = "metadata" 39 | var alreadyInserted = false 40 | 41 | override protected def beforeAll() = { 42 | super.beforeAll() 43 | setupData(useMultiplePartitions = true, needInsertData = true) 44 | TestData 45 | } 46 | 47 | override protected def afterAll() = { 48 | TestHbase.sql("DROP TABLE " + TableName_a) 49 | TestHbase.sql("DROP TABLE " + TableName_b) 50 | } 51 | 52 | def createTable(useMultiplePartitions: Boolean) = { 53 | try { 54 | // delete the existing hbase table 55 | if (TestHbase.hbaseAdmin.tableExists(HbaseTableName)) { 56 | TestHbase.hbaseAdmin.disableTable(HbaseTableName) 57 | TestHbase.hbaseAdmin.deleteTable(HbaseTableName) 58 | } 59 | 60 | if (TestHbase.hbaseAdmin.tableExists(Metadata_Table)) { 61 | TestHbase.hbaseAdmin.disableTable(Metadata_Table) 62 | TestHbase.hbaseAdmin.deleteTable(Metadata_Table) 63 | } 64 | 65 | var allColumns = List[AbstractColumn]() 66 | allColumns = allColumns :+ KeyColumn("col1", StringType, 1) 67 | allColumns = allColumns :+ NonKeyColumn("col2", ByteType, "cf1", "cq11") 68 | allColumns = allColumns :+ KeyColumn("col3", ShortType, 2) 69 | allColumns = allColumns :+ NonKeyColumn("col4", IntegerType, "cf1", "cq12") 70 | allColumns = allColumns :+ NonKeyColumn("col5", LongType, "cf2", "cq21") 71 | allColumns = allColumns :+ NonKeyColumn("col6", FloatType, "cf2", "cq22") 72 | allColumns = allColumns :+ KeyColumn("col7", IntegerType, 0) 73 | 74 | val splitKeys: Array[Array[Byte]] = if (useMultiplePartitions) { 75 | Array( 76 | new GenericRow(Array(256, " p256 ", 128: Short)), 77 | new GenericRow(Array(32, " p32 ", 256: Short)), 78 | new GenericRow(Array(-32, " n32 ", 128: Short)), 79 | new GenericRow(Array(-256, " n256 ", 256: Short)), 80 | new GenericRow(Array(-128, " n128 ", 128: Short)), 81 | new GenericRow(Array(0, " zero ", 256: Short)), 82 | new GenericRow(Array(128, " p128 ", 512: Short)) 83 | ).map(HBaseKVHelper.makeRowKey(_, Seq(IntegerType, StringType, ShortType))) 84 | } else { 85 | null 86 | } 87 | 88 | TestHbase.catalog.createTable(TableName_a, null, HbaseTableName, allColumns, splitKeys) 89 | 90 | TestHbase.sql( s"""CREATE TABLE $TableName_b(col1 STRING, col2 BYTE, col3 SHORT, col4 INTEGER, 91 | col5 LONG, col6 FLOAT, col7 INTEGER, PRIMARY KEY(col7, col1, col3)) 92 | MAPPED BY ($HbaseTableName, COLS=[col2=cf1.cq11, col4=cf1.cq12, col5=cf2.cq21, 93 | col6=cf2.cq22])""".stripMargin) 94 | 95 | if (!TestHbase.hbaseAdmin.tableExists(HbaseTableName)) { 96 | throw new IllegalArgumentException("where is our table?") 97 | } 98 | } 99 | } 100 | 101 | def checkHBaseTableExists(hbaseTable: String): Boolean = { 102 | val tableName = TableName.valueOf(hbaseTable) 103 | TestHbase.hbaseAdmin.tableExists(tableName) 104 | } 105 | 106 | def insertTestData() = { 107 | if (!checkHBaseTableExists(HbaseTableName)) { 108 | throw new IllegalStateException(s"Unable to find table $HbaseTableName") 109 | } 110 | 111 | val htable = new HTable(TestHbase.sparkContext.hadoopConfiguration, HbaseTableName) 112 | 113 | def putNewTableIntoHBase(keys: Seq[Any], keysType: Seq[DataType], 114 | vals: Seq[Any], valsType: Seq[DataType]): Unit = { 115 | val row = new GenericRow(keys.toArray) 116 | val key = makeRowKey(row, keysType) 117 | val put = new Put(key) 118 | Seq((vals(0), valsType(0), "cf1", "cq11"), 119 | (vals(1), valsType(1), "cf1", "cq12"), 120 | (vals(2), valsType(2), "cf2", "cq21"), 121 | (vals(3), valsType(3), "cf2", "cq22")).foreach { 122 | case (rowValue, rowType, colFamily, colQualifier) => 123 | addRowVals(put, rowValue, rowType, colFamily, colQualifier) 124 | } 125 | htable.put(put) 126 | } 127 | 128 | putNewTableIntoHBase(Seq(-257, " n257 ", 128: Short), 129 | Seq(IntegerType, StringType, ShortType), 130 | Seq[Any](1.toByte, -2048, 12345678901234L, 1234.5678F), 131 | Seq(ByteType, IntegerType, LongType, FloatType)) 132 | 133 | putNewTableIntoHBase(Seq(-255, " n255 ", 128: Short), 134 | Seq(IntegerType, StringType, ShortType), 135 | Seq[Any](2.toByte, -1024, 12345678901234L, 1234.5678F), 136 | Seq(ByteType, IntegerType, LongType, FloatType)) 137 | 138 | putNewTableIntoHBase(Seq(-129, " n129 ", 128: Short), 139 | Seq(IntegerType, StringType, ShortType), 140 | Seq[Any](3.toByte, -512, 12345678901234L, 1234.5678F), 141 | Seq(ByteType, IntegerType, LongType, FloatType)) 142 | 143 | putNewTableIntoHBase(Seq(-127, " n127 ", 128: Short), 144 | Seq(IntegerType, StringType, ShortType), 145 | Seq[Any](4.toByte, -256, 12345678901234L, 1234.5678F), 146 | Seq(ByteType, IntegerType, LongType, FloatType)) 147 | 148 | putNewTableIntoHBase(Seq(-33, " n33 ", 128: Short), 149 | Seq(IntegerType, StringType, ShortType), 150 | Seq[Any](5.toByte, -128, 12345678901234L, 1234.5678F), 151 | Seq(ByteType, IntegerType, LongType, FloatType)) 152 | 153 | putNewTableIntoHBase(Seq(-31, " n31 ", 128: Short), 154 | Seq(IntegerType, StringType, ShortType), 155 | Seq[Any](6.toByte, -64, 12345678901234L, 1234.5678F), 156 | Seq(ByteType, IntegerType, LongType, FloatType)) 157 | 158 | putNewTableIntoHBase(Seq(-1, " n1 ", 128: Short), 159 | Seq(IntegerType, StringType, ShortType), 160 | Seq[Any](7.toByte, -1, 12345678901234L, 1234.5678F), 161 | Seq(ByteType, IntegerType, LongType, FloatType)) 162 | 163 | putNewTableIntoHBase(Seq(1, " p1 ", 128: Short), 164 | Seq(IntegerType, StringType, ShortType), 165 | Seq[Any](8.toByte, 1, 12345678901234L, 1234.5678F), 166 | Seq(ByteType, IntegerType, LongType, FloatType)) 167 | 168 | putNewTableIntoHBase(Seq(31, " p31 ", 128: Short), 169 | Seq(IntegerType, StringType, ShortType), 170 | Seq[Any](9.toByte, 4, 12345678901234L, 1234.5678F), 171 | Seq(ByteType, IntegerType, LongType, FloatType)) 172 | 173 | putNewTableIntoHBase(Seq(33, " p33 ", 128: Short), 174 | Seq(IntegerType, StringType, ShortType), 175 | Seq[Any](10.toByte, 64, 12345678901234L, 1234.5678F), 176 | Seq(ByteType, IntegerType, LongType, FloatType)) 177 | 178 | putNewTableIntoHBase(Seq(127, " p127 ", 128: Short), 179 | Seq(IntegerType, StringType, ShortType), 180 | Seq[Any](11.toByte, 128, 12345678901234L, 1234.5678F), 181 | Seq(ByteType, IntegerType, LongType, FloatType)) 182 | 183 | putNewTableIntoHBase(Seq(129, " p129 ", 128: Short), 184 | Seq(IntegerType, StringType, ShortType), 185 | Seq[Any](12.toByte, 256, 12345678901234L, 1234.5678F), 186 | Seq(ByteType, IntegerType, LongType, FloatType)) 187 | 188 | putNewTableIntoHBase(Seq(255, " p255 ", 128: Short), 189 | Seq(IntegerType, StringType, ShortType), 190 | Seq[Any](13.toByte, 512, 12345678901234L, 1234.5678F), 191 | Seq(ByteType, IntegerType, LongType, FloatType)) 192 | 193 | putNewTableIntoHBase(Seq(257, " p257 ", 128: Short), 194 | Seq(IntegerType, StringType, ShortType), 195 | Seq[Any](14.toByte, 1024, 12345678901234L, 1234.5678F), 196 | Seq(ByteType, IntegerType, LongType, FloatType)) 197 | 198 | htable.close() 199 | } 200 | 201 | def makeRowKey(row: Row, dataTypeOfKeys: Seq[DataType]) = { 202 | val rawKeyCol = dataTypeOfKeys.zipWithIndex.map { 203 | case (dataType, index) => 204 | (DataTypeUtils.getRowColumnInHBaseRawType(row, index, dataType), 205 | dataType) 206 | } 207 | 208 | HBaseKVHelper.encodingRawKeyColumns(rawKeyCol) 209 | } 210 | 211 | def addRowVals(put: Put, rowValue: Any, rowType: DataType, 212 | colFamily: String, colQualifier: String) = { 213 | val bos = new ByteArrayOutputStream() 214 | val dos = new DataOutputStream(bos) 215 | val bu = BytesUtils.create(rowType) 216 | rowType match { 217 | case StringType => dos.write(bu.toBytes(rowValue.asInstanceOf[String])) 218 | case IntegerType => dos.write(bu.toBytes(rowValue.asInstanceOf[Int])) 219 | case BooleanType => dos.write(bu.toBytes(rowValue.asInstanceOf[Boolean])) 220 | case ByteType => dos.write(bu.toBytes(rowValue.asInstanceOf[Byte])) 221 | case DoubleType => dos.write(bu.toBytes(rowValue.asInstanceOf[Double])) 222 | case FloatType => dos.write(bu.toBytes(rowValue.asInstanceOf[Float])) 223 | case LongType => dos.write(bu.toBytes(rowValue.asInstanceOf[Long])) 224 | case ShortType => dos.write(bu.toBytes(rowValue.asInstanceOf[Short])) 225 | case _ => throw new Exception("Unsupported HBase SQL Data Type") 226 | } 227 | put.add(Bytes.toBytes(colFamily), Bytes.toBytes(colQualifier), bos.toByteArray) 228 | } 229 | 230 | def testHBaseScanner() = { 231 | val scan = new Scan 232 | val htable = new HTable(TestHbase.sparkContext.hadoopConfiguration, HbaseTableName) 233 | val scanner = htable.getScanner(scan) 234 | var res: Result = null 235 | do { 236 | res = scanner.next 237 | if (res != null) logInfo(s"Row ${res.getRow} has map=${res.getNoVersionMap.toString}") 238 | } while (res != null) 239 | } 240 | 241 | def setupData(useMultiplePartitions: Boolean, needInsertData: Boolean = false) { 242 | if (needInsertData && !alreadyInserted) { 243 | createTable(useMultiplePartitions) 244 | insertTestData() 245 | alreadyInserted = true 246 | } 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/HBaseTestData.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.hbase 2 | 3 | import org.apache.hadoop.fs.{Path, FileSystem} 4 | import org.apache.hadoop.hbase.util.Bytes 5 | import org.apache.hadoop.hbase.{TableExistsException, HColumnDescriptor, HTableDescriptor, TableName} 6 | import org.apache.spark.Logging 7 | import org.apache.spark.sql.SQLContext 8 | 9 | /* 10 | * Licensed to the Apache Software Foundation (ASF) under one or more 11 | * contributor license agreements. See the NOTICE file distributed with 12 | * this work for additional information regarding copyright ownership. 13 | * The ASF licenses this file to You under the Apache License, Version 2.0 14 | * (the "License"); you may not use this file except in compliance with 15 | * the License. You may obtain a copy of the License at 16 | * 17 | * http://www.apache.org/licenses/LICENSE-2.0 18 | * 19 | * Unless required by applicable law or agreed to in writing, software 20 | * distributed under the License is distributed on an "AS IS" BASIS, 21 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 | * See the License for the specific language governing permissions and 23 | * limitations under the License. 24 | */ 25 | 26 | /** 27 | * CreateTableAndLoadData 28 | * 29 | */ 30 | class HBaseTestData extends HBaseIntegrationTestBase { 31 | val DefaultStagingTableName = "StageTable" 32 | val DefaultTableName = "TestTable" 33 | val DefaultHbaseStagingTableName = s"Hb$DefaultStagingTableName" 34 | val DefaultHbaseTabName = s"Hb$DefaultTableName" 35 | val DefaultHbaseColFamilies = Seq("cf1", "cf2") 36 | 37 | val CsvPaths = Array("src/test/resources", "sql/hbase/src/test/resources") 38 | val DefaultLoadFile = "testTable.txt" 39 | 40 | private val tpath = for (csvPath <- CsvPaths 41 | if new java.io.File(csvPath).exists() 42 | ) yield { 43 | logInfo(s"Following path exists $csvPath") 44 | csvPath 45 | } 46 | private[hbase] val CsvPath = tpath(0) 47 | 48 | override protected def beforeAll() = { 49 | super.beforeAll() 50 | createTables(DefaultStagingTableName, DefaultTableName, 51 | DefaultHbaseStagingTableName, DefaultHbaseTabName) 52 | loadData(DefaultStagingTableName, DefaultTableName, s"$CsvPath/$DefaultLoadFile") 53 | } 54 | 55 | override protected def afterAll() = { 56 | super.afterAll() 57 | TestHbase.sql("DROP TABLE " + DefaultStagingTableName) 58 | TestHbase.sql("DROP TABLE " + DefaultTableName) 59 | } 60 | 61 | def createNativeHbaseTable(tableName: String, families: Seq[String]) = { 62 | val hbaseAdmin = TestHbase.hbaseAdmin 63 | val hdesc = new HTableDescriptor(TableName.valueOf(tableName)) 64 | families.foreach { f => hdesc.addFamily(new HColumnDescriptor(f))} 65 | try { 66 | hbaseAdmin.createTable(hdesc) 67 | } catch { 68 | case e: TableExistsException => 69 | logError(s"Table already exists $tableName", e) 70 | } 71 | } 72 | 73 | def createNativeHbaseTable(tableName: String, families: Seq[String], 74 | splitKeys: Array[HBaseRawType]) = { 75 | val hbaseAdmin = TestHbase.hbaseAdmin 76 | val hdesc = new HTableDescriptor(TableName.valueOf(tableName)) 77 | families.foreach { f => hdesc.addFamily(new HColumnDescriptor(f))} 78 | try { 79 | hbaseAdmin.createTable(hdesc, splitKeys) 80 | } catch { 81 | case e: TableExistsException => 82 | logError(s"Table already exists $tableName", e) 83 | } 84 | } 85 | 86 | def dropNativeHbaseTable(tableName: String) = { 87 | try { 88 | val hbaseAdmin = TestHbase.hbaseAdmin 89 | hbaseAdmin.disableTable(tableName) 90 | hbaseAdmin.deleteTable(tableName) 91 | } catch { 92 | case e: TableExistsException => 93 | logError(s"Table already exists $tableName", e) 94 | } 95 | } 96 | 97 | def createTables( 98 | stagingTableName: String, 99 | tableName: String, 100 | hbaseStagingTable: String, 101 | hbaseTable: String) = { 102 | val hbaseAdmin = TestHbase.hbaseAdmin 103 | if (!hbaseAdmin.tableExists(TableName.valueOf(hbaseStagingTable))) { 104 | createNativeHbaseTable(hbaseStagingTable, DefaultHbaseColFamilies) 105 | } 106 | if (!hbaseAdmin.tableExists(TableName.valueOf(hbaseTable))) { 107 | createNativeHbaseTable(hbaseTable, DefaultHbaseColFamilies) 108 | } 109 | 110 | if (TestHbase.catalog.checkLogicalTableExist(stagingTableName)) { 111 | val dropSql = s"DROP TABLE $stagingTableName" 112 | runSql(dropSql) 113 | } 114 | 115 | if (TestHbase.catalog.checkLogicalTableExist(tableName)) { 116 | val dropSql = s"DROP TABLE $tableName" 117 | runSql(dropSql) 118 | } 119 | 120 | val (stagingSql, tabSql) = 121 | ( s"""CREATE TABLE $stagingTableName(strcol STRING, bytecol STRING, shortcol STRING, intcol STRING, 122 | longcol STRING, floatcol STRING, doublecol STRING, PRIMARY KEY(doublecol, strcol, intcol)) 123 | MAPPED BY ($hbaseStagingTable, COLS=[bytecol=cf1.hbytecol, 124 | shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])""" 125 | .stripMargin 126 | , 127 | s"""CREATE TABLE $tableName(strcol STRING, bytecol BYTE, shortcol SHORT, intcol INTEGER, 128 | longcol LONG, floatcol FLOAT, doublecol DOUBLE, PRIMARY KEY(doublecol, strcol, intcol)) 129 | MAPPED BY ($hbaseTable, COLS=[bytecol=cf1.hbytecol, 130 | shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])""" 131 | .stripMargin 132 | ) 133 | try { 134 | logInfo(s"invoking $stagingSql ..") 135 | runSql(stagingSql) 136 | } catch { 137 | case e: TableExistsException => 138 | logInfo("IF NOT EXISTS still not implemented so we get the following exception", e) 139 | } 140 | 141 | logDebug(s"Created table $tableName: " + 142 | s"isTableAvailable= ${hbaseAdmin.isTableAvailable(s2b(hbaseStagingTable))}" + 143 | s" tableDescriptor= ${hbaseAdmin.getTableDescriptor(s2b(hbaseStagingTable))}") 144 | 145 | try { 146 | logInfo(s"invoking $tabSql ..") 147 | runSql(tabSql) 148 | } catch { 149 | case e: TableExistsException => 150 | logInfo("IF NOT EXISTS still not implemented so we get the following exception", e) 151 | } 152 | } 153 | 154 | def loadData(stagingTableName: String, tableName: String, loadFile: String) = { 155 | // then load data into table 156 | val loadSql = s"LOAD PARALL DATA LOCAL INPATH '$loadFile' INTO TABLE $tableName" 157 | runSql(loadSql) 158 | } 159 | 160 | def s2b(s: String) = Bytes.toBytes(s) 161 | 162 | def run(sqlCtx: SQLContext, testName: String, sql: String, exparr: Seq[Seq[Any]]) = { 163 | val execQuery1 = sqlCtx.executeSql(sql) 164 | val result1 = runSql(sql) 165 | assert(result1.size == exparr.length, s"$testName failed on size") 166 | verify(testName, 167 | sql, 168 | for (rx <- 0 until exparr.size) 169 | yield result1(rx).toSeq, exparr 170 | ) 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/TestData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import java.sql.Timestamp 21 | 22 | import org.apache.spark.sql.catalyst.plans.logical 23 | 24 | case class TestData(k: Int, v: String) 25 | 26 | object TestData { 27 | import TestHbase.implicits._ 28 | 29 | val testData = TestHbase.sparkContext.parallelize( 30 | (1 to 100).map(i => TestData(i, i.toString))).toDF() 31 | testData.registerTempTable("testData") 32 | 33 | val negativeData = TestHbase.sparkContext.parallelize( 34 | (1 to 100).map(i => TestData(-i, (-i).toString))).toDF() 35 | negativeData.registerTempTable("negativeData") 36 | 37 | case class LargeAndSmallInts(a: Int, b: Int) 38 | 39 | val largeAndSmallInts = 40 | TestHbase.sparkContext.parallelize( 41 | LargeAndSmallInts(2147483644, 1) :: 42 | LargeAndSmallInts(1, 2) :: 43 | LargeAndSmallInts(2147483645, 1) :: 44 | LargeAndSmallInts(2, 2) :: 45 | LargeAndSmallInts(2147483646, 1) :: 46 | LargeAndSmallInts(3, 2) :: Nil).toDF() 47 | largeAndSmallInts.registerTempTable("largeAndSmallInts") 48 | 49 | case class TestData2(a: Int, b: Int) 50 | 51 | val testData2 = 52 | TestHbase.sparkContext.parallelize( 53 | TestData2(1, 1) :: 54 | TestData2(1, 2) :: 55 | TestData2(2, 1) :: 56 | TestData2(2, 2) :: 57 | TestData2(3, 1) :: 58 | TestData2(3, 2) :: Nil).toDF() 59 | testData2.registerTempTable("testData2") 60 | 61 | case class DecimalData(a: BigDecimal, b: BigDecimal) 62 | 63 | val decimalData = 64 | TestHbase.sparkContext.parallelize( 65 | DecimalData(1, 1) :: 66 | DecimalData(1, 2) :: 67 | DecimalData(2, 1) :: 68 | DecimalData(2, 2) :: 69 | DecimalData(3, 1) :: 70 | DecimalData(3, 2) :: Nil).toDF() 71 | decimalData.registerTempTable("decimalData") 72 | 73 | case class BinaryData(a: Array[Byte], b: Int) 74 | 75 | val binaryData = 76 | TestHbase.sparkContext.parallelize( 77 | BinaryData("12".getBytes, 1) :: 78 | BinaryData("22".getBytes, 5) :: 79 | BinaryData("122".getBytes, 3) :: 80 | BinaryData("121".getBytes, 2) :: 81 | BinaryData("123".getBytes, 4) :: Nil).toDF() 82 | binaryData.registerTempTable("binaryData") 83 | 84 | case class TestData3(a: Int, b: Option[Int]) 85 | 86 | val testData3 = 87 | TestHbase.sparkContext.parallelize( 88 | TestData3(1, None) :: 89 | TestData3(2, Some(2)) :: Nil).toDF() 90 | testData3.registerTempTable("testData3") 91 | 92 | val emptyTableData = logical.LocalRelation('a.int, 'b.int) 93 | 94 | case class UpperCaseData(N: Int, L: String) 95 | 96 | val upperCaseData = 97 | TestHbase.sparkContext.parallelize( 98 | UpperCaseData(1, "A") :: 99 | UpperCaseData(2, "B") :: 100 | UpperCaseData(3, "C") :: 101 | UpperCaseData(4, "D") :: 102 | UpperCaseData(5, "E") :: 103 | UpperCaseData(6, "F") :: Nil).toDF() 104 | upperCaseData.registerTempTable("upperCaseData") 105 | 106 | case class LowerCaseData(n: Int, l: String) 107 | 108 | val lowerCaseData = 109 | TestHbase.sparkContext.parallelize( 110 | LowerCaseData(1, "a") :: 111 | LowerCaseData(2, "b") :: 112 | LowerCaseData(3, "c") :: 113 | LowerCaseData(4, "d") :: Nil).toDF() 114 | lowerCaseData.registerTempTable("lowerCaseData") 115 | 116 | case class ArrayData(dt: Seq[Int], nestedData: Seq[Seq[Int]]) 117 | 118 | val arrayData = 119 | TestHbase.sparkContext.parallelize( 120 | ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3))) :: 121 | ArrayData(Seq(2, 3, 4), Seq(Seq(2, 3, 4))) :: Nil) 122 | arrayData.toDF().registerTempTable("arrayData") 123 | 124 | case class MapData(data: scala.collection.Map[Int, String]) 125 | 126 | val mapData = 127 | TestHbase.sparkContext.parallelize( 128 | MapData(Map(1 -> "a1", 2 -> "b1", 3 -> "c1", 4 -> "d1", 5 -> "e1")) :: 129 | MapData(Map(1 -> "a2", 2 -> "b2", 3 -> "c2", 4 -> "d2")) :: 130 | MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) :: 131 | MapData(Map(1 -> "a4", 2 -> "b4")) :: 132 | MapData(Map(1 -> "a5")) :: Nil) 133 | mapData.toDF().registerTempTable("mapData") 134 | 135 | case class StringData(s: String) 136 | 137 | val repeatedData = 138 | TestHbase.sparkContext.parallelize(List.fill(2)(StringData("test"))).toDF() 139 | repeatedData.registerTempTable("repeatedData") 140 | 141 | val nullableRepeatedData = 142 | TestHbase.sparkContext.parallelize( 143 | List.fill(2)(StringData(null)) ++ 144 | List.fill(2)(StringData("test"))).toDF() 145 | nullableRepeatedData.registerTempTable("nullableRepeatedData") 146 | 147 | case class NullInts(a: Integer) 148 | 149 | val nullInts = 150 | TestHbase.sparkContext.parallelize( 151 | NullInts(1) :: 152 | NullInts(2) :: 153 | NullInts(3) :: 154 | NullInts(null) :: Nil 155 | ).toDF() 156 | nullInts.registerTempTable("nullInts") 157 | 158 | val allNulls = 159 | TestHbase.sparkContext.parallelize( 160 | NullInts(null) :: 161 | NullInts(null) :: 162 | NullInts(null) :: 163 | NullInts(null) :: Nil).toDF() 164 | allNulls.registerTempTable("allNulls") 165 | 166 | case class NullStrings(n: Int, s: String) 167 | 168 | val nullStrings = 169 | TestHbase.sparkContext.parallelize( 170 | NullStrings(1, "abc") :: 171 | NullStrings(2, "ABC") :: 172 | NullStrings(3, null) :: Nil).toDF() 173 | nullStrings.registerTempTable("nullStrings") 174 | 175 | case class TableName(tableName: String) 176 | 177 | TestHbase.sparkContext.parallelize(TableName("test") :: Nil).toDF().registerTempTable("tableName") 178 | 179 | val unparsedStrings = 180 | TestHbase.sparkContext.parallelize( 181 | "1, A1, true, null" :: 182 | "2, B2, false, null" :: 183 | "3, C3, true, null" :: 184 | "4, D4, true, 2147483644" :: Nil) 185 | 186 | case class TimestampField(time: Timestamp) 187 | 188 | val timestamps = TestHbase.sparkContext.parallelize((1 to 3).map { i => 189 | TimestampField(new Timestamp(i)) 190 | }).toDF() 191 | timestamps.registerTempTable("timestamps") 192 | 193 | case class IntField(i: Int) 194 | 195 | // An RDD with 4 elements and 8 partitions 196 | val withEmptyParts = TestHbase.sparkContext.parallelize((1 to 4).map(IntField), 8).toDF() 197 | withEmptyParts.registerTempTable("withEmptyParts") 198 | 199 | case class Person(id: Int, name: String, age: Int) 200 | 201 | case class Salary(personId: Int, salary: Double) 202 | 203 | val person = TestHbase.sparkContext.parallelize( 204 | Person(0, "mike", 30) :: 205 | Person(1, "jim", 20) :: Nil) 206 | person.toDF().registerTempTable("person") 207 | val salary = TestHbase.sparkContext.parallelize( 208 | Salary(0, 2000.0) :: 209 | Salary(1, 1000.0) :: Nil).toDF() 210 | salary.registerTempTable("salary") 211 | 212 | case class ComplexData(m: Map[Int, String], s: TestData, a: Seq[Int], b: Boolean) 213 | 214 | val complexData = 215 | TestHbase.sparkContext.parallelize( 216 | ComplexData(Map(1 -> "1"), TestData(1, "1"), Seq(1), b = true) 217 | :: ComplexData(Map(2 -> "2"), TestData(2, "2"), Seq(2), b = false) 218 | :: Nil).toDF() 219 | complexData.registerTempTable("complexData") 220 | } 221 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/TestHbase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.hadoop.hbase.{HBaseTestingUtility, MiniHBaseCluster} 21 | import org.apache.hadoop.hbase.client.HBaseAdmin 22 | 23 | import org.apache.spark.{SparkConf, SparkContext} 24 | 25 | 26 | object TestHbase 27 | extends HBaseSQLContext( 28 | new SparkContext("local[2]", "TestSQLContext", new SparkConf(true) 29 | .set("spark.hadoop.hbase.zookeeper.quorum", "localhost"))) { 30 | 31 | @transient val testUtil: HBaseTestingUtility = 32 | new HBaseTestingUtility(sparkContext.hadoopConfiguration) 33 | 34 | val nRegionServers: Int = 1 35 | val nDataNodes: Int = 1 36 | val nMasters: Int = 1 37 | 38 | logDebug(s"Spin up hbase minicluster w/ $nMasters master, $nRegionServers RS, $nDataNodes dataNodes") 39 | 40 | @transient val cluster: MiniHBaseCluster = testUtil.startMiniCluster(nMasters, nRegionServers, nDataNodes) 41 | logInfo(s"Started HBaseMiniCluster with regions = ${cluster.countServedRegions}") 42 | 43 | logInfo(s"Configuration zkPort=" 44 | + s"${sparkContext.hadoopConfiguration.get("hbase.zookeeper.property.clientPort")}") 45 | 46 | @transient lazy val hbaseAdmin: HBaseAdmin = new HBaseAdmin(sparkContext.hadoopConfiguration) 47 | } 48 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/hbase/TpcMiniTestSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hbase 19 | 20 | import org.apache.hadoop.hbase._ 21 | 22 | /** 23 | * This is a mini tpc test suite running against mini-cluster 24 | */ 25 | class TpcMiniTestSuite extends HBaseIntegrationTestBase { 26 | private val tableName = "store_sales" 27 | private val hbaseTableName = "store_sales_htable" 28 | private val hbaseFamilies = Seq("f") 29 | 30 | private val csvPaths = Array("src/test/resources", "sql/hbase/src/test/resources") 31 | private val csvFile = "store_sales.txt" 32 | private val tpath = for (csvPath <- csvPaths if new java.io.File(csvPath).exists()) yield { 33 | logInfo(s"Following path exists $csvPath\n") 34 | csvPath 35 | } 36 | private[hbase] val csvPath = tpath(0) 37 | 38 | override protected def beforeAll() = { 39 | val hbaseAdmin = TestHbase.hbaseAdmin 40 | 41 | /** 42 | * create hbase table if it does not exists 43 | */ 44 | if (!hbaseAdmin.tableExists(TableName.valueOf(hbaseTableName))) { 45 | val descriptor = new HTableDescriptor(TableName.valueOf(tableName)) 46 | hbaseFamilies.foreach { f => descriptor.addFamily(new HColumnDescriptor(f))} 47 | try { 48 | hbaseAdmin.createTable(descriptor) 49 | } catch { 50 | case e: TableExistsException => 51 | logError(s"Table already exists $tableName", e) 52 | } 53 | } 54 | 55 | /** 56 | * drop the existing logical table if it exists 57 | */ 58 | if (TestHbase.catalog.checkLogicalTableExist(tableName)) { 59 | val dropSql = "DROP TABLE " + tableName 60 | try { 61 | runSql(dropSql) 62 | } catch { 63 | case e: IllegalStateException => 64 | logError(s"Error occurs while dropping the table $tableName", e) 65 | } 66 | } 67 | 68 | /** 69 | * create table 70 | */ 71 | val createSql = 72 | s"""CREATE TABLE store_sales( 73 | ss_sold_date_sk INTEGER, 74 | ss_sold_time_sk INTEGER, 75 | ss_item_sk INTEGER, 76 | ss_customer_sk INTEGER, 77 | ss_cdemo_sk INTEGER, 78 | ss_hdemo_sk INTEGER, 79 | ss_addr_sk INTEGER, 80 | ss_store_sk INTEGER, 81 | ss_promo_sk INTEGER, 82 | ss_ticket_number INTEGER, 83 | ss_quantity INTEGER, 84 | ss_wholesale_cost FLOAT, 85 | ss_list_price FLOAT, 86 | ss_sales_price FLOAT, 87 | ss_ext_discount_amt FLOAT, 88 | ss_ext_sales_price FLOAT, 89 | ss_ext_wholesale_cost FLOAT, 90 | ss_ext_list_price FLOAT, 91 | ss_ext_tax FLOAT, 92 | ss_coupon_amt FLOAT, 93 | ss_net_paid FLOAT, 94 | ss_net_paid_inc_tax FLOAT, 95 | ss_net_profit FLOAT, 96 | PRIMARY KEY(ss_item_sk, ss_ticket_number)) 97 | MAPPED BY 98 | (store_sales_htable, COLS=[ 99 | ss_sold_date_sk=f.ss_sold_date_sk, 100 | ss_sold_time_sk=f.ss_sold_time_sk, 101 | ss_customer_sk=f.ss_customer_sk, 102 | ss_cdemo_sk=f.ss_cdemo_sk, 103 | ss_hdemo_sk=f.ss_hdemo_sk, 104 | ss_addr_sk=f.ss_addr_sk, 105 | ss_store_sk=f.ss_store_sk, 106 | ss_promo_sk=f.ss_promo_sk, 107 | ss_quantity=f.ss_quantity, 108 | ss_wholesale_cost=f.ss_wholesale_cost, 109 | ss_list_price=f.ss_list_price, 110 | ss_sales_price=f.ss_sales_price, 111 | ss_ext_discount_amt=f.ss_ext_discount_amt, 112 | ss_ext_sales_price=f.ss_ext_sales_price, 113 | ss_ext_wholesale_cost=f.ss_ext_wholesale_cost, 114 | ss_ext_list_price=f.ss_ext_list_price, 115 | ss_ext_tax=f.ss_ext_tax, 116 | ss_coupon_amt=f.ss_coupon_amt, 117 | ss_net_paid=f.ss_net_paid, 118 | ss_net_paid_inc_tax=f.ss_net_paid_inc_tax, 119 | ss_net_profit=f.ss_net_profit 120 | ])""".stripMargin 121 | 122 | try { 123 | runSql(createSql) 124 | } catch { 125 | case e: IllegalStateException => 126 | logError(s"Error occurs while creating the table $tableName", e) 127 | } 128 | 129 | /** 130 | * load the data 131 | */ 132 | val loadSql = "LOAD DATA LOCAL INPATH '" + s"$csvPath/$csvFile" + 133 | "' INTO TABLE store_sales" 134 | try { 135 | runSql(loadSql) 136 | } catch { 137 | case e: IllegalStateException => 138 | logError(s"Error occurs while loading the data $tableName", e) 139 | } 140 | } 141 | 142 | override protected def afterAll() = { 143 | runSql("DROP TABLE " + tableName) 144 | } 145 | 146 | test("Query 0") { 147 | val sql = "SELECT count(1) FROM store_sales" 148 | val rows = runSql(sql) 149 | assert(rows(0).get(0) == 100) 150 | } 151 | 152 | test("Query 1") { 153 | val sql = "SELECT ss_quantity, ss_wholesale_cost, ss_list_price FROM store_sales WHERE ss_item_sk = 2744 AND ss_ticket_number = 1" 154 | val rows = runSql(sql) 155 | assert(rows(0).get(0) == 37) 156 | assert(rows(0).get(1) == 63.63f) 157 | assert(rows(0).get(2) == 101.17f) 158 | } 159 | 160 | test("Query 2") { 161 | val sql = "SELECT ss_sold_date_sk, ss_sold_time_sk, ss_store_sk FROM store_sales WHERE ss_item_sk = 2744 AND ss_ticket_number = 1" 162 | val rows = runSql(sql) 163 | assert(rows(0).get(0) == 2451813) 164 | assert(rows(0).get(1) == 65495) 165 | assert(rows(0).get(2) == 25) 166 | } 167 | 168 | test("Query 3") { 169 | val sql = "SELECT ss_customer_sk, ss_promo_sk, ss_coupon_amt FROM store_sales WHERE ss_item_sk = 2744 AND ss_ticket_number = 1" 170 | val rows = runSql(sql) 171 | assert(rows(0).get(0) == 225006) 172 | assert(rows(0).get(1) == 354) 173 | assert(rows(0).get(2) == 46.03f) 174 | } 175 | 176 | test("Query 4") { 177 | val sql = "SELECT ss_item_sk, count(1) FROM store_sales GROUP BY ss_item_sk" 178 | val rows = runSql(sql) 179 | assert(rows.size == 100) 180 | } 181 | 182 | test("Query 5") { 183 | val sql = "SELECT ss_item_sk, ss_ticket_number, count(1) FROM store_sales WHERE ss_item_sk > 4000 AND ss_item_sk < 5000 GROUP BY ss_item_sk, ss_ticket_number" 184 | val rows = runSql(sql) 185 | assert(rows.size == 5) 186 | } 187 | 188 | test("Query 6") { 189 | val sql = "SELECT ss_item_sk, avg(ss_quantity) as avg_qty, count(ss_quantity) as cnt_qty FROM store_sales WHERE ss_item_sk = 2744 GROUP BY ss_item_sk" 190 | val rows = runSql(sql) 191 | assert(rows.size == 1) 192 | } 193 | 194 | test("Query 7") { 195 | val sql = "SELECT ss_item_sk, ss_ticket_number, sum(ss_wholesale_cost) as sum_wholesale_cost FROM store_sales WHERE ss_item_sk > 4000 AND ss_item_sk <= 5000 GROUP BY ss_item_sk, ss_ticket_number" 196 | val rows = runSql(sql) 197 | assert(rows.size == 5) 198 | } 199 | 200 | test("Query 8") { 201 | val sql = "SELECT ss_item_sk, ss_ticket_number, min(ss_wholesale_cost) as min_wholesale_cost, max(ss_wholesale_cost) as max_wholesale_cost, avg(ss_wholesale_cost) as avg_wholesale_cost FROM store_sales WHERE ss_item_sk > 4000 AND ss_item_sk <= 5000 GROUP BY ss_item_sk, ss_ticket_number" 202 | val rows = runSql(sql) 203 | assert(rows.size == 5) 204 | } 205 | 206 | test("Query 9") { 207 | val sql = "SELECT ss_item_sk, count(ss_customer_sk) as count_ss_customer_sk FROM store_sales WHERE ss_item_sk > 4000 AND ss_item_sk <= 5000 GROUP BY ss_item_sk" 208 | val rows = runSql(sql) 209 | assert(rows.size == 5) 210 | } 211 | 212 | test("Query 10") { 213 | val sql = "SELECT count(*) FROM store_sales WHERE ss_net_profit < 100" 214 | val rows = runSql(sql) 215 | assert(rows(0).get(0) == 74) 216 | } 217 | 218 | test("Query 11") { 219 | val sql = "SELECT count(*) FROM store_sales WHERE ss_coupon_amt < 50 AND ss_ext_discount_amt < 50 AND ss_net_paid < 50 AND ss_net_paid_inc_tax < 50" 220 | val rows = runSql(sql) 221 | assert(rows(0).get(0) == 6) 222 | } 223 | 224 | test("Query 12") { 225 | val sql = "SELECT count(distinct ss_customer_sk) as count_distinct_customer FROM store_sales" 226 | val rows = runSql(sql) 227 | assert(rows(0).get(0) == 8) 228 | } 229 | 230 | test("Query 13") { 231 | val sql = "SELECT * FROM store_sales limit 100" 232 | val rows = runSql(sql) 233 | assert(rows.size == 100) 234 | } 235 | 236 | test("Query 14") { 237 | val sql = "SELECT ss_customer_sk, count(*) FROM store_sales WHERE ss_item_sk >= 4000 AND ss_item_sk <= 5000 GROUP BY ss_customer_sk" 238 | val rows = runSql(sql) 239 | assert(rows.size == 5) 240 | } 241 | 242 | test("Query 15") { 243 | val sql = "SELECT count(ss_customer_sk) as count_customer FROM store_sales WHERE ss_customer_sk IN (1,25,50,75,100)" 244 | val rows = runSql(sql) 245 | assert(rows(0).get(0) == 0) 246 | } 247 | 248 | test("Query 16") { 249 | val sql = "SELECT count(ss_customer_sk) as count_customer FROM store_sales WHERE ss_customer_sk < 100 AND ss_quantity < 5" 250 | val rows = runSql(sql) 251 | assert(rows(0).get(0) == 2) 252 | } 253 | 254 | test("Query 17") { 255 | val sql = "SELECT count(ss_customer_sk) as count_customer FROM store_sales WHERE ss_customer_sk > 100" 256 | val rows = runSql(sql) 257 | assert(rows(0).get(0) == 83) 258 | } 259 | 260 | test("Query 18") { 261 | val sql = "SELECT ss_item_sk, ss_ticket_number FROM store_sales WHERE (ss_item_sk = 186 AND ss_ticket_number > 0)" 262 | val rows = runSql(sql) 263 | assert(rows.size == 1) 264 | } 265 | } 266 | --------------------------------------------------------------------------------