├── .gitignore
├── LICENSE
├── README.md
├── bin
    ├── compute-classpath.cmd
    ├── compute-classpath.sh
    └── hbase-sql
├── doc
    └── SparkSQLOnHBase_v2.1.docx
├── pom.xml
├── python
    └── pyspark
    │   ├── __init__.py
    │   ├── java_gateway.py
    │   └── sql.py
└── src
    ├── main
        └── scala
        │   └── org
        │       └── apache
        │           └── spark
        │               └── sql
        │                   └── hbase
        │                       ├── HBaseCatalog.scala
        │                       ├── HBaseCriticalPoint.scala
        │                       ├── HBasePartition.scala
        │                       ├── HBasePartitioner.scala
        │                       ├── HBaseRelation.scala
        │                       ├── HBaseSQLCliDriver.scala
        │                       ├── HBaseSQLConf.scala
        │                       ├── HBaseSQLContext.scala
        │                       ├── HBaseSQLParser.scala
        │                       ├── HBaseSQLReaderRDD.scala
        │                       ├── HBaseShuffledRDD.scala
        │                       ├── HadoopReader.scala
        │                       ├── IndexMappable.scala
        │                       ├── ScanPredClassifier.scala
        │                       ├── catalyst
        │                           ├── NotPusher.scala
        │                           └── expressions
        │                           │   └── PartialPredicateOperations.scala
        │                       ├── execution
        │                           ├── HBaseSQLTableScan.scala
        │                           ├── HBaseStrategies.scala
        │                           └── hbaseCommands.scala
        │                       ├── package.scala
        │                       ├── types
        │                           ├── HBaseBytesType.scala
        │                           ├── PartialOrderingDataType.scala
        │                           └── RangeType.scala
        │                       └── util
        │                           ├── BytesUtils.scala
        │                           ├── DataTypeUtils.scala
        │                           ├── HBaseKVHelper.scala
        │                           └── Util.scala
    └── test
        ├── java
            └── org
            │   └── apache
            │       └── spark
            │           └── sql
            │               └── hbase
            │                   └── api
            │                       └── java
            │                           └── JavaAPISuite.java
        ├── resources
            ├── joinTable1.txt
            ├── joinTable2.txt
            ├── joinTable3.txt
            ├── joinTable4.txt
            ├── loadData.txt
            ├── loadNullableData.txt
            ├── log4j.properties
            ├── onecoljoin1.txt
            ├── onecoljoin2.txt
            ├── splitLoadData.txt
            ├── splitLoadData1.txt
            ├── store_sales.txt
            └── testTable.txt
        └── scala
            └── org
                └── apache
                    └── spark
                        └── sql
                            └── hbase
                                ├── AggregateQueriesSuite.scala
                                ├── BasicQueriesSuite.scala
                                ├── BulkLoadIntoTableSuite.scala
                                ├── BytesUtilsSuite.scala
                                ├── CatalogTestSuite.scala
                                ├── CriticalPointsTestSuite.scala
                                ├── HBaseAdvancedSQLQuerySuite.scala
                                ├── HBaseBasicOperationSuite.scala
                                ├── HBaseInsertTableSuite.scala
                                ├── HBaseIntegrationTestBase.scala
                                ├── HBasePartitionerSuite.scala
                                ├── HBaseSQLQuerySuite.scala
                                ├── HBaseSplitTestData.scala
                                ├── HBaseTestData.scala
                                ├── TestData.scala
                                ├── TestHbase.scala
                                └── TpcMiniTestSuite.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.#*
 3 | *#*#
 4 | *.swp
 5 | *.ipr
 6 | *.iml
 7 | *.iws
 8 | *.pyc
 9 | .idea/
10 | .idea_modules/
11 | build/*.jar
12 | .settings
13 | .cache
14 | cache
15 | .generated-mima*
16 | work/
17 | out/
18 | .DS_Store
19 | third_party/libmesos.so
20 | third_party/libmesos.dylib
21 | build/apache-maven*
22 | build/zinc*
23 | build/scala*
24 | conf/java-opts
25 | conf/*.sh
26 | conf/*.cmd
27 | conf/*.properties
28 | conf/*.conf
29 | conf/*.xml
30 | conf/slaves
31 | docs/_site
32 | docs/api
33 | target/
34 | reports/
35 | .project
36 | .classpath
37 | .scala_dependencies
38 | lib_managed/
39 | src_managed/
40 | project/boot/
41 | project/plugins/project/build.properties
42 | project/build/target/
43 | project/plugins/target/
44 | project/plugins/lib_managed/
45 | project/plugins/src_managed/
46 | logs/
47 | log/
48 | spark-tests.log
49 | streaming-tests.log
50 | dependency-reduced-pom.xml
51 | .ensime
52 | .ensime_lucene
53 | checkpoint
54 | derby.log
55 | dist/
56 | dev/create-release/*txt
57 | dev/create-release/*final
58 | spark-*-bin-*.tgz
59 | unit-tests.log
60 | /lib/
61 | ec2/lib/
62 | rat-results.txt
63 | scalastyle.txt
64 | scalastyle-output.xml
65 | 
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Spark HBase
  2 | 
  3 | Apache HBase is a distributed Key-Value store of data on HDFS. It is modeled after Google’s Big Table, and provides APIs to query the data. The data is organized, partitioned and distributed by its “row keys”. Per partition, the data is further physically partitioned by “column families” that specify collections of “columns” of data. The data model is for wide and sparse tables where columns are dynamic and may well be sparse.
  4 | 
  5 | Although HBase is a very useful big data store, its access mechanism is very primitive and only through client-side APIs, Map/Reduce interfaces and interactive shells. SQL accesses to HBase data are available through Map/Reduce or interfaces mechanisms such as Apache Hive and Impala, or some “native” SQL technologies like Apache Phoenix. While the former is usually cheaper to implement and use, their latencies and efficiencies often cannot compare favorably with the latter and are often suitable only for offline analysis. The latter category, in contrast, often performs better and qualifies more as online engines; they are often on top of purpose-built execution engines.
  6 | 
  7 | Currently Spark supports queries against HBase data through HBase’s Map/Reduce interface (i.e., TableInputFormat). SparkSQL supports use of Hive data, which theoretically should be able to support HBase data access, out-of-box, through HBase’s Map/Reduce interface and therefore falls into the first category of the “SQL on HBase” technologies.
  8 | 
  9 | We believe, as a unified big data processing engine, Spark is in good position to provide better HBase support.
 10 | 
 11 | ## Online Documentation
 12 | 
 13 | Online documentation can be found on [Spark JIRA page](https://issues.apache.org/jira/browse/SPARK-3880).
 14 | 
 15 | ## Building Spark HBase
 16 | 
 17 | Spark HBase is built using [Apache Maven](http://maven.apache.org/).
 18 | 
 19 | The refactoring job of separating the spark hbase sub-project from the spark project is ongoing.
 20 | Some manual steps are required to build the new stand-alone spark-hbase project until this task
 21 | is complete.
 22 | 
 23 | In an effort the avoid confusion over the terms spark, spark-hbase, and hbase, these two projects
 24 | are referred to here as
 25 | 
 26 | 	"Spark-Huawei/spark":  https://github.com/Huawei-Spark/spark.git  (spark + all sub-modules)
 27 | 	"Spark-Huawei/hbase":  https://github.com/Huawei-Spark/hbase.git  (standalone spark-hbase project)
 28 | 
 29 | In short, you will need to manually delete the spark/sql/hbase module from the Spark-Huawei/spark
 30 | source tree, all references to it in the spark build infrastructure, build/install spark, then build
 31 | the standalone Spark-Huawei/hbase project.
 32 | 
 33 | 
 34 | Here is the step-by-step process:
 35 | 
 36 | I. Clone, edit, build Spark-Huawei/spark
 37 | 
 38 |   Define a SPARK_HOME environment variable on your development machine and clone the project to that location.
 39 | ```
 40 |     $ git clone https://github.com/Huawei-Spark/spark.git
 41 | ```
 42 |   Change your current working dir to your SPARK_HOME and make sure you downloaded branch 'hbase'.
 43 | ```
 44 |     $ git branch
 45 |     output:  * hbase
 46 | ```
 47 |   Manually remove the sql/hbase module from the Spark-Huawei/spark project.
 48 | ```
 49 |     $ rm -rf $SPARK_HOME/sql/hbase
 50 | ```
 51 |   Edit the spark project's parent pom.xml -- delete the line '<module>sql/hbase</module>' (from two locations).
 52 | 
 53 |   Build and install Spark-Huawei/spark; it must be installed in your local maven repo.
 54 | ```
 55 |     $ mvn -e -T1C -Pyarn,hadoop-2.4,hive  -Dhadoop.version=2.4.0 -DskipTests  clean package install
 56 | ```
 57 | II. Clone and build Spark-Huawei/hbase (new standalone spark-hbase project)
 58 | 
 59 |   Change your current working dir to ../$SPARK_HOME and clone the standalone spark-hbase project.
 60 | ```
 61 |     $ git clone https://github.com/Huawei-Spark/hbase.git
 62 | ```
 63 |   Make sure you downloaded branch 'master'.
 64 | ```
 65 |     $ git branch
 66 |     output:  * master
 67 | ```
 68 |   You have installed spark in your local maven repo; now you can build Spark-Huawei/hbase against it.
 69 | ```
 70 |     $ mvn -e -T1C -Phbase,hadoop-2.4  -Dhadoop.version=2.4.0 -DskipTests    clean package install
 71 | ```
 72 | III. Run Spark-Huawei/hbase test suites against an HBase minicluster, from Maven.
 73 | ```
 74 |     $ mvn -e -T1C -Phbase,hadoop-2.4  -Dhadoop.version=2.4.0  test
 75 | ```
 76 | 
 77 | ## Interactive Scala Shell
 78 | 
 79 | The easiest way to start using Spark HBase is through the Scala shell:
 80 | 
 81 |     ./bin/hbase-sql
 82 | 
 83 | 
 84 | ## Running Tests
 85 | 
 86 | Testing first requires [building Spark HBase](#building-spark). Once Spark HBase is built, tests
 87 | can be run using:
 88 | 
 89 |     ./dev/run-tests
 90 | 
 91 | Run all test suites from Maven:
 92 | 
 93 |     mvn -Phbase,hadoop-2.4 test
 94 | 
 95 | Run a single test suite from Maven, for example:
 96 | 
 97 |     mvn -Phbase,hadoop-2.4 test -DwildcardSuites=org.apache.spark.sql.hbase.BasicQueriesSuite
 98 | 
 99 | ## IDE Setup
100 | 
101 | We use IntelliJ IDEA for Spark HBase development. You can get the community edition for free and install the JetBrains Scala plugin from Preferences > Plugins.
102 | 
103 | To import the current Spark HBase project for IntelliJ:
104 | 
105 | 1. Download IntelliJ and install the Scala plug-in for IntelliJ. You may also need to install Maven plug-in for IntelliJ.
106 | 2. Go to "File -> Import Project", locate the Spark HBase source directory, and select "Maven Project".
107 | 3. In the Import Wizard, select "Import Maven projects automatically" and leave other settings at their default. 
108 | 4. Make sure some specific profiles are enabled. Select corresponding Hadoop version, "maven3" and also"hbase" in order to get dependencies.
109 | 5. Leave other settings at their default and you should be able to start your development.
110 | 6. When you run the scala test, sometimes you will get out of memory exception. You can increase your VM memory usage by the following setting, for example:
111 | 
112 | ```
113 |     -XX:MaxPermSize=512m -Xmx3072m
114 | ```
115 | 
116 | You can also make those setting to be the default by setting to the "Defaults -> ScalaTest".
117 | 
118 | ## Configuration
119 | 
120 | Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html)
121 | in the online documentation for an overview on how to configure Spark.
122 | 


--------------------------------------------------------------------------------
/bin/compute-classpath.cmd:
--------------------------------------------------------------------------------
  1 | @echo off
  2 | 
  3 | rem
  4 | rem Licensed to the Apache Software Foundation (ASF) under one or more
  5 | rem contributor license agreements.  See the NOTICE file distributed with
  6 | rem this work for additional information regarding copyright ownership.
  7 | rem The ASF licenses this file to You under the Apache License, Version 2.0
  8 | rem (the "License"); you may not use this file except in compliance with
  9 | rem the License.  You may obtain a copy of the License at
 10 | rem
 11 | rem    http://www.apache.org/licenses/LICENSE-2.0
 12 | rem
 13 | rem Unless required by applicable law or agreed to in writing, software
 14 | rem distributed under the License is distributed on an "AS IS" BASIS,
 15 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | rem See the License for the specific language governing permissions and
 17 | rem limitations under the License.
 18 | rem
 19 | 
 20 | rem This script computes Spark's classpath and prints it to stdout; it's used by both the "run"
 21 | rem script and the ExecutorRunner in standalone cluster mode.
 22 | 
 23 | rem If we're called from spark-class2.cmd, it already set enabledelayedexpansion and setting
 24 | rem it here would stop us from affecting its copy of the CLASSPATH variable; otherwise we
 25 | rem need to set it here because we use !datanucleus_jars! below.
 26 | if "%DONT_PRINT_CLASSPATH%"=="1" goto skip_delayed_expansion
 27 | setlocal enabledelayedexpansion
 28 | :skip_delayed_expansion
 29 | 
 30 | set SCALA_VERSION=2.10
 31 | 
 32 | rem Figure out where the Spark framework is installed
 33 | set FWDIR=%~dp0..\
 34 | 
 35 | rem Load environment variables from conf\spark-env.cmd, if it exists
 36 | if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd"
 37 | 
 38 | rem Build up classpath
 39 | set CLASSPATH=%SPARK_CLASSPATH%;%SPARK_SUBMIT_CLASSPATH%
 40 | 
 41 | if not "x%SPARK_CONF_DIR%"=="x" (
 42 |   set CLASSPATH=%CLASSPATH%;%SPARK_CONF_DIR%
 43 | ) else (
 44 |   set CLASSPATH=%CLASSPATH%;%FWDIR%conf
 45 | )
 46 | 
 47 | if exist "%FWDIR%RELEASE" (
 48 |   for %%d in ("%FWDIR%lib\spark-assembly*.jar") do (
 49 |     set ASSEMBLY_JAR=%%d
 50 |   )
 51 | ) else (
 52 |   for %%d in ("%FWDIR%assembly\target\scala-%SCALA_VERSION%\spark-assembly*hadoop*.jar") do (
 53 |     set ASSEMBLY_JAR=%%d
 54 |   )
 55 | )
 56 | 
 57 | set CLASSPATH=%CLASSPATH%;%ASSEMBLY_JAR%
 58 | 
 59 | rem When Hive support is needed, Datanucleus jars must be included on the classpath.
 60 | rem Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
 61 | rem Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
 62 | rem built with Hive, so look for them there.
 63 | if exist "%FWDIR%RELEASE" (
 64 |   set datanucleus_dir=%FWDIR%lib
 65 | ) else (
 66 |   set datanucleus_dir=%FWDIR%lib_managed\jars
 67 | )
 68 | set "datanucleus_jars="
 69 | for %%d in ("%datanucleus_dir%\datanucleus-*.jar") do (
 70 |   set datanucleus_jars=!datanucleus_jars!;%%d
 71 | )
 72 | set CLASSPATH=%CLASSPATH%;%datanucleus_jars%
 73 | 
 74 | set SPARK_CLASSES=%FWDIR%core\target\scala-%SCALA_VERSION%\classes
 75 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%repl\target\scala-%SCALA_VERSION%\classes
 76 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%mllib\target\scala-%SCALA_VERSION%\classes
 77 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%bagel\target\scala-%SCALA_VERSION%\classes
 78 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%graphx\target\scala-%SCALA_VERSION%\classes
 79 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%streaming\target\scala-%SCALA_VERSION%\classes
 80 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%tools\target\scala-%SCALA_VERSION%\classes
 81 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%sql\catalyst\target\scala-%SCALA_VERSION%\classes
 82 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%sql\core\target\scala-%SCALA_VERSION%\classes
 83 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%sql\hive\target\scala-%SCALA_VERSION%\classes
 84 | set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%sql\hbase\target\scala-%SCALA_VERSION%\classes
 85 | 
 86 | set SPARK_TEST_CLASSES=%FWDIR%core\target\scala-%SCALA_VERSION%\test-classes
 87 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%repl\target\scala-%SCALA_VERSION%\test-classes
 88 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%mllib\target\scala-%SCALA_VERSION%\test-classes
 89 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%bagel\target\scala-%SCALA_VERSION%\test-classes
 90 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%graphx\target\scala-%SCALA_VERSION%\test-classes
 91 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%streaming\target\scala-%SCALA_VERSION%\test-classes
 92 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%sql\catalyst\target\scala-%SCALA_VERSION%\test-classes
 93 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%sql\core\target\scala-%SCALA_VERSION%\test-classes
 94 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%sql\hive\target\scala-%SCALA_VERSION%\test-classes
 95 | set SPARK_TEST_CLASSES=%SPARK_TEST_CLASSES%;%FWDIR%sql\hbase\target\scala-%SCALA_VERSION%\test-classes
 96 | 
 97 | if "x%SPARK_TESTING%"=="x1" (
 98 |   rem Add test clases to path - note, add SPARK_CLASSES and SPARK_TEST_CLASSES before CLASSPATH
 99 |   rem so that local compilation takes precedence over assembled jar
100 |   set CLASSPATH=%SPARK_CLASSES%;%SPARK_TEST_CLASSES%;%CLASSPATH%
101 | )
102 | 
103 | rem Add hadoop conf dir - else FileSystem.*, etc fail
104 | rem Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts
105 | rem the configurtion files.
106 | if "x%HADOOP_CONF_DIR%"=="x" goto no_hadoop_conf_dir
107 |   set CLASSPATH=%CLASSPATH%;%HADOOP_CONF_DIR%
108 | :no_hadoop_conf_dir
109 | 
110 | if "x%YARN_CONF_DIR%"=="x" goto no_yarn_conf_dir
111 |   set CLASSPATH=%CLASSPATH%;%YARN_CONF_DIR%
112 | :no_yarn_conf_dir
113 | 
114 | rem To allow for distributions to append needed libraries to the classpath (e.g. when
115 | rem using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and
116 | rem append it to tbe final classpath.
117 | if not "x%$SPARK_DIST_CLASSPATH%"=="x" (
118 |   set CLASSPATH=%CLASSPATH%;%SPARK_DIST_CLASSPATH%
119 | )
120 | 
121 | rem A bit of a hack to allow calling this script within run2.cmd without seeing output
122 | if "%DONT_PRINT_CLASSPATH%"=="1" goto exit
123 | 
124 | echo %CLASSPATH%
125 | 
126 | :exit
127 | 


--------------------------------------------------------------------------------
/bin/compute-classpath.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | #
  4 | # Licensed to the Apache Software Foundation (ASF) under one or more
  5 | # contributor license agreements.  See the NOTICE file distributed with
  6 | # this work for additional information regarding copyright ownership.
  7 | # The ASF licenses this file to You under the Apache License, Version 2.0
  8 | # (the "License"); you may not use this file except in compliance with
  9 | # the License.  You may obtain a copy of the License at
 10 | #
 11 | #    http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | # This script computes Spark's classpath and prints it to stdout; it's used by both the "run"
 21 | # script and the ExecutorRunner in standalone cluster mode.
 22 | 
 23 | # Figure out where Spark is installed
 24 | FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 25 | 
 26 | . "$FWDIR"/bin/load-spark-env.sh
 27 | 
 28 | if [ -n "$SPARK_CLASSPATH" ]; then
 29 |   CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH"
 30 | else
 31 |   CLASSPATH="$SPARK_SUBMIT_CLASSPATH"
 32 | fi
 33 | 
 34 | # Build up classpath
 35 | if [ -n "$SPARK_CONF_DIR" ]; then
 36 |   CLASSPATH="$CLASSPATH:$SPARK_CONF_DIR"
 37 | else
 38 |   CLASSPATH="$CLASSPATH:$FWDIR/conf"
 39 | fi
 40 | 
 41 | ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SPARK_SCALA_VERSION"
 42 | 
 43 | if [ -n "$JAVA_HOME" ]; then
 44 |   JAR_CMD="$JAVA_HOME/bin/jar"
 45 | else
 46 |   JAR_CMD="jar"
 47 | fi
 48 | 
 49 | # A developer option to prepend more recently compiled Spark classes
 50 | if [ -n "$SPARK_PREPEND_CLASSES" ]; then
 51 |   echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
 52 |     "classes ahead of assembly." >&2
 53 |   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/classes"
 54 |   CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*"
 55 |   CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/classes"
 56 |   CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/classes"
 57 |   CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/classes"
 58 |   CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/classes"
 59 |   CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/classes"
 60 |   CLASSPATH="$CLASSPATH:$FWDIR/tools/target/scala-$SPARK_SCALA_VERSION/classes"
 61 |   CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/classes"
 62 |   CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/classes"
 63 |   CLASSPATH="$CLASSPATH:$FWDIR/sql/hbase/target/scala-$SPARK_SCALA_VERSION/classes"
 64 |   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/classes"
 65 |   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive-thriftserver/target/scala-$SPARK_SCALA_VERSION/classes"
 66 |   CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SPARK_SCALA_VERSION/classes"
 67 | fi
 68 | 
 69 | # Use spark-assembly jar from either RELEASE or assembly directory
 70 | if [ -f "$FWDIR/RELEASE" ]; then
 71 |   assembly_folder="$FWDIR"/lib
 72 | else
 73 |   assembly_folder="$ASSEMBLY_DIR"
 74 | fi
 75 | 
 76 | num_jars=0
 77 | 
 78 | for f in ${assembly_folder}/spark-assembly*hadoop*.jar; do
 79 |   if [[ ! -e "$f" ]]; then
 80 |     echo "Failed to find Spark assembly in $assembly_folder" 1>&2
 81 |     echo "You need to build Spark before running this program." 1>&2
 82 |     exit 1
 83 |   fi
 84 |   ASSEMBLY_JAR="$f"
 85 |   num_jars=$((num_jars+1))
 86 | done
 87 | 
 88 | if [ "$num_jars" -gt "1" ]; then
 89 |   echo "Found multiple Spark assembly jars in $assembly_folder:" 1>&2
 90 |   ls ${assembly_folder}/spark-assembly*hadoop*.jar 1>&2
 91 |   echo "Please remove all but one jar." 1>&2
 92 |   exit 1
 93 | fi
 94 | 
 95 | # Verify that versions of java used to build the jars and run Spark are compatible
 96 | jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
 97 | if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
 98 |   echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2
 99 |   echo "This is likely because Spark was compiled with Java 7 and run " 1>&2
100 |   echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2
101 |   echo "or build Spark with Java 6." 1>&2
102 |   exit 1
103 | fi
104 | 
105 | CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
106 | 
107 | # When Hive support is needed, Datanucleus jars must be included on the classpath.
108 | # Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
109 | # Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
110 | # built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
111 | # assembly is built for Hive, before actually populating the CLASSPATH with the jars.
112 | # Note that this check order is faster (by up to half a second) in the case where Hive is not used.
113 | if [ -f "$FWDIR/RELEASE" ]; then
114 |   datanucleus_dir="$FWDIR"/lib
115 | else
116 |   datanucleus_dir="$FWDIR"/lib_managed/jars
117 | fi
118 | 
119 | datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar$")"
120 | datanucleus_jars="$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)"
121 | 
122 | if [ -n "$datanucleus_jars" ]; then
123 |   hive_files=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null)
124 |   if [ -n "$hive_files" ]; then
125 |     echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
126 |     CLASSPATH="$CLASSPATH:$datanucleus_jars"
127 |   fi
128 | fi
129 | 
130 | # Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
131 | if [[ $SPARK_TESTING == 1 ]]; then
132 |   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/test-classes"
133 |   CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/test-classes"
134 |   CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/test-classes"
135 |   CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/test-classes"
136 |   CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SPARK_SCALA_VERSION/test-classes"
137 |   CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SPARK_SCALA_VERSION/test-classes"
138 |   CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SPARK_SCALA_VERSION/test-classes"
139 |   CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SPARK_SCALA_VERSION/test-classes"
140 |   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/test-classes"
141 |   CLASSPATH="$CLASSPATH:$FWDIR/sql/hbase/target/scala-$SPARK_SCALA_VERSION/test-classes"
142 | fi
143 | 
144 | # Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !
145 | # Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts
146 | # the configurtion files.
147 | if [ -n "$HADOOP_CONF_DIR" ]; then
148 |   CLASSPATH="$CLASSPATH:$HADOOP_CONF_DIR"
149 | fi
150 | if [ -n "$YARN_CONF_DIR" ]; then
151 |   CLASSPATH="$CLASSPATH:$YARN_CONF_DIR"
152 | fi
153 | 
154 | # To allow for distributions to append needed libraries to the classpath (e.g. when
155 | # using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and
156 | # append it to tbe final classpath.
157 | if [ -n "$SPARK_DIST_CLASSPATH" ]; then
158 |   CLASSPATH="$CLASSPATH:$SPARK_DIST_CLASSPATH"
159 | fi
160 | 
161 | echo "$CLASSPATH"
162 | 


--------------------------------------------------------------------------------
/bin/hbase-sql:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | #
21 | # Shell script for starting the Spark SQL for HBase CLI
22 | 
23 | # Enter posix mode for bash
24 | set -o posix
25 | 
26 | CLASS="org.apache.spark.sql.hbase.HBaseSQLCliDriver"
27 | 
28 | # Figure out where Spark is installed
29 | FWDIR=$SPARK_HOME
30 | if [ -z "$FWDIR" ]
31 | then
32 | echo "\$SPARK_HOME is not set"
33 | fi
34 | 
35 | function usage {
36 |   echo "Usage: ./bin/hbase-sql [options] [cli option]"
37 |   pattern="usage"
38 |   pattern+="\|Spark assembly has been built with hbase"
39 |   pattern+="\|NOTE: SPARK_PREPEND_CLASSES is set"
40 |   pattern+="\|Spark Command: "
41 |   pattern+="\|--help"
42 |   pattern+="\|======="
43 | 
44 |   "$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
45 |   echo
46 |   echo "CLI options:"
47 |   "$FWDIR"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2
48 | }
49 | 
50 | if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
51 |   usage
52 |   exit 0
53 | fi
54 | 
55 | source "$FWDIR"/bin/utils.sh
56 | SUBMIT_USAGE_FUNCTION=usage
57 | gatherSparkSubmitOpts "$@"
58 | 
59 | exec "$FWDIR"/bin/spark-submit --class $CLASS "${SUBMISSION_OPTS[@]}" spark-internal "${APPLICATION_OPTS[@]}"
60 | 


--------------------------------------------------------------------------------
/doc/SparkSQLOnHBase_v2.1.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stratio/hbase/10f96374963c63f201bc8916fec5ec18ce1372a8/doc/SparkSQLOnHBase_v2.1.docx


--------------------------------------------------------------------------------
/python/pyspark/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | """
19 | PySpark is the Python API for Spark.
20 | 
21 | Public classes:
22 | 
23 |   - :class:`SparkContext`:
24 |       Main entry point for Spark functionality.
25 |   - L{RDD}
26 |       A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
27 |   - L{Broadcast}
28 |       A broadcast variable that gets reused across tasks.
29 |   - L{Accumulator}
30 |       An "add-only" shared variable that tasks can only add values to.
31 |   - L{SparkConf}
32 |       For configuring Spark.
33 |   - L{SparkFiles}
34 |       Access files shipped with jobs.
35 |   - L{StorageLevel}
36 |       Finer-grained cache persistence levels.
37 | 
38 | """
39 | 
40 | from pyspark.conf import SparkConf
41 | from pyspark.context import SparkContext
42 | from pyspark.rdd import RDD
43 | from pyspark.files import SparkFiles
44 | from pyspark.storagelevel import StorageLevel
45 | from pyspark.accumulators import Accumulator, AccumulatorParam
46 | from pyspark.broadcast import Broadcast
47 | from pyspark.serializers import MarshalSerializer, PickleSerializer
48 | 
49 | # for back compatibility
50 | from pyspark.sql import SQLContext, HiveContext, HBaseSQLContext, SchemaRDD, Row
51 | 
52 | __all__ = [
53 |     "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast",
54 |     "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer",
55 | ]
56 | 


--------------------------------------------------------------------------------
/python/pyspark/java_gateway.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one or more
  3 | # contributor license agreements.  See the NOTICE file distributed with
  4 | # this work for additional information regarding copyright ownership.
  5 | # The ASF licenses this file to You under the Apache License, Version 2.0
  6 | # (the "License"); you may not use this file except in compliance with
  7 | # the License.  You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | 
 18 | import atexit
 19 | import os
 20 | import sys
 21 | import signal
 22 | import shlex
 23 | import platform
 24 | from subprocess import Popen, PIPE
 25 | from threading import Thread
 26 | from py4j.java_gateway import java_import, JavaGateway, GatewayClient
 27 | 
 28 | 
 29 | def launch_gateway():
 30 |     SPARK_HOME = os.environ["SPARK_HOME"]
 31 | 
 32 |     gateway_port = -1
 33 |     if "PYSPARK_GATEWAY_PORT" in os.environ:
 34 |         gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
 35 |     else:
 36 |         # Launch the Py4j gateway using Spark's run command so that we pick up the
 37 |         # proper classpath and settings from spark-env.sh
 38 |         on_windows = platform.system() == "Windows"
 39 |         script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
 40 |         submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS")
 41 |         submit_args = submit_args if submit_args is not None else ""
 42 |         submit_args = shlex.split(submit_args)
 43 |         command = [os.path.join(SPARK_HOME, script)] + submit_args + ["pyspark-shell"]
 44 |         if not on_windows:
 45 |             # Don't send ctrl-c / SIGINT to the Java gateway:
 46 |             def preexec_func():
 47 |                 signal.signal(signal.SIGINT, signal.SIG_IGN)
 48 |             env = dict(os.environ)
 49 |             env["IS_SUBPROCESS"] = "1"  # tell JVM to exit after python exits
 50 |             proc = Popen(command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func, env=env)
 51 |         else:
 52 |             # preexec_fn not supported on Windows
 53 |             proc = Popen(command, stdout=PIPE, stdin=PIPE)
 54 | 
 55 |         try:
 56 |             # Determine which ephemeral port the server started on:
 57 |             gateway_port = proc.stdout.readline()
 58 |             gateway_port = int(gateway_port)
 59 |         except ValueError:
 60 |             # Grab the remaining lines of stdout
 61 |             (stdout, _) = proc.communicate()
 62 |             exit_code = proc.poll()
 63 |             error_msg = "Launching GatewayServer failed"
 64 |             error_msg += " with exit code %d!\n" % exit_code if exit_code else "!\n"
 65 |             error_msg += "Warning: Expected GatewayServer to output a port, but found "
 66 |             if gateway_port == "" and stdout == "":
 67 |                 error_msg += "no output.\n"
 68 |             else:
 69 |                 error_msg += "the following:\n\n"
 70 |                 error_msg += "--------------------------------------------------------------\n"
 71 |                 error_msg += gateway_port + stdout
 72 |                 error_msg += "--------------------------------------------------------------\n"
 73 |             raise Exception(error_msg)
 74 | 
 75 |         # In Windows, ensure the Java child processes do not linger after Python has exited.
 76 |         # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when
 77 |         # the parent process' stdin sends an EOF). In Windows, however, this is not possible
 78 |         # because java.lang.Process reads directly from the parent process' stdin, contending
 79 |         # with any opportunity to read an EOF from the parent. Note that this is only best
 80 |         # effort and will not take effect if the python process is violently terminated.
 81 |         if on_windows:
 82 |             # In Windows, the child process here is "spark-submit.cmd", not the JVM itself
 83 |             # (because the UNIX "exec" command is not available). This means we cannot simply
 84 |             # call proc.kill(), which kills only the "spark-submit.cmd" process but not the
 85 |             # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all
 86 |             # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx)
 87 |             def killChild():
 88 |                 Popen(["cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid)])
 89 |             atexit.register(killChild)
 90 | 
 91 |         # Create a thread to echo output from the GatewayServer, which is required
 92 |         # for Java log output to show up:
 93 |         class EchoOutputThread(Thread):
 94 | 
 95 |             def __init__(self, stream):
 96 |                 Thread.__init__(self)
 97 |                 self.daemon = True
 98 |                 self.stream = stream
 99 | 
100 |             def run(self):
101 |                 while True:
102 |                     line = self.stream.readline()
103 |                     sys.stderr.write(line)
104 |         EchoOutputThread(proc.stdout).start()
105 | 
106 |     # Connect to the gateway
107 |     gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
108 | 
109 |     # Import the classes used by PySpark
110 |     java_import(gateway.jvm, "org.apache.spark.SparkConf")
111 |     java_import(gateway.jvm, "org.apache.spark.api.java.*")
112 |     java_import(gateway.jvm, "org.apache.spark.api.python.*")
113 |     java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
114 |     java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
115 |     java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
116 |     java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
117 |     java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")
118 |     java_import(gateway.jvm, "scala.Tuple2")
119 | 
120 |     java_import(gateway.jvm, "org.apache.spark.sql.hbase.*")
121 | 
122 |     return gateway
123 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/HBasePartition.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.sql.hbase
18 | 
19 | import org.apache.spark.{Logging, Partition}
20 | import org.apache.spark.sql.catalyst.expressions._
21 | import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._
22 | import org.apache.spark.sql.hbase.types.{HBaseBytesType, PartitionRange, Range}
23 | 
24 | 
25 | private[hbase] class HBasePartition(
26 |                                      val idx: Int, val mappedIndex: Int,
27 |                                      start: Option[HBaseRawType] = None,
28 |                                      end: Option[HBaseRawType] = None,
29 |                                      val server: Option[String] = None,
30 |                                      val filterPredicates: Option[Expression] = None,
31 |                                      @transient relation: HBaseRelation = null)
32 |   extends Range[HBaseRawType](start, true, end, false, HBaseBytesType)
33 |   with Partition with IndexMappable with Logging {
34 | 
35 |   override def index: Int = idx
36 | 
37 |   override def hashCode(): Int = idx
38 | 
39 |   @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start)
40 | 
41 |   @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end)
42 | 
43 |   def computePredicate(relation: HBaseRelation): Option[Expression] = {
44 |     val predicate = if (filterPredicates.isDefined &&
45 |       filterPredicates.get.references.exists(_.exprId == relation.partitionKeys(0).exprId)) {
46 |       val oriPredicate = filterPredicates.get
47 |       val predicateReferences = oriPredicate.references.toSeq
48 |       val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences)
49 |       val row = new GenericMutableRow(predicateReferences.size)
50 |       var rowIndex = 0
51 |       var i = 0
52 |       var range: PartitionRange[_] = null
53 |       while (i < relation.keyColumns.size) {
54 |         range = relation.generateRange(this, oriPredicate, i)
55 |         if (range != null) {
56 |           rowIndex = relation.rowIndex(predicateReferences, i)
57 |           if (rowIndex >= 0) row.update(rowIndex, range)
58 |           // if the non-last dimension range is not point, do not proceed to the next dims
59 |           if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size
60 |           else i = i + 1
61 |         } else i = relation.keyColumns.size
62 |       }
63 |       val pr = boundReference.partialReduce(row, predicateReferences)
64 |       pr match {
65 |         case (null, e: Expression) => Some(e)
66 |         case (true, _) => None
67 |         case (false, _) => Some(Literal(false))
68 |       }
69 |     } else filterPredicates
70 |     logInfo(predicate.toString)
71 |     predicate
72 |   }
73 | 
74 |   override def toString = {
75 |     s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates"
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/HBasePartitioner.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase
19 | 
20 | import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
21 | 
22 | import org.apache.hadoop.hbase.util.Bytes
23 | import org.apache.spark.serializer.JavaSerializer
24 | import org.apache.spark.util.{CollectionsUtils, Utils}
25 | import org.apache.spark.{Partitioner, SparkEnv}
26 | 
27 | object HBasePartitioner {
28 |   implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
29 |     def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
30 |   }
31 | }
32 | 
33 | class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner {
34 |   import HBasePartitioner.HBaseRawOrdering
35 | 
36 |   type t = HBaseRawType
37 | 
38 |   lazy private val len = splitKeys.length
39 | 
40 |   // For pre-split table splitKeys(0) = bytes[0], to remove it,
41 |   // otherwise partition 0 always be empty and
42 |   // we will miss the last region's date when bulk load
43 |   lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail
44 | 
45 |   def numPartitions = if (len == 0) 1 else len
46 | 
47 |   @transient private val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]
48 | 
49 |   def getPartition(key: Any): Int = {
50 |     val k = key.asInstanceOf[t]
51 |     var partition = 0
52 |     if (len <= 128 && len > 0) {
53 |       // If we have less than 128 partitions naive search
54 |       val ordering = implicitly[Ordering[t]]
55 |       while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
56 |         partition += 1
57 |       }
58 |     } else {
59 |       // Determine which binary search method to use only once.
60 |       partition = binarySearch(realSplitKeys, k)
61 |       // binarySearch either returns the match location or -[insertion point]-1
62 |       if (partition < 0) {
63 |         partition = -partition - 1
64 |       }
65 |       if (partition > realSplitKeys.length) {
66 |         partition = realSplitKeys.length
67 |       }
68 |     }
69 |     partition
70 |   }
71 | 
72 |   override def equals(other: Any): Boolean = other match {
73 |     case r: HBasePartitioner =>
74 |       r.splitKeys.sameElements(splitKeys)
75 |     case _ =>
76 |       false
77 |   }
78 | 
79 |   override def hashCode(): Int = {
80 |     val prime = 31
81 |     var result = 1
82 |     var i = 0
83 |     while (i < splitKeys.length) {
84 |       result = prime * result + splitKeys(i).hashCode
85 |       i += 1
86 |     }
87 |     result = prime * result
88 |     result
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/HBaseSQLCliDriver.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hbase
 19 | 
 20 | import java.io.File
 21 | 
 22 | import jline._
 23 | import org.apache.spark.{Logging, SparkConf, SparkContext}
 24 | 
 25 | /**
 26 |  * HBaseSQLCliDriver
 27 |  *
 28 |  */
 29 | object HBaseSQLCliDriver extends Logging {
 30 |   private val prompt = "spark-hbaseql"
 31 |   private val continuedPrompt = "".padTo(prompt.length, ' ')
 32 |   private val conf = new SparkConf()
 33 |   private val sc = new SparkContext(conf)
 34 |   private val hbaseCtx = new HBaseSQLContext(sc)
 35 | 
 36 |   private val QUIT = "QUIT"
 37 |   private val EXIT = "EXIT"
 38 |   private val HELP = "HELP"
 39 | 
 40 |   def getCompletors: Seq[Completor] = {
 41 |     val sc: SimpleCompletor = new SimpleCompletor(new Array[String](0))
 42 | 
 43 |     // add keywords, including lower-cased versions
 44 |     HBaseSQLParser.getKeywords.foreach { kw =>
 45 |       sc.addCandidateString(kw)
 46 |       sc.addCandidateString(kw.toLowerCase)
 47 |     }
 48 | 
 49 | 
 50 |     Seq(sc)
 51 |   }
 52 | 
 53 |   def main(args: Array[String]) {
 54 | 
 55 |     val reader = new ConsoleReader()
 56 |     reader.setBellEnabled(false)
 57 |     getCompletors.foreach(reader.addCompletor)
 58 | 
 59 |     val historyDirectory = System.getProperty("user.home")
 60 | 
 61 |     try {
 62 |       if (new File(historyDirectory).exists()) {
 63 |         val historyFile = historyDirectory + File.separator + ".hbaseqlhistory"
 64 |         reader.setHistory(new History(new File(historyFile)))
 65 |       } else {
 66 |         System.err.println("WARNING: Directory for hbaseql history file: " + historyDirectory +
 67 |           " does not exist.   History will not be available during this session.")
 68 |       }
 69 |     } catch {
 70 |       case e: Exception =>
 71 |         System.err.println("WARNING: Encountered an error while trying to initialize hbaseql's " +
 72 |           "history file.  History will not be available during this session.")
 73 |         System.err.println(e.getMessage)
 74 |     }
 75 | 
 76 |     println("Welcome to hbaseql CLI")
 77 |     var prefix = ""
 78 | 
 79 |     def promptPrefix = s"$prompt"
 80 |     var currentPrompt = promptPrefix
 81 |     var line = reader.readLine(currentPrompt + "> ")
 82 |     var ret = 0
 83 | 
 84 |     while (line != null) {
 85 |       if (prefix.nonEmpty) {
 86 |         prefix += '\n'
 87 |       }
 88 | 
 89 |       if (line.trim.endsWith(";") && !line.trim.endsWith("\\;")) {
 90 |         line = prefix + line
 91 |         processLine(line, allowInterrupting = true)
 92 |         prefix = ""
 93 |         currentPrompt = promptPrefix
 94 |       } else {
 95 |         prefix = prefix + line
 96 |         currentPrompt = continuedPrompt
 97 |       }
 98 | 
 99 |       line = reader.readLine(currentPrompt + "> ")
100 |     }
101 | 
102 |     System.exit(0)
103 |   }
104 | 
105 |   private def processLine(line: String, allowInterrupting: Boolean) = {
106 | 
107 |     // TODO: handle multiple command separated by ;
108 | 
109 |     // Since we are using SqlParser and it does not handle ';', just work around to omit the ';'
110 |     val input = line.trim.substring(0, line.length - 1)
111 | 
112 |     try {
113 |       process(input)
114 |     } catch {
115 |       case e: Exception =>
116 |         e.printStackTrace()
117 |     }
118 |   }
119 | 
120 |   private def process(input: String) = {
121 |     val token = input.split("\\s")
122 |     token(0).toUpperCase match {
123 |       case QUIT => System.exit(0)
124 |       case EXIT => System.exit(0)
125 |       case HELP => printHelp(token)
126 |       case "!" => // TODO: add support for bash command start with !
127 |       case _ =>
128 |         logInfo(s"Processing $input")
129 |         val start = System.currentTimeMillis()
130 |         val res = hbaseCtx.sql(input).collect()
131 |         val end = System.currentTimeMillis()
132 |         res.foreach(println)
133 |         val timeTaken: Double = (end - start) / 1000.0
134 |         println(s"Time taken: $timeTaken seconds")
135 |     }
136 |   }
137 | 
138 |   private def printHelp(token: Array[String]) = {
139 |     if (token.length > 1) {
140 |       token(1).toUpperCase match {
141 |         case "CREATE" =>
142 |           println( """CREATE TABLE table_name (col_name data_type, ..., PRIMARY KEY(col_name, ...))
143 |                 MAPPED BY (htable_name, COLS=[col_name=family_name.qualifier])""".stripMargin)
144 |         case "DROP" =>
145 |           println("DROP TABLE table_name")
146 |         case "ALTER" =>
147 |           println("ALTER TABLE table_name ADD (col_name data_type, ...) MAPPED BY (expression)")
148 |           println("ALTER TABLE table_name DROP col_name")
149 |         case "LOAD" =>
150 |           println( """LOAD DATA [LOCAL] INPATH file_path [OVERWRITE] INTO TABLE
151 |                 table_name [FIELDS TERMINATED BY char]""".stripMargin)
152 |         case "SELECT" =>
153 |           println( """SELECT [ALL | DISTINCT] select_expr, select_expr, ...
154 |                      |FROM table_reference
155 |                      |[WHERE where_condition]
156 |                      |[GROUP BY col_list]
157 |                      |[CLUSTER BY col_list
158 |                      |  | [DISTRIBUTE BY col_list] [SORT BY col_list]
159 |                      |]
160 |                      |[LIMIT number]""")
161 |         case "INSERT" =>
162 |           println("INSERT INTO table_name SELECT clause")
163 |           println("INSERT INTO table_name VALUES (value, ...)")
164 |         case "DESCRIBE" =>
165 |           println("DESCRIBE table_name")
166 |         case "SHOW" =>
167 |           println("SHOW TABLES")
168 |         case _ =>
169 |           printHelpUsage()
170 |       }
171 |     } else {
172 |       printHelpUsage()
173 |     }
174 |   }
175 |  
176 |   private def printHelpUsage() = {
177 |     println("""Usage: HELP Statement    
178 |       Statement:
179 |         CREATE | DROP | ALTER | LOAD | SELECT | INSERT | DESCRIBE | SHOW""")    
180 |   }
181 | }
182 | 
183 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/HBaseSQLConf.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase
19 | 
20 | import org.apache.spark.sql.SQLConf
21 | 
22 | private[hbase] object HBaseSQLConf {
23 |   val PARTITION_EXPIRATION = "spark.sql.hbase.partition.expiration"
24 |   val SCANNER_FETCH_SIZE = "spark.sql.hbase.scanner.fetchsize"
25 | }
26 | 
27 | /**
28 |  * A trait that enables the setting and getting of mutable config parameters/hints.
29 |  *
30 |  */
31 | private[hbase] class HBaseSQLConf extends SQLConf {
32 |   import org.apache.spark.sql.hbase.HBaseSQLConf._
33 | 
34 |   /** The expiration of cached partition (i.e., region) info; defaults to 10 minutes . */
35 |   private[spark] def partitionExpiration: Long = getConf(PARTITION_EXPIRATION, "600").toLong
36 |   private[spark] def scannerFetchSize: Int = getConf(SCANNER_FETCH_SIZE, "1000").toInt
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/HBaseSQLContext.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase
19 | 
20 | import org.apache.hadoop.hbase.HBaseConfiguration
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.api.java.JavaSparkContext
23 | import org.apache.spark.sql._
24 | import org.apache.spark.sql.SparkSQLParser
25 | import org.apache.spark.sql.catalyst.analysis.OverrideCatalog
26 | import org.apache.spark.sql.hbase.execution.HBaseStrategies
27 | 
28 | class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) {
29 |   def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)
30 | 
31 |   protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf
32 | 
33 |   @transient
34 |   override protected[sql] val sqlParser = {
35 |     val fallback = new HBaseSQLParser
36 |     new SparkSQLParser(fallback.parse(_))
37 |   }
38 | 
39 |   HBaseConfiguration.merge(
40 |     sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration))
41 | 
42 |   @transient
43 |   override protected[sql] lazy val catalog: HBaseCatalog =
44 |     new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog
45 | 
46 |   experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource)
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/HBaseSQLReaderRDD.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.spark.sql.hbase
 18 | 
 19 | 
 20 | import org.apache.hadoop.hbase.client.{ResultScanner, Result, Get}
 21 | import org.apache.hadoop.hbase.util.Bytes
 22 | import org.apache.spark.rdd.RDD
 23 | import org.apache.spark.sql.SQLContext
 24 | import org.apache.spark.sql.catalyst.expressions._
 25 | import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
 26 | import org.apache.spark.sql.execution.SparkPlan
 27 | import org.apache.spark.sql.hbase.util.{BytesUtils, HBaseKVHelper, DataTypeUtils}
 28 | import org.apache.spark.sql.types.AtomicType
 29 | 
 30 | import org.apache.spark.{InterruptibleIterator, Logging, Partition, TaskContext}
 31 | 
 32 | import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 33 | 
 34 | 
 35 | /**
 36 |  * HBaseSQLReaderRDD
 37 |  */
 38 | class HBaseSQLReaderRDD(
 39 |                          relation: HBaseRelation,
 40 |                          codegenEnabled: Boolean,
 41 |                          output: Seq[Attribute],
 42 |                          filterPred: Option[Expression],
 43 |                          coprocSubPlan: Option[SparkPlan],
 44 |                          @transient sqlContext: SQLContext)
 45 |   extends RDD[Row](sqlContext.sparkContext, Nil) with Logging {
 46 | 
 47 |   override def getPartitions: Array[Partition] = {
 48 |     RangeCriticalPoint.generatePrunedPartitions(relation, filterPred).toArray
 49 |   }
 50 | 
 51 |   override def getPreferredLocations(split: Partition): Seq[String] = {
 52 |     split.asInstanceOf[HBasePartition].server.map {
 53 |       identity
 54 |     }.toSeq
 55 |   }
 56 | 
 57 |   private def createIterator(context: TaskContext,
 58 |                              scanner: ResultScanner,
 59 |                              otherFilters: Option[Expression]): Iterator[Row] = {
 60 |     var finalOutput = output.distinct
 61 |     if (otherFilters.isDefined) {
 62 |       finalOutput = finalOutput.union(otherFilters.get.references.toSeq)
 63 |     }
 64 |     val row = new GenericMutableRow(finalOutput.size)
 65 |     val projections = finalOutput.zipWithIndex
 66 | 
 67 |     var finished: Boolean = false
 68 |     var gotNext: Boolean = false
 69 |     var result: Result = null
 70 | 
 71 |     val otherFilter: (Row) => Boolean = if (otherFilters.isDefined) {
 72 |       if (codegenEnabled) {
 73 |         GeneratePredicate.generate(otherFilters.get, finalOutput)
 74 |       } else {
 75 |         InterpretedPredicate.create(otherFilters.get, finalOutput)
 76 |       }
 77 |     } else null
 78 | 
 79 |     val iterator = new Iterator[Row] {
 80 |       override def hasNext: Boolean = {
 81 |         if (!finished) {
 82 |           if (!gotNext) {
 83 |             result = scanner.next
 84 |             finished = result == null
 85 |             gotNext = true
 86 |           }
 87 |         }
 88 |         if (finished) {
 89 |           close()
 90 |         }
 91 |         !finished
 92 |       }
 93 | 
 94 |       override def next(): Row = {
 95 |         if (hasNext) {
 96 |           gotNext = false
 97 |           relation.buildRow(projections, result, row)
 98 |         } else {
 99 |           null
100 |         }
101 |       }
102 | 
103 |       def close() = {
104 |         try {
105 |           scanner.close()
106 |           relation.closeHTable()
107 |         } catch {
108 |           case e: Exception => logWarning("Exception in scanner.close", e)
109 |         }
110 |       }
111 |     }
112 |     if (otherFilter == null) {
113 |       new InterruptibleIterator(context, iterator)
114 |     } else {
115 |       new InterruptibleIterator(context, iterator.filter(otherFilter))
116 |     }
117 |   }
118 | 
119 |   /**
120 |    * construct row key based on the critical point range information
121 |    * @param cpr the critical point range
122 |    * @param isStart the switch between start and end value
123 |    * @return the encoded row key, or null if the value is None
124 |    */
125 |   private def constructRowKey(cpr: MDCriticalPointRange[_], isStart: Boolean): HBaseRawType = {
126 |     val prefix = cpr.prefix
127 |     val head: Seq[(HBaseRawType, AtomicType)] = prefix.map {
128 |       case (itemValue, itemType) =>
129 |         (DataTypeUtils.dataToBytes(itemValue, itemType), itemType)
130 |     }
131 | 
132 |     val key = if (isStart) cpr.lastRange.start else cpr.lastRange.end
133 |     val keyType = cpr.lastRange.dt
134 |     val list = if (key.isDefined) {
135 |       val tail: (HBaseRawType, AtomicType) = {
136 |         (DataTypeUtils.dataToBytes(key.get, keyType), keyType)
137 |       }
138 |       head :+ tail
139 |     } else {
140 |       head
141 |     }
142 |     if (list.size == 0) {
143 |       null
144 |     } else {
145 |       HBaseKVHelper.encodingRawKeyColumns(list)
146 |     }
147 |   }
148 | 
149 |   // For critical-point-based predicate pushdown
150 |   // partial reduction for those partitions mapped to multiple critical point ranges,
151 |   // as indicated by the keyPartialEvalIndex in the partition, where the original
152 |   // filter predicate will be used
153 |   override def compute(split: Partition, context: TaskContext): Iterator[Row] = {
154 |     val partition = split.asInstanceOf[HBasePartition]
155 |     val predicates = partition.computePredicate(relation)
156 |     val expandedCPRs: Seq[MDCriticalPointRange[_]] =
157 |       RangeCriticalPoint.generateCriticalPointRanges(relation, predicates).
158 |         flatMap(_.flatten(new ArrayBuffer[(Any, AtomicType)](relation.dimSize)))
159 | 
160 |     if (expandedCPRs.isEmpty) {
161 |       val (filters, otherFilters, pushdownPreds) = relation.buildPushdownFilterList(predicates)
162 |       val pushablePreds = if (pushdownPreds.isDefined) {
163 |         ListBuffer[Expression](pushdownPreds.get)
164 |       } else {
165 |         ListBuffer[Expression]()
166 |       }
167 |       val scan = relation.buildScan(partition.start, partition.end, filters, otherFilters,
168 |         pushablePreds, output)
169 |       val scanner = relation.htable.getScanner(scan)
170 |       createIterator(context, scanner, otherFilters)
171 |     } else {
172 |       // expandedCPRs is not empty
173 |       val isPointRanges = expandedCPRs.forall(
174 |         p => p.lastRange.isPoint && p.prefix.size == relation.keyColumns.size - 1)
175 |       if (isPointRanges) {
176 |         // all of the last ranges are point range, build a list of get
177 |         val gets: java.util.List[Get] = new java.util.ArrayList[Get]()
178 | 
179 |         val distinctProjectionList = output.distinct
180 |         val nonKeyColumns = relation.nonKeyColumns.filter {
181 |           case nkc => distinctProjectionList.exists(nkc.sqlName == _.name)
182 |         }
183 | 
184 |         def generateGet(range: MDCriticalPointRange[_]): Get = {
185 |           val rowKey = constructRowKey(range, isStart = true)
186 |           val get = new Get(rowKey)
187 |           for (nonKeyColumn <- nonKeyColumns) {
188 |             get.addColumn(Bytes.toBytes(nonKeyColumn.family), Bytes.toBytes(nonKeyColumn.qualifier))
189 |           }
190 |           get
191 |         }
192 |         val predForEachRange: Seq[Expression] = expandedCPRs.map(range => {
193 |           gets.add(generateGet(range))
194 |           range.lastRange.pred
195 |         })
196 |         val resultsWithPred = relation.htable.get(gets).zip(predForEachRange).filter(!_._1.isEmpty)
197 | 
198 |         def evalResultForBoundPredicate(input: Row, predicate: Expression): Boolean = {
199 |           val boundPredicate = BindReferences.bindReference(predicate, output)
200 |           boundPredicate.eval(input).asInstanceOf[Boolean]
201 |         }
202 |         val projections = output.zipWithIndex
203 |         val resultRows: Seq[Row] = for {
204 |           (result, predicate) <- resultsWithPred
205 |           row = new GenericMutableRow(output.size)
206 |           resultRow = relation.buildRow(projections, result, row)
207 |           if predicate == null || evalResultForBoundPredicate(resultRow, predicate)
208 |         } yield resultRow
209 | 
210 |         resultRows.toIterator
211 |       }
212 |       else {
213 |         // isPointRanges is false
214 |         // calculate the range start
215 |         val startRowKey = constructRowKey(expandedCPRs(0), isStart = true)
216 |         val start = if (startRowKey != null) {
217 |           if (partition.start.isDefined && Bytes.compareTo(partition.start.get, startRowKey) > 0) {
218 |             Some(partition.start.get)
219 |           } else {
220 |             Some(startRowKey)
221 |           }
222 |         } else {
223 |           partition.start
224 |         }
225 | 
226 |         // calculate the range end
227 |         val size = expandedCPRs.size - 1
228 |         val endKey: Option[Any] = expandedCPRs(size).lastRange.end
229 |         val endInclusive: Boolean = expandedCPRs(size).lastRange.endInclusive
230 |         val endRowKey = constructRowKey(expandedCPRs(size), isStart = false)
231 |         val end = if (endRowKey != null) {
232 |           val finalKey: HBaseRawType = {
233 |             if (endInclusive || endKey.isEmpty) {
234 |               BytesUtils.addOne(endRowKey)
235 |             } else {
236 |               endRowKey
237 |             }
238 |           }
239 | 
240 |           if (finalKey != null) {
241 |             if (partition.end.isDefined && Bytes.compareTo(finalKey, partition.end.get) > 0) {
242 |               Some(partition.end.get)
243 |             } else {
244 |               Some(finalKey)
245 |             }
246 |           } else {
247 |             partition.end
248 |           }
249 |         } else {
250 |           partition.end
251 |         }
252 | 
253 | 
254 |         val (filters, otherFilters, preds) =
255 |           relation.buildCPRFilterList(output, filterPred, expandedCPRs)
256 |         val scan = relation.buildScan(start, end, filters, otherFilters, preds, output)
257 |         val scanner = relation.htable.getScanner(scan)
258 |         createIterator(context, scanner, otherFilters)
259 |       }
260 |     }
261 |   }
262 | }
263 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/HBaseShuffledRDD.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase
19 | 
20 | import org.apache.spark._
21 | import org.apache.spark.rdd.{RDD, ShuffledRDD, ShuffledRDDPartition}
22 | 
23 | class HBaseShuffledRDD (
24 |     prevRdd: RDD[(HBaseRawType, Array[HBaseRawType])],
25 |     part: Partitioner,
26 |     @transient hbPartitions: Seq[HBasePartition] = Nil) extends ShuffledRDD(prevRdd, part){
27 | 
28 |   override def getPartitions: Array[Partition] = {
29 |     if (hbPartitions==null || hbPartitions.isEmpty) {
30 |       Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
31 |     } else {
32 |       // only to be invoked by clients
33 |       hbPartitions.toArray
34 |     }
35 |   }
36 | 
37 |   override def getPreferredLocations(split: Partition): Seq[String] = {
38 |     if (hbPartitions==null || hbPartitions.isEmpty) {
39 |       Seq.empty
40 |     } else {
41 |       split.asInstanceOf[HBasePartition].server.map {
42 |         identity[String]
43 |       }.toSeq
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/HadoopReader.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase
19 | 
20 | import org.apache.spark.SparkContext
21 | import org.apache.spark.sql.hbase.util.HBaseKVHelper
22 | import org.apache.spark.sql.types._
23 | 
24 | /**
25 |  * Helper class for scanning files stored in Hadoop - e.g., to read text file when bulk loading.
26 |  */
27 | private[hbase] class HadoopReader(
28 |     @transient sc: SparkContext,
29 |     path: String,
30 |     delimiter: Option[String])(baseRelation: HBaseRelation) {
31 |   /** make RDD[(SparkImmutableBytesWritable, SparkKeyValue)] from text file. */
32 |   private[hbase] def makeBulkLoadRDDFromTextFile = {
33 |     val rdd = sc.textFile(path)
34 |     val splitRegex = delimiter.getOrElse(",")
35 |     val relation = baseRelation
36 | 
37 |     rdd.mapPartitions { iter =>
38 |       val lineBuffer = HBaseKVHelper.createLineBuffer(relation.output)
39 |       val keyBytes = new Array[(HBaseRawType, DataType)](relation.keyColumns.size)
40 |       iter.flatMap { line =>
41 |         if (line == "") {
42 |           None
43 |         } else {
44 |           // If the last column in the text file is null, the java parser will
45 |           // return a String[] containing only the non-null text values.
46 |           // In this case we need to append another element (null) to
47 |           // the array returned by line.split(splitRegex).
48 |           val valueBytes = new Array[HBaseRawType](relation.nonKeyColumns.size)
49 |           var textValueArray = line.split(splitRegex)
50 |           while (textValueArray.length < relation.output.length) {
51 |             textValueArray = textValueArray :+ ""
52 |           }
53 |           HBaseKVHelper.string2KV(textValueArray, relation, lineBuffer, keyBytes, valueBytes)
54 |           val rowKeyData = HBaseKVHelper.encodingRawKeyColumns(keyBytes)
55 |           Seq((rowKeyData, valueBytes))
56 |         }
57 |       }
58 |     }
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/IndexMappable.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.sql.hbase
18 | 
19 | private[hbase] trait IndexMappable {
20 |   def mappedIndex: Int
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/ScanPredClassifier.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hbase
 19 | 
 20 | import org.apache.spark.sql.catalyst.expressions._
 21 | import org.apache.spark.sql.hbase.util.{BytesUtils, DataTypeUtils}
 22 | 
 23 | /**
 24 |  * Classifies a predicate into a pair of (pushdownable, non-pushdownable) predicates
 25 |  * for a Scan; the logic relationship between the two components of the pair is AND
 26 |  */
 27 | class ScanPredClassifier(relation: HBaseRelation) {
 28 |   def apply(pred: Expression): (Option[Expression], Option[Expression]) = {
 29 |     // post-order bottom-up traversal
 30 |     pred match {
 31 |       case And(left, right) =>
 32 |         val (ll, lr) = apply(left)
 33 |         val (rl, rr) = apply(right)
 34 |         (ll, lr, rl, rr) match {
 35 |           // All Nones
 36 |           case (None, None, None, None) => (None, None)
 37 |           // Three Nones
 38 |           case (None, None, None, _) => (None, rr)
 39 |           case (None, None, _, None) => (rl, None)
 40 |           case (None, _, None, None) => (None, lr)
 41 |           case (_, None, None, None) => (ll, None)
 42 |           // two Nones
 43 |           case (None, None, _, _) => (rl, rr)
 44 |           case (None, _, None, _) => (None, Some(And(lr.get, rr.get)))
 45 |           case (None, _, _, None) => (rl, lr)
 46 |           case (_, None, None, _) => (ll, rr)
 47 |           case (_, None, _, None) => (Some(And(ll.get, rl.get)), None)
 48 |           case (_, _, None, None) => (ll, lr)
 49 |           // One None
 50 |           case (None, _, _, _) => (rl, Some(And(lr.get, rr.get)))
 51 |           case (_, None, _, _) => (Some(And(ll.get, rl.get)), rr)
 52 |           case (_, _, None, _) => (ll, Some(And(lr.get, rr.get)))
 53 |           case (_, _, _, None) => (Some(And(ll.get, rl.get)), lr)
 54 |           // No nones
 55 |           case _ => (Some(And(ll.get, rl.get)), Some(And(lr.get, rr.get)))
 56 |         }
 57 |       case Or(left, right) =>
 58 |         val (ll, lr) = apply(left)
 59 |         val (rl, rr) = apply(right)
 60 |         (ll, lr, rl, rr) match {
 61 |           // All Nones
 62 |           case (None, None, None, None) => (None, None)
 63 |           // Three Nones
 64 |           case (None, None, None, _) => (None, rr)
 65 |           case (None, None, _, None) => (rl, None)
 66 |           case (None, _, None, None) => (None, lr)
 67 |           case (_, None, None, None) => (ll, None)
 68 |           // two Nones
 69 |           case (None, None, _, _) => (rl, rr)
 70 |           case (None, _, None, _) => (None, Some(Or(lr.get, rr.get)))
 71 |           case (None, _, _, None) => (None, Some(Or(lr.get, rl.get)))
 72 |           case (_, None, None, _) => (None, Some(Or(ll.get, rr.get)))
 73 |           case (_, None, _, None) => (Some(Or(ll.get, rl.get)), None)
 74 |           case (_, _, None, None) => (ll, lr)
 75 |           // One None
 76 |           case (None, _, _, _) => (None, Some(pred))
 77 |           // Accept increased evaluation complexity for improved pushed down
 78 |           case (_, None, _, _) => (Some(Or(ll.get, rl.get)), Some(Or(ll.get, rr.get)))
 79 |           case (_, _, None, _) => (None, Some(pred))
 80 |           // Accept increased evaluation complexity for improved pushed down
 81 |           case (_, _, _, None) => (Some(Or(ll.get, rl.get)), Some(Or(lr.get, rl.get)))
 82 |           // No nones
 83 |           // Accept increased evaluation complexity for improved pushed down
 84 |           case _ => (Some(Or(ll.get, rl.get)), Some(And(Or(ll.get, rr.get),
 85 |             And(Or(lr.get, rl.get), Or(lr.get, rr.get)))))
 86 |         }
 87 |       case EqualTo(left, right) => classifyBinary(left, right, pred)
 88 |       case LessThan(left, right) => classifyBinary(left, right, pred)
 89 |       case LessThanOrEqual(left, right) => classifyBinary(left, right, pred)
 90 |       case GreaterThan(left, right) => classifyBinary(left, right, pred)
 91 |       case GreaterThanOrEqual(left, right) => classifyBinary(left, right, pred)
 92 |       case In(value@AttributeReference(_, _, _, _), list) =>
 93 |         if (relation.isNonKey(value) && list.filter(!_.isInstanceOf[Literal]).isEmpty) {
 94 |           (Some(pred), None)
 95 |         } else {
 96 |           (None, Some(pred))
 97 |         }
 98 |       case InSet(value@AttributeReference(name, dataType, _, _), hset)
 99 |         if relation.nonKeyColumns.exists(_.sqlName == name) =>
100 |         var errorOccurred = false
101 |         for (item <- hset if !errorOccurred) {
102 |           try {
103 |             /**
104 |              * Use try-catch to make sure data type conversion is proper, for example,
105 |              * Java throws casting exception while doing col2 in (1, 2, 3), if col2 data type
106 |              * if ByteType and 1, 2, 3 is Integer.
107 |               */
108 |             DataTypeUtils.getBinaryComparator(BytesUtils.create(dataType), Literal.create(item, dataType))
109 |           } catch {
110 |             case e: Exception => errorOccurred = true
111 |           }
112 |         }
113 |         if (errorOccurred) {
114 |           (None, Some(pred))
115 |         } else {
116 |           (Some(pred), None)
117 |         }
118 |       // everything else are treated as non pushdownable
119 |       case _ => (None, Some(pred))
120 |     }
121 |   }
122 | 
123 |   // returns true if the binary operator of the two args can be pushed down
124 |   private def classifyBinary(left: Expression, right: Expression, pred: Expression)
125 |   : (Option[Expression], Option[Expression]) = {
126 |     (left, right) match {
127 |       case (Literal(_, _), AttributeReference(_, _, _, _)) =>
128 |         if (relation.isNonKey(right.asInstanceOf[AttributeReference])) {
129 |           (Some(pred), None)
130 |         } else {
131 |           (None, Some(pred))
132 |         }
133 |       case (AttributeReference(_, _, _, _), Literal(_, _)) =>
134 |         if (relation.isNonKey(left.asInstanceOf[AttributeReference])) {
135 |           (Some(pred), None)
136 |         } else {
137 |           (None, Some(pred))
138 |         }
139 |       case _ => (None, Some(pred))
140 |     }
141 |   }
142 | }
143 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/catalyst/NotPusher.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase.catalyst
19 | 
20 | import org.apache.spark.sql.catalyst.expressions._
21 | import org.apache.spark.sql.catalyst.rules._
22 | 
23 | /**
24 |  * Pushes NOT through And/Or
25 |  */
26 | object NotPusher extends Rule[Expression] {
27 |   def apply(pred: Expression): Expression = pred transformDown  {
28 |     case Not(And(left, right)) => Or(Not(left), Not(right))
29 |     case Not(Or(left, right)) => And(Not(left), Not(right))
30 |     case not @ Not(exp) =>
31 |       // This pattern has been caught by optimizer but after NOT pushdown
32 |       // more opportunities may present
33 |       exp match {
34 |         case GreaterThan(l, r) => LessThanOrEqual(l, r)
35 |         case GreaterThanOrEqual(l, r) => LessThan(l, r)
36 |         case LessThan(l, r) => GreaterThanOrEqual(l, r)
37 |         case LessThanOrEqual(l, r) => GreaterThan(l, r)
38 |         case Not(e) => e
39 |         case _ => not
40 |       }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/execution/HBaseSQLTableScan.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase.execution
19 | 
20 | import org.apache.spark.annotation.DeveloperApi
21 | import org.apache.spark.rdd.RDD
22 | import org.apache.spark.sql.catalyst.expressions._
23 | import org.apache.spark.sql.catalyst.plans.physical.RangePartitioning
24 | import org.apache.spark.sql.execution.LeafNode
25 | import org.apache.spark.sql.hbase._
26 | 
27 | /**
28 |  * :: DeveloperApi ::
29 |  * The HBase table scan operator.
30 |  */
31 | @DeveloperApi
32 | case class HBaseSQLTableScan(
33 |                               relation: HBaseRelation,
34 |                               output: Seq[Attribute],
35 |                               result: RDD[Row]) extends LeafNode {
36 |   override def outputPartitioning = {
37 |     var ordering = List[SortOrder]()
38 |     for (key <- relation.partitionKeys) {
39 |       ordering = ordering :+ SortOrder(key, Ascending)
40 |     }
41 |     RangePartitioning(ordering.toSeq, relation.partitions.size)
42 |   }
43 | 
44 |   override protected def doExecute(): RDD[Row] = result
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/execution/HBaseStrategies.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hbase.execution
 19 | 
 20 | import org.apache.hadoop.hbase.util.Bytes
 21 | import org.apache.spark.rdd.RDD
 22 | import org.apache.spark.sql.catalyst.expressions._
 23 | import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 24 | import org.apache.spark.sql.catalyst.plans.logical
 25 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 26 | import org.apache.spark.sql.execution.{Project, SparkPlan}
 27 | import org.apache.spark.sql.hbase.{HBasePartition, HBaseRawType, HBaseRelation, KeyColumn}
 28 | import org.apache.spark.sql.sources.LogicalRelation
 29 | import org.apache.spark.sql.types._
 30 | import org.apache.spark.sql.{SQLContext, Strategy, execution}
 31 | 
 32 | /**
 33 |  * Retrieves data using a HBaseTableScan.  Partition pruning predicates are also detected and
 34 |  * applied.
 35 |  */
 36 | private[hbase] trait HBaseStrategies {
 37 |   self: SQLContext#SparkPlanner =>
 38 | 
 39 |   private[hbase] object HBaseDataSource extends Strategy {
 40 | 
 41 |     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
 42 |       case logical.Aggregate(groupingExpressions, aggregateExpressions, child)
 43 |         if groupingExpressions.nonEmpty &&
 44 |           canBeAggregatedForAll(groupingExpressions, aggregateExpressions, child) =>
 45 |           val withCodeGen = canBeCodeGened(allAggregates(aggregateExpressions)) && codegenEnabled
 46 |           if (withCodeGen) execution.GeneratedAggregate(
 47 |             // In this case, 'partial = true' doesn't mean it is partial, actually, it is not.
 48 |             // We made it to true to avoid adding Exchange operation.
 49 |             partial = true,
 50 |             groupingExpressions,
 51 |             aggregateExpressions,
 52 |             true,
 53 |             planLater(child)) :: Nil
 54 |           else execution.Aggregate(
 55 |             // In this case, 'partial = true' doesn't mean it is partial, actually, it is not.
 56 |             // We made it to true to avoid adding Exchange operation.
 57 |             partial = true,
 58 |             groupingExpressions,
 59 |             aggregateExpressions,
 60 |             planLater(child)) :: Nil
 61 | 
 62 |       case PhysicalOperation(projectList, inPredicates,
 63 |       l@LogicalRelation(relation: HBaseRelation)) =>
 64 |         pruneFilterProjectHBase(
 65 |           l,
 66 |           projectList,
 67 |           inPredicates,
 68 |           (a, f) => relation.buildScan(a, f)) :: Nil
 69 | 
 70 |       case _ => Nil
 71 |     }
 72 | 
 73 |     def canBeCodeGened(aggs: Seq[AggregateExpression]) = !aggs.exists {
 74 |       case _: Sum | _: Count | _: Max | _: CombineSetsAndCount => false
 75 |       // The generated set implementation is pretty limited ATM.
 76 |       case CollectHashSet(exprs) if exprs.size == 1 &&
 77 |         Seq(IntegerType, LongType).contains(exprs.head.dataType) => false
 78 |       case _ => true
 79 |     }
 80 | 
 81 |     def allAggregates(exprs: Seq[Expression]) =
 82 |       exprs.flatMap(_.collect { case a: AggregateExpression => a})
 83 | 
 84 |     /**
 85 |      * Determined to do the aggregation for all directly or do with partial aggregation
 86 |      */
 87 |     protected def canBeAggregatedForAll(groupingExpressions: Seq[Expression],
 88 |                                         aggregateExpressions: Seq[NamedExpression],
 89 |                                         child: LogicalPlan): Boolean = {
 90 |       def findScanNode(physicalChild: SparkPlan): Option[HBaseSQLTableScan] = physicalChild match {
 91 |         case chd: HBaseSQLTableScan => Some(chd)
 92 |         case chd if chd.children.size != 1 => None
 93 |         case chd => findScanNode(chd.children(0))
 94 |       }
 95 | 
 96 |       /**
 97 |        * @param headEnd the HBaseRawType for the end of head partition
 98 |        * @param tailStart the HBaseRawType for the start of tail partition
 99 |        * @param keysForGroup the remaining key dimension for grouping
100 |        * @return whether these two partitions are distinguished or not in the given dimension
101 |        */
102 |       def distinguishedForGroupKeys(headEnd: HBaseRawType,
103 |                                     tailStart: HBaseRawType,
104 |                                     keysForGroup: Seq[KeyColumn]): Boolean = {
105 |         //Divide raw type into two parts, one is the raw type for current key dimension,
106 |         //the other is the raw type for the key dimensions left
107 |         def divideRawType(rawType: HBaseRawType, key: KeyColumn)
108 |         : (HBaseRawType, HBaseRawType) = key.dataType match {
109 |           case dt: StringType => rawType.splitAt(rawType.indexWhere(_ == 0x00) + 1)
110 |           case dt if dt.defaultSize >= rawType.size => (rawType, Array())
111 |           case dt => rawType.splitAt(dt.defaultSize)
112 |         }
113 | 
114 |         if (keysForGroup.isEmpty) true
115 |         else {
116 |           val (curKey, keysLeft) = (keysForGroup.head, keysForGroup.tail)
117 |           val (headEndCurKey, headEndKeysLeft) = divideRawType(headEnd, curKey)
118 |           val (tailStartCurKey, tailStartKeysLeft) = divideRawType(tailStart, curKey)
119 | 
120 |           if (headEndKeysLeft.isEmpty || tailStartKeysLeft.isEmpty) true
121 |           else if (Bytes.compareTo(tailStartCurKey, headEndCurKey) != 0) true
122 |           else if (keysLeft.nonEmpty) distinguishedForGroupKeys(
123 |             headEndKeysLeft, tailStartKeysLeft, keysLeft)
124 |           else if (headEndKeysLeft.forall(_ == 0x00) || tailStartCurKey.forall(_ == 0x00)) true
125 |           else false
126 |         }
127 |       }
128 | 
129 |       val physicalChild = planLater(child)
130 |       def aggrWithPartial = false
131 |       def aggrForAll = true
132 | 
133 |       findScanNode(physicalChild) match {
134 |         case None => aggrWithPartial
135 |         case Some(scanNode: HBaseSQLTableScan) =>
136 |           val hbaseRelation = scanNode.relation
137 | 
138 |           //If there is only one partition in HBase,
139 |           //we don't need to do the partial aggregation
140 |           if (hbaseRelation.partitions.size == 1) aggrForAll
141 |           else {
142 |             val keysForGroup = hbaseRelation.keyColumns.takeWhile(key =>
143 |               groupingExpressions.exists {
144 |                 case expr: AttributeReference => expr.name == key.sqlName
145 |                 case _ => false
146 |               })
147 | 
148 |             //If there exists some expressions in groupingExpressions are not keys
149 |             //or it missed some mid dimensions in the rowkey,
150 |             //that means we have to do it with the partial aggregation.
151 |             //
152 |             //If the groupingExpreesions are composed by all keys,
153 |             //that means it need to be grouped by rowkey in all dimensions,
154 |             //so we could do the aggregation for all directly.
155 |             if (keysForGroup.size != groupingExpressions.size) aggrWithPartial
156 |             else if (keysForGroup.size == hbaseRelation.keyColumns.size) aggrForAll
157 |             else {
158 |               val partitionsAfterFilter = scanNode.result.partitions
159 |               val eachPartionApart = (0 to partitionsAfterFilter.size - 2).forall { case i =>
160 |                 val headEnd = partitionsAfterFilter(i).asInstanceOf[HBasePartition]
161 |                   .end.get.asInstanceOf[HBaseRawType]
162 |                 val tailStart = partitionsAfterFilter(i + 1).asInstanceOf[HBasePartition]
163 |                   .start.get.asInstanceOf[HBaseRawType]
164 |                 //If there exists any two partition are not distinguished from each other
165 |                 // for the given rowkey dimensions, we could not do the aggregation for all.
166 |                 distinguishedForGroupKeys(headEnd, tailStart, keysForGroup)
167 |               }
168 |               if (eachPartionApart) aggrForAll
169 |               else aggrWithPartial
170 |             }
171 |           }
172 |       }
173 |     }
174 | 
175 |     // Based on Catalyst expressions.
176 |     // Almost identical to pruneFilterProjectRaw
177 |     protected def pruneFilterProjectHBase(relation: LogicalRelation,
178 |                                           projectList: Seq[NamedExpression],
179 |                                           filterPredicates: Seq[Expression],
180 |                                           scanBuilder:
181 |                                           (Seq[Attribute], Seq[Expression]) => RDD[Row]) = {
182 | 
183 |       val projectSet = AttributeSet(projectList.flatMap(_.references))
184 |       val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
185 | 
186 |       val pushedFilters = if (filterPredicates.nonEmpty) {
187 |         Seq(filterPredicates.map {
188 |           _ transform {
189 |             // Match original case of attributes.
190 |             case a: AttributeReference => relation.attributeMap(a)
191 |             // We will do HBase-specific predicate pushdown so just use the original predicate here
192 |           }
193 |         }.reduceLeft(And))
194 |       } else {
195 |         filterPredicates
196 |       }
197 | 
198 |       val hbaseRelation = relation.relation.asInstanceOf[HBaseRelation]
199 |       if (projectList.map(_.toAttribute) == projectList &&
200 |         projectSet.size == projectList.size &&
201 |         filterSet.subsetOf(projectSet)) {
202 |         // When it is possible to just use column pruning to get the right projection and
203 |         // when the columns of this projection are enough to evaluate all filter conditions,
204 |         // just do a scan followed by a filter, with no extra project.
205 |         val requestedColumns =
206 |           projectList.asInstanceOf[Seq[Attribute]] // Safe due to if above.
207 |             .map(relation.attributeMap) // Match original case of attributes.
208 | 
209 |         // We have to use a HBase-specific scanner here while maintain as much compatibility
210 |         // with the data source API as possible, primarily because
211 |         // 1) We need to set up the outputPartitioning field to HBase-specific partitions
212 |         // 2) Future use of HBase co-processor
213 |         // 3) We will do partition-specific predicate pushdown
214 |         // The above two *now* are absent from the PhysicalRDD class.
215 | 
216 |         HBaseSQLTableScan(hbaseRelation, projectList.map(_.toAttribute),
217 |           scanBuilder(requestedColumns, pushedFilters))
218 |       } else {
219 |         val requestedColumns = projectSet.map(relation.attributeMap).toSeq
220 |         val scan = HBaseSQLTableScan(hbaseRelation, requestedColumns,
221 |           scanBuilder(requestedColumns, pushedFilters))
222 |         Project(projectList, scan)
223 |       }
224 |     }
225 |   }
226 | 
227 | }
228 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.sql
18 | 
19 | package object hbase {
20 |   type HBaseRawType = Array[Byte]
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/types/HBaseBytesType.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase.types
19 | 
20 | import org.apache.spark.sql.types._
21 | 
22 | import scala.reflect.runtime.universe.typeTag
23 | 
24 | /**
25 |  * Almost identical to BinaryType except for a different ordering to be consistent
26 |  * with that of HBase's internal ordering
27 |  * This is a data type for Low-Level HBase entities.
28 |  * It should not be used in High-Level processing
29 |  */
30 | private[hbase] case object HBaseBytesType extends AtomicType /*with PrimitiveType*/ {
31 |   override def defaultSize: Int = 4096
32 |   private[sql] type InternalType = Array[Byte]
33 |   // TODO: can not use ScalaReflectionLock now for its accessibility
34 |   // @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
35 |   @transient private[sql] lazy val tag = synchronized(typeTag[InternalType])
36 |   private[sql] val ordering = new Ordering[InternalType] {
37 |     def compare(x: Array[Byte], y: Array[Byte]): Int = {
38 |       for (i <- 0 until x.length; if i < y.length) {
39 |         val a: Int = x(i) & 0xff
40 |         val b: Int = y(i) & 0xff
41 |         val res = a - b
42 |         if (res != 0) return res
43 |       }
44 |       x.length - y.length
45 |     }
46 |   }
47 |  
48 |   private[spark] override def asNullable = this
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/types/PartialOrderingDataType.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.spark.sql.hbase.types
18 | 
19 | import org.apache.spark.sql.types._
20 | 
21 | import scala.reflect.runtime.universe.TypeTag
22 | 
23 | abstract class PartialOrderingDataType extends DataType {
24 |   private[sql] type JvmType
25 |   def toPartiallyOrderingDataType(s: Any, dt: AtomicType): Any
26 |   @transient private[sql] val tag: TypeTag[JvmType]
27 |   private[sql] val partialOrdering: PartialOrdering[JvmType]
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/types/RangeType.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.spark.sql.hbase.types
 18 | 
 19 | import java.sql.Timestamp
 20 | 
 21 | import org.apache.spark.sql.catalyst.expressions.Expression
 22 | import org.apache.spark.sql.types._
 23 | 
 24 | import scala.collection.immutable.HashMap
 25 | import scala.language.implicitConversions
 26 | import scala.math.PartialOrdering
 27 | import scala.reflect.runtime.universe.typeTag
 28 | 
 29 | class Range[T](val start: Option[T], // None for open ends
 30 |                val startInclusive: Boolean,
 31 |                val end: Option[T], // None for open ends
 32 |                val endInclusive: Boolean,
 33 |                val dt: AtomicType) extends Serializable {
 34 |   require(dt != null && !(start.isDefined && end.isDefined &&
 35 |     ((dt.ordering.eq(start.get, end.get) &&
 36 |       (!startInclusive || !endInclusive)) ||
 37 |       dt.ordering.gt(start.get.asInstanceOf[dt.InternalType], end.get.asInstanceOf[dt.InternalType]))),
 38 |     "Inappropriate range parameters")
 39 |   @transient lazy val isPoint: Boolean = start.isDefined && end.isDefined &&
 40 |     startInclusive && endInclusive && start.get.equals(end.get)
 41 | }
 42 | 
 43 | /**
 44 |  * HBase partition range
 45 |  * @param start start position
 46 |  * @param startInclusive whether the start position is inclusive or not
 47 |  * @param end end position
 48 |  * @param endInclusive whether the end position is inclusive or not
 49 |  * @param id the partition id
 50 |  * @param dt the data type
 51 |  * @param pred the associated predicate
 52 |  * @tparam T template of the type
 53 |  */
 54 | class PartitionRange[T](start: Option[T], startInclusive: Boolean,
 55 |                         end: Option[T], endInclusive: Boolean,
 56 |                         val id: Int, dt: AtomicType, var pred: Expression)
 57 |   extends Range[T](start, startInclusive, end, endInclusive, dt)
 58 | 
 59 | private[hbase] class RangeType[T] extends PartialOrderingDataType {
 60 |   override def defaultSize: Int = 4096
 61 |   private[sql] type JvmType = Range[T]
 62 |   // TODO: can not use ScalaReflectionLock now for its accessibility
 63 |   // @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
 64 |   @transient private[sql] lazy val tag = synchronized(typeTag[JvmType])
 65 |   
 66 |   private[spark] override def asNullable: RangeType[T] = this
 67 | 
 68 |   def toPartiallyOrderingDataType(s: Any, dt: AtomicType): Any = s match {
 69 |     case b: Boolean => new Range[Boolean](Some(b), true, Some(b), true, BooleanType)
 70 |     case b: Byte => new Range[Byte](Some(b), true, Some(b), true, ByteType)
 71 |     case d: Double => new Range[Double](Some(d), true, Some(d), true, DoubleType)
 72 |     case f: Float => new Range[Float](Some(f), true, Some(f), true, FloatType)
 73 |     case i: Int => new Range[Int](Some(i), true, Some(i), true, IntegerType)
 74 |     case l: Long => new Range[Long](Some(l), true, Some(l), true, LongType)
 75 |     case s: Short => new Range[Short](Some(s), true, Some(s), true, ShortType)
 76 |     case s: String => new Range[String](Some(s), true, Some(s), true, StringType)
 77 |     case t: Timestamp => new Range[Timestamp](Some(t), true, Some(t), true, TimestampType)
 78 |     case _ => s
 79 |   }
 80 | 
 81 |   val partialOrdering = new PartialOrdering[JvmType] {
 82 |     // Right now we just support comparisons between a range and a point
 83 |     // In the future when more generic range comparisons, these two methods
 84 |     // must be functional as expected
 85 |     // return -2 if a < b; -1 if a <= b; 0 if a = b; 1 if a >= b; 2 if a > b
 86 |     def tryCompare(a: JvmType, b: JvmType): Option[Int] = {
 87 |       val aRange = a.asInstanceOf[Range[T]]
 88 |       val aStartInclusive = aRange.startInclusive
 89 |       val aStart = aRange.start.getOrElse(null).asInstanceOf[aRange.dt.InternalType]
 90 |       val aEnd = aRange.end.getOrElse(null).asInstanceOf[aRange.dt.InternalType]
 91 |       val aEndInclusive = aRange.endInclusive
 92 |       val bRange = b.asInstanceOf[Range[T]]
 93 |       val bStart = bRange.start.getOrElse(null).asInstanceOf[aRange.dt.InternalType]
 94 |       val bEnd = bRange.end.getOrElse(null).asInstanceOf[aRange.dt.InternalType]
 95 |       val bStartInclusive = bRange.startInclusive
 96 |       val bEndInclusive = bRange.endInclusive
 97 | 
 98 |       // return 1 iff aStart > bEnd
 99 |       // return 1 iff aStart = bEnd, aStartInclusive & bEndInclusive are not true at same position
100 |       if ((aStart != null && bEnd != null)
101 |         && (aRange.dt.ordering.gt(aStart, bEnd)
102 |         || (aRange.dt.ordering.equiv(aStart, bEnd) && !(aStartInclusive && bEndInclusive)))) {
103 |         Some(2)
104 |       } // Vice versa
105 |       else if ((bStart != null && aEnd != null)
106 |         && (aRange.dt.ordering.gt(bStart, aEnd)
107 |         || (aRange.dt.ordering.equiv(bStart, aEnd) && !(bStartInclusive && aEndInclusive)))) {
108 |         Some(-2)
109 |       } else if (aStart != null && aEnd != null && bStart != null && bEnd != null &&
110 |         aRange.dt.ordering.equiv(bStart, aEnd)
111 |         && aRange.dt.ordering.equiv(aStart, aEnd)
112 |         && aRange.dt.ordering.equiv(bStart, bEnd)
113 |         && (aStartInclusive && aEndInclusive && bStartInclusive && bEndInclusive)) {
114 |         Some(0)
115 |       } else if (aEnd != null && bStart != null && aRange.dt.ordering.equiv(aEnd, bStart)
116 |         && aEndInclusive && bStartInclusive) {
117 |         Some(-1)
118 |       } else if (aStart != null && bEnd != null && aRange.dt.ordering.equiv(aStart, bEnd)
119 |         && aStartInclusive && bEndInclusive) {
120 |         Some(1)
121 |       } else {
122 |         None
123 |       }
124 |     }
125 | 
126 |     def lteq(a: JvmType, b: JvmType): Boolean = {
127 |       // [(aStart, aEnd)] and [(bStart, bEnd)]
128 |       // [( and )] mean the possibilities of the inclusive and exclusive condition
129 |       val aRange = a.asInstanceOf[Range[T]]
130 |       val aStartInclusive = aRange.startInclusive
131 |       val aEnd = if (aRange.end.isEmpty) null else aRange.end.get
132 |       val aEndInclusive = aRange.endInclusive
133 |       val bRange = b.asInstanceOf[Range[T]]
134 |       val bStart = if (bRange.start.isEmpty) null else bRange.start.get
135 |       val bStartInclusive = bRange.startInclusive
136 |       val bEndInclusive = bRange.endInclusive
137 | 
138 |       // Compare two ranges, return true iff the upper bound of the lower range is lteq to
139 |       // the lower bound of the upper range. Because the exclusive boundary could be null, which
140 |       // means the boundary could be infinity, we need to further check this conditions.
141 |       val result =
142 |         (aStartInclusive, aEndInclusive, bStartInclusive, bEndInclusive) match {
143 |           // [(aStart, aEnd] compare to [bStart, bEnd)]
144 |           case (_, true, true, _) =>
145 |             if (aRange.dt.ordering.lteq(aEnd.asInstanceOf[aRange.dt.InternalType],
146 |               bStart.asInstanceOf[aRange.dt.InternalType])) {
147 |               true
148 |             } else {
149 |               false
150 |             }
151 |           // [(aStart, aEnd] compare to (bStart, bEnd)]
152 |           case (_, true, false, _) =>
153 |             if (bStart != null && aRange.dt.ordering.lteq(aEnd.asInstanceOf[aRange.dt.InternalType],
154 |               bStart.asInstanceOf[aRange.dt.InternalType])) {
155 |               true
156 |             } else {
157 |               false
158 |             }
159 |           // [(aStart, aEnd) compare to [bStart, bEnd)]
160 |           case (_, false, true, _) =>
161 |             if (aEnd != null && aRange.dt.ordering.lteq(aEnd.asInstanceOf[aRange.dt.InternalType],
162 |               bStart.asInstanceOf[aRange.dt.InternalType])) {
163 |               true
164 |             } else {
165 |               false
166 |             }
167 |           // [(aStart, aEnd) compare to (bStart, bEnd)]
168 |           case (_, false, false, _) =>
169 |             if (aEnd != null && bStart != null &&
170 |               aRange.dt.ordering.lteq(aEnd.asInstanceOf[aRange.dt.InternalType],
171 |                 bStart.asInstanceOf[aRange.dt.InternalType])) {
172 |               true
173 |             } else {
174 |               false
175 |             }
176 |         }
177 | 
178 |       result
179 |     }
180 |   }
181 | }
182 | 
183 | object RangeType {
184 | 
185 |   object BooleanRangeType extends RangeType[Boolean]
186 | 
187 |   object ByteRangeType extends RangeType[Byte]
188 | 
189 |   object DecimalRangeType extends RangeType[BigDecimal]
190 | 
191 |   object DoubleRangeType extends RangeType[Double]
192 | 
193 |   object FloatRangeType extends RangeType[Float]
194 | 
195 |   object IntegerRangeType extends RangeType[Int]
196 | 
197 |   object LongRangeType extends RangeType[Long]
198 | 
199 |   object ShortRangeType extends RangeType[Short]
200 | 
201 |   object StringRangeType extends RangeType[String]
202 | 
203 |   object TimestampRangeType extends RangeType[Timestamp]
204 | 
205 |   val primitiveToPODataTypeMap: HashMap[AtomicType, PartialOrderingDataType] =
206 |     HashMap(
207 |       BooleanType -> BooleanRangeType,
208 |       ByteType -> ByteRangeType,
209 |       DoubleType -> DoubleRangeType,
210 |       FloatType -> FloatRangeType,
211 |       IntegerType -> IntegerRangeType,
212 |       LongType -> LongRangeType,
213 |       ShortType -> ShortRangeType,
214 |       StringType -> StringRangeType,
215 |       TimestampType -> TimestampRangeType
216 |     )
217 | }
218 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/util/BytesUtils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Licensed to the Apache Software Foundation (ASF) under one or more
  3 | * contributor license agreements.  See the NOTICE file distributed with
  4 | * this work for additional information regarding copyright ownership.
  5 | * The ASF licenses this file to You under the Apache License, Version 2.0
  6 | * (the "License"); you may not use this file except in compliance with
  7 | * the License.  You may obtain a copy of the License at
  8 | *
  9 | *    http://www.apache.org/licenses/LICENSE-2.0
 10 | *
 11 | * Unless required by applicable law or agreed to in writing, software
 12 | * distributed under the License is distributed on an "AS IS" BASIS,
 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | * See the License for the specific language governing permissions and
 15 | * limitations under the License.
 16 | */
 17 | package org.apache.spark.sql.hbase.util
 18 | 
 19 | import org.apache.hadoop.hbase.util.Bytes
 20 | import org.apache.spark.sql.types._
 21 | import org.apache.spark.sql.hbase._
 22 | 
 23 | object BytesUtils {
 24 |   def create(dataType: DataType): BytesUtils = {
 25 |     dataType match {
 26 |       case BooleanType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_BOOLEAN), BooleanType)
 27 |       case ByteType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_BYTE), ByteType)
 28 |       case DoubleType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_DOUBLE), DoubleType)
 29 |       case FloatType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_FLOAT), FloatType)
 30 |       case IntegerType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_INT), IntegerType)
 31 |       case LongType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_LONG), LongType)
 32 |       case ShortType => new BytesUtils(new HBaseRawType(Bytes.SIZEOF_SHORT), ShortType)
 33 |       case StringType => new BytesUtils(null, StringType)
 34 |     }
 35 |   }
 36 | 
 37 |   def toString(input: HBaseRawType, offset: Int, length: Int): String = {
 38 |     Bytes.toString(input, offset, length)
 39 |   }
 40 | 
 41 |   def toByte(input: HBaseRawType, offset: Int): Byte = {
 42 |     // Flip sign bit back
 43 |     val v: Int = input(offset) ^ 0x80
 44 |     v.asInstanceOf[Byte]
 45 |   }
 46 | 
 47 |   def toBoolean(input: HBaseRawType, offset: Int): Boolean = {
 48 |     input(offset) != 0
 49 |   }
 50 | 
 51 |   def toDouble(input: HBaseRawType, offset: Int): Double = {
 52 |     var l: Long = Bytes.toLong(input, offset, Bytes.SIZEOF_DOUBLE)
 53 |     l = l - 1
 54 |     l ^= (~l >> java.lang.Long.SIZE - 1) | java.lang.Long.MIN_VALUE
 55 |     java.lang.Double.longBitsToDouble(l)
 56 |   }
 57 | 
 58 |   def toShort(input: HBaseRawType, offset: Int): Short = {
 59 |     // flip sign bit back
 60 |     var v: Int = input(offset) ^ 0x80
 61 |     v = (v << 8) + (input(1 + offset) & 0xff)
 62 |     v.asInstanceOf[Short]
 63 |   }
 64 | 
 65 |   def toFloat(input: HBaseRawType, offset: Int): Float = {
 66 |     var i = Bytes.toInt(input, offset)
 67 |     i = i - 1
 68 |     i ^= (~i >> Integer.SIZE - 1) | Integer.MIN_VALUE
 69 |     java.lang.Float.intBitsToFloat(i)
 70 |   }
 71 | 
 72 |   def toInt(input: HBaseRawType, offset: Int): Int = {
 73 |     // Flip sign bit back
 74 |     var v: Int = input(offset) ^ 0x80
 75 |     for (i <- 1 to Bytes.SIZEOF_INT - 1) {
 76 |       v = (v << 8) + (input(i + offset) & 0xff)
 77 |     }
 78 |     v
 79 |   }
 80 | 
 81 |   def toLong(input: HBaseRawType, offset: Int): Long = {
 82 |     // Flip sign bit back
 83 |     var v: Long = input(offset) ^ 0x80
 84 |     for (i <- 1 to Bytes.SIZEOF_LONG - 1) {
 85 |       v = (v << 8) + (input(i + offset) & 0xff)
 86 |     }
 87 |     v
 88 |   }
 89 | 
 90 |   /**
 91 |    * add one to the unsigned byte array
 92 |    * @param input the unsigned byte array
 93 |    * @return null if the byte array is all 0xff, otherwise increase by 1
 94 |    */
 95 |   def addOne(input: HBaseRawType): HBaseRawType = {
 96 |     val len = input.length
 97 |     val result = new HBaseRawType(len)
 98 |     Array.copy(input, 0, result, 0, len)
 99 |     var setValue = false
100 |     for (index <- len - 1 to 0 by -1 if !setValue) {
101 |       val item: Byte = input(index)
102 |       if (item != 0xff.toByte) {
103 |         setValue = true
104 |         if ((item & 0x01.toByte) == 0.toByte) {
105 |           result(index) = (item ^ 0x01.toByte).toByte
106 |         } else if ((item & 0x02.toByte) == 0.toByte) {
107 |           result(index) = (item ^ 0x03.toByte).toByte
108 |         } else if ((item & 0x04.toByte) == 0.toByte) {
109 |           result(index) = (item ^ 0x07.toByte).toByte
110 |         } else if ((item & 0x08.toByte) == 0.toByte) {
111 |           result(index) = (item ^ 0x0f.toByte).toByte
112 |         } else if ((item & 0x10.toByte) == 0.toByte) {
113 |           result(index) = (item ^ 0x1f.toByte).toByte
114 |         } else if ((item & 0x20.toByte) == 0.toByte) {
115 |           result(index) = (item ^ 0x3f.toByte).toByte
116 |         } else if ((item & 0x40.toByte) == 0.toByte) {
117 |           result(index) = (item ^ 0x7f.toByte).toByte
118 |         } else {
119 |           result(index) = (item ^ 0xff.toByte).toByte
120 |         }
121 |         // after increment, set remaining bytes to zero
122 |         for (rest <- index + 1 until len) {
123 |           result(rest) = 0x00.toByte
124 |         }
125 |       }
126 |     }
127 |     if (!setValue) {
128 |       null
129 |     } else {
130 |       result
131 |     }
132 |   }
133 | }
134 | 
135 | class BytesUtils(var buffer: HBaseRawType, dt: DataType) {
136 |   val dataType = dt
137 | 
138 |   def toBytes(input: String): HBaseRawType = {
139 |     buffer = Bytes.toBytes(input)
140 |     buffer
141 |   }
142 | 
143 |   def toBytes(input: Byte): HBaseRawType = {
144 |     // Flip sign bit so that Byte is binary comparable
145 |     buffer(0) = (input ^ 0x80).asInstanceOf[Byte]
146 |     buffer
147 |   }
148 | 
149 |   def toBytes(input: Boolean): HBaseRawType = {
150 |     if (input) {
151 |       buffer(0) = (-1).asInstanceOf[Byte]
152 |     } else {
153 |       buffer(0) = 0.asInstanceOf[Byte]
154 |     }
155 |     buffer
156 |   }
157 | 
158 |   def toBytes(input: Double): HBaseRawType = {
159 |     var l: Long = java.lang.Double.doubleToLongBits(input)
160 |     l = (l ^ ((l >> java.lang.Long.SIZE - 1) | java.lang.Long.MIN_VALUE)) + 1
161 |     Bytes.putLong(buffer, 0, l)
162 |     buffer
163 |   }
164 | 
165 |   def toBytes(input: Short): HBaseRawType = {
166 |     buffer(0) = ((input >> 8) ^ 0x80).asInstanceOf[Byte]
167 |     buffer(1) = input.asInstanceOf[Byte]
168 |     buffer
169 |   }
170 | 
171 |   def toBytes(input: Float): HBaseRawType = {
172 |     var i: Int = java.lang.Float.floatToIntBits(input)
173 |     i = (i ^ ((i >> Integer.SIZE - 1) | Integer.MIN_VALUE)) + 1
174 |     Bytes.putInt(buffer, 0, i)
175 |     buffer
176 |   }
177 | 
178 |   def toBytes(input: Int): HBaseRawType = {
179 |     // Flip sign bit so that INTEGER is binary comparable
180 |     buffer(0) = ((input >> 24) ^ 0x80).asInstanceOf[Byte]
181 |     buffer(1) = (input >> 16).asInstanceOf[Byte]
182 |     buffer(2) = (input >> 8).asInstanceOf[Byte]
183 |     buffer(3) = input.asInstanceOf[Byte]
184 |     buffer
185 |   }
186 | 
187 |   def toBytes(input: Long): HBaseRawType = {
188 |     buffer(0) = ((input >> 56) ^ 0x80).asInstanceOf[Byte]
189 |     buffer(1) = (input >> 48).asInstanceOf[Byte]
190 |     buffer(2) = (input >> 40).asInstanceOf[Byte]
191 |     buffer(3) = (input >> 32).asInstanceOf[Byte]
192 |     buffer(4) = (input >> 24).asInstanceOf[Byte]
193 |     buffer(5) = (input >> 16).asInstanceOf[Byte]
194 |     buffer(6) = (input >> 8).asInstanceOf[Byte]
195 |     buffer(7) = input.asInstanceOf[Byte]
196 |     buffer
197 |   }
198 | }
199 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/util/DataTypeUtils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Licensed to the Apache Software Foundation (ASF) under one or more
  3 | * contributor license agreements.  See the NOTICE file distributed with
  4 | * this work for additional information regarding copyright ownership.
  5 | * The ASF licenses this file to You under the Apache License, Version 2.0
  6 | * (the "License"); you may not use this file except in compliance with
  7 | * the License.  You may obtain a copy of the License at
  8 | *
  9 | *    http://www.apache.org/licenses/LICENSE-2.0
 10 | *
 11 | * Unless required by applicable law or agreed to in writing, software
 12 | * distributed under the License is distributed on an "AS IS" BASIS,
 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | * See the License for the specific language governing permissions and
 15 | * limitations under the License.
 16 | */
 17 | package org.apache.spark.sql.hbase.util
 18 | 
 19 | import org.apache.hadoop.hbase.filter.BinaryComparator
 20 | import org.apache.spark.sql.catalyst.expressions.{Literal, MutableRow, Row}
 21 | import org.apache.spark.sql.types._
 22 | import org.apache.spark.sql.hbase._
 23 | 
 24 | /**
 25 |  * Data Type conversion utilities
 26 |  */
 27 | object DataTypeUtils {
 28 |   /**
 29 |    * convert the byte array to data
 30 |    * @param src the input byte array
 31 |    * @param offset the offset in the byte array
 32 |    * @param length the length of the data, only used by StringType
 33 |    * @param dt the data type
 34 |    * @return the actual data converted from byte array
 35 |    */
 36 |   def bytesToData(src: HBaseRawType, offset: Int, length: Int, dt: DataType): Any = {
 37 |     dt match {
 38 |       case BooleanType => BytesUtils.toBoolean(src, offset)
 39 |       case ByteType => src(offset)
 40 |       case DoubleType => BytesUtils.toDouble(src, offset)
 41 |       case FloatType => BytesUtils.toFloat(src, offset)
 42 |       case IntegerType => BytesUtils.toInt(src, offset)
 43 |       case LongType => BytesUtils.toLong(src, offset)
 44 |       case ShortType => BytesUtils.toShort(src, offset)
 45 |       case StringType => BytesUtils.toString(src, offset, length)
 46 |       case _ => throw new Exception("Unsupported HBase SQL Data Type")
 47 |     }
 48 |   }
 49 | 
 50 |   /**
 51 |    * convert data to byte array
 52 |    * @param src the input data
 53 |    * @param dt the data type
 54 |    * @return the output byte array
 55 |    */
 56 |   def dataToBytes(src: Any,
 57 |                   dt: DataType): HBaseRawType = {
 58 |     // TODO: avoid new instance per invocation
 59 |     val bu = BytesUtils.create(dt)
 60 |     dt match {
 61 |       case BooleanType => bu.toBytes(src.asInstanceOf[Boolean])
 62 |       case ByteType => bu.toBytes(src.asInstanceOf[Byte])
 63 |       case DoubleType => bu.toBytes(src.asInstanceOf[Double])
 64 |       case FloatType => bu.toBytes(src.asInstanceOf[Float])
 65 |       case IntegerType => bu.toBytes(src.asInstanceOf[Int])
 66 |       case LongType => bu.toBytes(src.asInstanceOf[Long])
 67 |       case ShortType => bu.toBytes(src.asInstanceOf[Short])
 68 |       case StringType => bu.toBytes(src.asInstanceOf[String])
 69 |       case _ => throw new Exception("Unsupported HBase SQL Data Type")
 70 |     }
 71 |   }
 72 | 
 73 |   /**
 74 |    * set the row data from byte array
 75 |    * @param row the row to be set
 76 |    * @param index the index in the row
 77 |    * @param src the input byte array
 78 |    * @param offset the offset in the byte array
 79 |    * @param length the length of the data, only used by StringType
 80 |    * @param dt the data type
 81 |    */
 82 |   def setRowColumnFromHBaseRawType(row: MutableRow,
 83 |                                    index: Int,
 84 |                                    src: HBaseRawType,
 85 |                                    offset: Int,
 86 |                                    length: => Int,
 87 |                                    dt: DataType): Unit = {
 88 |     if (src == null || src.isEmpty) {
 89 |       row.setNullAt(index)
 90 |       return
 91 |     }
 92 |     dt match {
 93 |       case BooleanType => row.setBoolean(index, BytesUtils.toBoolean(src, offset))
 94 |       case ByteType => row.setByte(index, BytesUtils.toByte(src, offset))
 95 |       case DoubleType => row.setDouble(index, BytesUtils.toDouble(src, offset))
 96 |       case FloatType => row.setFloat(index, BytesUtils.toFloat(src, offset))
 97 |       case IntegerType => row.setInt(index, BytesUtils.toInt(src, offset))
 98 |       case LongType => row.setLong(index, BytesUtils.toLong(src, offset))
 99 |       case ShortType => row.setShort(index, BytesUtils.toShort(src, offset))
100 |       case StringType => row.setString(index, BytesUtils.toString(src, offset, length))
101 |       case _ => throw new Exception("Unsupported HBase SQL Data Type")
102 |     }
103 |   }
104 | 
105 |   def string2TypeData(v: String, dt: DataType): Any = {
106 |     v match {
107 |       case null => null
108 |       case _ =>
109 |         dt match {
110 |           // TODO: handle some complex types
111 |           case BooleanType => v.toBoolean
112 |           case ByteType => v.getBytes()(0)
113 |           case DoubleType => v.toDouble
114 |           case FloatType => v.toFloat
115 |           case IntegerType => v.toInt
116 |           case LongType => v.toLong
117 |           case ShortType => v.toShort
118 |           case StringType => v
119 |         }
120 |     }
121 |   }
122 | 
123 |   /**
124 |    * get the data from row based on index
125 |    * @param row the input row
126 |    * @param index the index of the data
127 |    * @param dt the data type
128 |    * @return the data from the row based on index
129 |    */
130 |   def getRowColumnInHBaseRawType(row: Row, index: Int, dt: DataType): HBaseRawType = {
131 |     if (row.isNullAt(index)) return new Array[Byte](0)
132 | 
133 |     val bu = BytesUtils.create(dt)
134 |     dt match {
135 |       case BooleanType => bu.toBytes(row.getBoolean(index))
136 |       case ByteType => bu.toBytes(row.getByte(index))
137 |       case DoubleType => bu.toBytes(row.getDouble(index))
138 |       case FloatType => bu.toBytes(row.getFloat(index))
139 |       case IntegerType => bu.toBytes(row.getInt(index))
140 |       case LongType => bu.toBytes(row.getLong(index))
141 |       case ShortType => bu.toBytes(row.getShort(index))
142 |       case StringType => bu.toBytes(row.getString(index))
143 |       case _ => throw new Exception("Unsupported HBase SQL Data Type")
144 |     }
145 |   }
146 | 
147 |   /**
148 |    * create binary comparator for the input expression
149 |    * @param bu the byte utility
150 |    * @param expression the input expression
151 |    * @return the constructed binary comparator
152 |    */
153 |   def getBinaryComparator(bu: BytesUtils, expression: Literal): BinaryComparator = {
154 |     expression.dataType match {
155 |       case BooleanType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Boolean]))
156 |       case ByteType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Byte]))
157 |       case DoubleType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Double]))
158 |       case FloatType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Float]))
159 |       case IntegerType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Int]))
160 |       case LongType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Long]))
161 |       case ShortType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[Short]))
162 |       case StringType => new BinaryComparator(bu.toBytes(expression.value.asInstanceOf[String]))
163 |       case _ => throw new Exception("Cannot convert the data type using BinaryComparator")
164 |     }
165 |   }
166 | }
167 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/util/HBaseKVHelper.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hbase.util
 19 | 
 20 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
 21 | import org.apache.spark.sql.hbase._
 22 | import org.apache.spark.sql.types._
 23 | 
 24 | import scala.collection.mutable.ArrayBuffer
 25 | 
 26 | object HBaseKVHelper {
 27 |   private val delimiter: Byte = 0
 28 | 
 29 |   /**
 30 |    * create row key based on key columns information
 31 |    * for strings, it will add '0x00' as its delimiter
 32 |    * @param rawKeyColumns sequence of byte array and data type representing the key columns
 33 |    * @return array of bytes
 34 |    */
 35 |   def encodingRawKeyColumns(rawKeyColumns: Seq[(HBaseRawType, DataType)]): HBaseRawType = {
 36 |     val length = rawKeyColumns.foldLeft(0)((b, a) => {
 37 |       val len = b + a._1.length
 38 |       if (a._2 == StringType) len + 1 else len
 39 |     })
 40 |     val result = new HBaseRawType(length)
 41 |     var index = 0
 42 |     for (rawKeyColumn <- rawKeyColumns) {
 43 |       Array.copy(rawKeyColumn._1, 0, result, index, rawKeyColumn._1.length)
 44 |       index += rawKeyColumn._1.length
 45 |       if (rawKeyColumn._2 == StringType) {
 46 |         result(index) = delimiter
 47 |         index += 1
 48 |       }
 49 |     }
 50 |     result
 51 |   }
 52 | 
 53 |   /**
 54 |    * generate the sequence information of key columns from the byte array
 55 |    * @param rowKey array of bytes
 56 |    * @param keyColumns the sequence of key columns
 57 |    * @return sequence of information in (offset, length) tuple
 58 |    */
 59 |   def decodingRawKeyColumns(rowKey: HBaseRawType, keyColumns: Seq[KeyColumn]): Seq[(Int, Int)] = {
 60 |     var index = 0
 61 |     keyColumns.map {
 62 |       case c =>
 63 |         if (index >= rowKey.length) (-1, -1)
 64 |         else {
 65 |           val offset = index
 66 |           if (c.dataType == StringType) {
 67 |             val pos = rowKey.indexOf(delimiter, index)
 68 |             index = pos + 1
 69 |             (offset, pos - offset)
 70 |           } else {
 71 |             val length = c.dataType.asInstanceOf[AtomicType].defaultSize
 72 |             index += length
 73 |             (offset, length)
 74 |           }
 75 |         }
 76 |     }
 77 |   }
 78 | 
 79 |   /**
 80 |    * Takes a record, translate it into HBase row key column and value by matching with metadata
 81 |    * @param values record that as a sequence of string
 82 |    * @param relation HBaseRelation
 83 |    * @param keyBytes  output parameter, array of (key column and its type);
 84 |    * @param valueBytes array of (column family, column qualifier, value)
 85 |    */
 86 |   def string2KV(values: Seq[String],
 87 |                 relation: HBaseRelation,
 88 |                 lineBuffer: Array[BytesUtils],
 89 |                 keyBytes: Array[(Array[Byte], DataType)],
 90 |                 valueBytes: Array[HBaseRawType]) = {
 91 |     assert(values.length == relation.output.length,
 92 |       s"values length ${values.length} not equals columns length ${relation.output.length}")
 93 | 
 94 |     relation.keyColumns.foreach(kc => {
 95 |       val ordinal = kc.ordinal
 96 |       keyBytes(kc.order) = (string2Bytes(values(ordinal), lineBuffer(ordinal)),
 97 |         relation.output(ordinal).dataType)
 98 |     })
 99 |     for (i <- 0 until relation.nonKeyColumns.size) {
100 |       val nkc = relation.nonKeyColumns(i)
101 |       val bytes =  {
102 |         // we should not use the same buffer in bulk-loading otherwise it will lead to corrupted
103 |         lineBuffer(nkc.ordinal) = BytesUtils.create(lineBuffer(nkc.ordinal).dataType)
104 |         string2Bytes(values(nkc.ordinal), lineBuffer(nkc.ordinal))
105 |       }
106 |       valueBytes(i) = bytes
107 |     }
108 |   }
109 | 
110 |   private def string2Bytes(v: String, bu: BytesUtils): Array[Byte] = {
111 |     v match {
112 |       case "" => new Array[Byte](0)
113 |       case null => new Array[Byte](0)
114 |       case _ =>
115 |         bu.dataType match {
116 |           // todo: handle some complex types
117 |           case BooleanType => bu.toBytes(v.toBoolean)
118 |           case ByteType => bu.toBytes(v)
119 |           case DoubleType => bu.toBytes(v.toDouble)
120 |           case FloatType => bu.toBytes(v.toFloat)
121 |           case IntegerType => bu.toBytes(v.toInt)
122 |           case LongType => bu.toBytes(v.toLong)
123 |           case ShortType => bu.toBytes(v.toShort)
124 |           case StringType => bu.toBytes(v)
125 |         }
126 |     }
127 |   }
128 | 
129 |   /**
130 |    * create a array of buffer that to be used for creating HBase Put object
131 |    * @param schema the schema of the line buffer
132 |    * @return
133 |    */
134 |   private[hbase] def createLineBuffer(schema: Seq[Attribute]): Array[BytesUtils] = {
135 |     val buffer = ArrayBuffer[BytesUtils]()
136 |     schema.foreach { x =>
137 |       buffer.append(BytesUtils.create(x.dataType))
138 |     }
139 |     buffer.toArray
140 |   }
141 | 
142 |   /**
143 |    * create a row key
144 |    * @param row the generic row
145 |    * @param dataTypeOfKeys sequence of data type
146 |    * @return the row key
147 |    */
148 |   def makeRowKey(row: Row, dataTypeOfKeys: Seq[DataType]): HBaseRawType = {
149 |     val rawKeyCol = dataTypeOfKeys.zipWithIndex.map {
150 |       case (dataType, index) =>
151 |         (DataTypeUtils.getRowColumnInHBaseRawType(row, index, dataType), dataType)
152 |     }
153 | 
154 |     encodingRawKeyColumns(rawKeyCol)
155 |   }
156 | }
157 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hbase/util/Util.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase.util
19 | 
20 | import java.io._
21 | import java.util.concurrent.atomic.AtomicInteger
22 | import java.util.zip.{DeflaterOutputStream, InflaterInputStream}
23 | 
24 | import org.apache.hadoop.conf.Configuration
25 | import org.apache.hadoop.fs.{FileSystem, Path}
26 | import org.apache.hadoop.hbase.HBaseConfiguration
27 | 
28 | object Util {
29 |   val iteration = new AtomicInteger(0)
30 | 
31 |   def getTempFilePath(conf: Configuration, prefix: String): String = {
32 |     val fileSystem = FileSystem.get(conf)
33 |     val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}")
34 |     if (fileSystem.exists(path)) {
35 |       fileSystem.delete(path, true)
36 |     }
37 |     path.getName
38 |   }
39 | 
40 |   def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = {
41 |     val bos = new ByteArrayOutputStream
42 |     val deflaterOutputStream = new DeflaterOutputStream(bos)
43 |     val dos = new DataOutputStream(deflaterOutputStream)
44 |     configuration.write(dos)
45 |     dos.close()
46 |     bos.toByteArray
47 |   }
48 | 
49 |   def deserializeHBaseConfiguration(arr: Array[Byte]) = {
50 |     val conf = HBaseConfiguration.create
51 |     conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr))))
52 |     conf
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/test/java/org/apache/spark/sql/hbase/api/java/JavaAPISuite.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase.api.java;
19 | 
20 | import java.io.Serializable;
21 | 
22 | import org.apache.hadoop.hbase.HBaseTestingUtility;
23 | import org.apache.hadoop.hbase.MiniHBaseCluster;
24 | import org.apache.hadoop.hbase.client.HBaseAdmin;
25 | import org.apache.spark.SparkConf;
26 | import org.apache.spark.sql.SQLContext;
27 | import org.apache.spark.sql.hbase.*;
28 | import org.junit.After;
29 | import org.junit.Before;
30 | import org.junit.Test;
31 | 
32 | import org.apache.spark.api.java.JavaSparkContext;
33 | import org.apache.spark.sql.Row;
34 | 
35 | public class JavaAPISuite extends HBaseIntegrationTestBase implements Serializable {
36 |     private transient JavaSparkContext sc;
37 |     private transient SQLContext hsc;
38 |     private transient MiniHBaseCluster cluster;
39 |     private transient HBaseAdmin hbaseAdmin;
40 | 
41 |     private final String hb_staging_table = "HbStagingTable";
42 |     private final String staging_table = "StagingTable";
43 |     private final String create_sql = "CREATE TABLE " + staging_table + "(strcol STRING, bytecol String, shortcol String, intcol String, " +
44 |             "longcol string, floatcol string, doublecol string, PRIMARY KEY(doublecol, strcol, intcol))" +
45 |             " MAPPED BY (" + hb_staging_table + ", COLS=[bytecol=cf1.hbytecol, " +
46 |             "shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])";
47 |     private final String insert_sql = "INSERT INTO " + staging_table + " VALUES (\"strcol\" , \"bytecol\" , \"shortcol\" , \"intcol\" ," +
48 |             "  \"longcol\" , \"floatcol\" , \"doublecol\")";
49 |     private final String retrieve_sql = "SELECT * FROM " + staging_table;
50 | 
51 |     @Before
52 |     public void setUp() {
53 |         System.setProperty("spark.hadoop.hbase.zookeeper.quorum", "localhost");
54 | 
55 |         sc = new JavaSparkContext("local[2]", "JavaAPISuite", new SparkConf(true));
56 |         hsc = new HBaseSQLContext(sc);
57 | 
58 |         HBaseTestingUtility testUtil = new HBaseTestingUtility(hsc.sparkContext().
59 |                 hadoopConfiguration());
60 | 
61 |         int nRegionServers = 1;
62 |         int nDataNodes = 1;
63 |         int nMasters = 1;
64 | 
65 |         try {
66 |             cluster = testUtil.startMiniCluster(nMasters, nRegionServers, nDataNodes);
67 |             hbaseAdmin = new HBaseAdmin(hsc.sparkContext().hadoopConfiguration());
68 |         } catch (Exception e) {
69 |             e.printStackTrace();
70 |         }
71 |     }
72 | 
73 |     @Test
74 |     public void testCreateInsertRetrieveTable() {
75 |         hsc.sql(create_sql).collect();
76 |         hsc.sql(insert_sql).collect();
77 |         Row[] row = hsc.sql(retrieve_sql).collect();
78 | 
79 |         assert (row[0].toString().equals("[strcol,bytecol,shortcol,intcol,longcol,floatcol,doublecol]"));
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/test/resources/joinTable1.txt:
--------------------------------------------------------------------------------
 1 | RowA1,a,12345,23456789,3456789012345,45657.89, 5678912.345678
 2 | RowA2,a,12346,23456790,3456789012346,45657.90, 5678912.345679
 3 | Row2,b,12342,23456782,3456789012342,45657.82, 5678912.345682
 4 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683
 5 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684
 6 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685
 7 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686
 8 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687
 9 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688
10 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689
11 | RowA10a,j,12340,23456780,3456789012340,45657.80, 5678912.345690
12 | RowA10b,j,12341,23456781,3456789012341,45657.81, 5678912.345691
13 | RowA10c,j,12342,23456782,3456789012342,45657.82, 5678912.345692
14 | 


--------------------------------------------------------------------------------
/src/test/resources/joinTable2.txt:
--------------------------------------------------------------------------------
 1 | RowB1,a,12345,23456789,3456789012345,45657.89, 5678912.345678
 2 | Row2,b1,12342,23456782,3456789012342,45657.82, 5678912.345682
 3 | Row2,b2,12342,23456782,3456789012342,45657.82, 5678912.345683
 4 | Row2,b3,12342,23456782,3456789012342,45657.82, 5678912.345684
 5 | Row2,b4,12342,23456782,3456789012342,45657.82, 5678912.345685
 6 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683
 7 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684
 8 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685
 9 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686
10 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687
11 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688
12 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689
13 | RowB10a,j,12340,23456780,3456789012340,45657.80, 5678912.345690
14 | RowB10b,k,12341,23456781,3456789012341,45657.81, 5678912.345691


--------------------------------------------------------------------------------
/src/test/resources/joinTable3.txt:
--------------------------------------------------------------------------------
 1 | RowC1,a,12345,23456789,3456789012345,45657.89, 5678912.345678
 2 | RowC2,a,12346,23456790,3456789012346,45657.90, 5678912.345679
 3 | Row2,b,12342,23456782,3456789012342,45657.82, 5678912.345682
 4 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683
 5 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684
 6 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685
 7 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686
 8 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687
 9 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688
10 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689
11 | RowC10a,j,12340,23456780,3456789012340,45657.80, 5678912.345690
12 | RowC10b,j,12341,23456781,3456789012341,45657.81, 5678912.345691
13 | RowC10c,j,12342,23456782,3456789012342,45657.82, 5678912.345692
14 | 


--------------------------------------------------------------------------------
/src/test/resources/joinTable4.txt:
--------------------------------------------------------------------------------
 1 | RowD1,a,12345,23456789,3456789012345,45657.89, 5678912.345678
 2 | RowD2,a,12346,23456790,3456789012346,45657.90, 5678912.345679
 3 | Row2,b,12342,23456782,3456789012342,45657.82, 5678912.345682
 4 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683
 5 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684
 6 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685
 7 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686
 8 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687
 9 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688
10 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689
11 | RowD10a,j,12340,23456780,3456789012340,45657.80, 5678912.345690
12 | RowD10b,j,12341,23456781,3456789012341,45657.81, 5678912.345691
13 | RowD10c,j,12342,23456782,3456789012342,45657.82, 5678912.345692
14 | 


--------------------------------------------------------------------------------
/src/test/resources/loadData.txt:
--------------------------------------------------------------------------------
1 | row5,5,10
2 | row4,4,8
3 | row5,5,10
4 | row6,6,12


--------------------------------------------------------------------------------
/src/test/resources/loadNullableData.txt:
--------------------------------------------------------------------------------
1 | row1,,8,101
2 | row2,2,,102
3 | row3,3,10,
4 | row4,,,


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the file core/target/unit-tests.log
19 | log4j.rootLogger=WARN,CA,FA
20 | 
21 | #Console Appender
22 | log4j.appender.CA=org.apache.log4j.ConsoleAppender
23 | log4j.appender.CA.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.CA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %p %c: %m%n
25 | log4j.appender.CA.Threshold = INFO
26 | 
27 | 
28 | #File Appender
29 | log4j.appender.FA=org.apache.log4j.FileAppender
30 | log4j.appender.FA.append=false
31 | log4j.appender.FA.file=target/unit-tests.log
32 | log4j.appender.FA.layout=org.apache.log4j.PatternLayout
33 | log4j.appender.FA.layout.ConversionPattern=%d{HH:mm:ss.SSS} %p %c{1}: %m%n
34 | log4j.appender.FA.Threshold = INFO
35 | 
36 | log4j.logger.org.mortbay=WARN
37 | 
38 | log4j.logger.BlockStateChange=WARN
39 | log4j.logger.org.eclipse.jetty=WARN
40 | log4j.logger.org.apache.hadoop.hbase.ZNodeClearer=ERROR
41 | log4j.logger.org.apache.hadoop.hbase=WARN
42 | log4j.logger.org.apache.hadoop=WARN
43 | log4j.logger.org.apache.zookeeper=WARN
44 | 
45 | log4j.logger.org.apache.spark.sql.hbase=DEBUG
46 | log4j.logger.org.apache.spark=WARN
47 | log4j.logger.org.scalatest=WARN
48 | 


--------------------------------------------------------------------------------
/src/test/resources/onecoljoin1.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 2


--------------------------------------------------------------------------------
/src/test/resources/onecoljoin2.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 2


--------------------------------------------------------------------------------
/src/test/resources/splitLoadData.txt:
--------------------------------------------------------------------------------
 1 | 1,6,val6
 2 | 2,12,val12
 3 | 3,18,val18
 4 | 4,24,val24
 5 | 5,30,val30
 6 | 6,36,val36
 7 | 7,42,val42
 8 | 8,48,val48
 9 | 9,54,val54
10 | 10,60,val60
11 | 11,66,val66
12 | 12,72,val72
13 | 13,78,val78
14 | 14,84,val84
15 | 15,90,val90
16 | 16,96,val96


--------------------------------------------------------------------------------
/src/test/resources/splitLoadData1.txt:
--------------------------------------------------------------------------------
1 | 1,0a,1024,v1
2 | 1024,0b,0,v2
3 | 2048,cc,1024,v3
4 | 4096,0a,0,v4
5 | 4096,0b,1024,v5
6 | 4096,cc,0,v6
7 | 4096,cc,1024,v7


--------------------------------------------------------------------------------
/src/test/resources/testTable.txt:
--------------------------------------------------------------------------------
 1 | Row2,b,12342,23456782,3456789012342,45657.82, 5678912.345682
 2 | Row4,d,12344,23456784,3456789012344,45657.84, 5678912.345684
 3 | Row5,e,12345,23456785,3456789012345,45657.85, 5678912.345685
 4 | Row7,g,12347,23456787,3456789012347,45657.87, 5678912.345687
 5 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689
 6 | Row0,j,12340,23456780,3456789012340,45657.80, 5678912.345690
 7 | Row6,f,12346,23456786,3456789012346,45657.86, 5678912.345686
 8 | Row3,c,12343,23456783,3456789012343,45657.83, 5678912.345683
 9 | Row1,a,12345,23456789,3456789012345,45657.89, 5678912.345678
10 | Row8,h,12348,23456788,3456789012348,45657.88, 5678912.345688
11 | Row9,i,12349,23456789,3456789012349,45657.89, 5678912.345689
12 | 
13 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/AggregateQueriesSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Licensed to the Apache Software Foundation (ASF) under one or more
 3 | * contributor license agreements.  See the NOTICE file distributed with
 4 | * this work for additional information regarding copyright ownership.
 5 | * The ASF licenses this file to You under the Apache License, Version 2.0
 6 | * (the "License"); you may not use this file except in compliance with
 7 | * the License.  You may obtain a copy of the License at
 8 | *
 9 | *    http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | 
18 | package org.apache.spark.sql.hbase
19 | 
20 | class AggregateQueriesSuite extends HBaseTestData {
21 |   var testnm = "Group by with cols in select list and with order by"
22 |   test("Group by with cols in select list and with order by") {
23 |     val query =
24 |       s"""SELECT count(1) as cnt, intcol, floatcol, strcol, max(bytecol) bytecol, max(shortcol) shortcol,
25 |           max(floatcol) floatcolmax, max(doublecol) doublecol, max(longcol) from $DefaultTableName
26 |           WHERE strcol LIKE '%Row%' AND shortcol < 12345 AND doublecol > 5678912.345681
27 |           AND doublecol < 5678912.345684
28 |           GROUP BY intcol, floatcol, strcol ORDER BY strcol DESC"""
29 | 
30 |     testGroupBy(testnm, query)
31 |   }
32 | 
33 |   testnm = "Group by with cols in select list and with having and order by"
34 |   test("Group by with cols in select list and with having and order by") {
35 |     val query = s"""SELECT count(1) as cnt, intcol, floatcol, strcol, max(bytecol) bytecolmax,
36 |          max(shortcol) shortcolmax, max(floatcol) floatcolmax, max(doublecol) doublecolmax,
37 |          max(longcol) longcolmax
38 |          FROM $DefaultTableName
39 |          WHERE strcol like '%Row%' AND shortcol < 12345 AND doublecol > 5678912.345681
40 |          AND doublecol < 5678912.345685
41 |          GROUP BY intcol, floatcol, strcol
42 |          HAVING max(doublecol) < 5678912.345684
43 |          ORDER BY strcol DESC""".stripMargin
44 |     testGroupBy(testnm, query)
45 |   }
46 | 
47 |   def testGroupBy(testName: String, query: String) = {
48 |     val result1 = runSql(query)
49 |     assert(result1.size == 2, s"$testName failed on size")
50 |     val exparr = Array(
51 |       Array(1, 23456783, 45657.83F, "Row3", 'c', 12343, 45657.83F, 5678912.345683, 3456789012343L),
52 |       Array(1, 23456782, 45657.82F, "Row2", 'b', 12342, 45657.82F, 5678912.345682, 3456789012342L))
53 | 
54 |     val res = {
55 |       for (rx <- 0 until exparr.size)
56 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
57 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
58 |     assert(res, "One or more rows did not match expected")
59 | 
60 |     logInfo(s"$query came back with ${result1.size} results")
61 |     logInfo(result1.mkString)
62 | 
63 |     logInfo(s"Test $testName completed successfully")
64 |   }
65 | 
66 |   testnm = "Another Group by with cols in select list and with having and order by"
67 |   test("Another Group by with cols in select list and with having and order by") {
68 |     val query1 =
69 |       s"""SELECT count(1) as cnt, intcol, floatcol, strcol, max(bytecol) bytecolmax, max(shortcol) shortcolmax,
70 |           max(floatcol) floatcolmax, max(doublecol) doublecolmax, max(longcol) longcolmax FROM $DefaultTableName
71 |           WHERE strcol LIKE '%Row%' AND shortcol < 12345 AND doublecol > 5678912.345681
72 |           AND doublecol < 5678912.345685
73 |           GROUP BY intcol, floatcol, strcol HAVING max(doublecol) < 5678912.345684 ORDER BY strcol DESC"""
74 |         .stripMargin
75 | 
76 |     val result1 = runSql(query1)
77 |     assert(result1.size == 2, s"$testnm failed on size")
78 |     val exparr = Array(
79 |       Array(1, 23456783, 45657.83F, "Row3", 'c', 12343, 45657.83F, 5678912.345683, 3456789012343L),
80 |       Array(1, 23456782, 45657.82F, "Row2", 'b', 12342, 45657.82F, 5678912.345682, 3456789012342L))
81 | 
82 |     val res = {
83 |       for (rx <- 0 until exparr.size)
84 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
85 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
86 |     assert(res, "One or more rows did not match expected")
87 | 
88 |     logInfo(s"$query1 came back with ${result1.size} results")
89 |     logInfo(result1.mkString)
90 | 
91 |     logInfo(s"Test $testnm completed successfully")
92 |   }
93 | }
94 | 
95 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/BasicQueriesSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Licensed to the Apache Software Foundation (ASF) under one or more
  3 | * contributor license agreements.  See the NOTICE file distributed with
  4 | * this work for additional information regarding copyright ownership.
  5 | * The ASF licenses this file to You under the Apache License, Version 2.0
  6 | * (the "License"); you may not use this file except in compliance with
  7 | * the License.  You may obtain a copy of the License at
  8 | *
  9 | *    http://www.apache.org/licenses/LICENSE-2.0
 10 | *
 11 | * Unless required by applicable law or agreed to in writing, software
 12 | * distributed under the License is distributed on an "AS IS" BASIS,
 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | * See the License for the specific language governing permissions and
 15 | * limitations under the License.
 16 | */
 17 | 
 18 | package org.apache.spark.sql.hbase
 19 | 
 20 | class BasicQueriesSuite extends HBaseTestData {
 21 |   var testnm = "StarOperator * with limit"
 22 |   test("StarOperator * with limit") {
 23 |     val query1 =
 24 |       s"""SELECT * FROM $DefaultTableName LIMIT 3"""
 25 |         .stripMargin
 26 | 
 27 |     val result1 = runSql(query1)
 28 |     assert(result1.size == 3, s"$testnm failed on size")
 29 |     val exparr = Array(Array("Row1", 'a', 12345, 23456789, 3456789012345L, 45657.89F, 5678912.345678),
 30 |       Array("Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F, 5678912.345682),
 31 |       Array("Row3", 'c', 12343, 23456783, 3456789012343L, 45657.83F, 5678912.345683))
 32 | 
 33 |     var res = {
 34 |       for (rx <- 0 until 3)
 35 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
 36 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
 37 |     assert(res, "One or more rows did not match expected")
 38 | 
 39 |     logInfo(s"$query1 came back with ${result1.size} results")
 40 |     logInfo(result1.mkString)
 41 | 
 42 |     val sql2 =
 43 |       s"""SELECT * FROM $DefaultTableName LIMIT 2"""
 44 |         .stripMargin
 45 | 
 46 |     val results = runSql(sql2)
 47 |     logInfo(s"$sql2 came back with ${results.size} results")
 48 |     assert(results.size == 2, s"$testnm failed assertion on size")
 49 |     res = {
 50 |       for (rx <- 0 until 2)
 51 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
 52 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
 53 |     logInfo(results.mkString)
 54 |     assert(res, "One or more rows did not match expected")
 55 | 
 56 |     logInfo(s"Test $testnm completed successfully")
 57 |   }
 58 | 
 59 |   testnm = "Select all cols with filter"
 60 |   test("Select all cols with filter") {
 61 |     val query1 =
 62 |       s"""SELECT * FROM $DefaultTableName WHERE shortcol < 12345 LIMIT 2"""
 63 |         .stripMargin
 64 | 
 65 |     val result1 = runSql(query1)
 66 |     logInfo(s"$query1 came back with ${result1.size} results")
 67 |     assert(result1.size == 2, s"$testnm failed on size")
 68 |     val exparr = Array(
 69 |       Array("Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F, 5678912.345682),
 70 |       Array("Row3", 'c', 12343, 23456783, 3456789012343L, 45657.83F, 5678912.345683))
 71 | 
 72 |     val res = {
 73 |       for (rx <- 0 until 2)
 74 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
 75 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
 76 |     logInfo(result1.mkString)
 77 |     assert(res, "One or more rows did not match expected")
 78 | 
 79 |     logInfo(s"Test $testnm completed successfully")
 80 |   }
 81 | 
 82 |   testnm = "Select all cols with order by"
 83 |   test("Select all cols with order by") {
 84 |     val query1 =
 85 |       s"""SELECT * FROM $DefaultTableName WHERE shortcol < 12344 ORDER BY strcol DESC LIMIT 2"""
 86 |         .stripMargin
 87 | 
 88 |     val result1 = runSql(query1)
 89 |     assert(result1.size == 2, s"$testnm failed on size")
 90 |     val exparr = Array(
 91 |       Array("Row3", 'c', 12343, 23456783, 3456789012343L, 45657.83F, 5678912.345683),
 92 |       Array("Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F, 5678912.345682))
 93 | 
 94 |     val res = {
 95 |       for (rx <- 0 until 2)
 96 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
 97 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
 98 |     assert(res, "One or more rows did not match expected")
 99 | 
100 |     logInfo(s"Test $testnm completed successfully")
101 |   }
102 | 
103 |   testnm = "Select same column twice"
104 |   test("Select same column twice") {
105 |     val query1 =
106 |       s"""SELECT doublecol AS double1, doublecol AS doublecol
107 |              | FROM $DefaultTableName
108 |              | WHERE doublecol > 5678912.345681 AND doublecol < 5678912.345683"""
109 |              .stripMargin
110 | 
111 |     val result1 = runSql(query1)
112 |     logInfo(s"$query1 came back with ${result1.size} results")
113 |     assert(result1.size == 1, s"$testnm failed on size")
114 |     val exparr = Array(
115 |       Array(5678912.345682, 5678912.345682))
116 | 
117 |     assert(result1.size == 1, s"$testnm failed assertion on size")
118 |     val res = {
119 |       for (rx <- 0 until 1)
120 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
121 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
122 |     logInfo(result1.mkString)
123 |     assert(res, "One or more rows did not match expected")
124 | 
125 |     logInfo(s"Test $testnm completed successfully")
126 |   }
127 | 
128 |   testnm = "Select specific cols with filter"
129 |   test("Select specific cols with filter") {
130 |     val query1 =
131 |       s"""SELECT doublecol AS double1, -1 * doublecol AS minusdouble,
132 |          | substr(strcol, 2) as substrcol, doublecol, strcol,
133 |          | bytecol, shortcol, intcol, longcol, floatcol FROM $DefaultTableName WHERE strcol LIKE
134 |          |  '%Row%' AND shortcol < 12345
135 |          |  AND doublecol > 5678912.345681 AND doublecol < 5678912.345683 LIMIT 2"""
136 |         .stripMargin
137 | 
138 |     val result1 = runSql(query1)
139 |     logInfo(s"$query1 came back with ${result1.size} results")
140 |     assert(result1.size == 1, s"$testnm failed on size")
141 |     val exparr = Array(
142 |       Array(5678912.345682, -5678912.345682, "ow2", 5678912.345682,
143 |         "Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F))
144 | 
145 |     assert(result1.size == 1, s"$testnm failed assertion on size")
146 |     val res = {
147 |       for (rx <- 0 until 1)
148 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
149 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
150 |     logInfo(result1.mkString)
151 |     assert(res, "One or more rows did not match expected")
152 | 
153 |     logInfo(s"Test $testnm completed successfully")
154 |   }
155 | 
156 |   testnm = "Mixed And/or predicates"
157 |   test("Mixed And/or predicates") {
158 |     val query1 = s"""SELECT doublecol AS double1, -1 * doublecol AS minusdouble,
159 |      substr(strcol, 2) AS substrcol, doublecol, strcol,
160 |      bytecol, shortcol, intcol, longcol, floatcol FROM $DefaultTableName
161 |      WHERE strcol LIKE '%Row%'
162 |        AND shortcol < 12345
163 |        AND doublecol > 5678912.345681 AND doublecol < 5678912.345683
164 |        OR (doublecol = 5678912.345683 AND strcol IS NOT NULL)
165 |        OR (doublecol = 5678912.345683 AND strcol IS NOT NULL or intcol > 12345 AND intcol < 0)
166 |        OR (doublecol <> 5678912.345683 AND (strcol IS NULL or intcol > 12345 AND intcol < 0))
167 |        AND floatcol IS NOT NULL
168 |        AND (intcol IS NOT NULL and intcol > 0)
169 |        AND (intcol < 0 OR intcol IS NOT NULL)""".stripMargin
170 | 
171 |     val result1 = runSql(query1)
172 |     logInfo(s"$query1 came back with ${result1.size} results")
173 |     assert(result1.size == 2, s"$testnm failed on size")
174 |     val exparr = Array(
175 |       Array(5678912.345682, -5678912.345682, "ow2", 5678912.345682,
176 |         "Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F),
177 |       Array(5678912.345683, -5678912.345683, "ow3", 5678912.345683,
178 |         "Row3", -29, 12343, 23456783, 3456789012343L, 45657.83))
179 | 
180 |     val res = {
181 |       for (rx <- 0 until 1)
182 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
183 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
184 |     logInfo(result1.mkString)
185 |     assert(res, "One or more rows did not match expected")
186 | 
187 |     logInfo(s"Test $testnm completed successfully")
188 |   }
189 | 
190 |   testnm = "In predicates"
191 |   test("In predicates") {
192 |     val query1 = s"""SELECT doublecol AS double1, -1 * doublecol AS minusdouble,
193 |      substr(strcol, 2) AS substrcol, doublecol, strcol,
194 |      bytecol, shortcol, intcol, longcol, floatcol FROM $DefaultTableName
195 |      WHERE doublecol IN (doublecol + 5678912.345682 - doublecol, doublecol + 5678912.345683 - doublecol)""".stripMargin
196 | 
197 |     val result1 = runSql(query1)
198 |     logInfo(s"$query1 came back with ${result1.size} results")
199 |     assert(result1.size == 2, s"$testnm failed on size")
200 |     val exparr = Array(
201 |       Array(5678912.345682, -5678912.345682, "ow2", 5678912.345682,
202 |         "Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F),
203 |       Array(5678912.345683, -5678912.345683, "ow3", 5678912.345683,
204 |         "Row3", -29, 12343, 23456783, 3456789012343L, 45657.83))
205 | 
206 |     val res = {
207 |       for (rx <- 0 until 1)
208 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
209 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
210 |     logInfo(result1.mkString)
211 |     assert(res, "One or more rows did not match expected")
212 | 
213 |     logInfo(s"Test $testnm completed successfully")
214 |   }
215 | 
216 |   testnm = "InSet predicates"
217 |   test("InSet predicates") {
218 |     val query1 = s"""SELECT doublecol AS double1, -1 * doublecol AS minusdouble,
219 |      substr(strcol, 2) AS substrcol, doublecol, strcol,
220 |      bytecol, shortcol, intcol, longcol, floatcol FROM $DefaultTableName
221 |      WHERE doublecol IN (5678912.345682, 5678912.345683)""".stripMargin
222 | 
223 |     val result1 = runSql(query1)
224 |     logInfo(s"$query1 came back with ${result1.size} results")
225 |     assert(result1.size == 2, s"$testnm failed on size")
226 |     val exparr = Array(
227 |       Array(5678912.345682, -5678912.345682, "ow2", 5678912.345682,
228 |         "Row2", 'b', 12342, 23456782, 3456789012342L, 45657.82F),
229 |       Array(5678912.345683, -5678912.345683, "ow3", 5678912.345683,
230 |         "Row3", -29, 12343, 23456783, 3456789012343L, 45657.83))
231 | 
232 |     val res = {
233 |       for (rx <- 0 until 1)
234 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
235 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
236 |     logInfo(result1.mkString)
237 |     assert(res, "One or more rows did not match expected")
238 | 
239 |     logInfo(s"Test $testnm completed successfully")
240 |   }
241 | }
242 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/BytesUtilsSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Licensed to the Apache Software Foundation (ASF) under one or more
  3 | * contributor license agreements.  See the NOTICE file distributed with
  4 | * this work for additional information regarding copyright ownership.
  5 | * The ASF licenses this file to You under the Apache License, Version 2.0
  6 | * (the "License"); you may not use this file except in compliance with
  7 | * the License.  You may obtain a copy of the License at
  8 | *
  9 | *    http://www.apache.org/licenses/LICENSE-2.0
 10 | *
 11 | * Unless required by applicable law or agreed to in writing, software
 12 | * distributed under the License is distributed on an "AS IS" BASIS,
 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | * See the License for the specific language governing permissions and
 15 | * limitations under the License.
 16 | */
 17 | 
 18 | package org.apache.spark.sql.hbase
 19 | 
 20 | import org.apache.spark.Logging
 21 | import org.apache.hadoop.hbase.util.Bytes
 22 | import org.apache.spark.sql.types._
 23 | import org.apache.spark.sql.hbase.types.HBaseBytesType
 24 | import org.apache.spark.sql.hbase.util.BytesUtils
 25 | import org.scalatest.{BeforeAndAfterAll, FunSuite}
 26 | 
 27 | class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging {
 28 |   test("Bytes Ordering Test") {
 29 |     val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1,
 30 |       0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257)
 31 |     val result = s.map(i => (i, BytesUtils.create(IntegerType).toBytes(i)))
 32 |       .sortWith((f, s) =>
 33 |       HBaseBytesType.ordering.gt(
 34 |         f._2.asInstanceOf[HBaseBytesType.InternalType], s._2.asInstanceOf[HBaseBytesType.InternalType]))
 35 |     assert(result.map(a => a._1).toSeq == s.sorted.reverse)
 36 |   }
 37 | 
 38 |   def compare(a: Array[Byte], b: Array[Byte]): Int = {
 39 |     val length = Math.min(a.length, b.length)
 40 |     var result: Int = 0
 41 |     for (i <- 0 to length - 1) {
 42 |       val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte]
 43 |       if (diff != 0) {
 44 |         result = diff
 45 |       }
 46 |     }
 47 |     result
 48 |   }
 49 | 
 50 |   test("Bytes Utility Test") {
 51 |     assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType)
 52 |       .toBytes(input = true), 0) === true)
 53 |     assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType)
 54 |       .toBytes(input = false), 0) === false)
 55 | 
 56 |     assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(12.34d), 0)
 57 |       === 12.34d)
 58 |     assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(-12.34d), 0)
 59 |       === -12.34d)
 60 | 
 61 |     assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(12.34f), 0)
 62 |       === 12.34f)
 63 |     assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(-12.34f), 0)
 64 |       === -12.34f)
 65 | 
 66 |     assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(12), 0)
 67 |       === 12)
 68 |     assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(-12), 0)
 69 |       === -12)
 70 | 
 71 |     assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(1234l), 0)
 72 |       === 1234l)
 73 |     assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(-1234l), 0)
 74 |       === -1234l)
 75 | 
 76 |     assert(BytesUtils.toShort(BytesUtils.create(ShortType)
 77 |       .toBytes(12.asInstanceOf[Short]), 0) === 12)
 78 |     assert(BytesUtils.toShort(BytesUtils.create(ShortType)
 79 |       .toBytes(-12.asInstanceOf[Short]), 0) === -12)
 80 | 
 81 |     assert(BytesUtils.toString(BytesUtils.create(StringType).toBytes("abc"), 0, 3)
 82 |       === "abc")
 83 |     assert(BytesUtils.toString(BytesUtils.create(StringType).toBytes(""), 0, 0) === "")
 84 | 
 85 |     assert(BytesUtils.toByte(BytesUtils.create(ByteType)
 86 |       .toBytes(5.asInstanceOf[Byte]), 0) === 5)
 87 |     assert(BytesUtils.toByte(BytesUtils.create(ByteType)
 88 |       .toBytes(-5.asInstanceOf[Byte]), 0) === -5)
 89 | 
 90 |     assert(compare(BytesUtils.create(IntegerType).toBytes(128),
 91 |       BytesUtils.create(IntegerType).toBytes(-128)) > 0)
 92 |   }
 93 | 
 94 |   test("byte array plus one") {
 95 |     var byteArray =  Array[Byte](0x01.toByte, 127.toByte)
 96 |     assert(Bytes.compareTo(BytesUtils.addOne(byteArray),  Array[Byte](0x01.toByte, 0x80.toByte)) == 0)
 97 | 
 98 |     byteArray =  Array[Byte](0xff.toByte, 0xff.toByte)
 99 |     assert(BytesUtils.addOne(byteArray) == null)
100 | 
101 |     byteArray =  Array[Byte](0x02.toByte, 0xff.toByte)
102 |     assert(Bytes.compareTo(BytesUtils.addOne(byteArray),  Array[Byte](0x03.toByte, 0x00.toByte)) == 0)
103 |   }
104 | 
105 |   test("float comparison") {
106 |     val f1 = BytesUtils.create(FloatType).toBytes(-1.23f)
107 |     val f2 = BytesUtils.create(FloatType).toBytes(100f)
108 |     assert(Bytes.compareTo(f1, f2) < 0)
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/CatalogTestSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.spark.sql.hbase
 18 | 
 19 | import org.apache.hadoop.hbase._
 20 | import org.apache.hadoop.hbase.client.HBaseAdmin
 21 | import org.apache.spark.sql.catalyst.expressions.GenericRow
 22 | import org.apache.spark.sql.catalyst.plans.logical.Subquery
 23 | import org.apache.spark.sql.types._
 24 | import org.apache.spark.sql.hbase.util.HBaseKVHelper
 25 | import org.apache.spark.sql.sources.LogicalRelation
 26 | 
 27 | class CatalogTestSuite extends HBaseIntegrationTestBase {
 28 |   val (catalog, configuration) = (TestHbase.catalog, TestHbase.sparkContext.hadoopConfiguration)
 29 | 
 30 |   test("Create Table") {
 31 |     // prepare the test data
 32 |     val namespace = "testNamespace"
 33 |     val tableName = "testTable"
 34 |     val hbaseTableName = "hbaseTable"
 35 |     val family1 = "family1"
 36 |     val family2 = "family2"
 37 | 
 38 |     if (!catalog.checkHBaseTableExists(hbaseTableName)) {
 39 |       val admin = new HBaseAdmin(configuration)
 40 |       val desc = new HTableDescriptor(TableName.valueOf(hbaseTableName))
 41 |       desc.addFamily(new HColumnDescriptor(family1))
 42 |       desc.addFamily(new HColumnDescriptor(family2))
 43 |       admin.createTable(desc)
 44 |     }
 45 | 
 46 |     var allColumns = List[AbstractColumn]()
 47 |     allColumns = allColumns :+ KeyColumn("column2", IntegerType, 1)
 48 |     allColumns = allColumns :+ KeyColumn("column1", StringType, 0)
 49 |     allColumns = allColumns :+ NonKeyColumn("column4", FloatType, family2, "qualifier2")
 50 |     allColumns = allColumns :+ NonKeyColumn("column3", BooleanType, family1, "qualifier1")
 51 | 
 52 |     val splitKeys: Array[Array[Byte]] = Array(
 53 |       new GenericRow(Array(1024.0, "Upen", 128: Short)),
 54 |       new GenericRow(Array(1024.0, "Upen", 256: Short)),
 55 |       new GenericRow(Array(4096.0, "SF", 512: Short))
 56 |     ).map(HBaseKVHelper.makeRowKey(_, Seq(DoubleType, StringType, ShortType)))
 57 | 
 58 |     catalog.createTable(tableName, namespace, hbaseTableName, allColumns, splitKeys)
 59 | 
 60 |     assert(catalog.checkLogicalTableExist(tableName) === true)
 61 |   }
 62 | 
 63 |   test("Get Table") {
 64 |     // prepare the test data
 65 |     val hbaseNamespace = "testNamespace"
 66 |     val tableName = "testTable"
 67 |     val hbaseTableName = "hbaseTable"
 68 | 
 69 |     val oresult = catalog.getTable(tableName)
 70 |     assert(oresult.isDefined)
 71 |     val result = oresult.get
 72 |     assert(result.tableName === tableName)
 73 |     assert(result.hbaseNamespace === hbaseNamespace)
 74 |     assert(result.hbaseTableName === hbaseTableName)
 75 |     assert(result.keyColumns.size === 2)
 76 |     assert(result.nonKeyColumns.size === 2)
 77 |     assert(result.allColumns.size === 4)
 78 | 
 79 |     // check the data type
 80 |     assert(result.keyColumns(0).dataType === StringType)
 81 |     assert(result.keyColumns(1).dataType === IntegerType)
 82 |     assert(result.nonKeyColumns(1).dataType === FloatType)
 83 |     assert(result.nonKeyColumns(0).dataType === BooleanType)
 84 | 
 85 |     val relation = catalog.lookupRelation(Seq(tableName))
 86 |     val subquery = relation.asInstanceOf[Subquery]
 87 |     val hbRelation = subquery.child.asInstanceOf[LogicalRelation].relation.asInstanceOf[HBaseRelation]
 88 |     assert(hbRelation.nonKeyColumns.map(_.family) == List("family1", "family2"))
 89 |     val keyColumns = Seq(KeyColumn("column1", StringType, 0), KeyColumn("column2", IntegerType, 1))
 90 |     assert(hbRelation.keyColumns.equals(keyColumns))
 91 |     assert(relation.childrenResolved)
 92 |   }
 93 | 
 94 |   test("Alter Table") {
 95 |     val tableName = "testTable"
 96 | 
 97 |     val family1 = "family1"
 98 |     val column = NonKeyColumn("column5", BooleanType, family1, "qualifier3")
 99 | 
100 |     catalog.alterTableAddNonKey(tableName, column)
101 | 
102 |     var result = catalog.getTable(tableName)
103 |     var table = result.get
104 |     assert(table.allColumns.size === 5)
105 | 
106 |     catalog.alterTableDropNonKey(tableName, column.sqlName)
107 |     result = catalog.getTable(tableName)
108 |     table = result.get
109 |     assert(table.allColumns.size === 4)
110 |   }
111 | 
112 |   test("Delete Table") {
113 |     // prepare the test data
114 |     val tableName = "testTable"
115 | 
116 |     catalog.deleteTable(tableName)
117 | 
118 |     assert(catalog.checkLogicalTableExist(tableName) === false)
119 |   }
120 | 
121 |   test("Check Logical Table Exist") {
122 |     val tableName = "non-exist"
123 | 
124 |     assert(catalog.checkLogicalTableExist(tableName) === false)
125 |   }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/HBaseAdvancedSQLQuerySuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase
19 | 
20 | import org.apache.spark.sql.types._
21 | import org.apache.spark.sql.{SQLConf, _}
22 | 
23 | class HBaseAdvancedSQLQuerySuite extends HBaseSplitTestData {
24 |   import org.apache.spark.sql.hbase.TestHbase._
25 |   import org.apache.spark.sql.hbase.TestHbase.implicits._
26 | 
27 |   test("aggregation with codegen") {
28 |     val originalValue = TestHbase.conf.codegenEnabled
29 |     setConf(SQLConf.CODEGEN_ENABLED, "true")
30 |     val result = sql("SELECT col1 FROM ta GROUP BY col1").collect()
31 |     assert(result.size == 14, s"aggregation with codegen test failed on size")
32 |     setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString)
33 |   }
34 | 
35 |   test("dsl simple select 0") {
36 |     val tableA = sql("SELECT * FROM ta")
37 |     checkAnswer(
38 |       tableA.where('col7 === 1).orderBy('col2.asc).select('col4),
39 |       Row(1) :: Nil)
40 |     checkAnswer(
41 |       tableA.where('col2 === 6).orderBy('col2.asc).select('col7),
42 |       Row(-31) :: Nil)
43 |   }
44 |  
45 |   test("metadata is propagated correctly") {
46 |     val tableA = sql("SELECT col7, col1, col3 FROM ta")
47 |     val schema = tableA.schema
48 |     val docKey = "doc"
49 |     val docValue = "first name"
50 |     val metadata = new MetadataBuilder()
51 |       .putString(docKey, docValue)
52 |       .build()
53 |     val schemaWithMeta = new StructType(Array(
54 |       schema("col7"), schema("col1").copy(metadata = metadata), schema("col3")))
55 |     val personWithMeta = createDataFrame(tableA.rdd, schemaWithMeta)
56 |     def validateMetadata(rdd: DataFrame): Unit = {
57 |       assert(rdd.schema("col1").metadata.getString(docKey) == docValue)
58 |     }
59 |     personWithMeta.registerTempTable("personWithMeta")
60 |     validateMetadata(personWithMeta.select($"col1"))
61 |     validateMetadata(personWithMeta.select($"col1"))
62 |     validateMetadata(personWithMeta.select($"col7", $"col1"))
63 |     validateMetadata(sql("SELECT * FROM personWithMeta"))
64 |     validateMetadata(sql("SELECT col7, col1 FROM personWithMeta"))
65 |     validateMetadata(sql("SELECT * FROM personWithMeta JOIN salary ON col7 = personId"))
66 |     validateMetadata(sql("SELECT col1, salary FROM personWithMeta JOIN salary ON col7 = personId"))
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/HBaseBasicOperationSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hbase
 19 | 
 20 | /**
 21 |  * Test insert / query against the table
 22 |  */
 23 | class HBaseBasicOperationSuite extends HBaseSplitTestData {
 24 |   import org.apache.spark.sql.hbase.TestHbase._
 25 | 
 26 |   override def afterAll() = {
 27 |     if (TestHbase.hbaseAdmin.tableExists("ht0")) {
 28 |       TestHbase.hbaseAdmin.disableTable("ht0")
 29 |       TestHbase.hbaseAdmin.deleteTable("ht0")
 30 |     }
 31 |     if (TestHbase.hbaseAdmin.tableExists("ht1")) {
 32 |       TestHbase.hbaseAdmin.disableTable("ht1")
 33 |       TestHbase.hbaseAdmin.deleteTable("ht1")
 34 |     }
 35 |     super.afterAll()
 36 |   }
 37 | 
 38 |   test("Insert Into table0") {
 39 |     sql( """CREATE TABLE tb0 (column2 INTEGER, column1 INTEGER, column4 FLOAT,
 40 |           column3 SHORT, PRIMARY KEY(column1, column2))
 41 |           MAPPED BY (testNamespace.ht0, COLS=[column3=family1.qualifier1,
 42 |           column4=family2.qualifier2])"""
 43 |     )
 44 | 
 45 |     assert(sql( """SELECT * FROM tb0""").collect().size == 0)
 46 |     sql( """INSERT INTO tb0 SELECT col4,col4,col6,col3 FROM ta""")
 47 |     assert(sql( """SELECT * FROM tb0""").collect().size == 14)
 48 | 
 49 |     sql( """DROP TABLE tb0""")
 50 |   }
 51 | 
 52 |   test("Insert and Query Single Row") {
 53 |     sql( """CREATE TABLE tb1 (column1 INTEGER, column2 STRING,
 54 |           PRIMARY KEY(column1))
 55 |           MAPPED BY (ht1, COLS=[column2=cf.cq])"""
 56 |     )
 57 | 
 58 |     assert(sql( """SELECT * FROM tb1""").collect().size == 0)
 59 |     sql( """INSERT INTO tb1 VALUES (1024, "abc")""")
 60 |     sql( """INSERT INTO tb1 VALUES (1028, "abd")""")
 61 |     assert(sql( """SELECT * FROM tb1""").collect().size == 2)
 62 |     assert(
 63 |       sql( """SELECT * FROM tb1 WHERE (column1 = 1023 AND column2 ="abc")""").collect().size == 0)
 64 |     assert(sql(
 65 |       """SELECT * FROM tb1 WHERE (column1 = 1024)
 66 |         |OR (column1 = 1028 AND column2 ="abd")""".stripMargin).collect().size == 2)
 67 | 
 68 |     sql( """DROP TABLE tb1""")
 69 |   }
 70 | 
 71 |   test("Select test 0") {
 72 |     assert(sql( """SELECT * FROM ta""").count() == 14)
 73 |   }
 74 | 
 75 |   test("Count(*/1) and Non-Key Column Query") {
 76 |     assert(sql( """SELECT count(*) FROM ta""").collect()(0).get(0) == 14)
 77 |     assert(sql( """SELECT count(*) FROM ta where col2 < 8""").collect()(0).get(0) == 7)
 78 |     assert(sql( """SELECT count(*) FROM ta where col4 < 0""").collect()(0).get(0) == 7)
 79 |     assert(sql( """SELECT count(1) FROM ta where col2 < 8""").collect()(0).get(0) == 7)
 80 |     assert(sql( """SELECT count(1) FROM ta where col4 < 0""").collect()(0).get(0) == 7)
 81 |   }
 82 | 
 83 |   test("InSet Query") {
 84 |     assert(sql( """SELECT count(*) FROM ta where col2 IN (1, 2, 3)""").collect()(0).get(0) == 3)
 85 |     assert(sql( """SELECT count(*) FROM ta where col4 IN (1, 2, 3)""").collect()(0).get(0) == 1)
 86 |   }
 87 | 
 88 |   test("Select test 1 (AND, OR)") {
 89 |     assert(sql( """SELECT * FROM ta WHERE col7 = 255 OR col7 = 127""").collect().size == 2)
 90 |     assert(sql( """SELECT * FROM ta WHERE col7 < 0 AND col4 < -255""").collect().size == 4)
 91 |   }
 92 | 
 93 |   test("Select test 2 (WHERE)") {
 94 |     assert(sql( """SELECT * FROM ta WHERE col7 > 128""").count() == 3)
 95 |     assert(sql( """SELECT * FROM ta WHERE (col7 - 10 > 128) AND col1 = ' p255 '""").collect().size == 1)
 96 |   }
 97 | 
 98 |   test("Select test 3 (ORDER BY)") {
 99 |     val result = sql( """SELECT col1, col7 FROM ta ORDER BY col7 DESC""").collect()
100 |     val sortedResult = result.sortWith(
101 |       (r1, r2) => r1(1).asInstanceOf[Int] > r2(1).asInstanceOf[Int])
102 |     for ((r1, r2) <- result zip sortedResult) {
103 |       assert(r1.equals(r2))
104 |     }
105 |   }
106 | 
107 |   test("Select test 4 (join)") {
108 |     assert(sql( """SELECT ta.col2 FROM ta join tb on ta.col4=tb.col7""").collect().size == 2)
109 |     assert(sql( """SELECT * FROM ta FULL OUTER JOIN tb WHERE tb.col7 = 1""").collect().size == 14)
110 |     assert(sql( """SELECT * FROM ta LEFT JOIN tb WHERE tb.col7 = 1""").collect().size == 14)
111 |     assert(sql( """SELECT * FROM ta RIGHT JOIN tb WHERE tb.col7 = 1""").collect().size == 14)
112 |   }
113 | 
114 |   test("Alter Add column and Alter Drop column") {
115 |     assert(sql( """SELECT * FROM ta""").collect()(0).size == 7)
116 |     sql( """ALTER TABLE ta ADD col8 STRING MAPPED BY (col8 = cf1.cf13)""")
117 |     assert(sql( """SELECT * FROM ta""").collect()(0).size == 8)
118 |     sql( """ALTER TABLE ta DROP col8""")
119 |     assert(sql( """SELECT * FROM ta""").collect()(0).size == 7)
120 |   }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/HBaseInsertTableSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hbase
 19 | 
 20 | import org.apache.spark.sql.Row
 21 | 
 22 | class HBaseInsertTableSuite extends HBaseTestData {
 23 | 
 24 |   var testnm = "Insert all rows to the table from other table"
 25 |   test("Insert all rows to the table from other table") {
 26 |     val createQuery = s"""CREATE TABLE insertTestTable(strcol STRING, bytecol BYTE, shortcol SHORT,
 27 |             intcol INTEGER, longcol LONG, floatcol FLOAT, doublecol DOUBLE, 
 28 |             PRIMARY KEY(doublecol, strcol, intcol)) 
 29 |             MAPPED BY (hinsertTestTable, COLS=[bytecol=cf1.hbytecol,
 30 |             shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])"""
 31 |       .stripMargin
 32 |     runSql(createQuery)
 33 | 
 34 |     val insertQuery =
 35 |       s"""INSERT INTO insertTestTable SELECT * FROM $DefaultTableName"""
 36 |         .stripMargin
 37 |     runSql(insertQuery)
 38 | 
 39 |     val testQuery = "SELECT * FROM insertTestTable"
 40 |     val testResult = runSql(testQuery)
 41 |     val targetResult = runSql(s"SELECT * FROM $DefaultTableName")
 42 |     assert(testResult.size == targetResult.size, s"$testnm failed on size")
 43 | 
 44 |     compareResults(testResult, targetResult)
 45 | 
 46 |     runSql("DROP TABLE insertTestTable")
 47 |   }
 48 | 
 49 |   testnm = "Insert few rows to the table from other table after applying filter"
 50 |   test("Insert few rows to the table from other table after applying filter") {
 51 |     val createQuery = s"""CREATE TABLE insertTestTableFilter(strcol STRING, bytecol BYTE,
 52 |             shortcol SHORT, intcol INTEGER, longcol LONG, floatcol FLOAT, doublecol DOUBLE, 
 53 |             PRIMARY KEY(doublecol, strcol, intcol)) 
 54 |             MAPPED BY (hinsertTestTableFilter, COLS=[bytecol=cf1.hbytecol,
 55 |             shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])"""
 56 |       .stripMargin
 57 |     runSql(createQuery)
 58 | 
 59 |     val insertQuery =
 60 |       s"""insert into insertTestTableFilter select * from $DefaultTableName 
 61 |         where doublecol > 5678912.345681"""
 62 |         .stripMargin
 63 |     runSql(insertQuery)
 64 | 
 65 |     val testQuery = "select * from insertTestTableFilter"
 66 |     val testResult = runSql(testQuery)
 67 |     val targetResult = runSql(s"select * from $DefaultTableName where doublecol > 5678912.345681")
 68 |     assert(testResult.size == targetResult.size, s"$testnm failed on size")
 69 | 
 70 |     compareResults(testResult, targetResult)
 71 | 
 72 |     runSql("Drop Table insertTestTableFilter")
 73 |   }
 74 | 
 75 |   def compareResults(fetchResult: Array[Row], targetResult: Array[Row]) = {
 76 |     val res = {
 77 |       for (rx <- 0 until targetResult.size)
 78 |       yield compareWithTol(fetchResult(rx).toSeq, targetResult(rx).toSeq, s"Row$rx failed")
 79 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
 80 |     assert(res, "One or more rows did not match expected")
 81 |   }
 82 | 
 83 |   testnm = "Insert few columns to the table from other table"
 84 |   test("Insert few columns to the table from other table") {
 85 |     val createQuery = s"""CREATE TABLE insertTestTableFewCols(strcol STRING, bytecol BYTE,
 86 |             shortcol SHORT, intcol INTEGER, PRIMARY KEY(strcol, intcol)) 
 87 |             MAPPED BY (hinsertTestTableFewCols, COLS=[bytecol=cf1.hbytecol,
 88 |             shortcol=cf1.hshortcol])"""
 89 |       .stripMargin
 90 |     runSql(createQuery)
 91 | 
 92 |     val insertQuery =
 93 |       s"""INSERT INTO insertTestTableFewCols SELECT strcol, bytecol,
 94 |         shortcol, intcol FROM $DefaultTableName ORDER BY strcol"""
 95 |         .stripMargin
 96 |     runSql(insertQuery)
 97 | 
 98 |     val testQuery =
 99 |       "SELECT strcol, bytecol, shortcol, intcol FROM insertTestTableFewCols ORDER BY strcol"
100 |     val testResult = runSql(testQuery)
101 |     val targetResult =
102 |       runSql(s"SELECT strcol, bytecol, shortcol, intcol FROM $DefaultTableName ORDER BY strcol")
103 |     assert(testResult.size == targetResult.size, s"$testnm failed on size")
104 | 
105 |     compareResults(testResult, targetResult)
106 | 
107 |     runSql("DROP TABLE insertTestTableFewCols")
108 |   }
109 | 
110 |   testnm = "Insert into values test"
111 |   test("Insert into values test") {
112 |     val createQuery = s"""CREATE TABLE insertValuesTest(strcol STRING, bytecol BYTE,
113 |             shortcol SHORT, intcol INTEGER, PRIMARY KEY(strcol, intcol)) 
114 |             MAPPED BY (hinsertValuesTest, COLS=[bytecol=cf1.hbytecol,
115 |             shortcol=cf1.hshortcol])"""
116 |       .stripMargin
117 |     runSql(createQuery)
118 | 
119 |     val insertQuery1 = s"INSERT INTO insertValuesTest VALUES('Row0','a',12340,23456780)"
120 |     val insertQuery2 = s"INSERT INTO insertValuesTest VALUES('Row1','b',12345,23456789)"
121 |     val insertQuery3 = s"INSERT INTO insertValuesTest VALUES('Row2','c',12342,23456782)"
122 |     runSql(insertQuery1)
123 |     runSql(insertQuery2)
124 |     runSql(insertQuery3)
125 | 
126 |     val testQuery = "SELECT * FROM insertValuesTest ORDER BY strcol"
127 |     val testResult = runSql(testQuery)
128 |     assert(testResult.size == 3, s"$testnm failed on size")
129 | 
130 |     val exparr = Array(Array("Row0", 'a', 12340, 23456780),
131 |       Array("Row1", 'b', 12345, 23456789),
132 |       Array("Row2", 'c', 12342, 23456782))
133 | 
134 |     val res = {
135 |       for (rx <- 0 until 3)
136 |       yield compareWithTol(testResult(rx).toSeq, exparr(rx), s"Row$rx failed")
137 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
138 |     assert(res, "One or more rows did not match expected")
139 | 
140 |     runSql("DROP TABLE insertValuesTest")
141 |   }
142 | 
143 |   testnm = "Insert nullable values test"
144 |   test("Insert nullable values test") {
145 |     val createQuery = s"""CREATE TABLE insertNullValuesTest(strcol STRING, bytecol BYTE,
146 |             shortcol SHORT, intcol INTEGER, PRIMARY KEY(strcol))
147 |             MAPPED BY (hinsertNullValuesTest, COLS=[bytecol=cf1.hbytecol,
148 |             shortcol=cf1.hshortcol, intcol=cf1.hintcol])"""
149 |       .stripMargin
150 |     runSql(createQuery)
151 | 
152 |     val insertQuery1 = s"INSERT INTO insertNullValuesTest VALUES('Row0', null,  12340, 23456780)"
153 |     val insertQuery2 = s"INSERT INTO insertNullValuesTest VALUES('Row1', 'b',   null, 23456789)"
154 |     val insertQuery3 = s"INSERT INTO insertNullValuesTest VALUES('Row2', 'c',  12342, null)"
155 |     runSql(insertQuery1)
156 |     runSql(insertQuery2)
157 |     runSql(insertQuery3)
158 | 
159 |     val selectAllQuery = "SELECT * FROM insertNullValuesTest ORDER BY strcol"
160 |     val selectAllResult = runSql(selectAllQuery)
161 | 
162 |     assert(selectAllResult.size == 3, s"$testnm failed on size")
163 | 
164 |     var currentResultRow: Int = 0
165 | 
166 |     // check 1st result row
167 |     assert(selectAllResult(currentResultRow).length == 4, s"$testnm failed on row size (# of cols)")
168 |     assert(selectAllResult(currentResultRow)(0) === s"Row0", s"$testnm failed on returned Row0, key value")
169 |     assert(selectAllResult(currentResultRow)(1) == null, s"$testnm failed on returned Row0, null col1 value")
170 |     assert(selectAllResult(currentResultRow)(2) == 12340, s"$testnm failed on returned Row0, col2 value")
171 |     assert(selectAllResult(currentResultRow)(3) == 23456780, s"$testnm failed on returned Row0, col3 value")
172 | 
173 |     currentResultRow += 1
174 | 
175 |     // check 2nd result row
176 |     assert(selectAllResult(currentResultRow)(0) === s"Row1", s"$testnm failed on returned Row1, key value")
177 |     // skip comparison of actual and expected bytecol value
178 |     assert(selectAllResult(currentResultRow)(2) == null, s"$testnm failed on returned Row1, null col2 value")
179 |     assert(selectAllResult(currentResultRow)(3) == 23456789, s"$testnm failed on returned Row1, col3 value")
180 | 
181 |     currentResultRow += 1
182 | 
183 |     // check 3rd result row
184 |     assert(selectAllResult(currentResultRow)(0) === s"Row2", s"$testnm failed on returned Row2, key value")
185 |     // skip comparison of actual and expected bytecol value
186 |     assert(selectAllResult(currentResultRow)(2) == 12342, s"$testnm failed on returned Row2, col2 value")
187 |     assert(selectAllResult(currentResultRow)(3) == null, s"$testnm failed on returned Row2, null col3 value")
188 | 
189 |     // test 'where col is not null'
190 | 
191 |     val selectWhereIsNotNullQuery = "SELECT * FROM insertNullValuesTest WHERE intcol IS NOT NULL ORDER BY strcol"
192 |     val selectWhereIsNotNullResult = runSql(selectWhereIsNotNullQuery)
193 |     assert(selectWhereIsNotNullResult.size == 2, s"$testnm failed on size")
194 | 
195 |     currentResultRow = 0
196 |     // check 1st result row
197 |     assert(selectWhereIsNotNullResult(currentResultRow)(0) === s"Row0", s"$testnm failed on returned Row0, key value")
198 |     assert(selectWhereIsNotNullResult(currentResultRow)(1) == null, s"$testnm failed on returned Row0, null col1 value")
199 |     assert(selectWhereIsNotNullResult(currentResultRow)(2) == 12340, s"$testnm failed on returned Row0, col2 value")
200 |     assert(selectWhereIsNotNullResult(currentResultRow)(3) == 23456780, s"$testnm failed on returned Row0, col3 value")
201 | 
202 |     currentResultRow += 1
203 |     // check 2nd result row
204 |     assert(selectWhereIsNotNullResult(currentResultRow)(0) === s"Row1", s"$testnm failed on returned Row1, key value")
205 |     // skip comparison of actual and expected bytecol value
206 |     assert(selectWhereIsNotNullResult(currentResultRow)(2) == null, s"$testnm failed on returned Row1, null col2 value")
207 |     assert(selectWhereIsNotNullResult(currentResultRow)(3) == 23456789, s"$testnm failed on returned Row1, col3 value")
208 | 
209 | 
210 |     runSql("  Drop Table insertNullValuesTest")
211 |   }
212 | 
213 | 
214 | }
215 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/HBaseIntegrationTestBase.scala:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  4 |  * contributor license agreements.  See the NOTICE file distributed with
  5 |  * this work for additional information regarding copyright ownership.
  6 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  7 |  * (the "License"); you may not use this file except in compliance with
  8 |  * the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package org.apache.spark.sql.hbase
 20 | 
 21 | import java.util.Date
 22 | 
 23 | import org.apache.spark.Logging
 24 | import org.apache.spark.sql.catalyst.plans.logical
 25 | import org.apache.spark.sql.catalyst.util._
 26 | import org.apache.spark.sql.{DataFrame, Row}
 27 | import org.scalatest.{BeforeAndAfterAll, FunSuite, Suite}
 28 | 
 29 | abstract class HBaseIntegrationTestBase
 30 |   extends FunSuite with BeforeAndAfterAll with Logging {
 31 |   self: Suite =>
 32 | 
 33 |   val startTime = (new Date).getTime
 34 | 
 35 |   /**
 36 |    * Runs the plan and makes sure the answer matches the expected result.
 37 |    * @param rdd the [[DataFrame]] to be executed
 38 |    * @param expectedAnswer the expected result, can either be an Any, Seq[Product], or Seq[ Seq[Any] ].
 39 |    */
 40 |   protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = {
 41 |     val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s}.nonEmpty
 42 |     def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
 43 |       // Converts data to types that we can do equality comparison using Scala collections.
 44 |       // For BigDecimal type, the Scala type has a better definition of equality test (similar to
 45 |       // Java's java.math.BigDecimal.compareTo).
 46 |       val converted: Seq[Row] = answer.map { s =>
 47 |         Row.fromSeq(s.toSeq.map {
 48 |           case d: java.math.BigDecimal => BigDecimal(d)
 49 |           case o => o
 50 |         })
 51 |       }
 52 |       if (!isSorted) converted.sortBy(_.toString()) else converted
 53 |     }
 54 |     val sparkAnswer = try rdd.collect().toSeq catch {
 55 |       case e: Exception =>
 56 |         fail(
 57 |           s"""
 58 |             |Exception thrown while executing query:
 59 |             |${rdd.queryExecution}
 60 |             |== Exception ==
 61 |             |$e
 62 |             |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
 63 |           """.stripMargin)
 64 |     }
 65 | 
 66 |     if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
 67 |       fail( s"""
 68 |         |Results do not match for query:
 69 |         |${rdd.logicalPlan}
 70 |         |== Analyzed Plan ==
 71 |         |${rdd.queryExecution.analyzed}
 72 |         |== Physical Plan ==
 73 |         |${rdd.queryExecution.executedPlan}
 74 |         |== Results ==
 75 |         |${
 76 |         sideBySide(
 77 |           s"== Correct Answer - ${expectedAnswer.size} ==" +:
 78 |             prepareAnswer(expectedAnswer).map(_.toString()),
 79 |           s"== Spark Answer - ${sparkAnswer.size} ==" +:
 80 |             prepareAnswer(sparkAnswer).map(_.toString())).mkString("\n")
 81 |       }
 82 |       """.stripMargin)
 83 |     }
 84 |   }
 85 | 
 86 |   protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = {
 87 |     checkAnswer(rdd, Seq(expectedAnswer))
 88 |   }
 89 | 
 90 |   def runSql(sql: String):Array[Row] = {
 91 |     logInfo(sql)
 92 |     TestHbase.sql(sql).collect()
 93 |   }
 94 | 
 95 |   override protected def afterAll(): Unit = {
 96 |     val msg = s"Test ${getClass.getName} completed at ${(new java.util.Date).toString} duration=${((new java.util.Date).getTime - startTime) / 1000}"
 97 |     logInfo(msg)
 98 |   }
 99 | 
100 |   val CompareTol = 1e-6
101 | 
102 |   def compareWithTol(actarr: Seq[Any], exparr: Seq[Any], emsg: String): Boolean = {
103 |     actarr.zip(exparr).forall { case (aa, ee) =>
104 |       val eq = (aa, ee) match {
105 |         case (a: Double, e: Double) =>
106 |           Math.abs(a - e) <= CompareTol
107 |         case (a: Float, e: Float) =>
108 |           Math.abs(a - e) <= CompareTol
109 |         case (a: Byte, e)  => true //For now, we assume it is ok
110 |         case (a, e) =>
111 |           if(a == null && e == null) {
112 |             logDebug(s"a=null e=null")
113 |           } else {
114 |             logDebug(s"atype=${a.getClass.getName} etype=${e.getClass.getName}")
115 |           }
116 |           a == e
117 |         case _ => throw new IllegalArgumentException("Expected tuple")
118 |       }
119 |       if (!eq) {
120 |         logError(s"$emsg: Mismatch- act=$aa exp=$ee")
121 |       }
122 |       eq
123 |     }
124 |   }
125 | 
126 |   def verify(testName: String, sql: String, result1: Seq[Seq[Any]], exparr: Seq[Seq[Any]]) = {
127 |     val res = {
128 |       for (rx <- 0 until exparr.size)
129 |       yield compareWithTol(result1(rx).toSeq, exparr(rx), s"Row$rx failed")
130 |     }.foldLeft(true) { case (res1, newres) => res1 && newres}
131 | 
132 |     logInfo(s"$sql came back with ${result1.size} results")
133 |     logInfo(result1.mkString)
134 |     assert(res, "One or more rows did not match expected")
135 |   }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/HBaseSplitTestData.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hbase
 19 | 
 20 | import java.io.{ByteArrayOutputStream, DataOutputStream}
 21 | 
 22 | import org.apache.hadoop.hbase._
 23 | import org.apache.hadoop.hbase.client._
 24 | import org.apache.hadoop.hbase.util.Bytes
 25 | import org.apache.spark.sql.catalyst.expressions.{GenericRow, Row}
 26 | import org.apache.spark.sql.types._
 27 | import org.apache.spark.sql.hbase.util.{DataTypeUtils, HBaseKVHelper, BytesUtils}
 28 | 
 29 | /**
 30 |  * HBaseMainTest
 31 |  * create HbTestTable and metadata table, and insert some data
 32 |  */
 33 | class HBaseSplitTestData extends HBaseIntegrationTestBase
 34 | {
 35 |   val TableName_a: String = "ta"
 36 |   val TableName_b: String = "tb"
 37 |   val HbaseTableName: String = "ht"
 38 |   val Metadata_Table = "metadata"
 39 |   var alreadyInserted = false
 40 | 
 41 |   override protected def beforeAll() = {
 42 |     super.beforeAll()
 43 |     setupData(useMultiplePartitions = true, needInsertData = true)
 44 |     TestData
 45 |   }
 46 | 
 47 |   override protected def afterAll() = {
 48 |     TestHbase.sql("DROP TABLE " + TableName_a)
 49 |     TestHbase.sql("DROP TABLE " + TableName_b)
 50 |   }
 51 | 
 52 |   def createTable(useMultiplePartitions: Boolean) = {
 53 |     try {
 54 |       // delete the existing hbase table
 55 |       if (TestHbase.hbaseAdmin.tableExists(HbaseTableName)) {
 56 |         TestHbase.hbaseAdmin.disableTable(HbaseTableName)
 57 |         TestHbase.hbaseAdmin.deleteTable(HbaseTableName)
 58 |       }
 59 | 
 60 |       if (TestHbase.hbaseAdmin.tableExists(Metadata_Table)) {
 61 |         TestHbase.hbaseAdmin.disableTable(Metadata_Table)
 62 |         TestHbase.hbaseAdmin.deleteTable(Metadata_Table)
 63 |       }
 64 | 
 65 |       var allColumns = List[AbstractColumn]()
 66 |       allColumns = allColumns :+ KeyColumn("col1", StringType, 1)
 67 |       allColumns = allColumns :+ NonKeyColumn("col2", ByteType, "cf1", "cq11")
 68 |       allColumns = allColumns :+ KeyColumn("col3", ShortType, 2)
 69 |       allColumns = allColumns :+ NonKeyColumn("col4", IntegerType, "cf1", "cq12")
 70 |       allColumns = allColumns :+ NonKeyColumn("col5", LongType, "cf2", "cq21")
 71 |       allColumns = allColumns :+ NonKeyColumn("col6", FloatType, "cf2", "cq22")
 72 |       allColumns = allColumns :+ KeyColumn("col7", IntegerType, 0)
 73 | 
 74 |       val splitKeys: Array[Array[Byte]] = if (useMultiplePartitions) {
 75 |         Array(
 76 |           new GenericRow(Array(256, " p256 ", 128: Short)),
 77 |           new GenericRow(Array(32, " p32 ", 256: Short)),
 78 |           new GenericRow(Array(-32, " n32 ", 128: Short)),
 79 |           new GenericRow(Array(-256, " n256 ", 256: Short)),
 80 |           new GenericRow(Array(-128, " n128 ", 128: Short)),
 81 |           new GenericRow(Array(0, " zero ", 256: Short)),
 82 |           new GenericRow(Array(128, " p128 ", 512: Short))
 83 |         ).map(HBaseKVHelper.makeRowKey(_, Seq(IntegerType, StringType, ShortType)))
 84 |       } else {
 85 |         null
 86 |       }
 87 | 
 88 |       TestHbase.catalog.createTable(TableName_a, null, HbaseTableName, allColumns, splitKeys)
 89 | 
 90 |       TestHbase.sql( s"""CREATE TABLE $TableName_b(col1 STRING, col2 BYTE, col3 SHORT, col4 INTEGER,
 91 |           col5 LONG, col6 FLOAT, col7 INTEGER, PRIMARY KEY(col7, col1, col3))
 92 |           MAPPED BY ($HbaseTableName, COLS=[col2=cf1.cq11, col4=cf1.cq12, col5=cf2.cq21,
 93 |           col6=cf2.cq22])""".stripMargin)
 94 | 
 95 |       if (!TestHbase.hbaseAdmin.tableExists(HbaseTableName)) {
 96 |         throw new IllegalArgumentException("where is our table?")
 97 |       }
 98 |     }
 99 |   }
100 | 
101 |   def checkHBaseTableExists(hbaseTable: String): Boolean = {
102 |     val tableName = TableName.valueOf(hbaseTable)
103 |     TestHbase.hbaseAdmin.tableExists(tableName)
104 |   }
105 | 
106 |   def insertTestData() = {
107 |     if (!checkHBaseTableExists(HbaseTableName)) {
108 |       throw new IllegalStateException(s"Unable to find table $HbaseTableName")
109 |     }
110 | 
111 |     val htable = new HTable(TestHbase.sparkContext.hadoopConfiguration, HbaseTableName)
112 | 
113 |     def putNewTableIntoHBase(keys: Seq[Any], keysType: Seq[DataType],
114 |                              vals: Seq[Any], valsType: Seq[DataType]): Unit = {
115 |       val row = new GenericRow(keys.toArray)
116 |       val key = makeRowKey(row, keysType)
117 |       val put = new Put(key)
118 |       Seq((vals(0), valsType(0), "cf1", "cq11"),
119 |         (vals(1), valsType(1), "cf1", "cq12"),
120 |         (vals(2), valsType(2), "cf2", "cq21"),
121 |         (vals(3), valsType(3), "cf2", "cq22")).foreach {
122 |         case (rowValue, rowType, colFamily, colQualifier) =>
123 |           addRowVals(put, rowValue, rowType, colFamily, colQualifier)
124 |       }
125 |       htable.put(put)
126 |     }
127 | 
128 |     putNewTableIntoHBase(Seq(-257, " n257 ", 128: Short),
129 |       Seq(IntegerType, StringType, ShortType),
130 |       Seq[Any](1.toByte, -2048, 12345678901234L, 1234.5678F),
131 |       Seq(ByteType, IntegerType, LongType, FloatType))
132 | 
133 |     putNewTableIntoHBase(Seq(-255, " n255 ", 128: Short),
134 |       Seq(IntegerType, StringType, ShortType),
135 |       Seq[Any](2.toByte, -1024, 12345678901234L, 1234.5678F),
136 |       Seq(ByteType, IntegerType, LongType, FloatType))
137 | 
138 |     putNewTableIntoHBase(Seq(-129, " n129 ", 128: Short),
139 |       Seq(IntegerType, StringType, ShortType),
140 |       Seq[Any](3.toByte, -512, 12345678901234L, 1234.5678F),
141 |       Seq(ByteType, IntegerType, LongType, FloatType))
142 | 
143 |     putNewTableIntoHBase(Seq(-127, " n127 ", 128: Short),
144 |       Seq(IntegerType, StringType, ShortType),
145 |       Seq[Any](4.toByte, -256, 12345678901234L, 1234.5678F),
146 |       Seq(ByteType, IntegerType, LongType, FloatType))
147 | 
148 |     putNewTableIntoHBase(Seq(-33, " n33 ", 128: Short),
149 |       Seq(IntegerType, StringType, ShortType),
150 |       Seq[Any](5.toByte, -128, 12345678901234L, 1234.5678F),
151 |       Seq(ByteType, IntegerType, LongType, FloatType))
152 | 
153 |     putNewTableIntoHBase(Seq(-31, " n31 ", 128: Short),
154 |       Seq(IntegerType, StringType, ShortType),
155 |       Seq[Any](6.toByte, -64, 12345678901234L, 1234.5678F),
156 |       Seq(ByteType, IntegerType, LongType, FloatType))
157 | 
158 |     putNewTableIntoHBase(Seq(-1, " n1 ", 128: Short),
159 |       Seq(IntegerType, StringType, ShortType),
160 |       Seq[Any](7.toByte, -1, 12345678901234L, 1234.5678F),
161 |       Seq(ByteType, IntegerType, LongType, FloatType))
162 | 
163 |     putNewTableIntoHBase(Seq(1, " p1 ", 128: Short),
164 |       Seq(IntegerType, StringType, ShortType),
165 |       Seq[Any](8.toByte, 1, 12345678901234L, 1234.5678F),
166 |       Seq(ByteType, IntegerType, LongType, FloatType))
167 | 
168 |     putNewTableIntoHBase(Seq(31, " p31 ", 128: Short),
169 |       Seq(IntegerType, StringType, ShortType),
170 |       Seq[Any](9.toByte, 4, 12345678901234L, 1234.5678F),
171 |       Seq(ByteType, IntegerType, LongType, FloatType))
172 | 
173 |     putNewTableIntoHBase(Seq(33, " p33 ", 128: Short),
174 |       Seq(IntegerType, StringType, ShortType),
175 |       Seq[Any](10.toByte, 64, 12345678901234L, 1234.5678F),
176 |       Seq(ByteType, IntegerType, LongType, FloatType))
177 | 
178 |     putNewTableIntoHBase(Seq(127, " p127 ", 128: Short),
179 |       Seq(IntegerType, StringType, ShortType),
180 |       Seq[Any](11.toByte, 128, 12345678901234L, 1234.5678F),
181 |       Seq(ByteType, IntegerType, LongType, FloatType))
182 | 
183 |     putNewTableIntoHBase(Seq(129, " p129 ", 128: Short),
184 |       Seq(IntegerType, StringType, ShortType),
185 |       Seq[Any](12.toByte, 256, 12345678901234L, 1234.5678F),
186 |       Seq(ByteType, IntegerType, LongType, FloatType))
187 | 
188 |     putNewTableIntoHBase(Seq(255, " p255 ", 128: Short),
189 |       Seq(IntegerType, StringType, ShortType),
190 |       Seq[Any](13.toByte, 512, 12345678901234L, 1234.5678F),
191 |       Seq(ByteType, IntegerType, LongType, FloatType))
192 | 
193 |     putNewTableIntoHBase(Seq(257, " p257 ", 128: Short),
194 |       Seq(IntegerType, StringType, ShortType),
195 |       Seq[Any](14.toByte, 1024, 12345678901234L, 1234.5678F),
196 |       Seq(ByteType, IntegerType, LongType, FloatType))
197 | 
198 |     htable.close()
199 |   }
200 | 
201 |   def makeRowKey(row: Row, dataTypeOfKeys: Seq[DataType]) = {
202 |     val rawKeyCol = dataTypeOfKeys.zipWithIndex.map {
203 |       case (dataType, index) =>
204 |         (DataTypeUtils.getRowColumnInHBaseRawType(row, index, dataType),
205 |           dataType)
206 |     }
207 | 
208 |     HBaseKVHelper.encodingRawKeyColumns(rawKeyCol)
209 |   }
210 | 
211 |   def addRowVals(put: Put, rowValue: Any, rowType: DataType,
212 |                  colFamily: String, colQualifier: String) = {
213 |     val bos = new ByteArrayOutputStream()
214 |     val dos = new DataOutputStream(bos)
215 |     val bu = BytesUtils.create(rowType)
216 |     rowType match {
217 |       case StringType => dos.write(bu.toBytes(rowValue.asInstanceOf[String]))
218 |       case IntegerType => dos.write(bu.toBytes(rowValue.asInstanceOf[Int]))
219 |       case BooleanType => dos.write(bu.toBytes(rowValue.asInstanceOf[Boolean]))
220 |       case ByteType => dos.write(bu.toBytes(rowValue.asInstanceOf[Byte]))
221 |       case DoubleType => dos.write(bu.toBytes(rowValue.asInstanceOf[Double]))
222 |       case FloatType => dos.write(bu.toBytes(rowValue.asInstanceOf[Float]))
223 |       case LongType => dos.write(bu.toBytes(rowValue.asInstanceOf[Long]))
224 |       case ShortType => dos.write(bu.toBytes(rowValue.asInstanceOf[Short]))
225 |       case _ => throw new Exception("Unsupported HBase SQL Data Type")
226 |     }
227 |     put.add(Bytes.toBytes(colFamily), Bytes.toBytes(colQualifier), bos.toByteArray)
228 |   }
229 | 
230 |   def testHBaseScanner() = {
231 |     val scan = new Scan
232 |     val htable = new HTable(TestHbase.sparkContext.hadoopConfiguration, HbaseTableName)
233 |     val scanner = htable.getScanner(scan)
234 |     var res: Result = null
235 |     do {
236 |       res = scanner.next
237 |       if (res != null) logInfo(s"Row ${res.getRow} has map=${res.getNoVersionMap.toString}")
238 |     } while (res != null)
239 |   }
240 | 
241 |   def setupData(useMultiplePartitions: Boolean, needInsertData: Boolean = false) {
242 |     if (needInsertData && !alreadyInserted) {
243 |       createTable(useMultiplePartitions)
244 |       insertTestData()
245 |       alreadyInserted = true
246 |     }
247 |   }
248 | }
249 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/HBaseTestData.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.sql.hbase
  2 | 
  3 | import org.apache.hadoop.fs.{Path, FileSystem}
  4 | import org.apache.hadoop.hbase.util.Bytes
  5 | import org.apache.hadoop.hbase.{TableExistsException, HColumnDescriptor, HTableDescriptor, TableName}
  6 | import org.apache.spark.Logging
  7 | import org.apache.spark.sql.SQLContext
  8 | 
  9 | /*
 10 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 11 |  * contributor license agreements.  See the NOTICE file distributed with
 12 |  * this work for additional information regarding copyright ownership.
 13 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 14 |  * (the "License"); you may not use this file except in compliance with
 15 |  * the License.  You may obtain a copy of the License at
 16 |  *
 17 |  *    http://www.apache.org/licenses/LICENSE-2.0
 18 |  *
 19 |  * Unless required by applicable law or agreed to in writing, software
 20 |  * distributed under the License is distributed on an "AS IS" BASIS,
 21 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 22 |  * See the License for the specific language governing permissions and
 23 |  * limitations under the License.
 24 |  */
 25 | 
 26 | /**
 27 |  * CreateTableAndLoadData
 28 |  *
 29 |  */
 30 | class HBaseTestData extends HBaseIntegrationTestBase {
 31 |   val DefaultStagingTableName = "StageTable"
 32 |   val DefaultTableName = "TestTable"
 33 |   val DefaultHbaseStagingTableName = s"Hb$DefaultStagingTableName"
 34 |   val DefaultHbaseTabName = s"Hb$DefaultTableName"
 35 |   val DefaultHbaseColFamilies = Seq("cf1", "cf2")
 36 | 
 37 |   val CsvPaths = Array("src/test/resources", "sql/hbase/src/test/resources")
 38 |   val DefaultLoadFile = "testTable.txt"
 39 | 
 40 |   private val tpath = for (csvPath <- CsvPaths
 41 |                            if new java.io.File(csvPath).exists()
 42 |   ) yield {
 43 |     logInfo(s"Following path exists $csvPath")
 44 |     csvPath
 45 |   }
 46 |   private[hbase] val CsvPath = tpath(0)
 47 | 
 48 |   override protected def beforeAll() = {
 49 |     super.beforeAll()
 50 |     createTables(DefaultStagingTableName, DefaultTableName,
 51 |       DefaultHbaseStagingTableName, DefaultHbaseTabName)
 52 |     loadData(DefaultStagingTableName, DefaultTableName, s"$CsvPath/$DefaultLoadFile")
 53 |   }
 54 | 
 55 |   override protected def afterAll() = {
 56 |     super.afterAll()
 57 |     TestHbase.sql("DROP TABLE " + DefaultStagingTableName)
 58 |     TestHbase.sql("DROP TABLE " + DefaultTableName)
 59 |   }
 60 | 
 61 |   def createNativeHbaseTable(tableName: String, families: Seq[String]) = {
 62 |     val hbaseAdmin = TestHbase.hbaseAdmin
 63 |     val hdesc = new HTableDescriptor(TableName.valueOf(tableName))
 64 |     families.foreach { f => hdesc.addFamily(new HColumnDescriptor(f))}
 65 |     try {
 66 |       hbaseAdmin.createTable(hdesc)
 67 |     } catch {
 68 |       case e: TableExistsException =>
 69 |         logError(s"Table already exists $tableName", e)
 70 |     }
 71 |   }
 72 | 
 73 |   def createNativeHbaseTable(tableName: String, families: Seq[String],
 74 |                              splitKeys: Array[HBaseRawType]) = {
 75 |     val hbaseAdmin = TestHbase.hbaseAdmin
 76 |     val hdesc = new HTableDescriptor(TableName.valueOf(tableName))
 77 |     families.foreach { f => hdesc.addFamily(new HColumnDescriptor(f))}
 78 |     try {
 79 |       hbaseAdmin.createTable(hdesc, splitKeys)
 80 |     } catch {
 81 |       case e: TableExistsException =>
 82 |         logError(s"Table already exists $tableName", e)
 83 |     }
 84 |   }
 85 | 
 86 |   def dropNativeHbaseTable(tableName: String) = {
 87 |     try {
 88 |       val hbaseAdmin = TestHbase.hbaseAdmin
 89 |       hbaseAdmin.disableTable(tableName)
 90 |       hbaseAdmin.deleteTable(tableName)
 91 |     } catch {
 92 |       case e: TableExistsException =>
 93 |         logError(s"Table already exists $tableName", e)
 94 |     }
 95 |   }
 96 | 
 97 |   def createTables(
 98 |       stagingTableName: String,
 99 |       tableName: String,
100 |       hbaseStagingTable: String,
101 |       hbaseTable: String) = {
102 |     val hbaseAdmin = TestHbase.hbaseAdmin
103 |     if (!hbaseAdmin.tableExists(TableName.valueOf(hbaseStagingTable))) {
104 |       createNativeHbaseTable(hbaseStagingTable, DefaultHbaseColFamilies)
105 |     }
106 |     if (!hbaseAdmin.tableExists(TableName.valueOf(hbaseTable))) {
107 |       createNativeHbaseTable(hbaseTable, DefaultHbaseColFamilies)
108 |     }
109 | 
110 |     if (TestHbase.catalog.checkLogicalTableExist(stagingTableName)) {
111 |       val dropSql = s"DROP TABLE $stagingTableName"
112 |       runSql(dropSql)
113 |     }
114 | 
115 |     if (TestHbase.catalog.checkLogicalTableExist(tableName)) {
116 |       val dropSql = s"DROP TABLE $tableName"
117 |       runSql(dropSql)
118 |     }
119 | 
120 |     val (stagingSql, tabSql) =
121 |       ( s"""CREATE TABLE $stagingTableName(strcol STRING, bytecol STRING, shortcol STRING, intcol STRING,
122 |             longcol STRING, floatcol STRING, doublecol STRING, PRIMARY KEY(doublecol, strcol, intcol))
123 |             MAPPED BY ($hbaseStagingTable, COLS=[bytecol=cf1.hbytecol,
124 |             shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])"""
125 |         .stripMargin
126 |         ,
127 |         s"""CREATE TABLE $tableName(strcol STRING, bytecol BYTE, shortcol SHORT, intcol INTEGER,
128 |             longcol LONG, floatcol FLOAT, doublecol DOUBLE, PRIMARY KEY(doublecol, strcol, intcol))
129 |             MAPPED BY ($hbaseTable, COLS=[bytecol=cf1.hbytecol,
130 |             shortcol=cf1.hshortcol, longcol=cf2.hlongcol, floatcol=cf2.hfloatcol])"""
131 |           .stripMargin
132 |         )
133 |     try {
134 |       logInfo(s"invoking $stagingSql ..")
135 |       runSql(stagingSql)
136 |     } catch {
137 |       case e: TableExistsException =>
138 |         logInfo("IF NOT EXISTS still not implemented so we get the following exception", e)
139 |     }
140 | 
141 |     logDebug(s"Created table $tableName: " +
142 |       s"isTableAvailable= ${hbaseAdmin.isTableAvailable(s2b(hbaseStagingTable))}" +
143 |       s" tableDescriptor= ${hbaseAdmin.getTableDescriptor(s2b(hbaseStagingTable))}")
144 | 
145 |     try {
146 |       logInfo(s"invoking $tabSql ..")
147 |       runSql(tabSql)
148 |     } catch {
149 |       case e: TableExistsException =>
150 |         logInfo("IF NOT EXISTS still not implemented so we get the following exception", e)
151 |     }
152 |   }
153 | 
154 |   def loadData(stagingTableName: String, tableName: String, loadFile: String) = {
155 |     // then load data into table
156 |     val loadSql = s"LOAD PARALL DATA LOCAL INPATH '$loadFile' INTO TABLE $tableName"
157 |     runSql(loadSql)
158 |   }
159 | 
160 |   def s2b(s: String) = Bytes.toBytes(s)
161 | 
162 |   def run(sqlCtx: SQLContext, testName: String, sql: String, exparr: Seq[Seq[Any]]) = {
163 |     val execQuery1 = sqlCtx.executeSql(sql)
164 |     val result1 = runSql(sql)
165 |     assert(result1.size == exparr.length, s"$testName failed on size")
166 |     verify(testName,
167 |       sql,
168 |       for (rx <- 0 until exparr.size)
169 |       yield result1(rx).toSeq, exparr
170 |     )
171 |   }
172 | }
173 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/TestData.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hbase
 19 | 
 20 | import java.sql.Timestamp
 21 | 
 22 | import org.apache.spark.sql.catalyst.plans.logical
 23 | 
 24 | case class TestData(k: Int, v: String)
 25 | 
 26 | object TestData {
 27 |   import TestHbase.implicits._
 28 |   
 29 |   val testData = TestHbase.sparkContext.parallelize(
 30 |     (1 to 100).map(i => TestData(i, i.toString))).toDF()
 31 |   testData.registerTempTable("testData")
 32 | 
 33 |   val negativeData = TestHbase.sparkContext.parallelize(
 34 |     (1 to 100).map(i => TestData(-i, (-i).toString))).toDF()
 35 |   negativeData.registerTempTable("negativeData")
 36 | 
 37 |   case class LargeAndSmallInts(a: Int, b: Int)
 38 | 
 39 |   val largeAndSmallInts =
 40 |     TestHbase.sparkContext.parallelize(
 41 |       LargeAndSmallInts(2147483644, 1) ::
 42 |         LargeAndSmallInts(1, 2) ::
 43 |         LargeAndSmallInts(2147483645, 1) ::
 44 |         LargeAndSmallInts(2, 2) ::
 45 |         LargeAndSmallInts(2147483646, 1) ::
 46 |         LargeAndSmallInts(3, 2) :: Nil).toDF()
 47 |   largeAndSmallInts.registerTempTable("largeAndSmallInts")
 48 | 
 49 |   case class TestData2(a: Int, b: Int)
 50 | 
 51 |   val testData2 =
 52 |     TestHbase.sparkContext.parallelize(
 53 |       TestData2(1, 1) ::
 54 |         TestData2(1, 2) ::
 55 |         TestData2(2, 1) ::
 56 |         TestData2(2, 2) ::
 57 |         TestData2(3, 1) ::
 58 |         TestData2(3, 2) :: Nil).toDF()
 59 |   testData2.registerTempTable("testData2")
 60 | 
 61 |   case class DecimalData(a: BigDecimal, b: BigDecimal)
 62 | 
 63 |   val decimalData =
 64 |     TestHbase.sparkContext.parallelize(
 65 |       DecimalData(1, 1) ::
 66 |         DecimalData(1, 2) ::
 67 |         DecimalData(2, 1) ::
 68 |         DecimalData(2, 2) ::
 69 |         DecimalData(3, 1) ::
 70 |         DecimalData(3, 2) :: Nil).toDF()
 71 |   decimalData.registerTempTable("decimalData")
 72 | 
 73 |   case class BinaryData(a: Array[Byte], b: Int)
 74 | 
 75 |   val binaryData =
 76 |     TestHbase.sparkContext.parallelize(
 77 |       BinaryData("12".getBytes, 1) ::
 78 |         BinaryData("22".getBytes, 5) ::
 79 |         BinaryData("122".getBytes, 3) ::
 80 |         BinaryData("121".getBytes, 2) ::
 81 |         BinaryData("123".getBytes, 4) :: Nil).toDF()
 82 |   binaryData.registerTempTable("binaryData")
 83 | 
 84 |   case class TestData3(a: Int, b: Option[Int])
 85 | 
 86 |   val testData3 =
 87 |     TestHbase.sparkContext.parallelize(
 88 |       TestData3(1, None) ::
 89 |         TestData3(2, Some(2)) :: Nil).toDF()
 90 |   testData3.registerTempTable("testData3")
 91 | 
 92 |   val emptyTableData = logical.LocalRelation('a.int, 'b.int)
 93 | 
 94 |   case class UpperCaseData(N: Int, L: String)
 95 | 
 96 |   val upperCaseData =
 97 |     TestHbase.sparkContext.parallelize(
 98 |       UpperCaseData(1, "A") ::
 99 |         UpperCaseData(2, "B") ::
100 |         UpperCaseData(3, "C") ::
101 |         UpperCaseData(4, "D") ::
102 |         UpperCaseData(5, "E") ::
103 |         UpperCaseData(6, "F") :: Nil).toDF()
104 |   upperCaseData.registerTempTable("upperCaseData")
105 | 
106 |   case class LowerCaseData(n: Int, l: String)
107 | 
108 |   val lowerCaseData =
109 |     TestHbase.sparkContext.parallelize(
110 |       LowerCaseData(1, "a") ::
111 |         LowerCaseData(2, "b") ::
112 |         LowerCaseData(3, "c") ::
113 |         LowerCaseData(4, "d") :: Nil).toDF()
114 |   lowerCaseData.registerTempTable("lowerCaseData")
115 | 
116 |   case class ArrayData(dt: Seq[Int], nestedData: Seq[Seq[Int]])
117 | 
118 |   val arrayData =
119 |     TestHbase.sparkContext.parallelize(
120 |       ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3))) ::
121 |         ArrayData(Seq(2, 3, 4), Seq(Seq(2, 3, 4))) :: Nil)
122 |   arrayData.toDF().registerTempTable("arrayData")
123 | 
124 |   case class MapData(data: scala.collection.Map[Int, String])
125 | 
126 |   val mapData =
127 |     TestHbase.sparkContext.parallelize(
128 |       MapData(Map(1 -> "a1", 2 -> "b1", 3 -> "c1", 4 -> "d1", 5 -> "e1")) ::
129 |         MapData(Map(1 -> "a2", 2 -> "b2", 3 -> "c2", 4 -> "d2")) ::
130 |         MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) ::
131 |         MapData(Map(1 -> "a4", 2 -> "b4")) ::
132 |         MapData(Map(1 -> "a5")) :: Nil)
133 |   mapData.toDF().registerTempTable("mapData")
134 | 
135 |   case class StringData(s: String)
136 | 
137 |   val repeatedData =
138 |     TestHbase.sparkContext.parallelize(List.fill(2)(StringData("test"))).toDF()
139 |   repeatedData.registerTempTable("repeatedData")
140 | 
141 |   val nullableRepeatedData =
142 |     TestHbase.sparkContext.parallelize(
143 |       List.fill(2)(StringData(null)) ++
144 |         List.fill(2)(StringData("test"))).toDF()
145 |   nullableRepeatedData.registerTempTable("nullableRepeatedData")
146 | 
147 |   case class NullInts(a: Integer)
148 | 
149 |   val nullInts =
150 |     TestHbase.sparkContext.parallelize(
151 |       NullInts(1) ::
152 |         NullInts(2) ::
153 |         NullInts(3) ::
154 |         NullInts(null) :: Nil
155 |     ).toDF()
156 |   nullInts.registerTempTable("nullInts")
157 | 
158 |   val allNulls =
159 |     TestHbase.sparkContext.parallelize(
160 |       NullInts(null) ::
161 |         NullInts(null) ::
162 |         NullInts(null) ::
163 |         NullInts(null) :: Nil).toDF()
164 |   allNulls.registerTempTable("allNulls")
165 | 
166 |   case class NullStrings(n: Int, s: String)
167 | 
168 |   val nullStrings =
169 |     TestHbase.sparkContext.parallelize(
170 |       NullStrings(1, "abc") ::
171 |         NullStrings(2, "ABC") ::
172 |         NullStrings(3, null) :: Nil).toDF()
173 |   nullStrings.registerTempTable("nullStrings")
174 | 
175 |   case class TableName(tableName: String)
176 | 
177 |   TestHbase.sparkContext.parallelize(TableName("test") :: Nil).toDF().registerTempTable("tableName")
178 | 
179 |   val unparsedStrings =
180 |     TestHbase.sparkContext.parallelize(
181 |       "1, A1, true, null" ::
182 |         "2, B2, false, null" ::
183 |         "3, C3, true, null" ::
184 |         "4, D4, true, 2147483644" :: Nil)
185 | 
186 |   case class TimestampField(time: Timestamp)
187 | 
188 |   val timestamps = TestHbase.sparkContext.parallelize((1 to 3).map { i =>
189 |     TimestampField(new Timestamp(i))
190 |   }).toDF()
191 |   timestamps.registerTempTable("timestamps")
192 | 
193 |   case class IntField(i: Int)
194 | 
195 |   // An RDD with 4 elements and 8 partitions
196 |   val withEmptyParts = TestHbase.sparkContext.parallelize((1 to 4).map(IntField), 8).toDF()
197 |   withEmptyParts.registerTempTable("withEmptyParts")
198 | 
199 |   case class Person(id: Int, name: String, age: Int)
200 | 
201 |   case class Salary(personId: Int, salary: Double)
202 | 
203 |   val person = TestHbase.sparkContext.parallelize(
204 |     Person(0, "mike", 30) ::
205 |       Person(1, "jim", 20) :: Nil)
206 |   person.toDF().registerTempTable("person")
207 |   val salary = TestHbase.sparkContext.parallelize(
208 |     Salary(0, 2000.0) ::
209 |       Salary(1, 1000.0) :: Nil).toDF()
210 |   salary.registerTempTable("salary")
211 | 
212 |   case class ComplexData(m: Map[Int, String], s: TestData, a: Seq[Int], b: Boolean)
213 | 
214 |   val complexData =
215 |     TestHbase.sparkContext.parallelize(
216 |       ComplexData(Map(1 -> "1"), TestData(1, "1"), Seq(1), b = true)
217 |         :: ComplexData(Map(2 -> "2"), TestData(2, "2"), Seq(2), b = false)
218 |         :: Nil).toDF()
219 |   complexData.registerTempTable("complexData")
220 | }
221 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/TestHbase.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hbase
19 | 
20 | import org.apache.hadoop.hbase.{HBaseTestingUtility, MiniHBaseCluster}
21 | import org.apache.hadoop.hbase.client.HBaseAdmin
22 | 
23 | import org.apache.spark.{SparkConf, SparkContext}
24 | 
25 | 
26 | object TestHbase
27 |   extends HBaseSQLContext(
28 |     new SparkContext("local[2]", "TestSQLContext", new SparkConf(true)
29 |       .set("spark.hadoop.hbase.zookeeper.quorum", "localhost"))) {
30 | 
31 |   @transient val testUtil: HBaseTestingUtility =
32 |     new HBaseTestingUtility(sparkContext.hadoopConfiguration)
33 | 
34 |   val nRegionServers: Int = 1
35 |   val nDataNodes: Int = 1
36 |   val nMasters: Int = 1
37 | 
38 |   logDebug(s"Spin up hbase minicluster w/ $nMasters master, $nRegionServers RS, $nDataNodes dataNodes")
39 | 
40 |   @transient val cluster: MiniHBaseCluster = testUtil.startMiniCluster(nMasters, nRegionServers, nDataNodes)
41 |   logInfo(s"Started HBaseMiniCluster with regions = ${cluster.countServedRegions}")
42 | 
43 |   logInfo(s"Configuration zkPort="
44 |     + s"${sparkContext.hadoopConfiguration.get("hbase.zookeeper.property.clientPort")}")
45 | 
46 |   @transient lazy val hbaseAdmin: HBaseAdmin = new HBaseAdmin(sparkContext.hadoopConfiguration)
47 | }
48 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/hbase/TpcMiniTestSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.sql.hbase
 19 | 
 20 | import org.apache.hadoop.hbase._
 21 | 
 22 | /**
 23 |  * This is a mini tpc test suite running against mini-cluster
 24 |  */
 25 | class TpcMiniTestSuite extends HBaseIntegrationTestBase {
 26 |   private val tableName = "store_sales"
 27 |   private val hbaseTableName = "store_sales_htable"
 28 |   private val hbaseFamilies = Seq("f")
 29 | 
 30 |   private val csvPaths = Array("src/test/resources", "sql/hbase/src/test/resources")
 31 |   private val csvFile = "store_sales.txt"
 32 |   private val tpath = for (csvPath <- csvPaths if new java.io.File(csvPath).exists()) yield {
 33 |     logInfo(s"Following path exists $csvPath\n")
 34 |     csvPath
 35 |   }
 36 |   private[hbase] val csvPath = tpath(0)
 37 | 
 38 |   override protected def beforeAll() = {
 39 |     val hbaseAdmin = TestHbase.hbaseAdmin
 40 | 
 41 |     /**
 42 |      * create hbase table if it does not exists
 43 |      */
 44 |     if (!hbaseAdmin.tableExists(TableName.valueOf(hbaseTableName))) {
 45 |       val descriptor = new HTableDescriptor(TableName.valueOf(tableName))
 46 |       hbaseFamilies.foreach { f => descriptor.addFamily(new HColumnDescriptor(f))}
 47 |       try {
 48 |         hbaseAdmin.createTable(descriptor)
 49 |       } catch {
 50 |         case e: TableExistsException =>
 51 |           logError(s"Table already exists $tableName", e)
 52 |       }
 53 |     }
 54 | 
 55 |     /**
 56 |      * drop the existing logical table if it exists
 57 |      */
 58 |     if (TestHbase.catalog.checkLogicalTableExist(tableName)) {
 59 |       val dropSql = "DROP TABLE " + tableName
 60 |       try {
 61 |         runSql(dropSql)
 62 |       } catch {
 63 |         case e: IllegalStateException =>
 64 |           logError(s"Error occurs while dropping the table $tableName", e)
 65 |       }
 66 |     }
 67 | 
 68 |     /**
 69 |      * create table
 70 |      */
 71 |     val createSql =
 72 |       s"""CREATE TABLE store_sales(
 73 |         ss_sold_date_sk INTEGER,
 74 |         ss_sold_time_sk INTEGER,
 75 |         ss_item_sk INTEGER,
 76 |         ss_customer_sk INTEGER,
 77 |         ss_cdemo_sk INTEGER,
 78 |         ss_hdemo_sk INTEGER,
 79 |         ss_addr_sk INTEGER,
 80 |         ss_store_sk INTEGER,
 81 |         ss_promo_sk INTEGER,
 82 |         ss_ticket_number INTEGER,
 83 |         ss_quantity INTEGER,
 84 |         ss_wholesale_cost FLOAT,
 85 |         ss_list_price FLOAT,
 86 |         ss_sales_price FLOAT,
 87 |         ss_ext_discount_amt	FLOAT,
 88 |         ss_ext_sales_price FLOAT,
 89 |         ss_ext_wholesale_cost FLOAT,
 90 |         ss_ext_list_price FLOAT,
 91 |         ss_ext_tax FLOAT,
 92 |         ss_coupon_amt FLOAT,
 93 |         ss_net_paid FLOAT,
 94 |         ss_net_paid_inc_tax FLOAT,
 95 |         ss_net_profit FLOAT,
 96 |         PRIMARY KEY(ss_item_sk, ss_ticket_number))
 97 |         MAPPED BY
 98 |         (store_sales_htable, COLS=[
 99 |           ss_sold_date_sk=f.ss_sold_date_sk,
100 |           ss_sold_time_sk=f.ss_sold_time_sk,
101 |           ss_customer_sk=f.ss_customer_sk,
102 |           ss_cdemo_sk=f.ss_cdemo_sk,
103 |           ss_hdemo_sk=f.ss_hdemo_sk,
104 |           ss_addr_sk=f.ss_addr_sk,
105 |           ss_store_sk=f.ss_store_sk,
106 |           ss_promo_sk=f.ss_promo_sk,
107 |           ss_quantity=f.ss_quantity,
108 |           ss_wholesale_cost=f.ss_wholesale_cost,
109 |           ss_list_price=f.ss_list_price,
110 |           ss_sales_price=f.ss_sales_price,
111 |           ss_ext_discount_amt=f.ss_ext_discount_amt,
112 |           ss_ext_sales_price=f.ss_ext_sales_price,
113 |           ss_ext_wholesale_cost=f.ss_ext_wholesale_cost,
114 |           ss_ext_list_price=f.ss_ext_list_price,
115 |           ss_ext_tax=f.ss_ext_tax,
116 |           ss_coupon_amt=f.ss_coupon_amt,
117 |           ss_net_paid=f.ss_net_paid,
118 |           ss_net_paid_inc_tax=f.ss_net_paid_inc_tax,
119 |           ss_net_profit=f.ss_net_profit
120 |         ])""".stripMargin
121 | 
122 |     try {
123 |       runSql(createSql)
124 |     } catch {
125 |       case e: IllegalStateException =>
126 |         logError(s"Error occurs while creating the table $tableName", e)
127 |     }
128 | 
129 |     /**
130 |      * load the data
131 |      */
132 |     val loadSql = "LOAD DATA LOCAL INPATH '" + s"$csvPath/$csvFile" +
133 |       "' INTO TABLE store_sales"
134 |     try {
135 |       runSql(loadSql)
136 |     } catch {
137 |       case e: IllegalStateException =>
138 |         logError(s"Error occurs while loading the data $tableName", e)
139 |     }
140 |   }
141 | 
142 |   override protected def afterAll() = {
143 |     runSql("DROP TABLE " + tableName)
144 |   }
145 | 
146 |   test("Query 0") {
147 |     val sql = "SELECT count(1) FROM store_sales"
148 |     val rows = runSql(sql)
149 |     assert(rows(0).get(0) == 100)
150 |   }
151 | 
152 |   test("Query 1") {
153 |     val sql = "SELECT ss_quantity, ss_wholesale_cost, ss_list_price FROM store_sales WHERE ss_item_sk = 2744 AND ss_ticket_number = 1"
154 |     val rows = runSql(sql)
155 |     assert(rows(0).get(0) == 37)
156 |     assert(rows(0).get(1) == 63.63f)
157 |     assert(rows(0).get(2) == 101.17f)
158 |   }
159 | 
160 |   test("Query 2") {
161 |     val sql = "SELECT ss_sold_date_sk, ss_sold_time_sk, ss_store_sk FROM store_sales WHERE ss_item_sk = 2744 AND ss_ticket_number = 1"
162 |     val rows = runSql(sql)
163 |     assert(rows(0).get(0) == 2451813)
164 |     assert(rows(0).get(1) == 65495)
165 |     assert(rows(0).get(2) == 25)
166 |   }
167 | 
168 |   test("Query 3") {
169 |     val sql = "SELECT ss_customer_sk, ss_promo_sk, ss_coupon_amt FROM store_sales WHERE ss_item_sk = 2744 AND ss_ticket_number = 1"
170 |     val rows = runSql(sql)
171 |     assert(rows(0).get(0) == 225006)
172 |     assert(rows(0).get(1) == 354)
173 |     assert(rows(0).get(2) == 46.03f)
174 |   }
175 | 
176 |   test("Query 4") {
177 |     val sql = "SELECT ss_item_sk, count(1) FROM store_sales GROUP BY ss_item_sk"
178 |     val rows = runSql(sql)
179 |     assert(rows.size == 100)
180 |   }
181 | 
182 |   test("Query 5") {
183 |     val sql = "SELECT ss_item_sk, ss_ticket_number, count(1) FROM store_sales WHERE ss_item_sk > 4000 AND ss_item_sk < 5000 GROUP BY ss_item_sk, ss_ticket_number"
184 |     val rows = runSql(sql)
185 |     assert(rows.size == 5)
186 |   }
187 | 
188 |   test("Query 6") {
189 |     val sql = "SELECT ss_item_sk, avg(ss_quantity) as avg_qty, count(ss_quantity) as cnt_qty FROM store_sales WHERE ss_item_sk = 2744 GROUP BY ss_item_sk"
190 |     val rows = runSql(sql)
191 |     assert(rows.size == 1)
192 |   }
193 | 
194 |   test("Query 7") {
195 |     val sql = "SELECT ss_item_sk, ss_ticket_number, sum(ss_wholesale_cost) as sum_wholesale_cost FROM store_sales WHERE ss_item_sk > 4000 AND ss_item_sk <= 5000 GROUP BY ss_item_sk, ss_ticket_number"
196 |     val rows = runSql(sql)
197 |     assert(rows.size == 5)
198 |   }
199 | 
200 |   test("Query 8") {
201 |     val sql = "SELECT ss_item_sk, ss_ticket_number, min(ss_wholesale_cost) as min_wholesale_cost, max(ss_wholesale_cost) as max_wholesale_cost, avg(ss_wholesale_cost) as avg_wholesale_cost FROM store_sales WHERE ss_item_sk > 4000 AND ss_item_sk <= 5000 GROUP BY ss_item_sk, ss_ticket_number"
202 |     val rows = runSql(sql)
203 |     assert(rows.size == 5)
204 |   }
205 | 
206 |   test("Query 9") {
207 |     val sql = "SELECT ss_item_sk, count(ss_customer_sk) as count_ss_customer_sk FROM store_sales WHERE ss_item_sk > 4000 AND ss_item_sk <= 5000 GROUP BY ss_item_sk"
208 |     val rows = runSql(sql)
209 |     assert(rows.size == 5)
210 |   }
211 | 
212 |   test("Query 10") {
213 |     val sql = "SELECT count(*) FROM store_sales WHERE ss_net_profit < 100"
214 |     val rows = runSql(sql)
215 |     assert(rows(0).get(0) == 74)
216 |   }
217 | 
218 |   test("Query 11") {
219 |     val sql = "SELECT count(*) FROM store_sales WHERE ss_coupon_amt < 50 AND ss_ext_discount_amt < 50 AND ss_net_paid < 50 AND ss_net_paid_inc_tax < 50"
220 |     val rows = runSql(sql)
221 |     assert(rows(0).get(0) == 6)
222 |   }
223 | 
224 |   test("Query 12") {
225 |     val sql = "SELECT count(distinct ss_customer_sk) as count_distinct_customer FROM store_sales"
226 |     val rows = runSql(sql)
227 |     assert(rows(0).get(0) == 8)
228 |   }
229 | 
230 |   test("Query 13") {
231 |     val sql = "SELECT * FROM store_sales limit 100"
232 |     val rows = runSql(sql)
233 |     assert(rows.size == 100)
234 |   }
235 | 
236 |   test("Query 14") {
237 |     val sql = "SELECT ss_customer_sk, count(*) FROM store_sales WHERE ss_item_sk >= 4000 AND ss_item_sk <= 5000 GROUP BY ss_customer_sk"
238 |     val rows = runSql(sql)
239 |     assert(rows.size == 5)
240 |   }
241 | 
242 |   test("Query 15") {
243 |     val sql = "SELECT count(ss_customer_sk) as count_customer FROM store_sales WHERE ss_customer_sk IN (1,25,50,75,100)"
244 |     val rows = runSql(sql)
245 |     assert(rows(0).get(0) == 0)
246 |   }
247 | 
248 |   test("Query 16") {
249 |     val sql = "SELECT count(ss_customer_sk) as count_customer FROM store_sales WHERE ss_customer_sk < 100 AND ss_quantity < 5"
250 |     val rows = runSql(sql)
251 |     assert(rows(0).get(0) == 2)
252 |   }
253 | 
254 |   test("Query 17") {
255 |     val sql = "SELECT count(ss_customer_sk) as count_customer FROM store_sales WHERE ss_customer_sk > 100"
256 |     val rows = runSql(sql)
257 |     assert(rows(0).get(0) == 83)
258 |   }
259 | 
260 |   test("Query 18") {
261 |     val sql = "SELECT ss_item_sk, ss_ticket_number FROM store_sales WHERE (ss_item_sk = 186 AND ss_ticket_number > 0)"
262 |     val rows = runSql(sql)
263 |     assert(rows.size == 1)
264 |   }
265 | }
266 | 


--------------------------------------------------------------------------------