├── .editorconfig ├── .gitignore ├── CHANGELOG.md ├── LICENSE.md ├── README.md ├── bin └── activator ├── build.sbt ├── libexec └── activator-launch-1.3.9.jar ├── project ├── build.properties └── plugins.sbt └── src ├── main ├── java │ └── com │ │ └── ardentex │ │ └── spark │ │ └── hiveudf │ │ └── FormatCurrency.java └── scala │ └── com │ └── ardentex │ └── spark │ └── hiveudf │ ├── FormatTimestamp.scala │ └── ToHex.scala └── test └── scala └── com └── ardentex └── spark └── hiveudf ├── FormatCurrencySpec.scala ├── FormatTimestampSpec.scala └── ToHexSpec.scala /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig helps developers define and maintain consistent 2 | # coding styles between different editors and IDEs 3 | # editorconfig.org 4 | 5 | root = true 6 | 7 | 8 | [*] 9 | end_of_line = lf 10 | charset = utf-8 11 | trim_trailing_whitespace = true 12 | insert_final_newline = true 13 | indent_style = space 14 | indent_size = 2 15 | 16 | [*.{diff,md}] 17 | trim_trailing_whitespace = false 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /RUNNING_PID 2 | /logs/ 3 | /project/*-shim.sbt 4 | /project/project/ 5 | /project/target/ 6 | /target/ 7 | /.idea* 8 | /*.iml 9 | /metastore_db 10 | /derby.log 11 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log for spark-hive-udf 2 | 3 | Version 0.1.0 4 | 5 | - Updated to compile against both Scala 2.10 and 2.11. 6 | - Verified against Spark 2.1. 7 | - Made Scala UDFs more idiomatic. 8 | 9 | Version 0.0.1 10 | 11 | - Initial release 12 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | License 2 | ======= 3 | 4 | This software is released under BSD license, adapted from 5 | 6 | 7 | --- 8 | 9 | Copyright © 2016-2017, Brian M. Clapper. 10 | All rights reserved. 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted provided that the following conditions are met: 14 | 15 | * Redistributions of source code must retain the above copyright notice, 16 | this list of conditions and the following disclaimer. 17 | 18 | * Redistributions in binary form must reproduce the above copyright notice, 19 | this list of conditions and the following disclaimer in the documentation 20 | and/or other materials provided with the distribution. 21 | 22 | * Neither the names "clapper.org" nor the names of any contributors may 23 | be used to endorse or promote products derived from this software 24 | without specific prior written permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 | POSSIBILITY OF SUCH DAMAGE. 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sample Hive UDF project 2 | 3 | ## Introduction 4 | 5 | This project is just an example, containing several 6 | [Hive User Defined Functions][] (UDFs), for use in Apache Spark. It's 7 | intended to demonstrate how to build a Hive UDF in Scala or Java and use it 8 | within [Apache Spark][]. 9 | 10 | ## Why use a Hive UDF? 11 | 12 | One especially good use of Hive UDFs is with Python and DataFrames. 13 | Native Spark UDFs written in Python are slow, because they have to be 14 | executed in a Python process, rather than a JVM-based Spark Executor. 15 | For a Spark Executor to run a Python UDF, it must: 16 | 17 | * send data from the partition over to a Python process associated with 18 | the Executor, and 19 | * wait for the Python process to deserialize the data, run the UDF on it, 20 | reserialize the data, and send it back. 21 | 22 | By contrast, a Hive UDF, whether written in Scala or Java, can be executed 23 | in the Executor JVM, _even if the DataFrame logic is in Python_. 24 | 25 | There's really only one drawback: a Hive UDF _must_ be invoked via SQL. 26 | You can't call it as a function from the DataFrame API. 27 | 28 | **NOTE** It is also possible to do something similar, using native Spark 29 | UDFs. See and for some useful discussions. 30 | 31 | ## Building 32 | 33 | This project builds with [SBT][], but you don't have to download SBT. Just use 34 | the `activator` script in the `bin` subdirectory. To build the jar file, use 35 | this command: 36 | 37 | ``` 38 | $ bin/activator +jar 39 | ``` 40 | 41 | That command will download the dependencies (if they haven't already been 42 | downloaded), compile the code, run the unit tests, and create jar files for 43 | both Scala 2.10 and Scala 2.11. Those jars will be: 44 | 45 | * Scala 2.10: `target/scala-2.10/spark-hive-udf_2.10-0.1.0.jar` 46 | * Scala 2.11: `target/scala-2.11/spark-hive-udf_2.11-0.1.0.jar` 47 | 48 | ### Building with Maven 49 | 50 | Honestly, I'm not a big fan of Maven. I had a Maven `pom.xml` file here, but 51 | I got tired of maintaining two build files. Just use `activator`, as described 52 | above. 53 | 54 | ## Running in PySpark 55 | 56 | The following Python code demonstrates the UDFs in this package and assumes 57 | that you've packaged the code into `target/scala-2.11/spark-hive-udf_2.11-0.1.0.jar` 58 | and copied that jar to `/tmp`. 59 | 60 | These commands assume Spark local mode, but they should also work fine within 61 | a cluster manager like Spark Standalone or YARN. 62 | 63 | (You can also use Hive UDFs from Scala, by the way.) 64 | 65 | First, fire up PySpark: 66 | 67 | ``` 68 | $ pyspark --jars /tmp/spark-hive-udf_2.11-0.1.0.jar 69 | ``` 70 | 71 | At the PySpark prompt, enter the following. (If you're using IPython, 72 | `%paste` works best.) 73 | 74 | **NOTE**: The following code assumes Spark 2.x. 75 | 76 | ```python 77 | from datetime import datetime 78 | from collections import namedtuple 79 | from decimal import Decimal 80 | 81 | Person = namedtuple('Person', ('first_name', 'last_name', 'birth_date', 'salary', 'children')) 82 | 83 | fmt = "%Y-%m-%d" 84 | 85 | people = [ 86 | Person('Joe', 'Smith', datetime.strptime("1993-10-20", fmt), 70000.0, 2), 87 | Person('Jenny', 'Harmon', datetime.strptime("1987-08-02", fmt), 94000.0, 1) 88 | ] 89 | 90 | # Replace spark.sparkContext with sc if you're using Spark 1.x. 91 | df = spark.sparkContext.parallelize(people).toDF() 92 | 93 | # Replace spark with sqlContext if you're using Spark 1.x. 94 | spark.sql("CREATE TEMPORARY FUNCTION to_hex AS 'com.ardentex.spark.hiveudf.ToHex'") 95 | spark.sql("CREATE TEMPORARY FUNCTION datestring AS 'com.ardentex.spark.hiveudf.FormatTimestamp'") 96 | spark.sql("CREATE TEMPORARY FUNCTION currency AS 'com.ardentex.spark.hiveudf.FormatCurrency'") 97 | 98 | # Replace createOrReplaceTempView with registerTempTable if you're using 99 | # Spark 1.x 100 | df.createOrReplaceTempView("people") 101 | df2 = spark.sql("SELECT first_name, last_name, datestring(birth_date, 'MMMM dd, yyyy') as birth_date, currency(salary, 'en_US') as salary, to_hex(children) as hex_children FROM people") 102 | ``` 103 | 104 | Then, take a look at the second DataFrame: 105 | 106 | ``` 107 | df2.show() 108 | 109 | +----------+---------+----------------+----------+------------+ 110 | |first_name|last_name| birth_date| salary|hex_children| 111 | +----------+---------+----------------+----------+------------+ 112 | | Joe| Smith|October 20, 1993|$70,000.00| 0x2| 113 | | Jenny| Harmon| August 02, 1987|$94,000.00| 0x1| 114 | +----------+---------+----------------+----------+------------+ 115 | ``` 116 | 117 | ## Running in spark-shell (Scala) 118 | 119 | First, fire up the Spark shell: 120 | 121 | ``` 122 | $ spark-shell --jars /tmp/spark-hive-udf_2.11-0.1.0.jar 123 | ``` 124 | 125 | At the Scala REPL prompt, type `:paste`, then copy and paste the following 126 | code followed by a Ctrl-D. 127 | 128 | **NOTE**: The following code assumes Spark 2.x. 129 | 130 | ```scala 131 | import java.sql.Timestamp 132 | import java.text.SimpleDateFormat 133 | import java.util.Date 134 | 135 | case class Person(firstName: String, lastName: String, birthDate: Timestamp, salary: Double, children: Int) 136 | 137 | val fmt = new SimpleDateFormat("yyyy-MM-dd") 138 | 139 | val people = Array( 140 | Person("Joe", "Smith", new Timestamp(fmt.parse("1993-10-20").getTime), 70000.0, 2), 141 | Person("Jenny", "Harmon", new Timestamp(fmt.parse("1987-08-02").getTime), 94000.0, 1) 142 | ) 143 | 144 | // Replace spark.sparkContext with sc if you're using Spark 1.x. 145 | val df = spark.createDataFrame(spark.sparkContext.parallelize(people)) 146 | 147 | // Replace spark with sqlContext if you're using Spark 1.x. 148 | spark.sql("CREATE TEMPORARY FUNCTION toHex AS 'com.ardentex.spark.hiveudf.ToHex'") 149 | spark.sql("CREATE TEMPORARY FUNCTION datestring AS 'com.ardentex.spark.hiveudf.FormatTimestamp'") 150 | spark.sql("CREATE TEMPORARY FUNCTION currency AS 'com.ardentex.spark.hiveudf.FormatCurrency'") 151 | 152 | // Replace createOrReplaceTempView with registerTempTable if you're using 153 | // Spark 1.x 154 | df.createOrReplaceTempView("people") 155 | val df2 = spark.sql("SELECT firstName, lastName, datestring(birthDate, 'MMMM dd, yyyy') as birthDate, currency(salary, 'en_US') as salary, toHex(children) as hexChildren FROM people") 156 | ``` 157 | 158 | Then, take a look at the second DataFrame: 159 | 160 | ``` 161 | df2.show() 162 | 163 | +---------+--------+----------------+----------+-----------+ 164 | |firstName|lastName| birthDate| salary|hexChildren| 165 | +---------+--------+----------------+----------+-----------+ 166 | | Joe| Smith|October 20, 1993|$70,000.00| 0x2| 167 | | Jenny| Harmon| August 02, 1987|$94,000.00| 0x1| 168 | +---------+--------+----------------+----------+-----------+ 169 | ``` 170 | 171 | ## "Why did you write these things in Scala?" 172 | 173 | Because, after writing Scala for the last 7 years, I find Java annoying. But, 174 | I did include a Java UDF in this repo; take a look at the `FormatCurrency` UDF. 175 | The others are in Scala and, really, they're not hard to translate 176 | to Java. 177 | 178 | [Hive User Defined Functions]: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF 179 | [Apache Spark]: http://spark.apache.org 180 | [SBT]: http://scala-sbt.org 181 | -------------------------------------------------------------------------------- /bin/activator: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### ------------------------------- ### 4 | ### Helper methods for BASH scripts ### 5 | ### ------------------------------- ### 6 | 7 | realpath () { 8 | ( 9 | TARGET_FILE="$1" 10 | 11 | cd "$(dirname "$TARGET_FILE")" 12 | TARGET_FILE=$(basename "$TARGET_FILE") 13 | 14 | COUNT=0 15 | while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] 16 | do 17 | TARGET_FILE=$(readlink "$TARGET_FILE") 18 | cd "$(dirname "$TARGET_FILE")" 19 | TARGET_FILE=$(basename "$TARGET_FILE") 20 | COUNT=$(($COUNT + 1)) 21 | done 22 | 23 | if [ "$TARGET_FILE" == "." -o "$TARGET_FILE" == ".." ]; then 24 | cd "$TARGET_FILE" 25 | TARGET_FILEPATH= 26 | else 27 | TARGET_FILEPATH=/$TARGET_FILE 28 | fi 29 | 30 | # make sure we grab the actual windows zipPath, instead of cygwin's zipPath. 31 | if ! is_cygwin; then 32 | echo "$(pwd -P)/$TARGET_FILE" 33 | else 34 | echo $(cygwinpath "$(pwd -P)/$TARGET_FILE") 35 | fi 36 | ) 37 | } 38 | 39 | # TODO - Do we need to detect msys? 40 | 41 | # Uses uname to detect if we're in the odd cygwin environment. 42 | is_cygwin() { 43 | local os=$(uname -s) 44 | case "$os" in 45 | CYGWIN*) return 0 ;; 46 | *) return 1 ;; 47 | esac 48 | } 49 | 50 | # This can fix cygwin style /cygdrive paths so we get the 51 | # windows style paths. 52 | cygwinpath() { 53 | local file="$1" 54 | if is_cygwin; then 55 | echo $(cygpath -w $file) 56 | else 57 | echo $file 58 | fi 59 | } 60 | 61 | # Make something URI friendly 62 | make_url() { 63 | url="$1" 64 | local nospaces=${url// /%20} 65 | if is_cygwin; then 66 | echo "/${nospaces//\\//}" 67 | else 68 | echo "$nospaces" 69 | fi 70 | } 71 | 72 | # Detect if we should use JAVA_HOME or just try PATH. 73 | get_java_cmd() { 74 | if [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]]; then 75 | echo "$JAVA_HOME/bin/java" 76 | else 77 | echo "java" 78 | fi 79 | } 80 | 81 | echoerr () { 82 | echo 1>&2 "$@" 83 | } 84 | vlog () { 85 | [[ $verbose || $debug ]] && echoerr "$@" 86 | } 87 | dlog () { 88 | [[ $debug ]] && echoerr "$@" 89 | } 90 | execRunner () { 91 | # print the arguments one to a line, quoting any containing spaces 92 | [[ $verbose || $debug ]] && echo "# Executing command line:" && { 93 | for arg; do 94 | if printf "%s\n" "$arg" | grep -q ' '; then 95 | printf "\"%s\"\n" "$arg" 96 | else 97 | printf "%s\n" "$arg" 98 | fi 99 | done 100 | echo "" 101 | } 102 | 103 | exec "$@" 104 | } 105 | addJava () { 106 | dlog "[addJava] arg = '$1'" 107 | java_args=( "${java_args[@]}" "$1" ) 108 | } 109 | addApp () { 110 | dlog "[addApp] arg = '$1'" 111 | sbt_commands=( "${app_commands[@]}" "$1" ) 112 | } 113 | addResidual () { 114 | dlog "[residual] arg = '$1'" 115 | residual_args=( "${residual_args[@]}" "$1" ) 116 | } 117 | addDebugger () { 118 | addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1" 119 | } 120 | addConfigOpts () { 121 | dlog "[addConfigOpts] arg = '$*'" 122 | for item in $* 123 | do 124 | addJava "$item" 125 | done 126 | } 127 | # a ham-fisted attempt to move some memory settings in concert 128 | # so they need not be messed around with individually. 129 | get_mem_opts () { 130 | local mem=${1:-1024} 131 | local meta=$(( $mem / 4 )) 132 | (( $meta > 256 )) || meta=256 133 | (( $meta < 1024 )) || meta=1024 134 | 135 | # default is to set memory options but this can be overridden by code section below 136 | memopts="-Xms${mem}m -Xmx${mem}m" 137 | if [[ "${java_version}" > "1.8" ]]; then 138 | extmemopts="-XX:MetaspaceSize=64m -XX:MaxMetaspaceSize=${meta}m" 139 | else 140 | extmemopts="-XX:PermSize=64m -XX:MaxPermSize=${meta}m" 141 | fi 142 | 143 | if [[ "${java_opts}" == *-Xmx* ]] || [[ "${java_opts}" == *-Xms* ]] || [[ "${java_opts}" == *-XX:MaxPermSize* ]] || [[ "${java_opts}" == *-XX:ReservedCodeCacheSize* ]] || [[ "${java_opts}" == *-XX:MaxMetaspaceSize* ]]; then 144 | # if we detect any of these settings in ${java_opts} we need to NOT output our settings. 145 | # The reason is the Xms/Xmx, if they don't line up, cause errors. 146 | memopts="" 147 | extmemopts="" 148 | fi 149 | 150 | echo "${memopts} ${extmemopts}" 151 | } 152 | require_arg () { 153 | local type="$1" 154 | local opt="$2" 155 | local arg="$3" 156 | if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then 157 | die "$opt requires <$type> argument" 158 | fi 159 | } 160 | is_function_defined() { 161 | declare -f "$1" > /dev/null 162 | } 163 | 164 | # If we're *not* running in a terminal, and we don't have any arguments, then we need to add the 'ui' parameter 165 | detect_terminal_for_ui() { 166 | [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && { 167 | addResidual "ui" 168 | } 169 | # SPECIAL TEST FOR MAC 170 | [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && { 171 | echo "Detected MAC OSX launched script...." 172 | echo "Swapping to UI" 173 | addResidual "ui" 174 | } 175 | } 176 | 177 | # Processes incoming arguments and places them in appropriate global variables. called by the run method. 178 | process_args () { 179 | while [[ $# -gt 0 ]]; do 180 | case "$1" in 181 | -h|-help) usage; exit 1 ;; 182 | -v|-verbose) verbose=1 && shift ;; 183 | -d|-debug) debug=1 && shift ;; 184 | -mem) require_arg integer "$1" "$2" && app_mem="$2" && shift 2 ;; 185 | -jvm-debug) 186 | if echo "$2" | grep -E ^[0-9]+$ > /dev/null; then 187 | addDebugger "$2" && shift 188 | else 189 | addDebugger 9999 190 | fi 191 | shift ;; 192 | -java-home) require_arg zipPath "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;; 193 | -D*) addJava "$1" && shift ;; 194 | -J*) addJava "${1:2}" && shift ;; 195 | *) addResidual "$1" && shift ;; 196 | esac 197 | done 198 | 199 | is_function_defined process_my_args && { 200 | myargs=("${residual_args[@]}") 201 | residual_args=() 202 | process_my_args "${myargs[@]}" 203 | } 204 | } 205 | 206 | # Actually runs the script. 207 | run() { 208 | # TODO - check for sane environment 209 | 210 | # process the combined args, then reset "$@" to the residuals 211 | process_args "$@" 212 | detect_terminal_for_ui 213 | set -- "${residual_args[@]}" 214 | argumentCount=$# 215 | 216 | #check for jline terminal fixes on cygwin 217 | if is_cygwin; then 218 | stty -icanon min 1 -echo > /dev/null 2>&1 219 | addJava "-Djline.terminal=jline.UnixTerminal" 220 | addJava "-Dsbt.cygwin=true" 221 | fi 222 | 223 | # run sbt 224 | execRunner "$java_cmd" \ 225 | "-Dactivator.home=$(make_url "$activator_home")" \ 226 | $(get_mem_opts $app_mem) \ 227 | ${java_opts[@]} \ 228 | ${java_args[@]} \ 229 | -jar "$app_launcher" \ 230 | "${app_commands[@]}" \ 231 | "${residual_args[@]}" 232 | 233 | local exit_code=$? 234 | if is_cygwin; then 235 | stty icanon echo > /dev/null 2>&1 236 | fi 237 | exit $exit_code 238 | } 239 | 240 | # Loads a configuration file full of default command line options for this script. 241 | loadConfigFile() { 242 | cat "$1" | sed '/^\#/d' 243 | } 244 | 245 | ### ------------------------------- ### 246 | ### Start of customized settings ### 247 | ### ------------------------------- ### 248 | usage() { 249 | cat < [options] 251 | 252 | Command: 253 | ui Start the Activator UI 254 | new [name] [template-id] Create a new project with [name] using template [template-id] 255 | list-templates Print all available template names 256 | -h | -help Print this message 257 | 258 | Options: 259 | -v | -verbose Make this runner chattier 260 | -d | -debug Set sbt log level to debug 261 | -mem Set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) 262 | -jvm-debug Turn on JVM debugging, open at the given port. 263 | 264 | # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) 265 | -java-home Alternate JAVA_HOME 266 | 267 | # jvm options and output control 268 | -Dkey=val Pass -Dkey=val directly to the java runtime 269 | -J-X Pass option -X directly to the java runtime 270 | (-J is stripped) 271 | 272 | # environment variables (read from context) 273 | JAVA_OPTS Environment variable, if unset uses "" 274 | SBT_OPTS Environment variable, if unset uses "" 275 | ACTIVATOR_OPTS Environment variable, if unset uses "" 276 | 277 | In the case of duplicated or conflicting options, the order above 278 | shows precedence: environment variables lowest, command line options highest. 279 | EOM 280 | } 281 | 282 | ### ------------------------------- ### 283 | ### Main script ### 284 | ### ------------------------------- ### 285 | 286 | declare -a residual_args 287 | declare -a java_args 288 | declare -a app_commands 289 | declare -r real_script_path="$(realpath "$0")" 290 | declare -r activator_home="$(realpath "$(dirname "$(dirname "$real_script_path")")")" 291 | declare -r app_version="1.3.9" 292 | 293 | declare -r app_launcher="${activator_home}/libexec/activator-launch-${app_version}.jar" 294 | declare -r script_name=activator 295 | java_cmd=$(get_java_cmd) 296 | declare -r java_opts=( "${ACTIVATOR_OPTS[@]}" "${SBT_OPTS[@]}" "${JAVA_OPTS[@]}" "${java_opts[@]}" ) 297 | userhome="$HOME" 298 | if is_cygwin; then 299 | # cygwin sets home to something f-d up, set to real windows homedir 300 | userhome="$USERPROFILE" 301 | fi 302 | declare -r activator_user_home_dir="${userhome}/.activator" 303 | declare -r java_opts_config_home="${activator_user_home_dir}/activatorconfig.txt" 304 | declare -r java_opts_config_version="${activator_user_home_dir}/${app_version}/activatorconfig.txt" 305 | 306 | # Now check to see if it's a good enough version 307 | declare -r java_version=$("$java_cmd" -version 2>&1 | awk -F '"' '/version/ {print $2}') 308 | if [[ "$java_version" == "" ]]; then 309 | echo 310 | echo No java installations was detected. 311 | echo Please go to http://www.java.com/getjava/ and download 312 | echo 313 | exit 1 314 | elif [[ ! "$java_version" > "1.6" ]]; then 315 | echo 316 | echo The java installation you have is not up to date 317 | echo Activator requires at least version 1.6+, you have 318 | echo version $java_version 319 | echo 320 | echo Please go to http://www.java.com/getjava/ and download 321 | echo a valid Java Runtime and install before running Activator. 322 | echo 323 | exit 1 324 | fi 325 | 326 | # if configuration files exist, prepend their contents to the java args so it can be processed by this runner 327 | # a "versioned" config trumps one on the top level 328 | if [[ -f "$java_opts_config_version" ]]; then 329 | addConfigOpts $(loadConfigFile "$java_opts_config_version") 330 | elif [[ -f "$java_opts_config_home" ]]; then 331 | addConfigOpts $(loadConfigFile "$java_opts_config_home") 332 | fi 333 | 334 | run "$@" 335 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := """spark-hive-udf""" 2 | version := "0.1.0" 3 | organization := "com.ardentex" 4 | 5 | scalaVersion := "2.11.11" 6 | scalacOptions ++= Seq("-unchecked", "-feature", "-deprecation") 7 | crossScalaVersions := Seq(scalaVersion.value, "2.10.6") 8 | 9 | libraryDependencies ++= Seq( 10 | "org.apache.hive" % "hive-exec" % "2.1.1" % Provided, 11 | "org.apache.hadoop" % "hadoop-core" % "1.2.1" % Provided, 12 | "org.scalatest" %% "scalatest" % "3.0.1" % Test 13 | ) 14 | 15 | // Without this repo, you might get a failure trying to resolve transitive 16 | // dependency org.pentaho:pentaho-aggdesigner-algorithm:5.1.5-jhyde 17 | resolvers += "conjars" at "http://conjars.org/repo" 18 | 19 | addCommandAlias("jar", ";test;package") 20 | -------------------------------------------------------------------------------- /libexec/activator-launch-1.3.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bmc/spark-hive-udf/168296b5220e82f233aab892c3b20f1aca37aa12/libexec/activator-launch-1.3.9.jar -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | #Activator-generated Properties 2 | #Sat Feb 27 09:54:57 EST 2016 3 | template.uuid=e17acfbb-1ff5-41f5-b8cf-2c40be6a8340 4 | sbt.version=0.13.7 5 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.1") 2 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") 3 | -------------------------------------------------------------------------------- /src/main/java/com/ardentex/spark/hiveudf/FormatCurrency.java: -------------------------------------------------------------------------------- 1 | package com.ardentex.spark.hiveudf; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | import org.apache.hadoop.io.DoubleWritable; 5 | 6 | import java.text.NumberFormat; 7 | import java.util.Locale; 8 | 9 | /** 10 | * This UDF takes a double and converts it to a currency string. A decimal 11 | * type is more suited to money than a double, but Hadoop's IO formats don't 12 | * seem to support decimal. 13 | */ 14 | public class FormatCurrency extends UDF { 15 | 16 | /** The actual conversion routine. 17 | * 18 | * @param n the double 19 | * @param localeName the locale name string (e.g., "en_US") to use to format 20 | * the string, or null to use the default local. 21 | * 22 | * @return the formatted string 23 | */ 24 | public String evaluate(Double n, String localeName) { 25 | Locale locale; 26 | if (localeName == null) 27 | locale = Locale.getDefault(); 28 | else { 29 | String[] pieces = localeName.split("_"); 30 | if (pieces.length != 2) 31 | locale = Locale.getDefault(); 32 | else 33 | locale = new Locale(pieces[0], pieces[1]); 34 | } 35 | 36 | NumberFormat fmt = NumberFormat.getCurrencyInstance(locale); 37 | 38 | if (n == null) { 39 | return ""; 40 | } 41 | else { 42 | return fmt.format(n.doubleValue()); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/ardentex/spark/hiveudf/FormatTimestamp.scala: -------------------------------------------------------------------------------- 1 | package com.ardentex.spark.hiveudf 2 | 3 | import java.sql.Timestamp 4 | import java.text.SimpleDateFormat 5 | import java.util.Date 6 | 7 | import org.apache.hadoop.hive.ql.exec.UDF 8 | 9 | /** This UDF takes a SQL Timestamp and converts it to a string, using a 10 | * Java `SimpleDateFormat` string to dictate the format. 11 | */ 12 | class FormatTimestamp extends UDF { 13 | 14 | def evaluate(t: Timestamp, fmt: String): String = { 15 | val optRes = 16 | for { ts <- Option(t) // null check 17 | f <- Option(fmt) } // null check 18 | yield try { 19 | val formatter = new SimpleDateFormat(fmt) 20 | formatter.format(new Date(t.getTime)) 21 | } 22 | catch { 23 | // Bad format. Return Timestmap.toString. (We could return 24 | // an error message, as well, but this is fine for now.) 25 | case _: IllegalArgumentException => 26 | t.toString 27 | } 28 | 29 | optRes.getOrElse("") 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/ardentex/spark/hiveudf/ToHex.scala: -------------------------------------------------------------------------------- 1 | package com.ardentex.spark.hiveudf 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF 4 | import org.apache.hadoop.io.LongWritable 5 | 6 | /** This UDF takes a long integer and converts it to a hexadecimal string. 7 | */ 8 | class ToHex extends UDF { 9 | 10 | def evaluate(n: LongWritable): String = { 11 | Option(n) 12 | .map { num => 13 | // Use Scala string interpolation. It's the easiest way, and it's 14 | // type-safe, unlike String.format(). 15 | f"0x${num.get}%x" 16 | } 17 | .getOrElse("") 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/test/scala/com/ardentex/spark/hiveudf/FormatCurrencySpec.scala: -------------------------------------------------------------------------------- 1 | package com.ardentex.spark.hiveudf 2 | 3 | import java.util.Locale 4 | 5 | import org.apache.hadoop.io.{DoubleWritable, FloatWritable, LongWritable} 6 | import org.scalatest.{FlatSpec, Matchers} 7 | 8 | class FormatCurrencySpec extends FlatSpec with Matchers { 9 | val udf = new FormatCurrency 10 | 11 | def doTest(data: Array[(Double, String)], lang: String): Unit = { 12 | for ((input, expected) <- data) 13 | //udf.evaluate(new DoubleWritable(input),lang) should be (expected) 14 | udf.evaluate(input,lang) should be (expected) 15 | } 16 | 17 | "FormatCurrency" should "return a valid currency string for the US locale" in { 18 | val data = Array( 19 | (2999100.01, "$2,999,100.01"), 20 | (.11, "$0.11"), 21 | (999.0, "$999.00"), 22 | (1122.0, "$1,122.00") 23 | ) 24 | 25 | doTest(data, "en_US") 26 | } 27 | 28 | it should "return a valid currency string for the en_GB locale" in { 29 | val data = Array( 30 | (2999100.01, "£2,999,100.01"), 31 | (.11, "£0.11"), 32 | (999.0, "£999.00"), 33 | (1122.0, "£1,122.00") 34 | ) 35 | 36 | doTest(data, "en_GB") 37 | } 38 | 39 | it should "return a currency string in the default locale for a bad locale string" in { 40 | withDefaultLocale("fr", "FR") { 41 | val data = Array( 42 | (2999100.01, "2 999 100,01 €"), 43 | (.11, "0,11 €"), 44 | (999.0, "999,00 €"), 45 | (1122.0, "1 122,00 €") 46 | ) 47 | 48 | doTest(data, "nnyy") 49 | } 50 | } 51 | 52 | it should "return a currency string in the default locale if no locale is specified" in { 53 | withDefaultLocale("se", "SE") { 54 | val data = Array( 55 | (2999100.01, "SEK 2,999,100.01"), 56 | (.11, "SEK 0.11"), 57 | (999.0, "SEK 999.00"), 58 | (1122.0, "SEK 1,122.00") 59 | ) 60 | 61 | doTest(data, null) 62 | } 63 | } 64 | 65 | it should "return a valid currency string for the jp_JP locale" in { 66 | val data = Array( 67 | (2999100.01, "JPY 2,999,100"), 68 | (.11, "JPY 0"), 69 | (999.0, "JPY 999"), 70 | (1122.0, "JPY 1,122") 71 | ) 72 | 73 | doTest(data, "jp_JP") 74 | } 75 | 76 | 77 | it should "return an empty string for a null input" in { 78 | udf.evaluate(null, null) should be ("") 79 | } 80 | 81 | private def withDefaultLocale(lang: String, variant: String) 82 | (code: => Unit): Unit = { 83 | val locale = Locale.getDefault 84 | val newLocale = new Locale(lang, variant) 85 | Locale.setDefault(newLocale) 86 | 87 | try { 88 | code 89 | } 90 | 91 | finally { 92 | Locale.setDefault(locale) 93 | } 94 | 95 | } 96 | } 97 | 98 | 99 | -------------------------------------------------------------------------------- /src/test/scala/com/ardentex/spark/hiveudf/FormatTimestampSpec.scala: -------------------------------------------------------------------------------- 1 | package com.ardentex.spark.hiveudf 2 | 3 | import java.sql.Timestamp 4 | import java.text.SimpleDateFormat 5 | import java.util.Date 6 | 7 | import org.scalatest.{FlatSpec, Matchers} 8 | 9 | class FormatTimestampSpec extends FlatSpec with Matchers { 10 | val udf = new FormatTimestamp 11 | val timestampParser = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") 12 | 13 | "FormatTimestamp.evaluate" should "properly format a timestamp" in { 14 | val format = "yyyy/MMM/dd hh:mm a" 15 | val data = Array( 16 | ("2013-01-29 23:49:03", "2013/Jan/29 11:49 PM"), 17 | ("1941-09-03 10:01:53", "1941/Sep/03 10:01 AM"), 18 | ("1888-07-01 01:01:59", "1888/Jul/01 01:01 AM") 19 | ) 20 | 21 | for ((input, expected) <- data) { 22 | val ts = new Timestamp(timestampParser.parse(input).getTime) 23 | udf.evaluate(ts, format) should be (expected) 24 | } 25 | } 26 | 27 | it should "return an empty string when the timestamp is null" in { 28 | udf.evaluate(null, "yyyy-MM-dd") should be ("") 29 | } 30 | 31 | it should "return an empty string when the format is null" in { 32 | udf.evaluate(new Timestamp((new Date).getTime), null) should be ("") 33 | } 34 | 35 | it should "return Timestamp.toString when the format is bad" in { 36 | val ts = new Timestamp((new Date).getTime) 37 | udf.evaluate(ts, "bad format") should be (ts.toString) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/test/scala/com/ardentex/spark/hiveudf/ToHexSpec.scala: -------------------------------------------------------------------------------- 1 | package com.ardentex.spark.hiveudf 2 | 3 | import org.apache.hadoop.io.LongWritable 4 | import org.scalatest.{FlatSpec, Matchers} 5 | 6 | class ToHexSpec extends FlatSpec with Matchers { 7 | val udf = new ToHex 8 | 9 | "ToHex.evaluate" should "return a valid hex string" in { 10 | val data = Array( 11 | (234908234222L, "0x36b19f31ee"), 12 | (0L, "0x0"), 13 | (Long.MaxValue, "0x7fffffffffffffff"), 14 | (-10L, "0xfffffffffffffff6") 15 | ) 16 | 17 | for ((input, expected) <- data) { 18 | udf.evaluate(new LongWritable(input)) should be (expected) 19 | } 20 | } 21 | 22 | it should "return an empty string for a null input" in { 23 | udf.evaluate(null) should be ("") 24 | } 25 | } 26 | --------------------------------------------------------------------------------