├── .editorconfig
├── .gitignore
├── CHANGELOG.md
├── LICENSE.md
├── README.md
├── bin
    └── activator
├── build.sbt
├── libexec
    └── activator-launch-1.3.9.jar
├── project
    ├── build.properties
    └── plugins.sbt
└── src
    ├── main
        ├── java
        │   └── com
        │   │   └── ardentex
        │   │       └── spark
        │   │           └── hiveudf
        │   │               └── FormatCurrency.java
        └── scala
        │   └── com
        │       └── ardentex
        │           └── spark
        │               └── hiveudf
        │                   ├── FormatTimestamp.scala
        │                   └── ToHex.scala
    └── test
        └── scala
            └── com
                └── ardentex
                    └── spark
                        └── hiveudf
                            ├── FormatCurrencySpec.scala
                            ├── FormatTimestampSpec.scala
                            └── ToHexSpec.scala


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig helps developers define and maintain consistent
 2 | # coding styles between different editors and IDEs
 3 | # editorconfig.org
 4 | 
 5 | root = true
 6 | 
 7 | 
 8 | [*]
 9 | end_of_line = lf
10 | charset = utf-8
11 | trim_trailing_whitespace = true
12 | insert_final_newline = true
13 | indent_style = space
14 | indent_size = 2
15 | 
16 | [*.{diff,md}]
17 | trim_trailing_whitespace = false
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /RUNNING_PID
 2 | /logs/
 3 | /project/*-shim.sbt
 4 | /project/project/
 5 | /project/target/
 6 | /target/
 7 | /.idea*
 8 | /*.iml
 9 | /metastore_db
10 | /derby.log
11 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log for spark-hive-udf
 2 | 
 3 | Version 0.1.0
 4 | 
 5 | - Updated to compile against both Scala 2.10 and 2.11.
 6 | - Verified against Spark 2.1.
 7 | - Made Scala UDFs more idiomatic.
 8 | 
 9 | Version 0.0.1
10 | 
11 | - Initial release
12 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | License
 2 | =======
 3 | 
 4 | This software is released under BSD license, adapted from
 5 | <http://opensource.org/licenses/BSD-3-Clause>
 6 | 
 7 | ---
 8 | 
 9 | Copyright &copy; 2016-2017, Brian M. Clapper.
10 | All rights reserved.
11 | 
12 | Redistribution and use in source and binary forms, with or without
13 | modification, are permitted provided that the following conditions are met:
14 | 
15 | * Redistributions of source code must retain the above copyright notice,
16 |   this list of conditions and the following disclaimer.
17 | 
18 | * Redistributions in binary form must reproduce the above copyright notice,
19 |   this list of conditions and the following disclaimer in the documentation
20 |   and/or other materials provided with the distribution.
21 | 
22 | * Neither the names "clapper.org" nor the names of any contributors may
23 |   be used to endorse or promote products derived from this software
24 |   without specific prior written permission.
25 | 
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 | POSSIBILITY OF SUCH DAMAGE.
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Sample Hive UDF project
  2 | 
  3 | ## Introduction
  4 | 
  5 | This project is just an example, containing several
  6 | [Hive User Defined Functions][] (UDFs), for use in Apache Spark. It's
  7 | intended to demonstrate how to build a Hive UDF in Scala or Java and use it
  8 | within [Apache Spark][].
  9 | 
 10 | ## Why use a Hive UDF?
 11 | 
 12 | One especially good use of Hive UDFs is with Python and DataFrames.
 13 | Native Spark UDFs written in Python are slow, because they have to be
 14 | executed in a Python process, rather than a JVM-based Spark Executor.
 15 | For a Spark Executor to run a Python UDF, it must:
 16 | 
 17 | * send data from the partition over to a Python process associated with
 18 |   the Executor, and
 19 | * wait for the Python process to deserialize the data, run the UDF on it,
 20 |   reserialize the data, and send it back.
 21 | 
 22 | By contrast, a Hive UDF, whether written in Scala or Java, can be executed
 23 | in the Executor JVM, _even if the DataFrame logic is in Python_.
 24 | 
 25 | There's really only one drawback: a Hive UDF _must_ be invoked via SQL.
 26 | You can't call it as a function from the DataFrame API.
 27 | 
 28 | **NOTE** It is also possible to do something similar, using native Spark
 29 | UDFs. See <https://stackoverflow.com/questions/33233737/spark-how-to-map-python-with-scala-or-java-user-defined-functions> and <https://stackoverflow.com/questions/41780141/how-to-use-scala-udf-in-pyspark> for some useful discussions.
 30 | 
 31 | ## Building
 32 | 
 33 | This project builds with [SBT][], but you don't have to download SBT. Just use
 34 | the `activator` script in the `bin` subdirectory. To build the jar file, use
 35 | this command:
 36 | 
 37 | ```
 38 | $ bin/activator +jar
 39 | ```
 40 | 
 41 | That command will download the dependencies (if they haven't already been
 42 | downloaded), compile the code, run the unit tests, and create jar files for
 43 | both Scala 2.10 and Scala 2.11. Those jars will be:
 44 | 
 45 | * Scala 2.10: `target/scala-2.10/spark-hive-udf_2.10-0.1.0.jar`
 46 | * Scala 2.11: `target/scala-2.11/spark-hive-udf_2.11-0.1.0.jar`
 47 | 
 48 | ### Building with Maven
 49 | 
 50 | Honestly, I'm not a big fan of Maven. I had a Maven `pom.xml` file here, but
 51 | I got tired of maintaining two build files. Just use `activator`, as described 
 52 | above.
 53 | 
 54 | ## Running in PySpark
 55 | 
 56 | The following Python code demonstrates the UDFs in this package and assumes
 57 | that you've packaged the code into `target/scala-2.11/spark-hive-udf_2.11-0.1.0.jar`
 58 | and copied that jar to `/tmp`.
 59 | 
 60 | These commands assume Spark local mode, but they should also work fine within
 61 | a cluster manager like Spark Standalone or YARN.
 62 | 
 63 | (You can also use Hive UDFs from Scala, by the way.)
 64 | 
 65 | First, fire up PySpark:
 66 | 
 67 | ```
 68 | $ pyspark --jars /tmp/spark-hive-udf_2.11-0.1.0.jar
 69 | ```
 70 | 
 71 | At the PySpark prompt, enter the following. (If you're using IPython,
 72 | `%paste` works best.)
 73 | 
 74 | **NOTE**: The following code assumes Spark 2.x.
 75 | 
 76 | ```python
 77 | from datetime import datetime
 78 | from collections import namedtuple
 79 | from decimal import Decimal
 80 | 
 81 | Person = namedtuple('Person', ('first_name', 'last_name', 'birth_date', 'salary', 'children'))
 82 | 
 83 | fmt = "%Y-%m-%d"
 84 | 
 85 | people = [
 86 |     Person('Joe', 'Smith', datetime.strptime("1993-10-20", fmt), 70000.0, 2),
 87 |     Person('Jenny', 'Harmon', datetime.strptime("1987-08-02", fmt), 94000.0, 1)
 88 | ]
 89 | 
 90 | # Replace spark.sparkContext with sc if you're using Spark 1.x.
 91 | df = spark.sparkContext.parallelize(people).toDF()
 92 | 
 93 | # Replace spark with sqlContext if  you're using Spark 1.x.
 94 | spark.sql("CREATE TEMPORARY FUNCTION to_hex AS 'com.ardentex.spark.hiveudf.ToHex'")
 95 | spark.sql("CREATE TEMPORARY FUNCTION datestring AS 'com.ardentex.spark.hiveudf.FormatTimestamp'")
 96 | spark.sql("CREATE TEMPORARY FUNCTION currency AS 'com.ardentex.spark.hiveudf.FormatCurrency'")
 97 | 
 98 | # Replace createOrReplaceTempView with registerTempTable if you're using
 99 | # Spark 1.x
100 | df.createOrReplaceTempView("people")
101 | df2 = spark.sql("SELECT first_name, last_name, datestring(birth_date, 'MMMM dd, yyyy') as birth_date, currency(salary, 'en_US') as salary, to_hex(children) as hex_children FROM people")
102 | ```
103 | 
104 | Then, take a look at the second DataFrame:
105 | 
106 | ```
107 | df2.show()
108 | 
109 | +----------+---------+----------------+----------+------------+
110 | |first_name|last_name|      birth_date|    salary|hex_children|
111 | +----------+---------+----------------+----------+------------+
112 | |       Joe|    Smith|October 20, 1993|$70,000.00|         0x2|
113 | |     Jenny|   Harmon| August 02, 1987|$94,000.00|         0x1|
114 | +----------+---------+----------------+----------+------------+
115 | ```
116 | 
117 | ## Running in spark-shell (Scala)
118 | 
119 | First, fire up the Spark shell:
120 | 
121 | ```
122 | $ spark-shell --jars /tmp/spark-hive-udf_2.11-0.1.0.jar
123 | ```
124 | 
125 | At the Scala REPL prompt, type `:paste`, then copy and paste the following
126 | code followed by a Ctrl-D.
127 | 
128 | **NOTE**: The following code assumes Spark 2.x.
129 | 
130 | ```scala
131 | import java.sql.Timestamp
132 | import java.text.SimpleDateFormat
133 | import java.util.Date
134 | 
135 | case class Person(firstName: String, lastName: String, birthDate: Timestamp, salary: Double, children: Int)
136 | 
137 | val fmt = new SimpleDateFormat("yyyy-MM-dd")
138 | 
139 | val people = Array(
140 |     Person("Joe", "Smith", new Timestamp(fmt.parse("1993-10-20").getTime), 70000.0, 2),
141 |     Person("Jenny", "Harmon", new Timestamp(fmt.parse("1987-08-02").getTime), 94000.0, 1)
142 | )
143 | 
144 | // Replace spark.sparkContext with sc if you're using Spark 1.x.
145 | val df = spark.createDataFrame(spark.sparkContext.parallelize(people))
146 | 
147 | // Replace spark with sqlContext if  you're using Spark 1.x.
148 | spark.sql("CREATE TEMPORARY FUNCTION toHex AS 'com.ardentex.spark.hiveudf.ToHex'")
149 | spark.sql("CREATE TEMPORARY FUNCTION datestring AS 'com.ardentex.spark.hiveudf.FormatTimestamp'")
150 | spark.sql("CREATE TEMPORARY FUNCTION currency AS 'com.ardentex.spark.hiveudf.FormatCurrency'")
151 | 
152 | // Replace createOrReplaceTempView with registerTempTable if you're using
153 | // Spark 1.x
154 | df.createOrReplaceTempView("people")
155 | val df2 = spark.sql("SELECT firstName, lastName, datestring(birthDate, 'MMMM dd, yyyy') as birthDate, currency(salary, 'en_US') as salary, toHex(children) as hexChildren FROM people")
156 | ```
157 | 
158 | Then, take a look at the second DataFrame:
159 | 
160 | ```
161 | df2.show()
162 | 
163 | +---------+--------+----------------+----------+-----------+
164 | |firstName|lastName|       birthDate|    salary|hexChildren|
165 | +---------+--------+----------------+----------+-----------+
166 | |      Joe|   Smith|October 20, 1993|$70,000.00|        0x2|
167 | |    Jenny|  Harmon| August 02, 1987|$94,000.00|        0x1|
168 | +---------+--------+----------------+----------+-----------+
169 | ```
170 | 
171 | ## "Why did you write these things in Scala?"
172 | 
173 | Because, after writing Scala for the last 7 years, I find Java annoying. But,
174 | I did include a Java UDF in this repo; take a look at the `FormatCurrency` UDF. 
175 | The others are in Scala and, really, they're not hard to translate
176 | to Java.
177 | 
178 | [Hive User Defined Functions]: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
179 | [Apache Spark]: http://spark.apache.org
180 | [SBT]: http://scala-sbt.org
181 | 


--------------------------------------------------------------------------------
/bin/activator:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ###  ------------------------------- ###
  4 | ###  Helper methods for BASH scripts ###
  5 | ###  ------------------------------- ###
  6 | 
  7 | realpath () {
  8 | (
  9 |   TARGET_FILE="$1"
 10 | 
 11 |   cd "$(dirname "$TARGET_FILE")"
 12 |   TARGET_FILE=$(basename "$TARGET_FILE")
 13 | 
 14 |   COUNT=0
 15 |   while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
 16 |   do
 17 |       TARGET_FILE=$(readlink "$TARGET_FILE")
 18 |       cd "$(dirname "$TARGET_FILE")"
 19 |       TARGET_FILE=$(basename "$TARGET_FILE")
 20 |       COUNT=$(($COUNT + 1))
 21 |   done
 22 | 
 23 |   if [ "$TARGET_FILE" == "." -o "$TARGET_FILE" == ".." ]; then
 24 |     cd "$TARGET_FILE"
 25 |     TARGET_FILEPATH=
 26 |   else
 27 |     TARGET_FILEPATH=/$TARGET_FILE
 28 |   fi
 29 | 
 30 |   # make sure we grab the actual windows zipPath, instead of cygwin's zipPath.
 31 |   if ! is_cygwin; then
 32 |     echo "$(pwd -P)/$TARGET_FILE"
 33 |   else
 34 |     echo $(cygwinpath "$(pwd -P)/$TARGET_FILE")
 35 |   fi
 36 | )
 37 | }
 38 | 
 39 | # TODO - Do we need to detect msys?
 40 | 
 41 | # Uses uname to detect if we're in the odd cygwin environment.
 42 | is_cygwin() {
 43 |   local os=$(uname -s)
 44 |   case "$os" in
 45 |     CYGWIN*) return 0 ;;
 46 |     *)  return 1 ;;
 47 |   esac
 48 | }
 49 | 
 50 | # This can fix cygwin style /cygdrive paths so we get the
 51 | # windows style paths.
 52 | cygwinpath() {
 53 |   local file="$1"
 54 |   if is_cygwin; then
 55 |     echo $(cygpath -w $file)
 56 |   else
 57 |     echo $file
 58 |   fi
 59 | }
 60 | 
 61 | # Make something URI friendly
 62 | make_url() {
 63 |   url="$1"
 64 |   local nospaces=${url// /%20}
 65 |   if is_cygwin; then
 66 |     echo "/${nospaces//\\//}"
 67 |   else
 68 |     echo "$nospaces"
 69 |   fi
 70 | }
 71 | 
 72 | # Detect if we should use JAVA_HOME or just try PATH.
 73 | get_java_cmd() {
 74 |   if [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]];  then
 75 |     echo "$JAVA_HOME/bin/java"
 76 |   else
 77 |     echo "java"
 78 |   fi
 79 | }
 80 | 
 81 | echoerr () {
 82 |   echo 1>&2 "$@"
 83 | }
 84 | vlog () {
 85 |   [[ $verbose || $debug ]] && echoerr "$@"
 86 | }
 87 | dlog () {
 88 |   [[ $debug ]] && echoerr "$@"
 89 | }
 90 | execRunner () {
 91 |   # print the arguments one to a line, quoting any containing spaces
 92 |   [[ $verbose || $debug ]] && echo "# Executing command line:" && {
 93 |     for arg; do
 94 |       if printf "%s\n" "$arg" | grep -q ' '; then
 95 |         printf "\"%s\"\n" "$arg"
 96 |       else
 97 |         printf "%s\n" "$arg"
 98 |       fi
 99 |     done
100 |     echo ""
101 |   }
102 | 
103 |   exec "$@"
104 | }
105 | addJava () {
106 |   dlog "[addJava] arg = '$1'"
107 |   java_args=( "${java_args[@]}" "$1" )
108 | }
109 | addApp () {
110 |   dlog "[addApp] arg = '$1'"
111 |   sbt_commands=( "${app_commands[@]}" "$1" )
112 | }
113 | addResidual () {
114 |   dlog "[residual] arg = '$1'"
115 |   residual_args=( "${residual_args[@]}" "$1" )
116 | }
117 | addDebugger () {
118 |   addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
119 | }
120 | addConfigOpts () {
121 |   dlog "[addConfigOpts] arg = '$*'"
122 |   for item in $*
123 |   do
124 |     addJava "$item"
125 |   done
126 | }
127 | # a ham-fisted attempt to move some memory settings in concert
128 | # so they need not be messed around with individually.
129 | get_mem_opts () {
130 |   local mem=${1:-1024}
131 |   local meta=$(( $mem / 4 ))
132 |   (( $meta > 256 )) || meta=256
133 |   (( $meta < 1024 )) || meta=1024
134 | 
135 |   # default is to set memory options but this can be overridden by code section below
136 |   memopts="-Xms${mem}m -Xmx${mem}m"
137 |   if [[ "${java_version}" > "1.8" ]]; then
138 |     extmemopts="-XX:MetaspaceSize=64m -XX:MaxMetaspaceSize=${meta}m"
139 |   else
140 |     extmemopts="-XX:PermSize=64m -XX:MaxPermSize=${meta}m"
141 |   fi
142 | 
143 |   if [[ "${java_opts}" == *-Xmx* ]] || [[ "${java_opts}" == *-Xms* ]] || [[ "${java_opts}" == *-XX:MaxPermSize* ]] || [[ "${java_opts}" == *-XX:ReservedCodeCacheSize* ]] || [[ "${java_opts}" == *-XX:MaxMetaspaceSize* ]]; then
144 |     # if we detect any of these settings in ${java_opts} we need to NOT output our settings.
145 |     # The reason is the Xms/Xmx, if they don't line up, cause errors.
146 |     memopts=""
147 |     extmemopts=""
148 |   fi
149 | 
150 |   echo "${memopts} ${extmemopts}"
151 | }
152 | require_arg () {
153 |   local type="$1"
154 |   local opt="$2"
155 |   local arg="$3"
156 |   if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
157 |     die "$opt requires <$type> argument"
158 |   fi
159 | }
160 | is_function_defined() {
161 |   declare -f "$1" > /dev/null
162 | }
163 | 
164 | # If we're *not* running in a terminal, and we don't have any arguments, then we need to add the 'ui' parameter
165 | detect_terminal_for_ui() {
166 |   [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && {
167 |     addResidual "ui"
168 |   }
169 |   # SPECIAL TEST FOR MAC
170 |   [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && {
171 |     echo "Detected MAC OSX launched script...."
172 |     echo "Swapping to UI"
173 |     addResidual "ui"
174 |   }
175 | }
176 | 
177 | # Processes incoming arguments and places them in appropriate global variables.  called by the run method.
178 | process_args () {
179 |   while [[ $# -gt 0 ]]; do
180 |     case "$1" in
181 |        -h|-help) usage; exit 1 ;;
182 |     -v|-verbose) verbose=1 && shift ;;
183 |       -d|-debug) debug=1 && shift ;;
184 |            -mem) require_arg integer "$1" "$2" && app_mem="$2" && shift 2 ;;
185 |      -jvm-debug)
186 |         if echo "$2" | grep -E ^[0-9]+$ > /dev/null; then
187 |             addDebugger "$2" && shift
188 |         else
189 |             addDebugger 9999
190 |         fi
191 |         shift ;;
192 |      -java-home) require_arg zipPath "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;;
193 |             -D*) addJava "$1" && shift ;;
194 |             -J*) addJava "${1:2}" && shift ;;
195 |               *) addResidual "$1" && shift ;;
196 |     esac
197 |   done
198 | 
199 |   is_function_defined process_my_args && {
200 |     myargs=("${residual_args[@]}")
201 |     residual_args=()
202 |     process_my_args "${myargs[@]}"
203 |   }
204 | }
205 | 
206 | # Actually runs the script.
207 | run() {
208 |   # TODO - check for sane environment
209 | 
210 |   # process the combined args, then reset "$@" to the residuals
211 |   process_args "$@"
212 |   detect_terminal_for_ui
213 |   set -- "${residual_args[@]}"
214 |   argumentCount=$#
215 | 
216 |   #check for jline terminal fixes on cygwin
217 |   if is_cygwin; then
218 |     stty -icanon min 1 -echo > /dev/null 2>&1
219 |     addJava "-Djline.terminal=jline.UnixTerminal"
220 |     addJava "-Dsbt.cygwin=true"
221 |   fi
222 | 
223 |   # run sbt
224 |   execRunner "$java_cmd" \
225 |     "-Dactivator.home=$(make_url "$activator_home")" \
226 |     $(get_mem_opts $app_mem) \
227 |     ${java_opts[@]} \
228 |     ${java_args[@]} \
229 |     -jar "$app_launcher" \
230 |     "${app_commands[@]}" \
231 |     "${residual_args[@]}"
232 | 
233 |   local exit_code=$?
234 |   if is_cygwin; then
235 |     stty icanon echo > /dev/null 2>&1
236 |   fi
237 |   exit $exit_code
238 | }
239 | 
240 | # Loads a configuration file full of default command line options for this script.
241 | loadConfigFile() {
242 |   cat "$1" | sed '/^\#/d'
243 | }
244 | 
245 | ###  ------------------------------- ###
246 | ###  Start of customized settings    ###
247 | ###  ------------------------------- ###
248 | usage() {
249 |  cat <<EOM
250 | Usage: $script_name <command> [options]
251 | 
252 |   Command:
253 |   ui                 Start the Activator UI
254 |   new [name] [template-id]  Create a new project with [name] using template [template-id]
255 |   list-templates     Print all available template names
256 |   -h | -help         Print this message
257 | 
258 |   Options:
259 |   -v | -verbose      Make this runner chattier
260 |   -d | -debug        Set sbt log level to debug
261 |   -mem <integer>     Set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
262 |   -jvm-debug <port>  Turn on JVM debugging, open at the given port.
263 | 
264 |   # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
265 |   -java-home <zipPath>  Alternate JAVA_HOME
266 | 
267 |   # jvm options and output control
268 |   -Dkey=val          Pass -Dkey=val directly to the java runtime
269 |   -J-X               Pass option -X directly to the java runtime
270 |                      (-J is stripped)
271 | 
272 |   # environment variables (read from context)
273 |   JAVA_OPTS          Environment variable, if unset uses ""
274 |   SBT_OPTS           Environment variable, if unset uses ""
275 |   ACTIVATOR_OPTS     Environment variable, if unset uses ""
276 | 
277 | In the case of duplicated or conflicting options, the order above
278 | shows precedence: environment variables lowest, command line options highest.
279 | EOM
280 | }
281 | 
282 | ###  ------------------------------- ###
283 | ###  Main script                     ###
284 | ###  ------------------------------- ###
285 | 
286 | declare -a residual_args
287 | declare -a java_args
288 | declare -a app_commands
289 | declare -r real_script_path="$(realpath "$0")"
290 | declare -r activator_home="$(realpath "$(dirname "$(dirname "$real_script_path")")")"
291 | declare -r app_version="1.3.9"
292 | 
293 | declare -r app_launcher="${activator_home}/libexec/activator-launch-${app_version}.jar"
294 | declare -r script_name=activator
295 | java_cmd=$(get_java_cmd)
296 | declare -r java_opts=( "${ACTIVATOR_OPTS[@]}" "${SBT_OPTS[@]}" "${JAVA_OPTS[@]}" "${java_opts[@]}" )
297 | userhome="$HOME"
298 | if is_cygwin; then
299 |   # cygwin sets home to something f-d up, set to real windows homedir
300 |   userhome="$USERPROFILE"
301 | fi
302 | declare -r activator_user_home_dir="${userhome}/.activator"
303 | declare -r java_opts_config_home="${activator_user_home_dir}/activatorconfig.txt"
304 | declare -r java_opts_config_version="${activator_user_home_dir}/${app_version}/activatorconfig.txt"
305 | 
306 | # Now check to see if it's a good enough version
307 | declare -r java_version=$("$java_cmd" -version 2>&1 | awk -F '"' '/version/ {print $2}')
308 | if [[ "$java_version" == "" ]]; then
309 |   echo
310 |   echo No java installations was detected.
311 |   echo Please go to http://www.java.com/getjava/ and download
312 |   echo
313 |   exit 1
314 | elif [[ ! "$java_version" > "1.6" ]]; then
315 |   echo
316 |   echo The java installation you have is not up to date
317 |   echo Activator requires at least version 1.6+, you have
318 |   echo version $java_version
319 |   echo
320 |   echo Please go to http://www.java.com/getjava/ and download
321 |   echo a valid Java Runtime and install before running Activator.
322 |   echo
323 |   exit 1
324 | fi
325 | 
326 | # if configuration files exist, prepend their contents to the java args so it can be processed by this runner
327 | # a "versioned" config trumps one on the top level
328 | if [[ -f "$java_opts_config_version" ]]; then
329 |   addConfigOpts $(loadConfigFile "$java_opts_config_version")
330 | elif [[ -f "$java_opts_config_home" ]]; then
331 |   addConfigOpts $(loadConfigFile "$java_opts_config_home")
332 | fi
333 | 
334 | run "$@"
335 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := """spark-hive-udf"""
 2 | version := "0.1.0"
 3 | organization := "com.ardentex"
 4 | 
 5 | scalaVersion := "2.11.11"
 6 | scalacOptions ++= Seq("-unchecked", "-feature", "-deprecation")
 7 | crossScalaVersions := Seq(scalaVersion.value, "2.10.6")
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   "org.apache.hive"   % "hive-exec"   % "2.1.1" % Provided,
11 |   "org.apache.hadoop" % "hadoop-core" % "1.2.1" % Provided,
12 |   "org.scalatest"    %% "scalatest"   % "3.0.1" % Test
13 | )
14 | 
15 | // Without this repo, you might get a failure trying to resolve transitive
16 | // dependency org.pentaho:pentaho-aggdesigner-algorithm:5.1.5-jhyde
17 | resolvers += "conjars" at "http://conjars.org/repo"
18 | 
19 | addCommandAlias("jar", ";test;package")
20 | 


--------------------------------------------------------------------------------
/libexec/activator-launch-1.3.9.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bmc/spark-hive-udf/168296b5220e82f233aab892c3b20f1aca37aa12/libexec/activator-launch-1.3.9.jar


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #Activator-generated Properties
2 | #Sat Feb 27 09:54:57 EST 2016
3 | template.uuid=e17acfbb-1ff5-41f5-b8cf-2c40be6a8340
4 | sbt.version=0.13.7
5 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.1")
2 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
3 | 


--------------------------------------------------------------------------------
/src/main/java/com/ardentex/spark/hiveudf/FormatCurrency.java:
--------------------------------------------------------------------------------
 1 | package com.ardentex.spark.hiveudf;
 2 | 
 3 | import org.apache.hadoop.hive.ql.exec.UDF;
 4 | import org.apache.hadoop.io.DoubleWritable;
 5 | 
 6 | import java.text.NumberFormat;
 7 | import java.util.Locale;
 8 | 
 9 | /**
10 |  * This UDF takes a double and converts it to a currency string. A decimal
11 |  * type is more suited to money than a double, but Hadoop's IO formats don't
12 |  * seem to support decimal.
13 |  */
14 | public class FormatCurrency extends UDF {
15 | 
16 |   /** The actual conversion routine.
17 |    *
18 |    * @param n          the double
19 |    * @param localeName the locale name string (e.g., "en_US") to use to format
20 |    *                   the string, or null to use the default local.
21 |    *
22 |    * @return the formatted string
23 |    */
24 |   public String evaluate(Double n, String localeName) {
25 |     Locale locale;
26 |     if (localeName == null)
27 |       locale = Locale.getDefault();
28 |     else {
29 |       String[] pieces = localeName.split("_");
30 |       if (pieces.length != 2)
31 |         locale = Locale.getDefault();
32 |       else
33 |         locale = new Locale(pieces[0], pieces[1]);
34 |     }
35 | 
36 |     NumberFormat fmt = NumberFormat.getCurrencyInstance(locale);
37 | 
38 |     if (n == null) {
39 |       return "";
40 |     }
41 |     else {
42 |       return fmt.format(n.doubleValue());
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/com/ardentex/spark/hiveudf/FormatTimestamp.scala:
--------------------------------------------------------------------------------
 1 | package com.ardentex.spark.hiveudf
 2 | 
 3 | import java.sql.Timestamp
 4 | import java.text.SimpleDateFormat
 5 | import java.util.Date
 6 | 
 7 | import org.apache.hadoop.hive.ql.exec.UDF
 8 | 
 9 | /** This UDF takes a SQL Timestamp and converts it to a string, using a
10 |   * Java `SimpleDateFormat` string to dictate the format.
11 |   */
12 | class FormatTimestamp extends UDF {
13 | 
14 |   def evaluate(t: Timestamp, fmt: String): String = {
15 |     val optRes =
16 |       for { ts <- Option(t)     // null check
17 |             f  <- Option(fmt) } // null check
18 |       yield try {
19 |         val formatter = new SimpleDateFormat(fmt)
20 |         formatter.format(new Date(t.getTime))
21 |       }
22 |       catch {
23 |         // Bad format. Return Timestmap.toString. (We could return
24 |         // an error message, as well, but this is fine for now.)
25 |         case _: IllegalArgumentException =>
26 |           t.toString
27 |       }
28 | 
29 |     optRes.getOrElse("")
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/com/ardentex/spark/hiveudf/ToHex.scala:
--------------------------------------------------------------------------------
 1 | package com.ardentex.spark.hiveudf
 2 | 
 3 | import org.apache.hadoop.hive.ql.exec.UDF
 4 | import org.apache.hadoop.io.LongWritable
 5 | 
 6 | /** This UDF takes a long integer and converts it to a hexadecimal string.
 7 |   */
 8 | class ToHex extends UDF {
 9 | 
10 |   def evaluate(n: LongWritable): String = {
11 |     Option(n)
12 |       .map { num =>
13 |         // Use Scala string interpolation. It's the easiest way, and it's
14 |         // type-safe, unlike String.format().
15 |         f"0x${num.get}%x"
16 |       }
17 |       .getOrElse("")
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/test/scala/com/ardentex/spark/hiveudf/FormatCurrencySpec.scala:
--------------------------------------------------------------------------------
 1 | package com.ardentex.spark.hiveudf
 2 | 
 3 | import java.util.Locale
 4 | 
 5 | import org.apache.hadoop.io.{DoubleWritable, FloatWritable, LongWritable}
 6 | import org.scalatest.{FlatSpec, Matchers}
 7 | 
 8 | class FormatCurrencySpec extends FlatSpec with Matchers {
 9 |   val udf = new FormatCurrency
10 | 
11 |   def doTest(data: Array[(Double, String)], lang: String): Unit = {
12 |     for ((input, expected) <- data)
13 |     //udf.evaluate(new DoubleWritable(input),lang) should be (expected)
14 |     udf.evaluate(input,lang) should be (expected)
15 |   }
16 | 
17 |   "FormatCurrency" should "return a valid currency string for the US locale" in {
18 |     val data = Array(
19 |       (2999100.01, "$2,999,100.01"),
20 |       (.11,        "$0.11"),
21 |       (999.0,      "$999.00"),
22 |       (1122.0,     "$1,122.00")
23 |     )
24 | 
25 |     doTest(data, "en_US")
26 |   }
27 | 
28 |   it should "return a valid currency string for the en_GB locale" in {
29 |     val data = Array(
30 |       (2999100.01, "£2,999,100.01"),
31 |       (.11,        "£0.11"),
32 |       (999.0,      "£999.00"),
33 |       (1122.0,     "£1,122.00")
34 |     )
35 | 
36 |     doTest(data, "en_GB")
37 |   }
38 | 
39 |   it should "return a currency string in the default locale for a bad locale string" in {
40 |     withDefaultLocale("fr", "FR") {
41 |       val data = Array(
42 |         (2999100.01, "2 999 100,01 €"),
43 |         (.11,        "0,11 €"),
44 |         (999.0,      "999,00 €"),
45 |         (1122.0,     "1 122,00 €")
46 |       )
47 | 
48 |       doTest(data, "nnyy")
49 |     }
50 |   }
51 | 
52 |   it should "return a currency string in the default locale if no locale is specified" in {
53 |     withDefaultLocale("se", "SE") {
54 |       val data = Array(
55 |         (2999100.01, "SEK 2,999,100.01"),
56 |         (.11,        "SEK 0.11"),
57 |         (999.0,      "SEK 999.00"),
58 |         (1122.0,     "SEK 1,122.00")
59 |       )
60 | 
61 |       doTest(data, null)
62 |     }
63 |   }
64 | 
65 |   it should "return a valid currency string for the jp_JP locale" in {
66 |     val data = Array(
67 |       (2999100.01, "JPY 2,999,100"),
68 |       (.11,        "JPY 0"),
69 |       (999.0,      "JPY 999"),
70 |       (1122.0,     "JPY 1,122")
71 |     )
72 | 
73 |     doTest(data, "jp_JP")
74 |   }
75 | 
76 | 
77 |   it should "return an empty string for a null input" in {
78 |     udf.evaluate(null, null) should be ("")
79 |   }
80 | 
81 |   private def withDefaultLocale(lang: String, variant: String)
82 |                                (code: => Unit): Unit = {
83 |     val locale = Locale.getDefault
84 |     val newLocale = new Locale(lang, variant)
85 |     Locale.setDefault(newLocale)
86 | 
87 |     try {
88 |       code
89 |     }
90 | 
91 |     finally {
92 |       Locale.setDefault(locale)
93 |     }
94 | 
95 |   }
96 | }
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/src/test/scala/com/ardentex/spark/hiveudf/FormatTimestampSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.ardentex.spark.hiveudf
 2 | 
 3 | import java.sql.Timestamp
 4 | import java.text.SimpleDateFormat
 5 | import java.util.Date
 6 | 
 7 | import org.scalatest.{FlatSpec, Matchers}
 8 | 
 9 | class FormatTimestampSpec extends FlatSpec with Matchers {
10 |   val udf = new FormatTimestamp
11 |   val timestampParser = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
12 | 
13 |   "FormatTimestamp.evaluate" should "properly format a timestamp" in {
14 |     val format = "yyyy/MMM/dd hh:mm a"
15 |     val data = Array(
16 |       ("2013-01-29 23:49:03", "2013/Jan/29 11:49 PM"),
17 |       ("1941-09-03 10:01:53", "1941/Sep/03 10:01 AM"),
18 |       ("1888-07-01 01:01:59", "1888/Jul/01 01:01 AM")
19 |     )
20 | 
21 |     for ((input, expected) <- data) {
22 |       val ts = new Timestamp(timestampParser.parse(input).getTime)
23 |       udf.evaluate(ts, format) should be (expected)
24 |     }
25 |   }
26 | 
27 |   it should "return an empty string when the timestamp is null" in {
28 |     udf.evaluate(null, "yyyy-MM-dd") should be ("")
29 |   }
30 | 
31 |   it should "return an empty string when the format is null" in {
32 |     udf.evaluate(new Timestamp((new Date).getTime), null) should be ("")
33 |   }
34 | 
35 |   it should "return Timestamp.toString when the format is bad" in {
36 |     val ts = new Timestamp((new Date).getTime)
37 |     udf.evaluate(ts, "bad format") should be (ts.toString)
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/scala/com/ardentex/spark/hiveudf/ToHexSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.ardentex.spark.hiveudf
 2 | 
 3 | import org.apache.hadoop.io.LongWritable
 4 | import org.scalatest.{FlatSpec, Matchers}
 5 | 
 6 | class ToHexSpec extends FlatSpec with Matchers {
 7 |   val udf = new ToHex
 8 | 
 9 |   "ToHex.evaluate" should "return a valid hex string" in {
10 |     val data = Array(
11 |       (234908234222L, "0x36b19f31ee"),
12 |       (0L,            "0x0"),
13 |       (Long.MaxValue, "0x7fffffffffffffff"),
14 |       (-10L,          "0xfffffffffffffff6")
15 |     )
16 | 
17 |     for ((input, expected) <- data) {
18 |       udf.evaluate(new LongWritable(input)) should be (expected)
19 |     }
20 |   }
21 | 
22 |   it should "return an empty string for a null input" in {
23 |     udf.evaluate(null) should be ("")
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------