├── hive-udf ├── src │ └── main │ │ └── java │ │ └── com │ │ └── cloudera │ │ └── fce │ │ └── curtis │ │ └── sparkudfexamples │ │ └── hiveudf │ │ └── CTOF.java ├── hive-udf-example.py └── pom.xml ├── data ├── inventory.json └── temperatures.json ├── python-udf └── python-udf-example.py ├── scala-udaf-from-python ├── scala-udaf-from-python.py ├── src │ └── main │ │ └── scala │ │ └── com │ │ └── cloudera │ │ └── fce │ │ └── curtis │ │ └── sparkudfexamples │ │ └── scalaudaffrompython │ │ └── ScalaUDAFFromPythonExample.scala └── pom.xml ├── scala-udf ├── src │ └── main │ │ └── scala │ │ └── com │ │ └── cloudera │ │ └── fce │ │ └── curtis │ │ └── sparkudfexamples │ │ └── scalaudf │ │ └── ScalaUDFExample.scala └── pom.xml ├── java-udf ├── src │ └── main │ │ └── java │ │ └── com │ │ └── cloudera │ │ └── fce │ │ └── curtis │ │ └── sparkudfexamples │ │ └── javaudf │ │ └── JavaUDFExample.java └── pom.xml ├── README.md └── scala-udaf ├── src └── main │ └── scala │ └── com │ └── cloudera │ └── fce │ └── curtis │ └── sparkudfexamples │ └── scalaudaf │ └── ScalaUDAFExample.scala └── pom.xml /hive-udf/src/main/java/com/cloudera/fce/curtis/sparkudfexamples/hiveudf/CTOF.java: -------------------------------------------------------------------------------- 1 | package com.cloudera.fce.curtis.sparkudfexamples.hiveudf; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | public class CTOF extends UDF { 6 | public Double evaluate(Double degreesCelsius) { 7 | return ((degreesCelsius * 9.0 / 5.0) + 32.0); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /data/inventory.json: -------------------------------------------------------------------------------- 1 | {"Make":"Honda","Model":"Pilot","RetailValue":32145.0,"Stock":4} 2 | {"Make":"Honda","Model":"Civic","RetailValue":19575.0,"Stock":11} 3 | {"Make":"Honda","Model":"Ridgeline","RetailValue":42870.0,"Stock":2} 4 | {"Make":"Jeep","Model":"Cherokee","RetailValue":23595.0,"Stock":13} 5 | {"Make":"Jeep","Model":"Wrangler","RetailValue":27895.0,"Stock":4} 6 | {"Make":"Volkswagen","Model":"Passat","RetailValue":22440.0,"Stock":2} 7 | -------------------------------------------------------------------------------- /python-udf/python-udf-example.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName("Python UDF example").getOrCreate() 4 | 5 | df = spark.read.json("temperatures.json") 6 | df.createOrReplaceTempView("citytemps") 7 | 8 | # Register the UDF with our SparkSession 9 | spark.udf.register("CTOF", lambda degreesCelsius: ((degreesCelsius * 9.0 / 5.0) + 32.0)) 10 | 11 | spark.sql("SELECT city, CTOF(avgLow) AS avgLowF, CTOF(avgHigh) AS avgHighF FROM citytemps").show() 12 | -------------------------------------------------------------------------------- /scala-udaf-from-python/scala-udaf-from-python.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName("Scala UDAF from Python example").getOrCreate() 4 | 5 | df = spark.read.json("inventory.json") 6 | df.createOrReplaceTempView("inventory") 7 | 8 | spark.sparkContext._jvm.com.cloudera.fce.curtis.sparkudfexamples.scalaudaffrompython.ScalaUDAFFromPythonExample.registerUdf() 9 | 10 | spark.sql("SELECT Make, SUMPRODUCT(RetailValue,Stock) as InventoryValuePerMake FROM inventory GROUP BY Make").show() 11 | -------------------------------------------------------------------------------- /hive-udf/hive-udf-example.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import HiveContext 3 | 4 | conf = SparkConf().setAppName("Hive UDF example") 5 | sc = SparkContext(conf=conf) 6 | sqlContext = HiveContext(sc) 7 | 8 | df = sqlContext.read.json("temperatures.json") 9 | df.registerTempTable("citytemps") 10 | 11 | # Register our Hive UDF 12 | sqlContext.sql("CREATE TEMPORARY FUNCTION CTOF AS 'com.cloudera.fce.curtis.sparkudfexamples.hiveudf.CTOF'") 13 | 14 | sqlContext.sql("SELECT city, CTOF(avgLow) AS avgLowF, CTOF(avgHigh) AS avgHighF FROM citytemps").show() 15 | -------------------------------------------------------------------------------- /data/temperatures.json: -------------------------------------------------------------------------------- 1 | {"city":"St. John's","avgHigh":8.7,"avgLow":0.6} 2 | {"city":"Charlottetown","avgHigh":9.7,"avgLow":0.9} 3 | {"city":"Halifax","avgHigh":11.0,"avgLow":1.6} 4 | {"city":"Fredericton","avgHigh":11.2,"avgLow":-0.5} 5 | {"city":"Quebec","avgHigh":9.0,"avgLow":-1.0} 6 | {"city":"Montreal","avgHigh":11.1,"avgLow":1.4} 7 | {"city":"Ottawa","avgHigh":10.9,"avgLow":1.1} 8 | {"city":"Toronto","avgHigh":12.5,"avgLow":2.5} 9 | {"city":"Winnipeg","avgHigh":8.3,"avgLow":-3.1} 10 | {"city":"Regina","avgHigh":9.1,"avgLow":-3.4} 11 | {"city":"Edmonton","avgHigh":8.5,"avgLow":-3.8} 12 | {"city":"Calgary","avgHigh":10.5,"avgLow":-2.4} 13 | {"city":"Vancouver","avgHigh":13.7,"avgLow":6.5} 14 | {"city":"Victoria","avgHigh":14.1,"avgLow":5.3} 15 | {"city":"Whitehorse","avgHigh":4.5,"avgLow":-5.9} 16 | {"city":"Yellowknife","avgHigh":-0.2,"avgLow":-9.0} 17 | -------------------------------------------------------------------------------- /scala-udf/src/main/scala/com/cloudera/fce/curtis/sparkudfexamples/scalaudf/ScalaUDFExample.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.fce.curtis.sparkudfexamples.scalaudf 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.SparkConf 5 | 6 | object ScalaUDFExample { 7 | def main(args: Array[String]) { 8 | val conf = new SparkConf().setAppName("Scala UDF Example") 9 | val spark = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate() 10 | 11 | val ds = spark.read.json("temperatures.json") 12 | ds.createOrReplaceTempView("citytemps") 13 | 14 | // Register the UDF with our SparkSession 15 | spark.udf.register("CTOF", (degreesCelcius: Double) => ((degreesCelcius * 9.0 / 5.0) + 32.0)) 16 | 17 | spark.sql("SELECT city, CTOF(avgLow) AS avgLowF, CTOF(avgHigh) AS avgHighF FROM citytemps").show() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /java-udf/src/main/java/com/cloudera/fce/curtis/sparkudfexamples/javaudf/JavaUDFExample.java: -------------------------------------------------------------------------------- 1 | package com.cloudera.fce.curtis.sparkudfexamples.javaudf; 2 | 3 | import org.apache.spark.api.java.*; 4 | import org.apache.spark.SparkConf; 5 | import org.apache.spark.sql.*; 6 | import org.apache.spark.sql.api.java.UDF1; 7 | import org.apache.spark.sql.types.DataTypes; 8 | 9 | public class JavaUDFExample { 10 | public static void main(String[] args) { 11 | SparkConf conf = new SparkConf().setAppName("Java UDF Example"); 12 | SparkSession spark = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate(); 13 | 14 | Dataset ds = spark.read().json("temperatures.json"); 15 | ds.createOrReplaceTempView("citytemps"); 16 | 17 | // Register the UDF with our SparkSession 18 | spark.udf().register("CTOF", new UDF1() { 19 | @Override 20 | public Double call(Double degreesCelcius) { 21 | return ((degreesCelcius * 9.0 / 5.0) + 32.0); 22 | } 23 | }, DataTypes.DoubleType); 24 | 25 | spark.sql("SELECT city, CTOF(avgLow) AS avgLowF, CTOF(avgHigh) AS avgHighF FROM citytemps").show(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark UDF Examples 2 | Simple examples of Spark SQL user-defined functions. Tested with CDH 5.13.1, Spark 2.1.0 (see the *spark1.6* branch for Spark 1.x examples) 3 | 4 | ### Load the sample data 5 | ``` 6 | hdfs dfs -put data/temperatures.json temperatures.json 7 | hdfs dfs -put data/inventory.json inventory.json 8 | ``` 9 | 10 | ### Build the Java and Scala examples 11 | Under each example root (java-udf/, scala-udf/, ...): 12 |
13 | ``` 14 | mvn package 15 | ``` 16 | 17 | ### Run them 18 | Python UDF: 19 |
20 | ``` 21 | spark2-submit --master local python-udf-example.py 22 | ``` 23 | 24 | Scala UDF: 25 |
26 | ``` 27 | spark2-submit --class com.cloudera.fce.curtis.sparkudfexamples.scalaudf.ScalaUDFExample --master local target/scalaudf-0.0.1-jar-with-dependencies.jar 28 | ``` 29 | 30 | Java UDF: 31 |
32 | ``` 33 | spark2-submit --class com.cloudera.fce.curtis.sparkudfexamples.javaudf.JavaUDFExample --master local target/javaudf-0.0.1-jar-with-dependencies.jar 34 | ``` 35 | 36 | Scala UDAF: 37 |
38 | ``` 39 | spark2-submit --class com.cloudera.fce.curtis.sparkudfexamples.scalaudaf.ScalaUDAFExample --master local target/scalaudaf-0.0.1-jar-with-dependencies.jar 40 | ``` 41 | 42 | Hive UDF: 43 |
44 | ``` 45 | spark2-submit --jars target/hiveudf-0.0.1-jar-with-dependencies.jar hive-udf-example.py 46 | ``` 47 | 48 | Scala UDAF From PySpark: 49 |
50 | ``` 51 | spark2-submit --jars target/scalaudaffrompython-0.0.1.jar --driver-class-path target/scalaudaffrompython-0.0.1.jar scala-udaf-from-python.py 52 | ``` 53 | -------------------------------------------------------------------------------- /scala-udaf-from-python/src/main/scala/com/cloudera/fce/curtis/sparkudfexamples/scalaudaffrompython/ScalaUDAFFromPythonExample.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.fce.curtis.sparkudfexamples.scalaudaffrompython 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.SparkSession 8 | 9 | object ScalaUDAFFromPythonExample { 10 | 11 | private class SumProductAggregateFunction extends UserDefinedAggregateFunction { 12 | def inputSchema: StructType = 13 | new StructType().add("price", DoubleType).add("quantity", LongType) 14 | def bufferSchema: StructType = 15 | new StructType().add("total", DoubleType) 16 | def dataType: DataType = DoubleType 17 | def deterministic: Boolean = true 18 | 19 | def initialize(buffer: MutableAggregationBuffer): Unit = { 20 | buffer.update(0, 0.0) 21 | } 22 | 23 | def update(buffer: MutableAggregationBuffer, input: Row): Unit = { 24 | val sum = buffer.getDouble(0) 25 | val price = input.getDouble(0) 26 | val qty = input.getLong(1) 27 | buffer.update(0, sum + (price * qty)) 28 | } 29 | 30 | def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 31 | buffer1.update(0, buffer1.getDouble(0) + buffer2.getDouble(0)) 32 | } 33 | 34 | def evaluate(buffer: Row): Any = { 35 | buffer.getDouble(0) 36 | } 37 | } 38 | 39 | // This function is called from PySpark to register our UDAF 40 | def registerUdf() { 41 | import org.apache.spark.sql.SparkSession 42 | val spark = SparkSession.builder().getOrCreate() 43 | spark.udf.register("SUMPRODUCT", new SumProductAggregateFunction) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /scala-udaf/src/main/scala/com/cloudera/fce/curtis/sparkudfexamples/scalaudaf/ScalaUDAFExample.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.fce.curtis.sparkudfexamples.scalaudaf 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.SparkSession 8 | 9 | object ScalaUDAFExample { 10 | 11 | // Define the SparkSQL UDAF logic 12 | private class SumProductAggregateFunction extends UserDefinedAggregateFunction { 13 | // Define the UDAF input and result schema's 14 | def inputSchema: StructType = // Input = (Double price, Long quantity) 15 | new StructType().add("price", DoubleType).add("quantity", LongType) 16 | def bufferSchema: StructType = // Output = (Double total) 17 | new StructType().add("total", DoubleType) 18 | def dataType: DataType = DoubleType 19 | def deterministic: Boolean = true // true: our UDAF's output given an input is deterministic 20 | 21 | def initialize(buffer: MutableAggregationBuffer): Unit = { 22 | buffer.update(0, 0.0) // Initialize the result to 0.0 23 | } 24 | 25 | def update(buffer: MutableAggregationBuffer, input: Row): Unit = { 26 | val sum = buffer.getDouble(0) // Intermediate result to be updated 27 | val price = input.getDouble(0) // First input parameter 28 | val qty = input.getLong(1) // Second input parameter 29 | buffer.update(0, sum + (price * qty)) // Update the intermediate result 30 | } 31 | // Merge intermediate result sums by adding them 32 | def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 33 | buffer1.update(0, buffer1.getDouble(0) + buffer2.getDouble(0)) 34 | } 35 | // THe final result will be contained in 'buffer' 36 | def evaluate(buffer: Row): Any = { 37 | buffer.getDouble(0) 38 | } 39 | } 40 | 41 | def main (args: Array[String]) { 42 | val conf = new SparkConf().setAppName("Scala UDAF Example") 43 | val spark = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate() 44 | 45 | val testDF = spark.read.json("inventory.json") 46 | testDF.createOrReplaceTempView("inventory") 47 | // Register the UDAF with our SQLContext 48 | spark.udf.register("SUMPRODUCT", new SumProductAggregateFunction) 49 | 50 | spark.sql("SELECT Make, SUMPRODUCT(RetailValue,Stock) as InventoryValuePerMake FROM inventory GROUP BY Make").show() 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /hive-udf/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 16 | 17 | 4.0.0 18 | com.cloudera.fce.curtis.sparkudfexamples.hiveudf 19 | hiveudf 20 | 0.0.1 21 | jar 22 | "Hive UDF Example" 23 | 24 | 25 | 26 | cloudera-repos 27 | Cloudera Repos 28 | https://repository.cloudera.com/artifactory/cloudera-repos/ 29 | 30 | 31 | 32 | 33 | 34 | 35 | org.apache.maven.plugins 36 | maven-compiler-plugin 37 | 2.3.1 38 | 39 | 1.7 40 | 1.7 41 | 42 | 43 | 44 | maven-assembly-plugin 45 | 46 | 47 | jar-with-dependencies 48 | 49 | 50 | 51 | 52 | make-assembly 53 | package 54 | 55 | single 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | org.apache.hive 66 | hive-exec 67 | 1.2.1 68 | 69 | 70 | org.apache.hadoop 71 | hadoop-core 72 | 1.2.1 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /java-udf/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 16 | 17 | 4.0.0 18 | com.cloudera.fce.curtis.sparkudfexamples.javaudf 19 | javaudf 20 | 0.0.1 21 | jar 22 | "Java UDF Example" 23 | 24 | 25 | 26 | cloudera-repos 27 | Cloudera Repos 28 | https://repository.cloudera.com/artifactory/cloudera-repos/ 29 | 30 | 31 | 32 | 33 | 34 | 35 | org.apache.maven.plugins 36 | maven-compiler-plugin 37 | 2.3.1 38 | 39 | 1.7 40 | 1.7 41 | 42 | 43 | 44 | maven-assembly-plugin 45 | 46 | 47 | jar-with-dependencies 48 | 49 | 50 | 51 | 52 | make-assembly 53 | package 54 | 55 | single 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | org.apache.spark 66 | spark-core_2.11 67 | 2.1.0.cloudera1 68 | 69 | 70 | org.apache.spark 71 | spark-sql_2.11 72 | 2.1.0.cloudera1 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /scala-udaf-from-python/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 16 | 17 | 4.0.0 18 | com.cloudera.fce.curtis.sparkudfexamples.scalaudaffrompython 19 | scalaudaffrompython 20 | 0.0.1 21 | jar 22 | "Scala UDAF from Python Example" 23 | 24 | 25 | 26 | scala-tools.org 27 | Scala-tools Maven2 Repository 28 | http://scala-tools.org/repo-releases 29 | 30 | 31 | maven-hadoop 32 | Hadoop Releases 33 | https://repository.cloudera.com/content/repositories/releases/ 34 | 35 | 36 | cloudera-repos 37 | Cloudera Repos 38 | https://repository.cloudera.com/artifactory/cloudera-repos/ 39 | 40 | 41 | 42 | 43 | 44 | scala-tools.org 45 | Scala-tools Maven2 Repository 46 | http://scala-tools.org/repo-releases 47 | 48 | 49 | 50 | 51 | UTF-8 52 | UTF-8 53 | 54 | 55 | 56 | 57 | 58 | org.scala-tools 59 | maven-scala-plugin 60 | 2.15.2 61 | 62 | 63 | 64 | compile 65 | 66 | 67 | 68 | 69 | 70 | maven-compiler-plugin 71 | 3.1 72 | 73 | 1.6 74 | 1.6 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | org.scala-lang 83 | scala-library 84 | 2.11.12 85 | 86 | 87 | org.apache.spark 88 | spark-core_2.11 89 | 2.1.0.cloudera1 90 | 91 | 92 | org.apache.spark 93 | spark-sql_2.11 94 | 2.1.0.cloudera1 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /scala-udf/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 16 | 17 | 4.0.0 18 | com.cloudera.fce.curtis.sparkudfexamples.scalaudf 19 | scalaudf 20 | 0.0.1 21 | jar 22 | "Scala UDF Example" 23 | 24 | 25 | 26 | scala-tools.org 27 | Scala-tools Maven2 Repository 28 | http://scala-tools.org/repo-releases 29 | 30 | 31 | maven-hadoop 32 | Hadoop Releases 33 | https://repository.cloudera.com/content/repositories/releases/ 34 | 35 | 36 | cloudera-repos 37 | Cloudera Repos 38 | https://repository.cloudera.com/artifactory/cloudera-repos/ 39 | 40 | 41 | 42 | 43 | 44 | scala-tools.org 45 | Scala-tools Maven2 Repository 46 | http://scala-tools.org/repo-releases 47 | 48 | 49 | 50 | 51 | 52 | 53 | org.scala-tools 54 | maven-scala-plugin 55 | 2.15.2 56 | 57 | 58 | 59 | compile 60 | 61 | 62 | 63 | 64 | 65 | maven-compiler-plugin 66 | 2.3.1 67 | 68 | 1.7 69 | 1.7 70 | 71 | 72 | 73 | maven-assembly-plugin 74 | 75 | 76 | jar-with-dependencies 77 | 78 | 79 | 80 | 81 | make-assembly 82 | package 83 | 84 | single 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | org.scala-lang 95 | scala-library 96 | 2.11.12 97 | 98 | 99 | org.apache.spark 100 | spark-core_2.11 101 | 2.1.0.cloudera1 102 | 103 | 104 | org.apache.spark 105 | spark-sql_2.11 106 | 2.1.0.cloudera1 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /scala-udaf/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 16 | 17 | 4.0.0 18 | com.cloudera.fce.curtis.sparkudfexamples.scalaudaf 19 | scalaudaf 20 | 0.0.1 21 | jar 22 | "Scala UDAF Example" 23 | 24 | 25 | 26 | scala-tools.org 27 | Scala-tools Maven2 Repository 28 | http://scala-tools.org/repo-releases 29 | 30 | 31 | maven-hadoop 32 | Hadoop Releases 33 | https://repository.cloudera.com/content/repositories/releases/ 34 | 35 | 36 | cloudera-repos 37 | Cloudera Repos 38 | https://repository.cloudera.com/artifactory/cloudera-repos/ 39 | 40 | 41 | 42 | 43 | 44 | scala-tools.org 45 | Scala-tools Maven2 Repository 46 | http://scala-tools.org/repo-releases 47 | 48 | 49 | 50 | 51 | UTF-8 52 | UTF-8 53 | 54 | 55 | 56 | 57 | 58 | org.scala-tools 59 | maven-scala-plugin 60 | 2.15.2 61 | 62 | 63 | 64 | compile 65 | 66 | 67 | 68 | 69 | 70 | maven-compiler-plugin 71 | 2.3.1 72 | 73 | 1.7 74 | 1.7 75 | 76 | 77 | 78 | maven-assembly-plugin 79 | 80 | 81 | jar-with-dependencies 82 | 83 | 84 | 85 | 86 | make-assembly 87 | package 88 | 89 | single 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | org.scala-lang 100 | scala-library 101 | 2.11.12 102 | 103 | 104 | org.apache.spark 105 | spark-core_2.11 106 | 2.1.0.cloudera1 107 | 108 | 109 | org.apache.spark 110 | spark-sql_2.11 111 | 2.1.0.cloudera1 112 | 113 | 114 | 115 | 116 | --------------------------------------------------------------------------------