├── .gitignore ├── README.md ├── people.txt ├── pom.xml └── src ├── main └── java │ └── com │ └── matthewrathbone │ └── example │ ├── ComplexUDFExample.java │ ├── NameParserGenericUDTF.java │ ├── SimpleUDFExample.java │ └── TotalNumOfLettersGenericUDAF.java └── test └── java └── com └── matthewrathbone └── example ├── ComplexUDFExampleTest.java ├── NameParserGenericUDTFTest.java └── SimpleUDFExampleTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | target/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hive UDF Examples 2 | 3 | This code accompanies [this article which walks through creating UDFs in Apache Hive][blog-post]. 4 | 5 | ## Compile 6 | 7 | ``` 8 | mvn compile 9 | ``` 10 | 11 | ## Test 12 | 13 | ``` 14 | mvn test 15 | ``` 16 | 17 | ## Build 18 | ``` 19 | mvn assembly:single 20 | ``` 21 | 22 | ## Run 23 | 24 | ``` 25 | %> hive 26 | hive> ADD JAR /path/to/assembled.jar; 27 | hive> create temporary function hello as 'com.matthewrathbone.example.SimpleUDFExample'; 28 | hive> select hello(firstname) from people limit 10; 29 | 30 | ``` 31 | 32 | [blog-post]:http://blog.matthewrathbone.com/2013/08/10/guide-to-writing-hive-udfs.html -------------------------------------------------------------------------------- /people.txt: -------------------------------------------------------------------------------- 1 | John Smith 2 | John and Ann White 3 | Ted Green 4 | Dorothy 5 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | org.apache.maven.plugins 9 | maven-surefire-plugin 10 | 2.8 11 | 12 | 13 | maven-assembly-plugin 14 | 15 | 16 | 17 | com.matthewrathbone.example.RawMapreduce 18 | 19 | 20 | 21 | jar-with-dependencies 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 4.0.0 30 | com.matthewrathbone.example 31 | hive-extensions 32 | jar 33 | 1.0-SNAPSHOT 34 | hive-extensions 35 | http://maven.apache.org 36 | 37 | 38 | org.apache.hadoop 39 | hadoop-client 40 | 2.0.0-mr1-cdh4.3.1 41 | provided 42 | 43 | 44 | org.apache.hive 45 | hive-exec 46 | 0.10.0-cdh4.3.1 47 | provided 48 | 49 | 50 | 51 | org.apache.commons 52 | commons-io 53 | 1.3.2 54 | test 55 | 56 | 57 | commons-httpclient 58 | commons-httpclient 59 | 3.1 60 | test 61 | 62 | 63 | org.apache.hadoop 64 | hadoop-test 65 | 2.0.0-mr1-cdh4.1.2 66 | test 67 | 68 | 69 | junit 70 | junit 71 | 4.8.2 72 | test 73 | 74 | 75 | 76 | 77 | cloudera 78 | https://repository.cloudera.com/artifactory/cloudera-repos/ 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /src/main/java/com/matthewrathbone/example/ComplexUDFExample.java: -------------------------------------------------------------------------------- 1 | package com.matthewrathbone.example; 2 | 3 | import java.util.List; 4 | 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 7 | import org.apache.hadoop.hive.ql.metadata.HiveException; 8 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 9 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 11 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 12 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; 13 | 14 | class ComplexUDFExample extends GenericUDF { 15 | 16 | ListObjectInspector listOI; 17 | StringObjectInspector elementOI; 18 | 19 | @Override 20 | public String getDisplayString(String[] arg0) { 21 | return "arrayContainsExample()"; // this should probably be better 22 | } 23 | 24 | @Override 25 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 26 | if (arguments.length != 2) { 27 | throw new UDFArgumentLengthException("arrayContainsExample only takes 2 arguments: List, T"); 28 | } 29 | // 1. Check we received the right object types. 30 | ObjectInspector a = arguments[0]; 31 | ObjectInspector b = arguments[1]; 32 | if (!(a instanceof ListObjectInspector) || !(b instanceof StringObjectInspector)) { 33 | throw new UDFArgumentException("first argument must be a list / array, second argument must be a string"); 34 | } 35 | this.listOI = (ListObjectInspector) a; 36 | this.elementOI = (StringObjectInspector) b; 37 | 38 | // 2. Check that the list contains strings 39 | if(!(listOI.getListElementObjectInspector() instanceof StringObjectInspector)) { 40 | throw new UDFArgumentException("first argument must be a list of strings"); 41 | } 42 | 43 | // the return type of our function is a boolean, so we provide the correct object inspector 44 | return PrimitiveObjectInspectorFactory.javaBooleanObjectInspector; 45 | } 46 | 47 | @Override 48 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 49 | 50 | // get the list and string from the deferred objects using the object inspectors 51 | List list = (List) this.listOI.getList(arguments[0].get()); 52 | String arg = elementOI.getPrimitiveJavaObject(arguments[1].get()); 53 | 54 | // check for nulls 55 | if (list == null || arg == null) { 56 | return null; 57 | } 58 | 59 | // see if our list contains the value we need 60 | for(String s: list) { 61 | if (arg.equals(s)) return new Boolean(true); 62 | } 63 | return new Boolean(false); 64 | } 65 | 66 | } -------------------------------------------------------------------------------- /src/main/java/com/matthewrathbone/example/NameParserGenericUDTF.java: -------------------------------------------------------------------------------- 1 | package com.matthewrathbone.example; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 8 | import org.apache.hadoop.hive.ql.metadata.HiveException; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 12 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; 13 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 14 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 15 | 16 | public class NameParserGenericUDTF extends GenericUDTF { 17 | private PrimitiveObjectInspector stringOI = null; 18 | 19 | @Override 20 | public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { 21 | if (args.length != 1) { 22 | throw new UDFArgumentException("NameParserGenericUDTF() takes exactly one argument"); 23 | } 24 | 25 | if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE 26 | && ((PrimitiveObjectInspector) args[0]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) { 27 | throw new UDFArgumentException("NameParserGenericUDTF() takes a string as a parameter"); 28 | } 29 | 30 | // input 31 | stringOI = (PrimitiveObjectInspector) args[0]; 32 | 33 | // output 34 | List fieldNames = new ArrayList(2); 35 | List fieldOIs = new ArrayList(2); 36 | fieldNames.add("name"); 37 | fieldNames.add("surname"); 38 | fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 39 | fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 40 | return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); 41 | } 42 | 43 | public ArrayList processInputRecord(String name){ 44 | ArrayList result = new ArrayList(); 45 | 46 | // ignoring null or empty input 47 | if (name == null || name.isEmpty()) { 48 | return result; 49 | } 50 | 51 | String[] tokens = name.split("\\s+"); 52 | 53 | if (tokens.length == 2){ 54 | result.add(new Object[] { tokens[0], tokens[1] }); 55 | }else if (tokens.length == 4 && tokens[1].equals("and")){ 56 | result.add(new Object[] { tokens[0], tokens[3] }); 57 | result.add(new Object[] { tokens[2], tokens[3] }); 58 | } 59 | 60 | return result; 61 | } 62 | 63 | @Override 64 | public void process(Object[] record) throws HiveException { 65 | final String name = stringOI.getPrimitiveJavaObject(record[0]).toString(); 66 | ArrayList results = processInputRecord(name); 67 | 68 | Iterator it = results.iterator(); 69 | 70 | while (it.hasNext()){ 71 | Object[] r = it.next(); 72 | forward(r); 73 | } 74 | } 75 | 76 | @Override 77 | public void close() throws HiveException { 78 | // do nothing 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/com/matthewrathbone/example/SimpleUDFExample.java: -------------------------------------------------------------------------------- 1 | package com.matthewrathbone.example; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.Text; 6 | 7 | 8 | @Description( 9 | name="SimpleUDFExample", 10 | value="returns 'hello x', where x is whatever you give it (STRING)", 11 | extended="SELECT simpleudfexample('world') from foo limit 1;" 12 | ) 13 | class SimpleUDFExample extends UDF { 14 | 15 | public Text evaluate(Text input) { 16 | if(input == null) return null; 17 | return new Text("Hello " + input.toString()); 18 | } 19 | } -------------------------------------------------------------------------------- /src/main/java/com/matthewrathbone/example/TotalNumOfLettersGenericUDAF.java: -------------------------------------------------------------------------------- 1 | package com.matthewrathbone.example; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 4 | import org.apache.hadoop.hive.ql.metadata.HiveException; 5 | import org.apache.hadoop.hive.ql.parse.SemanticException; 6 | import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; 7 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 8 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 9 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 10 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; 12 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; 13 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; 14 | import org.apache.hadoop.hive.ql.exec.Description; 15 | 16 | @Description(name = "letters", value = "_FUNC_(expr) - Returns total number of letters in all the strings of a column.") 17 | public class TotalNumOfLettersGenericUDAF extends AbstractGenericUDAFResolver { 18 | 19 | @Override 20 | public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) 21 | throws SemanticException { 22 | if (parameters.length != 1) { 23 | throw new UDFArgumentTypeException(parameters.length - 1, 24 | "Exactly one argument is expected."); 25 | } 26 | 27 | ObjectInspector oi = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(parameters[0]); 28 | 29 | if (oi.getCategory() != ObjectInspector.Category.PRIMITIVE){ 30 | throw new UDFArgumentTypeException(0, 31 | "Argument must be PRIMITIVE, but " 32 | + oi.getCategory().name() 33 | + " was passed."); 34 | } 35 | 36 | PrimitiveObjectInspector inputOI = (PrimitiveObjectInspector) oi; 37 | 38 | if (inputOI.getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING){ 39 | throw new UDFArgumentTypeException(0, 40 | "Argument must be String, but " 41 | + inputOI.getPrimitiveCategory().name() 42 | + " was passed."); 43 | } 44 | 45 | return new TotalNumOfLettersEvaluator(); 46 | } 47 | 48 | public static class TotalNumOfLettersEvaluator extends GenericUDAFEvaluator { 49 | 50 | PrimitiveObjectInspector inputOI; 51 | ObjectInspector outputOI; 52 | PrimitiveObjectInspector integerOI; 53 | 54 | int total = 0; 55 | 56 | @Override 57 | public ObjectInspector init(Mode m, ObjectInspector[] parameters) 58 | throws HiveException { 59 | 60 | assert (parameters.length == 1); 61 | super.init(m, parameters); 62 | 63 | // init input object inspectors 64 | if (m == Mode.PARTIAL1 || m == Mode.COMPLETE) { 65 | inputOI = (PrimitiveObjectInspector) parameters[0]; 66 | } else { 67 | integerOI = (PrimitiveObjectInspector) parameters[0]; 68 | } 69 | 70 | // init output object inspectors 71 | // For partial function - array of integers 72 | outputOI = ObjectInspectorFactory.getReflectionObjectInspector(Integer.class, 73 | ObjectInspectorOptions.JAVA); 74 | return outputOI; 75 | 76 | } 77 | 78 | /** 79 | * class for storing the current sum of letters 80 | */ 81 | static class LetterSumAgg implements AggregationBuffer { 82 | int sum = 0; 83 | void add(int num){ 84 | sum += num; 85 | } 86 | } 87 | 88 | @Override 89 | public AggregationBuffer getNewAggregationBuffer() throws HiveException { 90 | LetterSumAgg result = new LetterSumAgg(); 91 | return result; 92 | } 93 | 94 | @Override 95 | public void reset(AggregationBuffer agg) throws HiveException { 96 | LetterSumAgg myagg = new LetterSumAgg(); 97 | } 98 | 99 | private boolean warned = false; 100 | 101 | @Override 102 | public void iterate(AggregationBuffer agg, Object[] parameters) 103 | throws HiveException { 104 | assert (parameters.length == 1); 105 | if (parameters[0] != null) { 106 | LetterSumAgg myagg = (LetterSumAgg) agg; 107 | Object p1 = ((PrimitiveObjectInspector) inputOI).getPrimitiveJavaObject(parameters[0]); 108 | myagg.add(String.valueOf(p1).length()); 109 | } 110 | } 111 | 112 | @Override 113 | public Object terminatePartial(AggregationBuffer agg) throws HiveException { 114 | LetterSumAgg myagg = (LetterSumAgg) agg; 115 | total += myagg.sum; 116 | return total; 117 | } 118 | 119 | @Override 120 | public void merge(AggregationBuffer agg, Object partial) 121 | throws HiveException { 122 | if (partial != null) { 123 | 124 | LetterSumAgg myagg1 = (LetterSumAgg) agg; 125 | 126 | Integer partialSum = (Integer) integerOI.getPrimitiveJavaObject(partial); 127 | 128 | LetterSumAgg myagg2 = new LetterSumAgg(); 129 | 130 | myagg2.add(partialSum); 131 | myagg1.add(myagg2.sum); 132 | } 133 | } 134 | 135 | @Override 136 | public Object terminate(AggregationBuffer agg) throws HiveException { 137 | LetterSumAgg myagg = (LetterSumAgg) agg; 138 | total = myagg.sum; 139 | return myagg.sum; 140 | } 141 | 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /src/test/java/com/matthewrathbone/example/ComplexUDFExampleTest.java: -------------------------------------------------------------------------------- 1 | package com.matthewrathbone.example; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import junit.framework.Assert; 7 | 8 | import org.apache.hadoop.hive.ql.metadata.HiveException; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; 10 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 13 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaBooleanObjectInspector; 14 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 15 | import org.junit.Test; 16 | 17 | public class ComplexUDFExampleTest { 18 | 19 | 20 | @Test 21 | public void testComplexUDFReturnsCorrectValues() throws HiveException { 22 | 23 | // set up the models we need 24 | ComplexUDFExample example = new ComplexUDFExample(); 25 | ObjectInspector stringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 26 | ObjectInspector listOI = ObjectInspectorFactory.getStandardListObjectInspector(stringOI); 27 | JavaBooleanObjectInspector resultInspector = (JavaBooleanObjectInspector) example.initialize(new ObjectInspector[]{listOI, stringOI}); 28 | 29 | // create the actual UDF arguments 30 | List list = new ArrayList(); 31 | list.add("a"); 32 | list.add("b"); 33 | list.add("c"); 34 | 35 | // test our results 36 | 37 | // the value exists 38 | Object result = example.evaluate(new DeferredObject[]{new DeferredJavaObject(list), new DeferredJavaObject("a")}); 39 | Assert.assertEquals(true, resultInspector.get(result)); 40 | 41 | // the value doesn't exist 42 | Object result2 = example.evaluate(new DeferredObject[]{new DeferredJavaObject(list), new DeferredJavaObject("d")}); 43 | Assert.assertEquals(false, resultInspector.get(result2)); 44 | 45 | // arguments are null 46 | Object result3 = example.evaluate(new DeferredObject[]{new DeferredJavaObject(null), new DeferredJavaObject(null)}); 47 | Assert.assertNull(result3); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/test/java/com/matthewrathbone/example/NameParserGenericUDTFTest.java: -------------------------------------------------------------------------------- 1 | package com.matthewrathbone.example; 2 | 3 | import java.util.ArrayList; 4 | 5 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 6 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 7 | 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | 11 | public class NameParserGenericUDTFTest { 12 | @Test 13 | public void testUDTFNoSpaceAtAll() { 14 | // set up the models we need 15 | NameParserGenericUDTF example = new NameParserGenericUDTF(); 16 | ObjectInspector[] inputOI = {PrimitiveObjectInspectorFactory.javaStringObjectInspector}; 17 | 18 | // create the actual UDF arguments 19 | String name = "Smith"; 20 | 21 | // the value exists 22 | try{ 23 | example.initialize(inputOI); 24 | }catch(Exception ex){ 25 | ; 26 | } 27 | 28 | ArrayList results = example.processInputRecord(name); 29 | Assert.assertEquals(0, results.size()); 30 | } 31 | 32 | @Test 33 | public void testUDTFOneSpace() { 34 | // set up the models we need 35 | NameParserGenericUDTF example = new NameParserGenericUDTF(); 36 | ObjectInspector[] inputOI = {PrimitiveObjectInspectorFactory.javaStringObjectInspector}; 37 | 38 | // create the actual UDF arguments 39 | String name = "John Smith"; 40 | 41 | // the value exists 42 | try{ 43 | example.initialize(inputOI); 44 | }catch(Exception ex){ 45 | ; 46 | } 47 | 48 | ArrayList results = example.processInputRecord(name); 49 | Assert.assertEquals(1, results.size()); 50 | Assert.assertEquals("John", results.get(0)[0]); 51 | Assert.assertEquals("Smith", results.get(0)[1]); 52 | } 53 | 54 | @Test 55 | public void testUDTFSpaceAndConstruction() { 56 | // set up the models we need 57 | NameParserGenericUDTF example = new NameParserGenericUDTF(); 58 | ObjectInspector[] inputOI = {PrimitiveObjectInspectorFactory.javaStringObjectInspector}; 59 | 60 | // create the actual UDF arguments 61 | String name = "John and Ann White"; 62 | 63 | // the value exists 64 | try{ 65 | example.initialize(inputOI); 66 | }catch(Exception ex){ 67 | ; 68 | } 69 | 70 | ArrayList results = example.processInputRecord(name); 71 | Assert.assertEquals(2, results.size()); 72 | Assert.assertEquals("John", results.get(0)[0]); 73 | Assert.assertEquals("White", results.get(0)[1]); 74 | Assert.assertEquals("Ann", results.get(1)[0]); 75 | Assert.assertEquals("White", results.get(1)[1]); 76 | } 77 | 78 | @Test 79 | public void testUDTFTooManySpaces() { 80 | // set up the models we need 81 | NameParserGenericUDTF example = new NameParserGenericUDTF(); 82 | ObjectInspector[] inputOI = {PrimitiveObjectInspectorFactory.javaStringObjectInspector}; 83 | 84 | // create the actual UDF arguments 85 | String name = "Blah Blah Blah Blah"; 86 | 87 | // the value exists 88 | try{ 89 | example.initialize(inputOI); 90 | }catch(Exception ex){ 91 | ; 92 | } 93 | 94 | ArrayList results = example.processInputRecord(name); 95 | Assert.assertEquals(0, results.size()); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/test/java/com/matthewrathbone/example/SimpleUDFExampleTest.java: -------------------------------------------------------------------------------- 1 | package com.matthewrathbone.example; 2 | 3 | import junit.framework.Assert; 4 | 5 | import org.apache.hadoop.io.Text; 6 | import org.junit.Test; 7 | 8 | public class SimpleUDFExampleTest { 9 | 10 | @Test 11 | public void testUDF() { 12 | SimpleUDFExample example = new SimpleUDFExample(); 13 | Assert.assertEquals("Hello world", example.evaluate(new Text("world")).toString()); 14 | } 15 | 16 | @Test 17 | public void testUDFNullCheck() { 18 | SimpleUDFExample example = new SimpleUDFExample(); 19 | Assert.assertNull(example.evaluate(null)); 20 | } 21 | } --------------------------------------------------------------------------------