├── .gitignore ├── hive-init.hql ├── ivysettings.xml ├── src └── com │ └── dataiku │ └── hive │ ├── udf │ ├── maps │ │ ├── UDFMapMaxKey.java │ │ ├── UDFCountToMap.java │ │ ├── UDFMapValueFilterLowerThan.java │ │ ├── UDFMapValueFilterTopN.java │ │ ├── UDAFMapGroupSum.java │ │ └── UDAFCountDistinctToMap.java │ ├── arrays │ │ ├── UDFArrayCountDistinct.java │ │ ├── UDFArrayGet.java │ │ ├── UDFArrayIntSum.java │ │ ├── UDFArrayCountEquals.java │ │ ├── UDFArrayJoin.java │ │ ├── UDFArraySubSequences.java │ │ └── UDAFCollectToArray.java │ ├── window │ │ ├── Rank.java │ │ ├── UDAFLastOfGroupAccordingTo.java │ │ ├── UDAFFirstOfGroupAccordingTo.java │ │ └── UDAFFirstOrLastOfGroupAccordingTo.java │ ├── strings │ │ └── UDFStringSubSequences.java │ └── maths │ │ ├── PrefixSumMovingAverage.java │ │ └── UDFExponentialSmoothingMovingAverage.java │ └── storage │ ├── XMLHiveStorageHandler.java │ ├── XMLSerde.java │ └── XMLHiveInputFormat.java ├── ivy.xml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | lib 4 | dist 5 | build 6 | -------------------------------------------------------------------------------- /hive-init.hql: -------------------------------------------------------------------------------- 1 | ADD JAR dist/dataiku-hive-udf.jar; 2 | 3 | CREATE TEMPORARY FUNCTION count_distinct_map as 'com.dataiku.hive.udf.maps.UDAFCountDistinctToMap'; 4 | CREATE TEMPORARY FUNCTION array_count_to_map as 'com.dataiku.hive.udf.maps.UDFCountToMap'; 5 | CREATE TEMPORARY FUNCTION map_filter_top as 'com.dataiku.hive.udf.maps.UDFMapValueFilterTopN'; 6 | CREATE TEMPORARY FUNCTION collect_all as 'com.dataiku.hive.udf.arrays.UDAFCollectToArray'; -------------------------------------------------------------------------------- /ivysettings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDFMapMaxKey.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maps; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | import java.util.*; 6 | 7 | /** 8 | * Retrieve the key with the maximal value for a map 9 | */ 10 | public class UDFMapMaxKey extends UDF { 11 | public String evaluate(Map map) { 12 | String maxKey = null; 13 | Integer maxValue = null; 14 | for(Map.Entry entry : map.entrySet()) { 15 | if (maxValue == null || entry.getValue() > maxValue) { 16 | maxKey = entry.getKey(); 17 | maxValue = entry.getValue(); 18 | } 19 | } 20 | return maxKey; 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDFCountToMap.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maps; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | import java.util.*; 6 | 7 | public class UDFCountToMap extends UDF { 8 | 9 | 10 | public Map evaluate(List a) { 11 | HashMap map= new HashMap(); 12 | if (a == null) { 13 | return null; 14 | } 15 | for(String s : a) { 16 | if (s == null) { 17 | continue; 18 | } 19 | if (map.containsKey(s)) { 20 | map.put(s, map.get(s) + 1); 21 | } else { 22 | map.put(s, 1); 23 | } 24 | } 25 | return map; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDFMapValueFilterLowerThan.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maps; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | import java.util.HashSet; 6 | import java.util.Map; 7 | import java.util.Set; 8 | 9 | /** 10 | */ 11 | public class UDFMapValueFilterLowerThan extends UDF { 12 | 13 | Set toRemove = new HashSet(); 14 | 15 | public Map evaluate(Map map, Integer minValue) { 16 | 17 | toRemove.clear(); 18 | for(String s : map.keySet()) { 19 | if (map.get(s) < minValue) { 20 | 21 | toRemove.add(s); 22 | } 23 | } 24 | 25 | for(String s : toRemove) { 26 | map.remove(s); 27 | } 28 | return map; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayCountDistinct.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | import java.util.HashSet; 18 | import java.util.List; 19 | import java.util.Set; 20 | 21 | import org.apache.hadoop.hive.ql.exec.UDF; 22 | 23 | public class UDFArrayCountDistinct extends UDF { 24 | Set stringSet = new HashSet(); 25 | 26 | public int evaluate(List a) { 27 | if (a == null) { 28 | return 0; 29 | } 30 | stringSet.clear(); 31 | stringSet.addAll(a); 32 | return stringSet.size(); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDFMapValueFilterTopN.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maps; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | import java.util.*; 6 | 7 | /** 8 | * Filter topN Elements from a map 9 | */ 10 | public class UDFMapValueFilterTopN extends UDF { 11 | public Map evaluate(Map map, Integer n) { 12 | 13 | if (map.size() < n) { 14 | return map; 15 | } 16 | List list = new ArrayList(map.values()); 17 | Collections.sort(list); 18 | int limit = list.get(list.size() - n); 19 | int count = 0; 20 | HashMap nm = new HashMap(); 21 | 22 | for(Map.Entry entry : map.entrySet()) { 23 | if (entry.getValue() > limit) { 24 | nm.put(entry.getKey(), entry.getValue()); 25 | } 26 | } 27 | for(Map.Entry entry : map.entrySet()) { 28 | if (nm.size() == n) { 29 | break; 30 | } 31 | if (entry.getValue() == limit) { 32 | nm.put(entry.getKey(), entry.getValue()); 33 | } 34 | } 35 | return nm; 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayGet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | import java.util.List; 18 | 19 | import org.apache.hadoop.hive.ql.exec.Description; 20 | import org.apache.hadoop.hive.ql.exec.UDF; 21 | import org.apache.hadoop.io.Text; 22 | 23 | @Description(name="array_get", value="_FUNC_(array, int) - returns the nth object in the array") 24 | public class UDFArrayGet extends UDF { 25 | private Text out = new Text(); 26 | 27 | public Text evaluate(List a, int offset) { 28 | if (a == null) return null; 29 | out.set(a.get(offset)); 30 | return out; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/window/Rank.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.window; 17 | import org.apache.hadoop.hive.ql.exec.Description; 18 | import org.apache.hadoop.hive.ql.exec.UDF; 19 | 20 | @Description(name="rank", value="_FUNC_(string) - Returns the number of times the column had the same value in the previous records") 21 | public final class Rank extends UDF { 22 | private int counter; 23 | private String currentKey; 24 | 25 | public int evaluate(final String key) { 26 | if (!key.equalsIgnoreCase(currentKey)) { 27 | counter = 0; 28 | currentKey = key; 29 | } 30 | return counter++; 31 | } 32 | } -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayIntSum.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | import java.util.List; 18 | 19 | import org.apache.hadoop.hive.ql.exec.Description; 20 | import org.apache.hadoop.hive.ql.exec.UDF; 21 | 22 | @Description(name="array_int_sum", value="_FUNC_(array) - returns the sum of elements in the array") 23 | public class UDFArrayIntSum extends UDF { 24 | public int evaluate(List a) { 25 | if (a == null) return 0; 26 | int sum = 0; 27 | for (int i = 0; i < a.size(); i++) { 28 | Integer elt = a.get(i); 29 | if (elt != null) sum += elt; 30 | } 31 | return sum; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/window/UDAFLastOfGroupAccordingTo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.window; 17 | import org.apache.hadoop.hive.ql.exec.Description; 18 | import org.apache.hadoop.hive.ql.parse.SemanticException; 19 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 20 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo; 21 | 22 | @Description(name="last_of_group", value="_FUNC_(outputColumn, sortColumn)") 23 | public final class UDAFLastOfGroupAccordingTo extends UDAFFirstOrLastOfGroupAccordingTo { 24 | @Override 25 | public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException { 26 | checkParameters(info); 27 | return new FirstEvaluator(); 28 | } 29 | 30 | public static class FirstEvaluator extends BaseEvaluator { 31 | @Override 32 | protected boolean needUpdate(int cmp) { 33 | return cmp > 0; 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/window/UDAFFirstOfGroupAccordingTo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.window; 17 | import org.apache.hadoop.hive.ql.exec.Description; 18 | import org.apache.hadoop.hive.ql.parse.SemanticException; 19 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 20 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo; 21 | 22 | @Description(name="first_of_group", value="_FUNC_(outputColumn, sortColumn)") 23 | public final class UDAFFirstOfGroupAccordingTo extends UDAFFirstOrLastOfGroupAccordingTo { 24 | @Override 25 | public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException { 26 | checkParameters(info); 27 | return new FirstEvaluator(); 28 | } 29 | 30 | public static class FirstEvaluator extends BaseEvaluator { 31 | @Override 32 | protected boolean needUpdate(int cmp) { 33 | return cmp < 0; 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/storage/XMLHiveStorageHandler.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.storage; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.hive.metastore.HiveMetaHook; 5 | import org.apache.hadoop.hive.ql.metadata.DefaultStorageHandler; 6 | import org.apache.hadoop.hive.ql.metadata.HiveException; 7 | import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; 8 | import org.apache.hadoop.hive.ql.plan.TableDesc; 9 | import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider; 10 | import org.apache.hadoop.hive.serde2.SerDe; 11 | import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; 12 | import org.apache.hadoop.mapred.InputFormat; 13 | import org.apache.hadoop.mapred.OutputFormat; 14 | 15 | import java.util.Map; 16 | import java.util.Properties; 17 | 18 | /** 19 | */ 20 | public class XMLHiveStorageHandler extends DefaultStorageHandler { 21 | @Override 22 | public Class getInputFormatClass() { 23 | return XMLHiveInputFormat.class; 24 | } 25 | 26 | @Override 27 | public Class getSerDeClass() { 28 | return XMLSerde.class; 29 | } 30 | 31 | @Override 32 | public Class getOutputFormatClass() { 33 | return org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat.class; 34 | } 35 | 36 | 37 | 38 | @Override 39 | public void configureInputJobProperties(TableDesc tableDesc, Map jobProperties) { 40 | super.configureInputJobProperties(tableDesc, jobProperties); //To change body of overridden methods use File | Settings | File Templates. 41 | Properties props = tableDesc.getProperties(); 42 | jobProperties.put(XMLHiveInputFormat.TAG_KEY, props.getProperty(XMLHiveInputFormat.TAG_KEY)); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayCountEquals.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | import java.util.List; 18 | 19 | import org.apache.hadoop.hive.ql.exec.Description; 20 | import org.apache.hadoop.hive.ql.exec.UDF; 21 | 22 | @Description(name="array_count_equals", value="_FUNC_(array, type needle) - Counts the number of times the needle appears in the array") 23 | public class UDFArrayCountEquals extends UDF { 24 | public int evaluate(List a, String needle) { 25 | if (a == null) return 0; 26 | if (needle == null) return a.size(); 27 | 28 | int ret = 0; 29 | for (int i = 0; i < a.size(); i++) { 30 | if (needle.equals(a.get(i))) ret++; 31 | } 32 | return ret; 33 | } 34 | 35 | public int evaluate(List a, int needle) { 36 | if (a == null) return 0; 37 | 38 | int ret = 0; 39 | for (int i = 0; i < a.size(); i++) { 40 | if (needle == a.get(i)) ret++; 41 | } 42 | return ret; 43 | } 44 | 45 | public double evaluate(List a, double needle) { 46 | if (a == null) return 0; 47 | 48 | int ret = 0; 49 | for (int i = 0; i < a.size(); i++) { 50 | if (needle == a.get(i)) ret++; 51 | } 52 | return ret; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /ivy.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/storage/XMLSerde.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.storage; 2 | 3 | import org.apache.commons.logging.Log; 4 | import org.apache.commons.logging.LogFactory; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.hive.serde2.SerDe; 7 | import org.apache.hadoop.hive.serde2.SerDeException; 8 | import org.apache.hadoop.hive.serde2.SerDeStats; 9 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 11 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.io.Writable; 14 | 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | import java.util.Properties; 18 | 19 | /** 20 | * 21 | * Serde for 22 | */ 23 | public class XMLSerde implements SerDe { 24 | 25 | 26 | ObjectInspector oi; 27 | List row; 28 | public static final Log LOG = LogFactory.getLog(XMLSerde.class.getName()); 29 | 30 | @Override 31 | public void initialize(Configuration entries, Properties properties) throws SerDeException { 32 | List columnNames = new ArrayList(); 33 | columnNames.add("text"); 34 | 35 | ArrayList columnOIs = new ArrayList(columnNames.size()); 36 | 37 | columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 38 | 39 | oi = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs); 40 | row = new ArrayList(); 41 | row.add(null); 42 | } 43 | 44 | @Override 45 | public Object deserialize(Writable blob) throws SerDeException { 46 | Text rowText = (Text) blob; 47 | row.set(0, rowText.toString()); 48 | return row; 49 | } 50 | 51 | @Override 52 | public ObjectInspector getObjectInspector() throws SerDeException { 53 | return oi; 54 | } 55 | 56 | @Override 57 | public SerDeStats getSerDeStats() { 58 | return null; 59 | } 60 | 61 | @Override 62 | public Class getSerializedClass() { 63 | return Text.class; 64 | } 65 | 66 | @Override 67 | public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException { 68 | throw new SerDeException("Not implemented"); 69 | 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayJoin.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | 21 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 22 | import org.apache.hadoop.hive.ql.metadata.HiveException; 23 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 24 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 26 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 27 | 28 | /** 29 | * Joins an array of arrays into a single array containing all elements. 30 | * No deduplication is performed 31 | */ 32 | public class UDFArrayJoin extends GenericUDF { 33 | ListObjectInspector arrayInspector; 34 | ListObjectInspector elementsInspector; 35 | 36 | List