├── .gitignore ├── hive-init.hql ├── ivysettings.xml ├── src └── com │ └── dataiku │ └── hive │ ├── udf │ ├── maps │ │ ├── UDFMapMaxKey.java │ │ ├── UDFCountToMap.java │ │ ├── UDFMapValueFilterLowerThan.java │ │ ├── UDFMapValueFilterTopN.java │ │ ├── UDAFMapGroupSum.java │ │ └── UDAFCountDistinctToMap.java │ ├── arrays │ │ ├── UDFArrayCountDistinct.java │ │ ├── UDFArrayGet.java │ │ ├── UDFArrayIntSum.java │ │ ├── UDFArrayCountEquals.java │ │ ├── UDFArrayJoin.java │ │ ├── UDFArraySubSequences.java │ │ └── UDAFCollectToArray.java │ ├── window │ │ ├── Rank.java │ │ ├── UDAFLastOfGroupAccordingTo.java │ │ ├── UDAFFirstOfGroupAccordingTo.java │ │ └── UDAFFirstOrLastOfGroupAccordingTo.java │ ├── strings │ │ └── UDFStringSubSequences.java │ └── maths │ │ ├── PrefixSumMovingAverage.java │ │ └── UDFExponentialSmoothingMovingAverage.java │ └── storage │ ├── XMLHiveStorageHandler.java │ ├── XMLSerde.java │ └── XMLHiveInputFormat.java ├── ivy.xml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | lib 4 | dist 5 | build 6 | -------------------------------------------------------------------------------- /hive-init.hql: -------------------------------------------------------------------------------- 1 | ADD JAR dist/dataiku-hive-udf.jar; 2 | 3 | CREATE TEMPORARY FUNCTION count_distinct_map as 'com.dataiku.hive.udf.maps.UDAFCountDistinctToMap'; 4 | CREATE TEMPORARY FUNCTION array_count_to_map as 'com.dataiku.hive.udf.maps.UDFCountToMap'; 5 | CREATE TEMPORARY FUNCTION map_filter_top as 'com.dataiku.hive.udf.maps.UDFMapValueFilterTopN'; 6 | CREATE TEMPORARY FUNCTION collect_all as 'com.dataiku.hive.udf.arrays.UDAFCollectToArray'; -------------------------------------------------------------------------------- /ivysettings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDFMapMaxKey.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maps; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | import java.util.*; 6 | 7 | /** 8 | * Retrieve the key with the maximal value for a map 9 | */ 10 | public class UDFMapMaxKey extends UDF { 11 | public String evaluate(Map map) { 12 | String maxKey = null; 13 | Integer maxValue = null; 14 | for(Map.Entry entry : map.entrySet()) { 15 | if (maxValue == null || entry.getValue() > maxValue) { 16 | maxKey = entry.getKey(); 17 | maxValue = entry.getValue(); 18 | } 19 | } 20 | return maxKey; 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDFCountToMap.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maps; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | import java.util.*; 6 | 7 | public class UDFCountToMap extends UDF { 8 | 9 | 10 | public Map evaluate(List a) { 11 | HashMap map= new HashMap(); 12 | if (a == null) { 13 | return null; 14 | } 15 | for(String s : a) { 16 | if (s == null) { 17 | continue; 18 | } 19 | if (map.containsKey(s)) { 20 | map.put(s, map.get(s) + 1); 21 | } else { 22 | map.put(s, 1); 23 | } 24 | } 25 | return map; 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDFMapValueFilterLowerThan.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maps; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | import java.util.HashSet; 6 | import java.util.Map; 7 | import java.util.Set; 8 | 9 | /** 10 | */ 11 | public class UDFMapValueFilterLowerThan extends UDF { 12 | 13 | Set toRemove = new HashSet(); 14 | 15 | public Map evaluate(Map map, Integer minValue) { 16 | 17 | toRemove.clear(); 18 | for(String s : map.keySet()) { 19 | if (map.get(s) < minValue) { 20 | 21 | toRemove.add(s); 22 | } 23 | } 24 | 25 | for(String s : toRemove) { 26 | map.remove(s); 27 | } 28 | return map; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayCountDistinct.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | import java.util.HashSet; 18 | import java.util.List; 19 | import java.util.Set; 20 | 21 | import org.apache.hadoop.hive.ql.exec.UDF; 22 | 23 | public class UDFArrayCountDistinct extends UDF { 24 | Set stringSet = new HashSet(); 25 | 26 | public int evaluate(List a) { 27 | if (a == null) { 28 | return 0; 29 | } 30 | stringSet.clear(); 31 | stringSet.addAll(a); 32 | return stringSet.size(); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDFMapValueFilterTopN.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maps; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | import java.util.*; 6 | 7 | /** 8 | * Filter topN Elements from a map 9 | */ 10 | public class UDFMapValueFilterTopN extends UDF { 11 | public Map evaluate(Map map, Integer n) { 12 | 13 | if (map.size() < n) { 14 | return map; 15 | } 16 | List list = new ArrayList(map.values()); 17 | Collections.sort(list); 18 | int limit = list.get(list.size() - n); 19 | int count = 0; 20 | HashMap nm = new HashMap(); 21 | 22 | for(Map.Entry entry : map.entrySet()) { 23 | if (entry.getValue() > limit) { 24 | nm.put(entry.getKey(), entry.getValue()); 25 | } 26 | } 27 | for(Map.Entry entry : map.entrySet()) { 28 | if (nm.size() == n) { 29 | break; 30 | } 31 | if (entry.getValue() == limit) { 32 | nm.put(entry.getKey(), entry.getValue()); 33 | } 34 | } 35 | return nm; 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayGet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | import java.util.List; 18 | 19 | import org.apache.hadoop.hive.ql.exec.Description; 20 | import org.apache.hadoop.hive.ql.exec.UDF; 21 | import org.apache.hadoop.io.Text; 22 | 23 | @Description(name="array_get", value="_FUNC_(array, int) - returns the nth object in the array") 24 | public class UDFArrayGet extends UDF { 25 | private Text out = new Text(); 26 | 27 | public Text evaluate(List a, int offset) { 28 | if (a == null) return null; 29 | out.set(a.get(offset)); 30 | return out; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/window/Rank.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.window; 17 | import org.apache.hadoop.hive.ql.exec.Description; 18 | import org.apache.hadoop.hive.ql.exec.UDF; 19 | 20 | @Description(name="rank", value="_FUNC_(string) - Returns the number of times the column had the same value in the previous records") 21 | public final class Rank extends UDF { 22 | private int counter; 23 | private String currentKey; 24 | 25 | public int evaluate(final String key) { 26 | if (!key.equalsIgnoreCase(currentKey)) { 27 | counter = 0; 28 | currentKey = key; 29 | } 30 | return counter++; 31 | } 32 | } -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayIntSum.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | import java.util.List; 18 | 19 | import org.apache.hadoop.hive.ql.exec.Description; 20 | import org.apache.hadoop.hive.ql.exec.UDF; 21 | 22 | @Description(name="array_int_sum", value="_FUNC_(array) - returns the sum of elements in the array") 23 | public class UDFArrayIntSum extends UDF { 24 | public int evaluate(List a) { 25 | if (a == null) return 0; 26 | int sum = 0; 27 | for (int i = 0; i < a.size(); i++) { 28 | Integer elt = a.get(i); 29 | if (elt != null) sum += elt; 30 | } 31 | return sum; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/window/UDAFLastOfGroupAccordingTo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.window; 17 | import org.apache.hadoop.hive.ql.exec.Description; 18 | import org.apache.hadoop.hive.ql.parse.SemanticException; 19 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 20 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo; 21 | 22 | @Description(name="last_of_group", value="_FUNC_(outputColumn, sortColumn)") 23 | public final class UDAFLastOfGroupAccordingTo extends UDAFFirstOrLastOfGroupAccordingTo { 24 | @Override 25 | public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException { 26 | checkParameters(info); 27 | return new FirstEvaluator(); 28 | } 29 | 30 | public static class FirstEvaluator extends BaseEvaluator { 31 | @Override 32 | protected boolean needUpdate(int cmp) { 33 | return cmp > 0; 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/window/UDAFFirstOfGroupAccordingTo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.window; 17 | import org.apache.hadoop.hive.ql.exec.Description; 18 | import org.apache.hadoop.hive.ql.parse.SemanticException; 19 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 20 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo; 21 | 22 | @Description(name="first_of_group", value="_FUNC_(outputColumn, sortColumn)") 23 | public final class UDAFFirstOfGroupAccordingTo extends UDAFFirstOrLastOfGroupAccordingTo { 24 | @Override 25 | public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException { 26 | checkParameters(info); 27 | return new FirstEvaluator(); 28 | } 29 | 30 | public static class FirstEvaluator extends BaseEvaluator { 31 | @Override 32 | protected boolean needUpdate(int cmp) { 33 | return cmp < 0; 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/storage/XMLHiveStorageHandler.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.storage; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.hive.metastore.HiveMetaHook; 5 | import org.apache.hadoop.hive.ql.metadata.DefaultStorageHandler; 6 | import org.apache.hadoop.hive.ql.metadata.HiveException; 7 | import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; 8 | import org.apache.hadoop.hive.ql.plan.TableDesc; 9 | import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider; 10 | import org.apache.hadoop.hive.serde2.SerDe; 11 | import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; 12 | import org.apache.hadoop.mapred.InputFormat; 13 | import org.apache.hadoop.mapred.OutputFormat; 14 | 15 | import java.util.Map; 16 | import java.util.Properties; 17 | 18 | /** 19 | */ 20 | public class XMLHiveStorageHandler extends DefaultStorageHandler { 21 | @Override 22 | public Class getInputFormatClass() { 23 | return XMLHiveInputFormat.class; 24 | } 25 | 26 | @Override 27 | public Class getSerDeClass() { 28 | return XMLSerde.class; 29 | } 30 | 31 | @Override 32 | public Class getOutputFormatClass() { 33 | return org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat.class; 34 | } 35 | 36 | 37 | 38 | @Override 39 | public void configureInputJobProperties(TableDesc tableDesc, Map jobProperties) { 40 | super.configureInputJobProperties(tableDesc, jobProperties); //To change body of overridden methods use File | Settings | File Templates. 41 | Properties props = tableDesc.getProperties(); 42 | jobProperties.put(XMLHiveInputFormat.TAG_KEY, props.getProperty(XMLHiveInputFormat.TAG_KEY)); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayCountEquals.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | import java.util.List; 18 | 19 | import org.apache.hadoop.hive.ql.exec.Description; 20 | import org.apache.hadoop.hive.ql.exec.UDF; 21 | 22 | @Description(name="array_count_equals", value="_FUNC_(array, type needle) - Counts the number of times the needle appears in the array") 23 | public class UDFArrayCountEquals extends UDF { 24 | public int evaluate(List a, String needle) { 25 | if (a == null) return 0; 26 | if (needle == null) return a.size(); 27 | 28 | int ret = 0; 29 | for (int i = 0; i < a.size(); i++) { 30 | if (needle.equals(a.get(i))) ret++; 31 | } 32 | return ret; 33 | } 34 | 35 | public int evaluate(List a, int needle) { 36 | if (a == null) return 0; 37 | 38 | int ret = 0; 39 | for (int i = 0; i < a.size(); i++) { 40 | if (needle == a.get(i)) ret++; 41 | } 42 | return ret; 43 | } 44 | 45 | public double evaluate(List a, double needle) { 46 | if (a == null) return 0; 47 | 48 | int ret = 0; 49 | for (int i = 0; i < a.size(); i++) { 50 | if (needle == a.get(i)) ret++; 51 | } 52 | return ret; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /ivy.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/storage/XMLSerde.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.storage; 2 | 3 | import org.apache.commons.logging.Log; 4 | import org.apache.commons.logging.LogFactory; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.hive.serde2.SerDe; 7 | import org.apache.hadoop.hive.serde2.SerDeException; 8 | import org.apache.hadoop.hive.serde2.SerDeStats; 9 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 11 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.io.Writable; 14 | 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | import java.util.Properties; 18 | 19 | /** 20 | * 21 | * Serde for 22 | */ 23 | public class XMLSerde implements SerDe { 24 | 25 | 26 | ObjectInspector oi; 27 | List row; 28 | public static final Log LOG = LogFactory.getLog(XMLSerde.class.getName()); 29 | 30 | @Override 31 | public void initialize(Configuration entries, Properties properties) throws SerDeException { 32 | List columnNames = new ArrayList(); 33 | columnNames.add("text"); 34 | 35 | ArrayList columnOIs = new ArrayList(columnNames.size()); 36 | 37 | columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 38 | 39 | oi = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs); 40 | row = new ArrayList(); 41 | row.add(null); 42 | } 43 | 44 | @Override 45 | public Object deserialize(Writable blob) throws SerDeException { 46 | Text rowText = (Text) blob; 47 | row.set(0, rowText.toString()); 48 | return row; 49 | } 50 | 51 | @Override 52 | public ObjectInspector getObjectInspector() throws SerDeException { 53 | return oi; 54 | } 55 | 56 | @Override 57 | public SerDeStats getSerDeStats() { 58 | return null; 59 | } 60 | 61 | @Override 62 | public Class getSerializedClass() { 63 | return Text.class; 64 | } 65 | 66 | @Override 67 | public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException { 68 | throw new SerDeException("Not implemented"); 69 | 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArrayJoin.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | 21 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 22 | import org.apache.hadoop.hive.ql.metadata.HiveException; 23 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 24 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 26 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 27 | 28 | /** 29 | * Joins an array of arrays into a single array containing all elements. 30 | * No deduplication is performed 31 | */ 32 | public class UDFArrayJoin extends GenericUDF { 33 | ListObjectInspector arrayInspector; 34 | ListObjectInspector elementsInspector; 35 | 36 | List ret = new ArrayList(); 37 | 38 | @Override 39 | public Object evaluate(DeferredObject[] args) throws HiveException { 40 | if (args.length != 1) return null; 41 | Object oin = args[0].get(); 42 | if (oin == null) return null; 43 | 44 | int nbArrays = arrayInspector.getListLength(oin); 45 | 46 | ret.clear(); 47 | for (int i = 0; i < nbArrays; i++) { 48 | Object oarr = arrayInspector.getListElement(oin, i); 49 | int nbElts = elementsInspector.getListLength(oarr); 50 | for (int j = 0; j < nbElts; j++) { 51 | Object oelt = elementsInspector.getListElement(oarr, j); 52 | ret.add(oelt); 53 | } 54 | } 55 | return ret; 56 | } 57 | 58 | @Override 59 | public String getDisplayString(String[] args) { 60 | return "array_join(" + args[0] + ")"; 61 | } 62 | 63 | @Override 64 | public ObjectInspector initialize(ObjectInspector[] args) 65 | throws UDFArgumentException { 66 | if (args.length != 1) { 67 | throw new UDFArgumentException("array_join expects 1 argument"); 68 | } 69 | if (args[0].getCategory() != ObjectInspector.Category.LIST) { 70 | throw new UDFArgumentException("array_join expects an array as argument, got " + args[0].getTypeName()); 71 | } 72 | arrayInspector = (ListObjectInspector) args[0]; 73 | 74 | ObjectInspector tmpElementsInspector = arrayInspector.getListElementObjectInspector(); 75 | if (tmpElementsInspector.getCategory() != ObjectInspector.Category.LIST) { 76 | throw new UDFArgumentException("array_join expects array, got array<" + tmpElementsInspector.getTypeName() + ">"); 77 | } 78 | elementsInspector = (ListObjectInspector)tmpElementsInspector; 79 | 80 | ObjectInspector elementElementInspector = elementsInspector.getListElementObjectInspector(); 81 | 82 | return ObjectInspectorFactory.getStandardListObjectInspector(elementElementInspector); 83 | } 84 | } -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/strings/UDFStringSubSequences.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.strings; 2 | 3 | import com.dataiku.hive.udf.arrays.UDFArraySubSequences; 4 | import com.google.common.base.Joiner; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 7 | import org.apache.hadoop.hive.ql.metadata.HiveException; 8 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; 9 | import org.apache.hadoop.hive.serde2.objectinspector.*; 10 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 11 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantIntObjectInspector; 13 | import org.apache.hadoop.io.IntWritable; 14 | import org.apache.hadoop.io.Text; 15 | 16 | import java.util.ArrayList; 17 | import java.util.Collections; 18 | import java.util.List; 19 | import java.util.ListIterator; 20 | 21 | 22 | @Description(name="string_subsequence", value="array _FUNC_(string, sep, N, MAX) - split a string according to sep, generate all subsequences of size <= N up to MAX elements, and generate all corresponing strings joined by sep") 23 | public class UDFStringSubSequences extends GenericUDTF { 24 | 25 | private ObjectInspectorConverters.Converter[] converters; 26 | 27 | StringObjectInspector stringOI; 28 | WritableConstantIntObjectInspector intOI; 29 | 30 | @Override 31 | public StructObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 32 | 33 | if (arguments.length < 4) { 34 | throw new IllegalArgumentException("Missing parameters, 4 needed"); 35 | } 36 | 37 | converters = new ObjectInspectorConverters.Converter[arguments.length]; 38 | 39 | converters[0] = ObjectInspectorConverters.getConverter(arguments[0],PrimitiveObjectInspectorFactory.writableStringObjectInspector); 40 | converters[1] = ObjectInspectorConverters.getConverter(arguments[1],PrimitiveObjectInspectorFactory.writableStringObjectInspector) ; 41 | converters[2] = ObjectInspectorConverters.getConverter(arguments[2],PrimitiveObjectInspectorFactory.writableIntObjectInspector); 42 | converters[3] = ObjectInspectorConverters.getConverter(arguments[3],PrimitiveObjectInspectorFactory.writableIntObjectInspector); 43 | 44 | ArrayList fieldNames = new ArrayList(); 45 | ArrayList fieldOIs = new ArrayList(); 46 | fieldNames.add("col1"); 47 | fieldOIs.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); 48 | return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); 49 | } 50 | 51 | private final Object[] forwardObj = new Object[1]; 52 | 53 | 54 | protected ArrayList buffer = new ArrayList(); 55 | protected Joiner joiner = Joiner.on(" "); 56 | protected int count = 0; 57 | protected int max; 58 | 59 | 60 | protected void forwardBuffer() throws HiveException { 61 | Text t = new Text(joiner.join(buffer)); 62 | forwardObj[0] = t; 63 | forward(forwardObj); 64 | count ++; 65 | } 66 | 67 | protected void enumerateSubSequenceStartAt(int start, int n, String[] elts) throws HiveException { 68 | if (count >= max) { 69 | return; 70 | } 71 | buffer.add(elts[start]); 72 | forwardBuffer(); 73 | if (buffer.size() < n) { 74 | for(int j = start+1; j < n && j < elts.length; j++) { 75 | enumerateSubSequenceStartAt(j, n, elts); 76 | } 77 | } 78 | buffer.remove(buffer.size()-1); 79 | } 80 | 81 | 82 | protected void enumerateSubSequence(int n, String[] elts) throws HiveException { 83 | for(int i = 0; i < elts.length; i++) { 84 | enumerateSubSequenceStartAt(i, n, elts); 85 | } 86 | } 87 | 88 | @Override 89 | public void process(Object[] objects) throws HiveException { 90 | if (objects.length < 4) { 91 | return; 92 | } 93 | count = 0; 94 | buffer.clear(); 95 | Text text = (Text) converters[0].convert(objects[0]); 96 | Text sep = (Text) converters[1].convert(objects[1]); 97 | IntWritable n = (IntWritable) converters[2].convert(objects[2]); 98 | IntWritable max = (IntWritable) converters[3].convert(objects[3]); 99 | this.max = max.get(); 100 | enumerateSubSequence(n.get(), text.toString().split(sep.toString())); 101 | } 102 | 103 | @Override 104 | public void close() throws HiveException { 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDFArraySubSequences.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | 18 | import org.apache.hadoop.hive.ql.exec.Description; 19 | import org.apache.hadoop.hive.ql.exec.UDF; 20 | import org.apache.hadoop.io.Text; 21 | 22 | import java.util.ArrayList; 23 | import java.util.Collections; 24 | import java.util.List; 25 | import java.util.ListIterator; 26 | 27 | @Description(name="array_get", value="_FUNC_(array, N, MAX) - returns an array> with all the subsequences of length <= N from the array") 28 | public class UDFArraySubSequences extends UDF { 29 | private Text out = new Text(); 30 | 31 | 32 | 33 | public static List> mysubn(int n, int max, List li) { 34 | List> ret = new ArrayList>(); 35 | int size = li.size(); 36 | if (n == 0) { 37 | ret.add(new ArrayList()); 38 | return ret; 39 | } 40 | if (li.isEmpty()) { 41 | return ret; 42 | } 43 | if (size < n) { 44 | return ret; 45 | } 46 | if (size == n) { 47 | ret.add(li); 48 | return ret; 49 | } 50 | 51 | /* I use counters to actually keep track of where I am in the list, 52 | * but iterators to actually get the elements. This is because we can't 53 | * assume what type of list we are accessing. list.get(n) is O(1) for 54 | * ArrayList, but O(n) for LinkedList, so we'd like to minimize these 55 | * as much as possible. 56 | * The reason we need to keep the counters and ending array, 57 | * instead of going until the iterators run out of elements, 58 | * is we only want them to the point where they could still make a 59 | * subsequence. 60 | */ 61 | ArrayList> iters = new ArrayList>(n); 62 | ArrayList currElems = new ArrayList(n); 63 | int[] counters = new int[n]; 64 | int[] endings = new int[n]; 65 | // Set up our initial values 66 | for(int i = 0; i < n; i++) { 67 | iters.add(li.listIterator(i)); 68 | currElems.add(iters.get(i).next()); 69 | counters[i] = i; 70 | endings[i] = size - n + i; 71 | } 72 | // Go until the we don't have enough elements left to make a subsequence 73 | while(counters[0] <= endings[0] && ret.size() < max) { 74 | List sub = new ArrayList(); 75 | for(int i = 0; i < n; i++) { 76 | sub.add(currElems.get(i)); 77 | } 78 | ret.add(sub); 79 | int c = n - 1; 80 | // Here we figure out how many of the counters (indexes) need updating. 81 | while(c > 0) { 82 | if(counters[c] < endings[c]) 83 | break; 84 | else 85 | c--; 86 | } 87 | // Update the left-most counter (index) 88 | counters[c]++; 89 | if(iters.get(c).hasNext()) 90 | currElems.set(c, iters.get(c).next()); 91 | c++; 92 | // Starting from the next left-most counter (if there is one), 93 | // set counter to be 1 more than previous counter, and reset 94 | // the iterator to start there as well. 95 | while(c < n) { 96 | // Reset the counter, and reset the iterator 97 | // to the new starting position. 98 | counters[c] = counters[c-1] + 1; 99 | iters.set(c, li.listIterator(counters[c])); 100 | // We make sure the iterator has another element. 101 | // The only time this will return false is when we are completely done 102 | // and the outer while is over. 103 | if(iters.get(c).hasNext()) 104 | currElems.set(c, iters.get(c).next()); 105 | c++; 106 | } 107 | } 108 | return ret; 109 | } 110 | 111 | 112 | public List> evaluate(List a, int n, int max) { 113 | return mysubn(n, max, a); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/arrays/UDAFCollectToArray.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.arrays; 17 | 18 | import java.util.ArrayList; 19 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 20 | import org.apache.hadoop.hive.ql.metadata.HiveException; 21 | import org.apache.hadoop.hive.ql.parse.SemanticException; 22 | import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; 23 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 26 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 27 | import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; 28 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; 29 | 30 | public class UDAFCollectToArray extends AbstractGenericUDAFResolver { 31 | @Override 32 | public GenericUDAFEvaluator getEvaluator(TypeInfo[] tis) throws SemanticException { 33 | if (tis.length != 1) { 34 | throw new UDFArgumentTypeException(tis.length - 1, "Exactly one argument is expected."); 35 | } 36 | return new CollectToArrayEvaluator(); 37 | } 38 | 39 | public static class CollectToArrayEvaluator extends GenericUDAFEvaluator { 40 | private ObjectInspector originalDataOI; 41 | private StandardListObjectInspector listOI; 42 | 43 | @Override 44 | public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { 45 | super.init(m, parameters); 46 | 47 | /* Setup input OI */ 48 | if (m == Mode.PARTIAL1 || m == Mode.COMPLETE) { 49 | /* Input is original data */ 50 | originalDataOI = parameters[0]; 51 | } else if (m == Mode.PARTIAL2 || m == Mode.FINAL){ 52 | /* Input is list of original data */ 53 | listOI = (StandardListObjectInspector) parameters[0]; 54 | originalDataOI = listOI.getListElementObjectInspector(); 55 | } 56 | 57 | /* Output OI : always a list of original data */ 58 | return ObjectInspectorFactory 59 | .getStandardListObjectInspector(ObjectInspectorUtils.getStandardObjectInspector(originalDataOI)); 60 | } 61 | 62 | static class ArrayAggregationBuffer implements AggregationBuffer { 63 | ArrayList container = new ArrayList(); 64 | } 65 | 66 | @Override 67 | public void reset(AggregationBuffer ab) throws HiveException { 68 | ((ArrayAggregationBuffer) ab).container.clear(); 69 | } 70 | 71 | @Override 72 | public AggregationBuffer getNewAggregationBuffer() throws HiveException { 73 | return new ArrayAggregationBuffer(); 74 | } 75 | 76 | @Override 77 | public void iterate(AggregationBuffer ab, Object[] parameters) throws HiveException { 78 | assert (parameters.length == 1); 79 | Object p = parameters[0]; 80 | if (p != null) { 81 | ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab; 82 | agg.container.add(ObjectInspectorUtils.copyToStandardObject(p, this.originalDataOI)); 83 | } 84 | } 85 | 86 | @Override 87 | public Object terminatePartial(AggregationBuffer ab) throws HiveException { 88 | ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab; 89 | ArrayList ret = new ArrayList(); 90 | ret.addAll(agg.container); 91 | return ret; 92 | } 93 | 94 | @Override 95 | public void merge(AggregationBuffer ab, Object p) throws HiveException { 96 | ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab; 97 | @SuppressWarnings("unchecked") 98 | ArrayList partialResult = (ArrayList)listOI.getList(p); 99 | for(Object o : partialResult) { 100 | agg.container.add(ObjectInspectorUtils.copyToStandardObject(o, originalDataOI)); 101 | } 102 | } 103 | 104 | @Override 105 | public Object terminate(AggregationBuffer ab) throws HiveException { 106 | ArrayAggregationBuffer agg = (ArrayAggregationBuffer) ab; 107 | ArrayList ret = new ArrayList(); 108 | ret.addAll(agg.container); 109 | return ret; 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDAFMapGroupSum.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maps; 2 | 3 | import com.google.common.collect.Maps; 4 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 5 | import org.apache.hadoop.hive.ql.metadata.HiveException; 6 | import org.apache.hadoop.hive.ql.parse.SemanticException; 7 | import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; 8 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 9 | import org.apache.hadoop.hive.serde2.lazy.LazyFactory; 10 | import org.apache.hadoop.hive.serde2.lazy.LazyMap; 11 | import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.*; 13 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; 14 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 15 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; 16 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; 17 | 18 | import java.rmi.MarshalledObject; 19 | import java.util.ArrayList; 20 | import java.util.HashMap; 21 | import java.util.Map; 22 | 23 | /** 24 | * Group a set of map and sum identical integer keys 25 | */ 26 | public class UDAFMapGroupSum extends AbstractGenericUDAFResolver { 27 | @Override 28 | public GenericUDAFEvaluator getEvaluator(TypeInfo[] tis) throws SemanticException { 29 | if (tis.length != 1) { 30 | throw new UDFArgumentTypeException(tis.length - 1, "Exactly one argument is expected."); 31 | } 32 | return new MapGroupSumEvaluator(); 33 | } 34 | 35 | public static class MapGroupSumEvaluator extends GenericUDAFEvaluator { 36 | private MapObjectInspector originalDataOI; 37 | private IntObjectInspector valueOI; 38 | private StringObjectInspector keyOI; 39 | 40 | 41 | @Override 42 | public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { 43 | super.init(m, parameters); 44 | 45 | originalDataOI = (MapObjectInspector) parameters[0]; 46 | keyOI = (StringObjectInspector) originalDataOI.getMapKeyObjectInspector(); 47 | valueOI = (IntObjectInspector) originalDataOI.getMapValueObjectInspector(); 48 | return ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, 49 | PrimitiveObjectInspectorFactory.javaIntObjectInspector); 50 | 51 | 52 | 53 | // /* Setup input OI */ 54 | // if (m == Mode.PARTIAL1 || m == Mode.COMPLETE) { 55 | // /* Input is original data */ 56 | // originalDataOI = parameters[0]; 57 | // } else if (m == Mode.PARTIAL2 || m == Mode.FINAL){ 58 | // /* Input is list of original data */ 59 | // listOI = (StandardListObjectInspector) parameters[0]; 60 | // originalDataOI = listOI.getListElementObjectInspector(); 61 | // } 62 | // 63 | // /* Output OI : always a list of original data */ 64 | // return ObjectInspectorFactory 65 | // .getStandardListObjectInspector(ObjectInspectorUtils.getStandardObjectInspector(originalDataOI)); 66 | } 67 | 68 | static class MapBuffer implements AggregationBuffer { 69 | Map map = new HashMap(); 70 | } 71 | 72 | @Override 73 | public void reset(AggregationBuffer ab) throws HiveException { 74 | ((MapBuffer) ab).map.clear(); 75 | } 76 | 77 | @Override 78 | public AggregationBuffer getNewAggregationBuffer() throws HiveException { 79 | return new MapBuffer(); 80 | } 81 | 82 | protected void mapAppend(Map m, Map from) { 83 | if (from == null) { 84 | return; 85 | } 86 | for(Map.Entry entry : from.entrySet()) { 87 | Object okey = entry.getKey(); 88 | Object ovalue = entry.getValue(); 89 | if (okey == null || ovalue == null) continue; 90 | String key = keyOI.getPrimitiveJavaObject(entry.getKey()); 91 | Integer value = valueOI.get(entry.getValue()); 92 | if (m.containsKey(key)) { 93 | m.put(key, m.get(key) + value); 94 | } else { 95 | m.put(key, value); 96 | } 97 | } 98 | } 99 | 100 | @Override 101 | public void iterate(AggregationBuffer ab, Object[] parameters) throws HiveException { 102 | assert (parameters.length == 1); 103 | Object p = parameters[0]; 104 | if (p != null) { 105 | MapBuffer agg = (MapBuffer) ab; 106 | Map o = (Map) this.originalDataOI.getMap(p); 107 | mapAppend(agg.map, o); 108 | } 109 | } 110 | 111 | @Override 112 | public Object terminatePartial(AggregationBuffer ab) throws HiveException { 113 | MapBuffer agg = (MapBuffer) ab; 114 | return Maps.newHashMap(agg.map); 115 | } 116 | 117 | @Override 118 | public void merge(AggregationBuffer ab, Object p) throws HiveException { 119 | MapBuffer agg = (MapBuffer) ab; 120 | @SuppressWarnings("unchecked") 121 | Map obj = (Map) this.originalDataOI.getMap(p); 122 | mapAppend(agg.map, obj); 123 | } 124 | 125 | @Override 126 | public Object terminate(AggregationBuffer ab) throws HiveException { 127 | MapBuffer agg = (MapBuffer) ab; 128 | return Maps.newHashMap(agg.map); 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataiku Hive UDFs 2 | 3 | This is a collection of UDF and Storage Handlers for [Apache Hive](http://apache.hive.org). 4 | 5 | ## Available UDFs 6 | 7 | ### Array operations 8 | 9 | #### array_count_distinct 10 | 11 | int array_count_distinct(array) 12 | 13 | Counts the number of distinct values in the array 14 | 15 | #### array_count_equals 16 | 17 | int array_count_equals(array haystack, int needle) 18 | int array_count_equals(array haystack, double needle) 19 | int array_count_equals(array haystack, string needle) 20 | 21 | Returns the number of times the needle is present in the haystack 22 | 23 | #### collect_to_array 24 | 25 | This is an aggregation function that gathers all input values and outputs them as an array. 26 | 27 | For example 28 | 29 | table page_views { 30 | int visitor_id; 31 | string page; 32 | } 33 | 34 | The query: 35 | 36 | select collect_to_array(page) from page_views group by visitor_id; 37 | 38 | produces: `array`, the list of pages viewed for each visitor_id 39 | 40 | #### array_join 41 | 42 | array array_join(array >) 43 | 44 | Joins an array of arrays into a single array containing all elements. 45 | This is often used in combination with collect_to_array 46 | 47 | For example, if you have: 48 | 49 | table A { 50 | int product_id; 51 | int day; 52 | array buying_customers; 53 | } 54 | 55 | collect_to_array(buying_customers) will therefore produce array> 56 | 57 | To get the full list of customers for one product, you can use: 58 | 59 | SELECT array_join(collect_to_array(buying_customers)) FROM A GROUP BY product_id; 60 | 61 | ### Map operations 62 | 63 | #### count_to_map 64 | 65 | Convert an array to a map, with a count of number of elements 66 | 67 | 68 | count_to_map<["yes", "no", "yes"]> => {"yes":2,"no":1} 69 | 70 | ### count_distinct_map 71 | 72 | For a group, generate a map with key from a secondary column counting the distinct values from keys from a third one. 73 | 74 | 75 | select query, count_distinct_map(country, userid) as nusers_per_country FROM queries GROUP BY query; 76 | 77 | query country userid 78 | FOO FR X 79 | FOO FR X 80 | FOO FR Y 81 | FOO EN Z 82 | 83 | => FOO, {"FR":2, EN:1} 84 | 85 | 86 | ### map_filter_lower_than 87 | 88 | Filter a map, keep only map entries where value is greater of equals to the provided argument 89 | 90 | map_filter_lower_than({"yes":2, "no":1}, 2) => {"yes":2} 91 | 92 | 93 | ### map_filter_top_n 94 | 95 | Filter a map, keep only the top N map entries according to the value. In case of equality, a random 96 | selection of the elements is performed 97 | 98 | map_filter_top_n({"yes":2, "no":1, "maybe":2, "surely":5}, 3) => {"surely":5, "maybe":2, "yes":2} 99 | 100 | ## map_group_sum 101 | 102 | Aggregating operation on map than performs the unions of keys of the map, and sum the value when a key 103 | exists in multiples maps 104 | 105 | 106 | CREATE TABLE docs { 107 | docid int; 108 | word_count map 109 | } 110 | 111 | SELECT map_group_sum(word_count) FROM docs; ## Get the global word frequency 112 | 113 | ### Maths 114 | 115 | ### UDFExponentialSmoothingMovingAverage. 116 | 117 | moving_avg(period, value, window, divisor, position) 118 | Compute the moving average on a column for a particular position. 119 | 120 | Example: 121 | 122 | p v 123 | 4 40 124 | 5 60 125 | 6 0 126 | 7 10 127 | 8 20 128 | 9 50 129 | 10 100 130 | 11 10 131 | 132 | moving_avg(p, v, 4, 2, 11) return: 133 | mean(10 * 1/(2^1) + 100 * 1/(2^2) + 50 * 1/(2^3) + 20 * 1/(2^4)) 134 | 135 | moving_avg(p, v, 2, 3, 11) return: 136 | mean(10 * 1/(3^1) + 100 * 1/(3^2)) 137 | 138 | If a p is missing the value is put at 0. 139 | moving_avg(p, v, 2, 3, 12) return: 140 | mean(0 * 1/(3^1) + 10 * 1/(3^2)) 141 | 142 | 143 | ### Windowing functions 144 | 145 | #### Rank 146 | 147 | int rank(string in) 148 | 149 | While processing a stream of rows, rank will return the number of times it has previously seen the same value of `in`. 150 | 151 | For example, while processing a table: 152 | 153 | table a { 154 | string data; 155 | } 156 | 157 | with values: 158 | 159 | p1 160 | p1 161 | p2 162 | p2 163 | p2 164 | p3 165 | p4 166 | 167 | The query: 168 | 169 | select data, rank(data) from a; 170 | 171 | would return: 172 | 173 | p1 0 174 | p1 1 175 | p2 0 176 | p2 1 177 | p2 2 178 | p3 0 179 | p4 0 180 | 181 | Therefore, rank only makes sense on a sorted table. 182 | 183 | rank is very useful for sequence analysis 184 | 185 | ##### first_of_group, last_of_group 186 | 187 | This is an aggregation function. 188 | 189 | TYPE1 first_of_group(TYPE1 outColumn, TYPE2 sortColumn) 190 | TYPE1 last_of_group(TYPE1 outColumn, TYPE2 sortColumn) 191 | 192 | For each group, these functions will sort the rows of the group by `sortColumn`, and then 193 | output the value of `outColumn` for the first (resp. last) row, once sorted. 194 | 195 | These functions are very useful for processing tables with "updates". 196 | 197 | For example: 198 | 199 | table user { 200 | int id; 201 | int version; 202 | string email; 203 | string location; 204 | } 205 | 206 | To get the last recorded location for a given user, you can use: 207 | 208 | select last_of_group(location, version) FROM user GROUP BY id; 209 | 210 | You can use several first_of_group/last_of_group in the same query: 211 | 212 | select last_of_group(location, version), last_of_group(email, version) FROM user GROUP BY id; 213 | 214 | 215 | 216 | ## Storage Handlers 217 | 218 | ### XMLHiveStorageHandler 219 | 220 | 221 | XMLHiveStorageHandler creates a table backed by one or multiple XML Files. 222 | 223 | In the example below my_dir should contain XML Files contains a tag. 224 | A Table will be created with one line per tag, with the raw XML content of each tag inside. 225 | 226 | CREATE TABLE my_table (text string) 227 | STORED BY 'com.dataiku.hive.storage.XMLHiveStorageHandler' 228 | LOCATION '/my_dir' 229 | TBLPROPERTIES ( 230 | "xml.tag"="MyTag" 231 | ) 232 | 233 | Note that the storage handler does not perform any XML entity substitution (such as > or unicode entities) 234 | 235 | qaa 236 | ## Copyright and license 237 | 238 | Copyright 2013 Dataiku SAS. 239 | 240 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this work except in compliance with the License. You may obtain a copy of the License in the LICENSE file, or at: 241 | 242 | http://www.apache.org/licenses/LICENSE-2.0 243 | 244 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 245 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maths/PrefixSumMovingAverage.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maths; 2 | 3 | /** 4 | * Author: Matthieu Scordia 5 | * Date: 04/03/14 6 | * Time: 15:12 7 | * 8 | * This class is call by UDFExponentialSmoothingMovingAverage to do the moving average. 9 | * 10 | */ 11 | 12 | 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | import java.util.Collections; 16 | import org.apache.hadoop.hive.serde2.io.DoubleWritable; 17 | import org.yecht.Data; 18 | 19 | public class PrefixSumMovingAverage { 20 | static class PrefixSumEntry implements Comparable 21 | { 22 | int period; 23 | double value; 24 | double prefixSum; 25 | double subsequenceTotal; 26 | double movingAverage; 27 | public int compareTo(Object other) 28 | { 29 | PrefixSumEntry o = (PrefixSumEntry)other; 30 | if (period < o.period) 31 | return -1; 32 | if (period > o.period) 33 | return 1; 34 | return 0; 35 | } 36 | } 37 | 38 | //class variables 39 | private int windowSize; 40 | private double diviseur; 41 | private int position; 42 | 43 | private ArrayList entries; 44 | 45 | public PrefixSumMovingAverage() 46 | { 47 | windowSize = 0; 48 | diviseur = 0.0; 49 | position = 0; 50 | } 51 | 52 | public void reset() 53 | { 54 | windowSize = 0; 55 | diviseur = 0.0; 56 | position = 0; 57 | entries = null; 58 | } 59 | 60 | public boolean isReady() 61 | { 62 | return (windowSize > 0); 63 | } 64 | 65 | /** 66 | * Sets the window for prefix sum computations 67 | * 68 | * @param window_size Size of the window for moving average 69 | * d is the divisor of the exponential smoothing. 70 | */ 71 | public void allocate(int window_size, double d, int p) { 72 | 73 | windowSize = window_size; 74 | diviseur = d; 75 | entries = new ArrayList(); 76 | position = p; 77 | } 78 | 79 | public double getDiviseur() { 80 | return diviseur; 81 | } 82 | 83 | public double getPosition() { 84 | return position; 85 | } 86 | 87 | 88 | @SuppressWarnings("unchecked") 89 | public void merge(List other) 90 | { 91 | 92 | if (other == null) 93 | return; 94 | 95 | // if this is an empty buffer, just copy in other 96 | // but deserialize the list 97 | if (windowSize == 0) 98 | { 99 | 100 | windowSize = (int)other.get(0).get(); 101 | diviseur = (double)other.get(1).get(); 102 | position = (int)other.get(2).get(); 103 | 104 | 105 | 106 | 107 | entries = new ArrayList(); 108 | // we're serialized as period, value, period, value 109 | for (int i = 3; i < other.size(); i+=2) 110 | { 111 | PrefixSumEntry e = new PrefixSumEntry(); 112 | e.period = (int)other.get(i).get(); 113 | e.value = other.get(i+1).get(); 114 | entries.add(e); 115 | } 116 | } 117 | 118 | // if we already have a buffer, we need to add these entries 119 | else 120 | { 121 | // we're serialized as period, value, period, value 122 | for (int i = 3; i < other.size(); i+=2) 123 | { 124 | PrefixSumEntry e = new PrefixSumEntry(); 125 | e.period = (int)other.get(i).get(); 126 | e.value = other.get(i+1).get(); 127 | entries.add(e); 128 | } 129 | } 130 | 131 | // sort and recompute 132 | Collections.sort(entries); 133 | 134 | // Compute the list of ponderation coeff for the moving average. 135 | 136 | 137 | // Compute the list of ponderation coeff for the moving average. 138 | 139 | double[] listCoeff = new double[windowSize]; 140 | double subdenom = 0.0; 141 | double coeffPond = 0.0; 142 | 143 | for (int i=1; i<=windowSize; i++){ 144 | coeffPond = 1/Math.pow(this.getDiviseur(),i); 145 | listCoeff[i-1]=coeffPond; 146 | subdenom += coeffPond; 147 | } 148 | 149 | // now do the subsequence totals and moving averages 150 | 151 | int lastEntry = entries.size()-1; 152 | 153 | double prefixSum = 0; 154 | 155 | int variationPos = 0; 156 | //System.out.println("beginning for"); 157 | for(int j = 0; j < windowSize; j++) 158 | { 159 | // my last entries: 160 | if(lastEntry-variationPos>=0){ 161 | PrefixSumEntry thisEntry = entries.get(lastEntry-variationPos); 162 | 163 | while (thisEntry.period>(getPosition()-j)){ 164 | variationPos+=1; 165 | thisEntry = entries.get(lastEntry-variationPos); 166 | //System.out.println(String.valueOf(thisEntry.period)); 167 | }; 168 | 169 | //System.out.println(String.valueOf(thisEntry.period) + " " + String.valueOf(thisEntry.value) +" "+ String.valueOf(variationPos)); 170 | //System.out.println(" test:"); 171 | //System.out.println(" "+ String.valueOf(thisEntry.period) + " == " + String.valueOf(getPosition()) +" - "+ String.valueOf(j)); 172 | if (thisEntry.period==(getPosition()-j)){ 173 | prefixSum += thisEntry.value * listCoeff[j]; 174 | variationPos+=1; 175 | } 176 | else { 177 | prefixSum += 0 * listCoeff[j]; 178 | 179 | } 180 | } 181 | else { 182 | prefixSum += 0 * listCoeff[j]; 183 | } 184 | 185 | } 186 | 187 | double movingAverage; 188 | PrefixSumEntry thisEntry = entries.get(lastEntry); 189 | movingAverage = prefixSum/subdenom; //Moving average is computed here! 190 | //System.out.println("result:"+String.valueOf(movingAverage)); 191 | thisEntry.movingAverage = movingAverage; 192 | 193 | 194 | 195 | } 196 | 197 | public int tableSize() 198 | { 199 | return entries.size(); 200 | } 201 | 202 | public PrefixSumEntry getEntry(int index) 203 | { 204 | return entries.get(index); 205 | } 206 | 207 | private boolean needsSorting(ArrayList entries) 208 | { 209 | PrefixSumEntry previous = null; 210 | for (PrefixSumEntry current:entries) 211 | { 212 | if (previous != null && current.compareTo(previous) < 0) 213 | return true; 214 | } 215 | return false; 216 | } 217 | 218 | @SuppressWarnings("unchecked") 219 | public void add(int period, double v) 220 | { 221 | //Add a new entry to the list and update table 222 | PrefixSumEntry e = new PrefixSumEntry(); 223 | e.period = period; 224 | e.value = v; 225 | entries.add(e); 226 | // do we need to ensure this is sorted? 227 | //if (needsSorting(entries)) 228 | Collections.sort(entries); 229 | // update the table 230 | // prefixSums first 231 | 232 | } 233 | 234 | public ArrayList serialize() 235 | { 236 | ArrayList result = new ArrayList(); 237 | 238 | result.add(new DoubleWritable(windowSize)); 239 | result.add(new DoubleWritable(diviseur)); 240 | result.add(new DoubleWritable(position)); 241 | 242 | if (entries != null) 243 | { 244 | for (PrefixSumEntry i : entries) 245 | { 246 | result.add(new DoubleWritable(i.period)); 247 | result.add(new DoubleWritable(i.value)); 248 | } 249 | } 250 | return result; 251 | } 252 | } -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/window/UDAFFirstOrLastOfGroupAccordingTo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.window; 17 | import java.util.ArrayList; 18 | 19 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 20 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 21 | import org.apache.hadoop.hive.ql.metadata.HiveException; 22 | import org.apache.hadoop.hive.ql.parse.SemanticException; 23 | import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; 24 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 25 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo; 26 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 27 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 28 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 29 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; 30 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; 31 | import org.apache.hadoop.hive.serde2.objectinspector.StructField; 32 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 33 | 34 | public abstract class UDAFFirstOrLastOfGroupAccordingTo extends AbstractGenericUDAFResolver { 35 | protected void checkParameters(GenericUDAFParameterInfo info) throws SemanticException { 36 | if (info.getParameterObjectInspectors().length != 2) { 37 | throw new UDFArgumentException("Two arguments are required"); 38 | } 39 | 40 | if (info.getParameterObjectInspectors()[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { 41 | throw new UDFArgumentTypeException(0, 42 | "Only primitive type arguments are accepted but " 43 | + info.getParameterObjectInspectors()[0].getTypeName() + " was passed in"); 44 | } 45 | if (info.getParameterObjectInspectors()[1].getCategory() != ObjectInspector.Category.PRIMITIVE) { 46 | throw new UDFArgumentTypeException(1, 47 | "Only primitive type arguments are accepted but " 48 | + info.getParameterObjectInspectors()[1].getTypeName() + " was passed in"); 49 | } 50 | } 51 | 52 | public static abstract class BaseEvaluator extends GenericUDAFEvaluator { 53 | static class UDAFFOGATBuffer implements AggregationBuffer { 54 | Object outColKeptValue; 55 | Object sortColKeptValue; 56 | } 57 | 58 | // For PARTIAL1 and COMPLETE: ObjectInspectors for original data 59 | private ObjectInspector outColOI; // Also used as output inspector for FINAL 60 | private ObjectInspector sortColOI; 61 | 62 | // For PARTIAL2 and FINAL: ObjectInspectors for partial aggregations (structs of objects) 63 | private StructObjectInspector soi; 64 | StructField outField; 65 | StructField sortField; 66 | 67 | 68 | @Override 69 | public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { 70 | super.init(m, parameters); 71 | 72 | /* Init input inspectors. */ 73 | if (m == Mode.PARTIAL1 || m == Mode.COMPLETE) { 74 | /* For partial 1 and complete : original data */ 75 | if (parameters.length != 2) throw new UDFArgumentException("It sucks " + parameters.length); 76 | outColOI = (PrimitiveObjectInspector) (parameters[0]); 77 | sortColOI = (PrimitiveObjectInspector) (parameters[1]); 78 | } else { 79 | /* For partial2 and final : struct containing partial results */ 80 | if (parameters.length != 1) throw new UDFArgumentException("It sucks " + parameters.length); 81 | soi = (StructObjectInspector) parameters[0]; 82 | outField = soi.getStructFieldRef("out"); 83 | outColOI = outField.getFieldObjectInspector(); 84 | sortField = soi.getStructFieldRef("sort"); 85 | sortColOI = sortField.getFieldObjectInspector(); 86 | } 87 | 88 | /* Init output inspectors */ 89 | if (m == Mode.FINAL || m == Mode.COMPLETE) { 90 | return outColOI; 91 | } else { 92 | /* The output of a partial aggregation is a struct containing the best sort value and the best out value */ 93 | ArrayList foi = new ArrayList(); 94 | foi.add(ObjectInspectorUtils.getStandardObjectInspector(outColOI, ObjectInspectorCopyOption.WRITABLE)); 95 | foi.add(ObjectInspectorUtils.getStandardObjectInspector(sortColOI, ObjectInspectorCopyOption.WRITABLE)); 96 | 97 | ArrayList fname = new ArrayList(); 98 | fname.add("out"); 99 | fname.add("sort"); 100 | 101 | return ObjectInspectorFactory.getStandardStructObjectInspector(fname, foi); 102 | } 103 | } 104 | 105 | @Override 106 | public AggregationBuffer getNewAggregationBuffer() throws HiveException { 107 | AggregationBuffer o= new UDAFFOGATBuffer(); 108 | //System.out.println(Thread.currentThread().getName() + ": NEW BUFFER " +o); 109 | return o; 110 | } 111 | 112 | @Override 113 | public void reset(AggregationBuffer buf) throws HiveException { 114 | UDAFFOGATBuffer bbuf = (UDAFFOGATBuffer)buf; 115 | //System.out.println(Thread.currentThread().getName() + ": RESET"); 116 | bbuf.outColKeptValue = null; 117 | bbuf.sortColKeptValue = null; 118 | } 119 | 120 | @Override 121 | public void iterate(AggregationBuffer buf, Object[] args) throws HiveException { 122 | UDAFFOGATBuffer bbuf = (UDAFFOGATBuffer)buf; 123 | Object outColVal = args[0]; 124 | Object sortColVal = args[1]; 125 | //System.out.println(Thread.currentThread().getName() + ": ITERATE ON " + outColVal + " (" + sortColVal + ") to " +bbuf); 126 | //System.out.println(" BBUF HAS " + bbuf.outColKeptValue+ " " +bbuf.sortColKeptValue); 127 | updateBuf(bbuf, outColVal, sortColVal); 128 | } 129 | 130 | @Override 131 | public void merge(AggregationBuffer buf, Object toMerge) throws HiveException { 132 | // System.out.println(Thread.currentThread().getName() + ": MERGE " + buf + " --" + toMerge + "-- soi is " +soi); 133 | UDAFFOGATBuffer bbuf = (UDAFFOGATBuffer)buf; 134 | Object out = soi.getStructFieldData(toMerge, outField); 135 | Object sort = soi.getStructFieldData(toMerge, sortField); 136 | updateBuf(bbuf, out, sort); 137 | } 138 | 139 | protected abstract boolean needUpdate(int cmp); 140 | 141 | private void updateBuf(UDAFFOGATBuffer bbuf, Object outColVal, Object sortColVal) { 142 | if (bbuf.sortColKeptValue == null) { 143 | bbuf.sortColKeptValue = ObjectInspectorUtils.copyToStandardObject(sortColVal, sortColOI, ObjectInspectorCopyOption.WRITABLE); 144 | bbuf.outColKeptValue = ObjectInspectorUtils.copyToStandardObject(outColVal, outColOI, ObjectInspectorCopyOption.WRITABLE); 145 | } else { 146 | int cmp = ObjectInspectorUtils.compare(sortColVal, sortColOI, bbuf.sortColKeptValue, 147 | ObjectInspectorUtils.getStandardObjectInspector(sortColOI, ObjectInspectorCopyOption.WRITABLE) 148 | ); 149 | if (needUpdate(cmp)) { 150 | bbuf.sortColKeptValue = ObjectInspectorUtils.copyToStandardObject(sortColVal, sortColOI, ObjectInspectorCopyOption.WRITABLE); 151 | bbuf.outColKeptValue = ObjectInspectorUtils.copyToStandardObject(outColVal, outColOI, ObjectInspectorCopyOption.WRITABLE); 152 | } 153 | } 154 | } 155 | 156 | @Override 157 | public Object terminate(AggregationBuffer buf) throws HiveException { 158 | UDAFFOGATBuffer bbuf = (UDAFFOGATBuffer)buf; 159 | return bbuf.outColKeptValue; 160 | } 161 | 162 | @Override 163 | public Object terminatePartial(AggregationBuffer buf) throws HiveException { 164 | UDAFFOGATBuffer bbuf = (UDAFFOGATBuffer)buf; 165 | Object[] out = new Object[2]; 166 | out[0] = (bbuf.outColKeptValue); 167 | out[1] = (bbuf.sortColKeptValue); 168 | // System.out.println("TERMINATE PARTIAL RETURN " + out[0] + " -- " + out[1] + " soi is " + soi); 169 | return out; 170 | } 171 | } 172 | 173 | } 174 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/storage/XMLHiveInputFormat.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | 19 | /** 20 | * Adapted from XmlInputFormat from Mahout project 21 | */ 22 | package com.dataiku.hive.storage; 23 | 24 | import java.io.IOException; 25 | import java.io.InputStream; 26 | import java.util.Properties; 27 | 28 | import com.sun.org.apache.xerces.internal.impl.xpath.regex.Match; 29 | import org.apache.commons.logging.Log; 30 | import org.apache.commons.logging.LogFactory; 31 | import org.apache.hadoop.fs.FSDataInputStream; 32 | import org.apache.hadoop.fs.FileSystem; 33 | import org.apache.hadoop.fs.Path; 34 | import org.apache.hadoop.fs.Seekable; 35 | import org.apache.hadoop.hive.ql.exec.Utilities; 36 | import org.apache.hadoop.io.DataOutputBuffer; 37 | import org.apache.hadoop.io.LongWritable; 38 | import org.apache.hadoop.io.Text; 39 | import org.apache.hadoop.io.compress.CompressionCodec; 40 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 41 | import org.apache.hadoop.mapred.*; 42 | import sun.nio.cs.ext.MacHebrew; 43 | 44 | /** 45 | * Reads records that are delimited by a specfic begin/end tag. 46 | */ 47 | public class XMLHiveInputFormat extends TextInputFormat { 48 | 49 | public static final String TAG_KEY = "xml.tag"; 50 | 51 | public static final Log LOG = LogFactory.getLog(XMLHiveInputFormat.class.getName()); 52 | 53 | 54 | @Override 55 | public RecordReader getRecordReader(InputSplit inputSplit, 56 | JobConf jobConf, 57 | Reporter reporter) throws IOException { 58 | return new XmlRecordReader((FileSplit) inputSplit, jobConf); 59 | } 60 | 61 | @Override 62 | protected boolean isSplitable(FileSystem fs, Path file) { 63 | return false; 64 | } 65 | 66 | /** 67 | * XMLRecordReader class to read through a given xml document to output xml 68 | * blocks as records as specified by the start tag and end tag 69 | * 70 | */ 71 | public static class XmlRecordReader implements 72 | RecordReader { 73 | private final byte[] startTag; 74 | private final byte[] endTag; 75 | private final long start; 76 | private final long end; 77 | private final InputStream fsin; 78 | private final DataOutputBuffer buffer = new DataOutputBuffer(); 79 | 80 | public XmlRecordReader(FileSplit split, JobConf jobConf) throws IOException { 81 | String tagKey = jobConf.get(TAG_KEY); 82 | if (tagKey == null) { 83 | try { 84 | Properties tableProperties = Utilities.getMapRedWork(jobConf).getPathToPartitionInfo().get(getInputPaths(jobConf)[0].toString()).getTableDesc().getProperties() ; 85 | tagKey = tableProperties.getProperty(TAG_KEY); 86 | } catch (Exception e) { 87 | throw new IOException("Unable to retrieve value for " + TAG_KEY, e); 88 | } 89 | if (tagKey == null) { 90 | throw new IOException("Unable to retrieve value for " + TAG_KEY); 91 | } 92 | } 93 | String startTagString = "<" + tagKey; 94 | String endTagString = ""; 95 | startTag = startTagString.getBytes("utf-8"); 96 | endTag = endTagString.getBytes("utf-8"); 97 | 98 | // open the file and seek to the start of the split 99 | start = split.getStart(); 100 | end = start + split.getLength(); 101 | Path file = split.getPath(); 102 | FileSystem fs = file.getFileSystem(jobConf); 103 | 104 | 105 | CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(jobConf); 106 | CompressionCodec codec = compressionCodecs.getCodec(file); 107 | 108 | if (codec != null) { 109 | fsin = codec.createInputStream(fs.open(split.getPath())); 110 | LOG.info("Initialized XmlRecordReader with codec " + codec.getClass().getName() + " with tag " + tagKey + " to " + split.getPath().toString() + " " + start + " " + end ); 111 | } else { 112 | fsin = fs.open(split.getPath()); 113 | ((FSDataInputStream)fsin).seek(start); 114 | LOG.info("Initialized XmlRecordReader with no codec with tag " + tagKey + " to " + split.getPath().toString() + " " + start + " " + end ); 115 | 116 | } 117 | } 118 | 119 | 120 | protected long pos() throws IOException { 121 | return ((Seekable) fsin).getPos(); 122 | 123 | } 124 | 125 | protected boolean readUntilSlashOrOpenTag(LongWritable key, Text value, int b) throws IOException { 126 | while (true) { 127 | if (b == -1) { 128 | return false; 129 | } 130 | writeToBuffer(b); 131 | if (b == (int) '/') { 132 | b = fsin.read(); 133 | writeToBuffer(b); 134 | if (b == (int) '>') { 135 | key.set(pos()); 136 | value.set(buffer.getData(), 0, buffer.getLength()); 137 | return true; 138 | } 139 | } 140 | 141 | if (b == (int) '>') { 142 | return false; 143 | } 144 | b = fsin.read(); 145 | } 146 | } 147 | 148 | @Override 149 | public boolean next(LongWritable key, Text value) throws IOException { 150 | if (true || pos() < end) { // we do not support split 151 | if (readUntilMatch(startTag, false)) { 152 | try { 153 | 154 | int b = fsin.read(); 155 | if (Character.isLetterOrDigit(b)) { 156 | return next(key, value); // should not match or ... ? 162 | 163 | if (readUntilSlashOrOpenTag(key, value, b)) { 164 | //LOG.info("Key:" + key.toString() + " Value:" + value.toString()); 165 | return true; 166 | } 167 | 168 | // Read until match tag... 169 | if (readUntilMatch(endTag, true)) { 170 | key.set(pos()); 171 | value.set(buffer.getData(), 0, buffer.getLength()); 172 | 173 | //String s = value.toString(); 174 | //LOG.info("Key:" + key.toString() + " Begin:" + s.substring(0, Math.min(s.length(), 10)) 175 | //+ " End:" + s.substring(s.length() - Math.min(s.length(), 10), s.length()) + "#"); 176 | return true; 177 | } 178 | } finally { 179 | buffer.reset(); 180 | } 181 | } 182 | } 183 | LOG.info("No Match startTag"); 184 | 185 | return false; 186 | } 187 | 188 | @Override 189 | public LongWritable createKey() { 190 | return new LongWritable(); 191 | } 192 | 193 | @Override 194 | public Text createValue() { 195 | return new Text(); 196 | } 197 | 198 | @Override 199 | public long getPos() throws IOException { 200 | return pos(); 201 | } 202 | 203 | @Override 204 | public void close() throws IOException { 205 | fsin.close(); 206 | } 207 | 208 | @Override 209 | public float getProgress() throws IOException { 210 | return (pos() - start) / (float) (end - start); 211 | } 212 | 213 | private void writeToBuffer(int b) throws IOException { 214 | if (b == '\n') { 215 | buffer.write(' '); 216 | } else if (b != '\r') { 217 | buffer.write(b); 218 | } 219 | } 220 | 221 | private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException { 222 | int i = 0; 223 | while (true) { 224 | int b = fsin.read(); 225 | // end of file: 226 | if (b == -1) return false; 227 | // save to buffer: 228 | if (withinBlock) { 229 | writeToBuffer(b); 230 | } 231 | 232 | // check if we're matching: 233 | if (b == match[i]) { 234 | i++; 235 | if (i >= match.length) return true; 236 | } else i = 0; 237 | // see if we've passed the stop point: 238 | // if (!withinBlock && i == 0 && pos() >= end) return false; // we do not support split 239 | } 240 | } 241 | } 242 | } -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maps/UDAFCountDistinctToMap.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2013 Dataiku 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance 6 | * with the License. You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.dataiku.hive.udf.maps; 17 | 18 | import org.apache.hadoop.hive.common.classification.InterfaceAudience; 19 | import org.apache.hadoop.hive.metastore.model.MPartition; 20 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 21 | import org.apache.hadoop.hive.ql.metadata.HiveException; 22 | import org.apache.hadoop.hive.ql.parse.SemanticException; 23 | import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; 24 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 25 | import org.apache.hadoop.hive.serde2.objectinspector.*; 26 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.*; 27 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; 28 | import org.apache.hadoop.io.Text; 29 | 30 | import java.util.ArrayList; 31 | import java.util.HashMap; 32 | import java.util.List; 33 | import java.util.Map; 34 | 35 | public class UDAFCountDistinctToMap extends AbstractGenericUDAFResolver { 36 | @Override 37 | public GenericUDAFEvaluator getEvaluator(TypeInfo[] tis) throws SemanticException { 38 | if (tis.length != 2) { 39 | throw new UDFArgumentTypeException(tis.length - 1, "Exactly two arguments is expected."); 40 | } 41 | return new CountDistinctToMap(); 42 | } 43 | 44 | /** 45 | * count_distinct_to_map(K key, V value) 46 | * (key, value) - PARTIAL1 --> Map 47 | * Map - PARTIAL2 --> Map 48 | * Map - FINAL --> Map 49 | * (key, value) - COMPLETE --> Map 50 | */ 51 | public static class CountDistinctToMap extends GenericUDAFEvaluator { 52 | private PrimitiveObjectInspector keyTypeOI; 53 | private PrimitiveObjectInspector valueTypeOI; 54 | private AbstractPrimitiveWritableObjectInspector keyOutputTypeOI; 55 | private AbstractPrimitiveWritableObjectInspector valueOutputTypeOI; 56 | private ListObjectInspector valueListInputTypeOI; 57 | private StandardListObjectInspector valueListOutputTypeOI; 58 | private MapObjectInspector intermediateMapInputTypeOI; 59 | private StandardMapObjectInspector intermediateMapOutputTypeOI; 60 | private StandardMapObjectInspector finalMapTypeOI; 61 | 62 | @Override 63 | public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { 64 | super.init(m, parameters); 65 | /* Setup input OI */ 66 | if (m == Mode.PARTIAL1) { 67 | keyTypeOI= (PrimitiveObjectInspector) parameters[0]; 68 | valueTypeOI = (PrimitiveObjectInspector) parameters[1]; 69 | keyOutputTypeOI = (AbstractPrimitiveWritableObjectInspector) ObjectInspectorUtils.getStandardObjectInspector(keyTypeOI, ObjectInspectorUtils.ObjectInspectorCopyOption.WRITABLE); 70 | valueOutputTypeOI= (AbstractPrimitiveWritableObjectInspector) ObjectInspectorUtils.getStandardObjectInspector(valueTypeOI, ObjectInspectorUtils.ObjectInspectorCopyOption.WRITABLE); 71 | 72 | valueListOutputTypeOI = ObjectInspectorFactory.getStandardListObjectInspector(valueOutputTypeOI); 73 | intermediateMapOutputTypeOI = ObjectInspectorFactory.getStandardMapObjectInspector(keyOutputTypeOI, valueListOutputTypeOI); 74 | return intermediateMapOutputTypeOI; 75 | } else if (m == Mode.COMPLETE) { 76 | keyTypeOI= (PrimitiveObjectInspector) parameters[0]; 77 | valueTypeOI = (PrimitiveObjectInspector) parameters[1]; 78 | keyOutputTypeOI = (AbstractPrimitiveWritableObjectInspector) ObjectInspectorUtils.getStandardObjectInspector(keyTypeOI, ObjectInspectorUtils.ObjectInspectorCopyOption.WRITABLE); 79 | valueOutputTypeOI= (AbstractPrimitiveWritableObjectInspector) ObjectInspectorUtils.getStandardObjectInspector(valueTypeOI, ObjectInspectorUtils.ObjectInspectorCopyOption.WRITABLE); 80 | 81 | finalMapTypeOI = ObjectInspectorFactory.getStandardMapObjectInspector(keyOutputTypeOI, PrimitiveObjectInspectorFactory.javaLongObjectInspector); 82 | /* Input is original data */ 83 | return finalMapTypeOI; 84 | } else if (m == Mode.PARTIAL2) { 85 | intermediateMapInputTypeOI = (MapObjectInspector) parameters[0]; 86 | keyOutputTypeOI= (AbstractPrimitiveWritableObjectInspector) intermediateMapInputTypeOI.getMapKeyObjectInspector(); 87 | valueListInputTypeOI= (ListObjectInspector) intermediateMapInputTypeOI.getMapValueObjectInspector(); 88 | valueOutputTypeOI = (AbstractPrimitiveWritableObjectInspector) valueListInputTypeOI.getListElementObjectInspector(); 89 | valueListOutputTypeOI = ObjectInspectorFactory.getStandardListObjectInspector(valueOutputTypeOI); 90 | intermediateMapOutputTypeOI = ObjectInspectorFactory.getStandardMapObjectInspector(keyOutputTypeOI, valueListOutputTypeOI); 91 | return intermediateMapOutputTypeOI; 92 | } else if (m == Mode.FINAL){ 93 | intermediateMapInputTypeOI = (MapObjectInspector) parameters[0]; 94 | keyOutputTypeOI = (AbstractPrimitiveWritableObjectInspector) intermediateMapInputTypeOI.getMapKeyObjectInspector(); 95 | valueListInputTypeOI= (ListObjectInspector) intermediateMapInputTypeOI.getMapValueObjectInspector(); 96 | valueOutputTypeOI = (AbstractPrimitiveWritableObjectInspector) valueListInputTypeOI.getListElementObjectInspector(); 97 | valueListOutputTypeOI = ObjectInspectorFactory.getStandardListObjectInspector(valueOutputTypeOI); 98 | intermediateMapOutputTypeOI = ObjectInspectorFactory.getStandardMapObjectInspector(keyOutputTypeOI, valueListOutputTypeOI); 99 | finalMapTypeOI = ObjectInspectorFactory.getStandardMapObjectInspector(keyOutputTypeOI, PrimitiveObjectInspectorFactory.javaIntObjectInspector); 100 | return finalMapTypeOI; 101 | } else { 102 | throw new IllegalArgumentException("Invalid mode"); 103 | } 104 | } 105 | 106 | class MapAgg implements AggregationBuffer { 107 | Map> content; 108 | 109 | protected MapAgg() { 110 | content = new HashMap>(); 111 | } 112 | 113 | protected void reset() { 114 | content.clear(); 115 | } 116 | 117 | protected void addEntry(Object k, Object v) { 118 | if (v == null) { 119 | return; 120 | } 121 | 122 | ArrayList vv = content.get(k); 123 | if (vv == null) { 124 | vv = new ArrayList(); 125 | content.put(k, vv); 126 | 127 | } 128 | if (!vv.contains(v)) { 129 | vv.add(v); 130 | } 131 | 132 | } 133 | 134 | protected void iterate(Object[] parameters) { 135 | Object key = parameters[0]; 136 | Object value = parameters[1]; 137 | addEntry(keyOutputTypeOI.copyObject(keyTypeOI.getPrimitiveWritableObject(key)), 138 | valueOutputTypeOI.copyObject(valueTypeOI.getPrimitiveWritableObject(value))); 139 | } 140 | 141 | protected Object terminatePartial() { 142 | return content; 143 | } 144 | 145 | protected void merge(Object o) { 146 | Map map = intermediateMapInputTypeOI.getMap(o); 147 | for(Map.Entry entry: map.entrySet()) { 148 | Object k = entry.getKey(); 149 | Object v = entry.getValue(); 150 | for(Object oo : valueListInputTypeOI.getList(v)) { 151 | addEntry(keyOutputTypeOI.copyObject(k), valueOutputTypeOI.copyObject(oo)); 152 | } 153 | } 154 | } 155 | 156 | protected Object terminate() { 157 | Map> map = content; 158 | Map mapFinal = new HashMap(); 159 | 160 | for(Map.Entry> entry : content.entrySet()) { 161 | mapFinal.put(entry.getKey(), Integer.valueOf(entry.getValue().size())); 162 | } 163 | return mapFinal; 164 | } 165 | 166 | } 167 | 168 | @Override 169 | public void reset(AggregationBuffer ab) throws HiveException { 170 | ((MapAgg) ab).reset(); 171 | } 172 | 173 | @Override 174 | public AggregationBuffer getNewAggregationBuffer() throws HiveException { 175 | return new MapAgg(); 176 | } 177 | 178 | @Override 179 | public void iterate(AggregationBuffer ab, Object[] parameters) throws HiveException { 180 | assert (parameters.length == 2); 181 | ((MapAgg) ab).iterate(parameters); 182 | } 183 | 184 | @Override 185 | public Object terminatePartial(AggregationBuffer ab) throws HiveException { 186 | return ((MapAgg) ab).terminatePartial(); 187 | } 188 | 189 | @Override 190 | public void merge(AggregationBuffer ab, Object p) throws HiveException { 191 | ((MapAgg) ab).merge(p); 192 | } 193 | 194 | @Override 195 | public Object terminate(AggregationBuffer ab) throws HiveException { 196 | return ((MapAgg) ab).terminate(); 197 | } 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /src/com/dataiku/hive/udf/maths/UDFExponentialSmoothingMovingAverage.java: -------------------------------------------------------------------------------- 1 | package com.dataiku.hive.udf.maths; 2 | 3 | /** 4 | * Author: Matthieu Scordia 5 | * Date: 04/03/14 6 | * Time: 15:12 7 | */ 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | 12 | import org.apache.commons.logging.Log; 13 | import org.apache.commons.logging.LogFactory; 14 | import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; 15 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; 16 | import org.apache.hadoop.hive.ql.exec.Description; 17 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 18 | import org.apache.hadoop.hive.ql.metadata.HiveException; 19 | import org.apache.hadoop.hive.ql.parse.SemanticException; 20 | import org.apache.hadoop.hive.serde2.io.DoubleWritable; 21 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 22 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 23 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; 24 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 25 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; 26 | import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; 27 | import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; 28 | import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; 29 | 30 | 31 | 32 | /** 33 | * UDFExponentialSmoothingMovingAverage 34 | * 35 | */ 36 | @Description(name = "moving_avg", value = "_FUNC_(p, x, windows, div) - Returns the moving mean of a set of numbers over a window of n observations 1/pow(div,i)") 37 | public class UDFExponentialSmoothingMovingAverage extends AbstractGenericUDAFResolver { 38 | 39 | static final Log LOG = LogFactory.getLog(UDFExponentialSmoothingMovingAverage.class.getName()); 40 | 41 | @Override 42 | public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { 43 | 44 | //System.out.println("check getEvaluator in"); 45 | 46 | //We need exactly three parameters 47 | if (parameters.length != 5) { 48 | throw new UDFArgumentTypeException(parameters.length - 1, "Moving Average requires 5 parameters"); 49 | } 50 | 51 | //check the first parameter to make sure they type is numeric 52 | 53 | if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) 54 | { 55 | throw new UDFArgumentTypeException(0, "Only primitive, numeric types can have a moving average but "+ 56 | parameters[0].getTypeName() + "was passed."); 57 | } 58 | 59 | // if it's a primative, let's make sure it's numeric 60 | switch(((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) { 61 | //fall through all numeric primitives 62 | case FLOAT: 63 | case DOUBLE: 64 | case INT: 65 | case LONG: 66 | case SHORT: 67 | break; 68 | default: 69 | throw new UDFArgumentTypeException(0, "Only numeric type arguments (excluding bytes and timestamps) are accepted"+ 70 | "but " + parameters[0].getTypeName() + " was passed."); 71 | } 72 | 73 | 74 | // check the second parameter 75 | if (parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) 76 | { 77 | throw new UDFArgumentTypeException(0, "Only primitive, numeric types can have a moving average but "+ 78 | parameters[1].getTypeName() + "was passed."); 79 | } 80 | 81 | // if it's a primative, let's make sure it's numeric 82 | switch(((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory()) { 83 | //fall through all numeric primitives 84 | case FLOAT: 85 | case DOUBLE: 86 | case INT: 87 | case LONG: 88 | case SHORT: 89 | break; 90 | default: 91 | throw new UDFArgumentTypeException(0, "Only numeric type arguments (excluding bytes and timestamps) are accepted"+ 92 | "but " + parameters[1].getTypeName() + " was passed."); 93 | } 94 | 95 | // ensure that the window size is an integer 96 | if (parameters[2].getCategory() != ObjectInspector.Category.PRIMITIVE) 97 | { 98 | throw new UDFArgumentTypeException(1, "ensure that the window size is an integer"); 99 | } 100 | 101 | if (((PrimitiveTypeInfo) parameters[2]).getPrimitiveCategory() != 102 | PrimitiveObjectInspector.PrimitiveCategory.INT) 103 | { 104 | throw new UDFArgumentTypeException(1, "ensure that the window size is an integer"); 105 | } 106 | 107 | 108 | // ensure that the diviseur is a double 109 | if (parameters[3].getCategory() != ObjectInspector.Category.PRIMITIVE) 110 | { 111 | throw new UDFArgumentTypeException(1, "ensure that the diviseur is a double"); 112 | } 113 | 114 | if (((PrimitiveTypeInfo) parameters[3]).getPrimitiveCategory() != 115 | PrimitiveObjectInspector.PrimitiveCategory.DOUBLE) 116 | { 117 | throw new UDFArgumentTypeException(1, "ensure that the diviseur is a double"); 118 | } 119 | 120 | // ensure that the position is a int. 121 | if (parameters[4].getCategory() != ObjectInspector.Category.PRIMITIVE) 122 | { 123 | throw new UDFArgumentTypeException(1, "ensure that the position is a int."); 124 | } 125 | 126 | if (((PrimitiveTypeInfo) parameters[4]).getPrimitiveCategory() != 127 | PrimitiveObjectInspector.PrimitiveCategory.INT) 128 | { 129 | throw new UDFArgumentTypeException(1, "ensure that the position is a int."); 130 | } 131 | 132 | //System.out.println("check getEvaluator out"); 133 | 134 | return new GenericUDAFMovingAverageEvaluator(); 135 | } 136 | 137 | public static class GenericUDAFMovingAverageEvaluator extends GenericUDAFEvaluator { 138 | 139 | // input inspectors for PARTIAL1 and COMPLETE 140 | private PrimitiveObjectInspector periodOI; 141 | private PrimitiveObjectInspector inputOI; 142 | private PrimitiveObjectInspector windowSizeOI; 143 | private PrimitiveObjectInspector diviseurOI; 144 | private PrimitiveObjectInspector positionOI; 145 | 146 | // input inspectors for PARTIAL2 and FINAL 147 | // list for MAs and one for residuals 148 | private StandardListObjectInspector loi; 149 | 150 | 151 | @Override 152 | public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { 153 | super.init(m, parameters); 154 | 155 | // initialize input inspectors 156 | if (m == Mode.PARTIAL1 || m == Mode.COMPLETE) 157 | { 158 | assert(parameters.length == 5); 159 | periodOI = (PrimitiveObjectInspector) parameters[0]; 160 | inputOI = (PrimitiveObjectInspector) parameters[1]; 161 | windowSizeOI = (PrimitiveObjectInspector) parameters[2]; 162 | diviseurOI = (PrimitiveObjectInspector) parameters[3]; 163 | positionOI = (PrimitiveObjectInspector) parameters[4]; 164 | 165 | 166 | } 167 | 168 | else 169 | { 170 | loi = (StandardListObjectInspector) parameters[0]; 171 | } 172 | 173 | // init output object inspectors 174 | if (m == Mode.PARTIAL1 || m == Mode.PARTIAL2) { 175 | // The output of a partial aggregation is a list of doubles representing the 176 | // moving average being constructed. 177 | // the first element in the list will be the window size 178 | // 179 | return ObjectInspectorFactory.getStandardListObjectInspector( 180 | PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); 181 | } 182 | else { 183 | return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; 184 | } 185 | } 186 | 187 | @Override 188 | public Object terminatePartial(AggregationBuffer agg) throws HiveException { 189 | // return an ArrayList where the first parameter is the window size 190 | MaAgg myagg = (MaAgg) agg; 191 | return myagg.prefixSum.serialize(); 192 | 193 | } 194 | 195 | @Override 196 | public Object terminate(AggregationBuffer agg) throws HiveException { 197 | // final return value goes here 198 | MaAgg myagg = (MaAgg) agg; 199 | 200 | if (myagg.prefixSum.tableSize() < 1) 201 | { 202 | return null; 203 | } 204 | 205 | else 206 | { 207 | ArrayList result = new ArrayList(); 208 | DoubleWritable[] entry = new DoubleWritable[1]; 209 | entry[0] = new DoubleWritable(myagg.prefixSum.getEntry(myagg.prefixSum.tableSize()-1).movingAverage); 210 | return entry[0]; 211 | } 212 | 213 | } 214 | 215 | @SuppressWarnings("unchecked") 216 | @Override 217 | public void merge(AggregationBuffer agg, Object partial) throws HiveException { 218 | // if we're merging two separate sets we're creating one table that's doubly long 219 | if (partial != null) 220 | { 221 | MaAgg myagg = (MaAgg) agg; 222 | List partialMovingAverage = (List) loi.getList(partial); 223 | 224 | myagg.prefixSum.merge(partialMovingAverage); 225 | } 226 | 227 | } 228 | 229 | @Override 230 | public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { 231 | 232 | assert (parameters.length == 5); 233 | 234 | if (parameters[0] == null || parameters[1] == null || parameters[2] == null || parameters[3] == null || parameters[4] == null) 235 | { 236 | return; 237 | } 238 | 239 | MaAgg myagg = (MaAgg) agg; 240 | 241 | // Parse out the window size just once if we haven't done so before. We need a window of at least 1, 242 | // otherwise there's no window. 243 | if (!myagg.prefixSum.isReady()) 244 | { 245 | int windowSize = PrimitiveObjectInspectorUtils.getInt(parameters[2], windowSizeOI); 246 | double diviseur = PrimitiveObjectInspectorUtils.getDouble(parameters[3], diviseurOI); 247 | int position = PrimitiveObjectInspectorUtils.getInt(parameters[4], positionOI); 248 | 249 | if (windowSize < 1) 250 | { 251 | throw new HiveException(getClass().getSimpleName() + " needs a window size >= 1"); 252 | } 253 | myagg.prefixSum.allocate(windowSize, diviseur, position); 254 | } 255 | 256 | //Add the current data point and compute the average 257 | int p = PrimitiveObjectInspectorUtils.getInt(parameters[0], periodOI); 258 | double v = PrimitiveObjectInspectorUtils.getDouble(parameters[1], inputOI); 259 | myagg.prefixSum.add(p,v); 260 | 261 | } 262 | 263 | // Aggregation buffer definition and manipulation methods 264 | static class MaAgg implements AggregationBuffer { 265 | PrefixSumMovingAverage prefixSum; 266 | }; 267 | 268 | @Override 269 | public AggregationBuffer getNewAggregationBuffer() throws HiveException { 270 | MaAgg result = new MaAgg(); 271 | reset(result); 272 | return result; 273 | } 274 | 275 | @Override 276 | public void reset(AggregationBuffer agg) throws HiveException { 277 | MaAgg myagg = (MaAgg) agg; 278 | myagg.prefixSum = new PrefixSumMovingAverage(); 279 | myagg.prefixSum.reset(); 280 | } 281 | } 282 | 283 | } --------------------------------------------------------------------------------