├── .gitignore
├── hive-init.hql
├── ivysettings.xml
├── src
└── com
│ └── dataiku
│ └── hive
│ ├── udf
│ ├── maps
│ │ ├── UDFMapMaxKey.java
│ │ ├── UDFCountToMap.java
│ │ ├── UDFMapValueFilterLowerThan.java
│ │ ├── UDFMapValueFilterTopN.java
│ │ ├── UDAFMapGroupSum.java
│ │ └── UDAFCountDistinctToMap.java
│ ├── arrays
│ │ ├── UDFArrayCountDistinct.java
│ │ ├── UDFArrayGet.java
│ │ ├── UDFArrayIntSum.java
│ │ ├── UDFArrayCountEquals.java
│ │ ├── UDFArrayJoin.java
│ │ ├── UDFArraySubSequences.java
│ │ └── UDAFCollectToArray.java
│ ├── window
│ │ ├── Rank.java
│ │ ├── UDAFLastOfGroupAccordingTo.java
│ │ ├── UDAFFirstOfGroupAccordingTo.java
│ │ └── UDAFFirstOrLastOfGroupAccordingTo.java
│ ├── strings
│ │ └── UDFStringSubSequences.java
│ └── maths
│ │ ├── PrefixSumMovingAverage.java
│ │ └── UDFExponentialSmoothingMovingAverage.java
│ └── storage
│ ├── XMLHiveStorageHandler.java
│ ├── XMLSerde.java
│ └── XMLHiveInputFormat.java
├── ivy.xml
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | lib
4 | dist
5 | build
6 |
--------------------------------------------------------------------------------
/hive-init.hql:
--------------------------------------------------------------------------------
1 | ADD JAR dist/dataiku-hive-udf.jar;
2 |
3 | CREATE TEMPORARY FUNCTION count_distinct_map as 'com.dataiku.hive.udf.maps.UDAFCountDistinctToMap';
4 | CREATE TEMPORARY FUNCTION array_count_to_map as 'com.dataiku.hive.udf.maps.UDFCountToMap';
5 | CREATE TEMPORARY FUNCTION map_filter_top as 'com.dataiku.hive.udf.maps.UDFMapValueFilterTopN';
6 | CREATE TEMPORARY FUNCTION collect_all as 'com.dataiku.hive.udf.arrays.UDAFCollectToArray';
--------------------------------------------------------------------------------
/ivysettings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/maps/UDFMapMaxKey.java:
--------------------------------------------------------------------------------
1 | package com.dataiku.hive.udf.maps;
2 |
3 | import org.apache.hadoop.hive.ql.exec.UDF;
4 |
5 | import java.util.*;
6 |
7 | /**
8 | * Retrieve the key with the maximal value for a map
9 | */
10 | public class UDFMapMaxKey extends UDF {
11 | public String evaluate(Map map) {
12 | String maxKey = null;
13 | Integer maxValue = null;
14 | for(Map.Entry entry : map.entrySet()) {
15 | if (maxValue == null || entry.getValue() > maxValue) {
16 | maxKey = entry.getKey();
17 | maxValue = entry.getValue();
18 | }
19 | }
20 | return maxKey;
21 | }
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/maps/UDFCountToMap.java:
--------------------------------------------------------------------------------
1 | package com.dataiku.hive.udf.maps;
2 |
3 | import org.apache.hadoop.hive.ql.exec.UDF;
4 |
5 | import java.util.*;
6 |
7 | public class UDFCountToMap extends UDF {
8 |
9 |
10 | public Map evaluate(List a) {
11 | HashMap map= new HashMap();
12 | if (a == null) {
13 | return null;
14 | }
15 | for(String s : a) {
16 | if (s == null) {
17 | continue;
18 | }
19 | if (map.containsKey(s)) {
20 | map.put(s, map.get(s) + 1);
21 | } else {
22 | map.put(s, 1);
23 | }
24 | }
25 | return map;
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/maps/UDFMapValueFilterLowerThan.java:
--------------------------------------------------------------------------------
1 | package com.dataiku.hive.udf.maps;
2 |
3 | import org.apache.hadoop.hive.ql.exec.UDF;
4 |
5 | import java.util.HashSet;
6 | import java.util.Map;
7 | import java.util.Set;
8 |
9 | /**
10 | */
11 | public class UDFMapValueFilterLowerThan extends UDF {
12 |
13 | Set toRemove = new HashSet();
14 |
15 | public Map evaluate(Map map, Integer minValue) {
16 |
17 | toRemove.clear();
18 | for(String s : map.keySet()) {
19 | if (map.get(s) < minValue) {
20 |
21 | toRemove.add(s);
22 | }
23 | }
24 |
25 | for(String s : toRemove) {
26 | map.remove(s);
27 | }
28 | return map;
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/arrays/UDFArrayCountDistinct.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2013 Dataiku
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the
5 | * "License"); you may not use this file except in compliance
6 | * with the License. You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.dataiku.hive.udf.arrays;
17 | import java.util.HashSet;
18 | import java.util.List;
19 | import java.util.Set;
20 |
21 | import org.apache.hadoop.hive.ql.exec.UDF;
22 |
23 | public class UDFArrayCountDistinct extends UDF {
24 | Set stringSet = new HashSet();
25 |
26 | public int evaluate(List a) {
27 | if (a == null) {
28 | return 0;
29 | }
30 | stringSet.clear();
31 | stringSet.addAll(a);
32 | return stringSet.size();
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/maps/UDFMapValueFilterTopN.java:
--------------------------------------------------------------------------------
1 | package com.dataiku.hive.udf.maps;
2 |
3 | import org.apache.hadoop.hive.ql.exec.UDF;
4 |
5 | import java.util.*;
6 |
7 | /**
8 | * Filter topN Elements from a map
9 | */
10 | public class UDFMapValueFilterTopN extends UDF {
11 | public Map evaluate(Map map, Integer n) {
12 |
13 | if (map.size() < n) {
14 | return map;
15 | }
16 | List list = new ArrayList(map.values());
17 | Collections.sort(list);
18 | int limit = list.get(list.size() - n);
19 | int count = 0;
20 | HashMap nm = new HashMap();
21 |
22 | for(Map.Entry entry : map.entrySet()) {
23 | if (entry.getValue() > limit) {
24 | nm.put(entry.getKey(), entry.getValue());
25 | }
26 | }
27 | for(Map.Entry entry : map.entrySet()) {
28 | if (nm.size() == n) {
29 | break;
30 | }
31 | if (entry.getValue() == limit) {
32 | nm.put(entry.getKey(), entry.getValue());
33 | }
34 | }
35 | return nm;
36 | }
37 | }
38 |
39 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/arrays/UDFArrayGet.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2013 Dataiku
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the
5 | * "License"); you may not use this file except in compliance
6 | * with the License. You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.dataiku.hive.udf.arrays;
17 | import java.util.List;
18 |
19 | import org.apache.hadoop.hive.ql.exec.Description;
20 | import org.apache.hadoop.hive.ql.exec.UDF;
21 | import org.apache.hadoop.io.Text;
22 |
23 | @Description(name="array_get", value="_FUNC_(array, int) - returns the nth object in the array")
24 | public class UDFArrayGet extends UDF {
25 | private Text out = new Text();
26 |
27 | public Text evaluate(List a, int offset) {
28 | if (a == null) return null;
29 | out.set(a.get(offset));
30 | return out;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/window/Rank.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2013 Dataiku
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the
5 | * "License"); you may not use this file except in compliance
6 | * with the License. You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.dataiku.hive.udf.window;
17 | import org.apache.hadoop.hive.ql.exec.Description;
18 | import org.apache.hadoop.hive.ql.exec.UDF;
19 |
20 | @Description(name="rank", value="_FUNC_(string) - Returns the number of times the column had the same value in the previous records")
21 | public final class Rank extends UDF {
22 | private int counter;
23 | private String currentKey;
24 |
25 | public int evaluate(final String key) {
26 | if (!key.equalsIgnoreCase(currentKey)) {
27 | counter = 0;
28 | currentKey = key;
29 | }
30 | return counter++;
31 | }
32 | }
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/arrays/UDFArrayIntSum.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2013 Dataiku
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the
5 | * "License"); you may not use this file except in compliance
6 | * with the License. You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.dataiku.hive.udf.arrays;
17 | import java.util.List;
18 |
19 | import org.apache.hadoop.hive.ql.exec.Description;
20 | import org.apache.hadoop.hive.ql.exec.UDF;
21 |
22 | @Description(name="array_int_sum", value="_FUNC_(array) - returns the sum of elements in the array")
23 | public class UDFArrayIntSum extends UDF {
24 | public int evaluate(List a) {
25 | if (a == null) return 0;
26 | int sum = 0;
27 | for (int i = 0; i < a.size(); i++) {
28 | Integer elt = a.get(i);
29 | if (elt != null) sum += elt;
30 | }
31 | return sum;
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/window/UDAFLastOfGroupAccordingTo.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2013 Dataiku
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the
5 | * "License"); you may not use this file except in compliance
6 | * with the License. You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.dataiku.hive.udf.window;
17 | import org.apache.hadoop.hive.ql.exec.Description;
18 | import org.apache.hadoop.hive.ql.parse.SemanticException;
19 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
20 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo;
21 |
22 | @Description(name="last_of_group", value="_FUNC_(outputColumn, sortColumn)")
23 | public final class UDAFLastOfGroupAccordingTo extends UDAFFirstOrLastOfGroupAccordingTo {
24 | @Override
25 | public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException {
26 | checkParameters(info);
27 | return new FirstEvaluator();
28 | }
29 |
30 | public static class FirstEvaluator extends BaseEvaluator {
31 | @Override
32 | protected boolean needUpdate(int cmp) {
33 | return cmp > 0;
34 | }
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/window/UDAFFirstOfGroupAccordingTo.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2013 Dataiku
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the
5 | * "License"); you may not use this file except in compliance
6 | * with the License. You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.dataiku.hive.udf.window;
17 | import org.apache.hadoop.hive.ql.exec.Description;
18 | import org.apache.hadoop.hive.ql.parse.SemanticException;
19 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
20 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo;
21 |
22 | @Description(name="first_of_group", value="_FUNC_(outputColumn, sortColumn)")
23 | public final class UDAFFirstOfGroupAccordingTo extends UDAFFirstOrLastOfGroupAccordingTo {
24 | @Override
25 | public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException {
26 | checkParameters(info);
27 | return new FirstEvaluator();
28 | }
29 |
30 | public static class FirstEvaluator extends BaseEvaluator {
31 | @Override
32 | protected boolean needUpdate(int cmp) {
33 | return cmp < 0;
34 | }
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/storage/XMLHiveStorageHandler.java:
--------------------------------------------------------------------------------
1 | package com.dataiku.hive.storage;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 | import org.apache.hadoop.hive.metastore.HiveMetaHook;
5 | import org.apache.hadoop.hive.ql.metadata.DefaultStorageHandler;
6 | import org.apache.hadoop.hive.ql.metadata.HiveException;
7 | import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
8 | import org.apache.hadoop.hive.ql.plan.TableDesc;
9 | import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider;
10 | import org.apache.hadoop.hive.serde2.SerDe;
11 | import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
12 | import org.apache.hadoop.mapred.InputFormat;
13 | import org.apache.hadoop.mapred.OutputFormat;
14 |
15 | import java.util.Map;
16 | import java.util.Properties;
17 |
18 | /**
19 | */
20 | public class XMLHiveStorageHandler extends DefaultStorageHandler {
21 | @Override
22 | public Class extends InputFormat> getInputFormatClass() {
23 | return XMLHiveInputFormat.class;
24 | }
25 |
26 | @Override
27 | public Class extends SerDe> getSerDeClass() {
28 | return XMLSerde.class;
29 | }
30 |
31 | @Override
32 | public Class extends OutputFormat> getOutputFormatClass() {
33 | return org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat.class;
34 | }
35 |
36 |
37 |
38 | @Override
39 | public void configureInputJobProperties(TableDesc tableDesc, Map jobProperties) {
40 | super.configureInputJobProperties(tableDesc, jobProperties); //To change body of overridden methods use File | Settings | File Templates.
41 | Properties props = tableDesc.getProperties();
42 | jobProperties.put(XMLHiveInputFormat.TAG_KEY, props.getProperty(XMLHiveInputFormat.TAG_KEY));
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/arrays/UDFArrayCountEquals.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2013 Dataiku
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the
5 | * "License"); you may not use this file except in compliance
6 | * with the License. You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.dataiku.hive.udf.arrays;
17 | import java.util.List;
18 |
19 | import org.apache.hadoop.hive.ql.exec.Description;
20 | import org.apache.hadoop.hive.ql.exec.UDF;
21 |
22 | @Description(name="array_count_equals", value="_FUNC_(array, type needle) - Counts the number of times the needle appears in the array")
23 | public class UDFArrayCountEquals extends UDF {
24 | public int evaluate(List a, String needle) {
25 | if (a == null) return 0;
26 | if (needle == null) return a.size();
27 |
28 | int ret = 0;
29 | for (int i = 0; i < a.size(); i++) {
30 | if (needle.equals(a.get(i))) ret++;
31 | }
32 | return ret;
33 | }
34 |
35 | public int evaluate(List a, int needle) {
36 | if (a == null) return 0;
37 |
38 | int ret = 0;
39 | for (int i = 0; i < a.size(); i++) {
40 | if (needle == a.get(i)) ret++;
41 | }
42 | return ret;
43 | }
44 |
45 | public double evaluate(List a, double needle) {
46 | if (a == null) return 0;
47 |
48 | int ret = 0;
49 | for (int i = 0; i < a.size(); i++) {
50 | if (needle == a.get(i)) ret++;
51 | }
52 | return ret;
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/ivy.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/storage/XMLSerde.java:
--------------------------------------------------------------------------------
1 | package com.dataiku.hive.storage;
2 |
3 | import org.apache.commons.logging.Log;
4 | import org.apache.commons.logging.LogFactory;
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.hive.serde2.SerDe;
7 | import org.apache.hadoop.hive.serde2.SerDeException;
8 | import org.apache.hadoop.hive.serde2.SerDeStats;
9 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
10 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
11 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
12 | import org.apache.hadoop.io.Text;
13 | import org.apache.hadoop.io.Writable;
14 |
15 | import java.util.ArrayList;
16 | import java.util.List;
17 | import java.util.Properties;
18 |
19 | /**
20 | *
21 | * Serde for
22 | */
23 | public class XMLSerde implements SerDe {
24 |
25 |
26 | ObjectInspector oi;
27 | List row;
28 | public static final Log LOG = LogFactory.getLog(XMLSerde.class.getName());
29 |
30 | @Override
31 | public void initialize(Configuration entries, Properties properties) throws SerDeException {
32 | List columnNames = new ArrayList();
33 | columnNames.add("text");
34 |
35 | ArrayList columnOIs = new ArrayList(columnNames.size());
36 |
37 | columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
38 |
39 | oi = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs);
40 | row = new ArrayList();
41 | row.add(null);
42 | }
43 |
44 | @Override
45 | public Object deserialize(Writable blob) throws SerDeException {
46 | Text rowText = (Text) blob;
47 | row.set(0, rowText.toString());
48 | return row;
49 | }
50 |
51 | @Override
52 | public ObjectInspector getObjectInspector() throws SerDeException {
53 | return oi;
54 | }
55 |
56 | @Override
57 | public SerDeStats getSerDeStats() {
58 | return null;
59 | }
60 |
61 | @Override
62 | public Class extends Writable> getSerializedClass() {
63 | return Text.class;
64 | }
65 |
66 | @Override
67 | public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException {
68 | throw new SerDeException("Not implemented");
69 |
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/com/dataiku/hive/udf/arrays/UDFArrayJoin.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2013 Dataiku
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the
5 | * "License"); you may not use this file except in compliance
6 | * with the License. You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.dataiku.hive.udf.arrays;
17 |
18 | import java.util.ArrayList;
19 | import java.util.List;
20 |
21 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
22 | import org.apache.hadoop.hive.ql.metadata.HiveException;
23 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
24 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
26 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
27 |
28 | /**
29 | * Joins an array of arrays into a single array containing all elements.
30 | * No deduplication is performed
31 | */
32 | public class UDFArrayJoin extends GenericUDF {
33 | ListObjectInspector arrayInspector;
34 | ListObjectInspector elementsInspector;
35 |
36 | List