├── src ├── main │ └── java │ │ └── com │ │ └── github │ │ └── aaronshan │ │ └── functions │ │ ├── fastuitl │ │ └── ints │ │ │ ├── IntComparator.java │ │ │ ├── AbstractIntComparator.java │ │ │ └── IntArrays.java │ │ ├── utils │ │ ├── Failures.java │ │ ├── MathUtils.java │ │ ├── json │ │ │ ├── JsonPath.java │ │ │ ├── JsonUtils.java │ │ │ └── JsonPathTokenizer.java │ │ ├── MapUtils.java │ │ ├── ArrayUtils.java │ │ └── ConfigUtils.java │ │ ├── model │ │ └── ChinaIdArea.java │ │ ├── geo │ │ ├── UDFGeoWgsToGcj.java │ │ ├── UDFGeoGcjToBd.java │ │ ├── UDFGeoGcjToWgs.java │ │ ├── UDFGeoGcjExtractWgs.java │ │ ├── UDFGeoBdToGcj.java │ │ └── UDFGeoWgsDistance.java │ │ ├── math │ │ ├── UDFMathInfinity.java │ │ ├── UDFMathIsNaN.java │ │ ├── UDFMathNaN.java │ │ ├── UDFMathIsFinite.java │ │ ├── UDFMathToBase.java │ │ ├── UDFMathNormalCdf.java │ │ ├── UDFMathInverseNormalCdf.java │ │ ├── UDFMathFromBase.java │ │ └── UDFMathIsInfinite.java │ │ ├── bitwise │ │ ├── UDFBitwiseNot.java │ │ ├── UDFBitwiseOr.java │ │ ├── UDFBitwiseAnd.java │ │ ├── UDFBitwiseXor.java │ │ └── UDFBitCount.java │ │ ├── regexp │ │ ├── re2j │ │ │ ├── Machine.java │ │ │ ├── DFAStateKey.java │ │ │ ├── SparseSet.java │ │ │ ├── PatternSyntaxException.java │ │ │ ├── DFAState.java │ │ │ ├── MachineInput.java │ │ │ ├── Inst.java │ │ │ └── SliceUtils.java │ │ ├── UDFRe2JRegexpLike.java │ │ ├── UDFRe2JRegexpExtract.java │ │ ├── UDFRe2JRegexpReplace.java │ │ ├── UDFRe2JRegexpSplit.java │ │ └── UDFRe2JRegexpExtractAll.java │ │ ├── card │ │ ├── UDFChinaIdCardArea.java │ │ ├── UDFChinaIdCardCity.java │ │ ├── UDFChinaIdCardGender.java │ │ ├── UDFChinaIdCardBirthday.java │ │ ├── UDFChinaIdCardInfo.java │ │ ├── UDFChinaIdCardProvince.java │ │ └── UDFChinaIdCardValid.java │ │ ├── string │ │ ├── UDFMd5.java │ │ ├── UDFSha256.java │ │ ├── UDFStringPosition.java │ │ ├── UDFStringNormalize.java │ │ ├── UDFCodePoint.java │ │ ├── UDFChineseToPinYin.java │ │ ├── UDFStringHammingDistance.java │ │ ├── UDFStringSplitToMap.java │ │ ├── UDFStringLevenshteinDistance.java │ │ └── UDFStringSplitToMultimap.java │ │ ├── url │ │ ├── UDFUrlDecode.java │ │ └── UDFUrlEncode.java │ │ ├── json │ │ ├── UDFJsonArrayGet.java │ │ ├── UDFJsonArrayLength.java │ │ ├── UDFJsonExtract.java │ │ ├── UDFJsonExtractScalar.java │ │ ├── UDFJsonSize.java │ │ ├── UDFJsonArrayExtract.java │ │ └── UDFJsonArrayExtractScalar.java │ │ ├── date │ │ ├── UDFDayOfYear.java │ │ ├── UDFDayOfWeek.java │ │ ├── UDFZodiacSignEn.java │ │ ├── UDFZodiacSignCn.java │ │ └── UDFTypeOfDay.java │ │ ├── map │ │ ├── UDFMapElementAt.java │ │ ├── UDFMapBuild.java │ │ └── UDFMapConcat.java │ │ └── array │ │ ├── UDFArrayReverse.java │ │ ├── UDFArrayMax.java │ │ ├── UDFArrayMin.java │ │ ├── UDFSequence.java │ │ ├── UDFArrayEquals.java │ │ ├── UDFArraySort.java │ │ ├── UDFArrayElementAt.java │ │ ├── UDFArrayShuffle.java │ │ └── UDFArrayDistinct.java └── test │ └── java │ └── com │ └── github │ └── aaronshan │ └── functions │ ├── geo │ └── UDFGeoBdToGcjTest.java │ ├── url │ ├── UDFUrlDecodeTest.java │ └── UDFUrlEncodeTest.java │ ├── date │ └── UDFDayOfYearTest.java │ ├── bitwise │ └── UDFBitCountTest.java │ ├── string │ ├── UDFStringSplitToMapTest.java │ └── UDFStringSplitToMultimapTest.java │ ├── array │ ├── UDFArrayShuffleTest.java │ ├── UDFArrayValueCountTest.java │ ├── UDFArrayContainsTest.java │ └── UDFArrayIntersectTest.java │ ├── regexp │ └── UDFRe2JRegexpExtractAllTest.java │ ├── map │ ├── UDFMapElementAtTest.java │ ├── UDFMapBuildTest.java │ ├── UDFMapConcatTest.java │ └── UDFMapEqualsTest.java │ └── math │ └── UDFMathCosineSimilarityTest.java ├── .gitignore ├── .travis.yml └── README-geo.md /src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntComparator.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.fastuitl.ints; 2 | 3 | import java.util.Comparator; 4 | 5 | // Note: this code was forked from fastutil (http://fastutil.di.unimi.it/) 6 | // Copyright (C) 2010-2013 Sebastiano Vigna 7 | public interface IntComparator extends Comparator { 8 | int compare(int var1, int var2); 9 | } 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # kdiff3 ignore 2 | *.orig 3 | 4 | # maven ignore 5 | target/ 6 | 7 | # eclipse ignore 8 | .settings/ 9 | .project 10 | .classpath 11 | 12 | # idea ignore 13 | .idea/ 14 | *.ipr 15 | *.iml 16 | *.iws 17 | 18 | # temp ignore 19 | *.log 20 | *.cache 21 | *.diff 22 | *.patch 23 | *.tmp 24 | 25 | # system ignore 26 | .DS_Store 27 | Thumbs.db 28 | 29 | # package ignore (optional) 30 | # *.jar 31 | # *.war 32 | # *.zip 33 | # *.tar 34 | # *.tar.gz 35 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/fastuitl/ints/AbstractIntComparator.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.fastuitl.ints; 2 | 3 | // Note: this code was forked from fastutil (http://fastutil.di.unimi.it/) 4 | // Copyright (C) 2010-2013 Sebastiano Vigna 5 | public abstract class AbstractIntComparator implements IntComparator { 6 | protected AbstractIntComparator() { 7 | } 8 | 9 | public int compare(Integer ok1, Integer ok2) { 10 | return this.compare(ok1.intValue(), ok2.intValue()); 11 | } 12 | 13 | public abstract int compare(int var1, int var2); 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/utils/Failures.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.utils; 2 | 3 | import org.apache.hadoop.hive.ql.metadata.HiveException; 4 | 5 | import static com.google.common.collect.Sets.newIdentityHashSet; 6 | import static java.lang.String.format; 7 | 8 | public class Failures { 9 | private Failures() {} 10 | 11 | public static void checkCondition(boolean condition, String formatString, Object... args) throws HiveException { 12 | if (!condition) { 13 | throw new HiveException(format(formatString, args)); 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/utils/MathUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.utils; 2 | 3 | import org.apache.hadoop.hive.ql.metadata.HiveException; 4 | 5 | import static com.github.aaronshan.functions.utils.Failures.checkCondition; 6 | import static java.lang.Character.MAX_RADIX; 7 | import static java.lang.Character.MIN_RADIX; 8 | 9 | public class MathUtils { 10 | public static void checkRadix(long radix) throws HiveException { 11 | checkCondition(radix >= MIN_RADIX && radix <= MAX_RADIX, "Radix must be between %d and %d", MIN_RADIX, MAX_RADIX); 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | dist: trusty 4 | 5 | jdk: 6 | - oraclejdk8 7 | - openjdk7 8 | 9 | before_cache: 10 | # Make the cache stable between builds by removing build output 11 | - rm -rf $HOME/.m2/repository/cc/shanruifeng 12 | 13 | cache: 14 | directories: 15 | - $HOME/.m2/repository 16 | 17 | install: 18 | - wget http://www.datanucleus.org/downloads/maven2/javax/jdo/jdo2-api/2.3-ec/jdo2-api-2.3-ec.jar -O $HOME/jdo2-api-2.3-ec.jar 19 | - mvn install:install-file -DgroupId=javax.jdo -DartifactId=jdo2-api -Dversion=2.3-ec -Dpackaging=jar -Dfile=$HOME/jdo2-api-2.3-ec.jar 20 | 21 | script: 22 | - jdk_switcher use oraclejdk8 23 | - mvn clean package 24 | -------------------------------------------------------------------------------- /README-geo.md: -------------------------------------------------------------------------------- 1 | ## 当前互联网地图的坐标系现状 2 | ### 地球坐标 (WGS84) 3 | - 国际标准,从 GPS 设备中取出的数据的坐标系 4 | - 国际地图提供商使用的坐标系 5 | 6 | ### 火星坐标 (GCJ-02), 也叫国测局坐标系 7 | - 中国标准,从国行移动设备中定位获取的坐标数据使用这个坐标系 8 | - 国家规定: 国内出版的各种地图系统(包括电子形式),必须至少采用GCJ-02对地理位置进行首次加密。 9 | 10 | ### 百度坐标 (BD-09) 11 | - 百度标准,百度 SDK,百度地图,Geocoding 使用 12 | - (本来就乱了,百度又在火星坐标上来个二次加密) 13 | 14 | ## 开发过程需要注意的事 15 | - 从设备获取经纬度(GPS)坐标 16 | * 如果使用的是百度sdk那么可以获得百度坐标(bd09)或者火星坐标(GCJ02),默认是bd09 17 | * 如果使用的是ios的原生定位库,那么获得的坐标是WGS84 18 | * 如果使用的是高德sdk,那么获取的坐标是GCJ02 19 | - 互联网在线地图使用的坐标系 20 | * 火星坐标系: 21 | + iOS 地图(其实是高德) 22 | + Google 地图 23 | + 搜搜、阿里云、高德地图 24 | * 百度坐标系: 25 | + 当然只有百度地图 26 | * WGS84坐标系: 27 | + 国际标准,谷歌国外地图、osm地图等国外的地图一般都是这个 28 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/model/ChinaIdArea.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.model; 2 | 3 | /** 4 | * @author ruifeng.shan 5 | * date: 2016-07-07 6 | * time: 18:20 7 | */ 8 | public class ChinaIdArea { 9 | private String province; 10 | private String city; 11 | private String area; 12 | 13 | public ChinaIdArea(String province, String city, String area) { 14 | this.province = province; 15 | this.city = city; 16 | this.area = area; 17 | } 18 | 19 | public String getProvince() { 20 | return province; 21 | } 22 | 23 | public String getCity() { 24 | return city; 25 | } 26 | 27 | public String getArea() { 28 | return area; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsToGcj.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.geo; 2 | 3 | import com.github.aaronshan.functions.utils.GeoUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.io.Text; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 17:03 11 | */ 12 | @Description(name = "wgs_to_gcj" 13 | , value = "_FUNC_(wgsLng) - Convert WGS-84 to GCJ-02." 14 | , extended = "Example:\n > select _FUNC_(wgsLng) from src;") 15 | public class UDFGeoWgsToGcj { 16 | private Text result = new Text(); 17 | 18 | public Text evaluate(double wgsLat, double wgsLng) { 19 | result.set(GeoUtils.WGS84ToGCJ02(wgsLat, wgsLng)); 20 | return result; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/math/UDFMathInfinity.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.DoubleWritable; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 18-7-23 10 | */ 11 | @Description(name = "infinity" 12 | , value = "_FUNC_() - Infinity." 13 | , extended = "Example:\n > select _FUNC_() from src;") 14 | public class UDFMathInfinity extends UDF { 15 | private DoubleWritable result = new DoubleWritable(); 16 | 17 | public UDFMathInfinity() { 18 | } 19 | 20 | public DoubleWritable evaluate() { 21 | result.set(Double.POSITIVE_INFINITY); 22 | return result; 23 | } 24 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseNot.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.bitwise; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.LongWritable; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 15:50 11 | */ 12 | @Description(name = "bitwise_not" 13 | , value = "_FUNC_(x) - returns the bitwise NOT of x in 2’s complement arithmetic." 14 | , extended = "Example:\n > select _FUNC_(9) from src;") 15 | public class UDFBitwiseNot extends UDF { 16 | private LongWritable result = new LongWritable(); 17 | 18 | public LongWritable evaluate(long num) { 19 | result.set(~num); 20 | return result; 21 | } 22 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToBd.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.geo; 2 | 3 | import com.github.aaronshan.functions.utils.GeoUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.io.Text; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 17:02 11 | */ 12 | @Description(name = "gcj_to_bd" 13 | , value = "_FUNC_(gcjLat, gcjLng) - Convert GCJ-02 to BD-09." 14 | , extended = "Example:\n > select _FUNC_(gcjLat, gcjLng) from src;") 15 | public class UDFGeoGcjToBd { 16 | private Text result = new Text(); 17 | 18 | public Text evaluate(double gcjLat, double gcjLng) { 19 | result.set(GeoUtils.GCJ02ToBD09(gcjLat, gcjLng)); 20 | return result; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjToWgs.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.geo; 2 | 3 | import com.github.aaronshan.functions.utils.GeoUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.io.Text; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 17:04 11 | */ 12 | @Description(name = "gcj_to_wgs" 13 | , value = "_FUNC_(gcjLat, gcjLng) - Convert GCJ-02 to WGS-84." 14 | , extended = "Example:\n > select _FUNC_(gcjLat, gcjLng) from src;") 15 | public class UDFGeoGcjToWgs { 16 | private Text result = new Text(); 17 | 18 | public Text evaluate(double gcjLat, double gcjLng) { 19 | result.set(GeoUtils.GCJ02ToWGS84(gcjLat, gcjLng)); 20 | return result; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseOr.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.bitwise; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.LongWritable; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 15:50 11 | */ 12 | @Description(name = "bitwise_or" 13 | , value = "_FUNC_(x, y) - returns the bitwise OR of x and y in 2’s complement arithmetic." 14 | , extended = "Example:\n > select _FUNC_(x, y) from src;") 15 | public class UDFBitwiseOr extends UDF { 16 | private LongWritable result = new LongWritable(); 17 | 18 | public LongWritable evaluate(long left, long right) { 19 | result.set(left | right); 20 | return result; 21 | } 22 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/geo/UDFGeoGcjExtractWgs.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.geo; 2 | 3 | import com.github.aaronshan.functions.utils.GeoUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.io.Text; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 17:05 11 | */ 12 | @Description(name = "gcj_extract_wgs" 13 | , value = "_FUNC_(gcjLat, gcjLng) - Convert GCJ-02 to WGS-84." 14 | , extended = "Example:\n > select _FUNC_(gcjLat, gcjLng) from src;") 15 | public class UDFGeoGcjExtractWgs { 16 | private Text result = new Text(); 17 | 18 | public Text evaluate(double gcjLat, double gcjLng) { 19 | result.set(GeoUtils.GCJ02ExtractWGS84(gcjLat, gcjLng)); 20 | return result; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseAnd.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.bitwise; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.LongWritable; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 15:49 11 | */ 12 | @Description(name = "bitwise_and" 13 | , value = "_FUNC_(x, y) - returns the bitwise AND of x and y in 2’s complement arithmetic." 14 | , extended = "Example:\n > select _FUNC_(x, y) from src;") 15 | public class UDFBitwiseAnd extends UDF { 16 | private LongWritable result = new LongWritable(); 17 | 18 | public LongWritable evaluate(long left, long right) { 19 | result.set(left & right); 20 | return result; 21 | } 22 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/bitwise/UDFBitwiseXor.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.bitwise; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.LongWritable; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 15:50 11 | */ 12 | @Description(name = "bitwise_xor" 13 | , value = "_FUNC_(x, y) - returns the bitwise XOR of x and y in 2’s complement arithmetic." 14 | , extended = "Example:\n > select _FUNC_(x, y) from src;") 15 | public class UDFBitwiseXor extends UDF { 16 | private LongWritable result = new LongWritable(); 17 | 18 | public LongWritable evaluate(long left, long right) { 19 | result.set(left ^ right); 20 | return result; 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcjTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.geo; 2 | 3 | import org.apache.hadoop.io.Text; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * @date 2016-07-28 10 | * @time 17:39 11 | */ 12 | public class UDFGeoBdToGcjTest { 13 | protected void runTest(Text expect, double lat, double lng, UDFGeoBdToGcj udf) { 14 | Text res = udf.evaluate(lat, lng); 15 | Assert.assertNotNull(res); 16 | Assert.assertEquals("bd_to_gcj test", expect.toString(), res.toString()); 17 | } 18 | 19 | @Test 20 | public void testBdToGcj() throws Exception { 21 | runTest(new Text("{\"lng\":116.39762729119315,\"lat\":39.90865673957631}"), 39.915, 116.404, new UDFGeoBdToGcj()); 22 | } 23 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/geo/UDFGeoBdToGcj.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.geo; 2 | 3 | import com.github.aaronshan.functions.utils.GeoUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-27 11 | * time: 17:02 12 | */ 13 | @Description(name = "bd_to_gcj" 14 | , value = "_FUNC_(bdLat, bdLng) - Convert BD-09 to GCJ-02." 15 | , extended = "Example:\n > select _FUNC_(bdLat, bdLng) from src;") 16 | public class UDFGeoBdToGcj extends UDF { 17 | private Text result = new Text(); 18 | 19 | public Text evaluate(double bdLat, double bdLng) { 20 | result.set(GeoUtils.BD09ToGCJ02(bdLat, bdLng)); 21 | return result; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/geo/UDFGeoWgsDistance.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.geo; 2 | 3 | import com.github.aaronshan.functions.utils.GeoUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.serde2.io.DoubleWritable; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 17:01 11 | */ 12 | @Description(name = "wgs_distance" 13 | , value = "_FUNC_(lat1, lng1, lat2, lng2) - return WGS84 distance." 14 | , extended = "Example:\n > select _FUNC_(lat1, lng1, lat2, lng2) from src;") 15 | public class UDFGeoWgsDistance { 16 | private DoubleWritable result = new DoubleWritable(); 17 | 18 | public DoubleWritable evaluate(double lat1, double lng1, double lat2, double lng2) { 19 | result.set(GeoUtils.WGS84Distance(lat1, lng1, lat2, lng2)); 20 | return result; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/url/UDFUrlDecodeTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.url; 2 | 3 | import org.apache.hadoop.io.Text; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | import static org.junit.Assert.*; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * @date 2016-07-28 12 | * @time 17:57 13 | */ 14 | public class UDFUrlDecodeTest { 15 | protected void runTest(String value, Text exp, UDFUrlDecode udf) { 16 | Text res = udf.evaluate(value); 17 | if (exp == null) { 18 | Assert.assertNull(res); 19 | } else { 20 | Assert.assertNotNull(res); 21 | Assert.assertEquals("url_encode test", exp.toString(), res.toString()); 22 | } 23 | } 24 | 25 | @Test 26 | public void testUrlEncode() throws Exception { 27 | runTest("http%3A%2F%2Fshanruifeng.cc%2F", new Text("http://shanruifeng.cc/"), new UDFUrlDecode()); 28 | } 29 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/re2j/Machine.java: -------------------------------------------------------------------------------- 1 | // Copyright 2010 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Original Go source here: 6 | // http://code.google.com/p/go/source/browse/src/pkg/regexp/exec.go 7 | 8 | package com.github.aaronshan.functions.regexp.re2j; 9 | 10 | import com.github.aaronshan.functions.regexp.re2j.RE2.Anchor; 11 | 12 | /** 13 | * A Machine matches an input string of Unicode characters against an RE2 instance. 14 | */ 15 | interface Machine { 16 | 17 | /** 18 | * Runs the machine over the input |in| starting at |pos| with the RE2 Anchor |anchor|. 19 | * |submatches| contains group positions after a successful match. 20 | * 21 | * @return reports whether a match was found. 22 | */ 23 | boolean match(MachineInput in, int pos, Anchor anchor, int[] submatches); 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardArea.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.card; 2 | 3 | import com.github.aaronshan.functions.utils.CardUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-25 11 | * time: 20:12 12 | */ 13 | @Description(name = "id_card_area" 14 | , value = "_FUNC_(string) - get area by given china id card." 15 | , extended = "Example:\n > select _FUNC_(string) from src;") 16 | public class UDFChinaIdCardArea extends UDF { 17 | private Text result = new Text(); 18 | 19 | public UDFChinaIdCardArea() { 20 | } 21 | 22 | public Text evaluate(Text idCard) { 23 | if (idCard == null) { 24 | return null; 25 | } 26 | result.set(CardUtils.getIdCardArea(idCard.toString())); 27 | return result; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardCity.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.card; 2 | 3 | import com.github.aaronshan.functions.utils.CardUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-25 11 | * time: 20:11 12 | */ 13 | @Description(name = "id_card_city" 14 | , value = "_FUNC_(string) - get city by given china id card." 15 | , extended = "Example:\n > select _FUNC_(string) from src;") 16 | public class UDFChinaIdCardCity extends UDF { 17 | private Text result = new Text(); 18 | 19 | public UDFChinaIdCardCity() { 20 | } 21 | 22 | public Text evaluate(Text idCard) { 23 | if (idCard == null) { 24 | return null; 25 | } 26 | result.set(CardUtils.getIdCardCity(idCard.toString())); 27 | return result; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardGender.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.card; 2 | 3 | import com.github.aaronshan.functions.utils.CardUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-25 11 | * time: 20:14 12 | */ 13 | @Description(name = "id_card_gender" 14 | , value = "_FUNC_(string) - get gender by given china id card." 15 | , extended = "Example:\n > select _FUNC_(string) from src;") 16 | public class UDFChinaIdCardGender extends UDF { 17 | private Text result = new Text(); 18 | 19 | public UDFChinaIdCardGender() { 20 | } 21 | 22 | public Text evaluate(Text idCard) { 23 | if (idCard == null) { 24 | return null; 25 | } 26 | result.set(CardUtils.getIdCardGender(idCard.toString())); 27 | return result; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardBirthday.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.card; 2 | 3 | import com.github.aaronshan.functions.utils.CardUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-25 11 | * time: 20:14 12 | */ 13 | @Description(name = "id_card_birthday" 14 | , value = "_FUNC_(string) - get birthday by given china id card." 15 | , extended = "Example:\n > select _FUNC_(string) from src;") 16 | public class UDFChinaIdCardBirthday extends UDF{ 17 | private Text result = new Text(); 18 | 19 | public UDFChinaIdCardBirthday() { 20 | } 21 | 22 | public Text evaluate(Text idCard) { 23 | if (idCard == null) { 24 | return null; 25 | } 26 | result.set(CardUtils.getIdCardBirthday(idCard.toString())); 27 | return result; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardInfo.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.card; 2 | 3 | import com.github.aaronshan.functions.utils.CardUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * Created by ruifengshan on 16/3/22. 10 | */ 11 | 12 | //身份证->json 13 | @Description(name = "id_card_info" 14 | , value = "_FUNC_(string) - get all info by given china id card, output is json string." 15 | , extended = "Example:\n > select _FUNC_(string) from src;") 16 | public class UDFChinaIdCardInfo extends UDF { 17 | private Text result = new Text(); 18 | 19 | public UDFChinaIdCardInfo() { 20 | } 21 | 22 | public Text evaluate(Text idCard) { 23 | if (idCard == null) { 24 | return null; 25 | } 26 | result.set(CardUtils.getJsonOfChinaIdCard(idCard.toString())); 27 | return result; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardProvince.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.card; 2 | 3 | import com.github.aaronshan.functions.utils.CardUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-25 11 | * time: 19:42 12 | */ 13 | @Description(name = "id_card_province" 14 | , value = "_FUNC_(string) - get province by given china id card." 15 | , extended = "Example:\n > select _FUNC_(string) from src;") 16 | public class UDFChinaIdCardProvince extends UDF { 17 | private Text result = new Text(); 18 | 19 | public UDFChinaIdCardProvince() { 20 | } 21 | 22 | public Text evaluate(Text idCard) { 23 | if (idCard == null) { 24 | return null; 25 | } 26 | result.set(CardUtils.getIdCardProvince(idCard.toString())); 27 | return result; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/url/UDFUrlEncodeTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.url; 2 | 3 | import com.github.aaronshan.functions.date.UDFDayOfYear; 4 | import org.apache.hadoop.io.Text; 5 | import org.junit.Assert; 6 | import org.junit.Test; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | /** 11 | * @author ruifeng.shan 12 | * @date 2016-07-28 13 | * @time 17:51 14 | */ 15 | public class UDFUrlEncodeTest { 16 | protected void runTest(String value, Text exp, UDFUrlEncode udf) { 17 | Text res = udf.evaluate(value); 18 | if (exp == null) { 19 | Assert.assertNull(res); 20 | } else { 21 | Assert.assertNotNull(res); 22 | Assert.assertEquals("url_encode test", exp.toString(), res.toString()); 23 | } 24 | } 25 | 26 | @Test 27 | public void testUrlEncode() throws Exception { 28 | runTest("http://shanruifeng.cc/", new Text("http%3A%2F%2Fshanruifeng.cc%2F"), new UDFUrlEncode()); 29 | } 30 | } -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/date/UDFDayOfYearTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.date; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.junit.Assert; 6 | import org.junit.Test; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * @date 2016-07-27 11 | * @time 20:20 12 | */ 13 | public class UDFDayOfYearTest { 14 | @Test 15 | public void testDayOfYear() throws Exception { 16 | Text dateString = new Text("2016-01-01"); 17 | UDFDayOfYear udf = new UDFDayOfYear(); 18 | 19 | runTest(dateString, new IntWritable(1), udf); 20 | } 21 | 22 | protected void runTest(Text dateString, IntWritable exp, UDFDayOfYear udf) { 23 | IntWritable res = udf.evaluate(dateString); 24 | if (exp == null) { 25 | Assert.assertNull(res); 26 | } else { 27 | Assert.assertNotNull(res); 28 | Assert.assertEquals("day_of_year test", exp.get(), res.get()); 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFMd5.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import org.apache.commons.codec.digest.DigestUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * Created by ruifengshan on 16/3/18. 10 | */ 11 | @Description(name = "md5" 12 | , value = "_FUNC_(string) - get md5 hash code by given input string." 13 | , extended = "Example:\n > select _FUNC_(string) from src;") 14 | public class UDFMd5 extends UDF { 15 | private Text result = new Text(); 16 | 17 | public UDFMd5() { 18 | } 19 | 20 | /** 21 | * md5 hash. 22 | * 23 | * @param text 字符串 24 | * @return md5 hash. 25 | */ 26 | public Text evaluate(Text text) { 27 | if (text == null) { 28 | return null; 29 | } 30 | 31 | result.set(DigestUtils.md5Hex((text.toString()))); 32 | return result; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/math/UDFMathIsNaN.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.BooleanWritable; 6 | import org.apache.hadoop.io.DoubleWritable; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 18-7-23 11 | */ 12 | @Description(name = "is_nan" 13 | , value = "_FUNC_(double) - test if value is nan." 14 | , extended = "Example:\n > select _FUNC_(double) from src;") 15 | public class UDFMathIsNaN extends UDF { 16 | BooleanWritable result = new BooleanWritable(); 17 | 18 | public UDFMathIsNaN() { 19 | } 20 | 21 | public BooleanWritable evaluate(DoubleWritable num) { 22 | if (num == null) { 23 | result.set(false); 24 | } else { 25 | result.set(isNaN(num.get())); 26 | } 27 | return result; 28 | } 29 | 30 | private boolean isNaN(double v) { 31 | return (v != v); 32 | } 33 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/math/UDFMathNaN.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.DoubleWritable; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 18-7-23 10 | */ 11 | @Description(name = "NaN" 12 | , value = "_FUNC_() - constant representing not-a-number." 13 | , extended = "Example:\n > select _FUNC_() from src;") 14 | public class UDFMathNaN extends UDF { 15 | /** 16 | * A constant holding a Not-a-Number (NaN) value of type 17 | * {@code double}. It is equivalent to the value returned by 18 | * {@code Double.longBitsToDouble(0x7ff8000000000000L)}. 19 | */ 20 | public static final double NaN = 0.0d / 0.0; 21 | 22 | private DoubleWritable result = new DoubleWritable(); 23 | 24 | public UDFMathNaN() { 25 | } 26 | 27 | public DoubleWritable evaluate() { 28 | result.set(NaN); 29 | return result; 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/card/UDFChinaIdCardValid.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.card; 2 | 3 | import com.github.aaronshan.functions.utils.CardUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.BooleanWritable; 7 | import org.apache.hadoop.io.Text; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * date: 2016-07-25 12 | * time: 20:15 13 | */ 14 | @Description(name = "is_valid_id_card" 15 | , value = "_FUNC_(string) - whether given china id card is valid or not." 16 | , extended = "Example:\n > select _FUNC_(string) from src;") 17 | public class UDFChinaIdCardValid extends UDF { 18 | private BooleanWritable result = new BooleanWritable(); 19 | 20 | public UDFChinaIdCardValid() { 21 | } 22 | 23 | public BooleanWritable evaluate(Text idCard) { 24 | if (idCard == null) { 25 | return null; 26 | } 27 | result.set(CardUtils.isValidIdCard(idCard.toString())); 28 | return result; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFSha256.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import org.apache.commons.codec.digest.DigestUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-25 11 | * time: 14:29 12 | */ 13 | @Description(name = "sha256" 14 | , value = "_FUNC_(string) - get sha256 hash code by given input string." 15 | , extended = "Example:\n > select _FUNC_(string) from src;") 16 | public class UDFSha256 extends UDF { 17 | private Text result = new Text(); 18 | 19 | public UDFSha256() { 20 | } 21 | 22 | /** 23 | * sha256 hash. 24 | * 25 | * @param text 字符串 26 | * @return sha256 hash. 27 | */ 28 | public Text evaluate(Text text) { 29 | if (text == null) { 30 | return null; 31 | } 32 | 33 | result.set(DigestUtils.sha256Hex((text.toString()))); 34 | return result; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/url/UDFUrlDecode.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.url; 2 | 3 | import java.io.UnsupportedEncodingException; 4 | import java.net.URLDecoder; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDF; 7 | import org.apache.hadoop.io.Text; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * date: 2016-07-27 12 | * time: 16:04 13 | */ 14 | @Description(name = "url_decode" 15 | , value = "_FUNC_(value) - Unescape the URL encoded value. This function is the inverse of url_encode()" 16 | , extended = "Example:\n > select _FUNC_(value) from src;") 17 | public class UDFUrlDecode extends UDF { 18 | private Text result = new Text(); 19 | 20 | public Text evaluate(String value) { 21 | if (value == null) { 22 | return null; 23 | } 24 | try { 25 | result.set(URLDecoder.decode(value, "UTF-8")); 26 | return result; 27 | } catch (UnsupportedEncodingException e) { 28 | throw new AssertionError(e); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayGet.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.json; 2 | 3 | import com.github.aaronshan.functions.utils.json.JsonUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-25 11 | * time: 15:29 12 | */ 13 | @Description(name = "json_array_get", value = "_FUNC_(json, json_path) - returns the element at the specified index into the json_array. The index is zero-based.. " 14 | , extended = "Example:\n" 15 | + " > SELECT _FUNC_(json_array, json_path) FROM src LIMIT 1;") 16 | public class UDFJsonArrayGet extends UDF { 17 | private Text result = new Text(); 18 | 19 | public UDFJsonArrayGet() { 20 | } 21 | 22 | public Text evaluate(Text json, long index) { 23 | try { 24 | result.set(JsonUtils.jsonArrayGet(json.toString(), index)); 25 | return result; 26 | } catch (Exception e) { 27 | return null; 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/math/UDFMathIsFinite.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.BooleanWritable; 6 | import org.apache.hadoop.io.DoubleWritable; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 18-7-23 11 | */ 12 | @Description(name = "is_finite" 13 | , value = "_FUNC_(double) - test if value is finite." 14 | , extended = "Example:\n > select _FUNC_(double) from src;") 15 | public class UDFMathIsFinite extends UDF { 16 | public static final double MAX_VALUE = 1.7976931348623157E308D; 17 | BooleanWritable result = new BooleanWritable(); 18 | 19 | public UDFMathIsFinite() { 20 | } 21 | 22 | public BooleanWritable evaluate(DoubleWritable num) { 23 | if (num == null) { 24 | result.set(false); 25 | } else { 26 | result.set(isFinite(num.get())); 27 | } 28 | return result; 29 | } 30 | 31 | private boolean isFinite(double d) { 32 | return Math.abs(d) <= MAX_VALUE; 33 | } 34 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayLength.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.json; 2 | 3 | import com.github.aaronshan.functions.utils.json.JsonUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * date: 2016-07-25 12 | * time: 14:57 13 | */ 14 | @Description(name = "json_array_length", value = "_FUNC_(json, json_path) - Returns the array length of json (a string containing a JSON array). " 15 | , extended = "Example:\n" 16 | + " > SELECT _FUNC_(json_array, json_path) FROM src LIMIT 1;") 17 | public class UDFJsonArrayLength extends UDF { 18 | private LongWritable result = new LongWritable(); 19 | 20 | public UDFJsonArrayLength() { 21 | } 22 | 23 | public LongWritable evaluate(Text text) { 24 | try { 25 | result.set(JsonUtils.jsonArrayLength(text.toString())); 26 | return result; 27 | } catch (Exception e) { 28 | return null; 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpLike.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.regexp; 2 | 3 | import io.airlift.slice.Slices; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2018-07-27 11 | * time: 22:36 12 | */ 13 | @Description(name = "regexp_like" 14 | , value = "_FUNC_(string, string) - returns substrings matching a regular expression." 15 | , extended = "Example:\n > select _FUNC_(string, pattern) from src;") 16 | public class UDFRe2JRegexpLike extends UDF { 17 | private static Re2JRegexp re2JRegexp; 18 | 19 | public UDFRe2JRegexpLike() { 20 | 21 | } 22 | 23 | public boolean evaluate(Text text, Text pattern) { 24 | if (text == null) { 25 | return false; 26 | } 27 | 28 | if (re2JRegexp == null) { 29 | re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern.toString())); 30 | } 31 | 32 | return re2JRegexp.matches(Slices.utf8Slice(text.toString())); 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/bitwise/UDFBitCountTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.bitwise; 2 | 3 | import org.apache.hadoop.hive.ql.metadata.HiveException; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.junit.Assert; 6 | import org.junit.Test; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * @date 2016-07-28 11 | * @time 14:15 12 | */ 13 | public class UDFBitCountTest { 14 | protected void runTest(long num, long bits, LongWritable exp, UDFBitCount udf) throws HiveException { 15 | LongWritable res = udf.evaluate(num, bits); 16 | if (exp == null) { 17 | Assert.assertNull(res); 18 | } else { 19 | Assert.assertNotNull(res); 20 | Assert.assertEquals("bit_count test", exp.get(), res.get()); 21 | } 22 | } 23 | 24 | @Test 25 | public void testBitCount() throws Exception { 26 | runTest(9, 64, new LongWritable(2), new UDFBitCount()); 27 | runTest(9, 8, new LongWritable(2), new UDFBitCount()); 28 | runTest(-7, 64, new LongWritable(62), new UDFBitCount()); 29 | runTest(-7, 8, new LongWritable(6), new UDFBitCount()); 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/math/UDFMathToBase.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.hive.ql.metadata.HiveException; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | 9 | import static com.github.aaronshan.functions.utils.MathUtils.checkRadix; 10 | 11 | /** 12 | * @author ruifeng.shan 13 | * date: 18-7-23 14 | */ 15 | @Description(name = "to_base" 16 | , value = "_FUNC_(long, long) - convert a number to a string in the given base." 17 | , extended = "Example:\n > select _FUNC_(long, long) from src;") 18 | public class UDFMathToBase extends UDF { 19 | private Text result = new Text(); 20 | 21 | public UDFMathToBase() { 22 | } 23 | 24 | public Text evaluate(LongWritable value, LongWritable radix) throws HiveException { 25 | if (value == null || radix == null) { 26 | return null; 27 | } 28 | 29 | checkRadix(radix.get()); 30 | result.set(Long.toString(value.get(), (int) radix.get())); 31 | return result; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/utils/json/JsonPath.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.utils.json; 2 | 3 | /** 4 | * @author ruifeng.shan 5 | * date: 2016-07-25 6 | * time: 15:15 7 | */ 8 | public class JsonPath { 9 | private final JsonExtract.JsonExtractor scalarExtractor; 10 | private final JsonExtract.JsonExtractor objectExtractor; 11 | private final JsonExtract.JsonExtractor sizeExtractor; 12 | 13 | public JsonPath(String pattern) { 14 | scalarExtractor = JsonExtract.generateExtractor(pattern, new JsonExtract.ScalarValueJsonExtractor()); 15 | objectExtractor = JsonExtract.generateExtractor(pattern, new JsonExtract.JsonValueJsonExtractor()); 16 | sizeExtractor = JsonExtract.generateExtractor(pattern, new JsonExtract.JsonSizeExtractor()); 17 | } 18 | 19 | public JsonExtract.JsonExtractor getScalarExtractor() { 20 | return scalarExtractor; 21 | } 22 | 23 | public JsonExtract.JsonExtractor getObjectExtractor() { 24 | return objectExtractor; 25 | } 26 | 27 | public JsonExtract.JsonExtractor getSizeExtractor() { 28 | return sizeExtractor; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/json/UDFJsonExtract.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.json; 2 | 3 | import com.github.aaronshan.functions.utils.json.JsonExtract; 4 | import com.github.aaronshan.functions.utils.json.JsonPath; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDF; 7 | import org.apache.hadoop.io.Text; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * date: 2016-07-25 12 | * time: 16:26 13 | */ 14 | @Description(name = "json_extract", value = "_FUNC_(json, json_path) - extract json by given jsonPath. " 15 | , extended = "Example:\n" 16 | + " > SELECT _FUNC_(json_array, json_path) FROM src LIMIT 1;") 17 | public class UDFJsonExtract extends UDF { 18 | private Text result = new Text(); 19 | 20 | public UDFJsonExtract() { 21 | } 22 | 23 | public Text evaluate(Text json, Text path) { 24 | try { 25 | JsonPath jsonPath = new JsonPath(path.toString()); 26 | String content = JsonExtract.extract(json.toString(), jsonPath.getObjectExtractor()); 27 | result.set(content); 28 | return result; 29 | } catch (Exception e) { 30 | return null; 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/math/UDFMathNormalCdf.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import org.apache.commons.math3.special.Erf; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.hive.ql.metadata.HiveException; 7 | import org.apache.hadoop.io.DoubleWritable; 8 | 9 | import static com.github.aaronshan.functions.utils.Failures.checkCondition; 10 | 11 | /** 12 | * @author ruifeng.shan 13 | * date: 2018-07-26 14 | * time: 23:03 15 | */ 16 | @Description(name = "normal_cdf" 17 | , value = "_FUNC_(mean, sd, v) - normal cdf given a mean, standard deviation, and value." 18 | , extended = "Example:\n > select _FUNC_(mean, sd, v) from src;") 19 | public class UDFMathNormalCdf extends UDF { 20 | private DoubleWritable result = new DoubleWritable(); 21 | 22 | public UDFMathNormalCdf() { 23 | } 24 | 25 | public DoubleWritable evaluate(double mean, double standardDeviation, double value) throws HiveException { 26 | checkCondition(standardDeviation > 0, "standardDeviation must > 0"); 27 | result.set(0.5 * (1 + Erf.erf((value - mean) / (standardDeviation * Math.sqrt(2))))); 28 | return result; 29 | } 30 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/math/UDFMathInverseNormalCdf.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import org.apache.commons.math3.special.Erf; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.hive.ql.metadata.HiveException; 7 | import org.apache.hadoop.io.DoubleWritable; 8 | 9 | import static com.github.aaronshan.functions.utils.Failures.checkCondition; 10 | 11 | /** 12 | * @author ruifeng.shan 13 | * date: 2018-07-26 14 | * time: 23:04 15 | */ 16 | @Description(name = "inverse_normal_cdf" 17 | , value = "_FUNC_(mean, sd, p) - inverse of normal cdf given a mean, std, and probability." 18 | , extended = "Example:\n > select _FUNC_(mean, sd, p) from src;") 19 | public class UDFMathInverseNormalCdf extends UDF { 20 | private DoubleWritable result = new DoubleWritable(); 21 | 22 | public UDFMathInverseNormalCdf() { 23 | } 24 | 25 | public DoubleWritable evaluate(double mean, double sd, double p) throws HiveException { 26 | checkCondition(p > 0 && p < 1, "p must be 0 > p > 1"); 27 | checkCondition(sd > 0, "sd must > 0"); 28 | 29 | result.set(mean + sd * 1.4142135623730951 * Erf.erfInv(2 * p - 1)); 30 | return result; 31 | } 32 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/json/UDFJsonExtractScalar.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.json; 2 | 3 | import com.github.aaronshan.functions.utils.json.JsonExtract; 4 | import com.github.aaronshan.functions.utils.json.JsonPath; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDF; 7 | import org.apache.hadoop.io.Text; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * date: 2016-07-25 12 | * time: 16:26 13 | */ 14 | @Description(name = "json_extract_scalar", value = "_FUNC_(json, json_path) - like json_extract, but returns the result value as a string (as opposed to being encoded as JSON). " 15 | , extended = "Example:\n" 16 | + " > SELECT _FUNC_(json_array, json_path) FROM src LIMIT 1;") 17 | public class UDFJsonExtractScalar extends UDF { 18 | private Text result = new Text(); 19 | 20 | public UDFJsonExtractScalar() { 21 | } 22 | 23 | public Text evaluate(Text json, Text path) { 24 | try { 25 | JsonPath jsonPath = new JsonPath(path.toString()); 26 | String content = JsonExtract.extract(json.toString(), jsonPath.getScalarExtractor()); 27 | result.set(content); 28 | return result; 29 | } catch (Exception e) { 30 | return null; 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/string/UDFStringSplitToMapTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import com.github.aaronshan.functions.utils.MapUtils; 4 | import com.google.common.collect.ImmutableMap; 5 | import com.google.common.collect.Maps; 6 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 7 | import org.junit.Assert; 8 | import org.junit.Test; 9 | 10 | import java.util.HashMap; 11 | 12 | public class UDFStringSplitToMapTest { 13 | @Test 14 | public void testStringSplitToMap() throws Exception { 15 | UDFStringSplitToMap udf = new UDFStringSplitToMap(); 16 | 17 | GenericUDF.DeferredObject string = new GenericUDF.DeferredJavaObject("a=123,b=0.4"); 18 | GenericUDF.DeferredObject entryDelimiter = new GenericUDF.DeferredJavaObject(","); 19 | GenericUDF.DeferredObject keyValueDelimiter = new GenericUDF.DeferredJavaObject("="); 20 | GenericUDF.DeferredObject[] args = {string, entryDelimiter, keyValueDelimiter}; 21 | 22 | HashMap output = (HashMap) udf.evaluate(args); 23 | 24 | HashMap expect = Maps.newHashMap(); 25 | expect.putAll(ImmutableMap.of("a", "123", "b", "0.4")); 26 | 27 | Assert.assertEquals("split_to_map() test", true, MapUtils.mapEquals(output, expect)); 28 | } 29 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFStringPosition.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import org.apache.commons.codec.digest.DigestUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * date: 2018-07-27 下午12:08 12 | */ 13 | @Description(name = "strpos" 14 | , value = "_FUNC_(string, substring) - returns index of first occurrence of a substring (or 0 if not found)." 15 | , extended = "Example:\n > select _FUNC_(string, substring) from src;") 16 | public class UDFStringPosition extends UDF { 17 | private LongWritable result = new LongWritable(0); 18 | 19 | public UDFStringPosition() { 20 | } 21 | 22 | public LongWritable evaluate(Text text, Text subText) { 23 | if (text == null || subText == null) { 24 | return result; 25 | } 26 | 27 | if (subText.getLength() == 1) { 28 | result.set(1); 29 | return result; 30 | } 31 | 32 | int index = text.toString().indexOf(subText.toString()); 33 | if (index < 0) { 34 | return result; 35 | } 36 | 37 | result.set(index + 1); 38 | return result; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFStringNormalize.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.hive.ql.metadata.HiveException; 6 | import org.apache.hadoop.io.Text; 7 | 8 | import java.text.Normalizer; 9 | 10 | /** 11 | * @author ruifeng.shan 12 | * date: 2018-07-27 下午12:37 13 | */ 14 | @Description(name = "normalize" 15 | , value = "_FUNC_(string, string) - transforms the string to normalized form." 16 | , extended = "Example:\n > select _FUNC_(string, form_str) from src;") 17 | public class UDFStringNormalize extends UDF { 18 | private Text result = new Text(); 19 | 20 | public UDFStringNormalize() { 21 | } 22 | 23 | public Text evaluate(Text text, Text form) throws HiveException { 24 | if (text == null) { 25 | return null; 26 | } 27 | 28 | Normalizer.Form targetForm; 29 | try { 30 | targetForm = Normalizer.Form.valueOf(form.toString()); 31 | } 32 | catch (IllegalArgumentException e) { 33 | throw new HiveException("Normalization form must be one of [NFD, NFC, NFKD, NFKC]"); 34 | } 35 | 36 | result.set(Normalizer.normalize(text.toString(), targetForm)); 37 | return result; 38 | } 39 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/url/UDFUrlEncode.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.url; 2 | 3 | import com.google.common.escape.Escaper; 4 | import com.google.common.net.UrlEscapers; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDF; 7 | import org.apache.hadoop.io.Text; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * date: 2016-07-27 12 | * time: 16:04 13 | */ 14 | @Description(name = "url_encode" 15 | , value = "_FUNC_(value) - string\n Escapes value by encoding it so that it can be safely included in URL query parameter names and values:\n" 16 | + "* Alphanumeric characters are not encoded.\n" 17 | + "* The characters characters ., -, * and _ are not encoded.\n" 18 | + "* The ASCII space character is encoded as +.\n" 19 | + "* All other characters are converted to UTF-8 and the bytes are encoded as the string %XX where XX is the uppercase hexadecimal value of the UTF-8 byte." 20 | , extended = "Example:\n > select _FUNC_(value) from src;") 21 | public class UDFUrlEncode extends UDF { 22 | private Text result = new Text(); 23 | 24 | public Text evaluate(String value) { 25 | if (value == null) { 26 | return null; 27 | } 28 | Escaper escaper = UrlEscapers.urlFormParameterEscaper(); 29 | result.set(escaper.escape(value)); 30 | return result; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/array/UDFArrayShuffleTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.github.aaronshan.functions.array.UDFArrayShuffle; 4 | import com.google.common.collect.ImmutableList; 5 | import org.apache.hadoop.hive.ql.metadata.HiveException; 6 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 7 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 8 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 9 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 10 | import org.junit.Test; 11 | 12 | import java.util.List; 13 | 14 | /** 15 | * @author aaronshan 16 | * @date 2018-08-18 上午8:59 17 | */ 18 | public class UDFArrayShuffleTest { 19 | @Test 20 | public void testArrayShuffle() throws HiveException { 21 | UDFArrayShuffle udf = new UDFArrayShuffle(); 22 | 23 | ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector); 24 | ObjectInspector[] arguments = {arrayOI}; 25 | 26 | udf.initialize(arguments); 27 | 28 | List array = ImmutableList.of(1,2,5,6); 29 | GenericUDF.DeferredObject arrayObj = new GenericUDF.DeferredJavaObject(array); 30 | GenericUDF.DeferredObject[] args = {arrayObj}; 31 | System.out.println(udf.evaluate(args)); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/math/UDFMathFromBase.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.hive.ql.metadata.HiveException; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | 9 | import static com.github.aaronshan.functions.utils.MathUtils.checkRadix; 10 | import static java.lang.String.format; 11 | 12 | /** 13 | * @author ruifeng.shan 14 | * date: 18-7-23 15 | */ 16 | @Description(name = "from_base" 17 | , value = "_FUNC_(string, long) - convert a number to a string in the given base." 18 | , extended = "Example:\n > select _FUNC_(string, long) from src;") 19 | public class UDFMathFromBase extends UDF { 20 | private LongWritable result = new LongWritable(); 21 | 22 | public UDFMathFromBase() { 23 | } 24 | 25 | public LongWritable evaluate(Text value, LongWritable radix) throws HiveException { 26 | if (value == null || radix == null) { 27 | return null; 28 | } 29 | 30 | checkRadix(radix.get()); 31 | try { 32 | result.set(Long.parseLong(value.toString(), (int) radix.get())); 33 | } catch (NumberFormatException e) { 34 | throw new HiveException(format("Not a valid base-%d number: %s", radix, value.toString()), e); 35 | } 36 | return result; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/re2j/DFAStateKey.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The RE2 Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Original RE2 source here: 6 | // https://github.com/google/re2/blob/master/re2/dfa.cc 7 | 8 | package com.github.aaronshan.functions.regexp.re2j; 9 | 10 | import java.util.Arrays; 11 | 12 | import static com.github.aaronshan.functions.regexp.re2j.Utils.arrayFirstElementsEqual; 13 | 14 | final class DFAStateKey { 15 | private final int[] instIndexes; 16 | private final int nIndexes; 17 | private final int flag; 18 | 19 | DFAStateKey(int[] instIndexes, int nIndexes, int flag) { 20 | this.instIndexes = instIndexes; 21 | this.nIndexes = nIndexes; 22 | this.flag = flag; 23 | } 24 | 25 | @Override 26 | public boolean equals(Object o) { 27 | if (this == o) return true; 28 | if (o == null || getClass() != o.getClass()) return false; 29 | 30 | DFAStateKey that = (DFAStateKey) o; 31 | 32 | return nIndexes == that.nIndexes && flag == that.flag && arrayFirstElementsEqual(instIndexes, that.instIndexes, nIndexes); 33 | } 34 | 35 | @Override 36 | public int hashCode() { 37 | int result = Arrays.hashCode(instIndexes); 38 | result = 31 * result + nIndexes; 39 | result = 31 * result + flag; 40 | return result; 41 | } 42 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/re2j/SparseSet.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The RE2 Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Original RE2 source here: 6 | // https://github.com/google/re2/blob/master/util/sparse_set.h 7 | 8 | package com.github.aaronshan.functions.regexp.re2j; 9 | 10 | class SparseSet { 11 | private final int[] dense; // may contain stale Entries in slots >= size 12 | private final int[] sparse; // may contain stale but in-bounds values. 13 | private int size; // of prefix of |dense| that is logically populated 14 | 15 | SparseSet(int n) { 16 | this.sparse = new int[n]; 17 | this.dense = new int[n]; 18 | } 19 | 20 | boolean contains(int i) { 21 | return sparse[i] < size && dense[sparse[i]] == i; 22 | } 23 | 24 | boolean isEmpty() { 25 | return size == 0; 26 | } 27 | 28 | void add(int i) { 29 | dense[size] = i; 30 | sparse[i] = size; 31 | size++; 32 | } 33 | 34 | void clear() { 35 | size = 0; 36 | } 37 | 38 | int getValueAt(int i) { 39 | if (i >= size) { 40 | throw new IndexOutOfBoundsException(String.format("Cannot get index %d. SparseSet is size %d", i, size)); 41 | } 42 | return dense[i]; 43 | } 44 | 45 | int getSize() { 46 | return size; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/json/UDFJsonSize.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.json; 2 | 3 | import com.github.aaronshan.functions.utils.json.JsonExtract; 4 | import com.github.aaronshan.functions.utils.json.JsonPath; 5 | import org.apache.commons.logging.Log; 6 | import org.apache.commons.logging.LogFactory; 7 | import org.apache.hadoop.hive.ql.exec.Description; 8 | import org.apache.hadoop.hive.ql.exec.UDF; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | 12 | /** 13 | * @author ruifeng.shan 14 | * date: 2016-07-25 15 | * time: 16:33 16 | */ 17 | @Description(name = "json_size", value = "_FUNC_(json, json_path) - like json_extract, but returns the size of the value. For objects or arrays, the size is the number of members, and the size of a scalar value is zero. " 18 | , extended = "Example:\n" 19 | + " > SELECT _FUNC_(json_array, json_path) FROM src LIMIT 1;") 20 | public class UDFJsonSize extends UDF{ 21 | private LongWritable result = new LongWritable(); 22 | 23 | public UDFJsonSize() { 24 | } 25 | 26 | public LongWritable evaluate(Text json, Text path) { 27 | try { 28 | JsonPath jsonPath = new JsonPath(path.toString()); 29 | Long size = JsonExtract.extract(json.toString(), jsonPath.getSizeExtractor()); 30 | result.set(size); 31 | return result; 32 | } catch (Exception e) { 33 | return null; 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimapTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import com.github.aaronshan.functions.utils.MapUtils; 4 | import com.google.common.collect.ImmutableList; 5 | import com.google.common.collect.ImmutableMap; 6 | import com.google.common.collect.Maps; 7 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | 11 | import java.util.HashMap; 12 | import java.util.List; 13 | 14 | public class UDFStringSplitToMultimapTest { 15 | @Test 16 | public void testStringSplitToMultimap() throws Exception { 17 | UDFStringSplitToMultimap udf = new UDFStringSplitToMultimap(); 18 | 19 | GenericUDF.DeferredObject string = new GenericUDF.DeferredJavaObject("a=123,b=0.4,a=124"); 20 | GenericUDF.DeferredObject entryDelimiter = new GenericUDF.DeferredJavaObject(","); 21 | GenericUDF.DeferredObject keyValueDelimiter = new GenericUDF.DeferredJavaObject("="); 22 | GenericUDF.DeferredObject[] args = {string, entryDelimiter, keyValueDelimiter}; 23 | 24 | HashMap> output = (HashMap>) udf.evaluate(args); 25 | 26 | HashMap> expect = Maps.newHashMap(); 27 | expect.putAll(ImmutableMap.>of("a", ImmutableList.of("123", "124"), "b", ImmutableList.of("0.4"))); 28 | 29 | Assert.assertEquals("split_to_multimap() test", true, MapUtils.mapEquals(output, expect)); 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtract.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.regexp; 2 | 3 | import io.airlift.slice.Slices; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.hive.ql.metadata.HiveException; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | 10 | /** 11 | * @author ruifeng.shan 12 | * date: 2018-07-27 13 | * time: 22:38 14 | */ 15 | @Description(name = "regexp_extract" 16 | , value = "_FUNC_(string, string) - returns substrings matching a regular expression." 17 | , extended = "Example:\n > select _FUNC_(string, pattern) from src;") 18 | public class UDFRe2JRegexpExtract extends UDF { 19 | private static Re2JRegexp re2JRegexp; 20 | private Text result = new Text(); 21 | 22 | public UDFRe2JRegexpExtract() { 23 | 24 | } 25 | 26 | public Text evaluate(Text source, Text pattern) throws HiveException { 27 | return evaluate(source, pattern, new LongWritable(0)); 28 | } 29 | 30 | public Text evaluate(Text source, Text pattern, LongWritable groupIndex) throws HiveException { 31 | if (source == null) { 32 | return null; 33 | } 34 | 35 | if (re2JRegexp == null) { 36 | re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern.toString())); 37 | } 38 | 39 | result.set(re2JRegexp.extract(Slices.utf8Slice(source.toString()), groupIndex.get()).toStringUtf8()); 40 | return result; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/bitwise/UDFBitCount.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.bitwise; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.hive.ql.metadata.HiveException; 6 | import org.apache.hadoop.io.LongWritable; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-27 11 | * time: 15:49 12 | */ 13 | @Description(name = "bit_count" 14 | , value = "_FUNC_(x, bits) - count the number of bits set in x (treated as bits-bit signed integer) in 2’s complement representation." 15 | , extended = "Example:\n > select _FUNC_(9, 64) from src;") 16 | public class UDFBitCount extends UDF { 17 | private LongWritable result = new LongWritable(); 18 | 19 | public LongWritable evaluate(long num, long bits) throws HiveException { 20 | if (bits == 64) { 21 | result.set(Long.bitCount(num)); 22 | return result; 23 | } 24 | if (bits <= 1 || bits > 64) { 25 | throw new HiveException("Bits specified in bit_count must be between 2 and 64, got " + bits); 26 | } 27 | long lowBitsMask = (1 << (bits - 1)) - 1; // set the least (bits - 1) bits 28 | if (num > lowBitsMask || num < ~lowBitsMask) { 29 | throw new HiveException("Number must be representable with the bits specified. " + num + " can not be represented with " + bits + " bits"); 30 | } 31 | long mask = lowBitsMask | 0x8000000000000000L; // set the least (bits - 1) bits and the sign bit 32 | result.set(Long.bitCount(num & mask)); 33 | return result; 34 | } 35 | } -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtractAllTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.regexp; 2 | 3 | import com.google.common.collect.ImmutableList; 4 | import com.google.common.collect.Iterables; 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 6 | import org.apache.hadoop.hive.ql.metadata.HiveException; 7 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 8 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 9 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 10 | import org.junit.Test; 11 | 12 | import java.util.ArrayList; 13 | 14 | import static org.junit.Assert.*; 15 | 16 | public class UDFRe2JRegexpExtractAllTest { 17 | @Test 18 | public void testUDFRe2JRegexpExtractAll() throws HiveException { 19 | UDFRe2JRegexpExtractAll udf = new UDFRe2JRegexpExtractAll(); 20 | 21 | ObjectInspector source = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 22 | ObjectInspector pattern = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 23 | ObjectInspector[] arguments = {source, pattern}; 24 | 25 | udf.initialize(arguments); 26 | 27 | GenericUDF.DeferredObject sourceObj = new GenericUDF.DeferredJavaObject("1a 2b 3c 6f"); 28 | GenericUDF.DeferredObject patternObj = new GenericUDF.DeferredJavaObject("\\d+"); 29 | GenericUDF.DeferredObject[] args = {sourceObj, patternObj}; 30 | 31 | ArrayList output = (ArrayList) udf.evaluate(args); 32 | assertTrue(Iterables.elementsEqual(ImmutableList.of("1", "2", "3", "6"), output)); 33 | } 34 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/utils/MapUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.utils; 2 | 3 | import java.util.Map; 4 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 5 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 6 | 7 | /** 8 | * @author ruifeng.shan 9 | * date: 2016-07-27 10 | * time: 22:33 11 | */ 12 | public final class MapUtils { 13 | public static boolean mapEquals(Map left, Map right) { 14 | if (left == null || right == null) { 15 | if (left == null && right == null) { 16 | return true; 17 | } 18 | return false; 19 | } 20 | 21 | if (left.size() != right.size()) { 22 | return false; 23 | } 24 | 25 | for (K key : left.keySet()) { 26 | if (!left.get(key).equals(right.get(key))) { 27 | return false; 28 | } 29 | } 30 | 31 | return true; 32 | } 33 | 34 | public static boolean mapEquals(Map left, Map right, ObjectInspector valueOI) { 35 | if (left == null || right == null) { 36 | if (left == null && right == null) { 37 | return true; 38 | } 39 | return false; 40 | } 41 | 42 | if (left.size() != right.size()) { 43 | return false; 44 | } 45 | 46 | for (Object key : left.keySet()) { 47 | if (ObjectInspectorUtils.compare(left.get(key), valueOI, right.get(key), valueOI) != 0) { 48 | return false; 49 | } 50 | } 51 | 52 | return true; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFCodePoint.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import io.airlift.slice.Slice; 4 | import io.airlift.slice.Slices; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDF; 7 | import org.apache.hadoop.hive.ql.metadata.HiveException; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.Text; 10 | 11 | import static com.github.aaronshan.functions.utils.Failures.checkCondition; 12 | import static io.airlift.slice.SliceUtf8.getCodePointAt; 13 | import static io.airlift.slice.SliceUtf8.countCodePoints; 14 | 15 | /** 16 | * @author ruifeng.shan 17 | * date: 2018-07-26 18 | * time: 23:23 19 | */ 20 | @Description(name = "codepoint" 21 | , value = "_FUNC_(string) - returns Unicode code point of a single character string." 22 | , extended = "Example:\n > select _FUNC_(string) from src;") 23 | public class UDFCodePoint extends UDF { 24 | private LongWritable result = new LongWritable(); 25 | 26 | public UDFCodePoint() { 27 | } 28 | 29 | /** 30 | * codepoint. 31 | * 32 | * @param text 字符串 33 | * @return Unicode code point 34 | * @throws HiveException hive exception 35 | */ 36 | public LongWritable evaluate(Text text) throws HiveException { 37 | if (text == null) { 38 | return null; 39 | } 40 | 41 | Slice slice = Slices.utf8Slice(text.toString()); 42 | checkCondition(countCodePoints(slice) == 1, "Input string must be a single character string"); 43 | 44 | result.set(getCodePointAt(slice, 0)); 45 | return result; 46 | } 47 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/math/UDFMathIsInfinite.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.BooleanWritable; 6 | import org.apache.hadoop.io.DoubleWritable; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 18-7-23 11 | */ 12 | @Description(name = "is_infinite" 13 | , value = "_FUNC_(double) - test if value is infinite." 14 | , extended = "Example:\n > select _FUNC_(double) from src;") 15 | public class UDFMathIsInfinite extends UDF { 16 | /** 17 | * A constant holding the positive infinity of type 18 | * {@code double}. It is equal to the value returned by 19 | * {@code Double.longBitsToDouble(0x7ff0000000000000L)}. 20 | */ 21 | public static final double POSITIVE_INFINITY = 1.0 / 0.0; 22 | 23 | /** 24 | * A constant holding the negative infinity of type 25 | * {@code double}. It is equal to the value returned by 26 | * {@code Double.longBitsToDouble(0xfff0000000000000L)}. 27 | */ 28 | public static final double NEGATIVE_INFINITY = -1.0 / 0.0; 29 | 30 | BooleanWritable result = new BooleanWritable(); 31 | 32 | public UDFMathIsInfinite() { 33 | } 34 | 35 | public BooleanWritable evaluate(DoubleWritable num) { 36 | if (num == null) { 37 | result.set(false); 38 | } else { 39 | result.set(isInfinite(num.get())); 40 | } 41 | return result; 42 | } 43 | 44 | private boolean isInfinite(double v) { 45 | return (v == POSITIVE_INFINITY) || (v == NEGATIVE_INFINITY); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpReplace.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.regexp; 2 | 3 | import io.airlift.slice.Slices; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.hive.ql.metadata.HiveException; 7 | import org.apache.hadoop.io.Text; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * date: 2018-07-27 12 | * time: 22:37 13 | */ 14 | @Description(name = "regexp_replace" 15 | , value = "_FUNC_(string, string) - removes substrings matching a regular expression\n" + 16 | "_FUNC_(string, string, string) - replaces substrings matching a regular expression by given string." 17 | , extended = "Example:\n > select _FUNC_(string, pattern) from src;\n" + 18 | "select _FUNC_(string, pattern, replacement) from src;") 19 | public class UDFRe2JRegexpReplace extends UDF { 20 | private static Re2JRegexp re2JRegexp; 21 | private Text result = new Text(); 22 | 23 | public UDFRe2JRegexpReplace() { 24 | 25 | } 26 | 27 | public Text evaluate(Text source, Text pattern) throws HiveException { 28 | return evaluate(source, pattern, new Text(Slices.EMPTY_SLICE.toStringUtf8())); 29 | } 30 | 31 | public Text evaluate(Text source, Text pattern, Text replacement) throws HiveException { 32 | if (source == null) { 33 | return null; 34 | } 35 | 36 | if (re2JRegexp == null) { 37 | re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern.toString())); 38 | } 39 | 40 | result.set(re2JRegexp.replace(Slices.utf8Slice(source.toString()), Slices.utf8Slice(replacement.toString())).toStringUtf8()); 41 | return result; 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/re2j/PatternSyntaxException.java: -------------------------------------------------------------------------------- 1 | // Copyright 2010 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package com.github.aaronshan.functions.regexp.re2j; 6 | 7 | /** 8 | * An exception thrown by the parser if the pattern was invalid. 9 | *

10 | * Following {@code java.util.regex.PatternSyntaxException}, this is an 11 | * unchecked exception. 12 | */ 13 | public class PatternSyntaxException extends RuntimeException { 14 | 15 | private final String error; // the nature of the error 16 | private final String input; // the partial input at the point of error. 17 | 18 | public PatternSyntaxException(String error, String input) { 19 | super("error parsing regexp: " + error + ": `" + input + "`"); 20 | this.error = error; 21 | this.input = input; 22 | } 23 | 24 | public PatternSyntaxException(String error) { 25 | super("error parsing regexp: " + error); 26 | this.error = error; 27 | this.input = ""; 28 | } 29 | 30 | /** 31 | * Retrieves the error index. 32 | * 33 | * @return The approximate index in the pattern of the error, 34 | * or -1 if the index is not known 35 | */ 36 | public int getIndex() { 37 | return -1; 38 | } 39 | 40 | /** 41 | * Retrieves the description of the error. 42 | * 43 | * @return The description of the error 44 | */ 45 | public String getDescription() { 46 | return error; 47 | } 48 | 49 | /** 50 | * Retrieves the erroneous regular-expression pattern. 51 | * 52 | * @return The erroneous pattern 53 | */ 54 | public String getPattern() { 55 | return input; 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/array/UDFArrayValueCountTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.google.common.collect.ImmutableList; 4 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 5 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 6 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 7 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 8 | import org.apache.hadoop.io.BooleanWritable; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.junit.Test; 11 | 12 | import java.util.List; 13 | 14 | import static org.junit.Assert.*; 15 | 16 | public class UDFArrayValueCountTest { 17 | @Test 18 | public void testArrayValueCount() throws Exception { 19 | UDFArrayValueCount udf = new UDFArrayValueCount(); 20 | 21 | ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 22 | ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 23 | ObjectInspector[] arguments = {arrayOI, valueOI}; 24 | 25 | udf.initialize(arguments); 26 | List array = ImmutableList.of("a", "b", "c", "a"); 27 | GenericUDF.DeferredObject arrayObj = new GenericUDF.DeferredJavaObject(array); 28 | GenericUDF.DeferredObject valueObj = new GenericUDF.DeferredJavaObject("a"); 29 | GenericUDF.DeferredObject[] args = {arrayObj, valueObj}; 30 | LongWritable output = (LongWritable) udf.evaluate(args); 31 | 32 | assertEquals("array_value_count() test", new LongWritable(2).get(), output.get()); 33 | 34 | // Try with null args 35 | GenericUDF.DeferredObject[] nullArgs = { new GenericUDF.DeferredJavaObject(null), new GenericUDF.DeferredJavaObject(null) }; 36 | output = (LongWritable) udf.evaluate(nullArgs); 37 | assertEquals("array_value_count() test", new LongWritable(0).get(), output.get()); 38 | } 39 | } -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/array/UDFArrayContainsTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.google.common.collect.ImmutableList; 4 | import java.util.List; 5 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; 6 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; 7 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 8 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 9 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 10 | import org.apache.hadoop.io.BooleanWritable; 11 | import org.junit.Test; 12 | 13 | import static org.junit.Assert.*; 14 | 15 | /** 16 | * @author ruifeng.shan 17 | * @date 2016-07-27 18 | * @time 20:29 19 | */ 20 | public class UDFArrayContainsTest { 21 | @Test 22 | public void testArrayContains() throws Exception { 23 | UDFArrayContains udf = new UDFArrayContains(); 24 | 25 | ObjectInspector arrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 26 | ObjectInspector valueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 27 | ObjectInspector[] arguments = {arrayOI, valueOI}; 28 | 29 | udf.initialize(arguments); 30 | List array = ImmutableList.of("a", "b", "c"); 31 | DeferredObject arrayObj = new DeferredJavaObject(array); 32 | DeferredObject valueObj = new DeferredJavaObject("a"); 33 | DeferredObject[] args = {arrayObj, valueObj}; 34 | BooleanWritable output = (BooleanWritable) udf.evaluate(args); 35 | 36 | assertEquals("array_contains() test", new BooleanWritable(true).get(), output.get()); 37 | 38 | // Try with null args 39 | DeferredObject[] nullArgs = { new DeferredJavaObject(null), new DeferredJavaObject(null) }; 40 | output = (BooleanWritable) udf.evaluate(nullArgs); 41 | assertEquals("array_contains() test", new BooleanWritable(false).get(), output.get()); 42 | } 43 | } -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/map/UDFMapElementAtTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.map; 2 | 3 | import com.github.aaronshan.functions.utils.MapUtils; 4 | import com.google.common.collect.ImmutableList; 5 | import com.google.common.collect.ImmutableMap; 6 | import com.google.common.collect.Maps; 7 | import java.util.LinkedHashMap; 8 | import java.util.List; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; 10 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 13 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 14 | import org.junit.Test; 15 | 16 | import static org.junit.Assert.*; 17 | 18 | /** 19 | * @author ruifeng.shan 20 | * @date 2016-07-27 21 | * @time 23:23 22 | */ 23 | public class UDFMapElementAtTest { 24 | @Test 25 | public void testMapElementAt() throws Exception { 26 | UDFMapElementAt udf = new UDFMapElementAt(); 27 | ObjectInspector mapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector); 28 | ObjectInspector keyOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 29 | ObjectInspector[] arguments = {mapOI, keyOI}; 30 | udf.initialize(arguments); 31 | 32 | LinkedHashMap map = Maps.newLinkedHashMap(); 33 | map.putAll(ImmutableMap.of("key1", "11", "key2", "12", "key3", "13")); 34 | DeferredObject mapObj = new DeferredJavaObject(map); 35 | DeferredObject keyObj = new DeferredJavaObject("key1"); 36 | DeferredObject[] args = {mapObj, keyObj}; 37 | assertEquals("map_concat() test", "11", udf.evaluate(args)); 38 | 39 | keyObj = new DeferredJavaObject("key4"); 40 | DeferredObject[] args1 = {mapObj, keyObj}; 41 | assertEquals("map_concat() test", null, udf.evaluate(args1)); 42 | } 43 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFChineseToPinYin.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import net.sourceforge.pinyin4j.PinyinHelper; 4 | import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; 5 | import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; 6 | import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; 7 | import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType; 8 | import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; 9 | import org.apache.hadoop.hive.ql.exec.Description; 10 | import org.apache.hadoop.hive.ql.exec.UDF; 11 | import org.apache.hadoop.io.Text; 12 | 13 | import java.util.regex.Matcher; 14 | import java.util.regex.Pattern; 15 | 16 | /** 17 | * Author: ruifengshan 18 | * Date: 23/03/2015 19 | */ 20 | @Description(name = "pinyin" 21 | , value = "_FUNC_(string) - get pinyin by given chinese." 22 | , extended = "Example:\n > select _FUNC_(string) from src;") 23 | public class UDFChineseToPinYin extends UDF { 24 | private Text result = new Text(); 25 | 26 | public UDFChineseToPinYin() { 27 | 28 | } 29 | 30 | /** 31 | * convert chinese han zi to pinyin. 32 | * 33 | * @param chinese 中文字符串 34 | * @return 中文字符串的拼音 35 | */ 36 | public Text evaluate(Text chinese) { 37 | if (chinese == null) { 38 | return null; 39 | } 40 | 41 | result.set(ConvertToPinyin(chinese.toString())); 42 | return result; 43 | } 44 | 45 | //convert chinese to pinyin. 46 | public String ConvertToPinyin(String name) { 47 | HanyuPinyinOutputFormat pyFormat = new HanyuPinyinOutputFormat(); 48 | pyFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); 49 | pyFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); 50 | pyFormat.setVCharType(HanyuPinyinVCharType.WITH_V); 51 | 52 | String result = null; 53 | try { 54 | result = PinyinHelper.toHanyuPinyinString(name, pyFormat, ""); 55 | } catch (BadHanyuPinyinOutputFormatCombination e) { 56 | return null; 57 | } 58 | 59 | return result; 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/date/UDFDayOfYear.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.date; 2 | 3 | import java.util.Calendar; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.hive.serde2.io.TimestampWritable; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.joda.time.LocalDate; 10 | import org.joda.time.format.DateTimeFormat; 11 | import org.joda.time.format.DateTimeFormatter; 12 | 13 | /** 14 | * @author ruifeng.shan 15 | * date: 2016-07-27 16 | * time: 15:58 17 | */ 18 | @Description(name = "day_of_year" 19 | , value = "_FUNC_(date) - returns the day of the year from x. The value ranges from 1 to 366." 20 | , extended = "Example:\n > select _FUNC_(date_string) from src;\n > select _FUNC_(date) from src;") 21 | public class UDFDayOfYear extends UDF { 22 | public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd"); 23 | 24 | private IntWritable result = new IntWritable(); 25 | 26 | public UDFDayOfYear() { 27 | 28 | } 29 | 30 | /** 31 | * Get the day of week from a date string. 32 | * 33 | * @param dateString the dateString in the format of "yyyy-MM-dd". 34 | * @return an int from 1 to 366 35 | * string. 36 | */ 37 | public IntWritable evaluate(Text dateString) { 38 | if (dateString == null) { 39 | return null; 40 | } 41 | 42 | try { 43 | LocalDate date = LocalDate.parse(dateString.toString(), DEFAULT_DATE_FORMATTER); 44 | 45 | result.set(date.getDayOfYear()); 46 | return result; 47 | } catch (Exception e) { 48 | return null; 49 | } 50 | } 51 | 52 | public IntWritable evaluate(TimestampWritable t) { 53 | if (t == null) { 54 | return null; 55 | } 56 | 57 | Calendar calendar = Calendar.getInstance(); 58 | calendar.setTime(t.getTimestamp()); 59 | LocalDate date = LocalDate.fromCalendarFields(calendar); 60 | result.set(date.getDayOfYear()); 61 | return result; 62 | } 63 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/date/UDFDayOfWeek.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.date; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.hive.serde2.io.TimestampWritable; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.joda.time.LocalDate; 9 | import org.joda.time.format.DateTimeFormat; 10 | import org.joda.time.format.DateTimeFormatter; 11 | 12 | import java.util.Calendar; 13 | 14 | /** 15 | * @author ruifeng.shan 16 | * date: 15-8-31 17 | */ 18 | @Description(name = "day_of_week" 19 | , value = "_FUNC_(date) - day of week. if monday, return 1, tuesday return 2 ... sunday return 7." 20 | , extended = "Example:\n > select _FUNC_(date_string) from src;\n > select _FUNC_(date) from src;") 21 | public class UDFDayOfWeek extends UDF { 22 | public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd"); 23 | 24 | private IntWritable result = new IntWritable(); 25 | 26 | public UDFDayOfWeek() { 27 | 28 | } 29 | 30 | /** 31 | * Get the day of week from a date string. 32 | * 33 | * @param dateString the dateString in the format of "yyyy-MM-dd". 34 | * @return an int from 0 to 6(0 = Monday, 1 = Tuesday, … 6 = Sunday). null if the dateString is not a valid date 35 | * string. 36 | */ 37 | public IntWritable evaluate(Text dateString) { 38 | if (dateString == null) { 39 | return null; 40 | } 41 | 42 | try { 43 | LocalDate date = LocalDate.parse(dateString.toString(), DEFAULT_DATE_FORMATTER); 44 | 45 | result.set(date.getDayOfWeek()); 46 | return result; 47 | } catch (Exception e) { 48 | return null; 49 | } 50 | } 51 | 52 | public IntWritable evaluate(TimestampWritable t) { 53 | if (t == null) { 54 | return null; 55 | } 56 | 57 | Calendar calendar = Calendar.getInstance(); 58 | calendar.setTime(t.getTimestamp()); 59 | LocalDate date = LocalDate.fromCalendarFields(calendar); 60 | result.set(date.getDayOfWeek()); 61 | return result; 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/re2j/DFAState.java: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The RE2 Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Original RE2 source here: 6 | // https://github.com/google/re2/blob/master/re2/dfa.cc 7 | 8 | package com.github.aaronshan.functions.regexp.re2j; 9 | 10 | import static com.github.aaronshan.functions.regexp.re2j.DFA.FLAG_MATCH; 11 | import static java.lang.System.arraycopy; 12 | 13 | final class DFAState { 14 | public static final DFAState DEAD_STATE = new DFAState(StateType.DEAD); 15 | private final StateType type; // the state type. Lets us create DEAD_STATE and FULL_MATCH_STATE 16 | private final int[] instIndexes; // indexes into prog instructions for this state 17 | private final int flag; // empty width flags 18 | private final DFAState[] next = new DFAState[256]; // Maps bytes to the next state to follow 19 | public DFAState(int[] instIndexes, int nIndexes, int flag) { 20 | this.type = StateType.REGULAR; 21 | this.instIndexes = new int[nIndexes]; 22 | arraycopy(instIndexes, 0, this.instIndexes, 0, nIndexes); 23 | this.flag = flag; 24 | } 25 | 26 | private DFAState(StateType type) { 27 | this.type = type; 28 | this.instIndexes = new int[0]; 29 | this.flag = 0; 30 | } 31 | 32 | public StateType getType() { 33 | return type; 34 | } 35 | 36 | public int getFlag() { 37 | return flag; 38 | } 39 | 40 | public int[] getInstIndexes() { 41 | return instIndexes; 42 | } 43 | 44 | public boolean isMatch() { 45 | return (flag & FLAG_MATCH) != 0; 46 | } 47 | 48 | public boolean isDead() { 49 | return type == StateType.DEAD; 50 | } 51 | 52 | public DFAState getNextState(byte b) { 53 | return next[b & 0xff]; 54 | } 55 | 56 | public void setNextState(byte b, DFAState state) { 57 | next[b & 0xff] = state; 58 | } 59 | 60 | public enum StateType { 61 | DEAD, // no possible match out of this state 62 | REGULAR // all other states 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/map/UDFMapBuildTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.map; 2 | 3 | import com.github.aaronshan.functions.utils.MapUtils; 4 | import com.github.aaronshan.functions.utils.MapUtils; 5 | import com.google.common.collect.ImmutableList; 6 | import com.google.common.collect.ImmutableMap; 7 | import com.google.common.collect.Maps; 8 | import java.util.LinkedHashMap; 9 | import java.util.List; 10 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; 11 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 13 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 14 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 15 | import org.junit.Assert; 16 | import org.junit.Test; 17 | 18 | import static org.junit.Assert.*; 19 | 20 | /** 21 | * @author ruifeng.shan 22 | * @date 2016-07-27 23 | * @time 22:23 24 | */ 25 | public class UDFMapBuildTest { 26 | @Test 27 | public void testMapBuild() throws Exception { 28 | UDFMapBuild udf = new UDFMapBuild(); 29 | ObjectInspector keyArrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 30 | ObjectInspector valueArrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 31 | ObjectInspector[] arguments = {keyArrayOI, valueArrayOI}; 32 | udf.initialize(arguments); 33 | 34 | List keyArray = ImmutableList.of("key1", "key2", "key3"); 35 | List valueArray = ImmutableList.of("value1", "value2", "value3"); 36 | DeferredObject keyArrayObj = new DeferredJavaObject(keyArray); 37 | DeferredObject valueArrayObj = new DeferredJavaObject(valueArray); 38 | DeferredObject[] args = {keyArrayObj, valueArrayObj}; 39 | LinkedHashMap output = (LinkedHashMap) udf.evaluate(args); 40 | LinkedHashMap expect = Maps.newLinkedHashMap(); 41 | expect.putAll(ImmutableMap.of("key1", "value1", "key2", "value2", "key3", "value3")); 42 | 43 | Assert.assertEquals("map_build() test", true, MapUtils.mapEquals(output, expect)); 44 | } 45 | } -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/array/UDFArrayIntersectTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.google.common.collect.ImmutableList; 4 | import com.google.common.collect.Iterables; 5 | import org.apache.hadoop.hive.ql.metadata.HiveException; 6 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 7 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 8 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 9 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 10 | import org.junit.Test; 11 | 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | import static org.junit.Assert.*; 16 | 17 | /** 18 | * @author ruifeng.shan 19 | * @date 2018-07-18 20 | * @time 13:00 21 | */ 22 | public class UDFArrayIntersectTest { 23 | @Test 24 | public void testArrayIntersect() throws HiveException { 25 | UDFArrayIntersect udf = new UDFArrayIntersect(); 26 | 27 | ObjectInspector leftArrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector); 28 | ObjectInspector rightArrayOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaIntObjectInspector); 29 | ObjectInspector[] arguments = {leftArrayOI, rightArrayOI}; 30 | 31 | udf.initialize(arguments); 32 | 33 | assertTrue(Iterables.elementsEqual(ImmutableList.of(1,2,5), evaluate(ImmutableList.of(0,1,2,3,4,5), ImmutableList.of(1,1,2,2,5,5), udf))); 34 | assertTrue(Iterables.elementsEqual(ImmutableList.of(1,2,3,4), evaluate(ImmutableList.of(0,1,2,3,4,4), ImmutableList.of(1,1,2,2,3,4), udf))); 35 | assertTrue(Iterables.elementsEqual(ImmutableList.of(1,2,3,4), evaluate(ImmutableList.of(0,1,1,2,3,4,4), ImmutableList.of(1,1,2,2,3,4), udf))); 36 | } 37 | 38 | private ArrayList evaluate(List leftArray, List rightArray, UDFArrayIntersect udf) throws HiveException { 39 | GenericUDF.DeferredObject leftArrayObj = new GenericUDF.DeferredJavaObject(leftArray); 40 | GenericUDF.DeferredObject rightArrayObj = new GenericUDF.DeferredJavaObject(rightArray); 41 | GenericUDF.DeferredObject[] args = {leftArrayObj, rightArrayObj}; 42 | ArrayList output = (ArrayList) udf.evaluate(args); 43 | return output; 44 | } 45 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignEn.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.date; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.joda.time.DateTime; 8 | import org.joda.time.format.DateTimeFormat; 9 | import org.joda.time.format.DateTimeFormatter; 10 | 11 | import java.util.Date; 12 | 13 | /** 14 | * Created by ruifengshan on 16/3/18. 15 | */ 16 | @Description(name = "zodiac_en" 17 | , value = "_FUNC_(date) - from the input date string or separate month and day arguments, returns the sing of the Zodiac." 18 | , extended = "Example:\n > select _FUNC_(date_string) from src;\n > select _FUNC_(month, day) from src;") 19 | public class UDFZodiacSignEn extends UDF { 20 | public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd"); 21 | private Text result = new Text(); 22 | 23 | public UDFZodiacSignEn() { 24 | } 25 | 26 | public Text evaluate(String birthday) { 27 | if (birthday == null) { 28 | return null; 29 | } 30 | DateTime dateTime = null; 31 | try { 32 | dateTime = DateTime.parse(birthday, DEFAULT_DATE_FORMATTER); 33 | } catch (Exception e) { 34 | return null; 35 | } 36 | 37 | return evaluate(dateTime.toDate()); 38 | } 39 | 40 | public Text evaluate(Date birthday) { 41 | if (birthday == null) { 42 | return null; 43 | } 44 | DateTime dateTime = new DateTime(birthday); 45 | return evaluate(new IntWritable(dateTime.getMonthOfYear()), new IntWritable(dateTime.getDayOfMonth())); 46 | } 47 | 48 | public Text evaluate(IntWritable month, IntWritable day) { 49 | if (month == null || day == null) { 50 | return null; 51 | } 52 | result.set(getZodiac(month.get(), day.get())); 53 | return result; 54 | } 55 | 56 | private String getZodiac(int month, int day) { 57 | String[] zodiacArray = {"Capricorn", "Aquarius", "Pisces", "Aries", "Taurus", "Gemini", "Cancer", "Leo", "Virgo", "Libra", "Scorpio", "Sagittarius"}; 58 | int[] splitDay = {19, 18, 20, 20, 20, 21, 22, 22, 22, 22, 21, 21}; // split day 59 | int index = month; 60 | if (day <= splitDay[month - 1]) { 61 | index = index - 1; 62 | } else if (month == 12) { 63 | index = 0; 64 | } 65 | return zodiacArray[index]; 66 | } 67 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/re2j/MachineInput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2010 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Original Go source here: 6 | // http://code.google.com/p/go/source/browse/src/pkg/regexp/regexp.go 7 | 8 | package com.github.aaronshan.functions.regexp.re2j; 9 | 10 | import io.airlift.slice.Slice; 11 | import sun.misc.Unsafe; 12 | 13 | import java.lang.reflect.Field; 14 | 15 | /** 16 | * MachineInput represents the UTF-8 input text supplied to the Machine. It provides one-character 17 | * lookahead. 18 | */ 19 | final class MachineInput { 20 | 21 | static final byte EOF = -1; 22 | private static final Unsafe unsafe; 23 | 24 | static { 25 | try { 26 | // fetch theUnsafe object 27 | Field field = Unsafe.class.getDeclaredField("theUnsafe"); 28 | field.setAccessible(true); 29 | unsafe = (Unsafe) field.get(null); 30 | if (unsafe == null) { 31 | throw new RuntimeException("Unsafe access not available"); 32 | } 33 | } catch (Exception e) { 34 | throw new RuntimeException(e); 35 | } 36 | } 37 | 38 | final Slice slice; 39 | final Object base; 40 | final long address; 41 | final int length; 42 | MachineInput(Slice slice) { 43 | this.slice = slice; 44 | this.base = slice.getBase(); 45 | this.address = slice.getAddress(); 46 | this.length = slice.length(); 47 | } 48 | 49 | static MachineInput fromUTF8(Slice slice) { 50 | return new MachineInput(slice); 51 | } 52 | 53 | // Returns the byte at the specified index. 54 | byte getByte(int i) { 55 | if (i >= length) { 56 | return EOF; 57 | } 58 | 59 | if (i < 0) { 60 | throw new IndexOutOfBoundsException("index less than zero (" + i + ")"); 61 | } 62 | 63 | return getByteUnchecked(i); 64 | } 65 | 66 | byte getByteUnchecked(int i) { 67 | return unsafe.getByte(base, address + i); 68 | } 69 | 70 | // Returns the index relative to |pos| at which |re2.prefix| is found 71 | // in this input stream, or a negative value if not found. 72 | int index(RE2 re2, int pos) { 73 | int i = Utils.indexOf(slice, re2.prefixUTF8, pos); 74 | return i < 0 ? i : i - pos; 75 | } 76 | 77 | // Returns the end position in the same units as step(). 78 | int endPos() { 79 | return length; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/map/UDFMapConcatTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.map; 2 | 3 | import com.github.aaronshan.functions.utils.MapUtils; 4 | import com.github.aaronshan.functions.utils.MapUtils; 5 | import com.google.common.collect.ImmutableMap; 6 | import com.google.common.collect.Maps; 7 | import java.util.LinkedHashMap; 8 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 12 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 13 | import org.junit.Assert; 14 | import org.junit.Test; 15 | 16 | import static org.junit.Assert.*; 17 | 18 | /** 19 | * @author ruifeng.shan 20 | * @date 2016-07-27 21 | * @time 23:06 22 | */ 23 | public class UDFMapConcatTest { 24 | @Test 25 | public void testMapConcat() throws Exception { 26 | UDFMapConcat udf = new UDFMapConcat(); 27 | ObjectInspector leftMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector); 28 | ObjectInspector rightMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector); 29 | ObjectInspector[] arguments = {leftMapOI, rightMapOI}; 30 | udf.initialize(arguments); 31 | 32 | LinkedHashMap leftMap = Maps.newLinkedHashMap(); 33 | leftMap.putAll(ImmutableMap.of("key1", "11", "key2", "12", "key3", "13")); 34 | LinkedHashMap rightMap = Maps.newLinkedHashMap(); 35 | rightMap.putAll(ImmutableMap.of("key3", "21", "key4", "22", "key5", "23")); 36 | 37 | DeferredObject leftMapObj = new DeferredJavaObject(leftMap); 38 | DeferredObject rightMapObj = new DeferredJavaObject(rightMap); 39 | DeferredObject[] args = {leftMapObj, rightMapObj}; 40 | LinkedHashMap output = (LinkedHashMap) udf.evaluate(args); 41 | LinkedHashMap expect = Maps.newLinkedHashMap(); 42 | expect.putAll(ImmutableMap.of("key1", "11", "key2", "12", "key3", "21", "key4", "22", "key5", "23")); 43 | 44 | Assert.assertEquals("map_concat() test", true, MapUtils.mapEquals(output, expect)); 45 | } 46 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/date/UDFZodiacSignCn.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.date; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDF; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.joda.time.DateTime; 8 | import org.joda.time.format.DateTimeFormat; 9 | import org.joda.time.format.DateTimeFormatter; 10 | 11 | import java.util.Date; 12 | 13 | /** 14 | * Created by ruifengshan on 16/3/18. 15 | */ 16 | @Description(name = "zodiac_cn" 17 | , value = "_FUNC_(date) - from the input date string or separate month and day arguments, returns the sing of the Zodiac." 18 | , extended = "Example:\n > select _FUNC_(date_string) from src;\n > select _FUNC_(month, day) from src;") 19 | public class UDFZodiacSignCn extends UDF { 20 | public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd"); 21 | private Text result = new Text(); 22 | 23 | public UDFZodiacSignCn() { 24 | } 25 | 26 | public Text evaluate(Text birthday) { 27 | if (birthday == null) { 28 | return null; 29 | } 30 | DateTime dateTime = null; 31 | try { 32 | dateTime = DateTime.parse(birthday.toString(), DEFAULT_DATE_FORMATTER); 33 | } catch (Exception e) { 34 | return null; 35 | } 36 | 37 | return evaluate(dateTime.toDate()); 38 | } 39 | 40 | public Text evaluate(Date birthday) { 41 | if (birthday == null) { 42 | return null; 43 | } 44 | DateTime dateTime = new DateTime(birthday); 45 | return evaluate(new IntWritable(dateTime.getMonthOfYear()), new IntWritable(dateTime.getDayOfMonth())); 46 | } 47 | 48 | public Text evaluate(IntWritable month, IntWritable day) { 49 | if (month == null || day == null) { 50 | return null; 51 | } 52 | result.set(getZodiac(month.get(), day.get())); 53 | return result; 54 | } 55 | 56 | private String getZodiac(int month, int day) { 57 | String[] zodiacArray = {"魔羯座", "水瓶座", "双鱼座", "白羊座", "金牛座", "双子座", "巨蟹座", "狮子座", "处女座", "天秤座", "天蝎座", "射手座"}; 58 | int[] splitDay = {19, 18, 20, 20, 20, 21, 22, 22, 22, 22, 21, 21}; // 两个星座分割日 59 | int index = month; 60 | // 所查询日期在分割日之前,索引-1,否则不变 61 | if (day <= splitDay[month - 1]) { 62 | index = index - 1; 63 | } else if (month == 12) { 64 | index = 0; 65 | } 66 | // 返回索引指向的星座string 67 | return zodiacArray[index]; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/map/UDFMapEqualsTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.map; 2 | 3 | import com.github.aaronshan.functions.utils.MapUtils; 4 | import com.google.common.collect.ImmutableMap; 5 | import com.google.common.collect.Maps; 6 | import java.util.LinkedHashMap; 7 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; 8 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; 9 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 11 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 12 | import org.apache.hadoop.io.BooleanWritable; 13 | import org.junit.Test; 14 | 15 | import static org.junit.Assert.*; 16 | 17 | /** 18 | * @author ruifeng.shan 19 | * @date 2016-07-27 20 | * @time 23:42 21 | */ 22 | public class UDFMapEqualsTest { 23 | @Test 24 | public void testMapEquals() throws Exception { 25 | UDFMapEquals udf = new UDFMapEquals(); 26 | ObjectInspector leftMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector); 27 | ObjectInspector rightMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector); 28 | ObjectInspector[] arguments = {leftMapOI, rightMapOI}; 29 | udf.initialize(arguments); 30 | 31 | LinkedHashMap leftMap = Maps.newLinkedHashMap(); 32 | leftMap.putAll(ImmutableMap.of("key1", "11", "key2", "12", "key3", "13")); 33 | LinkedHashMap rightMap = Maps.newLinkedHashMap(); 34 | rightMap.putAll(ImmutableMap.of("key3", "21", "key4", "22", "key5", "23")); 35 | 36 | DeferredObject leftMapObj = new DeferredJavaObject(leftMap); 37 | DeferredObject rightMapObj = new DeferredJavaObject(rightMap); 38 | DeferredObject[] args = {leftMapObj, rightMapObj}; 39 | BooleanWritable output = (BooleanWritable) udf.evaluate(args); 40 | 41 | assertEquals("map_concat() test", false, output.get()); 42 | 43 | rightMap = Maps.newLinkedHashMap(); 44 | rightMap.putAll(ImmutableMap.of("key1", "11", "key2", "12", "key3", "13")); 45 | rightMapObj = new DeferredJavaObject(rightMap); 46 | DeferredObject[] args1 = {leftMapObj, rightMapObj}; 47 | output = (BooleanWritable) udf.evaluate(args1); 48 | 49 | assertEquals("map_concat() test", true, output.get()); 50 | } 51 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/utils/ArrayUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.utils; 2 | 3 | import com.github.aaronshan.functions.fastuitl.ints.AbstractIntComparator; 4 | import com.github.aaronshan.functions.fastuitl.ints.IntComparator; 5 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 6 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 7 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 8 | 9 | /** 10 | * @author ruifeng.shan 11 | * date: 2016-07-26 12 | * time: 17:35 13 | */ 14 | public class ArrayUtils { 15 | public static IntComparator IntArrayCompare(final Object array, final ListObjectInspector arrayOI) { 16 | return new AbstractIntComparator() { 17 | @Override 18 | public int compare(int left, int right) { 19 | ObjectInspector arrayElementOI = arrayOI.getListElementObjectInspector(); 20 | Object leftArrayElement = arrayOI.getListElement(array, left); 21 | Object rightArrayElement = arrayOI.getListElement(array, right); 22 | if (leftArrayElement == null && rightArrayElement == null) { 23 | return 0; 24 | } 25 | if (leftArrayElement == null) { 26 | return -1; 27 | } 28 | if (rightArrayElement == null) { 29 | return 1; 30 | } 31 | int result = ObjectInspectorUtils.compare(leftArrayElement, arrayElementOI, rightArrayElement, arrayElementOI); 32 | 33 | return result; 34 | } 35 | }; 36 | } 37 | 38 | public static boolean arrayEquals(Object left, Object right, ListObjectInspector arrayOI) { 39 | if (left == null || right == null) { 40 | if (left == null && right == null) { 41 | return true; 42 | } 43 | return false; 44 | } 45 | 46 | int leftArrayLength = arrayOI.getListLength(left); 47 | int rightArrayLength = arrayOI.getListLength(right); 48 | 49 | if (leftArrayLength != rightArrayLength) { 50 | return false; 51 | } 52 | 53 | ObjectInspector arrayElementOI = arrayOI.getListElementObjectInspector(); 54 | for (int i = 0; i < leftArrayLength; i++) { 55 | Object leftArrayElement = arrayOI.getListElement(left, i); 56 | Object rightArrayElement = arrayOI.getListElement(right, i); 57 | int compareValue = ObjectInspectorUtils.compare(leftArrayElement, arrayElementOI, rightArrayElement, arrayElementOI); 58 | if (compareValue != 0) { 59 | return false; 60 | } 61 | } 62 | 63 | return true; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFStringHammingDistance.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import io.airlift.slice.Slice; 4 | import io.airlift.slice.Slices; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDF; 7 | import org.apache.hadoop.hive.ql.metadata.HiveException; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.Text; 10 | 11 | import static com.github.aaronshan.functions.utils.Failures.checkCondition; 12 | import static io.airlift.slice.SliceUtf8.lengthOfCodePoint; 13 | import static io.airlift.slice.SliceUtf8.tryGetCodePointAt; 14 | 15 | /** 16 | * @author ruifeng.shan 17 | * date: 2018-07-26 18 | * time: 23:43 19 | */ 20 | @Description(name = "hamming_distance" 21 | , value = "_FUNC_(string, string) - computes Hamming distance between two strings." 22 | , extended = "Example:\n > select _FUNC_(string, string) from src;") 23 | public class UDFStringHammingDistance extends UDF { 24 | private LongWritable result = new LongWritable(0); 25 | 26 | public UDFStringHammingDistance() { 27 | } 28 | 29 | /** 30 | * hamming distance. 31 | * 32 | * @param leftText left string 33 | * @param rightText right string 34 | * @return hamming distance. 35 | * @throws HiveException hive exception 36 | */ 37 | public LongWritable evaluate(Text leftText, Text rightText) throws HiveException { 38 | if (leftText == null || rightText == null) { 39 | return result; 40 | } 41 | 42 | Slice left = Slices.utf8Slice(leftText.toString()); 43 | Slice right = Slices.utf8Slice(rightText.toString()); 44 | int distance = 0; 45 | int leftPosition = 0; 46 | int rightPosition = 0; 47 | while (leftPosition < left.length() && rightPosition < right.length()) { 48 | int codePointLeft = tryGetCodePointAt(left, leftPosition); 49 | int codePointRight = tryGetCodePointAt(right, rightPosition); 50 | 51 | // if both code points are invalid, we do not care if they are equal 52 | // the following code treats them as equal if they happen to be of the same length 53 | if (codePointLeft != codePointRight) { 54 | distance++; 55 | } 56 | 57 | leftPosition += codePointLeft > 0 ? lengthOfCodePoint(codePointLeft) : -codePointLeft; 58 | rightPosition += codePointRight > 0 ? lengthOfCodePoint(codePointRight) : -codePointRight; 59 | } 60 | 61 | checkCondition(leftPosition == left.length() && rightPosition == right.length(), 62 | "The input strings to hamming_distance function must have the same length"); 63 | result.set(distance); 64 | 65 | return result; 66 | } 67 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/utils/ConfigUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.utils; 2 | 3 | import com.github.aaronshan.functions.model.ChinaIdArea; 4 | import com.google.common.base.Strings; 5 | import com.google.common.collect.Lists; 6 | import com.google.common.collect.Maps; 7 | import com.google.common.io.Closer; 8 | import java.io.BufferedReader; 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | import java.io.InputStreamReader; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | import java.util.Map; 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | /** 19 | * @author ruifeng.shan 20 | * date: 2016-07-07 21 | * time: 16:21 22 | */ 23 | public class ConfigUtils { 24 | private static Logger logger = LoggerFactory.getLogger(ConfigUtils.class); 25 | 26 | public static List loadFile(String fileName) throws IOException { 27 | ArrayList strings = Lists.newArrayList(); 28 | Closer closer = Closer.create(); 29 | try { 30 | InputStream inputStream = ConfigUtils.class.getResourceAsStream(fileName); 31 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); 32 | closer.register(bufferedReader); 33 | String line; 34 | while ((line = bufferedReader.readLine()) != null) { 35 | if (Strings.isNullOrEmpty(line) || line.startsWith("#")) { 36 | continue; 37 | } 38 | strings.add(line); 39 | } 40 | } catch (IOException e) { 41 | logger.error("loadFile {} error. error is {}.", fileName, e); 42 | throw e; 43 | } finally { 44 | closer.close(); 45 | } 46 | 47 | return strings; 48 | } 49 | 50 | public static Map getIdCardMap() { 51 | String fileName = "/china_p_c_a.config"; 52 | Map map = Maps.newHashMap(); 53 | try { 54 | List list = loadFile(fileName); 55 | for (String line : list) { 56 | String[] results = line.split("\t", 4); 57 | map.put(results[0], new ChinaIdArea(results[1], results[2], results[3])); 58 | } 59 | } catch (IOException e) { 60 | logger.error("get china id card map error. error is {}.", e); 61 | return map; 62 | } 63 | 64 | return map; 65 | } 66 | 67 | public static Map getDayMap() { 68 | String fileName = "/china_day_type.config"; 69 | Map map = Maps.newHashMap(); 70 | try { 71 | List list = loadFile(fileName); 72 | for (String line : list) { 73 | String[] results = line.split("\t", 2); 74 | map.put(results[0], results[1]); 75 | } 76 | } catch (IOException e) { 77 | logger.error("get day map error. error is {}.", e); 78 | return map; 79 | } 80 | 81 | return map; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/test/java/com/github/aaronshan/functions/math/UDFMathCosineSimilarityTest.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.math; 2 | 3 | import com.google.common.collect.ImmutableMap; 4 | import com.google.common.collect.Maps; 5 | import org.apache.hadoop.hive.ql.metadata.HiveException; 6 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 7 | import org.apache.hadoop.hive.serde2.io.DoubleWritable; 8 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 9 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 10 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 11 | import org.junit.Test; 12 | 13 | import java.util.LinkedHashMap; 14 | import java.util.Map; 15 | 16 | import static org.junit.Assert.*; 17 | 18 | public class UDFMathCosineSimilarityTest { 19 | 20 | @Test 21 | public void testCosineSimilarity() throws HiveException { 22 | Double result = getResult(ImmutableMap.of("a", 1.0, "b", 2.0), ImmutableMap.of("c", 1.0, "b", 3.0)); 23 | assertEquals(result, 2 * 3 / (Math.sqrt(5) * Math.sqrt(10)), 0.0); 24 | result = getResult(ImmutableMap.of("a", 1.0, "b", 2.0, "c", -1.0), ImmutableMap.of("c", 1.0, "b", 3.0)); 25 | assertEquals(result, (2 * 3 + (-1) * 1) / (Math.sqrt(1 + 4 + 1) * Math.sqrt(1 + 9)), 0.0); 26 | result = getResult(ImmutableMap.of("a", 1.0, "b", 2.0, "c", -1.0), ImmutableMap.of("d", 1.0, "e", 3.0)); 27 | assertEquals(result, 0.0, 0.0); 28 | result = getResult(null, ImmutableMap.of("c", 1.0, "b", 3.0)); 29 | assertEquals(result, null); 30 | LinkedHashMap leftMap = Maps.newLinkedHashMap(); 31 | leftMap.put("a", 1.0); 32 | leftMap.put("b", null); 33 | result = getResult(leftMap, ImmutableMap.of("c", 1.0, "b", 3.0)); 34 | assertEquals(result, null); 35 | } 36 | 37 | public Double getResult(Map leftMap, Map rightMap) throws HiveException { 38 | UDFMathCosineSimilarity udf = new UDFMathCosineSimilarity(); 39 | 40 | ObjectInspector leftMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); 41 | ObjectInspector rightMapOI = ObjectInspectorFactory.getStandardMapObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); 42 | ObjectInspector[] arguments = {leftMapOI, rightMapOI}; 43 | udf.initialize(arguments); 44 | 45 | GenericUDF.DeferredObject leftMapObj = new GenericUDF.DeferredJavaObject(leftMap); 46 | GenericUDF.DeferredObject rightMapObj = new GenericUDF.DeferredJavaObject(rightMap); 47 | GenericUDF.DeferredObject[] args = {leftMapObj, rightMapObj}; 48 | DoubleWritable output = (DoubleWritable) udf.evaluate(args); 49 | return output == null ? null : output.get(); 50 | } 51 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/date/UDFTypeOfDay.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.date; 2 | 3 | import com.github.aaronshan.functions.utils.ConfigUtils; 4 | import java.util.Calendar; 5 | import java.util.Map; 6 | import org.apache.hadoop.hive.ql.exec.Description; 7 | import org.apache.hadoop.hive.ql.exec.UDF; 8 | import org.apache.hadoop.hive.serde2.io.TimestampWritable; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.joda.time.LocalDate; 12 | import org.joda.time.format.DateTimeFormat; 13 | import org.joda.time.format.DateTimeFormatter; 14 | 15 | /** 16 | * @author ruifeng.shan 17 | * date: 15-9-1 18 | */ 19 | @Description(name = "type_of_day" 20 | , value = "_FUNC_(date) - get type of day in china. if normal festival, return 1; if weekend, return 2, if workday return 3, if weekend or festival but work, return 4; if error, return null." 21 | , extended = "Example:\n > select _FUNC_(date_string) from src;\n > select _FUNC_(date) from src;") 22 | public class UDFTypeOfDay extends UDF { 23 | public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd"); 24 | public final static Map dayMap = ConfigUtils.getDayMap(); 25 | private IntWritable result = new IntWritable(); 26 | 27 | public UDFTypeOfDay() { 28 | 29 | } 30 | 31 | /** 32 | * Get whether is holiday or not. 33 | * 34 | * @param dateString the dateString in the format of "yyyyMMdd". 35 | * @return 1: 法定节假日, 2: 正常周末, 3: 正常工作日 4:攒假的工作日 36 | */ 37 | public IntWritable evaluate(Text dateString) { 38 | if (dateString == null) { 39 | return null; 40 | } 41 | 42 | try { 43 | String value = dayMap.get(dateString.toString()); 44 | if (DayType.HOLIDAY.getCode().equalsIgnoreCase(value)) { 45 | result.set(1); 46 | } else if (DayType.WORKDAY.getCode().equalsIgnoreCase(value)) { 47 | result.set(4); 48 | } else { 49 | LocalDate date = LocalDate.parse(dateString.toString(), DEFAULT_DATE_FORMATTER); 50 | if (date.getDayOfWeek() < 6) { 51 | result.set(3); 52 | } else { 53 | result.set(2); 54 | } 55 | } 56 | 57 | return result; 58 | } catch (Exception e) { 59 | return null; 60 | } 61 | } 62 | 63 | public IntWritable evaluate(TimestampWritable t) { 64 | if (t == null) { 65 | return null; 66 | } 67 | 68 | try { 69 | Calendar calendar = Calendar.getInstance(); 70 | calendar.setTime(t.getTimestamp()); 71 | LocalDate date = LocalDate.fromCalendarFields(calendar); 72 | String dateString = date.toString(DEFAULT_DATE_FORMATTER); 73 | 74 | return evaluate(new Text(dateString)); 75 | } catch (Exception e) { 76 | return null; 77 | } 78 | } 79 | 80 | private enum DayType { 81 | HOLIDAY("holiday"), WORKDAY("workday"); 82 | 83 | private String code; 84 | 85 | private DayType(String code) { 86 | this.code = code; 87 | } 88 | 89 | public String getCode() { 90 | return this.code; 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpSplit.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.regexp; 2 | 3 | import io.airlift.slice.Slices; 4 | import java.util.ArrayList; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 9 | import org.apache.hadoop.hive.ql.metadata.HiveException; 10 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 13 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 14 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 15 | 16 | /** 17 | * @author ruifeng.shan 18 | * date: 2018-07-27 19 | * time: 22:38 20 | */ 21 | @Description(name = "regexp_split" 22 | , value = "_FUNC_(string, string) - returns array of strings split by pattern." 23 | , extended = "Example:\n > select _FUNC_(string, pattern) from src;") 24 | public class UDFRe2JRegexpSplit extends GenericUDF { 25 | private static final int ARG_COUNT = 2; 26 | private transient ArrayList result = new ArrayList(); 27 | private transient Re2JRegexp re2JRegexp; 28 | 29 | public UDFRe2JRegexpSplit() { 30 | 31 | } 32 | 33 | @Override 34 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 35 | // Check if two arguments were passed 36 | if (arguments.length != ARG_COUNT) { 37 | throw new UDFArgumentLengthException( 38 | "The function regexp_split(string, pattern) takes exactly " + ARG_COUNT + " arguments."); 39 | } 40 | 41 | for (int i = 0; i < ARG_COUNT; i++) { 42 | if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { 43 | throw new UDFArgumentTypeException(i, 44 | "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " 45 | + "expected at function regexp_split, but " 46 | + "\"" + arguments[i].getTypeName() + "\" " 47 | + "is found"); 48 | } 49 | } 50 | 51 | ObjectInspector expectOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 52 | 53 | return ObjectInspectorFactory.getStandardListObjectInspector(expectOI); 54 | } 55 | 56 | @Override 57 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 58 | String source = (String) arguments[0].get(); 59 | String pattern = (String) arguments[1].get(); 60 | 61 | if (source == null) { 62 | return null; 63 | } 64 | 65 | if (re2JRegexp == null) { 66 | re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern)); 67 | } 68 | 69 | result.clear(); 70 | result.addAll(re2JRegexp.split(Slices.utf8Slice(source))); 71 | 72 | return result; 73 | } 74 | 75 | @Override 76 | public String getDisplayString(String[] strings) { 77 | assert (strings.length == ARG_COUNT); 78 | return "regexp_split(" + strings[0] + ", " 79 | + strings[1] + ")"; 80 | } 81 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/utils/json/JsonUtils.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.utils.json; 2 | 3 | 4 | import com.fasterxml.jackson.core.JsonFactory; 5 | import com.fasterxml.jackson.core.JsonParser; 6 | import com.fasterxml.jackson.core.JsonToken; 7 | import com.fasterxml.jackson.databind.MappingJsonFactory; 8 | import com.google.common.collect.Lists; 9 | import java.io.IOException; 10 | import java.util.List; 11 | 12 | import static com.fasterxml.jackson.core.JsonFactory.Feature.CANONICALIZE_FIELD_NAMES; 13 | import static com.fasterxml.jackson.core.JsonToken.*; 14 | 15 | /** 16 | * @author ruifeng.shan 17 | * date: 2016-07-25 18 | * time: 14:47 19 | */ 20 | public class JsonUtils { 21 | private static final JsonFactory JSON_FACTORY = new JsonFactory() 22 | .disable(CANONICALIZE_FIELD_NAMES); 23 | 24 | private static final JsonFactory MAPPING_JSON_FACTORY = new MappingJsonFactory() 25 | .disable(CANONICALIZE_FIELD_NAMES); 26 | 27 | public static Long jsonArrayLength(String jsonString) { 28 | try { 29 | JsonParser parser = JSON_FACTORY.createParser(jsonString); 30 | if (parser.nextToken() != START_ARRAY) { 31 | return null; 32 | } 33 | long length = 0; 34 | while (true) { 35 | JsonToken token = parser.nextToken(); 36 | if (token == null) { 37 | return null; 38 | } 39 | if (token == END_ARRAY) { 40 | return length; 41 | } 42 | parser.skipChildren(); 43 | 44 | length++; 45 | } 46 | } catch (IOException e) { 47 | return null; 48 | } 49 | } 50 | 51 | public static String jsonArrayGet(String json, long index) { 52 | try { 53 | JsonParser parser = MAPPING_JSON_FACTORY.createParser(json); 54 | 55 | if (parser.nextToken() != START_ARRAY) { 56 | return null; 57 | } 58 | 59 | List tokens = null; 60 | if (index < 0) { 61 | tokens = Lists.newArrayList(); 62 | } 63 | 64 | long count = 0; 65 | while (true) { 66 | JsonToken token = parser.nextToken(); 67 | if (token == null) { 68 | return null; 69 | } 70 | if (token == END_ARRAY) { 71 | if (tokens != null && count >= index * -1) { 72 | return tokens.get(0); 73 | } 74 | 75 | return null; 76 | } 77 | 78 | String arrayElement; 79 | if (token == START_OBJECT || token == START_ARRAY) { 80 | arrayElement = parser.readValueAsTree().toString(); 81 | } else { 82 | arrayElement = parser.getValueAsString(); 83 | } 84 | 85 | if (count == index) { 86 | return arrayElement == null ? null : arrayElement; 87 | } 88 | 89 | if (tokens != null) { 90 | tokens.add(arrayElement); 91 | 92 | if (count >= index * -1) { 93 | tokens.remove(0); 94 | } 95 | } 96 | 97 | count++; 98 | } 99 | } catch (IOException e) { 100 | return null; 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/map/UDFMapElementAt.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.map; 2 | 3 | import java.util.Map; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 8 | import org.apache.hadoop.hive.ql.metadata.HiveException; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 10 | import org.apache.hadoop.hive.serde.serdeConstants; 11 | import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 13 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 14 | 15 | /** 16 | * @author ruifeng.shan 17 | * date: 2016-07-27 18 | * time: 15:38 19 | */ 20 | @Description(name = "map_element_at" 21 | , value = "_FUNC_(x, key) - returns value for given key, or NULL if the key is not contained in the map." 22 | , extended = "Example:\n > select _FUNC_(map, key) from src;") 23 | public class UDFMapElementAt extends GenericUDF { 24 | private static final int ARG_COUNT = 2; // Number of arguments to this UDF 25 | private transient MapObjectInspector mapOI; 26 | private transient ObjectInspector keyOI; 27 | 28 | @Override 29 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 30 | // Check if two arguments were passed 31 | if (arguments.length != ARG_COUNT) { 32 | throw new UDFArgumentLengthException( 33 | "The function map_element_at(map, key) takes exactly " + ARG_COUNT + " arguments."); 34 | } 35 | 36 | // Check if two argument is of category LIST 37 | if (!arguments[0].getCategory().equals(ObjectInspector.Category.MAP)) { 38 | throw new UDFArgumentTypeException(0, 39 | "\"" + serdeConstants.MAP_TYPE_NAME + "\" " 40 | + "expected at function map_element_at, but " 41 | + "\"" + arguments[0].getTypeName() + "\" " 42 | + "is found"); 43 | } 44 | 45 | mapOI = (MapObjectInspector) arguments[0]; 46 | keyOI = arguments[1]; 47 | 48 | ObjectInspector mapKeyOI = mapOI.getMapKeyObjectInspector(); 49 | ObjectInspector mapValueOI = mapOI.getMapValueObjectInspector(); 50 | 51 | // Check if map value type are of same value type 52 | if (!ObjectInspectorUtils.compareTypes(mapKeyOI, keyOI)) { 53 | throw new UDFArgumentTypeException(1, 54 | "\"" + mapKeyOI.getTypeName() + "\"" 55 | + " expected at function map_element_at key, but " 56 | + "\"" + keyOI.getTypeName() + "\"" 57 | + " is found"); 58 | } 59 | 60 | return mapValueOI; 61 | } 62 | 63 | @Override 64 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 65 | Object mapObj = arguments[0].get(); 66 | Object keyObj = arguments[1].get(); 67 | 68 | Map map = mapOI.getMap(mapObj); 69 | if (map == null) { 70 | return null; 71 | } 72 | 73 | return map.get(keyObj); 74 | } 75 | 76 | @Override 77 | public String getDisplayString(String[] strings) { 78 | assert (strings.length == ARG_COUNT); 79 | return "map_element_at(" + strings[0] + ", " 80 | + strings[1] + ")"; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/fastuitl/ints/IntArrays.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.fastuitl.ints; 2 | 3 | // Note: this code was forked from fastutil (http://fastutil.di.unimi.it/) 4 | // Copyright (C) 2010-2013 Sebastiano Vigna 5 | public class IntArrays { 6 | private static void selectionSort(int[] a, int from, int to, IntComparator comp) { 7 | for(int i = from; i < to - 1; ++i) { 8 | int m = i; 9 | 10 | int u; 11 | for(u = i + 1; u < to; ++u) { 12 | if(comp.compare(a[u], a[m]) < 0) { 13 | m = u; 14 | } 15 | } 16 | 17 | if(m != i) { 18 | u = a[i]; 19 | a[i] = a[m]; 20 | a[m] = u; 21 | } 22 | } 23 | } 24 | 25 | private static void swap(int[] x, int a, int b) { 26 | int t = x[a]; 27 | x[a] = x[b]; 28 | x[b] = t; 29 | } 30 | 31 | private static void vecSwap(int[] x, int a, int b, int n) { 32 | for(int i = 0; i < n; ++b) { 33 | swap(x, a, b); 34 | ++i; 35 | ++a; 36 | } 37 | 38 | } 39 | 40 | private static int med3(int[] x, int a, int b, int c, IntComparator comp) { 41 | int ab = comp.compare(x[a], x[b]); 42 | int ac = comp.compare(x[a], x[c]); 43 | int bc = comp.compare(x[b], x[c]); 44 | return ab < 0?(bc < 0?b:(ac < 0?c:a)):(bc > 0?b:(ac > 0?c:a)); 45 | } 46 | 47 | public static void quickSort(int[] x, int from, int to, IntComparator comp) { 48 | int len = to - from; 49 | if(len < 7) { 50 | selectionSort(x, from, to, comp); 51 | } else { 52 | int m = from + len / 2; 53 | int v; 54 | int a; 55 | int b; 56 | if(len > 7) { 57 | v = from; 58 | a = to - 1; 59 | if(len > 50) { 60 | b = len / 8; 61 | v = med3(x, from, from + b, from + 2 * b, comp); 62 | m = med3(x, m - b, m, m + b, comp); 63 | a = med3(x, a - 2 * b, a - b, a, comp); 64 | } 65 | 66 | m = med3(x, v, m, a, comp); 67 | } 68 | 69 | v = x[m]; 70 | a = from; 71 | b = from; 72 | int c = to - 1; 73 | int d = c; 74 | 75 | while(true) { 76 | int s; 77 | while(b > c || (s = comp.compare(x[b], v)) > 0) { 78 | for(; c >= b && (s = comp.compare(x[c], v)) >= 0; --c) { 79 | if(s == 0) { 80 | swap(x, c, d--); 81 | } 82 | } 83 | 84 | if(b > c) { 85 | s = Math.min(a - from, b - a); 86 | vecSwap(x, from, b - s, s); 87 | s = Math.min(d - c, to - d - 1); 88 | vecSwap(x, b, to - s, s); 89 | if((s = b - a) > 1) { 90 | quickSort(x, from, from + s, comp); 91 | } 92 | 93 | if((s = d - c) > 1) { 94 | quickSort(x, to - s, to, comp); 95 | } 96 | 97 | return; 98 | } 99 | 100 | swap(x, b++, c--); 101 | } 102 | 103 | if(s == 0) { 104 | swap(x, a++, b); 105 | } 106 | 107 | ++b; 108 | } 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/array/UDFArrayReverse.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import java.util.ArrayList; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 8 | import org.apache.hadoop.hive.ql.metadata.HiveException; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 10 | import org.apache.hadoop.hive.serde2.objectinspector.*; 11 | 12 | /** 13 | * @author ruifeng.shan 14 | * date: 2016-07-26 15 | * time: 18:03 16 | */ 17 | @Description(name = "array_reverse" 18 | , value = "_FUNC_(array) - reverse the array element." 19 | , extended = "Example:\n > select _FUNC_(array) from src;") 20 | public class UDFArrayReverse extends GenericUDF { 21 | private static final int ARG_COUNT = 1; // Number of arguments to this UDF 22 | private transient ListObjectInspector arrayOI; 23 | private transient ObjectInspector arrayElementOI; 24 | 25 | private transient ObjectInspectorConverters.Converter converter; 26 | private transient ArrayList result = new ArrayList(); 27 | 28 | public UDFArrayReverse() { 29 | } 30 | 31 | @Override 32 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 33 | // Check if two arguments were passed 34 | if (arguments.length != ARG_COUNT) { 35 | throw new UDFArgumentLengthException( 36 | "The function array_reverse(array) takes exactly " + ARG_COUNT + " arguments."); 37 | } 38 | 39 | // Check if two argument is of category LIST 40 | if (!arguments[0].getCategory().equals(ObjectInspector.Category.LIST)) { 41 | throw new UDFArgumentTypeException(0, 42 | "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " 43 | + "expected at function array_reverse, but " 44 | + "\"" + arguments[0].getTypeName() + "\" " 45 | + "is found"); 46 | } 47 | 48 | arrayOI = (ListObjectInspector) arguments[0]; 49 | arrayElementOI = arrayOI.getListElementObjectInspector(); 50 | 51 | // Check if the comparison is supported for this type 52 | if (!ObjectInspectorUtils.compareSupported(arrayElementOI)) { 53 | throw new UDFArgumentException("The function array_reverse" 54 | + " does not support comparison for " 55 | + "\"" + arrayElementOI.getTypeName() + "\"" 56 | + " types"); 57 | } 58 | 59 | converter = ObjectInspectorConverters.getConverter(arrayElementOI, arrayElementOI); 60 | 61 | return ObjectInspectorFactory.getStandardListObjectInspector(arrayElementOI); 62 | } 63 | 64 | @Override 65 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 66 | Object array = arguments[0].get(); 67 | int arrayLength = arrayOI.getListLength(array); 68 | 69 | // Check if array is null or empty 70 | if (array == null || arrayLength <= 0) { 71 | return null; 72 | } 73 | 74 | if (arrayLength == 1) { 75 | return array; 76 | } 77 | 78 | result.clear(); 79 | for (int i = arrayLength - 1; i >= 0; i--) { 80 | Object arrayElement = arrayOI.getListElement(array, i); 81 | result.add(arrayElement); 82 | } 83 | return result; 84 | } 85 | 86 | @Override 87 | public String getDisplayString(String[] strings) { 88 | assert (strings.length == ARG_COUNT); 89 | return "array_reverse(" + strings[0] + ")"; 90 | } 91 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtract.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.json; 2 | 3 | import com.github.aaronshan.functions.utils.json.JsonExtract; 4 | import com.github.aaronshan.functions.utils.json.JsonPath; 5 | import com.github.aaronshan.functions.utils.json.JsonUtils; 6 | import java.util.ArrayList; 7 | import org.apache.hadoop.hive.ql.exec.Description; 8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 9 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 10 | import org.apache.hadoop.hive.ql.metadata.HiveException; 11 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 13 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; 14 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 15 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 16 | import org.apache.hadoop.io.Text; 17 | 18 | /** 19 | * @author ruifeng.shan 20 | * date: 2016-07-25 21 | * time: 16:26 22 | */ 23 | @Description(name = "json_array_extract", value = "_FUNC_(json, json_path) - extract json array by given jsonPath. " 24 | , extended = "Example:\n" 25 | + " > SELECT _FUNC_(json_array, json_path) FROM src LIMIT 1;") 26 | public class UDFJsonArrayExtract extends GenericUDF { 27 | private ObjectInspectorConverters.Converter[] converters; 28 | 29 | public UDFJsonArrayExtract() { 30 | } 31 | 32 | @Override 33 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 34 | if (arguments.length != 2) { 35 | throw new UDFArgumentLengthException( 36 | "The function json_array_extract(json, json_path) takes exactly 2 arguments."); 37 | } 38 | 39 | converters = new ObjectInspectorConverters.Converter[arguments.length]; 40 | for (int i = 0; i < arguments.length; i++) { 41 | converters[i] = ObjectInspectorConverters.getConverter(arguments[i], 42 | PrimitiveObjectInspectorFactory.writableStringObjectInspector); 43 | } 44 | 45 | return ObjectInspectorFactory 46 | .getStandardListObjectInspector(PrimitiveObjectInspectorFactory 47 | .writableStringObjectInspector); 48 | } 49 | 50 | @Override 51 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 52 | assert (arguments.length == 2); 53 | 54 | if (arguments[0].get() == null || arguments[1].get() == null) { 55 | return null; 56 | } 57 | 58 | try { 59 | Text jsonText = (Text) converters[0].convert(arguments[0].get()); 60 | Text pathText = (Text) converters[1].convert(arguments[1].get()); 61 | String json = jsonText.toString(); 62 | 63 | Long length = JsonUtils.jsonArrayLength(json); 64 | if (length == null) { 65 | return null; 66 | } 67 | ArrayList ret = new ArrayList(length.intValue()); 68 | JsonPath jsonPath = new JsonPath(pathText.toString()); 69 | ret.clear(); 70 | for (int i = 0; i < length; i++) { 71 | String content = JsonUtils.jsonArrayGet(json, i); 72 | String result = JsonExtract.extract(content, jsonPath.getObjectExtractor()); 73 | ret.add(new Text(result)); 74 | } 75 | return ret; 76 | } catch (Exception e) { 77 | return null; 78 | } 79 | } 80 | 81 | @Override 82 | public String getDisplayString(String[] strings) { 83 | assert (strings.length == 2); 84 | return "json_array_extract(" + strings[0] + ", " + strings[1] + ")"; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/map/UDFMapBuild.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.map; 2 | 3 | import java.util.LinkedHashMap; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 8 | import org.apache.hadoop.hive.ql.metadata.HiveException; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 13 | 14 | /** 15 | * @author ruifeng.shan 16 | * date: 2016-07-27 17 | * time: 15:39 18 | */ 19 | @Description(name = "map_build" 20 | , value = "_FUNC_(array, array) - returns a map created using the given key/value arrays." 21 | , extended = "Example:\n > select _FUNC_(array, array) from src;") 22 | public class UDFMapBuild extends GenericUDF { 23 | private static final int ARG_COUNT = 2; // Number of arguments to this UDF 24 | LinkedHashMap result = new LinkedHashMap(); 25 | private transient ListObjectInspector keyArrayOI; 26 | private transient ListObjectInspector valueArrayOI; 27 | 28 | @Override 29 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 30 | // Check if two arguments were passed 31 | if (arguments.length != ARG_COUNT) { 32 | throw new UDFArgumentLengthException( 33 | "The function map_build(array, array) takes exactly " + ARG_COUNT + " arguments."); 34 | } 35 | 36 | // Check if two argument is of category LIST 37 | for (int i = 0; i < 2; i++) { 38 | if (!arguments[i].getCategory().equals(ObjectInspector.Category.LIST)) { 39 | throw new UDFArgumentTypeException(i, 40 | "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " 41 | + "expected at function map_build, but " 42 | + "\"" + arguments[i].getTypeName() + "\" " 43 | + "is found"); 44 | } 45 | } 46 | 47 | keyArrayOI = (ListObjectInspector) arguments[0]; 48 | valueArrayOI = (ListObjectInspector) arguments[1]; 49 | 50 | ObjectInspector mapKeyOI = keyArrayOI.getListElementObjectInspector(); 51 | ObjectInspector mapValueOI = valueArrayOI.getListElementObjectInspector(); 52 | 53 | return ObjectInspectorFactory.getStandardMapObjectInspector(mapKeyOI, mapValueOI); 54 | } 55 | 56 | @Override 57 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 58 | Object keyArray = arguments[0].get(); 59 | Object valueArray = arguments[1].get(); 60 | 61 | int keyArrayLength = keyArrayOI.getListLength(keyArray); 62 | int valueArrayLength = valueArrayOI.getListLength(valueArray); 63 | 64 | if (keyArray == null || valueArray == null || keyArrayLength <= 0 || valueArrayLength <= 0) { 65 | return null; 66 | } 67 | 68 | if (keyArrayLength != valueArrayLength) { 69 | throw new HiveException("key array length not equals value array length!"); 70 | } 71 | 72 | result.clear(); 73 | for (int i = 0; i < keyArrayLength; i++) { 74 | result.put(keyArrayOI.getListElement(keyArray, i), valueArrayOI.getListElement(valueArray, i)); 75 | } 76 | 77 | return result; 78 | } 79 | 80 | @Override 81 | public String getDisplayString(String[] strings) { 82 | assert (strings.length == ARG_COUNT); 83 | return "map_build(" + strings[0] + ", " 84 | + strings[1] + ")"; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/json/UDFJsonArrayExtractScalar.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.json; 2 | 3 | import com.github.aaronshan.functions.utils.json.JsonExtract; 4 | import com.github.aaronshan.functions.utils.json.JsonPath; 5 | import com.github.aaronshan.functions.utils.json.JsonUtils; 6 | import java.util.ArrayList; 7 | import org.apache.hadoop.hive.ql.exec.Description; 8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 9 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 10 | import org.apache.hadoop.hive.ql.metadata.HiveException; 11 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 13 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; 14 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 15 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 16 | import org.apache.hadoop.io.Text; 17 | 18 | /** 19 | * @author ruifeng.shan 20 | * date: 2016-07-25 21 | * time: 15:33 22 | */ 23 | @Description(name = "json_array_extract_scalar", value = "_FUNC_(json, json_path) - extract json array by given jsonPath. but returns the result value as a string (as opposed to being encoded as JSON)." 24 | , extended = "Example:\n" 25 | + " > SELECT _FUNC_(json_array, json_path) FROM src LIMIT 1;") 26 | public class UDFJsonArrayExtractScalar extends GenericUDF { 27 | private ObjectInspectorConverters.Converter[] converters; 28 | 29 | public UDFJsonArrayExtractScalar() { 30 | } 31 | 32 | @Override 33 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 34 | if (arguments.length != 2) { 35 | throw new UDFArgumentLengthException( 36 | "The function json_array_extract_scalar(json, json_path) takes exactly 2 arguments."); 37 | } 38 | 39 | converters = new ObjectInspectorConverters.Converter[arguments.length]; 40 | for (int i = 0; i < arguments.length; i++) { 41 | converters[i] = ObjectInspectorConverters.getConverter(arguments[i], 42 | PrimitiveObjectInspectorFactory.writableStringObjectInspector); 43 | } 44 | 45 | return ObjectInspectorFactory 46 | .getStandardListObjectInspector(PrimitiveObjectInspectorFactory 47 | .writableStringObjectInspector); 48 | } 49 | 50 | @Override 51 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 52 | assert (arguments.length == 2); 53 | 54 | if (arguments[0].get() == null || arguments[1].get() == null) { 55 | return null; 56 | } 57 | 58 | try { 59 | Text jsonText = (Text) converters[0].convert(arguments[0].get()); 60 | Text pathText = (Text) converters[1].convert(arguments[1].get()); 61 | String json = jsonText.toString(); 62 | 63 | Long length = JsonUtils.jsonArrayLength(json); 64 | if (length == null) { 65 | return null; 66 | } 67 | ArrayList ret = new ArrayList(length.intValue()); 68 | JsonPath jsonPath = new JsonPath(pathText.toString()); 69 | ret.clear(); 70 | for (int i = 0; i < length; i++) { 71 | String content = JsonUtils.jsonArrayGet(json, i); 72 | String result = JsonExtract.extract(content, jsonPath.getScalarExtractor()); 73 | ret.add(new Text(result)); 74 | } 75 | return ret; 76 | } catch (Exception e) { 77 | return null; 78 | } 79 | } 80 | 81 | @Override 82 | public String getDisplayString(String[] strings) { 83 | assert (strings.length == 2); 84 | return "json_array_extract_scalar(" + strings[0] + ", " + strings[1] + ")"; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/re2j/Inst.java: -------------------------------------------------------------------------------- 1 | // Copyright 2010 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Original Go source here: 6 | // http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/prog.go 7 | 8 | package com.github.aaronshan.functions.regexp.re2j; 9 | 10 | /** 11 | * A single instruction in the regular expression virtual machine. 12 | * 13 | * @see http://swtch.com/~rsc/regexp/regexp2.html 14 | */ 15 | class Inst { 16 | 17 | Op op; 18 | int out; // all but MATCH, FAIL 19 | int arg; // ALT, ALT_MATCH, CAPTURE, EMPTY_WIDTH 20 | byte[] byteRanges; // length==1 => exact match. Otherwise a list of [lo,hi] pairs. hi is *inclusive*. 21 | Inst(Op op) { 22 | this.op = op; 23 | } 24 | 25 | // op() returns i.Op but merges all the byte special cases into BYTE 26 | // Beware "op" is a public field. 27 | Op op() { 28 | switch (op) { 29 | case BYTE1: 30 | return Op.BYTE; 31 | default: 32 | return op; 33 | } 34 | } 35 | 36 | // MatchByte returns true if the instruction matches (and consumes) b. 37 | // It should only be called when op == InstByte. 38 | boolean matchByte(byte b) { 39 | // Special case: single-byte slice is from literal string, not byte range. 40 | if (byteRanges.length == 1) { 41 | int b0 = byteRanges[0]; 42 | return b == b0; 43 | } 44 | 45 | // Search through all pairs. 46 | int byteInt = b & 0xff; 47 | for (int j = 0; j < byteRanges.length; j += 2) { 48 | if (byteInt < (byteRanges[j] & 0xff)) { 49 | return false; 50 | } 51 | if (byteInt <= (byteRanges[j + 1] & 0xff)) { 52 | return true; 53 | } 54 | } 55 | 56 | return false; 57 | } 58 | 59 | @Override 60 | public String toString() { 61 | switch (op) { 62 | case ALT: 63 | return "alt -> " + out + ", " + arg; 64 | case ALT_MATCH: 65 | return "altmatch -> " + out + ", " + arg; 66 | case CAPTURE: 67 | return "cap " + arg + " -> " + out; 68 | case EMPTY_WIDTH: 69 | return "empty " + arg + " -> " + out; 70 | case MATCH: 71 | return "match"; 72 | case FAIL: 73 | return "fail"; 74 | case NOP: 75 | return "nop -> " + out; 76 | case BYTE: 77 | return "byte " + appendBytes() + " -> " + out; 78 | case BYTE1: 79 | return "byte1 " + appendBytes() + " -> " + out; 80 | default: 81 | throw new IllegalStateException("unhandled case in Inst.toString"); 82 | } 83 | } 84 | 85 | private String appendBytes() { 86 | StringBuilder out = new StringBuilder(); 87 | if (byteRanges.length == 1) { 88 | out.append(byteRanges[0] & 0xff); 89 | } else { 90 | for (int i = 0; i < byteRanges.length; i += 2) { 91 | out.append("[") 92 | .append(byteRanges[i] & 0xff) 93 | .append(",") 94 | .append(byteRanges[i + 1] & 0xff) 95 | .append("]"); 96 | if (i < byteRanges.length - 2) { 97 | out.append(";"); 98 | } 99 | } 100 | } 101 | return out.toString(); 102 | } 103 | 104 | enum Op { 105 | ALT, 106 | ALT_MATCH, 107 | CAPTURE, 108 | EMPTY_WIDTH, 109 | FAIL, 110 | MATCH, 111 | NOP, 112 | BYTE, 113 | BYTE1 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMap.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import com.google.common.base.Splitter; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | import org.apache.hadoop.hive.ql.exec.Description; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 9 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 10 | import org.apache.hadoop.hive.ql.metadata.HiveException; 11 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 13 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 14 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 15 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 16 | 17 | import static com.github.aaronshan.functions.utils.Failures.checkCondition; 18 | 19 | /** 20 | * @author ruifeng.shan 21 | * date: 2018-07-27 22 | * time: 00:04 23 | */ 24 | @Description(name = "split_to_map" 25 | , value = "_FUNC_(string, string, string) - returns a map created using the given key/value arrays." 26 | , extended = "Example:\n > select _FUNC_('a=123,b=.4,c=,=d', ',', '=') from src;") 27 | public class UDFStringSplitToMap extends GenericUDF { 28 | private static final int ARG_COUNT = 3; // Number of arguments to this UDF 29 | HashMap result = new HashMap(); 30 | 31 | @Override 32 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 33 | // Check if two arguments were passed 34 | if (arguments.length != ARG_COUNT) { 35 | throw new UDFArgumentLengthException( 36 | "The function split_to_map(string, string, string) takes exactly " + ARG_COUNT + " arguments."); 37 | } 38 | 39 | // Check if two argument is of string 40 | for (int i = 0; i < 3; i++) { 41 | if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { 42 | throw new UDFArgumentTypeException(i, 43 | "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " 44 | + "expected at function split_to_map, but " 45 | + "\"" + arguments[i].getTypeName() + "\" " 46 | + "is found"); 47 | } 48 | } 49 | 50 | ObjectInspector mapKeyOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 51 | ObjectInspector mapValueOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 52 | 53 | return ObjectInspectorFactory.getStandardMapObjectInspector(mapKeyOI, mapValueOI); 54 | } 55 | 56 | @Override 57 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 58 | String string = (String) arguments[0].get(); 59 | String entryDelimiter = (String) arguments[1].get(); 60 | String keyValueDelimiter = (String) arguments[2].get(); 61 | 62 | checkCondition(entryDelimiter.length() > 0, "entryDelimiter is empty"); 63 | checkCondition(keyValueDelimiter.length() > 0, "keyValueDelimiter is empty"); 64 | checkCondition(!entryDelimiter.equals(keyValueDelimiter), "entryDelimiter and keyValueDelimiter must not be the same"); 65 | 66 | if (string == null) { 67 | return null; 68 | } 69 | 70 | result.clear(); 71 | Map map = Splitter.on(entryDelimiter).withKeyValueSeparator(keyValueDelimiter).split(string); 72 | result.putAll(map); 73 | 74 | return result; 75 | } 76 | 77 | @Override 78 | public String getDisplayString(String[] strings) { 79 | assert (strings.length == ARG_COUNT); 80 | return "split_to_map(" + strings[0] + ", " 81 | + strings[1] + ", " + strings[2] + ")"; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/array/UDFArrayMax.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.github.aaronshan.functions.fastuitl.ints.IntArrays; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 8 | import org.apache.hadoop.hive.ql.metadata.HiveException; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 13 | 14 | import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; 15 | 16 | /** 17 | * @author ruifeng.shan 18 | * date: 2016-07-26 19 | * time: 17:31 20 | */ 21 | @Description(name = "array_max" 22 | , value = "_FUNC_(array) - returns the maximum value of input array." 23 | , extended = "Example:\n > select _FUNC_(array) from src;") 24 | public class UDFArrayMax extends GenericUDF { 25 | private static final int INITIAL_SIZE = 128; 26 | private static final int ARG_COUNT = 1; // Number of arguments to this UDF 27 | private int[] positions = new int[INITIAL_SIZE]; 28 | private transient ListObjectInspector arrayOI; 29 | private transient ObjectInspector arrayElementOI; 30 | 31 | public UDFArrayMax() { 32 | } 33 | 34 | @Override 35 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 36 | // Check if two arguments were passed 37 | if (arguments.length != ARG_COUNT) { 38 | throw new UDFArgumentLengthException( 39 | "The function array_max(array) takes exactly " + ARG_COUNT + "arguments."); 40 | } 41 | 42 | // Check if two argument is of category LIST 43 | if (!arguments[0].getCategory().equals(ObjectInspector.Category.LIST)) { 44 | throw new UDFArgumentTypeException(0, 45 | "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " 46 | + "expected at function array_max, but " 47 | + "\"" + arguments[0].getTypeName() + "\" " 48 | + "is found"); 49 | } 50 | 51 | arrayOI = (ListObjectInspector) arguments[0]; 52 | arrayElementOI = arrayOI.getListElementObjectInspector(); 53 | 54 | // Check if the comparison is supported for this type 55 | if (!ObjectInspectorUtils.compareSupported(arrayElementOI)) { 56 | throw new UDFArgumentException("The function array_max" 57 | + " does not support comparison for " 58 | + "\"" + arrayElementOI.getTypeName() + "\"" 59 | + " types"); 60 | } 61 | 62 | return arrayElementOI; 63 | } 64 | 65 | @Override 66 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 67 | Object array = arguments[0].get(); 68 | int arrayLength = arrayOI.getListLength(array); 69 | 70 | // Check if array is null or empty 71 | if (array == null || arrayLength <= 0) { 72 | return null; 73 | } 74 | 75 | if (arrayLength == 1) { 76 | return arrayOI.getListElement(array, positions[0]); 77 | } 78 | 79 | if (positions.length < arrayLength) { 80 | positions = new int[arrayLength]; 81 | } 82 | 83 | for (int i = 0; i < arrayLength; i++) { 84 | positions[i] = i; 85 | } 86 | 87 | IntArrays.quickSort(positions, 0, arrayLength, IntArrayCompare(array, arrayOI)); 88 | Object maxArrayElement = arrayOI.getListElement(array, positions[arrayLength - 1]); 89 | return maxArrayElement; 90 | } 91 | 92 | @Override 93 | public String getDisplayString(String[] strings) { 94 | assert (strings.length == ARG_COUNT); 95 | return "array_max(" + strings[0] + ")"; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/array/UDFArrayMin.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.github.aaronshan.functions.fastuitl.ints.IntArrays; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 8 | import org.apache.hadoop.hive.ql.metadata.HiveException; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 13 | 14 | import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; 15 | 16 | /** 17 | * @author ruifeng.shan 18 | * date: 2016-07-26 19 | * time: 17:32 20 | */ 21 | @Description(name = "array_min" 22 | , value = "_FUNC_(array) - returns the minimum value of input array." 23 | , extended = "Example:\n > select _FUNC_(array) from src;") 24 | public class UDFArrayMin extends GenericUDF { 25 | private static final int INITIAL_SIZE = 128; 26 | private static final int ARG_COUNT = 1; // Number of arguments to this UDF 27 | private int[] positions = new int[INITIAL_SIZE]; 28 | private transient ListObjectInspector arrayOI; 29 | private transient ObjectInspector arrayElementOI; 30 | 31 | public UDFArrayMin() { 32 | } 33 | 34 | @Override 35 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 36 | // Check if two arguments were passed 37 | if (arguments.length != ARG_COUNT) { 38 | throw new UDFArgumentLengthException( 39 | "The function array_min(array) takes exactly " + ARG_COUNT + "arguments."); 40 | } 41 | 42 | // Check if two argument is of category LIST 43 | if (!arguments[0].getCategory().equals(ObjectInspector.Category.LIST)) { 44 | throw new UDFArgumentTypeException(0, 45 | "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " 46 | + "expected at function array_min, but " 47 | + "\"" + arguments[0].getTypeName() + "\" " 48 | + "is found"); 49 | } 50 | 51 | arrayOI = (ListObjectInspector) arguments[0]; 52 | arrayElementOI = arrayOI.getListElementObjectInspector(); 53 | 54 | // Check if the comparison is supported for this type 55 | if (!ObjectInspectorUtils.compareSupported(arrayElementOI)) { 56 | throw new UDFArgumentException("The function array_min" 57 | + " does not support comparison for " 58 | + "\"" + arrayElementOI.getTypeName() + "\"" 59 | + " types"); 60 | } 61 | 62 | return arrayElementOI; 63 | } 64 | 65 | @Override 66 | public Object evaluate(GenericUDF.DeferredObject[] arguments) throws HiveException { 67 | Object array = arguments[0].get(); 68 | int arrayLength = arrayOI.getListLength(array); 69 | 70 | // Check if array is null or empty 71 | if (array == null || arrayLength <= 0) { 72 | return null; 73 | } 74 | 75 | if (arrayLength == 1) { 76 | return arrayOI.getListElement(array, positions[0]); 77 | } 78 | 79 | if (positions.length < arrayLength) { 80 | positions = new int[arrayLength]; 81 | } 82 | 83 | for (int i = 0; i < arrayLength; i++) { 84 | positions[i] = i; 85 | } 86 | 87 | IntArrays.quickSort(positions, 0, arrayLength, IntArrayCompare(array, arrayOI)); 88 | Object minArrayElement = arrayOI.getListElement(array, positions[0]); 89 | return minArrayElement; 90 | } 91 | 92 | @Override 93 | public String getDisplayString(String[] strings) { 94 | assert (strings.length == ARG_COUNT); 95 | return "array_min(" + strings[0] + ")"; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/array/UDFSequence.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.google.common.collect.Lists; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDF; 6 | import org.apache.hadoop.hive.ql.metadata.HiveException; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.joda.time.DateTime; 10 | import org.joda.time.format.DateTimeFormat; 11 | import org.joda.time.format.DateTimeFormatter; 12 | 13 | import java.util.List; 14 | 15 | import static com.github.aaronshan.functions.utils.Failures.checkCondition; 16 | /** 17 | * @author aaron02 18 | * date: 2018-08-18 上午9:23 19 | */ 20 | @Description(name = "sequence" 21 | , value = "_FUNC_(start, stop) - Generate a sequence of integers from start to stop.\n" + 22 | "_FUNC_(start, stop, step) - Generate a sequence of integers from start to stop, incrementing by step." 23 | , extended = "Example:\n > select _FUNC_(1, 5) from src;\n > select _FUNC_(1, 9, 4) from src;\n" + 24 | " > select _FUNC_('2016-04-12', '2016-04-14') from src;") 25 | public class UDFSequence extends UDF { 26 | public final static DateTimeFormatter DEFAULT_DATE_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); 27 | private static final long MAX_RESULT_ENTRIES = 10000; 28 | 29 | public UDFSequence() { 30 | 31 | } 32 | 33 | public Object evaluate(LongWritable start, LongWritable stop) throws HiveException { 34 | return fixedWidthSequence(start.get(), stop.get(), stop.get() >= start.get() ? 1 : -1, Long.class); 35 | } 36 | 37 | public Object evaluate(LongWritable start, LongWritable stop, LongWritable step) throws HiveException { 38 | return fixedWidthSequence(start.get(), stop.get(), step.get(), Long.class); 39 | } 40 | 41 | public Object evaluate(Text start, Text stop, long step) throws HiveException { 42 | long startMillis = DateTime.parse(start.toString(), DEFAULT_DATE_FORMATTER).getMillis(); 43 | long stopMillis = DateTime.parse(stop.toString(), DEFAULT_DATE_FORMATTER).getMillis(); 44 | return fixedWidthSequence(startMillis, stopMillis, step, String.class); 45 | } 46 | 47 | public static int toIntExact(long value) { 48 | if ((int)value != value) { 49 | throw new ArithmeticException("integer overflow"); 50 | } 51 | return (int)value; 52 | } 53 | 54 | private static Object fixedWidthSequence(long start, long stop, long step, Class type) throws HiveException { 55 | checkValidStep(start, stop, step); 56 | 57 | int length = toIntExact((stop - start) / step + 1L); 58 | checkMaxEntry(length); 59 | 60 | if (type == long.class || type == Long.class) { 61 | List result = Lists.newArrayList(); 62 | for (long i = 0, value = start; i < length; ++i, value += step) { 63 | result.add(value); 64 | } 65 | return result; 66 | } else if (type == String.class){ 67 | List result = Lists.newArrayList(); 68 | for (long i = 0, value = start; i < length; ++i, value += step) { 69 | DateTime dateTime = new DateTime(value); 70 | result.add(dateTime.toString(DEFAULT_DATE_FORMATTER)); 71 | } 72 | return result; 73 | } else { 74 | throw new HiveException("Don't support this class type!" + type); 75 | } 76 | } 77 | 78 | private static void checkValidStep(long start, long stop, long step) throws HiveException { 79 | checkCondition( 80 | step != 0, 81 | "step must not be zero"); 82 | checkCondition( 83 | step > 0 ? stop >= start : stop <= start, 84 | "sequence stop value should be greater than or equal to start value if step is greater than zero otherwise stop should be less than or equal to start"); 85 | } 86 | 87 | private static void checkMaxEntry(int length) throws HiveException { 88 | checkCondition( 89 | length <= MAX_RESULT_ENTRIES, 90 | "result of sequence function must not have more than 10000 entries"); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/array/UDFArrayEquals.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.github.aaronshan.functions.utils.ArrayUtils; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 8 | import org.apache.hadoop.hive.ql.metadata.HiveException; 9 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 13 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 14 | import org.apache.hadoop.io.BooleanWritable; 15 | 16 | /** 17 | * @author ruifeng.shan 18 | * date: 2016-09-08 19 | * time: 16:03 20 | */ 21 | @Description(name = "array_equals" 22 | , value = "_FUNC_(array, array) - whether two arrays equals or not." 23 | , extended = "Example:\n > select _FUNC_(array, array) from src;") 24 | public class UDFArrayEquals extends GenericUDF { 25 | private static final int ARG_COUNT = 2; // Number of arguments to this UDF 26 | private transient ListObjectInspector leftArrayOI; 27 | private transient ListObjectInspector rightArrayOI; 28 | private transient ObjectInspector leftArrayElementOI; 29 | private transient ObjectInspector rightArrayElementOI; 30 | 31 | private BooleanWritable result; 32 | 33 | public UDFArrayEquals() { 34 | } 35 | 36 | @Override 37 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 38 | // Check if two arguments were passed 39 | if (arguments.length != ARG_COUNT) { 40 | throw new UDFArgumentLengthException( 41 | "The function array_equals(array, array) takes exactly " + ARG_COUNT + " arguments."); 42 | } 43 | 44 | // Check if two argument is of category LIST 45 | for (int i = 0; i < 2; i++) { 46 | if (!arguments[i].getCategory().equals(ObjectInspector.Category.LIST)) { 47 | throw new UDFArgumentTypeException(i, 48 | "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " 49 | + "expected at function array_equals, but " 50 | + "\"" + arguments[i].getTypeName() + "\" " 51 | + "is found"); 52 | } 53 | } 54 | 55 | leftArrayOI = (ListObjectInspector) arguments[0]; 56 | rightArrayOI = (ListObjectInspector) arguments[1]; 57 | 58 | leftArrayElementOI = leftArrayOI.getListElementObjectInspector(); 59 | rightArrayElementOI = rightArrayOI.getListElementObjectInspector(); 60 | 61 | // Check if two array are of same type 62 | if (!ObjectInspectorUtils.compareTypes(leftArrayElementOI, rightArrayElementOI)) { 63 | throw new UDFArgumentTypeException(1, 64 | "\"" + leftArrayElementOI.getTypeName() + "\"" 65 | + " expected at function array_equals, but " 66 | + "\"" + rightArrayElementOI.getTypeName() + "\"" 67 | + " is found"); 68 | } 69 | 70 | // Check if the comparison is supported for this type 71 | if (!ObjectInspectorUtils.compareSupported(leftArrayElementOI)) { 72 | throw new UDFArgumentException("The function array_equals" 73 | + " does not support comparison for " 74 | + "\"" + leftArrayElementOI.getTypeName() + "\"" 75 | + " types"); 76 | } 77 | 78 | result = new BooleanWritable(false); 79 | return PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; 80 | } 81 | 82 | @Override 83 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 84 | Object leftArray = arguments[0].get(); 85 | Object rightArray = arguments[1].get(); 86 | 87 | boolean ret = ArrayUtils.arrayEquals(leftArray, rightArray, leftArrayOI); 88 | result.set(ret); 89 | 90 | return result; 91 | } 92 | 93 | @Override 94 | public String getDisplayString(String[] strings) { 95 | assert (strings.length == ARG_COUNT); 96 | return "array_equals(" + strings[0] + ", " 97 | + strings[1] + ")"; 98 | } 99 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/array/UDFArraySort.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.github.aaronshan.functions.fastuitl.ints.IntArrays; 4 | import java.util.ArrayList; 5 | 6 | import com.github.aaronshan.functions.utils.ArrayUtils; 7 | import org.apache.hadoop.hive.ql.exec.Description; 8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 9 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 10 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 11 | import org.apache.hadoop.hive.ql.metadata.HiveException; 12 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 13 | import org.apache.hadoop.hive.serde2.objectinspector.*; 14 | 15 | import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; 16 | 17 | /** 18 | * @author ruifeng.shan 19 | * date: 2016-07-26 20 | * time: 17:32 21 | */ 22 | @Description(name = "array_sort" 23 | , value = "_FUNC_(array) - sorts and returns the array. The elements of array must be orderable." 24 | , extended = "Example:\n > select _FUNC_(array) from src;") 25 | public class UDFArraySort extends GenericUDF { 26 | private static final int INITIAL_SIZE = 128; 27 | private static final int ARG_COUNT = 1; // Number of arguments to this UDF 28 | private int[] positions = new int[INITIAL_SIZE]; 29 | private transient ListObjectInspector arrayOI; 30 | private transient ObjectInspector arrayElementOI; 31 | 32 | private transient ObjectInspectorConverters.Converter converter; 33 | private transient ArrayList result = new ArrayList(); 34 | 35 | public UDFArraySort() { 36 | } 37 | 38 | @Override 39 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 40 | // Check if two arguments were passed 41 | if (arguments.length != ARG_COUNT) { 42 | throw new UDFArgumentLengthException( 43 | "The function array_sort(array) takes exactly " + ARG_COUNT + " arguments."); 44 | } 45 | 46 | // Check if two argument is of category LIST 47 | if (!arguments[0].getCategory().equals(ObjectInspector.Category.LIST)) { 48 | throw new UDFArgumentTypeException(0, 49 | "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " 50 | + "expected at function array_sort, but " 51 | + "\"" + arguments[0].getTypeName() + "\" " 52 | + "is found"); 53 | } 54 | 55 | arrayOI = (ListObjectInspector) arguments[0]; 56 | arrayElementOI = arrayOI.getListElementObjectInspector(); 57 | 58 | // Check if the comparison is supported for this type 59 | if (!ObjectInspectorUtils.compareSupported(arrayElementOI)) { 60 | throw new UDFArgumentException("The function array_sort" 61 | + " does not support comparison for " 62 | + "\"" + arrayElementOI.getTypeName() + "\"" 63 | + " types"); 64 | } 65 | 66 | converter = ObjectInspectorConverters.getConverter(arrayElementOI, arrayElementOI); 67 | 68 | return ObjectInspectorFactory.getStandardListObjectInspector(arrayElementOI); 69 | } 70 | 71 | @Override 72 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 73 | Object array = arguments[0].get(); 74 | int arrayLength = arrayOI.getListLength(array); 75 | 76 | // Check if array is null or empty 77 | if (array == null || arrayLength <= 0) { 78 | return null; 79 | } 80 | 81 | if (arrayLength == 1) { 82 | return array; 83 | } 84 | 85 | if (positions.length < arrayLength) { 86 | positions = new int[arrayLength]; 87 | } 88 | 89 | for (int i = 0; i < arrayLength; i++) { 90 | positions[i] = i; 91 | } 92 | 93 | IntArrays.quickSort(positions, 0, arrayLength, ArrayUtils.IntArrayCompare(array, arrayOI)); 94 | 95 | result.clear(); 96 | for (int i = 0; i < arrayLength; i++) { 97 | Object arrayElement = arrayOI.getListElement(array, positions[i]); 98 | result.add(arrayElement); 99 | } 100 | return result; 101 | } 102 | 103 | @Override 104 | public String getDisplayString(String[] strings) { 105 | assert (strings.length == ARG_COUNT); 106 | return "array_sort(" + strings[0] + ")"; 107 | } 108 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/utils/json/JsonPathTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.utils.json; 2 | 3 | import com.google.common.collect.AbstractIterator; 4 | 5 | import static java.lang.Character.isLetterOrDigit; 6 | import static java.lang.String.format; 7 | 8 | /** 9 | * @author ruifeng.shan 10 | * date: 2016-07-25 11 | * time: 15:06 12 | */ 13 | public class JsonPathTokenizer extends AbstractIterator { 14 | private static final char QUOTE = '\"'; 15 | private static final char DOT = '.'; 16 | private static final char OPEN_BRACKET = '['; 17 | private static final char CLOSE_BRACKET = ']'; 18 | private static final char UNICODE_CARET = '\u2038'; 19 | 20 | private final String path; 21 | private int index; 22 | 23 | public JsonPathTokenizer(String path) { 24 | if (path == null) { 25 | throw new NullPointerException("path is null"); 26 | } 27 | this.path = path; 28 | 29 | if (path.isEmpty()) { 30 | throw invalidJsonPath(); 31 | } 32 | 33 | // skip the start token 34 | match('$'); 35 | } 36 | 37 | private static boolean isUnquotedPathCharacter(char c) { 38 | return c == ':' || isUnquotedSubscriptCharacter(c); 39 | } 40 | 41 | private static boolean isUnquotedSubscriptCharacter(char c) { 42 | return c == '_' || isLetterOrDigit(c); 43 | } 44 | 45 | @Override 46 | protected String computeNext() { 47 | if (!hasNextCharacter()) { 48 | return endOfData(); 49 | } 50 | 51 | if (tryMatch(DOT)) { 52 | return matchPathSegment(); 53 | } 54 | 55 | if (tryMatch(OPEN_BRACKET)) { 56 | String token = tryMatch(QUOTE) ? matchQuotedSubscript() : matchUnquotedSubscript(); 57 | 58 | match(CLOSE_BRACKET); 59 | return token; 60 | } 61 | 62 | throw invalidJsonPath(); 63 | } 64 | 65 | private String matchPathSegment() { 66 | // seek until we see a special character or whitespace 67 | int start = index; 68 | while (hasNextCharacter() && isUnquotedPathCharacter(peekCharacter())) { 69 | nextCharacter(); 70 | } 71 | int end = index; 72 | 73 | String token = path.substring(start, end); 74 | 75 | // an empty unquoted token is not allowed 76 | if (token.isEmpty()) { 77 | throw invalidJsonPath(); 78 | } 79 | 80 | return token; 81 | } 82 | 83 | private String matchUnquotedSubscript() { 84 | // seek until we see a special character or whitespace 85 | int start = index; 86 | while (hasNextCharacter() && isUnquotedSubscriptCharacter(peekCharacter())) { 87 | nextCharacter(); 88 | } 89 | int end = index; 90 | 91 | String token = path.substring(start, end); 92 | 93 | // an empty unquoted token is not allowed 94 | if (token.isEmpty()) { 95 | throw invalidJsonPath(); 96 | } 97 | 98 | return token; 99 | } 100 | 101 | private String matchQuotedSubscript() { 102 | // quote has already been matched 103 | 104 | // seek until we see the close quote 105 | int start = index; 106 | while (hasNextCharacter() && peekCharacter() != QUOTE) { 107 | nextCharacter(); 108 | } 109 | int end = index; 110 | 111 | String token = path.substring(start, end); 112 | 113 | match(QUOTE); 114 | return token; 115 | } 116 | 117 | private boolean hasNextCharacter() { 118 | return index < path.length(); 119 | } 120 | 121 | private void match(char expected) { 122 | if (!tryMatch(expected)) { 123 | throw invalidJsonPath(); 124 | } 125 | } 126 | 127 | private boolean tryMatch(char expected) { 128 | if (peekCharacter() != expected) { 129 | return false; 130 | } 131 | index++; 132 | return true; 133 | } 134 | 135 | private void nextCharacter() { 136 | index++; 137 | } 138 | 139 | private char peekCharacter() { 140 | return path.charAt(index); 141 | } 142 | 143 | private RuntimeException invalidJsonPath() { 144 | return new RuntimeException(format("Invalid JSON path: '%s'", path)); 145 | } 146 | 147 | @Override 148 | public String toString() { 149 | return path.substring(0, index) + UNICODE_CARET + path.substring(index); 150 | } 151 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/UDFRe2JRegexpExtractAll.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.regexp; 2 | 3 | import io.airlift.slice.Slices; 4 | import java.util.ArrayList; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 9 | import org.apache.hadoop.hive.ql.metadata.HiveException; 10 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 12 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 13 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 14 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 15 | 16 | /** 17 | * @author ruifeng.shan 18 | * date: 2018-07-27 19 | * time: 22:38 20 | */ 21 | @Description(name = "regexp_extract_all" 22 | , value = "_FUNC_(string, string) - string(s) extracted using the given pattern\n" + 23 | "_FUNC_(string, string, long) - group(s) extracted using the given pattern." 24 | , extended = "Example:\n > select _FUNC_(string, pattern) from src;") 25 | public class UDFRe2JRegexpExtractAll extends GenericUDF { 26 | private transient ArrayList result = new ArrayList(); 27 | private transient Re2JRegexp re2JRegexp; 28 | 29 | public UDFRe2JRegexpExtractAll() { 30 | 31 | } 32 | 33 | @Override 34 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 35 | // Check if two arguments were passed 36 | if (arguments.length != 2 && arguments.length != 3) { 37 | throw new UDFArgumentLengthException( 38 | "The function regexp_extract_all takes exactly 2 or 3 arguments."); 39 | } 40 | 41 | for (int i = 0; i < 2; i++) { 42 | if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { 43 | throw new UDFArgumentTypeException(i, 44 | "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " 45 | + "expected at function regexp_extract_all, but " 46 | + "\"" + arguments[i].getTypeName() + "\" " 47 | + "is found"); 48 | } 49 | } 50 | 51 | if (arguments.length == 3) { 52 | if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaLongObjectInspector, arguments[2])) { 53 | throw new UDFArgumentTypeException(2, 54 | "\"" + PrimitiveObjectInspectorFactory.javaLongObjectInspector.getTypeName() + "\" " 55 | + "expected at function regexp_extract_all, but " 56 | + "\"" + arguments[2].getTypeName() + "\" " 57 | + "is found"); 58 | } 59 | } 60 | 61 | ObjectInspector expectOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 62 | 63 | return ObjectInspectorFactory.getStandardListObjectInspector(expectOI); 64 | } 65 | 66 | @Override 67 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 68 | String source = (String) arguments[0].get(); 69 | String pattern = (String) arguments[1].get(); 70 | Long groupIndex = 0L; 71 | if (arguments.length == 3) { 72 | groupIndex = (Long) arguments[2].get(); 73 | } 74 | 75 | if (source == null) { 76 | return null; 77 | } 78 | 79 | if (re2JRegexp == null) { 80 | re2JRegexp = new Re2JRegexp(Integer.MAX_VALUE, 5, Slices.utf8Slice(pattern)); 81 | } 82 | 83 | result.clear(); 84 | result.addAll(re2JRegexp.extractAll(Slices.utf8Slice(source), groupIndex)); 85 | 86 | return result; 87 | } 88 | 89 | @Override 90 | public String getDisplayString(String[] strings) { 91 | assert (strings.length == 2 || strings.length == 3); 92 | if (strings.length == 2) { 93 | return "regexp_extract_all(" + strings[0] + ", " 94 | + strings[1] + ")"; 95 | } else { 96 | return "regexp_extract_all(" + strings[0] + ", " 97 | + strings[1] + ", " + strings[2] + ")"; 98 | } 99 | } 100 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/array/UDFArrayElementAt.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 7 | import org.apache.hadoop.hive.ql.metadata.HiveException; 8 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 9 | import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; 10 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 11 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 12 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 13 | import org.apache.hadoop.io.IntWritable; 14 | 15 | /** 16 | * @author ruifeng.shan 17 | * date: 2016-07-27 18 | * time: 10:09 19 | */ 20 | @Description(name = "array_element_at" 21 | , value = "_FUNC_(array, index) - returns element of array at given index. If index < 0, element_at accesses elements from the last to the first." 22 | , extended = "Example:\n > select _FUNC_(array, index) from src;") 23 | public class UDFArrayElementAt extends GenericUDF { 24 | 25 | private static final int ARRAY_IDX = 0; 26 | private static final int INDEX_IDX = 1; 27 | private static final int ARG_COUNT = 2; // Number of arguments to this UDF 28 | private transient ObjectInspector indexOI; 29 | private transient ListObjectInspector arrayOI; 30 | private transient ObjectInspector arrayElementOI; 31 | 32 | public UDFArrayElementAt() { 33 | 34 | } 35 | 36 | @Override 37 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 38 | // Check if two arguments were passed 39 | if (arguments.length != ARG_COUNT) { 40 | throw new UDFArgumentLengthException( 41 | "The function array_element_at(array, index) takes exactly " + ARG_COUNT + " arguments."); 42 | } 43 | 44 | // Check if ARRAY_IDX argument is of category LIST 45 | if (!arguments[ARRAY_IDX].getCategory().equals(ObjectInspector.Category.LIST)) { 46 | throw new UDFArgumentTypeException(ARRAY_IDX, 47 | "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " 48 | + "expected at function array_element_at, but " 49 | + "\"" + arguments[ARRAY_IDX].getTypeName() + "\" " 50 | + "is found"); 51 | } 52 | 53 | arrayOI = (ListObjectInspector) arguments[ARRAY_IDX]; 54 | arrayElementOI = arrayOI.getListElementObjectInspector(); 55 | indexOI = arguments[INDEX_IDX]; 56 | 57 | ObjectInspector expectOI = PrimitiveObjectInspectorFactory.writableIntObjectInspector; 58 | 59 | // Check if index and expect are of same type 60 | if (!ObjectInspectorUtils.compareTypes(expectOI, indexOI)) { 61 | throw new UDFArgumentTypeException(INDEX_IDX, 62 | "\"" + expectOI.getTypeName() + "\"" 63 | + " expected at function array_element_at, but " 64 | + "\"" + indexOI.getTypeName() + "\"" 65 | + " is found"); 66 | } 67 | 68 | return arrayElementOI; 69 | } 70 | 71 | @Override 72 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 73 | Object array = arguments[ARRAY_IDX].get(); 74 | IntWritable index = (IntWritable) arguments[INDEX_IDX].get(); 75 | 76 | int arrayLength = arrayOI.getListLength(array); 77 | 78 | // Check if array is null or empty or index is null 79 | if (index == null || arrayLength <= 0) { 80 | return null; 81 | } 82 | 83 | if (index.get() < 0) { 84 | int idx = arrayLength + index.get(); 85 | if (idx >= 0) { 86 | return arrayOI.getListElement(array, idx); 87 | } 88 | } else { 89 | if (index.get() < arrayLength) { 90 | return arrayOI.getListElement(array, index.get()); 91 | } 92 | } 93 | 94 | return null; 95 | } 96 | 97 | @Override 98 | public String getDisplayString(String[] strings) { 99 | assert (strings.length == ARG_COUNT); 100 | return "array_element_at(" + strings[ARRAY_IDX] + ", " 101 | + strings[INDEX_IDX] + ")"; 102 | } 103 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFStringLevenshteinDistance.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import io.airlift.slice.Slice; 4 | import io.airlift.slice.Slices; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDF; 7 | import org.apache.hadoop.hive.ql.metadata.HiveException; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.Text; 10 | 11 | import static com.github.aaronshan.functions.utils.Failures.checkCondition; 12 | import static io.airlift.slice.SliceUtf8.getCodePointAt; 13 | import static io.airlift.slice.SliceUtf8.lengthOfCodePoint; 14 | import static io.airlift.slice.SliceUtf8.tryGetCodePointAt; 15 | 16 | /** 17 | * @author ruifeng.shan 18 | * date: 2018-07-26 19 | * time: 23:53 20 | */ 21 | @Description(name = "levenshtein_distance" 22 | , value = "_FUNC_(string, string) - computes Levenshtein distance between two strings." 23 | , extended = "Example:\n > select _FUNC_(string, string) from src;") 24 | public class UDFStringLevenshteinDistance extends UDF { 25 | private LongWritable result = new LongWritable(0); 26 | 27 | public UDFStringLevenshteinDistance() { 28 | } 29 | 30 | /** 31 | * Levenshtein distance. 32 | * 33 | * @param leftText left string 34 | * @param rightText right string 35 | * @return Levenshtein distance 36 | * @throws HiveException hive exception 37 | */ 38 | public LongWritable evaluate(Text leftText, Text rightText) throws HiveException { 39 | if (leftText == null || rightText == null) { 40 | return null; 41 | } 42 | 43 | Slice left = Slices.utf8Slice(leftText.toString()); 44 | Slice right = Slices.utf8Slice(rightText.toString()); 45 | int[] leftCodePoints = castToCodePoints(left); 46 | int[] rightCodePoints = castToCodePoints(right); 47 | 48 | if (leftCodePoints.length < rightCodePoints.length) { 49 | int[] tempCodePoints = leftCodePoints; 50 | leftCodePoints = rightCodePoints; 51 | rightCodePoints = tempCodePoints; 52 | } 53 | 54 | if (rightCodePoints.length == 0) { 55 | result.set(leftCodePoints.length); 56 | return result; 57 | } 58 | 59 | checkCondition((leftCodePoints.length * (rightCodePoints.length - 1)) <= 1000000, 60 | "The combined inputs for Levenshtein distance are too large"); 61 | 62 | int[] distances = new int[rightCodePoints.length]; 63 | for (int i = 0; i < rightCodePoints.length; i++) { 64 | distances[i] = i + 1; 65 | } 66 | 67 | for (int i = 0; i < leftCodePoints.length; i++) { 68 | int leftUpDistance = distances[0]; 69 | if (leftCodePoints[i] == rightCodePoints[0]) { 70 | distances[0] = i; 71 | } 72 | else { 73 | distances[0] = Math.min(i, distances[0]) + 1; 74 | } 75 | for (int j = 1; j < rightCodePoints.length; j++) { 76 | int leftUpDistanceNext = distances[j]; 77 | if (leftCodePoints[i] == rightCodePoints[j]) { 78 | distances[j] = leftUpDistance; 79 | } 80 | else { 81 | distances[j] = Math.min(distances[j - 1], Math.min(leftUpDistance, distances[j])) + 1; 82 | } 83 | leftUpDistance = leftUpDistanceNext; 84 | } 85 | } 86 | 87 | result.set(distances[rightCodePoints.length - 1]); 88 | 89 | return result; 90 | } 91 | 92 | private static int[] castToCodePoints(Slice slice) throws HiveException { 93 | int[] codePoints = new int[safeCountCodePoints(slice)]; 94 | int position = 0; 95 | for (int index = 0; index < codePoints.length; index++) { 96 | codePoints[index] = getCodePointAt(slice, position); 97 | position += lengthOfCodePoint(slice, position); 98 | } 99 | return codePoints; 100 | } 101 | 102 | private static int safeCountCodePoints(Slice slice) throws HiveException { 103 | int codePoints = 0; 104 | for (int position = 0; position < slice.length(); ) { 105 | int codePoint = tryGetCodePointAt(slice, position); 106 | if (codePoint < 0) { 107 | throw new HiveException("Invalid UTF-8 encoding in characters: " + slice.toStringUtf8()); 108 | } 109 | position += lengthOfCodePoint(codePoint); 110 | codePoints++; 111 | } 112 | return codePoints; 113 | } 114 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/array/UDFArrayShuffle.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import org.apache.hadoop.hive.ql.exec.Description; 4 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 5 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 7 | import org.apache.hadoop.hive.ql.metadata.HiveException; 8 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 9 | import org.apache.hadoop.hive.serde2.objectinspector.*; 10 | 11 | import java.util.ArrayList; 12 | import java.util.Random; 13 | 14 | /** 15 | * @author aaron02 16 | * date: 2018-08-18 上午8:52 17 | */ 18 | @Description(name = "array_shuffle" 19 | , value = "_FUNC_(array) - Generates a random permutation of the given array." 20 | , extended = "Example:\n > select _FUNC_(array) from src;") 21 | public class UDFArrayShuffle extends GenericUDF { 22 | private static final int ARG_COUNT = 1; // Number of arguments to this UDF 23 | private transient ListObjectInspector arrayOI; 24 | private transient ObjectInspector arrayElementOI; 25 | 26 | private transient ObjectInspectorConverters.Converter converter; 27 | private transient ArrayList result = new ArrayList(); 28 | 29 | private static final int INITIAL_LENGTH = 128; 30 | private int[] positions = new int[INITIAL_LENGTH]; 31 | 32 | public UDFArrayShuffle() { 33 | } 34 | 35 | @Override 36 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 37 | // Check if two arguments were passed 38 | if (arguments.length != ARG_COUNT) { 39 | throw new UDFArgumentLengthException( 40 | "The function array_shuffle(array) takes exactly " + ARG_COUNT + " arguments."); 41 | } 42 | 43 | // Check if two argument is of category LIST 44 | if (!arguments[0].getCategory().equals(ObjectInspector.Category.LIST)) { 45 | throw new UDFArgumentTypeException(0, 46 | "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " 47 | + "expected at function array_shuffle, but " 48 | + "\"" + arguments[0].getTypeName() + "\" " 49 | + "is found"); 50 | } 51 | 52 | arrayOI = (ListObjectInspector) arguments[0]; 53 | arrayElementOI = arrayOI.getListElementObjectInspector(); 54 | 55 | // Check if the comparison is supported for this type 56 | if (!ObjectInspectorUtils.compareSupported(arrayElementOI)) { 57 | throw new UDFArgumentException("The function array_shuffle" 58 | + " does not support comparison for " 59 | + "\"" + arrayElementOI.getTypeName() + "\"" 60 | + " types"); 61 | } 62 | 63 | converter = ObjectInspectorConverters.getConverter(arrayElementOI, arrayElementOI); 64 | 65 | return ObjectInspectorFactory.getStandardListObjectInspector(arrayElementOI); 66 | } 67 | 68 | @Override 69 | public Object evaluate(GenericUDF.DeferredObject[] arguments) throws HiveException { 70 | Object array = arguments[0].get(); 71 | int arrayLength = arrayOI.getListLength(array); 72 | 73 | // Check if array is null or empty 74 | if (array == null || arrayLength <= 0) { 75 | return null; 76 | } 77 | 78 | if (arrayLength == 1) { 79 | return array; 80 | } 81 | 82 | result.clear(); 83 | 84 | if (positions.length < arrayLength) { 85 | positions = new int[arrayLength]; 86 | } 87 | for (int i = 0; i < arrayLength; i++) { 88 | positions[i] = i; 89 | } 90 | 91 | // Fisher-Yates shuffle 92 | // Randomly swap a pair of positions 93 | for (int i = arrayLength - 1; i > 0; i--) { 94 | Random random = new Random(); 95 | int index = random.nextInt(i + 1); 96 | int swap = positions[i]; 97 | positions[i] = positions[index]; 98 | positions[index] = swap; 99 | } 100 | 101 | for (int i = 0; i < arrayLength; i++) { 102 | Object arrayElement = arrayOI.getListElement(array, positions[i]); 103 | result.add(arrayElement); 104 | } 105 | return result; 106 | } 107 | 108 | @Override 109 | public String getDisplayString(String[] strings) { 110 | assert (strings.length == ARG_COUNT); 111 | return "array_shuffle(" + strings[0] + ")"; 112 | } 113 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/string/UDFStringSplitToMultimap.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.string; 2 | 3 | import com.google.common.base.Splitter; 4 | import com.google.common.collect.ArrayListMultimap; 5 | import com.google.common.collect.Lists; 6 | import com.google.common.collect.Multimap; 7 | import java.util.HashMap; 8 | import java.util.List; 9 | import org.apache.hadoop.hive.ql.exec.Description; 10 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 11 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 12 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 13 | import org.apache.hadoop.hive.ql.metadata.HiveException; 14 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 15 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 16 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 17 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 18 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 19 | 20 | import static com.github.aaronshan.functions.utils.Failures.checkCondition; 21 | 22 | /** 23 | * @author ruifeng.shan 24 | * date: 2018-07-27 25 | * time: 00:04 26 | */ 27 | @Description(name = "split_to_multimap" 28 | , value = "_FUNC_(string, string, string) - creates a multimap by splitting a string into key/value pairs." 29 | , extended = "Example:\n > select _FUNC_('a=123,b=.4,c=,=d', ',', '=') from src;") 30 | public class UDFStringSplitToMultimap extends GenericUDF { 31 | private static final int ARG_COUNT = 3; // Number of arguments to this UDF 32 | HashMap> result = new HashMap>(); 33 | 34 | @Override 35 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 36 | // Check if two arguments were passed 37 | if (arguments.length != ARG_COUNT) { 38 | throw new UDFArgumentLengthException( 39 | "The function split_to_multimap(string, string, string) takes exactly " + ARG_COUNT + " arguments."); 40 | } 41 | 42 | // Check if two argument is of string 43 | for (int i = 0; i < 3; i++) { 44 | if (!ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, arguments[i])) { 45 | throw new UDFArgumentTypeException(i, 46 | "\"" + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" " 47 | + "expected at function split_to_multimap, but " 48 | + "\"" + arguments[i].getTypeName() + "\" " 49 | + "is found"); 50 | } 51 | } 52 | 53 | ObjectInspector mapKeyOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; 54 | ObjectInspector mapValueOI = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 55 | 56 | return ObjectInspectorFactory.getStandardMapObjectInspector(mapKeyOI, mapValueOI); 57 | } 58 | 59 | @Override 60 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 61 | String string = (String) arguments[0].get(); 62 | String entryDelimiter = (String) arguments[1].get(); 63 | String keyValueDelimiter = (String) arguments[2].get(); 64 | 65 | checkCondition(entryDelimiter.length() > 0, "entryDelimiter is empty"); 66 | checkCondition(keyValueDelimiter.length() > 0, "keyValueDelimiter is empty"); 67 | checkCondition(!entryDelimiter.equals(keyValueDelimiter), "entryDelimiter and keyValueDelimiter must not be the same"); 68 | 69 | if (string == null) { 70 | return null; 71 | } 72 | 73 | Multimap multimap = ArrayListMultimap.create(); 74 | 75 | result.clear(); 76 | List list = Splitter.on(entryDelimiter).splitToList(string); 77 | for (String str : list) { 78 | String[] fields = str.split(keyValueDelimiter); 79 | if (fields.length != 2) { 80 | throw new HiveException("Key-value delimiter must appear exactly once in each entry. Bad input: " + string); 81 | } 82 | multimap.put(fields[0], fields[1]); 83 | 84 | } 85 | 86 | for (String key : multimap.keySet()) { 87 | result.put(key, Lists.newArrayList(multimap.get(key))); 88 | } 89 | 90 | return result; 91 | } 92 | 93 | @Override 94 | public String getDisplayString(String[] strings) { 95 | assert (strings.length == ARG_COUNT); 96 | return "split_to_multimap(" + strings[0] + ", " 97 | + strings[1] + ", " + strings[2] + ")"; 98 | } 99 | } -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/regexp/re2j/SliceUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed under the Apache License, Version 2.0 (the "License"); 3 | * you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | package com.github.aaronshan.functions.regexp.re2j; 15 | 16 | import io.airlift.slice.Slice; 17 | import io.airlift.slice.SliceOutput; 18 | 19 | /** 20 | * Utility methods related to {@link Slice} class. 21 | */ 22 | final class SliceUtils { 23 | 24 | private SliceUtils() { 25 | } 26 | 27 | static void appendReplacement(SliceOutput so, Slice replacement, Matcher matcher) { 28 | int idx = 0; 29 | 30 | // Handle the following items: 31 | // 1. ${name}; 32 | // 2. $0, $1, $123 (group 123, if exists; or group 12, if exists; or group 1); 33 | // 3. \\, \$, \t (literal 't'). 34 | // 4. Anything that doesn't starts with \ or $ is considered regular bytes 35 | while (idx < replacement.length()) { 36 | byte nextByte = replacement.getByte(idx); 37 | if (nextByte == '$') { 38 | idx++; 39 | if (idx == replacement.length()) { 40 | throw new IllegalArgumentException("Illegal replacement sequence: " + replacement.toStringUtf8()); 41 | } 42 | nextByte = replacement.getByte(idx); 43 | int backref; 44 | if (nextByte == '{') { // case 1 in the above comment 45 | idx++; 46 | int startCursor = idx; 47 | while (idx < replacement.length()) { 48 | nextByte = replacement.getByte(idx); 49 | if (nextByte == '}') { 50 | break; 51 | } 52 | idx++; 53 | } 54 | String groupName = replacement.slice(startCursor, idx - startCursor).toStringUtf8(); 55 | Integer namedGroupIndex = matcher.pattern().re2().namedGroupIndexes.get(groupName); 56 | if (namedGroupIndex == null) { 57 | throw new IndexOutOfBoundsException("Illegal replacement sequence: unknown group " + groupName); 58 | } 59 | backref = namedGroupIndex; 60 | idx++; 61 | } else { // case 2 in the above comment 62 | backref = nextByte - '0'; 63 | if (backref < 0 || backref > 9) { 64 | throw new IllegalArgumentException("Illegal replacement sequence: " + replacement.toStringUtf8()); 65 | } 66 | if (matcher.groupCount() < backref) { 67 | throw new IndexOutOfBoundsException("Illegal replacement sequence: unknown group " + backref); 68 | } 69 | idx++; 70 | while (idx < replacement.length()) { // Adaptive group number: find largest group num that is not greater than actual number of groups 71 | int nextDigit = replacement.getByte(idx) - '0'; 72 | if (nextDigit < 0 || nextDigit > 9) { 73 | break; 74 | } 75 | int newBackref = (backref * 10) + nextDigit; 76 | if (matcher.groupCount() < newBackref) { 77 | break; 78 | } 79 | backref = newBackref; 80 | idx++; 81 | } 82 | } 83 | Slice group = matcher.group(backref); 84 | if (group != null) { 85 | so.writeBytes(group); 86 | } 87 | } else { // case 3 and 4 in the above comment 88 | if (nextByte == '\\') { 89 | idx++; 90 | if (idx == replacement.length()) { 91 | throw new IllegalArgumentException("Illegal replacement sequence: " + replacement.toStringUtf8()); 92 | } 93 | nextByte = replacement.getByte(idx); 94 | } 95 | so.appendByte(nextByte); 96 | idx++; 97 | } 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/map/UDFMapConcat.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.map; 2 | 3 | import java.util.LinkedHashMap; 4 | import java.util.Map; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 9 | import org.apache.hadoop.hive.ql.metadata.HiveException; 10 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 11 | import org.apache.hadoop.hive.serde.serdeConstants; 12 | import org.apache.hadoop.hive.serde2.objectinspector.*; 13 | 14 | /** 15 | * @author ruifeng.shan 16 | * date: 2016-07-27 17 | * time: 15:40 18 | */ 19 | @Description(name = "map_concat" 20 | , value = "_FUNC_(x, y) - returns the union of two maps. If a key is found in both x and y, that key’s value in the resulting map comes from y." 21 | , extended = "Example:\n > select _FUNC_(mapX, mapY) from src;") 22 | public class UDFMapConcat extends GenericUDF { 23 | private static final int ARG_COUNT = 2; // Number of arguments to this UDF 24 | LinkedHashMap result = new LinkedHashMap(); 25 | private transient MapObjectInspector leftMapOI; 26 | private transient MapObjectInspector rightMapOI; 27 | 28 | @Override 29 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 30 | // Check if two arguments were passed 31 | if (arguments.length != ARG_COUNT) { 32 | throw new UDFArgumentLengthException( 33 | "The function map_concat(map, map) takes exactly " + ARG_COUNT + " arguments."); 34 | } 35 | 36 | // Check if two argument is of category LIST 37 | for (int i = 0; i < 2; i++) { 38 | if (!arguments[i].getCategory().equals(ObjectInspector.Category.MAP)) { 39 | throw new UDFArgumentTypeException(i, 40 | "\"" + serdeConstants.MAP_TYPE_NAME + "\" " 41 | + "expected at function map_concat, but " 42 | + "\"" + arguments[i].getTypeName() + "\" " 43 | + "is found"); 44 | } 45 | } 46 | 47 | leftMapOI = (MapObjectInspector) arguments[0]; 48 | rightMapOI = (MapObjectInspector) arguments[1]; 49 | 50 | ObjectInspector leftMapKeyOI = leftMapOI.getMapKeyObjectInspector(); 51 | ObjectInspector leftMapValueOI = leftMapOI.getMapValueObjectInspector(); 52 | ObjectInspector rightMapKeyOI = rightMapOI.getMapKeyObjectInspector(); 53 | ObjectInspector rightMapValueOI = rightMapOI.getMapValueObjectInspector(); 54 | 55 | // Check if two map are of same key and value type 56 | if (!ObjectInspectorUtils.compareTypes(leftMapKeyOI, rightMapKeyOI)) { 57 | throw new UDFArgumentTypeException(1, 58 | "\"" + leftMapKeyOI.getTypeName() + "\"" 59 | + " expected at function map_concat key, but " 60 | + "\"" + rightMapKeyOI.getTypeName() + "\"" 61 | + " is found"); 62 | } 63 | 64 | if (!ObjectInspectorUtils.compareTypes(leftMapValueOI, rightMapValueOI)) { 65 | throw new UDFArgumentTypeException(1, 66 | "\"" + leftMapValueOI.getTypeName() + "\"" 67 | + " expected at function map_concat value, but " 68 | + "\"" + rightMapValueOI.getTypeName() + "\"" 69 | + " is found"); 70 | } 71 | 72 | return ObjectInspectorFactory.getStandardMapObjectInspector(leftMapKeyOI, leftMapValueOI); 73 | } 74 | 75 | @Override 76 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 77 | result.clear(); 78 | Object leftMapObj = arguments[0].get(); 79 | Object rightMapObj = arguments[1].get(); 80 | 81 | Map leftMap = leftMapOI.getMap(leftMapObj); 82 | Map rightMap = leftMapOI.getMap(rightMapObj); 83 | 84 | if (leftMap == null) { 85 | if (rightMap == null) { 86 | return null; 87 | } 88 | return rightMap; 89 | } else { 90 | if (rightMap == null) { 91 | return leftMap; 92 | } 93 | } 94 | 95 | result.putAll(leftMap); 96 | result.putAll(rightMap); 97 | 98 | return result; 99 | } 100 | 101 | @Override 102 | public String getDisplayString(String[] strings) { 103 | assert (strings.length == ARG_COUNT); 104 | return "map_concat(" + strings[0] + ", " 105 | + strings[1] + ")"; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/com/github/aaronshan/functions/array/UDFArrayDistinct.java: -------------------------------------------------------------------------------- 1 | package com.github.aaronshan.functions.array; 2 | 3 | import com.github.aaronshan.functions.fastuitl.ints.IntArrays; 4 | import java.util.ArrayList; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 7 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 9 | import org.apache.hadoop.hive.ql.metadata.HiveException; 10 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 11 | import org.apache.hadoop.hive.serde2.objectinspector.*; 12 | 13 | import static com.github.aaronshan.functions.utils.ArrayUtils.IntArrayCompare; 14 | 15 | /** 16 | * @author ruifeng.shan 17 | * date: 2016-07-26 18 | * time: 17:29 19 | */ 20 | @Description(name = "array_distinct" 21 | , value = "_FUNC_(array) - remove duplicate values from the array." 22 | , extended = "Example:\n > select _FUNC_(array) from src;") 23 | public class UDFArrayDistinct extends GenericUDF { 24 | private static final int INITIAL_SIZE = 128; 25 | private static final int ARG_COUNT = 1; // Number of arguments to this UDF 26 | private int[] positions = new int[INITIAL_SIZE]; 27 | private transient ListObjectInspector arrayOI; 28 | private transient ObjectInspector arrayElementOI; 29 | 30 | private transient ObjectInspectorConverters.Converter converter; 31 | private transient ArrayList result = new ArrayList(); 32 | 33 | public UDFArrayDistinct() { 34 | } 35 | 36 | @Override 37 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 38 | // Check if two arguments were passed 39 | if (arguments.length != ARG_COUNT) { 40 | throw new UDFArgumentLengthException( 41 | "The function array_distinct(array) takes exactly " + ARG_COUNT + " warguments."); 42 | } 43 | 44 | // Check if two argument is of category LIST 45 | if (!arguments[0].getCategory().equals(ObjectInspector.Category.LIST)) { 46 | throw new UDFArgumentTypeException(0, 47 | "\"" + org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "\" " 48 | + "expected at function array_distinct, but " 49 | + "\"" + arguments[0].getTypeName() + "\" " 50 | + "is found"); 51 | } 52 | 53 | arrayOI = (ListObjectInspector) arguments[0]; 54 | arrayElementOI = arrayOI.getListElementObjectInspector(); 55 | 56 | // Check if the comparison is supported for this type 57 | if (!ObjectInspectorUtils.compareSupported(arrayElementOI)) { 58 | throw new UDFArgumentException("The function array_distinct" 59 | + " does not support comparison for " 60 | + "\"" + arrayElementOI.getTypeName() + "\"" 61 | + " types"); 62 | } 63 | 64 | converter = ObjectInspectorConverters.getConverter(arrayElementOI, arrayElementOI); 65 | 66 | return ObjectInspectorFactory.getStandardListObjectInspector(arrayElementOI); 67 | } 68 | 69 | @Override 70 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 71 | Object array = arguments[0].get(); 72 | int arrayLength = arrayOI.getListLength(array); 73 | 74 | // Check if array is null or empty 75 | if (array == null || arrayLength <= 0) { 76 | return null; 77 | } 78 | 79 | if (arrayLength == 1) { 80 | return array; 81 | } 82 | 83 | if (positions.length < arrayLength) { 84 | positions = new int[arrayLength]; 85 | } 86 | 87 | for (int i = 0; i < arrayLength; i++) { 88 | positions[i] = i; 89 | } 90 | 91 | IntArrays.quickSort(positions, 0, arrayLength, IntArrayCompare(array, arrayOI)); 92 | 93 | result.clear(); 94 | Object lastElement = arrayOI.getListElement(array, positions[0]); 95 | result.add(lastElement); 96 | for (int i = 1; i < arrayLength; i++) { 97 | Object currentElement = arrayOI.getListElement(array, positions[i]); 98 | int compareValue = ObjectInspectorUtils.compare(lastElement, arrayElementOI, currentElement, arrayElementOI); 99 | if (compareValue == 0) { 100 | continue; 101 | } else { 102 | lastElement = currentElement; 103 | result.add(currentElement); 104 | } 105 | } 106 | 107 | return result; 108 | } 109 | 110 | @Override 111 | public String getDisplayString(String[] strings) { 112 | assert (strings.length == ARG_COUNT); 113 | return "array_distinct(" + strings[0] + ")"; 114 | } 115 | } --------------------------------------------------------------------------------