├── README.md ├── pom.xml └── src ├── main └── java │ └── com │ └── jointhegrid │ └── udf │ └── geoip │ └── GenericUDFGeoIP.java └── test ├── java └── com │ └── jointhegrid │ └── udf │ └── geoip │ └── GenericUDFGeoIPTest.java └── resources ├── hive-exec-log4j.properties ├── hive-log4j.properties └── hive-site.xml /README.md: -------------------------------------------------------------------------------- 1 | hive-geoip 2 | ========== 3 | 4 | GeoIP Functions for hive 5 | 6 | add file GeoIP.dat; 7 | add jar geo-ip-java.jar; 8 | add jar hive-udf-geo-ip-jtg.jar; 9 | create temporary function geoip as 'com.jointhegrid.hive.udf.GenericUDFGeoIP'; 10 | select geoip(first, 'COUNTRY_NAME', './GeoIP.dat' ) from a; 11 | 12 | You need a geoip database extracted(separately licenced) found here: 13 | http://geolite.maxmind.com/download/geoip/database/GeoLiteCountry/GeoIP.dat.gz 14 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.m6d 5 | hive-geoip 6 | hive-geoip 7 | 1.0.0-SNAPSHOT 8 | GeoIP in hive 9 | jar 10 | 11 | 12 | 13 | 14 | 15 | org.kohsuke 16 | geoip 17 | 1.2.5 18 | 19 | 20 | 21 | com.jointhegrid 22 | hive_test 23 | 4.0.0-SNAPSHOT 24 | 25 | 26 | org.apache.hadoop 27 | hadoop-core 28 | 0.20.2 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-test 33 | 0.20.2 34 | 35 | 36 | 37 | junit 38 | junit 39 | 4.7 40 | test 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | apache-main 52 | http://www.apache.org/dist/hadoop/common/hadoop-0.20.2 53 | hadoop-0.20.2.tar.gz 54 | ${project.build.directory}/hadoop 55 | 56 | 57 | org.codehaus.mojo 58 | wagon-maven-plugin 59 | 1.0-beta-3 60 | 61 | 62 | download-hadoop 63 | pre-integration-test 64 | 65 | download-single 66 | 67 | 68 | 69 | 70 | 71 | 72 | org.codehaus.mojo 73 | exec-maven-plugin 74 | 1.2.1 75 | 76 | tar 77 | 78 | -xf 79 | ${project.build.directory}/hadoop/hadoop-0.20.2.tar.gz 80 | -C 81 | ${project.build.directory} 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | org.apache.maven.plugins 90 | maven-eclipse-plugin 91 | 2.5.1 92 | 93 | [artifactId] 94 | true 95 | true 96 | 1.5 97 | 98 | org.eclipse.jdt.core.javabuilder 99 | org.maven.ide.eclipse.maven2Builder 100 | 101 | 102 | org.eclipse.jdt.core.javanature 103 | org.maven.ide.eclipse.maven2Nature 104 | 105 | 106 | 107 | 108 | maven-compiler-plugin 109 | 110 | 1.6 111 | 1.6 112 | 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /src/main/java/com/jointhegrid/udf/geoip/GenericUDFGeoIP.java: -------------------------------------------------------------------------------- 1 | package com.jointhegrid.udf.geoip; 2 | 3 | import com.maxmind.geoip.LookupService; 4 | import com.maxmind.geoip.Location; 5 | import com.maxmind.geoip.Country; 6 | import java.io.File; 7 | import java.io.IOException; 8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 9 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; 10 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; 11 | import org.apache.hadoop.hive.ql.exec.Description; 12 | import org.apache.hadoop.hive.ql.metadata.HiveException; 13 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 14 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 15 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; 16 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; 17 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 18 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; 19 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; 20 | import java.util.concurrent.CopyOnWriteArraySet; 21 | import java.util.Set; 22 | import java.util.Arrays; 23 | 24 | /** 25 | * GenericUDFGeoIP is a Hive User Defined Function that allows you to lookup 26 | * database information on a given ip. 27 | * argument 0 should be an IP string 28 | * argument 1 should be one of the following values: 29 | * COUNTRY_NAME, COUNTRY_CODE ,AREA_CODE 30 | * CITY,DMA_CODE,LATITUDE,LONGITUDE,METRO_CODE,POSTAL_CODE, REGION, ORG, ID 31 | * argument 2 should be the filename for you geo-ip database 32 | * 33 | *
 34 |  * Usage:
 35 |  * add file GeoIP.dat;
 36 |  * add jar geo-ip-java.jar;
 37 |  * add jar hive-udf-geo-ip-jtg.jar;
 38 |  * create temporary function geoip as 'com.jointhegrid.hive.udf.GenericUDFGeoIP';
 39 |  * select geoip(first, 'COUNTRY_NAME',  './GeoIP.dat' ) from a;
 40 |  * 
41 | * @author ecapriolo 42 | */ 43 | 44 | @Description( 45 | name = "geoip", 46 | value = "_FUNC_(ip,property,database) - loads database into GEO-IP lookup "+ 47 | "service, then looks up 'property' of ip. " 48 | ) 49 | 50 | public class GenericUDFGeoIP extends GenericUDF { 51 | 52 | private String ipString = null; 53 | private Long ipLong = null; 54 | private String property; 55 | private String database; 56 | private LookupService ls; 57 | 58 | private static final String COUNTRY_NAME = "COUNTRY_NAME"; 59 | private static final String COUNTRY_CODE = "COUNTRY_CODE"; 60 | private static final String AREA_CODE = "AREA_CODE"; 61 | private static final String CITY = "CITY"; 62 | private static final String DMA_CODE = "DMA_CODE"; 63 | private static final String LATITUDE = "LATITUDE"; 64 | private static final String LONGITUDE = "LONGITUDE"; 65 | private static final String METRO_CODE = "METRO_CODE"; 66 | private static final String POSTAL_CODE = "POSTAL_CODE"; 67 | private static final String REGION = "REGION"; 68 | private static final String ORG = "ORG"; 69 | private static final String ID = "ID"; 70 | 71 | private static final Set COUNTRY_PROPERTIES = 72 | new CopyOnWriteArraySet(Arrays.asList( 73 | new String[] {COUNTRY_NAME, COUNTRY_CODE})); 74 | 75 | private static final Set LOCATION_PROPERTIES = 76 | new CopyOnWriteArraySet(Arrays.asList( 77 | new String[] {AREA_CODE, CITY, DMA_CODE, LATITUDE, LONGITUDE, METRO_CODE, POSTAL_CODE, REGION})); 78 | 79 | PrimitiveObjectInspector [] argumentOIs; 80 | 81 | @Override 82 | public ObjectInspector initialize(ObjectInspector[] arguments) 83 | throws UDFArgumentException { 84 | 85 | argumentOIs = new PrimitiveObjectInspector [arguments.length]; 86 | 87 | if ( arguments.length != 3) { 88 | throw new UDFArgumentLengthException( 89 | "The function GenericUDFGeoIP( 'input', 'resultfield', 'datafile' ) " 90 | + " accepts 3 arguments."); 91 | } 92 | 93 | if (!(arguments[0] instanceof StringObjectInspector) && !(arguments[0] instanceof LongObjectInspector)) { 94 | throw new UDFArgumentTypeException(0, 95 | "The first 3 parameters of GenericUDFGeoIP('input', 'resultfield', 'datafile')" 96 | + " should be string."); 97 | } 98 | argumentOIs[0] = (PrimitiveObjectInspector) arguments[0]; 99 | 100 | for (int i = 1; i < arguments.length; i++) { 101 | if (!(arguments[i] instanceof StringObjectInspector )) { 102 | throw new UDFArgumentTypeException(i, 103 | "The first 3 parameters of GenericUDFGeoIP('input', 'resultfield', 'datafile')" 104 | + " should be string."); 105 | } 106 | argumentOIs[i] = (StringObjectInspector) arguments[i]; 107 | } 108 | return PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector( 109 | PrimitiveCategory.STRING); 110 | } 111 | 112 | @Override 113 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 114 | if (argumentOIs[0] instanceof LongObjectInspector) { 115 | this.ipLong = ((LongObjectInspector)argumentOIs[0]).get(arguments[0].get()); 116 | } else { 117 | this.ipString = ((StringObjectInspector)argumentOIs[0]).getPrimitiveJavaObject(arguments[0].get()); 118 | } 119 | this.property = ((StringObjectInspector)argumentOIs[1]).getPrimitiveJavaObject(arguments[1].get()); 120 | 121 | if (this.property != null) { 122 | this.property = this.property.toUpperCase(); 123 | } 124 | 125 | if (ls ==null){ 126 | if (argumentOIs.length == 3){ 127 | this.database = ((StringObjectInspector)argumentOIs[1]).getPrimitiveJavaObject(arguments[2].get()); 128 | File f = new File(database); 129 | if (!f.exists()){ 130 | throw new HiveException(database+" does not exist"); 131 | } 132 | try { 133 | ls = new LookupService ( f , LookupService.GEOIP_MEMORY_CACHE ); 134 | } catch (IOException ex){ 135 | throw new HiveException (ex); 136 | } 137 | } 138 | /** // how to do this??? 139 | if (argumentOIs.length == 2) { 140 | URL u = getClass().getClassLoader().getResource("GeoIP.dat"); 141 | try { 142 | System.out.println("f exists ?"+ new File(u.getFile()).exists() ); 143 | ls = new LookupService ( u.getFile() ); 144 | } catch (IOException ex){ throw new HiveException (ex); } 145 | } 146 | * */ 147 | } // ls null ? 148 | 149 | if (COUNTRY_PROPERTIES.contains(this.property)) { 150 | Country country = ipString != null ? ls.getCountry(ipString) : ls.getCountry(ipLong); 151 | if (country == null) { 152 | return null; 153 | } else if (this.property.equals(COUNTRY_NAME)) { 154 | return country.getName(); 155 | } else if (this.property.equals(COUNTRY_CODE)) { 156 | return country.getCode(); 157 | } 158 | assert(false); 159 | } else if (LOCATION_PROPERTIES.contains(this.property)) { 160 | Location loc = ipString != null ? ls.getLocation(ipString) : ls.getLocation(ipLong); 161 | if (loc == null) { 162 | return null; 163 | } 164 | //country 165 | if (this.property.equals(AREA_CODE)) { 166 | return loc.area_code + ""; 167 | } else if (this.property.equals(CITY)) { 168 | return loc.city == null ? null : loc.city + ""; 169 | } else if (this.property.equals(DMA_CODE)) { 170 | return loc.dma_code + ""; 171 | } else if (this.property.equals(LATITUDE)) { 172 | return loc.latitude + ""; 173 | } else if (this.property.equals(LONGITUDE)) { 174 | return loc.longitude + ""; 175 | } else if (this.property.equals(METRO_CODE)) { 176 | return loc.metro_code + ""; 177 | } else if (this.property.equals(POSTAL_CODE)) { 178 | return loc.postalCode == null ? null : loc.postalCode + ""; 179 | } else if (this.property.equals(REGION)) { 180 | return loc.region == null ? null : loc.region + ""; 181 | } 182 | assert(false); 183 | } else if (this.property.equals(ORG)) { 184 | return ipString != null ? ls.getOrg(ipString) : ls.getOrg(ipLong); 185 | } else if (this.property.equals(ID)) { 186 | return ipString != null ? ls.getID(ipString) : ls.getID(ipLong); 187 | } 188 | 189 | return null; 190 | } 191 | 192 | @Override 193 | public String getDisplayString(String[] children) { 194 | assert(children.length == 3); 195 | return "GenericUDFGeoIP ( "+children[0]+", "+children[1]+", "+children[2]+" )"; 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /src/test/java/com/jointhegrid/udf/geoip/GenericUDFGeoIPTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this template, choose Tools | Templates 3 | * and open the template in the editor. 4 | */ 5 | package com.jointhegrid.udf.geoip; 6 | 7 | import java.io.OutputStreamWriter; 8 | import java.io.BufferedWriter; 9 | import org.apache.hadoop.fs.FSDataOutputStream; 10 | import org.apache.hadoop.fs.Path; 11 | import java.util.List; 12 | import java.util.Arrays; 13 | import java.io.IOException; 14 | import com.jointhegrid.hive_test.HiveTestService; 15 | import org.junit.AfterClass; 16 | import org.junit.BeforeClass; 17 | import org.junit.Test; 18 | import static org.junit.Assert.*; 19 | 20 | public class GenericUDFGeoIPTest extends HiveTestService { 21 | 22 | public GenericUDFGeoIPTest() throws IOException { 23 | super(); 24 | } 25 | 26 | public void testCollect() throws Exception { 27 | Path p = new Path(this.ROOT_DIR, "rankfile"); 28 | 29 | FSDataOutputStream o = this.getFileSystem().create(p); 30 | BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(o)); 31 | bw.write("209.191.139.200\n"); 32 | bw.write("twelve\n"); 33 | bw.close(); 34 | 35 | String jarFile; 36 | jarFile = GenericUDFGeoIP.class.getProtectionDomain().getCodeSource().getLocation().getFile(); 37 | client.execute("add jar " + jarFile); 38 | jarFile = com.maxmind.geoip.LookupService.class.getProtectionDomain().getCodeSource().getLocation().getFile(); 39 | client.execute("add jar " + jarFile); 40 | //download this or put in reasources 41 | client.execute(" add file /tmp/GeoIP.dat"); 42 | 43 | client.execute("create temporary function geoip as 'com.jointhegrid.udf.geoip.GenericUDFGeoIP'"); 44 | client.execute("create table ips ( ip string) row format delimited fields terminated by '09' lines terminated by '10'"); 45 | client.execute("load data local inpath '" + p.toString() + "' into table ips"); 46 | 47 | client.execute("select geoip(ip, 'COUNTRY_NAME', './GeoIP.dat') FROM ips"); 48 | List expected = Arrays.asList("United States","N/A"); 49 | assertEquals(expected, client.fetchAll()); 50 | 51 | 52 | client.execute("drop table ips"); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/test/resources/hive-exec-log4j.properties: -------------------------------------------------------------------------------- 1 | # Define some default values that can be overridden by system properties 2 | hive.root.logger=INFO,FA 3 | #hive.root.logger=DEBUG,console 4 | hive.log.dir=/tmp/${user.name} 5 | hive.log.file=${hive.query.id}.log 6 | 7 | # Define the root logger to the system property "hadoop.root.logger". 8 | log4j.rootLogger=${hive.root.logger}, EventCounter 9 | 10 | # Logging Threshold 11 | log4j.threshhold=DEBUG 12 | 13 | # 14 | # File Appender 15 | # 16 | 17 | log4j.appender.FA=org.apache.log4j.FileAppender 18 | log4j.appender.FA.File=${hive.log.dir}/${hive.log.file} 19 | log4j.appender.FA.layout=org.apache.log4j.PatternLayout 20 | 21 | # Pattern format: Date LogLevel LoggerName LogMessage 22 | #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n 23 | # Debugging Pattern format 24 | log4j.appender.FA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n 25 | 26 | 27 | # 28 | # console 29 | # Add "console" to rootlogger above if you want to use this 30 | # 31 | 32 | log4j.appender.console=org.apache.log4j.ConsoleAppender 33 | log4j.appender.console.target=System.err 34 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 35 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 36 | 37 | #custom logging levels 38 | #log4j.logger.xxx=DEBUG 39 | 40 | # 41 | # Event Counter Appender 42 | # Sends counts of logging messages at different severity levels to Hadoop Metrics. 43 | # 44 | log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter 45 | 46 | 47 | #log4j.category.DataNucleus=ERROR,FA 48 | #log4j.category.Datastore=ERROR,FA 49 | #log4j.category.Datastore.Schema=ERROR,FA 50 | #log4j.category.JPOX.Datastore=ERROR,FA 51 | #log4j.category.JPOX.Plugin=ERROR,FA 52 | #log4j.category.JPOX.MetaData=ERROR,FA 53 | #log4j.category.JPOX.Query=ERROR,FA 54 | #log4j.category.JPOX.General=ERROR,FA 55 | #log4j.category.JPOX.Enhancer=ERROR,FA 56 | 57 | -------------------------------------------------------------------------------- /src/test/resources/hive-log4j.properties: -------------------------------------------------------------------------------- 1 | # Define some default values that can be overridden by system properties 2 | hive.root.logger=WARN,DRFA 3 | #hive.root.logger=DEBUG,console 4 | hive.log.dir=/tmp/${user.name} 5 | hive.log.file=hive.log 6 | 7 | # Define the root logger to the system property "hadoop.root.logger". 8 | log4j.rootLogger=${hive.root.logger}, EventCounter 9 | 10 | # Logging Threshold 11 | log4j.threshhold=WARN 12 | 13 | # 14 | # Daily Rolling File Appender 15 | # 16 | 17 | log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender 18 | log4j.appender.DRFA.File=${hive.log.dir}/${hive.log.file} 19 | 20 | # Rollver at midnight 21 | log4j.appender.DRFA.DatePattern=.yyyy-MM-dd 22 | 23 | # 30-day backup 24 | #log4j.appender.DRFA.MaxBackupIndex=30 25 | log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout 26 | 27 | # Pattern format: Date LogLevel LoggerName LogMessage 28 | #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n 29 | # Debugging Pattern format 30 | log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n 31 | 32 | 33 | # 34 | # console 35 | # Add "console" to rootlogger above if you want to use this 36 | # 37 | 38 | log4j.appender.console=org.apache.log4j.ConsoleAppender 39 | log4j.appender.console.target=System.err 40 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 41 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 42 | 43 | #custom logging levels 44 | #log4j.logger.xxx=DEBUG 45 | 46 | # 47 | # Event Counter Appender 48 | # Sends counts of logging messages at different severity levels to Hadoop Metrics. 49 | # 50 | log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter 51 | 52 | 53 | log4j.category.DataNucleus=ERROR,DRFA 54 | log4j.category.Datastore=ERROR,DRFA 55 | log4j.category.Datastore.Schema=ERROR,DRFA 56 | log4j.category.JPOX.Datastore=ERROR,DRFA 57 | log4j.category.JPOX.Plugin=ERROR,DRFA 58 | log4j.category.JPOX.MetaData=ERROR,DRFA 59 | log4j.category.JPOX.Query=ERROR,DRFA 60 | log4j.category.JPOX.General=ERROR,DRFA 61 | log4j.category.JPOX.Enhancer=ERROR,DRFA 62 | 63 | -------------------------------------------------------------------------------- /src/test/resources/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hive.mapred.reduce.tasks.speculative.execution 6 | false 7 | Whether speculative execution for reducers should be turned on. 8 | 9 | 10 | 11 | 12 | 13 | javax.jdo.option.ConnectionURL 14 | 15 | jdbc:derby:memory:metastore_db;create=true 16 | JDBC connect string for a JDBC metastore 17 | 18 | 19 | 20 | hive.metastore.warehouse.dir 21 | /tmp/warehouse 22 | location of default database for the warehouse 23 | 24 | 25 | 32 | 33 | 40 | 41 | 42 | --------------------------------------------------------------------------------