├── README.md
├── pom.xml
└── src
├── main
└── java
│ └── com
│ └── jointhegrid
│ └── udf
│ └── geoip
│ └── GenericUDFGeoIP.java
└── test
├── java
└── com
│ └── jointhegrid
│ └── udf
│ └── geoip
│ └── GenericUDFGeoIPTest.java
└── resources
├── hive-exec-log4j.properties
├── hive-log4j.properties
└── hive-site.xml
/README.md:
--------------------------------------------------------------------------------
1 | hive-geoip
2 | ==========
3 |
4 | GeoIP Functions for hive
5 |
6 | add file GeoIP.dat;
7 | add jar geo-ip-java.jar;
8 | add jar hive-udf-geo-ip-jtg.jar;
9 | create temporary function geoip as 'com.jointhegrid.hive.udf.GenericUDFGeoIP';
10 | select geoip(first, 'COUNTRY_NAME', './GeoIP.dat' ) from a;
11 |
12 | You need a geoip database extracted(separately licenced) found here:
13 | http://geolite.maxmind.com/download/geoip/database/GeoLiteCountry/GeoIP.dat.gz
14 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.m6d
5 | hive-geoip
6 | hive-geoip
7 | 1.0.0-SNAPSHOT
8 | GeoIP in hive
9 | jar
10 |
11 |
12 |
13 |
14 |
15 | org.kohsuke
16 | geoip
17 | 1.2.5
18 |
19 |
20 |
21 | com.jointhegrid
22 | hive_test
23 | 4.0.0-SNAPSHOT
24 |
25 |
26 | org.apache.hadoop
27 | hadoop-core
28 | 0.20.2
29 |
30 |
31 | org.apache.hadoop
32 | hadoop-test
33 | 0.20.2
34 |
35 |
36 |
37 | junit
38 | junit
39 | 4.7
40 | test
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | apache-main
52 | http://www.apache.org/dist/hadoop/common/hadoop-0.20.2
53 | hadoop-0.20.2.tar.gz
54 | ${project.build.directory}/hadoop
55 |
56 |
57 | org.codehaus.mojo
58 | wagon-maven-plugin
59 | 1.0-beta-3
60 |
61 |
62 | download-hadoop
63 | pre-integration-test
64 |
65 | download-single
66 |
67 |
68 |
69 |
70 |
71 |
72 | org.codehaus.mojo
73 | exec-maven-plugin
74 | 1.2.1
75 |
76 | tar
77 |
78 | -xf
79 | ${project.build.directory}/hadoop/hadoop-0.20.2.tar.gz
80 | -C
81 | ${project.build.directory}
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 | org.apache.maven.plugins
90 | maven-eclipse-plugin
91 | 2.5.1
92 |
93 | [artifactId]
94 | true
95 | true
96 | 1.5
97 |
98 | org.eclipse.jdt.core.javabuilder
99 | org.maven.ide.eclipse.maven2Builder
100 |
101 |
102 | org.eclipse.jdt.core.javanature
103 | org.maven.ide.eclipse.maven2Nature
104 |
105 |
106 |
107 |
108 | maven-compiler-plugin
109 |
110 | 1.6
111 | 1.6
112 |
113 |
114 |
115 |
116 |
117 |
118 |
--------------------------------------------------------------------------------
/src/main/java/com/jointhegrid/udf/geoip/GenericUDFGeoIP.java:
--------------------------------------------------------------------------------
1 | package com.jointhegrid.udf.geoip;
2 |
3 | import com.maxmind.geoip.LookupService;
4 | import com.maxmind.geoip.Location;
5 | import com.maxmind.geoip.Country;
6 | import java.io.File;
7 | import java.io.IOException;
8 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
9 | import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
10 | import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
11 | import org.apache.hadoop.hive.ql.exec.Description;
12 | import org.apache.hadoop.hive.ql.metadata.HiveException;
13 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
14 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
15 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
16 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
17 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
18 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
19 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
20 | import java.util.concurrent.CopyOnWriteArraySet;
21 | import java.util.Set;
22 | import java.util.Arrays;
23 |
24 | /**
25 | * GenericUDFGeoIP is a Hive User Defined Function that allows you to lookup
26 | * database information on a given ip.
27 | * argument 0 should be an IP string
28 | * argument 1 should be one of the following values:
29 | * COUNTRY_NAME, COUNTRY_CODE ,AREA_CODE
30 | * CITY,DMA_CODE,LATITUDE,LONGITUDE,METRO_CODE,POSTAL_CODE, REGION, ORG, ID
31 | * argument 2 should be the filename for you geo-ip database
32 | *
33 | *
34 | * Usage:
35 | * add file GeoIP.dat;
36 | * add jar geo-ip-java.jar;
37 | * add jar hive-udf-geo-ip-jtg.jar;
38 | * create temporary function geoip as 'com.jointhegrid.hive.udf.GenericUDFGeoIP';
39 | * select geoip(first, 'COUNTRY_NAME', './GeoIP.dat' ) from a;
40 | *
41 | * @author ecapriolo
42 | */
43 |
44 | @Description(
45 | name = "geoip",
46 | value = "_FUNC_(ip,property,database) - loads database into GEO-IP lookup "+
47 | "service, then looks up 'property' of ip. "
48 | )
49 |
50 | public class GenericUDFGeoIP extends GenericUDF {
51 |
52 | private String ipString = null;
53 | private Long ipLong = null;
54 | private String property;
55 | private String database;
56 | private LookupService ls;
57 |
58 | private static final String COUNTRY_NAME = "COUNTRY_NAME";
59 | private static final String COUNTRY_CODE = "COUNTRY_CODE";
60 | private static final String AREA_CODE = "AREA_CODE";
61 | private static final String CITY = "CITY";
62 | private static final String DMA_CODE = "DMA_CODE";
63 | private static final String LATITUDE = "LATITUDE";
64 | private static final String LONGITUDE = "LONGITUDE";
65 | private static final String METRO_CODE = "METRO_CODE";
66 | private static final String POSTAL_CODE = "POSTAL_CODE";
67 | private static final String REGION = "REGION";
68 | private static final String ORG = "ORG";
69 | private static final String ID = "ID";
70 |
71 | private static final Set COUNTRY_PROPERTIES =
72 | new CopyOnWriteArraySet(Arrays.asList(
73 | new String[] {COUNTRY_NAME, COUNTRY_CODE}));
74 |
75 | private static final Set LOCATION_PROPERTIES =
76 | new CopyOnWriteArraySet(Arrays.asList(
77 | new String[] {AREA_CODE, CITY, DMA_CODE, LATITUDE, LONGITUDE, METRO_CODE, POSTAL_CODE, REGION}));
78 |
79 | PrimitiveObjectInspector [] argumentOIs;
80 |
81 | @Override
82 | public ObjectInspector initialize(ObjectInspector[] arguments)
83 | throws UDFArgumentException {
84 |
85 | argumentOIs = new PrimitiveObjectInspector [arguments.length];
86 |
87 | if ( arguments.length != 3) {
88 | throw new UDFArgumentLengthException(
89 | "The function GenericUDFGeoIP( 'input', 'resultfield', 'datafile' ) "
90 | + " accepts 3 arguments.");
91 | }
92 |
93 | if (!(arguments[0] instanceof StringObjectInspector) && !(arguments[0] instanceof LongObjectInspector)) {
94 | throw new UDFArgumentTypeException(0,
95 | "The first 3 parameters of GenericUDFGeoIP('input', 'resultfield', 'datafile')"
96 | + " should be string.");
97 | }
98 | argumentOIs[0] = (PrimitiveObjectInspector) arguments[0];
99 |
100 | for (int i = 1; i < arguments.length; i++) {
101 | if (!(arguments[i] instanceof StringObjectInspector )) {
102 | throw new UDFArgumentTypeException(i,
103 | "The first 3 parameters of GenericUDFGeoIP('input', 'resultfield', 'datafile')"
104 | + " should be string.");
105 | }
106 | argumentOIs[i] = (StringObjectInspector) arguments[i];
107 | }
108 | return PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(
109 | PrimitiveCategory.STRING);
110 | }
111 |
112 | @Override
113 | public Object evaluate(DeferredObject[] arguments) throws HiveException {
114 | if (argumentOIs[0] instanceof LongObjectInspector) {
115 | this.ipLong = ((LongObjectInspector)argumentOIs[0]).get(arguments[0].get());
116 | } else {
117 | this.ipString = ((StringObjectInspector)argumentOIs[0]).getPrimitiveJavaObject(arguments[0].get());
118 | }
119 | this.property = ((StringObjectInspector)argumentOIs[1]).getPrimitiveJavaObject(arguments[1].get());
120 |
121 | if (this.property != null) {
122 | this.property = this.property.toUpperCase();
123 | }
124 |
125 | if (ls ==null){
126 | if (argumentOIs.length == 3){
127 | this.database = ((StringObjectInspector)argumentOIs[1]).getPrimitiveJavaObject(arguments[2].get());
128 | File f = new File(database);
129 | if (!f.exists()){
130 | throw new HiveException(database+" does not exist");
131 | }
132 | try {
133 | ls = new LookupService ( f , LookupService.GEOIP_MEMORY_CACHE );
134 | } catch (IOException ex){
135 | throw new HiveException (ex);
136 | }
137 | }
138 | /** // how to do this???
139 | if (argumentOIs.length == 2) {
140 | URL u = getClass().getClassLoader().getResource("GeoIP.dat");
141 | try {
142 | System.out.println("f exists ?"+ new File(u.getFile()).exists() );
143 | ls = new LookupService ( u.getFile() );
144 | } catch (IOException ex){ throw new HiveException (ex); }
145 | }
146 | * */
147 | } // ls null ?
148 |
149 | if (COUNTRY_PROPERTIES.contains(this.property)) {
150 | Country country = ipString != null ? ls.getCountry(ipString) : ls.getCountry(ipLong);
151 | if (country == null) {
152 | return null;
153 | } else if (this.property.equals(COUNTRY_NAME)) {
154 | return country.getName();
155 | } else if (this.property.equals(COUNTRY_CODE)) {
156 | return country.getCode();
157 | }
158 | assert(false);
159 | } else if (LOCATION_PROPERTIES.contains(this.property)) {
160 | Location loc = ipString != null ? ls.getLocation(ipString) : ls.getLocation(ipLong);
161 | if (loc == null) {
162 | return null;
163 | }
164 | //country
165 | if (this.property.equals(AREA_CODE)) {
166 | return loc.area_code + "";
167 | } else if (this.property.equals(CITY)) {
168 | return loc.city == null ? null : loc.city + "";
169 | } else if (this.property.equals(DMA_CODE)) {
170 | return loc.dma_code + "";
171 | } else if (this.property.equals(LATITUDE)) {
172 | return loc.latitude + "";
173 | } else if (this.property.equals(LONGITUDE)) {
174 | return loc.longitude + "";
175 | } else if (this.property.equals(METRO_CODE)) {
176 | return loc.metro_code + "";
177 | } else if (this.property.equals(POSTAL_CODE)) {
178 | return loc.postalCode == null ? null : loc.postalCode + "";
179 | } else if (this.property.equals(REGION)) {
180 | return loc.region == null ? null : loc.region + "";
181 | }
182 | assert(false);
183 | } else if (this.property.equals(ORG)) {
184 | return ipString != null ? ls.getOrg(ipString) : ls.getOrg(ipLong);
185 | } else if (this.property.equals(ID)) {
186 | return ipString != null ? ls.getID(ipString) : ls.getID(ipLong);
187 | }
188 |
189 | return null;
190 | }
191 |
192 | @Override
193 | public String getDisplayString(String[] children) {
194 | assert(children.length == 3);
195 | return "GenericUDFGeoIP ( "+children[0]+", "+children[1]+", "+children[2]+" )";
196 | }
197 | }
198 |
--------------------------------------------------------------------------------
/src/test/java/com/jointhegrid/udf/geoip/GenericUDFGeoIPTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * To change this template, choose Tools | Templates
3 | * and open the template in the editor.
4 | */
5 | package com.jointhegrid.udf.geoip;
6 |
7 | import java.io.OutputStreamWriter;
8 | import java.io.BufferedWriter;
9 | import org.apache.hadoop.fs.FSDataOutputStream;
10 | import org.apache.hadoop.fs.Path;
11 | import java.util.List;
12 | import java.util.Arrays;
13 | import java.io.IOException;
14 | import com.jointhegrid.hive_test.HiveTestService;
15 | import org.junit.AfterClass;
16 | import org.junit.BeforeClass;
17 | import org.junit.Test;
18 | import static org.junit.Assert.*;
19 |
20 | public class GenericUDFGeoIPTest extends HiveTestService {
21 |
22 | public GenericUDFGeoIPTest() throws IOException {
23 | super();
24 | }
25 |
26 | public void testCollect() throws Exception {
27 | Path p = new Path(this.ROOT_DIR, "rankfile");
28 |
29 | FSDataOutputStream o = this.getFileSystem().create(p);
30 | BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(o));
31 | bw.write("209.191.139.200\n");
32 | bw.write("twelve\n");
33 | bw.close();
34 |
35 | String jarFile;
36 | jarFile = GenericUDFGeoIP.class.getProtectionDomain().getCodeSource().getLocation().getFile();
37 | client.execute("add jar " + jarFile);
38 | jarFile = com.maxmind.geoip.LookupService.class.getProtectionDomain().getCodeSource().getLocation().getFile();
39 | client.execute("add jar " + jarFile);
40 | //download this or put in reasources
41 | client.execute(" add file /tmp/GeoIP.dat");
42 |
43 | client.execute("create temporary function geoip as 'com.jointhegrid.udf.geoip.GenericUDFGeoIP'");
44 | client.execute("create table ips ( ip string) row format delimited fields terminated by '09' lines terminated by '10'");
45 | client.execute("load data local inpath '" + p.toString() + "' into table ips");
46 |
47 | client.execute("select geoip(ip, 'COUNTRY_NAME', './GeoIP.dat') FROM ips");
48 | List expected = Arrays.asList("United States","N/A");
49 | assertEquals(expected, client.fetchAll());
50 |
51 |
52 | client.execute("drop table ips");
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/test/resources/hive-exec-log4j.properties:
--------------------------------------------------------------------------------
1 | # Define some default values that can be overridden by system properties
2 | hive.root.logger=INFO,FA
3 | #hive.root.logger=DEBUG,console
4 | hive.log.dir=/tmp/${user.name}
5 | hive.log.file=${hive.query.id}.log
6 |
7 | # Define the root logger to the system property "hadoop.root.logger".
8 | log4j.rootLogger=${hive.root.logger}, EventCounter
9 |
10 | # Logging Threshold
11 | log4j.threshhold=DEBUG
12 |
13 | #
14 | # File Appender
15 | #
16 |
17 | log4j.appender.FA=org.apache.log4j.FileAppender
18 | log4j.appender.FA.File=${hive.log.dir}/${hive.log.file}
19 | log4j.appender.FA.layout=org.apache.log4j.PatternLayout
20 |
21 | # Pattern format: Date LogLevel LoggerName LogMessage
22 | #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
23 | # Debugging Pattern format
24 | log4j.appender.FA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
25 |
26 |
27 | #
28 | # console
29 | # Add "console" to rootlogger above if you want to use this
30 | #
31 |
32 | log4j.appender.console=org.apache.log4j.ConsoleAppender
33 | log4j.appender.console.target=System.err
34 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
35 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
36 |
37 | #custom logging levels
38 | #log4j.logger.xxx=DEBUG
39 |
40 | #
41 | # Event Counter Appender
42 | # Sends counts of logging messages at different severity levels to Hadoop Metrics.
43 | #
44 | log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
45 |
46 |
47 | #log4j.category.DataNucleus=ERROR,FA
48 | #log4j.category.Datastore=ERROR,FA
49 | #log4j.category.Datastore.Schema=ERROR,FA
50 | #log4j.category.JPOX.Datastore=ERROR,FA
51 | #log4j.category.JPOX.Plugin=ERROR,FA
52 | #log4j.category.JPOX.MetaData=ERROR,FA
53 | #log4j.category.JPOX.Query=ERROR,FA
54 | #log4j.category.JPOX.General=ERROR,FA
55 | #log4j.category.JPOX.Enhancer=ERROR,FA
56 |
57 |
--------------------------------------------------------------------------------
/src/test/resources/hive-log4j.properties:
--------------------------------------------------------------------------------
1 | # Define some default values that can be overridden by system properties
2 | hive.root.logger=WARN,DRFA
3 | #hive.root.logger=DEBUG,console
4 | hive.log.dir=/tmp/${user.name}
5 | hive.log.file=hive.log
6 |
7 | # Define the root logger to the system property "hadoop.root.logger".
8 | log4j.rootLogger=${hive.root.logger}, EventCounter
9 |
10 | # Logging Threshold
11 | log4j.threshhold=WARN
12 |
13 | #
14 | # Daily Rolling File Appender
15 | #
16 |
17 | log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
18 | log4j.appender.DRFA.File=${hive.log.dir}/${hive.log.file}
19 |
20 | # Rollver at midnight
21 | log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
22 |
23 | # 30-day backup
24 | #log4j.appender.DRFA.MaxBackupIndex=30
25 | log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
26 |
27 | # Pattern format: Date LogLevel LoggerName LogMessage
28 | #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
29 | # Debugging Pattern format
30 | log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
31 |
32 |
33 | #
34 | # console
35 | # Add "console" to rootlogger above if you want to use this
36 | #
37 |
38 | log4j.appender.console=org.apache.log4j.ConsoleAppender
39 | log4j.appender.console.target=System.err
40 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
41 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
42 |
43 | #custom logging levels
44 | #log4j.logger.xxx=DEBUG
45 |
46 | #
47 | # Event Counter Appender
48 | # Sends counts of logging messages at different severity levels to Hadoop Metrics.
49 | #
50 | log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter
51 |
52 |
53 | log4j.category.DataNucleus=ERROR,DRFA
54 | log4j.category.Datastore=ERROR,DRFA
55 | log4j.category.Datastore.Schema=ERROR,DRFA
56 | log4j.category.JPOX.Datastore=ERROR,DRFA
57 | log4j.category.JPOX.Plugin=ERROR,DRFA
58 | log4j.category.JPOX.MetaData=ERROR,DRFA
59 | log4j.category.JPOX.Query=ERROR,DRFA
60 | log4j.category.JPOX.General=ERROR,DRFA
61 | log4j.category.JPOX.Enhancer=ERROR,DRFA
62 |
63 |
--------------------------------------------------------------------------------
/src/test/resources/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | hive.mapred.reduce.tasks.speculative.execution
6 | false
7 | Whether speculative execution for reducers should be turned on.
8 |
9 |
10 |
11 |
12 |
13 | javax.jdo.option.ConnectionURL
14 |
15 | jdbc:derby:memory:metastore_db;create=true
16 | JDBC connect string for a JDBC metastore
17 |
18 |
19 |
20 | hive.metastore.warehouse.dir
21 | /tmp/warehouse
22 | location of default database for the warehouse
23 |
24 |
25 |
32 |
33 |
40 |
41 |
42 |
--------------------------------------------------------------------------------