├── .gitignore ├── README.md ├── ch01 ├── assembly.xml ├── data │ └── sample.txt ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── packtpub │ └── esh │ ├── Driver.java │ ├── WordsMapper.java │ └── WordsReducer.java ├── ch02 ├── assembly.xml ├── data │ ├── network-logs.txt │ └── tweets.csv ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── packtpub │ └── esh │ ├── nwlogs │ ├── DomainUtil.java │ ├── Driver.java │ └── NetworkLogsMapper.java │ ├── tweets2es │ ├── Driver.java │ └── Tweets2EsMapper.java │ └── tweets2hdfs │ ├── Driver.java │ └── Tweets2HdfsMapper.java ├── ch03 ├── data │ └── setup-hrms.sh └── exercise │ └── avg-salary-by-city-request.sh ├── ch04 ├── assembly.xml ├── data │ └── consumer_complaints.csv ├── pom.xml ├── setup │ ├── complaints-dashboard.json │ └── setup-mappings.sh └── src │ └── main │ └── java │ └── com │ └── packtpub │ └── esh │ └── complaints │ ├── ComplaintsMapper.java │ └── Driver.java ├── ch05 ├── assembly.xml ├── data │ └── percolators.sh ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── packtpub │ └── esh │ └── streaming │ ├── ElasticSearchService.java │ ├── Topology.java │ ├── TweetsCollectorSpout.java │ └── TweetsParserBolt.java ├── ch07-spark ├── assembly.xml ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── packtpub │ └── esh │ └── spark │ ├── Crime.java │ ├── SparkEsReader.java │ ├── SparkEsWriter.java │ ├── SparkSQLEsReader.java │ ├── SparkSQLEsWriterReflection.java │ └── SparkSQLEsWriterSchema.java └── ch07 ├── assembly.xml ├── data ├── crimes.json └── crimes_dataset.csv ├── pom.xml ├── scripts ├── es-reader.pig ├── es-reader.sql ├── es-writer.pig ├── es-writer.sql ├── lingual-cleanup.sh └── lingual-writer.sh └── src └── main └── java └── com └── packtpub └── esh └── cascading ├── CascadingEsReader.java └── CascadingEsWriter.java /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.iml 3 | .classpath 4 | .project 5 | *.log 6 | **/target 7 | **/.settings 8 | **/.idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ElasticSearch for Hadoop Book Source Code 2 | 3 | ### Check Prerequisites 4 | - JDK 1.8 5 | 6 | ```sh 7 | $ java -version 8 | ``` 9 | - Maven 10 | 11 | ```sh 12 | $ mvn -version 13 | ``` 14 | - Make sure HDFS and YARN are running 15 | 16 | ```sh 17 | $ jps 18 | 19 | 13386 SecondaryNameNode 20 | 13059 NameNode 21 | 13179 DataNode 22 | 13649 NodeManager 23 | 13528 ResourceManager 24 | ``` 25 | - Make sure Elasticsearch 1.7+ is up and running at 9200 26 | ```sh 27 | $ curl -XGET http://localhost:9200 28 | 29 | { 30 | "status" : 200, 31 | "name" : "ES Hadoop Node", 32 | "cluster_name" : "eshadoopcluster", 33 | "version" : { 34 | "number" : "1.7.2", 35 | "build_hash" : "e43676b1385b8125d647f593f7202acbd816e8ec", 36 | "build_timestamp" : "2015-09-14T09:49:53Z", 37 | "build_snapshot" : false, 38 | "lucene_version" : "4.10.4" 39 | }, 40 | "tagline" : "You Know, for Search" 41 | } 42 | 43 | ``` 44 | 45 | ### Build 46 | - Open terminal and switch to the chapter directory you want to build 47 | - Execute 48 | ```sh 49 | $ cd ch01 50 | $ mvn clean package 51 | ``` 52 | - Verify that file with xxx-job.jar pattern is generated 53 | ```sh 54 | $ ls target 55 | ``` 56 | -------------------------------------------------------------------------------- /ch01/assembly.xml: -------------------------------------------------------------------------------- 1 | 2 | job 3 | 4 | jar 5 | 6 | false 7 | 8 | 9 | false 10 | runtime 11 | lib 12 | 13 | ${groupId}:${artifactId} 14 | 15 | 16 | 17 | true 18 | 19 | ${groupId}:${artifactId} 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /ch01/data/sample.txt: -------------------------------------------------------------------------------- 1 | The key features of Elasticsearch for Apache Hadoop include: 2 | 3 | Scalable Map/Reduce model 4 | elasticsearch-hadoop is built around Map/Reduce: every operation done in elasticsearch-hadoop results in multiple Hadoop tasks (based on the number of target shards) that interact, in parallel with Elasticsearch. 5 | REST based 6 | elasticsearch-hadoop uses Elasticsearch REST interface for communication, allowing for flexible deployments by minimizing the number of ports needed to be open within a network. 7 | Self contained 8 | the library has been designed to be small and efficient. At around 300KB and no extra dependencies outside Hadoop itself, distributing elasticsearch-hadoop within your cluster is simple and fast. 9 | Universal jar 10 | whether you are using Hadoop 1.x or Hadoop 2.x, vanilla Apache Hadoop or a certain distro, the same elasticsearch-hadoop jar works transparently across all of them. 11 | Memory and I/O efficient 12 | elasticsearch-hadoop is focused on performance. From pull-based parsing, to bulk updates and direct conversion to/of native types, elasticsearch-hadoop keeps its memory and network I/O usage finely-tuned. 13 | Adaptive I/O 14 | elasticsearch-hadoop detects transport errors and retries automatically. If the Elasticsearch node died, re-routes the request to the available nodes (which are discovered automatically). Additionally, if Elasticsearch is overloaded, elasticsearch-hadoop detects the data rejected and resents it, until it is either processed or the user-defined policy applies. 15 | Facilitates data co-location 16 | elasticsearch-hadoop fully integrates with Hadoop exposing its network access information, allowing co-located Elasticsearch and Hadoop clusters to be aware of each other and reduce network IO. 17 | Map/Reduce API support 18 | At its core, elasticsearch-hadoop uses the low-level Map/Reduce API to read and write data to Elasticsearch allowing for maximum integration flexibility and performance. 19 | old(mapred) & new(mapreduce) Map/Reduce APIs supported 20 | elasticsearch-hadoop automatically adjusts to your environment; one does not have to change between using the mapred or mapreduce APIs - both are supported, by the same classes, at the same time. 21 | Hive support 22 | Run Hive queries against Elasticsearch for advanced analystics and real_time reponses. elasticsearch-hadoop exposes Elasticsearch as a Hive table so your scripts can crunch through data faster then ever. 23 | Pig support 24 | elasticsearch-hadoop supports Apache Pig exposing Elasticsearch as a native Pig Storage. Run your Pig scripts against Elasticsearch without any modifications to your configuration or the Pig client. 25 | Cascading support 26 | Cascading is an application framework for Java developers to simply develop robust applications on Apache Hadoop. And with elasticsearch-hadoop, Cascading can run its flows directly onto Elasticsearch. 27 | -------------------------------------------------------------------------------- /ch01/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | es-hadoop-book-code 6 | ch01 7 | 0.0.1 8 | jar 9 | 10 | com.hadoop.app 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-core 27 | 1.2.1 28 | provided 29 | 30 | 31 | org.apache.hadoop 32 | hadoop-hdfs 33 | 2.6.0 34 | 35 | 36 | org.elasticsearch 37 | elasticsearch-hadoop 38 | 2.1.0 39 | 40 | 41 | cascading-hadoop 42 | cascading 43 | 44 | 45 | cascading-local 46 | cascading 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | maven-assembly-plugin 56 | 2.2.1 57 | 58 | 59 | assembly.xml 60 | 61 | 62 | 63 | com.packtpub.esh.Driver 64 | 65 | 66 | 67 | 68 | 69 | make-assembly 70 | package 71 | 72 | single 73 | 74 | 75 | 76 | 77 | 78 | org.apache.maven.plugins 79 | maven-compiler-plugin 80 | 3.3 81 | 82 | 1.8 83 | 1.8 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /ch01/src/main/java/com/packtpub/esh/Driver.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.elasticsearch.hadoop.mr.EsOutputFormat; 10 | 11 | 12 | public class Driver { 13 | 14 | public static void main(String[] args) throws Exception { 15 | Configuration conf = new Configuration(); 16 | // ElasticSearch Server nodes to point to 17 | conf.set("es.nodes", "localhost:9200"); 18 | // ElasticSearch index and type name in {indexName}/{typeName} format 19 | conf.set("es.resource", "eshadoop/wordcount"); 20 | 21 | // Create Job instance 22 | Job job = new Job(conf, "word count"); 23 | // set Driver class 24 | job.setJarByClass(Driver.class); 25 | job.setMapperClass(WordsMapper.class); 26 | job.setReducerClass(WordsReducer.class); 27 | job.setOutputKeyClass(Text.class); 28 | job.setOutputValueClass(IntWritable.class); 29 | // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar 30 | job.setOutputFormatClass(EsOutputFormat.class); 31 | 32 | FileInputFormat.addInputPath(job, new Path(args[0])); 33 | 34 | System.exit(job.waitForCompletion(true) ? 0 : 1); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /ch01/src/main/java/com/packtpub/esh/WordsMapper.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Mapper; 6 | 7 | import java.io.IOException; 8 | import java.util.StringTokenizer; 9 | 10 | public class WordsMapper extends Mapper { 11 | 12 | private final static IntWritable one = new IntWritable(1); 13 | 14 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 15 | StringTokenizer itr = new StringTokenizer(value.toString()); 16 | 17 | while (itr.hasMoreTokens()) { 18 | Text word = new Text(); 19 | word.set(itr.nextToken()); 20 | context.write(word, one); 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /ch01/src/main/java/com/packtpub/esh/WordsReducer.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.MapWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | import java.io.IOException; 9 | 10 | public class WordsReducer extends Reducer { 11 | 12 | @Override 13 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 14 | // This represents our ES document 15 | MapWritable result = new MapWritable(); 16 | int sum = 0; 17 | for (IntWritable val : values) { 18 | sum += val.get(); 19 | } 20 | // Add "word" field to ES document 21 | result.put(new Text("word"), key); 22 | // Add "count" field to ES document 23 | result.put(new Text("count"), new IntWritable(sum)); 24 | context.write(key, result); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /ch02/assembly.xml: -------------------------------------------------------------------------------- 1 | 2 | job 3 | 4 | jar 5 | 6 | false 7 | 8 | 9 | false 10 | runtime 11 | lib 12 | 13 | ${groupId}:${artifactId} 14 | 15 | 16 | 17 | true 18 | 19 | ${groupId}:${artifactId} 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /ch02/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | es-hadoop-book-code 6 | ch02 7 | 0.0.1 8 | jar 9 | 10 | com.hadoop.app 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | org.apache.hadoop 20 | hadoop-core 21 | 1.2.1 22 | provided 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-hdfs 27 | 2.6.0 28 | 29 | 30 | org.elasticsearch 31 | elasticsearch-hadoop 32 | 2.1.0 33 | 34 | 35 | cascading-hadoop 36 | cascading 37 | 38 | 39 | cascading-local 40 | cascading 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | maven-assembly-plugin 50 | 2.2.1 51 | 52 | 53 | 54 | make-network-logs-job 55 | 56 | 57 | assembly.xml 58 | 59 | 60 | 61 | com.packtpub.esh.nwlogs.Driver 62 | 63 | 64 | ${artifactId}-${version}-nwlogs 65 | 66 | package 67 | 68 | single 69 | 70 | 71 | 72 | make-tweets2es-job 73 | 74 | 75 | assembly.xml 76 | 77 | 78 | 79 | com.packtpub.esh.tweets2es.Driver 80 | 81 | 82 | ${artifactId}-${version}-tweets2es 83 | 84 | package 85 | 86 | single 87 | 88 | 89 | 90 | make-tweets2hdfs-job 91 | 92 | 93 | assembly.xml 94 | 95 | 96 | 97 | com.packtpub.esh.tweets2hdfs.Driver 98 | 99 | 100 | ${artifactId}-${version}-tweets2hdfs 101 | 102 | package 103 | 104 | 105 | single 106 | 107 | 108 | 109 | 110 | 111 | 112 | org.apache.maven.plugins 113 | maven-compiler-plugin 114 | 3.3 115 | 116 | 1.8 117 | 1.8 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /ch02/src/main/java/com/packtpub/esh/nwlogs/DomainUtil.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.nwlogs; 2 | 3 | import java.text.ParseException; 4 | import java.text.SimpleDateFormat; 5 | 6 | /** 7 | * Created by vishalshukla on 25/05/15. 8 | */ 9 | public class DomainUtil { 10 | /** 11 | * 12 | * @param url 13 | * @return 14 | */ 15 | public static String getHost(String url){ 16 | if(url == null || url.length() == 0) 17 | return ""; 18 | 19 | int doubleslash = url.indexOf("//"); 20 | if(doubleslash == -1) 21 | doubleslash = 0; 22 | else 23 | doubleslash += 2; 24 | 25 | int end = url.indexOf('/', doubleslash); 26 | end = end >= 0 ? end : url.length(); 27 | 28 | int port = url.indexOf(':', doubleslash); 29 | end = (port > 0 && port < end) ? port : end; 30 | 31 | return url.substring(doubleslash, end); 32 | } 33 | 34 | 35 | /** 36 | * 37 | * @param host 38 | * @return 39 | */ 40 | public static String getBaseDomain(String url) { 41 | String host = getHost(url); 42 | 43 | int startIndex = 0; 44 | int nextIndex = host.indexOf('.'); 45 | int lastIndex = host.lastIndexOf('.'); 46 | while (nextIndex < lastIndex) { 47 | startIndex = nextIndex + 1; 48 | nextIndex = host.indexOf('.', startIndex); 49 | } 50 | if (startIndex > 0) { 51 | return host.substring(startIndex); 52 | } else { 53 | return host; 54 | } 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /ch02/src/main/java/com/packtpub/esh/nwlogs/Driver.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.nwlogs; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.mapreduce.Job; 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 7 | import org.elasticsearch.hadoop.mr.EsOutputFormat; 8 | 9 | 10 | public class Driver { 11 | 12 | public static void main(String[] args) throws Exception { 13 | Configuration conf = new Configuration(); 14 | // ElasticSearch Server nodes to point to 15 | conf.set("es.nodes", "localhost:9200"); 16 | // ElasticSearch index and type name in {indexName}/{typeName} format 17 | conf.set("es.resource", "esh_network/network_logs_{action}"); 18 | 19 | // Create Job instance 20 | Job job = new Job(conf, "network monitor mapper"); 21 | // set Driver class 22 | job.setJarByClass(Driver.class); 23 | job.setMapperClass(NetworkLogsMapper.class); 24 | // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar 25 | job.setOutputFormatClass(EsOutputFormat.class); 26 | job.setNumReduceTasks(0); 27 | FileInputFormat.addInputPath(job, new Path(args[0])); 28 | 29 | System.exit(job.waitForCompletion(true) ? 0 : 1); 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /ch02/src/main/java/com/packtpub/esh/nwlogs/NetworkLogsMapper.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.nwlogs; 2 | 3 | import org.apache.hadoop.io.*; 4 | import org.apache.hadoop.mapreduce.Mapper; 5 | 6 | import java.io.IOException; 7 | import java.text.ParseException; 8 | import java.text.SimpleDateFormat; 9 | import java.util.Date; 10 | import java.util.StringTokenizer; 11 | import java.util.Locale; 12 | 13 | public class NetworkLogsMapper extends Mapper { 14 | 15 | 16 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 17 | MapWritable map = new MapWritable(); 18 | String line = value.toString().trim(); 19 | String[] parts = line.split("\" \\("); 20 | String keyVals = parts[0].substring(15, parts[0].length()).trim(); 21 | 22 | String srcIp = null; 23 | String destIp = null; 24 | String category = null; 25 | String action = null; 26 | String target = null; 27 | String serial = null; 28 | String ip = null; 29 | String timezone = null; 30 | Long time = null; 31 | 32 | int i = 0; 33 | StringTokenizer part1tokenizer = new StringTokenizer(keyVals); 34 | while (part1tokenizer.hasMoreTokens()) { 35 | String token = part1tokenizer.nextToken(); 36 | String keyPart = getKeyValue(token)[0]; 37 | String valuePart = getKeyValue(token)[1]; 38 | 39 | switch (keyPart) { 40 | case "src": 41 | srcIp = valuePart; 42 | break; 43 | case "dst": 44 | destIp = valuePart; 45 | break; 46 | case "id": 47 | category = valuePart; 48 | break; 49 | case "act": 50 | action = valuePart != null ? valuePart.toUpperCase() : null; 51 | break; 52 | case "msg": 53 | target = valuePart; 54 | break; 55 | } 56 | i++; 57 | } 58 | 59 | i = 0; 60 | if (parts.length > 1) { 61 | StringTokenizer part2Tokenizer = new StringTokenizer(parts[1], ","); 62 | while (part2Tokenizer.hasMoreTokens()) { 63 | String token = part2Tokenizer.nextToken(); 64 | String keyPart = getKeyValue(token)[0].trim(); 65 | String valuePart = getKeyValue(token)[1]; 66 | 67 | switch (keyPart) { 68 | case "sn": 69 | serial = valuePart; 70 | break; 71 | case "ip": 72 | ip = valuePart; 73 | break; 74 | case "tz": 75 | timezone = valuePart; 76 | break; 77 | case "time": 78 | String timeStr = valuePart; 79 | timeStr = timeStr.replaceAll("\\)", ""); 80 | SimpleDateFormat dateFormat = new SimpleDateFormat("EEE MMM dd hh:mm:ss yyyy",Locale.ENGLISH); 81 | try { 82 | time = dateFormat.parse(timeStr).getTime(); 83 | } catch (ParseException e) { 84 | e.printStackTrace(); 85 | } 86 | break; 87 | } 88 | i++; 89 | } 90 | } 91 | map.put(new Text("srcIp"), getWritableValue(srcIp)); 92 | map.put(new Text("destIp"), getWritableValue(destIp)); 93 | map.put(new Text("action"), getWritableValue(action)); 94 | map.put(new Text("category"), getWritableValue(category)); 95 | map.put(new Text("target"), getWritableValue(target)); 96 | map.put(new Text("serial"), getWritableValue(serial)); 97 | map.put(new Text("timezone"), getWritableValue(timezone)); 98 | map.put(new Text("ip"), getWritableValue(ip)); 99 | map.put(new Text("domain"),getWritableValue(getDomainName(target))); 100 | map.put(new Text("@timestamp"), time != null ? new LongWritable(time) : new LongWritable(new Date().getTime())); 101 | 102 | 103 | context.write(value, map); 104 | } 105 | 106 | private static WritableComparable getWritableValue(String value) { 107 | return value != null ? new Text(value) : NullWritable.get(); 108 | } 109 | 110 | public static String getDomainName(String url) { 111 | if(url==null) 112 | return null; 113 | return DomainUtil.getBaseDomain(url); 114 | } 115 | 116 | private static String[] getKeyValue(String token) { 117 | String[] values = token.split("="); 118 | String val = null; 119 | if (values.length >= 2) { 120 | val = values[1].trim(); 121 | val = val.replaceAll("\"", ""); 122 | } 123 | 124 | return new String[]{values[0],val}; 125 | } 126 | 127 | } 128 | -------------------------------------------------------------------------------- /ch02/src/main/java/com/packtpub/esh/tweets2es/Driver.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.tweets2es; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.mapreduce.Job; 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 7 | import org.elasticsearch.hadoop.mr.EsOutputFormat; 8 | 9 | import java.io.IOException; 10 | 11 | public class Driver { 12 | public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException { 13 | Configuration conf = new Configuration(); 14 | // ElasticSearch Server nodes to point to 15 | conf.set("es.nodes", "localhost:9200"); 16 | // ElasticSearch index and type name in {indexName}/{typeName} format 17 | conf.set("es.resource", "esh/tweets"); 18 | 19 | // Create Job instance 20 | Job job = new Job(conf, "tweets to es mapper"); 21 | // set Driver class 22 | job.setJarByClass(Driver.class); 23 | job.setMapperClass(Tweets2EsMapper.class); 24 | // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar 25 | job.setOutputFormatClass(EsOutputFormat.class); 26 | job.setNumReduceTasks(0); 27 | FileInputFormat.addInputPath(job, new Path(args[0])); 28 | 29 | System.exit(job.waitForCompletion(true) ? 0 : 1); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /ch02/src/main/java/com/packtpub/esh/tweets2es/Tweets2EsMapper.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.tweets2es; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.MapWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | import java.io.IOException; 9 | import java.text.ParseException; 10 | import java.text.SimpleDateFormat; 11 | import java.util.regex.Matcher; 12 | import java.util.regex.Pattern; 13 | import java.util.Locale; 14 | 15 | public class Tweets2EsMapper extends Mapper { 16 | 17 | 18 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 19 | MapWritable map = new MapWritable(); 20 | getMapWritable(value, map); 21 | context.write(value, map); 22 | } 23 | 24 | private void getMapWritable(Text value, MapWritable map) { 25 | String line = value.toString().trim(); 26 | String[] tokens = line.split("\","); 27 | String id = null; 28 | String text = null; 29 | String user = null; 30 | Long timestamp = null; 31 | 32 | for(int i=0;i { 14 | 15 | 16 | public void map(Object key, MapWritable value, Context context) throws IOException, InterruptedException { 17 | StringBuilder mappedValueBuilder = new StringBuilder(); 18 | mappedValueBuilder.append(getQuotedValue(value.get(new Text("tweetId")))+", "); 19 | mappedValueBuilder.append(getQuotedValue(value.get(new Text("text")))+", "); 20 | mappedValueBuilder.append(getQuotedValue(value.get(new Text("user")))+", "); 21 | mappedValueBuilder.append(getQuotedTimeValue(value.get(new Text("@timestamp")))); 22 | 23 | Text mappedValue = new Text(mappedValueBuilder.toString()); 24 | context.write(mappedValue, mappedValue); 25 | } 26 | 27 | private String getQuotedTimeValue(Writable writable) { 28 | Date timestamp = new Date(Long.parseLong(writable.toString())); 29 | SimpleDateFormat dateFormat = new SimpleDateFormat("EEE MMM dd hh:mm:ss zzz YYYY",Locale.ENGLISH); 30 | return "\""+dateFormat.format(timestamp)+"\""; 31 | } 32 | 33 | private String getQuotedValue(Writable value) { 34 | return "\""+value.toString()+"\""; 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /ch03/data/setup-hrms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -XPUT http://localhost:9200/hrms 4 | 5 | curl -XPUT http://localhost:9200/hrms/candidate/_mapping?pretty -d '{ 6 | "properties": { 7 | "experience": { 8 | "type": "float" 9 | }, 10 | "firstName": { 11 | "type": "string" 12 | }, 13 | "lastName": { 14 | "type": "string" 15 | }, 16 | "birthDate" : { 17 | "type" : "date", 18 | "format" : "dd/MM/YYYY" 19 | }, 20 | "salary" : { 21 | "type" : "double" 22 | }, 23 | "skills": { 24 | "type": "string" 25 | }, 26 | "address" : { 27 | "type" : "object", 28 | "properties" : { 29 | "street" : {"type" : "string"}, 30 | "city" : { 31 | "type" : "string", 32 | "index" : "not_analyzed" 33 | }, 34 | "region" : {"type" : "string"}, 35 | "geo":{"type":"geo_point"} 36 | } 37 | } 38 | } 39 | }' 40 | 41 | curl -XPOST http://localhost:9200/hrms/candidate -d '{ 42 | "firstName": "Emerson", 43 | "lastName": "Atkins", 44 | "skills": ["Java","Hadoop","ElasticSearch","Kibana"], 45 | "experience": 8.5, 46 | "birthDate":"30/04/1987", 47 | "address" :{ 48 | "street" : "Ap #576-619 Tincidunt Rd.", 49 | "city" : "Nagpur", 50 | "region": "MH", 51 | "geo": "15.97, 76.82" 52 | }, 53 | "salary":"120000" 54 | }' 55 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 56 | { 57 | "firstName": "Jorden", 58 | "lastName": "Mclean", 59 | "birthDate": "11/03/1980", 60 | "experience": 19, 61 | "skills": ["Java","Hadoop","ElasticSearch","Kibana"], 62 | "address" :{ 63 | "street": "2751 Ut Rd.", 64 | "city": "Purral", 65 | "region": "SJ", 66 | "geo": "-80.61395, 21.93988" 67 | }, 68 | "comments":"Passionate Java and BigData developer", 69 | "salary":"150000" 70 | 71 | }' 72 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 73 | { 74 | "firstName": "Aimee", 75 | "lastName": "Ramirez", 76 | "birthDate": "14/11/1973", 77 | "experience": 6, 78 | "skills": ["PHP","Magento","ElasticSearch","Kibana"], 79 | "address" :{ 80 | "street": "477-3861 Feugiat. Road", 81 | "city": "La Roche-sur-Yon", 82 | "region": "Pays de la Loire", 83 | "geo": "23.15, 72.33" 84 | }, 85 | "achievements":"Ethical hacking certification.", 86 | "salary":"80000" 87 | 88 | }' 89 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 90 | { 91 | "firstName": "Rebekah", 92 | "lastName": "Chang", 93 | "birthDate": "12/04/1984", 94 | "experience": 13, 95 | "skills": ["Java","Spring","ElasticSearch"], 96 | "address" :{ 97 | "street": "138-8420 Semper Rd.", 98 | "city": "Mumbai", 99 | "region": "MH", 100 | "geo": "18.97, 72.82" 101 | }, 102 | "comments":"Ethical hacking certification.", 103 | "salary":"125000" 104 | }' 105 | 106 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 107 | { 108 | "firstName": "Gray", 109 | "lastName": "Carson", 110 | "birthDate": "20/11/1989", 111 | "experience": 11, 112 | "skills": ["Python","R","Machine learning"], 113 | "address" :{ 114 | "street": "Ap #261-8043 Magna. Rd.", 115 | "city": "Parramatta", 116 | "region": "New South Wales", 117 | "geo": "24.15, 73.33" 118 | }, 119 | "comments":"Ethical hacking certification.", 120 | "salary":"115000" 121 | }' 122 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 123 | { 124 | "firstName": "Eagan", 125 | "lastName": "Riddle", 126 | "birthDate": "03/12/1979", 127 | "experience": 17, 128 | "skills": ["Linux","Networking","VMWare","DevOps","Docker"], 129 | "address" :{ 130 | "street": "7138 Amet Avenue", 131 | "city": "New South Wales", 132 | "region": "OV", 133 | "geo": "-89.52962, -117.05619" 134 | }, 135 | "salary":"180000" 136 | }' 137 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 138 | { 139 | "firstName": "Hamish", 140 | "lastName": "Mendez", 141 | "birthDate": "22/05/1988", 142 | "experience": 9, 143 | "skills": ["Ruby","Linux","Puppet","Chef","DevOps","Ansible","Shell Script","Groovy"], 144 | "address" :{ 145 | "street": "P.O. Box 452, 4375 Nam Road", 146 | "city": "Parramatta", 147 | "region": "Şa", 148 | "geo": "42.5772, 9.88647", 149 | "salary":"100000" 150 | } 151 | }' 152 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 153 | { 154 | "firstName": "Bradley", 155 | "lastName": "Stark", 156 | "birthDate": "23/06/1981", 157 | "experience": 1, 158 | "skills": ["HTML","CSS","Photoshop","AngularJS","SQL"], 159 | "address" :{ 160 | "street": "Ap #695-608 Aliquet. St.", 161 | "city": "Newcastle", 162 | "region": "New South Wales", 163 | "geo": "3.00917, -152.95787" 164 | }, 165 | "achievements":"Secured 1st rank in university in Masters", 166 | "salary":"60000" 167 | }' 168 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 169 | { 170 | "firstName": "Quemby", 171 | "lastName": "Cunningham", 172 | "birthDate": "02/09/1987", 173 | "experience": 5, 174 | "skills": ["Lucene","ElasticSearch","Java"], 175 | "address" :{ 176 | "street": "P.O. Box 751, 6709 Cras St.", 177 | "city": "Akron", 178 | "region": "Galicia", 179 | "geo": "70.68905, 56.43336" 180 | }, 181 | "salary":"80000" 182 | }' 183 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 184 | { 185 | "firstName": "Elton", 186 | "lastName": "Harper", 187 | "birthDate": "30/11/1978", 188 | "experience": 10, 189 | "skills": ["Hadoop","Spark","Java","Linux"], 190 | "address" :{ 191 | "street": "8854 Fermentum Road", 192 | "city": "New South Wales", 193 | "region": "Noord Brabant", 194 | "geo": "67.74365, -31.22381" 195 | }, 196 | "salary":"110000" 197 | }' 198 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 199 | { 200 | "firstName": "Hyacinth", 201 | "lastName": "Melendez", 202 | "birthDate": "08/01/1979", 203 | "experience": 11, 204 | "skills": ["Kibana","ElasticSearch","Java","Linux"], 205 | "address" :{ 206 | "street": "688-7523 Diam Rd.", 207 | "city": "Akron", 208 | "region": "Ohio", 209 | "geo": "40.68995, -123.71124" 210 | }, 211 | "salary":"120000" 212 | }' 213 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 214 | { 215 | "firstName": "Forrest", 216 | "lastName": "Lawson", 217 | "birthDate": "13/03/1978", 218 | "experience": 4, 219 | "skills": ["Solr","Lucene","Java","Full-text search"], 220 | "address" :{ 221 | "street": "P.O. Box 146, 3183 Amet Avenue", 222 | "city": "Istanbul", 223 | "region": "Ist", 224 | "geo": "-10, 155" 225 | }, 226 | "salary":"70000" 227 | }' 228 | 229 | 230 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 231 | { 232 | "firstName": "David", 233 | "lastName": "Lawson", 234 | "birthDate": "13/03/1968", 235 | "experience": 30, 236 | "skills": ["Lucene"], 237 | "address" :{ 238 | "street": "P.O. Box 146, 3183 Amet Avenue", 239 | "city": "Istanbul", 240 | "region": "Ist", 241 | "geo": "-9, 150" 242 | }, 243 | "salary":"200000" 244 | }' 245 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 246 | { 247 | "firstName": "David", 248 | "lastName": "Lawson", 249 | "birthDate": "13/03/1968", 250 | "experience": 40, 251 | "skills": ["Lucene"], 252 | "address" :{ 253 | "street": "P.O. Box 146, 3183 Amet Avenue", 254 | "city": "Istanbul", 255 | "region": "Ist", 256 | "geo": "-9.68931, 151.66362" 257 | }, 258 | "salary":"250000" 259 | }' 260 | 261 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 262 | { 263 | "firstName": "David", 264 | "lastName": "Lawson", 265 | "birthDate": "13/03/1968", 266 | "experience": 30, 267 | "skills": ["Lucene"], 268 | "address" :{ 269 | "street": "P.O. Box 146, 3183 Amet Avenue", 270 | "city": "Istanbul", 271 | "region": "Ist", 272 | "geo": "-9.68931, 151.66362" 273 | }, 274 | "salary":"210000" 275 | }' 276 | 277 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 278 | { 279 | "firstName": "Kiran", 280 | "lastName": "Suthar", 281 | "birthDate": "13/03/1968", 282 | "experience": 35, 283 | "skills": ["Lucene"], 284 | "address" :{ 285 | "street": "P.O. Box 146, 3183 Amet Avenue", 286 | "city": "Istanbul", 287 | "region": "Ist", 288 | "geo": "-9.68931, 151.66362" 289 | }, 290 | "salary":"300000" 291 | }' 292 | 293 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 294 | { 295 | "firstName": "David", 296 | "lastName": "Mackwan", 297 | "birthDate": "13/03/1968", 298 | "experience": 0, 299 | "skills": ["Java"], 300 | "address" :{ 301 | "street": "P.O. Box 146, 3183 Amet Avenue", 302 | "city": "Kota", 303 | "region": "Ist", 304 | "geo": "-9.68931, 151.66362" 305 | }, 306 | "salary":"40000" 307 | }' 308 | 309 | curl -XPOST http://localhost:9200/hrms/candidate -d ' 310 | { 311 | "firstName": "Pratik", 312 | "lastName": "Patel", 313 | "birthDate": "13/03/1968", 314 | "experience": 1, 315 | "skills": ["Java"], 316 | "address" :{ 317 | "street": "P.O. Box 146, 3183 Amet Avenue", 318 | "city": "Kota", 319 | "region": "Ist", 320 | "geo": "-9.68931, 151.66362" 321 | }, 322 | "salary":"65000" 323 | }' 324 | 325 | -------------------------------------------------------------------------------- /ch03/exercise/avg-salary-by-city-request.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | curl -XPOST http://localhost:9200/hrms/candidate/_search?pretty -d '{ 3 | "query": { 4 | "filtered": { 5 | "filter": { 6 | "bool": { 7 | "must": [ 8 | { 9 | "range": { 10 | "experience": { 11 | "gte": 5, 12 | "lte": 10 13 | } 14 | } 15 | }, 16 | { 17 | "terms": { 18 | "skills": [ 19 | "elasticsearch", 20 | "kibana", 21 | "lucene" 22 | ] 23 | } 24 | } 25 | ] 26 | } 27 | 28 | } 29 | } 30 | }, 31 | "aggs": { 32 | "by_city": { 33 | "terms": { 34 | "field": "address.city", 35 | "size": 5 36 | }, 37 | "aggs": { 38 | "by_skill": { 39 | "terms": { 40 | "field": "skills", 41 | "size": 5 42 | }, 43 | "aggs":{ 44 | "average": { 45 | "avg": { 46 | "field": "salary" 47 | } 48 | } 49 | } 50 | } 51 | } 52 | } 53 | }, 54 | "size": 0 55 | }' -------------------------------------------------------------------------------- /ch04/assembly.xml: -------------------------------------------------------------------------------- 1 | 2 | job 3 | 4 | jar 5 | 6 | false 7 | 8 | 9 | false 10 | runtime 11 | lib 12 | 13 | ${groupId}:${artifactId} 14 | 15 | 16 | 17 | true 18 | 19 | ${groupId}:${artifactId} 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /ch04/data/consumer_complaints.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalbrevitaz/eshadoop/7f564e1c781993916bb29175c5c7c0505c3cee4f/ch04/data/consumer_complaints.csv -------------------------------------------------------------------------------- /ch04/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | es-hadoop-book-code 6 | ch04 7 | 0.0.1 8 | jar 9 | 10 | com.hadoop.app 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | org.apache.hadoop 20 | hadoop-core 21 | 1.2.1 22 | provided 23 | 24 | 25 | org.apache.hadoop 26 | hadoop-hdfs 27 | 2.6.0 28 | 29 | 30 | org.elasticsearch 31 | elasticsearch-hadoop 32 | 2.0.2 33 | 34 | 35 | cascading-hadoop 36 | cascading 37 | 38 | 39 | cascading-local 40 | cascading 41 | 42 | 43 | 44 | 45 | org.apache.commons 46 | commons-csv 47 | 1.1 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | maven-assembly-plugin 56 | 2.2.1 57 | 58 | 59 | 60 | make-complaints-job 61 | 62 | 63 | assembly.xml 64 | 65 | 66 | 67 | com.packtpub.esh.complaints.Driver 68 | 69 | 70 | ${artifactId}-${version}-complaints 71 | 72 | package 73 | 74 | single 75 | 76 | 77 | 78 | 79 | 80 | org.apache.maven.plugins 81 | maven-compiler-plugin 82 | 3.3 83 | 84 | 1.8 85 | 1.8 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /ch04/setup/complaints-dashboard.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "_id": "Complaints-Dashboard", 4 | "_type": "dashboard", 5 | "_source": { 6 | "title": "Complaints Dashboard", 7 | "hits": 0, 8 | "description": "", 9 | "panelsJSON": "[\n {\n \"col\": 7,\n \"id\": \"State-wise-issues-over-the-time\",\n \"row\": 1,\n \"size_x\": 6,\n \"size_y\": 3,\n \"type\": \"visualization\"\n },\n {\n \"col\": 9,\n \"id\": \"State-wise-product-wise-issues\",\n \"row\": 4,\n \"size_x\": 4,\n \"size_y\": 4,\n \"type\": \"visualization\"\n },\n {\n \"col\": 1,\n \"id\": \"Company-wise-issues\",\n \"row\": 4,\n \"size_x\": 8,\n \"size_y\": 4,\n \"type\": \"visualization\"\n },\n {\n \"col\": 1,\n \"id\": \"State-wise-company-wise-issues\",\n \"row\": 8,\n \"size_x\": 12,\n \"size_y\": 3,\n \"type\": \"visualization\"\n },\n {\n \"col\": 1,\n \"id\": \"Product-wise-issues-over-the-time\",\n \"row\": 1,\n \"size_x\": 6,\n \"size_y\": 3,\n \"type\": \"visualization\"\n },\n {\n \"id\": \"Geography-wise-issues\",\n \"type\": \"visualization\",\n \"size_x\": 12,\n \"size_y\": 5,\n \"col\": 1,\n \"row\": 11\n }\n]", 10 | "version": 1, 11 | "timeRestore": true, 12 | "timeTo": "now", 13 | "timeFrom": "now-5y", 14 | "kibanaSavedObjectMeta": { 15 | "searchSourceJSON": "{\n \"filter\": [\n {\n \"query\": {\n \"query_string\": {\n \"analyze_wildcard\": true,\n \"query\": \"*\"\n }\n }\n }\n ]\n}" 16 | } 17 | } 18 | }, 19 | { 20 | "_id": "State-wise-issues-over-the-time", 21 | "_type": "visualization", 22 | "_source": { 23 | "title": "State-wise issues over the time", 24 | "visState": "{\"type\":\"area\",\"params\":{\"addLegend\":true,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"stacked\",\"shareYAxis\":true,\"smoothLines\":false,\"scale\":\"linear\",\"interpolate\":\"linear\",\"times\":[],\"addTimeMarker\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"dateSent\",\"interval\":\"M\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"state\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}", 25 | "description": "", 26 | "version": 1, 27 | "kibanaSavedObjectMeta": { 28 | "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"filter\":[]}" 29 | } 30 | } 31 | }, 32 | { 33 | "_id": "Company-wise-issues", 34 | "_type": "visualization", 35 | "_source": { 36 | "title": "Company-wise issues", 37 | "visState": "{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"mode\":\"stacked\",\"defaultYExtents\":false,\"scale\":\"linear\",\"times\":[],\"addTimeMarker\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"company\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"issue.raw\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}", 38 | "description": "", 39 | "version": 1, 40 | "kibanaSavedObjectMeta": { 41 | "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}" 42 | } 43 | } 44 | }, 45 | { 46 | "_id": "State-wise-company-wise-issues", 47 | "_type": "visualization", 48 | "_source": { 49 | "title": "State-wise company-wise issues", 50 | "visState": "{\"type\":\"pie\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"isDonut\":false,\"defaultYExtents\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"state\",\"size\":8,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":false}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"company\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}", 51 | "description": "", 52 | "version": 1, 53 | "kibanaSavedObjectMeta": { 54 | "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}" 55 | } 56 | } 57 | }, 58 | { 59 | "_id": "State-wise-product-wise-issues", 60 | "_type": "visualization", 61 | "_source": { 62 | "title": "State-wise product-wise issues", 63 | "visState": "{\"type\":\"pie\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"isDonut\":true},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"state\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"product\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}", 64 | "description": "", 65 | "version": 1, 66 | "kibanaSavedObjectMeta": { 67 | "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}" 68 | } 69 | } 70 | }, 71 | { 72 | "_id": "Geography-wise-issues", 73 | "_type": "visualization", 74 | "_source": { 75 | "title": "Geography-wise issues", 76 | "visState": "{\"type\":\"tile_map\",\"params\":{\"mapType\":\"Shaded Circle Markers\",\"isDesaturated\":false,\"heatMaxZoom\":16,\"heatMinOpacity\":\"0.31\",\"heatRadius\":\"12\",\"heatBlur\":\"11\",\"heatNormalizeData\":true,\"addTooltip\":true},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"geohash_grid\",\"schema\":\"segment\",\"params\":{\"field\":\"location\",\"autoPrecision\":true,\"mapZoom\":3,\"mapCenter\":[38.69910391920755,-61.52343749999999],\"precision\":2}}],\"listeners\":{}}", 77 | "description": "", 78 | "version": 1, 79 | "kibanaSavedObjectMeta": { 80 | "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"filter\":[]}" 81 | } 82 | } 83 | }, 84 | { 85 | "_id": "Company-pie", 86 | "_type": "visualization", 87 | "_source": { 88 | "title": "Company pie", 89 | "visState": "{\"type\":\"pie\",\"params\":{\"addLegend\":true,\"addTooltip\":true,\"isDonut\":false,\"shareYAxis\":true,\"spyPerPage\":10},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"company\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}", 90 | "description": "", 91 | "version": 1, 92 | "kibanaSavedObjectMeta": { 93 | "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}" 94 | } 95 | } 96 | }, 97 | { 98 | "_id": "Product-wise-issues-over-the-time", 99 | "_type": "visualization", 100 | "_source": { 101 | "title": "Product-wise issues over the time", 102 | "visState": "{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"mode\":\"stacked\",\"defaultYExtents\":false,\"scale\":\"linear\",\"times\":[],\"addTimeMarker\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"dateSent\",\"interval\":\"M\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"product\",\"size\":3,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}", 103 | "description": "", 104 | "version": 1, 105 | "kibanaSavedObjectMeta": { 106 | "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}" 107 | } 108 | } 109 | } 110 | ] -------------------------------------------------------------------------------- /ch04/setup/setup-mappings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | curl -XPUT http://localhost:9200/esh_complaints 3 | curl -XPUT http://localhost:9200/esh_complaints/complaints/_mapping -d ' 4 | { 5 | "complaints": { 6 | "properties": { 7 | "company": { 8 | "type": "string", 9 | "index": "not_analyzed" 10 | }, 11 | "companyResponse": { 12 | "type": "string", 13 | "index": "not_analyzed" 14 | }, 15 | "complaintId": { 16 | "type": "string", 17 | "index": "not_analyzed" 18 | }, 19 | "consumerDisputed": { 20 | "type": "boolean" 21 | }, 22 | "dateReceived": { 23 | "type": "date", 24 | "format": "MM/dd/yyyy||MM/dd/yy" 25 | }, 26 | "dateSent": { 27 | "type": "date", 28 | "format": "MM/dd/yyyy||MM/dd/yy" 29 | }, 30 | "issue.raw": { 31 | "type": "string", 32 | "index": "not_analyzed" 33 | }, 34 | "issue": { 35 | "type": "string" 36 | }, 37 | "location": { 38 | "type": "geo_point" 39 | }, 40 | "product": { 41 | "type": "string", 42 | "index": "not_analyzed", 43 | "fields": { 44 | "analyzed": { 45 | "type": "string" 46 | } 47 | } 48 | }, 49 | "state": { 50 | "type": "string", 51 | "index": "not_analyzed" 52 | }, 53 | "subissue": { 54 | "type": "string", 55 | "index": "not_analyzed", 56 | "fields": { 57 | "analyzed": { 58 | "type": "string" 59 | } 60 | } 61 | }, 62 | "submittedVia": { 63 | "type": "string", 64 | "index": "not_analyzed" 65 | }, 66 | "subproduct": { 67 | "type": "string", 68 | "index": "not_analyzed", 69 | "fields": { 70 | "analyzed": { 71 | "type": "string" 72 | } 73 | } 74 | }, 75 | "timelyResponse": { 76 | "type": "boolean" 77 | }, 78 | "zip": { 79 | "type": "string", 80 | "index": "not_analyzed" 81 | } 82 | } 83 | } 84 | }' -------------------------------------------------------------------------------- /ch04/src/main/java/com/packtpub/esh/complaints/ComplaintsMapper.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.complaints; 2 | 3 | import org.apache.commons.csv.CSVFormat; 4 | import org.apache.commons.csv.CSVParser; 5 | import org.apache.commons.csv.CSVRecord; 6 | import org.apache.commons.lang.StringUtils; 7 | import org.apache.hadoop.io.*; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | 10 | import java.io.IOException; 11 | 12 | public class ComplaintsMapper extends Mapper { 13 | 14 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 15 | MapWritable map = new MapWritable(); 16 | String line = value.toString().trim(); 17 | CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180); 18 | for (CSVRecord csvRecord : parser) { 19 | String zip = csvRecord.get(0); 20 | Long complaintId = Long.parseLong(csvRecord.get(1)); 21 | String product = csvRecord.get(2); 22 | String subproduct = csvRecord.get(3); 23 | String issue = csvRecord.get(4); 24 | String subissue = csvRecord.get(5); 25 | String state = csvRecord.get(6); 26 | String submittedVia = csvRecord.get(7); 27 | String dateReceived = csvRecord.get(8); 28 | String dateSent = csvRecord.get(9); 29 | String company = csvRecord.get(10); 30 | String companyResponse = csvRecord.get(11); 31 | String timelyResponse = csvRecord.get(12); 32 | String consumerDisputed = csvRecord.get(13); 33 | String latitude = csvRecord.get(14); 34 | String longitude = csvRecord.get(15); 35 | 36 | map.put(new Text("zip"), getWritableValue(zip)); 37 | map.put(new Text("complaintId"), getWritableLongValue(complaintId)); 38 | map.put(new Text("product"), getWritableValue(product)); 39 | map.put(new Text("subproduct"), getWritableValue(subproduct)); 40 | map.put(new Text("issue"), getWritableValue(issue)); 41 | map.put(new Text("issue.raw"), getWritableValue(issue)); 42 | map.put(new Text("subissue"), getWritableValue(subissue)); 43 | map.put(new Text("state"), getWritableValue(state)); 44 | map.put(new Text("submittedVia"), getWritableValue(submittedVia)); 45 | map.put(new Text("dateReceived"), getWritableValue(dateReceived)); 46 | map.put(new Text("dateSent"), getWritableValue(dateSent)); 47 | map.put(new Text("company"), getWritableValue(company)); 48 | map.put(new Text("companyResponse"), getWritableValue(companyResponse)); 49 | map.put(new Text("timelyResponse"), getWritableBooleanValue(timelyResponse)); 50 | map.put(new Text("consumerDisputed"), getWritableBooleanValue(consumerDisputed)); 51 | if("na".equalsIgnoreCase(latitude) || "na".equalsIgnoreCase(longitude)){ 52 | map.put(new Text("location"), NullWritable.get()); 53 | }else{ 54 | map.put(new Text("location"), getWritableValue(latitude+", "+longitude)); 55 | } 56 | 57 | } 58 | 59 | context.write(value, map); 60 | } 61 | 62 | private static WritableComparable getWritableValue(String value) { 63 | return value != null ? new Text(value) : NullWritable.get(); 64 | } 65 | 66 | private Writable getWritableLongValue(Long value) { 67 | return value!=null ? new LongWritable(value) : NullWritable.get(); 68 | } 69 | 70 | private Writable getWritableBooleanValue(String value) { 71 | if(StringUtils.isEmpty(value) || "na".equalsIgnoreCase(value)){ 72 | return NullWritable.get(); 73 | } 74 | return "yes".equalsIgnoreCase(value) ? new BooleanWritable(true) : new BooleanWritable(false); 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /ch04/src/main/java/com/packtpub/esh/complaints/Driver.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.complaints; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.mapreduce.Job; 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 7 | import org.elasticsearch.hadoop.mr.EsOutputFormat; 8 | 9 | 10 | public class Driver { 11 | 12 | public static void main(String[] args) throws Exception { 13 | Configuration conf = new Configuration(); 14 | // ElasticSearch Server nodes to point to 15 | conf.set("es.nodes", "localhost:9200"); 16 | // ElasticSearch index and type name in {indexName}/{typeName} format 17 | conf.set("es.resource", "esh_complaints/complaints"); 18 | 19 | // Create Job instance 20 | Job job = new Job(conf, "complaints mapper"); 21 | // set Driver class 22 | job.setJarByClass(Driver.class); 23 | job.setMapperClass(ComplaintsMapper.class); 24 | // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar 25 | job.setOutputFormatClass(EsOutputFormat.class); 26 | job.setNumReduceTasks(0); 27 | FileInputFormat.addInputPath(job, new Path(args[0])); 28 | 29 | System.exit(job.waitForCompletion(true) ? 0 : 1); 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /ch05/assembly.xml: -------------------------------------------------------------------------------- 1 | 2 | job 3 | 4 | jar 5 | 6 | false 7 | 8 | 9 | true 10 | runtime 11 | 12 | 13 | true 14 | 15 | ${groupId}:${artifactId} 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /ch05/data/percolators.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/1' -d '{ 4 | "query": { 5 | "match": { 6 | "tweet": "bigdata analytics hadoop spark elasticsearch nosql graphdb cassandra mongo mongodb datascience pig hive solar cloudera hortonworks iot" 7 | } 8 | } 9 | }'; 10 | 11 | 12 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/2' -d '{ 13 | "query": { 14 | "match": { 15 | "tweet": "relational mysql postgres oracle " 16 | } 17 | } 18 | }'; 19 | 20 | 21 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/3' -d '{ 22 | "query": { 23 | "match": { 24 | "tweet": "football socker tennis snooker chess cricket sports" 25 | } 26 | } 27 | } 28 | }'; 29 | 30 | 31 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/4' -d '{ 32 | "query": { 33 | "match": { 34 | "tweet": "agile scrum xp " 35 | } 36 | } 37 | } 38 | }'; 39 | 40 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/5' -d '{ 41 | "query": { 42 | "match": { 43 | "tweet": "business entrepreneur entrepreneurship biz designthinking startup" 44 | } 45 | } 46 | } 47 | }'; 48 | 49 | -------------------------------------------------------------------------------- /ch05/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | es-hadoop-book-code 6 | ch05 7 | 0.0.1 8 | jar 9 | 10 | com.hadoop.app 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.apache.storm 26 | storm-core 27 | 0.9.4 28 | provided 29 | 30 | 31 | org.apache.storm 32 | storm-starter 33 | 0.9.4 34 | provided 35 | 36 | 37 | org.apache.httpcomponents 38 | httpclient 39 | 4.0-alpha4 40 | 41 | 42 | org.twitter4j 43 | twitter4j-core 44 | 3.0.6 45 | 46 | 47 | org.scribe 48 | scribe 49 | 1.3.7 50 | 51 | 52 | org.elasticsearch 53 | elasticsearch 54 | 1.7.0 55 | 56 | 57 | 58 | org.elasticsearch 59 | elasticsearch-hadoop 60 | 2.1.1 61 | 62 | 63 | cascading-hadoop 64 | cascading 65 | 66 | 67 | cascading-local 68 | cascading 69 | 70 | 71 | 72 | 73 | org.twitter4j 74 | twitter4j-stream 75 | 3.0.6 76 | 77 | 78 | org.elasticsearch 79 | elasticsearch-storm 80 | 2.1.1 81 | 82 | 83 | org.apache.kafka 84 | kafka_2.10 85 | 0.8.2-beta 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | maven-assembly-plugin 94 | 2.2.1 95 | 96 | 97 | assembly.xml 98 | 99 | 100 | 101 | com.packtpub.esh.streaming.Topology 102 | 103 | 104 | 105 | 106 | 107 | make-assembly 108 | package 109 | 110 | single 111 | 112 | 113 | 114 | 115 | 116 | org.apache.maven.plugins 117 | maven-compiler-plugin 118 | 3.3 119 | 120 | 1.8 121 | 1.8 122 | 123 | 124 | 125 | 126 | 127 | 128 | sonatype-oss 129 | http://oss.sonatype.org/content/repositories/snapshots 130 | true 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /ch05/src/main/java/com/packtpub/esh/streaming/ElasticSearchService.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.streaming; 2 | 3 | 4 | import org.elasticsearch.action.ActionFuture; 5 | import org.elasticsearch.action.percolate.PercolateRequest; 6 | import org.elasticsearch.action.percolate.PercolateResponse; 7 | import org.elasticsearch.client.transport.TransportClient; 8 | import org.elasticsearch.common.settings.ImmutableSettings; 9 | import org.elasticsearch.common.settings.Settings; 10 | import org.elasticsearch.common.transport.InetSocketTransportAddress; 11 | 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | import java.util.Map; 15 | 16 | /** 17 | * Created by vishalshukla on 11/07/15. 18 | */ 19 | public class ElasticSearchService { 20 | 21 | private static transient TransportClient client; 22 | 23 | static{ 24 | Settings settings = ImmutableSettings.settingsBuilder() 25 | .put("cluster.name", "eshadoopcluster").build(); 26 | client = new TransportClient(settings); 27 | client.addTransportAddress(new InetSocketTransportAddress("localhost", 9300)); 28 | } 29 | 30 | public ElasticSearchService(){ 31 | 32 | } 33 | 34 | public List percolate(Map map){ 35 | List ids = new ArrayList(); 36 | PercolateRequest request = new PercolateRequest(); 37 | request.indices("es-storm"); 38 | request.documentType("storm-tweets"); 39 | ActionFuture responseFuture = client.percolate(request.source(map)); 40 | PercolateResponse response = responseFuture.actionGet(); 41 | PercolateResponse.Match[] matches = response.getMatches(); 42 | for(PercolateResponse.Match match: matches){ 43 | ids.add(match.getId().toString()); 44 | } 45 | return ids; 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /ch05/src/main/java/com/packtpub/esh/streaming/Topology.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.streaming; 2 | 3 | 4 | import backtype.storm.Config; 5 | import backtype.storm.LocalCluster; 6 | import backtype.storm.topology.TopologyBuilder; 7 | import org.elasticsearch.storm.EsBolt; 8 | 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | 12 | 13 | public class Topology { 14 | 15 | public static void main(String[] args) throws InterruptedException { 16 | 17 | TopologyBuilder builder = new TopologyBuilder(); 18 | builder.setSpout("tweets-collector", new TweetsCollectorSpout(),1); 19 | builder.setBolt("tweets-parser-bolt", new TweetsParserBolt()) 20 | .shuffleGrouping("tweets-collector"); 21 | 22 | Map config = new HashMap(); 23 | config.put("es.nodes","localhost:9200"); 24 | config.put("es.storm.bolt.flush.entries.size",100); 25 | builder.setBolt("es-bolt", new EsBolt("es-storm/storm-tweets",config)) 26 | .shuffleGrouping("tweets-parser-bolt") 27 | .addConfiguration(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, 2); 28 | 29 | LocalCluster cluster = new LocalCluster(); 30 | cluster.submitTopology("twitter-test", null, builder.createTopology()); 31 | } 32 | } -------------------------------------------------------------------------------- /ch05/src/main/java/com/packtpub/esh/streaming/TweetsCollectorSpout.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.streaming; 2 | 3 | 4 | 5 | import backtype.storm.Config; 6 | import backtype.storm.spout.SpoutOutputCollector; 7 | import backtype.storm.task.TopologyContext; 8 | import backtype.storm.topology.OutputFieldsDeclarer; 9 | import backtype.storm.topology.base.BaseRichSpout; 10 | import backtype.storm.tuple.Fields; 11 | import backtype.storm.tuple.Values; 12 | import backtype.storm.utils.Utils; 13 | import twitter4j.*; 14 | import twitter4j.auth.AccessToken; 15 | import twitter4j.conf.ConfigurationBuilder; 16 | 17 | import java.util.Map; 18 | import java.util.concurrent.LinkedBlockingQueue; 19 | 20 | @SuppressWarnings("serial") 21 | public class TweetsCollectorSpout extends BaseRichSpout { 22 | 23 | SpoutOutputCollector collector; 24 | LinkedBlockingQueue queue = null; 25 | TwitterStream twitterStream; 26 | 27 | // TODO: Initialize twitter credentials. 28 | String consumerKey = "<>"; 29 | String consumerSecret = "<>"; 30 | String accessToken = "<>"; 31 | String accessTokenSecret = "<>"; 32 | String[] keyWords = {}; 33 | 34 | public TweetsCollectorSpout() { 35 | } 36 | 37 | @Override 38 | public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { 39 | queue = new LinkedBlockingQueue(1000); 40 | this.collector = collector; 41 | 42 | StatusListener listener = new StatusListener() { 43 | 44 | public void onStatus(Status status) { 45 | queue.offer(status); 46 | } 47 | 48 | public void onDeletionNotice(StatusDeletionNotice sdn) { 49 | } 50 | 51 | public void onTrackLimitationNotice(int i) { 52 | } 53 | 54 | public void onScrubGeo(long l, long l1) { 55 | } 56 | 57 | public void onException(Exception ex) { 58 | } 59 | 60 | public void onStallWarning(StallWarning arg0) { 61 | 62 | } 63 | 64 | }; 65 | 66 | twitterStream = new TwitterStreamFactory( 67 | new ConfigurationBuilder().setJSONStoreEnabled(true).build()) 68 | .getInstance(); 69 | 70 | twitterStream.addListener(listener); 71 | twitterStream.setOAuthConsumer(consumerKey, consumerSecret); 72 | AccessToken token = new AccessToken(accessToken, accessTokenSecret); 73 | twitterStream.setOAuthAccessToken(token); 74 | 75 | if (keyWords.length == 0) { 76 | twitterStream.sample(); 77 | } 78 | else { 79 | FilterQuery query = new FilterQuery().track(keyWords); 80 | twitterStream.filter(query); 81 | } 82 | 83 | } 84 | 85 | public void nextTuple() { 86 | Status status = queue.poll(); 87 | if (status == null) { 88 | Utils.sleep(50); 89 | } else { 90 | collector.emit(new Values(status)); 91 | } 92 | } 93 | 94 | 95 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 96 | declarer.declare(new Fields("tweet")); 97 | } 98 | 99 | @Override 100 | public Map getComponentConfiguration() { 101 | Config config = new Config(); 102 | config.setMaxTaskParallelism(1); 103 | return config; 104 | } 105 | 106 | @Override 107 | public void close() { 108 | twitterStream.shutdown(); 109 | } 110 | 111 | 112 | @Override 113 | public void ack(Object id) { 114 | } 115 | 116 | @Override 117 | public void fail(Object id) { 118 | } 119 | 120 | 121 | } 122 | -------------------------------------------------------------------------------- /ch05/src/main/java/com/packtpub/esh/streaming/TweetsParserBolt.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.streaming; 2 | 3 | 4 | import backtype.storm.task.OutputCollector; 5 | import backtype.storm.task.TopologyContext; 6 | import backtype.storm.topology.OutputFieldsDeclarer; 7 | import backtype.storm.topology.base.BaseRichBolt; 8 | import backtype.storm.tuple.Fields; 9 | import backtype.storm.tuple.Tuple; 10 | import backtype.storm.tuple.Values; 11 | import twitter4j.HashtagEntity; 12 | import twitter4j.Status; 13 | import twitter4j.UserMentionEntity; 14 | 15 | import java.util.*; 16 | 17 | 18 | public class TweetsParserBolt extends BaseRichBolt { 19 | 20 | private static final long serialVersionUID = 3938843121119464326L; 21 | private OutputCollector collector; 22 | private static transient ElasticSearchService service = new ElasticSearchService(); 23 | 24 | public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { 25 | this.collector = collector; 26 | } 27 | 28 | @Override 29 | public void execute(Tuple input) { 30 | 31 | String user = null; 32 | String userHandle = null; 33 | 34 | String location = null; 35 | String country = null; 36 | List hashtagList = new ArrayList(); 37 | List mentionList = new ArrayList(); 38 | 39 | Status status = (Status) input.getValueByField("tweet"); 40 | 41 | String tweet = status.getText(); 42 | String source = status.getSource(); 43 | Date createdDate = status.getCreatedAt(); 44 | HashtagEntity entities[] = status.getHashtagEntities(); 45 | long retweetCount = status.getRetweetCount(); 46 | long favoriteCount = status.getFavoriteCount(); 47 | UserMentionEntity mentions[] = status.getUserMentionEntities(); 48 | String lang = status.getLang(); 49 | 50 | // Extract hashtags 51 | if (entities != null) { 52 | for (HashtagEntity entity : entities) { 53 | String hashTag = entity.getText(); 54 | hashtagList.add(hashTag); 55 | } 56 | } 57 | 58 | if (status.getPlace() != null) { 59 | if (status.getPlace().getName() != null) { 60 | location = status.getPlace().getName(); 61 | } 62 | if (status.getPlace().getCountry() != null) { 63 | country = status.getPlace().getCountry(); 64 | } 65 | } 66 | 67 | if (status.getUser() != null && status.getUser().getName() != null) { 68 | user = status.getUser().getName(); 69 | userHandle = status.getUser().getScreenName(); 70 | } 71 | 72 | if (mentions != null) { 73 | for (UserMentionEntity mention : mentions) { 74 | String mentionName = mention.getScreenName(); 75 | mentionList.add(mentionName); 76 | } 77 | } 78 | 79 | String strHashtag = hashtagList.toString().replace("[", "").replace("]", ""); 80 | String strUserMention = mentionList.toString().replace("[", "").replace("]", ""); 81 | 82 | if ("en".equalsIgnoreCase(lang)) { 83 | System.out.println("Emitting : " + userHandle + " -> " + tweet); 84 | String categories = classify(tweet); 85 | collector.emit(input, new Values(user, userHandle, tweet, createdDate, location, country, strHashtag, source, lang, retweetCount, favoriteCount, strUserMention, categories)); 86 | } 87 | } 88 | 89 | private String classify(String tweet) { 90 | StringBuilder categoriesBuilder = new StringBuilder(); 91 | 92 | Map main = new HashMap(); 93 | Map doc = new HashMap(); 94 | doc.put("tweet", tweet); 95 | main.put("doc", doc); 96 | List ids = service.percolate(main); 97 | for (String id : ids) { 98 | categoriesBuilder.append(getCategoryName(id) + " "); 99 | } 100 | return categoriesBuilder.toString(); 101 | } 102 | 103 | public String getCategoryName(String id) { 104 | switch (id) { 105 | case "1": 106 | return "BigData"; 107 | case "2": 108 | return "Relational Database"; 109 | case "3": 110 | return "Sports"; 111 | case "4": 112 | return "Agile"; 113 | case "5": 114 | return "Business"; 115 | default: 116 | return "Other"; 117 | } 118 | } 119 | 120 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 121 | declarer.declare(new Fields("user", "userHandle", "tweet", 122 | "time", "location", "country", "hashtags", "source", 123 | "lang", "retweetCount", "favoriteCount", "mentions", "categories")); 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /ch07-spark/assembly.xml: -------------------------------------------------------------------------------- 1 | 2 | job 3 | 4 | jar 5 | 6 | false 7 | 8 | 9 | true 10 | runtime 11 | 12 | 13 | true 14 | 15 | ${groupId}:${artifactId} 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /ch07-spark/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | es-hadoop-book-code 6 | ch07-spark 7 | 0.0.1 8 | jar 9 | 10 | com.hadoop.app 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | 20 | org.apache.spark 21 | spark-core_2.10 22 | 1.4.0 23 | 24 | 25 | org.apache.spark 26 | spark-sql_2.10 27 | 1.4.1 28 | 29 | 30 | com.google.guava 31 | guava-collections 32 | r03 33 | 34 | 35 | org.elasticsearch 36 | elasticsearch-hadoop 37 | 2.1.1 38 | 39 | 40 | org.apache.commons 41 | commons-csv 42 | 1.1 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | maven-assembly-plugin 51 | 2.5.5 52 | 53 | 54 | 55 | 56 | make-spark-writer-job 57 | 58 | 59 | assembly.xml 60 | 61 | 62 | 63 | com.packtpub.esh.spark.SparkEsWriter 64 | 65 | 66 | ${artifactId}-${version}-spark-writer 67 | 68 | package 69 | 70 | single 71 | 72 | 73 | 74 | make-spark-reader-job 75 | 76 | 77 | assembly.xml 78 | 79 | 80 | 81 | com.packtpub.esh.spark.SparkEsReader 82 | 83 | 84 | ${artifactId}-${version}-spark-reader 85 | 86 | package 87 | 88 | single 89 | 90 | 91 | 92 | make-sparksql-reflection-job 93 | 94 | 95 | assembly.xml 96 | 97 | 98 | 99 | com.packtpub.esh.spark.SparkSQLEsWriterReflection 100 | 101 | 102 | ${artifactId}-${version}-sparksql-reflection 103 | 104 | package 105 | 106 | single 107 | 108 | 109 | 110 | make-sparksql-schema-job 111 | 112 | 113 | assembly.xml 114 | 115 | 116 | 117 | com.packtpub.esh.spark.SparkSQLEsWriterReflection 118 | 119 | 120 | ${artifactId}-${version}-sparksql-schema 121 | 122 | package 123 | 124 | single 125 | 126 | 127 | 128 | make-sparksql-reader-job 129 | 130 | 131 | assembly.xml 132 | 133 | 134 | 135 | com.packtpub.esh.spark.SparkSQLEsReader 136 | 137 | 138 | ${artifactId}-${version}-sparksql-reader 139 | 140 | package 141 | 142 | single 143 | 144 | 145 | 146 | 147 | 148 | org.apache.maven.plugins 149 | maven-compiler-plugin 150 | 3.3 151 | 152 | 1.8 153 | 1.8 154 | 155 | 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /ch07-spark/src/main/java/com/packtpub/esh/spark/Crime.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.spark; 2 | 3 | import java.io.Serializable; 4 | import java.text.ParseException; 5 | import java.text.SimpleDateFormat; 6 | import java.util.Date; 7 | import java.util.Map; 8 | import java.util.Locale; 9 | 10 | /** 11 | * Created by vishalshukla on 16/08/15. 12 | * 13 | */ 14 | public class Crime implements Serializable { 15 | private String id; 16 | private String caseNumber; 17 | private Long eventDate; 18 | private String block; 19 | private String iucr; 20 | private String primaryType; 21 | private String description; 22 | private String location; 23 | private Boolean arrest; 24 | private Boolean domestic; 25 | private Map geoLocation; 26 | 27 | public String getId() { 28 | return id; 29 | } 30 | 31 | public void setId(String id) { 32 | this.id = id; 33 | } 34 | 35 | public String getCaseNumber() { 36 | return caseNumber; 37 | } 38 | 39 | public void setCaseNumber(String caseNumber) { 40 | this.caseNumber = caseNumber; 41 | } 42 | 43 | public Long getEventDate() { 44 | return eventDate; 45 | } 46 | 47 | public void setEventDate(String eventDate) throws ParseException { 48 | SimpleDateFormat format = new SimpleDateFormat("MM/dd/yy hh:mm",Locale.ENGLISH); 49 | Date date = format.parse(eventDate); 50 | this.eventDate = date.getTime(); 51 | } 52 | 53 | public void setEventDate(Long eventDate) { 54 | this.eventDate = eventDate; 55 | } 56 | 57 | public String getBlock() { 58 | return block; 59 | } 60 | 61 | public void setBlock(String block) { 62 | this.block = block; 63 | } 64 | 65 | public String getIucr() { 66 | return iucr; 67 | } 68 | 69 | public void setIucr(String iucr) { 70 | this.iucr = iucr; 71 | } 72 | 73 | public String getPrimaryType() { 74 | return primaryType; 75 | } 76 | 77 | public void setPrimaryType(String primaryType) { 78 | this.primaryType = primaryType; 79 | } 80 | 81 | public String getDescription() { 82 | return description; 83 | } 84 | 85 | public void setDescription(String description) { 86 | this.description = description; 87 | } 88 | 89 | public String getLocation() { 90 | return location; 91 | } 92 | 93 | public void setLocation(String location) { 94 | this.location = location; 95 | } 96 | 97 | public Boolean getArrest() { 98 | return arrest; 99 | } 100 | 101 | public void setArrest(Boolean arrest) { 102 | this.arrest = arrest; 103 | } 104 | 105 | public Boolean getDomestic() { 106 | return domestic; 107 | } 108 | 109 | public void setDomestic(Boolean domestic) { 110 | this.domestic = domestic; 111 | } 112 | 113 | public Map getGeoLocation() { 114 | return geoLocation; 115 | } 116 | 117 | public void setGeoLocation(Map geoLocation) { 118 | this.geoLocation = geoLocation; 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /ch07-spark/src/main/java/com/packtpub/esh/spark/SparkEsReader.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.spark; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; 7 | 8 | import java.util.Map; 9 | /** 10 | * Created by vishalshukla on 16/08/15. 11 | */ 12 | public class SparkEsReader { 13 | 14 | public static void main(String args[]){ 15 | SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]"); 16 | JavaSparkContext context = new JavaSparkContext(conf); 17 | JavaRDD> esRDD = 18 | JavaEsSpark.esRDD(context, "esh_spark/crimes", "{\"query\" : { \"term\" : { \"primaryType\" : \"theft\" } } }").values(); 19 | 20 | for(Map item: esRDD.collect()){ 21 | System.out.println(item); 22 | } 23 | 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /ch07-spark/src/main/java/com/packtpub/esh/spark/SparkEsWriter.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.spark; 2 | 3 | import org.apache.commons.csv.CSVFormat; 4 | import org.apache.commons.csv.CSVParser; 5 | import org.apache.commons.csv.CSVRecord; 6 | import org.apache.commons.lang.StringUtils; 7 | import org.apache.spark.SparkConf; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import org.apache.spark.api.java.JavaSparkContext; 10 | import org.apache.spark.api.java.function.Function; 11 | import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; 12 | 13 | import java.util.HashMap; 14 | import java.util.Map; 15 | 16 | /** 17 | * Created by vishalshukla on 14/08/15. 18 | */ 19 | public class SparkEsWriter { 20 | public static void main(String[] args) { 21 | 22 | SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]"); 23 | conf.set("es.index.auto.create", "true"); 24 | JavaSparkContext context = new JavaSparkContext(conf); 25 | 26 | JavaRDD textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv"); 27 | 28 | JavaRDD dataSplits = textFile.map(line -> { 29 | CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180); 30 | Crime c = new Crime(); 31 | CSVRecord record = parser.getRecords().get(0); 32 | c.setId(record.get(0)); 33 | c.setCaseNumber(record.get(1)); 34 | c.setEventDate(record.get(2)); 35 | c.setBlock(record.get(3)); 36 | c.setIucr(record.get(4)); 37 | c.setPrimaryType(record.get(5)); 38 | c.setDescription(record.get(6)); 39 | c.setLocation(record.get(7)); 40 | c.setArrest(Boolean.parseBoolean(record.get(8))); 41 | c.setDomestic(Boolean.parseBoolean(record.get(9))); 42 | String lat = record.get(10); 43 | String lon = record.get(11); 44 | 45 | Map geoLocation = new HashMap<>(); 46 | geoLocation.put("lat", StringUtils.isEmpty(lat)?null:Double.parseDouble(lat)); 47 | geoLocation.put("lon", StringUtils.isEmpty(lon)?null:Double.parseDouble(lon)); 48 | c.setGeoLocation(geoLocation); 49 | return c; 50 | }); 51 | 52 | JavaEsSpark.saveToEs(dataSplits, "esh_spark/crimes"); 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /ch07-spark/src/main/java/com/packtpub/esh/spark/SparkSQLEsReader.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.spark; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaSparkContext; 5 | import org.apache.spark.sql.DataFrame; 6 | import org.apache.spark.sql.Row; 7 | import org.apache.spark.sql.SQLContext; 8 | 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | 12 | /** 13 | * Created by vishalshukla on 22/08/15. 14 | */ 15 | public class SparkSQLEsReader { 16 | public static void main(String args[]) { 17 | SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]"); 18 | conf.set("es.index.auto.create", "true"); 19 | JavaSparkContext context = new JavaSparkContext(conf); 20 | 21 | SQLContext sqlContext = new SQLContext(context); 22 | Map options = new HashMap<>(); 23 | options.put("pushdown","true"); 24 | options.put("es.nodes","localhost"); 25 | 26 | DataFrame df = sqlContext.read() 27 | .options(options) 28 | .format("org.elasticsearch.spark.sql").load("esh_sparksql/crimes_reflection"); 29 | 30 | df.registerTempTable("crimes"); 31 | 32 | DataFrame theftCrimes = sqlContext.sql("SELECT * FROM crimes WHERE primaryType='THEFT'"); 33 | for(Row row: theftCrimes.javaRDD().collect()){ 34 | System.out.println(row); 35 | } 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /ch07-spark/src/main/java/com/packtpub/esh/spark/SparkSQLEsWriterReflection.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.spark; 2 | 3 | import org.apache.commons.csv.CSVFormat; 4 | import org.apache.commons.csv.CSVParser; 5 | import org.apache.commons.csv.CSVRecord; 6 | import org.apache.commons.lang.StringUtils; 7 | import org.apache.spark.SparkConf; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import org.apache.spark.api.java.JavaSparkContext; 10 | import org.apache.spark.api.java.function.Function; 11 | import org.apache.spark.sql.DataFrame; 12 | import org.apache.spark.sql.SQLContext; 13 | import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL; 14 | 15 | import java.util.HashMap; 16 | import java.util.Map; 17 | 18 | /** 19 | * Created by vishalshukla on 22/08/15. 20 | */ 21 | public class SparkSQLEsWriterReflection { 22 | public static void main(String args[]) { 23 | SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]"); 24 | conf.set("es.index.auto.create", "true"); 25 | JavaSparkContext context = new JavaSparkContext(conf); 26 | 27 | JavaRDD textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv"); 28 | 29 | JavaRDD dataSplits = textFile.map(line -> { 30 | CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180); 31 | Crime c = new Crime(); 32 | CSVRecord record = parser.getRecords().get(0); 33 | c.setId(record.get(0)); 34 | c.setCaseNumber(record.get(1)); 35 | c.setEventDate(record.get(2)); 36 | c.setBlock(record.get(3)); 37 | c.setIucr(record.get(4)); 38 | c.setPrimaryType(record.get(5)); 39 | c.setDescription(record.get(6)); 40 | c.setLocation(record.get(7)); 41 | c.setArrest(Boolean.parseBoolean(record.get(8))); 42 | c.setDomestic(Boolean.parseBoolean(record.get(9))); 43 | String lat = record.get(10); 44 | String lon = record.get(11); 45 | Map geoLocation = new HashMap<>(); 46 | geoLocation.put("lat", StringUtils.isEmpty(lat) ? null : Double.parseDouble(lat)); 47 | geoLocation.put("lon", StringUtils.isEmpty(lon) ? null : Double.parseDouble(lon)); 48 | c.setGeoLocation(geoLocation); 49 | return c; 50 | } 51 | ); 52 | 53 | SQLContext sqlContext = new SQLContext(context); 54 | DataFrame df = sqlContext.createDataFrame(dataSplits, Crime.class); 55 | 56 | JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_reflection"); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /ch07-spark/src/main/java/com/packtpub/esh/spark/SparkSQLEsWriterSchema.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.spark; 2 | 3 | import org.apache.commons.csv.CSVFormat; 4 | import org.apache.commons.csv.CSVParser; 5 | import org.apache.commons.csv.CSVRecord; 6 | import org.apache.commons.lang.StringUtils; 7 | import org.apache.spark.SparkConf; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import org.apache.spark.api.java.JavaSparkContext; 10 | import org.apache.spark.api.java.function.Function; 11 | import org.apache.spark.sql.DataFrame; 12 | import org.apache.spark.sql.Row; 13 | import org.apache.spark.sql.RowFactory; 14 | import org.apache.spark.sql.SQLContext; 15 | import org.apache.spark.sql.types.DataTypes; 16 | import org.apache.spark.sql.types.StructField; 17 | import org.apache.spark.sql.types.StructType; 18 | import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL; 19 | 20 | import java.sql.Timestamp; 21 | import java.text.SimpleDateFormat; 22 | import java.util.ArrayList; 23 | import java.util.Date; 24 | import java.util.List; 25 | import java.util.Locale; 26 | 27 | /** 28 | * Created by vishalshukla on 22/08/15. 29 | */ 30 | public class SparkSQLEsWriterSchema { 31 | public static void main(String args[]) { 32 | SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]"); 33 | conf.set("es.index.auto.create", "true"); 34 | JavaSparkContext context = new JavaSparkContext(conf); 35 | 36 | JavaRDD textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv"); 37 | SQLContext sqlContext = new org.apache.spark.sql.SQLContext(context); 38 | 39 | List fields = new ArrayList<>(); 40 | fields.add(DataTypes.createStructField("id", DataTypes.StringType, true)); 41 | fields.add(DataTypes.createStructField("caseNumber", DataTypes.StringType, true)); 42 | fields.add(DataTypes.createStructField("eventDate", DataTypes.TimestampType, true)); 43 | fields.add(DataTypes.createStructField("block", DataTypes.StringType, true)); 44 | fields.add(DataTypes.createStructField("iucr", DataTypes.StringType, true)); 45 | fields.add(DataTypes.createStructField("primaryType", DataTypes.StringType, true)); 46 | fields.add(DataTypes.createStructField("description", DataTypes.StringType, true)); 47 | fields.add(DataTypes.createStructField("location", DataTypes.StringType, true)); 48 | fields.add(DataTypes.createStructField("arrest", DataTypes.BooleanType, true)); 49 | fields.add(DataTypes.createStructField("domestic", DataTypes.BooleanType, true)); 50 | 51 | 52 | List geoFields = new ArrayList<>(); 53 | geoFields.add(DataTypes.createStructField("lat", DataTypes.DoubleType, true)); 54 | geoFields.add(DataTypes.createStructField("lon", DataTypes.DoubleType, true)); 55 | StructType geoLocationSchema = DataTypes.createStructType(geoFields); 56 | fields.add(DataTypes.createStructField("geoLocation", geoLocationSchema, true)); 57 | 58 | StructType schema = DataTypes.createStructType(fields); 59 | 60 | JavaRDD rowRDD = textFile.map(line -> { 61 | CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180); 62 | CSVRecord record = parser.getRecords().get(0); 63 | SimpleDateFormat format = new SimpleDateFormat("MM/dd/yy hh:mm",Locale.ENGLISH); 64 | Date eventDate = format.parse(record.get(2)); 65 | 66 | Row geo = RowFactory.create(StringUtils.isEmpty(record.get(10)) ? null : Double.parseDouble(record.get(10)), 67 | StringUtils.isEmpty(record.get(11)) ? null : Double.parseDouble(record.get(11))); 68 | return RowFactory.create(record.get(0), record.get(1), 69 | new Timestamp(eventDate.getTime()), record.get(3), record.get(4), 70 | record.get(5), record.get(6), record.get(7), 71 | Boolean.parseBoolean(record.get(8)), 72 | Boolean.parseBoolean(record.get(9)), 73 | geo 74 | ); 75 | }); 76 | 77 | DataFrame df = sqlContext.createDataFrame(rowRDD, schema); 78 | df.registerTempTable("crime"); 79 | JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_schema"); 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /ch07/assembly.xml: -------------------------------------------------------------------------------- 1 | 2 | job 3 | 4 | jar 5 | 6 | false 7 | 8 | 9 | true 10 | runtime 11 | 12 | 13 | true 14 | 15 | ${groupId}:${artifactId} 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /ch07/data/crimes.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": 10178221, 4 | "caseNumber": "HY366678", 5 | "eventDate": "08/02/15 23:58", 6 | "block": "042XX W MADISON ST", 7 | "iucr": 1811, 8 | "primaryType": "NARCOTICS", 9 | "description": "POSS: CANNABIS 30GMS OR LESS", 10 | "location": "SIDEWALK", 11 | "arrest": "TRUE", 12 | "domestic": "FALSE", 13 | "lat": 41.88076873, 14 | "lon": -87.73136165 15 | }, 16 | { 17 | "id": 10178234, 18 | "caseNumber": "HY366669", 19 | "eventDate": "08/02/15 23:55", 20 | "block": "035XX W VAN BUREN ST", 21 | "iucr": 486, 22 | "primaryType": "BATTERY", 23 | "description": "DOMESTIC BATTERY SIMPLE", 24 | "location": "SIDEWALK", 25 | "arrest": "TRUE", 26 | "domestic": "TRUE", 27 | "lat": 41.87530121, 28 | "lon": -87.71414439 29 | }, 30 | { 31 | "id": 10181024, 32 | "caseNumber": "HY367638", 33 | "eventDate": "08/02/15 23:47", 34 | "block": "069XX S WINCHESTER AVE", 35 | "iucr": 2826, 36 | "primaryType": "OTHER OFFENSE", 37 | "description": "HARASSMENT BY ELECTRONIC MEANS", 38 | "location": "OTHER", 39 | "arrest": "FALSE", 40 | "domestic": "FALSE", 41 | "lat": 41.76792578, 42 | "lon": -87.67225279 43 | }, 44 | { 45 | "id": 10178323, 46 | "caseNumber": "HY366697", 47 | "eventDate": "08/02/15 23:45", 48 | "block": "045XX S SAWYER AVE", 49 | "iucr": 486, 50 | "primaryType": "BATTERY", 51 | "description": "DOMESTIC BATTERY SIMPLE", 52 | "location": "RESIDENCE", 53 | "arrest": "FALSE", 54 | "domestic": "TRUE", 55 | "lat": 41.81151147, 56 | "lon": -87.70517785 57 | }, 58 | { 59 | "id": 10178252, 60 | "caseNumber": "HY366660", 61 | "eventDate": "08/02/15 23:45", 62 | "block": "041XX W GRENSHAW ST", 63 | "iucr": 1310, 64 | "primaryType": "CRIMINAL DAMAGE", 65 | "description": "TO PROPERTY", 66 | "location": "APARTMENT", 67 | "arrest": "FALSE", 68 | "domestic": "TRUE", 69 | "lat": 41.86713278, 70 | "lon": -87.72931615 71 | }, 72 | { 73 | "id": 10178286, 74 | "caseNumber": "HY366712", 75 | "eventDate": "08/02/15 23:45", 76 | "block": "043XX N KENMORE AVE", 77 | "iucr": 486, 78 | "primaryType": "BATTERY", 79 | "description": "DOMESTIC BATTERY SIMPLE", 80 | "location": "APARTMENT", 81 | "arrest": "TRUE", 82 | "domestic": "TRUE", 83 | "lat": 41.9615215, 84 | "lon": -87.65618446 85 | }, 86 | { 87 | "id": 10178357, 88 | "caseNumber": "HY366662", 89 | "eventDate": "08/02/15 23:41", 90 | "block": "025XX W LAWRENCE AVE", 91 | "iucr": 486, 92 | "primaryType": "BATTERY", 93 | "description": "DOMESTIC BATTERY SIMPLE", 94 | "location": "SIDEWALK", 95 | "arrest": "FALSE", 96 | "domestic": "TRUE", 97 | "lat": 41.96870273, 98 | "lon": -87.69147181 99 | }, 100 | { 101 | "id": 10178260, 102 | "caseNumber": "HY366716", 103 | "eventDate": "08/02/15 23:38", 104 | "block": "065XX S SEELEY AVE", 105 | "iucr": 1340, 106 | "primaryType": "CRIMINAL DAMAGE", 107 | "description": "TO STATE SUP PROP", 108 | "location": "SCHOOL, PUBLIC, BUILDING", 109 | "arrest": "FALSE", 110 | "domestic": "FALSE", 111 | "lat": 41.77425422, 112 | "lon": -87.67513531 113 | }, 114 | { 115 | "id": 10178233, 116 | "caseNumber": "HY366661", 117 | "eventDate": "08/02/15 23:32", 118 | "block": "064XX S LOWE AVE", 119 | "iucr": 497, 120 | "primaryType": "BATTERY", 121 | "description": "AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON", 122 | "location": "APARTMENT", 123 | "arrest": "TRUE", 124 | "domestic": "TRUE", 125 | "lat": 41.77720468, 126 | "lon": -87.64065374 127 | }, 128 | { 129 | "id": 10178943, 130 | "caseNumber": "HY366951", 131 | "eventDate": "08/02/15 23:30", 132 | "block": "062XX W ROSCOE ST", 133 | "iucr": 910, 134 | "primaryType": "MOTOR VEHICLE THEFT", 135 | "description": "AUTOMOBILE", 136 | "location": "STREET", 137 | "arrest": "FALSE", 138 | "domestic": "FALSE", 139 | "lat": 41.94216025, 140 | "lon": -87.78166666 141 | }, 142 | { 143 | "id": 10179727, 144 | "caseNumber": "HY367927", 145 | "eventDate": "08/02/15 23:30", 146 | "block": "012XX W GRAND AVE", 147 | "iucr": 497, 148 | "primaryType": "BATTERY", 149 | "description": "AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON", 150 | "location": "APARTMENT", 151 | "arrest": "FALSE", 152 | "domestic": "TRUE", 153 | "lat": 41.89117935, 154 | "lon": -87.65860268 155 | }, 156 | { 157 | "id": 10178262, 158 | "caseNumber": "HY366663", 159 | "eventDate": "08/02/15 23:28", 160 | "block": "047XX S DAMEN AVE", 161 | "iucr": 1811, 162 | "primaryType": "NARCOTICS", 163 | "description": "POSS: CANNABIS 30GMS OR LESS", 164 | "location": "GAS STATION", 165 | "arrest": "TRUE", 166 | "domestic": "FALSE", 167 | "lat": 41.80832553, 168 | "lon": -87.67482701 169 | }, 170 | { 171 | "id": 10178278, 172 | "caseNumber": "HY366671", 173 | "eventDate": "08/02/15 23:20", 174 | "block": "005XX W ROOSEVELT RD", 175 | "iucr": 560, 176 | "primaryType": "ASSAULT", 177 | "description": "SIMPLE", 178 | "location": "CONVENIENCE STORE", 179 | "arrest": "FALSE", 180 | "domestic": "FALSE", 181 | "lat": 41.86714315, 182 | "lon": -87.63936798 183 | }, 184 | { 185 | "id": 10178335, 186 | "caseNumber": "HY366719", 187 | "eventDate": "08/02/15 23:19", 188 | "block": "034XX S LAWNDALE AVE", 189 | "iucr": "143A", 190 | "primaryType": "WEAPONS VIOLATION", 191 | "description": "UNLAWFUL POSS OF HANDGUN", 192 | "location": "STREET", 193 | "arrest": "TRUE", 194 | "domestic": "FALSE", 195 | "lat": 41.82991123, 196 | "lon": -87.71671383 197 | }, 198 | { 199 | "id": 10178241, 200 | "caseNumber": "HY366644", 201 | "eventDate": "08/02/15 23:17", 202 | "block": "0000X W 79TH ST", 203 | "iucr": 1310, 204 | "primaryType": "CRIMINAL DAMAGE", 205 | "description": "TO PROPERTY", 206 | "location": "RESTAURANT", 207 | "arrest": "FALSE", 208 | "domestic": "FALSE", 209 | "lat": 41.75081548, 210 | "lon": -87.62667814 211 | }, 212 | { 213 | "id": 10179681, 214 | "caseNumber": "HY367958", 215 | "eventDate": "08/02/15 23:15", 216 | "block": "125XX S WENTWORTH AVE", 217 | "iucr": 1121, 218 | "primaryType": "DECEPTIVE PRACTICE", 219 | "description": "COUNTERFEITING DOCUMENT", 220 | "location": "SIDEWALK", 221 | "arrest": "FALSE", 222 | "domestic": "FALSE", 223 | "lat": 41.66603296, 224 | "lon": -87.62722538 225 | }, 226 | { 227 | "id": 10178299, 228 | "caseNumber": "HY366676", 229 | "eventDate": "08/02/15 23:13", 230 | "block": "077XX S AVALON AVE", 231 | "iucr": 1310, 232 | "primaryType": "CRIMINAL DAMAGE", 233 | "description": "TO PROPERTY", 234 | "location": "RESIDENCE", 235 | "arrest": "FALSE", 236 | "domestic": "FALSE", 237 | "lat": 41.754208, 238 | "lon": -87.59448509 239 | }, 240 | { 241 | "id": 10178224, 242 | "caseNumber": "HY366674", 243 | "eventDate": "08/02/15 23:13", 244 | "block": "077XX S AVALON AVE", 245 | "iucr": 610, 246 | "primaryType": "BURGLARY", 247 | "description": "FORCIBLE ENTRY", 248 | "location": "APARTMENT", 249 | "arrest": "FALSE", 250 | "domestic": "FALSE", 251 | "lat": 41.754208, 252 | "lon": -87.59448509 253 | } 254 | ] -------------------------------------------------------------------------------- /ch07/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | es-hadoop-book-code 6 | ch07 7 | 0.0.1 8 | jar 9 | 10 | com.hadoop.app 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | cascading 20 | cascading-core 21 | 2.6.3 22 | 23 | 24 | cascading 25 | cascading-local 26 | 2.6.3 27 | 28 | 29 | cascading 30 | cascading-hadoop 31 | 2.6.3 32 | 33 | 34 | org.apache.hadoop 35 | hadoop-core 36 | 1.2.1 37 | provided 38 | 39 | 40 | 41 | org.elasticsearch 42 | elasticsearch-hadoop 43 | 2.1.1 44 | 45 | 46 | 47 | 48 | 49 | 50 | maven-assembly-plugin 51 | 2.2.1 52 | 53 | 54 | 55 | 56 | 57 | make-writer-job 58 | 59 | 60 | assembly.xml 61 | 62 | 63 | 64 | com.packtpub.esh.cascading.CascadingEsWriter 65 | 66 | 67 | ${artifactId}-${version}-cascading-writer 68 | 69 | package 70 | 71 | single 72 | 73 | 74 | 75 | 76 | make-reader-job 77 | 78 | 79 | assembly.xml 80 | 81 | 82 | 83 | com.packtpub.esh.cascading.CascadingEsReader 84 | 85 | 86 | ${artifactId}-${version}-cascading-reader 87 | 88 | package 89 | 90 | single 91 | 92 | 93 | 94 | 95 | 96 | org.apache.maven.plugins 97 | maven-compiler-plugin 98 | 3.3 99 | 100 | 1.8 101 | 1.8 102 | 103 | 104 | 105 | 106 | 107 | 108 | conjars 109 | http://conjars.org/repo/ 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /ch07/scripts/es-reader.pig: -------------------------------------------------------------------------------- 1 | REGISTER hdfs://localhost:9000/lib/elasticsearch-hadoop-2.1.1.jar; 2 | 3 | ES = LOAD 'esh_pig/crimes' using org.elasticsearch.hadoop.pig.EsStorage('{"query" : { "term" : { "primaryType" : "theft" } } }'); 4 | 5 | dump ES; -------------------------------------------------------------------------------- /ch07/scripts/es-reader.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS theft_crimes; 2 | 3 | CREATE EXTERNAL TABLE IF NOT EXISTS theft_crimes ( 4 | id STRING, 5 | caseNumber STRING, 6 | eventDate DATE, 7 | block STRING, 8 | iucr STRING, 9 | primaryType STRING, 10 | description STRING, 11 | location STRING, 12 | arrest BOOLEAN, 13 | domestic BOOLEAN, 14 | geoLocation STRUCT) 15 | STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler' 16 | TBLPROPERTIES('es.resource' = 'esh_hive/crimes', 'es.query' = '{"query" : { "term" : { "primarytype" : "theft" } } }'); 17 | 18 | -- stream data from Elasticsearch 19 | SELECT location, count(*) as noOfCrimes FROM theft_crimes group by location; 20 | -------------------------------------------------------------------------------- /ch07/scripts/es-writer.pig: -------------------------------------------------------------------------------- 1 | 2 | REGISTER hdfs://localhost:9000/lib/elasticsearch-hadoop-2.1.1.jar; 3 | 4 | -- Match the reducer parallelism to the number of shards available 5 | SET default_parallel 5; 6 | 7 | -- Disable combining input splits 8 | SET pig.noSplitCombination TRUE; 9 | 10 | -- Load CSV file into SOURCE 11 | SOURCE = load '/ch07/crimes_dataset.csv' using PigStorage(',') as (id:chararray, caseNumber:chararray, 12 | date:datetime, block:chararray, iucr:chararray, primaryType:chararray, description:chararray, 13 | location:chararray, arrest:boolean, domestic:boolean, lat:double,lon:double); 14 | 15 | TARGET = foreach SOURCE generate id, caseNumber, 16 | date, block, iucr, primaryType, description, 17 | location, arrest, domestic, TOTUPLE(lon, lat) AS geoLocation; 18 | 19 | -- Store to ES index 20 | STORE TARGET INTO 'esh_pig/crimes' 21 | USING org.elasticsearch.hadoop.pig.EsStorage('es.http.timeout = 5m', 22 | 'es.index.auto.create = true', 23 | 'es.mapping.names=arrest:isArrest, domestic:isDomestic', 24 | 'es.mapping.id=id'); 25 | 26 | -------------------------------------------------------------------------------- /ch07/scripts/es-writer.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS source; 2 | DROP TABLE IF EXISTS crimes; 3 | 4 | CREATE EXTERNAL TABLE source ( 5 | id STRING, 6 | caseNumber STRING, 7 | eventDate DATE, 8 | block STRING, 9 | iucr STRING, 10 | primaryType STRING, 11 | description STRING, 12 | location STRING, 13 | arrest BOOLEAN, 14 | domestic BOOLEAN, 15 | lat DOUBLE, 16 | lon DOUBLE) 17 | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' 18 | STORED AS TEXTFILE 19 | LOCATION '/ch07'; 20 | 21 | CREATE EXTERNAL TABLE crimes ( 22 | id STRING, 23 | caseNumber STRING, 24 | eventDate DATE, 25 | block STRING, 26 | iucr STRING, 27 | primaryType STRING, 28 | description STRING, 29 | location STRING, 30 | arrest BOOLEAN, 31 | domestic BOOLEAN, 32 | geoLocation STRUCT) 33 | STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler' 34 | TBLPROPERTIES('es.resource' = 'esh_hive/crimes'); 35 | 36 | -- insert data to Elasticsearch from another table called 'source' 37 | INSERT OVERWRITE TABLE crimes 38 | SELECT s.id, s.caseNumber, s.eventDate, s.block, s.iucr, s.primaryType, s.description, s.location, s.arrest, s.domestic, named_struct('lat', cast(s.lat AS DOUBLE), 'lon', cast(s.lon AS DOUBLE)) 39 | FROM source s; 40 | 41 | 42 | -------------------------------------------------------------------------------- /ch07/scripts/lingual-cleanup.sh: -------------------------------------------------------------------------------- 1 | # Execute this only if you have already created schema and you want to start fresh again with lingual 2 | lingual catalog --schema esh --remove; 3 | lingual catalog --provider es --remove; -------------------------------------------------------------------------------- /ch07/scripts/lingual-writer.sh: -------------------------------------------------------------------------------- 1 | export LINGUAL_PLATFORM=hadoop 2 | # register {es} as a provider 3 | lingual catalog --init 4 | lingual catalog --provider --add /opt/lib/elasticsearch-hadoop-2.1.1.jar 5 | # add a custom schema (called 'titles') for querying 6 | 7 | lingual catalog --schema esh --add 8 | lingual catalog --schema esh --stereotype crimes --add \ 9 | --columns id,caseNumber,eventDate,block,iucr,primaryType,description,location,arrest,domestic,lat,lon --types string,string,string,string,string,string,string,string,string,string,string,string 10 | 11 | lingual catalog --schema esh --format es --add --provider es 12 | lingual catalog --schema esh --protocol es --add --provider es \ 13 | --properties=host=localhost 14 | lingual catalog --schema esh --table crimes --stereotype crimes \ 15 | -add esh_cascading/crimes --format es --provider es --protocol es -------------------------------------------------------------------------------- /ch07/src/main/java/com/packtpub/esh/cascading/CascadingEsReader.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.cascading; 2 | 3 | 4 | import cascading.flow.FlowConnector; 5 | import cascading.flow.local.LocalFlowConnector; 6 | import cascading.pipe.Pipe; 7 | import cascading.tap.Tap; 8 | import cascading.tap.local.StdOutTap; 9 | import org.elasticsearch.hadoop.cascading.EsTap; 10 | 11 | import java.util.Properties; 12 | 13 | 14 | public class CascadingEsReader { 15 | public static void main(String[] args) throws InterruptedException { 16 | 17 | Tap in = new EsTap("localhost",9200, "esh_cascading/crimes", 18 | "{\"query\" : { \"term\" : { \"primaryType\" : \"theft\" } } }"); 19 | 20 | Properties props = new Properties(); 21 | props.setProperty("es.nodes","localhost"); 22 | Tap out = new StdOutTap(new cascading.scheme.local.TextLine()); 23 | FlowConnector flow = new LocalFlowConnector(); 24 | 25 | Pipe fromEs = new Pipe("search-from-es"); 26 | flow.connect(in, out, fromEs).complete(); 27 | } 28 | } -------------------------------------------------------------------------------- /ch07/src/main/java/com/packtpub/esh/cascading/CascadingEsWriter.java: -------------------------------------------------------------------------------- 1 | package com.packtpub.esh.cascading; 2 | 3 | 4 | import cascading.flow.FlowConnector; 5 | import cascading.flow.hadoop.HadoopFlowConnector; 6 | import cascading.operation.expression.ExpressionFunction; 7 | import cascading.pipe.Each; 8 | import cascading.pipe.Pipe; 9 | import cascading.scheme.hadoop.TextDelimited; 10 | import cascading.tap.Tap; 11 | import cascading.tap.hadoop.Hfs; 12 | import cascading.tuple.Fields; 13 | import org.elasticsearch.hadoop.cascading.EsTap; 14 | 15 | import java.util.Properties; 16 | 17 | 18 | public class CascadingEsWriter { 19 | public static void main(String[] args) throws InterruptedException { 20 | Properties props = new Properties(); 21 | props.setProperty("es.mapping.id", "id"); 22 | FlowConnector flow = new HadoopFlowConnector(props); 23 | 24 | Fields inFields = new Fields("id", "caseNumber", "eventDate", "block", 25 | "iucr", "primaryType", "description","location","arrest","domestic", 26 | "lat", "lon" 27 | ); 28 | TextDelimited scheme = new TextDelimited(inFields, false, ",","\""); 29 | Tap in = new Hfs(scheme, "/ch07/crimes_dataset.csv"); 30 | 31 | String expression = "lat + \", \" + lon"; 32 | Fields location = new Fields( "geoLocation" ); 33 | ExpressionFunction locationFunction = new ExpressionFunction( location, expression, String.class ); 34 | 35 | Pipe toEs = new Pipe("to-Es"); 36 | toEs = new Each(toEs, locationFunction,Fields.ALL); 37 | 38 | Fields outFields = new Fields("id", "caseNumber", "eventDate", "block", 39 | "iucr", "primaryType", "description","location","arrest","domestic", 40 | "geoLocation" 41 | ); 42 | Tap out = new EsTap("localhost",9200, "esh_cascading/crimes", outFields); 43 | 44 | 45 | flow.connect(in, out, toEs).complete(); 46 | } 47 | } --------------------------------------------------------------------------------