├── .gitignore
├── README.md
├── ch01
    ├── assembly.xml
    ├── data
    │   └── sample.txt
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── packtpub
    │                   └── esh
    │                       ├── Driver.java
    │                       ├── WordsMapper.java
    │                       └── WordsReducer.java
├── ch02
    ├── assembly.xml
    ├── data
    │   ├── network-logs.txt
    │   └── tweets.csv
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── packtpub
    │                   └── esh
    │                       ├── nwlogs
    │                           ├── DomainUtil.java
    │                           ├── Driver.java
    │                           └── NetworkLogsMapper.java
    │                       ├── tweets2es
    │                           ├── Driver.java
    │                           └── Tweets2EsMapper.java
    │                       └── tweets2hdfs
    │                           ├── Driver.java
    │                           └── Tweets2HdfsMapper.java
├── ch03
    ├── data
    │   └── setup-hrms.sh
    └── exercise
    │   └── avg-salary-by-city-request.sh
├── ch04
    ├── assembly.xml
    ├── data
    │   └── consumer_complaints.csv
    ├── pom.xml
    ├── setup
    │   ├── complaints-dashboard.json
    │   └── setup-mappings.sh
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── packtpub
    │                   └── esh
    │                       └── complaints
    │                           ├── ComplaintsMapper.java
    │                           └── Driver.java
├── ch05
    ├── assembly.xml
    ├── data
    │   └── percolators.sh
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── packtpub
    │                   └── esh
    │                       └── streaming
    │                           ├── ElasticSearchService.java
    │                           ├── Topology.java
    │                           ├── TweetsCollectorSpout.java
    │                           └── TweetsParserBolt.java
├── ch07-spark
    ├── assembly.xml
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── packtpub
    │                   └── esh
    │                       └── spark
    │                           ├── Crime.java
    │                           ├── SparkEsReader.java
    │                           ├── SparkEsWriter.java
    │                           ├── SparkSQLEsReader.java
    │                           ├── SparkSQLEsWriterReflection.java
    │                           └── SparkSQLEsWriterSchema.java
└── ch07
    ├── assembly.xml
    ├── data
        ├── crimes.json
        └── crimes_dataset.csv
    ├── pom.xml
    ├── scripts
        ├── es-reader.pig
        ├── es-reader.sql
        ├── es-writer.pig
        ├── es-writer.sql
        ├── lingual-cleanup.sh
        └── lingual-writer.sh
    └── src
        └── main
            └── java
                └── com
                    └── packtpub
                        └── esh
                            └── cascading
                                ├── CascadingEsReader.java
                                └── CascadingEsWriter.java


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | *.iml
3 | .classpath
4 | .project
5 | *.log
6 | **/target
7 | **/.settings
8 | **/.idea


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ElasticSearch for Hadoop Book Source Code
 2 | 
 3 | ### Check Prerequisites
 4 |   - JDK 1.8
 5 | 
 6 |     ```sh 
 7 |     $ java -version 
 8 |     ```
 9 |   - Maven 
10 | 
11 |     ```sh 
12 |     $ mvn -version 
13 |     ```
14 |   - Make sure HDFS and YARN are running
15 | 
16 |     ```sh 
17 |     $ jps
18 |     
19 |     13386 SecondaryNameNode
20 |     13059 NameNode
21 |     13179 DataNode
22 |     13649 NodeManager
23 |     13528 ResourceManager
24 |     ```
25 |   - Make sure Elasticsearch 1.7+ is up and running at 9200
26 |     ```sh
27 |     $ curl -XGET http://localhost:9200
28 |     
29 |     {
30 |         "status" : 200,
31 |         "name" : "ES Hadoop Node",
32 |         "cluster_name" : "eshadoopcluster",
33 |         "version" : {
34 |             "number" : "1.7.2",
35 |             "build_hash" : "e43676b1385b8125d647f593f7202acbd816e8ec",
36 |             "build_timestamp" : "2015-09-14T09:49:53Z",
37 |             "build_snapshot" : false,
38 |             "lucene_version" : "4.10.4"
39 |         },
40 |         "tagline" : "You Know, for Search"
41 |     }
42 | 
43 |     ```
44 | 
45 | ### Build
46 |   - Open terminal and switch to the chapter directory you want to build
47 |   - Execute 
48 |     ```sh 
49 |     $ cd ch01
50 |     $ mvn clean package
51 |     ```
52 |   - Verify that file with xxx-job.jar pattern is generated
53 |     ```sh 
54 |     $ ls target
55 |     ```
56 | 


--------------------------------------------------------------------------------
/ch01/assembly.xml:
--------------------------------------------------------------------------------
 1 | <assembly>
 2 | 	<id>job</id>
 3 | 	<formats>
 4 | 		<format>jar</format>
 5 | 	</formats>
 6 | 	<includeBaseDirectory>false</includeBaseDirectory>
 7 | 	<dependencySets>
 8 | 		<dependencySet>
 9 | 			<unpack>false</unpack>
10 | 			<scope>runtime</scope>
11 | 			<outputDirectory>lib</outputDirectory>
12 | 			<excludes>
13 | 				<exclude>${groupId}:${artifactId}</exclude>
14 | 			</excludes>
15 | 		</dependencySet>
16 | 		<dependencySet>
17 | 			<unpack>true</unpack>
18 | 			<includes>
19 | 				<include>${groupId}:${artifactId}</include>
20 | 			</includes>
21 | 		</dependencySet>
22 | 	</dependencySets>
23 | </assembly>


--------------------------------------------------------------------------------
/ch01/data/sample.txt:
--------------------------------------------------------------------------------
 1 | ﻿The key features of Elasticsearch for Apache Hadoop include:
 2 | 
 3 | Scalable Map/Reduce model
 4 | elasticsearch-hadoop is built around Map/Reduce: every operation done in elasticsearch-hadoop results in multiple Hadoop tasks (based on the number of target shards) that interact, in parallel with Elasticsearch.
 5 | REST based
 6 | elasticsearch-hadoop uses Elasticsearch REST interface for communication, allowing for flexible deployments by minimizing the number of ports needed to be open within a network.
 7 | Self contained
 8 | the library has been designed to be small and efficient. At around 300KB and no extra dependencies outside Hadoop itself, distributing elasticsearch-hadoop within your cluster is simple and fast.
 9 | Universal jar
10 | whether you are using Hadoop 1.x or Hadoop 2.x, vanilla Apache Hadoop or a certain distro, the same elasticsearch-hadoop jar works transparently across all of them.
11 | Memory and I/O efficient
12 | elasticsearch-hadoop is focused on performance. From pull-based parsing, to bulk updates and direct conversion to/of native types, elasticsearch-hadoop keeps its memory and network I/O usage finely-tuned.
13 | Adaptive I/O
14 | elasticsearch-hadoop detects transport errors and retries automatically. If the Elasticsearch node died, re-routes the request to the available nodes (which are discovered automatically). Additionally, if Elasticsearch is overloaded, elasticsearch-hadoop detects the data rejected and resents it, until it is either processed or the user-defined policy applies.
15 | Facilitates data co-location
16 | elasticsearch-hadoop fully integrates with Hadoop exposing its network access information, allowing co-located Elasticsearch and Hadoop clusters to be aware of each other and reduce network IO.
17 | Map/Reduce API support
18 | At its core, elasticsearch-hadoop uses the low-level Map/Reduce API to read and write data to Elasticsearch allowing for maximum integration flexibility and performance.
19 | old(mapred) & new(mapreduce) Map/Reduce APIs supported
20 | elasticsearch-hadoop automatically adjusts to your environment; one does not have to change between using the mapred or mapreduce APIs - both are supported, by the same classes, at the same time.
21 | Hive support
22 | Run Hive queries against Elasticsearch for advanced analystics and real_time reponses. elasticsearch-hadoop exposes Elasticsearch as a Hive table so your scripts can crunch through data faster then ever.
23 | Pig support
24 | elasticsearch-hadoop supports Apache Pig exposing Elasticsearch as a native Pig Storage. Run your Pig scripts against Elasticsearch without any modifications to your configuration or the Pig client.
25 | Cascading support
26 | Cascading is an application framework for Java developers to simply develop robust applications on Apache Hadoop. And with elasticsearch-hadoop, Cascading can run its flows directly onto Elasticsearch.
27 | 


--------------------------------------------------------------------------------
/ch01/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 
 5 | 	<groupId>es-hadoop-book-code</groupId>
 6 | 	<artifactId>ch01</artifactId>
 7 | 	<version>0.0.1</version>
 8 | 	<packaging>jar</packaging>
 9 | 
10 | 	<name>com.hadoop.app</name>
11 | 	<url>http://maven.apache.org</url>
12 | 
13 | 	<properties>
14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 | 	</properties>
16 | 
17 | 	<dependencies>
18 | 		<dependency>
19 | 			<groupId>junit</groupId>
20 | 			<artifactId>junit</artifactId>
21 | 			<version>3.8.1</version>
22 | 			<scope>test</scope>
23 | 		</dependency>
24 | 		<dependency>
25 | 			<groupId>org.apache.hadoop</groupId>
26 | 			<artifactId>hadoop-core</artifactId>
27 | 			<version>1.2.1</version>
28 | 			<scope>provided</scope>
29 | 		</dependency>
30 | 		<dependency>
31 | 			<groupId>org.apache.hadoop</groupId>
32 | 			<artifactId>hadoop-hdfs</artifactId>
33 | 			<version>2.6.0</version>
34 | 		</dependency>
35 | 		<dependency>
36 | 			<groupId>org.elasticsearch</groupId>
37 | 			<artifactId>elasticsearch-hadoop</artifactId>
38 | 			<version>2.1.0</version>
39 | 			<exclusions>
40 | 				<exclusion>
41 | 					<artifactId>cascading-hadoop</artifactId>
42 | 					<groupId>cascading</groupId>
43 | 				</exclusion>
44 | 				<exclusion>
45 | 					<artifactId>cascading-local</artifactId>
46 | 					<groupId>cascading</groupId>
47 | 				</exclusion>
48 | 			</exclusions>
49 | 		</dependency>
50 | 	</dependencies>
51 | 
52 | 	<build>
53 | 		<plugins>
54 | 			<plugin>
55 | 				<artifactId>maven-assembly-plugin</artifactId>
56 | 				<version>2.2.1</version>
57 | 				<configuration>
58 | 					<descriptors>
59 | 						<descriptor>assembly.xml</descriptor>
60 | 					</descriptors>
61 | 					<archive>
62 | 						<manifest>
63 | 							<mainClass>com.packtpub.esh.Driver</mainClass>
64 | 						</manifest>
65 | 					</archive>
66 | 				</configuration>
67 | 				<executions>
68 | 					<execution>
69 | 						<id>make-assembly</id>
70 | 						<phase>package</phase>
71 | 						<goals>
72 | 							<goal>single</goal>
73 | 						</goals>
74 | 					</execution>
75 | 				</executions>
76 | 			</plugin>
77 |             <plugin>
78 |                 <groupId>org.apache.maven.plugins</groupId>
79 |                 <artifactId>maven-compiler-plugin</artifactId>
80 |                 <version>3.3</version>
81 |                 <configuration>
82 |                     <source>1.8</source>
83 |                     <target>1.8</target>
84 |                 </configuration>
85 |             </plugin>
86 | 		</plugins>
87 | 	</build>
88 | </project>
89 | 


--------------------------------------------------------------------------------
/ch01/src/main/java/com/packtpub/esh/Driver.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 | import org.elasticsearch.hadoop.mr.EsOutputFormat;
10 | 
11 | 
12 | public class Driver {
13 | 
14 | 	public static void main(String[] args) throws Exception {
15 | 		Configuration conf = new Configuration();
16 |         // ElasticSearch Server nodes to point to
17 | 		conf.set("es.nodes", "localhost:9200");
18 |         // ElasticSearch index and type name in {indexName}/{typeName} format
19 | 		conf.set("es.resource", "eshadoop/wordcount");
20 | 
21 |         // Create Job instance
22 | 		Job job = new Job(conf, "word count");
23 |         // set Driver class
24 | 		job.setJarByClass(Driver.class);
25 |         job.setMapperClass(WordsMapper.class);
26 |         job.setReducerClass(WordsReducer.class);
27 |         job.setOutputKeyClass(Text.class);
28 |         job.setOutputValueClass(IntWritable.class);
29 |         // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar
30 | 	    job.setOutputFormatClass(EsOutputFormat.class);
31 | 
32 |         FileInputFormat.addInputPath(job, new Path(args[0]));
33 | 
34 |         System.exit(job.waitForCompletion(true) ? 0 : 1);
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/ch01/src/main/java/com/packtpub/esh/WordsMapper.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh;
 2 | 
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Mapper;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.StringTokenizer;
 9 | 
10 | public class WordsMapper extends Mapper<Object, Text, Text, IntWritable> {
11 | 
12 | 	private final static IntWritable one = new IntWritable(1);
13 | 	
14 | 	public void map(Object key, Text value, Context context)	throws IOException, InterruptedException {
15 | 		StringTokenizer itr = new StringTokenizer(value.toString());
16 | 
17 | 		while (itr.hasMoreTokens()) {
18 |             Text word = new Text();
19 | 			word.set(itr.nextToken());
20 | 			context.write(word, one);
21 | 		}
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/ch01/src/main/java/com/packtpub/esh/WordsReducer.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh;
 2 | 
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.io.MapWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | import java.io.IOException;
 9 | 
10 | public class WordsReducer extends Reducer<Text,IntWritable,Text,MapWritable> {
11 | 
12 | 	@Override
13 | 	public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
14 | 		// This represents our ES document
15 |         MapWritable result = new MapWritable();
16 | 		int sum = 0;
17 | 		for (IntWritable val : values) {
18 | 			sum += val.get();
19 | 		}
20 |         // Add "word" field to ES document
21 | 		result.put(new Text("word"), key);
22 |         // Add "count" field to ES document
23 | 		result.put(new Text("count"), new IntWritable(sum));
24 | 		context.write(key, result);
25 | 	}
26 | 	
27 | }
28 | 


--------------------------------------------------------------------------------
/ch02/assembly.xml:
--------------------------------------------------------------------------------
 1 | <assembly>
 2 | 	<id>job</id>
 3 | 	<formats>
 4 | 		<format>jar</format>
 5 | 	</formats>
 6 | 	<includeBaseDirectory>false</includeBaseDirectory>
 7 | 	<dependencySets>
 8 | 		<dependencySet>
 9 | 			<unpack>false</unpack>
10 | 			<scope>runtime</scope>
11 | 			<outputDirectory>lib</outputDirectory>
12 | 			<excludes>
13 | 				<exclude>${groupId}:${artifactId}</exclude>
14 | 			</excludes>
15 | 		</dependencySet>
16 | 		<dependencySet>
17 | 			<unpack>true</unpack>
18 | 			<includes>
19 | 				<include>${groupId}:${artifactId}</include>
20 | 			</includes>
21 | 		</dependencySet>
22 | 	</dependencySets>
23 | </assembly>


--------------------------------------------------------------------------------
/ch02/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 
  5 | 	<groupId>es-hadoop-book-code</groupId>
  6 | 	<artifactId>ch02</artifactId>
  7 |         <version>0.0.1</version>
  8 | 	<packaging>jar</packaging>
  9 | 
 10 | 	<name>com.hadoop.app</name>
 11 | 	<url>http://maven.apache.org</url>
 12 | 
 13 | 	<properties>
 14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 | 	</properties>
 16 | 
 17 | 	<dependencies>
 18 | 		<dependency>
 19 | 			<groupId>org.apache.hadoop</groupId>
 20 | 			<artifactId>hadoop-core</artifactId>
 21 | 			<version>1.2.1</version>
 22 |             <scope>provided</scope>
 23 | 		</dependency>
 24 | 		<dependency>
 25 | 			<groupId>org.apache.hadoop</groupId>
 26 | 			<artifactId>hadoop-hdfs</artifactId>
 27 | 			<version>2.6.0</version>
 28 | 		</dependency>
 29 | 		<dependency>
 30 | 			<groupId>org.elasticsearch</groupId>
 31 | 			<artifactId>elasticsearch-hadoop</artifactId>
 32 | 			<version>2.1.0</version>
 33 | 			<exclusions>
 34 | 				<exclusion>
 35 | 					<artifactId>cascading-hadoop</artifactId>
 36 | 					<groupId>cascading</groupId>
 37 | 				</exclusion>
 38 | 				<exclusion>
 39 | 					<artifactId>cascading-local</artifactId>
 40 | 					<groupId>cascading</groupId>
 41 | 				</exclusion>
 42 | 			</exclusions>
 43 | 		</dependency>
 44 | 	</dependencies>
 45 | 
 46 | 	<build>
 47 | 		<plugins>
 48 | 			<plugin>
 49 | 				<artifactId>maven-assembly-plugin</artifactId>
 50 | 				<version>2.2.1</version>
 51 | 
 52 | 				<executions>
 53 | 					<execution>
 54 | 						<id>make-network-logs-job</id>
 55 |                         <configuration>
 56 |                             <descriptors>
 57 |                                 <descriptor>assembly.xml</descriptor>
 58 |                             </descriptors>
 59 |                             <archive>
 60 |                                 <manifest>
 61 |                                     <mainClass>com.packtpub.esh.nwlogs.Driver</mainClass>
 62 |                                 </manifest>
 63 |                             </archive>
 64 |                             <finalName>${artifactId}-${version}-nwlogs</finalName>
 65 |                         </configuration>
 66 | 						<phase>package</phase>
 67 | 						<goals>
 68 | 							<goal>single</goal>
 69 | 						</goals>
 70 | 					</execution>
 71 |                     <execution>
 72 |                         <id>make-tweets2es-job</id>
 73 |                         <configuration>
 74 |                             <descriptors>
 75 |                                 <descriptor>assembly.xml</descriptor>
 76 |                             </descriptors>
 77 |                             <archive>
 78 |                                 <manifest>
 79 |                                     <mainClass>com.packtpub.esh.tweets2es.Driver</mainClass>
 80 |                                 </manifest>
 81 |                             </archive>
 82 |                             <finalName>${artifactId}-${version}-tweets2es</finalName>
 83 |                         </configuration>
 84 |                         <phase>package</phase>
 85 |                         <goals>
 86 |                             <goal>single</goal>
 87 |                         </goals>
 88 |                     </execution>
 89 |                     <execution>
 90 |                         <id>make-tweets2hdfs-job</id>
 91 |                         <configuration>
 92 |                             <descriptors>
 93 |                                 <descriptor>assembly.xml</descriptor>
 94 |                             </descriptors>
 95 |                             <archive>
 96 |                                 <manifest>
 97 |                                     <mainClass>com.packtpub.esh.tweets2hdfs.Driver</mainClass>
 98 |                                 </manifest>
 99 |                             </archive>
100 |                             <finalName>${artifactId}-${version}-tweets2hdfs</finalName>
101 |                         </configuration>
102 |                         <phase>package</phase>
103 | 
104 |                         <goals>
105 |                             <goal>single</goal>
106 |                         </goals>
107 | 
108 |                     </execution>
109 | 				</executions>
110 | 			</plugin>
111 |             <plugin>
112 |                 <groupId>org.apache.maven.plugins</groupId>
113 |                 <artifactId>maven-compiler-plugin</artifactId>
114 |                 <version>3.3</version>
115 |                 <configuration>
116 |                     <source>1.8</source>
117 |                     <target>1.8</target>
118 |                 </configuration>
119 |             </plugin>
120 | 		</plugins>
121 | 	</build>
122 | </project>
123 | 


--------------------------------------------------------------------------------
/ch02/src/main/java/com/packtpub/esh/nwlogs/DomainUtil.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.nwlogs;
 2 | 
 3 | import java.text.ParseException;
 4 | import java.text.SimpleDateFormat;
 5 | 
 6 | /**
 7 |  * Created by vishalshukla on 25/05/15.
 8 |  */
 9 | public class DomainUtil {
10 |     /**
11 |      *
12 |      * @param url
13 |      * @return
14 |      */
15 |     public static String getHost(String url){
16 |         if(url == null || url.length() == 0)
17 |             return "";
18 | 
19 |         int doubleslash = url.indexOf("//");
20 |         if(doubleslash == -1)
21 |             doubleslash = 0;
22 |         else
23 |             doubleslash += 2;
24 | 
25 |         int end = url.indexOf('/', doubleslash);
26 |         end = end >= 0 ? end : url.length();
27 | 
28 |         int port = url.indexOf(':', doubleslash);
29 |         end = (port > 0 && port < end) ? port : end;
30 | 
31 |         return url.substring(doubleslash, end);
32 |     }
33 | 
34 | 
35 |     /**
36 |      *
37 |      * @param host
38 |      * @return
39 |      */
40 |     public static String getBaseDomain(String url) {
41 |         String host = getHost(url);
42 | 
43 |         int startIndex = 0;
44 |         int nextIndex = host.indexOf('.');
45 |         int lastIndex = host.lastIndexOf('.');
46 |         while (nextIndex < lastIndex) {
47 |             startIndex = nextIndex + 1;
48 |             nextIndex = host.indexOf('.', startIndex);
49 |         }
50 |         if (startIndex > 0) {
51 |             return host.substring(startIndex);
52 |         } else {
53 |             return host;
54 |         }
55 |     }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/ch02/src/main/java/com/packtpub/esh/nwlogs/Driver.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.nwlogs;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.mapreduce.Job;
 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 7 | import org.elasticsearch.hadoop.mr.EsOutputFormat;
 8 | 
 9 | 
10 | public class Driver {
11 | 
12 | 	public static void main(String[] args) throws Exception {
13 | 		Configuration conf = new Configuration();
14 |         // ElasticSearch Server nodes to point to
15 | 		conf.set("es.nodes", "localhost:9200");
16 |         // ElasticSearch index and type name in {indexName}/{typeName} format
17 | 		conf.set("es.resource", "esh_network/network_logs_{action}");
18 | 
19 |         // Create Job instance
20 | 		Job job = new Job(conf, "network monitor mapper");
21 |         // set Driver class
22 | 		job.setJarByClass(Driver.class);
23 |         job.setMapperClass(NetworkLogsMapper.class);
24 |         // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar
25 | 	    job.setOutputFormatClass(EsOutputFormat.class);
26 |         job.setNumReduceTasks(0);
27 |         FileInputFormat.addInputPath(job, new Path(args[0]));
28 | 
29 |         System.exit(job.waitForCompletion(true) ? 0 : 1);
30 | 	}
31 | 
32 | }


--------------------------------------------------------------------------------
/ch02/src/main/java/com/packtpub/esh/nwlogs/NetworkLogsMapper.java:
--------------------------------------------------------------------------------
  1 | package com.packtpub.esh.nwlogs;
  2 | 
  3 | import org.apache.hadoop.io.*;
  4 | import org.apache.hadoop.mapreduce.Mapper;
  5 | 
  6 | import java.io.IOException;
  7 | import java.text.ParseException;
  8 | import java.text.SimpleDateFormat;
  9 | import java.util.Date;
 10 | import java.util.StringTokenizer;
 11 | import java.util.Locale;
 12 | 
 13 | public class NetworkLogsMapper extends Mapper<Object, Text, Text, MapWritable> {
 14 | 
 15 | 
 16 |     public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 17 |         MapWritable map = new MapWritable();
 18 |         String line = value.toString().trim();
 19 |         String[] parts = line.split("\" \\(");
 20 |         String keyVals = parts[0].substring(15, parts[0].length()).trim();
 21 | 
 22 |         String srcIp = null;
 23 |         String destIp = null;
 24 |         String category = null;
 25 |         String action = null;
 26 |         String target = null;
 27 |         String serial = null;
 28 |         String ip = null;
 29 |         String timezone = null;
 30 |         Long time = null;
 31 | 
 32 |         int i = 0;
 33 |         StringTokenizer part1tokenizer = new StringTokenizer(keyVals);
 34 |         while (part1tokenizer.hasMoreTokens()) {
 35 |             String token = part1tokenizer.nextToken();
 36 |             String keyPart = getKeyValue(token)[0];
 37 |             String valuePart = getKeyValue(token)[1];
 38 | 
 39 |             switch (keyPart) {
 40 |                 case "src":
 41 |                     srcIp = valuePart;
 42 |                     break;
 43 |                 case "dst":
 44 |                     destIp = valuePart;
 45 |                     break;
 46 |                 case "id":
 47 |                     category = valuePart;
 48 |                     break;
 49 |                 case "act":
 50 |                     action = valuePart != null ? valuePart.toUpperCase() : null;
 51 |                     break;
 52 |                 case "msg":
 53 |                     target = valuePart;
 54 |                     break;
 55 |             }
 56 |             i++;
 57 |         }
 58 | 
 59 |         i = 0;
 60 |         if (parts.length > 1) {
 61 |             StringTokenizer part2Tokenizer = new StringTokenizer(parts[1], ",");
 62 |             while (part2Tokenizer.hasMoreTokens()) {
 63 |                 String token = part2Tokenizer.nextToken();
 64 |                 String keyPart = getKeyValue(token)[0].trim();
 65 |                 String valuePart = getKeyValue(token)[1];
 66 | 
 67 |                 switch (keyPart) {
 68 |                     case "sn":
 69 |                         serial = valuePart;
 70 |                         break;
 71 |                     case "ip":
 72 |                         ip = valuePart;
 73 |                         break;
 74 |                     case "tz":
 75 |                         timezone = valuePart;
 76 |                         break;
 77 |                     case "time":
 78 |                         String timeStr = valuePart;
 79 |                         timeStr = timeStr.replaceAll("\\)", "");
 80 |                         SimpleDateFormat dateFormat = new SimpleDateFormat("EEE MMM dd hh:mm:ss yyyy",Locale.ENGLISH);
 81 |                         try {
 82 |                             time = dateFormat.parse(timeStr).getTime();
 83 |                         } catch (ParseException e) {
 84 |                             e.printStackTrace();
 85 |                         }
 86 |                         break;
 87 |                 }
 88 |                 i++;
 89 |             }
 90 |         }
 91 |         map.put(new Text("srcIp"), getWritableValue(srcIp));
 92 |         map.put(new Text("destIp"), getWritableValue(destIp));
 93 |         map.put(new Text("action"), getWritableValue(action));
 94 |         map.put(new Text("category"), getWritableValue(category));
 95 |         map.put(new Text("target"), getWritableValue(target));
 96 |         map.put(new Text("serial"), getWritableValue(serial));
 97 |         map.put(new Text("timezone"), getWritableValue(timezone));
 98 |         map.put(new Text("ip"), getWritableValue(ip));
 99 |         map.put(new Text("domain"),getWritableValue(getDomainName(target)));
100 |         map.put(new Text("@timestamp"), time != null ? new LongWritable(time) : new LongWritable(new Date().getTime()));
101 | 
102 | 
103 |         context.write(value, map);
104 |     }
105 | 
106 |     private static WritableComparable getWritableValue(String value) {
107 |         return value != null ? new Text(value) : NullWritable.get();
108 |     }
109 | 
110 |     public static String getDomainName(String url) {
111 |         if(url==null)
112 |             return null;
113 |         return DomainUtil.getBaseDomain(url);
114 |     }
115 | 
116 |     private static String[] getKeyValue(String token) {
117 |         String[] values = token.split("=");
118 |         String val = null;
119 |         if (values.length >= 2) {
120 |             val = values[1].trim();
121 |             val = val.replaceAll("\"", "");
122 |         }
123 | 
124 |         return new String[]{values[0],val};
125 |     }
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/ch02/src/main/java/com/packtpub/esh/tweets2es/Driver.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.tweets2es;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.mapreduce.Job;
 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 7 | import org.elasticsearch.hadoop.mr.EsOutputFormat;
 8 | 
 9 | import java.io.IOException;
10 | 
11 | public class Driver {
12 |     public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException {
13 |         Configuration conf = new Configuration();
14 |         // ElasticSearch Server nodes to point to
15 |         conf.set("es.nodes", "localhost:9200");
16 |         // ElasticSearch index and type name in {indexName}/{typeName} format
17 |         conf.set("es.resource", "esh/tweets");
18 | 
19 |         // Create Job instance
20 |         Job job = new Job(conf, "tweets to es mapper");
21 |         // set Driver class
22 |         job.setJarByClass(Driver.class);
23 |         job.setMapperClass(Tweets2EsMapper.class);
24 |         // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar
25 |         job.setOutputFormatClass(EsOutputFormat.class);
26 |         job.setNumReduceTasks(0);
27 |         FileInputFormat.addInputPath(job, new Path(args[0]));
28 | 
29 |         System.exit(job.waitForCompletion(true) ? 0 : 1);
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/ch02/src/main/java/com/packtpub/esh/tweets2es/Tweets2EsMapper.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.tweets2es;
 2 | 
 3 | import org.apache.hadoop.io.LongWritable;
 4 | import org.apache.hadoop.io.MapWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | 
 8 | import java.io.IOException;
 9 | import java.text.ParseException;
10 | import java.text.SimpleDateFormat;
11 | import java.util.regex.Matcher;
12 | import java.util.regex.Pattern;
13 | import java.util.Locale;
14 | 
15 | public class Tweets2EsMapper extends Mapper<Object, Text, Text, MapWritable> {
16 | 
17 | 
18 |     public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
19 |         MapWritable map = new MapWritable();
20 |         getMapWritable(value, map);
21 |         context.write(value, map);
22 |     }
23 | 
24 |     private void getMapWritable(Text value, MapWritable map) {
25 |         String line = value.toString().trim();
26 |         String[] tokens = line.split("\",");
27 |         String id = null;
28 |         String text = null;
29 |         String user = null;
30 |         Long timestamp = null;
31 | 
32 |         for(int i=0;i<tokens.length;i++) {
33 |             String token = tokens[i];
34 |             switch (i) {
35 |                 case 0:
36 |                     id = token.replaceAll("\"", "");
37 |                     break;
38 |                 case 1:
39 |                     text = token.replaceAll("\"", "");
40 |                     break;
41 |                 case 2:
42 |                     String timeStr = token.replaceAll("\"", "");
43 |                     SimpleDateFormat dateFormat = new SimpleDateFormat("EEE MMM dd hh:mm:ss zzz YYYY",Locale.ENGLISH);
44 |                     try {
45 |                         timestamp = dateFormat.parse(timeStr).getTime();
46 |                     } catch (ParseException e) {
47 |                         e.printStackTrace();
48 |                     }
49 | 
50 |                     break;
51 |                 case 3:
52 |                     user = token.replaceAll("\"", "");
53 |                     break;
54 |                 case 4:
55 |                     break;
56 |             }
57 | 
58 |         }
59 |         map.put(new Text("tweetId"), new Text(id));
60 |         map.put(new Text("text"), new Text(text));
61 |         map.put(new Text("user"), new Text(user));
62 |         map.put(new Text("@timestamp"), new LongWritable(timestamp));
63 |     }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/ch02/src/main/java/com/packtpub/esh/tweets2hdfs/Driver.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.tweets2hdfs;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.mapreduce.Job;
 6 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 7 | import org.elasticsearch.hadoop.mr.EsInputFormat;
 8 | 
 9 | import java.io.IOException;
10 | 
11 | public class Driver {
12 |     private static String query = "{\n" +
13 |             "  \"query\": {\n" +
14 |             "    \"bool\": {\n" +
15 |             "      \"should\": [\n" +
16 |             "        {\n" +
17 |             "          \"term\": {\n" +
18 |             "            \"text\": {\n" +
19 |             "              \"value\": \"elasticsearch\"\n" +
20 |             "            }\n" +
21 |             "          \n" +
22 |             "          }\n" +
23 |             "        },{\n" +
24 |             "          \"term\": {\n" +
25 |             "            \"text\": {\n" +
26 |             "              \"value\": \"kibana\"\n" +
27 |             "            }\n" +
28 |             "          \n" +
29 |             "          }\n" +
30 |             "        },{\n" +
31 |             "          \"term\": {\n" +
32 |             "            \"text\": {\n" +
33 |             "              \"value\": \"analysis\"\n" +
34 |             "            }\n" +
35 |             "          \n" +
36 |             "          }\n" +
37 |             "        },{\n" +
38 |             "          \"term\": {\n" +
39 |             "            \"text\": {\n" +
40 |             "              \"value\": \"visualize\"\n" +
41 |             "            }\n" +
42 |             "          \n" +
43 |             "          }\n" +
44 |             "        },{\n" +
45 |             "          \"term\": {\n" +
46 |             "            \"text\": {\n" +
47 |             "              \"value\": \"realtime\"\n" +
48 |             "            }\n" +
49 |             "          \n" +
50 |             "          }\n" +
51 |             "        }\n" +
52 |             "      ]\n" +
53 |             "      ,\"minimum_number_should_match\": 2\n" +
54 |             "    }\n" +
55 |             "    \n" +
56 |             "  }\n" +
57 |             "}";
58 | 
59 |     public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException {
60 | 
61 |         Configuration conf = new Configuration();
62 |         // ElasticSearch Server nodes to point to
63 |         conf.set("es.nodes", "localhost:9200");
64 |         // ElasticSearch index and type name in {indexName}/{typeName} format
65 |         conf.set("es.resource", "esh/tweets");
66 |         conf.set("es.query", query);
67 | 
68 |         // Create Job instance
69 |         Job job = new Job(conf, "tweets to hdfs mapper");
70 |         // set Driver class
71 |         job.setJarByClass(Driver.class);
72 |         job.setMapperClass(Tweets2HdfsMapper.class);
73 |         // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar
74 |         job.setInputFormatClass(EsInputFormat.class);
75 |         job.setNumReduceTasks(0);
76 |         FileOutputFormat.setOutputPath(job, new Path(args[0]));
77 | 
78 |         System.exit(job.waitForCompletion(true) ? 0 : 1);
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/ch02/src/main/java/com/packtpub/esh/tweets2hdfs/Tweets2HdfsMapper.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.tweets2hdfs;
 2 | 
 3 | import org.apache.hadoop.io.MapWritable;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.io.Writable;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | 
 8 | import java.io.IOException;
 9 | import java.text.SimpleDateFormat;
10 | import java.util.Date;
11 | import java.util.Locale;
12 | 
13 | public class Tweets2HdfsMapper extends Mapper<Object, MapWritable, Text, Text> {
14 | 
15 | 
16 |     public void map(Object key, MapWritable value, Context context) throws IOException, InterruptedException {
17 |         StringBuilder mappedValueBuilder = new StringBuilder();
18 |         mappedValueBuilder.append(getQuotedValue(value.get(new Text("tweetId")))+", ");
19 |         mappedValueBuilder.append(getQuotedValue(value.get(new Text("text")))+", ");
20 |         mappedValueBuilder.append(getQuotedValue(value.get(new Text("user")))+", ");
21 |         mappedValueBuilder.append(getQuotedTimeValue(value.get(new Text("@timestamp"))));
22 | 
23 |         Text mappedValue = new Text(mappedValueBuilder.toString());
24 |         context.write(mappedValue, mappedValue);
25 |     }
26 | 
27 |     private String getQuotedTimeValue(Writable writable) {
28 |         Date timestamp = new Date(Long.parseLong(writable.toString()));
29 |         SimpleDateFormat dateFormat = new SimpleDateFormat("EEE MMM dd hh:mm:ss zzz YYYY",Locale.ENGLISH);
30 |         return "\""+dateFormat.format(timestamp)+"\"";
31 |     }
32 | 
33 |     private String getQuotedValue(Writable value) {
34 |         return "\""+value.toString()+"\"";
35 |     }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/ch03/data/setup-hrms.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | curl -XPUT http://localhost:9200/hrms
  4 | 
  5 | curl -XPUT http://localhost:9200/hrms/candidate/_mapping?pretty -d '{
  6 |   "properties": {
  7 |                "experience": {
  8 |                   "type": "float"
  9 |                },
 10 |                "firstName": {
 11 |                   "type": "string"
 12 |                },
 13 |                "lastName": {
 14 |                   "type": "string"
 15 |                },
 16 |                "birthDate" : {                 
 17 |                   "type" : "date",                 
 18 |                   "format" : "dd/MM/YYYY" 
 19 |                },
 20 |                "salary" : {                 
 21 |                   "type" : "double" 
 22 |                },
 23 |                "skills": {
 24 |                   "type": "string"
 25 |                },
 26 |                "address" : {
 27 |                         "type" : "object",
 28 |                         "properties" : {
 29 |                             "street" : {"type" : "string"},
 30 |                             "city" : {
 31 | "type" : "string", 
 32 | "index" : "not_analyzed"
 33 | },
 34 |                             "region" : {"type" : "string"},
 35 |                             "geo":{"type":"geo_point"}
 36 |                         }
 37 |                 }
 38 |   }       
 39 | }'
 40 | 
 41 | curl -XPOST  http://localhost:9200/hrms/candidate -d '{
 42 |   "firstName": "Emerson",
 43 |   "lastName": "Atkins",
 44 |   "skills": ["Java","Hadoop","ElasticSearch","Kibana"],
 45 |   "experience": 8.5,
 46 |   "birthDate":"30/04/1987",
 47 |   "address" :{
 48 |                 "street" : "Ap #576-619 Tincidunt Rd.",
 49 |                 "city" : "Nagpur",
 50 |                 "region": "MH",
 51 |                 "geo": "15.97, 76.82"
 52 |               },
 53 |    "salary":"120000"
 54 | }'
 55 | curl -XPOST  http://localhost:9200/hrms/candidate -d '
 56 | { 
 57 |     "firstName": "Jorden",
 58 |     "lastName": "Mclean",
 59 |     "birthDate": "11/03/1980",
 60 |     "experience": 19,
 61 |     "skills": ["Java","Hadoop","ElasticSearch","Kibana"],
 62 |     "address" :{
 63 |     "street": "2751 Ut Rd.",
 64 |     "city": "Purral",
 65 |     "region": "SJ",
 66 |     "geo": "-80.61395, 21.93988"
 67 |     },
 68 |     "comments":"Passionate Java and BigData developer",
 69 |    "salary":"150000"
 70 |     
 71 |   }'
 72 | curl -XPOST  http://localhost:9200/hrms/candidate -d '
 73 |   {
 74 |     "firstName": "Aimee",
 75 |     "lastName": "Ramirez",
 76 |     "birthDate": "14/11/1973",
 77 |     "experience": 6,
 78 |     "skills": ["PHP","Magento","ElasticSearch","Kibana"],
 79 |     "address" :{
 80 |       "street": "477-3861 Feugiat. Road",
 81 |       "city": "La Roche-sur-Yon",
 82 |       "region": "Pays de la Loire",
 83 |       "geo": "23.15, 72.33"
 84 |     },
 85 |     "achievements":"Ethical hacking certification.",
 86 |    "salary":"80000"
 87 | 
 88 |   }'
 89 | curl -XPOST  http://localhost:9200/hrms/candidate -d '
 90 |   {
 91 |     "firstName": "Rebekah",
 92 |     "lastName": "Chang",
 93 |     "birthDate": "12/04/1984",
 94 |     "experience": 13,
 95 |     "skills": ["Java","Spring","ElasticSearch"],
 96 |     "address" :{
 97 |       "street": "138-8420 Semper Rd.",
 98 |       "city": "Mumbai",
 99 |       "region": "MH",
100 |       "geo": "18.97, 72.82"
101 |     },
102 |     "comments":"Ethical hacking certification.",
103 |    "salary":"125000"
104 |   }'
105 | 
106 | curl -XPOST  http://localhost:9200/hrms/candidate -d '
107 |   {
108 |     "firstName": "Gray",
109 |     "lastName": "Carson",
110 |     "birthDate": "20/11/1989",
111 |     "experience": 11,
112 |     "skills": ["Python","R","Machine learning"],
113 |     "address" :{
114 |     "street": "Ap #261-8043 Magna. Rd.",
115 |     "city": "Parramatta",
116 |     "region": "New South Wales",
117 |     "geo": "24.15, 73.33"
118 |     },
119 |     "comments":"Ethical hacking certification.",
120 |    "salary":"115000"
121 |   }'
122 | curl -XPOST  http://localhost:9200/hrms/candidate -d '
123 |   {
124 |     "firstName": "Eagan",
125 |     "lastName": "Riddle",
126 |     "birthDate": "03/12/1979",
127 |     "experience": 17,
128 |     "skills": ["Linux","Networking","VMWare","DevOps","Docker"],
129 |     "address" :{
130 |     "street": "7138 Amet Avenue",
131 |     "city": "New South Wales",
132 |     "region": "OV",
133 |     "geo": "-89.52962, -117.05619"
134 |     },
135 |    "salary":"180000"
136 |   }'
137 | curl -XPOST  http://localhost:9200/hrms/candidate -d '
138 |   {
139 |     "firstName": "Hamish",
140 |     "lastName": "Mendez",
141 |     "birthDate": "22/05/1988",
142 |     "experience": 9,
143 |     "skills": ["Ruby","Linux","Puppet","Chef","DevOps","Ansible","Shell Script","Groovy"],
144 |     "address" :{
145 |     "street": "P.O. Box 452, 4375 Nam Road",
146 |     "city": "Parramatta",
147 |     "region": "Şa",
148 |     "geo": "42.5772, 9.88647",
149 |    "salary":"100000"
150 |     }
151 |   }'
152 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
153 |   {
154 |     "firstName": "Bradley",
155 |     "lastName": "Stark",
156 |     "birthDate": "23/06/1981",
157 |     "experience": 1,
158 |     "skills": ["HTML","CSS","Photoshop","AngularJS","SQL"],
159 |     "address" :{
160 |     "street": "Ap #695-608 Aliquet. St.",
161 |     "city": "Newcastle",
162 |     "region": "New South Wales",
163 |     "geo": "3.00917, -152.95787"
164 |     },
165 |     "achievements":"Secured 1st rank in university in Masters",
166 |    "salary":"60000"
167 |   }'
168 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
169 |   {
170 |     "firstName": "Quemby",
171 |     "lastName": "Cunningham",
172 |     "birthDate": "02/09/1987",
173 |     "experience": 5,
174 |     "skills": ["Lucene","ElasticSearch","Java"],
175 |     "address" :{
176 |     "street": "P.O. Box 751, 6709 Cras St.",
177 |     "city": "Akron",
178 |     "region": "Galicia",
179 |     "geo": "70.68905, 56.43336"
180 |     },
181 |    "salary":"80000"
182 |   }'
183 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
184 |   {
185 |     "firstName": "Elton",
186 |     "lastName": "Harper",
187 |     "birthDate": "30/11/1978",
188 |     "experience": 10,
189 |     "skills": ["Hadoop","Spark","Java","Linux"],
190 |     "address" :{
191 |     "street": "8854 Fermentum Road",
192 |     "city": "New South Wales",
193 |     "region": "Noord Brabant",
194 |     "geo": "67.74365, -31.22381"
195 |     },
196 |    "salary":"110000"
197 |   }'
198 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
199 |   {
200 |     "firstName": "Hyacinth",
201 |     "lastName": "Melendez",
202 |     "birthDate": "08/01/1979",
203 |     "experience": 11,
204 |     "skills": ["Kibana","ElasticSearch","Java","Linux"],
205 |     "address" :{
206 |     "street": "688-7523 Diam Rd.",
207 |     "city": "Akron",
208 |     "region": "Ohio",
209 |     "geo": "40.68995, -123.71124"
210 |     },
211 |    "salary":"120000"
212 |   }'
213 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
214 |   {
215 |     "firstName": "Forrest",
216 |     "lastName": "Lawson",
217 |     "birthDate": "13/03/1978",
218 |     "experience": 4,
219 |     "skills": ["Solr","Lucene","Java","Full-text search"],
220 |     "address" :{
221 |     "street": "P.O. Box 146, 3183 Amet Avenue",
222 |     "city": "Istanbul",
223 |     "region": "Ist",
224 |     "geo": "-10, 155"
225 |     },
226 |    "salary":"70000"
227 |   }'
228 | 
229 | 
230 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
231 |   {
232 |     "firstName": "David",
233 |     "lastName": "Lawson",
234 |     "birthDate": "13/03/1968",
235 |     "experience": 30,
236 |     "skills": ["Lucene"],
237 |     "address" :{
238 |     "street": "P.O. Box 146, 3183 Amet Avenue",
239 |     "city": "Istanbul",
240 |     "region": "Ist",
241 |     "geo": "-9, 150"
242 |     },
243 |    "salary":"200000"
244 |   }'
245 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
246 |   {
247 |     "firstName": "David",
248 |     "lastName": "Lawson",
249 |     "birthDate": "13/03/1968",
250 |     "experience": 40,
251 |     "skills": ["Lucene"],
252 |     "address" :{
253 |     "street": "P.O. Box 146, 3183 Amet Avenue",
254 |     "city": "Istanbul",
255 |     "region": "Ist",
256 |     "geo": "-9.68931, 151.66362"
257 |     },
258 |    "salary":"250000"
259 |   }'
260 | 
261 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
262 |   {
263 |     "firstName": "David",
264 |     "lastName": "Lawson",
265 |     "birthDate": "13/03/1968",
266 |     "experience": 30,
267 |     "skills": ["Lucene"],
268 |     "address" :{
269 |     "street": "P.O. Box 146, 3183 Amet Avenue",
270 |     "city": "Istanbul",
271 |     "region": "Ist",
272 |     "geo": "-9.68931, 151.66362"
273 |     },
274 |    "salary":"210000"
275 |   }'
276 | 
277 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
278 |   {
279 |     "firstName": "Kiran",
280 |     "lastName": "Suthar",
281 |     "birthDate": "13/03/1968",
282 |     "experience": 35,
283 |     "skills": ["Lucene"],
284 |     "address" :{
285 |     "street": "P.O. Box 146, 3183 Amet Avenue",
286 |     "city": "Istanbul",
287 |     "region": "Ist",
288 |     "geo": "-9.68931, 151.66362"
289 |     },
290 |    "salary":"300000"
291 |   }'
292 | 
293 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
294 |   {
295 |     "firstName": "David",
296 |     "lastName": "Mackwan",
297 |     "birthDate": "13/03/1968",
298 |     "experience": 0,
299 |     "skills": ["Java"],
300 |     "address" :{
301 |     "street": "P.O. Box 146, 3183 Amet Avenue",
302 |     "city": "Kota",
303 |     "region": "Ist",
304 |     "geo": "-9.68931, 151.66362"
305 |     },
306 |    "salary":"40000"
307 |   }'
308 | 
309 |   curl -XPOST  http://localhost:9200/hrms/candidate -d '
310 |   {
311 |     "firstName": "Pratik",
312 |     "lastName": "Patel",
313 |     "birthDate": "13/03/1968",
314 |     "experience": 1,
315 |     "skills": ["Java"],
316 |     "address" :{
317 |     "street": "P.O. Box 146, 3183 Amet Avenue",
318 |     "city": "Kota",
319 |     "region": "Ist",
320 |     "geo": "-9.68931, 151.66362"
321 |     },
322 |    "salary":"65000"
323 |   }'
324 | 
325 | 


--------------------------------------------------------------------------------
/ch03/exercise/avg-salary-by-city-request.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | curl -X﻿POST http://localhost:9200/hrms/candidate/_search?pretty -d '{
 3 |   "query": {
 4 |     "filtered": {
 5 |       "filter": {
 6 |         "bool": {
 7 |           "must": [
 8 |             {
 9 |               "range": {
10 |           "experience": {
11 |             "gte": 5,
12 |             "lte": 10
13 |           }
14 |         }
15 |             },
16 |             {
17 |               "terms": {
18 |                 "skills": [
19 |                   "elasticsearch",
20 |                   "kibana",
21 |                   "lucene"
22 |                 ]
23 |               }
24 |             }
25 |           ]
26 |         }
27 | 
28 |       }
29 |     }
30 |   },
31 |   "aggs": {
32 |     "by_city": {
33 |       "terms": {
34 |         "field": "address.city",
35 |         "size": 5
36 |       },
37 |       "aggs": {
38 |         "by_skill": {
39 |           "terms": {
40 |             "field": "skills",
41 |             "size": 5
42 |        },
43 |         "aggs":{
44 |           "average": {
45 |             "avg": {
46 |                 "field": "salary"
47 |             }
48 |            }
49 |          }
50 |        }
51 |      }
52 |     }
53 |   },
54 |   "size": 0
55 | }'


--------------------------------------------------------------------------------
/ch04/assembly.xml:
--------------------------------------------------------------------------------
 1 | <assembly>
 2 | 	<id>job</id>
 3 | 	<formats>
 4 | 		<format>jar</format>
 5 | 	</formats>
 6 | 	<includeBaseDirectory>false</includeBaseDirectory>
 7 | 	<dependencySets>
 8 | 		<dependencySet>
 9 | 			<unpack>false</unpack>
10 | 			<scope>runtime</scope>
11 | 			<outputDirectory>lib</outputDirectory>
12 | 			<excludes>
13 | 				<exclude>${groupId}:${artifactId}</exclude>
14 | 			</excludes>
15 | 		</dependencySet>
16 | 		<dependencySet>
17 | 			<unpack>true</unpack>
18 | 			<includes>
19 | 				<include>${groupId}:${artifactId}</include>
20 | 			</includes>
21 | 		</dependencySet>
22 | 	</dependencySets>
23 | </assembly>


--------------------------------------------------------------------------------
/ch04/data/consumer_complaints.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalbrevitaz/eshadoop/7f564e1c781993916bb29175c5c7c0505c3cee4f/ch04/data/consumer_complaints.csv


--------------------------------------------------------------------------------
/ch04/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 
 5 | 	<groupId>es-hadoop-book-code</groupId>
 6 | 	<artifactId>ch04</artifactId>
 7 |         <version>0.0.1</version>
 8 | 	<packaging>jar</packaging>
 9 | 
10 | 	<name>com.hadoop.app</name>
11 | 	<url>http://maven.apache.org</url>
12 | 
13 | 	<properties>
14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 | 	</properties>
16 | 
17 | 	<dependencies>
18 | 		<dependency>
19 | 			<groupId>org.apache.hadoop</groupId>
20 | 			<artifactId>hadoop-core</artifactId>
21 | 			<version>1.2.1</version>
22 | 			<scope>provided</scope>
23 | 		</dependency>
24 | 		<dependency>
25 | 			<groupId>org.apache.hadoop</groupId>
26 | 			<artifactId>hadoop-hdfs</artifactId>
27 | 			<version>2.6.0</version>
28 | 		</dependency>
29 | 		<dependency>
30 | 			<groupId>org.elasticsearch</groupId>
31 | 			<artifactId>elasticsearch-hadoop</artifactId>
32 | 			<version>2.0.2</version>
33 | 			<exclusions>
34 | 				<exclusion>
35 | 					<artifactId>cascading-hadoop</artifactId>
36 | 					<groupId>cascading</groupId>
37 | 				</exclusion>
38 | 				<exclusion>
39 | 					<artifactId>cascading-local</artifactId>
40 | 					<groupId>cascading</groupId>
41 | 				</exclusion>
42 | 			</exclusions>
43 | 		</dependency>
44 |         <dependency>
45 |             <groupId>org.apache.commons</groupId>
46 |             <artifactId>commons-csv</artifactId>
47 |             <version>1.1</version>
48 |         </dependency>
49 | 
50 |     </dependencies>
51 | 
52 | 	<build>
53 | 		<plugins>
54 | 			<plugin>
55 | 				<artifactId>maven-assembly-plugin</artifactId>
56 | 				<version>2.2.1</version>
57 | 
58 | 				<executions>
59 | 					<execution>
60 | 						<id>make-complaints-job</id>
61 |                         <configuration>
62 |                             <descriptors>
63 |                                 <descriptor>assembly.xml</descriptor>
64 |                             </descriptors>
65 |                             <archive>
66 |                                 <manifest>
67 |                                     <mainClass>com.packtpub.esh.complaints.Driver</mainClass>
68 |                                 </manifest>
69 |                             </archive>
70 |                             <finalName>${artifactId}-${version}-complaints</finalName>
71 |                         </configuration>
72 | 						<phase>package</phase>
73 | 						<goals>
74 | 							<goal>single</goal>
75 | 						</goals>
76 | 					</execution>
77 | 				</executions>
78 | 			</plugin>
79 |             <plugin>
80 |                 <groupId>org.apache.maven.plugins</groupId>
81 |                 <artifactId>maven-compiler-plugin</artifactId>
82 |                 <version>3.3</version>
83 |                 <configuration>
84 |                     <source>1.8</source>
85 |                     <target>1.8</target>
86 |                 </configuration>
87 |             </plugin>
88 | 		</plugins>
89 | 	</build>
90 | </project>
91 | 


--------------------------------------------------------------------------------
/ch04/setup/complaints-dashboard.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "_id": "Complaints-Dashboard",
  4 |     "_type": "dashboard",
  5 |     "_source": {
  6 |       "title": "Complaints Dashboard",
  7 |       "hits": 0,
  8 |       "description": "",
  9 |       "panelsJSON": "[\n  {\n    \"col\": 7,\n    \"id\": \"State-wise-issues-over-the-time\",\n    \"row\": 1,\n    \"size_x\": 6,\n    \"size_y\": 3,\n    \"type\": \"visualization\"\n  },\n  {\n    \"col\": 9,\n    \"id\": \"State-wise-product-wise-issues\",\n    \"row\": 4,\n    \"size_x\": 4,\n    \"size_y\": 4,\n    \"type\": \"visualization\"\n  },\n  {\n    \"col\": 1,\n    \"id\": \"Company-wise-issues\",\n    \"row\": 4,\n    \"size_x\": 8,\n    \"size_y\": 4,\n    \"type\": \"visualization\"\n  },\n  {\n    \"col\": 1,\n    \"id\": \"State-wise-company-wise-issues\",\n    \"row\": 8,\n    \"size_x\": 12,\n    \"size_y\": 3,\n    \"type\": \"visualization\"\n  },\n  {\n    \"col\": 1,\n    \"id\": \"Product-wise-issues-over-the-time\",\n    \"row\": 1,\n    \"size_x\": 6,\n    \"size_y\": 3,\n    \"type\": \"visualization\"\n  },\n  {\n    \"id\": \"Geography-wise-issues\",\n    \"type\": \"visualization\",\n    \"size_x\": 12,\n    \"size_y\": 5,\n    \"col\": 1,\n    \"row\": 11\n  }\n]",
 10 |       "version": 1,
 11 |       "timeRestore": true,
 12 |       "timeTo": "now",
 13 |       "timeFrom": "now-5y",
 14 |       "kibanaSavedObjectMeta": {
 15 |         "searchSourceJSON": "{\n  \"filter\": [\n    {\n      \"query\": {\n        \"query_string\": {\n          \"analyze_wildcard\": true,\n          \"query\": \"*\"\n        }\n      }\n    }\n  ]\n}"
 16 |       }
 17 |     }
 18 |   },
 19 |   {
 20 |     "_id": "State-wise-issues-over-the-time",
 21 |     "_type": "visualization",
 22 |     "_source": {
 23 |       "title": "State-wise issues over the time",
 24 |       "visState": "{\"type\":\"area\",\"params\":{\"addLegend\":true,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"stacked\",\"shareYAxis\":true,\"smoothLines\":false,\"scale\":\"linear\",\"interpolate\":\"linear\",\"times\":[],\"addTimeMarker\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"dateSent\",\"interval\":\"M\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"state\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
 25 |       "description": "",
 26 |       "version": 1,
 27 |       "kibanaSavedObjectMeta": {
 28 |         "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"filter\":[]}"
 29 |       }
 30 |     }
 31 |   },
 32 |   {
 33 |     "_id": "Company-wise-issues",
 34 |     "_type": "visualization",
 35 |     "_source": {
 36 |       "title": "Company-wise issues",
 37 |       "visState": "{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"mode\":\"stacked\",\"defaultYExtents\":false,\"scale\":\"linear\",\"times\":[],\"addTimeMarker\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"company\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"issue.raw\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
 38 |       "description": "",
 39 |       "version": 1,
 40 |       "kibanaSavedObjectMeta": {
 41 |         "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}"
 42 |       }
 43 |     }
 44 |   },
 45 |   {
 46 |     "_id": "State-wise-company-wise-issues",
 47 |     "_type": "visualization",
 48 |     "_source": {
 49 |       "title": "State-wise company-wise issues",
 50 |       "visState": "{\"type\":\"pie\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"isDonut\":false,\"defaultYExtents\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"state\",\"size\":8,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":false}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"company\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
 51 |       "description": "",
 52 |       "version": 1,
 53 |       "kibanaSavedObjectMeta": {
 54 |         "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}"
 55 |       }
 56 |     }
 57 |   },
 58 |   {
 59 |     "_id": "State-wise-product-wise-issues",
 60 |     "_type": "visualization",
 61 |     "_source": {
 62 |       "title": "State-wise product-wise issues",
 63 |       "visState": "{\"type\":\"pie\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"isDonut\":true},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"state\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"product\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
 64 |       "description": "",
 65 |       "version": 1,
 66 |       "kibanaSavedObjectMeta": {
 67 |         "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}"
 68 |       }
 69 |     }
 70 |   },
 71 |   {
 72 |     "_id": "Geography-wise-issues",
 73 |     "_type": "visualization",
 74 |     "_source": {
 75 |       "title": "Geography-wise issues",
 76 |       "visState": "{\"type\":\"tile_map\",\"params\":{\"mapType\":\"Shaded Circle Markers\",\"isDesaturated\":false,\"heatMaxZoom\":16,\"heatMinOpacity\":\"0.31\",\"heatRadius\":\"12\",\"heatBlur\":\"11\",\"heatNormalizeData\":true,\"addTooltip\":true},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"geohash_grid\",\"schema\":\"segment\",\"params\":{\"field\":\"location\",\"autoPrecision\":true,\"mapZoom\":3,\"mapCenter\":[38.69910391920755,-61.52343749999999],\"precision\":2}}],\"listeners\":{}}",
 77 |       "description": "",
 78 |       "version": 1,
 79 |       "kibanaSavedObjectMeta": {
 80 |         "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"filter\":[]}"
 81 |       }
 82 |     }
 83 |   },
 84 |   {
 85 |     "_id": "Company-pie",
 86 |     "_type": "visualization",
 87 |     "_source": {
 88 |       "title": "Company pie",
 89 |       "visState": "{\"type\":\"pie\",\"params\":{\"addLegend\":true,\"addTooltip\":true,\"isDonut\":false,\"shareYAxis\":true,\"spyPerPage\":10},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"company\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
 90 |       "description": "",
 91 |       "version": 1,
 92 |       "kibanaSavedObjectMeta": {
 93 |         "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}"
 94 |       }
 95 |     }
 96 |   },
 97 |   {
 98 |     "_id": "Product-wise-issues-over-the-time",
 99 |     "_type": "visualization",
100 |     "_source": {
101 |       "title": "Product-wise issues over the time",
102 |       "visState": "{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"mode\":\"stacked\",\"defaultYExtents\":false,\"scale\":\"linear\",\"times\":[],\"addTimeMarker\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"dateSent\",\"interval\":\"M\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"product\",\"size\":3,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
103 |       "description": "",
104 |       "version": 1,
105 |       "kibanaSavedObjectMeta": {
106 |         "searchSourceJSON": "{\"index\":\"esh_complaints\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}"
107 |       }
108 |     }
109 |   }
110 | ]


--------------------------------------------------------------------------------
/ch04/setup/setup-mappings.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | curl -XPUT http://localhost:9200/esh_complaints
 3 | curl -XPUT http://localhost:9200/esh_complaints/complaints/_mapping -d '
 4 | {
 5 |          "complaints": {
 6 |             "properties": {
 7 |                "company": {
 8 |                   "type": "string",
 9 |                   "index": "not_analyzed"
10 |                },
11 |                "companyResponse": {
12 |                   "type": "string",
13 |                   "index": "not_analyzed"
14 |                },
15 |                "complaintId": {
16 |                   "type": "string",
17 |                   "index": "not_analyzed"
18 |                },
19 |                "consumerDisputed": {
20 |                   "type": "boolean"
21 |                },
22 |                "dateReceived": {
23 |                   "type": "date",
24 |                   "format": "MM/dd/yyyy||MM/dd/yy"
25 |                },
26 |                "dateSent": {
27 |                   "type": "date",
28 |                   "format": "MM/dd/yyyy||MM/dd/yy"
29 |                },
30 |                "issue.raw": {
31 |                   "type": "string",
32 |                   "index": "not_analyzed"
33 |                },
34 |                "issue": {
35 |                   "type": "string"
36 |                },
37 |                "location": {
38 |                   "type": "geo_point"
39 |                },
40 |                "product": {
41 |                   "type": "string",
42 |                   "index": "not_analyzed",
43 |                   "fields": {
44 |                      "analyzed": {
45 |                         "type": "string"
46 |                      }
47 |                   }
48 |                },
49 |                "state": {
50 |                   "type": "string",
51 |                   "index": "not_analyzed"
52 |                },
53 |                "subissue": {
54 |                   "type": "string",
55 |                   "index": "not_analyzed",
56 |                   "fields": {
57 |                      "analyzed": {
58 |                         "type": "string"
59 |                      }
60 |                   }
61 |                },
62 |                "submittedVia": {
63 |                   "type": "string",
64 |                   "index": "not_analyzed"
65 |                },
66 |                "subproduct": {
67 |                   "type": "string",
68 |                   "index": "not_analyzed",
69 |                   "fields": {
70 |                      "analyzed": {
71 |                         "type": "string"
72 |                      }
73 |                   }
74 |                },
75 |                "timelyResponse": {
76 |                   "type": "boolean"
77 |                },
78 |                "zip": {
79 |                   "type": "string",
80 |                   "index": "not_analyzed"
81 |                }
82 |             }
83 |          }
84 | }'


--------------------------------------------------------------------------------
/ch04/src/main/java/com/packtpub/esh/complaints/ComplaintsMapper.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.complaints;
 2 | 
 3 | import org.apache.commons.csv.CSVFormat;
 4 | import org.apache.commons.csv.CSVParser;
 5 | import org.apache.commons.csv.CSVRecord;
 6 | import org.apache.commons.lang.StringUtils;
 7 | import org.apache.hadoop.io.*;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | 
10 | import java.io.IOException;
11 | 
12 | public class ComplaintsMapper extends Mapper<Object, Text, Text, MapWritable> {
13 | 
14 |     public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
15 |         MapWritable map = new MapWritable();
16 |         String line = value.toString().trim();
17 |         CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180);
18 |         for (CSVRecord csvRecord : parser) {
19 |             String zip  = csvRecord.get(0);
20 |             Long complaintId = Long.parseLong(csvRecord.get(1));
21 |             String product = csvRecord.get(2);
22 |             String subproduct = csvRecord.get(3);
23 |             String issue = csvRecord.get(4);
24 |             String subissue = csvRecord.get(5);
25 |             String state = csvRecord.get(6);
26 |             String submittedVia = csvRecord.get(7);
27 |             String dateReceived = csvRecord.get(8);
28 |             String dateSent = csvRecord.get(9);
29 |             String company = csvRecord.get(10);
30 |             String companyResponse = csvRecord.get(11);
31 |             String timelyResponse = csvRecord.get(12);
32 |             String consumerDisputed = csvRecord.get(13);
33 |             String latitude = csvRecord.get(14);
34 |             String longitude = csvRecord.get(15);
35 | 
36 |             map.put(new Text("zip"), getWritableValue(zip));
37 |             map.put(new Text("complaintId"), getWritableLongValue(complaintId));
38 |             map.put(new Text("product"), getWritableValue(product));
39 |             map.put(new Text("subproduct"), getWritableValue(subproduct));
40 |             map.put(new Text("issue"), getWritableValue(issue));
41 |             map.put(new Text("issue.raw"), getWritableValue(issue));
42 |             map.put(new Text("subissue"), getWritableValue(subissue));
43 |             map.put(new Text("state"), getWritableValue(state));
44 |             map.put(new Text("submittedVia"), getWritableValue(submittedVia));
45 |             map.put(new Text("dateReceived"), getWritableValue(dateReceived));
46 |             map.put(new Text("dateSent"), getWritableValue(dateSent));
47 |             map.put(new Text("company"), getWritableValue(company));
48 |             map.put(new Text("companyResponse"), getWritableValue(companyResponse));
49 |             map.put(new Text("timelyResponse"), getWritableBooleanValue(timelyResponse));
50 |             map.put(new Text("consumerDisputed"), getWritableBooleanValue(consumerDisputed));
51 |             if("na".equalsIgnoreCase(latitude) || "na".equalsIgnoreCase(longitude)){
52 |                 map.put(new Text("location"), NullWritable.get());
53 |             }else{
54 |                 map.put(new Text("location"), getWritableValue(latitude+", "+longitude));
55 |             }
56 | 
57 |         }
58 | 
59 |         context.write(value, map);
60 |     }
61 | 
62 |     private static WritableComparable getWritableValue(String value) {
63 |         return value != null ? new Text(value) : NullWritable.get();
64 |     }
65 | 
66 |     private Writable getWritableLongValue(Long value) {
67 |         return value!=null ? new LongWritable(value) : NullWritable.get();
68 |     }
69 | 
70 |     private Writable getWritableBooleanValue(String value) {
71 |         if(StringUtils.isEmpty(value) || "na".equalsIgnoreCase(value)){
72 |             return NullWritable.get();
73 |         }
74 |         return "yes".equalsIgnoreCase(value) ? new BooleanWritable(true) : new BooleanWritable(false);
75 |     }
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/ch04/src/main/java/com/packtpub/esh/complaints/Driver.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.complaints;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.mapreduce.Job;
 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 7 | import org.elasticsearch.hadoop.mr.EsOutputFormat;
 8 | 
 9 | 
10 | public class Driver {
11 | 
12 | 	public static void main(String[] args) throws Exception {
13 | 		Configuration conf = new Configuration();
14 |         // ElasticSearch Server nodes to point to
15 | 		conf.set("es.nodes", "localhost:9200");
16 |         // ElasticSearch index and type name in {indexName}/{typeName} format
17 | 		conf.set("es.resource", "esh_complaints/complaints");
18 | 
19 |         // Create Job instance
20 | 		Job job = new Job(conf, "complaints mapper");
21 |         // set Driver class
22 | 		job.setJarByClass(Driver.class);
23 |         job.setMapperClass(ComplaintsMapper.class);
24 |         // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar
25 | 	    job.setOutputFormatClass(EsOutputFormat.class);
26 |         job.setNumReduceTasks(0);
27 |         FileInputFormat.addInputPath(job, new Path(args[0]));
28 | 
29 |         System.exit(job.waitForCompletion(true) ? 0 : 1);
30 | 	}
31 | 
32 | }


--------------------------------------------------------------------------------
/ch05/assembly.xml:
--------------------------------------------------------------------------------
 1 | <assembly>
 2 |     <id>job</id>
 3 |     <formats>
 4 |         <format>jar</format>
 5 |     </formats>
 6 |     <includeBaseDirectory>false</includeBaseDirectory>
 7 |     <dependencySets>
 8 |         <dependencySet>
 9 |             <unpack>true</unpack>
10 |             <scope>runtime</scope>
11 |         </dependencySet>
12 |         <dependencySet>
13 |             <unpack>true</unpack>
14 |             <includes>
15 |                 <include>${groupId}:${artifactId}</include>
16 |             </includes>
17 |         </dependencySet>
18 |     </dependencySets>
19 | </assembly>


--------------------------------------------------------------------------------
/ch05/data/percolators.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/1' -d '{
 4 |      "query": {
 5 |                   "match": {
 6 |                      "tweet": "bigdata analytics hadoop spark elasticsearch nosql graphdb cassandra mongo mongodb datascience pig hive solar cloudera hortonworks iot"
 7 |                   }
 8 |                }
 9 | }';
10 | 
11 | 
12 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/2' -d '{
13 |   "query": {
14 |      "match": {
15 |         "tweet": "relational mysql postgres oracle "
16 |      }
17 |   }
18 | }';
19 | 
20 | 
21 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/3' -d '{
22 |     "query": {
23 |        "match": {
24 |          "tweet": "football socker tennis snooker chess cricket sports"
25 |        }
26 |     }
27 |   }
28 | }';
29 | 
30 | 
31 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/4' -d '{
32 |     "query": {
33 |       "match": {
34 |         "tweet": "agile scrum xp "
35 |       }
36 |     }
37 |   }
38 | }';
39 | 
40 | curl -XPUT 'http://localhost:9200/es-storm/.percolator/5' -d '{
41 |     "query": {
42 |           "match": {
43 |             "tweet": "business entrepreneur entrepreneurship  biz designthinking startup"
44 |           }
45 |         }
46 |     }
47 | }';
48 | 
49 | 


--------------------------------------------------------------------------------
/ch05/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 
  5 | 	<groupId>es-hadoop-book-code</groupId>
  6 | 	<artifactId>ch05</artifactId>
  7 |         <version>0.0.1</version>
  8 | 	<packaging>jar</packaging>
  9 | 
 10 | 	<name>com.hadoop.app</name>
 11 | 	<url>http://maven.apache.org</url>
 12 | 
 13 | 	<properties>
 14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 | 	</properties>
 16 | 
 17 |     <dependencies>
 18 |         <dependency>
 19 |             <groupId>junit</groupId>
 20 |             <artifactId>junit</artifactId>
 21 |             <version>3.8.1</version>
 22 |             <scope>test</scope>
 23 |         </dependency>
 24 |         <dependency>
 25 |             <groupId>org.apache.storm</groupId>
 26 |             <artifactId>storm-core</artifactId>
 27 |             <version>0.9.4</version>
 28 |             <scope>provided</scope>
 29 |         </dependency>
 30 |         <dependency>
 31 |             <groupId>org.apache.storm</groupId>
 32 |             <artifactId>storm-starter</artifactId>
 33 |             <version>0.9.4</version>
 34 |             <scope>provided</scope>
 35 |         </dependency>
 36 |         <dependency>
 37 |             <groupId>org.apache.httpcomponents</groupId>
 38 |             <artifactId>httpclient</artifactId>
 39 |             <version>4.0-alpha4</version>
 40 |         </dependency>
 41 |         <dependency>
 42 |             <groupId>org.twitter4j</groupId>
 43 |             <artifactId>twitter4j-core</artifactId>
 44 |             <version>3.0.6</version>
 45 |         </dependency>
 46 |         <dependency>
 47 |             <groupId>org.scribe</groupId>
 48 |             <artifactId>scribe</artifactId>
 49 |             <version>1.3.7</version>
 50 |         </dependency>
 51 |         <dependency>
 52 |             <groupId>org.elasticsearch</groupId>
 53 |             <artifactId>elasticsearch</artifactId>
 54 |             <version>1.7.0</version>
 55 |         </dependency>
 56 | 
 57 |         <dependency>
 58 |             <groupId>org.elasticsearch</groupId>
 59 |             <artifactId>elasticsearch-hadoop</artifactId>
 60 |             <version>2.1.1</version>
 61 |             <exclusions>
 62 |                 <exclusion>
 63 |                     <artifactId>cascading-hadoop</artifactId>
 64 |                     <groupId>cascading</groupId>
 65 |                 </exclusion>
 66 |                 <exclusion>
 67 |                     <artifactId>cascading-local</artifactId>
 68 |                     <groupId>cascading</groupId>
 69 |                 </exclusion>
 70 |             </exclusions>
 71 |         </dependency>
 72 |         <dependency>
 73 |             <groupId>org.twitter4j</groupId>
 74 |             <artifactId>twitter4j-stream</artifactId>
 75 |             <version>3.0.6</version>
 76 |         </dependency>
 77 |         <dependency>
 78 |             <groupId>org.elasticsearch</groupId>
 79 |             <artifactId>elasticsearch-storm</artifactId>
 80 |             <version>2.1.1</version>
 81 |         </dependency>
 82 |         <dependency>
 83 |             <groupId>org.apache.kafka</groupId>
 84 |             <artifactId>kafka_2.10</artifactId>
 85 |             <version>0.8.2-beta</version>
 86 |         </dependency>
 87 | 
 88 |     </dependencies>
 89 | 
 90 | 	<build>
 91 | 		<plugins>
 92 |             <plugin>
 93 |                 <artifactId>maven-assembly-plugin</artifactId>
 94 |                 <version>2.2.1</version>
 95 |                 <configuration>
 96 |                     <descriptors>
 97 |                         <descriptor>assembly.xml</descriptor>
 98 |                     </descriptors>
 99 |                     <archive>
100 |                         <manifest>
101 |                             <mainClass>com.packtpub.esh.streaming.Topology</mainClass>
102 |                         </manifest>
103 |                     </archive>
104 |                 </configuration>
105 |                 <executions>
106 |                     <execution>
107 |                         <id>make-assembly</id>
108 |                         <phase>package</phase>
109 |                         <goals>
110 |                             <goal>single</goal>
111 |                         </goals>
112 |                     </execution>
113 |                 </executions>
114 |             </plugin>
115 |             <plugin>
116 |                 <groupId>org.apache.maven.plugins</groupId>
117 |                 <artifactId>maven-compiler-plugin</artifactId>
118 |                 <version>3.3</version>
119 |                 <configuration>
120 |                     <source>1.8</source>
121 |                     <target>1.8</target>
122 |                 </configuration>
123 |             </plugin>
124 | 		</plugins>
125 | 	</build>
126 |     <repositories>
127 |         <repository>
128 |             <id>sonatype-oss</id>
129 |             <url>http://oss.sonatype.org/content/repositories/snapshots</url>
130 |             <snapshots><enabled>true</enabled></snapshots>
131 |         </repository>
132 |     </repositories>
133 | </project>
134 | 


--------------------------------------------------------------------------------
/ch05/src/main/java/com/packtpub/esh/streaming/ElasticSearchService.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.streaming;
 2 | 
 3 | 
 4 | import org.elasticsearch.action.ActionFuture;
 5 | import org.elasticsearch.action.percolate.PercolateRequest;
 6 | import org.elasticsearch.action.percolate.PercolateResponse;
 7 | import org.elasticsearch.client.transport.TransportClient;
 8 | import org.elasticsearch.common.settings.ImmutableSettings;
 9 | import org.elasticsearch.common.settings.Settings;
10 | import org.elasticsearch.common.transport.InetSocketTransportAddress;
11 | 
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | import java.util.Map;
15 | 
16 | /**
17 |  * Created by vishalshukla on 11/07/15.
18 |  */
19 | public class ElasticSearchService {
20 | 
21 |     private static transient TransportClient client;
22 | 
23 |     static{
24 |         Settings settings = ImmutableSettings.settingsBuilder()
25 |                 .put("cluster.name", "eshadoopcluster").build();
26 |         client = new TransportClient(settings);
27 |         client.addTransportAddress(new InetSocketTransportAddress("localhost", 9300));
28 |     }
29 | 
30 |     public ElasticSearchService(){
31 | 
32 |     }
33 | 
34 |     public List<String> percolate(Map map){
35 |         List<String> ids = new ArrayList<String>();
36 |         PercolateRequest request = new PercolateRequest();
37 |         request.indices("es-storm");
38 |         request.documentType("storm-tweets");
39 |         ActionFuture<PercolateResponse> responseFuture = client.percolate(request.source(map));
40 |         PercolateResponse response = responseFuture.actionGet();
41 |         PercolateResponse.Match[] matches = response.getMatches();
42 |         for(PercolateResponse.Match match: matches){
43 |             ids.add(match.getId().toString());
44 |         }
45 |         return ids;
46 |     }
47 | 
48 | }


--------------------------------------------------------------------------------
/ch05/src/main/java/com/packtpub/esh/streaming/Topology.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.streaming;
 2 | 
 3 | 
 4 | import backtype.storm.Config;
 5 | import backtype.storm.LocalCluster;
 6 | import backtype.storm.topology.TopologyBuilder;
 7 | import org.elasticsearch.storm.EsBolt;
 8 | 
 9 | import java.util.HashMap;
10 | import java.util.Map;
11 | 
12 | 
13 | public class Topology {
14 | 
15 |     public static void main(String[] args) throws InterruptedException {
16 | 
17 |         TopologyBuilder builder = new TopologyBuilder();
18 |         builder.setSpout("tweets-collector", new TweetsCollectorSpout(),1);
19 |         builder.setBolt("tweets-parser-bolt", new TweetsParserBolt())
20 |                 .shuffleGrouping("tweets-collector");
21 | 
22 |         Map config = new HashMap();
23 |         config.put("es.nodes","localhost:9200");
24 |         config.put("es.storm.bolt.flush.entries.size",100);
25 |         builder.setBolt("es-bolt", new EsBolt("es-storm/storm-tweets",config))
26 |                 .shuffleGrouping("tweets-parser-bolt")
27 |                 .addConfiguration(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, 2);
28 | 
29 |         LocalCluster cluster = new LocalCluster();
30 |         cluster.submitTopology("twitter-test", null, builder.createTopology());
31 |     }
32 | }


--------------------------------------------------------------------------------
/ch05/src/main/java/com/packtpub/esh/streaming/TweetsCollectorSpout.java:
--------------------------------------------------------------------------------
  1 | package com.packtpub.esh.streaming;
  2 | 
  3 | 
  4 | 
  5 | import backtype.storm.Config;
  6 | import backtype.storm.spout.SpoutOutputCollector;
  7 | import backtype.storm.task.TopologyContext;
  8 | import backtype.storm.topology.OutputFieldsDeclarer;
  9 | import backtype.storm.topology.base.BaseRichSpout;
 10 | import backtype.storm.tuple.Fields;
 11 | import backtype.storm.tuple.Values;
 12 | import backtype.storm.utils.Utils;
 13 | import twitter4j.*;
 14 | import twitter4j.auth.AccessToken;
 15 | import twitter4j.conf.ConfigurationBuilder;
 16 | 
 17 | import java.util.Map;
 18 | import java.util.concurrent.LinkedBlockingQueue;
 19 | 
 20 | @SuppressWarnings("serial")
 21 | public class TweetsCollectorSpout extends BaseRichSpout {
 22 | 
 23 |     SpoutOutputCollector collector;
 24 |     LinkedBlockingQueue<Status> queue = null;
 25 |     TwitterStream twitterStream;
 26 | 
 27 |     // TODO: Initialize twitter credentials.
 28 |     String consumerKey = "<<YOUR_CONSUMER_KEY>>";
 29 |     String consumerSecret = "<<YOUR_CONSUMER_SECRET>>";
 30 |     String accessToken = "<<YOUR_ACCESS_TOKEN>>";
 31 |     String accessTokenSecret = "<<YOUR_TOKEN_SECRET>>";
 32 |     String[] keyWords = {};
 33 | 
 34 |     public TweetsCollectorSpout() {
 35 |     }
 36 | 
 37 |     @Override
 38 |     public void open(Map conf, TopologyContext context,	SpoutOutputCollector collector) {
 39 |         queue = new LinkedBlockingQueue<Status>(1000);
 40 |         this.collector = collector;
 41 | 
 42 |         StatusListener listener = new StatusListener() {
 43 | 
 44 |             public void onStatus(Status status) {
 45 |                 queue.offer(status);
 46 |             }
 47 | 
 48 |             public void onDeletionNotice(StatusDeletionNotice sdn) {
 49 |             }
 50 | 
 51 |             public void onTrackLimitationNotice(int i) {
 52 |             }
 53 | 
 54 |             public void onScrubGeo(long l, long l1) {
 55 |             }
 56 | 
 57 |             public void onException(Exception ex) {
 58 |             }
 59 | 
 60 |             public void onStallWarning(StallWarning arg0) {
 61 | 
 62 |             }
 63 | 
 64 |         };
 65 | 
 66 |         twitterStream = new TwitterStreamFactory(
 67 |                 new ConfigurationBuilder().setJSONStoreEnabled(true).build())
 68 |                 .getInstance();
 69 | 
 70 |         twitterStream.addListener(listener);
 71 |         twitterStream.setOAuthConsumer(consumerKey, consumerSecret);
 72 |         AccessToken token = new AccessToken(accessToken, accessTokenSecret);
 73 |         twitterStream.setOAuthAccessToken(token);
 74 | 
 75 |         if (keyWords.length == 0) {
 76 |             twitterStream.sample();
 77 |         }
 78 |         else {
 79 |             FilterQuery query = new FilterQuery().track(keyWords);
 80 |             twitterStream.filter(query);
 81 |         }
 82 | 
 83 |     }
 84 | 
 85 |     public void nextTuple() {
 86 |         Status status = queue.poll();
 87 |         if (status == null) {
 88 |             Utils.sleep(50);
 89 |         } else {
 90 |             collector.emit(new Values(status));
 91 |         }
 92 |     }
 93 | 
 94 | 
 95 |     public void declareOutputFields(OutputFieldsDeclarer declarer) {
 96 |         declarer.declare(new Fields("tweet"));
 97 |     }
 98 | 
 99 |     @Override
100 |     public Map<String, Object> getComponentConfiguration() {
101 |         Config config = new Config();
102 |         config.setMaxTaskParallelism(1);
103 |         return config;
104 |     }
105 | 
106 |     @Override
107 |     public void close() {
108 |         twitterStream.shutdown();
109 |     }
110 | 
111 | 
112 |     @Override
113 |     public void ack(Object id) {
114 |     }
115 | 
116 |     @Override
117 |     public void fail(Object id) {
118 |     }
119 | 
120 | 
121 | }
122 | 


--------------------------------------------------------------------------------
/ch05/src/main/java/com/packtpub/esh/streaming/TweetsParserBolt.java:
--------------------------------------------------------------------------------
  1 | package com.packtpub.esh.streaming;
  2 | 
  3 | 
  4 | import backtype.storm.task.OutputCollector;
  5 | import backtype.storm.task.TopologyContext;
  6 | import backtype.storm.topology.OutputFieldsDeclarer;
  7 | import backtype.storm.topology.base.BaseRichBolt;
  8 | import backtype.storm.tuple.Fields;
  9 | import backtype.storm.tuple.Tuple;
 10 | import backtype.storm.tuple.Values;
 11 | import twitter4j.HashtagEntity;
 12 | import twitter4j.Status;
 13 | import twitter4j.UserMentionEntity;
 14 | 
 15 | import java.util.*;
 16 | 
 17 | 
 18 | public class TweetsParserBolt extends BaseRichBolt {
 19 | 
 20 |     private static final long serialVersionUID = 3938843121119464326L;
 21 |     private OutputCollector collector;
 22 |     private static transient ElasticSearchService service = new ElasticSearchService();
 23 | 
 24 |     public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
 25 |         this.collector = collector;
 26 |     }
 27 | 
 28 |     @Override
 29 |     public void execute(Tuple input) {
 30 | 
 31 |         String user = null;
 32 |         String userHandle = null;
 33 | 
 34 |         String location = null;
 35 |         String country = null;
 36 |         List<String> hashtagList = new ArrayList<String>();
 37 |         List<String> mentionList = new ArrayList<String>();
 38 | 
 39 |         Status status = (Status) input.getValueByField("tweet");
 40 | 
 41 |         String tweet = status.getText();
 42 |         String source = status.getSource();
 43 |         Date createdDate = status.getCreatedAt();
 44 |         HashtagEntity entities[] = status.getHashtagEntities();
 45 |         long retweetCount = status.getRetweetCount();
 46 |         long favoriteCount = status.getFavoriteCount();
 47 |         UserMentionEntity mentions[] = status.getUserMentionEntities();
 48 |         String lang = status.getLang();
 49 | 
 50 |         // Extract hashtags
 51 |         if (entities != null) {
 52 |             for (HashtagEntity entity : entities) {
 53 |                 String hashTag = entity.getText();
 54 |                 hashtagList.add(hashTag);
 55 |             }
 56 |         }
 57 | 
 58 |         if (status.getPlace() != null) {
 59 |             if (status.getPlace().getName() != null) {
 60 |                 location = status.getPlace().getName();
 61 |             }
 62 |             if (status.getPlace().getCountry() != null) {
 63 |                 country = status.getPlace().getCountry();
 64 |             }
 65 |         }
 66 | 
 67 |         if (status.getUser() != null && status.getUser().getName() != null) {
 68 |             user = status.getUser().getName();
 69 |             userHandle = status.getUser().getScreenName();
 70 |         }
 71 | 
 72 |         if (mentions != null) {
 73 |             for (UserMentionEntity mention : mentions) {
 74 |                 String mentionName = mention.getScreenName();
 75 |                 mentionList.add(mentionName);
 76 |             }
 77 |         }
 78 | 
 79 |         String strHashtag = hashtagList.toString().replace("[", "").replace("]", "");
 80 |         String strUserMention = mentionList.toString().replace("[", "").replace("]", "");
 81 | 
 82 |         if ("en".equalsIgnoreCase(lang)) {
 83 |             System.out.println("Emitting : " + userHandle + " -> " + tweet);
 84 |             String categories = classify(tweet);
 85 |             collector.emit(input, new Values(user, userHandle, tweet, createdDate, location, country, strHashtag, source, lang, retweetCount, favoriteCount, strUserMention, categories));
 86 |         }
 87 |     }
 88 | 
 89 |     private String classify(String tweet) {
 90 |         StringBuilder categoriesBuilder = new StringBuilder();
 91 | 
 92 |         Map<String, Object> main = new HashMap<String, Object>();
 93 |         Map<String, Object> doc = new HashMap<String, Object>();
 94 |         doc.put("tweet", tweet);
 95 |         main.put("doc", doc);
 96 |         List<String> ids = service.percolate(main);
 97 |         for (String id : ids) {
 98 |             categoriesBuilder.append(getCategoryName(id) + " ");
 99 |         }
100 |         return categoriesBuilder.toString();
101 |     }
102 | 
103 |     public String getCategoryName(String id) {
104 |         switch (id) {
105 |             case "1":
106 |                 return "BigData";
107 |             case "2":
108 |                 return "Relational Database";
109 |             case "3":
110 |                 return "Sports";
111 |             case "4":
112 |                 return "Agile";
113 |             case "5":
114 |                 return "Business";
115 |             default:
116 |                 return "Other";
117 |         }
118 |     }
119 | 
120 |     public void declareOutputFields(OutputFieldsDeclarer declarer) {
121 |         declarer.declare(new Fields("user", "userHandle", "tweet",
122 |                 "time", "location", "country", "hashtags", "source",
123 |                 "lang", "retweetCount", "favoriteCount", "mentions", "categories"));
124 |     }
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/ch07-spark/assembly.xml:
--------------------------------------------------------------------------------
 1 | <assembly>
 2 |     <id>job</id>
 3 |     <formats>
 4 |         <format>jar</format>
 5 |     </formats>
 6 |     <includeBaseDirectory>false</includeBaseDirectory>
 7 |     <dependencySets>
 8 |         <dependencySet>
 9 |             <unpack>true</unpack>
10 |             <scope>runtime</scope>
11 |         </dependencySet>
12 |         <dependencySet>
13 |             <unpack>true</unpack>
14 |             <includes>
15 |                 <include>${groupId}:${artifactId}</include>
16 |             </includes>
17 |         </dependencySet>
18 |     </dependencySets>
19 | </assembly>


--------------------------------------------------------------------------------
/ch07-spark/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |     <groupId>es-hadoop-book-code</groupId>
  6 |     <artifactId>ch07-spark</artifactId>
  7 |     <version>0.0.1</version>
  8 |     <packaging>jar</packaging>
  9 | 
 10 |     <name>com.hadoop.app</name>
 11 |     <url>http://maven.apache.org</url>
 12 | 
 13 |     <properties>
 14 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 |     </properties>
 16 | 
 17 |     <dependencies>
 18 | 
 19 |         <dependency>
 20 |             <groupId>org.apache.spark</groupId>
 21 |             <artifactId>spark-core_2.10</artifactId>
 22 |             <version>1.4.0</version>
 23 |         </dependency>
 24 |         <dependency>
 25 |             <groupId>org.apache.spark</groupId>
 26 |             <artifactId>spark-sql_2.10</artifactId>
 27 |             <version>1.4.1</version>
 28 |         </dependency>
 29 |         <dependency>
 30 |             <groupId>com.google.guava</groupId>
 31 |             <artifactId>guava-collections</artifactId>
 32 |             <version>r03</version>
 33 |         </dependency>
 34 |         <dependency>
 35 |             <groupId>org.elasticsearch</groupId>
 36 |             <artifactId>elasticsearch-hadoop</artifactId>
 37 |             <version>2.1.1</version>
 38 |         </dependency>
 39 |         <dependency>
 40 |             <groupId>org.apache.commons</groupId>
 41 |             <artifactId>commons-csv</artifactId>
 42 |             <version>1.1</version>
 43 |         </dependency>
 44 | 
 45 |     </dependencies>
 46 | 
 47 |     <build>
 48 |         <plugins>
 49 |             <plugin>
 50 |                 <artifactId>maven-assembly-plugin</artifactId>
 51 |                 <version>2.5.5</version>
 52 | 
 53 |                 <executions>
 54 | 
 55 |                     <execution>
 56 |                         <id>make-spark-writer-job</id>
 57 |                         <configuration>
 58 |                             <descriptors>
 59 |                                 <descriptor>assembly.xml</descriptor>
 60 |                             </descriptors>
 61 |                             <archive>
 62 |                                 <manifest>
 63 |                                     <mainClass>com.packtpub.esh.spark.SparkEsWriter</mainClass>
 64 |                                 </manifest>
 65 |                             </archive>
 66 |                             <finalName>${artifactId}-${version}-spark-writer</finalName>
 67 |                         </configuration>
 68 |                         <phase>package</phase>
 69 |                         <goals>
 70 |                             <goal>single</goal>
 71 |                         </goals>
 72 |                     </execution>
 73 |                     <execution>
 74 |                         <id>make-spark-reader-job</id>
 75 |                         <configuration>
 76 |                             <descriptors>
 77 |                                 <descriptor>assembly.xml</descriptor>
 78 |                             </descriptors>
 79 |                             <archive>
 80 |                                 <manifest>
 81 |                                     <mainClass>com.packtpub.esh.spark.SparkEsReader</mainClass>
 82 |                                 </manifest>
 83 |                             </archive>
 84 |                             <finalName>${artifactId}-${version}-spark-reader</finalName>
 85 |                         </configuration>
 86 |                         <phase>package</phase>
 87 |                         <goals>
 88 |                             <goal>single</goal>
 89 |                         </goals>
 90 |                     </execution>
 91 |                     <execution>
 92 |                         <id>make-sparksql-reflection-job</id>
 93 |                         <configuration>
 94 |                             <descriptors>
 95 |                                 <descriptor>assembly.xml</descriptor>
 96 |                             </descriptors>
 97 |                             <archive>
 98 |                                 <manifest>
 99 |                                     <mainClass>com.packtpub.esh.spark.SparkSQLEsWriterReflection</mainClass>
100 |                                 </manifest>
101 |                             </archive>
102 |                             <finalName>${artifactId}-${version}-sparksql-reflection</finalName>
103 |                         </configuration>
104 |                         <phase>package</phase>
105 |                         <goals>
106 |                             <goal>single</goal>
107 |                         </goals>
108 |                     </execution>
109 |                     <execution>
110 |                         <id>make-sparksql-schema-job</id>
111 |                         <configuration>
112 |                             <descriptors>
113 |                                 <descriptor>assembly.xml</descriptor>
114 |                             </descriptors>
115 |                             <archive>
116 |                                 <manifest>
117 |                                     <mainClass>com.packtpub.esh.spark.SparkSQLEsWriterReflection</mainClass>
118 |                                 </manifest>
119 |                             </archive>
120 |                             <finalName>${artifactId}-${version}-sparksql-schema</finalName>
121 |                         </configuration>
122 |                         <phase>package</phase>
123 |                         <goals>
124 |                             <goal>single</goal>
125 |                         </goals>
126 |                     </execution>
127 |                     <execution>
128 |                         <id>make-sparksql-reader-job</id>
129 |                         <configuration>
130 |                             <descriptors>
131 |                                 <descriptor>assembly.xml</descriptor>
132 |                             </descriptors>
133 |                             <archive>
134 |                                 <manifest>
135 |                                     <mainClass>com.packtpub.esh.spark.SparkSQLEsReader</mainClass>
136 |                                 </manifest>
137 |                             </archive>
138 |                             <finalName>${artifactId}-${version}-sparksql-reader</finalName>
139 |                         </configuration>
140 |                         <phase>package</phase>
141 |                         <goals>
142 |                             <goal>single</goal>
143 |                         </goals>
144 |                     </execution>
145 |                 </executions>
146 |             </plugin>
147 |             <plugin>
148 |                 <groupId>org.apache.maven.plugins</groupId>
149 |                 <artifactId>maven-compiler-plugin</artifactId>
150 |                 <version>3.3</version>
151 |                 <configuration>
152 |                     <source>1.8</source>
153 |                     <target>1.8</target>
154 |                 </configuration>
155 |             </plugin>
156 |         </plugins>
157 |     </build>
158 | 
159 | </project>
160 | 


--------------------------------------------------------------------------------
/ch07-spark/src/main/java/com/packtpub/esh/spark/Crime.java:
--------------------------------------------------------------------------------
  1 | package com.packtpub.esh.spark;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.text.ParseException;
  5 | import java.text.SimpleDateFormat;
  6 | import java.util.Date;
  7 | import java.util.Map;
  8 | import java.util.Locale;
  9 | 
 10 | /**
 11 |  * Created by vishalshukla on 16/08/15.
 12 |  *
 13 |  */
 14 | public class Crime implements Serializable {
 15 |     private String id;
 16 |     private String caseNumber;
 17 |     private Long eventDate;
 18 |     private String block;
 19 |     private String iucr;
 20 |     private String primaryType;
 21 |     private String description;
 22 |     private String location;
 23 |     private Boolean arrest;
 24 |     private Boolean domestic;
 25 |     private Map<String, Double> geoLocation;
 26 | 
 27 |     public String getId() {
 28 |         return id;
 29 |     }
 30 | 
 31 |     public void setId(String id) {
 32 |         this.id = id;
 33 |     }
 34 | 
 35 |     public String getCaseNumber() {
 36 |         return caseNumber;
 37 |     }
 38 | 
 39 |     public void setCaseNumber(String caseNumber) {
 40 |         this.caseNumber = caseNumber;
 41 |     }
 42 | 
 43 |     public Long getEventDate() {
 44 |         return eventDate;
 45 |     }
 46 | 
 47 |     public void setEventDate(String eventDate) throws ParseException {
 48 |         SimpleDateFormat format = new SimpleDateFormat("MM/dd/yy hh:mm",Locale.ENGLISH);
 49 |         Date date = format.parse(eventDate);
 50 |         this.eventDate = date.getTime();
 51 |     }
 52 | 
 53 |     public void setEventDate(Long eventDate) {
 54 |         this.eventDate = eventDate;
 55 |     }
 56 | 
 57 |     public String getBlock() {
 58 |         return block;
 59 |     }
 60 | 
 61 |     public void setBlock(String block) {
 62 |         this.block = block;
 63 |     }
 64 | 
 65 |     public String getIucr() {
 66 |         return iucr;
 67 |     }
 68 | 
 69 |     public void setIucr(String iucr) {
 70 |         this.iucr = iucr;
 71 |     }
 72 | 
 73 |     public String getPrimaryType() {
 74 |         return primaryType;
 75 |     }
 76 | 
 77 |     public void setPrimaryType(String primaryType) {
 78 |         this.primaryType = primaryType;
 79 |     }
 80 | 
 81 |     public String getDescription() {
 82 |         return description;
 83 |     }
 84 | 
 85 |     public void setDescription(String description) {
 86 |         this.description = description;
 87 |     }
 88 | 
 89 |     public String getLocation() {
 90 |         return location;
 91 |     }
 92 | 
 93 |     public void setLocation(String location) {
 94 |         this.location = location;
 95 |     }
 96 | 
 97 |     public Boolean getArrest() {
 98 |         return arrest;
 99 |     }
100 | 
101 |     public void setArrest(Boolean arrest) {
102 |         this.arrest = arrest;
103 |     }
104 | 
105 |     public Boolean getDomestic() {
106 |         return domestic;
107 |     }
108 | 
109 |     public void setDomestic(Boolean domestic) {
110 |         this.domestic = domestic;
111 |     }
112 | 
113 |     public Map<String, Double> getGeoLocation() {
114 |         return geoLocation;
115 |     }
116 | 
117 |     public void setGeoLocation(Map<String, Double> geoLocation) {
118 |         this.geoLocation = geoLocation;
119 |     }
120 | }
121 | 


--------------------------------------------------------------------------------
/ch07-spark/src/main/java/com/packtpub/esh/spark/SparkEsReader.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.spark;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaRDD;
 5 | import org.apache.spark.api.java.JavaSparkContext;
 6 | import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
 7 | 
 8 | import java.util.Map;
 9 | /**
10 |  * Created by vishalshukla on 16/08/15.
11 |  */
12 | public class SparkEsReader {
13 | 
14 |     public static void main(String args[]){
15 |         SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]");
16 |         JavaSparkContext context = new JavaSparkContext(conf);
17 |         JavaRDD<Map<String, Object>> esRDD =
18 |                 JavaEsSpark.esRDD(context, "esh_spark/crimes", "{\"query\" : { \"term\" : { \"primaryType\" : \"theft\" } } }").values();
19 | 
20 |         for(Map<String,Object> item: esRDD.collect()){
21 |             System.out.println(item);
22 |         }
23 | 
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/ch07-spark/src/main/java/com/packtpub/esh/spark/SparkEsWriter.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.spark;
 2 | 
 3 | import org.apache.commons.csv.CSVFormat;
 4 | import org.apache.commons.csv.CSVParser;
 5 | import org.apache.commons.csv.CSVRecord;
 6 | import org.apache.commons.lang.StringUtils;
 7 | import org.apache.spark.SparkConf;
 8 | import org.apache.spark.api.java.JavaRDD;
 9 | import org.apache.spark.api.java.JavaSparkContext;
10 | import org.apache.spark.api.java.function.Function;
11 | import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
12 | 
13 | import java.util.HashMap;
14 | import java.util.Map;
15 | 
16 | /**
17 |  * Created by vishalshukla on 14/08/15.
18 |  */
19 | public class SparkEsWriter {
20 |     public static void main(String[] args) {
21 | 
22 |         SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]");
23 |         conf.set("es.index.auto.create", "true");
24 |         JavaSparkContext context = new JavaSparkContext(conf);
25 | 
26 |         JavaRDD<String> textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv");
27 | 
28 |         JavaRDD<Crime> dataSplits = textFile.map(line -> {
29 |                 CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180);
30 |                 Crime c = new Crime();
31 |                 CSVRecord record = parser.getRecords().get(0);
32 |                 c.setId(record.get(0));
33 |                 c.setCaseNumber(record.get(1));
34 |                 c.setEventDate(record.get(2));
35 |                 c.setBlock(record.get(3));
36 |                 c.setIucr(record.get(4));
37 |                 c.setPrimaryType(record.get(5));
38 |                 c.setDescription(record.get(6));
39 |                 c.setLocation(record.get(7));
40 |                 c.setArrest(Boolean.parseBoolean(record.get(8)));
41 |                 c.setDomestic(Boolean.parseBoolean(record.get(9)));
42 |                 String lat = record.get(10);
43 |                 String lon = record.get(11);
44 | 
45 |                 Map<String, Double> geoLocation = new HashMap<>();
46 |                 geoLocation.put("lat", StringUtils.isEmpty(lat)?null:Double.parseDouble(lat));
47 |                 geoLocation.put("lon", StringUtils.isEmpty(lon)?null:Double.parseDouble(lon));
48 |                 c.setGeoLocation(geoLocation);
49 |                 return c;
50 |             });
51 | 
52 |         JavaEsSpark.saveToEs(dataSplits, "esh_spark/crimes");
53 | 
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/ch07-spark/src/main/java/com/packtpub/esh/spark/SparkSQLEsReader.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.spark;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaSparkContext;
 5 | import org.apache.spark.sql.DataFrame;
 6 | import org.apache.spark.sql.Row;
 7 | import org.apache.spark.sql.SQLContext;
 8 | 
 9 | import java.util.HashMap;
10 | import java.util.Map;
11 | 
12 | /**
13 |  * Created by vishalshukla on 22/08/15.
14 |  */
15 | public class SparkSQLEsReader {
16 |     public static void main(String args[]) {
17 |         SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]");
18 |         conf.set("es.index.auto.create", "true");
19 |         JavaSparkContext context = new JavaSparkContext(conf);
20 | 
21 |         SQLContext sqlContext = new SQLContext(context);
22 |         Map<String, String> options = new HashMap<>();
23 |         options.put("pushdown","true");
24 |         options.put("es.nodes","localhost");
25 | 
26 |         DataFrame df = sqlContext.read()
27 |                 .options(options)
28 |                 .format("org.elasticsearch.spark.sql").load("esh_sparksql/crimes_reflection");
29 | 
30 |         df.registerTempTable("crimes");
31 | 
32 |         DataFrame theftCrimes = sqlContext.sql("SELECT * FROM crimes WHERE primaryType='THEFT'");
33 |         for(Row row: theftCrimes.javaRDD().collect()){
34 |             System.out.println(row);
35 |         }
36 |     }
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/ch07-spark/src/main/java/com/packtpub/esh/spark/SparkSQLEsWriterReflection.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.spark;
 2 | 
 3 | import org.apache.commons.csv.CSVFormat;
 4 | import org.apache.commons.csv.CSVParser;
 5 | import org.apache.commons.csv.CSVRecord;
 6 | import org.apache.commons.lang.StringUtils;
 7 | import org.apache.spark.SparkConf;
 8 | import org.apache.spark.api.java.JavaRDD;
 9 | import org.apache.spark.api.java.JavaSparkContext;
10 | import org.apache.spark.api.java.function.Function;
11 | import org.apache.spark.sql.DataFrame;
12 | import org.apache.spark.sql.SQLContext;
13 | import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL;
14 | 
15 | import java.util.HashMap;
16 | import java.util.Map;
17 | 
18 | /**
19 |  * Created by vishalshukla on 22/08/15.
20 |  */
21 | public class SparkSQLEsWriterReflection {
22 |     public static void main(String args[]) {
23 |         SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]");
24 |         conf.set("es.index.auto.create", "true");
25 |         JavaSparkContext context = new JavaSparkContext(conf);
26 | 
27 |         JavaRDD<String> textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv");
28 | 
29 |         JavaRDD<Crime> dataSplits = textFile.map(line -> {
30 |                 CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180);
31 |                 Crime c = new Crime();
32 |                 CSVRecord record = parser.getRecords().get(0);
33 |                 c.setId(record.get(0));
34 |                 c.setCaseNumber(record.get(1));
35 |                 c.setEventDate(record.get(2));
36 |                 c.setBlock(record.get(3));
37 |                 c.setIucr(record.get(4));
38 |                 c.setPrimaryType(record.get(5));
39 |                 c.setDescription(record.get(6));
40 |                 c.setLocation(record.get(7));
41 |                 c.setArrest(Boolean.parseBoolean(record.get(8)));
42 |                 c.setDomestic(Boolean.parseBoolean(record.get(9)));
43 |                 String lat = record.get(10);
44 |                 String lon = record.get(11);
45 |                 Map<String, Double> geoLocation = new HashMap<>();
46 |                 geoLocation.put("lat", StringUtils.isEmpty(lat) ? null : Double.parseDouble(lat));
47 |                 geoLocation.put("lon", StringUtils.isEmpty(lon) ? null : Double.parseDouble(lon));
48 |                 c.setGeoLocation(geoLocation);
49 |                 return c;
50 |             }
51 |         );
52 | 
53 |         SQLContext sqlContext = new SQLContext(context);
54 |         DataFrame df = sqlContext.createDataFrame(dataSplits, Crime.class);
55 | 
56 |         JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_reflection");
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/ch07-spark/src/main/java/com/packtpub/esh/spark/SparkSQLEsWriterSchema.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.spark;
 2 | 
 3 | import org.apache.commons.csv.CSVFormat;
 4 | import org.apache.commons.csv.CSVParser;
 5 | import org.apache.commons.csv.CSVRecord;
 6 | import org.apache.commons.lang.StringUtils;
 7 | import org.apache.spark.SparkConf;
 8 | import org.apache.spark.api.java.JavaRDD;
 9 | import org.apache.spark.api.java.JavaSparkContext;
10 | import org.apache.spark.api.java.function.Function;
11 | import org.apache.spark.sql.DataFrame;
12 | import org.apache.spark.sql.Row;
13 | import org.apache.spark.sql.RowFactory;
14 | import org.apache.spark.sql.SQLContext;
15 | import org.apache.spark.sql.types.DataTypes;
16 | import org.apache.spark.sql.types.StructField;
17 | import org.apache.spark.sql.types.StructType;
18 | import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL;
19 | 
20 | import java.sql.Timestamp;
21 | import java.text.SimpleDateFormat;
22 | import java.util.ArrayList;
23 | import java.util.Date;
24 | import java.util.List;
25 | import java.util.Locale;
26 | 
27 | /**
28 |  * Created by vishalshukla on 22/08/15.
29 |  */
30 | public class SparkSQLEsWriterSchema {
31 |     public static void main(String args[]) {
32 |         SparkConf conf = new SparkConf().setAppName("esh-spark").setMaster("local[4]");
33 |         conf.set("es.index.auto.create", "true");
34 |         JavaSparkContext context = new JavaSparkContext(conf);
35 | 
36 |         JavaRDD<String> textFile = context.textFile("hdfs://localhost:9000/ch07/crimes_dataset.csv");
37 |         SQLContext sqlContext = new org.apache.spark.sql.SQLContext(context);
38 | 
39 |         List<StructField> fields = new ArrayList<>();
40 |         fields.add(DataTypes.createStructField("id", DataTypes.StringType, true));
41 |         fields.add(DataTypes.createStructField("caseNumber", DataTypes.StringType, true));
42 |         fields.add(DataTypes.createStructField("eventDate", DataTypes.TimestampType, true));
43 |         fields.add(DataTypes.createStructField("block", DataTypes.StringType, true));
44 |         fields.add(DataTypes.createStructField("iucr", DataTypes.StringType, true));
45 |         fields.add(DataTypes.createStructField("primaryType", DataTypes.StringType, true));
46 |         fields.add(DataTypes.createStructField("description", DataTypes.StringType, true));
47 |         fields.add(DataTypes.createStructField("location", DataTypes.StringType, true));
48 |         fields.add(DataTypes.createStructField("arrest", DataTypes.BooleanType, true));
49 |         fields.add(DataTypes.createStructField("domestic", DataTypes.BooleanType, true));
50 | 
51 | 
52 |         List<StructField> geoFields = new ArrayList<>();
53 |         geoFields.add(DataTypes.createStructField("lat", DataTypes.DoubleType, true));
54 |         geoFields.add(DataTypes.createStructField("lon", DataTypes.DoubleType, true));
55 |         StructType geoLocationSchema = DataTypes.createStructType(geoFields);
56 |         fields.add(DataTypes.createStructField("geoLocation", geoLocationSchema, true));
57 | 
58 |         StructType schema = DataTypes.createStructType(fields);
59 | 
60 |         JavaRDD<Row> rowRDD = textFile.map(line -> {
61 |                         CSVParser parser = CSVParser.parse(line, CSVFormat.RFC4180);
62 |                         CSVRecord record = parser.getRecords().get(0);
63 |                         SimpleDateFormat format = new SimpleDateFormat("MM/dd/yy hh:mm",Locale.ENGLISH);
64 |                         Date eventDate = format.parse(record.get(2));
65 | 
66 |                         Row geo = RowFactory.create(StringUtils.isEmpty(record.get(10)) ? null : Double.parseDouble(record.get(10)),
67 |                                 StringUtils.isEmpty(record.get(11)) ? null : Double.parseDouble(record.get(11)));
68 |                         return RowFactory.create(record.get(0), record.get(1),
69 |                                 new Timestamp(eventDate.getTime()), record.get(3), record.get(4),
70 |                                 record.get(5), record.get(6), record.get(7),
71 |                                 Boolean.parseBoolean(record.get(8)),
72 |                                 Boolean.parseBoolean(record.get(9)),
73 |                                 geo
74 |                         );
75 |                     });
76 | 
77 |         DataFrame df = sqlContext.createDataFrame(rowRDD, schema);
78 |         df.registerTempTable("crime");
79 |         JavaEsSparkSQL.saveToEs(df, "esh_sparksql/crimes_schema");
80 |     }
81 | 
82 | }
83 | 


--------------------------------------------------------------------------------
/ch07/assembly.xml:
--------------------------------------------------------------------------------
 1 | <assembly>
 2 |     <id>job</id>
 3 |     <formats>
 4 |         <format>jar</format>
 5 |     </formats>
 6 |     <includeBaseDirectory>false</includeBaseDirectory>
 7 |     <dependencySets>
 8 |         <dependencySet>
 9 |             <unpack>true</unpack>
10 |             <scope>runtime</scope>
11 |         </dependencySet>
12 |         <dependencySet>
13 |             <unpack>true</unpack>
14 |             <includes>
15 |                 <include>${groupId}:${artifactId}</include>
16 |             </includes>
17 |         </dependencySet>
18 |     </dependencySets>
19 | </assembly>


--------------------------------------------------------------------------------
/ch07/data/crimes.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "id": 10178221,
  4 |     "caseNumber": "HY366678",
  5 |     "eventDate": "08/02/15 23:58",
  6 |     "block": "042XX W MADISON ST",
  7 |     "iucr": 1811,
  8 |     "primaryType": "NARCOTICS",
  9 |     "description": "POSS: CANNABIS 30GMS OR LESS",
 10 |     "location": "SIDEWALK",
 11 |     "arrest": "TRUE",
 12 |     "domestic": "FALSE",
 13 |     "lat": 41.88076873,
 14 |     "lon": -87.73136165
 15 |   },
 16 |   {
 17 |     "id": 10178234,
 18 |     "caseNumber": "HY366669",
 19 |     "eventDate": "08/02/15 23:55",
 20 |     "block": "035XX W VAN BUREN ST",
 21 |     "iucr": 486,
 22 |     "primaryType": "BATTERY",
 23 |     "description": "DOMESTIC BATTERY SIMPLE",
 24 |     "location": "SIDEWALK",
 25 |     "arrest": "TRUE",
 26 |     "domestic": "TRUE",
 27 |     "lat": 41.87530121,
 28 |     "lon": -87.71414439
 29 |   },
 30 |   {
 31 |     "id": 10181024,
 32 |     "caseNumber": "HY367638",
 33 |     "eventDate": "08/02/15 23:47",
 34 |     "block": "069XX S WINCHESTER AVE",
 35 |     "iucr": 2826,
 36 |     "primaryType": "OTHER OFFENSE",
 37 |     "description": "HARASSMENT BY ELECTRONIC MEANS",
 38 |     "location": "OTHER",
 39 |     "arrest": "FALSE",
 40 |     "domestic": "FALSE",
 41 |     "lat": 41.76792578,
 42 |     "lon": -87.67225279
 43 |   },
 44 |   {
 45 |     "id": 10178323,
 46 |     "caseNumber": "HY366697",
 47 |     "eventDate": "08/02/15 23:45",
 48 |     "block": "045XX S SAWYER AVE",
 49 |     "iucr": 486,
 50 |     "primaryType": "BATTERY",
 51 |     "description": "DOMESTIC BATTERY SIMPLE",
 52 |     "location": "RESIDENCE",
 53 |     "arrest": "FALSE",
 54 |     "domestic": "TRUE",
 55 |     "lat": 41.81151147,
 56 |     "lon": -87.70517785
 57 |   },
 58 |   {
 59 |     "id": 10178252,
 60 |     "caseNumber": "HY366660",
 61 |     "eventDate": "08/02/15 23:45",
 62 |     "block": "041XX W GRENSHAW ST",
 63 |     "iucr": 1310,
 64 |     "primaryType": "CRIMINAL DAMAGE",
 65 |     "description": "TO PROPERTY",
 66 |     "location": "APARTMENT",
 67 |     "arrest": "FALSE",
 68 |     "domestic": "TRUE",
 69 |     "lat": 41.86713278,
 70 |     "lon": -87.72931615
 71 |   },
 72 |   {
 73 |     "id": 10178286,
 74 |     "caseNumber": "HY366712",
 75 |     "eventDate": "08/02/15 23:45",
 76 |     "block": "043XX N KENMORE AVE",
 77 |     "iucr": 486,
 78 |     "primaryType": "BATTERY",
 79 |     "description": "DOMESTIC BATTERY SIMPLE",
 80 |     "location": "APARTMENT",
 81 |     "arrest": "TRUE",
 82 |     "domestic": "TRUE",
 83 |     "lat": 41.9615215,
 84 |     "lon": -87.65618446
 85 |   },
 86 |   {
 87 |     "id": 10178357,
 88 |     "caseNumber": "HY366662",
 89 |     "eventDate": "08/02/15 23:41",
 90 |     "block": "025XX W LAWRENCE AVE",
 91 |     "iucr": 486,
 92 |     "primaryType": "BATTERY",
 93 |     "description": "DOMESTIC BATTERY SIMPLE",
 94 |     "location": "SIDEWALK",
 95 |     "arrest": "FALSE",
 96 |     "domestic": "TRUE",
 97 |     "lat": 41.96870273,
 98 |     "lon": -87.69147181
 99 |   },
100 |   {
101 |     "id": 10178260,
102 |     "caseNumber": "HY366716",
103 |     "eventDate": "08/02/15 23:38",
104 |     "block": "065XX S SEELEY AVE",
105 |     "iucr": 1340,
106 |     "primaryType": "CRIMINAL DAMAGE",
107 |     "description": "TO STATE SUP PROP",
108 |     "location": "SCHOOL, PUBLIC, BUILDING",
109 |     "arrest": "FALSE",
110 |     "domestic": "FALSE",
111 |     "lat": 41.77425422,
112 |     "lon": -87.67513531
113 |   },
114 |   {
115 |     "id": 10178233,
116 |     "caseNumber": "HY366661",
117 |     "eventDate": "08/02/15 23:32",
118 |     "block": "064XX S LOWE AVE",
119 |     "iucr": 497,
120 |     "primaryType": "BATTERY",
121 |     "description": "AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON",
122 |     "location": "APARTMENT",
123 |     "arrest": "TRUE",
124 |     "domestic": "TRUE",
125 |     "lat": 41.77720468,
126 |     "lon": -87.64065374
127 |   },
128 |   {
129 |     "id": 10178943,
130 |     "caseNumber": "HY366951",
131 |     "eventDate": "08/02/15 23:30",
132 |     "block": "062XX W ROSCOE ST",
133 |     "iucr": 910,
134 |     "primaryType": "MOTOR VEHICLE THEFT",
135 |     "description": "AUTOMOBILE",
136 |     "location": "STREET",
137 |     "arrest": "FALSE",
138 |     "domestic": "FALSE",
139 |     "lat": 41.94216025,
140 |     "lon": -87.78166666
141 |   },
142 |   {
143 |     "id": 10179727,
144 |     "caseNumber": "HY367927",
145 |     "eventDate": "08/02/15 23:30",
146 |     "block": "012XX W GRAND AVE",
147 |     "iucr": 497,
148 |     "primaryType": "BATTERY",
149 |     "description": "AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON",
150 |     "location": "APARTMENT",
151 |     "arrest": "FALSE",
152 |     "domestic": "TRUE",
153 |     "lat": 41.89117935,
154 |     "lon": -87.65860268
155 |   },
156 |   {
157 |     "id": 10178262,
158 |     "caseNumber": "HY366663",
159 |     "eventDate": "08/02/15 23:28",
160 |     "block": "047XX S DAMEN AVE",
161 |     "iucr": 1811,
162 |     "primaryType": "NARCOTICS",
163 |     "description": "POSS: CANNABIS 30GMS OR LESS",
164 |     "location": "GAS STATION",
165 |     "arrest": "TRUE",
166 |     "domestic": "FALSE",
167 |     "lat": 41.80832553,
168 |     "lon": -87.67482701
169 |   },
170 |   {
171 |     "id": 10178278,
172 |     "caseNumber": "HY366671",
173 |     "eventDate": "08/02/15 23:20",
174 |     "block": "005XX W ROOSEVELT RD",
175 |     "iucr": 560,
176 |     "primaryType": "ASSAULT",
177 |     "description": "SIMPLE",
178 |     "location": "CONVENIENCE STORE",
179 |     "arrest": "FALSE",
180 |     "domestic": "FALSE",
181 |     "lat": 41.86714315,
182 |     "lon": -87.63936798
183 |   },
184 |   {
185 |     "id": 10178335,
186 |     "caseNumber": "HY366719",
187 |     "eventDate": "08/02/15 23:19",
188 |     "block": "034XX S LAWNDALE AVE",
189 |     "iucr": "143A",
190 |     "primaryType": "WEAPONS VIOLATION",
191 |     "description": "UNLAWFUL POSS OF HANDGUN",
192 |     "location": "STREET",
193 |     "arrest": "TRUE",
194 |     "domestic": "FALSE",
195 |     "lat": 41.82991123,
196 |     "lon": -87.71671383
197 |   },
198 |   {
199 |     "id": 10178241,
200 |     "caseNumber": "HY366644",
201 |     "eventDate": "08/02/15 23:17",
202 |     "block": "0000X W 79TH ST",
203 |     "iucr": 1310,
204 |     "primaryType": "CRIMINAL DAMAGE",
205 |     "description": "TO PROPERTY",
206 |     "location": "RESTAURANT",
207 |     "arrest": "FALSE",
208 |     "domestic": "FALSE",
209 |     "lat": 41.75081548,
210 |     "lon": -87.62667814
211 |   },
212 |   {
213 |     "id": 10179681,
214 |     "caseNumber": "HY367958",
215 |     "eventDate": "08/02/15 23:15",
216 |     "block": "125XX S WENTWORTH AVE",
217 |     "iucr": 1121,
218 |     "primaryType": "DECEPTIVE PRACTICE",
219 |     "description": "COUNTERFEITING DOCUMENT",
220 |     "location": "SIDEWALK",
221 |     "arrest": "FALSE",
222 |     "domestic": "FALSE",
223 |     "lat": 41.66603296,
224 |     "lon": -87.62722538
225 |   },
226 |   {
227 |     "id": 10178299,
228 |     "caseNumber": "HY366676",
229 |     "eventDate": "08/02/15 23:13",
230 |     "block": "077XX S AVALON AVE",
231 |     "iucr": 1310,
232 |     "primaryType": "CRIMINAL DAMAGE",
233 |     "description": "TO PROPERTY",
234 |     "location": "RESIDENCE",
235 |     "arrest": "FALSE",
236 |     "domestic": "FALSE",
237 |     "lat": 41.754208,
238 |     "lon": -87.59448509
239 |   },
240 |   {
241 |     "id": 10178224,
242 |     "caseNumber": "HY366674",
243 |     "eventDate": "08/02/15 23:13",
244 |     "block": "077XX S AVALON AVE",
245 |     "iucr": 610,
246 |     "primaryType": "BURGLARY",
247 |     "description": "FORCIBLE ENTRY",
248 |     "location": "APARTMENT",
249 |     "arrest": "FALSE",
250 |     "domestic": "FALSE",
251 |     "lat": 41.754208,
252 |     "lon": -87.59448509
253 |   }
254 | ]


--------------------------------------------------------------------------------
/ch07/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |     <groupId>es-hadoop-book-code</groupId>
  6 |     <artifactId>ch07</artifactId>
  7 |     <version>0.0.1</version>
  8 |     <packaging>jar</packaging>
  9 | 
 10 |     <name>com.hadoop.app</name>
 11 |     <url>http://maven.apache.org</url>
 12 | 
 13 |     <properties>
 14 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 |     </properties>
 16 | 
 17 |     <dependencies>
 18 |         <dependency>
 19 |             <groupId>cascading</groupId>
 20 |             <artifactId>cascading-core</artifactId>
 21 |             <version>2.6.3</version>
 22 |         </dependency>
 23 |         <dependency>
 24 |             <groupId>cascading</groupId>
 25 |             <artifactId>cascading-local</artifactId>
 26 |             <version>2.6.3</version>
 27 |         </dependency>
 28 |         <dependency>
 29 |             <groupId>cascading</groupId>
 30 |             <artifactId>cascading-hadoop</artifactId>
 31 |             <version>2.6.3</version>
 32 |         </dependency>
 33 |         <dependency>
 34 |             <groupId>org.apache.hadoop</groupId>
 35 |             <artifactId>hadoop-core</artifactId>
 36 |             <version>1.2.1</version>
 37 |             <scope>provided</scope>
 38 |         </dependency>
 39 | 
 40 |         <dependency>
 41 |             <groupId>org.elasticsearch</groupId>
 42 |             <artifactId>elasticsearch-hadoop</artifactId>
 43 |             <version>2.1.1</version>
 44 |         </dependency>
 45 |     </dependencies>
 46 | 
 47 |     <build>
 48 |         <plugins>
 49 |             <plugin>
 50 |                 <artifactId>maven-assembly-plugin</artifactId>
 51 |                 <version>2.2.1</version>
 52 | 
 53 |                 <executions>
 54 | 
 55 | 
 56 |                     <execution>
 57 |                         <id>make-writer-job</id>
 58 |                         <configuration>
 59 |                             <descriptors>
 60 |                                 <descriptor>assembly.xml</descriptor>
 61 |                             </descriptors>
 62 |                             <archive>
 63 |                                 <manifest>
 64 |                                     <mainClass>com.packtpub.esh.cascading.CascadingEsWriter</mainClass>
 65 |                                 </manifest>
 66 |                             </archive>
 67 |                             <finalName>${artifactId}-${version}-cascading-writer</finalName>
 68 |                         </configuration>
 69 |                         <phase>package</phase>
 70 |                         <goals>
 71 |                             <goal>single</goal>
 72 |                         </goals>
 73 |                     </execution>
 74 | 
 75 |                     <execution>
 76 |                         <id>make-reader-job</id>
 77 |                         <configuration>
 78 |                             <descriptors>
 79 |                                 <descriptor>assembly.xml</descriptor>
 80 |                             </descriptors>
 81 |                             <archive>
 82 |                                 <manifest>
 83 |                                     <mainClass>com.packtpub.esh.cascading.CascadingEsReader</mainClass>
 84 |                                 </manifest>
 85 |                             </archive>
 86 |                             <finalName>${artifactId}-${version}-cascading-reader</finalName>
 87 |                         </configuration>
 88 |                         <phase>package</phase>
 89 |                         <goals>
 90 |                             <goal>single</goal>
 91 |                         </goals>
 92 |                     </execution>
 93 |                 </executions>
 94 |             </plugin>
 95 |             <plugin>
 96 |                 <groupId>org.apache.maven.plugins</groupId>
 97 |                 <artifactId>maven-compiler-plugin</artifactId>
 98 |                 <version>3.3</version>
 99 |                 <configuration>
100 |                     <source>1.8</source>
101 |                     <target>1.8</target>
102 |                 </configuration>
103 |             </plugin>
104 |         </plugins>
105 |     </build>
106 |     <repositories>
107 |     <repository>
108 |         <id>conjars</id>
109 |         <url>http://conjars.org/repo/</url>
110 | 
111 |     </repository>
112 | </repositories>
113 | </project>
114 | 


--------------------------------------------------------------------------------
/ch07/scripts/es-reader.pig:
--------------------------------------------------------------------------------
1 | REGISTER hdfs://localhost:9000/lib/elasticsearch-hadoop-2.1.1.jar;
2 | 
3 | ES = LOAD 'esh_pig/crimes' using org.elasticsearch.hadoop.pig.EsStorage('{"query" : { "term" : { "primaryType" : "theft" } } }');
4 | 
5 | dump ES;


--------------------------------------------------------------------------------
/ch07/scripts/es-reader.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS theft_crimes;
 2 | 
 3 | CREATE EXTERNAL TABLE IF NOT EXISTS theft_crimes (
 4 |    id      STRING,
 5 |    caseNumber STRING,
 6 |    eventDate DATE,
 7 |    block STRING,
 8 |    iucr STRING,
 9 |    primaryType STRING,
10 |    description STRING,
11 |    location STRING,
12 |    arrest BOOLEAN,
13 |    domestic BOOLEAN,
14 |    geoLocation   STRUCT<lat:DOUBLE, lon:DOUBLE>)
15 | STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
16 | TBLPROPERTIES('es.resource' = 'esh_hive/crimes', 'es.query' = '{"query" : { "term" : { "primarytype" : "theft" } } }');
17 | 
18 | -- stream data from Elasticsearch
19 | SELECT location, count(*) as noOfCrimes FROM theft_crimes group by location;
20 | 


--------------------------------------------------------------------------------
/ch07/scripts/es-writer.pig:
--------------------------------------------------------------------------------
 1 | 
 2 | REGISTER hdfs://localhost:9000/lib/elasticsearch-hadoop-2.1.1.jar;
 3 | 
 4 | -- Match the reducer parallelism to the number of shards available
 5 | SET default_parallel 5;
 6 | 
 7 | -- Disable combining input splits
 8 | SET pig.noSplitCombination TRUE;
 9 | 
10 | -- Load CSV file into SOURCE
11 | SOURCE = load '/ch07/crimes_dataset.csv' using PigStorage(',') as (id:chararray, caseNumber:chararray,
12 |     date:datetime, block:chararray, iucr:chararray, primaryType:chararray, description:chararray,
13 |     location:chararray, arrest:boolean, domestic:boolean, lat:double,lon:double);
14 | 
15 | TARGET = foreach SOURCE generate id, caseNumber,
16 |     date, block, iucr, primaryType, description,
17 |     location, arrest, domestic, TOTUPLE(lon, lat) AS geoLocation;
18 | 
19 | -- Store to ES index
20 | STORE TARGET INTO 'esh_pig/crimes'
21 |     USING org.elasticsearch.hadoop.pig.EsStorage('es.http.timeout = 5m',
22 |         'es.index.auto.create = true',
23 |         'es.mapping.names=arrest:isArrest, domestic:isDomestic',
24 |         'es.mapping.id=id');
25 | 
26 | 


--------------------------------------------------------------------------------
/ch07/scripts/es-writer.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS source;
 2 | DROP TABLE IF EXISTS crimes;
 3 | 
 4 | CREATE EXTERNAL TABLE source (
 5 |     id      STRING,
 6 |     caseNumber STRING,
 7 |     eventDate DATE,
 8 |     block STRING,
 9 |     iucr STRING,
10 |     primaryType STRING,
11 |     description STRING,
12 |     location STRING,
13 |     arrest BOOLEAN,
14 |     domestic BOOLEAN,
15 |     lat DOUBLE,
16 |     lon DOUBLE)
17 | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
18 | STORED AS TEXTFILE
19 | LOCATION '/ch07';
20 | 
21 | CREATE EXTERNAL TABLE crimes (
22 |         id STRING,
23 |         caseNumber STRING,
24 |         eventDate DATE,
25 |         block STRING,
26 |         iucr STRING,
27 |         primaryType STRING,
28 |         description STRING,
29 |         location STRING,
30 |         arrest BOOLEAN,
31 |         domestic BOOLEAN,
32 |         geoLocation   STRUCT<lat:DOUBLE, lon:DOUBLE>)
33 | STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
34 | TBLPROPERTIES('es.resource' = 'esh_hive/crimes');
35 | 
36 | -- insert data to Elasticsearch from another table called 'source'
37 | INSERT OVERWRITE TABLE crimes
38 |     SELECT s.id, s.caseNumber, s.eventDate, s.block, s.iucr, s.primaryType, s.description, s.location, s.arrest, s.domestic, named_struct('lat', cast(s.lat AS DOUBLE), 'lon', cast(s.lon AS DOUBLE))
39 |                     FROM source s;
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/ch07/scripts/lingual-cleanup.sh:
--------------------------------------------------------------------------------
1 | # Execute this only if you have already created schema and you want to start fresh again with lingual
2 | lingual catalog --schema esh --remove;
3 | lingual catalog --provider es --remove;


--------------------------------------------------------------------------------
/ch07/scripts/lingual-writer.sh:
--------------------------------------------------------------------------------
 1 | export LINGUAL_PLATFORM=hadoop
 2 | # register {es} as a provider
 3 | lingual catalog --init
 4 | lingual catalog --provider --add /opt/lib/elasticsearch-hadoop-2.1.1.jar
 5 | # add a custom schema (called 'titles') for querying
 6 | 
 7 | lingual catalog --schema esh --add
 8 | lingual catalog --schema esh --stereotype crimes --add \
 9 |     --columns id,caseNumber,eventDate,block,iucr,primaryType,description,location,arrest,domestic,lat,lon --types string,string,string,string,string,string,string,string,string,string,string,string
10 | 
11 | lingual catalog --schema esh --format es --add --provider es
12 | lingual catalog --schema esh --protocol es --add --provider es \
13 |     --properties=host=localhost
14 | lingual catalog --schema esh --table crimes --stereotype crimes \
15 |     -add esh_cascading/crimes --format es --provider es --protocol es


--------------------------------------------------------------------------------
/ch07/src/main/java/com/packtpub/esh/cascading/CascadingEsReader.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.cascading;
 2 | 
 3 | 
 4 | import cascading.flow.FlowConnector;
 5 | import cascading.flow.local.LocalFlowConnector;
 6 | import cascading.pipe.Pipe;
 7 | import cascading.tap.Tap;
 8 | import cascading.tap.local.StdOutTap;
 9 | import org.elasticsearch.hadoop.cascading.EsTap;
10 | 
11 | import java.util.Properties;
12 | 
13 | 
14 | public class CascadingEsReader {
15 |     public static void main(String[] args) throws InterruptedException {
16 | 
17 |         Tap in = new EsTap("localhost",9200, "esh_cascading/crimes",
18 |             "{\"query\" : { \"term\" : { \"primaryType\" : \"theft\" } } }");
19 | 
20 |         Properties props = new Properties();
21 |         props.setProperty("es.nodes","localhost");
22 |         Tap out = new StdOutTap(new cascading.scheme.local.TextLine());
23 |         FlowConnector flow = new LocalFlowConnector();
24 | 
25 |         Pipe fromEs = new Pipe("search-from-es");
26 |         flow.connect(in, out, fromEs).complete();
27 |     }
28 | }


--------------------------------------------------------------------------------
/ch07/src/main/java/com/packtpub/esh/cascading/CascadingEsWriter.java:
--------------------------------------------------------------------------------
 1 | package com.packtpub.esh.cascading;
 2 | 
 3 | 
 4 | import cascading.flow.FlowConnector;
 5 | import cascading.flow.hadoop.HadoopFlowConnector;
 6 | import cascading.operation.expression.ExpressionFunction;
 7 | import cascading.pipe.Each;
 8 | import cascading.pipe.Pipe;
 9 | import cascading.scheme.hadoop.TextDelimited;
10 | import cascading.tap.Tap;
11 | import cascading.tap.hadoop.Hfs;
12 | import cascading.tuple.Fields;
13 | import org.elasticsearch.hadoop.cascading.EsTap;
14 | 
15 | import java.util.Properties;
16 | 
17 | 
18 | public class CascadingEsWriter {
19 |     public static void main(String[] args) throws InterruptedException {
20 |         Properties props = new Properties();
21 |         props.setProperty("es.mapping.id", "id");
22 |         FlowConnector flow = new HadoopFlowConnector(props);
23 | 
24 |         Fields inFields = new Fields("id", "caseNumber", "eventDate", "block",
25 |                 "iucr", "primaryType", "description","location","arrest","domestic",
26 |                 "lat", "lon"
27 |         );
28 |         TextDelimited scheme = new TextDelimited(inFields, false, ",","\"");
29 |         Tap in = new Hfs(scheme, "/ch07/crimes_dataset.csv");
30 | 
31 |         String expression = "lat + \", \" + lon";
32 |         Fields location = new Fields( "geoLocation" );
33 |         ExpressionFunction locationFunction = new ExpressionFunction( location, expression, String.class );
34 | 
35 |         Pipe toEs = new Pipe("to-Es");
36 |         toEs = new Each(toEs, locationFunction,Fields.ALL);
37 | 
38 |         Fields outFields = new Fields("id", "caseNumber", "eventDate", "block",
39 |                 "iucr", "primaryType", "description","location","arrest","domestic",
40 |                 "geoLocation"
41 |         );
42 |         Tap out = new EsTap("localhost",9200, "esh_cascading/crimes",  outFields);
43 | 
44 | 
45 |         flow.connect(in, out, toEs).complete();
46 |     }
47 | }


--------------------------------------------------------------------------------