├── .gitignore
├── README.md
├── ch01
├── assembly.xml
├── data
│ └── sample.txt
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── packtpub
│ └── esh
│ ├── Driver.java
│ ├── WordsMapper.java
│ └── WordsReducer.java
├── ch02
├── assembly.xml
├── data
│ ├── network-logs.txt
│ └── tweets.csv
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── packtpub
│ └── esh
│ ├── nwlogs
│ ├── DomainUtil.java
│ ├── Driver.java
│ └── NetworkLogsMapper.java
│ ├── tweets2es
│ ├── Driver.java
│ └── Tweets2EsMapper.java
│ └── tweets2hdfs
│ ├── Driver.java
│ └── Tweets2HdfsMapper.java
├── ch03
├── data
│ └── setup-hrms.sh
└── exercise
│ └── avg-salary-by-city-request.sh
├── ch04
├── assembly.xml
├── data
│ └── consumer_complaints.csv
├── pom.xml
├── setup
│ ├── complaints-dashboard.json
│ └── setup-mappings.sh
└── src
│ └── main
│ └── java
│ └── com
│ └── packtpub
│ └── esh
│ └── complaints
│ ├── ComplaintsMapper.java
│ └── Driver.java
├── ch05
├── assembly.xml
├── data
│ └── percolators.sh
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── packtpub
│ └── esh
│ └── streaming
│ ├── ElasticSearchService.java
│ ├── Topology.java
│ ├── TweetsCollectorSpout.java
│ └── TweetsParserBolt.java
├── ch07-spark
├── assembly.xml
├── pom.xml
└── src
│ └── main
│ └── java
│ └── com
│ └── packtpub
│ └── esh
│ └── spark
│ ├── Crime.java
│ ├── SparkEsReader.java
│ ├── SparkEsWriter.java
│ ├── SparkSQLEsReader.java
│ ├── SparkSQLEsWriterReflection.java
│ └── SparkSQLEsWriterSchema.java
└── ch07
├── assembly.xml
├── data
├── crimes.json
└── crimes_dataset.csv
├── pom.xml
├── scripts
├── es-reader.pig
├── es-reader.sql
├── es-writer.pig
├── es-writer.sql
├── lingual-cleanup.sh
└── lingual-writer.sh
└── src
└── main
└── java
└── com
└── packtpub
└── esh
└── cascading
├── CascadingEsReader.java
└── CascadingEsWriter.java
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | *.iml
3 | .classpath
4 | .project
5 | *.log
6 | **/target
7 | **/.settings
8 | **/.idea
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ElasticSearch for Hadoop Book Source Code
2 |
3 | ### Check Prerequisites
4 | - JDK 1.8
5 |
6 | ```sh
7 | $ java -version
8 | ```
9 | - Maven
10 |
11 | ```sh
12 | $ mvn -version
13 | ```
14 | - Make sure HDFS and YARN are running
15 |
16 | ```sh
17 | $ jps
18 |
19 | 13386 SecondaryNameNode
20 | 13059 NameNode
21 | 13179 DataNode
22 | 13649 NodeManager
23 | 13528 ResourceManager
24 | ```
25 | - Make sure Elasticsearch 1.7+ is up and running at 9200
26 | ```sh
27 | $ curl -XGET http://localhost:9200
28 |
29 | {
30 | "status" : 200,
31 | "name" : "ES Hadoop Node",
32 | "cluster_name" : "eshadoopcluster",
33 | "version" : {
34 | "number" : "1.7.2",
35 | "build_hash" : "e43676b1385b8125d647f593f7202acbd816e8ec",
36 | "build_timestamp" : "2015-09-14T09:49:53Z",
37 | "build_snapshot" : false,
38 | "lucene_version" : "4.10.4"
39 | },
40 | "tagline" : "You Know, for Search"
41 | }
42 |
43 | ```
44 |
45 | ### Build
46 | - Open terminal and switch to the chapter directory you want to build
47 | - Execute
48 | ```sh
49 | $ cd ch01
50 | $ mvn clean package
51 | ```
52 | - Verify that file with xxx-job.jar pattern is generated
53 | ```sh
54 | $ ls target
55 | ```
56 |
--------------------------------------------------------------------------------
/ch01/assembly.xml:
--------------------------------------------------------------------------------
1 |
2 | job
3 |
4 | jar
5 |
6 | false
7 |
8 |
9 | false
10 | runtime
11 | lib
12 |
13 | ${groupId}:${artifactId}
14 |
15 |
16 |
17 | true
18 |
19 | ${groupId}:${artifactId}
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/ch01/data/sample.txt:
--------------------------------------------------------------------------------
1 | The key features of Elasticsearch for Apache Hadoop include:
2 |
3 | Scalable Map/Reduce model
4 | elasticsearch-hadoop is built around Map/Reduce: every operation done in elasticsearch-hadoop results in multiple Hadoop tasks (based on the number of target shards) that interact, in parallel with Elasticsearch.
5 | REST based
6 | elasticsearch-hadoop uses Elasticsearch REST interface for communication, allowing for flexible deployments by minimizing the number of ports needed to be open within a network.
7 | Self contained
8 | the library has been designed to be small and efficient. At around 300KB and no extra dependencies outside Hadoop itself, distributing elasticsearch-hadoop within your cluster is simple and fast.
9 | Universal jar
10 | whether you are using Hadoop 1.x or Hadoop 2.x, vanilla Apache Hadoop or a certain distro, the same elasticsearch-hadoop jar works transparently across all of them.
11 | Memory and I/O efficient
12 | elasticsearch-hadoop is focused on performance. From pull-based parsing, to bulk updates and direct conversion to/of native types, elasticsearch-hadoop keeps its memory and network I/O usage finely-tuned.
13 | Adaptive I/O
14 | elasticsearch-hadoop detects transport errors and retries automatically. If the Elasticsearch node died, re-routes the request to the available nodes (which are discovered automatically). Additionally, if Elasticsearch is overloaded, elasticsearch-hadoop detects the data rejected and resents it, until it is either processed or the user-defined policy applies.
15 | Facilitates data co-location
16 | elasticsearch-hadoop fully integrates with Hadoop exposing its network access information, allowing co-located Elasticsearch and Hadoop clusters to be aware of each other and reduce network IO.
17 | Map/Reduce API support
18 | At its core, elasticsearch-hadoop uses the low-level Map/Reduce API to read and write data to Elasticsearch allowing for maximum integration flexibility and performance.
19 | old(mapred) & new(mapreduce) Map/Reduce APIs supported
20 | elasticsearch-hadoop automatically adjusts to your environment; one does not have to change between using the mapred or mapreduce APIs - both are supported, by the same classes, at the same time.
21 | Hive support
22 | Run Hive queries against Elasticsearch for advanced analystics and real_time reponses. elasticsearch-hadoop exposes Elasticsearch as a Hive table so your scripts can crunch through data faster then ever.
23 | Pig support
24 | elasticsearch-hadoop supports Apache Pig exposing Elasticsearch as a native Pig Storage. Run your Pig scripts against Elasticsearch without any modifications to your configuration or the Pig client.
25 | Cascading support
26 | Cascading is an application framework for Java developers to simply develop robust applications on Apache Hadoop. And with elasticsearch-hadoop, Cascading can run its flows directly onto Elasticsearch.
27 |
--------------------------------------------------------------------------------
/ch01/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | es-hadoop-book-code
6 | ch01
7 | 0.0.1
8 | jar
9 |
10 | com.hadoop.app
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | junit
20 | junit
21 | 3.8.1
22 | test
23 |
24 |
25 | org.apache.hadoop
26 | hadoop-core
27 | 1.2.1
28 | provided
29 |
30 |
31 | org.apache.hadoop
32 | hadoop-hdfs
33 | 2.6.0
34 |
35 |
36 | org.elasticsearch
37 | elasticsearch-hadoop
38 | 2.1.0
39 |
40 |
41 | cascading-hadoop
42 | cascading
43 |
44 |
45 | cascading-local
46 | cascading
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | maven-assembly-plugin
56 | 2.2.1
57 |
58 |
59 | assembly.xml
60 |
61 |
62 |
63 | com.packtpub.esh.Driver
64 |
65 |
66 |
67 |
68 |
69 | make-assembly
70 | package
71 |
72 | single
73 |
74 |
75 |
76 |
77 |
78 | org.apache.maven.plugins
79 | maven-compiler-plugin
80 | 3.3
81 |
82 | 1.8
83 | 1.8
84 |
85 |
86 |
87 |
88 |
89 |
--------------------------------------------------------------------------------
/ch01/src/main/java/com/packtpub/esh/Driver.java:
--------------------------------------------------------------------------------
1 | package com.packtpub.esh;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 | import org.apache.hadoop.fs.Path;
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.Text;
7 | import org.apache.hadoop.mapreduce.Job;
8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
9 | import org.elasticsearch.hadoop.mr.EsOutputFormat;
10 |
11 |
12 | public class Driver {
13 |
14 | public static void main(String[] args) throws Exception {
15 | Configuration conf = new Configuration();
16 | // ElasticSearch Server nodes to point to
17 | conf.set("es.nodes", "localhost:9200");
18 | // ElasticSearch index and type name in {indexName}/{typeName} format
19 | conf.set("es.resource", "eshadoop/wordcount");
20 |
21 | // Create Job instance
22 | Job job = new Job(conf, "word count");
23 | // set Driver class
24 | job.setJarByClass(Driver.class);
25 | job.setMapperClass(WordsMapper.class);
26 | job.setReducerClass(WordsReducer.class);
27 | job.setOutputKeyClass(Text.class);
28 | job.setOutputValueClass(IntWritable.class);
29 | // set OutputFormat to EsOutputFormat provided by ElasticSearch-Hadoop jar
30 | job.setOutputFormatClass(EsOutputFormat.class);
31 |
32 | FileInputFormat.addInputPath(job, new Path(args[0]));
33 |
34 | System.exit(job.waitForCompletion(true) ? 0 : 1);
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/ch01/src/main/java/com/packtpub/esh/WordsMapper.java:
--------------------------------------------------------------------------------
1 | package com.packtpub.esh;
2 |
3 | import org.apache.hadoop.io.IntWritable;
4 | import org.apache.hadoop.io.Text;
5 | import org.apache.hadoop.mapreduce.Mapper;
6 |
7 | import java.io.IOException;
8 | import java.util.StringTokenizer;
9 |
10 | public class WordsMapper extends Mapper