├── LICENSE.md
├── README.md
├── pom.xml
└── src
    ├── main
        ├── java
        │   └── com
        │   │   └── inin
        │   │       └── analytics
        │   │           └── elasticsearch
        │   │               ├── BaseESMapper.java
        │   │               ├── BaseESReducer.java
        │   │               ├── ConfigParams.java
        │   │               ├── ESEmbededContainer.java
        │   │               ├── IndexingPostProcessor.java
        │   │               ├── ShardConfig.java
        │   │               ├── driver
        │   │                   └── Driver.java
        │   │               ├── example
        │   │                   ├── ExampleIndexingJob.java
        │   │                   ├── ExampleIndexingReducerImpl.java
        │   │                   ├── ExampleJobPrep.java
        │   │                   └── GenerateData.java
        │   │               ├── index
        │   │                   ├── rotation
        │   │                   │   ├── ElasticSearchIndexMetadata.java
        │   │                   │   ├── ElasticsearchIndexRotationManager.java
        │   │                   │   ├── ElasticsearchIndexRotationManagerNoop.java
        │   │                   │   ├── ElasticsearchIndexRotationManagerZookeeper.java
        │   │                   │   ├── ExampleElasticsearchIndexRotationStrategyZookeeper.java
        │   │                   │   └── RebuildPipelineState.java
        │   │                   ├── routing
        │   │                   │   ├── ElasticsearchRoutingStrategy.java
        │   │                   │   └── ElasticsearchRoutingStrategyV1.java
        │   │                   └── selector
        │   │                   │   ├── RealtimeIndexSelectionStrategy.java
        │   │                   │   └── RealtimeIndexSelectionStrategyLagged.java
        │   │               ├── transport
        │   │                   ├── BaseTransport.java
        │   │                   ├── HDFSSnapshotTransport.java
        │   │                   ├── LocalFSSnapshotTransport.java
        │   │                   ├── S3SnapshotTransport.java
        │   │                   └── SnapshotTransportStrategy.java
        │   │               └── util
        │   │                   ├── DateTimeDeserializer.java
        │   │                   ├── DateTimeSerializer.java
        │   │                   ├── GsonFactory.java
        │   │                   ├── LocalDateDeserializer.java
        │   │                   ├── LocalDateSerializer.java
        │   │                   └── MurmurHash.java
        └── resources
        │   ├── hadoop-job.xml
        │   └── logback.xml
    └── test
        └── java
            └── com
                └── inin
                    └── analytics
                        ├── DateSerializerTest.java
                        ├── ElasticsearchRoutingStrategyV1Test.java
                        └── IndexRotationStrategyZookeeperTest.java


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Genesys Cloud Services, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Elasticsearch-Lambda 
 2 | 
 3 | ## What is Lambda
 4 | 
 5 | For a primer on Lambda Architecture see [http://lambda-architecture.net/]. At Interactive Intelligence we're applying Lamda architecture to Elasticsearch. In our case that means we're streaming data into Elasticsearch in real time generated by Storm, and then re-building the whole thing every night in Hadoop. This allows some architecture nicities such as 
 6 | 
 7 |  * Changes to analyzers & tokenizers are rolled out for all historical data every night, automatically
 8 |  * New features & bug fixes that affect the data being indexed are rolled out every night, automatically. No data repair/backpopulation scripts are ever required.
 9 |  * Tune the # shards and shard routing strategies on data already written. Changes roll out every night, automatically.   
10 |  * With the button that rebuilds the cluster getting hit nightly, it is a well oiled button.
11 |  * If data gets corrupt, no heroics are required. Hit the rebuild button and grab a beer.
12 |  * Backups? Why bother when you can just hit the rebuild button?
13 |   
14 | Obviously there's a decent bit of work up front to get all this working. Being a fairly generic problem, we decided to open source our infastructure. 
15 | 
16 | ## Features
17 | 
18 | A new way to bulk load elasticsearch from hadoop
19 | 
20 |  * Build indexes offline, without touching your production cluster
21 |  * Run Elasticsearch unmodified, entirely within YARN
22 |  * Build snapshots of indexes without requiring enough disk space on task trackers to hold an entire index
23 |  * Load those indexes into your cluster using the snapshot restore functionality built into Elasticsearch
24 |  * .. and more to come. We're in the process of pulling as much into this repo as we can. 
25 |  
26 | 
27 | ## How it works
28 | 
29 | The meat is in BaseEsReducer where individual reducer tasks recieve all the data for a single shard of a single index. It creates an embeded Elasticsearch instance, bulk loads it locally in-jvm, and then creates a snapshot. Discovery is disabled and the elasticsearch instances do not form a cluster with each other. Once bulk loading a shard is complete it is flushed, optimized, snapshotted, and then transfered to a snapshot repository (S3, HDFS, or Local FS). After the job is complete, any shards that have no data get placeholder shards generated to make the index complete.   
30 | 
31 | By making reducers only responsible for a single shard worth of data at a time, the total disk space required on task trackers is roughly
32 | 
33 | (shard data) + (shard snapshot) * (num reducers per task tracker)   
34 | 
35 | After indexes have been generated they can be loaded in using the snapshot restore functionality built into Elasticsearch. The index promotion process maintains state in Zookeeper. This is in the process of being open sourced.
36 | 
37 | ## Maven
38 | ```XML
39 | <repository>
40 | <id>oss-sonatype</id>
41 | <name>oss-sonatype</name>
42 | <url>https://oss.sonatype.org/content/groups/public/</url>
43 | </repository>
44 | 
45 | <dependency>
46 | <artifactId>elasticsearch-lambda</artifactId>
47 | <groupId>com.inin.analytics</groupId>
48 | <version>1.0.25</version>
49 | </dependency>
50 | ```
51 | 
52 | ## Shard routing
53 | 
54 | In order to index 1 shard per reducer at a time, elasticsearch-lambda relies on manual shard routing. If you've got big indexes (probably why you're here), then you'll almost certainly want a custom routing strategy so that searches can hit a subset of shards.
55 | 
56 | To create your own you would implement the ElasticsearchRoutingStrategy interface and make use of it during the setup method of the ExampleJobPrep job. The default works as such:
57 | 
58 | ElasticsearchRoutingStrategyV1: Two parameters: numShards & numShardsPerOrg. A nieve apprach would be routing all data for 1 customer to 1 shard. To avoid hotspotting shards with large customers, this lets you spread the load across multiple shards. For example with 10 shards and 3 per customer, customer A might sit on shards 1,3,5 while customer B sits on shards 2,3,8. Setting the inputs to 10 & 10 would spread all customers evenly across all 10 shards. 
59 | 
60 | ## EMR Example Steps
61 | 
62 |  * generateExampleData 1000 hdfs:///tmp/test/data 
63 |  * examplePrep hdfs:///tmp/test/data/ hdfs:///tmp/test/json/ _rebuild_20141030012508 5 2
64 |  * esIndexRebuildExample hdfs:///tmp/test/json/ /media/ephemeral0/tmp/bulkload/ hdfs:///tmp/snapshotrepo/ my_backup /media/ephemeral0/tmp/esrawdata/ 1 5 100 hdfs:///tmp/manifest/
65 | 
66 |  
67 | ## Running Configs (for eclipse/IDE) 
68 | You can experiment via these run configs ran in series
69 |  
70 |  * com.inin.analytics.elasticsearch.driver.Driver
71 |  
72 | Lets build some dummy data
73 | 
74 |  * generateExampleData 1000 file:///tmp/data/part2
75 |  
76 | Prepare some data for the indexing job
77 | 
78 |  * examplePrep /tmp/data/part2 /tmp/datajson/ _rebuild_20141030012508 5 2
79 | 
80 | Build Elasticsearch indexes, snapshot them, and transport them to a snapshot repository on hdfs (s3 paths also allowed)
81 | 
82 |  * esIndexRebuildExample /tmp/datajson/ /tmp/bulkload110/ hdfs:///tmp/snapshotrepo110/ my_backup /tmp/esrawdata1010/ 1 5 2 /tmp/manifest110/
83 | 
84 | ### Can I use HDFS or NFS for Elasticsearch data?
85 | 
86 | Elasticsearch does not currently support backing it's data with HDFS, so this project makes use of local disks on the task trackers. Given that Solr Cloud already supports HDFS backed data, it's concievable that one day Elasticsearch might.
87 | 
88 | When considering NFS you must first consider how different hadoop distributions have implemented it. The apache version of hadoop implements NFS with large local disk buffers, so it may or may not save you any trouble. The Mapr NFS implementation is more native and performant. In our tests, running Elasticsearch on YARN and backing the data directories by NFS mounts backed by MapR-FS ran roughly half as fast. While impressive, it's up to you to balance the cost of using MapR for it's NFS cabilitiy to run Elasticsearch. Note, this requires substituting Elasticsearch's locking mechanism for a non-filesystem based implementation.
89 |  
90 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 
  5 | 	<groupId>com.inin.analytics</groupId>
  6 | 	<artifactId>elasticsearch-lambda</artifactId>
  7 | 	<packaging>jar</packaging>
  8 | 	<version>1.2.1</version>
  9 | 	<name>elasticsearch-lambda</name>
 10 | 	<description>Framework For Lambda Architecture on Elasticsearch</description>
 11 | 	<url>https://github.com/drewdahlke/elasticsearch-lambda</url>
 12 | 	<properties>
 13 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 14 | 		<java-version>1.7</java-version>
 15 | 		<junit.version>4.11</junit.version>
 16 | 		<mockito.version>1.9.5</mockito.version>
 17 | 		<analytics.core.version>0.1.93-SNAPSHOT</analytics.core.version>
 18 | 		<hadoop.core.version>2.2.0</hadoop.core.version>
 19 | 		<apache.commons-io.version>2.4</apache.commons-io.version>
 20 | 		<apache.commons-lang.version>2.5</apache.commons-lang.version>
 21 | 		<commons.codec.version>1.4</commons.codec.version>
 22 | 		<curator.version>2.4.0</curator.version>
 23 | 		<java.io.tmpdir>/tmp</java.io.tmpdir>
 24 | 		<surefire.version>2.7.2</surefire.version>
 25 | 		<mrunit.version>1.0.0</mrunit.version>
 26 | 		<joda.version>2.3</joda.version>
 27 | 		<jackson.version>1.9.2</jackson.version>
 28 | 		<logback.version>1.0.13</logback.version>
 29 | 		<aws-java-sdk.version>1.9.38</aws-java-sdk.version>
 30 | 		<slf4j.version>1.7.5</slf4j.version>
 31 | 		<org.apache.avro.version>1.7.5</org.apache.avro.version>
 32 | 	</properties>
 33 | 	
 34 | 	<licenses>
 35 |   		<license>
 36 |     		<name>The Apache License, Version 2.0</name>
 37 |     		<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 38 |   		</license>
 39 | 	</licenses>
 40 | 	
 41 | 	<scm>
 42 |   		<connection>https://github.com/drewdahlke/elasticsearch-lambda.git</connection>
 43 |   		<developerConnection>https://github.com/drewdahlke/elasticsearch-lambda.git</developerConnection>
 44 |   		<url>https://github.com/drewdahlke/elasticsearch-lambda.git</url>
 45 | 	</scm>	
 46 | 
 47 | 	<!--External Repositories -->
 48 | 	<repositories>
 49 | 		<repository>
 50 | 			<id>repository.springframework.maven.release</id>
 51 | 			<name>Spring Framework Maven Release Repository</name>
 52 | 			<url>http://maven.springframework.org/release</url>
 53 | 			<snapshots>
 54 | 				<enabled>false</enabled>
 55 | 			</snapshots>
 56 | 		</repository>
 57 | 		<repository>
 58 | 			<id>conjars.org</id>
 59 | 			<url>http://conjars.org/repo</url>
 60 | 		</repository>
 61 | 		<repository>
 62 | 			<id>Codehaus Snapshots</id>
 63 | 			<url>http://repository.codehaus.org/</url>
 64 | 		</repository>
 65 | 		<repository>
 66 | 			<id>central</id>
 67 | 			<name>Maven Repository Switchboard</name>
 68 | 			<layout>default</layout>
 69 | 			<url>http://repo1.maven.org/maven2</url>
 70 | 			<snapshots>
 71 | 				<enabled>false</enabled>
 72 | 			</snapshots>
 73 | 		</repository>
 74 | 	    <repository>
 75 | 	        <id>oss-sonatype</id>
 76 | 	        <name>oss-sonatype</name>
 77 | 	        <url>https://oss.sonatype.org/content/groups/public</url>
 78 | 	    </repository>		
 79 | 	</repositories>
 80 | 
 81 | 
 82 | 	<dependencies>
 83 | 		<dependency>
 84 | 			<groupId>commons-codec</groupId>
 85 | 			<artifactId>commons-codec</artifactId>
 86 | 			<version>${commons.codec.version}</version>
 87 | 		</dependency>
 88 | 		<dependency>
 89 | 			<groupId>org.codehaus.jackson</groupId>
 90 | 			<artifactId>jackson-mapper-lgpl</artifactId>
 91 | 			<version>${jackson.version}</version>
 92 | 		</dependency>
 93 | 		<dependency>
 94 | 			<groupId>org.codehaus.jackson</groupId>
 95 | 			<artifactId>jackson-core-lgpl</artifactId>
 96 | 			<version>${jackson.version}</version>
 97 | 		</dependency>
 98 | 		<dependency>
 99 | 			<artifactId>jackson-mapper-asl</artifactId>
100 | 			<groupId>org.codehaus.jackson</groupId>
101 | 			<version>${jackson.version}</version>
102 | 		</dependency>
103 | 		<dependency>
104 | 			<groupId>org.codehaus.jackson</groupId>
105 | 			<artifactId>jackson-core-asl</artifactId>
106 | 			<version>${jackson.version}</version>
107 | 		</dependency>
108 | 		<dependency>
109 | 			<groupId>org.apache.avro</groupId>
110 | 			<artifactId>avro</artifactId>
111 | 			<version>${org.apache.avro.version}</version>
112 | 			<scope>provided</scope>
113 | 		</dependency>
114 | 
115 | 		<dependency>
116 | 			<groupId>us.monoid.web</groupId>
117 | 			<artifactId>resty</artifactId>
118 | 			<version>0.3.2</version>
119 | 		</dependency>
120 | 		<dependency>
121 | 			<groupId>joda-time</groupId>
122 | 			<artifactId>joda-time</artifactId>
123 | 			<version>${joda.version}</version>
124 | 		</dependency>
125 | 
126 | 
127 | 
128 | 		<dependency>
129 | 			<groupId>org.apache.curator</groupId>
130 | 			<artifactId>curator-recipes</artifactId>
131 | 			<version>${curator.version}</version>
132 | 			<exclusions>
133 | 				<exclusion>
134 | 					<groupId>org.apache.commons</groupId>
135 | 					<artifactId>commons-math</artifactId>
136 | 				</exclusion>
137 | 				<exclusion>
138 | 					<groupId>org.javassist</groupId>
139 | 					<artifactId>javassist</artifactId>
140 | 				</exclusion>
141 | 				<exclusion>
142 | 				    <artifactId>slf4j-log4j12</artifactId>
143 | 				    <groupId>org.slf4j</groupId>
144 | 				</exclusion>
145 | 				<exclusion>
146 | 				    <artifactId>log4j</artifactId>
147 | 				    <groupId>log4j</groupId>
148 | 				</exclusion>
149 | 			</exclusions>
150 | 		</dependency>
151 | 
152 | 		<dependency>
153 | 			<groupId>org.apache.curator</groupId>
154 | 			<artifactId>curator-framework</artifactId>
155 | 			<version>2.3.0</version>
156 | 			<exclusions>
157 | 				<exclusion>
158 | 					<artifactId>slf4j-api</artifactId>
159 | 					<groupId>org.slf4j</groupId>
160 | 				</exclusion>
161 | 			</exclusions>
162 | 		</dependency>
163 | 
164 | 		<dependency>
165 | 			<groupId>com.amazonaws</groupId>
166 | 			<artifactId>aws-java-sdk</artifactId>
167 | 			<version>${aws-java-sdk.version}</version>
168 | 			<exclusions>
169 | 				<exclusion>
170 | 					<groupId>com.fasterxml.jackson.core</groupId>
171 | 					<artifactId>jackson-core</artifactId>
172 | 				</exclusion>
173 | 				<exclusion>
174 | 					<groupId>com.fasterxml.jackson.core</groupId>
175 | 					<artifactId>jackson-annotations</artifactId>
176 | 				</exclusion>
177 | 				<!-- <exclusion> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> 
178 | 					</exclusion> -->
179 | 				<exclusion>
180 | 					<groupId>commons-codec</groupId>
181 | 					<artifactId>commons-codec</artifactId>
182 | 				</exclusion>
183 | 				<exclusion>
184 | 					<artifactId>commons-logging</artifactId>
185 | 					<groupId>commons-logging</groupId>
186 | 				</exclusion>
187 | 			</exclusions>
188 | 		</dependency>
189 | 
190 | 
191 | 		<dependency>
192 | 			<groupId>org.apache.mrunit</groupId>
193 | 			<artifactId>mrunit</artifactId>
194 | 			<version>${mrunit.version}</version>
195 | 			<classifier>hadoop2</classifier>
196 | 			<scope>provided</scope>
197 | 		</dependency>
198 | 		<dependency>
199 | 			<groupId>org.elasticsearch</groupId>
200 | 			<artifactId>elasticsearch</artifactId>
201 | 			<version>1.6.0</version> 
202 | 		</dependency>
203 | 
204 | 		<dependency>
205 | 			<groupId>com.google.code.gson</groupId>
206 | 			<artifactId>gson</artifactId>
207 | 			<version>2.3.1</version>
208 | 			<scope>provided</scope>
209 | 		</dependency>
210 | 
211 | 		<!-- Logging -->
212 | 		<dependency>
213 | 			<groupId>org.slf4j</groupId>
214 | 			<artifactId>slf4j-api</artifactId>
215 | 			<version>${slf4j.version}</version>
216 | 		</dependency>
217 | 		<dependency>
218 | 			<groupId>org.slf4j</groupId>
219 | 			<artifactId>jcl-over-slf4j</artifactId>
220 | 			<version>${slf4j.version}</version>
221 | 		</dependency>
222 | 		<dependency>
223 | 			<groupId>org.slf4j</groupId>
224 | 			<artifactId>log4j-over-slf4j</artifactId>
225 | 			<version>${slf4j.version}</version>
226 | 		</dependency>
227 | 		<dependency>
228 | 			<groupId>org.slf4j</groupId>
229 | 			<artifactId>jul-to-slf4j</artifactId>
230 | 			<version>${slf4j.version}</version>
231 | 		</dependency>
232 | 		<dependency>
233 | 			<groupId>ch.qos.logback</groupId>
234 | 			<artifactId>logback-classic</artifactId>
235 | 			<version>${logback.version}</version>
236 | 		</dependency>
237 | 
238 | 		<dependency>
239 | 			<groupId>ch.qos.logback</groupId>
240 | 			<artifactId>logback-core</artifactId>
241 | 			<version>${logback.version}</version>
242 | 		</dependency>
243 | 		<!-- Logging -->
244 | 
245 | 		<dependency>
246 | 			<groupId>org.slf4j</groupId>
247 | 			<artifactId>slf4j-api</artifactId>
248 | 		</dependency>
249 | 	
250 | 		<!-- Utility packages -->
251 | 		<dependency>
252 | 			<groupId>commons-lang</groupId>
253 | 			<artifactId>commons-lang</artifactId>
254 | 			<version>${apache.commons-lang.version}</version>
255 | 			<scope>provided</scope>
256 | 		</dependency>
257 | 		<dependency>
258 | 			<groupId>commons-io</groupId>
259 | 			<artifactId>commons-io</artifactId>
260 | 			<version>${apache.commons-io.version}</version>
261 | 			<scope>provided</scope>
262 | 		</dependency>
263 | 		<dependency>
264 | 			<groupId>org.apache.commons</groupId>
265 | 			<artifactId>commons-compress</artifactId>
266 | 			<version>1.5</version>
267 | 			<scope>provided</scope>
268 | 		</dependency>
269 | 		<dependency>
270 | 			<groupId>commons-httpclient</groupId>
271 | 			<artifactId>commons-httpclient</artifactId>
272 | 			<version>3.1</version>
273 | 			<scope>provided</scope>
274 | 			<exclusions>
275 | 				<exclusion>
276 | 					<artifactId>commons-codec</artifactId>
277 | 					<groupId>commons-codec</groupId>
278 | 				</exclusion>
279 | 			</exclusions>
280 | 		</dependency>
281 | 
282 | 		<dependency>
283 | 			<groupId>org.slf4j</groupId>
284 | 			<artifactId>slf4j-api</artifactId>
285 | 			<version>1.7.5</version>
286 | 		</dependency>
287 | 
288 | 
289 | 		<dependency>
290 | 			<groupId>org.slf4j</groupId>
291 | 			<artifactId>jul-to-slf4j</artifactId>
292 | 			<version>${slf4j.version}</version>
293 | 		</dependency>
294 | 		<dependency>
295 | 			<groupId>org.slf4j</groupId>
296 | 			<artifactId>log4j-over-slf4j</artifactId>
297 | 			<version>${slf4j.version}</version>
298 | 		</dependency>
299 | 
300 | 		<dependency>
301 | 			<groupId>org.apache.curator</groupId>
302 | 			<artifactId>curator-test</artifactId>
303 | 			<version>${curator.version}</version>
304 | 			<exclusions>
305 | 				<exclusion>
306 | 					<groupId>org.apache.commons</groupId>
307 | 					<artifactId>commons-math</artifactId>
308 | 				</exclusion>
309 | 				<exclusion>
310 | 					<groupId>org.javassist</groupId>
311 | 					<artifactId>javassist</artifactId>
312 | 				</exclusion>
313 | 			</exclusions>
314 | 			<scope>provided</scope>
315 | 		</dependency>
316 | 		<dependency>
317 | 			<groupId>org.javassist</groupId>
318 | 			<artifactId>javassist</artifactId>
319 | 			<version>3.18.1-GA</version>
320 | 		</dependency>
321 | 
322 | 		<!-- Testing -->
323 | 		<dependency>
324 | 			<groupId>junit</groupId>
325 | 			<artifactId>junit</artifactId>
326 | 			<version>${junit.version}</version>
327 | 			<scope>test</scope>
328 | 		</dependency>
329 | 
330 | 		<dependency>
331 | 			<groupId>org.mockito</groupId>
332 | 			<artifactId>mockito-all</artifactId>
333 | 			<version>${mockito.version}</version>
334 | 			<scope>test</scope>
335 | 		</dependency>
336 | 
337 | 		<dependency>
338 | 			<groupId>org.apache.hadoop</groupId>
339 | 			<artifactId>hadoop-mapreduce-client-app</artifactId>
340 | 			<version>${hadoop.core.version}</version>
341 | 			<exclusions>
342 | 				<exclusion>
343 | 					<artifactId>jackson-core-asl</artifactId>
344 | 					<groupId>org.codehaus.jackson</groupId>
345 | 				</exclusion>
346 | 				<exclusion>
347 | 					<artifactId>jackson-mapper-asl</artifactId>
348 | 					<groupId>org.codehaus.jackson</groupId>
349 | 				</exclusion>
350 | 				<exclusion>
351 | 					<artifactId>avro</artifactId>
352 | 					<groupId>org.apache.avro</groupId>
353 | 				</exclusion>
354 | 				<exclusion>
355 | 				    <artifactId>slf4j-log4j12</artifactId>
356 | 				    <groupId>org.slf4j</groupId>
357 | 				</exclusion>
358 | 				<exclusion>
359 | 				    <artifactId>log4j</artifactId>
360 | 				    <groupId>log4j</groupId>
361 | 				</exclusion>
362 | 			</exclusions>
363 | 			<scope>provided</scope>
364 | 		</dependency>
365 | 		<dependency>
366 | 			<groupId>org.apache.hadoop</groupId>
367 | 			<artifactId>hadoop-common</artifactId>
368 | 			<version>${hadoop.core.version}</version>
369 | 			<exclusions>
370 | 				<exclusion>
371 | 					<artifactId>commons-codec</artifactId>
372 | 					<groupId>commons-codec</groupId>
373 | 				</exclusion>
374 | 				<exclusion>
375 | 					<artifactId>jackson-core-asl</artifactId>
376 | 					<groupId>org.codehaus.jackson</groupId>
377 | 				</exclusion>
378 | 				<exclusion>
379 | 					<artifactId>jackson-mapper-asl</artifactId>
380 | 					<groupId>org.codehaus.jackson</groupId>
381 | 				</exclusion>
382 | 				<exclusion>
383 | 					<artifactId>avro</artifactId>
384 | 					<groupId>org.apache.avro</groupId>
385 | 				</exclusion>
386 | 				<exclusion>
387 | 				    <artifactId>slf4j-log4j12</artifactId>
388 | 				    <groupId>org.slf4j</groupId>
389 | 				</exclusion>
390 | 				<exclusion>
391 | 				    <artifactId>log4j</artifactId>
392 | 				    <groupId>log4j</groupId>
393 | 				</exclusion>
394 | 			</exclusions>
395 | 			<scope>provided</scope>
396 | 		</dependency>
397 | 		<dependency>
398 | 			<groupId>org.apache.hadoop</groupId>
399 | 			<artifactId>hadoop-hdfs</artifactId>
400 | 			<version>${hadoop.core.version}</version>
401 | 			<exclusions>
402 | 				<exclusion>
403 | 					<artifactId>commons-codec</artifactId>
404 | 					<groupId>commons-codec</groupId>
405 | 				</exclusion>
406 | 				<exclusion>
407 | 					<artifactId>jackson-core-asl</artifactId>
408 | 					<groupId>org.codehaus.jackson</groupId>
409 | 				</exclusion>
410 | 				<exclusion>
411 | 					<artifactId>jackson-mapper-asl</artifactId>
412 | 					<groupId>org.codehaus.jackson</groupId>
413 | 				</exclusion>
414 | 				<exclusion>
415 | 				    <artifactId>log4j</artifactId>
416 | 				    <groupId>log4j</groupId>
417 | 				</exclusion>
418 | 			</exclusions>
419 | 			<scope>provided</scope>
420 | 		</dependency>
421 | 	</dependencies>
422 | 
423 |  	<developers>
424 |     	<developer>
425 |       		<name>Drew Dahlke</name>
426 |       		<email>justin.dahlke@gmail.com</email>
427 |       		<organization>Interactive Intelligence</organization>
428 |       		<organizationUrl>http://www.inin.com</organizationUrl>
429 |     	</developer>
430 |     	<developer>
431 |       		<name>Michael Mulligan</name>
432 |       		<email>Michael.Mulligan@inin.com</email>
433 |       		<organization>Interactive Intelligence</organization>
434 |       		<organizationUrl>http://www.inin.com</organizationUrl>
435 |     	</developer>
436 |   	</developers>
437 |   	
438 | 	<distributionManagement>
439 | 	  <snapshotRepository>
440 | 	    <id>ossrh</id>
441 | 	    <url>https://oss.sonatype.org/content/repositories/snapshots</url>
442 | 	  </snapshotRepository>
443 | 	  <repository>
444 | 	    <id>ossrh</id>
445 | 	    <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
446 | 	  </repository>
447 | 	</distributionManagement>
448 | 	
449 |   	
450 | 	<build>
451 | 		<plugins>
452 | 	 		<plugin>
453 | 		      <groupId>org.sonatype.plugins</groupId>
454 | 		      <artifactId>nexus-staging-maven-plugin</artifactId>
455 | 		      <version>1.6.3</version>
456 | 		      <extensions>true</extensions>
457 | 		      <configuration>
458 | 		        <serverId>ossrh</serverId>
459 | 		        <nexusUrl>https://oss.sonatype.org/</nexusUrl>
460 | 		        <autoReleaseAfterClose>true</autoReleaseAfterClose>
461 | 		      </configuration>
462 | 		    </plugin>		
463 | 			 <plugin>
464 | 		      <groupId>org.apache.maven.plugins</groupId>
465 | 		      <artifactId>maven-source-plugin</artifactId>
466 | 		      <version>2.2.1</version>
467 | 		      <executions>
468 | 		        <execution>
469 | 		          <id>attach-sources</id>
470 | 		          <goals>
471 | 		            <goal>jar-no-fork</goal>
472 | 		          </goals>
473 | 		        </execution>
474 | 		      </executions>
475 | 		    </plugin>
476 | 		    <plugin>
477 | 		      <groupId>org.apache.maven.plugins</groupId>
478 | 		      <artifactId>maven-gpg-plugin</artifactId>
479 | 		      <version>1.5</version>
480 | 		       <configuration>
481 | 			      <executable>gpg</executable>
482 | 		       </configuration>
483 | 		      <executions>
484 | 		        <execution>
485 | 		          <id>sign-artifacts</id>
486 | 		          <phase>verify</phase>
487 | 		          <goals>
488 | 		            <goal>sign</goal>
489 | 		          </goals>
490 | 		        </execution>
491 | 		      </executions>
492 | 		    </plugin>	
493 |                 <plugin>
494 |                     <groupId>org.apache.maven.plugins</groupId>
495 |                     <artifactId>maven-javadoc-plugin</artifactId>
496 |                     <configuration>
497 |                         <!-- jdk8 started linting javadocs by default; ours are not fully compliant -->
498 |                         <additionalparam>-Xdoclint:none</additionalparam>
499 |                     </configuration>
500 |                 </plugin>		    	    
501 | 		    <plugin>
502 | 		      <groupId>org.apache.maven.plugins</groupId>
503 | 		      <artifactId>maven-javadoc-plugin</artifactId>
504 | 		      <version>2.9.1</version>
505 | 		      <executions>
506 | 		        <execution>
507 | 		          <id>attach-javadocs</id>
508 | 		          <goals>
509 | 		            <goal>jar</goal>
510 | 		          </goals>
511 | 		        </execution>
512 | 		      </executions>
513 | 		    </plugin>
514 | 			<plugin>
515 | 				<groupId>org.apache.maven.plugins</groupId>
516 | 				<artifactId>maven-surefire-plugin</artifactId>
517 | 				<version>${surefire.version}</version>
518 | 				<configuration>
519 | 					<workingDirectory>${java.io.tmpdir}</workingDirectory>
520 | 				</configuration>
521 | 			</plugin>
522 | 			<plugin>
523 | 				<groupId>org.apache.maven.plugins</groupId>
524 | 				<artifactId>maven-compiler-plugin</artifactId>
525 | 				<version>3.1</version>
526 | 				<configuration>
527 | 					<source>${java-version}</source>
528 | 					<target>${java-version}</target>
529 | 				</configuration>
530 | 			</plugin>
531 | 			<plugin>
532 | 				<artifactId>maven-assembly-plugin</artifactId>
533 | 				<version>2.2.1</version>
534 | 				<configuration>
535 | 					<descriptors>
536 | 						<descriptor>src/main/resources/hadoop-job.xml</descriptor>
537 | 					</descriptors>
538 | 					<archive>
539 | 						<manifest>
540 | 							<mainClass>com.inin.analytics.elasticsearch.driver.Driver</mainClass>
541 | 						</manifest>
542 | 					</archive>
543 | 				</configuration>
544 | 				<executions>
545 | 					<execution>
546 | 						<id>make-assembly</id>
547 | 						<phase>package</phase>
548 | 						<goals>
549 | 							<goal>single</goal>
550 | 						</goals>
551 | 					</execution>
552 | 				</executions>
553 | 			</plugin>
554 | 			<plugin>
555 | 				<groupId>org.codehaus.mojo</groupId>
556 | 				<artifactId>exec-maven-plugin</artifactId>
557 | 				<version>1.2.1</version>
558 | 				<executions>
559 | 					<execution>
560 | 						<goals>
561 | 							<goal>java</goal>
562 | 						</goals>
563 | 					</execution>
564 | 				</executions>
565 | 				<configuration>
566 | 					<mainClass>com.inin.analytics.elasticlearch.driver.Driver</mainClass>
567 | 				</configuration>
568 | 			</plugin>
569 | 		</plugins>
570 | 	</build>
571 | </project>
572 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/BaseESMapper.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapred.JobConf;
 7 | import org.apache.hadoop.mapred.Mapper;
 8 | import org.apache.hadoop.mapred.OutputCollector;
 9 | import org.apache.hadoop.mapred.Reporter;
10 | 
11 | /**
12 |  * This class acts as a passthrough routing data to the reducer. Your input should be keyed off of 
13 |  * [index name] | [index type] 
14 |  * 
15 |  * The values should be the raw json payloads to send to ES.
16 |  * @author drew
17 |  *
18 |  */
19 | public class BaseESMapper implements Mapper <Text, Text, Text, Text> {
20 | 	public void configure(JobConf job) {
21 | 	}
22 | 
23 | 	public void map(Text indexAndRouting, Text documentPayload, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
24 | 		/**
25 | 		 * Reducer key looks like this   [indexName]|[routing hash] value [doc type]|[doc id]|json
26 | 		 * 
27 | 		 */
28 | 		output.collect(indexAndRouting, documentPayload);
29 | 	} 
30 | 
31 | 	public void close() throws IOException {
32 | 	}
33 | 
34 | }


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/BaseESReducer.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch;
  2 | 
  3 | import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
  4 | 
  5 | import java.io.File;
  6 | import java.io.IOException;
  7 | import java.util.Arrays;
  8 | import java.util.Iterator;
  9 | 
 10 | import org.apache.commons.io.FileUtils;
 11 | import org.apache.commons.lang.StringUtils;
 12 | import org.apache.hadoop.io.NullWritable;
 13 | import org.apache.hadoop.io.Text;
 14 | import org.apache.hadoop.mapred.JobConf;
 15 | import org.apache.hadoop.mapred.OutputCollector;
 16 | import org.apache.hadoop.mapred.Reducer;
 17 | import org.apache.hadoop.mapred.Reporter;
 18 | import org.elasticsearch.action.ActionFuture;
 19 | import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
 20 | import org.elasticsearch.action.admin.indices.delete.DeleteIndexResponse;
 21 | import org.elasticsearch.action.index.IndexResponse;
 22 | import org.elasticsearch.cluster.metadata.IndexMetaData;
 23 | import com.inin.analytics.elasticsearch.transport.SnapshotTransportStrategy;
 24 | 
 25 | public abstract class BaseESReducer implements Reducer<Text, Text, NullWritable, Text> {
 26 | 	public static final char TUPLE_SEPARATOR = '|';
 27 | 	public static final char DIR_SEPARATOR = '/';
 28 | 	
 29 | 	public static enum JOB_COUNTER {
 30 | 		TIME_SPENT_INDEXING_MS, TIME_SPENT_FLUSHING_MS, TIME_SPENT_MERGING_MS, TIME_SPENT_SNAPSHOTTING_MS, TIME_SPENT_TRANSPORTING_SNAPSHOT_MS, INDEXING_DOC_FAIL, INDEX_DOC_CREATED, INDEX_DOC_NOT_CREATED
 31 | 	}
 32 | 	
 33 | 	// We prefix all snapshots with the word snapshot
 34 | 	public static final String SNAPSHOT_NAME = "snapshot";
 35 | 	
 36 | 	// The local filesystem location that ES will write the snapshot out to
 37 | 	private String snapshotWorkingLocation;
 38 | 	
 39 | 	// Where the snapshot will be moved to. Typical use case would be to throw it onto S3
 40 | 	private String snapshotFinalDestination;
 41 | 	
 42 | 	// The name of a snapshot repo. We'll enumerate that on each job run so that the repo names are unique across rebuilds
 43 | 	private String snapshotRepoName;
 44 | 	
 45 | 	// Local filesystem location where index data is built
 46 | 	private String esWorkingDir;
 47 | 	
 48 | 	// The partition of data this reducer is serving. Useful for making directories unique if running multiple reducers on a task tracker 
 49 | 	private String partition;
 50 | 	
 51 | 	// The container handles spinning up our embedded elasticsearch instance
 52 | 	private ESEmbededContainer esEmbededContainer;
 53 | 	
 54 | 	private ShardConfig shardConfig;
 55 | 		
 56 | 	// Hold onto some frequently generated objects to cut down on GC overhead 
 57 | 	private String indexType;
 58 | 	private String docId;
 59 | 	private String pre;
 60 | 	private String json;
 61 |    
 62 | 	@Override
 63 | 	public void configure(JobConf job) {
 64 | 		partition = job.get("mapred.task.partition");
 65 |         String attemptId = job.get("mapred.task.id");
 66 | 		
 67 | 		// If running multiple reducers on a node, the node needs a unique name & data directory hence the random number we append 
 68 | 		snapshotWorkingLocation = job.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()) + partition + attemptId + DIR_SEPARATOR;
 69 | 		snapshotFinalDestination = job.get(ConfigParams.SNAPSHOT_FINAL_DESTINATION.toString());
 70 | 		snapshotRepoName = job.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString());
 71 | 		esWorkingDir = job.get(ConfigParams.ES_WORKING_DIR.toString()) + partition + attemptId + DIR_SEPARATOR;
 72 | 		if(shardConfig == null) {
 73 | 		    shardConfig = getShardConfig(job);    
 74 | 		}
 75 | 	}
 76 | 	
 77 | 	public void setShardConfig(ShardConfig shardConfig) {
 78 |         this.shardConfig = shardConfig;
 79 |     }
 80 | 
 81 |     private void init(String index) {
 82 | 		String templateName = getTemplateName();
 83 | 		String templateJson = getTemplate();
 84 | 
 85 | 		ESEmbededContainer.Builder builder = new ESEmbededContainer.Builder()
 86 | 		.withNodeName("embededESTempLoaderNode" + partition)
 87 | 		.withWorkingDir(esWorkingDir)
 88 | 		.withClusterName("bulkLoadPartition:" + partition)
 89 | 		.withSnapshotWorkingLocation(snapshotWorkingLocation)
 90 | 		.withSnapshotRepoName(snapshotRepoName);
 91 | 		
 92 | 		if(templateName != null && templateJson != null) {
 93 | 			builder.withTemplate(templateName, templateJson);	
 94 | 		}
 95 | 		
 96 | 		if(esEmbededContainer == null) {
 97 | 			esEmbededContainer = builder.build();	
 98 | 		} 
 99 | 		
100 | 		// Create index
101 | 		esEmbededContainer.getNode().client().admin().indices().prepareCreate(index).setSettings(settingsBuilder()
102 | 		        .put("index.number_of_replicas", 0)
103 | 		        .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, shardConfig.getShardsForIndex(index))
104 | 		        ).get();
105 | 	}
106 | 	
107 | 	/**
108 | 	 * Provide the JSON contents of the index template. This is your hook for configuring ElasticSearch.
109 | 	 * 
110 | 	 * http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-templates.html
111 | 	 * 
112 |      * @return String
113 | 	 */
114 | 	public abstract String getTemplate();
115 | 	
116 | 	
117 | 	/**
118 | 	 * Provide a ShardConfig which provides the number of shards per index and the number of 
119 | 	 * shards to split organizations across. The number can be uniform across indices or a mapping
120 | 	 * can be provided to enable per-index configuration values.
121 | 	 *  
122 | 	 * @param job
123 | 	 * @return ShardConfig
124 | 	 */
125 | 	public abstract ShardConfig getShardConfig(JobConf job);
126 | 	
127 | 	/**
128 | 	 * Provide an all lower case template name
129 | 	 *  
130 | 	 * @return String
131 | 	 */
132 | 	public abstract String getTemplateName();
133 | 
134 | 	@Override
135 | 	public void reduce(Text docMetaData, Iterator<Text> documentPayloads, OutputCollector<NullWritable, Text> output, final Reporter reporter) throws IOException {
136 | 		String[] pieces = StringUtils.split(docMetaData.toString(), TUPLE_SEPARATOR);
137 | 		String indexName = pieces[0];
138 | 		String routing = pieces[1]; 
139 | 		init(indexName);
140 | 
141 | 		long start = System.currentTimeMillis();
142 | 		while(documentPayloads.hasNext()) {
143 | 			Text line = documentPayloads.next();
144 | 			if(line == null) {
145 | 				continue;
146 | 			}
147 | 			
148 | 			pieces = StringUtils.split(line.toString(), TUPLE_SEPARATOR);
149 | 			indexType = pieces[0];
150 | 			docId = pieces[1];
151 | 			pre = indexType + TUPLE_SEPARATOR + docId + TUPLE_SEPARATOR;
152 | 			json = line.toString().substring(pre.length());
153 | 
154 | 			IndexResponse response = esEmbededContainer.getNode().client().prepareIndex(indexName, indexType).setId(docId).setRouting(routing).setSource(json).execute().actionGet();
155 | 			if(response.isCreated()) {
156 | 				reporter.incrCounter(JOB_COUNTER.INDEX_DOC_CREATED, 1l);
157 | 			} else {
158 | 				reporter.incrCounter(JOB_COUNTER.INDEX_DOC_NOT_CREATED, 1l);
159 | 			}
160 | 		}
161 | 
162 | 		reporter.incrCounter(JOB_COUNTER.TIME_SPENT_INDEXING_MS, System.currentTimeMillis() - start);
163 | 		
164 | 		snapshot(indexName, reporter);
165 | 		output.collect(NullWritable.get(), new Text(indexName));
166 | 	}
167 | 
168 | 	@Override
169 | 	public void close() throws IOException {
170 | 		if(esEmbededContainer != null) {
171 | 			esEmbededContainer.getNode().close();
172 | 			while(!esEmbededContainer.getNode().isClosed());
173 | 			FileUtils.deleteDirectory(new File(snapshotWorkingLocation));
174 | 		}
175 | 	}
176 | 
177 | 	public void snapshot(String index, Reporter reporter) throws IOException {
178 | 		esEmbededContainer.snapshot(Arrays.asList(index), SNAPSHOT_NAME, snapshotRepoName, reporter);
179 | 		
180 | 		// Delete the index to free up that space
181 | 		ActionFuture<DeleteIndexResponse> response = esEmbededContainer.getNode().client().admin().indices().delete(new DeleteIndexRequest(index));
182 | 		while(!response.isDone());
183 | 		
184 | 		// Move the shard snapshot to the destination
185 | 		long start = System.currentTimeMillis();
186 | 		SnapshotTransportStrategy.get(snapshotWorkingLocation, snapshotFinalDestination).execute(SNAPSHOT_NAME, index);
187 | 		reporter.incrCounter(JOB_COUNTER.TIME_SPENT_TRANSPORTING_SNAPSHOT_MS, System.currentTimeMillis() - start);
188 | 		
189 | 		esEmbededContainer.deleteSnapshot(SNAPSHOT_NAME, snapshotRepoName);
190 | 	}
191 | }


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/ConfigParams.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch;
 2 | 
 3 | public enum ConfigParams {
 4 | 	SNAPSHOT_WORKING_LOCATION_CONFIG_KEY,
 5 | 	SNAPSHOT_REPO_NAME_CONFIG_KEY,
 6 | 	SNAPSHOT_FINAL_DESTINATION,
 7 | 	ES_WORKING_DIR,
 8 | 	NUM_SHARDS_PER_INDEX,
 9 | 	NUM_SHARDS_PER_ORGANIZATION
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/ESEmbededContainer.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch;
  2 | 
  3 | import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
  4 | 
  5 | import java.util.HashMap;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | 
  9 | import org.apache.hadoop.mapred.Reporter;
 10 | import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsRequest;
 11 | import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
 12 | import org.elasticsearch.cluster.metadata.IndexMetaData;
 13 | import org.elasticsearch.common.settings.ImmutableSettings;
 14 | import org.elasticsearch.common.settings.Settings;
 15 | import org.elasticsearch.common.unit.TimeValue;
 16 | import org.elasticsearch.node.Node;
 17 | import org.elasticsearch.plugins.PluginsService;
 18 | import org.elasticsearch.snapshots.SnapshotInfo;
 19 | import org.slf4j.Logger;
 20 | import org.slf4j.LoggerFactory;
 21 | 
 22 | import com.google.common.base.Preconditions;
 23 | 
 24 | /**
 25 |  * Builds an embedded elasticsearch instance and configures it for you
 26 |  * 
 27 |  * @author drew
 28 |  *
 29 |  */
 30 | public class ESEmbededContainer {
 31 | 	private Node node;
 32 | 	private long DEFAULT_TIMEOUT_MS = 60 * 30 * 1000; 
 33 | 	private static Integer MAX_MERGED_SEGMENT_SIZE_MB = 256;
 34 | 	private static transient Logger logger = LoggerFactory.getLogger(ESEmbededContainer.class);
 35 | 	
 36 | 	public void snapshot(List<String> index, String snapshotName, String snapshotRepoName, Reporter reporter) {
 37 | 		snapshot(index, snapshotName, snapshotRepoName, DEFAULT_TIMEOUT_MS, reporter);
 38 | 	}
 39 | 	
 40 | 	/**
 41 | 	 * Flush, optimize, and snapshot an index. Block until complete.
 42 | 	 * 
 43 | 	 * @param indicies
 44 | 	 * @param snapshotName
 45 | 	 * @param snapshotRepoName
 46 | 	 * @param timeoutMS
 47 | 	 * @param reporter
 48 | 	 */
 49 | 	public void snapshot(List<String> indicies, String snapshotName, String snapshotRepoName, long timeoutMS, Reporter reporter) {
 50 | 		/* Flush & optimize before the snapshot.
 51 | 		 *  
 52 | 		 * TODO: Long operations could block longer that the container allows an operation to go
 53 | 		 * unresponsive b/f killing. We need to issue the request and poll the future waiting on the
 54 | 		 * operation to succeed, but update a counter or something to let the hadoop framework
 55 | 		 * know the process is still alive. 
 56 | 		 */  
 57 | 		TimeValue v = new TimeValue(timeoutMS);
 58 | 		for(String index : indicies) {
 59 | 			long start = System.currentTimeMillis();
 60 | 
 61 | 			// Flush
 62 | 			node.client().admin().indices().prepareFlush(index).get(v);
 63 | 			if(reporter != null) {
 64 | 				reporter.incrCounter(BaseESReducer.JOB_COUNTER.TIME_SPENT_FLUSHING_MS, System.currentTimeMillis() - start);
 65 | 			}
 66 | 
 67 | 			// Merge
 68 | 			start = System.currentTimeMillis();
 69 | 			node.client().admin().indices().prepareOptimize(index).get(v);
 70 | 			if(reporter != null) {
 71 | 				reporter.incrCounter(BaseESReducer.JOB_COUNTER.TIME_SPENT_MERGING_MS, System.currentTimeMillis() - start);
 72 | 			}
 73 | 		}
 74 | 
 75 | 		// Snapshot
 76 | 		long start = System.currentTimeMillis();
 77 | 		node.client().admin().cluster().prepareCreateSnapshot(snapshotRepoName, snapshotName).setIndices((String[]) indicies.toArray(new String[0])).execute();
 78 | 
 79 | 		// ES snapshot restore ignores timers and will block no more than 30s :( You have to block & poll to make sure it's done
 80 | 		blockForSnapshot(snapshotRepoName, indicies, timeoutMS);	
 81 | 		
 82 | 		if(reporter != null) {
 83 | 			reporter.incrCounter(BaseESReducer.JOB_COUNTER.TIME_SPENT_SNAPSHOTTING_MS, System.currentTimeMillis() - start);
 84 | 		}
 85 | 
 86 | 	}
 87 | 
 88 | 	/**
 89 | 	 * Block for index snapshots to be complete
 90 | 	 * 
 91 | 	 * @param snapshotRepoName
 92 | 	 * @param indicies
 93 | 	 * @param timeoutMS
 94 | 	 */
 95 | 	private void blockForSnapshot(String snapshotRepoName, List<String> indicies, long timeoutMS) {
 96 | 		long start = System.currentTimeMillis();
 97 | 		while(System.currentTimeMillis() - start < timeoutMS) {
 98 | 
 99 | 			GetSnapshotsResponse repos = node.client().admin().cluster().getSnapshots(new GetSnapshotsRequest(snapshotRepoName)).actionGet();
100 | 				for(SnapshotInfo i : repos.getSnapshots()) {
101 | 					if(i.state().completed() && i.successfulShards() == i.totalShards() && i.totalShards() >= indicies.size()) {
102 | 						logger.info("Snapshot completed {} out of {} indicies. Snapshot state {}. ", i.successfulShards(), i.totalShards(), i.state().completed());
103 | 						return;
104 | 					} else {
105 | 						logger.info("Snapshotted {} out of {} indicies, polling for completion. Snapshot state {}.", i.successfulShards(), i.totalShards(), i.state().completed());
106 | 					}
107 | 				}
108 | 			try {
109 | 				// Don't slam ES with snapshot status requests in a tight loop
110 | 				Thread.sleep(1000);
111 | 			} catch (InterruptedException e) {
112 | 				Thread.currentThread().interrupt();
113 | 			}
114 | 		}
115 | 	}
116 | 	
117 | 	public void deleteSnapshot(String snapshotName, String snapshotRepoName) {
118 | 		node.client().admin().cluster().prepareDeleteSnapshot(snapshotRepoName, snapshotName).execute().actionGet();
119 | 	}
120 | 
121 | 	public static class Builder {
122 | 		private ESEmbededContainer container;
123 | 		private String nodeName;
124 | 		private String workingDir;
125 | 		private String clusterName;
126 | 		private String templateName;
127 | 		private String templateSource;
128 | 		private String snapshotWorkingLocation;
129 | 		private String snapshotRepoName;
130 | 		private boolean memoryBackedIndex = false;
131 | 
132 | 		public ESEmbededContainer build() {
133 | 			Preconditions.checkNotNull(nodeName);
134 | 			Preconditions.checkNotNull(workingDir);
135 | 			Preconditions.checkNotNull(clusterName);
136 | 
137 | 			org.elasticsearch.common.settings.ImmutableSettings.Builder builder = ImmutableSettings.builder()
138 | 			.put("http.enabled", false) // Disable HTTP transport, we'll communicate inner-jvm
139 | 			.put("processors", 1) // We could experiment ramping this up to match # cores - num reducers per node
140 | 			.put("node.name", nodeName)
141 | 			.put("path.data", workingDir)
142 | 			.put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true) // Allow plugins if they're bundled in with the uuberjar
143 | 			.put("index.refresh_interval", -1) 
144 | 			.put("index.translog.flush_threshold_size", "128mb") // Aggressive flushing helps keep the memory footprint below the yarn container max. TODO: Make configurable 
145 | 			.put("bootstrap.mlockall", true)
146 | 			.put("cluster.routing.allocation.disk.watermark.low", 99) // Nodes don't form a cluster, so routing allocations don't matter
147 | 			.put("cluster.routing.allocation.disk.watermark.high", 99)
148 | 			.put("index.load_fixed_bitset_filters_eagerly", false)
149 | 			.put("indices.store.throttle.type", "none") // Allow indexing to max out disk IO
150 | 			.put("indices.memory.index_buffer_size", "5%") // The default 10% is a bit large b/c it's calculated against JVM heap size & not Yarn container allocation. Choosing a good value here could be made smarter.
151 | 			.put("index.merge.policy.max_merged_segment", MAX_MERGED_SEGMENT_SIZE_MB + "mb") // The default 5gb segment max size is too large for the typical hadoop node
152 | 			//.put("index.merge.policy.max_merge_at_once", 10) 
153 | 			.put("index.merge.policy.segments_per_tier", 4)
154 | 			.put("index.merge.scheduler.max_thread_count", 1)
155 | 			.put("path.repo", snapshotWorkingLocation)
156 | 			.put("index.compound_format", false) // Explicitly disable compound files
157 | 			//.put("index.codec", "best_compression") // Lucene 5/ES 2.0 feature to play with when that's out
158 | 			.put("indices.fielddata.cache.size", "0%");
159 | 			
160 | 			if(memoryBackedIndex) {
161 | 				builder.put("index.store.type", "memory");
162 | 			}
163 | 			Settings nodeSettings = builder.build();
164 | 
165 | 			// Create the node
166 | 			container.setNode(nodeBuilder()
167 | 					.client(false) // It's a client + data node
168 | 					.local(true) // Tell ES cluster discovery to be inner-jvm only, disable HTTP based node discovery
169 | 					.clusterName(clusterName)
170 | 					.settings(nodeSettings)
171 | 					.build());
172 | 
173 | 			// Start ES
174 | 			container.getNode().start();
175 | 
176 | 			// Configure the cluster with an index template mapping
177 | 			if(templateName != null && templateSource != null) {
178 | 				container.getNode().client().admin().indices().preparePutTemplate(templateName).setSource(templateSource).get();	
179 | 			}
180 | 
181 | 			// Create the snapshot repo
182 | 			if(snapshotWorkingLocation != null && snapshotRepoName != null) {
183 | 				Map<String, Object> settings = new HashMap<>();
184 | 				settings.put("location", snapshotWorkingLocation);
185 | 				settings.put("compress", true);
186 | 				settings.put("max_snapshot_bytes_per_sec", "400mb"); // The default 20mb/sec is very slow for a local disk to disk snapshot
187 | 				container.getNode().client().admin().cluster().preparePutRepository(snapshotRepoName).setType("fs").setSettings(settings).get();
188 | 			}
189 | 
190 | 			return container;
191 | 		}
192 | 
193 | 		public Builder() {
194 | 			container = new ESEmbededContainer();
195 | 		}
196 | 
197 | 		/**
198 | 		 * @param nodeName
199 | 		 * @return Builder
200 | 		 */
201 | 		public Builder withNodeName(String nodeName) {
202 | 			this.nodeName = nodeName;
203 | 			return this;
204 | 		}
205 | 
206 | 		/**
207 | 		 * 
208 | 		 * @param workingDir
209 | 		 * @return Builder
210 | 		 */
211 | 		public Builder withWorkingDir(String workingDir) {
212 | 			this.workingDir = workingDir;
213 | 			return this;
214 | 		}
215 | 		
216 | 		/**
217 | 		 * 
218 | 		 * @param clusterName
219 | 		 * @return Builder
220 | 		 */
221 | 		public Builder withClusterName(String clusterName) {
222 | 			this.clusterName = clusterName;
223 | 			return this;
224 | 		}
225 | 
226 | 		/**
227 | 		 * 
228 | 		 * @param templateName
229 | 		 * @param templateSource
230 | 		 * @return Builder
231 | 		 */
232 | 		public Builder withTemplate(String templateName, String templateSource) {
233 | 			this.templateName = templateName;
234 | 			this.templateSource = templateSource;
235 | 			return this;
236 | 		}
237 | 
238 | 		/**
239 | 		 * 
240 | 		 * @param snapshotWorkingLocation
241 | 		 * @return Builder
242 | 		 */
243 | 		public Builder withSnapshotWorkingLocation(String snapshotWorkingLocation) {
244 | 			this.snapshotWorkingLocation = snapshotWorkingLocation;
245 | 			return this;
246 | 		}
247 | 
248 | 		/**
249 | 		 * 
250 | 		 * @param snapshotRepoName
251 | 		 * @return Builder
252 | 		 */
253 | 		public Builder withSnapshotRepoName(String snapshotRepoName) {
254 | 			this.snapshotRepoName = snapshotRepoName;
255 | 			return this;
256 | 		}
257 | 		
258 | 		/**
259 | 		 * 
260 | 		 * @param memoryBackedIndex
261 | 		 * @return Builder
262 | 		 */
263 | 		public Builder withInMemoryBackedIndexes(boolean memoryBackedIndex) {
264 | 			this.memoryBackedIndex = memoryBackedIndex;
265 | 			return this;
266 | 		}
267 | 
268 | 	}
269 | 
270 | 	/**
271 | 	 * 
272 | 	 * @return Node
273 | 	 */
274 | 	public Node getNode() {
275 | 		return node;
276 | 	}
277 | 
278 | 	public void setNode(Node node) {
279 | 		this.node = node;
280 | 	}
281 | 
282 | }
283 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/IndexingPostProcessor.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch;
  2 | 
  3 | import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
  4 | 
  5 | import java.io.BufferedReader;
  6 | import java.io.File;
  7 | import java.io.FileNotFoundException;
  8 | import java.io.IOException;
  9 | import java.io.InputStreamReader;
 10 | import java.io.PrintWriter;
 11 | import java.util.ArrayList;
 12 | import java.util.HashMap;
 13 | import java.util.HashSet;
 14 | import java.util.List;
 15 | import java.util.Map;
 16 | import java.util.Set;
 17 | 
 18 | import org.apache.commons.io.FileUtils;
 19 | import org.apache.hadoop.conf.Configuration;
 20 | import org.apache.hadoop.fs.FileSystem;
 21 | import org.apache.hadoop.fs.FileUtil;
 22 | import org.apache.hadoop.fs.Path;
 23 | import org.elasticsearch.cluster.metadata.IndexMetaData;
 24 | import org.slf4j.Logger;
 25 | import org.slf4j.LoggerFactory;
 26 | 
 27 | import com.inin.analytics.elasticsearch.transport.BaseTransport;
 28 | import com.inin.analytics.elasticsearch.transport.SnapshotTransportStrategy;
 29 | 
 30 | public class IndexingPostProcessor {
 31 | 	private static transient Logger logger = LoggerFactory.getLogger(IndexingPostProcessor.class);
 32 | 	
 33 | 	/**
 34 | 	 * The job output in HDFS is just a manifest of indicies generated by the Job. Why? S3 is eventually consistent in some
 35 |      * zones. That means if you try to list the indicies you just generated by this job, you might miss some. Instead, we
 36 |      * have the job spit out tiny manifests. This method merges them together, de-dupes them, and if there's any shards that
 37 |      * didn't get generated because they have no data it puts a placeholder empty shard in it's place to satisfy ElasticSearch.
 38 |      * 
 39 | 	 * @param jobOutput
 40 | 	 * @param manifestFile
 41 | 	 * @param scratchDir
 42 | 	 * @param shardConfig
 43 | 	 * @param conf
 44 | 	 * @param reducerClass
 45 | 	 * @throws IOException
 46 | 	 * @throws InstantiationException
 47 | 	 * @throws IllegalAccessException
 48 | 	 */
 49 | 	public void execute(Path jobOutput, Path manifestFile, String scratchDir, ShardConfig shardConfig, Configuration conf, Class<? extends BaseESReducer> reducerClass) throws IOException, InstantiationException, IllegalAccessException {
 50 | 		FileSystem fs = FileSystem.get(conf);
 51 | 		ESEmbededContainer esEmbededContainer = null;
 52 | 		boolean rootManifestUploaded = false;
 53 | 		try{
 54 | 			Map<String, Integer> numShardsGenerated = new HashMap<String, Integer>();
 55 | 
 56 | 			// Each reducer spits out it's own manifest file, merge em all together into 1 file
 57 | 			FileUtil.copyMerge(fs, jobOutput, fs, manifestFile, false, conf, "");
 58 | 
 59 | 			// Read the merged file, de-duping entries as it reads
 60 | 			BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(manifestFile)));
 61 | 			String line;
 62 | 			line=br.readLine();
 63 | 			Set<String> indicies = new HashSet<>();
 64 | 			while (line != null){
 65 | 				indicies.add(line);
 66 | 				int count = numShardsGenerated.containsKey(line) ? numShardsGenerated.get(line) : 0;
 67 | 				numShardsGenerated.put(line, count + 1);
 68 | 				line=br.readLine();
 69 | 			}
 70 | 
 71 | 			File scratch = new File(scratchDir);
 72 | 			if(!scratch.exists()) {
 73 | 				// Make the dir if it doesn't exist
 74 | 				scratch.mkdirs();	
 75 | 			} else {
 76 | 				FileUtils.deleteDirectory(scratch);
 77 | 				scratch.mkdirs();
 78 | 			}
 79 | 			
 80 | 			esEmbededContainer = getESEmbededContainer(conf, reducerClass);
 81 | 
 82 | 			String scratchFile = scratchDir + "manifest";
 83 | 			PrintWriter writer = new PrintWriter(scratchFile, "UTF-8");
 84 | 			
 85 | 			// Create all the indexes
 86 | 			for(String index : indicies) {
 87 | 			     esEmbededContainer.getNode().client().admin().indices().prepareCreate(index).setSettings(settingsBuilder()
 88 | 			                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, shardConfig.getShardsForIndex(index))).get();
 89 | 			}
 90 | 
 91 | 			// Snapshot it
 92 | 			List<String> indexesToSnapshot = new ArrayList<>();
 93 | 			indexesToSnapshot.addAll(indicies);
 94 | 			esEmbededContainer.snapshot(indexesToSnapshot, BaseESReducer.SNAPSHOT_NAME, conf.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString()), null);
 95 | 			
 96 | 			for(String index : indicies) {
 97 | 				try{
 98 | 					placeMissingIndexes(BaseESReducer.SNAPSHOT_NAME, esEmbededContainer, conf, index, shardConfig, !rootManifestUploaded);
 99 | 					// The root level manifests are the same on each one, so it need only be uploaded once
100 | 					rootManifestUploaded = true;
101 | 				} catch (FileNotFoundException | IllegalArgumentException e) {
102 | 					logger.error("Unable to include index " + index + " in the manifest because missing shards could not be generated", e);
103 | 					continue;
104 | 				}
105 | 
106 | 				// Re-write the manifest to local disk
107 | 				writer.println(index);	
108 | 			}
109 | 			
110 | 			// Clean up index from embedded instance
111 | 			for(String index : indicies) {
112 | 				esEmbededContainer.getNode().client().admin().indices().prepareDelete(index).execute();	
113 | 			}
114 | 
115 | 			writer.close();
116 | 
117 | 			// Move the manifest onto HDFS
118 | 			fs.copyFromLocalFile(new Path(scratchFile), manifestFile);
119 | 		} finally {
120 | 			if(esEmbededContainer != null) {
121 | 				esEmbededContainer.getNode().close();
122 | 				while(!esEmbededContainer.getNode().isClosed());
123 | 			}
124 | 			FileUtils.deleteDirectory(new File(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString())));
125 | 		}
126 | 	}
127 | 
128 | 	/**
129 | 	 * 
130 | 	 * @param snapshotName
131 | 	 * @param esEmbededContainer
132 | 	 * @param conf
133 | 	 * @param index
134 | 	 * @param shardConfig
135 | 	 * @param includeRootManifest
136 | 	 * @throws IOException
137 | 	 */
138 | 	public void placeMissingIndexes(String snapshotName, ESEmbededContainer esEmbededContainer, Configuration conf, String index, ShardConfig shardConfig, boolean includeRootManifest) throws IOException {
139 | 		BaseTransport transport = SnapshotTransportStrategy.get(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()), conf.get(ConfigParams.SNAPSHOT_FINAL_DESTINATION.toString()));
140 | 		transport.placeMissingShards(snapshotName, index, shardConfig, includeRootManifest);			
141 | 	}
142 | 
143 | 	/**
144 | 	 * Returns a ESEmbededContainer configured for some local indexing
145 | 	 * 
146 | 	 * @param conf
147 | 	 * @param reducerClass
148 | 	 * @return
149 | 	 * @throws IOException
150 | 	 * @throws InstantiationException
151 | 	 * @throws IllegalAccessException
152 | 	 */
153 | 	private ESEmbededContainer getESEmbededContainer(Configuration conf, Class<? extends BaseESReducer> reducerClass) throws IOException, InstantiationException, IllegalAccessException {
154 | 		ESEmbededContainer esEmbededContainer = null;
155 | 		BaseESReducer red = reducerClass.newInstance();
156 | 		String templateName = red.getTemplateName();
157 | 		String templateJson = red.getTemplate();
158 | 		red.close();
159 | 		
160 | 		ESEmbededContainer.Builder builder = new ESEmbededContainer.Builder()
161 | 		.withNodeName("embededESTempLoaderNode")
162 | 		.withInMemoryBackedIndexes(true)
163 | 		.withWorkingDir(conf.get(ConfigParams.ES_WORKING_DIR.toString()))
164 | 		.withClusterName("bulkLoadPartition")
165 | 		.withSnapshotWorkingLocation(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()))
166 | 		.withSnapshotRepoName(conf.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString()));
167 | 		
168 | 		if(templateName != null && templateJson != null) {
169 | 			builder.withTemplate(templateName, templateJson);	
170 | 		}
171 | 		
172 | 		esEmbededContainer = builder.build();
173 | 		return esEmbededContainer;
174 | 	}
175 | }
176 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/ShardConfig.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | /**
 7 |  * While many ES clusters have a uniform number of shards per index, if your
 8 |  * indexing volume ebbs & flows it makes sense to vary the shard count from day
 9 |  * to day. Given that you're rebuilding indices for historical data one usage pattern
10 |  * has a script reach out to the ES cluster to retrieve the size of each index so
11 |  * a shard count can be be calculated.
12 |  * 
13 |  * For example, perhaps a desirable shard size is 10GB/shard. If the index on the cluster
14 |  * currently takes 25GB, it could be calculated that 3 shards for said index would be appropriate.
15 |  * 
16 |  * Note: Shard size metrics on live ES clusters aren't %100 accurate metric b/c it can vary during merge
17 |  * operations, but the technique is still useful nonetheless. Historical data tends to be less mutable
18 |  * leading to fewer merges. It's up to the implementer to determine an algorithm that fits their
19 |  * implementation. 
20 |  * 
21 |  * 
22 |  * @author drew
23 |  *
24 |  */
25 | public class ShardConfig {
26 |     private Map<String, Long> shardsPerIndex = new HashMap<>();
27 |     private Map<String, Long> shardsPerOrg = new HashMap<>();
28 |     private Long defaultShardsPerIndex = 5l;
29 |     private Long defaultShardsPerOrg = 2l;
30 |     
31 |     public ShardConfig(Map<String, Long> shardsPerIndex, Map<String, Long> shardsPerOrg) {
32 |         this.shardsPerIndex = shardsPerIndex;
33 |         this.shardsPerOrg = shardsPerOrg;
34 |     }
35 |         
36 |     public ShardConfig(Map<String, Long> shardsPerIndex, Map<String, Long> shardsPerOrg, Long defaultShardsPerIndex, Long defaultShardsPerOrg) {
37 |         this.shardsPerIndex = shardsPerIndex;
38 |         this.shardsPerOrg = shardsPerOrg;
39 |         this.defaultShardsPerIndex = defaultShardsPerIndex;
40 |         this.defaultShardsPerOrg = defaultShardsPerOrg;
41 |     }
42 |     
43 |     public ShardConfig() {}
44 |     
45 |     public ShardConfig(Long defaultShardsPerIndex, Long defaultShardsPerOrg) {
46 |         this.defaultShardsPerIndex = defaultShardsPerIndex;
47 |         this.defaultShardsPerOrg = defaultShardsPerOrg;
48 |     }
49 | 
50 |     public Long getShardsForIndex(String index) {
51 |         if(shardsPerIndex.containsKey(index)) {
52 |             return shardsPerIndex.get(index);
53 |         } 
54 |         
55 |         return defaultShardsPerIndex;
56 |     }
57 |     public Long getShardsForOrg(String index) {
58 |         if(shardsPerOrg.containsKey(index)) {
59 |             return shardsPerOrg.get(index);
60 |         }
61 |         return defaultShardsPerOrg;
62 |     }
63 |     
64 |     
65 | 
66 |     @Override
67 |     public String toString() {
68 |         return "ShardConfig [shardsPerIndex=" + shardsPerIndex + ", shardsPerOrg=" + shardsPerOrg
69 |                 + ", defaultShardsPerIndex=" + defaultShardsPerIndex + ", defaultShardsPerOrg=" + defaultShardsPerOrg
70 |                 + "]";
71 |     }
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/driver/Driver.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.driver;
 2 | 
 3 | import java.util.TimeZone;
 4 | 
 5 | import org.apache.hadoop.util.ProgramDriver;
 6 | import org.joda.time.DateTimeZone;
 7 | 
 8 | import com.inin.analytics.elasticsearch.example.ExampleIndexingJob;
 9 | import com.inin.analytics.elasticsearch.example.ExampleJobPrep;
10 | import com.inin.analytics.elasticsearch.example.GenerateData;
11 | 
12 | public class Driver extends ProgramDriver {
13 | 
14 |     public Driver() throws Throwable {
15 |             super();
16 |             addClass("generateExampleData", GenerateData.class, "Example job for how to build documents for elasticsearch indexing");
17 |             addClass("examplePrep", ExampleJobPrep.class, "Example job for how to build documents for elasticsearch indexing");
18 |             addClass("esIndexRebuildExample", ExampleIndexingJob.class, "Example job for how to rebuild elasticsearch indexes");
19 |     }
20 | 
21 |     public static void main(String[] args) throws Throwable {   	
22 |             DateTimeZone.setDefault(DateTimeZone.UTC);
23 |             TimeZone.setDefault(TimeZone.getTimeZone("GMT"));
24 |             Driver driver = new Driver();
25 |             driver.driver(args);
26 |             System.exit(0);
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/example/ExampleIndexingJob.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.example;
 2 | 
 3 | import org.apache.commons.lang.StringUtils;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapred.FileInputFormat;
 8 | import org.apache.hadoop.mapred.FileOutputFormat;
 9 | import org.apache.hadoop.mapred.JobClient;
10 | import org.apache.hadoop.mapred.JobConf;
11 | import org.apache.hadoop.mapred.SequenceFileInputFormat;
12 | import org.apache.hadoop.mapred.TextOutputFormat;
13 | import org.apache.hadoop.util.Tool;
14 | 
15 | import com.inin.analytics.elasticsearch.BaseESMapper;
16 | import com.inin.analytics.elasticsearch.ConfigParams;
17 | import com.inin.analytics.elasticsearch.IndexingPostProcessor;
18 | import com.inin.analytics.elasticsearch.ShardConfig;
19 | 
20 | public class ExampleIndexingJob implements Tool {
21 | 
22 | 	private static Configuration conf;
23 | 	public static int main(String[] args) throws Exception {
24 | 		if(args.length != 9) {
25 | 			System.err.println("Invalid # arguments. EG: loadES [pipe separated input] [snapshot working directory (fs/nfs)] [snapshot final destination (s3/nfs/hdfs)] [snapshot repo name] [elasticsearch working data location] [num reducers] [num shards per index] [num shards per organization] [manifest location]");
26 | 			return -1;
27 | 		}
28 | 
29 | 		String inputPath = args[0];
30 | 		String snapshotWorkingLocation = args[1];
31 | 		String snapshotFinalDestination = args[2];
32 | 		String snapshotRepoName = args[3];
33 | 		String esWorkingDir = args[4];
34 | 		Integer numReducers = new Integer(args[5]);
35 | 		Long numShardsPerIndex = new Long(args[6]);
36 | 		Long numShardsPerOrganization = new Long(args[7]);
37 | 		String manifestLocation = args[8];
38 | 
39 | 		// Remove trailing slashes from the destination 
40 | 		snapshotFinalDestination = StringUtils.stripEnd(snapshotFinalDestination, "/");
41 | 
42 | 		conf = new Configuration();
43 | 		conf.set(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString(), snapshotWorkingLocation);
44 | 		conf.set(ConfigParams.SNAPSHOT_FINAL_DESTINATION.toString(), snapshotFinalDestination);
45 | 		conf.set(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString(), snapshotRepoName);
46 | 		conf.set(ConfigParams.ES_WORKING_DIR.toString(), esWorkingDir);
47 | 		conf.set(ConfigParams.NUM_SHARDS_PER_INDEX.toString(), numShardsPerIndex.toString());
48 | 		conf.set(ConfigParams.NUM_SHARDS_PER_ORGANIZATION.toString(), numShardsPerOrganization.toString());
49 | 
50 | 		JobConf job = new JobConf(conf, ExampleIndexingJob.class);
51 | 		job.setJobName("Elastic Search Offline Index Generator");
52 | 		job.setInputFormat(SequenceFileInputFormat.class);
53 | 		job.setOutputFormat(TextOutputFormat.class);
54 | 		job.setMapperClass(BaseESMapper.class);
55 | 		job.setReducerClass(ExampleIndexingReducerImpl.class);
56 | 		job.setMapOutputValueClass(Text.class);
57 | 		job.setMapOutputKeyClass(Text.class);
58 | 		job.setNumReduceTasks(numReducers);
59 | 		job.setSpeculativeExecution(false);
60 | 
61 | 		Path jobOutput = new Path(manifestLocation + "/raw/");
62 | 		Path manifestFile = new Path(manifestLocation + "manifest");
63 | 
64 | 		FileOutputFormat.setOutputPath(job, jobOutput);
65 | 
66 | 		// Set up inputs
67 | 		String[]inputFolders = StringUtils.split(inputPath, "|");
68 | 		for(String input : inputFolders) {
69 | 			FileInputFormat.addInputPath(job, new Path(input));
70 | 		}
71 | 
72 | 		JobClient.runJob(job);
73 | 		IndexingPostProcessor postProcessor = new IndexingPostProcessor();
74 | 		postProcessor.execute(jobOutput, manifestFile, esWorkingDir,  new ShardConfig(numShardsPerIndex, numShardsPerOrganization), conf, ExampleIndexingReducerImpl.class);
75 | 		return 0;
76 | 	}
77 | 
78 | 	@Override
79 | 	public void setConf(Configuration conf) {
80 | 		this.conf = conf;
81 | 	}
82 | 
83 | 	@Override
84 | 	public Configuration getConf() {
85 | 		return conf;
86 | 	}
87 | 
88 | 	@Override
89 | 	public int run(String[] args) throws Exception {
90 | 		return ExampleIndexingJob.main(args);
91 | 	}
92 | 
93 | }


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/example/ExampleIndexingReducerImpl.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.example;
 2 | 
 3 | import org.apache.hadoop.mapred.JobConf;
 4 | 
 5 | import com.inin.analytics.elasticsearch.BaseESReducer;
 6 | import com.inin.analytics.elasticsearch.ConfigParams;
 7 | import com.inin.analytics.elasticsearch.ShardConfig;
 8 | 
 9 | public class ExampleIndexingReducerImpl extends BaseESReducer {
10 | 	
11 | 	/**
12 | 	 * Provide the JSON contents of the index template. This is your hook for configuring ElasticSearch.
13 | 	 * 
14 | 	 * http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-templates.html
15 | 	 */
16 | 	public String getTemplate() {
17 | 		return null;
18 | 	}
19 | 
20 | 	/**
21 | 	 * Return a name for the index template. 
22 | 	 */
23 | 	@Override
24 | 	public String getTemplateName() {
25 | 		return null;
26 | 	}
27 | 
28 |     @Override
29 |     public ShardConfig getShardConfig(JobConf job) {
30 |         Long numShardsPerIndex = job.getLong(ConfigParams.NUM_SHARDS_PER_INDEX.name(), 5l);
31 |         Long numShardsPerOrganization = job.getLong(ConfigParams.NUM_SHARDS_PER_ORGANIZATION.name(), 2l);
32 |         return new ShardConfig(numShardsPerIndex, numShardsPerOrganization);
33 |     }
34 |     
35 | }


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/example/ExampleJobPrep.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch.example;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.commons.lang.StringUtils;
  6 | import org.apache.hadoop.conf.Configuration;
  7 | import org.apache.hadoop.fs.Path;
  8 | import org.apache.hadoop.io.LongWritable;
  9 | import org.apache.hadoop.io.Text;
 10 | import org.apache.hadoop.mapreduce.Job;
 11 | import org.apache.hadoop.mapreduce.Mapper;
 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 13 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 15 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 16 | import org.apache.hadoop.util.Tool;
 17 | 
 18 | import com.inin.analytics.elasticsearch.BaseESReducer;
 19 | import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
 20 | import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategy;
 21 | import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1;
 22 | 
 23 | /**
 24 |  * Sample hadoop job for taking data from GenerateData.java and writes it out
 25 |  * into a format suitable for ExampleIndexingJob.java 
 26 |  */
 27 | public class ExampleJobPrep  implements Tool {
 28 | 	private static Configuration conf;
 29 | 	private static final String INDEX_TYPE = "conversation";
 30 | 	private static final String INDEX_SUFFIX_CONFIG = "indexSuffixConfigKey";
 31 | 
 32 | 	private static final String NUM_SHARDS_PER_CUSTOMER = "numShardsPerCustomer";
 33 | 	private static final String NUM_SHARDS = "numShards";
 34 | 
 35 | 	public static class DocMapper extends Mapper <LongWritable, Text, Text, Text> {
 36 | 		private ElasticsearchRoutingStrategy elasticsearchRoutingStrategy;
 37 | 
 38 | 		@Override
 39 | 		protected void setup(Context context) throws IOException, InterruptedException {
 40 | 			Integer numShardsPerOrg = context.getConfiguration().getInt(NUM_SHARDS_PER_CUSTOMER, 1);
 41 | 			Integer numShards = context.getConfiguration().getInt(NUM_SHARDS, 1);
 42 | 
 43 | 			ElasticSearchIndexMetadata indexMetadata = new ElasticSearchIndexMetadata();
 44 | 			indexMetadata.setNumShards(numShards);
 45 | 			indexMetadata.setNumShardsPerOrg(numShardsPerOrg);
 46 | 			elasticsearchRoutingStrategy = new ElasticsearchRoutingStrategyV1();
 47 | 			elasticsearchRoutingStrategy.configure(indexMetadata);
 48 | 		}
 49 | 
 50 | 		@Override
 51 | 		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
 52 | 			String[] csv = StringUtils.split(value.toString(), ",");
 53 | 			String docId = csv[1];
 54 | 			String customerId = csv[0];
 55 | 			String json = "{\"customerId\":\"" + customerId + "\",\"color\":\"" + csv[2] + "\",\"id\":\"" + docId + "\",\"description\":\"" + csv[3] + "\"}";
 56 | 			String routingHash = elasticsearchRoutingStrategy.getRoutingHash(customerId, docId);
 57 | 
 58 | 			Text outputKey = new Text(INDEX_TYPE + BaseESReducer.TUPLE_SEPARATOR + routingHash);
 59 | 			Text outputValue = new Text(INDEX_TYPE + BaseESReducer.TUPLE_SEPARATOR + customerId + BaseESReducer.TUPLE_SEPARATOR + json);
 60 | 			context.write(outputKey, outputValue);
 61 | 		} 
 62 | 	}
 63 | 
 64 | 	public static boolean main(String[] args) throws Exception {
 65 | 		if(args.length != 5) {
 66 | 			System.err.println("Invalid # arguments. EG: loadES [pipe separated paths to source files containing segments & properties] [output location] [index name suffix] [numShardsPerIndex] [maxNumShardsPerCustomer (for routing)]");
 67 | 		}
 68 | 
 69 | 		String inputs = args[0];
 70 | 		String output = args[1];
 71 | 		String indexSuffix = args[2];
 72 | 		Integer numShards = new Integer(args[3]);
 73 | 		Integer numShardsPerCustomer = new Integer(args[4]);
 74 | 
 75 | 		conf = new Configuration();
 76 | 		Job job = Job.getInstance(conf, "Prep example");
 77 | 		job.setJarByClass(ExampleJobPrep.class);
 78 | 		job.setMapperClass(DocMapper.class);
 79 | 		job.setOutputKeyClass(Text.class);
 80 | 		job.setOutputValueClass(Text.class);
 81 | 		job.setInputFormatClass(SequenceFileInputFormat.class);
 82 | 		job.setOutputFormatClass(SequenceFileOutputFormat.class);
 83 | 
 84 | 		job.setNumReduceTasks(0);
 85 | 
 86 | 		job.getConfiguration().set(INDEX_SUFFIX_CONFIG, indexSuffix);
 87 | 		job.getConfiguration().set(NUM_SHARDS_PER_CUSTOMER, numShardsPerCustomer.toString());
 88 | 		job.getConfiguration().set(NUM_SHARDS, numShards.toString());
 89 | 
 90 | 		FileOutputFormat.setOutputPath(job, new Path(output));
 91 | 
 92 | 		// Set up inputs
 93 | 		String[]inputFolders = StringUtils.split(inputs, "|");
 94 | 		for(String input : inputFolders) {
 95 | 			FileInputFormat.addInputPath(job, new Path(input));	
 96 | 		}
 97 | 
 98 | 		return job.waitForCompletion(true);
 99 | 	}
100 | 
101 | 	@Override
102 | 	public void setConf(Configuration conf) {
103 | 		this.conf = conf;
104 | 	}
105 | 
106 | 	@Override
107 | 	public Configuration getConf() {
108 | 		return conf;
109 | 	}
110 | 
111 | 	@Override
112 | 	public int run(String[] args) throws Exception {
113 | 		boolean success = ExampleJobPrep.main(args);
114 | 		if(success) {
115 | 			return 0;
116 | 		} else {
117 | 			return 1;
118 | 		}
119 | 	}
120 | 
121 | }


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/example/GenerateData.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.example;
 2 | 
 3 | import java.util.UUID;
 4 | 
 5 | import org.apache.commons.lang.RandomStringUtils;
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.fs.FileSystem;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.LongWritable;
10 | import org.apache.hadoop.io.SequenceFile;
11 | import org.apache.hadoop.io.Text;
12 | 
13 | public class GenerateData {
14 | 	
15 | 	public static void main(String[] args) throws Exception {
16 | 		if(args.length != 2) {
17 | 			System.err.println("Invalid # arguments. EG: [num sample records] [output file]");
18 | 			return;
19 | 		}
20 | 
21 | 		Long numRecords = new Long(args[0]);
22 | 		
23 | 		String attribute = null;
24 | 		String customer = null;
25 | 		String customer1 = UUID.randomUUID().toString();
26 | 		String customer2 = UUID.randomUUID().toString();
27 | 		
28 | 		Configuration config = new Configuration();     
29 | 		FileSystem fs = FileSystem.get(config); 
30 | 		Path filenamePath = new Path(args[1]);  
31 | 		SequenceFile.Writer inputWriter = new SequenceFile.Writer(fs, config, filenamePath, LongWritable.class, Text.class);
32 | 
33 | 		for(Long x = 0l; x < numRecords; x++) {
34 | 			if(x % 2 == 0) {
35 | 				attribute = "yellow";
36 | 			} else {
37 | 				attribute = "blue";
38 | 			}
39 | 			
40 | 			if(x % 3 == 0) {
41 | 				customer = customer1;
42 | 			} else {
43 | 				customer = customer2;
44 | 			}
45 | 			
46 | 			inputWriter.append(new LongWritable(x), new Text(customer + "," + UUID.randomUUID().toString() + "," + attribute + "," + RandomStringUtils.randomAlphanumeric(15)));
47 | 		}
48 | 		
49 | 		inputWriter.close();
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/rotation/ElasticSearchIndexMetadata.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch.index.rotation;
  2 | 
  3 | import org.joda.time.LocalDate;
  4 | 
  5 | import com.google.gson.annotations.Expose;
  6 | import com.google.gson.annotations.SerializedName;
  7 | 
  8 | /**
  9 |  * When an index has been rebuilt and we're substituting it in, this class holds onto
 10 |  * some metadata about the rebuilt index.
 11 |  * 
 12 |  * @author drew
 13 |  *
 14 |  */
 15 | public class ElasticSearchIndexMetadata {
 16 | 
 17 | 	/**
 18 | 	 * Name of the index when it was first created
 19 | 	 */
 20 | 	
 21 | 	@Expose
 22 | 	private String indexNameAtBirth;
 23 | 	
 24 | 	/**
 25 | 	 * Name of the index after it was rebuilt
 26 | 	 */
 27 | 	
 28 | 	@Expose
 29 | 	private String rebuiltIndexName;
 30 | 
 31 | 	/**
 32 | 	 * A shortened alias for the rebuilt index
 33 | 	 */
 34 | 	
 35 | 	@Expose
 36 | 	private String rebuiltIndexAlias;
 37 | 
 38 | 	/**
 39 | 	 * Date associated with the index (if applicable such as date partitioned data)
 40 | 	 */
 41 | 	
 42 | 	@SerializedName("indexLocalDate") 
 43 | 	@Expose
 44 | 	private LocalDate indexDate;
 45 | 	
 46 | 	/**
 47 | 	 * Number if shards in the index
 48 | 	 */
 49 | 	
 50 | 	@Expose
 51 | 	private int numShards;
 52 | 	
 53 | 	/**
 54 | 	 * Useful for the routing strategy, the number of shards an organization's data is split across within an index
 55 | 	 */
 56 | 	
 57 | 	@Expose
 58 | 	private int numShardsPerOrg;
 59 | 	
 60 | 	/**
 61 | 	 * Class name for the routing strategy
 62 | 	 */
 63 | 	
 64 | 	@Expose
 65 | 	private String routingStrategyClassName;
 66 | 	
 67 | 	public String getIndexNameAtBirth() {
 68 | 		return indexNameAtBirth;
 69 | 	}
 70 | 	public void setIndexNameAtBirth(String indexNameAtBirth) {
 71 | 		this.indexNameAtBirth = indexNameAtBirth;
 72 | 	}
 73 | 	public String getRebuiltIndexAlias() {
 74 | 		return rebuiltIndexAlias;
 75 | 	}
 76 | 	public void setRebuiltIndexAlias(String rebuiltIndexAlias) {
 77 | 		this.rebuiltIndexAlias = rebuiltIndexAlias;
 78 | 	}
 79 | 	public int getNumShardsPerOrg() {
 80 | 		return numShardsPerOrg;
 81 | 	}
 82 | 	public void setNumShardsPerOrg(int numShardsPerOrg) {
 83 | 		this.numShardsPerOrg = numShardsPerOrg;
 84 | 	}
 85 | 	public String getRoutingStrategyClassName() {
 86 | 		return routingStrategyClassName;
 87 | 	}
 88 | 	public void setRoutingStrategyClassName(String routingStrategyClassName) {
 89 | 		this.routingStrategyClassName = routingStrategyClassName;
 90 | 	}
 91 | 	public int getNumShards() {
 92 | 		return numShards;
 93 | 	}
 94 | 	public void setNumShards(int numShards) {
 95 | 		this.numShards = numShards;
 96 | 	}
 97 | 	public String getRebuiltIndexName() {
 98 | 		return rebuiltIndexName;
 99 | 	}
100 | 	public void setRebuiltIndexName(String rebuiltIndexName) {
101 | 		this.rebuiltIndexName = rebuiltIndexName;
102 | 	}
103 | 	public LocalDate getDate() {
104 | 		return indexDate;
105 | 	}
106 | 	public void setDate(LocalDate date) {
107 | 		this.indexDate = date;
108 | 	}
109 | 	@Override
110 | 	public String toString() {
111 | 		return "RotatedIndexMetadata [indexNameAtBirth=" + indexNameAtBirth
112 | 				+ ", rebuiltIndexName=" + rebuiltIndexName
113 | 				+ ", rebuiltIndexAlias=" + rebuiltIndexAlias + ", indexDate="
114 | 				+ indexDate + ", numShards=" + numShards + ", numShardsPerOrg="
115 | 				+ numShardsPerOrg + ", routingStrategyClassName="
116 | 				+ routingStrategyClassName + "]";
117 | 	}
118 | 	
119 | 	
120 | 	
121 | }
122 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/rotation/ElasticsearchIndexRotationManager.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.index.rotation;
 2 | 
 3 | 
 4 | /**
 5 |  * Swapping rebuilt indexes into an ES cluster with zero downtime requires holding 
 6 |  * onto some state about the indexes. For example, if you rebuilt you cluster every night 
 7 |  * then depending on data retention you could have multiple date stamped [on creation] versions
 8 |  * of an index. Here we keep track of what version is in use. 
 9 |  * 
10 |  * @author drew
11 |  *
12 |  */
13 | public interface ElasticsearchIndexRotationManager {
14 | 	
15 | 	/**
16 | 	 * Look up meta data for an index based on the original name it was created with
17 | 	 * @param indexNameAtBirth
18 | 	 * @return ElasticSearchIndexMetadata
19 | 	 */
20 | 	ElasticSearchIndexMetadata getElasticSearchIndexMetadata(String indexNameAtBirth);
21 | 	
22 | 	/**
23 | 	 * Once a rebuilt index has been loaded into ElasticSearch, this registers the metadata
24 | 	 * with zookeeper so that other parts of the system know they can start using the new index. 
25 | 	 * @param rotatedIndexMetadata
26 | 	 */
27 | 	void registerIndexAvailableOnRotation(ElasticSearchIndexMetadata rotatedIndexMetadata);
28 | 	
29 | 	
30 | 	/**
31 | 	 * Optional: When rebuilding indexes, hold onto the state of the rebuild process. This is 
32 | 	 * useful if you wish to defer writes to an index being rebuilt until it's done and swapped in.  
33 | 	 *  
34 | 	 * @param state
35 | 	 */
36 | 	void updateRebuildPipelineState(RebuildPipelineState state);
37 | 	
38 | 	/**
39 | 	 * Get the current state of the index rebuild process. Again, this is optional and relies on updateRebuildPipelineState being used.
40 | 	 * 
41 | 	 * @return RebuildPipelineState
42 | 	 */
43 | 	RebuildPipelineState getRebuildPipelineState();
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/rotation/ElasticsearchIndexRotationManagerNoop.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.index.rotation;
 2 | 
 3 | 
 4 | 
 5 | public class ElasticsearchIndexRotationManagerNoop implements ElasticsearchIndexRotationManager {
 6 | 
 7 | 
 8 | 	@Override
 9 | 	public void registerIndexAvailableOnRotation(ElasticSearchIndexMetadata rotatedIndexMetadata) {
10 | 		
11 | 	}
12 | 
13 | 	@Override
14 | 	public void updateRebuildPipelineState(RebuildPipelineState state) {
15 | 		
16 | 	}
17 | 
18 | 	@Override
19 | 	public RebuildPipelineState getRebuildPipelineState() {
20 | 		return RebuildPipelineState.COMPLETE;
21 | 	}
22 | 
23 | 	@Override
24 | 	public ElasticSearchIndexMetadata getElasticSearchIndexMetadata(String indexNameAtBirth) {
25 | 		ElasticSearchIndexMetadata rotatedIndexMetadata = new ElasticSearchIndexMetadata();
26 | 		rotatedIndexMetadata.setIndexNameAtBirth(indexNameAtBirth);
27 | 		return rotatedIndexMetadata;
28 | 	}
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/rotation/ElasticsearchIndexRotationManagerZookeeper.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch.index.rotation;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.Map;
  5 | import java.util.Map.Entry;
  6 | 
  7 | import org.apache.commons.lang.StringUtils;
  8 | import org.apache.curator.framework.CuratorFramework;
  9 | import org.apache.curator.framework.listen.Listenable;
 10 | import org.apache.curator.framework.recipes.cache.ChildData;
 11 | import org.apache.curator.framework.recipes.cache.NodeCache;
 12 | import org.apache.curator.framework.state.ConnectionState;
 13 | import org.apache.curator.framework.state.ConnectionStateListener;
 14 | import org.apache.zookeeper.KeeperException;
 15 | import org.apache.zookeeper.KeeperException.Code;
 16 | import org.slf4j.Logger;
 17 | import org.slf4j.LoggerFactory;
 18 | 
 19 | import com.google.common.base.Preconditions;
 20 | import com.google.gson.Gson;
 21 | import com.inin.analytics.elasticsearch.util.GsonFactory;
 22 | 
 23 | public abstract class ElasticsearchIndexRotationManagerZookeeper implements ElasticsearchIndexRotationManager {
 24 | 
 25 | 	protected abstract String getBaseZnode();
 26 | 	protected abstract String getRebuildStateZnode();
 27 | 	
 28 | 	protected CuratorFramework curator;
 29 | 	
 30 | 	protected static transient Logger logger = LoggerFactory.getLogger(ElasticsearchIndexRotationManagerZookeeper.class);
 31 | 	protected Map<String, NodeCache> indexNameCache = new HashMap<>();
 32 | 	protected NodeCache rebuildStateCache;
 33 | 	protected Gson gson;
 34 | 	protected static final String FAIL_MESSAGE = "Failed getting routing strategy from zookeeper for ";
 35 | 	protected Listenable<ConnectionStateListener> connectionStateListener;
 36 | 	
 37 | 
 38 | 
 39 | 	public void setCurator(CuratorFramework curator) {
 40 | 		this.curator = curator;
 41 | 	}
 42 | 
 43 | 	public void init() {
 44 | 		Preconditions.checkNotNull(curator, "curator is a required dependency");
 45 | 		gson = GsonFactory.buildGsonBuilder().excludeFieldsWithoutExposeAnnotation().create();
 46 | 
 47 | 		// AR-1785 Create watcher to rebuild nodeCache after ZK reconnects from a connection blip 
 48 | 		connectionStateListener = curator.getConnectionStateListenable();
 49 | 		connectionStateListener.addListener(new ConnectionStateListener() {
 50 | 			@Override
 51 | 			public void stateChanged(CuratorFramework curator, ConnectionState state) {
 52 | 				if (state.equals(ConnectionState.RECONNECTED) && indexNameCache != null) {
 53 | 					for(Entry<String, NodeCache> nodeCachePair : indexNameCache.entrySet()) {
 54 | 						try {
 55 | 							// nodeCache stops updating after a connection blip nukes its listener. I'd almost consider that a bug in curator, but for now this is the advised workaround.  
 56 | 							logger.info("ZK connection reconnect detected, rebuilding curator nodeCache for " + nodeCachePair.getKey());
 57 | 							nodeCachePair.getValue().rebuild();
 58 | 						} catch (Exception e) {
 59 | 							logger.warn("Failed to rebuild nodeCache after ZK reconnect ", e);
 60 | 						}
 61 | 					}
 62 | 				}
 63 | 			}
 64 | 		});
 65 | 	}
 66 | 
 67 | 	protected void ensureNodePathExists(String zkPath) throws Exception {
 68 | 		try {
 69 | 			String[] pieces = StringUtils.split(zkPath, "/");
 70 | 			String znode = "";
 71 | 			for(String piece : pieces) {
 72 | 				znode = znode + "/" + piece;
 73 | 				try {
 74 | 					curator.getData().forPath(znode);
 75 | 				} catch (KeeperException e) {
 76 | 					Code errorCode = e.code();
 77 | 					if(errorCode.equals(Code.NONODE)) {
 78 | 						curator.create().forPath(znode);
 79 | 					}
 80 | 				}
 81 | 			}
 82 | 		} catch (Exception e) {
 83 | 			throw new IllegalStateException("Failed to initialize ElasticsearchIndexRotationStrategy because base ZK nodes could not be created", e);
 84 | 		}
 85 | 	}
 86 | 
 87 | 
 88 | 	/**
 89 | 	 * Register that an index is on the hadoop rebuild train. We'll store the index name that the live data
 90 | 	 * would normally flow into with the alias to the rebuilt index. EG
 91 | 	 * 
 92 | 	 * New index at birth looks like
 93 | 	 * c140101
 94 | 	 * 
 95 | 	 * Rebuilt index 
 96 | 	 * c140101_reubild_82389238923
 97 | 	 * 
 98 | 	 * Rebuilt index alias 
 99 | 	 * c140101_r
100 | 	 * 
101 | 	 * @param bucket
102 | 	 */
103 | 	
104 | 	@Override
105 | 	public void registerIndexAvailableOnRotation(ElasticSearchIndexMetadata rotatedIndexMetadata) {
106 | 		String indexNameZnode = getBaseZnode() + rotatedIndexMetadata.getIndexNameAtBirth();
107 | 		try {
108 | 			ensureNodePathExists(indexNameZnode);
109 | 			// Persisting metadata in json
110 | 			curator.setData().forPath(indexNameZnode, gson.toJson(rotatedIndexMetadata).getBytes());
111 | 		} catch (Exception e) {
112 | 			throw new IllegalStateException("Unable to register znode " + indexNameZnode, e);
113 | 		}
114 | 	}
115 | 	
116 | 	/**
117 | 	 * @param String 
118 | 	 * @return ElasticSearchIndexMetadata
119 | 	 */
120 | 	public ElasticSearchIndexMetadata getElasticSearchIndexMetadata(String indexNameAtBirth) {
121 | 		String znode = getBaseZnode() + indexNameAtBirth;
122 | 		try {
123 | 			if(!indexNameCache.containsKey(indexNameAtBirth)) {
124 | 				createNodeCacheForName(znode, indexNameAtBirth);
125 | 			}
126 | 			ChildData cd = indexNameCache.get(indexNameAtBirth).getCurrentData();
127 | 			
128 | 			if(cd != null) {
129 | 				String json = new String(cd.getData());
130 | 				return gson.fromJson(json, ElasticSearchIndexMetadata.class);
131 | 			}
132 | 		} catch (Exception e) {
133 | 			throw new IllegalStateException("Error retrieving znode, unable to maintain index metadata ", e);
134 | 		}
135 | 
136 | 		ElasticSearchIndexMetadata metadata = new ElasticSearchIndexMetadata();
137 | 		metadata.setIndexNameAtBirth(indexNameAtBirth);
138 | 		return metadata;
139 | 	}
140 | 	
141 | 
142 | 	protected synchronized void createNodeCacheForName(String zkPath, final String indexName) throws Exception {
143 | 		final NodeCache nodeCache = new NodeCache(curator, zkPath);
144 | 		nodeCache.start(true);
145 | 		indexNameCache.put(indexName, nodeCache);
146 | 	}
147 | 
148 | 	@Override
149 | 	public void updateRebuildPipelineState(RebuildPipelineState state) {
150 | 		try {
151 | 			ensureNodePathExists(getRebuildStateZnode());
152 | 			// Persisting metadata in json
153 | 			curator.setData().forPath(getRebuildStateZnode(), state.name().getBytes());
154 | 		} catch (Exception e) {
155 | 			throw new IllegalStateException("Unable to register state " + state, e);
156 | 		}
157 | 	}
158 | 
159 | 	/**
160 | 	 * @return RebuildPipelineState
161 | 	 */
162 | 	@Override
163 | 	public RebuildPipelineState getRebuildPipelineState() {
164 | 		if(rebuildStateCache == null) {
165 | 			rebuildStateCache = new NodeCache(curator, getRebuildStateZnode());
166 | 			try {
167 | 				rebuildStateCache.start(true);
168 | 			} catch (Exception e) {
169 | 				throw new IllegalStateException("Unable to get pipeline rebuild state", e);
170 | 			}
171 | 		}
172 | 		ChildData cd = rebuildStateCache.getCurrentData();
173 | 		if(cd != null) {
174 | 			return RebuildPipelineState.valueOf(new String(cd.getData()));
175 | 		} else {
176 | 			// COMPLETE ~= NOT_RUNNING, so if it's never been ran that's what we default to. At some point we'll want to fix the enum.
177 | 			return RebuildPipelineState.COMPLETE;
178 | 		}
179 | 	}
180 | 
181 | 
182 | }
183 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/rotation/ExampleElasticsearchIndexRotationStrategyZookeeper.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.index.rotation;
 2 | 
 3 | public class ExampleElasticsearchIndexRotationStrategyZookeeper extends ElasticsearchIndexRotationManagerZookeeper  {
 4 | 	private static final String INDEX_NAME_BASE_ZNODE = "/example/index/alias/";
 5 | 	private static final String INDEX_REBUILD_PIPELINE_STATE_ZNODE = "/example/index/rebuild/pipeline/state";
 6 | 	
 7 | 	@Override
 8 | 	protected String getBaseZnode() {
 9 | 		return INDEX_NAME_BASE_ZNODE;
10 | 	}
11 | 	@Override
12 | 	protected String getRebuildStateZnode() {
13 | 		return INDEX_REBUILD_PIPELINE_STATE_ZNODE;
14 | 	}
15 | }


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/rotation/RebuildPipelineState.java:
--------------------------------------------------------------------------------
1 | package com.inin.analytics.elasticsearch.index.rotation;
2 | 
3 | public enum RebuildPipelineState {
4 | 	RUNNING, // Pipeline is running
5 | 	COMPLETE  // Pipeline is done running
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/routing/ElasticsearchRoutingStrategy.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.index.routing;
 2 | 
 3 | import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
 4 | 
 5 | public interface ElasticsearchRoutingStrategy extends java.io.Serializable {
 6 | 	String getRoutingHash(String orgId, String convId);
 7 | 	String[] getPossibleRoutingHashes(String orgId);
 8 | 	void configure(ElasticSearchIndexMetadata rotatedIndexMetadata);
 9 | 	int getNumShardsPerOrg();
10 | 	int getNumShards();
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/routing/ElasticsearchRoutingStrategyV1.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch.index.routing;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.Map;
  5 | 
  6 | import com.google.common.base.Preconditions;
  7 | import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
  8 | import com.inin.analytics.elasticsearch.util.MurmurHash;
  9 | 
 10 | 
 11 | /**
 12 |  * This routing strategy for elasticsearch. Read up on what routing does here
 13 |  * http://www.elasticsearch.org/blog/customizing-your-document-routing/ 
 14 |  * 
 15 |  * Perhaps you have 10 shards per index and you don't wish to query every shard 
 16 |  * every time you do a search against an org. A simple sharding strategy would
 17 |  * put all the data for 1 org on 1 shard using consistant hashing on orgId. However that 
 18 |  * has the potential to hotspot some shards if an org with a lot of data comes
 19 |  * through.
 20 |  * 
 21 |  * This attempts to alleviate that by making the # subset of shards configurable. EG
 22 |  * numShards = 10, numShardsPerOrg = 3, all of an org's data will be split to one of 
 23 |  * 3 shards. Which one of the 3 is determined by hashing the conversationId.
 24 |  *  
 25 |  * Note: DO NOT CHANGE THIS CLASS. It's immutable once it's been used to generate ES indexes
 26 |  * so changing it affects data routing and will make data appear unavailable b/c its looking
 27 |  * in the wrong shard. The correct thing to do is to make a newer version of this class,
 28 |  * say ElasticsearchRoutingStrategyV2 and see to it that the hadoop jobs to rebuild
 29 |  * the ES indexes not only use it, but update zookeeper with which implementation
 30 |  * indexes were built with. That way you can evolve the routing strategy without breaking
 31 |  * anything.
 32 |  * 
 33 |  * @author drew
 34 |  *
 35 |  */
 36 | public class ElasticsearchRoutingStrategyV1 implements ElasticsearchRoutingStrategy, java.io.Serializable {
 37 | 	private static final long serialVersionUID = 1L;
 38 | 	private int numShardsPerOrg;
 39 | 	private int numShards;
 40 | 	private Map<Integer, Integer> shardToRout = new HashMap<>();
 41 | 
 42 | 	/**
 43 | 	 * Adapted from DjbHashFunction & PlainOperationRouting in Elasticsearch. This is the default hashing 
 44 | 	 * Algorithm for doc routing. We need this to reverse engineer routing strings that rout to
 45 | 	 * the shard we want. 
 46 | 	 * 
 47 | 	 * @param value
 48 | 	 * @return
 49 | 	 */
 50 | 	public int hash(String value) {
 51 | 		long hash = 5381;
 52 | 		for (int i = 0; i < value.length(); i++) {
 53 | 			hash = ((hash << 5) + hash) + value.charAt(i);
 54 | 		}
 55 | 
 56 | 		return Math.abs((int) hash % numShards);
 57 | 	}
 58 | 
 59 | 	public void init() {
 60 | 		Integer x = 0;
 61 | 		while(shardToRout.size() < numShards) {
 62 | 			Integer hash = hash(x.toString());
 63 | 			if(shardToRout.get(x) == null) {
 64 | 				shardToRout.put(x, hash);
 65 | 			}
 66 | 			x++;
 67 | 		}
 68 | 	}
 69 | 
 70 | 	public ElasticsearchRoutingStrategyV1() {
 71 | 
 72 | 	}
 73 | 
 74 | 	@Override
 75 | 	public void configure(ElasticSearchIndexMetadata rotatedIndexMetadata) {
 76 | 		Preconditions.checkNotNull(rotatedIndexMetadata.getNumShardsPerOrg(), "Num shards per org must not be null with " + this.getClass().getSimpleName());
 77 | 		Preconditions.checkNotNull(rotatedIndexMetadata.getNumShards(), "Num shards must not be null with " + this.getClass().getSimpleName());
 78 | 		this.numShardsPerOrg = rotatedIndexMetadata.getNumShardsPerOrg();
 79 | 		this.numShards = rotatedIndexMetadata.getNumShards();
 80 | 		init();
 81 | 	}
 82 | 
 83 | 	@Override
 84 | 	public int getNumShardsPerOrg() {
 85 | 		return numShardsPerOrg;
 86 | 	}
 87 | 
 88 | 	@Override
 89 | 	public int getNumShards() {
 90 | 		return numShards;
 91 | 	}
 92 | 	
 93 | 
 94 | 	public Map<Integer, Integer> getShardToRout() {
 95 | 		return shardToRout;
 96 | 	}
 97 | 
 98 | 	/**
 99 | 	 * For an orgId & convId, get the shard routing for a document.  
100 | 	 * 
101 | 	 * Note: ES re-hashes routing values so shard 1 wont necessarily mean 
102 | 	 * your data ends up in shard 1. However, if you realize that 
103 | 	 * then you're in a bad place. 
104 | 	 * 
105 | 	 * 
106 | 	 * @param orgId
107 | 	 * @param convId
108 | 	 * @param numShards
109 | 	 * @param numShardsPerOrg
110 | 	 * @return
111 | 	 */
112 | 
113 | 	@Override
114 | 	public String getRoutingHash(String orgId, String convId) {
115 | 		Preconditions.checkArgument(numShards >= numShardsPerOrg, "Misconfigured, numShards must be >= numShardsPerOrg");
116 | 		int orgIdHash = getOrgIdHash(orgId, numShards);
117 | 		int convIdHash = Math.abs(MurmurHash.getInstance().hash(convId.getBytes(), 0)) % numShardsPerOrg;
118 | 
119 | 		int shard = orgIdHash + convIdHash;
120 | 		while(shard >= numShards) {
121 | 			shard -= numShards;
122 | 		}
123 | 
124 | 		return shardToRout.get(shard).toString();
125 | 	}
126 | 
127 | 	/**
128 | 	 * When searching data for an Org, you may desire to only search the shards
129 | 	 * which hold data for that Org. This gives you a list of possible shard routings.
130 | 	 * 
131 | 	 * @param orgId
132 | 	 * @param numShards
133 | 	 * @param numShardsPerOrg
134 | 	 * @return
135 | 	 */
136 | 	@Override
137 | 	public String[] getPossibleRoutingHashes(String orgId) {
138 | 		int orgIdHash = getOrgIdHash(orgId, numShards);
139 | 		String[] possibleShards = new String[numShardsPerOrg];
140 | 		for(int x = 0; x < numShardsPerOrg; x ++) {
141 | 			int shard = orgIdHash + x;
142 | 			while(shard >= numShards) {
143 | 				shard -= numShards;
144 | 			}
145 | 			possibleShards[x] = shardToRout.get(shard).toString();
146 | 		}
147 | 		return possibleShards;
148 | 	}
149 | 
150 | 	private int getOrgIdHash(String orgId, int numShards) {
151 | 		return Math.abs(MurmurHash.getInstance().hash(orgId.getBytes(), 0)) % numShards;
152 | 	}
153 | 
154 | 	@Override
155 | 	public int hashCode() {
156 | 		final int prime = 31;
157 | 		int result = 1;
158 | 		result = prime * result + numShards;
159 | 		result = prime * result + numShardsPerOrg;
160 | 		return result;
161 | 	}
162 | 
163 | 	@Override
164 | 	public boolean equals(Object obj) {
165 | 		if (this == obj)
166 | 			return true;
167 | 		if (obj == null)
168 | 			return false;
169 | 		if (getClass() != obj.getClass())
170 | 			return false;
171 | 		ElasticsearchRoutingStrategyV1 other = (ElasticsearchRoutingStrategyV1) obj;
172 | 		if (numShards != other.numShards)
173 | 			return false;
174 | 		if (numShardsPerOrg != other.numShardsPerOrg)
175 | 			return false;
176 | 		return true;
177 | 	}
178 | 
179 |     @Override
180 |     public String toString() {
181 |         return "ElasticsearchRoutingStrategyV1 [numShardsPerOrg=" + numShardsPerOrg + ", numShards=" + numShards
182 |                 + ", shardToRout=" + shardToRout + "]";
183 |     }
184 | 
185 | 	
186 | }
187 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/selector/RealtimeIndexSelectionStrategy.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.index.selector;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Set;
 5 | 
 6 | import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
 7 | import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategy;
 8 | 
 9 | public interface RealtimeIndexSelectionStrategy {
10 | 	ElasticsearchRoutingStrategy get(ElasticSearchIndexMetadata rotatedIndexMetadata);
11 | 	Set<ElasticsearchRoutingStrategy> getRoutingStrategiesForIndicies(List<ElasticSearchIndexMetadata> indices);
12 | 
13 | 	@Deprecated // Use getRoutingStrategiesForIndicies
14 | 	ElasticsearchRoutingStrategy getRoutingStrategyForIndicies(List<ElasticSearchIndexMetadata> indices);
15 | 	String getIndexWritable(ElasticSearchIndexMetadata rotatedIndexMetadata);
16 | 	String getIndexReadable(ElasticSearchIndexMetadata rotatedIndexMetadata);
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/index/selector/RealtimeIndexSelectionStrategyLagged.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch.index.selector;
  2 | 
  3 | import java.util.HashSet;
  4 | import java.util.List;
  5 | import java.util.Set;
  6 | 
  7 | import org.joda.time.DateTime;
  8 | import org.slf4j.Logger;
  9 | import org.slf4j.LoggerFactory;
 10 | 
 11 | import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
 12 | import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategy;
 13 | 
 14 | 
 15 | /**
 16 |  * For a realtime engine feeding data into elasticsearch or pulling data out, some strategies
 17 |  * around which index to use are employed here. For example, we know if live data is streaming
 18 |  * into today's index that we may want to ignore today's index that got rebuilt in hadoop because
 19 |  * the data invariably changed from the second the job started. This employs an x day lag on
 20 |  * switching over reads/writes to hadoop generated indexes.
 21 |  *  
 22 |  * @author drew
 23 |  *
 24 |  */
 25 | public class RealtimeIndexSelectionStrategyLagged implements RealtimeIndexSelectionStrategy {
 26 | 
 27 | 	private int LAG = 2;
 28 | 	protected static final String FAIL_MESSAGE = "Failed getting routing strategy from zookeeper for ";
 29 | 	private static transient Logger logger = LoggerFactory.getLogger(RealtimeIndexSelectionStrategyLagged.class);
 30 | 	
 31 | 
 32 | 	/**
 33 | 	 * 
 34 | 	 * @param lAG
 35 | 	 */
 36 | 	public RealtimeIndexSelectionStrategyLagged(int lAG) {
 37 | 		super();
 38 | 		LAG = lAG;
 39 | 	}
 40 | 
 41 | 	public ElasticsearchRoutingStrategy get(ElasticSearchIndexMetadata rotatedIndexMetadata) {
 42 | 		
 43 | 		DateTime now = new DateTime();
 44 | 		if(rotatedIndexMetadata != null && rotatedIndexMetadata.getRoutingStrategyClassName() != null && !rotatedIndexMetadata.getDate().isAfter(now.minusDays(LAG).toLocalDate())) {
 45 | 			ElasticsearchRoutingStrategy strategy;
 46 | 			try {
 47 | 				strategy = (ElasticsearchRoutingStrategy) Class.forName(rotatedIndexMetadata.getRoutingStrategyClassName()).newInstance();
 48 | 				strategy.configure(rotatedIndexMetadata);
 49 | 				return strategy;
 50 | 			} catch (InstantiationException e) {
 51 | 				logger.error(FAIL_MESSAGE + rotatedIndexMetadata, e);
 52 | 			} catch (IllegalAccessException e) {
 53 | 				logger.error(FAIL_MESSAGE + rotatedIndexMetadata, e);
 54 | 			} catch (ClassNotFoundException e) {
 55 | 				logger.error(FAIL_MESSAGE + rotatedIndexMetadata, e);
 56 | 			}
 57 | 		}
 58 | 		return null;
 59 | 	}
 60 | 	
 61 | 	/**
 62 | 	 * Check if the routing strategy is the same for all the indicies. This is useful 
 63 | 	 * when searching because all indexes have to have the same routing strategy in order to 
 64 | 	 * cut down the # of shards to search. If they differ, then you have to hit the whole index.
 65 | 	 * 
 66 | 	 * Returns null if there's no common routing strategy
 67 | 	 * 
 68 | 	 * @param indices
 69 | 	 * @return
 70 | 	 */
 71 | 	public Set<ElasticsearchRoutingStrategy> getRoutingStrategiesForIndicies(List<ElasticSearchIndexMetadata> indices) {
 72 | 	    Set<ElasticsearchRoutingStrategy> strategies = new HashSet<>();
 73 | 		ElasticsearchRoutingStrategy routingStrategy = null;
 74 | 		for(ElasticSearchIndexMetadata index : indices) {
 75 | 			if(index.getRoutingStrategyClassName() == null) {
 76 | 				// If the routing strategy isn't set, then there can be no common strategy
 77 | 				routingStrategy = null;
 78 | 				break;
 79 | 			} else if(routingStrategy == null) {
 80 | 			    strategies.add(get(index));
 81 | 			} else {
 82 | 				ElasticsearchRoutingStrategy routingStrategy2 = get(index);
 83 | 				if(routingStrategy2 == null || !routingStrategy2.equals(routingStrategy)) {
 84 | 					routingStrategy = null;
 85 | 					break;
 86 | 				}
 87 | 			}
 88 | 		}
 89 | 		return strategies;
 90 | 	}
 91 | 	
 92 | 	/**
 93 | 	 * Warning, this method only works if all indices contain the same number of shards. Please
 94 | 	 * move to getRoutingStrategiesForIndicies(..)
 95 | 	 */
 96 | 	@Deprecated
 97 | 	@Override
 98 | 	public ElasticsearchRoutingStrategy getRoutingStrategyForIndicies(List<ElasticSearchIndexMetadata> indices) {
 99 | 	    ElasticsearchRoutingStrategy routingStrategy = null;
100 | 	    for(ElasticSearchIndexMetadata index : indices) {
101 | 	        if(index.getRoutingStrategyClassName() == null) {
102 | 	            // If the routing strategy isn't set, then there can be no common strategy
103 | 	            routingStrategy = null;
104 | 	            break;
105 | 	        } else if(routingStrategy == null) {
106 | 	            routingStrategy = get(index);
107 | 	        } else {
108 | 	            ElasticsearchRoutingStrategy routingStrategy2 = get(index);
109 | 	            if(routingStrategy2 == null || !routingStrategy2.equals(routingStrategy)) {
110 | 	                routingStrategy = null;
111 | 	                break;
112 | 	            }
113 | 	        }
114 | 	    }
115 | 	    return routingStrategy;
116 | 	}
117 | 
118 | 	public String getIndexWritable(ElasticSearchIndexMetadata rotatedIndexMetadata) {
119 | 		DateTime now = new DateTime();
120 | 		if(rotatedIndexMetadata.getRebuiltIndexAlias() == null || (rotatedIndexMetadata.getDate() != null && rotatedIndexMetadata.getDate().isAfter(now.minusDays(LAG).toLocalDate()))) {
121 | 			// Only use rotated indexes for data that's ROTATION_LAG_DAYS old
122 | 			return rotatedIndexMetadata.getIndexNameAtBirth();
123 | 		}
124 | 		
125 | 		if(rotatedIndexMetadata.getRebuiltIndexName() == null) {
126 | 			// For backwards compatibility (we didn't always write the rebuiltIndexName to zookeeper), we'll fall back on the alias
127 | 			return rotatedIndexMetadata.getRebuiltIndexAlias();
128 | 		} 
129 | 
130 | 		// Ideally you write to the index by it's full name
131 | 		return rotatedIndexMetadata.getRebuiltIndexName();	
132 | 	}
133 | 
134 | 	/**
135 | 	 * When reading from ES, searches may hit many indexes. To help avoid hitting the URL size limit
136 | 	 * when doing searches that hit a large # of indexes, we alias them with something short. 
137 | 	 */
138 | 	public String getIndexReadable(ElasticSearchIndexMetadata rotatedIndexMetadata) {
139 | 		DateTime now = new DateTime();
140 | 		if(rotatedIndexMetadata.getRebuiltIndexAlias() == null || (rotatedIndexMetadata.getDate() != null && rotatedIndexMetadata.getDate().isAfter(now.minusDays(LAG).toLocalDate()))) {
141 | 			// Only use rotated indexes for data that's ROTATION_LAG_DAYS old
142 | 			return rotatedIndexMetadata.getIndexNameAtBirth();
143 | 		}
144 | 
145 | 		return rotatedIndexMetadata.getRebuiltIndexAlias();
146 | 	}
147 | 
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/transport/BaseTransport.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch.transport;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FilenameFilter;
  5 | import java.io.IOException;
  6 | 
  7 | import org.apache.commons.io.FileUtils;
  8 | 
  9 | import com.google.common.base.Preconditions;
 10 | import com.inin.analytics.elasticsearch.BaseESReducer;
 11 | import com.inin.analytics.elasticsearch.ShardConfig;
 12 | import com.inin.analytics.elasticsearch.transport.SnapshotTransportStrategy.STORAGE_SYSTEMS;
 13 | 
 14 | public abstract class BaseTransport {
 15 | 	protected String snapshotWorkingLocation;
 16 | 	protected String snapshotFinalDestination;
 17 | 	private DirectoryFilter directoryFilter = new DirectoryFilter();
 18 | 	
 19 | 	public BaseTransport(String snapshotWorkingLocation, String snapshotFinalDestination) {
 20 | 		this.snapshotWorkingLocation = snapshotWorkingLocation;
 21 | 		this.snapshotFinalDestination = snapshotFinalDestination;
 22 | 		Preconditions.checkNotNull(snapshotWorkingLocation);
 23 | 		Preconditions.checkNotNull(snapshotFinalDestination);
 24 | 	}
 25 | 	
 26 | 	protected abstract void init();
 27 | 	protected abstract void close();
 28 | 	protected abstract void transferFile(boolean deleteSource, String destination, String filename, String localDirectory) throws IOException;
 29 | 	protected abstract void transferDir(String destination, String localShardPath, String shard) throws IOException;
 30 | 	protected abstract boolean checkExists(String destination, Integer shardNumber) throws IOException;
 31 | 
 32 | 	/**
 33 | 	 * Transport a snapshot sitting on the local filesystem to a remote repository. Snapshots are stiched together
 34 | 	 * shard by shard because we're snapshotting 1 shard at a time. 
 35 | 	 * 
 36 | 	 * @param snapshotName
 37 | 	 * @param index
 38 | 	 * @param shardNumber
 39 | 	 * @throws IOException
 40 | 	 */
 41 | 	public void execute(String snapshotName, String index) throws IOException {
 42 | 		init();
 43 | 		// Figure out which shard has all the data
 44 | 		String largestShard = getShardSource(index);
 45 | 
 46 | 		String destination = removeStorageSystemFromPath(snapshotFinalDestination);
 47 | 
 48 | 		// Upload top level manifests
 49 | 		transferFile(false, destination, "metadata-" + snapshotName, snapshotWorkingLocation);
 50 | 		transferFile(false, destination, "snapshot-" + snapshotName, snapshotWorkingLocation);
 51 | 		transferFile(false, destination, "index", snapshotWorkingLocation);
 52 | 
 53 | 		
 54 | 		// Upload per-index manifests
 55 | 		String indexManifestSource =  snapshotWorkingLocation + "indices" + BaseESReducer.DIR_SEPARATOR + index;
 56 | 		String indexManifestDestination = destination + BaseESReducer.DIR_SEPARATOR + "indices" + BaseESReducer.DIR_SEPARATOR + index;
 57 | 		
 58 | 		transferFile(false, indexManifestDestination, "snapshot-" + snapshotName, indexManifestSource);
 59 | 		
 60 | 		// Cleanup shard data
 61 | 		cleanEmptyShards(index, largestShard);
 62 | 		
 63 | 		// Upload shard data
 64 | 		String shardSource = snapshotWorkingLocation + "indices" + BaseESReducer.DIR_SEPARATOR + index + BaseESReducer.DIR_SEPARATOR + largestShard;
 65 | 		
 66 | 		String shardDestination = destination + BaseESReducer.DIR_SEPARATOR + "indices" + BaseESReducer.DIR_SEPARATOR + index + BaseESReducer.DIR_SEPARATOR;
 67 | 		transferDir(shardDestination, shardSource, largestShard);
 68 | 		close();
 69 | 	}
 70 | 	
 71 | 	public void placeMissingShards(String snapshotName, String index, ShardConfig shardConfig, boolean includeRootManifest) throws IOException {
 72 | 		init();
 73 | 		String destination = removeStorageSystemFromPath(snapshotFinalDestination);
 74 | 		
 75 | 		if(includeRootManifest) {
 76 | 			// Upload top level manifests
 77 | 			transferFile(false, destination, "metadata-" + snapshotName, snapshotWorkingLocation);
 78 | 			transferFile(false, destination, "snapshot-" + snapshotName, snapshotWorkingLocation);
 79 | 			transferFile(false, destination, "index", snapshotWorkingLocation);
 80 | 		}
 81 | 		
 82 | 		for(int shard = 0; shard < shardConfig.getShardsForIndex(index); shard++) {
 83 | 			String indexDestination = destination + BaseESReducer.DIR_SEPARATOR + "indices" + BaseESReducer.DIR_SEPARATOR + index + BaseESReducer.DIR_SEPARATOR  ;
 84 | 			if(!checkExists(indexDestination, shard)) {
 85 | 				// Upload shard data
 86 | 				String shardSource = snapshotWorkingLocation + "indices" + BaseESReducer.DIR_SEPARATOR + index + BaseESReducer.DIR_SEPARATOR + shard;
 87 | 				transferDir(indexDestination, shardSource, new Integer(shard).toString());
 88 | 			}
 89 | 		}
 90 | 		close();
 91 | 	}
 92 | 	
 93 | 	/**
 94 | 	 * Rip out filesystem specific stuff off the path EG s3:// 
 95 | 	 * @param s
 96 | 	 * @return s
 97 | 	 */
 98 | 	private String removeStorageSystemFromPath(String s) {
 99 | 		for(STORAGE_SYSTEMS storageSystem : SnapshotTransportStrategy.STORAGE_SYSTEMS.values()) {
100 | 			s = s.replaceFirst(storageSystem.name() + "://", "");			
101 | 		}
102 | 
103 | 		return s;
104 | 	}
105 | 	
106 | 	/**
107 | 	 * We've snapshotted an index with all data routed to a single shard (1 shard per reducer). Problem is 
108 | 	 * we don't know which shard # it routed all the data to. We can determine that by picking 
109 | 	 * out the largest shard folder and renaming it to the shard # we want it to be.
110 | 	 */
111 | 	private String getShardSource(String index) throws IOException {
112 | 		// Get a list of shards in the snapshot
113 | 		String baseIndexLocation = snapshotWorkingLocation + "indices" + BaseESReducer.DIR_SEPARATOR + index + BaseESReducer.DIR_SEPARATOR;
114 | 		File file = new File(baseIndexLocation);
115 | 		String[] shardDirectories = file.list(directoryFilter);
116 | 		
117 | 		// Figure out which shard has all the data in it. Since we've routed all data to it, there'll only be one
118 | 		Long biggestDirLength = null;
119 | 		String biggestDir = null;
120 | 		for(String directory : shardDirectories) {
121 | 			File curDir = new File(baseIndexLocation + directory);
122 |                         long curDirLength = FileUtils.sizeOfDirectory(curDir);
123 | 			if(biggestDirLength == null || biggestDirLength < curDirLength) {
124 | 				biggestDir = directory;
125 | 				biggestDirLength = curDirLength;
126 | 			}
127 | 		}
128 | 		
129 | 		return biggestDir;
130 | 	}
131 | 	
132 | 	/**
133 | 	 * We're building 1 shard at a time. Therefore each snapshot has a bunch of empty
134 | 	 * shards and 1 shard with all the data in it. This deletes all the empty shard folders
135 | 	 * for you.
136 | 	 * 
137 | 	 * @param index
138 | 	 * @param biggestDir
139 | 	 * @throws IOException
140 | 	 */
141 | 	private void cleanEmptyShards(String index, String biggestDir) throws IOException {
142 | 		String baseIndexLocation = snapshotWorkingLocation + "indices" + BaseESReducer.DIR_SEPARATOR + index + BaseESReducer.DIR_SEPARATOR;
143 | 		File file = new File(baseIndexLocation);
144 | 		String[] shardDirectories = file.list(directoryFilter);
145 | 		
146 | 		// Remove the empty shards
147 | 		for(String directory : shardDirectories) {
148 | 			if(!directory.equals(biggestDir)) {
149 | 				FileUtils.deleteDirectory(new File(baseIndexLocation + directory));
150 | 			}
151 | 		}
152 | 	}
153 | 	
154 | 	private class DirectoryFilter implements FilenameFilter {
155 | 		
156 | 		@Override
157 | 		public boolean accept(File current, String name) {
158 | 			return new File(current, name).isDirectory();
159 | 		}
160 | 	}
161 | }
162 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/transport/HDFSSnapshotTransport.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.transport;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.commons.lang.StringUtils;
 8 | import org.apache.hadoop.conf.Configuration;
 9 | import org.apache.hadoop.fs.FileSystem;
10 | import org.apache.hadoop.fs.Path;
11 | import org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException;
12 | import org.apache.hadoop.ipc.RemoteException;
13 | import org.slf4j.Logger;
14 | import org.slf4j.LoggerFactory;
15 | 
16 | import com.inin.analytics.elasticsearch.BaseESReducer;
17 | 
18 | public class HDFSSnapshotTransport  extends BaseTransport {
19 | 	private FileSystem hdfsFileSystem;
20 | 	private static transient Logger logger = LoggerFactory.getLogger(HDFSSnapshotTransport.class);
21 | 	
22 | 	public HDFSSnapshotTransport(String snapshotWorkingLocation, String snapshotFinalDestination) {
23 | 		super(snapshotWorkingLocation, snapshotFinalDestination);
24 | 	}
25 | 
26 | 	@Override
27 | 	protected void init() {
28 | 	    Configuration conf = new Configuration();
29 | 	    try {
30 | 			hdfsFileSystem = FileSystem.get(conf);
31 | 		} catch (IOException e) {
32 | 			throw new IllegalStateException("Unable to initialize HDFSSnapshotTransport because of ", e);
33 | 		}
34 | 	}
35 | 
36 | 	@Override
37 | 	protected void close() {
38 | 		
39 | 	}
40 | 	
41 | 	private void ensurePathExists(String destination) throws IOException {
42 | 		String[] pieces = StringUtils.split(destination, BaseESReducer.DIR_SEPARATOR);
43 | 		
44 | 		String path = "";
45 | 		for(String piece : pieces) {
46 | 			if(StringUtils.isEmpty(piece)) {
47 | 				continue;
48 | 			}
49 | 			path = path + BaseESReducer.DIR_SEPARATOR + piece;
50 | 			if(!hdfsFileSystem.exists(new Path(path))) {
51 | 				try{
52 | 					hdfsFileSystem.mkdirs(new Path(path));	
53 | 				} catch (IOException e) {
54 | 					logger.warn("Unable to create path " + path + " likely because it was created in another reducer thread.");
55 | 				}
56 | 			}
57 | 		}
58 | 	}
59 | 
60 | 	@Override
61 | 	protected void transferFile(boolean deleteSource, String destination, String filename, String localDirectory) throws IOException {
62 | 		Path source = new Path(localDirectory + BaseESReducer.DIR_SEPARATOR + filename);
63 | 		ensurePathExists(destination);
64 | 
65 | 		try{
66 | 			hdfsFileSystem.copyFromLocalFile(deleteSource, true, source, new Path(destination + BaseESReducer.DIR_SEPARATOR + filename));	
67 | 		}
68 | 		catch(LeaseExpiredException | RemoteException e) {
69 | 			// This is an expected race condition where 2 reducers are trying to write the manifest files at the same time. That's okay, it only has to succeed once. 
70 | 			logger.warn("Exception from 2 reducers writing the same file concurrently. One writer failed to obtain a lease. Destination " + destination + " filename " + filename + " localDirectory " + localDirectory, e);
71 | 		}
72 | 	}
73 | 
74 | 	@Override
75 | 	protected void transferDir(String destination, String localShardPath, String shard) throws IOException {
76 | 		destination = destination + shard + BaseESReducer.DIR_SEPARATOR;
77 | 		ensurePathExists(destination);
78 | 		try{
79 | 			File[] files = new File(localShardPath).listFiles();
80 | 			for (File file : files) {
81 | 				transferFile(true, destination, file.getName(), localShardPath);
82 | 			}
83 | 		} catch(FileNotFoundException e) {
84 | 			throw new FileNotFoundException("Exception copying " + localShardPath + " to " + destination);
85 | 		} 
86 | 	}
87 | 
88 | 	@Override
89 | 	protected boolean checkExists(String destination, Integer shardNumber) throws IOException {
90 | 		return hdfsFileSystem.exists(new Path(destination + shardNumber));
91 | 	}
92 | }
93 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/transport/LocalFSSnapshotTransport.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.transport;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | 
 6 | import org.apache.commons.io.FileUtils;
 7 | 
 8 | import com.google.common.base.Preconditions;
 9 | import com.inin.analytics.elasticsearch.BaseESReducer;
10 | 
11 | /**
12 |  * Move the snapshot to locally connected storage
13 |  * 
14 |  * @author drew
15 |  *
16 |  */
17 | public class LocalFSSnapshotTransport extends BaseTransport {
18 | 
19 | 	public LocalFSSnapshotTransport(String snapshotWorkingLocation, String snapshotFinalDestination) {
20 | 		super(snapshotWorkingLocation, snapshotFinalDestination);
21 | 	}
22 | 
23 | 	@Override
24 | 	protected void init() {
25 | 		// no-op
26 | 	}
27 | 
28 | 	@Override
29 | 	protected void close() {
30 | 		// no-op
31 | 	}
32 | 
33 | 	@Override
34 | 	protected void transferFile(boolean deleteSource, String destination, String filename, String localDirectory) throws IOException {
35 | 		File source = new File(localDirectory + BaseESReducer.DIR_SEPARATOR + filename);
36 | 		Preconditions.checkArgument(source.exists(), "Could not find source file: " + source.getAbsolutePath()); 
37 | 
38 | 		File destinationDir = new File(destination);
39 | 		FileUtils.forceMkdir(destinationDir);
40 | 		FileUtils.copyFileToDirectory(source, destinationDir);
41 | 		if(deleteSource) {
42 | 			source.delete();
43 | 		}
44 | 	}
45 | 
46 | 	@Override
47 | 	protected void transferDir(String destination, String source, String shard) throws IOException {
48 | 		File sourceDir = new File(source);
49 | 		Preconditions.checkArgument(sourceDir.exists(), "Could not find dir: " + source); 
50 | 		
51 | 		File destinationDir = new File(destination + shard);
52 | 		FileUtils.forceMkdir(destinationDir);
53 | 		FileUtils.copyDirectory(sourceDir, destinationDir);
54 | 	}
55 | 
56 | 	@Override
57 | 	protected boolean checkExists(String destination, Integer shardNumber) throws IOException {
58 | 		File destinationDir = new File(destination + shardNumber);
59 | 		return destinationDir.exists();
60 | 	}
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/transport/S3SnapshotTransport.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics.elasticsearch.transport;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.IOException;
  7 | import java.util.concurrent.Executors;
  8 | import java.util.concurrent.ThreadFactory;
  9 | import java.util.concurrent.ThreadPoolExecutor;
 10 | 
 11 | import org.apache.commons.lang.StringUtils;
 12 | import org.slf4j.Logger;
 13 | import org.slf4j.LoggerFactory;
 14 | 
 15 | import com.amazonaws.ClientConfiguration;
 16 | import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
 17 | import com.amazonaws.regions.Regions;
 18 | import com.amazonaws.services.s3.AmazonS3Client;
 19 | import com.amazonaws.services.s3.model.ListObjectsRequest;
 20 | import com.amazonaws.services.s3.model.ObjectListing;
 21 | import com.amazonaws.services.s3.model.ObjectMetadata;
 22 | import com.amazonaws.services.s3.model.S3ObjectSummary;
 23 | import com.amazonaws.services.s3.transfer.MultipleFileUpload;
 24 | import com.amazonaws.services.s3.transfer.ObjectMetadataProvider;
 25 | import com.amazonaws.services.s3.transfer.Transfer.TransferState;
 26 | import com.amazonaws.services.s3.transfer.TransferManager;
 27 | import com.amazonaws.services.s3.transfer.Upload;
 28 | import com.google.common.base.Preconditions;
 29 | import com.inin.analytics.elasticsearch.BaseESReducer;
 30 | 
 31 | /**
 32 |  * Stitch together snapshots of ES shards as it pushes files to S3. If we could get the
 33 |  * snapshot gateway S3 plugin working we could potentially use that instead of using the aws sdk
 34 |  * directly, but there's some things holding that up. 
 35 |  * 
 36 |  * 1) That plugin needs some work to support MFA (at the time of writing it does not)
 37 |  * 2) We'll have to repackage ES with the plugin because installing plugins via the
 38 |  * embeded client isn't supported very well.
 39 |  * 3) This job is actually creating franken-snapshots. We're snapshotting each shard separately to reduce
 40 |  * the storage footprint per reducer task and then merging the snapshots together. To use the snapshot gateway
 41 |  * we would need a way to have the whole index on a single task tracker node. That means
 42 |  * beefy task trackers or mounting either some NFS or Ephemeral disks. Doing 1 shard at a time shrinks the problem.
 43 |  * 
 44 |  * @author drew
 45 |  *
 46 |  */
 47 | public class S3SnapshotTransport extends BaseTransport {
 48 | 	private static transient Logger logger = LoggerFactory.getLogger(S3SnapshotTransport.class);
 49 | 	private static final int S3_TRANSFER_THREAD_COUNT = 128;
 50 | 	private TransferManager tx;
 51 | 	private ObjectMetadataProvider objectMetadataProvider;
 52 | 
 53 | 	/**
 54 | 	 * The default S3 thread pool in the aws sdk is 10 threads. ES Snapshots can be 100s of files, so parallelizing that
 55 | 	 * is advised. 
 56 | 	 * 
 57 | 	 * @return
 58 | 	 */
 59 | 	public static ThreadPoolExecutor createDefaultExecutorService() {
 60 | 		ThreadFactory threadFactory = new ThreadFactory() {
 61 | 			private int threadCount = 1;
 62 | 
 63 | 			public Thread newThread(Runnable r) {
 64 | 				Thread thread = new Thread(r);
 65 | 				thread.setName("s3-transfer-manager-worker-" + threadCount++);
 66 | 				return thread;
 67 | 			}
 68 | 		};
 69 | 		return (ThreadPoolExecutor)Executors.newFixedThreadPool(S3_TRANSFER_THREAD_COUNT, threadFactory);
 70 | 	}
 71 | 
 72 | 	public S3SnapshotTransport(String snapshotWorkingLocation, String snapshotFinalDestination) {
 73 | 		super(snapshotWorkingLocation, snapshotFinalDestination);
 74 | 	}
 75 | 
 76 | 	public static AmazonS3Client getS3Client() {
 77 | 		return (Regions.getCurrentRegion() != null) ?
 78 | 				Regions.getCurrentRegion().createClient(AmazonS3Client.class,
 79 | 						new DefaultAWSCredentialsProviderChain(),
 80 | 						new ClientConfiguration()) :
 81 | 							new AmazonS3Client();
 82 | 	}
 83 | 
 84 | 	@Override
 85 | 	protected void init() {
 86 | 		tx = new TransferManager(getS3Client(), createDefaultExecutorService());
 87 | 		
 88 | 		objectMetadataProvider = new ObjectMetadataProvider() {
 89 | 			@Override
 90 | 			public void provideObjectMetadata(File file, ObjectMetadata metadata) {
 91 | 				metadata.setSSEAlgorithm("AES256");
 92 | 				metadata.setContentLength(file.length());
 93 | 			}
 94 | 		};
 95 | 	}
 96 | 	
 97 | 	@Override
 98 | 	protected void close() {
 99 | 		tx.shutdownNow();	
100 | 	}
101 | 
102 | 	protected void transferDir(String shardDestinationBucket, String localShardPath, String shard) {
103 | 		MultipleFileUpload mfu = tx.uploadDirectory(shardDestinationBucket + shard, null, new File(localShardPath), true, objectMetadataProvider);
104 | 		
105 | 		/**
106 | 		 * TODO: Hadoop has a configurable timeout for how long a reducer can be non-responsive (usually 600s). If 
107 | 		 * this takes >600s hadoop will kill the task. We need to ping the reporter to let it know it's alive
108 | 		 * in the case where the file transfer is taking a while.
109 | 		 */
110 | 		while(!mfu.isDone()) {
111 | 			logger.info("Transfering to S3 completed %" + mfu.getProgress().getPercentTransferred());
112 | 			try {
113 | 				Thread.sleep(1000);
114 | 			} catch (InterruptedException e) {
115 | 				Thread.currentThread().interrupt();
116 | 			}
117 | 		}
118 | 	}
119 | 	
120 | 	protected void transferFile(boolean deleteSource, String bucket, String filename, String localDirectory) {
121 | 		File source = new File(localDirectory + BaseESReducer.DIR_SEPARATOR + filename);
122 | 		Preconditions.checkArgument(source.exists(), "Could not find source file: " + source.getAbsolutePath());
123 | 		logger.info("Transfering + " + source + " to " + bucket + " with key " + filename);
124 | 		FileInputStream fis;
125 | 		try {
126 | 			fis = new FileInputStream(source);
127 | 			ObjectMetadata objectMetadata = new ObjectMetadata();
128 | 			objectMetadata.setSSEAlgorithm("AES256");
129 | 			objectMetadata.setContentLength(source.length());
130 | 			Upload upload = tx.upload(bucket, filename, fis, objectMetadata);
131 | 			
132 | 			while(!upload.isDone());
133 | 			Preconditions.checkState(upload.getState().equals(TransferState.Completed), "File " + filename + " failed to upload with state: " + upload.getState());
134 | 			if(deleteSource) {
135 | 				source.delete();
136 | 			}
137 | 		} catch (FileNotFoundException e) {
138 | 			// Exception should never be thrown because the precondition above has already validated existence of file
139 | 			logger.error("Filename could not be found " + filename, e);
140 | 		}
141 | 	}
142 | 
143 | 	@Override
144 | 	protected boolean checkExists(String destination, Integer shard) throws IOException {
145 | 		// Break that s3 path into bucket & key 
146 | 		String[] pieces = StringUtils.split(destination, "/");
147 | 		String bucket = pieces[0];
148 | 		String key = destination.substring(bucket.length() + 1);
149 | 		
150 | 		// AWS SDK doesn't have an "exists" method so you have to list and check if the key is there. Thanks Obama
151 | 		ObjectListing objects = tx.getAmazonS3Client().listObjects(new ListObjectsRequest().withBucketName(bucket).withPrefix(key));
152 | 
153 | 		for (S3ObjectSummary objectSummary: objects.getObjectSummaries()) {
154 | 			if (objectSummary.getKey().startsWith(key + shard)) {
155 | 				return true;
156 | 			}
157 | 		}
158 | 		return false;
159 | 	}
160 | 
161 | }
162 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/transport/SnapshotTransportStrategy.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.transport;
 2 | 
 3 | public class SnapshotTransportStrategy {
 4 | 	
 5 | 	public enum STORAGE_SYSTEMS {
 6 | 		s3,
 7 | 		hdfs
 8 | 	}
 9 | 
10 | 	/**
11 | 	 * Given a source & destination, return an appropriate transport implementation
12 | 	 * 
13 | 	 * @param snapshotWorkingLocation
14 | 	 * @param snapshotFinalDestination
15 | 	 * @return BaseTransport
16 | 	 */
17 | 	public static BaseTransport get(String snapshotWorkingLocation, String snapshotFinalDestination) {
18 | 		BaseTransport trasport = null;
19 | 		if(snapshotFinalDestination.startsWith(STORAGE_SYSTEMS.s3.name())) {
20 | 			trasport = new S3SnapshotTransport(snapshotWorkingLocation, snapshotFinalDestination);
21 | 		} else if(snapshotFinalDestination.startsWith(STORAGE_SYSTEMS.hdfs.name())) {
22 | 				trasport = new HDFSSnapshotTransport(snapshotWorkingLocation, snapshotFinalDestination);
23 | 		} else {
24 | 			trasport = new LocalFSSnapshotTransport(snapshotWorkingLocation, snapshotFinalDestination);
25 | 		}
26 | 		return trasport;
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/util/DateTimeDeserializer.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.util;
 2 | 
 3 | import java.lang.reflect.Type;
 4 | 
 5 | import org.joda.time.DateTime;
 6 | import org.joda.time.DateTimeZone;
 7 | 
 8 | import com.google.gson.JsonDeserializationContext;
 9 | import com.google.gson.JsonDeserializer;
10 | import com.google.gson.JsonElement;
11 | import com.google.gson.JsonParseException;
12 | import com.google.gson.JsonPrimitive;
13 | 
14 | public class DateTimeDeserializer implements JsonDeserializer<DateTime> {
15 | 	public DateTime deserialize(JsonElement json, Type typeOfT,
16 | 			JsonDeserializationContext context) throws JsonParseException {
17 | 		JsonPrimitive primitive = json.getAsJsonPrimitive();
18 | 		if(primitive.isNumber()) {
19 | 			return new DateTime(primitive.getAsLong()).withZone(DateTimeZone.UTC);
20 | 		} else {
21 | 			return new DateTime(primitive.getAsString()).withZone(DateTimeZone.UTC);
22 | 		}
23 | 	}
24 | }


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/util/DateTimeSerializer.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.util;
 2 | 
 3 | import java.lang.reflect.Type;
 4 | 
 5 | import org.joda.time.DateTime;
 6 | 
 7 | import com.google.gson.JsonElement;
 8 | import com.google.gson.JsonPrimitive;
 9 | import com.google.gson.JsonSerializationContext;
10 | import com.google.gson.JsonSerializer;
11 | 
12 | public class DateTimeSerializer implements JsonSerializer<DateTime> {
13 | 	
14 | 	@Override
15 | 	public JsonElement serialize(DateTime src, Type typeOfSrc,
16 | 			JsonSerializationContext context) {
17 | 		return new JsonPrimitive(src.toString());
18 | 	}
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/util/GsonFactory.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.util;
 2 | 
 3 | import org.joda.time.DateTime;
 4 | import org.joda.time.LocalDate;
 5 | 
 6 | import com.google.gson.Gson;
 7 | import com.google.gson.GsonBuilder;
 8 | 
 9 | public class GsonFactory {
10 |     public static Gson buildGsonParser() {
11 |         return buildGsonBuilder().create();
12 |     }
13 | 
14 |     public static GsonBuilder buildGsonBuilder() {
15 |         GsonBuilder builder = new GsonBuilder();
16 |         builder.registerTypeAdapter(DateTime.class, new DateTimeSerializer());
17 |         builder.registerTypeAdapter(DateTime.class, new DateTimeDeserializer());
18 |         builder.registerTypeAdapter(LocalDate.class, new LocalDateSerializer());
19 |         builder.registerTypeAdapter(LocalDate.class, new LocalDateDeserializer());
20 |         return builder;
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/util/LocalDateDeserializer.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.util;
 2 | 
 3 | import java.lang.reflect.Type;
 4 | 
 5 | 
 6 | import org.joda.time.LocalDate;
 7 | 
 8 | import com.google.gson.JsonDeserializationContext;
 9 | import com.google.gson.JsonDeserializer;
10 | import com.google.gson.JsonElement;
11 | import com.google.gson.JsonParseException;
12 | import com.google.gson.JsonPrimitive;
13 | 
14 | public class LocalDateDeserializer implements JsonDeserializer<LocalDate> {
15 | 	public LocalDate deserialize(JsonElement json, Type typeOfT,
16 | 			JsonDeserializationContext context) throws JsonParseException {
17 | 		JsonPrimitive primitive = json.getAsJsonPrimitive();
18 | 		return new LocalDate(primitive.getAsString());
19 | 	}
20 | }


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/util/LocalDateSerializer.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics.elasticsearch.util;
 2 | 
 3 | import java.lang.reflect.Type;
 4 | 
 5 | import org.joda.time.LocalDate;
 6 | 
 7 | import com.google.gson.JsonElement;
 8 | import com.google.gson.JsonPrimitive;
 9 | import com.google.gson.JsonSerializationContext;
10 | import com.google.gson.JsonSerializer;
11 | 
12 | public class LocalDateSerializer implements JsonSerializer<LocalDate> {
13 | 	
14 | 	@Override
15 | 	public JsonElement serialize(LocalDate src, Type typeOfSrc,
16 | 			JsonSerializationContext context) {
17 | 		return new JsonPrimitive(src.toString());
18 | 	}
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/com/inin/analytics/elasticsearch/util/MurmurHash.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.inin.analytics.elasticsearch.util;
19 | 
20 | public class MurmurHash  {
21 | 	private static MurmurHash _instance = new MurmurHash();
22 | 
23 | 	public static MurmurHash getInstance() {
24 | 		return _instance;
25 | 	}
26 | 
27 | 	public int hash(byte[] data, int length, int seed) {
28 | 		int m = 0x5bd1e995;
29 | 		int r = 24;
30 | 
31 | 		int h = seed ^ length;
32 | 
33 | 		int len_4 = length >> 2;
34 | 
35 | 		for (int i = 0; i < len_4; i++) {
36 | 			int i_4 = i << 2;
37 | 			int k = data[i_4 + 3];
38 | 			k = k << 8;
39 | 			k = k | (data[i_4 + 2] & 0xff);
40 | 			k = k << 8;
41 | 			k = k | (data[i_4 + 1] & 0xff);
42 | 			k = k << 8;
43 | 			k = k | (data[i_4 + 0] & 0xff);
44 | 			k *= m;
45 | 			k ^= k >>> r;
46 | 			k *= m;
47 | 			h *= m;
48 | 			h ^= k;
49 | 		}
50 | 
51 | 		// avoid calculating modulo
52 | 		int len_m = len_4 << 2;
53 | 		int left = length - len_m;
54 | 
55 | 		if (left != 0) {
56 | 			if (left >= 3) {
57 | 				h ^= (int) data[length - 3] << 16;
58 | 			}
59 | 			if (left >= 2) {
60 | 				h ^= (int) data[length - 2] << 8;
61 | 			}
62 | 			if (left >= 1) {
63 | 				h ^= (int) data[length - 1];
64 | 			}
65 | 
66 | 			h *= m;
67 | 		}
68 | 
69 | 		h ^= h >>> 13;
70 | 				h *= m;
71 | 				h ^= h >>> 15;
72 | 
73 | 			return h;
74 | 	}
75 | 	
76 | 	  /**
77 | 	   * Calculate a hash using all bytes from the input argument,
78 | 	   * and a provided seed value.
79 | 	   * @param bytes input bytes
80 | 	   * @param initval seed value
81 | 	   * @return hash value
82 | 	   */
83 | 	  public int hash(byte[] bytes, int initval) {
84 | 	    return hash(bytes, bytes.length, initval);
85 | 	  }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/resources/hadoop-job.xml:
--------------------------------------------------------------------------------
 1 | <assembly>
 2 |   <id>job</id>
 3 |   <formats>
 4 |     <format>jar</format>
 5 |   </formats>
 6 |   <includeBaseDirectory>false</includeBaseDirectory>
 7 |   <dependencySets>
 8 |     <dependencySet>
 9 |       <unpack>false</unpack>
10 |       <scope>runtime</scope>
11 |       <outputDirectory>lib</outputDirectory>
12 |       <excludes>
13 |         <exclude>${groupId}:${artifactId}</exclude>
14 |       </excludes>
15 |     </dependencySet>
16 |     <dependencySet>
17 |       <unpack>true</unpack>
18 |       <includes>
19 |         <include>${groupId}:${artifactId}</include>
20 |       </includes>
21 |     </dependencySet>
22 |   </dependencySets>
23 | </assembly>
24 | 


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 | 
 4 |     <property name="service.name" value="analytics-hadoop" />
 5 |     <property name="log.dir" value="${LOG_DIR:-/var/log/inin}" />
 6 | 
 7 |     <logger name="org.apache.hadoop.security.UserGroupInformation" level="INFO" />
 8 |     <logger name="com.netflix.config" level="WARN" />
 9 |     <logger name="org.apache.cassandra" level="WARN" />
10 | 	<logger name="org.apache.zookeeper" level="WARN" />
11 | 	<logger name="me.prettyprint.cassandra" level="WARN" />
12 |     <logger name="org.apache.cassandra" level="WARN" />
13 |     <logger name="com.inin.analytics" level="WARN" />
14 |     <logger name="com.inin.analytics.hadoop" level="INFO" />
15 |     
16 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
17 |         <encoder>
18 |             <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger - %msg%n</pattern>
19 |         </encoder>
20 |     </appender>
21 |     
22 |     <root level="info">
23 |         <appender-ref ref="STDOUT" />
24 |         <appender-ref ref="service" />
25 |     </root>
26 | </configuration>
27 | 


--------------------------------------------------------------------------------
/src/test/java/com/inin/analytics/DateSerializerTest.java:
--------------------------------------------------------------------------------
 1 | package com.inin.analytics;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import org.joda.time.LocalDate;
 6 | import org.junit.Test;
 7 | 
 8 | import com.google.gson.Gson;
 9 | import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
10 | import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1;
11 | import com.inin.analytics.elasticsearch.util.GsonFactory;
12 | 
13 | public class DateSerializerTest {
14 | 
15 | 	@Test
16 | 	public void test() {
17 | 		ElasticSearchIndexMetadata indexMetadata = new ElasticSearchIndexMetadata();
18 | 		indexMetadata.setDate(new LocalDate("2015-02-09"));
19 | 		indexMetadata.setIndexNameAtBirth("test");
20 | 		indexMetadata.setNumShards(2);
21 | 		indexMetadata.setNumShardsPerOrg(3);
22 | 		indexMetadata.setRebuiltIndexAlias("alias");
23 | 		indexMetadata.setRoutingStrategyClassName(ElasticsearchRoutingStrategyV1.class.getName());
24 | 		Gson gson = GsonFactory.buildGsonBuilder().excludeFieldsWithoutExposeAnnotation().create();
25 | 		String json = gson.toJson(indexMetadata);
26 | 		
27 | 		assertEquals(json, "{\"indexNameAtBirth\":\"test\",\"rebuiltIndexAlias\":\"alias\",\"indexLocalDate\":\"2015-02-09\",\"numShards\":2,\"numShardsPerOrg\":3,\"routingStrategyClassName\":\"com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1\"}");
28 | 		ElasticSearchIndexMetadata indexMetadataDeserialized = gson.fromJson(json, ElasticSearchIndexMetadata.class);
29 | 		assertEquals(indexMetadata.getDate(), indexMetadataDeserialized.getDate());
30 | 		assertEquals(indexMetadata.getIndexNameAtBirth(), indexMetadataDeserialized.getIndexNameAtBirth());
31 | 		assertEquals(indexMetadata.getNumShards(), indexMetadataDeserialized.getNumShards());
32 | 		assertEquals(indexMetadata.getNumShardsPerOrg(), indexMetadataDeserialized.getNumShardsPerOrg());
33 | 		assertEquals(indexMetadata.getRebuiltIndexAlias(), indexMetadataDeserialized.getRebuiltIndexAlias());
34 | 		assertEquals(indexMetadata.getRebuiltIndexName(), indexMetadataDeserialized.getRebuiltIndexName());
35 | 		assertEquals(indexMetadata.getRoutingStrategyClassName(), indexMetadataDeserialized.getRoutingStrategyClassName());
36 | 	}
37 | 	
38 | 	@Test
39 | 	public void testBackwardsCompatibility() {
40 | 		// We had a case where the indexDate was serialized wrong. This has since been fixed, but this test verifies that the indexDate is ignored and left null rather than blowing up on deserialize
41 | 		String oldJson = "\n" + 
42 | 				"\n" + 
43 | 				"{\n" + 
44 | 				"  \"indexNameAtBirth\": \"c141031\",\n" + 
45 | 				"  \"rebuiltIndexName\": \"c141031_build_399_20150206012509\",\n" + 
46 | 				"  \"rebuiltIndexAlias\": \"c141031r\",\n" + 
47 | 				"  \"indexDate\": {\n" + 
48 | 				"    \"iLocalMillis\": 1414713600000,\n" + 
49 | 				"    \"iChronology\": {\n" + 
50 | 				"      \"iBase\": {\n" + 
51 | 				"        \"iMinDaysInFirstWeek\": 4\n" + 
52 | 				"      }\n" + 
53 | 				"    }\n" + 
54 | 				"  },\n" + 
55 | 				"  \"numShards\": 5,\n" + 
56 | 				"  \"numShardsPerOrg\": 2,\n" + 
57 | 				"  \"routingStrategyClassName\": \"com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1\"\n" + 
58 | 				"}\n" + 
59 | 				"\n" + 
60 | 				"";
61 | 		Gson gson = GsonFactory.buildGsonBuilder().excludeFieldsWithoutExposeAnnotation().create();
62 | 		ElasticSearchIndexMetadata indexMetadataDeserialized = gson.fromJson(oldJson, ElasticSearchIndexMetadata.class);
63 | 		assertNull(indexMetadataDeserialized.getDate());
64 | 		assertEquals("c141031", indexMetadataDeserialized.getIndexNameAtBirth());
65 | 		assertEquals(5, indexMetadataDeserialized.getNumShards());
66 | 		assertEquals(2, indexMetadataDeserialized.getNumShardsPerOrg());
67 | 		assertEquals("c141031r", indexMetadataDeserialized.getRebuiltIndexAlias());
68 | 		assertEquals("c141031_build_399_20150206012509", indexMetadataDeserialized.getRebuiltIndexName());
69 | 		assertEquals("com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1", indexMetadataDeserialized.getRoutingStrategyClassName());
70 | 	}
71 | 	
72 | 	@Test
73 | 	public void testMissingDate() {
74 | 		// We had a case where the indexDate was serialized wrong. This has since been fixed, but this test verifies that the indexDate is ignored and left null rather than blowing up on deserialize
75 | 		String oldJson = "\n" + 
76 | 				"\n" + 
77 | 				"{\n" + 
78 | 				"  \"indexNameAtBirth\": \"c141031\",\n" + 
79 | 				"  \"rebuiltIndexName\": \"c141031_build_399_20150206012509\",\n" + 
80 | 				"  \"rebuiltIndexAlias\": \"c141031r\",\n" + 
81 | 				"  \"numShards\": 5,\n" + 
82 | 				"  \"numShardsPerOrg\": 2,\n" + 
83 | 				"  \"routingStrategyClassName\": \"com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1\"\n" + 
84 | 				"}\n" + 
85 | 				"\n" + 
86 | 				"";
87 | 		Gson gson = GsonFactory.buildGsonBuilder().excludeFieldsWithoutExposeAnnotation().create();
88 | 		ElasticSearchIndexMetadata indexMetadataDeserialized = gson.fromJson(oldJson, ElasticSearchIndexMetadata.class);
89 | 		assertNull(indexMetadataDeserialized.getDate());
90 | 		assertEquals("c141031", indexMetadataDeserialized.getIndexNameAtBirth());
91 | 		assertEquals(5, indexMetadataDeserialized.getNumShards());
92 | 		assertEquals(2, indexMetadataDeserialized.getNumShardsPerOrg());
93 | 		assertEquals("c141031r", indexMetadataDeserialized.getRebuiltIndexAlias());
94 | 		assertEquals("c141031_build_399_20150206012509", indexMetadataDeserialized.getRebuiltIndexName());
95 | 		assertEquals("com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1", indexMetadataDeserialized.getRoutingStrategyClassName());
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/src/test/java/com/inin/analytics/ElasticsearchRoutingStrategyV1Test.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | import static org.junit.Assert.assertTrue;
  5 | 
  6 | import java.util.HashSet;
  7 | import java.util.Set;
  8 | 
  9 | import org.junit.Before;
 10 | import org.junit.Test;
 11 | 
 12 | import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
 13 | import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1;
 14 | 
 15 | public class ElasticsearchRoutingStrategyV1Test {
 16 | 
 17 | 	private Set<String> orgIds = new HashSet<>();
 18 | 	private Set<String> convIds = new HashSet<>();
 19 | 	
 20 | 	@Before
 21 | 	public void setUp() throws Exception {
 22 | 		// Note: this could be randomly generated, but if a test is to fail I want it to fail consistently 
 23 | 		orgIds.add("ed1121bf-5e61-4ac5-ad99-c24f8c4f79db");
 24 | 		orgIds.add("b8864a7e-98d9-4bef-af1e-54c8bea7ae40");
 25 | 		orgIds.add("decccc4f-2c96-4f4c-890f-eb1433ff4c90");
 26 | 		orgIds.add("1650943b-b125-41cf-9729-3bd3e164da16");
 27 | 		orgIds.add("005a22cc-afbb-4bbe-97e9-6f1447293ed7");
 28 | 		orgIds.add("e29469e1-02a1-4d63-91ef-40affca740a8");
 29 | 		orgIds.add("400cdb2f-7573-444e-9612-e218ff1c8387");
 30 | 		orgIds.add("aec66b84-6c34-466b-8991-031cba01241b");
 31 | 		orgIds.add("53adf13e-ce20-4112-9809-6aa29c60dfa5");
 32 | 		orgIds.add("f7f8ff19-81bf-49b1-a896-e996674d5a1f");
 33 | 		orgIds.add("2eb8db9f-d3ae-4d9a-ac78-55cb792e0d2d");
 34 | 		orgIds.add("3b984743-49bd-47d9-b38f-da3f822db03a");
 35 | 		orgIds.add("b68edfd1-305f-4d31-9443-605ba05eb5cc");
 36 | 		orgIds.add("0c8ce21d-3bb5-4dab-9531-1e2f3320259e");
 37 | 		orgIds.add("254f6bec-8b3d-48d2-976a-ba4a3517558b");
 38 | 		
 39 | 		convIds.add("0a3fe8fa-0291-4a28-87c7-2eeeda2295cd");
 40 | 		convIds.add("38b261be-23c4-4fe6-846c-f06231ddf82f");
 41 | 		convIds.add("3e4602bb-9c72-4174-b29f-b72dee356716");
 42 | 		convIds.add("3ff398ac-b832-4085-9ba3-0d2756c03f21");
 43 | 		convIds.add("8773bd12-3fdc-452f-b440-60bee40fadfc");
 44 | 		convIds.add("a0f20cbe-19a4-4aff-833d-c0919d6cfe73");
 45 | 		convIds.add("de48d484-23ce-4e0d-b465-de91b2f6ad72");
 46 | 		convIds.add("be57d96e-7ee8-4bba-bc35-15e347b69bed");
 47 | 		convIds.add("7cb1b182-b64a-4815-bc61-87714dbd0431");
 48 | 		convIds.add("8b9bbfbc-34dc-45f4-8dee-d82a44fc9995");
 49 | 		convIds.add("60ecef71-0a30-4798-aae7-63f6c1df0cf0");
 50 | 		convIds.add("64d0431b-bb68-4045-8fff-5ae2ed4eed51");
 51 | 		convIds.add("2e8df74f-3536-4044-aa13-1c1b273ab29f");
 52 | 	}
 53 | 	
 54 | 	@Test
 55 | 	public void testOrgOn7ShardsHashes() {
 56 | 		ElasticSearchIndexMetadata indexMetadata = new ElasticSearchIndexMetadata();
 57 | 		indexMetadata.setNumShards(10);
 58 | 		indexMetadata.setNumShardsPerOrg(7);
 59 | 		
 60 | 		ElasticsearchRoutingStrategyV1 strategy = new ElasticsearchRoutingStrategyV1();
 61 | 		strategy.configure(indexMetadata);
 62 | 		
 63 | 		
 64 | 		for(String orgId : orgIds) {
 65 | 			Set<String> routingHashs = new HashSet<>();
 66 | 			for(String convId : convIds) {
 67 | 				String routingHash = strategy.getRoutingHash(orgId, convId);
 68 | 				routingHashs.add(routingHash);
 69 | 			}
 70 | 			// Data was spread across #numShardsPerOrg shards
 71 | 			assertEquals(routingHashs.size(), 7);
 72 | 			
 73 | 			// Possible hashes contain the routing hashes
 74 | 			String[] possibleHashes = strategy.getPossibleRoutingHashes(orgId);
 75 | 			assertEquals(possibleHashes.length, 7);
 76 | 			for(String possibleHash : possibleHashes) {
 77 | 				assertTrue(routingHashs.contains(possibleHash));
 78 | 			}
 79 | 		}
 80 | 	}
 81 | 	
 82 | 	@Test
 83 | 	public void testOrgOn1ShardsHashes() {
 84 | 		ElasticSearchIndexMetadata indexMetadata = new ElasticSearchIndexMetadata();
 85 | 		indexMetadata.setNumShards(5);
 86 | 		indexMetadata.setNumShardsPerOrg(1);
 87 | 
 88 | 		ElasticsearchRoutingStrategyV1 strategy = new ElasticsearchRoutingStrategyV1();
 89 | 		strategy.configure(indexMetadata);
 90 | 		for(String orgId : orgIds) {
 91 | 			Set<String> routingHashs = new HashSet<>();
 92 | 			for(String convId : convIds) {
 93 | 				String routingHash = strategy.getRoutingHash(orgId, convId);
 94 | 				routingHashs.add(routingHash);
 95 | 			}
 96 | 			// Data was spread across #numShardsPerOrg shards
 97 | 			assertEquals(routingHashs.size(), 1);
 98 | 			
 99 | 			// Possible hashes contain the routing hashes
100 | 			String[] possibleHashes = strategy.getPossibleRoutingHashes(orgId);
101 | 			assertEquals(possibleHashes.length, 1);
102 | 			for(String possibleHash : possibleHashes) {
103 | 				assertTrue(routingHashs.contains(possibleHash));
104 | 			}
105 | 		}
106 | 	}
107 | 	
108 | 	@Test
109 | 	public void testSingleShardIndex() {
110 | 	    ElasticSearchIndexMetadata indexMetadata = new ElasticSearchIndexMetadata();
111 | 	    indexMetadata.setNumShards(1);
112 | 	    indexMetadata.setNumShardsPerOrg(1);
113 | 
114 | 	    ElasticsearchRoutingStrategyV1 strategy = new ElasticsearchRoutingStrategyV1();
115 | 	    strategy.configure(indexMetadata);
116 | 	    Set<String> routingHashs = new HashSet<>();
117 | 	    for(String orgId : orgIds) {
118 | 	        for(String convId : convIds) {
119 | 	            String routingHash = strategy.getRoutingHash(orgId, convId);
120 | 	            routingHashs.add(routingHash);
121 | 	        }
122 | 	    }
123 | 	    assertEquals(routingHashs.size(), 1);
124 | 	}
125 | 
126 | 	@Test
127 | 	public void testOrgDistribution() {
128 | 		ElasticSearchIndexMetadata indexMetadata = new ElasticSearchIndexMetadata();
129 | 		indexMetadata.setNumShards(5);
130 | 		indexMetadata.setNumShardsPerOrg(1);
131 | 		ElasticsearchRoutingStrategyV1 strategy = new ElasticsearchRoutingStrategyV1();
132 | 		strategy.configure(indexMetadata);
133 | 		
134 | 		Set<String> routingHashs = new HashSet<>();
135 | 		for(String orgId : orgIds) {
136 | 			routingHashs.add(strategy.getRoutingHash(orgId, "713729b0-91d1-4006-9317-8db4bc113be4"));
137 | 		}
138 | 		assertEquals(routingHashs.size(), indexMetadata.getNumShards());
139 | 	}
140 | 
141 | }
142 | 


--------------------------------------------------------------------------------
/src/test/java/com/inin/analytics/IndexRotationStrategyZookeeperTest.java:
--------------------------------------------------------------------------------
  1 | package com.inin.analytics;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | import static org.junit.Assert.fail;
  5 | 
  6 | import java.io.IOException;
  7 | 
  8 | import org.apache.curator.framework.CuratorFramework;
  9 | import org.apache.curator.framework.CuratorFrameworkFactory;
 10 | import org.apache.curator.retry.ExponentialBackoffRetry;
 11 | import org.apache.curator.test.TestingServer;
 12 | import org.joda.time.DateTime;
 13 | import org.junit.After;
 14 | import org.junit.Before;
 15 | import org.junit.Test;
 16 | import org.junit.runner.RunWith;
 17 | import org.mockito.runners.MockitoJUnitRunner;
 18 | 
 19 | import com.inin.analytics.elasticsearch.index.rotation.ExampleElasticsearchIndexRotationStrategyZookeeper;
 20 | import com.inin.analytics.elasticsearch.index.rotation.RebuildPipelineState;
 21 | import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
 22 | import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1;
 23 | import com.inin.analytics.elasticsearch.index.selector.RealtimeIndexSelectionStrategyLagged;
 24 | 
 25 | 
 26 | @RunWith(MockitoJUnitRunner.class)
 27 | public class IndexRotationStrategyZookeeperTest {
 28 | 
 29 | 	private static final Integer ZOOKEEPER_EMBEDED_PORT = 2181;
 30 | 	private TestingServer zk = null;
 31 | 	private ExampleElasticsearchIndexRotationStrategyZookeeper rotation;
 32 | 
 33 | 	@Before
 34 | 	public void setup() throws Exception {
 35 | 		zk = new TestingServer(ZOOKEEPER_EMBEDED_PORT);
 36 | 		rotation = new ExampleElasticsearchIndexRotationStrategyZookeeper();
 37 | 		rotation.setCurator(getCurator());
 38 | 		rotation.init();
 39 | 	}
 40 | 
 41 | 	public CuratorFramework getCurator() throws InterruptedException {
 42 | 		ExponentialBackoffRetry retryPolicy = new ExponentialBackoffRetry(1000, 3);
 43 | 		CuratorFramework curator = CuratorFrameworkFactory.newClient("localhost", 10000, 10000, retryPolicy);
 44 | 		curator.start();
 45 | 		curator.getZookeeperClient().blockUntilConnectedOrTimedOut();
 46 | 		return curator;
 47 | 	}
 48 | 	
 49 | 	@After
 50 | 	public void close() throws IOException {
 51 | 		zk.close();
 52 | 	}
 53 | 
 54 | 	@Test
 55 | 	public final void testRegisterAndGet() throws Exception {
 56 | 		DateTime now = new DateTime();
 57 | 					
 58 | 		ElasticSearchIndexMetadata metaData = new ElasticSearchIndexMetadata();
 59 | 		metaData.setIndexNameAtBirth("a");
 60 | 		metaData.setNumShardsPerOrg(2);
 61 | 		metaData.setRebuiltIndexAlias("b");
 62 | 		metaData.setRebuiltIndexName("b");
 63 | 		
 64 | 		metaData.setRoutingStrategyClassName(ElasticsearchRoutingStrategyV1.class.getName());
 65 | 		rotation.registerIndexAvailableOnRotation(metaData);
 66 | 
 67 | 		// NodeCache curator recipe is async so we loop for up to 1sec waiting for the watcher to react. Polling sucks, but it beats Thread.sleep(1000) and generally happens in a few ms.
 68 | 		long timer = System.currentTimeMillis();
 69 | 		while(true) {
 70 | 			ElasticSearchIndexMetadata readMetaData = rotation.getElasticSearchIndexMetadata(metaData.getIndexNameAtBirth());
 71 | 			assertEquals(readMetaData.getIndexNameAtBirth(), metaData.getIndexNameAtBirth());
 72 | 			
 73 | 			
 74 | 			if(readMetaData.getRebuiltIndexName() != null) {
 75 | 				// Assert that rotation lag is accounted for
 76 | 				assertEquals(readMetaData.getRebuiltIndexName(), metaData.getRebuiltIndexName());
 77 | 
 78 | 				break;
 79 | 			}
 80 | 			if(System.currentTimeMillis() - timer > 1000) {
 81 | 				fail("NodeCache failed to update with latest value in a reasonable amount of time");
 82 | 			}
 83 | 		}
 84 | 	}
 85 | 	
 86 | 	@Test
 87 | 	public void testRealtimeIndexSelectionStrategyLagged() {
 88 | 		DateTime now = new DateTime();
 89 | 		ElasticSearchIndexMetadata metaData = new ElasticSearchIndexMetadata();
 90 | 		metaData.setIndexNameAtBirth("a");
 91 | 		metaData.setNumShardsPerOrg(2);
 92 | 		metaData.setRebuiltIndexAlias("b");
 93 | 		metaData.setRebuiltIndexName("bb");
 94 | 		metaData.setRoutingStrategyClassName(ElasticsearchRoutingStrategyV1.class.getName());
 95 | 		
 96 | 		RealtimeIndexSelectionStrategyLagged strategy = new RealtimeIndexSelectionStrategyLagged(2);
 97 | 		metaData.setDate(now.toLocalDate());
 98 | 		assertEquals(strategy.getIndexReadable(metaData), "a");
 99 | 		assertEquals(strategy.getIndexWritable(metaData), "a");
100 | 
101 | 		metaData.setDate(now.minusDays(1).toLocalDate());
102 | 		assertEquals(strategy.getIndexReadable(metaData), "a");
103 | 		assertEquals(strategy.getIndexWritable(metaData), "a");
104 | 
105 | 		metaData.setDate(now.minusDays(2).toLocalDate());
106 | 		assertEquals(strategy.getIndexReadable(metaData), "b");
107 | 		assertEquals(strategy.getIndexWritable(metaData), "bb");
108 | 
109 | 		metaData.setDate(now.minusDays(3).toLocalDate());
110 | 		assertEquals(strategy.getIndexReadable(metaData), "b");
111 | 		assertEquals(strategy.getIndexWritable(metaData), "bb");
112 | 		
113 | 		metaData.setRebuiltIndexName(null);
114 | 		metaData.setRebuiltIndexAlias(null);
115 | 		assertEquals(strategy.getIndexWritable(metaData), "a");
116 | 	}
117 | 	
118 | 	@Test
119 | 	public void testRealtimeIndexSelectionStrategyLagged_BackwardsCompatibility() {
120 | 		DateTime now = new DateTime();
121 | 		ElasticSearchIndexMetadata metaData = new ElasticSearchIndexMetadata();
122 | 		metaData.setIndexNameAtBirth("a");
123 | 		metaData.setNumShardsPerOrg(2);
124 | 		metaData.setRebuiltIndexAlias("b");
125 | 		metaData.setRoutingStrategyClassName(ElasticsearchRoutingStrategyV1.class.getName());
126 | 		
127 | 		RealtimeIndexSelectionStrategyLagged strategy = new RealtimeIndexSelectionStrategyLagged(2);
128 | 		metaData.setDate(now.minusDays(2).toLocalDate());
129 | 		assertEquals(strategy.getIndexReadable(metaData), "b");
130 | 		assertEquals(strategy.getIndexWritable(metaData), "b");
131 | 	}
132 | 	
133 | 	@Test
134 | 	public void testRebuildPipelineStateRunning() {
135 | 		rotation.updateRebuildPipelineState(RebuildPipelineState.RUNNING);
136 | 		assertEquals(rotation.getRebuildPipelineState(), RebuildPipelineState.RUNNING);
137 | 	}
138 | 
139 | 	@Test
140 | 	public void testRebuildPipelineStateComplete() {
141 | 		rotation.updateRebuildPipelineState(RebuildPipelineState.COMPLETE);
142 | 		assertEquals(rotation.getRebuildPipelineState(), RebuildPipelineState.COMPLETE);
143 | 	}
144 | 
145 | }
146 | 


--------------------------------------------------------------------------------