├── .travis.yml
├── .settings
    ├── org.eclipse.core.resources.prefs
    ├── org.eclipse.m2e.core.prefs
    └── org.eclipse.jdt.core.prefs
├── .gitignore
├── NOTICE
├── .classpath
├── .project
├── src
    ├── main
    │   ├── assemblies
    │   │   └── esplugin.xml
    │   ├── resources
    │   │   └── es-plugin.properties
    │   └── java
    │   │   └── com
    │   │       └── github
    │   │           └── lbroudoux
    │   │               └── elasticsearch
    │   │                   └── river
    │   │                       └── s3
    │   │                           ├── river
    │   │                               ├── TikaHolder.java
    │   │                               ├── S3RiverModule.java
    │   │                               ├── S3RiverFeedDefinition.java
    │   │                               ├── S3RiverUtil.java
    │   │                               └── S3River.java
    │   │                           ├── plugin
    │   │                               └── S3RiverPlugin.java
    │   │                           ├── connector
    │   │                               ├── S3ObjectSummaries.java
    │   │                               └── S3Connector.java
    │   │                           └── rest
    │   │                               └── S3ManageAction.java
    ├── itest
    │   └── java
    │   │   └── com
    │   │       └── github
    │   │           └── lbroudoux
    │   │               └── elasticsearch
    │   │                   └── river
    │   │                       └── s3
    │   │                           └── connector
    │   │                               └── S3ConnectorTest.java
    └── test
    │   └── java
    │       └── com
    │           └── github
    │               └── lbroudoux
    │                   └── elasticsearch
    │                       └── river
    │                           └── s3
    │                               └── river
    │                                   └── S3RiverUtilTest.java
├── pom.xml
├── LICENSE
└── README.md


/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | jdk:
3 |   - openjdk7
4 |   - oraclejdk8


--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding/<project>=UTF-8
3 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | 
 3 | # Package Files #
 4 | *.jar
 5 | *.war
 6 | *.ear
 7 | 
 8 | # IntelliJ
 9 | .idea
10 | target
11 | *.iml
12 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright (c) Laurent Broudoux - 2013 
2 | 
3 | This product includes software developed by The Apache Software
4 | Foundation (http://www.apache.org/).


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
4 | org.eclipse.jdt.core.compiler.compliance=1.6
5 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
6 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
7 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
8 | org.eclipse.jdt.core.compiler.source=1.6
9 | 


--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" path="src/main/java"/>
 4 | 	<classpathentry kind="src" path="src/main/resources"/>
 5 | 	<classpathentry kind="src" path="src/test/java"/>
 6 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
 7 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"/>
 8 | 	<classpathentry kind="output" path="target/classes"/>
 9 | </classpath>
10 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>es-amazon-s3-river</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/src/main/assemblies/esplugin.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <assembly>
 3 |   <id>bin</id>
 4 |   <formats>
 5 |     <format>zip</format>
 6 |   </formats>
 7 |   <includeBaseDirectory>false</includeBaseDirectory>
 8 |   <dependencySets>
 9 |     <dependencySet>
10 |       <unpack>false</unpack>
11 |       <outputDirectory>/</outputDirectory>
12 |       <useProjectArtifact>true</useProjectArtifact>
13 |       <useTransitiveFiltering>true</useTransitiveFiltering>
14 |       <excludes>
15 |         <exclude>org.elasticsearch:elasticsearch:jar</exclude>
16 |         <exclude>junit:junit</exclude>
17 |         <exclude>log4j:log4j</exclude>
18 |       </excludes>
19 |     </dependencySet>
20 |   </dependencySets>
21 |   <fileSets>
22 |     <fileSet>
23 |       <directory>${project.build.directory}/</directory>
24 |       <outputDirectory>/</outputDirectory>
25 |       <includes>
26 |         <include>${project.name}-${project.version}.jar</include>
27 |         <include></include>
28 |       </includes>
29 |     </fileSet>
30 |   </fileSets>
31 | </assembly>


--------------------------------------------------------------------------------
/src/main/resources/es-plugin.properties:
--------------------------------------------------------------------------------
 1 | ################################################################
 2 | # Licensed to Laurent Broudoux (the "Author") under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership. Author licenses this
 6 | # file to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #    http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied.  See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | ################################################################
19 | plugin=com.github.lbroudoux.elasticsearch.river.s3.plugin.S3RiverPlugin


--------------------------------------------------------------------------------
/src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/TikaHolder.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Laurent Broudoux (the "Author") under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership. Author licenses this
 6 |  * file to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | package com.github.lbroudoux.elasticsearch.river.s3.river;
20 | 
21 | import org.apache.tika.Tika;
22 | /**
23 |  * Simple singleton holder for Apache Tika.
24 |  * @author laurent
25 |  */
26 | public class TikaHolder {
27 | 
28 |    private static final Tika tika = new Tika();
29 | 
30 |    /** @return This holder singleton's instance. */
31 |    public static Tika tika(){
32 |       return tika;
33 |    }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3RiverModule.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Laurent Broudoux (the "Author") under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership. Author licenses this
 6 |  * file to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | package com.github.lbroudoux.elasticsearch.river.s3.river;
20 | 
21 | import org.elasticsearch.common.inject.AbstractModule;
22 | import org.elasticsearch.river.River;
23 | /**
24 |  * 
25 |  * @author laurent
26 |  */
27 | public class S3RiverModule extends AbstractModule{
28 | 
29 |    @Override
30 |    protected void configure(){
31 |       bind(River.class).to(S3River.class).asEagerSingleton();
32 |    }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/itest/java/com/github/lbroudoux/elasticsearch/river/s3/connector/S3ConnectorTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Laurent Broudoux (the "Author") under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership. Author licenses this
 6 |  * file to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | package com.github.lbroudoux.elasticsearch.river.s3.connector;
20 | 
21 | import com.amazonaws.services.s3.model.AmazonS3Exception;
22 | 
23 | import org.junit.Test;
24 | /**
25 |  * @author laurent
26 |  */
27 | public class S3ConnectorTest{
28 | 
29 |    @Test(expected = AmazonS3Exception.class)
30 |    public void shouldNotConnectUserBucketWithBadSecretKey() {
31 |       S3Connector connector = new S3Connector("AKIAITHNRLFUUVPFBKZQ", "azerty");
32 |       connector.connectUserBucket("famillebroudoux", "papiers/");
33 |    }
34 | 
35 |    @Test(expected = AmazonS3Exception.class)
36 |    public void shouldNotConnectUserBucketWithBadBucket() {
37 |       S3Connector connector = new S3Connector("AKIAITHNRLFUUVPFBKZQ", "<replace by secret>");
38 |       connector.connectUserBucket("azerty", "papiers/");
39 |    }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/lbroudoux/elasticsearch/river/s3/plugin/S3RiverPlugin.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Laurent Broudoux (the "Author") under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership. Author licenses this
 6 |  * file to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | package com.github.lbroudoux.elasticsearch.river.s3.plugin;
20 | 
21 | import org.elasticsearch.common.inject.Module;
22 | import org.elasticsearch.plugins.AbstractPlugin;
23 | import org.elasticsearch.rest.RestModule;
24 | import org.elasticsearch.river.RiversModule;
25 | 
26 | import com.github.lbroudoux.elasticsearch.river.s3.rest.S3ManageAction;
27 | import com.github.lbroudoux.elasticsearch.river.s3.river.S3RiverModule;
28 | /**
29 |  * Amazon S3 River plugin definition.
30 |  * @author laurent
31 |  */
32 | public class S3RiverPlugin extends AbstractPlugin{
33 | 
34 |    @Override
35 |    public String name(){
36 |       return "river-amazon-s3";
37 |    }
38 | 
39 |    @Override
40 |    public String description(){
41 |       return "River Amazon S3 Plugin";
42 |    }
43 | 
44 |    @Override
45 |    public void processModule(Module module){
46 |       if (module instanceof RiversModule){
47 |          ((RiversModule) module).registerRiver("amazon-s3", S3RiverModule.class);
48 |       }
49 |       if (module instanceof RestModule) {
50 |          ((RestModule) module).addRestAction(S3ManageAction.class);
51 |       }
52 |    }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/lbroudoux/elasticsearch/river/s3/connector/S3ObjectSummaries.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Laurent Broudoux (the "Author") under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership. Author licenses this
 6 |  * file to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | package com.github.lbroudoux.elasticsearch.river.s3.connector;
20 | 
21 | import java.io.Serializable;
22 | import java.util.List;
23 | 
24 | import com.amazonaws.services.s3.model.S3ObjectSummary;
25 | /**
26 |  * This is a simple wrapper for carrying picked up summaries of S3 bucket objects
27 |  * that mastches the last modification date criteria along with the keys of any objects
28 |  * regardless their modification date.
29 |  * @author laurent
30 |  */
31 | public class S3ObjectSummaries implements Serializable{
32 | 
33 |    /** Default serial version UID. */
34 |    private static final long serialVersionUID = 1L;
35 | 
36 |    private Long lastScanTime;
37 |    
38 |    private List<String> keys;
39 |    private List<S3ObjectSummary> pickedSummaries;
40 | 
41 |    
42 |    public S3ObjectSummaries(Long lastScanTime, List<S3ObjectSummary> summaries, List<String> keys){
43 |       this.lastScanTime = lastScanTime;
44 |       this.pickedSummaries = summaries;
45 |       this.keys = keys;
46 |    }
47 |    
48 |    public Long getLastScanTime(){
49 |       return lastScanTime;
50 |    }
51 | 
52 |    public List<String> getKeys(){
53 |       return keys;
54 |    }
55 |    
56 |    public List<S3ObjectSummary> getPickedSummaries(){
57 |       return pickedSummaries;
58 |    }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/test/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3RiverUtilTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Laurent Broudoux (the "Author") under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership. Author licenses this
 6 |  * file to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | package com.github.lbroudoux.elasticsearch.river.s3.river;
20 | 
21 | import static junit.framework.Assert.*;
22 | 
23 | import java.util.Arrays;
24 | import java.util.List;
25 | 
26 | import org.junit.Test;
27 | /**
28 |  * Test case for S3RiverUtil class.
29 |  * @author laurent
30 |  */
31 | public class S3RiverUtilTest {
32 | 
33 |    @Test
34 |    public void shouldSayIsIndexable() {
35 |       List<String> includes = Arrays.asList("*.pdf");
36 |       List<String> excludes = Arrays.asList("*.mkv");
37 |       assertTrue(S3RiverUtil.isIndexable("mydoc.pdf", includes, excludes));
38 |    }
39 | 
40 |    @Test
41 |    public void shouldNotSayIsIndexable() {
42 |       List<String> includes = Arrays.asList("*.pdf");
43 |       List<String> excludes = Arrays.asList("*.mkv");
44 |       assertFalse(S3RiverUtil.isIndexable("mymovie.mkv", includes, excludes));
45 |    }
46 | 
47 |    @Test
48 |    public void shouldSayIsIndexableWhenNoSpec() {
49 |       // mydoc not in inclusions.
50 |       assertTrue(S3RiverUtil.isIndexable("mydoc.pdf", null, null));
51 |    }
52 | 
53 |    @Test
54 |    public void shouldSayIsIndexableWhenInclusionsOnly() {
55 |       List<String> includes = Arrays.asList("*.pdf");
56 |       List<String> excludes = Arrays.asList();
57 |       // mydoc in inclusions.
58 |       assertTrue(S3RiverUtil.isIndexable("mydoc.pdf", includes, excludes));
59 |    }
60 | 
61 |    @Test
62 |    public void shouldNotSayIsIndexableWhenInclusionsOnly() {
63 |       List<String> includes = Arrays.asList("*.pdf");
64 |       List<String> excludes = Arrays.asList();
65 |       // mymovie not in inclusions.
66 |       assertFalse(S3RiverUtil.isIndexable("mymovie.mkv", includes, excludes));
67 |    }
68 | 
69 |    @Test
70 |    public void shouldSayIsIndexableWhenExclusionsOnly() {
71 |       List<String> includes = Arrays.asList();
72 |       List<String> excludes = Arrays.asList("*.mkv");
73 |       // mydoc not in exclusions.
74 |       assertTrue(S3RiverUtil.isIndexable("mydoc.pdf", includes, excludes));
75 |    }
76 | 
77 |    @Test
78 |    public void shoudNotSayIsIndexableWhenExclusionsOnly() {
79 |       List<String> includes = Arrays.asList();
80 |       List<String> excludes = Arrays.asList("*.mkv");
81 |       // mymovie in exclusions.
82 |       assertFalse(S3RiverUtil.isIndexable("mymovie.mkv", includes, excludes));
83 |    }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/lbroudoux/elasticsearch/river/s3/rest/S3ManageAction.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Laurent Broudoux (the "Author") under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership. Author licenses this
  6 |  * file to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | package com.github.lbroudoux.elasticsearch.river.s3.rest;
 20 | 
 21 | import java.io.IOException;
 22 | 
 23 | import org.elasticsearch.client.Client;
 24 | import org.elasticsearch.common.inject.Inject;
 25 | import org.elasticsearch.common.settings.Settings;
 26 | import org.elasticsearch.common.xcontent.XContentBuilder;
 27 | import org.elasticsearch.common.xcontent.XContentBuilderString;
 28 | import org.elasticsearch.rest.BaseRestHandler;
 29 | import org.elasticsearch.rest.BytesRestResponse;
 30 | import org.elasticsearch.rest.RestChannel;
 31 | import org.elasticsearch.rest.RestController;
 32 | import org.elasticsearch.rest.RestRequest;
 33 | import org.elasticsearch.rest.RestStatus;
 34 | 
 35 | import org.elasticsearch.rest.RestRequest.Method;
 36 | 
 37 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
 38 | /**
 39 |  * REST actions definition for starting and stopping an Amazon S3 river.
 40 |  * @author laurent
 41 |  */
 42 | public class S3ManageAction extends BaseRestHandler{
 43 | 
 44 |    /** The constant for 'start river' command. */
 45 |    public static final String START_COMMAND = "_start";
 46 |    /** The constant for 'stop river' command. */
 47 |    public static final String STOP_COMMAND = "_stop";
 48 |    
 49 |    @Inject
 50 |    public S3ManageAction(Settings settings, Client client, RestController controller){
 51 |       super(settings, controller, client);
 52 | 
 53 |       // Define S3 REST endpoints.
 54 |       controller.registerHandler(Method.GET, "/_s3/{rivername}/{command}", this);
 55 |    }
 56 |    
 57 |    @Override
 58 |    public void handleRequest(RestRequest request, RestChannel channel, Client client) throws Exception{
 59 |       if (logger.isDebugEnabled()){
 60 |          logger.debug("REST S3ManageAction called");
 61 |       }
 62 |       
 63 |       String rivername = request.param("rivername");
 64 |       String command = request.param("command");
 65 |       
 66 |       String status = null;
 67 |       if (START_COMMAND.equals(command)){
 68 |          status = "STARTED";
 69 |       } else if (STOP_COMMAND.equals(command)){
 70 |          status = "STOPPED";
 71 |       }
 72 |       
 73 |       try{
 74 |          if (status != null){
 75 |             XContentBuilder xb = jsonBuilder()
 76 |                .startObject()
 77 |                   .startObject("amazon-s3")
 78 |                      .field("feedname", rivername)
 79 |                      .field("status", status)
 80 |                   .endObject()
 81 |                .endObject();
 82 |             client.prepareIndex("_river", rivername, "_s3status").setSource(xb).execute().actionGet();
 83 |          }
 84 |          
 85 |          XContentBuilder builder = jsonBuilder();
 86 |          builder
 87 |             .startObject()
 88 |                .field(new XContentBuilderString("ok"), true)
 89 |             .endObject();
 90 |          channel.sendResponse(new BytesRestResponse(RestStatus.OK, builder));
 91 |       } catch (IOException e) {
 92 |          onFailure(request, channel, e);
 93 |       }
 94 |    }
 95 |    
 96 |    /** */
 97 |    private void onFailure(RestRequest request, RestChannel channel, Exception e) throws Exception{
 98 |       try{
 99 |           channel.sendResponse(new BytesRestResponse(channel, e));
100 |       } catch (IOException ioe){
101 |          logger.error("Sending failure response fails !", e);
102 |          channel.sendResponse(new BytesRestResponse(RestStatus.INTERNAL_SERVER_ERROR));
103 |       }
104 |    }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3RiverFeedDefinition.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Laurent Broudoux (the "Author") under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership. Author licenses this
  6 |  * file to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | package com.github.lbroudoux.elasticsearch.river.s3.river;
 20 | 
 21 | import java.util.List;
 22 | /**
 23 |  * A definition bean wrapping information of river feed settings.
 24 |  * @author laurent
 25 |  */
 26 | public class S3RiverFeedDefinition{
 27 | 
 28 |    private String feedname;
 29 |    private String bucket;
 30 |    private String pathPrefix;
 31 |    private String downloadHost;
 32 |    private int updateRate;
 33 |    private List<String> includes;
 34 |    private List<String> excludes;
 35 |    private String accessKey;
 36 |    private String secretKey;
 37 |    private boolean useIAMRoleForEC2;
 38 |    private boolean jsonSupport;
 39 |    private double indexedCharsRatio = 0;
 40 |    
 41 |    public S3RiverFeedDefinition(String feedname, String bucket, String pathPrefix, String downloadHost, int updateRate, 
 42 |          List<String> includes, List<String> excludes, String accessKey, String secretKey, boolean useIAMRoleForEC2,
 43 |          boolean jsonSupport, double indexedCharsRatio) {
 44 |       this.feedname = feedname;
 45 |       this.bucket = bucket;
 46 |       this.pathPrefix = pathPrefix;
 47 |       this.downloadHost = downloadHost;
 48 |       this.updateRate = updateRate;
 49 |       this.includes = includes;
 50 |       this.excludes = excludes;
 51 |       this.accessKey = accessKey;
 52 |       this.secretKey = secretKey;
 53 |       this.useIAMRoleForEC2 = useIAMRoleForEC2;
 54 |       this.jsonSupport = jsonSupport;
 55 |       this.indexedCharsRatio = indexedCharsRatio;
 56 |    }
 57 |    
 58 |    public String getFeedname() {
 59 |       return feedname;
 60 |    }
 61 |    public void setFeedname(String feedname) {
 62 |       this.feedname = feedname;
 63 |    }
 64 |    
 65 |    public String getBucket() {
 66 |       return bucket;
 67 |    }
 68 |    public void setBucket(String bucket) {
 69 |       this.bucket = bucket;
 70 |    }
 71 | 
 72 |    public String getPathPrefix() {
 73 |       return pathPrefix;
 74 |    }
 75 |    public void setPathPrefix(String pathPrefix) {
 76 |       this.pathPrefix = pathPrefix;
 77 |    }
 78 |    
 79 |    public String getDownloadHost() {
 80 |       return downloadHost;
 81 |    }
 82 |    public void setDownloadHost(String downloadHost) {
 83 |       this.downloadHost = downloadHost;
 84 |    }
 85 | 
 86 |    public int getUpdateRate() {
 87 |       return updateRate;
 88 |    }
 89 |    public void setUpdateRate(int updateRate) {
 90 |       this.updateRate = updateRate;
 91 |    }
 92 |    
 93 |    public List<String> getIncludes() {
 94 |       return includes;
 95 |    }
 96 |    public void setIncludes(List<String> includes) {
 97 |       this.includes = includes;
 98 |    }
 99 | 
100 |    public List<String> getExcludes() {
101 |       return excludes;
102 |    }
103 |    public void setExcludes(List<String> excludes) {
104 |       this.excludes = excludes;
105 |    }
106 | 
107 |    public String getAccessKey() {
108 |       return accessKey;
109 |    }
110 |    public void setAccessKey(String accessKey) {
111 |       this.accessKey = accessKey;
112 |    }
113 | 
114 |    public String getSecretKey() {
115 |       return secretKey;
116 |    }
117 |    public void setSecretKey(String secretKey) {
118 |       this.secretKey = secretKey;
119 |    }
120 | 
121 |    public boolean isUseIAMRoleForEC2() {
122 |       return useIAMRoleForEC2;
123 |    }
124 | 
125 |    public boolean isJsonSupport(){ return jsonSupport; }
126 | 
127 |    public double getIndexedCharsRatio() {
128 |       return indexedCharsRatio;
129 |    }
130 |    public void setIndexedCharsRatio(double indexedCharsRatio) {
131 |       this.indexedCharsRatio = indexedCharsRatio;
132 |    }
133 | }
134 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3RiverUtil.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Laurent Broudoux (the "Author") under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership. Author licenses this
  6 |  * file to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | package com.github.lbroudoux.elasticsearch.river.s3.river;
 20 | 
 21 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
 22 | 
 23 | import java.util.*;
 24 | 
 25 | import org.elasticsearch.common.Strings;
 26 | import org.elasticsearch.common.xcontent.XContentBuilder;
 27 | import org.elasticsearch.common.xcontent.support.XContentMapValues;
 28 | /**
 29 |  * Utility class for Amazon S3 indexing management.
 30 |  * @author laurent
 31 |  */
 32 | public class S3RiverUtil{
 33 | 
 34 |    public static final String INDEX_TYPE_DOC = "doc";
 35 |    
 36 |    public static final String DOC_FIELD_TITLE = "title";
 37 |    public static final String DOC_FIELD_MODIFIED_DATE = "modifiedDate";
 38 |    public static final String DOC_FIELD_SOURCE_URL = "source_url";
 39 |    public static final String DOC_FIELD_METADATA = "metadata";
 40 |    
 41 |    /**
 42 |     * Build mapping description for Amazon S3 files.
 43 |     * @param type The name of type for S3 files
 44 |     * @return A content builder for mapping informations
 45 |     * @throws Exception it something goes wrong
 46 |     */
 47 |    public static XContentBuilder buildS3FileMapping(String type) throws Exception{
 48 |       XContentBuilder xbMapping = jsonBuilder().prettyPrint().startObject()
 49 |             .startObject(type).startObject("properties")
 50 |             .startObject(DOC_FIELD_TITLE).field("type", "string").field("analyzer","keyword").endObject()
 51 |             .startObject(DOC_FIELD_MODIFIED_DATE).field("type", "date").endObject()
 52 |             .startObject(DOC_FIELD_SOURCE_URL).field("type", "string").endObject()
 53 |             .startObject(DOC_FIELD_METADATA).field("type", "object").endObject()
 54 |             .startObject("file")
 55 |                .startObject("properties")
 56 |                   .startObject("title").field("type", "string").field("store", "yes").endObject()
 57 |                   .startObject("file").field("type", "string")
 58 |                      .field("term_vector", "with_positions_offsets")
 59 |                      .field("store", "yes")
 60 |                   .endObject()
 61 |                   .startObject("metadata").field("type", "object").field("store", "yes").endObject()
 62 |                .endObject()
 63 |             .endObject()
 64 |             .endObject().endObject().endObject();
 65 |       return xbMapping;
 66 |    }
 67 |    
 68 |    /**
 69 |     * Extract array from settings (array or ; delimited String)
 70 |     * @param settings Settings
 71 |     * @param path Path to settings definition
 72 |     * @return Array of settings
 73 |     */
 74 |    @SuppressWarnings("unchecked")
 75 |    public static String[] buildArrayFromSettings(Map<String, Object> settings, String path){
 76 |       String[] includes;
 77 | 
 78 |       // We manage comma separated format and arrays
 79 |       if (XContentMapValues.isArray(XContentMapValues.extractValue(path, settings))) {
 80 |          List<String> includesarray = (List<String>) XContentMapValues.extractValue(path, settings);
 81 |          int i = 0;
 82 |          includes = new String[includesarray.size()];
 83 |          for (String include : includesarray) {
 84 |             includes[i++] = trimAllWhitespace(include);
 85 |          }
 86 |       } else {
 87 |          String includedef = (String) XContentMapValues.extractValue(path, settings);
 88 |          includes = Strings.commaDelimitedListToStringArray(trimAllWhitespace(includedef));
 89 |       }
 90 |       
 91 |       String[] uniquelist = removeDuplicateStrings(includes);
 92 |       
 93 |       return uniquelist;
 94 |    }
 95 | 
 96 |    /**
 97 |     * Tells if an Aamzon S3 file is indexable from its key (file name), based on includes
 98 |     * and excludes rules. 
 99 |     * @return true if file should be indexed, false otherwise
100 |     */
101 |    public static boolean isIndexable(String key, List<String> includes, List<String> excludes){
102 |       // If no rules specified, we index everything !
103 |       if ((includes == null && excludes == null) 
104 |             || (includes.isEmpty() && excludes.isEmpty())){
105 |          return true;
106 |       }
107 |       
108 |       // Exclude rules : we know that whatever includes rules are, we should exclude matching files.
109 |       if (excludes != null){
110 |          for (String exclude : excludes){
111 |             String regex = exclude.replace("?", ".?").replace("*", ".*?");
112 |             if (key.matches(regex)){
113 |                return false;
114 |             }
115 |          }
116 |       }
117 |       
118 |       // Include rules : we should add document if it match include rules.
119 |       if (includes == null || includes.isEmpty()){
120 |          return true;
121 |       }
122 |       if (includes != null){
123 |          for (String include : includes){
124 |             String regex = include.replace("?", ".?").replace("*", ".*?");
125 |             if (key.matches(regex)){
126 |                return true;
127 |             }
128 |          }
129 |       }
130 |       
131 |       return false;
132 |    }
133 | 
134 |    /**
135 |     * Trim <i>all</i> whitespace from the given String: leading, trailing, and inbetween characters.
136 |     * @param str the String to check
137 |     * @return the trimmed String
138 |     * @see java.lang.Character#isWhitespace
139 |     */
140 |    public static String trimAllWhitespace(String str) {
141 |       if (!Strings.hasLength(str)) {
142 |          return str;
143 |       }
144 |       StringBuilder sb = new StringBuilder(str);
145 |       int index = 0;
146 |       while (sb.length() > index) {
147 |          if (Character.isWhitespace(sb.charAt(index))) {
148 |             sb.deleteCharAt(index);
149 |          } else {
150 |             index++;
151 |          }
152 |       }
153 |       return sb.toString();
154 |    }
155 | 
156 |    /**
157 |     * Remove duplicate Strings from the given array. Also sorts the array, as it uses a TreeSet.
158 |     * @param array the String array
159 |     * @return an array without duplicates, in natural sort order
160 |     */
161 |    public static String[] removeDuplicateStrings(String[] array) {
162 |       if (array == null || array.length == 0) {
163 |          return array;
164 |       }
165 |       Set<String> set = new TreeSet<String>();
166 |       set.addAll(Arrays.asList(array));
167 |       return Strings.toStringArray(set);
168 |    }
169 | }
170 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 |   <groupId>com.github.lbroudoux.elasticsearch</groupId>
  4 |   <artifactId>amazon-s3-river</artifactId>
  5 |   <version>1.6.1-SNAPSHOT</version>
  6 |   <packaging>jar</packaging>
  7 | 
  8 |   <parent>
  9 |     <groupId>org.sonatype.oss</groupId>
 10 |     <artifactId>oss-parent</artifactId>
 11 |     <version>7</version>
 12 |   </parent>
 13 | 
 14 |   <licenses>
 15 |     <license>
 16 |       <name>The Apache Software License, Version 2.0</name>
 17 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 18 |       <distribution>repo</distribution>
 19 |     </license>
 20 |   </licenses>
 21 |   <developers>
 22 |     <developer>
 23 |       <id>laurent</id>
 24 |       <name>Laurent Broudoux</name>
 25 |       <email>laurent.broudoux@gmail.com</email>
 26 |       <url>http://lbroudoux.wordpress.com</url>
 27 |       <timezone>+1</timezone>
 28 |     </developer>
 29 |   </developers>
 30 |   <scm>
 31 |     <url>scm:git@github.com:lbroudoux/es-amazon-s3-river.git</url>
 32 |     <connection>scm:git:git@github.com:lbroudoux/es-amazon-s3-river.git</connection>
 33 |     <developerConnection>scm:git:git@github.com:lbroudoux/es-amazon-s3-river.git</developerConnection>
 34 |     <tag>HEAD</tag>
 35 |   </scm>
 36 |   <issueManagement>
 37 |     <system>GitHub</system>
 38 |     <url>https://github.com/lbroudoux/es-amazon-s3-river/issues/</url>
 39 |   </issueManagement>
 40 | 
 41 |   <properties>
 42 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 43 |     <elasticsearch.version>1.6.2</elasticsearch.version>
 44 |     <tika.version>1.6</tika.version>
 45 |   </properties>
 46 |   
 47 |   <dependencies>
 48 |     <dependency>
 49 |       <groupId>org.elasticsearch</groupId>
 50 |       <artifactId>elasticsearch</artifactId>
 51 |       <version>${elasticsearch.version}</version>
 52 |     </dependency>
 53 |     <dependency>
 54 |       <groupId>org.apache.tika</groupId>
 55 |       <artifactId>tika-core</artifactId>
 56 |       <version>${tika.version}</version>
 57 |     </dependency>
 58 |     <dependency>
 59 |       <groupId>org.apache.tika</groupId>
 60 |       <artifactId>tika-parsers</artifactId>
 61 |       <version>${tika.version}</version>
 62 |     </dependency>
 63 |     <dependency>
 64 |       <groupId>com.amazonaws</groupId>
 65 |       <artifactId>aws-java-sdk</artifactId>
 66 |       <version>1.6.12</version>
 67 |     </dependency>
 68 |     <dependency>
 69 |       <groupId>junit</groupId>
 70 |       <artifactId>junit</artifactId>
 71 |       <version>4.1</version>
 72 |       <scope>test</scope>
 73 |     </dependency>
 74 |     <dependency>
 75 |       <groupId>org.slf4j</groupId>
 76 |       <artifactId>slf4j-log4j12</artifactId>
 77 |       <version>1.5.6</version>
 78 |       <scope>test</scope>
 79 |     </dependency>
 80 |   </dependencies>
 81 |   
 82 |   <build>
 83 |     <plugins>
 84 |       <plugin>
 85 |         <groupId>org.apache.maven.plugins</groupId>
 86 |         <artifactId>maven-release-plugin</artifactId>
 87 |         <version>2.4</version>
 88 |       </plugin>
 89 |       <plugin>
 90 |         <groupId>org.apache.maven.plugins</groupId>
 91 |         <artifactId>maven-source-plugin</artifactId>
 92 |         <version>2.2.1</version>
 93 |         <executions>
 94 |           <execution>
 95 |             <id>attach-sources</id>
 96 |             <phase>package</phase>
 97 |             <goals>
 98 |               <goal>jar-no-fork</goal>
 99 |             </goals>
100 |           </execution>
101 |         </executions>
102 |       </plugin>
103 |       <plugin>
104 |         <groupId>org.apache.maven.plugins</groupId>
105 |         <artifactId>maven-javadoc-plugin</artifactId>
106 |         <version>2.9</version>
107 |         <executions>
108 |           <execution>
109 |             <id>attach-javadoc</id>
110 |             <phase>package</phase>
111 |             <goals>
112 |               <goal>jar</goal>
113 |             </goals>
114 |           </execution>
115 |         </executions>
116 |       </plugin>
117 |       <!-- Add/Edit items in META-INF/MANIFEST.MF -->
118 |       <plugin>
119 |         <groupId>org.apache.maven.plugins</groupId>
120 |         <artifactId>maven-jar-plugin</artifactId>
121 |         <version>2.4</version>
122 |       </plugin>
123 |       <!-- Copy dependencies into lib folder -->
124 |       <!-- To see full classpath use  mvn dependency:build-classpath -->
125 |       <plugin>
126 |         <groupId>org.apache.maven.plugins</groupId>
127 |         <artifactId>maven-dependency-plugin</artifactId>
128 |         <version>2.6</version>
129 |         <executions>
130 |           <execution>
131 |             <id>copy-dependencies</id>
132 |             <phase>package</phase>
133 |             <goals>
134 |               <goal>copy-dependencies</goal>
135 |             </goals>
136 |             <configuration>
137 |               <outputDirectory>${project.build.directory}/lib</outputDirectory>
138 |             </configuration>
139 |           </execution>
140 |         </executions>
141 |       </plugin>
142 |       <plugin>
143 |         <groupId>org.apache.maven.plugins</groupId>
144 |         <artifactId>maven-surefire-plugin</artifactId>
145 |         <version>2.12.4</version>
146 |         <configuration>
147 |           <skip>true</skip>
148 |         </configuration>
149 |         <executions>
150 |           <execution>
151 |             <id>surefire-test</id>
152 |             <phase>test</phase>
153 |             <goals>
154 |               <goal>test</goal>
155 |             </goals>
156 |             <configuration>
157 |               <skip>false</skip>
158 |               <additionalClasspathElements>
159 |                 <additionalClasspathElement>${project.build.directory}/classes/conf</additionalClasspathElement>
160 |                 <additionalClasspathElement>${project.build.directory}/lib</additionalClasspathElement>
161 |               </additionalClasspathElements>
162 |               <excludes>
163 |                 <exclude>**/itest/**</exclude>
164 |               </excludes>
165 |             </configuration>
166 |           </execution>
167 |           <execution>
168 |             <id>surefire-itest</id>
169 |             <phase>integration-test</phase>
170 |             <goals>
171 |               <goal>test</goal>
172 |             </goals>
173 |             <configuration>
174 |               <skip>true</skip>
175 |               <additionalClasspathElements>
176 |                 <additionalClasspathElement>${project.build.directory}/classes/conf</additionalClasspathElement>
177 |                 <additionalClasspathElement>${project.build.directory}/lib</additionalClasspathElement>
178 |               </additionalClasspathElements>
179 |               <includes>
180 |                 <include>**/itest/**</include>
181 |               </includes>
182 |             </configuration>
183 |           </execution>
184 |         </executions>
185 |       </plugin>
186 |          
187 |       <!-- Generate the release zip file (run during package step) -->
188 |       <plugin>
189 |         <groupId>org.apache.maven.plugins</groupId>
190 |         <artifactId>maven-assembly-plugin</artifactId>
191 |         <version>2.4</version>
192 |         <configuration>
193 |           <appendAssemblyId>false</appendAssemblyId>
194 |           <!-- <outputDirectory>${project.build.directory}/release/</outputDirectory> -->
195 |           <descriptors>
196 |             <descriptor>${basedir}/src/main/assemblies/esplugin.xml</descriptor>
197 |           </descriptors>
198 |         </configuration>
199 |         <executions>
200 |           <execution>
201 |             <id>generate-release-plugin</id>
202 |             <phase>package</phase>
203 |             <goals>
204 |               <goal>single</goal>
205 |             </goals>
206 |           </execution>
207 |         </executions>
208 |       </plugin>
209 |       <plugin>
210 |         <groupId>org.apache.maven.plugins</groupId>
211 |         <artifactId>maven-compiler-plugin</artifactId>
212 |         <version>3.0</version>
213 |         <configuration>
214 |           <source>1.6</source>
215 |           <target>1.6</target>
216 |         </configuration>
217 |       </plugin>
218 |     </plugins>
219 |   </build>
220 |   
221 |   <profiles>
222 |     <profile>
223 |       <id>release</id>
224 |       <build>
225 |         <plugins>
226 |           <plugin>
227 |             <groupId>org.apache.maven.plugins</groupId>
228 |             <artifactId>maven-gpg-plugin</artifactId>
229 |             <version>1.4</version>
230 |             <executions>
231 |               <execution>
232 |                 <id>sign-artifacts</id>
233 |                 <phase>verify</phase>
234 |                 <goals>
235 |                   <goal>sign</goal>
236 |                 </goals>
237 |               </execution>
238 |             </executions>
239 |           </plugin>
240 |         </plugins>
241 |       </build>
242 |     </profile>
243 |   </profiles>
244 | </project>
245 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/lbroudoux/elasticsearch/river/s3/connector/S3Connector.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Laurent Broudoux (the "Author") under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership. Author licenses this
  6 |  * file to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | package com.github.lbroudoux.elasticsearch.river.s3.connector;
 20 | 
 21 | import java.io.ByteArrayOutputStream;
 22 | import java.io.IOException;
 23 | import java.io.InputStream;
 24 | import java.util.ArrayList;
 25 | import java.util.Collections;
 26 | import java.util.List;
 27 | import java.util.Map;
 28 | 
 29 | import com.amazonaws.auth.InstanceProfileCredentialsProvider;
 30 | import com.amazonaws.services.s3.model.*;
 31 | import org.elasticsearch.common.logging.ESLogger;
 32 | import org.elasticsearch.common.logging.Loggers;
 33 | 
 34 | import com.amazonaws.auth.AWSCredentials;
 35 | import com.amazonaws.auth.BasicAWSCredentials;
 36 | import com.amazonaws.services.s3.AmazonS3Client;
 37 | import com.github.lbroudoux.elasticsearch.river.s3.river.S3RiverFeedDefinition;
 38 | /**
 39 |  * This is a connector for querying and retrieving files or folders from
 40 |  * an Amazon S3 bucket. Credentials are mandatory for connecting to remote drive.
 41 |  * @author laurent
 42 |  */
 43 | public class S3Connector{
 44 | 
 45 |    private static final ESLogger logger = Loggers.getLogger(S3Connector.class);
 46 |    
 47 |    private final String accessKey;
 48 |    private final String secretKey;
 49 |    private boolean useIAMRoleForEC2 = false;
 50 |    private String bucketName;
 51 |    private String pathPrefix;
 52 |    private AmazonS3Client s3Client;
 53 | 
 54 |    /**
 55 |     * Create a S3Connector with security credentials. This is helpful if you want
 56 |     * to use IAM Roles as described here http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html.
 57 |     */
 58 |    public S3Connector(boolean useIAMRoleForEC2) {
 59 |       this.accessKey = null;
 60 |       this.secretKey = null;
 61 |       this.useIAMRoleForEC2 = useIAMRoleForEC2;
 62 |    }
 63 | 
 64 |    /**
 65 |     * Create a SEConnector with provided security credentials.
 66 |     * @param accessKey The AWS access key such as provided by AWS console
 67 |     * @param secretKey The AWS secret key such as provided by AWS console
 68 |     */
 69 |    public S3Connector(String accessKey, String secretKey){
 70 |       this.accessKey = accessKey;
 71 |       this.secretKey = secretKey;
 72 |    }
 73 |    
 74 |    /**
 75 |     * Connect to the specified bucket using previously given accesskey and secretkey.
 76 |     * @param bucketName Name of the bucket to connect to
 77 |     * @param pathPrefix Prefix that will be later used for filtering documents
 78 |     * @throws AmazonS3Exception when access or secret keys are wrong or bucket does not exists
 79 |     */
 80 |    public void connectUserBucket(String bucketName, String pathPrefix) throws AmazonS3Exception{
 81 |       this.bucketName = bucketName;
 82 |       this.pathPrefix = pathPrefix;
 83 |       if (accessKey != null && secretKey != null) {
 84 |          AWSCredentials credentials = new BasicAWSCredentials(accessKey, secretKey);
 85 |          s3Client = new AmazonS3Client(credentials);
 86 |       } else if (useIAMRoleForEC2) {
 87 |          // Force usage of IAM Role process as described into
 88 |          // http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html.
 89 |          s3Client = new AmazonS3Client(new InstanceProfileCredentialsProvider());
 90 |       } else {
 91 |          // Default credentials retrieval or IAM Role process as described into
 92 |          // http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html.
 93 |          s3Client = new AmazonS3Client();
 94 |       }
 95 |       // Getting location seems odd as we don't use it later and doesBucketExists() seems
 96 |       // more appropriate... However, this later returns true even for non existing buckets !
 97 |       s3Client.getBucketLocation(bucketName);
 98 |    }
 99 |    
100 |    /**
101 |     * Select and retrieves summaries of object into bucket and of given path prefix
102 |     * that have modification date younger than lastScanTime.
103 |     * @param lastScanTime Last modification date filter
104 |     * @return Summaries of picked objects.
105 |     */
106 |    public S3ObjectSummaries getObjectSummaries(Long lastScanTime){
107 |       if (logger.isDebugEnabled()){
108 |          logger.debug("Getting buckets changes since {}", lastScanTime);
109 |       }
110 |       List<String> keys = new ArrayList<String>();
111 |       List<S3ObjectSummary> result = new ArrayList<S3ObjectSummary>();
112 |       
113 |       // Store the scan time to return before doing big queries...
114 |       Long lastScanTimeToReturn = System.currentTimeMillis();
115 |       if (lastScanTime == null){
116 |          lastScanTime = 0L;
117 |       }
118 |       
119 |       ListObjectsRequest request = new ListObjectsRequest().withBucketName(bucketName)
120 |             .withPrefix(pathPrefix);
121 |       ObjectListing listing = s3Client.listObjects(request);
122 |       logger.debug("Listing: {}", listing);
123 |       while (!listing.getObjectSummaries().isEmpty() || listing.isTruncated()){
124 |          List<S3ObjectSummary> summaries = listing.getObjectSummaries();
125 |          if (logger.isDebugEnabled()){
126 |             logger.debug("Found {} items in this listObjects page", summaries.size());
127 |          }
128 |          for (S3ObjectSummary summary : summaries){
129 |             if (logger.isDebugEnabled()){
130 |                logger.debug("Getting {} last modified on {}", summary.getKey(), summary.getLastModified());
131 |             }
132 |             keys.add(summary.getKey());
133 |             if (summary.getLastModified().getTime() > lastScanTime){
134 |                logger.debug("  Picked !");
135 |                result.add(summary);
136 |             }
137 |          }
138 |          listing = s3Client.listNextBatchOfObjects(listing);
139 |       }
140 |       
141 |       // Wrap results and latest scan time.
142 |       return new S3ObjectSummaries(lastScanTimeToReturn, result, keys);
143 |    }
144 |    
145 |    public Map<String,Object> getS3UserMetadata(String key){ 
146 | 	   return Collections.<String, Object>unmodifiableMap(s3Client.getObjectMetadata(bucketName, key).getUserMetadata());
147 |    }
148 | 
149 |    /**
150 |     * Download Amazon S3 file as byte array.
151 |     * @param summary The summary of the S3 Object to download
152 |     * @return This file bytes or null if something goes wrong.
153 |     */
154 |    public byte[] getContent(S3ObjectSummary summary){
155 |       if (logger.isDebugEnabled()){
156 |          logger.debug("Downloading file content from {}", summary.getKey());
157 |       }
158 |       // Retrieve object corresponding to key into bucket.
159 |       S3Object object = s3Client.getObject(bucketName, summary.getKey());
160 |       
161 |       InputStream is = null;
162 |       ByteArrayOutputStream bos = null;
163 | 
164 |       try{
165 |          // Get input stream on S3 Object.
166 |          is = object.getObjectContent();
167 |          bos = new ByteArrayOutputStream();
168 | 
169 |          byte[] buffer = new byte[4096];
170 |          int len = is.read(buffer);
171 |          while (len > 0) {
172 |             bos.write(buffer, 0, len);
173 |             len = is.read(buffer);
174 |          }
175 | 
176 |          // Flush and return result.
177 |          bos.flush();
178 |          return bos.toByteArray();
179 |       } catch (IOException e) {
180 |          e.printStackTrace();
181 |          return null;
182 |       } finally {
183 |          if (bos != null){
184 |             try{
185 |                bos.close();
186 |             } catch (IOException e) {
187 |             }
188 |          }
189 |          try{
190 |             is.close();
191 |          } catch (IOException e) {
192 |          }
193 |       }
194 |    }
195 |    
196 |    /**
197 |     * Get the download url of this S3 object. May return null if the
198 |     * object bucket and key cannot be converted to a URL.
199 |     * @param summary A S3 object
200 |     * @param feedDefinition The holder of S3 feed definition.
201 |     * @return The resource url if possible (access is subject to AWS credential)
202 |     */
203 |    public String getDownloadUrl(S3ObjectSummary summary, S3RiverFeedDefinition feedDefinition){
204 |       String resourceUrl = s3Client.getResourceUrl(summary.getBucketName(), summary.getKey()); 
205 |       // If a download host (actually a vhost such as cloudfront offers) is specified, use it to
206 |       // recreate a vhosted resource url. This is made by substitution of the generic host name in url. 
207 |       if (resourceUrl != null && feedDefinition.getDownloadHost() != null){
208 |          int hostPosEnd = resourceUrl.indexOf("s3.amazonaws.com/") + "s3.amazonaws.com".length();
209 |          String vhostResourceUrl = feedDefinition.getDownloadHost() + resourceUrl.substring(hostPosEnd);
210 |          return vhostResourceUrl;
211 |       }
212 |       return resourceUrl;
213 |    }
214 | }
215 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | es-amazon-s3-river
  2 | ==================
  3 | 
  4 | Amazon S3 river for Elasticsearch
  5 | 
  6 | This river plugin helps to index documents from a Amazon S3 account buckets.
  7 | 
  8 | *WARNING*: For 0.0.1 released version, you need to have the [Attachment Plugin](https://github.com/elasticsearch/elasticsearch-mapper-attachments).
  9 | 
 10 | *WARNING*: Starting from 0.0.2, you don't need anymore the [Attachment Plugin](https://github.com/elasticsearch/elasticsearch-mapper-attachments) as we use now directly [Tika](http://tika.apache.org/), see [issue #2](https://github.com/lbroudoux/es-amazon-s3-river/issues/2).
 11 | 
 12 | Versions
 13 | --------
 14 | 
 15 | <table>
 16 |    <thead>
 17 |       <tr>
 18 |          <td>Amazon S3 River Plugin</td>
 19 |          <td>ElasticSearch</td>
 20 |          <td>Attachment Plugin</td>
 21 |          <td>Tika</td>
 22 |       </tr>
 23 |    </thead>
 24 |    <tbody>
 25 |       <tr>
 26 |          <td>master (1.6.1-SNAPSHOT)</td>
 27 |          <td>1.6.x and 1.7.x</td>
 28 |          <td>No more used</td>
 29 |          <td>1.6</td>
 30 |       </tr>
 31 |       <tr>
 32 |          <td>1.6.0</td>
 33 |          <td>1.6.x and 1.7.x</td>
 34 |          <td>No more used</td>
 35 |          <td>1.6</td>
 36 |       </tr>
 37 |       <tr>
 38 |          <td>1.4.1</td>
 39 |          <td>1.4.x and 1.5.x</td>
 40 |          <td>No more used</td>
 41 |          <td>1.6</td>
 42 |       </tr>
 43 |       <tr>
 44 |          <td>1.4.0</td>
 45 |          <td>1.4.x and 1.5.x</td>
 46 |          <td>No more used</td>
 47 |          <td>1.6</td>
 48 |       </tr>
 49 |       <tr>
 50 |          <td>1.3.0</td>
 51 |          <td>1.3.x</td>
 52 |          <td>No more used</td>
 53 |          <td>1.4</td>
 54 |       </tr>
 55 |       <tr>
 56 |          <td>1.2.0</td>
 57 |          <td>1.2.x</td>
 58 |          <td>No more used</td>
 59 |          <td>1.4</td>
 60 |       </tr>
 61 |       <tr>
 62 |          <td>0.0.4</td>
 63 |          <td>1.0.x and 1.1.x</td>
 64 |          <td>No more used</td>
 65 |          <td>1.4</td>
 66 |       </tr>
 67 |       <tr>
 68 |          <td>0.0.3</td>
 69 |          <td>1.0.0</td>
 70 |          <td>No more used</td>
 71 |          <td>1.4</td>
 72 |       </tr>
 73 |       <tr>
 74 |          <td>0.0.2</td>
 75 |          <td>0.90.0</td>
 76 |          <td>No more used</td>
 77 |          <td>1.4</td>
 78 |       </tr>
 79 |       <tr>
 80 |          <td>0.0.1</td>
 81 |          <td>0.90.0</td>
 82 |          <td>1.7.0</td>
 83 |          <td></td>
 84 |       </tr>
 85 |    </tbody>
 86 | </table>
 87 | 
 88 | Build Status
 89 | ------------
 90 | 
 91 | Travis CI [![Build Status](https://travis-ci.org/lbroudoux/es-amazon-s3-river.png?branch=master)](https://travis-ci.org/lbroudoux/es-amazon-s3-river)
 92 | 
 93 | 
 94 | Getting Started
 95 | ===============
 96 | 
 97 | Installation
 98 | ------------
 99 | 
100 | Just install as a regular Elasticsearch plugin by typing :
101 | 
102 | ```sh
103 | $ bin/plugin --install com.github.lbroudoux.elasticsearch/amazon-s3-river/1.6.0
104 | ```
105 | 
106 | This will do the job...
107 | 
108 | ```
109 | -> Installing com.github.lbroudoux.elasticsearch/amazon-s3-river/1.6.0...
110 | Trying http://download.elasticsearch.org/com.github.lbroudoux.elasticsearch/amazon-s3-river/amazon-s3-river-1.6.0.zip...
111 | Trying http://search.maven.org/remotecontent?filepath=com/github/lbroudoux/elasticsearch/amazon-s3-river/1.6.0/amazon-s3-river-1.6.0.zip...
112 | Downloading ......DONE
113 | Installed amazon-s3-river
114 | ```
115 | 
116 | 
117 | Get Amazon AWS credentials (accessKey and secretKey)
118 | ------------------------------------------
119 | 
120 | First, you need to login to Amazon AWS account owning the S3 bucket to and then retrieve your security credentials by visiting this [page](https://portal.aws.amazon.com/gp/aws/securityCredentials).
121 | 
122 | Once done, you should note your `accessKey` and `secretKey` codes.
123 | 
124 | 
125 | Creating an Amazon S3 river
126 | ------------------------
127 | 
128 | We create first an index to store our *documents* (optional):
129 | 
130 | ```sh
131 | $ curl -XPUT 'http://localhost:9200/mys3docs/' -d '{}'
132 | ```
133 | 
134 | We create the river with the following properties :
135 | 
136 | * accessKey : AAAAAAAAAAAAAAAA
137 | * secretKey: BBBBBBBBBBBBBBBB
138 | * Amazon S3 bucket to index : `myownbucket`
139 | * Path prefix to index in this buckets : `Work/` (This is optional. If specified, it should be an existing path with the trailing /)
140 | * Update Rate : every 15 minutes (15 * 60 * 1000 = 900000 ms)
141 | * Get only docs like `*.doc` and `*.pdf`
142 | * Don't index `*.zip` and `*.gz`
143 | 
144 | ```sh
145 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{
146 |   "type": "amazon-s3",
147 |   "amazon-s3": {
148 |     "accessKey": "AAAAAAAAAAAAAAAA",
149 |     "secretKey": "BBBBBBBBBBBBBBBB",
150 |     "name": "My Amazon S3 feed",
151 |     "bucket" : "myownbucket"
152 |     "pathPrefix": "Work/",
153 |     "update_rate": 900000,
154 |     "includes": "*.doc,*.pdf",
155 |     "excludes": "*.zip,*.gz"
156 |   }
157 | }'
158 | ```
159 | 
160 | By default, river is using an index that have the same name (`mys3docs` in the above example).
161 | 
162 | *From 0.0.2 version*
163 | 
164 | The `source_url` of documents is now stored within Elasticsearch index in order to allow you to access
165 | later the whole document content from your application (this is indeed a use case coming from [Scrutmydocs](http://www.scrutmydocs.org)).
166 | 
167 | By default, the plugin uses what is called the *resourceUrl* of a S3 bucket document. If the document have
168 | been made public within S3, it can be accessed directly from your browser. If it's not the case, the stored url
169 | is intended to be used by a regular S3 client that has the allowed set of credentials to access the document.
170 | 
171 | Another option to easily distribute S3 content is to setup a Web proxy in front of S3 such as CloudFront (see
172 | [Service Private Content With CloudFront](http://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/PrivateContent.html)).
173 | In that later case, you'll want to rewrite `source_url` by substituting the S3 part by your own host name. This
174 | plugin allows you to do that by specifying a `download_host` as a river properties.
175 | 
176 | 
177 | Specifying index options
178 | ------------------------
179 | 
180 | Index options can be specified when creating an amazon-s3-river. The properties are the following :
181 | 
182 | * Index name : "amazondocs"
183 | * Type of documents : "doc"
184 | * Size of an indexation bulk : 50 (default is 100)
185 | 
186 | You'll have to use them as follow when creating a river :
187 | 
188 | ```sh
189 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{
190 |   "type": "amazon-s3",
191 |   "amazon-s3": {
192 |     "accessKey": "AAAAAAAAAAAAAAAA",
193 |     "secretKey": "BBBBBBBBBBBBBBBB",
194 |     "name": "My Amazon S3 feed",
195 |     "bucket" : "myownbucket"
196 |     "pathPrefix": "Work/",
197 |     "update_rate": 900000,
198 |     "includes": "*.doc,*.pdf",
199 |     "excludes": "*.zip,*.gz"
200 |   },
201 |   "index": {
202 |     "index": "amazondocs",
203 |     "type": "doc",
204 |     "bulk_size": 50
205 |   }
206 | }'
207 | ```
208 | 
209 | Indexing Json documents
210 | -----------------------
211 | 
212 | *From 0.0.4 version*
213 | 
214 | If you want to index Json files directly without parsing them through Tika, you can set the `json_support` configuration
215 | option to `true` like
216 | 
217 | ```sh
218 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{
219 |   "type": "amazon-s3",
220 |   "amazon-s3": {
221 |     "accessKey": "AAAAAAAAAAAAAAAA",
222 |     "secretKey": "BBBBBBBBBBBBBBBB",
223 |     "name": "My Amazon S3 feed",
224 |     "bucket" : "myownbucket"
225 |     "pathPrefix": "Jsons/",
226 |     "update_rate": 900000,
227 |     "json_support": true,
228 |     "includes": "*.json"
229 |   }
230 | }'
231 | ```
232 | 
233 | Be sure in your river configuration to correclty use `includes` or `excludes` to only retrieve Json documents.
234 | 
235 | In this case of `json_support` and if you did not define a mapping prior creating it, this river *will not*
236 | automatically generate a mapping like mentioned below into the Advanced section. In this case, Elasticsearch will auto
237 | guess the mapping.
238 | 
239 | 
240 | Advanced
241 | ========
242 | 
243 | Management actions
244 | ------------------
245 | 
246 | If you need to stop a river, you can call the `_s3` endpoint with your river name followed by the `_stop` command like this :
247 | 
248 | ```sh
249 | GET _s3/mys3docs/_stop
250 | ```
251 | 
252 | To restart the river from the previous point, just call the corresponding `_start` endpoint :
253 | 
254 | ```sh
255 | GET _s3/mys3docs/_start
256 | ```
257 | 
258 | Extracted characters
259 | --------------------
260 | 
261 | *From 1.4.1 version*
262 | 
263 | By default this river plugin will extract only a limited size of characters (up to 100000 that is the default aloowed by Tika). 
264 | But this may be not sufficient for big documents. You can override this limit using the `indexed_chars_ratio` river option like this :
265 | 
266 | ```sh
267 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{
268 |   "type": "amazon-s3",
269 |   "amazon-s3": {
270 |     "accessKey": "AAAAAAAAAAAAAAAA",
271 |     "secretKey": "BBBBBBBBBBBBBBBB",
272 |     "name": "My Amazon S3 feed",
273 |     "bucket" : "myownbucket"
274 |     "pathPrefix": "Work/",
275 |     "indexed_chars_ratio": 1
276 |   }
277 | }'
278 | ```
279 | 
280 | `indexed_chars_ratio` should actually been a positive double number. Setting `indexed_chars_ratio` to `x` will compute
281 | file size, multiply it with `x` and pass it to Tika. Setting a value of `1`, will extract exactly the filesize.
282 | 
283 | That means that a value of `0.8` will extract 20% less characters than the file size. A value of `1.5` will extract 50%
284 | more characters than the filesize (think compressed files). 
285 | 
286 | Note that Tika requires to allocate in memory a data structure to extract text. Setting `indexed_chars_ratio` to a high 
287 | number will require more memory !
288 | 
289 | 
290 | Credential keys security and IAM Role
291 | -------------------------------------
292 |  
293 | *From 1.4.1 version*
294 | 
295 | Transferring `accessKey` and `secretKey` as river creation option is not the always applicable depending on your context.
296 | This may lead to an exposition of this keys. From 1.4.1 version, you may now have the ability to :
297 | 
298 | * either use the default credential retrieval process that checks system variables and configuration files, 
299 | * either force the usage of IAM Role if your nodes are running directly onto an Amazon EC2 instance.
300 | 
301 | We recommend you to check see http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html for
302 | explanations of credential retrieval process.
303 | 
304 | The behaviour of this river plugin is now the following :
305 | 
306 | * `accessKey` and `secretKey` are no longer mandatory fields. If not provided at index creation, river will just try to
307 | connect to your S3 bucket using the default provider chain,
308 | * new option `use_EC2_IAM` can be set to `true` to force the usage of EC2 IAM Role.
309 |  
310 | In action, this lead to something like when creating river :
311 | 
312 | ```sh
313 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{
314 |   "type": "amazon-s3",
315 |   "amazon-s3": {
316 |     "name": "My Amazon S3 feed",
317 |     "bucket" : "myownbucket"
318 |     "pathPrefix": "Work/",
319 |     "use_EC2_IAM": true
320 |   }
321 | }'
322 | ```
323 | 
324 | Autogenerated mapping
325 | ---------------------
326 | 
327 | When the river detect a new type, it creates automatically a mapping for this type.
328 | 
329 | ```javascript
330 | {
331 |   "doc" : {
332 |     "properties" : {
333 |       "title" : {
334 |         "type" : "string",
335 |         "analyzer" : "keyword"
336 |       },
337 |       "modifiedDate" : {
338 |         "type" : "date",
339 |         "format" : "dateOptionalTime"
340 |       },
341 |       "file" : {
342 |         "type" : "attachment",
343 |         "fields" : {
344 |           "file" : {
345 |             "type" : "string",
346 |             "store" : "yes",
347 |             "term_vector" : "with_positions_offsets"
348 |           },
349 |           "title" : {
350 |             "type" : "string",
351 |             "store" : "yes"
352 |           }
353 |         }
354 |       }
355 |     }
356 |   }
357 | }
358 | ```
359 | 
360 | *From 0.0.2 version*
361 | 
362 | We now use directly Tika instead of the mapper-attachment plugin.
363 | 
364 | ```javascript
365 | {
366 |   "doc" : {
367 |     "properties" : {
368 |       "title" : {
369 |         "type" : "string",
370 |         "analyzer" : "keyword"
371 |       },
372 |       "modifiedDate" : {
373 |         "type" : "date",
374 |         "format" : "dateOptionalTime"
375 |       },
376 |       "source_url" : {
377 |         "type" : "string"
378 |       },
379 |       "file" : {
380 |         "properties" : {
381 |           "file" : {
382 |             "type" : "string",
383 |             "store" : "yes",
384 |             "term_vector" : "with_positions_offsets"
385 |           },
386 |           "title" : {
387 |             "type" : "string",
388 |             "store" : "yes"
389 |           }
390 |         }
391 |       }
392 |     }
393 |   }
394 | }
395 | ``` 
396 |      
397 |     
398 | License
399 | =======
400 | 
401 | ```
402 | This software is licensed under the Apache 2 license, quoted below.
403 | 
404 | Copyright 2013-2015 Laurent Broudoux
405 | 
406 | Licensed under the Apache License, Version 2.0 (the "License"); you may not
407 | use this file except in compliance with the License. You may obtain a copy of
408 | the License at
409 | 
410 |     http://www.apache.org/licenses/LICENSE-2.0
411 | 
412 | Unless required by applicable law or agreed to in writing, software
413 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
414 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
415 | License for the specific language governing permissions and limitations under
416 | the License.
417 | ```
418 | 


--------------------------------------------------------------------------------
/src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3River.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Laurent Broudoux (the "Author") under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership. Author licenses this
  6 |  * file to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *    http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | package com.github.lbroudoux.elasticsearch.river.s3.river;
 20 | 
 21 | import java.util.*;
 22 | 
 23 | import com.amazonaws.services.s3.model.AmazonS3Exception;
 24 | import org.apache.tika.metadata.Metadata;
 25 | import org.elasticsearch.ExceptionsHelper;
 26 | import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse;
 27 | import org.elasticsearch.action.bulk.*;
 28 | import org.elasticsearch.action.get.GetResponse;
 29 | import org.elasticsearch.action.search.SearchResponse;
 30 | import org.elasticsearch.action.search.SearchType;
 31 | import org.elasticsearch.client.Client;
 32 | import org.elasticsearch.cluster.ClusterState;
 33 | import org.elasticsearch.cluster.block.ClusterBlockException;
 34 | import org.elasticsearch.cluster.metadata.IndexMetaData;
 35 | import org.elasticsearch.cluster.metadata.MappingMetaData;
 36 | import org.elasticsearch.common.inject.Inject;
 37 | import org.elasticsearch.common.io.stream.BytesStreamInput;
 38 | import org.elasticsearch.common.util.concurrent.EsExecutors;
 39 | import org.elasticsearch.common.xcontent.XContentBuilder;
 40 | import org.elasticsearch.common.xcontent.support.XContentMapValues;
 41 | import org.elasticsearch.indices.IndexAlreadyExistsException;
 42 | import org.elasticsearch.river.AbstractRiverComponent;
 43 | import org.elasticsearch.river.River;
 44 | import org.elasticsearch.river.RiverName;
 45 | import org.elasticsearch.river.RiverSettings;
 46 | import org.elasticsearch.search.SearchHit;
 47 | 
 48 | import com.amazonaws.services.s3.model.S3ObjectSummary;
 49 | import com.github.lbroudoux.elasticsearch.river.s3.connector.S3ObjectSummaries;
 50 | import com.github.lbroudoux.elasticsearch.river.s3.connector.S3Connector;
 51 | import com.github.lbroudoux.elasticsearch.river.s3.river.TikaHolder;
 52 | import org.elasticsearch.threadpool.ThreadPool;
 53 | 
 54 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
 55 | /**
 56 |  * A River component for scanning and indexing Amazon S3 documents into Elasticsearch.
 57 |  * @author laurent
 58 |  */
 59 | public class S3River extends AbstractRiverComponent implements River{
 60 | 
 61 |    private final Client client;
 62 | 
 63 |    private final ThreadPool threadPool;
 64 |    
 65 |    private final String indexName;
 66 | 
 67 |    private final String typeName;
 68 | 
 69 |    private final int bulkSize;
 70 | 
 71 |    private RiverStatus riverStatus;
 72 | 
 73 |    private volatile Thread feedThread;
 74 | 
 75 |    private volatile BulkProcessor bulkProcessor;
 76 | 
 77 |    private volatile boolean closed = false;
 78 |    
 79 |    private final S3RiverFeedDefinition feedDefinition;
 80 |    
 81 |    private final S3Connector s3;
 82 |    
 83 |    
 84 |    @Inject
 85 |    @SuppressWarnings({ "unchecked" })
 86 |    protected S3River(RiverName riverName, RiverSettings settings, Client client, ThreadPool threadPool) throws Exception{
 87 |       super(riverName, settings);
 88 |       this.client = client;
 89 |       this.threadPool = threadPool;
 90 |       this.riverStatus = RiverStatus.UNKNOWN;
 91 |       
 92 |       // Deal with connector settings.
 93 |       if (settings.settings().containsKey("amazon-s3")){
 94 |          Map<String, Object> feed = (Map<String, Object>)settings.settings().get("amazon-s3");
 95 |          
 96 |          // Retrieve feed settings.
 97 |          String feedname = XContentMapValues.nodeStringValue(feed.get("name"), null);
 98 |          String bucket = XContentMapValues.nodeStringValue(feed.get("bucket"), null);
 99 |          String pathPrefix = XContentMapValues.nodeStringValue(feed.get("pathPrefix"), null);
100 |          String downloadHost = XContentMapValues.nodeStringValue(feed.get("download_host"), null);
101 |          int updateRate = XContentMapValues.nodeIntegerValue(feed.get("update_rate"), 15 * 60 * 1000);
102 |          boolean jsonSupport = XContentMapValues.nodeBooleanValue(feed.get("json_support"), false);
103 |          double indexedCharsRatio  = XContentMapValues.nodeDoubleValue(feed.get("indexed_chars_ratio"), 0.0);
104 |          
105 |          String[] includes = S3RiverUtil.buildArrayFromSettings(settings.settings(), "amazon-s3.includes");
106 |          String[] excludes = S3RiverUtil.buildArrayFromSettings(settings.settings(), "amazon-s3.excludes");
107 |          
108 |          // Retrieve connection settings.
109 |          String accessKey = XContentMapValues.nodeStringValue(feed.get("accessKey"), null);
110 |          String secretKey = XContentMapValues.nodeStringValue(feed.get("secretKey"), null);
111 |          boolean useIAMRoleForEC2 = XContentMapValues.nodeBooleanValue(feed.get("use_EC2_IAM"), false);
112 |          
113 |          feedDefinition = new S3RiverFeedDefinition(feedname, bucket, pathPrefix, downloadHost,
114 |                updateRate, Arrays.asList(includes), Arrays.asList(excludes), accessKey, secretKey, useIAMRoleForEC2,
115 |                jsonSupport, indexedCharsRatio);
116 |       } else {
117 |          logger.error("You didn't define the amazon-s3 settings. Exiting... See https://github.com/lbroudoux/es-amazon-s3-river");
118 |          indexName = null;
119 |          typeName = null;
120 |          bulkSize = 100;
121 |          feedDefinition = null;
122 |          s3 = null;
123 |          return;
124 |       }
125 |       
126 |       // Deal with index settings if provided.
127 |       if (settings.settings().containsKey("index")) {
128 |          Map<String, Object> indexSettings = (Map<String, Object>)settings.settings().get("index");
129 |          
130 |          indexName = XContentMapValues.nodeStringValue(indexSettings.get("index"), riverName.name());
131 |          typeName = XContentMapValues.nodeStringValue(indexSettings.get("type"), S3RiverUtil.INDEX_TYPE_DOC);
132 |          bulkSize = XContentMapValues.nodeIntegerValue(indexSettings.get("bulk_size"), 100);
133 |       } else {
134 |          indexName = riverName.name();
135 |          typeName = S3RiverUtil.INDEX_TYPE_DOC;
136 |          bulkSize = 100;
137 |       }
138 |       
139 |       // We need to connect to Amazon S3 after ensure mandatory settings are here.
140 |       if (feedDefinition.getBucket() == null){
141 |          logger.error("Amazon S3 bucket should not be null. Please fix this.");
142 |          throw new IllegalArgumentException("Amazon S3 bucket should not be null.");
143 |       }
144 |       // Connect using the appropriate authentication process.
145 |       if (feedDefinition.getAccessKey() == null && feedDefinition.getSecretKey() == null) {
146 |          s3 = new S3Connector(feedDefinition.isUseIAMRoleForEC2());
147 |       } else {
148 |          s3 = new S3Connector(feedDefinition.getAccessKey(), feedDefinition.getSecretKey());
149 |       }
150 |       try {
151 |          s3.connectUserBucket(feedDefinition.getBucket(), feedDefinition.getPathPrefix());
152 |       } catch (AmazonS3Exception ase){
153 |          logger.error("Exception while connecting Amazon S3 user bucket. "
154 |                + "Either access key, secret key, IAM Role or bucket name are incorrect");
155 |          throw ase;
156 |       }
157 | 
158 |       this.riverStatus = RiverStatus.INITIALIZED;
159 |    }
160 |    
161 |    @Override
162 |    public void start(){
163 |       if (logger.isInfoEnabled()){
164 |          logger.info("Starting amazon s3 river scanning");
165 |       }
166 | 
167 |       this.riverStatus = RiverStatus.STARTING;
168 |       // Let's start this in another thread so we won't stop the start process
169 |       threadPool.generic().execute(new Runnable() {
170 |          @Override
171 |          public void run() {
172 |             // We are first waiting for a yellow state at least
173 |             logger.debug("Waiting for yellow status");
174 |             client.admin().cluster().prepareHealth("_river").setWaitForYellowStatus().get();
175 |             logger.debug("Yellow or green status received");
176 | 
177 |             try {
178 |                // Create the index if it doesn't exist
179 |                if (!client.admin().indices().prepareExists(indexName).execute().actionGet().isExists()) {
180 |                   client.admin().indices().prepareCreate(indexName).execute().actionGet();
181 |                }
182 |             } catch (Exception e) {
183 |                if (ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException){
184 |                   // that's fine.
185 |                } else if (ExceptionsHelper.unwrapCause(e) instanceof ClusterBlockException){
186 |                   // ok, not recovered yet..., lets start indexing and hope we recover by the first bulk.
187 |                } else {
188 |                   logger.warn("failed to create index [{}], disabling river...", e, indexName);
189 |                   return;
190 |                }
191 |             }
192 | 
193 |             try {
194 |                // If needed, we create the new mapping for files
195 |                if (!feedDefinition.isJsonSupport()) {
196 |                   pushMapping(indexName, typeName, S3RiverUtil.buildS3FileMapping(typeName));
197 |                }
198 |             } catch (Exception e) {
199 |                logger.warn("Failed to create mapping for [{}/{}], disabling river...",
200 |                      e, indexName, typeName);
201 |                return;
202 |             }
203 | 
204 |             // Creating bulk processor
205 |             bulkProcessor = BulkProcessor.builder(client, new BulkProcessor.Listener() {
206 |                @Override
207 |                public void beforeBulk(long id, BulkRequest request) {
208 |                   logger.debug("Going to execute new bulk composed of {} actions", request.numberOfActions());
209 |                }
210 | 
211 |                @Override
212 |                public void afterBulk(long id, BulkRequest request, BulkResponse response) {
213 |                   logger.debug("Executed bulk composed of {} actions", request.numberOfActions());
214 |                   if (response.hasFailures()) {
215 |                      logger.warn("There was failures while executing bulk", response.buildFailureMessage());
216 |                      if (logger.isDebugEnabled()) {
217 |                         for (BulkItemResponse item : response.getItems()) {
218 |                            if (item.isFailed()) {
219 |                               logger.debug("Error for {}/{}/{} for {} operation: {}", item.getIndex(),
220 |                                     item.getType(), item.getId(), item.getOpType(), item.getFailureMessage());
221 |                            }
222 |                         }
223 |                      }
224 |                   }
225 |                }
226 | 
227 |                @Override
228 |                public void afterBulk(long id, BulkRequest request, Throwable throwable) {
229 |                   logger.warn("Error executing bulk", throwable);
230 |                }
231 |             })
232 |                   .setBulkActions(bulkSize)
233 |                   .build();
234 | 
235 |             // We create as many Threads as there are feeds.
236 |             feedThread = EsExecutors.daemonThreadFactory(settings.globalSettings(), "fs_slurper")
237 |                   .newThread(new S3Scanner(feedDefinition));
238 |             feedThread.start();
239 |             riverStatus = RiverStatus.RUNNING;
240 |          }
241 |       });   
242 |       
243 |    }
244 |    
245 |    @Override
246 |    public void close(){
247 |       if (logger.isInfoEnabled()){
248 |          logger.info("Closing amazon s3 river");
249 |       }
250 |       closed = true;
251 |       riverStatus = RiverStatus.STOPPING;
252 |       
253 |       // We have to close the Thread.
254 |       if (feedThread != null){
255 |          feedThread.interrupt();
256 |       }
257 |       riverStatus = RiverStatus.STOPPED;
258 |    }
259 |    
260 |    /**
261 |     * Check if a mapping already exists in an index
262 |     * @param index Index name
263 |     * @param type Mapping name
264 |     * @return true if mapping exists
265 |     */
266 |    private boolean isMappingExist(String index, String type) {
267 |       ClusterState cs = client.admin().cluster().prepareState()
268 |             .setIndices(index).execute().actionGet()
269 |             .getState();
270 |       // Check index metadata existence.
271 |       IndexMetaData imd = cs.getMetaData().index(index);
272 |       if (imd == null){
273 |          return false;
274 |       }
275 |       // Check mapping metadata existence.
276 |       MappingMetaData mdd = imd.mapping(type);
277 |       if (mdd != null){
278 |          return true;
279 |       }
280 |       return false;
281 |    }
282 |    
283 |    private void pushMapping(String index, String type, XContentBuilder xcontent) throws Exception {
284 |       if (logger.isTraceEnabled()){
285 |          logger.trace("pushMapping(" + index + ", " + type + ")");
286 |       }
287 | 
288 |       // If type does not exist, we create it
289 |       boolean mappingExist = isMappingExist(index, type);
290 |       if (!mappingExist) {
291 |          logger.debug("Mapping [" + index + "]/[" + type + "] doesn't exist. Creating it.");
292 | 
293 |          // Read the mapping json file if exists and use it.
294 |          if (xcontent != null){
295 |             if (logger.isTraceEnabled()){
296 |                logger.trace("Mapping for [" + index + "]/[" + type + "]=" + xcontent.string());
297 |             }
298 |             // Create type and mapping
299 |             PutMappingResponse response = client.admin().indices()
300 |                   .preparePutMapping(index)
301 |                   .setType(type)
302 |                   .setSource(xcontent)
303 |                   .execute().actionGet();       
304 |             if (!response.isAcknowledged()){
305 |                throw new Exception("Could not define mapping for type [" + index + "]/[" + type + "].");
306 |             } else {
307 |                if (logger.isDebugEnabled()){
308 |                   if (mappingExist){
309 |                      logger.debug("Mapping definition for [" + index + "]/[" + type + "] succesfully merged.");
310 |                   } else {
311 |                      logger.debug("Mapping definition for [" + index + "]/[" + type + "] succesfully created.");
312 |                   }
313 |                }
314 |             }
315 |          } else {
316 |             if (logger.isDebugEnabled()){
317 |                logger.debug("No mapping definition for [" + index + "]/[" + type + "]. Ignoring.");
318 |             }
319 |          }
320 |       } else {
321 |          if (logger.isDebugEnabled()){
322 |             logger.debug("Mapping [" + index + "]/[" + type + "] already exists and mergeMapping is not set.");
323 |          }
324 |       }
325 |       if (logger.isTraceEnabled()){
326 |          logger.trace("/pushMapping(" + index + ", " + type + ")");
327 |       }
328 |    }
329 |    
330 |    /** */
331 |    private class S3Scanner implements Runnable{
332 |       
333 |       private BulkRequestBuilder bulk;
334 |       private S3RiverFeedDefinition feedDefinition;
335 |       
336 |       public S3Scanner(S3RiverFeedDefinition feedDefinition){
337 |          this.feedDefinition = feedDefinition;
338 |       }
339 |       
340 |       @Override
341 |       public void run(){
342 |          while (true){
343 |             if (closed){
344 |                return;
345 |             }
346 | 
347 |             try{
348 |                if (isStarted()){
349 |                   // Scan folder starting from last changes id, then record the new one.
350 |                   Long lastScanTime = getLastScanTimeFromRiver("_lastScanTime");
351 |                   lastScanTime = scan(lastScanTime);
352 |                   updateRiver("_lastScanTime", lastScanTime);
353 |                } else {
354 |                   logger.info("Amazon S3 River is disabled for {}", riverName().name());
355 |                }
356 |             } catch (Exception e){
357 |                logger.warn("Error while indexing content from {}", feedDefinition.getBucket());
358 |                if (logger.isDebugEnabled()){
359 |                   logger.debug("Exception for folder {} is {}", feedDefinition.getBucket(), e);
360 |                   e.printStackTrace();
361 |                }
362 |             }
363 |             
364 |             try {
365 |                if (logger.isDebugEnabled()){
366 |                   logger.debug("Amazon S3 river is going to sleep for {} ms", feedDefinition.getUpdateRate());
367 |                }
368 |                Thread.sleep(feedDefinition.getUpdateRate());
369 |             } catch (InterruptedException ie){
370 |             }
371 |          }
372 |       }
373 |       
374 |       private boolean isStarted(){
375 |          // Refresh index before querying it.
376 |          client.admin().indices().prepareRefresh("_river").execute().actionGet();
377 |          GetResponse isStartedGetResponse = client.prepareGet("_river", riverName().name(), "_s3status").execute().actionGet();
378 |          try{
379 |             if (!isStartedGetResponse.isExists()){
380 |                XContentBuilder xb = jsonBuilder().startObject()
381 |                      .startObject("amazon-s3")
382 |                         .field("feedname", feedDefinition.getFeedname())
383 |                         .field("status", "STARTED").endObject()
384 |                      .endObject();
385 |                client.prepareIndex("_river", riverName.name(), "_s3status").setSource(xb).execute();
386 |                return true;
387 |             } else {
388 |                String status = (String)XContentMapValues.extractValue("amazon-s3.status", isStartedGetResponse.getSourceAsMap());
389 |                if ("STOPPED".equals(status)){
390 |                   return false;
391 |                }
392 |             }
393 |          } catch (Exception e){
394 |             logger.warn("failed to get status for " + riverName().name() + ", throttling....", e);
395 |          }
396 |          return true;
397 |       }
398 |       
399 |       @SuppressWarnings("unchecked")
400 |       private Long getLastScanTimeFromRiver(String lastScanTimeField){
401 |          Long result = null;
402 |          try {
403 |             // Do something.
404 |             client.admin().indices().prepareRefresh("_river").execute().actionGet();
405 |             GetResponse lastSeqGetResponse = client.prepareGet("_river", riverName().name(),
406 |                   lastScanTimeField).execute().actionGet();
407 |             if (lastSeqGetResponse.isExists()) {
408 |                Map<String, Object> fsState = (Map<String, Object>) lastSeqGetResponse.getSourceAsMap().get("amazon-s3");
409 | 
410 |                if (fsState != null){
411 |                   Object lastScanTime= fsState.get(lastScanTimeField);
412 |                   if (lastScanTime != null){
413 |                      try{
414 |                         result = Long.parseLong(lastScanTime.toString());
415 |                      } catch (NumberFormatException nfe){
416 |                         logger.warn("Last recorded lastScanTime is not a Long {}", lastScanTime.toString());
417 |                      }
418 |                   }
419 |                }
420 |             } else {
421 |                // This is first call, just log in debug mode.
422 |                if (logger.isDebugEnabled()){
423 |                   logger.debug("{} doesn't exist", lastScanTimeField);
424 |                }
425 |             }
426 |          } catch (Exception e) {
427 |             logger.warn("failed to get _lastScanTimeField, throttling....", e);
428 |          }
429 | 
430 |          if (logger.isDebugEnabled()){
431 |             logger.debug("lastScanTimeField: {}", result);
432 |          }
433 |          return result;
434 |       }
435 |       
436 |       /** Scan the Amazon S3 bucket for last changes. */
437 |       private Long scan(Long lastScanTime) throws Exception{
438 |          if (logger.isDebugEnabled()){
439 |             logger.debug("Starting scanning of bucket {} since {}", feedDefinition.getBucket(), lastScanTime);
440 |          }
441 |          S3ObjectSummaries summaries = s3.getObjectSummaries(lastScanTime);
442 |          
443 |          // Store now already indexed ids.
444 |          List<String> previousFileIds = getAlreadyIndexFileIds();
445 |          
446 |          // Browse change and checks if its indexable before starting.
447 |          for (S3ObjectSummary summary : summaries.getPickedSummaries()){
448 |             if (S3RiverUtil.isIndexable(summary.getKey(), feedDefinition.getIncludes(), feedDefinition.getExcludes())){
449 |                indexFile(summary);
450 |             }
451 |          }
452 |          
453 |          // Now, because we do not get changes but only present files, we should 
454 |          // compare previously indexed files with latest to extract deleted ones...
455 |          // But before, we need to produce a list of index ids corresponding to S3 keys.
456 |          List<String> summariesIds = new ArrayList<String>();
457 |          for (String key : summaries.getKeys()){
458 |             summariesIds.add(buildIndexIdFromS3Key(key));
459 |          }
460 |          for (String previousFileId : previousFileIds){
461 |             if (!summariesIds.contains(previousFileId)){
462 |                esDelete(indexName, typeName, previousFileId);
463 |             }
464 |          }
465 |          
466 |          return summaries.getLastScanTime();
467 |       }
468 |       
469 |       /** Retrieve the ids of files already present into index. */
470 |       private List<String> getAlreadyIndexFileIds(){
471 |          List<String> fileIds = new ArrayList<String>();
472 |          // TODO : Should be later optimized for only retrieving ids and getting
473 |          // over the 5000 hits limitation.
474 |          SearchResponse response = client
475 |                .prepareSearch(indexName)
476 |                .setSearchType(SearchType.QUERY_AND_FETCH)
477 |                .setTypes(typeName)
478 |                .setFrom(0)
479 |                .setSize(5000)
480 |                .execute().actionGet();
481 |          if (response.getHits() != null && response.getHits().getHits() != null){
482 |             for (SearchHit hit : response.getHits().getHits()){
483 |                fileIds.add(hit.getId());
484 |             }
485 |          }
486 |          return fileIds;
487 |       }
488 |       
489 |       /** Index an Amazon S3 file by retrieving its content and building the suitable Json content. */
490 |       private String indexFile(S3ObjectSummary summary){
491 |          if (logger.isDebugEnabled()){
492 |             logger.debug("Trying to index '{}'", summary.getKey());
493 |          }
494 |          
495 |          try{
496 |             // Build a unique id from S3 unique summary key.
497 |             String fileId = buildIndexIdFromS3Key(summary.getKey());
498 | 
499 |             if (feedDefinition.isJsonSupport()){
500 |                esIndex(indexName, typeName, summary.getKey(), s3.getContent(summary));
501 |             } else {
502 |                byte[] fileContent = s3.getContent(summary);
503 | 
504 |                if (fileContent != null) {
505 |                   // Compute number of chars to index.
506 |                   // see https://github.com/lbroudoux/es-amazon-s3-river/issues/36
507 |                   int indexedChars = 100000;
508 |                   if (feedDefinition.getIndexedCharsRatio() > 0) {
509 |                      indexedChars = (int) Math.round(fileContent.length * feedDefinition.getIndexedCharsRatio());
510 |                   }
511 | 
512 |                   // Parse content using Tika directly.
513 |                   Metadata fileMetadata = new Metadata();
514 |                   String parsedContent = TikaHolder.tika().parseToString(
515 |                         new BytesStreamInput(fileContent), fileMetadata, indexedChars);
516 | 
517 |                   // Store Tika metadatas into a map.
518 |                   Map<String, Object> fileMetadataMap = new HashMap<String, Object>();
519 |                   for (String key : fileMetadata.names()) {
520 |                      fileMetadataMap.put(key, fileMetadata.get(key));
521 |                   }
522 | 
523 |                   esIndex(indexName, typeName, fileId,
524 |                         jsonBuilder()
525 |                               .startObject()
526 |                                  .field(S3RiverUtil.DOC_FIELD_TITLE, summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1))
527 |                                  .field(S3RiverUtil.DOC_FIELD_MODIFIED_DATE, summary.getLastModified().getTime())
528 |                                  .field(S3RiverUtil.DOC_FIELD_SOURCE_URL, s3.getDownloadUrl(summary, feedDefinition))
529 |                                  .field(S3RiverUtil.DOC_FIELD_METADATA, s3.getS3UserMetadata(summary.getKey()))
530 |                                  .startObject("file")
531 |                                     .field("_name", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1))
532 |                                     .field("title", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1))
533 |                                     .field("file", parsedContent)
534 |                                     .field("metadata", fileMetadataMap)
535 |                                  .endObject()
536 |                               .endObject()
537 |                   );
538 |                   return fileId;
539 |                }
540 |             }
541 |          } catch (Exception e) {
542 |             logger.warn("Can not index " + summary.getKey() + " : " + e.getMessage());
543 |          }
544 |          return null;
545 |       }
546 |       
547 |       /** Build a unique id from S3 unique summary key. */
548 |       private String buildIndexIdFromS3Key(String key){
549 |          return key.replace('/', '-');
550 |       }
551 |       
552 |       /** Update river last changes id value.*/
553 |       private void updateRiver(String lastScanTimeField, Long lastScanTime) throws Exception{
554 |          if (logger.isDebugEnabled()){
555 |             logger.debug("Updating lastScanTimeField: {}", lastScanTime);
556 |          }
557 | 
558 |          // We store the lastupdate date and some stats
559 |          XContentBuilder xb = jsonBuilder()
560 |             .startObject()
561 |                .startObject("amazon-s3")
562 |                   .field("feedname", feedDefinition.getFeedname())
563 |                   .field(lastScanTimeField, lastScanTime)
564 |                .endObject()
565 |             .endObject();
566 |          esIndex("_river", riverName.name(), lastScanTimeField, xb);
567 |       }
568 | 
569 |       /** Add to bulk an IndexRequest. */
570 |       private void esIndex(String index, String type, String id, XContentBuilder xb) throws Exception{
571 |          if (logger.isDebugEnabled()){
572 |             logger.debug("Indexing in ES " + index + ", " + type + ", " + id);
573 |          }
574 |          if (logger.isTraceEnabled()){
575 |             logger.trace("Json indexed : {}", xb.string());
576 |          }
577 |          bulkProcessor.add(client.prepareIndex(index, type, id).setSource(xb).request());
578 |       }
579 | 
580 |       /** Add to bulk an IndexRequest. */
581 |       private void esIndex(String index, String type, String id, byte[] json) throws Exception{
582 |          if (logger.isDebugEnabled()){
583 |             logger.debug("Indexing in ES " + index + ", " + type + ", " + id);
584 |          }
585 |          if (logger.isTraceEnabled()){
586 |             logger.trace("Json indexed : {}", json);
587 |          }
588 |          bulkProcessor.add(client.prepareIndex(index, type, id).setSource(json).request());
589 |       }
590 | 
591 |       /** Add to bulk a DeleteRequest. */
592 |       private void esDelete(String index, String type, String id) throws Exception{
593 |          if (logger.isDebugEnabled()){
594 |             logger.debug("Deleting from ES " + index + ", " + type + ", " + id);
595 |          }
596 |          bulkProcessor.add(client.prepareDelete(index, type, id).request());
597 |       }
598 |    }
599 | 
600 |    private enum RiverStatus {
601 |       UNKNOWN,
602 |       INITIALIZED,
603 |       STARTING,
604 |       RUNNING,
605 |       STOPPING,
606 |       STOPPED;
607 |    }
608 | }
609 | 


--------------------------------------------------------------------------------
Amazon S3 River Plugin	ElasticSearch	Attachment Plugin	Tika
master (1.6.1-SNAPSHOT)	1.6.x and 1.7.x	No more used	1.6
1.6.0	1.6.x and 1.7.x	No more used	1.6
1.4.1	1.4.x and 1.5.x	No more used	1.6
1.4.0	1.4.x and 1.5.x	No more used	1.6
1.3.0	1.3.x	No more used	1.4
1.2.0	1.2.x	No more used	1.4
0.0.4	1.0.x and 1.1.x	No more used	1.4
0.0.3	1.0.0	No more used	1.4
0.0.2	0.90.0	No more used	1.4
0.0.1	0.90.0	1.7.0