├── .travis.yml ├── .settings ├── org.eclipse.core.resources.prefs ├── org.eclipse.m2e.core.prefs └── org.eclipse.jdt.core.prefs ├── .gitignore ├── NOTICE ├── .classpath ├── .project ├── src ├── main │ ├── assemblies │ │ └── esplugin.xml │ ├── resources │ │ └── es-plugin.properties │ └── java │ │ └── com │ │ └── github │ │ └── lbroudoux │ │ └── elasticsearch │ │ └── river │ │ └── s3 │ │ ├── river │ │ ├── TikaHolder.java │ │ ├── S3RiverModule.java │ │ ├── S3RiverFeedDefinition.java │ │ ├── S3RiverUtil.java │ │ └── S3River.java │ │ ├── plugin │ │ └── S3RiverPlugin.java │ │ ├── connector │ │ ├── S3ObjectSummaries.java │ │ └── S3Connector.java │ │ └── rest │ │ └── S3ManageAction.java ├── itest │ └── java │ │ └── com │ │ └── github │ │ └── lbroudoux │ │ └── elasticsearch │ │ └── river │ │ └── s3 │ │ └── connector │ │ └── S3ConnectorTest.java └── test │ └── java │ └── com │ └── github │ └── lbroudoux │ └── elasticsearch │ └── river │ └── s3 │ └── river │ └── S3RiverUtilTest.java ├── pom.xml ├── LICENSE └── README.md /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - openjdk7 4 | - oraclejdk8 -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding/=UTF-8 3 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Package Files # 4 | *.jar 5 | *.war 6 | *.ear 7 | 8 | # IntelliJ 9 | .idea 10 | target 11 | *.iml 12 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright (c) Laurent Broudoux - 2013 2 | 3 | This product includes software developed by The Apache Software 4 | Foundation (http://www.apache.org/). -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 4 | org.eclipse.jdt.core.compiler.compliance=1.6 5 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 6 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 7 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 8 | org.eclipse.jdt.core.compiler.source=1.6 9 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | es-amazon-s3-river 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/main/assemblies/esplugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | bin 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | false 11 | / 12 | true 13 | true 14 | 15 | org.elasticsearch:elasticsearch:jar 16 | junit:junit 17 | log4j:log4j 18 | 19 | 20 | 21 | 22 | 23 | ${project.build.directory}/ 24 | / 25 | 26 | ${project.name}-${project.version}.jar 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/main/resources/es-plugin.properties: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | # Licensed to Laurent Broudoux (the "Author") under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. Author licenses this 6 | # file to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | ################################################################ 19 | plugin=com.github.lbroudoux.elasticsearch.river.s3.plugin.S3RiverPlugin -------------------------------------------------------------------------------- /src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/TikaHolder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.river; 20 | 21 | import org.apache.tika.Tika; 22 | /** 23 | * Simple singleton holder for Apache Tika. 24 | * @author laurent 25 | */ 26 | public class TikaHolder { 27 | 28 | private static final Tika tika = new Tika(); 29 | 30 | /** @return This holder singleton's instance. */ 31 | public static Tika tika(){ 32 | return tika; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3RiverModule.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.river; 20 | 21 | import org.elasticsearch.common.inject.AbstractModule; 22 | import org.elasticsearch.river.River; 23 | /** 24 | * 25 | * @author laurent 26 | */ 27 | public class S3RiverModule extends AbstractModule{ 28 | 29 | @Override 30 | protected void configure(){ 31 | bind(River.class).to(S3River.class).asEagerSingleton(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/itest/java/com/github/lbroudoux/elasticsearch/river/s3/connector/S3ConnectorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.connector; 20 | 21 | import com.amazonaws.services.s3.model.AmazonS3Exception; 22 | 23 | import org.junit.Test; 24 | /** 25 | * @author laurent 26 | */ 27 | public class S3ConnectorTest{ 28 | 29 | @Test(expected = AmazonS3Exception.class) 30 | public void shouldNotConnectUserBucketWithBadSecretKey() { 31 | S3Connector connector = new S3Connector("AKIAITHNRLFUUVPFBKZQ", "azerty"); 32 | connector.connectUserBucket("famillebroudoux", "papiers/"); 33 | } 34 | 35 | @Test(expected = AmazonS3Exception.class) 36 | public void shouldNotConnectUserBucketWithBadBucket() { 37 | S3Connector connector = new S3Connector("AKIAITHNRLFUUVPFBKZQ", ""); 38 | connector.connectUserBucket("azerty", "papiers/"); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/github/lbroudoux/elasticsearch/river/s3/plugin/S3RiverPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.plugin; 20 | 21 | import org.elasticsearch.common.inject.Module; 22 | import org.elasticsearch.plugins.AbstractPlugin; 23 | import org.elasticsearch.rest.RestModule; 24 | import org.elasticsearch.river.RiversModule; 25 | 26 | import com.github.lbroudoux.elasticsearch.river.s3.rest.S3ManageAction; 27 | import com.github.lbroudoux.elasticsearch.river.s3.river.S3RiverModule; 28 | /** 29 | * Amazon S3 River plugin definition. 30 | * @author laurent 31 | */ 32 | public class S3RiverPlugin extends AbstractPlugin{ 33 | 34 | @Override 35 | public String name(){ 36 | return "river-amazon-s3"; 37 | } 38 | 39 | @Override 40 | public String description(){ 41 | return "River Amazon S3 Plugin"; 42 | } 43 | 44 | @Override 45 | public void processModule(Module module){ 46 | if (module instanceof RiversModule){ 47 | ((RiversModule) module).registerRiver("amazon-s3", S3RiverModule.class); 48 | } 49 | if (module instanceof RestModule) { 50 | ((RestModule) module).addRestAction(S3ManageAction.class); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/github/lbroudoux/elasticsearch/river/s3/connector/S3ObjectSummaries.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.connector; 20 | 21 | import java.io.Serializable; 22 | import java.util.List; 23 | 24 | import com.amazonaws.services.s3.model.S3ObjectSummary; 25 | /** 26 | * This is a simple wrapper for carrying picked up summaries of S3 bucket objects 27 | * that mastches the last modification date criteria along with the keys of any objects 28 | * regardless their modification date. 29 | * @author laurent 30 | */ 31 | public class S3ObjectSummaries implements Serializable{ 32 | 33 | /** Default serial version UID. */ 34 | private static final long serialVersionUID = 1L; 35 | 36 | private Long lastScanTime; 37 | 38 | private List keys; 39 | private List pickedSummaries; 40 | 41 | 42 | public S3ObjectSummaries(Long lastScanTime, List summaries, List keys){ 43 | this.lastScanTime = lastScanTime; 44 | this.pickedSummaries = summaries; 45 | this.keys = keys; 46 | } 47 | 48 | public Long getLastScanTime(){ 49 | return lastScanTime; 50 | } 51 | 52 | public List getKeys(){ 53 | return keys; 54 | } 55 | 56 | public List getPickedSummaries(){ 57 | return pickedSummaries; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/test/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3RiverUtilTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.river; 20 | 21 | import static junit.framework.Assert.*; 22 | 23 | import java.util.Arrays; 24 | import java.util.List; 25 | 26 | import org.junit.Test; 27 | /** 28 | * Test case for S3RiverUtil class. 29 | * @author laurent 30 | */ 31 | public class S3RiverUtilTest { 32 | 33 | @Test 34 | public void shouldSayIsIndexable() { 35 | List includes = Arrays.asList("*.pdf"); 36 | List excludes = Arrays.asList("*.mkv"); 37 | assertTrue(S3RiverUtil.isIndexable("mydoc.pdf", includes, excludes)); 38 | } 39 | 40 | @Test 41 | public void shouldNotSayIsIndexable() { 42 | List includes = Arrays.asList("*.pdf"); 43 | List excludes = Arrays.asList("*.mkv"); 44 | assertFalse(S3RiverUtil.isIndexable("mymovie.mkv", includes, excludes)); 45 | } 46 | 47 | @Test 48 | public void shouldSayIsIndexableWhenNoSpec() { 49 | // mydoc not in inclusions. 50 | assertTrue(S3RiverUtil.isIndexable("mydoc.pdf", null, null)); 51 | } 52 | 53 | @Test 54 | public void shouldSayIsIndexableWhenInclusionsOnly() { 55 | List includes = Arrays.asList("*.pdf"); 56 | List excludes = Arrays.asList(); 57 | // mydoc in inclusions. 58 | assertTrue(S3RiverUtil.isIndexable("mydoc.pdf", includes, excludes)); 59 | } 60 | 61 | @Test 62 | public void shouldNotSayIsIndexableWhenInclusionsOnly() { 63 | List includes = Arrays.asList("*.pdf"); 64 | List excludes = Arrays.asList(); 65 | // mymovie not in inclusions. 66 | assertFalse(S3RiverUtil.isIndexable("mymovie.mkv", includes, excludes)); 67 | } 68 | 69 | @Test 70 | public void shouldSayIsIndexableWhenExclusionsOnly() { 71 | List includes = Arrays.asList(); 72 | List excludes = Arrays.asList("*.mkv"); 73 | // mydoc not in exclusions. 74 | assertTrue(S3RiverUtil.isIndexable("mydoc.pdf", includes, excludes)); 75 | } 76 | 77 | @Test 78 | public void shoudNotSayIsIndexableWhenExclusionsOnly() { 79 | List includes = Arrays.asList(); 80 | List excludes = Arrays.asList("*.mkv"); 81 | // mymovie in exclusions. 82 | assertFalse(S3RiverUtil.isIndexable("mymovie.mkv", includes, excludes)); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/com/github/lbroudoux/elasticsearch/river/s3/rest/S3ManageAction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.rest; 20 | 21 | import java.io.IOException; 22 | 23 | import org.elasticsearch.client.Client; 24 | import org.elasticsearch.common.inject.Inject; 25 | import org.elasticsearch.common.settings.Settings; 26 | import org.elasticsearch.common.xcontent.XContentBuilder; 27 | import org.elasticsearch.common.xcontent.XContentBuilderString; 28 | import org.elasticsearch.rest.BaseRestHandler; 29 | import org.elasticsearch.rest.BytesRestResponse; 30 | import org.elasticsearch.rest.RestChannel; 31 | import org.elasticsearch.rest.RestController; 32 | import org.elasticsearch.rest.RestRequest; 33 | import org.elasticsearch.rest.RestStatus; 34 | 35 | import org.elasticsearch.rest.RestRequest.Method; 36 | 37 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; 38 | /** 39 | * REST actions definition for starting and stopping an Amazon S3 river. 40 | * @author laurent 41 | */ 42 | public class S3ManageAction extends BaseRestHandler{ 43 | 44 | /** The constant for 'start river' command. */ 45 | public static final String START_COMMAND = "_start"; 46 | /** The constant for 'stop river' command. */ 47 | public static final String STOP_COMMAND = "_stop"; 48 | 49 | @Inject 50 | public S3ManageAction(Settings settings, Client client, RestController controller){ 51 | super(settings, controller, client); 52 | 53 | // Define S3 REST endpoints. 54 | controller.registerHandler(Method.GET, "/_s3/{rivername}/{command}", this); 55 | } 56 | 57 | @Override 58 | public void handleRequest(RestRequest request, RestChannel channel, Client client) throws Exception{ 59 | if (logger.isDebugEnabled()){ 60 | logger.debug("REST S3ManageAction called"); 61 | } 62 | 63 | String rivername = request.param("rivername"); 64 | String command = request.param("command"); 65 | 66 | String status = null; 67 | if (START_COMMAND.equals(command)){ 68 | status = "STARTED"; 69 | } else if (STOP_COMMAND.equals(command)){ 70 | status = "STOPPED"; 71 | } 72 | 73 | try{ 74 | if (status != null){ 75 | XContentBuilder xb = jsonBuilder() 76 | .startObject() 77 | .startObject("amazon-s3") 78 | .field("feedname", rivername) 79 | .field("status", status) 80 | .endObject() 81 | .endObject(); 82 | client.prepareIndex("_river", rivername, "_s3status").setSource(xb).execute().actionGet(); 83 | } 84 | 85 | XContentBuilder builder = jsonBuilder(); 86 | builder 87 | .startObject() 88 | .field(new XContentBuilderString("ok"), true) 89 | .endObject(); 90 | channel.sendResponse(new BytesRestResponse(RestStatus.OK, builder)); 91 | } catch (IOException e) { 92 | onFailure(request, channel, e); 93 | } 94 | } 95 | 96 | /** */ 97 | private void onFailure(RestRequest request, RestChannel channel, Exception e) throws Exception{ 98 | try{ 99 | channel.sendResponse(new BytesRestResponse(channel, e)); 100 | } catch (IOException ioe){ 101 | logger.error("Sending failure response fails !", e); 102 | channel.sendResponse(new BytesRestResponse(RestStatus.INTERNAL_SERVER_ERROR)); 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3RiverFeedDefinition.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.river; 20 | 21 | import java.util.List; 22 | /** 23 | * A definition bean wrapping information of river feed settings. 24 | * @author laurent 25 | */ 26 | public class S3RiverFeedDefinition{ 27 | 28 | private String feedname; 29 | private String bucket; 30 | private String pathPrefix; 31 | private String downloadHost; 32 | private int updateRate; 33 | private List includes; 34 | private List excludes; 35 | private String accessKey; 36 | private String secretKey; 37 | private boolean useIAMRoleForEC2; 38 | private boolean jsonSupport; 39 | private double indexedCharsRatio = 0; 40 | 41 | public S3RiverFeedDefinition(String feedname, String bucket, String pathPrefix, String downloadHost, int updateRate, 42 | List includes, List excludes, String accessKey, String secretKey, boolean useIAMRoleForEC2, 43 | boolean jsonSupport, double indexedCharsRatio) { 44 | this.feedname = feedname; 45 | this.bucket = bucket; 46 | this.pathPrefix = pathPrefix; 47 | this.downloadHost = downloadHost; 48 | this.updateRate = updateRate; 49 | this.includes = includes; 50 | this.excludes = excludes; 51 | this.accessKey = accessKey; 52 | this.secretKey = secretKey; 53 | this.useIAMRoleForEC2 = useIAMRoleForEC2; 54 | this.jsonSupport = jsonSupport; 55 | this.indexedCharsRatio = indexedCharsRatio; 56 | } 57 | 58 | public String getFeedname() { 59 | return feedname; 60 | } 61 | public void setFeedname(String feedname) { 62 | this.feedname = feedname; 63 | } 64 | 65 | public String getBucket() { 66 | return bucket; 67 | } 68 | public void setBucket(String bucket) { 69 | this.bucket = bucket; 70 | } 71 | 72 | public String getPathPrefix() { 73 | return pathPrefix; 74 | } 75 | public void setPathPrefix(String pathPrefix) { 76 | this.pathPrefix = pathPrefix; 77 | } 78 | 79 | public String getDownloadHost() { 80 | return downloadHost; 81 | } 82 | public void setDownloadHost(String downloadHost) { 83 | this.downloadHost = downloadHost; 84 | } 85 | 86 | public int getUpdateRate() { 87 | return updateRate; 88 | } 89 | public void setUpdateRate(int updateRate) { 90 | this.updateRate = updateRate; 91 | } 92 | 93 | public List getIncludes() { 94 | return includes; 95 | } 96 | public void setIncludes(List includes) { 97 | this.includes = includes; 98 | } 99 | 100 | public List getExcludes() { 101 | return excludes; 102 | } 103 | public void setExcludes(List excludes) { 104 | this.excludes = excludes; 105 | } 106 | 107 | public String getAccessKey() { 108 | return accessKey; 109 | } 110 | public void setAccessKey(String accessKey) { 111 | this.accessKey = accessKey; 112 | } 113 | 114 | public String getSecretKey() { 115 | return secretKey; 116 | } 117 | public void setSecretKey(String secretKey) { 118 | this.secretKey = secretKey; 119 | } 120 | 121 | public boolean isUseIAMRoleForEC2() { 122 | return useIAMRoleForEC2; 123 | } 124 | 125 | public boolean isJsonSupport(){ return jsonSupport; } 126 | 127 | public double getIndexedCharsRatio() { 128 | return indexedCharsRatio; 129 | } 130 | public void setIndexedCharsRatio(double indexedCharsRatio) { 131 | this.indexedCharsRatio = indexedCharsRatio; 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3RiverUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.river; 20 | 21 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; 22 | 23 | import java.util.*; 24 | 25 | import org.elasticsearch.common.Strings; 26 | import org.elasticsearch.common.xcontent.XContentBuilder; 27 | import org.elasticsearch.common.xcontent.support.XContentMapValues; 28 | /** 29 | * Utility class for Amazon S3 indexing management. 30 | * @author laurent 31 | */ 32 | public class S3RiverUtil{ 33 | 34 | public static final String INDEX_TYPE_DOC = "doc"; 35 | 36 | public static final String DOC_FIELD_TITLE = "title"; 37 | public static final String DOC_FIELD_MODIFIED_DATE = "modifiedDate"; 38 | public static final String DOC_FIELD_SOURCE_URL = "source_url"; 39 | public static final String DOC_FIELD_METADATA = "metadata"; 40 | 41 | /** 42 | * Build mapping description for Amazon S3 files. 43 | * @param type The name of type for S3 files 44 | * @return A content builder for mapping informations 45 | * @throws Exception it something goes wrong 46 | */ 47 | public static XContentBuilder buildS3FileMapping(String type) throws Exception{ 48 | XContentBuilder xbMapping = jsonBuilder().prettyPrint().startObject() 49 | .startObject(type).startObject("properties") 50 | .startObject(DOC_FIELD_TITLE).field("type", "string").field("analyzer","keyword").endObject() 51 | .startObject(DOC_FIELD_MODIFIED_DATE).field("type", "date").endObject() 52 | .startObject(DOC_FIELD_SOURCE_URL).field("type", "string").endObject() 53 | .startObject(DOC_FIELD_METADATA).field("type", "object").endObject() 54 | .startObject("file") 55 | .startObject("properties") 56 | .startObject("title").field("type", "string").field("store", "yes").endObject() 57 | .startObject("file").field("type", "string") 58 | .field("term_vector", "with_positions_offsets") 59 | .field("store", "yes") 60 | .endObject() 61 | .startObject("metadata").field("type", "object").field("store", "yes").endObject() 62 | .endObject() 63 | .endObject() 64 | .endObject().endObject().endObject(); 65 | return xbMapping; 66 | } 67 | 68 | /** 69 | * Extract array from settings (array or ; delimited String) 70 | * @param settings Settings 71 | * @param path Path to settings definition 72 | * @return Array of settings 73 | */ 74 | @SuppressWarnings("unchecked") 75 | public static String[] buildArrayFromSettings(Map settings, String path){ 76 | String[] includes; 77 | 78 | // We manage comma separated format and arrays 79 | if (XContentMapValues.isArray(XContentMapValues.extractValue(path, settings))) { 80 | List includesarray = (List) XContentMapValues.extractValue(path, settings); 81 | int i = 0; 82 | includes = new String[includesarray.size()]; 83 | for (String include : includesarray) { 84 | includes[i++] = trimAllWhitespace(include); 85 | } 86 | } else { 87 | String includedef = (String) XContentMapValues.extractValue(path, settings); 88 | includes = Strings.commaDelimitedListToStringArray(trimAllWhitespace(includedef)); 89 | } 90 | 91 | String[] uniquelist = removeDuplicateStrings(includes); 92 | 93 | return uniquelist; 94 | } 95 | 96 | /** 97 | * Tells if an Aamzon S3 file is indexable from its key (file name), based on includes 98 | * and excludes rules. 99 | * @return true if file should be indexed, false otherwise 100 | */ 101 | public static boolean isIndexable(String key, List includes, List excludes){ 102 | // If no rules specified, we index everything ! 103 | if ((includes == null && excludes == null) 104 | || (includes.isEmpty() && excludes.isEmpty())){ 105 | return true; 106 | } 107 | 108 | // Exclude rules : we know that whatever includes rules are, we should exclude matching files. 109 | if (excludes != null){ 110 | for (String exclude : excludes){ 111 | String regex = exclude.replace("?", ".?").replace("*", ".*?"); 112 | if (key.matches(regex)){ 113 | return false; 114 | } 115 | } 116 | } 117 | 118 | // Include rules : we should add document if it match include rules. 119 | if (includes == null || includes.isEmpty()){ 120 | return true; 121 | } 122 | if (includes != null){ 123 | for (String include : includes){ 124 | String regex = include.replace("?", ".?").replace("*", ".*?"); 125 | if (key.matches(regex)){ 126 | return true; 127 | } 128 | } 129 | } 130 | 131 | return false; 132 | } 133 | 134 | /** 135 | * Trim all whitespace from the given String: leading, trailing, and inbetween characters. 136 | * @param str the String to check 137 | * @return the trimmed String 138 | * @see java.lang.Character#isWhitespace 139 | */ 140 | public static String trimAllWhitespace(String str) { 141 | if (!Strings.hasLength(str)) { 142 | return str; 143 | } 144 | StringBuilder sb = new StringBuilder(str); 145 | int index = 0; 146 | while (sb.length() > index) { 147 | if (Character.isWhitespace(sb.charAt(index))) { 148 | sb.deleteCharAt(index); 149 | } else { 150 | index++; 151 | } 152 | } 153 | return sb.toString(); 154 | } 155 | 156 | /** 157 | * Remove duplicate Strings from the given array. Also sorts the array, as it uses a TreeSet. 158 | * @param array the String array 159 | * @return an array without duplicates, in natural sort order 160 | */ 161 | public static String[] removeDuplicateStrings(String[] array) { 162 | if (array == null || array.length == 0) { 163 | return array; 164 | } 165 | Set set = new TreeSet(); 166 | set.addAll(Arrays.asList(array)); 167 | return Strings.toStringArray(set); 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.github.lbroudoux.elasticsearch 4 | amazon-s3-river 5 | 1.6.1-SNAPSHOT 6 | jar 7 | 8 | 9 | org.sonatype.oss 10 | oss-parent 11 | 7 12 | 13 | 14 | 15 | 16 | The Apache Software License, Version 2.0 17 | http://www.apache.org/licenses/LICENSE-2.0.txt 18 | repo 19 | 20 | 21 | 22 | 23 | laurent 24 | Laurent Broudoux 25 | laurent.broudoux@gmail.com 26 | http://lbroudoux.wordpress.com 27 | +1 28 | 29 | 30 | 31 | scm:git@github.com:lbroudoux/es-amazon-s3-river.git 32 | scm:git:git@github.com:lbroudoux/es-amazon-s3-river.git 33 | scm:git:git@github.com:lbroudoux/es-amazon-s3-river.git 34 | HEAD 35 | 36 | 37 | GitHub 38 | https://github.com/lbroudoux/es-amazon-s3-river/issues/ 39 | 40 | 41 | 42 | UTF-8 43 | 1.6.2 44 | 1.6 45 | 46 | 47 | 48 | 49 | org.elasticsearch 50 | elasticsearch 51 | ${elasticsearch.version} 52 | 53 | 54 | org.apache.tika 55 | tika-core 56 | ${tika.version} 57 | 58 | 59 | org.apache.tika 60 | tika-parsers 61 | ${tika.version} 62 | 63 | 64 | com.amazonaws 65 | aws-java-sdk 66 | 1.6.12 67 | 68 | 69 | junit 70 | junit 71 | 4.1 72 | test 73 | 74 | 75 | org.slf4j 76 | slf4j-log4j12 77 | 1.5.6 78 | test 79 | 80 | 81 | 82 | 83 | 84 | 85 | org.apache.maven.plugins 86 | maven-release-plugin 87 | 2.4 88 | 89 | 90 | org.apache.maven.plugins 91 | maven-source-plugin 92 | 2.2.1 93 | 94 | 95 | attach-sources 96 | package 97 | 98 | jar-no-fork 99 | 100 | 101 | 102 | 103 | 104 | org.apache.maven.plugins 105 | maven-javadoc-plugin 106 | 2.9 107 | 108 | 109 | attach-javadoc 110 | package 111 | 112 | jar 113 | 114 | 115 | 116 | 117 | 118 | 119 | org.apache.maven.plugins 120 | maven-jar-plugin 121 | 2.4 122 | 123 | 124 | 125 | 126 | org.apache.maven.plugins 127 | maven-dependency-plugin 128 | 2.6 129 | 130 | 131 | copy-dependencies 132 | package 133 | 134 | copy-dependencies 135 | 136 | 137 | ${project.build.directory}/lib 138 | 139 | 140 | 141 | 142 | 143 | org.apache.maven.plugins 144 | maven-surefire-plugin 145 | 2.12.4 146 | 147 | true 148 | 149 | 150 | 151 | surefire-test 152 | test 153 | 154 | test 155 | 156 | 157 | false 158 | 159 | ${project.build.directory}/classes/conf 160 | ${project.build.directory}/lib 161 | 162 | 163 | **/itest/** 164 | 165 | 166 | 167 | 168 | surefire-itest 169 | integration-test 170 | 171 | test 172 | 173 | 174 | true 175 | 176 | ${project.build.directory}/classes/conf 177 | ${project.build.directory}/lib 178 | 179 | 180 | **/itest/** 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | org.apache.maven.plugins 190 | maven-assembly-plugin 191 | 2.4 192 | 193 | false 194 | 195 | 196 | ${basedir}/src/main/assemblies/esplugin.xml 197 | 198 | 199 | 200 | 201 | generate-release-plugin 202 | package 203 | 204 | single 205 | 206 | 207 | 208 | 209 | 210 | org.apache.maven.plugins 211 | maven-compiler-plugin 212 | 3.0 213 | 214 | 1.6 215 | 1.6 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | release 224 | 225 | 226 | 227 | org.apache.maven.plugins 228 | maven-gpg-plugin 229 | 1.4 230 | 231 | 232 | sign-artifacts 233 | verify 234 | 235 | sign 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | -------------------------------------------------------------------------------- /src/main/java/com/github/lbroudoux/elasticsearch/river/s3/connector/S3Connector.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.connector; 20 | 21 | import java.io.ByteArrayOutputStream; 22 | import java.io.IOException; 23 | import java.io.InputStream; 24 | import java.util.ArrayList; 25 | import java.util.Collections; 26 | import java.util.List; 27 | import java.util.Map; 28 | 29 | import com.amazonaws.auth.InstanceProfileCredentialsProvider; 30 | import com.amazonaws.services.s3.model.*; 31 | import org.elasticsearch.common.logging.ESLogger; 32 | import org.elasticsearch.common.logging.Loggers; 33 | 34 | import com.amazonaws.auth.AWSCredentials; 35 | import com.amazonaws.auth.BasicAWSCredentials; 36 | import com.amazonaws.services.s3.AmazonS3Client; 37 | import com.github.lbroudoux.elasticsearch.river.s3.river.S3RiverFeedDefinition; 38 | /** 39 | * This is a connector for querying and retrieving files or folders from 40 | * an Amazon S3 bucket. Credentials are mandatory for connecting to remote drive. 41 | * @author laurent 42 | */ 43 | public class S3Connector{ 44 | 45 | private static final ESLogger logger = Loggers.getLogger(S3Connector.class); 46 | 47 | private final String accessKey; 48 | private final String secretKey; 49 | private boolean useIAMRoleForEC2 = false; 50 | private String bucketName; 51 | private String pathPrefix; 52 | private AmazonS3Client s3Client; 53 | 54 | /** 55 | * Create a S3Connector with security credentials. This is helpful if you want 56 | * to use IAM Roles as described here http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html. 57 | */ 58 | public S3Connector(boolean useIAMRoleForEC2) { 59 | this.accessKey = null; 60 | this.secretKey = null; 61 | this.useIAMRoleForEC2 = useIAMRoleForEC2; 62 | } 63 | 64 | /** 65 | * Create a SEConnector with provided security credentials. 66 | * @param accessKey The AWS access key such as provided by AWS console 67 | * @param secretKey The AWS secret key such as provided by AWS console 68 | */ 69 | public S3Connector(String accessKey, String secretKey){ 70 | this.accessKey = accessKey; 71 | this.secretKey = secretKey; 72 | } 73 | 74 | /** 75 | * Connect to the specified bucket using previously given accesskey and secretkey. 76 | * @param bucketName Name of the bucket to connect to 77 | * @param pathPrefix Prefix that will be later used for filtering documents 78 | * @throws AmazonS3Exception when access or secret keys are wrong or bucket does not exists 79 | */ 80 | public void connectUserBucket(String bucketName, String pathPrefix) throws AmazonS3Exception{ 81 | this.bucketName = bucketName; 82 | this.pathPrefix = pathPrefix; 83 | if (accessKey != null && secretKey != null) { 84 | AWSCredentials credentials = new BasicAWSCredentials(accessKey, secretKey); 85 | s3Client = new AmazonS3Client(credentials); 86 | } else if (useIAMRoleForEC2) { 87 | // Force usage of IAM Role process as described into 88 | // http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html. 89 | s3Client = new AmazonS3Client(new InstanceProfileCredentialsProvider()); 90 | } else { 91 | // Default credentials retrieval or IAM Role process as described into 92 | // http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html. 93 | s3Client = new AmazonS3Client(); 94 | } 95 | // Getting location seems odd as we don't use it later and doesBucketExists() seems 96 | // more appropriate... However, this later returns true even for non existing buckets ! 97 | s3Client.getBucketLocation(bucketName); 98 | } 99 | 100 | /** 101 | * Select and retrieves summaries of object into bucket and of given path prefix 102 | * that have modification date younger than lastScanTime. 103 | * @param lastScanTime Last modification date filter 104 | * @return Summaries of picked objects. 105 | */ 106 | public S3ObjectSummaries getObjectSummaries(Long lastScanTime){ 107 | if (logger.isDebugEnabled()){ 108 | logger.debug("Getting buckets changes since {}", lastScanTime); 109 | } 110 | List keys = new ArrayList(); 111 | List result = new ArrayList(); 112 | 113 | // Store the scan time to return before doing big queries... 114 | Long lastScanTimeToReturn = System.currentTimeMillis(); 115 | if (lastScanTime == null){ 116 | lastScanTime = 0L; 117 | } 118 | 119 | ListObjectsRequest request = new ListObjectsRequest().withBucketName(bucketName) 120 | .withPrefix(pathPrefix); 121 | ObjectListing listing = s3Client.listObjects(request); 122 | logger.debug("Listing: {}", listing); 123 | while (!listing.getObjectSummaries().isEmpty() || listing.isTruncated()){ 124 | List summaries = listing.getObjectSummaries(); 125 | if (logger.isDebugEnabled()){ 126 | logger.debug("Found {} items in this listObjects page", summaries.size()); 127 | } 128 | for (S3ObjectSummary summary : summaries){ 129 | if (logger.isDebugEnabled()){ 130 | logger.debug("Getting {} last modified on {}", summary.getKey(), summary.getLastModified()); 131 | } 132 | keys.add(summary.getKey()); 133 | if (summary.getLastModified().getTime() > lastScanTime){ 134 | logger.debug(" Picked !"); 135 | result.add(summary); 136 | } 137 | } 138 | listing = s3Client.listNextBatchOfObjects(listing); 139 | } 140 | 141 | // Wrap results and latest scan time. 142 | return new S3ObjectSummaries(lastScanTimeToReturn, result, keys); 143 | } 144 | 145 | public Map getS3UserMetadata(String key){ 146 | return Collections.unmodifiableMap(s3Client.getObjectMetadata(bucketName, key).getUserMetadata()); 147 | } 148 | 149 | /** 150 | * Download Amazon S3 file as byte array. 151 | * @param summary The summary of the S3 Object to download 152 | * @return This file bytes or null if something goes wrong. 153 | */ 154 | public byte[] getContent(S3ObjectSummary summary){ 155 | if (logger.isDebugEnabled()){ 156 | logger.debug("Downloading file content from {}", summary.getKey()); 157 | } 158 | // Retrieve object corresponding to key into bucket. 159 | S3Object object = s3Client.getObject(bucketName, summary.getKey()); 160 | 161 | InputStream is = null; 162 | ByteArrayOutputStream bos = null; 163 | 164 | try{ 165 | // Get input stream on S3 Object. 166 | is = object.getObjectContent(); 167 | bos = new ByteArrayOutputStream(); 168 | 169 | byte[] buffer = new byte[4096]; 170 | int len = is.read(buffer); 171 | while (len > 0) { 172 | bos.write(buffer, 0, len); 173 | len = is.read(buffer); 174 | } 175 | 176 | // Flush and return result. 177 | bos.flush(); 178 | return bos.toByteArray(); 179 | } catch (IOException e) { 180 | e.printStackTrace(); 181 | return null; 182 | } finally { 183 | if (bos != null){ 184 | try{ 185 | bos.close(); 186 | } catch (IOException e) { 187 | } 188 | } 189 | try{ 190 | is.close(); 191 | } catch (IOException e) { 192 | } 193 | } 194 | } 195 | 196 | /** 197 | * Get the download url of this S3 object. May return null if the 198 | * object bucket and key cannot be converted to a URL. 199 | * @param summary A S3 object 200 | * @param feedDefinition The holder of S3 feed definition. 201 | * @return The resource url if possible (access is subject to AWS credential) 202 | */ 203 | public String getDownloadUrl(S3ObjectSummary summary, S3RiverFeedDefinition feedDefinition){ 204 | String resourceUrl = s3Client.getResourceUrl(summary.getBucketName(), summary.getKey()); 205 | // If a download host (actually a vhost such as cloudfront offers) is specified, use it to 206 | // recreate a vhosted resource url. This is made by substitution of the generic host name in url. 207 | if (resourceUrl != null && feedDefinition.getDownloadHost() != null){ 208 | int hostPosEnd = resourceUrl.indexOf("s3.amazonaws.com/") + "s3.amazonaws.com".length(); 209 | String vhostResourceUrl = feedDefinition.getDownloadHost() + resourceUrl.substring(hostPosEnd); 210 | return vhostResourceUrl; 211 | } 212 | return resourceUrl; 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | es-amazon-s3-river 2 | ================== 3 | 4 | Amazon S3 river for Elasticsearch 5 | 6 | This river plugin helps to index documents from a Amazon S3 account buckets. 7 | 8 | *WARNING*: For 0.0.1 released version, you need to have the [Attachment Plugin](https://github.com/elasticsearch/elasticsearch-mapper-attachments). 9 | 10 | *WARNING*: Starting from 0.0.2, you don't need anymore the [Attachment Plugin](https://github.com/elasticsearch/elasticsearch-mapper-attachments) as we use now directly [Tika](http://tika.apache.org/), see [issue #2](https://github.com/lbroudoux/es-amazon-s3-river/issues/2). 11 | 12 | Versions 13 | -------- 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 |
Amazon S3 River PluginElasticSearchAttachment PluginTika
master (1.6.1-SNAPSHOT)1.6.x and 1.7.xNo more used1.6
1.6.01.6.x and 1.7.xNo more used1.6
1.4.11.4.x and 1.5.xNo more used1.6
1.4.01.4.x and 1.5.xNo more used1.6
1.3.01.3.xNo more used1.4
1.2.01.2.xNo more used1.4
0.0.41.0.x and 1.1.xNo more used1.4
0.0.31.0.0No more used1.4
0.0.20.90.0No more used1.4
0.0.10.90.01.7.0
87 | 88 | Build Status 89 | ------------ 90 | 91 | Travis CI [![Build Status](https://travis-ci.org/lbroudoux/es-amazon-s3-river.png?branch=master)](https://travis-ci.org/lbroudoux/es-amazon-s3-river) 92 | 93 | 94 | Getting Started 95 | =============== 96 | 97 | Installation 98 | ------------ 99 | 100 | Just install as a regular Elasticsearch plugin by typing : 101 | 102 | ```sh 103 | $ bin/plugin --install com.github.lbroudoux.elasticsearch/amazon-s3-river/1.6.0 104 | ``` 105 | 106 | This will do the job... 107 | 108 | ``` 109 | -> Installing com.github.lbroudoux.elasticsearch/amazon-s3-river/1.6.0... 110 | Trying http://download.elasticsearch.org/com.github.lbroudoux.elasticsearch/amazon-s3-river/amazon-s3-river-1.6.0.zip... 111 | Trying http://search.maven.org/remotecontent?filepath=com/github/lbroudoux/elasticsearch/amazon-s3-river/1.6.0/amazon-s3-river-1.6.0.zip... 112 | Downloading ......DONE 113 | Installed amazon-s3-river 114 | ``` 115 | 116 | 117 | Get Amazon AWS credentials (accessKey and secretKey) 118 | ------------------------------------------ 119 | 120 | First, you need to login to Amazon AWS account owning the S3 bucket to and then retrieve your security credentials by visiting this [page](https://portal.aws.amazon.com/gp/aws/securityCredentials). 121 | 122 | Once done, you should note your `accessKey` and `secretKey` codes. 123 | 124 | 125 | Creating an Amazon S3 river 126 | ------------------------ 127 | 128 | We create first an index to store our *documents* (optional): 129 | 130 | ```sh 131 | $ curl -XPUT 'http://localhost:9200/mys3docs/' -d '{}' 132 | ``` 133 | 134 | We create the river with the following properties : 135 | 136 | * accessKey : AAAAAAAAAAAAAAAA 137 | * secretKey: BBBBBBBBBBBBBBBB 138 | * Amazon S3 bucket to index : `myownbucket` 139 | * Path prefix to index in this buckets : `Work/` (This is optional. If specified, it should be an existing path with the trailing /) 140 | * Update Rate : every 15 minutes (15 * 60 * 1000 = 900000 ms) 141 | * Get only docs like `*.doc` and `*.pdf` 142 | * Don't index `*.zip` and `*.gz` 143 | 144 | ```sh 145 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{ 146 | "type": "amazon-s3", 147 | "amazon-s3": { 148 | "accessKey": "AAAAAAAAAAAAAAAA", 149 | "secretKey": "BBBBBBBBBBBBBBBB", 150 | "name": "My Amazon S3 feed", 151 | "bucket" : "myownbucket" 152 | "pathPrefix": "Work/", 153 | "update_rate": 900000, 154 | "includes": "*.doc,*.pdf", 155 | "excludes": "*.zip,*.gz" 156 | } 157 | }' 158 | ``` 159 | 160 | By default, river is using an index that have the same name (`mys3docs` in the above example). 161 | 162 | *From 0.0.2 version* 163 | 164 | The `source_url` of documents is now stored within Elasticsearch index in order to allow you to access 165 | later the whole document content from your application (this is indeed a use case coming from [Scrutmydocs](http://www.scrutmydocs.org)). 166 | 167 | By default, the plugin uses what is called the *resourceUrl* of a S3 bucket document. If the document have 168 | been made public within S3, it can be accessed directly from your browser. If it's not the case, the stored url 169 | is intended to be used by a regular S3 client that has the allowed set of credentials to access the document. 170 | 171 | Another option to easily distribute S3 content is to setup a Web proxy in front of S3 such as CloudFront (see 172 | [Service Private Content With CloudFront](http://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/PrivateContent.html)). 173 | In that later case, you'll want to rewrite `source_url` by substituting the S3 part by your own host name. This 174 | plugin allows you to do that by specifying a `download_host` as a river properties. 175 | 176 | 177 | Specifying index options 178 | ------------------------ 179 | 180 | Index options can be specified when creating an amazon-s3-river. The properties are the following : 181 | 182 | * Index name : "amazondocs" 183 | * Type of documents : "doc" 184 | * Size of an indexation bulk : 50 (default is 100) 185 | 186 | You'll have to use them as follow when creating a river : 187 | 188 | ```sh 189 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{ 190 | "type": "amazon-s3", 191 | "amazon-s3": { 192 | "accessKey": "AAAAAAAAAAAAAAAA", 193 | "secretKey": "BBBBBBBBBBBBBBBB", 194 | "name": "My Amazon S3 feed", 195 | "bucket" : "myownbucket" 196 | "pathPrefix": "Work/", 197 | "update_rate": 900000, 198 | "includes": "*.doc,*.pdf", 199 | "excludes": "*.zip,*.gz" 200 | }, 201 | "index": { 202 | "index": "amazondocs", 203 | "type": "doc", 204 | "bulk_size": 50 205 | } 206 | }' 207 | ``` 208 | 209 | Indexing Json documents 210 | ----------------------- 211 | 212 | *From 0.0.4 version* 213 | 214 | If you want to index Json files directly without parsing them through Tika, you can set the `json_support` configuration 215 | option to `true` like 216 | 217 | ```sh 218 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{ 219 | "type": "amazon-s3", 220 | "amazon-s3": { 221 | "accessKey": "AAAAAAAAAAAAAAAA", 222 | "secretKey": "BBBBBBBBBBBBBBBB", 223 | "name": "My Amazon S3 feed", 224 | "bucket" : "myownbucket" 225 | "pathPrefix": "Jsons/", 226 | "update_rate": 900000, 227 | "json_support": true, 228 | "includes": "*.json" 229 | } 230 | }' 231 | ``` 232 | 233 | Be sure in your river configuration to correclty use `includes` or `excludes` to only retrieve Json documents. 234 | 235 | In this case of `json_support` and if you did not define a mapping prior creating it, this river *will not* 236 | automatically generate a mapping like mentioned below into the Advanced section. In this case, Elasticsearch will auto 237 | guess the mapping. 238 | 239 | 240 | Advanced 241 | ======== 242 | 243 | Management actions 244 | ------------------ 245 | 246 | If you need to stop a river, you can call the `_s3` endpoint with your river name followed by the `_stop` command like this : 247 | 248 | ```sh 249 | GET _s3/mys3docs/_stop 250 | ``` 251 | 252 | To restart the river from the previous point, just call the corresponding `_start` endpoint : 253 | 254 | ```sh 255 | GET _s3/mys3docs/_start 256 | ``` 257 | 258 | Extracted characters 259 | -------------------- 260 | 261 | *From 1.4.1 version* 262 | 263 | By default this river plugin will extract only a limited size of characters (up to 100000 that is the default aloowed by Tika). 264 | But this may be not sufficient for big documents. You can override this limit using the `indexed_chars_ratio` river option like this : 265 | 266 | ```sh 267 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{ 268 | "type": "amazon-s3", 269 | "amazon-s3": { 270 | "accessKey": "AAAAAAAAAAAAAAAA", 271 | "secretKey": "BBBBBBBBBBBBBBBB", 272 | "name": "My Amazon S3 feed", 273 | "bucket" : "myownbucket" 274 | "pathPrefix": "Work/", 275 | "indexed_chars_ratio": 1 276 | } 277 | }' 278 | ``` 279 | 280 | `indexed_chars_ratio` should actually been a positive double number. Setting `indexed_chars_ratio` to `x` will compute 281 | file size, multiply it with `x` and pass it to Tika. Setting a value of `1`, will extract exactly the filesize. 282 | 283 | That means that a value of `0.8` will extract 20% less characters than the file size. A value of `1.5` will extract 50% 284 | more characters than the filesize (think compressed files). 285 | 286 | Note that Tika requires to allocate in memory a data structure to extract text. Setting `indexed_chars_ratio` to a high 287 | number will require more memory ! 288 | 289 | 290 | Credential keys security and IAM Role 291 | ------------------------------------- 292 | 293 | *From 1.4.1 version* 294 | 295 | Transferring `accessKey` and `secretKey` as river creation option is not the always applicable depending on your context. 296 | This may lead to an exposition of this keys. From 1.4.1 version, you may now have the ability to : 297 | 298 | * either use the default credential retrieval process that checks system variables and configuration files, 299 | * either force the usage of IAM Role if your nodes are running directly onto an Amazon EC2 instance. 300 | 301 | We recommend you to check see http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html for 302 | explanations of credential retrieval process. 303 | 304 | The behaviour of this river plugin is now the following : 305 | 306 | * `accessKey` and `secretKey` are no longer mandatory fields. If not provided at index creation, river will just try to 307 | connect to your S3 bucket using the default provider chain, 308 | * new option `use_EC2_IAM` can be set to `true` to force the usage of EC2 IAM Role. 309 | 310 | In action, this lead to something like when creating river : 311 | 312 | ```sh 313 | $ curl -XPUT 'http://localhost:9200/_river/mys3docs/_meta' -d '{ 314 | "type": "amazon-s3", 315 | "amazon-s3": { 316 | "name": "My Amazon S3 feed", 317 | "bucket" : "myownbucket" 318 | "pathPrefix": "Work/", 319 | "use_EC2_IAM": true 320 | } 321 | }' 322 | ``` 323 | 324 | Autogenerated mapping 325 | --------------------- 326 | 327 | When the river detect a new type, it creates automatically a mapping for this type. 328 | 329 | ```javascript 330 | { 331 | "doc" : { 332 | "properties" : { 333 | "title" : { 334 | "type" : "string", 335 | "analyzer" : "keyword" 336 | }, 337 | "modifiedDate" : { 338 | "type" : "date", 339 | "format" : "dateOptionalTime" 340 | }, 341 | "file" : { 342 | "type" : "attachment", 343 | "fields" : { 344 | "file" : { 345 | "type" : "string", 346 | "store" : "yes", 347 | "term_vector" : "with_positions_offsets" 348 | }, 349 | "title" : { 350 | "type" : "string", 351 | "store" : "yes" 352 | } 353 | } 354 | } 355 | } 356 | } 357 | } 358 | ``` 359 | 360 | *From 0.0.2 version* 361 | 362 | We now use directly Tika instead of the mapper-attachment plugin. 363 | 364 | ```javascript 365 | { 366 | "doc" : { 367 | "properties" : { 368 | "title" : { 369 | "type" : "string", 370 | "analyzer" : "keyword" 371 | }, 372 | "modifiedDate" : { 373 | "type" : "date", 374 | "format" : "dateOptionalTime" 375 | }, 376 | "source_url" : { 377 | "type" : "string" 378 | }, 379 | "file" : { 380 | "properties" : { 381 | "file" : { 382 | "type" : "string", 383 | "store" : "yes", 384 | "term_vector" : "with_positions_offsets" 385 | }, 386 | "title" : { 387 | "type" : "string", 388 | "store" : "yes" 389 | } 390 | } 391 | } 392 | } 393 | } 394 | } 395 | ``` 396 | 397 | 398 | License 399 | ======= 400 | 401 | ``` 402 | This software is licensed under the Apache 2 license, quoted below. 403 | 404 | Copyright 2013-2015 Laurent Broudoux 405 | 406 | Licensed under the Apache License, Version 2.0 (the "License"); you may not 407 | use this file except in compliance with the License. You may obtain a copy of 408 | the License at 409 | 410 | http://www.apache.org/licenses/LICENSE-2.0 411 | 412 | Unless required by applicable law or agreed to in writing, software 413 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 414 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 415 | License for the specific language governing permissions and limitations under 416 | the License. 417 | ``` 418 | -------------------------------------------------------------------------------- /src/main/java/com/github/lbroudoux/elasticsearch/river/s3/river/S3River.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Laurent Broudoux (the "Author") under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Author licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.github.lbroudoux.elasticsearch.river.s3.river; 20 | 21 | import java.util.*; 22 | 23 | import com.amazonaws.services.s3.model.AmazonS3Exception; 24 | import org.apache.tika.metadata.Metadata; 25 | import org.elasticsearch.ExceptionsHelper; 26 | import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse; 27 | import org.elasticsearch.action.bulk.*; 28 | import org.elasticsearch.action.get.GetResponse; 29 | import org.elasticsearch.action.search.SearchResponse; 30 | import org.elasticsearch.action.search.SearchType; 31 | import org.elasticsearch.client.Client; 32 | import org.elasticsearch.cluster.ClusterState; 33 | import org.elasticsearch.cluster.block.ClusterBlockException; 34 | import org.elasticsearch.cluster.metadata.IndexMetaData; 35 | import org.elasticsearch.cluster.metadata.MappingMetaData; 36 | import org.elasticsearch.common.inject.Inject; 37 | import org.elasticsearch.common.io.stream.BytesStreamInput; 38 | import org.elasticsearch.common.util.concurrent.EsExecutors; 39 | import org.elasticsearch.common.xcontent.XContentBuilder; 40 | import org.elasticsearch.common.xcontent.support.XContentMapValues; 41 | import org.elasticsearch.indices.IndexAlreadyExistsException; 42 | import org.elasticsearch.river.AbstractRiverComponent; 43 | import org.elasticsearch.river.River; 44 | import org.elasticsearch.river.RiverName; 45 | import org.elasticsearch.river.RiverSettings; 46 | import org.elasticsearch.search.SearchHit; 47 | 48 | import com.amazonaws.services.s3.model.S3ObjectSummary; 49 | import com.github.lbroudoux.elasticsearch.river.s3.connector.S3ObjectSummaries; 50 | import com.github.lbroudoux.elasticsearch.river.s3.connector.S3Connector; 51 | import com.github.lbroudoux.elasticsearch.river.s3.river.TikaHolder; 52 | import org.elasticsearch.threadpool.ThreadPool; 53 | 54 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; 55 | /** 56 | * A River component for scanning and indexing Amazon S3 documents into Elasticsearch. 57 | * @author laurent 58 | */ 59 | public class S3River extends AbstractRiverComponent implements River{ 60 | 61 | private final Client client; 62 | 63 | private final ThreadPool threadPool; 64 | 65 | private final String indexName; 66 | 67 | private final String typeName; 68 | 69 | private final int bulkSize; 70 | 71 | private RiverStatus riverStatus; 72 | 73 | private volatile Thread feedThread; 74 | 75 | private volatile BulkProcessor bulkProcessor; 76 | 77 | private volatile boolean closed = false; 78 | 79 | private final S3RiverFeedDefinition feedDefinition; 80 | 81 | private final S3Connector s3; 82 | 83 | 84 | @Inject 85 | @SuppressWarnings({ "unchecked" }) 86 | protected S3River(RiverName riverName, RiverSettings settings, Client client, ThreadPool threadPool) throws Exception{ 87 | super(riverName, settings); 88 | this.client = client; 89 | this.threadPool = threadPool; 90 | this.riverStatus = RiverStatus.UNKNOWN; 91 | 92 | // Deal with connector settings. 93 | if (settings.settings().containsKey("amazon-s3")){ 94 | Map feed = (Map)settings.settings().get("amazon-s3"); 95 | 96 | // Retrieve feed settings. 97 | String feedname = XContentMapValues.nodeStringValue(feed.get("name"), null); 98 | String bucket = XContentMapValues.nodeStringValue(feed.get("bucket"), null); 99 | String pathPrefix = XContentMapValues.nodeStringValue(feed.get("pathPrefix"), null); 100 | String downloadHost = XContentMapValues.nodeStringValue(feed.get("download_host"), null); 101 | int updateRate = XContentMapValues.nodeIntegerValue(feed.get("update_rate"), 15 * 60 * 1000); 102 | boolean jsonSupport = XContentMapValues.nodeBooleanValue(feed.get("json_support"), false); 103 | double indexedCharsRatio = XContentMapValues.nodeDoubleValue(feed.get("indexed_chars_ratio"), 0.0); 104 | 105 | String[] includes = S3RiverUtil.buildArrayFromSettings(settings.settings(), "amazon-s3.includes"); 106 | String[] excludes = S3RiverUtil.buildArrayFromSettings(settings.settings(), "amazon-s3.excludes"); 107 | 108 | // Retrieve connection settings. 109 | String accessKey = XContentMapValues.nodeStringValue(feed.get("accessKey"), null); 110 | String secretKey = XContentMapValues.nodeStringValue(feed.get("secretKey"), null); 111 | boolean useIAMRoleForEC2 = XContentMapValues.nodeBooleanValue(feed.get("use_EC2_IAM"), false); 112 | 113 | feedDefinition = new S3RiverFeedDefinition(feedname, bucket, pathPrefix, downloadHost, 114 | updateRate, Arrays.asList(includes), Arrays.asList(excludes), accessKey, secretKey, useIAMRoleForEC2, 115 | jsonSupport, indexedCharsRatio); 116 | } else { 117 | logger.error("You didn't define the amazon-s3 settings. Exiting... See https://github.com/lbroudoux/es-amazon-s3-river"); 118 | indexName = null; 119 | typeName = null; 120 | bulkSize = 100; 121 | feedDefinition = null; 122 | s3 = null; 123 | return; 124 | } 125 | 126 | // Deal with index settings if provided. 127 | if (settings.settings().containsKey("index")) { 128 | Map indexSettings = (Map)settings.settings().get("index"); 129 | 130 | indexName = XContentMapValues.nodeStringValue(indexSettings.get("index"), riverName.name()); 131 | typeName = XContentMapValues.nodeStringValue(indexSettings.get("type"), S3RiverUtil.INDEX_TYPE_DOC); 132 | bulkSize = XContentMapValues.nodeIntegerValue(indexSettings.get("bulk_size"), 100); 133 | } else { 134 | indexName = riverName.name(); 135 | typeName = S3RiverUtil.INDEX_TYPE_DOC; 136 | bulkSize = 100; 137 | } 138 | 139 | // We need to connect to Amazon S3 after ensure mandatory settings are here. 140 | if (feedDefinition.getBucket() == null){ 141 | logger.error("Amazon S3 bucket should not be null. Please fix this."); 142 | throw new IllegalArgumentException("Amazon S3 bucket should not be null."); 143 | } 144 | // Connect using the appropriate authentication process. 145 | if (feedDefinition.getAccessKey() == null && feedDefinition.getSecretKey() == null) { 146 | s3 = new S3Connector(feedDefinition.isUseIAMRoleForEC2()); 147 | } else { 148 | s3 = new S3Connector(feedDefinition.getAccessKey(), feedDefinition.getSecretKey()); 149 | } 150 | try { 151 | s3.connectUserBucket(feedDefinition.getBucket(), feedDefinition.getPathPrefix()); 152 | } catch (AmazonS3Exception ase){ 153 | logger.error("Exception while connecting Amazon S3 user bucket. " 154 | + "Either access key, secret key, IAM Role or bucket name are incorrect"); 155 | throw ase; 156 | } 157 | 158 | this.riverStatus = RiverStatus.INITIALIZED; 159 | } 160 | 161 | @Override 162 | public void start(){ 163 | if (logger.isInfoEnabled()){ 164 | logger.info("Starting amazon s3 river scanning"); 165 | } 166 | 167 | this.riverStatus = RiverStatus.STARTING; 168 | // Let's start this in another thread so we won't stop the start process 169 | threadPool.generic().execute(new Runnable() { 170 | @Override 171 | public void run() { 172 | // We are first waiting for a yellow state at least 173 | logger.debug("Waiting for yellow status"); 174 | client.admin().cluster().prepareHealth("_river").setWaitForYellowStatus().get(); 175 | logger.debug("Yellow or green status received"); 176 | 177 | try { 178 | // Create the index if it doesn't exist 179 | if (!client.admin().indices().prepareExists(indexName).execute().actionGet().isExists()) { 180 | client.admin().indices().prepareCreate(indexName).execute().actionGet(); 181 | } 182 | } catch (Exception e) { 183 | if (ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException){ 184 | // that's fine. 185 | } else if (ExceptionsHelper.unwrapCause(e) instanceof ClusterBlockException){ 186 | // ok, not recovered yet..., lets start indexing and hope we recover by the first bulk. 187 | } else { 188 | logger.warn("failed to create index [{}], disabling river...", e, indexName); 189 | return; 190 | } 191 | } 192 | 193 | try { 194 | // If needed, we create the new mapping for files 195 | if (!feedDefinition.isJsonSupport()) { 196 | pushMapping(indexName, typeName, S3RiverUtil.buildS3FileMapping(typeName)); 197 | } 198 | } catch (Exception e) { 199 | logger.warn("Failed to create mapping for [{}/{}], disabling river...", 200 | e, indexName, typeName); 201 | return; 202 | } 203 | 204 | // Creating bulk processor 205 | bulkProcessor = BulkProcessor.builder(client, new BulkProcessor.Listener() { 206 | @Override 207 | public void beforeBulk(long id, BulkRequest request) { 208 | logger.debug("Going to execute new bulk composed of {} actions", request.numberOfActions()); 209 | } 210 | 211 | @Override 212 | public void afterBulk(long id, BulkRequest request, BulkResponse response) { 213 | logger.debug("Executed bulk composed of {} actions", request.numberOfActions()); 214 | if (response.hasFailures()) { 215 | logger.warn("There was failures while executing bulk", response.buildFailureMessage()); 216 | if (logger.isDebugEnabled()) { 217 | for (BulkItemResponse item : response.getItems()) { 218 | if (item.isFailed()) { 219 | logger.debug("Error for {}/{}/{} for {} operation: {}", item.getIndex(), 220 | item.getType(), item.getId(), item.getOpType(), item.getFailureMessage()); 221 | } 222 | } 223 | } 224 | } 225 | } 226 | 227 | @Override 228 | public void afterBulk(long id, BulkRequest request, Throwable throwable) { 229 | logger.warn("Error executing bulk", throwable); 230 | } 231 | }) 232 | .setBulkActions(bulkSize) 233 | .build(); 234 | 235 | // We create as many Threads as there are feeds. 236 | feedThread = EsExecutors.daemonThreadFactory(settings.globalSettings(), "fs_slurper") 237 | .newThread(new S3Scanner(feedDefinition)); 238 | feedThread.start(); 239 | riverStatus = RiverStatus.RUNNING; 240 | } 241 | }); 242 | 243 | } 244 | 245 | @Override 246 | public void close(){ 247 | if (logger.isInfoEnabled()){ 248 | logger.info("Closing amazon s3 river"); 249 | } 250 | closed = true; 251 | riverStatus = RiverStatus.STOPPING; 252 | 253 | // We have to close the Thread. 254 | if (feedThread != null){ 255 | feedThread.interrupt(); 256 | } 257 | riverStatus = RiverStatus.STOPPED; 258 | } 259 | 260 | /** 261 | * Check if a mapping already exists in an index 262 | * @param index Index name 263 | * @param type Mapping name 264 | * @return true if mapping exists 265 | */ 266 | private boolean isMappingExist(String index, String type) { 267 | ClusterState cs = client.admin().cluster().prepareState() 268 | .setIndices(index).execute().actionGet() 269 | .getState(); 270 | // Check index metadata existence. 271 | IndexMetaData imd = cs.getMetaData().index(index); 272 | if (imd == null){ 273 | return false; 274 | } 275 | // Check mapping metadata existence. 276 | MappingMetaData mdd = imd.mapping(type); 277 | if (mdd != null){ 278 | return true; 279 | } 280 | return false; 281 | } 282 | 283 | private void pushMapping(String index, String type, XContentBuilder xcontent) throws Exception { 284 | if (logger.isTraceEnabled()){ 285 | logger.trace("pushMapping(" + index + ", " + type + ")"); 286 | } 287 | 288 | // If type does not exist, we create it 289 | boolean mappingExist = isMappingExist(index, type); 290 | if (!mappingExist) { 291 | logger.debug("Mapping [" + index + "]/[" + type + "] doesn't exist. Creating it."); 292 | 293 | // Read the mapping json file if exists and use it. 294 | if (xcontent != null){ 295 | if (logger.isTraceEnabled()){ 296 | logger.trace("Mapping for [" + index + "]/[" + type + "]=" + xcontent.string()); 297 | } 298 | // Create type and mapping 299 | PutMappingResponse response = client.admin().indices() 300 | .preparePutMapping(index) 301 | .setType(type) 302 | .setSource(xcontent) 303 | .execute().actionGet(); 304 | if (!response.isAcknowledged()){ 305 | throw new Exception("Could not define mapping for type [" + index + "]/[" + type + "]."); 306 | } else { 307 | if (logger.isDebugEnabled()){ 308 | if (mappingExist){ 309 | logger.debug("Mapping definition for [" + index + "]/[" + type + "] succesfully merged."); 310 | } else { 311 | logger.debug("Mapping definition for [" + index + "]/[" + type + "] succesfully created."); 312 | } 313 | } 314 | } 315 | } else { 316 | if (logger.isDebugEnabled()){ 317 | logger.debug("No mapping definition for [" + index + "]/[" + type + "]. Ignoring."); 318 | } 319 | } 320 | } else { 321 | if (logger.isDebugEnabled()){ 322 | logger.debug("Mapping [" + index + "]/[" + type + "] already exists and mergeMapping is not set."); 323 | } 324 | } 325 | if (logger.isTraceEnabled()){ 326 | logger.trace("/pushMapping(" + index + ", " + type + ")"); 327 | } 328 | } 329 | 330 | /** */ 331 | private class S3Scanner implements Runnable{ 332 | 333 | private BulkRequestBuilder bulk; 334 | private S3RiverFeedDefinition feedDefinition; 335 | 336 | public S3Scanner(S3RiverFeedDefinition feedDefinition){ 337 | this.feedDefinition = feedDefinition; 338 | } 339 | 340 | @Override 341 | public void run(){ 342 | while (true){ 343 | if (closed){ 344 | return; 345 | } 346 | 347 | try{ 348 | if (isStarted()){ 349 | // Scan folder starting from last changes id, then record the new one. 350 | Long lastScanTime = getLastScanTimeFromRiver("_lastScanTime"); 351 | lastScanTime = scan(lastScanTime); 352 | updateRiver("_lastScanTime", lastScanTime); 353 | } else { 354 | logger.info("Amazon S3 River is disabled for {}", riverName().name()); 355 | } 356 | } catch (Exception e){ 357 | logger.warn("Error while indexing content from {}", feedDefinition.getBucket()); 358 | if (logger.isDebugEnabled()){ 359 | logger.debug("Exception for folder {} is {}", feedDefinition.getBucket(), e); 360 | e.printStackTrace(); 361 | } 362 | } 363 | 364 | try { 365 | if (logger.isDebugEnabled()){ 366 | logger.debug("Amazon S3 river is going to sleep for {} ms", feedDefinition.getUpdateRate()); 367 | } 368 | Thread.sleep(feedDefinition.getUpdateRate()); 369 | } catch (InterruptedException ie){ 370 | } 371 | } 372 | } 373 | 374 | private boolean isStarted(){ 375 | // Refresh index before querying it. 376 | client.admin().indices().prepareRefresh("_river").execute().actionGet(); 377 | GetResponse isStartedGetResponse = client.prepareGet("_river", riverName().name(), "_s3status").execute().actionGet(); 378 | try{ 379 | if (!isStartedGetResponse.isExists()){ 380 | XContentBuilder xb = jsonBuilder().startObject() 381 | .startObject("amazon-s3") 382 | .field("feedname", feedDefinition.getFeedname()) 383 | .field("status", "STARTED").endObject() 384 | .endObject(); 385 | client.prepareIndex("_river", riverName.name(), "_s3status").setSource(xb).execute(); 386 | return true; 387 | } else { 388 | String status = (String)XContentMapValues.extractValue("amazon-s3.status", isStartedGetResponse.getSourceAsMap()); 389 | if ("STOPPED".equals(status)){ 390 | return false; 391 | } 392 | } 393 | } catch (Exception e){ 394 | logger.warn("failed to get status for " + riverName().name() + ", throttling....", e); 395 | } 396 | return true; 397 | } 398 | 399 | @SuppressWarnings("unchecked") 400 | private Long getLastScanTimeFromRiver(String lastScanTimeField){ 401 | Long result = null; 402 | try { 403 | // Do something. 404 | client.admin().indices().prepareRefresh("_river").execute().actionGet(); 405 | GetResponse lastSeqGetResponse = client.prepareGet("_river", riverName().name(), 406 | lastScanTimeField).execute().actionGet(); 407 | if (lastSeqGetResponse.isExists()) { 408 | Map fsState = (Map) lastSeqGetResponse.getSourceAsMap().get("amazon-s3"); 409 | 410 | if (fsState != null){ 411 | Object lastScanTime= fsState.get(lastScanTimeField); 412 | if (lastScanTime != null){ 413 | try{ 414 | result = Long.parseLong(lastScanTime.toString()); 415 | } catch (NumberFormatException nfe){ 416 | logger.warn("Last recorded lastScanTime is not a Long {}", lastScanTime.toString()); 417 | } 418 | } 419 | } 420 | } else { 421 | // This is first call, just log in debug mode. 422 | if (logger.isDebugEnabled()){ 423 | logger.debug("{} doesn't exist", lastScanTimeField); 424 | } 425 | } 426 | } catch (Exception e) { 427 | logger.warn("failed to get _lastScanTimeField, throttling....", e); 428 | } 429 | 430 | if (logger.isDebugEnabled()){ 431 | logger.debug("lastScanTimeField: {}", result); 432 | } 433 | return result; 434 | } 435 | 436 | /** Scan the Amazon S3 bucket for last changes. */ 437 | private Long scan(Long lastScanTime) throws Exception{ 438 | if (logger.isDebugEnabled()){ 439 | logger.debug("Starting scanning of bucket {} since {}", feedDefinition.getBucket(), lastScanTime); 440 | } 441 | S3ObjectSummaries summaries = s3.getObjectSummaries(lastScanTime); 442 | 443 | // Store now already indexed ids. 444 | List previousFileIds = getAlreadyIndexFileIds(); 445 | 446 | // Browse change and checks if its indexable before starting. 447 | for (S3ObjectSummary summary : summaries.getPickedSummaries()){ 448 | if (S3RiverUtil.isIndexable(summary.getKey(), feedDefinition.getIncludes(), feedDefinition.getExcludes())){ 449 | indexFile(summary); 450 | } 451 | } 452 | 453 | // Now, because we do not get changes but only present files, we should 454 | // compare previously indexed files with latest to extract deleted ones... 455 | // But before, we need to produce a list of index ids corresponding to S3 keys. 456 | List summariesIds = new ArrayList(); 457 | for (String key : summaries.getKeys()){ 458 | summariesIds.add(buildIndexIdFromS3Key(key)); 459 | } 460 | for (String previousFileId : previousFileIds){ 461 | if (!summariesIds.contains(previousFileId)){ 462 | esDelete(indexName, typeName, previousFileId); 463 | } 464 | } 465 | 466 | return summaries.getLastScanTime(); 467 | } 468 | 469 | /** Retrieve the ids of files already present into index. */ 470 | private List getAlreadyIndexFileIds(){ 471 | List fileIds = new ArrayList(); 472 | // TODO : Should be later optimized for only retrieving ids and getting 473 | // over the 5000 hits limitation. 474 | SearchResponse response = client 475 | .prepareSearch(indexName) 476 | .setSearchType(SearchType.QUERY_AND_FETCH) 477 | .setTypes(typeName) 478 | .setFrom(0) 479 | .setSize(5000) 480 | .execute().actionGet(); 481 | if (response.getHits() != null && response.getHits().getHits() != null){ 482 | for (SearchHit hit : response.getHits().getHits()){ 483 | fileIds.add(hit.getId()); 484 | } 485 | } 486 | return fileIds; 487 | } 488 | 489 | /** Index an Amazon S3 file by retrieving its content and building the suitable Json content. */ 490 | private String indexFile(S3ObjectSummary summary){ 491 | if (logger.isDebugEnabled()){ 492 | logger.debug("Trying to index '{}'", summary.getKey()); 493 | } 494 | 495 | try{ 496 | // Build a unique id from S3 unique summary key. 497 | String fileId = buildIndexIdFromS3Key(summary.getKey()); 498 | 499 | if (feedDefinition.isJsonSupport()){ 500 | esIndex(indexName, typeName, summary.getKey(), s3.getContent(summary)); 501 | } else { 502 | byte[] fileContent = s3.getContent(summary); 503 | 504 | if (fileContent != null) { 505 | // Compute number of chars to index. 506 | // see https://github.com/lbroudoux/es-amazon-s3-river/issues/36 507 | int indexedChars = 100000; 508 | if (feedDefinition.getIndexedCharsRatio() > 0) { 509 | indexedChars = (int) Math.round(fileContent.length * feedDefinition.getIndexedCharsRatio()); 510 | } 511 | 512 | // Parse content using Tika directly. 513 | Metadata fileMetadata = new Metadata(); 514 | String parsedContent = TikaHolder.tika().parseToString( 515 | new BytesStreamInput(fileContent), fileMetadata, indexedChars); 516 | 517 | // Store Tika metadatas into a map. 518 | Map fileMetadataMap = new HashMap(); 519 | for (String key : fileMetadata.names()) { 520 | fileMetadataMap.put(key, fileMetadata.get(key)); 521 | } 522 | 523 | esIndex(indexName, typeName, fileId, 524 | jsonBuilder() 525 | .startObject() 526 | .field(S3RiverUtil.DOC_FIELD_TITLE, summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1)) 527 | .field(S3RiverUtil.DOC_FIELD_MODIFIED_DATE, summary.getLastModified().getTime()) 528 | .field(S3RiverUtil.DOC_FIELD_SOURCE_URL, s3.getDownloadUrl(summary, feedDefinition)) 529 | .field(S3RiverUtil.DOC_FIELD_METADATA, s3.getS3UserMetadata(summary.getKey())) 530 | .startObject("file") 531 | .field("_name", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1)) 532 | .field("title", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1)) 533 | .field("file", parsedContent) 534 | .field("metadata", fileMetadataMap) 535 | .endObject() 536 | .endObject() 537 | ); 538 | return fileId; 539 | } 540 | } 541 | } catch (Exception e) { 542 | logger.warn("Can not index " + summary.getKey() + " : " + e.getMessage()); 543 | } 544 | return null; 545 | } 546 | 547 | /** Build a unique id from S3 unique summary key. */ 548 | private String buildIndexIdFromS3Key(String key){ 549 | return key.replace('/', '-'); 550 | } 551 | 552 | /** Update river last changes id value.*/ 553 | private void updateRiver(String lastScanTimeField, Long lastScanTime) throws Exception{ 554 | if (logger.isDebugEnabled()){ 555 | logger.debug("Updating lastScanTimeField: {}", lastScanTime); 556 | } 557 | 558 | // We store the lastupdate date and some stats 559 | XContentBuilder xb = jsonBuilder() 560 | .startObject() 561 | .startObject("amazon-s3") 562 | .field("feedname", feedDefinition.getFeedname()) 563 | .field(lastScanTimeField, lastScanTime) 564 | .endObject() 565 | .endObject(); 566 | esIndex("_river", riverName.name(), lastScanTimeField, xb); 567 | } 568 | 569 | /** Add to bulk an IndexRequest. */ 570 | private void esIndex(String index, String type, String id, XContentBuilder xb) throws Exception{ 571 | if (logger.isDebugEnabled()){ 572 | logger.debug("Indexing in ES " + index + ", " + type + ", " + id); 573 | } 574 | if (logger.isTraceEnabled()){ 575 | logger.trace("Json indexed : {}", xb.string()); 576 | } 577 | bulkProcessor.add(client.prepareIndex(index, type, id).setSource(xb).request()); 578 | } 579 | 580 | /** Add to bulk an IndexRequest. */ 581 | private void esIndex(String index, String type, String id, byte[] json) throws Exception{ 582 | if (logger.isDebugEnabled()){ 583 | logger.debug("Indexing in ES " + index + ", " + type + ", " + id); 584 | } 585 | if (logger.isTraceEnabled()){ 586 | logger.trace("Json indexed : {}", json); 587 | } 588 | bulkProcessor.add(client.prepareIndex(index, type, id).setSource(json).request()); 589 | } 590 | 591 | /** Add to bulk a DeleteRequest. */ 592 | private void esDelete(String index, String type, String id) throws Exception{ 593 | if (logger.isDebugEnabled()){ 594 | logger.debug("Deleting from ES " + index + ", " + type + ", " + id); 595 | } 596 | bulkProcessor.add(client.prepareDelete(index, type, id).request()); 597 | } 598 | } 599 | 600 | private enum RiverStatus { 601 | UNKNOWN, 602 | INITIALIZED, 603 | STARTING, 604 | RUNNING, 605 | STOPPING, 606 | STOPPED; 607 | } 608 | } 609 | --------------------------------------------------------------------------------