├── gradle.properties ├── settings.gradle ├── src ├── main │ ├── bin │ │ ├── ingest-opennlp-env.bat │ │ ├── ingest-opennlp-env │ │ ├── download-models │ │ └── download-models.bat │ ├── resources │ │ └── plugin-descriptor.properties │ └── java │ │ └── de │ │ └── spinscale │ │ └── elasticsearch │ │ └── ingest │ │ └── opennlp │ │ ├── ExtractedEntities.java │ │ ├── IngestOpenNlpPlugin.java │ │ ├── OpenNlpModelDownloader.java │ │ ├── OpenNlpProcessor.java │ │ └── OpenNlpService.java └── test │ └── java │ └── de │ └── spinscale │ └── elasticsearch │ └── ingest │ └── opennlp │ ├── OpenNlpServiceTests.java │ ├── OpenNlpThreadSafeTests.java │ ├── OpenNlpProcessorTests.java │ └── OpenNlpPluginIntegrationTests.java ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── NOTICE.txt ├── .gitignore ├── Dockerfile ├── gradlew.bat ├── gradlew ├── LICENSE.txt └── README.md /gradle.properties: -------------------------------------------------------------------------------- 1 | elasticsearchVersion = 8.5.0 2 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'ingest-opennlp' 2 | -------------------------------------------------------------------------------- /src/main/bin/ingest-opennlp-env.bat: -------------------------------------------------------------------------------- 1 | 2 | set ES_CLASSPATH=!ES_CLASSPATH!;!ES_HOME!/plugins/ingest-opennlp/* 3 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinscale/elasticsearch-ingest-opennlp/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Alexaner Reelsen 2 | Copyright 2016-2017 Alexander Reelsen 3 | 4 | The models have been downloaded from http://opennlp.sourceforge.net/models-1.5/ 5 | -------------------------------------------------------------------------------- /src/main/bin/ingest-opennlp-env: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # include x-pack-security jars in classpath 4 | ES_CLASSPATH="$ES_CLASSPATH:$ES_HOME/plugins/ingest-opennlp/*" 5 | 6 | -------------------------------------------------------------------------------- /src/main/bin/download-models: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ES_MAIN_CLASS=de.spinscale.elasticsearch.ingest.opennlp.OpenNlpModelDownloader \ 4 | ES_ADDITIONAL_SOURCES="ingest-opennlp/ingest-opennlp-env" \ 5 | "`dirname "$0"`"/../elasticsearch-cli \ 6 | "$@" 7 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | name=ingest-opennlp 2 | version=@version@ 3 | elasticsearch.version=@elasticsearchVersion@ 4 | classname=de.spinscale.elasticsearch.ingest.opennlp.IngestOpenNlpPlugin 5 | description=Ingest processor doing language detection for fields 6 | java.version=17 7 | extended.plugins= 8 | has.native.controller=false 9 | -------------------------------------------------------------------------------- /src/main/bin/download-models.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | setlocal enabledelayedexpansion 4 | setlocal enableextensions 5 | 6 | set ES_MAIN_CLASS=de.spinscale.elasticsearch.ingest.opennlp.OpenNlpModelDownloader 7 | set ES_ADDITIONAL_SOURCES=ingest-opennlp\ingest-opennlp-env 8 | call "%~dp0..\elasticsearch-cli.bat" ^ 9 | %%* ^ 10 | || exit /b 1 11 | 12 | endlocal 13 | endlocal 14 | 15 | 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .gradle/ 3 | build/ 4 | out/ 5 | *.swp 6 | *-execution-hints.log 7 | *-execution-times.log 8 | 9 | // ignore the downloaded model files in git 10 | src/test/resources/models/ 11 | 12 | // intellij 13 | *.iml 14 | *.ipr 15 | *.iws 16 | 17 | // eclipse 18 | .project 19 | .classpath 20 | eclipse-build 21 | */.project 22 | */.classpath 23 | */eclipse-build 24 | .settings 25 | !/.settings/org.eclipse.core.resources.prefs 26 | !/.settings/org.eclipse.jdt.core.prefs 27 | !/.settings/org.eclipse.jdt.ui.prefs 28 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.elastic.co/elasticsearch/elasticsearch:8.5.0 2 | 3 | ADD ./src/test/resources/models/en-ner-persons.bin /usr/share/elasticsearch/config/ingest-opennlp/ 4 | ADD ./src/test/resources/models/en-ner-locations.bin /usr/share/elasticsearch/config/ingest-opennlp/ 5 | ADD ./src/test/resources/models/en-ner-dates.bin /usr/share/elasticsearch/config/ingest-opennlp/ 6 | 7 | ENV ES_SETTING_INGEST_OPENNLP_MODEL_FILE_NAMES=en-ner-persons.bin 8 | ENV ES_SETTING_INGEST_OPENNLP_MODEL_FILE_LOCATIONS=en-ner-locations.bin 9 | ENV ES_SETTING_INGEST_OPENNLP_MODEL_FILE_DATES=en-ner-dates.bin 10 | 11 | ADD build/distribution/elasticsearch-ingest-opennlp.zip /elasticsearch-ingest-opennlp.zip 12 | RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch file:///elasticsearch-ingest-opennlp.zip 13 | EXPOSE 9200 14 | -------------------------------------------------------------------------------- /src/main/java/de/spinscale/elasticsearch/ingest/opennlp/ExtractedEntities.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright [2016] [Alexander Reelsen] 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package de.spinscale.elasticsearch.ingest.opennlp; 19 | 20 | import opennlp.tools.util.Span; 21 | import org.elasticsearch.common.util.set.Sets; 22 | 23 | import java.util.HashSet; 24 | import java.util.Set; 25 | 26 | /** 27 | * Helper class to contain the extracted spans/tokens of a field 28 | */ 29 | class ExtractedEntities { 30 | 31 | private final String[] tokens; 32 | private final Span[] spans; 33 | private final HashSet entities; 34 | 35 | ExtractedEntities(String[] tokens, Span[] spans) { 36 | this.tokens = tokens; 37 | this.spans = spans; 38 | this.entities = Sets.newHashSet(Span.spansToStrings(spans, tokens)); 39 | } 40 | 41 | Set getEntityValues() { 42 | return entities; 43 | } 44 | 45 | String[] getTokens() { 46 | return tokens; 47 | } 48 | 49 | Span[] getSpans() { 50 | return spans; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/de/spinscale/elasticsearch/ingest/opennlp/IngestOpenNlpPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright [2016] [Alexander Reelsen] 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package de.spinscale.elasticsearch.ingest.opennlp; 19 | 20 | import org.elasticsearch.common.settings.Setting; 21 | import org.elasticsearch.common.settings.Setting.Property; 22 | import org.elasticsearch.ingest.Processor; 23 | import org.elasticsearch.plugins.IngestPlugin; 24 | import org.elasticsearch.plugins.Plugin; 25 | 26 | import java.nio.file.Path; 27 | import java.util.Arrays; 28 | import java.util.Collections; 29 | import java.util.List; 30 | import java.util.Map; 31 | 32 | public class IngestOpenNlpPlugin extends Plugin implements IngestPlugin { 33 | 34 | static final String NAME = "ingest-opennlp"; 35 | 36 | static final Setting.AffixSetting MODEL_FILE_SETTINGS = 37 | Setting.prefixKeySetting("ingest.opennlp.model.file.", key -> Setting.simpleString(key, Property.NodeScope)); 38 | 39 | @Override 40 | public List> getSettings() { 41 | return Arrays.asList(MODEL_FILE_SETTINGS); 42 | } 43 | 44 | @Override 45 | public Map getProcessors(Processor.Parameters parameters) { 46 | Path configDirectory = parameters.env.configFile().resolve("ingest-opennlp"); 47 | OpenNlpService openNlpService = new OpenNlpService(configDirectory, parameters.env.settings()); 48 | openNlpService.start(); 49 | 50 | return Collections.singletonMap(OpenNlpProcessor.TYPE, new OpenNlpProcessor.Factory(openNlpService)); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/test/java/de/spinscale/elasticsearch/ingest/opennlp/OpenNlpServiceTests.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright [2016] [Alexander Reelsen] 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package de.spinscale.elasticsearch.ingest.opennlp; 19 | 20 | import org.elasticsearch.common.settings.Settings; 21 | import org.junit.jupiter.api.Test; 22 | 23 | import java.nio.file.Paths; 24 | 25 | import static org.assertj.core.api.Assertions.assertThat; 26 | 27 | /* 28 | * Important: You need to run gradle from the command line first 29 | * to download the models referenced here 30 | */ 31 | public class OpenNlpServiceTests { 32 | 33 | @Test 34 | public void testThatModelsCanBeLoaded() { 35 | Settings settings = Settings.builder() 36 | .put("ingest.opennlp.model.file.names", "en-ner-persons.bin") 37 | .put("ingest.opennlp.model.file.locations", "en-ner-locations.bin") 38 | .put("ingest.opennlp.model.file.dates", "en-ner-dates.bin") 39 | .build(); 40 | OpenNlpService service = new OpenNlpService(Paths.get("src/test/resources/models/"), settings); 41 | service.start(); 42 | 43 | ExtractedEntities nameEntites = service.find("Kobe Bryant was one of the best basketball players of all time.", "names"); 44 | assertThat(nameEntites.getEntityValues()).hasSize(1); 45 | assertThat(nameEntites.getEntityValues()).contains("Kobe Bryant"); 46 | 47 | ExtractedEntities locationEntities = service.find("Munich is really an awesome city, but New York is as well.", "locations"); 48 | assertThat(locationEntities.getEntityValues()).hasSize(2); 49 | assertThat(locationEntities.getEntityValues()).contains("Munich", "New York"); 50 | 51 | ExtractedEntities dateEntities = service.find("Yesterday has been the hottest day of the year.", "dates"); 52 | assertThat(dateEntities.getEntityValues()).hasSize(1); 53 | assertThat(dateEntities.getEntityValues()).contains("Yesterday"); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 33 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 34 | 35 | @rem Find java.exe 36 | if defined JAVA_HOME goto findJavaFromJavaHome 37 | 38 | set JAVA_EXE=java.exe 39 | %JAVA_EXE% -version >NUL 2>&1 40 | if "%ERRORLEVEL%" == "0" goto init 41 | 42 | echo. 43 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 44 | echo. 45 | echo Please set the JAVA_HOME variable in your environment to match the 46 | echo location of your Java installation. 47 | 48 | goto fail 49 | 50 | :findJavaFromJavaHome 51 | set JAVA_HOME=%JAVA_HOME:"=% 52 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 53 | 54 | if exist "%JAVA_EXE%" goto init 55 | 56 | echo. 57 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 58 | echo. 59 | echo Please set the JAVA_HOME variable in your environment to match the 60 | echo location of your Java installation. 61 | 62 | goto fail 63 | 64 | :init 65 | @rem Get command-line arguments, handling Windows variants 66 | 67 | if not "%OS%" == "Windows_NT" goto win9xME_args 68 | 69 | :win9xME_args 70 | @rem Slurp the command line arguments. 71 | set CMD_LINE_ARGS= 72 | set _SKIP=2 73 | 74 | :win9xME_args_slurp 75 | if "x%~1" == "x" goto execute 76 | 77 | set CMD_LINE_ARGS=%* 78 | 79 | :execute 80 | @rem Setup the command line 81 | 82 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 83 | 84 | @rem Execute Gradle 85 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 86 | 87 | :end 88 | @rem End local scope for the variables with windows NT shell 89 | if "%ERRORLEVEL%"=="0" goto mainEnd 90 | 91 | :fail 92 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 93 | rem the _cmd.exe /c_ return code! 94 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 95 | exit /b 1 96 | 97 | :mainEnd 98 | if "%OS%"=="Windows_NT" endlocal 99 | 100 | :omega 101 | -------------------------------------------------------------------------------- /src/main/java/de/spinscale/elasticsearch/ingest/opennlp/OpenNlpModelDownloader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright [2016] [Alexander Reelsen] 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package de.spinscale.elasticsearch.ingest.opennlp; 19 | 20 | import joptsimple.OptionSet; 21 | import org.elasticsearch.cli.ProcessInfo; 22 | import org.elasticsearch.cli.Terminal; 23 | import org.elasticsearch.common.cli.EnvironmentAwareCommand; 24 | import org.elasticsearch.core.SuppressForbidden; 25 | import org.elasticsearch.env.Environment; 26 | 27 | import java.io.InputStream; 28 | import java.net.URL; 29 | import java.net.URLConnection; 30 | import java.nio.file.Files; 31 | import java.nio.file.Path; 32 | 33 | public class OpenNlpModelDownloader extends EnvironmentAwareCommand { 34 | 35 | public static void main(String[] args) throws Exception { 36 | exit(new OpenNlpModelDownloader().main(args, Terminal.DEFAULT, ProcessInfo.fromSystem())); 37 | } 38 | 39 | public OpenNlpModelDownloader() { 40 | super("Downloads three sample models for named entity recognition for dates, locations and persons"); 41 | } 42 | 43 | @Override 44 | public void execute(Terminal terminal, OptionSet options, Environment env, ProcessInfo processInfo) throws Exception { 45 | Path configDirectoryPath = env.configFile().resolve(IngestOpenNlpPlugin.NAME).normalize().toAbsolutePath(); 46 | if (Files.exists(configDirectoryPath) == false) { 47 | Files.createDirectories(configDirectoryPath); 48 | } 49 | 50 | String baseUrl = "http://opennlp.sourceforge.net/models-1.5/"; 51 | download(baseUrl + "en-ner-person.bin", configDirectoryPath.resolve("en-ner-persons.bin"), terminal); 52 | download(baseUrl + "en-ner-location.bin", configDirectoryPath.resolve("en-ner-locations.bin"), terminal); 53 | download(baseUrl + "en-ner-date.bin", configDirectoryPath.resolve("en-ner-dates.bin"), terminal); 54 | 55 | terminal.println("\nyou can use the following configuration settings now\n"); 56 | 57 | terminal.println("ingest.opennlp.model.file.persons: en-ner-persons.bin"); 58 | terminal.println("ingest.opennlp.model.file.dates: en-ner-dates.bin"); 59 | terminal.println("ingest.opennlp.model.file.locations: en-ner-locations.bin"); 60 | } 61 | 62 | @SuppressForbidden(reason = "we have to download the models, so we have to open a socket") 63 | private void download(String url, Path filename, Terminal terminal) throws Exception { 64 | terminal.print(Terminal.Verbosity.NORMAL,"Downloading " + filename.getFileName() + " model... "); 65 | if (Files.exists(filename)) { 66 | terminal.println("not downloading, existed already."); 67 | } else { 68 | URLConnection connection = new URL(url).openConnection(); 69 | connection.connect(); 70 | 71 | try (InputStream inputStream = connection.getInputStream()) { 72 | Files.copy(inputStream, filename); 73 | } 74 | terminal.println("done"); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/test/java/de/spinscale/elasticsearch/ingest/opennlp/OpenNlpThreadSafeTests.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright [2016] [Alexander Reelsen] 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package de.spinscale.elasticsearch.ingest.opennlp; 19 | 20 | import org.elasticsearch.common.settings.Settings; 21 | import org.elasticsearch.common.util.iterable.Iterables; 22 | import org.junit.jupiter.api.AfterEach; 23 | import org.junit.jupiter.api.BeforeEach; 24 | import org.junit.jupiter.api.Test; 25 | 26 | import java.nio.file.Path; 27 | import java.util.ArrayList; 28 | import java.util.List; 29 | import java.util.concurrent.CountDownLatch; 30 | import java.util.concurrent.ExecutorService; 31 | import java.util.concurrent.Executors; 32 | import java.util.concurrent.TimeUnit; 33 | 34 | import static org.assertj.core.api.Assertions.assertThat; 35 | 36 | public class OpenNlpThreadSafeTests { 37 | 38 | private static final Settings settings = Settings.builder() 39 | .put("ingest.opennlp.model.file.locations", "en-ner-locations.bin") 40 | .build(); 41 | private OpenNlpService service; 42 | private ExecutorService executorService; 43 | 44 | @BeforeEach 45 | public void setup() throws Exception { 46 | service = new OpenNlpService(Path.of("src/test/resources/models/"), settings).start(); 47 | executorService = Executors.newFixedThreadPool(10); 48 | } 49 | 50 | @AfterEach 51 | public void tearDown() throws Exception { 52 | executorService.shutdownNow(); 53 | executorService.awaitTermination(10, TimeUnit.SECONDS); 54 | } 55 | 56 | @Test 57 | public void testThatOpenNlpServiceIsThreadSafe() throws Exception { 58 | int runs = 1000; 59 | CountDownLatch latch = new CountDownLatch(runs); 60 | List runnables = new ArrayList<>(); 61 | 62 | String[] cities = new String[] {"Munich", "Stockholm", "Madrid", "San Francisco", "Cologne", "Paris", "London", "Amsterdam"}; 63 | 64 | for (int i = 0; i < runs; i++) { 65 | String city = cities[runs % cities.length]; 66 | 67 | OpennlpRunnable runnable = new OpennlpRunnable(i, city, latch); 68 | runnables.add(runnable); 69 | executorService.submit(runnable); 70 | } 71 | 72 | latch.await(30, TimeUnit.SECONDS); 73 | for (OpennlpRunnable runnable : runnables) { 74 | runnable.assertResultIsCorrect(); 75 | } 76 | } 77 | 78 | private class OpennlpRunnable implements Runnable { 79 | 80 | private int idx; 81 | final String city; 82 | private CountDownLatch latch; 83 | String result; 84 | 85 | OpennlpRunnable(int idx, String city, CountDownLatch latch) { 86 | this.idx = idx; 87 | this.city = city; 88 | this.latch = latch; 89 | } 90 | 91 | @Override 92 | public void run() { 93 | try { 94 | ExtractedEntities locations = service.find(city + " is really an awesome city, but others are as well.", "locations"); 95 | // logger.info("Got {}, expected {}, index {}", locations, city, idx); 96 | if (locations.getEntityValues().size() > 0) { 97 | result = Iterables.get(locations.getEntityValues(), 0); 98 | } 99 | } catch (Exception e) { 100 | } finally { 101 | latch.countDown(); 102 | } 103 | } 104 | 105 | private void assertResultIsCorrect() { 106 | assertThat(result).isEqualTo(city); 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/main/java/de/spinscale/elasticsearch/ingest/opennlp/OpenNlpProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright [2016] [Alexander Reelsen] 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package de.spinscale.elasticsearch.ingest.opennlp; 19 | 20 | import org.elasticsearch.common.Strings; 21 | import org.elasticsearch.ingest.AbstractProcessor; 22 | import org.elasticsearch.ingest.IngestDocument; 23 | import org.elasticsearch.ingest.Processor; 24 | 25 | import java.util.ArrayList; 26 | import java.util.HashMap; 27 | import java.util.HashSet; 28 | import java.util.Iterator; 29 | import java.util.List; 30 | import java.util.Map; 31 | import java.util.Set; 32 | 33 | import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalList; 34 | import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalStringProperty; 35 | import static org.elasticsearch.ingest.ConfigurationUtils.readStringProperty; 36 | 37 | public class OpenNlpProcessor extends AbstractProcessor { 38 | 39 | static final String TYPE = "opennlp"; 40 | 41 | private final OpenNlpService openNlpService; 42 | private final String sourceField; 43 | private final String targetField; 44 | private final String annotatedTextField; 45 | private final Set fields; 46 | 47 | OpenNlpProcessor(OpenNlpService openNlpService, String tag, String sourceField, String targetField, String annotatedTextField, 48 | Set fields, String description) { 49 | super(tag, description); 50 | this.openNlpService = openNlpService; 51 | this.sourceField = sourceField; 52 | this.targetField = targetField; 53 | this.annotatedTextField = annotatedTextField; 54 | this.fields = fields; 55 | } 56 | 57 | @Override 58 | public IngestDocument execute(IngestDocument ingestDocument) { 59 | String content = ingestDocument.getFieldValue(sourceField, String.class); 60 | 61 | if (Strings.hasLength(content)) { 62 | Map> entities = new HashMap<>(); 63 | mergeExisting(entities, ingestDocument, targetField); 64 | 65 | List extractedEntities = new ArrayList<>(); 66 | for (String field : fields) { 67 | ExtractedEntities data = openNlpService.find(content, field); 68 | extractedEntities.add(data); 69 | merge(entities, field, data.getEntityValues()); 70 | } 71 | 72 | // convert set to list, otherwise toXContent serialization in simulate pipeline fails 73 | Map> entitiesToStore = new HashMap<>(); 74 | Iterator>> iterator = entities.entrySet().iterator(); 75 | while (iterator.hasNext()) { 76 | Map.Entry> entry = iterator.next(); 77 | entitiesToStore.put(entry.getKey(), new ArrayList<>(entry.getValue())); 78 | } 79 | 80 | ingestDocument.setFieldValue(targetField, entitiesToStore); 81 | 82 | if (Strings.hasLength(annotatedTextField) && extractedEntities.isEmpty() == false) { 83 | String annotatedText = OpenNlpService.createAnnotatedText(content, extractedEntities); 84 | ingestDocument.setFieldValue(annotatedTextField, annotatedText); 85 | } 86 | } 87 | 88 | return ingestDocument; 89 | } 90 | 91 | @Override 92 | public String getType() { 93 | return TYPE; 94 | } 95 | 96 | public static final class Factory implements Processor.Factory { 97 | 98 | private OpenNlpService openNlpService; 99 | 100 | Factory(OpenNlpService openNlpService) { 101 | this.openNlpService = openNlpService; 102 | } 103 | 104 | @Override 105 | public OpenNlpProcessor create(Map registry, String processorTag, String description, 106 | Map config) { 107 | String field = readStringProperty(TYPE, processorTag, config, "field"); 108 | String targetField = readStringProperty(TYPE, processorTag, config, "target_field", "entities"); 109 | String annotatedTextField = readOptionalStringProperty(TYPE, processorTag, config, "annotated_text_field"); 110 | List fields = readOptionalList(TYPE, processorTag, config, "fields"); 111 | final Set foundFields = fields == null || fields.size() == 0 ? openNlpService.getModels() : new HashSet<>(fields); 112 | return new OpenNlpProcessor(openNlpService, processorTag, field, targetField, annotatedTextField, foundFields, description); 113 | } 114 | } 115 | 116 | private static void mergeExisting(Map> entities, IngestDocument ingestDocument, String targetField) { 117 | if (ingestDocument.hasField(targetField)) { 118 | @SuppressWarnings("unchecked") 119 | Map> existing = ingestDocument.getFieldValue(targetField, Map.class); 120 | entities.putAll(existing); 121 | } else { 122 | ingestDocument.setFieldValue(targetField, entities); 123 | } 124 | } 125 | 126 | private static void merge(Map> map, String key, Set values) { 127 | if (values.size() == 0) return; 128 | 129 | if (map.containsKey(key)) { 130 | values.addAll(map.get(key)); 131 | } 132 | 133 | map.put(key, values); 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | # 4 | # Copyright 2015 the original author or authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | ## 21 | ## Gradle start up script for UN*X 22 | ## 23 | ############################################################################## 24 | 25 | # Attempt to set APP_HOME 26 | # Resolve links: $0 may be a link 27 | PRG="$0" 28 | # Need this for relative symlinks. 29 | while [ -h "$PRG" ] ; do 30 | ls=`ls -ld "$PRG"` 31 | link=`expr "$ls" : '.*-> \(.*\)$'` 32 | if expr "$link" : '/.*' > /dev/null; then 33 | PRG="$link" 34 | else 35 | PRG=`dirname "$PRG"`"/$link" 36 | fi 37 | done 38 | SAVED="`pwd`" 39 | cd "`dirname \"$PRG\"`/" >/dev/null 40 | APP_HOME="`pwd -P`" 41 | cd "$SAVED" >/dev/null 42 | 43 | APP_NAME="Gradle" 44 | APP_BASE_NAME=`basename "$0"` 45 | 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 48 | 49 | # Use the maximum available, or set MAX_FD != -1 to use that value. 50 | MAX_FD="maximum" 51 | 52 | warn () { 53 | echo "$*" 54 | } 55 | 56 | die () { 57 | echo 58 | echo "$*" 59 | echo 60 | exit 1 61 | } 62 | 63 | # OS specific support (must be 'true' or 'false'). 64 | cygwin=false 65 | msys=false 66 | darwin=false 67 | nonstop=false 68 | case "`uname`" in 69 | CYGWIN* ) 70 | cygwin=true 71 | ;; 72 | Darwin* ) 73 | darwin=true 74 | ;; 75 | MINGW* ) 76 | msys=true 77 | ;; 78 | NONSTOP* ) 79 | nonstop=true 80 | ;; 81 | esac 82 | 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 84 | 85 | # Determine the Java command to use to start the JVM. 86 | if [ -n "$JAVA_HOME" ] ; then 87 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 88 | # IBM's JDK on AIX uses strange locations for the executables 89 | JAVACMD="$JAVA_HOME/jre/sh/java" 90 | else 91 | JAVACMD="$JAVA_HOME/bin/java" 92 | fi 93 | if [ ! -x "$JAVACMD" ] ; then 94 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 95 | 96 | Please set the JAVA_HOME variable in your environment to match the 97 | location of your Java installation." 98 | fi 99 | else 100 | JAVACMD="java" 101 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 102 | 103 | Please set the JAVA_HOME variable in your environment to match the 104 | location of your Java installation." 105 | fi 106 | 107 | # Increase the maximum file descriptors if we can. 108 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 109 | MAX_FD_LIMIT=`ulimit -H -n` 110 | if [ $? -eq 0 ] ; then 111 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 112 | MAX_FD="$MAX_FD_LIMIT" 113 | fi 114 | ulimit -n $MAX_FD 115 | if [ $? -ne 0 ] ; then 116 | warn "Could not set maximum file descriptor limit: $MAX_FD" 117 | fi 118 | else 119 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 120 | fi 121 | fi 122 | 123 | # For Darwin, add options to specify how the application appears in the dock 124 | if $darwin; then 125 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 126 | fi 127 | 128 | # For Cygwin or MSYS, switch paths to Windows format before running java 129 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then 130 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 131 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 132 | JAVACMD=`cygpath --unix "$JAVACMD"` 133 | 134 | # We build the pattern for arguments to be converted via cygpath 135 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 136 | SEP="" 137 | for dir in $ROOTDIRSRAW ; do 138 | ROOTDIRS="$ROOTDIRS$SEP$dir" 139 | SEP="|" 140 | done 141 | OURCYGPATTERN="(^($ROOTDIRS))" 142 | # Add a user-defined pattern to the cygpath arguments 143 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 144 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 145 | fi 146 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 147 | i=0 148 | for arg in "$@" ; do 149 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 150 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 151 | 152 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 153 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 154 | else 155 | eval `echo args$i`="\"$arg\"" 156 | fi 157 | i=`expr $i + 1` 158 | done 159 | case $i in 160 | 0) set -- ;; 161 | 1) set -- "$args0" ;; 162 | 2) set -- "$args0" "$args1" ;; 163 | 3) set -- "$args0" "$args1" "$args2" ;; 164 | 4) set -- "$args0" "$args1" "$args2" "$args3" ;; 165 | 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 166 | 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 167 | 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 168 | 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 169 | 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 170 | esac 171 | fi 172 | 173 | # Escape application args 174 | save () { 175 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 176 | echo " " 177 | } 178 | APP_ARGS=`save "$@"` 179 | 180 | # Collect all arguments for the java command, following the shell quoting and substitution rules 181 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 182 | 183 | exec "$JAVACMD" "$@" 184 | -------------------------------------------------------------------------------- /src/main/java/de/spinscale/elasticsearch/ingest/opennlp/OpenNlpService.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright [2016] [Alexander Reelsen] 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package de.spinscale.elasticsearch.ingest.opennlp; 19 | 20 | import opennlp.tools.namefind.NameFinderME; 21 | import opennlp.tools.namefind.TokenNameFinderModel; 22 | import opennlp.tools.tokenize.SimpleTokenizer; 23 | import opennlp.tools.util.Span; 24 | import org.apache.logging.log4j.LogManager; 25 | import org.apache.logging.log4j.Logger; 26 | import org.elasticsearch.ElasticsearchException; 27 | import org.elasticsearch.common.StopWatch; 28 | import org.elasticsearch.common.Strings; 29 | import org.elasticsearch.common.settings.Settings; 30 | 31 | import java.io.IOException; 32 | import java.io.InputStream; 33 | import java.nio.file.Files; 34 | import java.nio.file.Path; 35 | import java.util.ArrayList; 36 | import java.util.Arrays; 37 | import java.util.List; 38 | import java.util.Map; 39 | import java.util.Optional; 40 | import java.util.Set; 41 | import java.util.concurrent.ConcurrentHashMap; 42 | 43 | /** 44 | * OpenNLP name finders are not thread safe, so we load them via a thread local hack 45 | */ 46 | public class OpenNlpService { 47 | 48 | private static final Logger logger = LogManager.getLogger(OpenNlpService.class); 49 | private final Path configDirectory; 50 | private Settings settings; 51 | 52 | private ThreadLocal threadLocal = new ThreadLocal<>(); 53 | private Map nameFinderModels = new ConcurrentHashMap<>(); 54 | 55 | OpenNlpService(Path configDirectory, Settings settings) { 56 | this.configDirectory = configDirectory; 57 | this.settings = settings; 58 | } 59 | 60 | Set getModels() { 61 | return IngestOpenNlpPlugin.MODEL_FILE_SETTINGS.getAsMap(settings).keySet(); 62 | } 63 | 64 | protected OpenNlpService start() { 65 | StopWatch sw = new StopWatch("models-loading"); 66 | Map settingsMap = IngestOpenNlpPlugin.MODEL_FILE_SETTINGS.getAsMap(settings); 67 | for (Map.Entry entry : settingsMap.entrySet()) { 68 | String name = entry.getKey(); 69 | sw.start(name); 70 | Path path = configDirectory.resolve(entry.getValue()); 71 | try (InputStream is = Files.newInputStream(path)) { 72 | nameFinderModels.put(name, new TokenNameFinderModel(is)); 73 | } catch (IOException e) { 74 | // this means a broken configuration, throw an exception and exit 75 | // otherwise users will ask why enrichment does not work 76 | throw new ElasticsearchException(e); 77 | } 78 | sw.stop(); 79 | } 80 | 81 | if (settingsMap.keySet().size() == 0) { 82 | logger.error("Did not load any models for ingest-opennlp plugin, none configured"); 83 | } else { 84 | logger.info("Read models in [{}] for {}", sw.totalTime(), settingsMap.keySet()); 85 | } 86 | 87 | return this; 88 | } 89 | 90 | public ExtractedEntities find(String content, String field) { 91 | try { 92 | if (!nameFinderModels.containsKey(field)) { 93 | throw new ElasticsearchException("Could not find field [{}], possible values {}", field, nameFinderModels.keySet()); 94 | } 95 | TokenNameFinderModel finderModel = nameFinderModels.get(field); 96 | if (threadLocal.get() == null || !threadLocal.get().equals(finderModel)) { 97 | threadLocal.set(finderModel); 98 | } 99 | 100 | String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content); 101 | Span[] spans = new NameFinderME(finderModel).find(tokens); 102 | 103 | return new ExtractedEntities(tokens, spans); 104 | } finally { 105 | threadLocal.remove(); 106 | } 107 | } 108 | 109 | static String createAnnotatedText(String content, List extractedEntities) { 110 | // these spans contain the real offset of each word in start/end variables! 111 | // the spans of the method argument contain the offset of each token, as mentioned in tokens! 112 | Span[] spansWithRealOffsets = SimpleTokenizer.INSTANCE.tokenizePos(content); 113 | 114 | List spansList = new ArrayList<>(); 115 | extractedEntities.stream() 116 | .map(ExtractedEntities::getSpans) 117 | .forEach(s -> spansList.addAll(Arrays.asList(s))); 118 | 119 | Span[] spans = NameFinderME.dropOverlappingSpans(spansList.toArray(new Span[0])); 120 | String[] tokens = extractedEntities.get(0).getTokens(); 121 | 122 | // shortcut if there is no enrichment to be done 123 | if (spans.length == 0) { 124 | return content; 125 | } 126 | 127 | StringBuilder builder = new StringBuilder(); 128 | for (int i = 0; i < tokens.length; i++) { 129 | final int idx = i; 130 | String token = tokens[i]; 131 | 132 | final Optional optionalSpan = Arrays.stream(spans).filter(s -> s.getStart() == idx).findFirst(); 133 | if (optionalSpan.isPresent()) { 134 | Span span = optionalSpan.get(); 135 | int start = span.getStart(); 136 | int end = span.getEnd(); 137 | String type = span.getType(); 138 | 139 | String[] spanTokens = new String[end - start]; 140 | int spanPosition = 0; 141 | for (int tokenPosition = start ; tokenPosition < end; tokenPosition++) { 142 | spanTokens[spanPosition++] = tokens[tokenPosition]; 143 | } 144 | String entityString = Strings.arrayToDelimitedString(spanTokens, " "); 145 | 146 | builder.append("["); 147 | builder.append(entityString); 148 | builder.append("]("); 149 | builder.append(Strings.capitalize(type)); 150 | builder.append("_"); 151 | builder.append(entityString); 152 | builder.append(")"); 153 | i = end - 1; 154 | } else { 155 | builder.append(token); 156 | } 157 | 158 | // only append a whitespace, if the offsets actually differ 159 | if (i < tokens.length - 1) { 160 | if (spansWithRealOffsets[i].getEnd() != spansWithRealOffsets[i+1].getStart()) { 161 | builder.append(" "); 162 | } 163 | } 164 | } 165 | 166 | return builder.toString(); 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/test/java/de/spinscale/elasticsearch/ingest/opennlp/OpenNlpProcessorTests.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright [2016] [Alexander Reelsen] 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | * 16 | */ 17 | 18 | package de.spinscale.elasticsearch.ingest.opennlp; 19 | 20 | import org.elasticsearch.action.ingest.SimulateProcessorResult; 21 | import org.elasticsearch.common.settings.Settings; 22 | import org.elasticsearch.core.PathUtils; 23 | import org.elasticsearch.core.Tuple; 24 | import org.elasticsearch.index.VersionType; 25 | import org.elasticsearch.ingest.IngestDocument; 26 | import org.elasticsearch.ingest.Processor; 27 | import org.elasticsearch.xcontent.ToXContent; 28 | import org.elasticsearch.xcontent.XContentBuilder; 29 | import org.elasticsearch.xcontent.XContentFactory; 30 | import org.junit.jupiter.api.BeforeAll; 31 | import org.junit.jupiter.api.Test; 32 | 33 | import java.nio.file.Path; 34 | import java.util.*; 35 | 36 | import static org.assertj.core.api.Assertions.assertThat; 37 | 38 | public class OpenNlpProcessorTests { 39 | 40 | private static OpenNlpService service; 41 | 42 | @BeforeAll 43 | public static void createOpenNlpService() throws Exception { 44 | Settings settings = Settings.builder() 45 | .put("ingest.opennlp.model.file.names", "en-ner-persons.bin") 46 | .put("ingest.opennlp.model.file.locations", "en-ner-locations.bin") 47 | .put("ingest.opennlp.model.file.dates", "en-ner-dates.bin") 48 | .build(); 49 | 50 | Path path = PathUtils.get(OpenNlpProcessorTests.class.getResource("/models/en-ner-persons.bin").toURI()); 51 | service = new OpenNlpService(path.getParent(), settings).start(); 52 | } 53 | 54 | @Test 55 | public void testThatExtractionsWork() throws Exception { 56 | OpenNlpProcessor processor = new OpenNlpProcessor(service, null, "source_field", "target_field", 57 | null, new HashSet<>(Arrays.asList("names", "dates", "locations")), "description"); 58 | 59 | Map entityData = getIngestDocumentData(processor); 60 | 61 | assertThatHasElements(entityData, "names", "Kobe Bryant", "Michael Jordan"); 62 | assertThatHasElements(entityData, "dates", "Yesterday"); 63 | assertThatHasElements(entityData, "locations", "Munich", "New York"); 64 | } 65 | 66 | @Test 67 | public void testThatFieldsCanBeExcluded() throws Exception { 68 | OpenNlpProcessor processor = new OpenNlpProcessor(service, null, "source_field", "target_field", 69 | null, new HashSet<>(Arrays.asList("dates")), "description"); 70 | 71 | Map entityData = getIngestDocumentData(processor); 72 | 73 | assertThat(entityData).doesNotContainKey("locations"); 74 | assertThat(entityData).doesNotContainKey("names"); 75 | assertThatHasElements(entityData, "dates", "Yesterday"); 76 | } 77 | 78 | @Test 79 | public void testThatExistingValuesAreMergedWithoutDuplicates() throws Exception { 80 | OpenNlpProcessor processor = new OpenNlpProcessor(service, null, "source_field", "target_field", 81 | null, new HashSet<>(Arrays.asList("names", "dates", "locations")), "description"); 82 | 83 | IngestDocument ingestDocument = getIngestDocument(); 84 | 85 | Map entityData = new HashMap<>(); 86 | entityData.put("names", Arrays.asList("Magic Johnson", "Kobe Bryant")); 87 | entityData.put("locations", Arrays.asList("Paris", "Munich")); 88 | entityData.put("dates", Arrays.asList("Today", "Yesterday")); 89 | 90 | ingestDocument.setFieldValue("target_field", entityData); 91 | 92 | ingestDocument = processor.execute(ingestDocument); 93 | 94 | entityData = getIngestDocumentData(ingestDocument); 95 | 96 | assertThatHasElements(entityData, "names", "Magic Johnson", "Kobe Bryant", "Michael Jordan"); 97 | assertThatHasElements(entityData, "dates", "Today", "Yesterday"); 98 | assertThatHasElements(entityData, "locations", "Paris", "Munich", "New York"); 99 | } 100 | 101 | @Test 102 | public void testConstructorNoFieldsSpecified() throws Exception { 103 | Map config = new HashMap<>(); 104 | config.put("field", "source_field"); 105 | config.put("target_field", "target_field"); 106 | 107 | OpenNlpProcessor.Factory factory = new OpenNlpProcessor.Factory(service); 108 | Map registry = Collections.emptyMap(); 109 | OpenNlpProcessor processor = factory.create(registry, null, "description", config); 110 | 111 | Map entityData = getIngestDocumentData(processor); 112 | 113 | assertThatHasElements(entityData, "names", "Kobe Bryant", "Michael Jordan"); 114 | assertThatHasElements(entityData, "dates", "Yesterday"); 115 | assertThatHasElements(entityData, "locations", "Munich", "New York"); 116 | } 117 | 118 | @Test 119 | public void testToXContent() throws Exception { 120 | OpenNlpProcessor processor = new OpenNlpProcessor(service, null, "source_field", "target_field", 121 | null, new HashSet<>(Arrays.asList("names", "dates", "locations")), "description"); 122 | 123 | IngestDocument ingestDocument = getIngestDocument(); 124 | processor.execute(ingestDocument); 125 | 126 | SimulateProcessorResult result = new SimulateProcessorResult("type", "tag", "description", 127 | ingestDocument, Tuple.tuple("data", true)); 128 | 129 | try (XContentBuilder builder = XContentFactory.jsonBuilder()) { 130 | result.toXContent(builder, ToXContent.EMPTY_PARAMS); 131 | } 132 | } 133 | 134 | @Test 135 | public void testAnnotatedText() throws Exception { 136 | Map config = new HashMap<>(); 137 | config.put("field", "source_field"); 138 | config.put("annotated_text_field", "my_annotated_text_field"); 139 | 140 | OpenNlpProcessor.Factory factory = new OpenNlpProcessor.Factory(service); 141 | Map registry = Collections.emptyMap(); 142 | OpenNlpProcessor processor = factory.create(registry, null, "description", config); 143 | 144 | IngestDocument ingestDocument = processor.execute(getIngestDocument()); 145 | String content = ingestDocument.getFieldValue("my_annotated_text_field", String.class); 146 | assertThat(content).endsWith("[Kobe Bryant](Person_Kobe Bryant) was one of the best basketball players of all times. Not even" + 147 | " [Michael Jordan](Person_Michael Jordan) has ever scored 81 points in one game. [Munich](Location_Munich) is really" + 148 | " an awesome city, but [New York](Location_New York) is as well. [Yesterday](Date_Yesterday) has been the hottest" + 149 | " day of the year."); 150 | } 151 | 152 | private Map getIngestDocumentData(OpenNlpProcessor processor) throws Exception { 153 | IngestDocument ingestDocument = getIngestDocument(); 154 | return getIngestDocumentData(processor.execute(ingestDocument)); 155 | } 156 | 157 | private IngestDocument getIngestDocument() throws Exception { 158 | return getIngestDocument("Kobe Bryant was one of the best basketball players of all times. Not even Michael Jordan has ever " + 159 | "scored 81 points in one game. Munich is really an awesome city, but New York is as well. Yesterday has been the " + 160 | "hottest day of the year."); 161 | } 162 | 163 | private IngestDocument getIngestDocument(String content) throws Exception { 164 | Map document = new HashMap<>(); 165 | document.put("source_field", content); 166 | return new IngestDocument("my-index", "my-id", 1L, null, VersionType.INTERNAL, document); 167 | } 168 | 169 | private Map getIngestDocumentData(IngestDocument ingestDocument) throws Exception { 170 | @SuppressWarnings("unchecked") 171 | Map data = (Map) ingestDocument.getSourceAndMetadata().get("target_field"); 172 | return data; 173 | } 174 | 175 | private void assertThatHasElements(Map entityData, String field, String ... items) { 176 | List values = getValues(entityData, field); 177 | assertThat(values).hasSize(items.length); 178 | assertThat(values).contains(items); 179 | } 180 | 181 | private List getValues(Map entityData, String field) { 182 | assertThat(entityData).containsKey(field); 183 | assertThat(entityData.get(field)).isInstanceOf(List.class); 184 | @SuppressWarnings("unchecked") 185 | List values = (List) entityData.get(field); 186 | return values; 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /src/test/java/de/spinscale/elasticsearch/ingest/opennlp/OpenNlpPluginIntegrationTests.java: -------------------------------------------------------------------------------- 1 | package de.spinscale.elasticsearch.ingest.opennlp; 2 | 3 | import co.elastic.clients.elasticsearch.ElasticsearchClient; 4 | import co.elastic.clients.elasticsearch.core.GetResponse; 5 | import co.elastic.clients.elasticsearch.ingest.SimulateResponse; 6 | import co.elastic.clients.elasticsearch.ingest.simulate.Document; 7 | import co.elastic.clients.elasticsearch.ingest.simulate.PipelineSimulation; 8 | import co.elastic.clients.json.JsonData; 9 | import co.elastic.clients.json.jackson.JacksonJsonpMapper; 10 | import co.elastic.clients.transport.ElasticsearchTransport; 11 | import co.elastic.clients.transport.rest_client.RestClientTransport; 12 | import jakarta.json.JsonString; 13 | import org.apache.http.HttpHost; 14 | import org.elasticsearch.client.RestClient; 15 | import org.junit.jupiter.api.AfterAll; 16 | import org.junit.jupiter.api.BeforeAll; 17 | import org.junit.jupiter.api.Tag; 18 | import org.junit.jupiter.api.Test; 19 | import org.slf4j.LoggerFactory; 20 | import org.testcontainers.containers.GenericContainer; 21 | import org.testcontainers.containers.output.Slf4jLogConsumer; 22 | import org.testcontainers.containers.wait.strategy.LogMessageWaitStrategy; 23 | import org.testcontainers.images.builder.ImageFromDockerfile; 24 | 25 | import java.io.IOException; 26 | import java.net.URI; 27 | import java.net.http.HttpClient; 28 | import java.net.http.HttpRequest; 29 | import java.net.http.HttpResponse; 30 | import java.nio.file.Paths; 31 | import java.util.List; 32 | import java.util.Map; 33 | 34 | import static org.assertj.core.api.Assertions.assertThat; 35 | 36 | @Tag("slow") 37 | public class OpenNlpPluginIntegrationTests { 38 | 39 | private static String text = "Kobe Bryant was one of the best basketball players of all times. Not even Michael Jordan has ever scored 81 points in one game. Munich is really an awesome city, but New York is as well. Yesterday has been the hottest day of the year."; 40 | private static GenericContainer container; 41 | private static RestClient restClient; 42 | private static ElasticsearchClient client; 43 | 44 | @BeforeAll 45 | public static void startContainer() { 46 | ImageFromDockerfile image = new ImageFromDockerfile().withDockerfile(Paths.get("./Dockerfile")); 47 | container = new GenericContainer(image); 48 | container.addEnv("discovery.type", "single-node"); 49 | container.withEnv("xpack.security.enabled", "false"); 50 | container.addExposedPorts(9200); 51 | container.setWaitStrategy(new LogMessageWaitStrategy().withRegEx(".*(\"message\":\\s?\"started[\\s?|\"].*|] started\n$)")); 52 | 53 | container.start(); 54 | container.followOutput(new Slf4jLogConsumer(LoggerFactory.getLogger(OpenNlpPluginIntegrationTests.class))); 55 | 56 | // Create the low-level client 57 | restClient = RestClient.builder(new HttpHost("localhost", container.getMappedPort(9200))).build(); 58 | ElasticsearchTransport transport = new RestClientTransport(restClient, new JacksonJsonpMapper()); 59 | client = new ElasticsearchClient(transport); 60 | } 61 | 62 | @AfterAll 63 | public static void stopContainer() throws IOException { 64 | if (restClient != null) { 65 | restClient.close(); 66 | } 67 | if (container != null) { 68 | container.close(); 69 | } 70 | } 71 | 72 | @Test 73 | public void testNodesInfo() throws Exception { 74 | String endpoint = String.format("http://localhost:%s/_nodes", container.getMappedPort(9200)); 75 | HttpClient client = HttpClient.newHttpClient(); 76 | 77 | HttpRequest request = HttpRequest.newBuilder() 78 | .GET() 79 | .uri(URI.create(endpoint)) 80 | .build(); 81 | 82 | HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); 83 | 84 | assertThat(response.body()).contains("\"name\":\"ingest-opennlp\""); 85 | assertThat(response.body()).contains("\"type\":\"opennlp\""); 86 | } 87 | 88 | @Test 89 | public void testProcessor() throws Exception { 90 | String endpoint = String.format("http://localhost:%s", container.getMappedPort(9200)); 91 | HttpClient httpClient = HttpClient.newHttpClient(); 92 | 93 | HttpRequest request = HttpRequest.newBuilder() 94 | .uri(URI.create(endpoint + "/_ingest/pipeline/my_pipeline")) 95 | .header("Content-Type", "application/json") 96 | .PUT(HttpRequest.BodyPublishers.ofString(""" 97 | { "processors": [ { "opennlp" : { "field" : "field1" } } ] } 98 | """)) 99 | .build(); 100 | httpClient.send(request, HttpResponse.BodyHandlers.ofString()); 101 | 102 | client.index(b -> b.index("test") 103 | .id("1") 104 | .pipeline("my_pipeline") 105 | .document(Map.of("field1", text)) 106 | ); 107 | 108 | GetResponse response = client.get(b -> b.index("test").id("1"), Map.class); 109 | 110 | assertThat(response.found()).isTrue(); 111 | Map source = response.source(); 112 | assertThat(source).containsKey("entities"); 113 | Map entities = (Map) source.get("entities"); 114 | assertThat(entities).containsOnlyKeys("dates", "locations", "names"); 115 | List dates = (List) entities.get("dates"); 116 | assertThat(dates).containsOnly("Yesterday"); 117 | List names = (List) entities.get("names"); 118 | assertThat(names).containsOnly("Kobe Bryant", "Michael Jordan"); 119 | List locations = (List) entities.get("locations"); 120 | assertThat(locations).containsOnly("Munich", "New York"); 121 | } 122 | 123 | @Test 124 | public void testProcessorSelectFields() throws Exception { 125 | String endpoint = String.format("http://localhost:%s", container.getMappedPort(9200)); 126 | HttpClient httpClient = HttpClient.newHttpClient(); 127 | 128 | HttpRequest request = HttpRequest.newBuilder() 129 | .uri(URI.create(endpoint + "/_ingest/pipeline/my_pipeline")) 130 | .header("Content-Type", "application/json") 131 | .PUT(HttpRequest.BodyPublishers.ofString(""" 132 | { "processors": [ { "opennlp" : { "field" : "field1", "fields": ["locations"] } } ] } 133 | """)) 134 | .build(); 135 | httpClient.send(request, HttpResponse.BodyHandlers.ofString()); 136 | 137 | client.index(b -> b.index("test") 138 | .id("1") 139 | .pipeline("my_pipeline") 140 | .document(Map.of("field1", text)) 141 | ); 142 | 143 | GetResponse response = client.get(b -> b.index("test").id("1"), Map.class); 144 | 145 | assertThat(response.found()).isTrue(); 146 | Map source = response.source(); 147 | assertThat(source).containsKey("entities"); 148 | Map entities = (Map) source.get("entities"); 149 | assertThat(entities).containsOnlyKeys("locations"); 150 | List locations = (List) entities.get("locations"); 151 | assertThat(locations).containsOnly("Munich", "New York"); 152 | } 153 | 154 | @Test 155 | public void testProcessorAnnotatedTextOutput() throws Exception { 156 | String endpoint = String.format("http://localhost:%s", container.getMappedPort(9200)); 157 | HttpClient httpClient = HttpClient.newHttpClient(); 158 | 159 | HttpRequest request = HttpRequest.newBuilder() 160 | .uri(URI.create(endpoint + "/_ingest/pipeline/my_pipeline")) 161 | .header("Content-Type", "application/json") 162 | .PUT(HttpRequest.BodyPublishers.ofString(""" 163 | { "processors": [ { "opennlp" : { "field" : "field1", "annotated_text_field" : "annotated_text" } } ] } 164 | """)) 165 | .build(); 166 | httpClient.send(request, HttpResponse.BodyHandlers.ofString()); 167 | 168 | client.index(b -> b.index("test") 169 | .id("1") 170 | .pipeline("my_pipeline") 171 | .document(Map.of("field1", text)) 172 | ); 173 | 174 | GetResponse response = client.get(b -> b.index("test").id("1"), Map.class); 175 | 176 | assertThat(response.found()).isTrue(); 177 | Map source = response.source(); 178 | assertThat(source).containsKey("annotated_text"); 179 | String expectedAnnotatedText = "[Kobe Bryant](Person_Kobe Bryant) was one of the best basketball players of all times. Not even [Michael Jordan](Person_Michael Jordan) has ever scored 81 points in one game. [Munich](Location_Munich) is really an awesome city, but [New York](Location_New York) is as well. [Yesterday](Date_Yesterday) has been the hottest day of the year."; 180 | assertThat(source.get("annotated_text").toString()).isEqualTo(expectedAnnotatedText); 181 | } 182 | 183 | @Test 184 | public void testSimulatePipeline() throws Exception { 185 | String endpoint = String.format("http://localhost:%s", container.getMappedPort(9200)); 186 | HttpClient httpClient = HttpClient.newHttpClient(); 187 | 188 | HttpRequest request = HttpRequest.newBuilder() 189 | .uri(URI.create(endpoint + "/_ingest/pipeline/my_pipeline")) 190 | .header("Content-Type", "application/json") 191 | .PUT(HttpRequest.BodyPublishers.ofString(""" 192 | { "processors": [ { "opennlp" : { "field" : "field1" } } ] } 193 | """)) 194 | .build(); 195 | httpClient.send(request, HttpResponse.BodyHandlers.ofString()); 196 | 197 | SimulateResponse response = client.ingest().simulate(b -> b 198 | .id("my_pipeline") 199 | .docs(Document.of(d -> d.source(JsonData.of(Map.of("field1", text))))) 200 | ); 201 | 202 | assertThat(response.docs()).hasSize(1); 203 | PipelineSimulation simulation = response.docs().get(0); 204 | assertThat(simulation.doc().source()).hasEntrySatisfying("field1", jsonData -> { 205 | assertThat(jsonData.to(String.class)).isEqualTo(text); 206 | }); 207 | 208 | assertThat(simulation.doc().source()).hasEntrySatisfying("entities", data -> { 209 | Map entities = data.to(Map.class); 210 | assertThat(entities).hasSize(3); 211 | assertThat(entities).containsOnlyKeys("dates", "names", "locations"); 212 | assertThat(entities.get("dates")).asList().map(o -> ((JsonString) o).getString()).contains("Yesterday"); 213 | assertThat(entities.get("names")).asList().map(o -> ((JsonString) o).getString()).contains("Kobe Bryant", "Michael Jordan"); 214 | assertThat(entities.get("locations")).asList().map(o -> ((JsonString) o).getString()).contains("Munich", "New York"); 215 | }); 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elasticsearch OpenNLP Ingest Processor 2 | 3 | **Note: Maintainer wanted. I cannot guarantee to keep maintaining this, so if you want to take over as a maintainer, please fork the repo, provide new packages per release and give me a ping to link to your repository!** 4 | 5 | I wrote a [opennlp mapping plugin](https://github.com/spinscale/elasticsearch-opennlp-plugin) a couple of years ago and people asked me, why I did not update it. The main reason was, that it was a bad architectural choice as mentioned in the [openlp plugin README](https://github.com/spinscale/elasticsearch-opennlp-plugin#elasticsearch-opennlp-plugin). With the introduction of ingest processors in Elasticsearch 5.0 this problem has been resolved. 6 | 7 | This processor is doing named/date/location/'whatever you have a model for' entity recognition and stores the output in the JSON before it is being stored. 8 | 9 | This plugin is also intended to show you, that using gradle as a build system makes it very easy to reuse the testing facilities that elasticsearch already provides. First, you can run regular tests, but by adding a rest test, the plugin will be packaged and unzipped against elasticsearch, allowing you to execute a real end-to-end test, by just adding a java test class. 10 | 11 | ## Installation 12 | 13 | | ES | Command | 14 | | ----- | ------- | 15 | | 8.5.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.5.0.1/ingest-opennlp-8.5.0.1.zip` | 16 | | 8.4.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.4.3.1/ingest-opennlp-8.4.3.1.zip` | 17 | | 8.4.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.4.2.1/ingest-opennlp-8.4.2.1.zip` | 18 | | 8.4.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.4.1.1/ingest-opennlp-8.4.1.1.zip` | 19 | | 8.4.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.4.0.1/ingest-opennlp-8.4.0.1.zip` | 20 | | 8.3.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.3.3.1/ingest-opennlp-8.3.3.1.zip` | 21 | | 8.3.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.3.2.1/ingest-opennlp-8.3.2.1.zip` | 22 | | 8.3.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.3.1.1/ingest-opennlp-8.3.1.1.zip` | 23 | | 8.3.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.3.0.1/ingest-opennlp-8.3.0.1.zip` | 24 | | 8.2.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.2.3.1/ingest-opennlp-8.2.3.1.zip` | 25 | | 8.2.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.2.2.1/ingest-opennlp-8.2.2.1.zip` | 26 | | 8.2.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.2.1.1/ingest-opennlp-8.2.0.1.zip` | 27 | | 8.2.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.2.0.1/ingest-opennlp-8.2.0.1.zip` | 28 | | 8.1.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.1.3.1/ingest-opennlp-8.1.3.1.zip` | 29 | | 8.1.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.1.2.1/ingest-opennlp-8.1.2.1.zip` | 30 | | 8.1.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.1.1.1/ingest-opennlp-8.1.1.1.zip` | 31 | | 8.1.0 | No release due to dependency issue in Elasticsearch | 32 | | 8.0.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.0.1.1/ingest-opennlp-8.0.1.1.zip` | 33 | | 8.0.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.0.0.1/ingest-opennlp-8.0.0.1.zip` | 34 | | 8.0.0-rc1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.0.0-rc1.1/ingest-opennlp-8.0.0-rc1.1.zip` | 35 | | 8.0.0-beta1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.0.0-beta1.1/ingest-opennlp-8.0.0-beta1.1.zip` | 36 | | 8.0.0-alpha2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/8.0.0-alpha2.1/ingest-opennlp-8.0.0-alpha2.1.zip` | 37 | | 7.17.6 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.17.6.1/ingest-opennlp-7.17.6.1.zip` | 38 | | 7.17.5 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.17.5.1/ingest-opennlp-7.17.5.1.zip` | 39 | | 7.17.4 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.17.4.1/ingest-opennlp-7.17.4.1.zip` | 40 | | 7.17.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.17.3.1/ingest-opennlp-7.17.3.1.zip` | 41 | | 7.17.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.17.2.1/ingest-opennlp-7.17.2.1.zip` | 42 | | 7.17.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.17.1.1/ingest-opennlp-7.17.1.1.zip` | 43 | | 7.17.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.17.0.1/ingest-opennlp-7.17.0.1.zip` | 44 | | 7.16.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.16.3.1/ingest-opennlp-7.16.3.1.zip` | 45 | | 7.16.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.16.2.1/ingest-opennlp-7.16.2.1.zip` | 46 | | 7.16.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.16.1.1/ingest-opennlp-7.16.1.1.zip` | 47 | | 7.16.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.16.0.1/ingest-opennlp-7.16.0.1.zip` | 48 | | 7.15.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.15.2.1/ingest-opennlp-7.15.2.1.zip` | 49 | | 7.15.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.15.1.1/ingest-opennlp-7.15.1.1.zip` | 50 | | 7.15.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.15.0.1/ingest-opennlp-7.15.0.1.zip` | 51 | | 7.14.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.14.2.1/ingest-opennlp-7.14.2.1.zip` | 52 | | 7.14.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.14.1.1/ingest-opennlp-7.14.1.1.zip` | 53 | | 7.14.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.14.0.1/ingest-opennlp-7.14.0.1.zip` | 54 | | 7.13.4 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.13.4.1/ingest-opennlp-7.13.4.1.zip` | 55 | | 7.13.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.13.3.1/ingest-opennlp-7.13.3.1.zip` | 56 | | 7.13.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.13.2.1/ingest-opennlp-7.13.2.1.zip` | 57 | | 7.13.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.13.1.1/ingest-opennlp-7.13.1.1.zip` | 58 | | 7.13.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.13.0.1/ingest-opennlp-7.13.0.1.zip` | 59 | | 7.12.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.12.1.1/ingest-opennlp-7.12.1.1.zip` | 60 | | 7.12.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.12.0.1/ingest-opennlp-7.12.0.1.zip` | 61 | | 7.11.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.11.2.1/ingest-opennlp-7.11.2.1.zip` | 62 | | 7.11.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.11.1.1/ingest-opennlp-7.11.1.1.zip` | 63 | | 7.11.0 | No release due to issues with Elasticsearch dependencies | 64 | | 7.10.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.10.2.1/ingest-opennlp-7.10.2.1.zip` | 65 | | 7.10.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.10.1.1/ingest-opennlp-7.10.1.1.zip` | 66 | | 7.10.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.10.0.1/ingest-opennlp-7.10.0.1.zip` | 67 | | 7.9.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.9.3.1/ingest-opennlp-7.9.3.1.zip` | 68 | | 7.9.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.9.2.1/ingest-opennlp-7.9.2.1.zip` | 69 | | 7.9.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.9.1.1/ingest-opennlp-7.9.1.1.zip` | 70 | | 7.9.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.9.0.1/ingest-opennlp-7.9.0.1.zip` | 71 | | 7.8.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.8.1.1/ingest-opennlp-7.8.1.1.zip` | 72 | | 7.8.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.8.0.1/ingest-opennlp-7.8.0.1.zip` | 73 | | 7.7.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.7.1.1/ingest-opennlp-7.7.1.1.zip` | 74 | | 7.7.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.7.0.1/ingest-opennlp-7.7.0.1.zip` | 75 | | 7.6.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.6.2.1/ingest-opennlp-7.6.2.1.zip` | 76 | | 7.6.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.6.1.1/ingest-opennlp-7.6.1.1.zip` | 77 | | 7.6.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.6.0.1/ingest-opennlp-7.6.0.1.zip` | 78 | | 7.5.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.5.2.1/ingest-opennlp-7.5.2.1.zip` | 79 | | 7.5.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.5.1.1/ingest-opennlp-7.5.1.1.zip` | 80 | | 7.5.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.5.0.1/ingest-opennlp-7.5.0.1.zip` | 81 | | 7.4.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.4.2.1/ingest-opennlp-7.4.2.1.zip` | 82 | | 7.4.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.4.1.1/ingest-opennlp-7.4.1.1.zip` | 83 | | 7.4.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.4.0.1/ingest-opennlp-7.4.0.1.zip` | 84 | | 7.3.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.3.2.1/ingest-opennlp-7.3.2.1.zip` | 85 | | 7.3.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.3.1.1/ingest-opennlp-7.3.1.1.zip` | 86 | | 7.3.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.3.0.1/ingest-opennlp-7.3.0.1.zip` | 87 | | 7.2.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.2.1.1/ingest-opennlp-7.2.1.1.zip` | 88 | | 7.2.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.2.0.1/ingest-opennlp-7.2.0.1.zip` | 89 | | 7.1.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.1.1.1/ingest-opennlp-7.1.1.1.zip` | 90 | | 7.1.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.1.0.1/ingest-opennlp-7.1.0.1.zip` | 91 | | 7.0.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.0.1.1/ingest-opennlp-7.0.1.1.zip` | 92 | | 7.0.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/7.0.0.1/ingest-opennlp-7.0.0.1.zip` | 93 | | 6.8.23| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.23.1/ingest-opennlp-6.8.23.1.zip` | 94 | | 6.8.22| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.22.1/ingest-opennlp-6.8.22.1.zip` | 95 | | 6.8.21| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.21.1/ingest-opennlp-6.8.21.1.zip` | 96 | | 6.8.20| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.20.1/ingest-opennlp-6.8.20.1.zip` | 97 | | 6.8.19| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.19.1/ingest-opennlp-6.8.19.1.zip` | 98 | | 6.8.18| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.18.1/ingest-opennlp-6.8.18.1.zip` | 99 | | 6.8.17| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.17.1/ingest-opennlp-6.8.17.1.zip` | 100 | | 6.8.16| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.16.1/ingest-opennlp-6.8.16.1.zip` | 101 | | 6.8.15| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.15.1/ingest-opennlp-6.8.15.1.zip` | 102 | | 6.8.14| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.14.1/ingest-opennlp-6.8.14.1.zip` | 103 | | 6.8.13| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.13.1/ingest-opennlp-6.8.13.1.zip` | 104 | | 6.8.12| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.12.1/ingest-opennlp-6.8.12.1.zip` | 105 | | 6.8.11| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.11.1/ingest-opennlp-6.8.11.1.zip` | 106 | | 6.8.10| `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.10.1/ingest-opennlp-6.8.10.1.zip` | 107 | | 6.8.9 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.9.1/ingest-opennlp-6.8.9.1.zip` | 108 | | 6.8.8 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.8.1/ingest-opennlp-6.8.8.1.zip` | 109 | | 6.8.7 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.7.1/ingest-opennlp-6.8.7.1.zip` | 110 | | 6.8.6 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.6.1/ingest-opennlp-6.8.6.1.zip` | 111 | | 6.8.5 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.5.1/ingest-opennlp-6.8.5.1.zip` | 112 | | 6.8.4 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.4.1/ingest-opennlp-6.8.4.1.zip` | 113 | | 6.8.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.3.1/ingest-opennlp-6.8.3.1.zip` | 114 | | 6.8.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.2.1/ingest-opennlp-6.8.2.1.zip` | 115 | | 6.8.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.1.1/ingest-opennlp-6.8.1.1.zip` | 116 | | 6.8.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.8.0.1/ingest-opennlp-6.8.0.1.zip` | 117 | | 6.7.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.7.2.1/ingest-opennlp-6.7.2.1.zip` | 118 | | 6.7.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.7.1.1/ingest-opennlp-6.7.1.1.zip` | 119 | | 6.7.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.7.0.1/ingest-opennlp-6.7.0.1.zip` | 120 | | 6.6.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.6.2.1/ingest-opennlp-6.6.2.1.zip` | 121 | | 6.6.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.6.1.1/ingest-opennlp-6.6.1.1.zip` | 122 | | 6.6.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.6.0.1/ingest-opennlp-6.6.0.1.zip` | 123 | | 6.5.4 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.5.4.1/ingest-opennlp-6.5.4.1.zip` | 124 | | 6.5.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.5.3.1/ingest-opennlp-6.5.3.1.zip` | 125 | | 6.5.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.5.2.1/ingest-opennlp-6.5.2.1.zip` | 126 | | 6.5.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.5.1.1/ingest-opennlp-6.5.1.1.zip` | 127 | | 6.5.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.5.0.1/ingest-opennlp-6.5.0.1.zip` | 128 | | 6.4.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.4.3.1/ingest-opennlp-6.4.3.1.zip` | 129 | | 6.4.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.4.2.1/ingest-opennlp-6.4.2.1.zip` | 130 | | 6.4.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.4.1.1/ingest-opennlp-6.4.1.1.zip` | 131 | | 6.4.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.4.0.1/ingest-opennlp-6.4.0.1.zip` | 132 | | 6.3.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.3.2.1/ingest-opennlp-6.3.2.1.zip` | 133 | | 6.3.1 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.3.1.1/ingest-opennlp-6.3.1.1.zip` | 134 | | 6.3.0 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.3.0.1/ingest-opennlp-6.3.0.1.zip` | 135 | | 6.2.4 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.2.4.1/ingest-opennlp-6.2.4.1.zip` | 136 | | 6.2.3 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.2.3.1/ingest-opennlp-6.2.3.1.zip` | 137 | | 6.2.2 | `bin/elasticsearch-plugin install https://github.com/spinscale/elasticsearch-ingest-opennlp/releases/download/6.2.2.1/ingest-opennlp-6.2.2.1.zip` | 138 | | 5.2.0 | `bin/elasticsearch-plugin install https://oss.sonatype.org/content/repositories/releases/de/spinscale/elasticsearch/plugin/ingest/ingest-opennlp/5.2.0.1/ingest-opennlp-5.2.0.1.zip` | 139 | | 5.1.2 | `bin/elasticsearch-plugin install https://oss.sonatype.org/content/repositories/releases/de/spinscale/elasticsearch/plugin/ingest/ingest-opennlp/5.1.2.1/ingest-opennlp-5.1.2.1.zip` | 140 | | 5.1.1 | `bin/elasticsearch-plugin install https://oss.sonatype.org/content/repositories/releases/de/spinscale/elasticsearch/plugin/ingest/ingest-opennlp/5.1.1.1/ingest-opennlp-5.1.1.1.zip` | 141 | 142 | **IMPORTANT**: If you are running this plugin with Elasticsearch 6.5.2 or 143 | newer, you need to download the NER models from sourceforge after 144 | installation. 145 | 146 | To download the models, run the following under Linux and osx (this is in 147 | the `bin` directory of your Elasticsearch installation) 148 | 149 | ``` 150 | bin/ingest-opennlp/download-models 151 | ``` 152 | 153 | If you are using windows, please use the following command 154 | 155 | ``` 156 | bin\ingest-opennlp\download-models.bat 157 | ``` 158 | 159 | 160 | ## Usage 161 | 162 | This is how you configure a pipeline with support for opennlp 163 | 164 | You can add the following lines to the `config/elasticsearch.yml` (as those models are shipped by default, they are easy to enable). The models are looked up in the `config/ingest-opennlp/` directory. 165 | 166 | ``` 167 | ingest.opennlp.model.file.persons: en-ner-persons.bin 168 | ingest.opennlp.model.file.dates: en-ner-dates.bin 169 | ingest.opennlp.model.file.locations: en-ner-locations.bin 170 | ``` 171 | 172 | Now fire up Elasticsearch and configure a pipeline 173 | 174 | ``` 175 | PUT _ingest/pipeline/opennlp-pipeline 176 | { 177 | "description": "A pipeline to do named entity extraction", 178 | "processors": [ 179 | { 180 | "opennlp" : { 181 | "field" : "my_field" 182 | } 183 | } 184 | ] 185 | } 186 | 187 | PUT /my-index/_doc/1?pipeline=opennlp-pipeline 188 | { 189 | "my_field" : "Kobe Bryant was one of the best basketball players of all times. Not even Michael Jordan has ever scored 81 points in one game. Munich is really an awesome city, but New York is as well. Yesterday has been the hottest day of the year." 190 | } 191 | 192 | # response will contain an entities field with locations, dates and persons 193 | GET /my-index/_doc/1 194 | ``` 195 | 196 | You can also specify only certain named entities in the processor, i.e. if you only want to extract persons 197 | 198 | 199 | ``` 200 | PUT _ingest/pipeline/opennlp-pipeline 201 | { 202 | "description": "A pipeline to do named entity extraction", 203 | "processors": [ 204 | { 205 | "opennlp" : { 206 | "field" : "my_field" 207 | "fields" : [ "persons" ] 208 | } 209 | } 210 | ] 211 | } 212 | ``` 213 | 214 | You can also emit text in the format used by the [annotated text plugin](https://www.elastic.co/guide/en/elasticsearch/plugins/current/mapper-annotated-text.html). 215 | 216 | ``` 217 | PUT _ingest/pipeline/opennlp-pipeline 218 | { 219 | "description": "A pipeline to do named entity extraction", 220 | "processors": [ 221 | { 222 | "opennlp" : { 223 | "field" : "my_field", 224 | "annotated_text_field" : "my_annotated_text_field" 225 | } 226 | } 227 | ] 228 | } 229 | ``` 230 | 231 | **Note: The creation of annotated text field syntax is only supported when running on Elasticsearch 7.0.1 onwards** 232 | 233 | 234 | ## Configuration 235 | 236 | You can configure own models per field, the setting for this is prefixed `ingest.opennlp.model.file.`. So you can configure any model with any field name, by specifying a name and a path to file, like the three examples below: 237 | 238 | | Parameter | Use | 239 | | --- | --- | 240 | | ingest.opennlp.model.file.names | Configure the file for named entity recognition for the field name | 241 | | ingest.opennlp.model.file.dates | Configure the file for date entity recognition for the field date | 242 | | ingest.opennlp.model.file.persons | Configure the file for person entity recognition for the field person | 243 | | ingest.opennlp.model.file.WHATEVER | Configure the file for WHATEVER entity recognition for the field WHATEVER | 244 | 245 | ## Development setup & running tests 246 | 247 | In order to install this plugin, you need to create a zip distribution first by running 248 | 249 | ```bash 250 | ./gradlew clean check 251 | ``` 252 | 253 | This will produce a zip file in `build/distributions`. As part of the build, the models are packaged into the zip file, but need to be downloaded before. There is a special task in the `build.gradle` which is downloading the models, in case they dont exist. 254 | 255 | After building the zip file, you can install it like this 256 | 257 | ```bash 258 | bin/plugin install file:///path/to/elasticsearch-ingest-opennlp/build/distribution/ingest-opennlp-X.Y.Z-SNAPSHOT.zip 259 | ``` 260 | 261 | Ensure that you have the models downloaded, before testing. 262 | 263 | ## Bugs & TODO 264 | 265 | * A couple of groovy build mechanisms from core are disabled. See the `build.gradle` for further explanations 266 | * Only the most basic NLP functions are exposed, please fork and add your own code to this! 267 | 268 | --------------------------------------------------------------------------------