├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── docker ├── .dockerignore ├── .gitignore ├── README.md ├── pom.xml ├── test.sh └── verify.sh ├── docs └── Rosette-API-Plugin-for-Elasticsearch-Doc-Enrichment.md ├── plugin ├── pom.xml └── src │ ├── main │ ├── assemblies │ │ └── plugin.xml │ ├── java │ │ └── com │ │ │ └── rosette │ │ │ └── elasticsearch │ │ │ ├── CategoriesProcessor.java │ │ │ ├── EntitiesProcessor.java │ │ │ ├── LanguageProcessor.java │ │ │ ├── NameTranslationProcessor.java │ │ │ ├── RosetteAbstractProcessor.java │ │ │ ├── RosetteApiWrapper.java │ │ │ ├── RosetteTextAnalysisPlugin.java │ │ │ └── SentimentProcessor.java │ └── resources │ │ ├── plugin-descriptor.properties │ │ └── plugin-security.policy │ └── test │ ├── java │ └── com │ │ └── rosette │ │ └── elasticsearch │ │ ├── CategoriesProcessorTest.java │ │ ├── EntitiesProcessorTest.java │ │ ├── LanguageProcessorTest.java │ │ ├── MockRosetteInitialization.java │ │ ├── NameTranslationProcessorTest.java │ │ ├── RosetteAbstractProcessorTest.java │ │ ├── RosetteTextAnalysisPluginIT.java │ │ └── SentimentProcessorTest.java │ └── resources │ ├── elasticsearch.version │ ├── it_processors │ ├── all.json │ ├── categories.json │ ├── entities.json │ ├── entities_sentiment.json │ ├── language.json │ ├── sentiment.json │ ├── translate_eng.json │ └── translate_rus.json │ └── mock_responses │ ├── categories_response.json │ ├── entities_response.json │ ├── language_response.json │ ├── name-translation_response.json │ ├── sentiment_adm_response.json │ └── sentiment_response.json ├── pom.xml └── tools ├── README.md └── release.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | **/.DS_Store 3 | **/.idea/* 4 | *.iml 5 | target/ 6 | pom.xml.tag 7 | pom.xml.releaseBackup 8 | pom.xml.next 9 | release.properties 10 | classpath 11 | .checkstyle 12 | .ruleset 13 | .pmd 14 | .project 15 | .settings 16 | build.log 17 | dependency-reduced-pom.xml 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | script: mvn -B install 4 | 5 | jdk: 6 | - openjdk8 7 | - openjdk11 8 | 9 | services: 10 | - docker 11 | 12 | sudo: required 13 | 14 | notifications: 15 | slack: 16 | on_failure: always 17 | on_success: always 18 | secure: YUeHcLIxXFLOokSc8ljuRWfA8q02+1Us8dalaRLWatJXqnGTGasbPc7PPeg0/hesjJA6j4KTohbtLCjt0Eto+wR5ENG5Yeyy7tP3FlWPA2AQJ4Xl71zqrKO9q8C/FDw6dKJxXeEbACN7HFvBETrQ6643bNbz559FQA2iQOLi6fk4fMJs04GQBGj1zosvCVJTafzRuReYau5kfyBGqnVBDTnmoNYirgMFAyeg8mTM5NAUWPfDdl4r6DzdXKIGV05wgLzDOdDwhJmBcrFAKss5xv49aIbPTzCyQbOqNgOBUy1sXjNI8RMRgkUtFnU+FRQXFx5jsCJwvwdWBHyFjxKaJvz14ZYqKb8i4GRLdUj95FbWKvJnmexaoCkA9MbyqO6SLtviqhTl8oZjLUdfMX7htJgJtjM0u1A+ZbaHH7NsTNVtfdlCYTQ4M1ZZHy/cTPQ08OIe62nrbcQiCWuOTxukhZnRO5pctmI9BZGVhGNSxssDlM5vFMacQMXBEHGKq8PqtyLP7bzJriX1rSUFXwsCp5SYqozvcrvsy7wOK5mHhiUDQmadXNQXb3evha7uJ+u402AlkCObLULLoAs6yRZzL1g9C8hNRp8gplj7+ni9rvGtr4vZi7Gp/HnENhqUcP7/LbY6RKH9npFm6QlYoaLDOcFC6Ky4JnK749NtHqP3+bY= 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | For those who wish to contribute to this project, here are some guidelines. 4 | 5 | Fork, then clone the repo: 6 | 7 | git clone git@github.com:your-username/rosette-elasticsearch-plugin.git 8 | 9 | Set up your environment: 10 | 11 | - Java 8 12 | - Maven 13 | - Get a [Rosette API key](https://developer.rosette.com/signup) if you don't have one 14 | - export ROSETTE_API_KEY="\" 15 | 16 | Make sure you can successfully build and tests pass: 17 | 18 | mvn clean install 19 | 20 | Make your change. Add tests for your change. Make sure everything still passes: 21 | 22 | mvn clean install 23 | 24 | Push to your fork and [submit a pull request][pr]. 25 | 26 | [pr]: https://github.com/rosette-api/rosette-elasticsearch-plugin/compare/ 27 | 28 | We'll take a look and may suggest some changes or improvements or alternatives. 29 | 30 | To increase the chance that your pull request is accepted, make sure to write tests, clean commented code, and [good commit messages][commit]. 31 | 32 | [commit]: http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html 33 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | --- 4 | 5 | [![Build Status](https://travis-ci.org/rosette-api/rosette-elasticsearch-plugin.svg?branch=master)](https://travis-ci.org/rosette-api/rosette-elasticsearch-plugin) 6 | [![Maven Central](https://img.shields.io/maven-central/v/com.rosette.elasticsearch/rosette-elasticsearch-plugin?color=blue)](https://mvnrepository.com/artifact/com.rosette.elasticsearch/rosette-elasticsearch-plugin) 7 | 8 | # Rosette Plugin for Elasticsearch 9 | 10 | A Document Enrichment plugin that brings the Rosette API to Elasticsearch. 11 | 12 | This ingest plugin allows Elasticsearch users to perform Language Identification, Sentiment Analysis, Entity Extraction, 13 | Categorization, and Name Translation on documents as they're indexed. 14 | 15 | ## Rosette API 16 | The Rosette Text Analytics Platform uses natural language processing, statistical modeling, and machine learning to 17 | analyze unstructured and semi-structured text across 364 language-encoding-script combinations, revealing valuable 18 | information and actionable data. Rosette provides endpoints for extracting entities and relationships, translating and 19 | comparing the similarity of names, categorizing and adding linguistic tags to text and more. 20 | 21 | ## Rosette API Access 22 | - Rosette Cloud [Sign Up](https://developer.rosette.com/signup) 23 | - Rosette Enterprise [Evaluation](https://www.rosette.com/product-eval/) 24 | 25 | ## Quick Start 26 | 27 | ## How to Install 28 | There are two common ways to install the plugin into Elasticsearch. (Make sure the version of the plugin matches the version of Elasticsearch you are using!) 29 | 30 | - Download the desired version of the plugin from the Releases tab on github 31 | - Install using: `bin/elasticsearch-plugin install file:///` 32 | - Install from a deployed maven artifact: 33 | `bin/elasticsearch-plugin install com.rosette.elasticsearch:rosette-elasticsearch-plugin:` 34 | 35 | 36 | #### Note on Versioning: 37 | The plugin uses semantic versioning. The first three numbers describe which version of Elasticsearch this version of the plugin is compatible with, and the last number indicates the version of the plugin within that Elasticsearch version. 38 | 39 | For instance, `5.3.0.1` is the second patch version of the plugin for Elasticsearch 5.3.0. 40 | 41 | ## How to Build 42 | Building the plugin requires a Rosette API key. If you don’t already have a Rosette API developer account, head over to [developer.rosette.com](https://developer.rosette.com/signup) to get your free Rosette API key. 43 | 44 | Place the key in the ROSETTE_API_KEY environment variable (ie. `export ROSETTE_API_KEY=`) 45 | 46 | Then run `mvn clean install` 47 | 48 | The plugin zip can then be found in `plugin/target/releases/` ready to be installed into the appropriate version of Elasticsearch. 49 | 50 | You can also [Test with Docker](docker/README.md) 51 | 52 | #### Documentation & Support 53 | - [Full Plugin Documentation](docs/Rosette-API-Plugin-for-Elasticsearch-Doc-Enrichment.md) 54 | - [Rosette Platform API](https://developer.rosette.com/features-and-functions) 55 | - [Rosette Platform Release Notes](https://support.rosette.com/hc/en-us/articles/360018354971-Release-Notes) 56 | - [Support](https://support.rosette.com) 57 | - [Plugin License: Apache 2.0](https://github.com/rosette-api/python/blob/develop/LICENSE.txt) 58 | -------------------------------------------------------------------------------- /docker/.dockerignore: -------------------------------------------------------------------------------- 1 | .gitignore 2 | -------------------------------------------------------------------------------- /docker/.gitignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | plugins 3 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | Testing the plugin 2 | ================== 3 | 4 | `mvn clean install` from the top level directory first, then: 5 | 6 | ``` 7 | mvn docker:build 8 | ROSETTE_API_KEY= mvn docker:run 9 | 10 | 11 | 13 | 14 | ./test.sh 15 | ``` 16 | ================== 17 | 18 | There is a known issue where the docker container logs an error message for unknown reasons: 19 | ``` 20 | [ERROR] DOCKER> Cannot process chunk response: java.io.IOException: Bad file descriptor 21 | ``` 22 | This can be ignored. 23 | -------------------------------------------------------------------------------- /docker/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 4.0.0 19 | com.rosette.elasticsearch 20 | rosette-elasticsearch-docker 21 | pom 22 | 23 | rosette-elasticsearch-parent 24 | com.rosette.elasticsearch 25 | 7.17.0.1-SNAPSHOT 26 | .. 27 | 28 | 29 | 30 | com.rosette.elasticsearch 31 | rosette-elasticsearch-plugin 32 | ${project.version} 33 | zip 34 | 35 | 36 | 37 | ${project.build.directory}/curl-output.txt 38 | 0.36.0 39 | 40 | 41 | verify 42 | 43 | 44 | io.fabric8 45 | docker-maven-plugin 46 | ${docker-maven-plugin.version} 47 | 48 | true 49 | true 50 | 51 | 52 | docker-elasticsearch 53 | basistechnologycorporation/rosette-elasticsearch 54 | 55 | docker.elastic.co/elasticsearch/elasticsearch:${elasticsearch.version} 56 | 57 | find /plugins -name "*.zip" -exec /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch file://{} \; 58 | 59 | 60 | 61 | /plugins 62 | 63 | copy-plugin 64 | 65 | 66 | 67 | *:zip 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | http://${host.ip}:${docker.port} 78 | GET 79 | 200..399 80 | 81 | 82 | 83 | 84 | ${docker.port}:9200 85 | 86 | 87 | ${env.ROSETTE_API_KEY} 88 | _local_ 89 | 0.0.0.0 90 | false 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | start 99 | pre-integration-test 100 | 101 | build 102 | start 103 | 104 | 105 | 106 | stop 107 | post-integration-test 108 | 109 | stop 110 | 111 | 112 | 113 | 114 | 115 | org.codehaus.mojo 116 | exec-maven-plugin 117 | 118 | 119 | run_IT 120 | 121 | exec 122 | 123 | integration-test 124 | 125 | ./test.sh 126 | 127 | ${host.ip} 128 | ${docker.port} 129 | 130 | ${curl.output} 131 | 132 | 133 | 134 | verify_IT_success 135 | 136 | exec 137 | 138 | verify 139 | 140 | ./verify.sh 141 | 142 | ${curl.output} 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | set-url-localhost 153 | 154 | 155 | !env.DOCKER_HOST 156 | 157 | 158 | 159 | 160 | 161 | org.codehaus.mojo 162 | build-helper-maven-plugin 163 | ${build-helper-maven-plugin.version} 164 | 165 | 166 | get-local-ip 167 | initialize 168 | 169 | local-ip 170 | 171 | 172 | host.ip 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | set-url-from-docker-host 182 | 183 | 184 | env.DOCKER_HOST 185 | 186 | 187 | 188 | 189 | 190 | org.codehaus.mojo 191 | build-helper-maven-plugin 192 | ${build-helper-maven-plugin.version} 193 | 194 | 195 | validate 196 | regex-property 197 | 198 | regex-property 199 | 200 | 201 | host.ip 202 | ${env.DOCKER_HOST} 203 | ^tcp://(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3}):\d{1,5}$ 204 | $1.$2.$3.$4 205 | true 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | -------------------------------------------------------------------------------- /docker/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | code=0 4 | command_counter=0 5 | set -x 6 | 7 | # uses '|| code=$?' to make sure that even if there is an error, 8 | # the maven build continues and stops the docker image 9 | # the exit codes will be appended to the output file for later verification 10 | 11 | curl -fsSL -H 'Content-Type: application/json' -XPUT "$1:$2/_ingest/pipeline/my_pipeline" -d' 12 | { 13 | "processors": [ 14 | { 15 | "ros_language" : { 16 | "field" : "text", 17 | "target_field" : "language" 18 | } 19 | } 20 | ] 21 | } 22 | ' || ((code++)) 23 | echo 24 | echo "After command ${command_counter} our error code count is ${code}." 25 | command_counter=$((command_counter + 1)) 26 | 27 | curl -fsSL -H 'Content-Type: application/json' -XPOST "$1:$2/indexname/mappingName?pipeline=my_pipeline&pretty" -d' 28 | { 29 | "text" : "This is a document containing English text" 30 | } 31 | ' || ((code++)) 32 | echo "After command ${command_counter} our error code count is ${code}." 33 | command_counter=$((command_counter + 1)) 34 | 35 | curl -fsSL -H "Content-Type: application/json" -XPUT "$1:$2/_ingest/pipeline/rosapi?pretty" -d' 36 | { 37 | "processors": [ 38 | { 39 | "ros_language" : { 40 | "field" : "text", 41 | "target_field" : "language" 42 | } 43 | }, 44 | { 45 | "ros_categories" : { 46 | "field" : "text", 47 | "target_field" : "category" 48 | } 49 | }, 50 | { 51 | "ros_sentiment" : { 52 | "field" : "text", 53 | "target_field" : "sentiment" 54 | } 55 | }, 56 | { 57 | "ros_entities" : { 58 | "field" : "text", 59 | "target_field" : "entities_sentiment", 60 | "include_sentiment" : true, 61 | "include_offsets" : true, 62 | "include_translation" : true, 63 | "translation_language" : "eng" 64 | } 65 | }, 66 | { 67 | "ros_entities" : { 68 | "field" : "text", 69 | "target_field" : "entities", 70 | "include_sentiment" : false, 71 | "include_offsets" : false, 72 | "include_translation" : false, 73 | "translation_language" : "eng" 74 | } 75 | }, 76 | { 77 | "ros_name_translation" : { 78 | "field" : "name", 79 | "target_field" : "translation", 80 | "target_language" : "rus" 81 | } 82 | } 83 | ] 84 | } 85 | ' || ((code++)) 86 | echo "After command ${command_counter} our error code count is ${code}." 87 | command_counter=$((command_counter + 1)) 88 | 89 | #Pipeline without categories since it only supports English 90 | curl -fsSL -H "Content-Type: application/json" -XPUT "$1:$2/_ingest/pipeline/rosapi_jpn?pretty" -d' 91 | { 92 | "processors": [ 93 | { 94 | "ros_language" : { 95 | "field" : "text", 96 | "target_field" : "language" 97 | } 98 | }, 99 | { 100 | "ros_sentiment" : { 101 | "field" : "text", 102 | "target_field" : "sentiment" 103 | } 104 | }, 105 | { 106 | "ros_entities" : { 107 | "field" : "text", 108 | "target_field" : "entities_sentiment", 109 | "include_sentiment" : true, 110 | "include_offsets" : true, 111 | "include_translation" : true, 112 | "translation_language" : "eng" 113 | } 114 | }, 115 | { 116 | "ros_entities" : { 117 | "field" : "text", 118 | "target_field" : "entities", 119 | "include_sentiment" : false, 120 | "include_offsets" : false, 121 | "include_translation" : false, 122 | "translation_language" : "eng" 123 | } 124 | }, 125 | { 126 | "ros_name_translation" : { 127 | "field" : "name", 128 | "target_field" : "translation", 129 | "target_language" : "rus" 130 | } 131 | } 132 | ] 133 | } 134 | ' || ((code++)) 135 | echo "After command ${command_counter} our error code count is ${code}." 136 | command_counter=$((command_counter + 1)) 137 | 138 | curl -fsSL -H "Content-Type: application/json" -XPUT "$1:$2/test_idx?include_type_name=true&pretty" -d' 139 | { 140 | "mappings": { 141 | "rosette": { 142 | "properties": { 143 | "text" : { "type" : "text" }, 144 | "name" : { "type" : "text" }, 145 | "language" : { "type" : "keyword" }, 146 | "category" : { "type" : "keyword" }, 147 | "sentiment" : { "type" : "keyword" }, 148 | "entities" : { "type" : "nested" }, 149 | "translation" : { "type" : "text" } 150 | } 151 | } 152 | } 153 | } 154 | ' || ((code++)) 155 | echo "After command ${command_counter} our error code count is ${code}." 156 | command_counter=$((command_counter + 1)) 157 | 158 | curl -fsSL -H "Content-Type: application/json" -XPUT "$1:$2/test_idx/rosette/1?pretty&refresh=true&pipeline=rosapi" -d' 159 | { 160 | "text": "Original Ghostbuster Dan Aykroyd, who also co-wrote the 1984 Ghostbusters film, couldn’t be more pleased with the new all-female Ghostbusters cast, telling The Hollywood Reporter, “The Aykroyd family is delighted by this inheritance of the Ghostbusters torch by these most magnificent women in comedy.”" 161 | } 162 | ' || ((code++)) 163 | echo "After command ${command_counter} our error code count is ${code}." 164 | command_counter=$((command_counter + 1)) 165 | 166 | curl -fsSL -H "Content-Type: application/json" -XPUT "$1:$2/test_idx/rosette/2?pretty&refresh=true&pipeline=rosapi_jpn" -d' 167 | { 168 | "text": "バングラデシュ政府、ロヒンギャ難民の島への移動を計画
\nバングラデシュ政府、ロヒンギャ難民の島への移動を計画\n\nテンガール・チャール島は約10年前に、メグナ川の堆積土で形成され、高潮の際には数十センチの水に囲まれてしまう。道路や堤防などは築かれておらず、島を記載する地図はあまりない。\n\n約30キロ西には60万人が住むハティア島があり、現在の難民キャンプからの移動には9時間かかる。\n\nある地元政府関係者はAFP通信に対し、テンガール・チャール島について、「島に行けるのは冬のみで、海賊たちの隠れ家になっている」と語った。島を洪水から守るため植樹が行われているが、完了するまでには少なくとも10年がかかるという。同関係者は、「モンスーンの季節には完全に水浸しになってしまう」と話し、「あそこに住まわせるというのは、ひどいアイデアだ」と指摘した。\n\nImage caption 移住が計画されているテンガール・チャール島はハティア(Hatiya)島の近くにある\n\nミャンマーでは、ロヒンギャの人々は国境を接するバングラデシュからの不法移民として扱われており、国籍の取得ができずにいる。\n\n" 169 | } 170 | ' || ((code++)) 171 | echo "After command ${command_counter} our error code count is ${code}." 172 | command_counter=$((command_counter + 1)) 173 | 174 | curl -fsSL -H "Content-Type: application/json" -XPUT "$1:$2/test_idx/rosette/3?pretty&refresh=true&pipeline=rosapi" -d' 175 | { 176 | "text" : "Vladimir Vladimirovich Nabokov was a Russian-American novelist and entomologist. His first nine novels were in Russian, and he achieved international prominence after he began writing English prose.", 177 | "name" : "Vladimir Nabokov" 178 | } 179 | ' || ((code++)) 180 | echo "After command ${command_counter} our error code count is ${code}." 181 | command_counter=$((command_counter + 1)) 182 | 183 | sleep 3 184 | 185 | curl -fsSL -H "Content-Type: application/json" -XPOST "$1:$2/test_idx/_search?pretty" -d' 186 | { 187 | "query": { 188 | "constant_score" : { 189 | "filter" : { 190 | "exists" : {"field" : "language"} 191 | } 192 | } 193 | } 194 | } 195 | ' || ((code++)) 196 | echo "After command ${command_counter} our error code count is ${code}." 197 | 198 | set +x 199 | # The way we check to see if test.sh succeeded is to read in the output 200 | # in verify.sh. We then check the last line of the output file and 201 | # based on the last line, decide if the test was successful. This 202 | # mechanism fails sporadically, and I suspect it is caused by a buffering 203 | # issue. Perhaps a brief snooze will make it more reliable. 204 | sleep 2 205 | echo "exit: $code" 206 | -------------------------------------------------------------------------------- /docker/verify.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [[ "$(tail -1 "$1" 2>/dev/null)" != "exit: 0" ]] ; then 4 | echo "There are test failures. Exiting..." 5 | cat $1 6 | exit 1 7 | fi 8 | 9 | if grep -E -i 'exception|"failed":[1-9]' "$1" ; then 10 | echo "Test query failed! See $1 for details. Exiting..." 11 | cat $1 12 | exit 1 13 | fi 14 | -------------------------------------------------------------------------------- /docs/Rosette-API-Plugin-for-Elasticsearch-Doc-Enrichment.md: -------------------------------------------------------------------------------- 1 | # Rosette API Plugin for Elasticsearch for Document Enrichment 2 | 3 | ## Overview 4 | Basis Technology has written a plugin for Elasticsearch as a means of calling the Rosette API endpoints at indexing time, to annotate unstructured textual fields in a document with text analytic results in separate “metadata” fields. This document enrichment from Rosette allows refinement of search results through these Rosette functions: 5 | - **Language identification** - tag the language of each document 6 | - **Sentiment analysis** - tag the sentiment of each document or sentiment surrounding each entity (person, location, organization, etc.) 7 | - **Categorization** - tag each document with its primary topic (sports, home/garden, politics, etc.) 8 | - **Entity extraction and linking** - find the key entities in each document for faceted searching and link entities to Wikidata entries 9 | - **Name translation** - Translate names into English from 11 languages such as Arabic, Chinese, Korean, Japanese, and Russian. 10 | 11 | Note: There are two other Rosette plugins for Elasticsearch which offer these NLP functions through the Rosette SDK. 12 | - *Identity Resolution Plugin:* Fuzzy name matching (across 11 types of variations and across languages and scripts) 13 | - *Multilingual Search Enablement Plugin:* Text pre-processing (lemmatization, tokenization, noun decompounding, etc.) to enable search in 40+ languages, while enhancing precision and recall 14 | See [Rosette’s Elasticsearch Plugins](https://www.rosette.com/elastic/) or contact info@rosette.com for more information. 15 | 16 | ## Quick Start Guide 17 | ### Overview 18 | Rosette functionality is called through an ingest node of Elasticsearch that pre-processes documents before indexing takes place. You define a pipeline that specifies the series of processors that transforms or enriches the document. See the [Ingest APIs of Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/master/ingest-apis.html) for more about how to create, add, or delete pipelines. 19 | ### Version Compatibility 20 | The plugin uses semantic versioning. The first three numbers indicate the version of Elasticsearch that the plugin is compatible with, and the last number indicates the version of the plugin within that Elasticsearch version. 21 | For example, 5.3.1.1 is the second patch version of the plugin for Elasticsearch 5.3.1. 22 | 23 | ### Installation 24 | 1. Install Elasticsearch 25 | (Make sure the Elasticsearch version is compatible with the Document Enrichment Plugin or the plugin will not install.) 26 | 27 | 2. Install the Rosette API plugin (where x.x.x.x stands for the version number) by navigating to the elasticsearch-x.x.x root directory and running the following. 28 | ```sh 29 | bin/elasticsearch-plugin install file:///path/to/rosette-elasticsearch-plugin-x.x.x.x.zip 30 | ``` 31 | Use the absolute file path to refer to the plugin zip. You may be prompted to grant permissions necessary for the plugin to function. The Document Enrichment plugin is now in plugins/rosapi. 32 | 33 | 3. Input your Rosette API key. If you don’t already have one, [sign up for a free trial or paid Rosette API plan](https://developer.rosette.com/signup), or for those who need greater speed or security, contact our sales team (sales@basistech.com) to learn about our on-premise version of Rosette API. You can set the key in one of two ways: 34 | * as an environment variable `export ROSETTE_API_KEY=` 35 | * as an Elasticsearch setting `ingest.rosette.api_key: ` in the `config/elasticsearch.yml` file 36 | 37 | There is also an option to specify an alternative URL to use for on-premise installations of Rosette API. Once again this is either via an environment variable `export ROSETTE_API_URL=` or via a config setting `ingest.rosette.api_url: ` 38 | 39 | ### Configuration 40 | Each Rosette function is implemented as an ingest processor, which is configured as part of an ingest pipeline. Ingest pipelines are specified when indexing a document. 41 | 42 | For example, here's a simple pipeline that runs language identification: 43 | * First create the pipeline: 44 | ```sh 45 | curl -XPUT "http://localhost:9200/_ingest/pipeline/lang_id" -H 'Content-Type: application/json' -d ' 46 | { 47 | "processors": [ 48 | { 49 | "ros_language" : 50 | { "field" : "text", "target_field" : "language" } 51 | } 52 | ] 53 | }' 54 | ``` 55 | * Then index a document with that pipeline: 56 | ```sh 57 | curl -XPOST "http://localhost:9200/indexname/mappingName?pipeline=lang_id" -H 'Content-Type: application/json' -d ' 58 | { "text" : "This is a document containing English text" }' 59 | ``` 60 | See the [Elasticsearch Ingest configuration](https://www.elastic.co/guide/en/elasticsearch/reference/master/ingest.html) for more details. 61 | 62 | ## How It Works: Rosette Processors 63 | Below are details of how to call each Rosette processor through the plugin. Note that entity extraction, entity linking, and entity-level sentiment analysis can be completed with one call, but categorization, name translation, and sentiment analysis are each separate calls. (See more about [Rosette API pricing plans](https://www.rosette.com/pricing/), which range from free to high-call volume plans.) 64 | 65 | For full details of acceptable parameter values see the [online Rosette API documentation](https://developer.rosette.com/features-and-functions) after you have [signed up to receive an API key](https://developer.rosette.com/). 66 | 67 | ### Language Identification 68 | 69 | **Function:** 70 | Given a text field, Rosette detects the language it is most likely to be, and indexes the [identified language](https://developer.rosette.com/features-and-functions#language-support26) in the record. 71 | 72 | **Parameters:** 73 | 74 | |Name | Required | Default | Description | 75 | |--------|-----------|----------------|--------------| 76 | |field | yes | | Field containing input text| 77 | |target_field | no | ros_language | Field to hold output| 78 | 79 | **Examples:** 80 | 81 | Configuration: 82 | ```sh 83 | { 84 | "ros_language" : { 85 | "field" : "text", 86 | "target_field" : "language" 87 | } 88 | } 89 | ``` 90 | Output: 91 | ```sh 92 | { 93 | "text" : "This is English", 94 | "language" : "eng" 95 | } 96 | ``` 97 | ### Entity Extraction, Linking, and Entity-Level Sentiment 98 | 99 | **Function:** 100 | Extracts entities (identifies 18 [entity types](https://developer.rosette.com/features-and-functions#-entity-types) in [20 languages](https://developer.rosette.com/features-and-functions#language-support24) from a body of text and stores them along with their QID (wikidata ID number) and entity type. 101 | 102 | Optionally, Rosette can translate the entity mentions to English ([9 supported languages](https://developer.rosette.com/features-and-functions#language-support44)) and determine the sentiment (pos, neg, or neu) surrounding an entity. 103 | 104 | **Parameters:** 105 | 106 | Name | Required | Default | Description 107 | --------|-----------|------------|-------------------- 108 | field | yes | | Field containing input text 109 | target_field | no | ros_entities | Field to hold output object 110 | include_translation | no | false | Boolean indicating whether entity mentions should be translated 111 | translation_language | no | eng | Target language to translate entity mentions into 112 | include_sentiment | no | false | Boolean indicating whether to include entity-level sentiment 113 | include_offsets | no | false | Boolean indicating whether to include entity offsets 114 | 115 | **Examples:** 116 | 117 | **Configuration:** 118 | ```sh 119 | { 120 | "ros_entities" : { 121 | "field" : "text", 122 | "target_field" : "entities", 123 | "include_translation" : true, 124 | "translation_language" : eng, 125 | "include_sentiment" : true, 126 | "include_offsets" : true 127 | } 128 | } 129 | ``` 130 | **Output:** 131 | ```sh 132 | { 133 | "text" : "Bill Murray will appear in new Ghostbusters film.", 134 | "entities" : [ 135 | { 136 | "mention" : "Bill Murray", 137 | "type" : "PERSON", 138 | "entityId" : "Q29250", 139 | "translation" : "Bill Murray", 140 | "sentiment" : "neu", 141 | "count" : 1, 142 | "offsets" : [...] 143 | }, 144 | { 145 | "mention" : "Ghostbusters", 146 | "type" : "PRODUCT", 147 | "entityId" : "Q108745", 148 | "translation" : "Ghostbusters", 149 | "sentiment" : "neu", 150 | "count" : 1, 151 | "offsets" : [...] 152 | } 153 | ] 154 | } 155 | ``` 156 | ### Sentiment Analysis 157 | 158 | **Function:** 159 | Rosette detects the overall sentiment of a body of text as negative (neg), neutral (neu) or positive (pos). (https://developer.rosette.com/features-and-functions#language-support) 160 | 161 | **Parameters:** 162 | 163 | Name | Required | Default | Description 164 | --------|-----------|------------|--------------------------------------------- 165 | field | yes | | Field containing input text 166 | target_field | no | ros_sentiment | Field to hold output object 167 | 168 | **Examples:** 169 | 170 | **Configuration:** 171 | ```sh 172 | { 173 | "ros_sentiment" : { 174 | "field" : "text", 175 | "target_field" : "sentiment" 176 | } 177 | } 178 | ``` 179 | **Output:** 180 | ```sh 181 | { 182 | "text" : "Original Ghostbuster Dan Aykroyd, who also co-wrote the 1984 Ghostbusters film, couldn’t be more pleased with the new all-female Ghostbusters cast, telling The Hollywood Reporter", 183 | "sentiment" : "pos" 184 | } 185 | ``` 186 | ### Categorization 187 | 188 | **Function:** 189 | Rosette classifies a text field as a member of a general category. Default categories are the tier 1 categories of the IAB Quality Assurance Guidelines (QAG) Taxonomy. (https://www.iab.com/guidelines/iab-quality-assurance-guidelines-qag-taxonomy/) 190 | 191 | **Parameters:** 192 | 193 | Name | Required | Default | Description 194 | ---------|-----------|----------|------------------------------------------- 195 | field | yes | | Field containing input text 196 | target_field | no | ros_category | Field to hold output ([output values](https://developer.rosette.com/features-and-functions#categorization)) 197 | 198 | **Examples:** 199 | 200 | **Configuration:** 201 | ```sh 202 | { 203 | "ros_categories" : { 204 | "field" : "text", 205 | "target_field" : "category" 206 | } 207 | } 208 | ``` 209 | **Output:** 210 | ```sh 211 | { 212 | "text" : "This is an article about the arts.", 213 | "category" : "ARTS_AND_ENTERTAINMENT" 214 | } 215 | ``` 216 | ### Name Translation 217 | 218 | **Function:** 219 | Accepts a field that it assumes is a name (of a person, location, or organization) and translates the name to the target language. 220 | 221 | A name such as Ichiro Suzuki is of “language origin” Japanese, while the “script” is English, whereas 鈴木一郎 is of “language origin” Japanese and “script” Japanese. 222 | 223 | **Parameters:** 224 | 225 | Name | Required | Default | Description 226 | --------|-----------|-------------------|-------------------------------- 227 | field | yes | | Field containing input text 228 | target_field | no | ros_translation | Field to hold output object 229 | target_language | no | eng | Language to translate to ([language codes](https://developer.rosette.com/features-and-functions#language-support44)) 230 | target_script | no |Zyyy (Unknown) |Script to translate to ([script codes](https://developer.rosette.com/features-and-functions#language-support44)) 231 | entity_type | no | PERSON | Entity type of the name being translated: PERSON (default), LOCATION, or ORGANIZATION 232 | source_language | no | xxx (Unknown) | Language of use of the name being translated—that is, which language is the name written in. ([language codes](https://developer.rosette.com/features-and-functions#language-support44)) 233 | Source_script | no | Zyyy (Unknown) | Script of the name being translated ([script codes](https://developer.rosette.com/features-and-functions#language-support44)) 234 | source_language_of_origin | no | xxx (Unknown) | Language of origin of the name being translated ([language codes](https://developer.rosette.com/features-and-functions#language-support44)) 235 | 236 | **Examples:** 237 | 238 | **Configuration:** 239 | ```sh 240 | { 241 | "ros_language" : { 242 | "field" : "name", 243 | "target_field" : "translation", 244 | "target_language" : "eng", 245 | "entity_type" : "PERSON" 246 | } 247 | } 248 | ``` 249 | **Output:** 250 | ```sh 251 | { 252 | "name" : "マット・デイモン", 253 | "translation" : "Matt Damon" 254 | } 255 | ``` 256 | ### Sample Ingest Pipeline 257 | ```sh 258 | { 259 | "description" : "Illustrative ingest pipeline that runs Rosette Api on documents indexed in the text field. Errors in sentiment (usually due to limited language support) send a document to a separate index. Errors in categorization (also usually language support) are ignored completely. Overall errors won't stop the ingest process but an error field will be populated with the cause of the error.", 260 | "processors" : [ 261 | { 262 | "ros_language" : { 263 | "field" : "text", 264 | "target_field" : "language" 265 | } 266 | }, 267 | { 268 | "ros_entities" : { 269 | "field" : "text", 270 | "target_field" : "entities", 271 | "include_translation" : true, 272 | "include_sentiment" : true 273 | } 274 | }, 275 | { 276 | "ros_sentiment" : { 277 | "field" : "text", 278 | "target_field" : "sentiment", 279 | "on_failure" : [ 280 | { 281 | "set" : { 282 | "field" : "_index", 283 | "value" : "no_sent_index" 284 | } 285 | } 286 | ] 287 | }, 288 | }, 289 | { 290 | "ros_categories" : { 291 | "field" : "text", 292 | "target_field" : "category", 293 | "ignore_failure" : true 294 | } 295 | } 296 | ], 297 | "on_failure" : [ 298 | { 299 | "set" : { 300 | "field" : "error", 301 | "value" : "{{ _ingest.on_failure_message }}" 302 | } 303 | } 304 | ] 305 | } 306 | ``` 307 | 308 | 309 | 310 | 311 | 312 | 313 | -------------------------------------------------------------------------------- /plugin/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 4.0.0 19 | com.rosette.elasticsearch 20 | rosette-elasticsearch-plugin 21 | 22 | rosette-elasticsearch-parent 23 | com.rosette.elasticsearch 24 | 7.17.0.1-SNAPSHOT 25 | .. 26 | 27 | 2017 28 | http://rosette-api.github.io/${project.artifactId} 29 | 30 | scm:git:git@github.com:rosette-api/rosette-elasticsearch-plugin.git 31 | scm:git:git@github.com:rosette-api/rosette-elasticsearch-plugin.git 32 | HEAD 33 | 34 | Elasticsearch analysis plugin powered by Rosette API 35 | 36 | 37 | site 38 | scm:git:git@github.com:rosette-api/${project.artifactId}.git 39 | 40 | 41 | 42 | 5.3.0 43 | 1.15 44 | 3.12.0 45 | 3.0.2 46 | 2.10.13 47 | 5.0.4 48 | 2.2.14 49 | 5.11.2 50 | /rest/worker/v1/ 51 | 2.2 52 | 1.7.32 53 | 54 | 55 | 56 | com.basistech.rosette 57 | rosette-api 58 | 59 | 60 | commons-codec 61 | commons-codec 62 | 63 | 64 | 65 | 66 | commons-codec 67 | commons-codec 68 | ${commons-codec.version} 69 | 70 | 71 | org.slf4j 72 | slf4j-api 73 | 74 | 75 | org.apache.logging.log4j 76 | log4j-slf4j-impl 77 | ${log4j.version} 78 | 79 | 80 | org.elasticsearch 81 | elasticsearch 82 | provided 83 | 84 | 85 | net.sf.jopt-simple 86 | jopt-simple 87 | 88 | 89 | joda-time 90 | joda-time 91 | 92 | 93 | org.hamcrest 94 | hamcrest 95 | 96 | 97 | 98 | 99 | org.apache.logging.log4j 100 | log4j-api 101 | provided 102 | 103 | 104 | org.elasticsearch.test 105 | framework 106 | test 107 | 108 | 109 | org.apache.commons 110 | commons-lang3 111 | 112 | 113 | 114 | commons-logging 115 | commons-logging 116 | 117 | 118 | commons-codec 119 | commons-codec 120 | 121 | 122 | org.hamcrest 123 | hamcrest 124 | 125 | 126 | 127 | 128 | org.apache.logging.log4j 129 | log4j-core 130 | test 131 | 132 | 133 | org.mock-server 134 | mockserver-client-java 135 | ${mockserver.version} 136 | test 137 | 138 | 139 | 140 | io.swagger 141 | swagger-core 142 | 143 | 144 | commons-codec 145 | commons-codec 146 | 147 | 148 | jakarta.validation 149 | jakarta.validation-api 150 | 151 | 152 | jakarta.xml.bind 153 | jakarta.xml.bind-api 154 | 155 | 156 | javax.validation 157 | validation-api 158 | 159 | 160 | javax.xml.bind 161 | jaxb-api 162 | 163 | 164 | com.github.java-json-tools 165 | json-schema-validator 166 | 167 | 168 | org.apache.commons 169 | commons-lang3 170 | 171 | 172 | org.slf4j 173 | slf4j-ext 174 | 175 | 176 | 177 | 178 | org.mock-server 179 | mockserver-netty 180 | ${mockserver.version} 181 | test 182 | 183 | 184 | ch.qos.logback 185 | logback-classic 186 | 187 | 188 | com.github.java-json-tools 189 | json-schema-validator 190 | 191 | 192 | com.google.code.findbugs 193 | jsr305 194 | 195 | 196 | joda-time 197 | joda-time 198 | 199 | 200 | net.sf.jopt-simple 201 | jopt-simple 202 | 203 | 204 | org.apache.commons 205 | commons-lang3 206 | 207 | 208 | org.hamcrest 209 | hamcrest 210 | 211 | 212 | org.slf4j 213 | slf4j-ext 214 | 215 | 216 | 217 | 218 | com.github.java-json-tools 219 | json-schema-validator 220 | ${json-schema-validator.version} 221 | test 222 | 223 | 224 | com.google.code.findbugs 225 | jsr305 226 | 227 | 228 | joda-time 229 | joda-time 230 | 231 | 232 | 233 | 234 | com.google.code.findbugs 235 | jsr305 236 | ${findbugs.version} 237 | test 238 | 239 | 240 | net.sf.jopt-simple 241 | jopt-simple 242 | ${jopt-simple.version} 243 | test 244 | 245 | 246 | org.apache.commons 247 | commons-lang3 248 | ${commons-lang3.version} 249 | test 250 | 251 | 252 | org.hamcrest 253 | hamcrest 254 | ${hamcrest.version} 255 | test 256 | 257 | 258 | org.slf4j 259 | slf4j-ext 260 | ${slf4j-ext.version} 261 | test 262 | 263 | 264 | joda-time 265 | joda-time 266 | ${joda-time.version} 267 | test 268 | 269 | 270 | 271 | install 272 | 273 | 274 | src/main/resources 275 | true 276 | 277 | plugin-descriptor.properties 278 | 279 | 280 | 281 | src/main/resources 282 | false 283 | 284 | plugin-security.policy 285 | 286 | 287 | 288 | 289 | 290 | src/test/resources 291 | true 292 | 293 | 294 | 295 | 296 | org.codehaus.mojo 297 | build-helper-maven-plugin 298 | ${build-helper-maven-plugin.version} 299 | 300 | 301 | reserve-mockserver-port 302 | generate-sources 303 | 304 | reserve-network-port 305 | 306 | 307 | 308 | mockserver.port 309 | 310 | 311 | 312 | 313 | 314 | 315 | org.apache.maven.plugins 316 | maven-dependency-plugin 317 | 318 | 319 | set-dep-properties 320 | generate-resources 321 | 322 | properties 323 | 324 | 325 | 326 | 327 | 328 | 329 | org.apache.maven.plugins 330 | maven-shade-plugin 331 | 332 | 333 | package 334 | 335 | shade 336 | 337 | 338 | true 339 | true 340 | 341 | 342 | 343 | 344 | META-INF/maven/dependencies.properties 345 | 346 | 347 | 348 | 349 | 350 | com.basistech:adm-json 351 | com.basistech.rosette:rosette-api 352 | com.basistech.rosette:rosette-api-json 353 | com.basistech:common-api-jackson 354 | com.fasterxml.jackson.core:* 355 | 356 | 357 | 358 | 359 | com.fasterxml 360 | com.basistech.shaded.com.fasterxml 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | org.apache.maven.plugins 369 | maven-assembly-plugin 370 | 371 | ${project.artifactId}-${project.version} 372 | false 373 | ${project.build.directory}/releases/ 374 | 375 | ${basedir}/src/main/assemblies/plugin.xml 376 | 377 | 378 | 379 | 380 | package 381 | 382 | single 383 | 384 | 385 | 386 | 387 | 388 | org.mock-server 389 | mockserver-maven-plugin 390 | ${mockserver.version} 391 | 392 | ${mockserver.port} 393 | INFO 394 | com.rosette.elasticsearch.MockRosetteInitialization 395 | 396 | 397 | 398 | process-test-classes 399 | process-test-classes 400 | 401 | start 402 | 403 | 404 | 405 | prepare-package 406 | prepare-package 407 | 408 | stop 409 | 410 | 411 | 412 | 413 | 414 | org.apache.maven.plugins 415 | maven-surefire-plugin 416 | 417 | 418 | false 419 | 420 | 421 | http://localhost:${mockserver.port}${mockserver.baseurl} 422 | 423 | 424 | 425 | 426 | org.apache.maven.plugins 427 | maven-failsafe-plugin 428 | 429 | 430 | 431 | integration-test 432 | verify 433 | 434 | 435 | 436 | 437 | com.basistech:adm-json 438 | com.basistech.rosette:rosette-api 439 | com.basistech.rosette:rosette-api-json 440 | com.basistech:common-api-jackson 441 | 442 | 443 | false 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | biz.aQute.bnd 454 | bnd-maven-plugin 455 | ${bnd-maven-plugin.version} 456 | 457 | 458 | 459 | bnd-process 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | org.apache.maven.plugins 471 | maven-javadoc-plugin 472 | ${maven-javadoc-plugin.version} 473 | 474 | ${jdk.version} 475 | true 476 | 477 | 478 | 479 | non-aggregate 480 | 481 | javadoc 482 | 483 | 484 | 485 | aggregate 486 | 487 | aggregate 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | -------------------------------------------------------------------------------- /plugin/src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | plugin 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 11 | true 12 | 13 | 14 | ${project.basedir}/src/main/resources/plugin-security.policy 15 | false 16 | 17 | 18 | 19 | 20 | true 21 | false 22 | 23 | 24 | com.fasterxml.jackson.core:jackson-core 25 | com.basistech:adm-json 26 | com.basistech.rosette:rosette-api 27 | com.basistech.rosette:rosette-api-json 28 | com.basistech:common-api-jackson 29 | com.fasterxml.jackson.core:* 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /plugin/src/main/java/com/rosette/elasticsearch/CategoriesProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import com.basistech.rosette.api.HttpRosetteAPIException; 19 | import com.basistech.rosette.apimodel.CategoriesOptions; 20 | import com.basistech.rosette.apimodel.CategoriesResponse; 21 | import com.basistech.rosette.apimodel.DocumentRequest; 22 | import org.apache.logging.log4j.Logger; 23 | import org.elasticsearch.ElasticsearchException; 24 | import org.elasticsearch.common.Strings; 25 | import org.elasticsearch.common.logging.Loggers; 26 | import org.elasticsearch.ingest.ConfigurationUtils; 27 | import org.elasticsearch.ingest.IngestDocument; 28 | import org.elasticsearch.ingest.Processor; 29 | import java.security.AccessController; 30 | import java.security.PrivilegedAction; 31 | import java.util.Map; 32 | 33 | import static com.basistech.rosette.api.common.AbstractRosetteAPI.CATEGORIES_SERVICE_PATH; 34 | 35 | public class CategoriesProcessor extends RosetteAbstractProcessor { 36 | 37 | public static final String TYPE = "ros_categories"; 38 | private static final Logger LOGGER = Loggers 39 | .getLogger(CategoriesProcessor.class, CategoriesProcessor.class.getName()); 40 | 41 | CategoriesProcessor(RosetteApiWrapper rosAPI, String tag, String description, String inputField, 42 | String targetField) { 43 | super(rosAPI, tag, description, TYPE, inputField, targetField); 44 | } 45 | 46 | @Override 47 | public void processDocument(String inputText, IngestDocument ingestDocument) throws Exception { 48 | // call /categories endpoint and set the top result in the field 49 | DocumentRequest request = DocumentRequest.builder() 50 | .content(inputText).build(); 51 | CategoriesResponse response; 52 | try { 53 | // RosApi client binding's Jackson needs elevated privilege 54 | response = AccessController.doPrivileged((PrivilegedAction) () -> 55 | rosAPI.getHttpRosetteAPI().perform(CATEGORIES_SERVICE_PATH, request, CategoriesResponse.class) 56 | ); 57 | } catch (HttpRosetteAPIException ex) { 58 | LOGGER.error(ex.getErrorResponse().getMessage()); 59 | throw new ElasticsearchException(ex.getErrorResponse().getMessage(), ex); 60 | } 61 | 62 | if (response.getCategories() != null 63 | && !response.getCategories().isEmpty() 64 | && response.getCategories().get(0) != null 65 | && !Strings.isNullOrEmpty(response.getCategories().get(0).getLabel())) { 66 | ingestDocument.setFieldValue(targetField, response.getCategories().get(0).getLabel()); 67 | } else { 68 | throw new ElasticsearchException(TYPE + " ingest processor failed to categorize document."); 69 | } 70 | } 71 | 72 | public static final class Factory implements Processor.Factory { 73 | private RosetteApiWrapper rosAPI; 74 | 75 | Factory(RosetteApiWrapper rosAPI) { 76 | this.rosAPI = rosAPI; 77 | } 78 | 79 | @Override 80 | public Processor create(Map registry, String processorTag, 81 | String processorDescription, Map config) throws Exception { 82 | String inputField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); 83 | String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, 84 | Parameters.TARGET_FIELD.name, Parameters.TARGET_FIELD.defaultValue); 85 | return new CategoriesProcessor(rosAPI, processorTag, processorDescription, inputField, targetField); 86 | } 87 | } 88 | 89 | enum Parameters { 90 | TARGET_FIELD("target_field", "ros_category"); 91 | 92 | String name; 93 | String defaultValue; 94 | 95 | Parameters(String name, String defaultValue) { 96 | this.name = name; 97 | this.defaultValue = defaultValue; 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /plugin/src/main/java/com/rosette/elasticsearch/EntitiesProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import com.basistech.rosette.api.HttpRosetteAPIException; 19 | import com.basistech.rosette.apimodel.DocumentRequest; 20 | import com.basistech.rosette.apimodel.EntitiesOptions; 21 | import com.basistech.rosette.apimodel.NameTranslationRequest; 22 | import com.basistech.rosette.apimodel.NameTranslationResponse; 23 | import com.basistech.rosette.apimodel.SentimentOptions; 24 | import com.basistech.rosette.dm.AnnotatedText; 25 | import com.basistech.rosette.dm.Entity; 26 | import com.basistech.rosette.dm.Mention; 27 | import com.basistech.util.LanguageCode; 28 | import org.apache.logging.log4j.Logger; 29 | import org.elasticsearch.ElasticsearchException; 30 | import org.elasticsearch.common.logging.Loggers; 31 | import org.elasticsearch.ingest.ConfigurationUtils; 32 | import org.elasticsearch.ingest.IngestDocument; 33 | import org.elasticsearch.ingest.Processor; 34 | 35 | import java.security.AccessController; 36 | import java.security.PrivilegedAction; 37 | import java.util.ArrayList; 38 | import java.util.HashMap; 39 | import java.util.List; 40 | import java.util.Map; 41 | import java.util.stream.Collectors; 42 | 43 | import static com.basistech.rosette.api.common.AbstractRosetteAPI.ENTITIES_SERVICE_PATH; 44 | import static com.basistech.rosette.api.common.AbstractRosetteAPI.NAME_TRANSLATION_SERVICE_PATH; 45 | import static com.basistech.rosette.api.common.AbstractRosetteAPI.SENTIMENT_SERVICE_PATH; 46 | 47 | public class EntitiesProcessor extends RosetteAbstractProcessor { 48 | 49 | public static final String TYPE = "ros_entities"; 50 | 51 | private static final Logger LOGGER = Loggers.getLogger(EntitiesProcessor.class, EntitiesProcessor.class.getName()); 52 | 53 | private boolean includeOffsets; 54 | private boolean doTranslate; 55 | private LanguageCode translateLanguage; 56 | private boolean doSentiment; 57 | 58 | EntitiesProcessor(RosetteApiWrapper rosAPI, String tag, String description, String inputField, String targetField, 59 | boolean includeOffsets, boolean doTranslate, LanguageCode translateLanguage, 60 | boolean doSentiment) { 61 | super(rosAPI, tag, description, TYPE, inputField, targetField); 62 | this.includeOffsets = includeOffsets; 63 | this.doTranslate = doTranslate; 64 | this.translateLanguage = translateLanguage; 65 | this.doSentiment = doSentiment; 66 | } 67 | 68 | @Override 69 | public void processDocument(String inputText, IngestDocument ingestDocument) throws Exception { 70 | //Need to use the ADM for entities so we get offsets 71 | AnnotatedText adm; 72 | 73 | //If entity level sentiment is desired, use the entity information from the ASCENT call 74 | try { 75 | //SENTIMENT 76 | if (doSentiment) { 77 | DocumentRequest sentrequest = DocumentRequest.builder() 78 | .content(inputText).build(); 79 | adm = AccessController.doPrivileged((PrivilegedAction) () -> 80 | rosAPI.getHttpRosetteAPI().perform(SENTIMENT_SERVICE_PATH, sentrequest) 81 | ); 82 | } else { 83 | //REX 84 | DocumentRequest entityrequest = DocumentRequest.builder() 85 | .content(inputText).build(); 86 | adm = AccessController.doPrivileged((PrivilegedAction) () -> 87 | rosAPI.getHttpRosetteAPI().perform(ENTITIES_SERVICE_PATH, entityrequest) 88 | ); 89 | } 90 | } catch (HttpRosetteAPIException ex) { 91 | LOGGER.error(ex.getErrorResponse().getMessage()); 92 | throw new ElasticsearchException(ex.getErrorResponse().getMessage(), ex); 93 | } 94 | 95 | List> entities = adm.getEntities().stream().map(this::processEntity) 96 | .collect(Collectors.toList()); 97 | 98 | ingestDocument.setFieldValue(targetField, entities); 99 | } 100 | 101 | public static final class Factory implements Processor.Factory { 102 | private RosetteApiWrapper rosAPI; 103 | 104 | Factory(RosetteApiWrapper rosAPI) { 105 | this.rosAPI = rosAPI; 106 | } 107 | 108 | @Override 109 | public Processor create(Map registry, String processorTag, 110 | String processorDescription, Map config) throws Exception { 111 | 112 | String inputField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); 113 | String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, 114 | Parameters.TARGET_FIELD.name, Parameters.TARGET_FIELD.defaultValue); 115 | boolean includeOffsets = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, 116 | Parameters.OFFSETS.name, Boolean.parseBoolean(Parameters.OFFSETS.defaultValue)); 117 | boolean doTranslate = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, 118 | Parameters.TRANSLATE.name, Boolean.parseBoolean(Parameters.TRANSLATE.defaultValue)); 119 | LanguageCode translateLanguage = LanguageCode.lookupByISO639(ConfigurationUtils 120 | .readStringProperty(TYPE, processorTag, config, 121 | Parameters.TRANSLATE_LANGUAGE.name, Parameters.TRANSLATE_LANGUAGE.defaultValue)); 122 | boolean doSentiment = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, 123 | Parameters.SENTIMENT.name, Boolean.parseBoolean(Parameters.SENTIMENT.defaultValue)); 124 | 125 | return new EntitiesProcessor(rosAPI, processorTag, processorDescription, inputField, targetField, 126 | includeOffsets, doTranslate, translateLanguage, doSentiment); 127 | } 128 | } 129 | 130 | //Transforms the Entity object into a Map that ES can serialize. 131 | // Calls RNT on each head mention for a translation (if requested) 132 | private HashMap processEntity(Entity entity) { 133 | List mentions = entity.getMentions(); 134 | String headMention = mentions.get(entity.getHeadMentionIndex()).getNormalized(); 135 | String type = entity.getType(); 136 | 137 | HashMap toReturn = new HashMap<>(); 138 | 139 | toReturn.put("mention", headMention); 140 | toReturn.put("entityId", entity.getEntityId()); 141 | toReturn.put("type", type); 142 | toReturn.put("count", mentions.size()); 143 | 144 | if (includeOffsets) { 145 | List> offsets = new ArrayList<>(); 146 | for (Mention mention : mentions) { 147 | Map offsetMap = new HashMap<>(); 148 | offsetMap.put("start", mention.getStartOffset()); 149 | offsetMap.put("end", mention.getEndOffset()); 150 | offsets.add(offsetMap); 151 | } 152 | toReturn.put("offsets", offsets); 153 | } 154 | 155 | //RNT 156 | if (doTranslate 157 | && ("PERSON".equalsIgnoreCase(type) 158 | || "LOCATION".equalsIgnoreCase(type) 159 | || "ORGANIZATION".equalsIgnoreCase(type))) { 160 | NameTranslationRequest rntrequest = NameTranslationRequest.builder() 161 | .name(headMention) 162 | .targetLanguage(translateLanguage) 163 | .entityType(type) 164 | .build(); 165 | 166 | NameTranslationResponse rntresponse; 167 | try { 168 | rntresponse = AccessController.doPrivileged((PrivilegedAction) () -> 169 | rosAPI.getHttpRosetteAPI().perform(NAME_TRANSLATION_SERVICE_PATH, rntrequest, 170 | NameTranslationResponse.class) 171 | ); 172 | } catch (HttpRosetteAPIException ex) { 173 | LOGGER.error(ex.getErrorResponse().getMessage()); 174 | throw new ElasticsearchException(ex.getErrorResponse().getMessage(), ex); 175 | } 176 | toReturn.put("translation", rntresponse.getTranslation()); 177 | } 178 | 179 | if (entity.getSentiment() != null) { 180 | toReturn.put("sentiment", entity.getSentiment().get(0).getLabel()); 181 | } 182 | 183 | return toReturn; 184 | } 185 | 186 | enum Parameters { 187 | TARGET_FIELD("target_field", "ros_entities"), 188 | OFFSETS("include_offsets", "false"), 189 | TRANSLATE("include_translation", "false"), 190 | TRANSLATE_LANGUAGE("translation_language", "eng"), 191 | SENTIMENT("include_sentiment", "false"); 192 | 193 | String name; 194 | String defaultValue; 195 | 196 | Parameters(String name, String defaultValue) { 197 | this.name = name; 198 | this.defaultValue = defaultValue; 199 | } 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /plugin/src/main/java/com/rosette/elasticsearch/LanguageProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import com.basistech.rosette.api.HttpRosetteAPIException; 19 | import com.basistech.rosette.apimodel.DocumentRequest; 20 | import com.basistech.rosette.apimodel.LanguageOptions; 21 | import com.basistech.rosette.apimodel.LanguageResponse; 22 | import org.apache.logging.log4j.Logger; 23 | import org.elasticsearch.ElasticsearchException; 24 | import org.elasticsearch.common.logging.Loggers; 25 | import org.elasticsearch.ingest.ConfigurationUtils; 26 | import org.elasticsearch.ingest.IngestDocument; 27 | import org.elasticsearch.ingest.Processor; 28 | 29 | import java.security.AccessController; 30 | import java.security.PrivilegedAction; 31 | import java.util.Map; 32 | 33 | import static com.basistech.rosette.api.common.AbstractRosetteAPI.LANGUAGE_SERVICE_PATH; 34 | 35 | public class LanguageProcessor extends RosetteAbstractProcessor { 36 | 37 | public static final String TYPE = "ros_language"; 38 | 39 | private static final Logger LOGGER = Loggers.getLogger(LanguageProcessor.class, LanguageProcessor.class.getName()); 40 | 41 | LanguageProcessor(RosetteApiWrapper rosAPI, String tag, String description, String inputField, String targetField) { 42 | super(rosAPI, tag, description, TYPE, inputField, targetField); 43 | } 44 | 45 | @Override 46 | public void processDocument(String inputText, IngestDocument ingestDocument) throws Exception { 47 | // call /language endpoint and set the result in the field 48 | DocumentRequest request = DocumentRequest.builder() 49 | .content(inputText).build(); 50 | LanguageResponse response; 51 | try { 52 | // RosApi client binding's Jackson needs elevated privilege 53 | response = AccessController.doPrivileged((PrivilegedAction) () -> 54 | rosAPI.getHttpRosetteAPI().perform(LANGUAGE_SERVICE_PATH, request, 55 | LanguageResponse.class) 56 | ); 57 | } catch (HttpRosetteAPIException ex) { 58 | LOGGER.error(ex.getErrorResponse().getMessage()); 59 | throw new ElasticsearchException(ex.getErrorResponse().getMessage(), ex); 60 | } 61 | 62 | if (response.getLanguageDetections() != null 63 | && !response.getLanguageDetections().isEmpty() 64 | && response.getLanguageDetections().get(0) != null 65 | && response.getLanguageDetections().get(0).getLanguage() != null) { 66 | ingestDocument.setFieldValue(targetField, response.getLanguageDetections().get(0).getLanguage().ISO639_3()); 67 | } else { 68 | throw new ElasticsearchException(TYPE + " ingest processor failed to guess language of document."); 69 | } 70 | } 71 | 72 | public static final class Factory implements Processor.Factory { 73 | private RosetteApiWrapper rosAPI; 74 | 75 | Factory(RosetteApiWrapper rosAPI) { 76 | this.rosAPI = rosAPI; 77 | } 78 | 79 | @Override 80 | public Processor create(Map registry, String processorTag, 81 | String processorDescription, Map config) throws Exception { 82 | String inputField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); 83 | String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, 84 | Parameters.TARGET_FIELD.name, Parameters.TARGET_FIELD.defaultValue); 85 | return new LanguageProcessor(rosAPI, processorTag, processorDescription, inputField, targetField); 86 | } 87 | } 88 | 89 | enum Parameters { 90 | TARGET_FIELD("target_field", "ros_language"); 91 | 92 | String name; 93 | String defaultValue; 94 | 95 | Parameters(String name, String defaultValue) { 96 | this.name = name; 97 | this.defaultValue = defaultValue; 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /plugin/src/main/java/com/rosette/elasticsearch/NameTranslationProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import com.basistech.rosette.api.HttpRosetteAPIException; 19 | import com.basistech.rosette.api.common.AbstractRosetteAPI; 20 | import com.basistech.rosette.apimodel.NameTranslationRequest; 21 | import com.basistech.rosette.apimodel.NameTranslationResponse; 22 | import com.basistech.util.ISO15924; 23 | import com.basistech.util.LanguageCode; 24 | import org.apache.logging.log4j.Logger; 25 | import org.elasticsearch.ElasticsearchException; 26 | import org.elasticsearch.common.logging.Loggers; 27 | import org.elasticsearch.ingest.ConfigurationUtils; 28 | import org.elasticsearch.ingest.IngestDocument; 29 | import org.elasticsearch.ingest.Processor; 30 | 31 | import java.security.AccessController; 32 | import java.security.PrivilegedAction; 33 | import java.util.Map; 34 | 35 | public class NameTranslationProcessor extends RosetteAbstractProcessor { 36 | 37 | public static final String TYPE = "ros_name_translation"; 38 | 39 | private static final Logger LOGGER = Loggers 40 | .getLogger(NameTranslationProcessor.class, NameTranslationProcessor.class.getName()); 41 | 42 | private LanguageCode targetLanguage; 43 | private ISO15924 targetScript; 44 | private String entityType; 45 | private LanguageCode sourceLanguage; 46 | private ISO15924 sourceScript; 47 | private LanguageCode sourceOrigin; 48 | 49 | NameTranslationProcessor(RosetteApiWrapper rosAPI, String tag, String description, String inputField, 50 | String targetField, LanguageCode targetLanguage, ISO15924 targetScript, String entityType, 51 | LanguageCode sourceLanguage, ISO15924 sourceScript, LanguageCode sourceOrigin) { 52 | super(rosAPI, tag, description, TYPE, inputField, targetField); 53 | this.targetLanguage = targetLanguage; 54 | this.targetScript = targetScript; 55 | this.entityType = entityType; 56 | this.sourceLanguage = sourceLanguage; 57 | this.sourceScript = sourceScript; 58 | this.sourceOrigin = sourceOrigin; 59 | } 60 | 61 | @Override 62 | public void processDocument(String inputText, IngestDocument ingestDocument) throws Exception { 63 | // call /name-translation endpoint and set the result in the field 64 | NameTranslationRequest request = NameTranslationRequest.builder() 65 | .name(inputText) 66 | .targetLanguage(targetLanguage) 67 | .entityType(entityType) 68 | .targetScript(targetScript) 69 | .sourceLanguageOfUse(sourceLanguage) 70 | .sourceLanguageOfOrigin(sourceOrigin) 71 | .sourceScript(sourceScript).build(); 72 | 73 | NameTranslationResponse response; 74 | try { 75 | // RosApi client binding's Jackson needs elevated privilege 76 | response = AccessController.doPrivileged((PrivilegedAction) () -> 77 | rosAPI.getHttpRosetteAPI().perform(AbstractRosetteAPI.NAME_TRANSLATION_SERVICE_PATH, request, 78 | NameTranslationResponse.class) 79 | ); 80 | } catch (HttpRosetteAPIException ex) { 81 | LOGGER.error(ex.getErrorResponse().getMessage()); 82 | throw new ElasticsearchException(ex.getErrorResponse().getMessage(), ex); 83 | } 84 | 85 | ingestDocument.setFieldValue(targetField, response.getTranslation()); 86 | } 87 | 88 | public static final class Factory implements Processor.Factory { 89 | private RosetteApiWrapper rosAPI; 90 | 91 | Factory(RosetteApiWrapper rosAPI) { 92 | this.rosAPI = rosAPI; 93 | } 94 | 95 | @Override 96 | public Processor create(Map registry, String processorTag, 97 | String processorDescription, Map config) throws Exception { 98 | 99 | String inputField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); 100 | String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, 101 | Parameters.TARGET_FIELD.name, Parameters.TARGET_FIELD.defaultValue); 102 | LanguageCode targetLanguage = LanguageCode.lookupByISO639(ConfigurationUtils 103 | .readStringProperty(TYPE, processorTag, config, 104 | Parameters.TARGET_LANGUAGE.name, Parameters.TARGET_LANGUAGE.defaultValue)); 105 | ISO15924 targetScript = ISO15924.lookupByCode4(ConfigurationUtils 106 | .readStringProperty(TYPE, processorTag, config, 107 | Parameters.TARGET_SCRIPT.name, Parameters.TARGET_SCRIPT.defaultValue)); 108 | String entityType = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, 109 | Parameters.ENTITY_TYPE.name, Parameters.ENTITY_TYPE.defaultValue); 110 | 111 | LanguageCode sourceLanguage = LanguageCode.lookupByISO639(ConfigurationUtils 112 | .readStringProperty(TYPE, processorTag, config, 113 | Parameters.SOURCE_LANGUAGE.name, Parameters.SOURCE_LANGUAGE.defaultValue)); 114 | ISO15924 sourceScript = ISO15924.lookupByCode4(ConfigurationUtils 115 | .readStringProperty(TYPE, processorTag, config, 116 | Parameters.SOURCE_SCRIPT.name, Parameters.SOURCE_SCRIPT.defaultValue)); 117 | LanguageCode sourceOrigin = LanguageCode.lookupByISO639(ConfigurationUtils 118 | .readStringProperty(TYPE, processorTag, config, 119 | Parameters.SOURCE_LANGUAGE_ORIGIN.name, Parameters.SOURCE_LANGUAGE_ORIGIN.defaultValue)); 120 | 121 | return new NameTranslationProcessor(rosAPI, processorTag, processorDescription, inputField, targetField, 122 | targetLanguage, targetScript, entityType, sourceLanguage, sourceScript, sourceOrigin); 123 | } 124 | } 125 | 126 | enum Parameters { 127 | TARGET_FIELD("target_field", "ros_translation"), 128 | TARGET_LANGUAGE("target_language", "eng"), 129 | TARGET_SCRIPT("target_script", "Zyyy"), 130 | ENTITY_TYPE("entity_type", "PERSON"), 131 | SOURCE_LANGUAGE("source_language", "xxx"), 132 | SOURCE_SCRIPT("source_script", "Zyyy"), 133 | SOURCE_LANGUAGE_ORIGIN("source_language_of_origin", "xxx"); 134 | 135 | String name; 136 | String defaultValue; 137 | 138 | Parameters(String name, String defaultValue) { 139 | this.name = name; 140 | this.defaultValue = defaultValue; 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /plugin/src/main/java/com/rosette/elasticsearch/RosetteAbstractProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import org.elasticsearch.ElasticsearchException; 19 | import org.elasticsearch.SpecialPermission; 20 | import org.elasticsearch.common.Strings; 21 | import org.elasticsearch.ingest.AbstractProcessor; 22 | import org.elasticsearch.ingest.IngestDocument; 23 | 24 | /** 25 | * Class that holds code shared by all Rosette ingest processors 26 | */ 27 | public abstract class RosetteAbstractProcessor extends AbstractProcessor { 28 | 29 | protected String inputField; 30 | protected String targetField; 31 | protected RosetteApiWrapper rosAPI; 32 | protected final String processorType; 33 | 34 | RosetteAbstractProcessor(RosetteApiWrapper rosAPI, String tag, String description, String processorType, 35 | String inputField, String targetField) { 36 | super(tag, description); 37 | this.inputField = inputField; 38 | this.targetField = targetField; 39 | this.rosAPI = rosAPI; 40 | this.processorType = processorType; 41 | } 42 | 43 | @Override 44 | public IngestDocument execute(IngestDocument ingestDocument) throws Exception { 45 | if (ingestDocument.hasField(targetField)) { 46 | throw new ElasticsearchException("Document already contains data in target field for this ingest " 47 | + "processor: " + processorType); 48 | } 49 | if (!ingestDocument.hasField(inputField)) { 50 | //Do nothing 51 | return ingestDocument; 52 | } 53 | 54 | String inputText = ingestDocument.getFieldValue(inputField, String.class); 55 | 56 | if (Strings.isNullOrEmpty(inputText)) { 57 | //Do nothing 58 | return ingestDocument; 59 | } 60 | 61 | SecurityManager sm = System.getSecurityManager(); 62 | if (sm != null) { 63 | sm.checkPermission(new SpecialPermission()); 64 | } 65 | 66 | processDocument(inputText, ingestDocument); 67 | return ingestDocument; 68 | } 69 | 70 | @Override 71 | public String getType() { 72 | return this.processorType; 73 | } 74 | 75 | /** 76 | * Performs processor specific modifications to the document. 77 | * @param inputText value of the field in the document that was specified as the input field to process 78 | * @param ingestDocument document to be ingested 79 | * @throws Exception when something goes wrong 80 | */ 81 | protected abstract void processDocument(String inputText, IngestDocument ingestDocument) throws Exception; 82 | } 83 | -------------------------------------------------------------------------------- /plugin/src/main/java/com/rosette/elasticsearch/RosetteApiWrapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import java.io.IOException; 19 | import java.io.InputStream; 20 | import java.util.Properties; 21 | 22 | import org.apache.logging.log4j.Logger; 23 | import org.elasticsearch.ElasticsearchException; 24 | import org.elasticsearch.common.Strings; 25 | 26 | import com.basistech.rosette.api.HttpRosetteAPI; 27 | import org.elasticsearch.common.logging.Loggers; 28 | 29 | //Configures and holds on to the shared Rosette API client 30 | public final class RosetteApiWrapper { 31 | 32 | private static final Logger LOGGER = Loggers.getLogger(RosetteApiWrapper.class, RosetteApiWrapper.class.getName()); 33 | 34 | private static final String APP_HEADER; 35 | static { 36 | Properties props = new Properties(); 37 | String appHeader = ""; 38 | try (InputStream ins = RosetteApiWrapper.class.getClassLoader() 39 | .getResourceAsStream("plugin-descriptor.properties")) { 40 | props.load(ins); 41 | String pluginName = props.getProperty("classname") 42 | .substring(props.getProperty("classname").lastIndexOf('.') + 1); 43 | String pluginVersion = props.getProperty("version"); 44 | String elasticVersion = props.getProperty("elasticsearch.version"); 45 | appHeader = String.format("%s-%s/ElasticSearch-%s", pluginName, pluginVersion, elasticVersion); 46 | } catch (IOException e) { 47 | // unreachable or the plugin is broken 48 | } finally { 49 | APP_HEADER = appHeader; 50 | } 51 | } 52 | 53 | // TODO: revisit this when we use embedded client 54 | private HttpRosetteAPI httpRosetteAPI; 55 | 56 | RosetteApiWrapper() { 57 | this(null, null); 58 | } 59 | 60 | RosetteApiWrapper(String apiKey, String altUrl) { 61 | if (Strings.isNullOrEmpty(apiKey)) { 62 | apiKey = System.getenv("ROSETTE_API_KEY"); 63 | } 64 | 65 | if (Strings.isNullOrEmpty(altUrl)) { 66 | altUrl = System.getenv("ROSETTE_API_URL"); 67 | } 68 | 69 | if ((HttpRosetteAPI.DEFAULT_URL_BASE.equalsIgnoreCase(altUrl) || Strings.isNullOrEmpty(altUrl)) 70 | && Strings.isNullOrEmpty(apiKey)) { 71 | throw new ElasticsearchException("Rosette plugin requires setting an API Key either via the '" 72 | + RosetteTextAnalysisPlugin.ROSETTE_API_KEY.getKey() 73 | + "' setting, or the 'ROSETTE_API_KEY' environment variable."); 74 | } 75 | 76 | HttpRosetteAPI.Builder clientBuilder = new HttpRosetteAPI.Builder(); 77 | clientBuilder.key(apiKey).additionalHeader("X-RosetteAPI-App", APP_HEADER); 78 | if (!Strings.isNullOrEmpty(altUrl)) { 79 | LOGGER.info("Using alternative URL for Rosette API at : {} ", altUrl); 80 | clientBuilder.url(altUrl); 81 | } 82 | httpRosetteAPI = clientBuilder.build(); 83 | } 84 | 85 | public HttpRosetteAPI getHttpRosetteAPI() { 86 | return httpRosetteAPI; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /plugin/src/main/java/com/rosette/elasticsearch/RosetteTextAnalysisPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import java.util.Arrays; 19 | import java.util.HashMap; 20 | import java.util.List; 21 | import java.util.Map; 22 | 23 | import org.elasticsearch.common.settings.Setting; 24 | import org.elasticsearch.ingest.Processor; 25 | import org.elasticsearch.plugins.IngestPlugin; 26 | import org.elasticsearch.plugins.MapperPlugin; 27 | import org.elasticsearch.plugins.Plugin; 28 | 29 | public class RosetteTextAnalysisPlugin extends Plugin implements MapperPlugin, IngestPlugin { 30 | 31 | public static final Setting ROSETTE_API_KEY = 32 | Setting.simpleString("ingest.rosette.api_key", Setting.Property.NodeScope, Setting.Property.Filtered); 33 | public static final Setting ROSETTE_API_URL = 34 | Setting.simpleString("ingest.rosette.api_url", Setting.Property.NodeScope, Setting.Property.Filtered); 35 | 36 | @Override 37 | public List> getSettings() { 38 | return Arrays.asList(ROSETTE_API_KEY, ROSETTE_API_URL); 39 | } 40 | 41 | @Override 42 | public Map getProcessors(Processor.Parameters parameters) { 43 | String key = ROSETTE_API_KEY.get(parameters.env.settings()); 44 | String altURL = ROSETTE_API_URL.get(parameters.env.settings()); 45 | //As this method is called at Node startup, this should ensure only one instance of the api client 46 | RosetteApiWrapper rosAPI = new RosetteApiWrapper(key, altURL); 47 | 48 | Map processors = new HashMap<>(); 49 | processors.put(LanguageProcessor.TYPE, new LanguageProcessor.Factory(rosAPI)); 50 | processors.put(CategoriesProcessor.TYPE, new CategoriesProcessor.Factory(rosAPI)); 51 | processors.put(SentimentProcessor.TYPE, new SentimentProcessor.Factory(rosAPI)); 52 | processors.put(NameTranslationProcessor.TYPE, new NameTranslationProcessor.Factory(rosAPI)); 53 | processors.put(EntitiesProcessor.TYPE, new EntitiesProcessor.Factory(rosAPI)); 54 | return processors; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /plugin/src/main/java/com/rosette/elasticsearch/SentimentProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import com.basistech.rosette.api.HttpRosetteAPIException; 19 | import com.basistech.rosette.apimodel.DocumentRequest; 20 | import com.basistech.rosette.apimodel.SentimentOptions; 21 | import com.basistech.rosette.apimodel.SentimentResponse; 22 | import org.apache.logging.log4j.Logger; 23 | import org.elasticsearch.ElasticsearchException; 24 | import org.elasticsearch.common.Strings; 25 | import org.elasticsearch.common.logging.Loggers; 26 | import org.elasticsearch.ingest.ConfigurationUtils; 27 | import org.elasticsearch.ingest.IngestDocument; 28 | import org.elasticsearch.ingest.Processor; 29 | 30 | import java.security.AccessController; 31 | import java.security.PrivilegedAction; 32 | import java.util.Map; 33 | 34 | import static com.basistech.rosette.api.common.AbstractRosetteAPI.SENTIMENT_SERVICE_PATH; 35 | 36 | public class SentimentProcessor extends RosetteAbstractProcessor { 37 | 38 | public static final String TYPE = "ros_sentiment"; 39 | 40 | private static final Logger LOGGER = Loggers.getLogger(SentimentProcessor.class, 41 | SentimentProcessor.class.getName()); 42 | 43 | SentimentProcessor(RosetteApiWrapper rosAPI, String tag, String description, String inputField, 44 | String targetField) { 45 | super(rosAPI, tag, description, TYPE, inputField, targetField); 46 | } 47 | 48 | @Override 49 | public void processDocument(String inputText, IngestDocument ingestDocument) throws Exception { 50 | // call /sentiment endpoint and set the top result in the field 51 | DocumentRequest request = DocumentRequest.builder() 52 | .content(inputText).build(); 53 | SentimentResponse response; 54 | try { 55 | // RosApi client binding's Jackson needs elevated privilege 56 | response = AccessController.doPrivileged((PrivilegedAction) () -> 57 | rosAPI.getHttpRosetteAPI().perform(SENTIMENT_SERVICE_PATH, request, 58 | SentimentResponse.class) 59 | ); 60 | } catch (HttpRosetteAPIException ex) { 61 | LOGGER.error(ex.getErrorResponse().getMessage()); 62 | throw new ElasticsearchException(ex.getErrorResponse().getMessage(), ex); 63 | } 64 | 65 | if (response.getDocument() != null 66 | && !Strings.isNullOrEmpty(response.getDocument().getLabel())) { 67 | ingestDocument.setFieldValue(targetField, response.getDocument().getLabel()); 68 | } else { 69 | throw new ElasticsearchException(TYPE + " ingest processor failed to determine sentiment of document."); 70 | } 71 | } 72 | 73 | public static final class Factory implements Processor.Factory { 74 | private RosetteApiWrapper rosAPI; 75 | 76 | Factory(RosetteApiWrapper rosAPI) { 77 | this.rosAPI = rosAPI; 78 | } 79 | 80 | @Override 81 | public Processor create(Map registry, String processorTag, 82 | String processorDescription, Map config) throws Exception { 83 | String inputField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field"); 84 | String targetField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, 85 | Parameters.TARGET_FIELD.name, Parameters.TARGET_FIELD.defaultValue); 86 | return new SentimentProcessor(rosAPI, processorTag, processorDescription, inputField, targetField); 87 | } 88 | } 89 | 90 | enum Parameters { 91 | TARGET_FIELD("target_field", "ros_sentiment"); 92 | 93 | String name; 94 | String defaultValue; 95 | 96 | Parameters(String name, String defaultValue) { 97 | this.name = name; 98 | this.defaultValue = defaultValue; 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /plugin/src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | description=${project.description} 2 | version=${project.version} 3 | name=rosapi 4 | classname=com.rosette.elasticsearch.RosetteTextAnalysisPlugin 5 | java.version=${jdk.version} 6 | elasticsearch.version=${elasticsearch.version} 7 | -------------------------------------------------------------------------------- /plugin/src/main/resources/plugin-security.policy: -------------------------------------------------------------------------------- 1 | grant { 2 | permission java.lang.RuntimePermission "accessDeclaredMembers"; 3 | permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; 4 | permission java.net.SocketPermission "*", "connect,resolve"; 5 | }; 6 | -------------------------------------------------------------------------------- /plugin/src/test/java/com/rosette/elasticsearch/CategoriesProcessorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import org.elasticsearch.ingest.IngestDocument; 19 | import org.elasticsearch.ingest.RandomDocumentPicks; 20 | import org.elasticsearch.test.ESSingleNodeTestCase; 21 | import org.hamcrest.MatcherAssert; 22 | import org.hamcrest.Matchers; 23 | import org.junit.Test; 24 | 25 | import java.util.HashMap; 26 | import java.util.Map; 27 | 28 | public class CategoriesProcessorTest extends ESSingleNodeTestCase { 29 | 30 | @Test 31 | public void testCategories() throws Exception { 32 | CategoriesProcessor processor = new CategoriesProcessor(new RosetteApiWrapper(), randomUnicodeOfLength(10), 33 | "description", "text", "category"); 34 | 35 | String inputText = "The people played lots of sports like soccer and hockey. The score was very high. " 36 | + "Touchdown!"; 37 | 38 | Map document = new HashMap<>(); 39 | document.put("text", inputText); 40 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 41 | processor.execute(ingestDocument); 42 | 43 | MatcherAssert.assertThat(ingestDocument.getSourceAndMetadata().get("text"), Matchers.equalTo(inputText)); 44 | MatcherAssert.assertThat(ingestDocument.getSourceAndMetadata().get("category"), Matchers.equalTo("SPORTS")); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /plugin/src/test/java/com/rosette/elasticsearch/EntitiesProcessorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import com.basistech.util.LanguageCode; 19 | import org.elasticsearch.ingest.IngestDocument; 20 | import org.elasticsearch.ingest.RandomDocumentPicks; 21 | import org.elasticsearch.test.ESSingleNodeTestCase; 22 | import org.hamcrest.Matchers; 23 | import org.hamcrest.MatcherAssert; 24 | import org.junit.Test; 25 | 26 | import java.util.HashMap; 27 | import java.util.List; 28 | import java.util.Map; 29 | 30 | public class EntitiesProcessorTest extends ESSingleNodeTestCase { 31 | 32 | private static final String INPUTTEXT = "Original Ghostbuster Dan Aykroyd, who also co-wrote the 1984 Ghostbusters " 33 | + "film, couldn’t be more pleased with the new all-female Ghostbusters cast, telling The Hollywood " 34 | + "Reporter, “The Aykroyd family is delighted by this inheritance of the Ghostbusters torch by these " 35 | + "most magnificent women in comedy."; 36 | 37 | @Test 38 | public void testEntities() throws Exception { 39 | EntitiesProcessor processor = new EntitiesProcessor(new RosetteApiWrapper(), randomUnicodeOfLength(10), 40 | "description", "text", "entities", false, false, 41 | LanguageCode.ENGLISH, false); 42 | 43 | Map document = new HashMap<>(); 44 | document.put("text", INPUTTEXT); 45 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 46 | processor.execute(ingestDocument); 47 | 48 | Map source = ingestDocument.getSourceAndMetadata(); 49 | assertFalse("Entity processor failed to return entities", ((List)source.get("entities")).isEmpty()); 50 | 51 | Map entity = (Map)((List)source.get("entities")).get(0); 52 | MatcherAssert.assertThat(entity.get("mention"), Matchers.equalTo("Dan Aykroyd")); 53 | //There shouldn't be any sentiment, translations, or offsets 54 | MatcherAssert.assertThat(entity.get("sentiment"), Matchers.nullValue()); 55 | MatcherAssert.assertThat(entity.get("translation"), Matchers.nullValue()); 56 | MatcherAssert.assertThat(entity.get("offsets"), Matchers.nullValue()); 57 | } 58 | 59 | @Test 60 | public void testOffsets() throws Exception { 61 | EntitiesProcessor processor = new EntitiesProcessor(new RosetteApiWrapper(), randomUnicodeOfLength(10), 62 | "description", "text", "entities", true, false, 63 | LanguageCode.ENGLISH, false); 64 | 65 | Map document = new HashMap<>(); 66 | document.put("text", INPUTTEXT); 67 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 68 | processor.execute(ingestDocument); 69 | 70 | Map source = ingestDocument.getSourceAndMetadata(); 71 | assertFalse("Entity processor failed to return entities", ((List)source.get("entities")).isEmpty()); 72 | Map entity = (Map)((List)source.get("entities")).get(0); 73 | assertFalse("Entities are missing offsets", ((List)entity.get("offsets")).isEmpty()); 74 | assertFalse("Entity offsets are empty", ((Map)((List)entity.get("offsets")).get(0)).isEmpty()); 75 | 76 | MatcherAssert.assertThat(entity.get("sentiment"), Matchers.nullValue()); 77 | MatcherAssert.assertThat(entity.get("translation"), Matchers.nullValue()); 78 | } 79 | 80 | @Test 81 | public void testSentiment() throws Exception { 82 | EntitiesProcessor processor = new EntitiesProcessor(new RosetteApiWrapper(), randomUnicodeOfLength(10), 83 | "description", "text", "entities", false, false, 84 | LanguageCode.ENGLISH, true); 85 | 86 | Map document = new HashMap<>(); 87 | document.put("text", INPUTTEXT); 88 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 89 | processor.execute(ingestDocument); 90 | 91 | Map source = ingestDocument.getSourceAndMetadata(); 92 | assertFalse("Entity processor failed to return entities", ((List)source.get("entities")).isEmpty()); 93 | Map entity = (Map)((List)source.get("entities")).get(0); 94 | MatcherAssert.assertThat(entity.get("sentiment"), Matchers.anything()); 95 | 96 | MatcherAssert.assertThat(entity.get("translation"), Matchers.nullValue()); 97 | MatcherAssert.assertThat(entity.get("offsets"), Matchers.nullValue()); 98 | } 99 | 100 | @Test 101 | public void testTranslate() throws Exception { 102 | EntitiesProcessor processor = new EntitiesProcessor(new RosetteApiWrapper(), randomUnicodeOfLength(10), 103 | "description", "text", "entities", false, true, 104 | LanguageCode.KOREAN, false); 105 | 106 | Map document = new HashMap<>(); 107 | document.put("text", INPUTTEXT); 108 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 109 | processor.execute(ingestDocument); 110 | 111 | Map source = ingestDocument.getSourceAndMetadata(); 112 | assertFalse("Entity processor failed to return entities", ((List)source.get("entities")).isEmpty()); 113 | Map entity = (Map)((List)source.get("entities")).get(0); 114 | MatcherAssert.assertThat(entity.get("translation"), Matchers.anything()); 115 | 116 | MatcherAssert.assertThat(entity.get("sentiment"), Matchers.nullValue()); 117 | MatcherAssert.assertThat(entity.get("offsets"), Matchers.nullValue()); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /plugin/src/test/java/com/rosette/elasticsearch/LanguageProcessorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import org.elasticsearch.ingest.IngestDocument; 19 | import org.elasticsearch.ingest.RandomDocumentPicks; 20 | import org.elasticsearch.test.ESSingleNodeTestCase; 21 | import org.hamcrest.MatcherAssert; 22 | import org.hamcrest.Matchers; 23 | import org.junit.Test; 24 | 25 | import java.util.HashMap; 26 | import java.util.Map; 27 | 28 | public class LanguageProcessorTest extends ESSingleNodeTestCase { 29 | 30 | @Test 31 | public void testLangId() throws Exception { 32 | LanguageProcessor processor = new LanguageProcessor(new RosetteApiWrapper(), randomUnicodeOfLength(10), 33 | "description", "text", "language"); 34 | 35 | String inputText = "This is a very English document. It should be identified as English."; 36 | 37 | Map document = new HashMap<>(); 38 | document.put("text", inputText); 39 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 40 | processor.execute(ingestDocument); 41 | 42 | MatcherAssert.assertThat(ingestDocument.getSourceAndMetadata().get("text"), Matchers.equalTo(inputText)); 43 | MatcherAssert.assertThat(ingestDocument.getSourceAndMetadata().get("language"), Matchers.equalTo("eng")); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /plugin/src/test/java/com/rosette/elasticsearch/MockRosetteInitialization.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import org.apache.http.HttpHeaders; 19 | import org.mockserver.client.MockServerClient; 20 | import org.mockserver.model.Header; 21 | import org.mockserver.model.HttpRequest; 22 | import org.mockserver.model.HttpResponse; 23 | import org.mockserver.model.Parameter; 24 | import org.mockserver.client.initialize.PluginExpectationInitializer; 25 | 26 | import java.io.BufferedReader; 27 | import java.io.IOException; 28 | import java.io.InputStream; 29 | import java.io.InputStreamReader; 30 | import java.nio.charset.StandardCharsets; 31 | 32 | //Mock-server is launched before unit tests run and serves up a mocked json response for each endpoint 33 | public class MockRosetteInitialization implements PluginExpectationInitializer { 34 | 35 | @Override 36 | public void initializeExpectations(MockServerClient mockServerClient) { 37 | String baseURL = System.getProperty("mockserver.baseurl", "/rest/worker/v1/"); 38 | //Specific case for when the EntityProcessor calls sentiment expecting an ADM 39 | //This has to be called before the other sentiment endpoint is added 40 | addSentimentADM(baseURL, mockServerClient); 41 | 42 | addEndpoint(baseURL, "categories", mockServerClient); 43 | addEndpoint(baseURL, "sentiment", mockServerClient); 44 | addEndpoint(baseURL, "language", mockServerClient); 45 | addEndpoint(baseURL, "entities", mockServerClient); 46 | addEndpoint(baseURL, "name-translation", mockServerClient); 47 | } 48 | 49 | private void addEndpoint(String baseURL, String endpointName, MockServerClient mockServerClient) { 50 | try (InputStream is = getClass().getClassLoader() 51 | .getResourceAsStream("mock_responses/" + endpointName + "_response.json")) { 52 | String response = getStringFromResource(is); 53 | mockServerClient.when(HttpRequest.request() 54 | .withMethod("POST") 55 | .withPath(baseURL + endpointName)) 56 | .respond(HttpResponse.response() 57 | .withStatusCode(200) 58 | .withHeaders( 59 | new Header(HttpHeaders.CONTENT_TYPE, "application/json") 60 | ) 61 | .withBody(response)); 62 | } catch (IOException ioe) { 63 | throw new RuntimeException(ioe); 64 | } 65 | } 66 | 67 | private void addSentimentADM(String baseURL, MockServerClient mockServerClient) { 68 | try (InputStream is = getClass().getClassLoader() 69 | .getResourceAsStream("mock_responses/sentiment_adm_response.json")) { 70 | String response = getStringFromResource(is); 71 | mockServerClient.when(HttpRequest.request() 72 | .withMethod("POST") 73 | .withPath(baseURL + "sentiment").withQueryStringParameter(new Parameter("output", "rosette"))) 74 | .respond(HttpResponse.response() 75 | .withStatusCode(200) 76 | .withHeaders( 77 | new Header(HttpHeaders.CONTENT_TYPE, "application/json") 78 | ) 79 | .withBody(response)); 80 | } catch (IOException ioe) { 81 | throw new RuntimeException(ioe); 82 | } 83 | } 84 | 85 | private String getStringFromResource(InputStream is) throws IOException { 86 | StringBuilder sb = new StringBuilder(); 87 | String line; 88 | try (BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8.name()))) { 89 | while ((line = br.readLine()) != null) { 90 | sb.append(line); 91 | } 92 | } 93 | return sb.toString(); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /plugin/src/test/java/com/rosette/elasticsearch/NameTranslationProcessorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import com.basistech.util.ISO15924; 19 | import com.basistech.util.LanguageCode; 20 | import org.elasticsearch.ingest.IngestDocument; 21 | import org.elasticsearch.ingest.RandomDocumentPicks; 22 | import org.elasticsearch.test.ESSingleNodeTestCase; 23 | import org.hamcrest.MatcherAssert; 24 | import org.hamcrest.Matchers; 25 | import org.junit.Test; 26 | 27 | import java.util.HashMap; 28 | import java.util.Map; 29 | 30 | public class NameTranslationProcessorTest extends ESSingleNodeTestCase { 31 | 32 | @Test 33 | public void testTranslateToEnglish() throws Exception { 34 | NameTranslationProcessor processor = new NameTranslationProcessor(new RosetteApiWrapper(), 35 | randomUnicodeOfLength(10), "description", "text", "translation", 36 | LanguageCode.ENGLISH, ISO15924.Latn, "PERSON", LanguageCode.RUSSIAN, ISO15924.Cyrl, 37 | LanguageCode.UNKNOWN); 38 | 39 | String inputText = "Владимир Путин"; 40 | 41 | Map document = new HashMap<>(); 42 | document.put("text", inputText); 43 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 44 | processor.execute(ingestDocument); 45 | 46 | MatcherAssert.assertThat(ingestDocument.getSourceAndMetadata().get("text"), Matchers.equalTo(inputText)); 47 | MatcherAssert.assertThat(ingestDocument.getSourceAndMetadata().get("translation"), 48 | Matchers.equalTo("Vladimir Putin")); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /plugin/src/test/java/com/rosette/elasticsearch/RosetteAbstractProcessorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import org.elasticsearch.ElasticsearchException; 19 | import org.elasticsearch.ingest.IngestDocument; 20 | import org.elasticsearch.ingest.RandomDocumentPicks; 21 | import org.elasticsearch.test.ESSingleNodeTestCase; 22 | import org.hamcrest.MatcherAssert; 23 | import org.hamcrest.Matchers; 24 | import org.junit.Test; 25 | 26 | import java.util.HashMap; 27 | import java.util.Map; 28 | 29 | public class RosetteAbstractProcessorTest extends ESSingleNodeTestCase { 30 | 31 | class MockProcessor extends RosetteAbstractProcessor { 32 | MockProcessor(RosetteApiWrapper rosAPI, String tag, String description, String inputField, String targetField) { 33 | super(rosAPI, tag, description, "mock_processor", inputField, targetField); 34 | } 35 | 36 | @Override 37 | public void processDocument(String inputText, IngestDocument ingestDocument) throws Exception { 38 | ingestDocument.setFieldValue(targetField, "Processed!"); 39 | } 40 | } 41 | 42 | @Test 43 | public void testEmptyField() throws Exception { 44 | MockProcessor processor = new MockProcessor(new RosetteApiWrapper(), randomUnicodeOfLength(10), 45 | "description", "text", "target"); 46 | 47 | //Process document with an empty "text" field 48 | Map document = new HashMap<>(); 49 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 50 | processor.execute(ingestDocument); 51 | 52 | //Nothing should get placed in the target field 53 | MatcherAssert.assertThat("Processor should not process empty ingest field", 54 | ingestDocument.getSourceAndMetadata().get("target"), Matchers.nullValue()); 55 | } 56 | 57 | @Test(expected = ElasticsearchException.class) 58 | public void testOverwrite() throws Exception { 59 | MockProcessor processor = new MockProcessor(new RosetteApiWrapper(), randomUnicodeOfLength(10), 60 | "description", "text", "target"); 61 | 62 | //Process document with a value already in the target field 63 | Map document = new HashMap<>(); 64 | document.put("text", "input text"); 65 | document.put("target", "don't overwrite me!"); 66 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 67 | 68 | //We expect an exception to be thrown 69 | processor.execute(ingestDocument); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /plugin/src/test/java/com/rosette/elasticsearch/RosetteTextAnalysisPluginIT.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import org.elasticsearch.action.admin.cluster.node.info.NodeInfo; 19 | import org.elasticsearch.action.admin.cluster.node.info.NodesInfoRequest; 20 | import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; 21 | import org.elasticsearch.action.admin.cluster.node.info.PluginsAndModules; 22 | import org.elasticsearch.action.index.IndexResponse; 23 | import org.elasticsearch.action.support.master.AcknowledgedResponse; 24 | import org.elasticsearch.action.search.SearchResponse; 25 | import org.elasticsearch.common.bytes.BytesArray; 26 | import org.elasticsearch.common.settings.Settings; 27 | import org.elasticsearch.xcontent.XContentFactory; 28 | import org.elasticsearch.xcontent.XContentType; 29 | import org.elasticsearch.index.query.QueryBuilders; 30 | import org.elasticsearch.plugins.Plugin; 31 | import org.elasticsearch.plugins.PluginInfo; 32 | import org.elasticsearch.rest.RestStatus; 33 | import org.elasticsearch.test.ESIntegTestCase; 34 | import org.elasticsearch.test.hamcrest.ElasticsearchAssertions; 35 | import org.hamcrest.MatcherAssert; 36 | import org.hamcrest.Matchers; 37 | import org.junit.Test; 38 | 39 | import java.io.BufferedReader; 40 | import java.io.IOException; 41 | import java.io.InputStream; 42 | import java.io.InputStreamReader; 43 | import java.nio.charset.StandardCharsets; 44 | import java.util.Collection; 45 | import java.util.Collections; 46 | import java.util.List; 47 | import java.util.Map; 48 | 49 | //Tests all processors against an running embedded ES instance using the deployed Rosette API 50 | public class RosetteTextAnalysisPluginIT extends ESIntegTestCase { 51 | 52 | @Override 53 | protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { 54 | return Settings.builder() 55 | .put(super.nodeSettings(nodeOrdinal, otherSettings)) 56 | .put(RosetteTextAnalysisPlugin.ROSETTE_API_KEY.getKey(), System.getProperty("rosette.api.key", "")) 57 | .build(); 58 | } 59 | 60 | @Override 61 | protected Collection> nodePlugins() { 62 | return Collections.singleton(RosetteTextAnalysisPlugin.class); 63 | } 64 | 65 | @Test 66 | public void testPluginIsLoaded() throws Exception { 67 | NodesInfoResponse response = client().admin().cluster().prepareNodesInfo() 68 | .addMetric(NodesInfoRequest.Metric.PLUGINS.metricName()).get(); 69 | for (NodeInfo nodeInfo : response.getNodes()) { 70 | boolean pluginFound = false; 71 | for (PluginInfo pluginInfo : nodeInfo.getInfo(PluginsAndModules.class).getPluginInfos()) { 72 | if (pluginInfo.getName().equals(RosetteTextAnalysisPlugin.class.getName())) { 73 | pluginFound = true; 74 | break; 75 | } 76 | } 77 | assertTrue(pluginFound); 78 | } 79 | } 80 | 81 | //Tests the language processor 82 | @Test 83 | public void testLanguage() throws Exception { 84 | 85 | String inputText = "This is a very English document. It should be identified as English."; 86 | 87 | SearchResponse response = exercisePipeline(inputText, "language"); 88 | 89 | //Check the source for the expected language 90 | MatcherAssert.assertThat(response.getHits().getAt(0).getSourceAsMap() 91 | .get(LanguageProcessor.Parameters.TARGET_FIELD.defaultValue), Matchers.equalTo("eng")); 92 | } 93 | 94 | @Test 95 | public void testCategories() throws Exception { 96 | 97 | String inputText = "The people played lots of sports like soccer and hockey. The score was very high. " 98 | + "Touchdown!"; 99 | 100 | SearchResponse response = exercisePipeline(inputText, "categories"); 101 | 102 | //Check the source for the expected category 103 | MatcherAssert.assertThat(response.getHits().getAt(0).getSourceAsMap() 104 | .get(CategoriesProcessor.Parameters.TARGET_FIELD.defaultValue), Matchers.equalTo("SPORTS")); 105 | } 106 | 107 | @Test 108 | public void testSentiment() throws Exception { 109 | 110 | String inputText = "I love this sentence so much I want to marry it!"; 111 | 112 | SearchResponse response = exercisePipeline(inputText, "sentiment"); 113 | 114 | //Check the source for the expected sentiment 115 | MatcherAssert.assertThat(response.getHits().getAt(0).getSourceAsMap() 116 | .get(SentimentProcessor.Parameters.TARGET_FIELD.defaultValue), Matchers.equalTo("pos")); 117 | } 118 | 119 | @Test 120 | public void testTranslateToEnglish() throws Exception { 121 | 122 | String inputText = "Владимир Путин"; 123 | 124 | SearchResponse response = exercisePipeline(inputText, "translate_eng"); 125 | 126 | //Check the source for the expected English translation 127 | MatcherAssert.assertThat(response.getHits().getAt(0).getSourceAsMap() 128 | .get(NameTranslationProcessor.Parameters.TARGET_FIELD.defaultValue), 129 | Matchers.equalTo("Vladimir Putin")); 130 | } 131 | 132 | @Test 133 | public void testTranslateFromEnglish() throws Exception { 134 | String inputText = "Vladimir Putin"; 135 | 136 | SearchResponse response = exercisePipeline(inputText, "translate_rus"); 137 | 138 | //Check the source for the expected Russian translation 139 | MatcherAssert.assertThat(response.getHits().getAt(0).getSourceAsMap() 140 | .get(NameTranslationProcessor.Parameters.TARGET_FIELD.defaultValue), 141 | Matchers.equalTo("Владимир Путин")); 142 | } 143 | 144 | @Test 145 | public void testEntities() throws Exception { 146 | 147 | String inputText = "Original Ghostbuster Dan Aykroyd, who also co-wrote the 1984 Ghostbusters film, couldn’t " 148 | + "be more pleased with the new all-female Ghostbusters cast, telling The Hollywood Reporter, “The " 149 | + "Aykroyd family is delighted by this inheritance of the Ghostbusters torch by these most magnificent " 150 | + "women in comedy.”"; 151 | 152 | SearchResponse response = exercisePipeline(inputText, "entities"); 153 | 154 | //Check the source for the expected entity result 155 | assertFalse(((List)response.getHits().getAt(0).getSourceAsMap() 156 | .get(EntitiesProcessor.Parameters.TARGET_FIELD.defaultValue)).isEmpty()); 157 | Map entity = (Map)((List)response.getHits().getAt(0).getSourceAsMap() 158 | .get(EntitiesProcessor.Parameters.TARGET_FIELD.defaultValue)).get(0); 159 | MatcherAssert.assertThat(entity.get("mention"), Matchers.equalTo("Original Ghostbuster Dan Aykroyd")); 160 | } 161 | 162 | @Test 163 | public void testEntitiesWithSentiment() throws Exception { 164 | 165 | String inputText = "Original Ghostbuster Dan Aykroyd, who also co-wrote the 1984 Ghostbusters film, couldn’t " 166 | + "be more pleased with the new all-female Ghostbusters cast, telling The Hollywood Reporter, “The " 167 | + "Aykroyd family is delighted by this inheritance of the Ghostbusters torch by these most magnificent " 168 | + "women in comedy.”"; 169 | 170 | SearchResponse response = exercisePipeline(inputText, "entities_sentiment"); 171 | 172 | //Check the source for the expected entity level sentiment 173 | assertFalse(((List)response.getHits().getAt(0).getSourceAsMap() 174 | .get(EntitiesProcessor.Parameters.TARGET_FIELD.defaultValue)).isEmpty()); 175 | Map entity = (Map)((List)response.getHits().getAt(0).getSourceAsMap() 176 | .get(EntitiesProcessor.Parameters.TARGET_FIELD.defaultValue)).get(0); 177 | MatcherAssert.assertThat(entity.get("mention"), Matchers.equalTo("Original Ghostbuster Dan Aykroyd")); 178 | MatcherAssert.assertThat(entity.get("sentiment"), Matchers.equalTo("pos")); 179 | } 180 | 181 | //Test that all (or most) of the processors work together 182 | @Test 183 | public void testAll() throws Exception { 184 | 185 | String inputText = "Original Ghostbuster Dan Aykroyd, who also co-wrote the 1984 Ghostbusters film, couldn’t " 186 | + "be more pleased with the new all-female Ghostbusters cast, telling The Hollywood Reporter, “The " 187 | + "Aykroyd family is delighted by this inheritance of the Ghostbusters torch by these most magnificent " 188 | + "women in comedy.”"; 189 | 190 | SearchResponse response = exercisePipeline(inputText, "all"); 191 | 192 | //Check the source for the expected entity result 193 | Map source = response.getHits().getAt(0).getSourceAsMap(); 194 | MatcherAssert.assertThat(source.get(LanguageProcessor.Parameters.TARGET_FIELD.defaultValue), 195 | Matchers.equalTo("eng")); 196 | MatcherAssert.assertThat(source.get(CategoriesProcessor.Parameters.TARGET_FIELD.defaultValue), 197 | Matchers.equalTo("ARTS_AND_ENTERTAINMENT")); 198 | MatcherAssert.assertThat(source.get(SentimentProcessor.Parameters.TARGET_FIELD.defaultValue), 199 | Matchers.equalTo("pos")); 200 | 201 | assertFalse(((List)source.get(EntitiesProcessor.Parameters.TARGET_FIELD.defaultValue)).isEmpty()); 202 | Map entity = (Map)((List)source.get(EntitiesProcessor.Parameters.TARGET_FIELD.defaultValue)).get(0); 203 | MatcherAssert.assertThat(entity.get("mention"), Matchers.equalTo("Original Ghostbuster Dan Aykroyd")); 204 | } 205 | 206 | private SearchResponse exercisePipeline(String inputText, String pipelineName) throws IOException { 207 | 208 | //Add the ingest pipeline 209 | AcknowledgedResponse pipelineResponse = client().admin().cluster() 210 | .preparePutPipeline(pipelineName, getProcessorConfig(pipelineName), XContentType.JSON).get(); 211 | assertTrue("Failed to add ingest pipeline", pipelineResponse.isAcknowledged()); 212 | 213 | //Add a document that uses the ingest pipeline 214 | IndexResponse indexResponse = client().prepareIndex("test", "test").setPipeline(pipelineName) 215 | .setSource(XContentFactory.jsonBuilder().startObject().field("text", inputText) 216 | .endObject()).get(); 217 | assertEquals("Failed to index document correctly", RestStatus.CREATED, indexResponse.status()); 218 | //Force index refresh 219 | refresh("test"); 220 | 221 | //Find the document 222 | SearchResponse response = client().prepareSearch("test").setQuery(QueryBuilders.matchAllQuery()).get(); 223 | ElasticsearchAssertions.assertNoFailures(response); 224 | 225 | return response; 226 | } 227 | 228 | private BytesArray getProcessorConfig(String name) throws IOException { 229 | try (InputStream is = getClass().getClassLoader().getResourceAsStream("it_processors/" + name + ".json")) { 230 | StringBuilder sb = new StringBuilder(); 231 | String line; 232 | try (BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8.name()))) { 233 | while ((line = br.readLine()) != null) { 234 | sb.append(line); 235 | } 236 | } 237 | return new BytesArray(sb.toString()); 238 | } 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /plugin/src/test/java/com/rosette/elasticsearch/SentimentProcessorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Basis Technology Corp. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.rosette.elasticsearch; 17 | 18 | import org.elasticsearch.ingest.IngestDocument; 19 | import org.elasticsearch.ingest.RandomDocumentPicks; 20 | import org.elasticsearch.test.ESSingleNodeTestCase; 21 | import org.hamcrest.MatcherAssert; 22 | import org.hamcrest.Matchers; 23 | import org.junit.Test; 24 | 25 | import java.util.HashMap; 26 | import java.util.Map; 27 | 28 | public class SentimentProcessorTest extends ESSingleNodeTestCase { 29 | 30 | @Test 31 | public void testSentiment() throws Exception { 32 | SentimentProcessor processor = new SentimentProcessor(new RosetteApiWrapper(), randomUnicodeOfLength(10), 33 | "description", "text", "sentiment"); 34 | 35 | String inputText = "I love this sentence so much I want to marry it!"; 36 | 37 | Map document = new HashMap<>(); 38 | document.put("text", inputText); 39 | IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); 40 | processor.execute(ingestDocument); 41 | 42 | MatcherAssert.assertThat(ingestDocument.getSourceAndMetadata().get("text"), Matchers.equalTo(inputText)); 43 | MatcherAssert.assertThat(ingestDocument.getSourceAndMetadata().get("sentiment"), Matchers.equalTo("pos")); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /plugin/src/test/resources/elasticsearch.version: -------------------------------------------------------------------------------- 1 | ${elasticsearch.version} 2 | -------------------------------------------------------------------------------- /plugin/src/test/resources/it_processors/all.json: -------------------------------------------------------------------------------- 1 | { "processors" : 2 | [ 3 | { 4 | "ros_categories" : { "field": "text" } 5 | }, 6 | { 7 | "ros_language" : { "field": "text" } 8 | }, 9 | { 10 | "ros_sentiment" : { "field": "text" } 11 | }, 12 | { 13 | "ros_entities": { 14 | "field": "text", 15 | "include_offsets": false, 16 | "include_translation": false, 17 | "include_sentiment": false 18 | } 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /plugin/src/test/resources/it_processors/categories.json: -------------------------------------------------------------------------------- 1 | { "processors" : [ { "ros_categories" : { "field": "text" } } ] } -------------------------------------------------------------------------------- /plugin/src/test/resources/it_processors/entities.json: -------------------------------------------------------------------------------- 1 | { "processors" : [ { "ros_entities" : { "field": "text", "include_offsets": true, "include_translation": true, "include_sentiment": false } } ] } -------------------------------------------------------------------------------- /plugin/src/test/resources/it_processors/entities_sentiment.json: -------------------------------------------------------------------------------- 1 | { "processors" : [ { "ros_entities" : { "field": "text", "include_offsets": true, "include_translation": true, "include_sentiment": true } } ] } -------------------------------------------------------------------------------- /plugin/src/test/resources/it_processors/language.json: -------------------------------------------------------------------------------- 1 | { "processors" : [ { "ros_language" : { "field": "text" } } ] } -------------------------------------------------------------------------------- /plugin/src/test/resources/it_processors/sentiment.json: -------------------------------------------------------------------------------- 1 | { "processors" : [ { "ros_sentiment" : { "field": "text" } } ] } -------------------------------------------------------------------------------- /plugin/src/test/resources/it_processors/translate_eng.json: -------------------------------------------------------------------------------- 1 | { "processors" : [ { "ros_name_translation" : { "field": "text" } } ] } -------------------------------------------------------------------------------- /plugin/src/test/resources/it_processors/translate_rus.json: -------------------------------------------------------------------------------- 1 | { "processors" : [ { "ros_name_translation" : { "field": "text", "target_language": "rus" } } ] } -------------------------------------------------------------------------------- /plugin/src/test/resources/mock_responses/categories_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "categories": [ 3 | { 4 | "label": "SPORTS", 5 | "confidence": 0.07876023835418125 6 | } 7 | ] 8 | } -------------------------------------------------------------------------------- /plugin/src/test/resources/mock_responses/entities_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "version":"1.1.0", 3 | "data":"Original Ghostbuster Dan Aykroyd, who also co-wrote the 1984 Ghostbusters film, couldn’t be more pleased with the new all-female Ghostbusters cast, telling The Hollywood Reporter, “The Aykroyd family is delighted by this inheritance of the Ghostbusters torch by these most magnificent women in comedy.", 4 | "attributes":{ 5 | "scriptRegion":{ 6 | "type":"list", 7 | "itemType":"scriptRegion", 8 | "items":[ 9 | { 10 | "startOffset":0, 11 | "endOffset":301, 12 | "script":"Latn" 13 | } 14 | ] 15 | }, 16 | "sentence":{ 17 | "type":"list", 18 | "itemType":"sentence", 19 | "items":[ 20 | { 21 | "startOffset":0, 22 | "endOffset":301 23 | } 24 | ] 25 | }, 26 | "languageDetection":{ 27 | "type":"languageDetection", 28 | "startOffset":0, 29 | "endOffset":301, 30 | "detectionResults":[ 31 | { 32 | "language":"eng", 33 | "encoding":"UTF-16BE", 34 | "script":"Latn", 35 | "confidence":0.02120806448228041 36 | } 37 | ] 38 | }, 39 | "token":{ 40 | "type":"list", 41 | "itemType":"token", 42 | "items":[ 43 | { 44 | "startOffset":0, 45 | "endOffset":8, 46 | "text":"Original" 47 | }, 48 | { 49 | "startOffset":9, 50 | "endOffset":20, 51 | "text":"Ghostbuster" 52 | }, 53 | { 54 | "startOffset":21, 55 | "endOffset":24, 56 | "text":"Dan" 57 | }, 58 | { 59 | "startOffset":25, 60 | "endOffset":32, 61 | "text":"Aykroyd" 62 | }, 63 | { 64 | "startOffset":32, 65 | "endOffset":33, 66 | "text":"," 67 | }, 68 | { 69 | "startOffset":34, 70 | "endOffset":37, 71 | "text":"who" 72 | }, 73 | { 74 | "startOffset":38, 75 | "endOffset":42, 76 | "text":"also" 77 | }, 78 | { 79 | "startOffset":43, 80 | "endOffset":45, 81 | "text":"co" 82 | }, 83 | { 84 | "startOffset":45, 85 | "endOffset":46, 86 | "text":"-" 87 | }, 88 | { 89 | "startOffset":46, 90 | "endOffset":51, 91 | "text":"wrote" 92 | }, 93 | { 94 | "startOffset":52, 95 | "endOffset":55, 96 | "text":"the" 97 | }, 98 | { 99 | "startOffset":56, 100 | "endOffset":60, 101 | "text":"1984" 102 | }, 103 | { 104 | "startOffset":61, 105 | "endOffset":73, 106 | "text":"Ghostbusters" 107 | }, 108 | { 109 | "startOffset":74, 110 | "endOffset":78, 111 | "text":"film" 112 | }, 113 | { 114 | "startOffset":78, 115 | "endOffset":79, 116 | "text":"," 117 | }, 118 | { 119 | "startOffset":80, 120 | "endOffset":88, 121 | "text":"couldn’t" 122 | }, 123 | { 124 | "startOffset":89, 125 | "endOffset":91, 126 | "text":"be" 127 | }, 128 | { 129 | "startOffset":92, 130 | "endOffset":96, 131 | "text":"more" 132 | }, 133 | { 134 | "startOffset":97, 135 | "endOffset":104, 136 | "text":"pleased" 137 | }, 138 | { 139 | "startOffset":105, 140 | "endOffset":109, 141 | "text":"with" 142 | }, 143 | { 144 | "startOffset":110, 145 | "endOffset":113, 146 | "text":"the" 147 | }, 148 | { 149 | "startOffset":114, 150 | "endOffset":117, 151 | "text":"new" 152 | }, 153 | { 154 | "startOffset":118, 155 | "endOffset":121, 156 | "text":"all" 157 | }, 158 | { 159 | "startOffset":121, 160 | "endOffset":122, 161 | "text":"-" 162 | }, 163 | { 164 | "startOffset":122, 165 | "endOffset":128, 166 | "text":"female" 167 | }, 168 | { 169 | "startOffset":129, 170 | "endOffset":141, 171 | "text":"Ghostbusters" 172 | }, 173 | { 174 | "startOffset":142, 175 | "endOffset":146, 176 | "text":"cast" 177 | }, 178 | { 179 | "startOffset":146, 180 | "endOffset":147, 181 | "text":"," 182 | }, 183 | { 184 | "startOffset":148, 185 | "endOffset":155, 186 | "text":"telling" 187 | }, 188 | { 189 | "startOffset":156, 190 | "endOffset":159, 191 | "text":"The" 192 | }, 193 | { 194 | "startOffset":160, 195 | "endOffset":169, 196 | "text":"Hollywood" 197 | }, 198 | { 199 | "startOffset":170, 200 | "endOffset":178, 201 | "text":"Reporter" 202 | }, 203 | { 204 | "startOffset":178, 205 | "endOffset":179, 206 | "text":"," 207 | }, 208 | { 209 | "startOffset":180, 210 | "endOffset":181, 211 | "text":"“" 212 | }, 213 | { 214 | "startOffset":181, 215 | "endOffset":184, 216 | "text":"The" 217 | }, 218 | { 219 | "startOffset":185, 220 | "endOffset":192, 221 | "text":"Aykroyd" 222 | }, 223 | { 224 | "startOffset":193, 225 | "endOffset":199, 226 | "text":"family" 227 | }, 228 | { 229 | "startOffset":200, 230 | "endOffset":202, 231 | "text":"is" 232 | }, 233 | { 234 | "startOffset":203, 235 | "endOffset":212, 236 | "text":"delighted" 237 | }, 238 | { 239 | "startOffset":213, 240 | "endOffset":215, 241 | "text":"by" 242 | }, 243 | { 244 | "startOffset":216, 245 | "endOffset":220, 246 | "text":"this" 247 | }, 248 | { 249 | "startOffset":221, 250 | "endOffset":232, 251 | "text":"inheritance" 252 | }, 253 | { 254 | "startOffset":233, 255 | "endOffset":235, 256 | "text":"of" 257 | }, 258 | { 259 | "startOffset":236, 260 | "endOffset":239, 261 | "text":"the" 262 | }, 263 | { 264 | "startOffset":240, 265 | "endOffset":252, 266 | "text":"Ghostbusters" 267 | }, 268 | { 269 | "startOffset":253, 270 | "endOffset":258, 271 | "text":"torch" 272 | }, 273 | { 274 | "startOffset":259, 275 | "endOffset":261, 276 | "text":"by" 277 | }, 278 | { 279 | "startOffset":262, 280 | "endOffset":267, 281 | "text":"these" 282 | }, 283 | { 284 | "startOffset":268, 285 | "endOffset":272, 286 | "text":"most" 287 | }, 288 | { 289 | "startOffset":273, 290 | "endOffset":284, 291 | "text":"magnificent" 292 | }, 293 | { 294 | "startOffset":285, 295 | "endOffset":290, 296 | "text":"women" 297 | }, 298 | { 299 | "startOffset":291, 300 | "endOffset":293, 301 | "text":"in" 302 | }, 303 | { 304 | "startOffset":294, 305 | "endOffset":300, 306 | "text":"comedy" 307 | }, 308 | { 309 | "startOffset":300, 310 | "endOffset":301, 311 | "text":"." 312 | } 313 | ] 314 | }, 315 | "entities":{ 316 | "type":"list", 317 | "itemType":"entities", 318 | "items":[ 319 | { 320 | "mentions":[ 321 | { 322 | "startOffset":21, 323 | "endOffset":32, 324 | "source":"kb-linker", 325 | "normalized":"Dan Aykroyd" 326 | }, 327 | { 328 | "startOffset":185, 329 | "endOffset":192, 330 | "source":"statistical", 331 | "subsource":"/data/roots/rex/7.24.1.c58.3/data/statistical/eng/model-LE.bin", 332 | "normalized":"Aykroyd" 333 | } 334 | ], 335 | "headMentionIndex":0, 336 | "type":"PERSON", 337 | "entityId":"Q105221" 338 | }, 339 | { 340 | "mentions":[ 341 | { 342 | "startOffset":61, 343 | "endOffset":73, 344 | "source":"kb-linker", 345 | "normalized":"Ghostbusters" 346 | }, 347 | { 348 | "startOffset":129, 349 | "endOffset":141, 350 | "source":"kb-linker", 351 | "normalized":"Ghostbusters" 352 | }, 353 | { 354 | "startOffset":240, 355 | "endOffset":252, 356 | "source":"kb-linker", 357 | "normalized":"Ghostbusters" 358 | } 359 | ], 360 | "headMentionIndex":0, 361 | "type":"PRODUCT", 362 | "entityId":"Q108745" 363 | }, 364 | { 365 | "mentions":[ 366 | { 367 | "startOffset":156, 368 | "endOffset":178, 369 | "source":"kb-linker", 370 | "normalized":"The Hollywood Reporter" 371 | } 372 | ], 373 | "headMentionIndex":0, 374 | "type":"ORGANIZATION", 375 | "entityId":"Q61503" 376 | } 377 | ] 378 | } 379 | }, 380 | "documentMetadata":{ 381 | "processedBy":[ 382 | "whole-document-language@10.28.73.67", 383 | "entity-extraction@10.28.77.104" 384 | ] 385 | } 386 | } -------------------------------------------------------------------------------- /plugin/src/test/resources/mock_responses/language_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "languageDetections": [ 3 | { 4 | "language": "eng", 5 | "confidence": 0.5936566252009456 6 | }, 7 | { 8 | "language": "por", 9 | "confidence": 0.07179441867600847 10 | }, 11 | { 12 | "language": "ita", 13 | "confidence": 0.05314874058696064 14 | }, 15 | { 16 | "language": "fra", 17 | "confidence": 0.04732445026624705 18 | }, 19 | { 20 | "language": "spa", 21 | "confidence": 0.04249449401047532 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /plugin/src/test/resources/mock_responses/name-translation_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "translation": "Vladimir Putin", 3 | "targetLanguage": "eng", 4 | "targetScript": "Latn", 5 | "targetScheme": "IC", 6 | "confidence": 0.45912207430901864 7 | } -------------------------------------------------------------------------------- /plugin/src/test/resources/mock_responses/sentiment_adm_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "version":"1.1.0", 3 | "data":"Original Ghostbuster Dan Aykroyd, who also co-wrote the 1984 Ghostbusters film, couldn’t be more pleased with the new all-female Ghostbusters cast, telling The Hollywood Reporter, “The Aykroyd family is delighted by this inheritance of the Ghostbusters torch by these most magnificent women in comedy.", 4 | "attributes":{ 5 | "sentence":{ 6 | "type":"list", 7 | "itemType":"sentence", 8 | "items":[ 9 | { 10 | "startOffset":0, 11 | "endOffset":301 12 | } 13 | ] 14 | }, 15 | "languageDetection":{ 16 | "type":"languageDetection", 17 | "startOffset":0, 18 | "endOffset":301, 19 | "detectionResults":[ 20 | { 21 | "language":"eng", 22 | "encoding":"UTF-16BE", 23 | "script":"Latn", 24 | "confidence":0.02120806448228041 25 | } 26 | ] 27 | }, 28 | "scriptRegion":{ 29 | "type":"list", 30 | "itemType":"scriptRegion", 31 | "items":[ 32 | { 33 | "startOffset":0, 34 | "endOffset":301, 35 | "script":"Latn" 36 | } 37 | ] 38 | }, 39 | "sentimentResults":{ 40 | "type":"list", 41 | "itemType":"categorizerResults", 42 | "items":[ 43 | { 44 | "label":"pos", 45 | "score":0.6234125839546323, 46 | "confidence":0.7962072011038756, 47 | "explanationSet":[ 48 | "pleased", 49 | "hollywood", 50 | "wrote", 51 | "*POS_LEX*", 52 | "new" 53 | ] 54 | } 55 | ] 56 | }, 57 | "token":{ 58 | "type":"list", 59 | "itemType":"token", 60 | "items":[ 61 | { 62 | "startOffset":0, 63 | "endOffset":8, 64 | "text":"Original" 65 | }, 66 | { 67 | "startOffset":9, 68 | "endOffset":20, 69 | "text":"Ghostbuster" 70 | }, 71 | { 72 | "startOffset":21, 73 | "endOffset":24, 74 | "text":"Dan" 75 | }, 76 | { 77 | "startOffset":25, 78 | "endOffset":32, 79 | "text":"Aykroyd" 80 | }, 81 | { 82 | "startOffset":32, 83 | "endOffset":33, 84 | "text":"," 85 | }, 86 | { 87 | "startOffset":34, 88 | "endOffset":37, 89 | "text":"who" 90 | }, 91 | { 92 | "startOffset":38, 93 | "endOffset":42, 94 | "text":"also" 95 | }, 96 | { 97 | "startOffset":43, 98 | "endOffset":45, 99 | "text":"co" 100 | }, 101 | { 102 | "startOffset":45, 103 | "endOffset":46, 104 | "text":"-" 105 | }, 106 | { 107 | "startOffset":46, 108 | "endOffset":51, 109 | "text":"wrote" 110 | }, 111 | { 112 | "startOffset":52, 113 | "endOffset":55, 114 | "text":"the" 115 | }, 116 | { 117 | "startOffset":56, 118 | "endOffset":60, 119 | "text":"1984" 120 | }, 121 | { 122 | "startOffset":61, 123 | "endOffset":73, 124 | "text":"Ghostbusters" 125 | }, 126 | { 127 | "startOffset":74, 128 | "endOffset":78, 129 | "text":"film" 130 | }, 131 | { 132 | "startOffset":78, 133 | "endOffset":79, 134 | "text":"," 135 | }, 136 | { 137 | "startOffset":80, 138 | "endOffset":88, 139 | "text":"couldn’t" 140 | }, 141 | { 142 | "startOffset":89, 143 | "endOffset":91, 144 | "text":"be" 145 | }, 146 | { 147 | "startOffset":92, 148 | "endOffset":96, 149 | "text":"more" 150 | }, 151 | { 152 | "startOffset":97, 153 | "endOffset":104, 154 | "text":"pleased" 155 | }, 156 | { 157 | "startOffset":105, 158 | "endOffset":109, 159 | "text":"with" 160 | }, 161 | { 162 | "startOffset":110, 163 | "endOffset":113, 164 | "text":"the" 165 | }, 166 | { 167 | "startOffset":114, 168 | "endOffset":117, 169 | "text":"new" 170 | }, 171 | { 172 | "startOffset":118, 173 | "endOffset":121, 174 | "text":"all" 175 | }, 176 | { 177 | "startOffset":121, 178 | "endOffset":122, 179 | "text":"-" 180 | }, 181 | { 182 | "startOffset":122, 183 | "endOffset":128, 184 | "text":"female" 185 | }, 186 | { 187 | "startOffset":129, 188 | "endOffset":141, 189 | "text":"Ghostbusters" 190 | }, 191 | { 192 | "startOffset":142, 193 | "endOffset":146, 194 | "text":"cast" 195 | }, 196 | { 197 | "startOffset":146, 198 | "endOffset":147, 199 | "text":"," 200 | }, 201 | { 202 | "startOffset":148, 203 | "endOffset":155, 204 | "text":"telling" 205 | }, 206 | { 207 | "startOffset":156, 208 | "endOffset":159, 209 | "text":"The" 210 | }, 211 | { 212 | "startOffset":160, 213 | "endOffset":169, 214 | "text":"Hollywood" 215 | }, 216 | { 217 | "startOffset":170, 218 | "endOffset":178, 219 | "text":"Reporter" 220 | }, 221 | { 222 | "startOffset":178, 223 | "endOffset":179, 224 | "text":"," 225 | }, 226 | { 227 | "startOffset":180, 228 | "endOffset":181, 229 | "text":"“" 230 | }, 231 | { 232 | "startOffset":181, 233 | "endOffset":184, 234 | "text":"The" 235 | }, 236 | { 237 | "startOffset":185, 238 | "endOffset":192, 239 | "text":"Aykroyd" 240 | }, 241 | { 242 | "startOffset":193, 243 | "endOffset":199, 244 | "text":"family" 245 | }, 246 | { 247 | "startOffset":200, 248 | "endOffset":202, 249 | "text":"is" 250 | }, 251 | { 252 | "startOffset":203, 253 | "endOffset":212, 254 | "text":"delighted" 255 | }, 256 | { 257 | "startOffset":213, 258 | "endOffset":215, 259 | "text":"by" 260 | }, 261 | { 262 | "startOffset":216, 263 | "endOffset":220, 264 | "text":"this" 265 | }, 266 | { 267 | "startOffset":221, 268 | "endOffset":232, 269 | "text":"inheritance" 270 | }, 271 | { 272 | "startOffset":233, 273 | "endOffset":235, 274 | "text":"of" 275 | }, 276 | { 277 | "startOffset":236, 278 | "endOffset":239, 279 | "text":"the" 280 | }, 281 | { 282 | "startOffset":240, 283 | "endOffset":252, 284 | "text":"Ghostbusters" 285 | }, 286 | { 287 | "startOffset":253, 288 | "endOffset":258, 289 | "text":"torch" 290 | }, 291 | { 292 | "startOffset":259, 293 | "endOffset":261, 294 | "text":"by" 295 | }, 296 | { 297 | "startOffset":262, 298 | "endOffset":267, 299 | "text":"these" 300 | }, 301 | { 302 | "startOffset":268, 303 | "endOffset":272, 304 | "text":"most" 305 | }, 306 | { 307 | "startOffset":273, 308 | "endOffset":284, 309 | "text":"magnificent" 310 | }, 311 | { 312 | "startOffset":285, 313 | "endOffset":290, 314 | "text":"women" 315 | }, 316 | { 317 | "startOffset":291, 318 | "endOffset":293, 319 | "text":"in" 320 | }, 321 | { 322 | "startOffset":294, 323 | "endOffset":300, 324 | "text":"comedy" 325 | }, 326 | { 327 | "startOffset":300, 328 | "endOffset":301, 329 | "text":"." 330 | } 331 | ] 332 | }, 333 | "entities":{ 334 | "type":"list", 335 | "itemType":"entities", 336 | "items":[ 337 | { 338 | "mentions":[ 339 | { 340 | "startOffset":21, 341 | "endOffset":32, 342 | "source":"kb-linker", 343 | "normalized":"Dan Aykroyd" 344 | }, 345 | { 346 | "startOffset":185, 347 | "endOffset":192, 348 | "source":"statistical", 349 | "subsource":"/data/roots/rex/7.24.1.c58.3/data/statistical/eng/model-LE.bin", 350 | "normalized":"Aykroyd" 351 | } 352 | ], 353 | "headMentionIndex":0, 354 | "type":"PERSON", 355 | "entityId":"Q105221", 356 | "sentiment":[ 357 | { 358 | "label":"pos", 359 | "score":0.2378945518285036, 360 | "confidence":0.6385089278441162, 361 | "explanationSet":[ 362 | "hollywood", 363 | "wrote", 364 | "*POS_LEX*", 365 | "delighted", 366 | "reporter" 367 | ] 368 | } 369 | ] 370 | }, 371 | { 372 | "mentions":[ 373 | { 374 | "startOffset":61, 375 | "endOffset":73, 376 | "source":"kb-linker", 377 | "normalized":"Ghostbusters" 378 | }, 379 | { 380 | "startOffset":129, 381 | "endOffset":141, 382 | "source":"kb-linker", 383 | "normalized":"Ghostbusters" 384 | }, 385 | { 386 | "startOffset":240, 387 | "endOffset":252, 388 | "source":"kb-linker", 389 | "normalized":"Ghostbusters" 390 | } 391 | ], 392 | "headMentionIndex":0, 393 | "type":"PRODUCT", 394 | "entityId":"Q108745", 395 | "sentiment":[ 396 | { 397 | "label":"pos", 398 | "score":0.6744932839646935, 399 | "confidence":0.8111117726721541, 400 | "explanationSet":[ 401 | "pleased", 402 | "hollywood", 403 | "wrote", 404 | "*POS_LEX*", 405 | "new" 406 | ] 407 | } 408 | ] 409 | }, 410 | { 411 | "mentions":[ 412 | { 413 | "startOffset":156, 414 | "endOffset":178, 415 | "source":"kb-linker", 416 | "normalized":"The Hollywood Reporter" 417 | } 418 | ], 419 | "headMentionIndex":0, 420 | "type":"ORGANIZATION", 421 | "entityId":"Q61503", 422 | "sentiment":[ 423 | { 424 | "label":"pos", 425 | "score":-0.02252599410712719, 426 | "confidence":0.5338094035254866, 427 | "explanationSet":[ 428 | "*POS_LEX*", 429 | "new", 430 | "delighted", 431 | "reporter", 432 | "family" 433 | ] 434 | } 435 | ] 436 | } 437 | ] 438 | } 439 | }, 440 | "documentMetadata":{ 441 | "processedBy":[ 442 | "whole-document-language@10.28.73.206", 443 | "entity-extraction@10.28.177.218", 444 | "sentiment@10.28.177.218" 445 | ] 446 | } 447 | } -------------------------------------------------------------------------------- /plugin/src/test/resources/mock_responses/sentiment_response.json: -------------------------------------------------------------------------------- 1 | { 2 | "document": { 3 | "label": "pos", 4 | "confidence": 0.6410158927447778 5 | }, 6 | "entities": [] 7 | } -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 4.0.0 19 | com.rosette.elasticsearch 20 | rosette-elasticsearch-parent 21 | 7.17.0.1-SNAPSHOT 22 | pom 23 | 24 | open-source-parent 25 | com.basistech 26 | 9.0.1 27 | 28 | 29 | 2017 30 | http://rosette-api.github.io/rosette-elasticsearch-plugin 31 | 32 | scm:git:git@github.com:rosette-api/rosette-elasticsearch-plugin.git 33 | scm:git:git@github.com:rosette-api/rosette-elasticsearch-plugin.git 34 | HEAD 35 | 36 | Elasticsearch plugin parent 37 | 38 | 39 | site 40 | scm:git:git@github.com:rosette-api/rosette-elasticsearch-plugin.git 41 | 42 | 43 | 44 | 1.8 45 | ${jdk.version} 46 | ${jdk.version} 47 | 3.2.0 48 | 7.17.0 49 | 2.17.1 50 | 3.0.0-M3 51 | 1.6 52 | 3.3.0 53 | 54 | https://s01.oss.sonatype.org 55 | 1.20.0 56 | 57 | 58 | plugin 59 | docker 60 | 61 | 62 | 63 | 64 | com.basistech.rosette 65 | rosette-api 66 | ${rosette.api.binding.version} 67 | 68 | 69 | org.elasticsearch 70 | elasticsearch 71 | ${elasticsearch.version} 72 | 73 | 74 | org.apache.logging.log4j 75 | log4j-api 76 | ${log4j.version} 77 | 78 | 79 | org.apache.logging.log4j 80 | log4j-core 81 | ${log4j.version} 82 | 83 | 84 | org.elasticsearch.test 85 | framework 86 | ${elasticsearch.version} 87 | 88 | 89 | org.apache.commons 90 | commons-lang3 91 | 92 | 93 | 94 | 95 | 96 | 97 | install 98 | 99 | 100 | org.apache.maven.plugins 101 | maven-enforcer-plugin 102 | ${maven-enforcer-plugin.version} 103 | 104 | 105 | enforce-maven 106 | 107 | enforce 108 | 109 | 110 | 111 | 112 | 3.6.3 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | org.apache.maven.plugins 121 | maven-compiler-plugin 122 | 123 | utf-8 124 | ${jdk.version} 125 | ${jdk.version} 126 | true 127 | true 128 | 129 | 130 | 131 | org.apache.maven.plugins 132 | maven-javadoc-plugin 133 | ${maven-javadoc-plugin.version} 134 | 135 | ${jdk.version} 136 | true 137 | 138 | 139 | 140 | aggregate 141 | 142 | aggregate 143 | 144 | site 145 | 146 | 147 | plain 148 | 149 | jar 150 | 151 | package 152 | 153 | 154 | 155 | 156 | org.apache.maven.plugins 157 | maven-source-plugin 158 | 159 | 160 | attach-sources 161 | verify 162 | 163 | jar-no-fork 164 | 165 | 166 | 167 | 168 | 169 | org.apache.maven.plugins 170 | maven-release-plugin 171 | 172 | @{project.version} 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | release 181 | 182 | 183 | 184 | org.apache.maven.plugins 185 | maven-gpg-plugin 186 | ${maven-gpg-plugin.version} 187 | 188 | 189 | 190 | sign 191 | 192 | 193 | 194 | --pinentry-mode 195 | loopback 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | org.apache.maven.plugins 209 | maven-javadoc-plugin 210 | ${maven-javadoc-plugin.version} 211 | 212 | ${jdk.version} 213 | true 214 | 215 | 216 | 217 | non-aggregate 218 | 219 | javadoc 220 | 221 | 222 | 223 | aggregate 224 | 225 | aggregate 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Releasing the Elasticsearch plugin 2 | 3 | ## Requirements 4 | 5 | To release you need: 6 | - an OSSRH account 7 | - gpg installed and a pgp key published 8 | - credentials for the above added to your maven settings file 9 | - personal OAuth access token with full repo permissions 10 | 11 | The process for this is described [here](https://github.com/RosetteTextAnalytics/rosapi1.5/blob/master/doc/release-binding.md#request-access-to-ossrh-if-not-already-done). 12 | 13 | ## Releasing 14 | 15 | A single script takes care of running the maven release process as well as creating the release in github. 16 | 17 | From the root directory of the repo run: 18 | `./tools/release.sh ${ELASTIC_VERSION} ${GITHUB_ACCESS_TOKEN}` 19 | 20 | As an example: 21 | `./tools/release.sh 5.6.2 012345abcdef` 22 | 23 | If successful head to https://oss.sonatype.org/#stagingRepositories to release the staged artifacts. 24 | -------------------------------------------------------------------------------- /tools/release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #Expects two arguments: 4 | #- ELASTIC_VERSION: The version of elastic for which you are building the plugin 5 | #- GITHUB_ACCESS_TOKEN: Personal OAuth access token with full repo permissions. 6 | # 7 | # ./tools/release.sh ${ELASTIC_VERSION} ${GITHUB_ACCESS_TOKEN} 8 | 9 | set -e 10 | 11 | ELASTIC_VERSION=$1 12 | ACCESS_TOKEN=$2 13 | 14 | echo "**" 15 | echo "* Set versions so they will be incremented correctly." 16 | echo "**" 17 | mvn versions:update-property -Dproperty=elasticsearch.version -DnewVersion=[${ELASTIC_VERSION}] -DallowDowngrade -DgenerateBackupPoms=false 18 | mvn versions:set -DnewVersion=${ELASTIC_VERSION}.0-SNAPSHOT -DgenerateBackupPoms=false 19 | git commit -a -m "Auto-update Elasticsearch to ${ELASTIC_VERSION}" 20 | 21 | echo "**" 22 | echo "* First running mvn release:prepare release:perform" 23 | echo "* (You may be asked for your ssh password)" 24 | echo "**" 25 | mvn -Prelease release:prepare release:perform --batch-mode 26 | 27 | echo "**" 28 | echo "* Now adding the release to the github repo" 29 | echo "**" 30 | 31 | version=$(sed -n 's/^version=\(.*\)/\1/p' plugin/target/classes/plugin-descriptor.properties) 32 | 33 | if [ "${version}" == ${ELASTIC_VERSION} ]; then 34 | echo "Error: version ${version} does not match Elastic version ${ELASTIC_VERSION}" 35 | exit 1 36 | fi 37 | 38 | response=$(curl -sS "https://github.com/rosette-api/rosette-elasticsearch-plugin/releases/tag/${version}") 39 | 40 | if [ "${response}" = "Not Found" ]; then 41 | echo "FAILURE: Failed to find release tag for ${version}. Did mvn release run successfully?" 42 | exit 1 43 | fi 44 | 45 | echo "* You will now be prompted for your github.com password..." 46 | 47 | notes="Release compatible with Elasticsearch ${ELASTIC_VERSION}" 48 | 49 | #response=$(curl -XPOST -u $git_username https://api.github.com/repos/rosette-api/rosette-elasticsearch-plugin/releases -d '{ "tag_name": "'"$version"'", "name" : "'"rosette-elasticsearch-plugin-$version"'", "body" : "'"$notes"'" }') 50 | 51 | response=$(curl -sS \ 52 | -H "Content-Type: application/json" \ 53 | -H "Authorization: token ${ACCESS_TOKEN}" \ 54 | -d '{ "tag_name": "'"${version}"'", "name" : "'"rosette-elasticsearch-plugin-${version}"'", "body" : "'"${notes}"'" }' \ 55 | https://api.github.com/repos/rosette-api/rosette-elasticsearch-plugin/releases) 56 | echo "${response}" 57 | uploadurl=$(echo "${response}" | sed -n 's/.*"upload_url": "\(.*\){?name,label}",/\1/p') 58 | 59 | if [ -z "${uploadurl}" ]; then 60 | echo "* Failed to create new release in github. Verify correct github credentials and that the release doesn't already exist. Aborting." 61 | exit 1 62 | fi 63 | 64 | filename="plugin/target/releases/rosette-elasticsearch-plugin-${version}.zip" 65 | fullurl="${uploadurl}?name=$(basename ${filename})" 66 | echo "* Adding plugin zip package to release assets..." 67 | response=$(curl -sS \ 68 | -H "Content-Type: application/zip" \ 69 | -H "Authorization: token ${ACCESS_TOKEN}" \ 70 | "${uploadurl}?name=$(basename ${filename})" --data-binary @"${filename}") 71 | 72 | echo "**" 73 | echo "* Release success!" 74 | echo "* MAKE SURE to head to https://oss.sonatype.org/#stagingRepositories to release the staged artifacts." 75 | echo "* Verify the tag and release look correct at https://github.com/rosette-api/rosette-elasticsearch-plugin." 76 | --------------------------------------------------------------------------------