├── LICENSE.txt ├── README.md ├── elasticsearch-analysis-annotation-0.9.zip ├── pom.xml └── src └── main ├── assemblies └── plugin.xml ├── java └── org │ └── elasticsearch │ ├── index │ └── analysis │ │ └── annotation │ │ ├── AnnotationAnalysisBinderProcessor.java │ │ ├── AnnotationAnalyzerProvider.java │ │ └── InlineAnnotationFilterFactory.java │ ├── indices │ └── analysis │ │ └── annotation │ │ ├── AnnotationIndicesAnalysis.java │ │ └── AnnotationIndicesAnalysisModule.java │ └── plugin │ └── analysis │ └── annotation │ ├── AnnotationAnalysisPlugin.java │ ├── AnnotationAnalyzer.java │ └── InlineAnnotationFilter.java └── resources └── es-plugin.properties /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | elasticsearch-analysis-annotation 2 | ================================= 3 | 4 | Analysis plugin for ElasticSearch providing capability for processing inline 5 | annotations in documents. 6 | 7 | Description 8 | ----------- 9 | 10 | Inline annotations are considered to be simple semantic informations inlined in 11 | source text, that are removed from the indexed text and injected as synonyms at 12 | positions of the words they are related to. 13 | 14 | This plugin provides analyzer `AnnotationAnalyzer` as well as filter 15 | `InlineAnnotationFilter`. 16 | `AnnotationAnalyzer` is composed of `WhitespaceTokenizer`, `LowerCaseFilter` and 17 | `InlineAnnotationFilter` (with default settings). 18 | More sophisticated analyzers (equivalent to StandardAnalyzer or SnowballAnalyzer) 19 | can be configure via configuration file elasticsearch.yml or web API. 20 | 21 | 22 | Example 23 | ------- 24 | Let's say we have this documents 25 | ``` 26 | "Mozart[artist] was born[lifeEvent] in Salzburg[city;Austria]" 27 | ``` 28 | 29 | If we parse this with StandardAnalyzer equivalent with annotation analysis added to it 30 | we get these tokens - some are omitted due to used StopFilter. 31 | ``` 32 | | [austria] 33 | [artist] | | [lifeevent] | | [city] 34 | mozart | | born | | salzburg 35 | ``` 36 | 37 | If we use StandardAnalyzer the result would be 38 | ``` 39 | mozart | artist | | born | lifeevent | | salzburg | city | austria 40 | ``` 41 | 42 | 43 | Installation 44 | ------------ 45 | This plugin follows conventions for elasticsearch plugins, thus can be installed 46 | in a standard manner - see http://www.elasticsearch.org/guide/reference/modules/plugins/ 47 | 48 | 49 | Using this plugin 50 | ----------------- 51 | To use those custom analyzers/filters you need to either modify `elasticsearch.yml` 52 | configuration file - see http://www.elasticsearch.org/guide/reference/index-modules/analysis/ or specify 53 | index mapping via elasticsearch API. 54 | 55 | The following example configuration contains definitions for analyzers based on behaviour of 56 | StandardAnalyzer and SnowballAnalyzer. 57 | 58 | *Please note that standard_annotation and snowball_annotation analyzers use standard tokenizer, 59 | which removes all non-alphanumeric characters and thus makes it impossible to process inline 60 | annotations marked with [,],; (which are used in default behaviour of InlineAnnotationFilter).* 61 | 62 | For this purpose we need to use mapping char filter, which remaps those special characters to 63 | their equivalent, which will be accepted by standard tokenizer as part of the token. 64 | 65 | ``` 66 | index : 67 | analysis : 68 | char_filter : 69 | annotation_remap : 70 | type : mapping 71 | mappings : ["[=>__annotation_start__", "]=>__annotation_end__",";=>__annotation_delimiter__"] 72 | analyzer : 73 | standard_annotation : 74 | type : custom 75 | tokenizer : standard 76 | char_filter : annotation_remap 77 | filter : [standard, lowercase, annotation_filter, stop] 78 | snowball_annotation : 79 | type : custom 80 | tokenizer : standard 81 | char_filter : annotation_remap 82 | filter : [standard, lowercase, annotation_filter, stop, snowball] 83 | filter : 84 | annotation_filter : 85 | type : annotation_filter 86 | start : __annotation_start__ 87 | end : __annotation_end__ 88 | delimiter : __annotation_delimiter__ 89 | ``` 90 | 91 | To test the analyzer you can query the following 92 | http://localhost:9200/test/_analyze?analyzer=annotation&text="Mozart[city;Salzburg]" 93 | 94 | Limitation 95 | ---------- 96 | Another thing to keep in mind is that you can't use word-delimiting characters inside annotations. 97 | The whole string would be treated as two tokens which would result in unexpected behaviour. 98 | 99 | Customization 100 | ------------- 101 | The InlineAnnotationFilter can be slightly customized. 102 | 103 | List of supported options 104 | + `start` - start delimiter for inline annotation 105 | + `end` - end delimiter for inline annotation 106 | + `prefix` - string to be prepended to synonym, that is created from inline annotation 107 | + `suffix` - string to be apended to synonym, that is created from inline annotation 108 | + `token-type` - token type of synonym 109 | + `delimiter` - delimiter for multiple inline annotations 110 | 111 | Example providing default values 112 | ``` 113 | index : 114 | analysis : 115 | analyzer : 116 | annotation : 117 | type : annotation 118 | start : [ 119 | end : ] 120 | prefix : [ 121 | suffix : ] 122 | token-type: synonym 123 | delimiter : ; 124 | ``` 125 | 126 | 127 | Elasticsearch version 128 | --------------------- 129 | This plugin was successfuly tested on elasticsearch version 0.90.2 -------------------------------------------------------------------------------- /elasticsearch-analysis-annotation-0.9.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samekmichal/elasticsearch-analysis-annotation/99bd3dfdb1c9bb722b7e0dfb1645f694da8f974f/elasticsearch-analysis-annotation-0.9.zip -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | elasticsearch-analysis-annotation 6 | 4.0.0 7 | org.elasticsearch 8 | elasticsearch-analysis-annotation 9 | 1.0 10 | jar 11 | 2013 12 | 13 | 14 | The Apache Software License, Version 2.0 15 | http://www.apache.org/licenses/LICENSE-2.0.txt 16 | repo 17 | 18 | 19 | 20 | 21 | org.sonatype.oss 22 | oss-parent 23 | 7 24 | 25 | 26 | 27 | 0.90.1 28 | 4.3.0 29 | 30 | 31 | 32 | 33 | sonatype 34 | http://oss.sonatype.org/content/repositories/releases/ 35 | 36 | 37 | 38 | 39 | 40 | org.elasticsearch 41 | elasticsearch 42 | ${elasticsearch.version} 43 | compile 44 | 45 | 46 | 47 | org.apache.lucene 48 | lucene-core 49 | ${lucene.version} 50 | 51 | 52 | 53 | 54 | org.testng 55 | testng 56 | 6.3.1 57 | test 58 | 59 | 60 | 61 | 62 | org.apache.lucene 63 | lucene-analyzers-common 64 | ${lucene.version} 65 | 66 | 67 | com.google.guava 68 | guava 69 | 14.0.1 70 | 71 | 72 | 73 | 74 | 75 | 76 | org.apache.maven.plugins 77 | maven-compiler-plugin 78 | 2.3.2 79 | 80 | 1.6 81 | 1.6 82 | 83 | 84 | 85 | org.apache.maven.plugins 86 | maven-surefire-plugin 87 | 2.11 88 | 89 | 90 | **/*Tests.java 91 | 92 | 93 | 94 | 95 | org.apache.maven.plugins 96 | maven-source-plugin 97 | 2.1.2 98 | 99 | 100 | attach-sources 101 | 102 | jar 103 | 104 | 105 | 106 | 107 | 108 | maven-assembly-plugin 109 | 2.3 110 | 111 | false 112 | ${project.build.directory}/releases/ 113 | 114 | ${basedir}/src/main/assemblies/plugin.xml 115 | 116 | 117 | 118 | 119 | package 120 | 121 | single 122 | 123 | 124 | 125 | 126 | 127 | 128 | Analysis plugin for ElasticSearch providing capability of handling inline annotations such as: 129 | "Mozart[artist]" or "Salzburg[city;Austria]" 130 | 131 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | plugin 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | / 11 | true 12 | true 13 | 14 | org.elasticsearch:elasticsearch 15 | 16 | 17 | 18 | 19 | 20 | ${project.build.directory}/ 21 | / 22 | 23 | elasticsearch-${project.name}-${elasticsearch.version}.jar 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/annotation/AnnotationAnalysisBinderProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to ElasticSearch and Shay Banon under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. ElasticSearch licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package org.elasticsearch.index.analysis.annotation; 21 | 22 | import org.elasticsearch.index.analysis.AnalysisModule; 23 | 24 | /** 25 | */ 26 | public class AnnotationAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { 27 | 28 | @Override 29 | public void processAnalyzers(AnalyzersBindings analyzersBindings) { 30 | analyzersBindings.processAnalyzer("annotation", AnnotationAnalyzerProvider.class); 31 | } 32 | @Override 33 | public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { 34 | tokenFiltersBindings.processTokenFilter("annotation_filter", InlineAnnotationFilterFactory.class); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/annotation/AnnotationAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to ElasticSearch and Shay Banon under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. ElasticSearch licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package org.elasticsearch.index.analysis.annotation; 21 | 22 | import org.elasticsearch.common.inject.Inject; 23 | import org.elasticsearch.common.inject.assistedinject.Assisted; 24 | import org.elasticsearch.common.settings.Settings; 25 | import org.elasticsearch.env.Environment; 26 | import org.elasticsearch.index.Index; 27 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; 28 | import org.elasticsearch.index.settings.IndexSettings; 29 | import org.elasticsearch.plugin.analysis.annotation.AnnotationAnalyzer; 30 | import org.elasticsearch.plugin.analysis.annotation.InlineAnnotationFilter; 31 | 32 | /** 33 | */ 34 | public class AnnotationAnalyzerProvider extends AbstractIndexAnalyzerProvider { 35 | 36 | private final AnnotationAnalyzer analyzer; 37 | private final String name; 38 | 39 | @Inject 40 | public AnnotationAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { 41 | super(index, indexSettings, name, settings); 42 | 43 | this.name = name; 44 | 45 | InlineAnnotationFilter.settings(settings, name); 46 | 47 | analyzer = new AnnotationAnalyzer(version); 48 | } 49 | 50 | @Override 51 | public AnnotationAnalyzer get() { 52 | return this.analyzer; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/annotation/InlineAnnotationFilterFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to ElasticSearch and Shay Banon under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. ElasticSearch licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package org.elasticsearch.index.analysis.annotation; 21 | 22 | import org.apache.lucene.analysis.TokenStream; 23 | import org.elasticsearch.common.inject.Inject; 24 | import org.elasticsearch.common.inject.assistedinject.Assisted; 25 | import org.elasticsearch.common.settings.Settings; 26 | import org.elasticsearch.index.Index; 27 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 28 | import org.elasticsearch.index.settings.IndexSettings; 29 | import org.elasticsearch.plugin.analysis.annotation.InlineAnnotationFilter; 30 | 31 | 32 | 33 | public class InlineAnnotationFilterFactory extends AbstractTokenFilterFactory { 34 | 35 | @Inject public InlineAnnotationFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { 36 | super(index, indexSettings, name, settings); 37 | 38 | InlineAnnotationFilter.settings(settings, name); 39 | } 40 | 41 | @Override public TokenStream create(TokenStream tokenStream) { 42 | return new InlineAnnotationFilter(tokenStream); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/indices/analysis/annotation/AnnotationIndicesAnalysis.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to ElasticSearch and Shay Banon under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. ElasticSearch licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package org.elasticsearch.indices.analysis.annotation; 21 | 22 | import org.apache.lucene.analysis.TokenStream; 23 | import org.elasticsearch.common.component.AbstractComponent; 24 | import org.elasticsearch.common.inject.Inject; 25 | import org.elasticsearch.common.lucene.Lucene; 26 | import org.elasticsearch.common.settings.Settings; 27 | import org.elasticsearch.index.analysis.AnalyzerScope; 28 | import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory; 29 | import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory; 30 | import org.elasticsearch.index.analysis.TokenFilterFactory; 31 | import org.elasticsearch.indices.analysis.IndicesAnalysisService; 32 | import org.elasticsearch.plugin.analysis.annotation.AnnotationAnalyzer; 33 | import org.elasticsearch.plugin.analysis.annotation.InlineAnnotationFilter; 34 | 35 | /** 36 | * Registers indices level analysis components so, if not explicitly configured, 37 | * will be shared among all indices. 38 | */ 39 | public class AnnotationIndicesAnalysis extends AbstractComponent { 40 | 41 | @Inject 42 | public AnnotationIndicesAnalysis(Settings settings, 43 | IndicesAnalysisService indicesAnalysisService) { 44 | super(settings); 45 | indicesAnalysisService.analyzerProviderFactories().put( 46 | "default", 47 | new PreBuiltAnalyzerProviderFactory("default", 48 | AnalyzerScope.INDICES, new AnnotationAnalyzer( 49 | Lucene.ANALYZER_VERSION))); 50 | 51 | indicesAnalysisService.tokenFilterFactories().put("annotation_filter", 52 | new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { 53 | @Override 54 | public String name() { 55 | return "annotation_filter"; 56 | } 57 | 58 | @Override 59 | public TokenStream create(TokenStream tokenStream) { 60 | return new InlineAnnotationFilter(tokenStream); 61 | } 62 | })); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/indices/analysis/annotation/AnnotationIndicesAnalysisModule.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to ElasticSearch and Shay Banon under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. ElasticSearch licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package org.elasticsearch.indices.analysis.annotation; 21 | 22 | import org.elasticsearch.common.inject.AbstractModule; 23 | 24 | /** 25 | */ 26 | public class AnnotationIndicesAnalysisModule extends AbstractModule { 27 | 28 | @Override 29 | protected void configure() { 30 | bind(AnnotationIndicesAnalysis.class).asEagerSingleton(); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/annotation/AnnotationAnalysisPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to ElasticSearch and Shay Banon under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. ElasticSearch licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package org.elasticsearch.plugin.analysis.annotation; 21 | 22 | //import com.google.common.collect.ImmutableList; 23 | import org.elasticsearch.common.collect.ImmutableList; 24 | import org.elasticsearch.common.inject.Module; 25 | import org.elasticsearch.index.analysis.AnalysisModule; 26 | import org.elasticsearch.index.analysis.annotation.AnnotationAnalysisBinderProcessor; 27 | import org.elasticsearch.indices.analysis.annotation.AnnotationIndicesAnalysisModule; 28 | import org.elasticsearch.plugins.AbstractPlugin; 29 | 30 | import java.util.Collection; 31 | 32 | /** 33 | * 34 | */ 35 | public class AnnotationAnalysisPlugin extends AbstractPlugin { 36 | 37 | @Override 38 | public String name() { 39 | return "analysis-annotation"; 40 | } 41 | 42 | @Override 43 | public String description() { 44 | return "Inline annotations analysis support"; 45 | } 46 | 47 | @Override 48 | public Collection> modules() { 49 | return ImmutableList.>of(AnnotationIndicesAnalysisModule.class); 50 | } 51 | 52 | public void onModule(AnalysisModule module) { 53 | module.addProcessor(new AnnotationAnalysisBinderProcessor()); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/annotation/AnnotationAnalyzer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to ElasticSearch and Shay Banon under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. ElasticSearch licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package org.elasticsearch.plugin.analysis.annotation; 21 | 22 | import java.io.Reader; 23 | 24 | import org.apache.lucene.analysis.Analyzer; 25 | import org.apache.lucene.analysis.TokenStream; 26 | import org.apache.lucene.analysis.Tokenizer; 27 | import org.apache.lucene.analysis.core.LowerCaseFilter; 28 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 29 | import org.apache.lucene.util.Version; 30 | 31 | /** 32 | * Analyzer for inline annotations composed of WhitespaceTokenizer, LowerCaseFilter, 33 | * and InlineAnnotationFilter (default settings). 34 | * 35 | * @author Michal Samek, samek.michal @ gmail.com 36 | * 37 | */ 38 | public class AnnotationAnalyzer extends Analyzer { 39 | 40 | private final Version version; 41 | 42 | public AnnotationAnalyzer(Version version) { 43 | this.version = version; 44 | } 45 | 46 | 47 | @Override 48 | protected TokenStreamComponents createComponents(String fieldName, 49 | Reader reader) { 50 | Tokenizer source = new WhitespaceTokenizer(version, reader); 51 | TokenStream filter = new LowerCaseFilter(version, source); 52 | filter = new InlineAnnotationFilter(filter); 53 | 54 | return new TokenStreamComponents(source, filter); 55 | } 56 | 57 | } 58 | 59 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/annotation/InlineAnnotationFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to ElasticSearch and Shay Banon under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. ElasticSearch licenses this 6 | * file to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package org.elasticsearch.plugin.analysis.annotation; 21 | 22 | import java.io.IOException; 23 | import java.util.Stack; 24 | 25 | import org.apache.lucene.analysis.TokenFilter; 26 | import org.apache.lucene.analysis.TokenStream; 27 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 28 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 29 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 30 | import org.apache.lucene.util.AttributeSource; 31 | import org.elasticsearch.ElasticSearchIllegalArgumentException; 32 | import org.elasticsearch.common.settings.Settings; 33 | 34 | /** 35 | * Characters surrounded by '[' SYNONYM_START_DELIMITER and ']' are considered 36 | * to be a synonym to the word located before the opening '['. 37 | * eg 38 | * "W. A. Mozart[artist]" 39 | * "Salzburg[city;Austria]" 40 | * 41 | * The surrounding brackets can be changed by assigning desired characters to 42 | * SYNONYM_START_DELIMITER or SYNONYM_END_DELIMITER respectively. 43 | * 44 | * If more synonyms are present, they are delimited by SYNONYMS_DELIMITER, which 45 | * is by default ';'. 46 | * 47 | * If it is desired to surround the resulting synonym with prefix/suffix, set 48 | * SYNONYMS_PREFIX or SYNONYMS_SUFIX. 49 | * 50 | * @author Michal Samek, samek.michal @ gmail.com 51 | * 52 | */ 53 | 54 | public class InlineAnnotationFilter extends TokenFilter { 55 | public static String SYNONYM_TOKEN_TYPE = ""; 56 | 57 | public static String SYNONYM_START_DELIMITER = "["; 58 | public static String SYNONYM_END_DELIMITER = "]"; 59 | public static String SYNONYMS_DELIMITER = ";"; 60 | public static String SYNONYM_PREFIX = "["; 61 | public static String SYNONYM_SUFFIX = "]"; 62 | 63 | private Stack synonymStack; 64 | private AttributeSource.State current; 65 | private final CharTermAttribute termAtt; 66 | private final PositionIncrementAttribute posIncrAtt; 67 | private final TypeAttribute typeAtt; 68 | 69 | 70 | public InlineAnnotationFilter(TokenStream input) { 71 | super(input); 72 | synonymStack = new Stack(); 73 | this.termAtt = addAttribute(CharTermAttribute.class); 74 | this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); 75 | this.typeAtt = addAttribute(TypeAttribute.class); 76 | } 77 | 78 | @Override 79 | public boolean incrementToken() throws IOException { 80 | if (synonymStack.size() > 0) { 81 | popAliasFromStack(); 82 | return true; 83 | } 84 | 85 | if (!input.incrementToken()) { 86 | return false; 87 | } 88 | 89 | if (addAliasesToStack()) { 90 | if (termAtt.length() == 0) 91 | { 92 | popAliasFromStack(); 93 | } 94 | } 95 | 96 | current = captureState(); 97 | return true; 98 | } 99 | 100 | private void popAliasFromStack() 101 | { 102 | String syn = SYNONYM_PREFIX + synonymStack.pop() + SYNONYM_SUFFIX; 103 | restoreState(current); 104 | termAtt.copyBuffer(syn.toCharArray(), 0, syn.length()); 105 | typeAtt.setType(SYNONYM_TOKEN_TYPE); 106 | posIncrAtt.setPositionIncrement(0); 107 | } 108 | 109 | 110 | /** 111 | * Checks whether current token has synonyms appended. If it has, then they 112 | * are pushed on the synonymStack. 113 | * 114 | * @return true if synonyms were found, otherwise false 115 | */ 116 | private boolean addAliasesToStack() { 117 | String buffer = termAtt.toString(); 118 | String synonyms = null; 119 | int length = buffer.length(); 120 | 121 | searchingLoop: 122 | for (int i = 0; i < length; i++) { 123 | if (buffer.startsWith(SYNONYM_START_DELIMITER,i)) { 124 | 125 | // It might not be necessary to search for closing delimiter 126 | int synonyms_start = i + SYNONYM_START_DELIMITER.length(); 127 | for (int j = synonyms_start; j < length; j++) { 128 | if (buffer.startsWith(SYNONYM_END_DELIMITER,j)) { 129 | synonyms = buffer.substring(synonyms_start, j); 130 | termAtt.setLength(i); 131 | break searchingLoop; 132 | } 133 | } 134 | } 135 | } 136 | 137 | // No synonyms have been found 138 | if (synonyms == null) { 139 | return false; 140 | } 141 | 142 | 143 | int beginIndex = 0; 144 | int endIndex = -1; 145 | while ((endIndex = synonyms.indexOf(SYNONYMS_DELIMITER, beginIndex)) != -1) { 146 | synonymStack.push(synonyms.substring(beginIndex, endIndex).trim()); 147 | beginIndex = endIndex+SYNONYMS_DELIMITER.length(); 148 | } 149 | 150 | // Single synonym, which is not ended by SYNONYMS_DELIMITER, eq [artist] 151 | // Last synonym, which is not ended by SYNONYMS_DELIMITER, eq [city;Austria] 152 | // For [city;Austria;] the beginIndex will be set to index, that is equal to the string length 153 | if (beginIndex < synonyms.length() && synonyms.length() > 0) { 154 | synonymStack.push(synonyms.substring(beginIndex).trim()); 155 | } 156 | return true; 157 | } 158 | 159 | 160 | /** 161 | * Process settings passed by ElasticSearch during initialization. 162 | * Recognised settings are: 163 | * start - start delimiter for inline annotation 164 | * end - end delimiter for inline annotation 165 | * prefix - string to be prepended to synonym, that is created from inline annotation 166 | * suffix - string to be apended to synonym, that is created from inline annotation 167 | * token-type - token type of synonym 168 | * delimiter - delimiter for multiple inline annotations 169 | * @param settings 170 | * @param name - logical name of the analyzer 171 | */ 172 | public static void settings(Settings settings, String name) { 173 | String start_delim, end_delim, syn_prefix, syn_suffix, delimiter, token_type; 174 | start_delim = settings.get("start"); 175 | end_delim = settings.get("end"); 176 | syn_prefix = settings.get("prefix"); 177 | syn_suffix = settings.get("suffix"); 178 | delimiter = settings.get("delimiter"); 179 | token_type = settings.get("token-type"); 180 | 181 | 182 | if (start_delim != null) { 183 | if (start_delim.length() == 0) { 184 | throw new ElasticSearchIllegalArgumentException( 185 | "Analyzer " + name + " has invalid settings: start " + 186 | "delimiter cannot be empty string"); 187 | } 188 | InlineAnnotationFilter.SYNONYM_START_DELIMITER = start_delim; 189 | } 190 | 191 | if (end_delim != null) { 192 | if (end_delim.length() == 0) { 193 | throw new ElasticSearchIllegalArgumentException( 194 | "Analyzer " + name + " has invalid settings: end " + 195 | "delimiter cannot be empty string"); 196 | } 197 | InlineAnnotationFilter.SYNONYM_END_DELIMITER = end_delim; 198 | } 199 | 200 | if (syn_prefix != null) { 201 | InlineAnnotationFilter.SYNONYM_PREFIX = syn_prefix; 202 | } 203 | 204 | if (syn_suffix != null) { 205 | InlineAnnotationFilter.SYNONYM_SUFFIX = syn_suffix; 206 | } 207 | 208 | if (delimiter != null) { 209 | if (delimiter.length() == 0) { 210 | throw new ElasticSearchIllegalArgumentException( 211 | "Analyzer " + name + " has invalid settings: " + 212 | "delimiter cannot be empty string"); 213 | } 214 | InlineAnnotationFilter.SYNONYMS_DELIMITER = delimiter; 215 | } 216 | 217 | if (token_type != null) { 218 | InlineAnnotationFilter.SYNONYM_TOKEN_TYPE = token_type; 219 | } 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /src/main/resources/es-plugin.properties: -------------------------------------------------------------------------------- 1 | plugin=org.elasticsearch.plugin.analysis.annotation.AnnotationAnalysisPlugin --------------------------------------------------------------------------------