├── src ├── test │ ├── resources │ │ ├── synonym.txt │ │ └── log4j2.xml │ └── java │ │ └── com.bellszhu.elasticsearch.plugin │ │ └── DynamicSynonymPluginTest.java └── main │ ├── resources │ ├── plugin-security.policy │ └── plugin-descriptor.properties │ ├── java │ └── com │ │ └── bellszhu │ │ └── elasticsearch │ │ └── plugin │ │ ├── synonym │ │ └── analysis │ │ │ ├── SynonymFile.java │ │ │ ├── AbsSynonymFilter.java │ │ │ ├── DynamicSynonymGraphTokenFilterFactory.java │ │ │ ├── LocalSynonymFile.java │ │ │ ├── DynamicSynonymTokenFilterFactory.java │ │ │ ├── RemoteSynonymFile.java │ │ │ ├── DynamicSynonymGraphFilter.java │ │ │ └── DynamicSynonymFilter.java │ │ └── DynamicSynonymPlugin.java │ └── assemblies │ └── plugin.xml ├── .dockerignore ├── .gitignore ├── Dockerfile ├── Makefile ├── .github └── workflows │ └── build-and-publish-docker-image.yml ├── README.md └── pom.xml /src/test/resources/synonym.txt: -------------------------------------------------------------------------------- 1 | 金拱门,肯德基,KFC 2 | america, unitied states, meiguo -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Ignore everything 2 | * 3 | 4 | # Except 5 | !pom.xml 6 | !src/main -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .settings 2 | .classpath 3 | .project 4 | .idea 5 | .DS_Store 6 | /target 7 | *.iml -------------------------------------------------------------------------------- /src/main/resources/plugin-security.policy: -------------------------------------------------------------------------------- 1 | grant { 2 | permission java.net.SocketPermission "*", "connect,resolve"; 3 | }; -------------------------------------------------------------------------------- /src/test/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3.9.2-eclipse-temurin-8-alpine as build 2 | 3 | RUN apk update \ 4 | && apk add zip 5 | 6 | WORKDIR /app 7 | 8 | COPY . ./ 9 | 10 | RUN mvn package \ 11 | && unzip target/releases/elasticsearch-analysis-dynamic-synonym-*.zip -d target/extracted 12 | 13 | FROM docker.elastic.co/elasticsearch/elasticsearch:8.7.1 14 | 15 | COPY --from=build --chown=elasticsearch:elasticsearch /app/target/extracted /usr/share/elasticsearch/plugins/dynamic-synonym/ 16 | -------------------------------------------------------------------------------- /src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/SynonymFile.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package com.bellszhu.elasticsearch.plugin.synonym.analysis; 5 | 6 | import java.io.Reader; 7 | 8 | import org.apache.lucene.analysis.synonym.SynonymMap; 9 | 10 | /** 11 | * @author bellszhu 12 | */ 13 | public interface SynonymFile { 14 | 15 | SynonymMap reloadSynonymMap(); 16 | 17 | boolean isNeedReloadSynonymMap(); 18 | 19 | Reader getReader(); 20 | 21 | } -------------------------------------------------------------------------------- /src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/AbsSynonymFilter.java: -------------------------------------------------------------------------------- 1 | package com.bellszhu.elasticsearch.plugin.synonym.analysis; 2 | 3 | import org.apache.lucene.analysis.TokenFilter; 4 | import org.apache.lucene.analysis.TokenStream; 5 | import org.apache.lucene.analysis.synonym.SynonymMap; 6 | 7 | /** 8 | * @author bellszhu 9 | */ 10 | public abstract class AbsSynonymFilter extends TokenFilter { 11 | /** 12 | * Construct a token stream filtering the given input. 13 | * 14 | * @param input 15 | */ 16 | protected AbsSynonymFilter(TokenStream input) { 17 | super(input); 18 | } 19 | 20 | abstract void update(SynonymMap synonymMap); 21 | } 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | image_host ?= 2 | 3 | image_tag ?= 1.0.0 4 | 5 | .PHONY: help build_image push_image build_image_and_push 6 | 7 | help: ## Display this help message. 8 | @echo "Please use \`make \` where is one of" 9 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; \ 10 | {printf "\033[36m%-40s\033[0m %s\n", $$1, $$2}' 11 | 12 | build_image: ## Build docker image. Use `image_host` to override the default image host and `image_tag` to do the same for image tag. 13 | docker build -t $(image_host)dynamic-synonym-elasticsearch:$(image_tag) -t $(image_host)dynamic-synonym-elasticsearch:latest . 14 | 15 | push_image: ## Push docker image. Use `image_host` to override the default image host. 16 | docker push -a $(image_host)dynamic-synonym-elasticsearch 17 | 18 | build_image_and_push: build_image push_image ## Build and push docker image. Use `image_host` to override the default image host and `image_tag` to do the same for image tag. -------------------------------------------------------------------------------- /.github/workflows/build-and-publish-docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Build and push docker image 2 | 3 | on: 4 | push: 5 | tags: [ v*.*.* ] 6 | 7 | jobs: 8 | build-n-publish: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Clone repository 12 | uses: actions/checkout@v2 13 | 14 | - name: Get tag version 15 | if: startsWith(github.ref, 'refs/tags/') 16 | id: get_tag_version 17 | run: echo ::set-output name=TAG::${GITHUB_REF/refs\/tags\//} 18 | 19 | - name: Docker - authenticate 20 | if: startsWith(github.ref, 'refs/tags') 21 | uses: docker/login-action@v1 22 | with: 23 | username: ${{ secrets.DOCKER_USERNAME }} 24 | password: ${{ secrets.DOCKER_PASSWORD }} 25 | 26 | - name: Docker - build and push images 27 | if: startsWith(github.ref, 'refs/tags') 28 | run: make build_image_and_push image_host=${{ env.IMAGE_HOST_SLASH_APPENDED }} image_tag=${{ steps.get_tag_version.outputs.TAG }} -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | - 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 11 | true 12 | 13 | 14 | ${project.basedir}/src/main/resources/plugin-security.policy 15 | true 16 | 17 | 18 | 19 | 20 | true 21 | true 22 | 23 | org.elasticsearch:elasticsearch 24 | 25 | 26 | 27 | true 28 | true 29 | 30 | org.apache.httpcomponents:httpclient 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java: -------------------------------------------------------------------------------- 1 | package com.bellszhu.elasticsearch.plugin; 2 | 3 | import static org.elasticsearch.plugins.AnalysisPlugin.requiresAnalysisSettings; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | import org.elasticsearch.index.analysis.TokenFilterFactory; 9 | import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; 10 | import org.elasticsearch.plugins.AnalysisPlugin; 11 | import org.elasticsearch.plugins.Plugin; 12 | 13 | import com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymGraphTokenFilterFactory; 14 | import com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymTokenFilterFactory; 15 | 16 | 17 | /** 18 | * @author bellszhu 19 | */ 20 | public class DynamicSynonymPlugin extends Plugin implements AnalysisPlugin { 21 | 22 | @Override 23 | public Map> getTokenFilters() { 24 | Map> extra = new HashMap<>(); 25 | extra.put("dynamic_synonym", requiresAnalysisSettings((indexSettings, env, name, settings) -> new DynamicSynonymTokenFilterFactory(env, name, settings))); 26 | extra.put("dynamic_synonym_graph", requiresAnalysisSettings((indexSettings, env, name, settings) -> new DynamicSynonymGraphTokenFilterFactory(env, name, settings))); 27 | return extra; 28 | } 29 | } -------------------------------------------------------------------------------- /src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package com.bellszhu.elasticsearch.plugin.synonym.analysis; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | import java.util.function.Function; 6 | 7 | import org.apache.lucene.analysis.Analyzer; 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.elasticsearch.common.settings.Settings; 10 | import org.elasticsearch.env.Environment; 11 | import org.elasticsearch.index.analysis.AnalysisMode; 12 | import org.elasticsearch.index.analysis.CharFilterFactory; 13 | import org.elasticsearch.index.analysis.TokenFilterFactory; 14 | import org.elasticsearch.index.analysis.TokenizerFactory; 15 | 16 | public class DynamicSynonymGraphTokenFilterFactory extends DynamicSynonymTokenFilterFactory { 17 | 18 | public DynamicSynonymGraphTokenFilterFactory( 19 | Environment env, String name, Settings settings 20 | ) throws IOException { 21 | super(env, name, settings); 22 | } 23 | 24 | @Override 25 | public TokenStream create(TokenStream tokenStream) { 26 | throw new IllegalStateException( 27 | "Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first" 28 | ); 29 | } 30 | 31 | @Override 32 | public TokenFilterFactory getChainAwareTokenFilterFactory( 33 | TokenizerFactory tokenizer, List charFilters, 34 | List previousTokenFilters, 35 | Function allFilters 36 | ) { 37 | final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters); 38 | synonymMap = buildSynonyms(analyzer); 39 | final String name = name(); 40 | return new TokenFilterFactory() { 41 | @Override 42 | public String name() { 43 | return name; 44 | } 45 | 46 | @Override 47 | public TokenStream create(TokenStream tokenStream) { 48 | // fst is null means no synonyms 49 | if (synonymMap.fst == null) { 50 | return tokenStream; 51 | } 52 | DynamicSynonymGraphFilter dynamicSynonymGraphFilter = new DynamicSynonymGraphFilter( 53 | tokenStream, synonymMap, false); 54 | dynamicSynonymFilters.put(dynamicSynonymGraphFilter, 1); 55 | 56 | return dynamicSynonymGraphFilter; 57 | } 58 | 59 | @Override 60 | public AnalysisMode getAnalysisMode() { 61 | return analysisMode; 62 | } 63 | }; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Synonym for ElasticSearch 2 | 3 | The dynamic synonym plugin adds a synonym token filter that reloads the synonym file (local file or remote file) at given intervals (default 60s). 4 | 5 | ## Version 6 | 7 | | dynamic synonym version | ES version | 8 | |-------------------------|---------------| 9 | | master | 8.x -> master | 10 | | 7.4.2 | 7.4.2 | 11 | | 6.1.4 | 6.1.4 | 12 | | 5.2.0 | 5.2.0 | 13 | | 5.1.1 | 5.1.1 | 14 | | 2.3.0 | 2.3.0 | 15 | | 2.2.0 | 2.2.0 | 16 | | 2.1.0 | 2.1.0 | 17 | | 2.0.0 | 2.0.0 | 18 | | 1.6.0 | 1.6.X | 19 | 20 | ## Installation 21 | 22 | 1. `mvn package` 23 | 24 | 2. copy and unzip `target/releases/elasticsearch-analysis-dynamic-synonym-{version}.zip` to `your-es-root/plugins/dynamic-synonym` 25 | 26 | ## Example 27 | 28 | ```json 29 | { 30 | "index" : { 31 | "analysis" : { 32 | "analyzer" : { 33 | "synonym" : { 34 | "tokenizer" : "whitespace", 35 | "filter" : ["remote_synonym"] 36 | } 37 | }, 38 | "filter" : { 39 | "remote_synonym" : { 40 | "type" : "dynamic_synonym", 41 | "synonyms_path" : "http://host:port/synonym.txt", 42 | "interval": 30 43 | }, 44 | "local_synonym" : { 45 | "type" : "dynamic_synonym", 46 | "synonyms_path" : "synonym.txt" 47 | }, 48 | "synonym_graph" : { 49 | "type" : "dynamic_synonym_graph", 50 | "synonyms_path" : "http://host:port/synonym.txt" 51 | } 52 | } 53 | } 54 | } 55 | } 56 | ``` 57 | ### Configuration 58 | 59 | `type`: `dynamic_synonym` or `dynamic_synonym_graph`, *mandatory* 60 | 61 | `synonyms_path`: A file path relative to the Elastic config file or an URL, *mandatory* 62 | 63 | `interval`: Refresh interval in seconds for the synonym file, default: `60`, *optional* 64 | 65 | `ignore_case`: Ignore case in synonyms file, default: `false`, *optional* 66 | 67 | `expand`: Expand, default: `true`, *optional* 68 | 69 | `lenient`: Lenient on exception thrown when importing a synonym, default: `false`, *optional* 70 | 71 | `format`: Synonym file format, default: `''`, *optional*. For WordNet structure this can be set to `'wordnet'` 72 | 73 | 74 | ## Update mechanism 75 | 76 | * Local files: Determined by modification time of the file, if it has changed the synonyms wil 77 | * Remote files: Reads out the `Last-Modified` and `ETag` http header. If one of these changes, the synonyms will be reloaded. 78 | 79 | **Note:** File encoding should be an utf-8 text file. 80 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=2.0 29 | # java.version=1.7 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=${project.description} 35 | # 36 | # 'version': plugin's version 37 | version=${project.version} 38 | # 39 | # 'name': the plugin name 40 | name=${elasticsearch.plugin.name} 41 | 42 | ### mandatory elements for site plugins: 43 | # 44 | # 'site': set to true to indicate contents of the _site/ 45 | # directory in the root of the plugin should be served. 46 | # site=${elasticsearch.plugin.site} 47 | # 48 | ### mandatory elements for jvm plugins : 49 | # 50 | # 'jvm': true if the 'classname' class should be loaded 51 | # from jar files in the root directory of the plugin. 52 | # Note that only jar files in the root directory are 53 | # added to the classpath for the plugin! If you need 54 | # other resources, package them into a resources jar. 55 | # jvm=${elasticsearch.plugin.jvm} 56 | # 57 | # 'classname': the name of the class to load, fully-qualified. 58 | classname=${elasticsearch.plugin.classname} 59 | # 60 | # 'java.version' version of java the code is built against 61 | # use the system property java.specification.version 62 | # version string must be a sequence of nonnegative decimal integers 63 | # separated by "."'s and may have leading zeros 64 | java.version=${maven.compiler.target} 65 | # 66 | # 'elasticsearch.version' version of elasticsearch compiled against 67 | # You will have to release a new version of the plugin for each new 68 | # elasticsearch release. This version is checked when the plugin 69 | # is loaded so Elasticsearch will refuse to start in the presence of 70 | # plugins with the incorrect elasticsearch.version. 71 | elasticsearch.version=${elasticsearch.version} 72 | # 73 | ### deprecated elements for jvm plugins : 74 | # 75 | # 'isolated': true if the plugin should have its own classloader. 76 | # passing false is deprecated, and only intended to support plugins 77 | # that have hard dependencies against each other. If this is 78 | # not specified, then the plugin is isolated by default. 79 | # isolated=${elasticsearch.plugin.isolated} 80 | # -------------------------------------------------------------------------------- /src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package com.bellszhu.elasticsearch.plugin.synonym.analysis; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.File; 8 | import java.io.IOException; 9 | import java.io.InputStreamReader; 10 | import java.io.Reader; 11 | import java.io.StringReader; 12 | import java.nio.charset.StandardCharsets; 13 | import java.nio.file.Files; 14 | import java.nio.file.Path; 15 | 16 | import org.apache.logging.log4j.LogManager; 17 | import org.apache.logging.log4j.Logger; 18 | import org.apache.lucene.analysis.Analyzer; 19 | import org.apache.lucene.analysis.synonym.SynonymMap; 20 | import org.elasticsearch.env.Environment; 21 | 22 | 23 | /** 24 | * @author bellszhu 25 | */ 26 | public class LocalSynonymFile implements SynonymFile { 27 | 28 | private static final Logger logger = LogManager.getLogger("dynamic-synonym"); 29 | 30 | private String format; 31 | 32 | private boolean expand; 33 | 34 | private boolean lenient; 35 | 36 | private Analyzer analyzer; 37 | 38 | private Environment env; 39 | 40 | /** 41 | * Local file path relative to the config directory 42 | */ 43 | private String location; 44 | 45 | private Path synonymFilePath; 46 | 47 | private long lastModified; 48 | 49 | LocalSynonymFile(Environment env, Analyzer analyzer, boolean expand, boolean lenient, 50 | String format, String location) { 51 | this.analyzer = analyzer; 52 | this.expand = expand; 53 | this.lenient = lenient; 54 | this.format = format; 55 | this.env = env; 56 | this.location = location; 57 | 58 | this.synonymFilePath = deepSearch(); 59 | isNeedReloadSynonymMap(); 60 | } 61 | 62 | @Override 63 | public SynonymMap reloadSynonymMap() { 64 | try { 65 | logger.debug("start reload local synonym from {}.", synonymFilePath); 66 | Reader rulesReader = getReader(); 67 | SynonymMap.Builder parser = RemoteSynonymFile.getSynonymParser( 68 | rulesReader, format, expand, lenient, analyzer); 69 | return parser.build(); 70 | } catch (Exception e) { 71 | logger.error("reload local synonym {} error!", synonymFilePath, e); 72 | throw new IllegalArgumentException( 73 | "could not reload local synonyms file to build synonyms", e); 74 | } 75 | 76 | } 77 | 78 | /* 79 | Just deleted when reading the file, Returns empty synonym 80 | keyword if file not exists. 81 | A small probability event. 82 | */ 83 | public Reader getReader() { 84 | if (!Files.exists(synonymFilePath)) { 85 | return new StringReader(""); 86 | } 87 | try (BufferedReader br = new BufferedReader(new InputStreamReader( 88 | synonymFilePath.toUri().toURL().openStream(), StandardCharsets.UTF_8))) { 89 | StringBuilder sb = new StringBuilder(); 90 | String line; 91 | while ((line = br.readLine()) != null) { 92 | // logger.info("reload local synonym: {}", line); 93 | sb.append(line).append(System.getProperty("line.separator")); 94 | } 95 | return new StringReader(sb.toString()); 96 | } catch (IOException e) { 97 | logger.error("get local synonym reader {} error!", location, e); 98 | // throw new IllegalArgumentException( 99 | // "IOException while reading local synonyms file", e); 100 | // Fix #54 Returns blank if synonym file has be deleted. 101 | return new StringReader(""); 102 | } 103 | } 104 | 105 | @Override 106 | public boolean isNeedReloadSynonymMap() { 107 | try { 108 | /* 109 | If the file does not exist, it will be scanned every time 110 | until the file is restored. 111 | */ 112 | if (!Files.exists(synonymFilePath) && !Files.exists(synonymFilePath = deepSearch())) { 113 | return false; 114 | } 115 | File synonymFile = synonymFilePath.toFile(); 116 | if (synonymFile.exists() 117 | && lastModified < synonymFile.lastModified()) { 118 | lastModified = synonymFile.lastModified(); 119 | return true; 120 | } 121 | } catch (Exception e) { 122 | logger.error("check need reload local synonym {} error!", location, e); 123 | } 124 | 125 | return false; 126 | } 127 | 128 | /** 129 | * Deep search synonym file. 130 | * Step 1. Query the 'sysnonym_path' parameter as an absolute path 131 | * Step 2. Query the es config path 132 | * Step 3. Query in current relative path 133 | *

134 | * Override this method to expend search path 135 | * 136 | * @return the synonym path. 137 | */ 138 | protected Path deepSearch() { 139 | return env.configFile().resolve(location); 140 | // // TODO 141 | // SpecialPermission.check(); 142 | // return AccessController.doPrivileged((PrivilegedAction) () -> { 143 | // return env.configFile().resolve(location); 144 | //// // access denied:java.io.FilePermission 145 | //// Path path; 146 | //// // Load setting config as absolute path 147 | //// if (Files.exists(Paths.get(location))) { // access denied:java.io.FilePermission 148 | //// path = Paths.get(location); 149 | //// // Load from setting config path 150 | //// } else if (Files.exists(env.configFile().resolve(location))) { 151 | //// path = env.configFile().resolve(location); 152 | //// // Load from current relative path 153 | //// } else { 154 | //// URL url = getClass().getClassLoader().getResource(location); 155 | //// if (url != null) { 156 | //// path = Paths.get(url.getFile()); 157 | //// } else { 158 | //// path = env.configFile().resolve(location); 159 | //// } 160 | //// } 161 | //// return path; 162 | // }); 163 | } 164 | } -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.bellszhu.elasticsearch 6 | elasticsearch-analysis-dynamic-synonym 7 | 8.7.1 8 | jar 9 | elasticsearch-dynamic-synonym 10 | Analysis-plugin for synonym 11 | 12 | 13 | UTF-8 14 | ${project.version} 15 | 17 16 | analysis-dynamic-synonym 17 | ${project.basedir}/src/main/assemblies/plugin.xml 18 | 19 | com.bellszhu.elasticsearch.plugin.DynamicSynonymPlugin 20 | 21 | true 22 | 23 | 24 | 25 | 26 | The Apache Software License, Version 2.0 27 | http://www.apache.org/licenses/LICENSE-2.0.txt 28 | repo 29 | 30 | 31 | 32 | 33 | org.sonatype.oss 34 | oss-parent 35 | 9 36 | 37 | 38 | 39 | scm:git:git@github.com:bells/elasticsearch-analysis-dynamic-synonym.git 40 | scm:git:git@github.com:bells/elasticsearch-analysis-dynamic-synonym.git 41 | 42 | https://github.com/bells/elasticsearch-analysis-dynamic-synonym 43 | 44 | 45 | 46 | 47 | central 48 | https://repo1.maven.org/maven2 49 | 50 | true 51 | 52 | 53 | true 54 | 55 | 56 | 57 | codelibs.org 58 | CodeLibs Repository 59 | https://maven.codelibs.org/ 60 | 61 | 62 | 63 | 64 | 65 | org.elasticsearch 66 | elasticsearch 67 | ${elasticsearch.version} 68 | 69 | 70 | org.codelibs.elasticsearch.module 71 | analysis-common 72 | ${project.version} 73 | 74 | 75 | junit 76 | junit 77 | 4.13.1 78 | test 79 | 80 | 81 | org.apache.httpcomponents.client5 82 | httpclient5 83 | 5.2.1 84 | 85 | 86 | org.apache.logging.log4j 87 | log4j-core 88 | 2.17.1 89 | provided 90 | 91 | 92 | org.apache.logging.log4j 93 | log4j-api 94 | 2.20.0 95 | provided 96 | 97 | 98 | org.codelibs 99 | elasticsearch-cluster-runner 100 | ${project.version}.0 101 | test 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | org.apache.maven.plugins 110 | maven-compiler-plugin 111 | 3.11.0 112 | 113 | ${maven.compiler.target} 114 | ${maven.compiler.target} 115 | 116 | 117 | 118 | org.apache.maven.plugins 119 | maven-surefire-plugin 120 | 3.1.0 121 | 122 | 123 | **/*Tests.java 124 | 125 | 126 | 127 | 128 | org.apache.maven.plugins 129 | maven-enforcer-plugin 130 | 3.3.0 131 | 132 | 133 | org.apache.maven.plugins 134 | maven-resources-plugin 135 | 3.3.1 136 | 137 | 138 | org.apache.maven.plugins 139 | maven-source-plugin 140 | 3.3.0 141 | 142 | 143 | attach-sources 144 | 145 | jar 146 | 147 | 148 | 149 | 150 | 151 | maven-assembly-plugin 152 | 153 | false 154 | ${project.build.directory}/releases/ 155 | 156 | ${basedir}/src/main/assemblies/plugin.xml 157 | 158 | 159 | 160 | fully.qualified.MainClass 161 | 162 | 163 | 164 | 165 | 166 | package 167 | 168 | single 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package com.bellszhu.elasticsearch.plugin.synonym.analysis; 2 | 3 | 4 | import java.io.IOException; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.WeakHashMap; 8 | import java.util.concurrent.Executors; 9 | import java.util.concurrent.ScheduledExecutorService; 10 | import java.util.concurrent.ScheduledFuture; 11 | import java.util.concurrent.TimeUnit; 12 | import java.util.concurrent.atomic.AtomicInteger; 13 | import java.util.function.Function; 14 | 15 | import org.apache.logging.log4j.LogManager; 16 | import org.apache.logging.log4j.Logger; 17 | import org.apache.lucene.analysis.Analyzer; 18 | import org.apache.lucene.analysis.TokenStream; 19 | import org.apache.lucene.analysis.synonym.SynonymMap; 20 | import org.elasticsearch.common.settings.Settings; 21 | import org.elasticsearch.env.Environment; 22 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 23 | import org.elasticsearch.index.analysis.AnalysisMode; 24 | import org.elasticsearch.index.analysis.CharFilterFactory; 25 | import org.elasticsearch.index.analysis.CustomAnalyzer; 26 | import org.elasticsearch.index.analysis.TokenFilterFactory; 27 | import org.elasticsearch.index.analysis.TokenizerFactory; 28 | 29 | /** 30 | * @author bellszhu 31 | */ 32 | public class DynamicSynonymTokenFilterFactory extends 33 | AbstractTokenFilterFactory { 34 | 35 | private static final Logger logger = LogManager.getLogger("dynamic-synonym"); 36 | 37 | /** 38 | * Static id generator 39 | */ 40 | private static final AtomicInteger id = new AtomicInteger(1); 41 | private static final ScheduledExecutorService pool = Executors.newScheduledThreadPool(1, r -> { 42 | Thread thread = new Thread(r); 43 | thread.setName("monitor-synonym-Thread-" + id.getAndAdd(1)); 44 | return thread; 45 | }); 46 | private volatile ScheduledFuture scheduledFuture; 47 | 48 | private final String location; 49 | private final boolean expand; 50 | private final boolean lenient; 51 | private final String format; 52 | private final int interval; 53 | protected SynonymMap synonymMap; 54 | protected Map dynamicSynonymFilters = new WeakHashMap<>(); 55 | protected final Environment environment; 56 | protected final AnalysisMode analysisMode; 57 | 58 | public DynamicSynonymTokenFilterFactory( 59 | Environment env, 60 | String name, 61 | Settings settings 62 | ) throws IOException { 63 | super(name, settings); 64 | 65 | this.location = settings.get("synonyms_path"); 66 | if (this.location == null) { 67 | throw new IllegalArgumentException( 68 | "dynamic synonym requires `synonyms_path` to be configured"); 69 | } 70 | if (settings.get("ignore_case") != null) { 71 | } 72 | 73 | this.interval = settings.getAsInt("interval", 60); 74 | this.expand = settings.getAsBoolean("expand", true); 75 | this.lenient = settings.getAsBoolean("lenient", false); 76 | this.format = settings.get("format", ""); 77 | boolean updateable = settings.getAsBoolean("updateable", false); 78 | this.analysisMode = updateable ? AnalysisMode.SEARCH_TIME : AnalysisMode.ALL; 79 | this.environment = env; 80 | } 81 | 82 | @Override 83 | public AnalysisMode getAnalysisMode() { 84 | return this.analysisMode; 85 | } 86 | 87 | 88 | @Override 89 | public TokenStream create(TokenStream tokenStream) { 90 | throw new IllegalStateException( 91 | "Call getChainAwareTokenFilterFactory to specialize this factory for an analysis chain first"); 92 | } 93 | 94 | public TokenFilterFactory getChainAwareTokenFilterFactory( 95 | TokenizerFactory tokenizer, 96 | List charFilters, 97 | List previousTokenFilters, 98 | Function allFilters 99 | ) { 100 | final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters); 101 | synonymMap = buildSynonyms(analyzer); 102 | final String name = name(); 103 | return new TokenFilterFactory() { 104 | @Override 105 | public String name() { 106 | return name; 107 | } 108 | 109 | @Override 110 | public TokenStream create(TokenStream tokenStream) { 111 | // fst is null means no synonyms 112 | if (synonymMap.fst == null) { 113 | return tokenStream; 114 | } 115 | DynamicSynonymFilter dynamicSynonymFilter = new DynamicSynonymFilter(tokenStream, synonymMap, false); 116 | dynamicSynonymFilters.put(dynamicSynonymFilter, 1); 117 | 118 | return dynamicSynonymFilter; 119 | } 120 | 121 | @Override 122 | public TokenFilterFactory getSynonymFilter() { 123 | // In order to allow chained synonym filters, we return IDENTITY here to 124 | // ensure that synonyms don't get applied to the synonym map itself, 125 | // which doesn't support stacked input tokens 126 | return IDENTITY_FILTER; 127 | } 128 | 129 | @Override 130 | public AnalysisMode getAnalysisMode() { 131 | return analysisMode; 132 | } 133 | }; 134 | } 135 | 136 | Analyzer buildSynonymAnalyzer( 137 | TokenizerFactory tokenizer, 138 | List charFilters, 139 | List tokenFilters 140 | ) { 141 | return new CustomAnalyzer( 142 | tokenizer, 143 | charFilters.toArray(new CharFilterFactory[0]), 144 | tokenFilters.stream().map(TokenFilterFactory::getSynonymFilter).toArray(TokenFilterFactory[]::new) 145 | ); 146 | } 147 | 148 | SynonymMap buildSynonyms(Analyzer analyzer) { 149 | try { 150 | return getSynonymFile(analyzer).reloadSynonymMap(); 151 | } catch (Exception e) { 152 | logger.error("failed to build synonyms", e); 153 | throw new IllegalArgumentException("failed to build synonyms", e); 154 | } 155 | } 156 | 157 | SynonymFile getSynonymFile(Analyzer analyzer) { 158 | try { 159 | SynonymFile synonymFile; 160 | if (location.startsWith("http://") || location.startsWith("https://")) { 161 | synonymFile = new RemoteSynonymFile( 162 | environment, analyzer, expand, lenient, format, location); 163 | } else { 164 | synonymFile = new LocalSynonymFile( 165 | environment, analyzer, expand, lenient, format, location); 166 | } 167 | if (scheduledFuture == null) { 168 | scheduledFuture = pool.scheduleAtFixedRate(new Monitor(synonymFile), 169 | interval, interval, TimeUnit.SECONDS); 170 | } 171 | return synonymFile; 172 | } catch (Exception e) { 173 | logger.error("failed to get synonyms: " + location, e); 174 | throw new IllegalArgumentException("failed to get synonyms : " + location, e); 175 | } 176 | } 177 | 178 | public class Monitor implements Runnable { 179 | 180 | private SynonymFile synonymFile; 181 | 182 | Monitor(SynonymFile synonymFile) { 183 | this.synonymFile = synonymFile; 184 | } 185 | 186 | @Override 187 | public void run() { 188 | try { 189 | logger.info("===== Monitor ======="); 190 | if (synonymFile.isNeedReloadSynonymMap()) { 191 | synonymMap = synonymFile.reloadSynonymMap(); 192 | for (AbsSynonymFilter dynamicSynonymFilter : dynamicSynonymFilters.keySet()) { 193 | dynamicSynonymFilter.update(synonymMap); 194 | logger.debug("success reload synonym"); 195 | } 196 | } 197 | } catch (Exception e) { 198 | logger.info("Monitor error", e); 199 | // e.printStackTrace(); 200 | logger.error(e); 201 | } 202 | } 203 | } 204 | 205 | } 206 | -------------------------------------------------------------------------------- /src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package com.bellszhu.elasticsearch.plugin.synonym.analysis; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.IOException; 8 | import java.io.InputStreamReader; 9 | import java.io.Reader; 10 | import java.io.StringReader; 11 | import java.text.ParseException; 12 | import java.util.concurrent.TimeUnit; 13 | 14 | import org.apache.hc.client5.http.classic.methods.HttpGet; 15 | import org.apache.hc.client5.http.classic.methods.HttpHead; 16 | import org.apache.hc.client5.http.classic.methods.HttpUriRequest; 17 | import org.apache.hc.client5.http.config.RequestConfig; 18 | import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; 19 | import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; 20 | import org.apache.hc.client5.http.impl.classic.HttpClients; 21 | import org.apache.hc.core5.http.message.StatusLine; 22 | import org.apache.logging.log4j.LogManager; 23 | import org.apache.logging.log4j.Logger; 24 | import org.apache.lucene.analysis.Analyzer; 25 | import org.apache.lucene.analysis.synonym.SynonymMap; 26 | import org.elasticsearch.analysis.common.ESSolrSynonymParser; 27 | import org.elasticsearch.analysis.common.ESWordnetSynonymParser; 28 | import org.elasticsearch.env.Environment; 29 | 30 | /** 31 | * @author bellszhu 32 | */ 33 | public class RemoteSynonymFile implements SynonymFile { 34 | 35 | private static final String LAST_MODIFIED_HEADER = "Last-Modified"; 36 | private static final String ETAG_HEADER = "ETag"; 37 | 38 | private static final Logger logger = LogManager.getLogger("dynamic-synonym"); 39 | 40 | private CloseableHttpClient httpclient; 41 | 42 | private String format; 43 | 44 | private boolean expand; 45 | 46 | private boolean lenient; 47 | 48 | private Analyzer analyzer; 49 | 50 | private Environment env; 51 | 52 | /** 53 | * Remote URL address 54 | */ 55 | private String location; 56 | 57 | private String lastModified; 58 | 59 | private String eTags; 60 | 61 | RemoteSynonymFile(Environment env, Analyzer analyzer, 62 | boolean expand, boolean lenient, String format, String location) { 63 | this.analyzer = analyzer; 64 | this.expand = expand; 65 | this.lenient = lenient; 66 | this.format = format; 67 | this.env = env; 68 | this.location = location; 69 | 70 | this.httpclient = HttpClients.createDefault(); 71 | 72 | isNeedReloadSynonymMap(); 73 | } 74 | 75 | static SynonymMap.Builder getSynonymParser( 76 | Reader rulesReader, String format, boolean expand, boolean lenient, Analyzer analyzer 77 | ) throws IOException, ParseException { 78 | SynonymMap.Builder parser; 79 | if ("wordnet".equalsIgnoreCase(format)) { 80 | parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer); 81 | ((ESWordnetSynonymParser) parser).parse(rulesReader); 82 | } else { 83 | parser = new ESSolrSynonymParser(true, expand, lenient, analyzer); 84 | ((ESSolrSynonymParser) parser).parse(rulesReader); 85 | } 86 | return parser; 87 | } 88 | 89 | @Override 90 | public SynonymMap reloadSynonymMap() { 91 | Reader rulesReader = null; 92 | try { 93 | logger.debug("start reload remote synonym from {}.", location); 94 | rulesReader = getReader(); 95 | SynonymMap.Builder parser; 96 | 97 | parser = getSynonymParser(rulesReader, format, expand, lenient, analyzer); 98 | return parser.build(); 99 | } catch (Exception e) { 100 | logger.error("reload remote synonym {} error!", location, e); 101 | throw new IllegalArgumentException( 102 | "could not reload remote synonyms file to build synonyms", 103 | e); 104 | } finally { 105 | if (rulesReader != null) { 106 | try { 107 | rulesReader.close(); 108 | } catch (Exception e) { 109 | logger.error("failed to close rulesReader", e); 110 | } 111 | } 112 | } 113 | } 114 | 115 | private CloseableHttpResponse executeHttpRequest(HttpUriRequest httpUriRequest) { 116 | try { 117 | return httpclient.execute(httpUriRequest); 118 | } catch (IOException e) { 119 | logger.error("Unable to execute HTTP request.", e); 120 | } 121 | return null; 122 | } 123 | 124 | /** 125 | * Download custom terms from a remote server 126 | */ 127 | public Reader getReader() { 128 | Reader reader; 129 | RequestConfig rc = RequestConfig.custom() 130 | .setConnectionRequestTimeout(10 * 1000, TimeUnit.MILLISECONDS) 131 | .setResponseTimeout(60 * 1000, TimeUnit.MILLISECONDS) 132 | .build(); 133 | CloseableHttpResponse response = null; 134 | BufferedReader br = null; 135 | HttpGet get = new HttpGet(location); 136 | get.setConfig(rc); 137 | try { 138 | response = executeHttpRequest(get); 139 | assert response != null; 140 | StatusLine statusLine = new StatusLine(response); 141 | if (statusLine.getStatusCode() == 200) { 142 | String charset = "UTF-8"; // 获取编码,默认为utf-8 143 | if (response.getEntity().getContentType().contains("charset=")) { 144 | String contentType = response.getEntity().getContentType(); 145 | charset = contentType.substring(contentType 146 | .lastIndexOf('=') + 1); 147 | } 148 | 149 | br = new BufferedReader(new InputStreamReader(response 150 | .getEntity().getContent(), charset)); 151 | StringBuilder sb = new StringBuilder(); 152 | String line; 153 | while ((line = br.readLine()) != null) { 154 | logger.debug("reload remote synonym: {}", line); 155 | sb.append(line) 156 | .append(System.getProperty("line.separator")); 157 | } 158 | reader = new StringReader(sb.toString()); 159 | } else reader = new StringReader(""); 160 | } catch (Exception e) { 161 | logger.error("get remote synonym reader {} error!", location, e); 162 | // throw new IllegalArgumentException( 163 | // "Exception while reading remote synonyms file", e); 164 | // Fix #54 Returns blank if synonym file has be deleted. 165 | reader = new StringReader("1=>1"); 166 | } finally { 167 | try { 168 | if (br != null) { 169 | br.close(); 170 | } 171 | } catch (IOException e) { 172 | logger.error("failed to close bufferedReader", e); 173 | } 174 | try { 175 | if (response != null) { 176 | response.close(); 177 | } 178 | } catch (IOException e) { 179 | logger.error("failed to close http response", e); 180 | } 181 | } 182 | return reader; 183 | } 184 | 185 | @Override 186 | public boolean isNeedReloadSynonymMap() { 187 | logger.info("==== isNeedReloadSynonymMap ===="); 188 | RequestConfig rc = RequestConfig.custom() 189 | .setConnectionRequestTimeout(10 * 1000, TimeUnit.MILLISECONDS) 190 | .setResponseTimeout(15 * 1000, TimeUnit.MILLISECONDS) 191 | .build(); 192 | HttpHead head = new HttpHead(location); 193 | head.setConfig(rc); 194 | 195 | // 设置请求头 196 | if (lastModified != null) { 197 | head.setHeader("If-Modified-Since", lastModified); 198 | } 199 | if (eTags != null) { 200 | head.setHeader("If-None-Match", eTags); 201 | } 202 | 203 | CloseableHttpResponse response = null; 204 | try { 205 | response = executeHttpRequest(head); 206 | assert response != null; 207 | StatusLine statusLine = new StatusLine(response); 208 | if (statusLine.getStatusCode() == 200) { // 返回200 才做操作 209 | if (!response.getLastHeader(LAST_MODIFIED_HEADER).getValue() 210 | .equalsIgnoreCase(lastModified) 211 | || !response.getLastHeader(ETAG_HEADER).getValue() 212 | .equalsIgnoreCase(eTags)) { 213 | 214 | lastModified = response.getLastHeader(LAST_MODIFIED_HEADER) == null ? null 215 | : response.getLastHeader(LAST_MODIFIED_HEADER) 216 | .getValue(); 217 | eTags = response.getLastHeader(ETAG_HEADER) == null ? null 218 | : response.getLastHeader(ETAG_HEADER).getValue(); 219 | return true; 220 | } 221 | } else if (statusLine.getStatusCode() == 304) { 222 | return false; 223 | } else { 224 | logger.info("remote synonym {} return bad code {}", location, 225 | statusLine.getStatusCode()); 226 | } 227 | } catch (Exception e){ 228 | return false; 229 | } finally { 230 | try { 231 | if (response != null) { 232 | response.close(); 233 | } 234 | } catch (IOException e) { 235 | logger.error("failed to close http response", e); 236 | } 237 | } 238 | return false; 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /src/test/java/com.bellszhu.elasticsearch.plugin/DynamicSynonymPluginTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, guanquan.wang@yandex.com All Rights Reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.bellszhu.elasticsearch.plugin; 18 | 19 | import org.codelibs.elasticsearch.runner.ElasticsearchClusterRunner; 20 | import org.elasticsearch.action.ActionFuture; 21 | import org.elasticsearch.action.admin.indices.analyze.AnalyzeAction; 22 | import org.elasticsearch.common.settings.Settings; 23 | import org.elasticsearch.xcontent.XContentType; 24 | import org.junit.After; 25 | import org.junit.Before; 26 | import org.junit.Test; 27 | 28 | import java.io.IOException; 29 | import java.util.List; 30 | import java.util.concurrent.TimeUnit; 31 | 32 | import static org.codelibs.elasticsearch.runner.ElasticsearchClusterRunner.newConfigs; 33 | 34 | /** 35 | * Create by guanquan.wang at 2019-09-18 16:55 36 | */ 37 | public class DynamicSynonymPluginTest { 38 | private ElasticsearchClusterRunner runner; 39 | 40 | @Before 41 | public void setUp() { 42 | // create runner instance 43 | runner = new ElasticsearchClusterRunner(); 44 | // create ES nodes 45 | runner.build(newConfigs() 46 | .numOfNode(1) // Create a test node, default number of node is 3. 47 | .pluginTypes("com.bellszhu.elasticsearch.plugin.DynamicSynonymPlugin") 48 | ); 49 | } 50 | 51 | @After 52 | public void tearDown() throws IOException { 53 | // close runner 54 | runner.close(); 55 | // delete all files 56 | runner.clean(); 57 | } 58 | 59 | private void createIndexWithLocalSynonym(String indexName, String synonymType, String localPath) { 60 | final String indexSettings = "{\n" + 61 | " \"index\":{\n" + 62 | " \"analysis\":{\n" + 63 | " \"filter\":{\n" + 64 | " \"local_synonym\": {\n" + 65 | " \"type\": \"" + synonymType + "\",\n" + 66 | " \"synonyms_path\": \"" + localPath + "\",\n" + 67 | " \"interval\": \"10\"\n" + 68 | " }"+ 69 | " },\n" + 70 | " \"char_filter\":{\n" + 71 | " \"my_char_filter\":{\n" + 72 | " \"pattern\":\"[- /]\",\n" + 73 | " \"type\":\"pattern_replace\",\n" + 74 | " \"replacement\":\"\"\n" + 75 | " }\n" + 76 | " },\n" + 77 | " \"analyzer\":{\n" + 78 | " \"synonym_analyzer\":{\n" + 79 | " \"filter\":[\n" + 80 | " \"lowercase\",\n" + 81 | " \"asciifolding\",\n" + 82 | " \"local_synonym\"\n" + 83 | " ],\n" + 84 | " \"type\":\"custom\",\n" + 85 | " \"tokenizer\":\"keyword\"\n" + 86 | " }\n" + 87 | " }\n" + 88 | " }\n" + 89 | " }\n" + 90 | "}"; 91 | 92 | runner.createIndex(indexName, Settings.builder().loadFromSource(indexSettings, XContentType.JSON).build()); 93 | // wait for yellow status 94 | runner.ensureYellow(); 95 | } 96 | 97 | private void createIndexWithRemoteSynonym(String indexName) { 98 | final String indexSettings = "{\n" + 99 | " \"index\":{\n" + 100 | " \"analysis\":{\n" + 101 | " \"filter\":{\n" + 102 | " \"remote_synonym\": {\n" + 103 | " \"type\": \"dynamic_synonym\",\n" + 104 | " \"synonyms_path\": \"http://localhost:8080/api/synonym\",\n" + 105 | " \"interval\": \"10\"\n" + 106 | " }"+ 107 | " },\n" + 108 | " \"char_filter\":{\n" + 109 | " \"my_char_filter\":{\n" + 110 | " \"pattern\":\"[- /]\",\n" + 111 | " \"type\":\"pattern_replace\",\n" + 112 | " \"replacement\":\"\"\n" + 113 | " }\n" + 114 | " },\n" + 115 | " \"analyzer\":{\n" + 116 | " \"synonym_analyzer\":{\n" + 117 | " \"filter\":[\n" + 118 | " \"lowercase\",\n" + 119 | " \"asciifolding\",\n" + 120 | " \"remote_synonym\"\n" + 121 | " ],\n" + 122 | " \"type\":\"custom\",\n" + 123 | " \"tokenizer\":\"keyword\"\n" + 124 | " }\n" + 125 | " }\n" + 126 | " }\n" + 127 | " }\n" + 128 | "}"; 129 | 130 | runner.createIndex(indexName, Settings.builder().loadFromSource(indexSettings, XContentType.JSON).build()); 131 | // wait for yellow status 132 | runner.ensureYellow(); 133 | } 134 | 135 | private synchronized void analyzer(String indexName) throws InterruptedException { 136 | List tokens = tokens(indexName, "肯德基"); 137 | for (AnalyzeAction.AnalyzeToken token : tokens) { 138 | System.out.println(token.getTerm() + " => " + token.getType()); 139 | } 140 | 141 | /* 142 | Wait one minute to modify the synonym file and run again. 143 | */ 144 | wait(1000 * 60); 145 | 146 | tokens = tokens(indexName, "金拱门"); 147 | for (AnalyzeAction.AnalyzeToken token : tokens) { 148 | System.out.println(token.getTerm() + " => " + token.getType()); 149 | } 150 | 151 | tokens = tokens(indexName, "america"); 152 | for (AnalyzeAction.AnalyzeToken token : tokens) { 153 | System.out.println(token.getTerm() + " => " + token.getType()); 154 | } 155 | } 156 | 157 | private List tokens(String indexName, String text) { 158 | AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request(indexName); 159 | analyzeRequest.text(text); 160 | analyzeRequest.analyzer("synonym_analyzer"); 161 | ActionFuture actionFuture = runner.admin().indices().analyze(analyzeRequest); 162 | AnalyzeAction.Response response = actionFuture.actionGet(10L, TimeUnit.SECONDS); 163 | return response.getTokens(); 164 | } 165 | 166 | @Test 167 | public void testLocalAbsolute() { 168 | String index = "test_local_absolute"; 169 | String absolutePath = "target/test-classes/synonym.txt"; 170 | // create an index 171 | createIndexWithLocalSynonym(index, "dynamic_synonym", absolutePath); 172 | 173 | String text = "肯德基"; 174 | List analyzeTokens = tokens(index, text); 175 | for (AnalyzeAction.AnalyzeToken token : analyzeTokens) { 176 | System.out.println(token.getTerm() + " => " + token.getType()); 177 | } 178 | 179 | assert analyzeTokens.size() == 3; 180 | for (AnalyzeAction.AnalyzeToken token : analyzeTokens) { 181 | String key = token.getTerm(); 182 | if (text.equalsIgnoreCase(key)) { 183 | assert token.getType().equalsIgnoreCase("word"); 184 | } else { 185 | assert token.getType().equalsIgnoreCase("synonym"); 186 | } 187 | } 188 | } 189 | 190 | @Test 191 | public void testGraphLocalAbsolute() { 192 | String index = "test_local_absolute"; 193 | String absolutePath = "target/test-classes/synonym.txt"; 194 | // create an index 195 | createIndexWithLocalSynonym(index, "dynamic_synonym_graph", absolutePath); 196 | 197 | String text = "肯德基"; 198 | List analyzeTokens = tokens(index, text); 199 | for (AnalyzeAction.AnalyzeToken token : analyzeTokens) { 200 | System.out.println(token.getTerm() + " => " + token.getType()); 201 | } 202 | 203 | assert analyzeTokens.size() == 3; 204 | for (AnalyzeAction.AnalyzeToken token : analyzeTokens) { 205 | String key = token.getTerm(); 206 | if (text.equalsIgnoreCase(key)) { 207 | assert token.getType().equalsIgnoreCase("word"); 208 | } else { 209 | assert token.getType().equalsIgnoreCase("synonym"); 210 | } 211 | } 212 | } 213 | 214 | @Test 215 | public void testLocal() { 216 | String index = "test_local_relative"; 217 | String absolutePath = "synonym.txt"; 218 | // create an index 219 | createIndexWithLocalSynonym(index, "dynamic_synonym", absolutePath); 220 | 221 | String text = "kfc"; 222 | List analyzeTokens = tokens(index, text); 223 | for (AnalyzeAction.AnalyzeToken token : analyzeTokens) { 224 | System.out.println(token.getTerm() + " => " + token.getType()); 225 | } 226 | 227 | assert analyzeTokens.size() == 3; 228 | for (AnalyzeAction.AnalyzeToken token : analyzeTokens) { 229 | String key = token.getTerm(); 230 | if (text.equalsIgnoreCase(key)) { 231 | assert token.getType().equalsIgnoreCase("word"); 232 | } else { 233 | assert token.getType().equalsIgnoreCase("synonym"); 234 | } 235 | } 236 | } 237 | 238 | @Test 239 | public void testRemote() throws InterruptedException { 240 | String index = "test_remote"; 241 | // create an index 242 | createIndexWithRemoteSynonym(index); 243 | 244 | analyzer(index); 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.bellszhu.elasticsearch.plugin.synonym.analysis; 19 | 20 | import java.io.IOException; 21 | import java.util.ArrayList; 22 | import java.util.LinkedList; 23 | import java.util.List; 24 | 25 | import org.apache.lucene.analysis.TokenStream; 26 | import org.apache.lucene.analysis.core.FlattenGraphFilter; 27 | import org.apache.lucene.analysis.synonym.SynonymFilter; 28 | import org.apache.lucene.analysis.synonym.SynonymMap; 29 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 30 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 31 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 32 | import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; 33 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 34 | import org.apache.lucene.store.ByteArrayDataInput; 35 | import org.apache.lucene.util.BytesRef; 36 | import org.apache.lucene.util.CharsRefBuilder; 37 | import org.apache.lucene.util.RollingBuffer; 38 | import org.apache.lucene.util.fst.FST; 39 | 40 | // TODO: maybe we should resolve token -> wordID then run 41 | // FST on wordIDs, for better perf? 42 | 43 | // TODO: a more efficient approach would be Aho/Corasick's 44 | // algorithm 45 | // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm 46 | // It improves over the current approach here 47 | // because it does not fully re-start matching at every 48 | // token. For example if one pattern is "a b c x" 49 | // and another is "b c d" and the input is "a b c d", on 50 | // trying to parse "a b c x" but failing when you got to x, 51 | // rather than starting over again your really should 52 | // immediately recognize that "b c d" matches at the next 53 | // input. I suspect this won't matter that much in 54 | // practice, but it's possible on some set of synonyms it 55 | // will. We'd have to modify Aho/Corasick to enforce our 56 | // conflict resolving (eg greedy matching) because that algo 57 | // finds all matches. This really amounts to adding a .* 58 | // closure to the FST and then determinizing it. 59 | // 60 | // Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps 61 | 62 | /** Applies single- or multi-token synonyms from a {@link SynonymMap} 63 | * to an incoming {@link TokenStream}, producing a fully correct graph 64 | * output. This is a replacement for {@link SynonymFilter}, which produces 65 | * incorrect graphs for multi-token synonyms. 66 | * 67 | *

However, if you use this during indexing, you must follow it with 68 | * {@link FlattenGraphFilter} to squash tokens on top of one another 69 | * like {@link SynonymFilter}, because the indexer can't directly 70 | * consume a graph. To get fully correct positional queries when your 71 | * synonym replacements are multiple tokens, you should instead apply 72 | * synonyms using this {@code TokenFilter} at query time and translate 73 | * the resulting graph to a {@code TermAutomatonQuery} e.g. using 74 | * {@code TokenStreamToTermAutomatonQuery}. 75 | * 76 | *

NOTE: this cannot consume an incoming graph; results will 77 | * be undefined. 78 | * 79 | * @lucene.experimental */ 80 | 81 | public final class DynamicSynonymGraphFilter extends AbsSynonymFilter { 82 | 83 | public static final String TYPE_SYNONYM = "SYNONYM"; 84 | 85 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 86 | private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); 87 | private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); 88 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 89 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 90 | 91 | private SynonymMap synonyms; 92 | private final boolean ignoreCase; 93 | 94 | private FST fst; 95 | 96 | private FST.BytesReader fstReader; 97 | private FST.Arc scratchArc; 98 | private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); 99 | private final BytesRef scratchBytes = new BytesRef(); 100 | private final CharsRefBuilder scratchChars = new CharsRefBuilder(); 101 | private final LinkedList outputBuffer = new LinkedList<>(); 102 | 103 | private int nextNodeOut; 104 | private int lastNodeOut; 105 | private int maxLookaheadUsed; 106 | 107 | // For testing: 108 | private int captureCount; 109 | 110 | private boolean liveToken; 111 | 112 | // Start/end offset of the current match: 113 | private int matchStartOffset; 114 | private int matchEndOffset; 115 | 116 | // True once the input TokenStream is exhausted: 117 | private boolean finished; 118 | 119 | private int lookaheadNextRead; 120 | private int lookaheadNextWrite; 121 | 122 | private RollingBuffer lookahead = new RollingBuffer() { 123 | @Override 124 | protected BufferedInputToken newInstance() { 125 | return new BufferedInputToken(); 126 | } 127 | }; 128 | 129 | static class BufferedInputToken implements RollingBuffer.Resettable { 130 | final CharsRefBuilder term = new CharsRefBuilder(); 131 | State state; 132 | int startOffset = -1; 133 | int endOffset = -1; 134 | 135 | @Override 136 | public void reset() { 137 | state = null; 138 | term.clear(); 139 | 140 | // Intentionally invalid to ferret out bugs: 141 | startOffset = -1; 142 | endOffset = -1; 143 | } 144 | } 145 | 146 | static class BufferedOutputToken { 147 | final String term; 148 | 149 | // Non-null if this was an incoming token: 150 | final State state; 151 | 152 | final int startNode; 153 | final int endNode; 154 | 155 | public BufferedOutputToken(State state, String term, int startNode, int endNode) { 156 | this.state = state; 157 | this.term = term; 158 | this.startNode = startNode; 159 | this.endNode = endNode; 160 | } 161 | } 162 | 163 | /** 164 | * Apply previously built synonyms to incoming tokens. 165 | * @param input input tokenstream 166 | * @param synonyms synonym map 167 | * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. 168 | * Note, if you set this to true, it's your responsibility to lowercase 169 | * the input entries when you create the {@link SynonymMap} 170 | */ 171 | public DynamicSynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { 172 | super(input); 173 | update(synonyms); 174 | this.ignoreCase = ignoreCase; 175 | } 176 | 177 | @Override 178 | public boolean incrementToken() throws IOException { 179 | //System.out.println("\nS: incrToken lastNodeOut=" + lastNodeOut + " nextNodeOut=" + nextNodeOut); 180 | 181 | assert lastNodeOut <= nextNodeOut; 182 | 183 | if (outputBuffer.isEmpty() == false) { 184 | // We still have pending outputs from a prior synonym match: 185 | releaseBufferedToken(); 186 | //System.out.println(" syn: ret buffered=" + this); 187 | assert liveToken == false; 188 | return true; 189 | } 190 | 191 | // Try to parse a new synonym match at the current token: 192 | 193 | if (parse()) { 194 | // A new match was found: 195 | releaseBufferedToken(); 196 | //System.out.println(" syn: after parse, ret buffered=" + this); 197 | assert liveToken == false; 198 | return true; 199 | } 200 | 201 | if (lookaheadNextRead == lookaheadNextWrite) { 202 | 203 | // Fast path: parse pulled one token, but it didn't match 204 | // the start for any synonym, so we now return it "live" w/o having 205 | // cloned all of its atts: 206 | if (finished) { 207 | //System.out.println(" syn: ret END"); 208 | return false; 209 | } 210 | 211 | assert liveToken; 212 | liveToken = false; 213 | 214 | // NOTE: no need to change posInc since it's relative, i.e. whatever 215 | // node our output is upto will just increase by the incoming posInc. 216 | // We also don't need to change posLen, but only because we cannot 217 | // consume a graph, so the incoming token can never span a future 218 | // synonym match. 219 | 220 | } else { 221 | // We still have buffered lookahead tokens from a previous 222 | // parse attempt that required lookahead; just replay them now: 223 | //System.out.println(" restore buffer"); 224 | assert lookaheadNextRead < lookaheadNextWrite: "read=" + lookaheadNextRead + " write=" + lookaheadNextWrite; 225 | BufferedInputToken token = lookahead.get(lookaheadNextRead); 226 | lookaheadNextRead++; 227 | 228 | restoreState(token.state); 229 | 230 | lookahead.freeBefore(lookaheadNextRead); 231 | 232 | //System.out.println(" after restore offset=" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset()); 233 | assert liveToken == false; 234 | } 235 | 236 | lastNodeOut += posIncrAtt.getPositionIncrement(); 237 | nextNodeOut = lastNodeOut + posLenAtt.getPositionLength(); 238 | 239 | //System.out.println(" syn: ret lookahead=" + this); 240 | 241 | return true; 242 | } 243 | 244 | private void releaseBufferedToken() throws IOException { 245 | //System.out.println(" releaseBufferedToken"); 246 | 247 | BufferedOutputToken token = outputBuffer.pollFirst(); 248 | 249 | if (token.state != null) { 250 | // This is an original input token (keepOrig=true case): 251 | //System.out.println(" hasState"); 252 | restoreState(token.state); 253 | //System.out.println(" startOffset=" + offsetAtt.startOffset() + " endOffset=" + offsetAtt.endOffset()); 254 | } else { 255 | clearAttributes(); 256 | //System.out.println(" no state"); 257 | termAtt.append(token.term); 258 | 259 | // We better have a match already: 260 | assert matchStartOffset != -1; 261 | 262 | offsetAtt.setOffset(matchStartOffset, matchEndOffset); 263 | //System.out.println(" startOffset=" + matchStartOffset + " endOffset=" + matchEndOffset); 264 | typeAtt.setType(TYPE_SYNONYM); 265 | } 266 | 267 | //System.out.println(" lastNodeOut=" + lastNodeOut); 268 | //System.out.println(" term=" + termAtt); 269 | 270 | posIncrAtt.setPositionIncrement(token.startNode - lastNodeOut); 271 | lastNodeOut = token.startNode; 272 | posLenAtt.setPositionLength(token.endNode - token.startNode); 273 | } 274 | 275 | /** Scans the next input token(s) to see if a synonym matches. Returns true 276 | * if a match was found. */ 277 | private boolean parse() throws IOException { 278 | // System.out.println(Thread.currentThread().getName() + ": S: parse: " + System.identityHashCode(this)); 279 | 280 | // Holds the longest match we've seen so far: 281 | BytesRef matchOutput = null; 282 | int matchInputLength = 0; 283 | 284 | BytesRef pendingOutput = fst.outputs.getNoOutput(); 285 | fst.getFirstArc(scratchArc); 286 | 287 | assert scratchArc.output() == fst.outputs.getNoOutput(); 288 | 289 | // How many tokens in the current match 290 | int matchLength = 0; 291 | boolean doFinalCapture = false; 292 | 293 | int lookaheadUpto = lookaheadNextRead; 294 | matchStartOffset = -1; 295 | 296 | byToken: 297 | while (true) { 298 | //System.out.println(" cycle lookaheadUpto=" + lookaheadUpto + " maxPos=" + lookahead.getMaxPos()); 299 | 300 | // Pull next token's chars: 301 | final char[] buffer; 302 | final int bufferLen; 303 | final int inputEndOffset; 304 | 305 | if (lookaheadUpto <= lookahead.getMaxPos()) { 306 | // Still in our lookahead buffer 307 | BufferedInputToken token = lookahead.get(lookaheadUpto); 308 | lookaheadUpto++; 309 | buffer = token.term.chars(); 310 | bufferLen = token.term.length(); 311 | inputEndOffset = token.endOffset; 312 | //System.out.println(" use buffer now max=" + lookahead.getMaxPos()); 313 | if (matchStartOffset == -1) { 314 | matchStartOffset = token.startOffset; 315 | } 316 | } else { 317 | 318 | // We used up our lookahead buffer of input tokens 319 | // -- pull next real input token: 320 | 321 | assert finished || liveToken == false; 322 | 323 | if (finished) { 324 | //System.out.println(" break: finished"); 325 | break; 326 | } else if (input.incrementToken()) { 327 | //System.out.println(" input.incrToken"); 328 | liveToken = true; 329 | buffer = termAtt.buffer(); 330 | bufferLen = termAtt.length(); 331 | if (matchStartOffset == -1) { 332 | matchStartOffset = offsetAtt.startOffset(); 333 | } 334 | inputEndOffset = offsetAtt.endOffset(); 335 | 336 | lookaheadUpto++; 337 | } else { 338 | // No more input tokens 339 | finished = true; 340 | //System.out.println(" break: now set finished"); 341 | break; 342 | } 343 | } 344 | 345 | matchLength++; 346 | //System.out.println(" cycle term=" + new String(buffer, 0, bufferLen)); 347 | 348 | // Run each char in this token through the FST: 349 | int bufUpto = 0; 350 | while (bufUpto < bufferLen) { 351 | final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); 352 | if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { 353 | break byToken; 354 | } 355 | 356 | // Accum the output 357 | pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output()); 358 | bufUpto += Character.charCount(codePoint); 359 | } 360 | 361 | assert bufUpto == bufferLen; 362 | 363 | // OK, entire token matched; now see if this is a final 364 | // state in the FST (a match): 365 | if (scratchArc.isFinal()) { 366 | matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput()); 367 | matchInputLength = matchLength; 368 | matchEndOffset = inputEndOffset; 369 | //System.out.println(" ** match"); 370 | } 371 | 372 | // See if the FST can continue matching (ie, needs to 373 | // see the next input token): 374 | if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { 375 | // No further rules can match here; we're done 376 | // searching for matching rules starting at the 377 | // current input position. 378 | break; 379 | } else { 380 | // More matching is possible -- accum the output (if 381 | // any) of the WORD_SEP arc: 382 | pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output()); 383 | doFinalCapture = true; 384 | if (liveToken) { 385 | capture(); 386 | } 387 | } 388 | } 389 | 390 | if (doFinalCapture && liveToken && finished == false) { 391 | // Must capture the final token if we captured any prior tokens: 392 | capture(); 393 | } 394 | 395 | if (matchOutput != null) { 396 | 397 | if (liveToken) { 398 | // Single input token synonym; we must buffer it now: 399 | capture(); 400 | } 401 | 402 | // There is a match! 403 | bufferOutputTokens(matchOutput, matchInputLength); 404 | lookaheadNextRead += matchInputLength; 405 | //System.out.println(" precmatch; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos()); 406 | lookahead.freeBefore(lookaheadNextRead); 407 | //System.out.println(" match; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos()); 408 | return true; 409 | } else { 410 | //System.out.println(" no match; lookaheadNextRead=" + lookaheadNextRead); 411 | return false; 412 | } 413 | 414 | //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); 415 | } 416 | 417 | /** Expands the output graph into the necessary tokens, adding 418 | * synonyms as side paths parallel to the input tokens, and 419 | * buffers them in the output token buffer. */ 420 | private void bufferOutputTokens(BytesRef bytes, int matchInputLength) { 421 | bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); 422 | 423 | final int code = bytesReader.readVInt(); 424 | final boolean keepOrig = (code & 0x1) == 0; 425 | //System.out.println(" buffer: keepOrig=" + keepOrig + " matchInputLength=" + matchInputLength); 426 | 427 | // How many nodes along all paths; we need this to assign the 428 | // node ID for the final end node where all paths merge back: 429 | int totalPathNodes; 430 | if (keepOrig) { 431 | assert matchInputLength > 0; 432 | totalPathNodes = matchInputLength - 1; 433 | } else { 434 | totalPathNodes = 0; 435 | } 436 | 437 | // How many synonyms we will insert over this match: 438 | final int count = code >>> 1; 439 | 440 | // TODO: we could encode this instead into the FST: 441 | 442 | // 1st pass: count how many new nodes we need 443 | List> paths = new ArrayList<>(); 444 | for(int outputIDX=0;outputIDX path = new ArrayList<>(); 451 | paths.add(path); 452 | int chEnd = scratchChars.length(); 453 | for(int chUpto=0; chUpto<=chEnd; chUpto++) { 454 | if (chUpto == chEnd || scratchChars.charAt(chUpto) == SynonymMap.WORD_SEPARATOR) { 455 | path.add(new String(scratchChars.chars(), lastStart, chUpto - lastStart)); 456 | lastStart = 1 + chUpto; 457 | } 458 | } 459 | 460 | assert path.size() > 0; 461 | totalPathNodes += path.size() - 1; 462 | } 463 | //System.out.println(" totalPathNodes=" + totalPathNodes); 464 | 465 | // 2nd pass: buffer tokens for the graph fragment 466 | 467 | // NOTE: totalPathNodes will be 0 in the case where the matched 468 | // input is a single token and all outputs are also a single token 469 | 470 | // We "spawn" a side-path for each of the outputs for this matched 471 | // synonym, all ending back at this end node: 472 | 473 | int startNode = nextNodeOut; 474 | 475 | int endNode = startNode + totalPathNodes + 1; 476 | //System.out.println(" " + paths.size() + " new side-paths"); 477 | 478 | // First, fanout all tokens departing start node for these new side paths: 479 | int newNodeCount = 0; 480 | for(List path : paths) { 481 | int pathEndNode; 482 | //System.out.println(" path size=" + path.size()); 483 | if (path.size() == 1) { 484 | // Single token output, so there are no intermediate nodes: 485 | pathEndNode = endNode; 486 | } else { 487 | pathEndNode = nextNodeOut + newNodeCount + 1; 488 | newNodeCount += path.size() - 1; 489 | } 490 | outputBuffer.add(new BufferedOutputToken(null, path.get(0), startNode, pathEndNode)); 491 | } 492 | 493 | // We must do the original tokens last, else the offsets "go backwards": 494 | if (keepOrig) { 495 | BufferedInputToken token = lookahead.get(lookaheadNextRead); 496 | int inputEndNode; 497 | if (matchInputLength == 1) { 498 | // Single token matched input, so there are no intermediate nodes: 499 | inputEndNode = endNode; 500 | } else { 501 | inputEndNode = nextNodeOut + newNodeCount + 1; 502 | } 503 | 504 | //System.out.println(" keepOrig first token: " + token.term); 505 | 506 | outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), startNode, inputEndNode)); 507 | } 508 | 509 | nextNodeOut = endNode; 510 | 511 | // Do full side-path for each syn output: 512 | for(int pathID=0;pathID path = paths.get(pathID); 514 | if (path.size() > 1) { 515 | int lastNode = outputBuffer.get(pathID).endNode; 516 | for(int i=1;i 1) { 525 | // Do full "side path" with the original tokens: 526 | int lastNode = outputBuffer.get(paths.size()).endNode; 527 | for(int i=1;i(); 588 | 589 | } 590 | 591 | // for testing 592 | int getCaptureCount() { 593 | return captureCount; 594 | } 595 | 596 | // for testing 597 | int getMaxLookaheadUsed() { 598 | return maxLookaheadUsed; 599 | } 600 | } 601 | -------------------------------------------------------------------------------- /src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java: -------------------------------------------------------------------------------- 1 | package com.bellszhu.elasticsearch.plugin.synonym.analysis; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.io.IOException; 21 | import java.util.Arrays; 22 | 23 | import org.apache.lucene.analysis.TokenStream; 24 | import org.apache.lucene.analysis.synonym.SynonymMap; 25 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 26 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 27 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 28 | import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; 29 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 30 | import org.apache.lucene.store.ByteArrayDataInput; 31 | import org.apache.lucene.util.ArrayUtil; 32 | import org.apache.lucene.util.AttributeSource; 33 | import org.apache.lucene.util.BytesRef; 34 | import org.apache.lucene.util.CharsRef; 35 | import org.apache.lucene.util.CharsRefBuilder; 36 | import org.apache.lucene.util.RamUsageEstimator; 37 | import org.apache.lucene.util.fst.FST; 38 | 39 | /** 40 | * Matches single or multi word synonyms in a token stream. This token stream 41 | * cannot properly handle position increments != 1, ie, you should place this 42 | * filter before filtering out stop words. 43 | * 44 | *

45 | * Note that with the current implementation, parsing is greedy, so whenever 46 | * multiple parses would apply, the rule starting the earliest and parsing the 47 | * most tokens wins. For example if you have these rules: 48 | * 49 | *

 50 |  *   a -> x
 51 |  *   a b -> y
 52 |  *   b c d -> z
 53 |  * 
54 | *

55 | * Then input a b c d e parses to y b c 56 | * d, ie the 2nd rule "wins" because it started earliest and matched the 57 | * most input tokens of other rules starting at that point. 58 | *

59 | * 60 | *

61 | * A future improvement to this filter could allow non-greedy parsing, such that 62 | * the 3rd rule would win, and also separately allow multiple parses, such that 63 | * all 3 rules would match, perhaps even on a rule by rule basis. 64 | *

65 | * 66 | *

67 | * NOTE: when a match occurs, the output tokens associated with the 68 | * matching rule are "stacked" on top of the input stream (if the rule had 69 | * keepOrig=true) and also on top of another matched rule's output 70 | * tokens. This is not a correct solution, as really the output should be an 71 | * arbitrary graph/lattice. For example, with the above match, you would expect 72 | * an exact PhraseQuery "y b 73 | * c" to match the parsed tokens, but it will fail to do so. This 74 | * limitation is necessary because Lucene's TokenStream (and index) cannot yet 75 | * represent an arbitrary graph. 76 | *

77 | * 78 | *

79 | * NOTE: If multiple incoming tokens arrive on the same position, only 80 | * the first token at that position is used for parsing. Subsequent tokens 81 | * simply pass through and are not parsed. A future improvement would be to 82 | * allow these tokens to also be matched. 83 | *

84 | */ 85 | 86 | // TODO: maybe we should resolve token -> wordID then run 87 | // FST on wordIDs, for better perf? 88 | 89 | // TODO: a more efficient approach would be Aho/Corasick's 90 | // algorithm 91 | // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm 92 | // It improves over the current approach here 93 | // because it does not fully re-start matching at every 94 | // token. For example if one pattern is "a b c x" 95 | // and another is "b c d" and the input is "a b c d", on 96 | // trying to parse "a b c x" but failing when you got to x, 97 | // rather than starting over again your really should 98 | // immediately recognize that "b c d" matches at the next 99 | // input. I suspect this won't matter that much in 100 | // practice, but it's possible on some set of synonyms it 101 | // will. We'd have to modify Aho/Corasick to enforce our 102 | // conflict resolving (eg greedy matching) because that algo 103 | // finds all matches. This really amounts to adding a .* 104 | // closure to the FST and then determinizing it. 105 | // 106 | // Another possible solution is described at 107 | // http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps 108 | 109 | public final class DynamicSynonymFilter extends AbsSynonymFilter { 110 | 111 | private static final String TYPE_SYNONYM = "SYNONYM"; 112 | private final boolean ignoreCase; 113 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 114 | private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); 115 | private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); 116 | 117 | // TODO: we should set PositionLengthAttr too... 118 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 119 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 120 | private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); 121 | private final BytesRef scratchBytes = new BytesRef(); 122 | private final CharsRefBuilder scratchChars = new CharsRefBuilder(); 123 | private SynonymMap synonyms; 124 | private int rollBufferSize; 125 | 126 | private int captureCount; 127 | // How many future input tokens have already been matched 128 | // to a synonym; because the matching is "greedy" we don't 129 | // try to do any more matching for such tokens: 130 | private int inputSkipCount; 131 | 132 | // Rolling buffer, holding pending input tokens we had to 133 | // clone because we needed to look ahead, indexed by 134 | // position: 135 | private PendingInput[] futureInputs; 136 | // Rolling buffer, holding stack of pending synonym 137 | // outputs, indexed by position: 138 | private PendingOutputs[] futureOutputs; 139 | 140 | // Where (in rolling buffers) to write next input saved state: 141 | private int nextWrite; 142 | 143 | // Where (in rolling buffers) to read next input saved state: 144 | private int nextRead; 145 | 146 | // True once we've read last token 147 | private boolean finished; 148 | 149 | private FST.Arc scratchArc; 150 | 151 | private FST fst; 152 | 153 | private FST.BytesReader fstReader; 154 | /* 155 | * This is the core of this TokenFilter: it locates the synonym matches and 156 | * buffers up the results into futureInputs/Outputs. 157 | * 158 | * NOTE: this calls input.incrementToken and does not capture the state if 159 | * no further tokens were checked. So caller must then forward state to our 160 | * caller, or capture: 161 | */ 162 | private int lastStartOffset; 163 | private int lastEndOffset; 164 | 165 | /** 166 | * @param input input tokenstream 167 | * @param synonyms synonym map 168 | * @param ignoreCase case-folds input for matching with 169 | * {@link Character#toLowerCase(int)}. Note, if you set this to 170 | * true, its your responsibility to lowercase the input entries 171 | * when you create the {@link SynonymMap} 172 | */ 173 | DynamicSynonymFilter(TokenStream input, SynonymMap synonyms, 174 | boolean ignoreCase) { 175 | super(input); 176 | this.ignoreCase = ignoreCase; 177 | update(synonyms); 178 | } 179 | 180 | private void capture() { 181 | captureCount++; 182 | final PendingInput input = futureInputs[nextWrite]; 183 | 184 | input.state = captureState(); 185 | input.consumed = false; 186 | input.term.copyChars(termAtt.buffer(), 0, termAtt.length()); 187 | 188 | nextWrite = rollIncr(nextWrite); 189 | 190 | // Buffer head should never catch up to tail: 191 | assert nextWrite != nextRead; 192 | } 193 | 194 | private void parse() throws IOException { 195 | 196 | assert inputSkipCount == 0; 197 | 198 | int curNextRead = nextRead; 199 | 200 | // Holds the longest match we've seen so far: 201 | BytesRef matchOutput = null; 202 | int matchInputLength = 0; 203 | int matchEndOffset = -1; 204 | 205 | BytesRef pendingOutput = fst.outputs.getNoOutput(); 206 | fst.getFirstArc(scratchArc); 207 | 208 | assert scratchArc.output() == fst.outputs.getNoOutput(); 209 | 210 | int tokenCount = 0; 211 | 212 | byToken: 213 | while (true) { 214 | 215 | // Pull next token's chars: 216 | final char[] buffer; 217 | final int bufferLen; 218 | 219 | int inputEndOffset = 0; 220 | 221 | if (curNextRead == nextWrite) { 222 | 223 | // We used up our lookahead buffer of input tokens 224 | // -- pull next real input token: 225 | if (finished) { 226 | break; 227 | } else { 228 | assert futureInputs[nextWrite].consumed; 229 | // Not correct: a syn match whose output is longer 230 | // than its input can set future inputs keepOrig 231 | // to true: 232 | if (input.incrementToken()) { 233 | buffer = termAtt.buffer(); 234 | bufferLen = termAtt.length(); 235 | final PendingInput input = futureInputs[nextWrite]; 236 | lastStartOffset = input.startOffset = offsetAtt 237 | .startOffset(); 238 | lastEndOffset = input.endOffset = offsetAtt.endOffset(); 239 | inputEndOffset = input.endOffset; 240 | if (nextRead != nextWrite) { 241 | capture(); 242 | } else { 243 | input.consumed = false; 244 | } 245 | 246 | } else { 247 | // No more input tokens 248 | finished = true; 249 | break; 250 | } 251 | } 252 | } else { 253 | // Still in our lookahead 254 | buffer = futureInputs[curNextRead].term.chars(); 255 | bufferLen = futureInputs[curNextRead].term.length(); 256 | inputEndOffset = futureInputs[curNextRead].endOffset; 257 | } 258 | 259 | tokenCount++; 260 | 261 | // Run each char in this token through the FST: 262 | int bufUpto = 0; 263 | while (bufUpto < bufferLen) { 264 | final int codePoint = Character.codePointAt(buffer, bufUpto, 265 | bufferLen); 266 | if (fst.findTargetArc( 267 | ignoreCase ? Character.toLowerCase(codePoint) 268 | : codePoint, scratchArc, scratchArc, fstReader) == null) { 269 | break byToken; 270 | } 271 | 272 | // Accum the output 273 | pendingOutput = fst.outputs.add(pendingOutput, 274 | scratchArc.output()); 275 | bufUpto += Character.charCount(codePoint); 276 | } 277 | 278 | // OK, entire token matched; now see if this is a final 279 | // state: 280 | if (scratchArc.isFinal()) { 281 | matchOutput = fst.outputs.add(pendingOutput, 282 | scratchArc.nextFinalOutput()); 283 | matchInputLength = tokenCount; 284 | matchEndOffset = inputEndOffset; 285 | } 286 | 287 | // See if the FST wants to continue matching (ie, needs to 288 | // see the next input token): 289 | if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, 290 | scratchArc, fstReader) == null) { 291 | // No further rules can match here; we're done 292 | // searching for matching rules starting at the 293 | // current input position. 294 | break; 295 | } else { 296 | // More matching is possible -- accum the output (if 297 | // any) of the WORD_SEP arc: 298 | pendingOutput = fst.outputs.add(pendingOutput, 299 | scratchArc.output()); 300 | if (nextRead == nextWrite) { 301 | capture(); 302 | } 303 | } 304 | 305 | curNextRead = rollIncr(curNextRead); 306 | } 307 | 308 | if (nextRead == nextWrite && !finished) { 309 | nextWrite = rollIncr(nextWrite); 310 | } 311 | 312 | if (matchOutput != null) { 313 | inputSkipCount = matchInputLength; 314 | addOutput(matchOutput, matchInputLength, matchEndOffset); 315 | } else if (nextRead != nextWrite) { 316 | // Even though we had no match here, we set to 1 317 | // because we need to skip current input token before 318 | // trying to match again: 319 | inputSkipCount = 1; 320 | } else { 321 | assert finished; 322 | } 323 | 324 | } 325 | 326 | // Interleaves all output tokens onto the futureOutputs: 327 | private void addOutput(BytesRef bytes, int matchInputLength, 328 | int matchEndOffset) { 329 | bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); 330 | 331 | final int code = bytesReader.readVInt(); 332 | final boolean keepOrig = (code & 0x1) == 0; 333 | final int count = code >>> 1; 334 | for (int outputIDX = 0; outputIDX < count; outputIDX++) { 335 | synonyms.words.get(bytesReader.readVInt(), scratchBytes); 336 | scratchChars.copyUTF8Bytes(scratchBytes); 337 | int lastStart = 0; 338 | final int chEnd = lastStart + scratchChars.length(); 339 | int outputUpto = nextRead; 340 | for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { 341 | if (chIDX == chEnd 342 | || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) { 343 | final int outputLen = chIDX - lastStart; 344 | // Caller is not allowed to have empty string in 345 | // the output: 346 | assert outputLen > 0 : "output contains empty string: " 347 | + scratchChars; 348 | final int endOffset; 349 | final int posLen; 350 | if (chIDX == chEnd && lastStart == 0) { 351 | // This rule had a single output token, so, we set 352 | // this output's endOffset to the current 353 | // endOffset (ie, endOffset of the last input 354 | // token it matched): 355 | endOffset = matchEndOffset; 356 | posLen = keepOrig ? matchInputLength : 1; 357 | } else { 358 | // This rule has more than one output token; we 359 | // can't pick any particular endOffset for this 360 | // case, so, we inherit the endOffset for the 361 | // input token which this output overlaps: 362 | endOffset = -1; 363 | posLen = 1; 364 | } 365 | futureOutputs[outputUpto].add(scratchChars.chars(), 366 | lastStart, outputLen, endOffset, posLen); 367 | lastStart = 1 + chIDX; 368 | outputUpto = rollIncr(outputUpto); 369 | assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" 370 | + outputUpto + " vs nextWrite=" + nextWrite; 371 | } 372 | } 373 | } 374 | 375 | int upto = nextRead; 376 | for (int idx = 0; idx < matchInputLength; idx++) { 377 | futureInputs[upto].keepOrig |= keepOrig; 378 | futureInputs[upto].matched = true; 379 | upto = rollIncr(upto); 380 | } 381 | } 382 | 383 | // ++ mod rollBufferSize 384 | private int rollIncr(int count) { 385 | count++; 386 | if (count == rollBufferSize) { 387 | return 0; 388 | } else { 389 | return count; 390 | } 391 | } 392 | 393 | @Override 394 | public boolean incrementToken() throws IOException { 395 | 396 | while (true) { 397 | 398 | // First play back any buffered future inputs/outputs 399 | // w/o running parsing again: 400 | while (inputSkipCount != 0) { 401 | 402 | // At each position, we first output the original 403 | // token 404 | 405 | // TODO: maybe just a PendingState class, holding 406 | // both input & outputs? 407 | final PendingInput input = futureInputs[nextRead]; 408 | final PendingOutputs outputs = futureOutputs[nextRead]; 409 | 410 | if (!input.consumed && (input.keepOrig || !input.matched)) { 411 | if (input.state != null) { 412 | // Return a previously saved token (because we 413 | // had to lookahead): 414 | restoreState(input.state); 415 | } else { 416 | // Pass-through case: return token we just pulled 417 | // but didn't capture: 418 | assert inputSkipCount == 1 : "inputSkipCount=" 419 | + inputSkipCount + " nextRead=" + nextRead; 420 | } 421 | input.reset(); 422 | if (outputs.count > 0) { 423 | outputs.posIncr = 0; 424 | } else { 425 | nextRead = rollIncr(nextRead); 426 | inputSkipCount--; 427 | } 428 | return true; 429 | } else if (outputs.upto < outputs.count) { 430 | // Still have pending outputs to replay at this 431 | // position 432 | input.reset(); 433 | final int posIncr = outputs.posIncr; 434 | final CharsRef output = outputs.pullNext(); 435 | clearAttributes(); 436 | termAtt.copyBuffer(output.chars, output.offset, 437 | output.length); 438 | typeAtt.setType(TYPE_SYNONYM); 439 | int endOffset = outputs.getLastEndOffset(); 440 | if (endOffset == -1) { 441 | endOffset = input.endOffset; 442 | } 443 | offsetAtt.setOffset(input.startOffset, endOffset); 444 | posIncrAtt.setPositionIncrement(posIncr); 445 | posLenAtt.setPositionLength(outputs.getLastPosLength()); 446 | if (outputs.count == 0) { 447 | // Done with the buffered input and all outputs at 448 | // this position 449 | nextRead = rollIncr(nextRead); 450 | inputSkipCount--; 451 | } 452 | return true; 453 | } else { 454 | // Done with the buffered input and all outputs at 455 | // this position 456 | input.reset(); 457 | nextRead = rollIncr(nextRead); 458 | inputSkipCount--; 459 | } 460 | } 461 | 462 | if (finished && nextRead == nextWrite) { 463 | // End case: if any output syns went beyond end of 464 | // input stream, enumerate them now: 465 | final PendingOutputs outputs = futureOutputs[nextRead]; 466 | if (outputs.upto < outputs.count) { 467 | final int posIncr = outputs.posIncr; 468 | final CharsRef output = outputs.pullNext(); 469 | futureInputs[nextRead].reset(); 470 | if (outputs.count == 0) { 471 | nextWrite = nextRead = rollIncr(nextRead); 472 | } 473 | clearAttributes(); 474 | // Keep offset from last input token: 475 | offsetAtt.setOffset(lastStartOffset, lastEndOffset); 476 | termAtt.copyBuffer(output.chars, output.offset, 477 | output.length); 478 | typeAtt.setType(TYPE_SYNONYM); 479 | posIncrAtt.setPositionIncrement(posIncr); 480 | return true; 481 | } else { 482 | return false; 483 | } 484 | } 485 | 486 | // Find new synonym matches: 487 | parse(); 488 | } 489 | } 490 | 491 | @Override 492 | public void reset() throws IOException { 493 | 494 | super.reset(); 495 | captureCount = 0; 496 | finished = false; 497 | inputSkipCount = 0; 498 | nextRead = nextWrite = 0; 499 | 500 | // In normal usage these resets would not be needed, 501 | // since they reset-as-they-are-consumed, but the app 502 | // may not consume all input tokens (or we might hit an 503 | // exception), in which case we have leftover state 504 | // here: 505 | for (PendingInput input : futureInputs) { 506 | input.reset(); 507 | } 508 | for (PendingOutputs output : futureOutputs) { 509 | output.reset(); 510 | } 511 | } 512 | 513 | void update(SynonymMap synonymMap) { 514 | this.synonyms = synonymMap; 515 | this.fst = synonyms.fst; 516 | if (fst == null) { 517 | throw new IllegalArgumentException("fst must be non-null"); 518 | } 519 | this.fstReader = fst.getBytesReader(); 520 | 521 | // Must be 1+ so that when roll buffer is at full 522 | // lookahead we can distinguish this full buffer from 523 | // the empty buffer: 524 | rollBufferSize = 1 + synonyms.maxHorizontalContext; 525 | 526 | futureInputs = new PendingInput[rollBufferSize]; 527 | futureOutputs = new PendingOutputs[rollBufferSize]; 528 | for (int pos = 0; pos < rollBufferSize; pos++) { 529 | futureInputs[pos] = new PendingInput(); 530 | futureOutputs[pos] = new PendingOutputs(); 531 | } 532 | 533 | scratchArc = new FST.Arc<>(); 534 | } 535 | 536 | // Hold all buffered (read ahead) stacked input tokens for 537 | // a future position. When multiple tokens are at the 538 | // same position, we only store (and match against) the 539 | // term for the first token at the position, but capture 540 | // state for (and enumerate) all other tokens at this 541 | // position: 542 | private static class PendingInput { 543 | final CharsRefBuilder term = new CharsRefBuilder(); 544 | AttributeSource.State state; 545 | boolean keepOrig; 546 | boolean matched; 547 | boolean consumed = true; 548 | int startOffset; 549 | int endOffset; 550 | 551 | void reset() { 552 | state = null; 553 | consumed = true; 554 | keepOrig = false; 555 | matched = false; 556 | } 557 | } 558 | 559 | // Holds pending output synonyms for one future position: 560 | private static class PendingOutputs { 561 | CharsRefBuilder[] outputs; 562 | int[] endOffsets; 563 | int[] posLengths; 564 | int upto; 565 | int count; 566 | int posIncr = 1; 567 | int lastEndOffset; 568 | int lastPosLength; 569 | 570 | PendingOutputs() { 571 | outputs = new CharsRefBuilder[1]; 572 | endOffsets = new int[1]; 573 | posLengths = new int[1]; 574 | } 575 | 576 | void reset() { 577 | upto = count = 0; 578 | posIncr = 1; 579 | } 580 | 581 | CharsRef pullNext() { 582 | assert upto < count; 583 | lastEndOffset = endOffsets[upto]; 584 | lastPosLength = posLengths[upto]; 585 | final CharsRefBuilder result = outputs[upto++]; 586 | posIncr = 0; 587 | if (upto == count) { 588 | reset(); 589 | } 590 | return result.get(); 591 | } 592 | 593 | int getLastEndOffset() { 594 | return lastEndOffset; 595 | } 596 | 597 | int getLastPosLength() { 598 | return lastPosLength; 599 | } 600 | 601 | void add(char[] output, int offset, int len, int endOffset, 602 | int posLength) { 603 | if (count == outputs.length) { 604 | outputs = Arrays.copyOf(outputs, ArrayUtil.oversize(1 + count, 605 | RamUsageEstimator.NUM_BYTES_OBJECT_REF)); 606 | } 607 | if (count == endOffsets.length) { 608 | final int[] next = new int[ArrayUtil.oversize(1 + count, 609 | Integer.BYTES)]; 610 | System.arraycopy(endOffsets, 0, next, 0, count); 611 | endOffsets = next; 612 | } 613 | if (count == posLengths.length) { 614 | final int[] next = new int[ArrayUtil.oversize(1 + count, 615 | Integer.BYTES)]; 616 | System.arraycopy(posLengths, 0, next, 0, count); 617 | posLengths = next; 618 | } 619 | if (outputs[count] == null) { 620 | outputs[count] = new CharsRefBuilder(); 621 | } 622 | outputs[count].copyChars(output, offset, len); 623 | // endOffset can be -1, in which case we should simply 624 | // use the endOffset of the input token, or X >= 0, in 625 | // which case we use X as the endOffset for this output 626 | endOffsets[count] = endOffset; 627 | posLengths[count] = posLength; 628 | count++; 629 | } 630 | } 631 | 632 | } 633 | --------------------------------------------------------------------------------