├── .gitignore ├── LICENSE ├── README.md ├── docker-compose.yml ├── pom.xml └── src ├── main ├── assemblies │ └── plugin.xml ├── java │ └── org │ │ └── elasticsearch │ │ ├── index │ │ └── analysis │ │ │ ├── ThaiChub2Analyzer.java │ │ │ ├── ThaiChub2AnalyzerProvider.java │ │ │ ├── ThaiChub2Tokenizer.java │ │ │ ├── ThaiChub2TokenizerFactory.java │ │ │ └── Veer66Wordcut.java │ │ └── plugin │ │ └── analysis │ │ └── thaichub2 │ │ └── AnalysisThaiChub2Plugin.java ├── plugin-metadata │ └── plugin-security.policy └── resources │ ├── dictionary.txt │ └── plugin-descriptor.properties └── test └── java └── org └── elasticseach └── index └── analysis └── ThaiChub2TokenizerTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | /*.iml 2 | .idea/ 3 | ## File-based project format: 4 | *.ipr 5 | *.iws 6 | ## Plugin-specific files: 7 | # IntelliJ 8 | out/ 9 | # mpeltonen/sbt-idea plugin 10 | .idea_modules/ 11 | # JIRA plugin 12 | atlassian-ide-plugin.xml 13 | # Crashlytics plugin (for Android Studio and IntelliJ) 14 | com_crashlytics_export_strings.xml 15 | 16 | #OSX 17 | 18 | .DS_Store 19 | .AppleDouble 20 | .LSOverride 21 | # Icon must end with two \r 22 | Icon 23 | # Thumbnails 24 | ._* 25 | # Files that might appear on external disk 26 | .Spotlight-V100 27 | .Trashes 28 | # Directories potentially created on remote AFP share 29 | .AppleDB 30 | .AppleDesktop 31 | Network Trash Folder 32 | Temporary Items 33 | .apdisk 34 | 35 | *.log.* 36 | data 37 | *.log 38 | 39 | target/ 40 | /bin/ 41 | .env 42 | *.iml 43 | .idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Thai Analysis Plugin for Elasticsearch 2 | 3 | The Thaichub2 (thai-chub-chub) Analysis Plugin integrates the Thai word segmentation modules into Elasticsearch. 4 | 5 | ## Installation on Elasticsearch 6 | 7 | - Download a release zip from the [release page](https://github.com/tlefsad/elasticsearch-analysis-thaichub2/releases) matching your ES version (Now supports only version 7.6.2). 8 | 9 | - Install with this command 10 | 11 | ```sh 12 | ./bin/elasticsearch-plugin install --batch file:/// 13 | ``` 14 | 15 | - Restart Elasticsearch 16 | 17 | ## Sample Usage 18 | 19 | Sample request 20 | 21 | ``` 22 | POST _analyze 23 | { 24 | "analyzer": "thaichub2_analyzer", 25 | "text": "นมตรามะลิ" 26 | } 27 | ``` 28 | 29 | Result 30 | 31 | ``` 32 | { 33 | "tokens" : [ 34 | { 35 | "token" : "นม", 36 | "start_offset" : 0, 37 | "end_offset" : 2, 38 | "type" : "word", 39 | "position" : 0 40 | }, 41 | { 42 | "token" : "ตรา", 43 | "start_offset" : 2, 44 | "end_offset" : 5, 45 | "type" : "word", 46 | "position" : 1 47 | }, 48 | { 49 | "token" : "มะลิ", 50 | "start_offset" : 5, 51 | "end_offset" : 9, 52 | "type" : "word", 53 | "position" : 2 54 | } 55 | ] 56 | } 57 | ``` 58 | 59 | ## Thanks 60 | - [Vee Satayamas](https://github.com/veer66) for the Thai word segmentation library. 61 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | elasticsearch: 4 | image: docker.elastic.co/elasticsearch/elasticsearch:7.6.2 5 | container_name: elasticsearch 6 | environment: 7 | - discovery.type=single-node 8 | ports: 9 | - 9200:9200 10 | - 9300:9300 11 | volumes: 12 | - .:/elasticsearch-analysis-thaichub2 13 | command: sh -c './bin/elasticsearch-plugin install --batch file:///elasticsearch-analysis-thaichub2/target/releases/analysis-thaichub2-7.6.2.zip && exec /usr/local/bin/docker-entrypoint.sh elasticsearch' 14 | kibana: 15 | image: docker.elastic.co/kibana/kibana:7.6.2 16 | container_name: kibana 17 | depends_on: 18 | - elasticsearch 19 | ports: 20 | - 5601:5601 21 | links: 22 | - elasticsearch:elasticsearch 23 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.elasticsearch 8 | analysis-thaichub2 9 | 7.6.2 10 | http://maven.apache.org 11 | jar 12 | 13 | 7.6.2 14 | UTF-8 15 | 13 16 | 13 17 | analysis-thaichub2 18 | org.elasticsearch.plugin.analysis.thaichub2.AnalysisThaiChub2Plugin 19 | true 20 | 21 | 22 | 23 | oss.sonatype.org 24 | https://oss.sonatype.org/content/repositories/snapshots 25 | 26 | 27 | oss.sonatype.org 28 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 29 | 30 | 31 | 32 | 33 | clojars 34 | Clojars repository 35 | https://clojars.org/repo 36 | 37 | 38 | oss.sonatype.org 39 | OSS Sonatype 40 | 41 | true 42 | 43 | 44 | true 45 | 46 | http://oss.sonatype.org/content/repositories/releases/ 47 | 48 | 49 | 50 | 51 | 52 | org.elasticsearch 53 | elasticsearch 54 | ${elasticsearch.version} 55 | compile 56 | 57 | 58 | org.testng 59 | testng 60 | 6.3.1 61 | test 62 | 63 | 64 | org.hamcrest 65 | hamcrest-core 66 | 1.3.RC2 67 | test 68 | 69 | 70 | org.hamcrest 71 | hamcrest-library 72 | 1.3.RC2 73 | test 74 | 75 | 76 | rocks.veer66 77 | prefix-tree-x 78 | 1.1-SNAPSHOT 79 | 80 | 81 | rocks.veer66 82 | wordcut-x 83 | 1.1-SNAPSHOT 84 | 85 | 86 | junit 87 | junit 88 | 4.11 89 | test 90 | 91 | 92 | commons-io 93 | commons-io 94 | 2.5 95 | 96 | 97 | 98 | 99 | 100 | org.apache.maven.plugins 101 | maven-compiler-plugin 102 | 3.5.1 103 | 104 | ${maven.compiler.target} 105 | ${maven.compiler.target} 106 | 107 | 108 | 109 | org.apache.maven.plugins 110 | maven-surefire-plugin 111 | 2.19.1 112 | 113 | 114 | org.apache.maven.plugins 115 | maven-source-plugin 116 | 2.1.2 117 | 118 | 119 | attach-sources 120 | 121 | jar 122 | 123 | 124 | 125 | 126 | 127 | maven-assembly-plugin 128 | 2.3 129 | 130 | false 131 | ${project.build.directory}/releases/ 132 | 133 | ${basedir}/src/main/assemblies/plugin.xml 134 | 135 | 136 | 137 | 138 | package 139 | 140 | single 141 | 142 | 143 | 144 | 145 | 146 | org.apache.maven.plugins 147 | maven-jar-plugin 148 | 3.1.0 149 | 150 | 151 | 152 | 153 | 154 | release 155 | 156 | 157 | 158 | org.sonatype.plugins 159 | nexus-staging-maven-plugin 160 | 1.6.3 161 | true 162 | 163 | oss 164 | https://oss.sonatype.org/ 165 | true 166 | 167 | 168 | 169 | org.apache.maven.plugins 170 | maven-release-plugin 171 | 2.1 172 | 173 | true 174 | false 175 | release 176 | deploy 177 | 178 | 179 | 180 | org.apache.maven.plugins 181 | maven-compiler-plugin 182 | 3.5.1 183 | 184 | ${maven.compiler.target} 185 | ${maven.compiler.target} 186 | 187 | 188 | 189 | org.apache.maven.plugins 190 | maven-gpg-plugin 191 | 1.5 192 | 193 | 194 | sign-artifacts 195 | verify 196 | 197 | sign 198 | 199 | 200 | 201 | 202 | 203 | org.apache.maven.plugins 204 | maven-source-plugin 205 | 2.2.1 206 | 207 | 208 | attach-sources 209 | 210 | jar-no-fork 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | plugin 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | 11 | ${project.basedir}/src/main/plugin-metadata 12 | 13 | plugin-security.policy 14 | 15 | / 16 | true 17 | 18 | 19 | 20 | 21 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 22 | / 23 | true 24 | 25 | 26 | 27 | 28 | / 29 | true 30 | true 31 | 32 | org.elasticsearch:elasticsearch 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/ThaiChub2Analyzer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | 5 | public final class ThaiChub2Analyzer extends Analyzer{ 6 | public ThaiChub2Analyzer() { 7 | } 8 | 9 | @Override 10 | protected TokenStreamComponents createComponents(String fieldName) { 11 | return new TokenStreamComponents(new ThaiChub2Tokenizer()); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/ThaiChub2AnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | import org.elasticsearch.env.Environment; 5 | import org.elasticsearch.index.IndexSettings; 6 | 7 | public class ThaiChub2AnalyzerProvider extends AbstractIndexAnalyzerProvider{ 8 | 9 | private final ThaiChub2Analyzer analyzer; 10 | 11 | public ThaiChub2AnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 12 | super(indexSettings, name, settings); 13 | this.analyzer = new ThaiChub2Analyzer(); 14 | } 15 | 16 | @Override 17 | public ThaiChub2Analyzer get() { 18 | return this.analyzer; 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/ThaiChub2Tokenizer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import rocks.veer66.Wordcut; 4 | 5 | import org.apache.commons.io.IOUtils; 6 | import org.apache.lucene.analysis.Tokenizer; 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 10 | 11 | import java.io.IOException; 12 | import java.util.List; 13 | import java.util.concurrent.CopyOnWriteArrayList; 14 | 15 | 16 | public class ThaiChub2Tokenizer extends Tokenizer{ 17 | 18 | private static final int DEFAULT_BUFFER_SIZE = 256; 19 | 20 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 21 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 22 | private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); 23 | private String inputText; 24 | 25 | private final List pending = new CopyOnWriteArrayList<>(); 26 | private int offset = 0; 27 | private int pos = 0; 28 | 29 | private final Wordcut wordcut = Veer66Wordcut.wordcut; 30 | 31 | public ThaiChub2Tokenizer() { 32 | this(DEFAULT_BUFFER_SIZE); 33 | } 34 | 35 | public ThaiChub2Tokenizer(int bufferSize) { 36 | super(); 37 | termAtt.resizeBuffer(bufferSize); 38 | } 39 | 40 | private void tokenize() throws IOException { 41 | inputText = IOUtils.toString(input); 42 | final List result = this.wordcut.segmentToStrList(inputText); 43 | if (result != null) { 44 | pending.addAll(result); 45 | } 46 | } 47 | 48 | @Override 49 | public final boolean incrementToken() throws IOException { 50 | while (pending.size() == 0) { 51 | tokenize(); 52 | if (pending.size() == 0) { 53 | return false; 54 | } 55 | } 56 | 57 | clearAttributes(); 58 | 59 | for (int i = pos; i < pending.size(); i++) { 60 | pos++; 61 | final String word = pending.get(i); 62 | if (accept(word)) { 63 | posIncrAtt.setPositionIncrement(1); 64 | final int length = word.length(); 65 | termAtt.copyBuffer(word.toCharArray(), 0, length); 66 | final int start = inputText.indexOf(word, offset); 67 | offsetAtt.setOffset(correctOffset(start), offset = correctOffset(start + length)); 68 | return true; 69 | } 70 | } 71 | return false; 72 | } 73 | 74 | private boolean accept(String word) { 75 | for (int i = 0; i < word.length(); i++) { 76 | char c = word.charAt(i); 77 | if (Character.isWhitespace(c)) { 78 | return false; 79 | } 80 | } 81 | return true; 82 | } 83 | 84 | @Override 85 | public final void end() { 86 | final int finalOffset = correctOffset(offset); 87 | offsetAtt.setOffset(finalOffset, finalOffset); 88 | } 89 | 90 | @Override 91 | public void reset() throws IOException { 92 | super.reset(); 93 | pos = 0; 94 | offset = 0; 95 | pending.clear(); 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/ThaiChub2TokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | 8 | public class ThaiChub2TokenizerFactory extends AbstractTokenizerFactory{ 9 | 10 | public ThaiChub2TokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 11 | super(indexSettings, settings, name); 12 | } 13 | 14 | @Override 15 | public Tokenizer create() { 16 | return new ThaiChub2Tokenizer(); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/Veer66Wordcut.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import rocks.veer66.Wordcut; 4 | 5 | import java.io.IOException; 6 | import java.net.URL; 7 | 8 | public class Veer66Wordcut { 9 | 10 | private static final URL url = Veer66Wordcut.class.getResource("/dictionary.txt"); 11 | public static Wordcut wordcut; 12 | 13 | static { 14 | try { 15 | wordcut = Wordcut.fromDixUrl(url); 16 | } catch (IOException e) { 17 | e.printStackTrace(); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/thaichub2/AnalysisThaiChub2Plugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analysis.thaichub2; 2 | 3 | import static java.util.Collections.singletonMap; 4 | 5 | import org.apache.lucene.analysis.Analyzer; 6 | import org.elasticsearch.index.analysis.AnalyzerProvider; 7 | import org.elasticsearch.index.analysis.TokenizerFactory; 8 | import org.elasticsearch.index.analysis.ThaiChub2AnalyzerProvider; 9 | import org.elasticsearch.index.analysis.ThaiChub2TokenizerFactory; 10 | import org.elasticsearch.indices.analysis.AnalysisModule; 11 | import org.elasticsearch.plugins.AnalysisPlugin; 12 | import org.elasticsearch.plugins.Plugin; 13 | 14 | import java.util.Map; 15 | 16 | public class AnalysisThaiChub2Plugin extends Plugin implements AnalysisPlugin{ 17 | 18 | @Override 19 | public Map>> getAnalyzers() { 20 | return singletonMap("thaichub2_analyzer", ThaiChub2AnalyzerProvider::new); 21 | } 22 | 23 | @Override 24 | public Map> getTokenizers() { 25 | return singletonMap("thaichub2_tokenizer", ThaiChub2TokenizerFactory::new); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/plugin-metadata/plugin-security.policy: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Elasticsearch under one or more contributor 3 | * license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright 5 | * ownership. Elasticsearch licenses this file to you under 6 | * the Apache License, Version 2.0 (the "License"); you may 7 | * not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | grant { 21 | permission java.security.AllPermission "*"; 22 | }; 23 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=2.0 29 | # java.version=1.7 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=${project.description} 35 | # 36 | # 'version': plugin's version 37 | version=${project.version} 38 | # 39 | # 'name': the plugin name 40 | name=${elasticsearch.plugin.name} 41 | 42 | # 43 | # 'classname': the name of the class to load, fully-qualified. 44 | classname=${elasticsearch.plugin.classname} 45 | # 46 | # 'java.version' version of java the code is built against 47 | # use the system property java.specification.version 48 | # version string must be a sequence of nonnegative decimal integers 49 | # separated by "."'s and may have leading zeros 50 | java.version=${maven.compiler.target} 51 | # 52 | # 'elasticsearch.version' version of elasticsearch compiled against 53 | # You will have to release a new version of the plugin for each new 54 | # elasticsearch release. This version is checked when the plugin 55 | # is loaded so Elasticsearch will refuse to start in the presence of 56 | # plugins with the incorrect elasticsearch.version. 57 | elasticsearch.version=${elasticsearch.version} -------------------------------------------------------------------------------- /src/test/java/org/elasticseach/index/analysis/ThaiChub2TokenizerTest.java: -------------------------------------------------------------------------------- 1 | package org.elasticseach.index.analysis; 2 | 3 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 4 | import org.elasticsearch.index.analysis.ThaiChub2Tokenizer; 5 | import org.junit.Assert; 6 | import org.testng.annotations.Test; 7 | 8 | import java.io.IOException; 9 | import java.io.StringReader; 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.List; 13 | 14 | public class ThaiChub2TokenizerTest { 15 | 16 | @Test 17 | public void TestTokenizer() throws IOException { 18 | String[] input = {"คลิปดำ","คลิปขาว"}; 19 | 20 | List expected = Arrays.asList("คลิป,ดำ", "คลิป,ขาว"); 21 | 22 | List results = new ArrayList<>(); 23 | 24 | for (String value : input) { 25 | List list = new ArrayList<>(); 26 | StringReader sr = new StringReader(value); 27 | 28 | ThaiChub2Tokenizer tokenizer = new ThaiChub2Tokenizer(); 29 | tokenizer.setReader(sr); 30 | tokenizer.reset(); 31 | 32 | boolean hasnext = tokenizer.incrementToken(); 33 | while (hasnext) { 34 | CharTermAttribute ta = tokenizer.getAttribute(CharTermAttribute.class); 35 | list.add(ta.toString()); 36 | System.out.println(ta.toString()); 37 | hasnext = tokenizer.incrementToken(); 38 | } 39 | results.add(String.join(",", list)); 40 | } 41 | Assert.assertEquals(expected, results); 42 | } 43 | } 44 | --------------------------------------------------------------------------------