├── .gitignore
├── LICENSE
├── README.md
├── build.gradle
├── docker
├── Dockerfile
└── docker-compose.yml
├── elasticsearch-jaso-analyzer.iml
├── gradle
└── wrapper
│ ├── gradle-wrapper.jar
│ └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
└── src
├── main
├── java
│ └── org
│ │ └── elasticsearch
│ │ ├── analysis
│ │ ├── BaseTokenizer.java
│ │ ├── JasoAnalyzer.java
│ │ ├── JasoDecomposer.java
│ │ ├── JasoTokenizer.java
│ │ └── TokenizerOptions.java
│ │ ├── common
│ │ └── config.java
│ │ ├── index
│ │ └── analysis
│ │ │ ├── JasoAnalyzerProvider.java
│ │ │ └── JasoTokenizerFactory.java
│ │ └── plugin
│ │ └── analysis
│ │ └── JasoAnalysisPlugin.java
└── resources
│ ├── log4j2.xml
│ └── plugin-descriptor.properties
└── test
└── java
└── org
└── elasticsearch
└── analysis
├── JasoTest.java
├── JasoTokenizerTest.java
└── TestCaseVO.java
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .idea/
3 | .git/
4 | .gradle/
5 | .DS_Store
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Choi ilkyu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Korean Jaso Analyzer for Elasticsearch 8.6.2
2 | (자동완성 플러그인)
3 |
4 | ## Build & Packaging
5 |
6 | ###### 터미널 환경에서 자바 버전은 17로 변경해야합니다.
7 | ~~~shell
8 | $ sh gradlew clean build buildPluginZip
9 | ~~~
10 |
11 | ###### 자동완성용 한글 자소분석기입니다. elasticsearch 8.6.2 에서 테스트 되었습니다
12 |
13 | ## 도커 컨데이이너에서 elasticsearch, kibana 설치/실행
14 | ```
15 | #플러그인이 자동으로 설치된다.
16 | cd docker
17 | docker-compose up -d
18 | ```
19 |
20 | ## 직접설치
21 |
22 | ###### *설치*
23 | ```
24 | bin/elasticsearch-plugin install https://github.com/netcrazy/elasticsearch-jaso-analyzer/releases/download/v8.6.2/jaso-analyzer-plugin-8.6.2-plugin.zip
25 | ```
26 |
27 | ###### *삭제 (필요시)*
28 | ```
29 | bin/elasticsearch-plugin remove jaso-analyzer
30 | ```
31 |
32 | ###### *인덱스 삭제 (필요시)*
33 | ```
34 | curl -XDELETE 'http://localhost:9200/jaso'
35 | ```
36 |
37 | ###### *Korean Jaso Analyer 설정 및 인덱스 생성 (기본 자소검색용)*
38 | ```
39 | curl -XPUT -H 'Content-Type: application/json' localhost:9200/jaso -d '{
40 | "settings": {
41 | "index": {
42 | "analysis": {
43 | "filter": {
44 | "suggest_filter": {
45 | "type": "edge_ngram",
46 | "min_gram": 1,
47 | "max_gram": 50
48 | }
49 | },
50 | "analyzer": {
51 | "suggest_search_analyzer": {
52 | "type": "custom",
53 | "tokenizer": "jaso_tokenizer"
54 | },
55 | "suggest_index_analyzer": {
56 | "type": "custom",
57 | "tokenizer": "jaso_tokenizer",
58 | "filter": [
59 | "suggest_filter"
60 | ]
61 | }
62 | }
63 | }
64 | }
65 | }
66 | }'
67 | ```
68 |
69 | ###### *Korean Jaso Analyer 설정 및 인덱스 생성 (한,영오타 및 초성토큰 추출이 필요할 때..)*
70 | ```
71 | curl -XPUT -H 'Content-Type: application/json' http://localhost:9200/jaso/ -d '{
72 | "settings": {
73 | "index": {
74 | "analysis": {
75 | "filter": {
76 | "suggest_filter": {
77 | "type": "edge_ngram",
78 | "min_gram": 1,
79 | "max_gram": 50
80 | }
81 | },
82 | "tokenizer": {
83 | "jaso_search_tokenizer": {
84 | "type": "jaso_tokenizer",
85 | "mistype": true,
86 | "chosung": false
87 | },
88 | "jaso_index_tokenizer": {
89 | "type": "jaso_tokenizer",
90 | "mistype": true,
91 | "chosung": true
92 | }
93 | },
94 | "analyzer": {
95 | "suggest_search_analyzer": {
96 | "type": "custom",
97 | "tokenizer": "jaso_search_tokenizer"
98 | },
99 | "suggest_index_analyzer": {
100 | "type": "custom",
101 | "tokenizer": "jaso_index_tokenizer",
102 | "filter": [
103 | "suggest_filter"
104 | ]
105 | }
106 | }
107 | }
108 | }
109 | }
110 | }'
111 | ```
112 |
113 | ###### *인덱스 맵핑*
114 | ```
115 | curl -XPUT -H 'Content-Type: application/json' http://localhost:9200/jaso/_mapping -d '{
116 | "properties": {
117 | "name": {
118 | "type": "text",
119 | "store": true,
120 | "analyzer": "suggest_index_analyzer",
121 | "search_analyzer": "suggest_search_analyzer"
122 | }
123 | }
124 | }'
125 | ```
126 |
127 |
128 | ###### *인덱스타임 분석기 테스트*
129 | ```
130 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_analyze?pretty=true -d '{
131 | "analyzer" : "suggest_index_analyzer",
132 | "text" : "최일규 Hello"
133 | }'
134 | ```
135 |
136 |
137 | ###### *쿼리타임 분석기 테스트*
138 | ```
139 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_analyze?pretty=true -d '{
140 | "analyzer" : "suggest_search_analyzer",
141 | "text" : "쵱"
142 | }'
143 | ```
144 |
145 |
146 | ###### *문서생성*
147 | ```
148 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_doc?pretty=true -d '{
149 | "name":"최일규 Hello"
150 | }'
151 |
152 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_doc?pretty=true -d '{
153 | "name":"초아"
154 | }'
155 | ```
156 |
157 | ###### *문서검색*
158 | ```
159 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_search?pretty=true -d '{
160 | "query" : {
161 | "match" : { "name" : "초" }
162 | }
163 | }'
164 |
165 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_search?pretty=true -d '{
166 | "query" : {
167 | "match" : { "name" : "ㅊㅇㄱ" }
168 | }
169 | }'
170 | ```
171 |
--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 |
3 | compileJava {
4 | sourceCompatibility = JavaVersion.VERSION_17
5 | targetCompatibility = JavaVersion.VERSION_17
6 | }
7 |
8 | version = '8.6.2'
9 | jar {
10 | manifest {
11 | attributes 'Implementation-Title': 'Elasticsearch Jaso Analyzer Plugin',
12 | 'Implementation-Version': version
13 | }
14 | }
15 |
16 | repositories {
17 | mavenCentral()
18 | }
19 |
20 | dependencies {
21 | implementation group: 'org.elasticsearch', name: 'elasticsearch', version: version
22 | implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1'
23 | implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1'
24 | implementation group: 'org.slf4j', name: 'slf4j-api', version: '1.7.32'
25 | testImplementation group: 'junit', name: 'junit', version: '4.+'
26 | }
27 |
28 | test {
29 | systemProperties 'property': 'value'
30 | }
31 |
32 | task buildPluginZip(type: Zip, dependsOn:[':jar']) {
33 | baseName = 'jaso-analyzer-plugin'
34 | classifier = 'plugin'
35 | from 'build/libs'
36 | from 'src/main/resources'
37 |
38 | copy {
39 | from "build/distributions"
40 | into "docker"
41 | }
42 | }
43 |
44 | artifacts {
45 | archives buildPluginZip
46 | }
47 |
48 | [ compileJava, compileTestJava ]*.options*.encoding = 'UTF-8'
49 | [ compileJava, compileTestJava ]*.options*.compilerArgs = ['-Xlint:-options']
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.elastic.co/elasticsearch/elasticsearch:8.6.2
2 |
3 | COPY jaso-analyzer-plugin-8.6.2-plugin.zip /tmp/
4 | RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install file:///tmp/jaso-analyzer-plugin-8.6.2-plugin.zip
--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2.2'
2 | services:
3 | elasticsearch:
4 | build:
5 | context: .
6 | dockerfile: ./Dockerfile
7 | container_name: es-node-01
8 | environment:
9 | - cluster.name=es-docker-cluster
10 | - node.name=es01
11 | - xpack.security.enabled=false
12 | - discovery.type=single-node
13 | ulimits:
14 | memlock:
15 | soft: -1
16 | hard: -1
17 | nofile:
18 | soft: 262144
19 | hard: 262144
20 | cap_add:
21 | - IPC_LOCK
22 | volumes:
23 | - data01:/usr/share/elasticsearch/data
24 | ports:
25 | - "9200:9200"
26 | - "9300:9300"
27 | networks:
28 | - es-net
29 |
30 | kibana:
31 | container_name: kibana
32 | image: docker.elastic.co/kibana/kibana:8.6.2
33 | environment:
34 | ELASTICSEARCH_URL: http://es-node-01:9200
35 | ELASTICSEARCH_HOSTS: http://es-node-01:9200
36 | ports:
37 | - "5601:5601"
38 | depends_on:
39 | - elasticsearch
40 | networks:
41 | - es-net
42 |
43 | volumes:
44 | data01:
45 | driver: local
46 | data02:
47 | driver: local
48 |
49 | networks:
50 | es-net:
51 | driver: bridge
--------------------------------------------------------------------------------
/elasticsearch-jaso-analyzer.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netcrazy/elasticsearch-jaso-analyzer/d7204d0f698040bd53c9816e6daee9d09a38f127/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | #
4 | # Copyright 2015 the original author or authors.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # https://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | ##############################################################################
20 | ##
21 | ## Gradle start up script for UN*X
22 | ##
23 | ##############################################################################
24 |
25 | # Attempt to set APP_HOME
26 | # Resolve links: $0 may be a link
27 | PRG="$0"
28 | # Need this for relative symlinks.
29 | while [ -h "$PRG" ] ; do
30 | ls=`ls -ld "$PRG"`
31 | link=`expr "$ls" : '.*-> \(.*\)$'`
32 | if expr "$link" : '/.*' > /dev/null; then
33 | PRG="$link"
34 | else
35 | PRG=`dirname "$PRG"`"/$link"
36 | fi
37 | done
38 | SAVED="`pwd`"
39 | cd "`dirname \"$PRG\"`/" >/dev/null
40 | APP_HOME="`pwd -P`"
41 | cd "$SAVED" >/dev/null
42 |
43 | APP_NAME="Gradle"
44 | APP_BASE_NAME=`basename "$0"`
45 |
46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
48 |
49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
50 | MAX_FD="maximum"
51 |
52 | warn () {
53 | echo "$*"
54 | }
55 |
56 | die () {
57 | echo
58 | echo "$*"
59 | echo
60 | exit 1
61 | }
62 |
63 | # OS specific support (must be 'true' or 'false').
64 | cygwin=false
65 | msys=false
66 | darwin=false
67 | nonstop=false
68 | case "`uname`" in
69 | CYGWIN* )
70 | cygwin=true
71 | ;;
72 | Darwin* )
73 | darwin=true
74 | ;;
75 | MINGW* )
76 | msys=true
77 | ;;
78 | NONSTOP* )
79 | nonstop=true
80 | ;;
81 | esac
82 |
83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
84 |
85 |
86 | # Determine the Java command to use to start the JVM.
87 | if [ -n "$JAVA_HOME" ] ; then
88 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
89 | # IBM's JDK on AIX uses strange locations for the executables
90 | JAVACMD="$JAVA_HOME/jre/sh/java"
91 | else
92 | JAVACMD="$JAVA_HOME/bin/java"
93 | fi
94 | if [ ! -x "$JAVACMD" ] ; then
95 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
96 |
97 | Please set the JAVA_HOME variable in your environment to match the
98 | location of your Java installation."
99 | fi
100 | else
101 | JAVACMD="java"
102 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
103 |
104 | Please set the JAVA_HOME variable in your environment to match the
105 | location of your Java installation."
106 | fi
107 |
108 | # Increase the maximum file descriptors if we can.
109 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
110 | MAX_FD_LIMIT=`ulimit -H -n`
111 | if [ $? -eq 0 ] ; then
112 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
113 | MAX_FD="$MAX_FD_LIMIT"
114 | fi
115 | ulimit -n $MAX_FD
116 | if [ $? -ne 0 ] ; then
117 | warn "Could not set maximum file descriptor limit: $MAX_FD"
118 | fi
119 | else
120 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
121 | fi
122 | fi
123 |
124 | # For Darwin, add options to specify how the application appears in the dock
125 | if $darwin; then
126 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
127 | fi
128 |
129 | # For Cygwin or MSYS, switch paths to Windows format before running java
130 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
131 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
132 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
133 |
134 | JAVACMD=`cygpath --unix "$JAVACMD"`
135 |
136 | # We build the pattern for arguments to be converted via cygpath
137 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
138 | SEP=""
139 | for dir in $ROOTDIRSRAW ; do
140 | ROOTDIRS="$ROOTDIRS$SEP$dir"
141 | SEP="|"
142 | done
143 | OURCYGPATTERN="(^($ROOTDIRS))"
144 | # Add a user-defined pattern to the cygpath arguments
145 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
146 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
147 | fi
148 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
149 | i=0
150 | for arg in "$@" ; do
151 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
152 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
153 |
154 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
155 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
156 | else
157 | eval `echo args$i`="\"$arg\""
158 | fi
159 | i=`expr $i + 1`
160 | done
161 | case $i in
162 | 0) set -- ;;
163 | 1) set -- "$args0" ;;
164 | 2) set -- "$args0" "$args1" ;;
165 | 3) set -- "$args0" "$args1" "$args2" ;;
166 | 4) set -- "$args0" "$args1" "$args2" "$args3" ;;
167 | 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
168 | 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
169 | 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
170 | 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
171 | 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
172 | esac
173 | fi
174 |
175 | # Escape application args
176 | save () {
177 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
178 | echo " "
179 | }
180 | APP_ARGS=`save "$@"`
181 |
182 | # Collect all arguments for the java command, following the shell quoting and substitution rules
183 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
184 |
185 | exec "$JAVACMD" "$@"
186 |
--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/BaseTokenizer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.analysis;
2 |
3 | import org.apache.lucene.analysis.Tokenizer;
4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
6 | import org.apache.lucene.analysis.CharacterUtils;
7 | import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
8 |
9 | import java.io.*;
10 |
11 | /**
12 | * Base 자소 토크나이저 구현
13 | *
14 | * @author 최일규
15 | * @since 2016-02-10
16 | */
17 | public abstract class BaseTokenizer extends Tokenizer {
18 |
19 | private final TokenizerOptions options;
20 |
21 | private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
22 | private static final int MAX_WORD_LEN = 2048;
23 | private static final int IO_BUFFER_SIZE = 4096;
24 |
25 | private final CharTermAttribute termAtt;
26 | private final OffsetAttribute offsetAtt;
27 |
28 | private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
29 |
30 | protected BaseTokenizer(TokenizerOptions options) {
31 | this.options = options;
32 |
33 | termAtt = addAttribute(CharTermAttribute.class);
34 | offsetAtt = addAttribute(OffsetAttribute.class);
35 |
36 | offset = 0;
37 | bufferIndex = 0;
38 | dataLen = 0;
39 | finalOffset = 0;
40 | }
41 |
42 | protected boolean isTokenChar(int c) {
43 | throw new UnsupportedOperationException("Subclasses of CharTokenizer must implement isTokenChar(int)");
44 | }
45 |
46 | protected int normalize(int c) {
47 | return c;
48 | }
49 |
50 | /**
51 | * lucene 4.2x의 경우 데이터가 있으면 자소분리 후 true가 떨어지나, 여기서는 false로 떨어져 ioBuffer사이즈 상태로 조건변경 (CharacterUtils.fill)
52 | *
53 | * @author 최일규
54 | * @since 2014-07-11
55 | */
56 | @Override
57 | public final boolean incrementToken() throws IOException {
58 | clearAttributes();
59 |
60 | int length = 0;
61 | int start = -1; // this variable is always initialized
62 | char[] buffer = termAtt.buffer();
63 | while (true) {
64 | if (bufferIndex >= dataLen) {
65 |
66 | offset += dataLen;
67 | CharacterUtils.fill(ioBuffer, jasoDecompose(input, this.options));
68 |
69 | //버퍼사이즈가 있으면 분석한다. (return false일때까지... 재귀호출)
70 | if (ioBuffer.getLength() == 0) {
71 | dataLen = 0; // so next offset += dataLen won't decrement offset
72 | if (length > 0) {
73 | break;
74 | } else {
75 | finalOffset = correctOffset(offset);
76 | return false;
77 | }
78 | }
79 | dataLen = ioBuffer.getLength();
80 | bufferIndex = 0;
81 | }
82 | // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
83 | final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, dataLen);
84 | bufferIndex += Character.charCount(c);
85 |
86 | // if it's a token char
87 | if (isTokenChar(c)) {
88 |
89 | // start of token
90 | if (length == 0) {
91 | assert start == -1;
92 | start = offset + bufferIndex - 1;
93 |
94 | // check if a supplementary could run out of bounds
95 | } else if (length >= buffer.length - 1) {
96 |
97 | // make sure a supplementary fits in the buffer
98 | buffer = termAtt.resizeBuffer(2 + length);
99 | }
100 |
101 | // buffer it, normalized
102 | length += Character.toChars(normalize(c), buffer, length);
103 | if (length >= MAX_WORD_LEN) {
104 | break;
105 | }
106 | } else if (length > 0) {
107 | // return 'em
108 | break;
109 | }
110 | }
111 |
112 | termAtt.setLength(length);
113 | assert start != -1;
114 | offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length));
115 | return true;
116 | }
117 |
118 | @Override
119 | public final void end() {
120 | // set final offset
121 | offsetAtt.setOffset(finalOffset, finalOffset);
122 | }
123 |
124 | /**
125 | * Reader -> String -> 자소변환 -> String -> Reader
126 | */
127 | public static Reader jasoDecompose(Reader in, TokenizerOptions options) {
128 | Writer writer = new StringWriter();
129 | JasoDecomposer decomposer = new JasoDecomposer();
130 | char[] buffer = new char[2048];
131 | String temp;
132 |
133 | try {
134 | int n;
135 | while ((n = in.read(buffer)) != -1) {
136 | writer.write(buffer, 0, n);
137 | }
138 | temp = writer.toString();
139 | temp = decomposer.runJasoDecompose(temp, options);
140 | // System.out.println(temp);
141 | in = new StringReader(temp);
142 | } catch (Exception e) {
143 | StringWriter errors = new StringWriter();
144 | e.printStackTrace(new PrintWriter(errors));
145 | }
146 | return in;
147 | }
148 |
149 | @Override
150 | public void reset() throws IOException {
151 | super.reset();
152 | bufferIndex = 0;
153 | offset = 0;
154 | dataLen = 0;
155 | finalOffset = 0;
156 | ioBuffer.reset(); // make sure to reset the IO buffer!!
157 | }
158 | }
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/JasoAnalyzer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.analysis;
2 |
3 | import org.apache.lucene.analysis.Analyzer;
4 |
5 | /**
6 | * JasoAnalyzer
7 | *
8 | * @author 최일규
9 | * @since 2018-03-21
10 | */
11 | public class JasoAnalyzer extends Analyzer {
12 | public JasoAnalyzer() {
13 | }
14 |
15 | @Override
16 | protected Analyzer.TokenStreamComponents createComponents(final String fieldName) {
17 | return new Analyzer.TokenStreamComponents(new JasoTokenizer(TokenizerOptions.create("jaso_analyzer")));
18 | }
19 | }
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/JasoDecomposer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.analysis;
2 |
3 | /**
4 | * 자동완성용 자소분해 (자소분해 with WhiteSpace)
5 | *
6 | * @author 최일규
7 | * @since 2016-02-10
8 | */
9 | public class JasoDecomposer {
10 |
11 | //초성(19자) ㄱ ㄲ ㄴ ㄷ ㄸ ㄹ ㅁ ㅂ ㅃ ㅅ ㅆ ㅇ ㅈ ㅉ ㅊ ㅋ ㅌ ㅍ ㅎ
12 | static String[] chosungKor = {"ㄱ", "ㄱㄱ", "ㄴ", "ㄷ", "ㄷㄷ", "ㄹ", "ㅁ", "ㅂ", "ㅂㅂ", "ㅅ", "ㅅㅅ", "ㅇ", "ㅈ", "ㅈㅈ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ"};
13 | //중성(21자) ㅏ ㅐ ㅑ ㅒ ㅓ ㅔ ㅕ ㅖ ㅗ ㅘ(9) ㅙ(10) ㅚ(11) ㅛ ㅜ ㅝ(14) ㅞ(15) ㅟ(16) ㅠ ㅡ ㅢ(19) ㅣ
14 | static String[] jungsungKor = {"ㅏ", "ㅐ", "ㅑ", "ㅒ", "ㅓ", "ㅔ", "ㅕ", "ㅖ", "ㅗ", "ㅗㅏ", "ㅗㅐ", "ㅗㅣ", "ㅛ", "ㅜ", "ㅜㅓ", "ㅜㅔ", "ㅜㅣ", "ㅠ", "ㅡ", "ㅡㅣ", "ㅣ"};
15 | //종성(28자) <없음> ㄱ ㄲ ㄳ(3) ㄴ ㄵ(5) ㄶ(6) ㄷ ㄹ ㄺ(9) ㄻ(10) ㄼ(11) ㄽ(12) ㄾ(13) ㄿ(14) ㅀ(15) ㅁ ㅂ ㅄ(18) ㅅ ㅆ ㅇ ㅈ ㅊ ㅋ ㅌ ㅍ ㅎ
16 | static String[] jongsungKor = {" ", "ㄱ", "ㄱㄱ", "ㄱㅅ", "ㄴ", "ㄴㅈ", "ㄴㅎ", "ㄷ", "ㄹ", "ㄹㄱ", "ㄹㅁ", "ㄹㅂ", "ㄹㅅ", "ㄹㅌ", "ㄹㅍ", "ㄹㅎ", "ㅁ", "ㅂ", "ㅂㅅ", "ㅅ", "ㅅㅅ", "ㅇ", "ㅈ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ"};
17 |
18 | static String[] chosungEng = {"r", "R", "s", "e", "E", "f", "a", "q", "Q", "t", "T", "d", "w", "W", "c", "z", "x", "v", "g"};
19 | static String[] jungsungEng = {"k", "o", "i", "O", "j", "p", "u", "P", "h", "hk", "ho", "hl", "y", "n", "nj", "np", "nl", "b", "m", "ml", "l"};
20 | static String[] jongsungEng = {"", "r", "R", "rt", "s", "sw", "sg", "e", "f", "fr", "fa", "fq", "ft", "fx", "fv", "fg", "a", "q", "qt", "t", "T", "d", "w", "c", "z", "x", "v", "g"};
21 |
22 | static String[] mistyping = {"ㅁ", "ㅠ", "ㅊ", "ㅇ", "ㄷ", "ㄹ", "ㅎ", "ㅗ", "ㅑ", "ㅓ", "ㅏ", "ㅣ", "ㅡ", "ㅜ", "ㅐ", "ㅔ", "ㅂ", "ㄱ", "ㄴ", "ㅅ", "ㅕ", "ㅍ", "ㅈ", "ㅌ", "ㅛ", "ㅋ"};
23 |
24 | public String runJasoDecompose(String originStr, TokenizerOptions options) {
25 |
26 | if (!originStr.isEmpty()) {
27 |
28 | //lowercase 처리
29 | originStr = originStr.toLowerCase();
30 |
31 | char[] termBuffer = originStr.toCharArray();
32 | StringBuilder korBuffer = new StringBuilder();
33 | StringBuilder engBuffer = new StringBuilder();
34 | StringBuilder chosungBuffer = new StringBuilder();
35 | StringBuilder mistypingBuffer = new StringBuilder();
36 | StringBuilder etcBuffer = new StringBuilder();
37 | StringBuilder returnBuffer = new StringBuilder();
38 |
39 | //첫글자가 한글일때만 초성분해
40 | boolean firstCharType = false;
41 | if (termBuffer.length > 0)
42 | firstCharType = isHangul(Character.toString(termBuffer[0]));
43 |
44 | //자소포함여부
45 | boolean jaso = isJaso(originStr);
46 | //한글포함여부
47 | boolean hangul = isHangul(originStr);
48 | //영문포함여부
49 | boolean english = isEnglish(originStr);
50 |
51 | int strLen = originStr.length();
52 |
53 | int cho;
54 | int jung;
55 | int jong;
56 | for (char ch : termBuffer) {
57 | //가(AC00)~힣(D7A3) 에 속한 글자면 분해
58 | if (ch >= 0xAC00 && ch <= 0xD7A3 && !jaso) {
59 | //Unicode 값으로 환산한다.
60 | int uniValue = ch - 0xAC00;
61 |
62 | jong = uniValue % 28; //종성
63 | cho = ((uniValue - jong) / 28) / 21; //초성
64 | jung = ((uniValue - jong) / 28) % 21; //중성
65 |
66 | //한글초성
67 | korBuffer.append(chosungKor[cho]);
68 |
69 | //한글에 대한 초성처리 (일반적으로 색인시 초성을 담는다.)
70 | if (options.isChosung() && firstCharType) {
71 | //초성은 2자이상일때 포함
72 | if (strLen >= 2)
73 | chosungBuffer.append(chosungKor[cho]);
74 | }
75 |
76 | //한글문장에 대한 영문오타처리 (ㄱ -> r)
77 | if (options.isMistype()) {
78 | engBuffer.append(chosungEng[cho].toLowerCase());
79 | }
80 |
81 | //한글중성
82 | korBuffer.append(jungsungKor[jung]);
83 |
84 | //한글문장에 대한 영문오타처리 (ㅏ-> k)
85 | if (options.isMistype()) {
86 | engBuffer.append(jungsungEng[jung].toLowerCase());
87 | }
88 |
89 | //받침이 있으면
90 | if (jong != 0) {
91 | korBuffer.append(jongsungKor[jong]);
92 |
93 | //한글문장에 대한 영문오타처리 (ㄲ -> R)
94 | if (options.isMistype()) {
95 | engBuffer.append(jongsungEng[jong].toLowerCase());
96 | }
97 | }
98 | } else {
99 |
100 | if (options.isMistype()) {
101 | if (!jaso) {
102 | if (hangul) {
103 | korBuffer.append(ch);
104 | }
105 | engBuffer.append(ch);
106 | }
107 | } else {
108 | if (!jaso) {
109 | if (hangul) {
110 | korBuffer.append(ch);
111 | } else {
112 | engBuffer.append(ch);
113 | }
114 | }
115 | }
116 |
117 | //영문문장에 대한 한글오타처리 (hello -> ㅗ디ㅣㅐ)
118 | if (options.isMistype() && !hangul) {
119 | int index;
120 | if (ch >= 0x61 && ch <= 0x7A) {
121 | //소문자
122 | index = (int) ch - 97;
123 | mistypingBuffer.append(mistyping[index]);
124 | } else if (ch >= 0x41 && ch <= 0x5A) {
125 | //대문자
126 | index = (int) ch - 65;
127 | mistypingBuffer.append(mistyping[index]);
128 | } else {
129 | if (english)
130 | mistypingBuffer.append(ch);
131 | }
132 | }
133 | }
134 |
135 | //추가적인 예외상황으로 추가 토큰처리 (ㅗ디ㅣㅐ -> ㅗㄷㅣㅣㅐ 자소분해)
136 | if (jaso) {
137 |
138 | if (ch >= 0xAC00 && ch <= 0xD7A3) {
139 | //Unicode 값으로 환산한다.
140 | int uniValue = ch - 0xAC00;
141 |
142 | jong = uniValue % 28; //종성
143 | cho = ((uniValue - jong) / 28) / 21; //초성
144 | jung = ((uniValue - jong) / 28) % 21; //중성
145 |
146 | etcBuffer.append(chosungKor[cho]);
147 | etcBuffer.append(jungsungKor[jung]);
148 | //받침이 있으면
149 | if (jong != 0) {
150 | etcBuffer.append(jongsungKor[jong]);
151 | }
152 | } else if (isJaso(Character.toString(ch))) {
153 | //복자음 강제분리
154 | switch (ch) {
155 | case 'ㄲ':
156 | etcBuffer.append("ㄱㄱ");
157 | break;
158 | case 'ㄳ':
159 | etcBuffer.append("ㄱㅅ");
160 | break;
161 | case 'ㄵ':
162 | etcBuffer.append("ㄴㅈ");
163 | break;
164 | case 'ㄶ':
165 | etcBuffer.append("ㄴㅎ");
166 | break;
167 | case 'ㄺ':
168 | etcBuffer.append("ㄹㄱ");
169 | break;
170 | case 'ㄻ':
171 | etcBuffer.append("ㄹㅁ");
172 | break;
173 | case 'ㄼ':
174 | etcBuffer.append("ㄹㅂ");
175 | break;
176 | case 'ㄽ':
177 | etcBuffer.append("ㄹㅅ");
178 | break;
179 | case 'ㄾ':
180 | etcBuffer.append("ㄹㅌ");
181 | break;
182 | case 'ㄿ':
183 | etcBuffer.append("ㄹㅍ");
184 | break;
185 | case 'ㅀ':
186 | etcBuffer.append("ㄹㅎ");
187 | break;
188 | case 'ㅄ':
189 | etcBuffer.append("ㅂㅅ");
190 | break;
191 | case 'ㄸ':
192 | etcBuffer.append("ㄷㄷ");
193 | break;
194 | case 'ㅃ':
195 | etcBuffer.append("ㅂㅂ");
196 | break;
197 | case 'ㅆ':
198 | etcBuffer.append("ㅅㅅ");
199 | break;
200 | case 'ㅉ':
201 | etcBuffer.append("ㅈㅈ");
202 | break;
203 | default:
204 | etcBuffer.append(ch);
205 | }
206 | } else {
207 | etcBuffer.append(ch);
208 | }
209 | }
210 | }
211 |
212 | //결과 조합
213 |
214 | //공백을 붙인 전체 문자열 (한글)
215 | if (korBuffer.indexOf(" ") != -1) {
216 | if (korBuffer.length() > 0) {
217 | returnBuffer.append(korBuffer.toString().replaceAll(" ", ""));
218 | returnBuffer.append(" ");
219 | }
220 | }
221 |
222 | //공백으로 분리된 문자열 (한글)
223 | if (korBuffer.length() > 0) {
224 | returnBuffer.append(korBuffer.toString());
225 | returnBuffer.append(" ");
226 | }
227 |
228 | //공백을 붙인 전체 문자열 (영문)
229 | if (engBuffer.indexOf(" ") != -1) {
230 | if (engBuffer.length() > 0) {
231 | returnBuffer.append(engBuffer.toString().replaceAll(" ", ""));
232 | returnBuffer.append(" ");
233 | }
234 | }
235 |
236 | //공백으로 분리된 문자열 (영문)
237 | if (engBuffer.length() > 0) {
238 | returnBuffer.append(engBuffer.toString());
239 | returnBuffer.append(" ");
240 | }
241 |
242 | //공백을 붙인 전체 문자열 (오타)
243 | if (mistypingBuffer.indexOf(" ") != -1) {
244 | if (mistypingBuffer.length() > 0) {
245 | returnBuffer.append(mistypingBuffer.toString().replaceAll(" ", ""));
246 | returnBuffer.append(" ");
247 | }
248 | }
249 |
250 | //공백으로 분리된 문자열 (오타)
251 | if (mistypingBuffer.length() > 0) {
252 | returnBuffer.append(mistypingBuffer);
253 | returnBuffer.append(" ");
254 | }
255 |
256 | if (chosungBuffer.length() > 0) {
257 | returnBuffer.append(chosungBuffer);
258 | returnBuffer.append(" ");
259 | }
260 |
261 | if (etcBuffer.length() > 0) {
262 | returnBuffer.append(etcBuffer);
263 | returnBuffer.append(" ");
264 | }
265 |
266 | return returnBuffer.toString().trim();
267 | } else {
268 | return "";
269 | }
270 | }
271 |
272 | /**
273 | * 문자열에 한글포함 여부
274 | */
275 | private boolean isHangul(String str) {
276 | return str.matches(".*[ㄱ-ㅎㅏ-ㅣ가-힣]+.*");
277 | }
278 |
279 | /**
280 | * 문자열에 영문포함 여부
281 | */
282 | private boolean isEnglish(String str) {
283 | return str.matches(".*[a-zA-Z]+.*");
284 | }
285 |
286 | /**
287 | * 문자열에 초성,중성 포함 여부
288 | */
289 | private boolean isJaso(String str) {
290 | return str.matches(".*[ㄱ-ㅎㅏ-ㅣ]+.*");
291 | }
292 | }
293 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/JasoTokenizer.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.analysis;
2 |
3 | import org.elasticsearch.common.config;
4 |
5 | /**
6 | * 자소 토크나이저 구현
7 | *
8 | * @author 최일규
9 | * @since 2018-03-21
10 | */
11 | public final class JasoTokenizer extends BaseTokenizer {
12 |
13 | /**
14 | * 자소 토크나이저 생성자
15 | *
16 | * @param options 토크나이저 옵션
17 | */
18 | public JasoTokenizer(TokenizerOptions options) {
19 | super(options);
20 | }
21 |
22 | /**
23 | * Collects only characters which do not satisfy
24 | * {@link Character#isWhitespace(int)}.
25 | */
26 | @Override
27 | protected boolean isTokenChar(int c) {
28 | return !isSplit(c);
29 | }
30 |
31 | /**
32 | * White Space로 토큰분해
33 | */
34 | private boolean isSplit(int c) {
35 | return (char) c == config.WHITESPACE_CHAR;
36 | }
37 | }
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/TokenizerOptions.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.analysis;
2 |
3 | /**
4 | * 토크나이저 옵션
5 | *
6 | * @author 최일규
7 | * @since 2016-02-12
8 | */
9 | public class TokenizerOptions {
10 |
11 | //한영오타에 대한 토큰 추출여부 (hello -> ㅗㄷㅣㅣㅐ, 최일규 -> chldlfrb)
12 | public final static boolean MISTYPE = false;
13 |
14 | //초성검색을 위한 토큰 추출여부 (최일규 -> ㅊㅇㄱ)
15 | public final static boolean CHOSUNG = false;
16 |
17 | private boolean mistype = MISTYPE;
18 | private boolean chosung = CHOSUNG;
19 |
20 | private String name = null;
21 |
22 | public static TokenizerOptions create(String name) {
23 | return new TokenizerOptions(name);
24 | }
25 |
26 | private TokenizerOptions(String name) {
27 | this.name = name;
28 | }
29 |
30 | public String getName() {
31 | return name;
32 | }
33 |
34 | public boolean isMistype() {
35 | return mistype;
36 | }
37 |
38 | public void setMistype(boolean mistype) {
39 | this.mistype = mistype;
40 | }
41 |
42 | public boolean isChosung() {
43 | return chosung;
44 | }
45 |
46 | public void setChosung(boolean chosung) {
47 | this.chosung = chosung;
48 | }
49 | }
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/common/config.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.common;
2 |
3 | /**
4 | * 글로벌 공통변수
5 | *
6 | * @author 최일규
7 | * @since 2016-02-03
8 | */
9 | public class config {
10 | public static final char WHITESPACE_CHAR = ' ';
11 | }
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/JasoAnalyzerProvider.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.elasticsearch.analysis.JasoAnalyzer;
4 | import org.elasticsearch.common.settings.Settings;
5 | import org.elasticsearch.env.Environment;
6 | import org.elasticsearch.index.IndexSettings;
7 |
8 | /**
9 | * JasoAnalyzerProvider
10 | *
11 | * @author 최일규
12 | * @since 2018-03-21
13 | */
14 | public class JasoAnalyzerProvider extends AbstractIndexAnalyzerProvider {
15 | private final JasoAnalyzer analyzer;
16 |
17 | public JasoAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
18 | super(name, settings);
19 | analyzer = new JasoAnalyzer();
20 | }
21 |
22 | @Override
23 | public JasoAnalyzer get() {
24 | return analyzer;
25 | }
26 | }
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/JasoTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.apache.lucene.analysis.Tokenizer;
4 | import org.elasticsearch.analysis.JasoTokenizer;
5 | import org.elasticsearch.analysis.TokenizerOptions;
6 | import org.elasticsearch.common.settings.Settings;
7 | import org.elasticsearch.index.IndexSettings;
8 | import org.elasticsearch.env.Environment;
9 |
10 | /**
11 | * JasoTokenizerFactory
12 | *
13 | * @author 최일규
14 | * @since 2018-03-21
15 | */
16 | public class JasoTokenizerFactory extends AbstractTokenizerFactory {
17 |
18 | private final TokenizerOptions options;
19 |
20 | public JasoTokenizerFactory(IndexSettings indexSettings,
21 | Environment environment,
22 | String name,
23 | Settings settings) {
24 |
25 | super(indexSettings, settings, name);
26 |
27 | this.options = TokenizerOptions.create(name);
28 | this.options.setMistype(settings.getAsBoolean("mistype", TokenizerOptions.MISTYPE));
29 | this.options.setChosung(settings.getAsBoolean("chosung", TokenizerOptions.CHOSUNG));
30 | }
31 |
32 | @Override
33 | public Tokenizer create() {
34 | return new JasoTokenizer(this.options);
35 | }
36 | }
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/JasoAnalysisPlugin.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.analysis;
2 |
3 | import org.elasticsearch.index.analysis.JasoTokenizerFactory;
4 | import org.elasticsearch.index.analysis.*;
5 | import org.elasticsearch.plugins.AnalysisPlugin;
6 | import org.elasticsearch.plugins.Plugin;
7 |
8 | import java.util.Map;
9 |
10 | import org.elasticsearch.indices.analysis.AnalysisModule;
11 |
12 | import static java.util.Collections.singletonMap;
13 |
14 | import org.apache.lucene.analysis.Analyzer;
15 |
16 | /**
17 | * JasoAnalysisPlugin
18 | *
19 | * @author 최일규
20 | * @since 2018-03-21
21 | */
22 | public class JasoAnalysisPlugin extends Plugin implements AnalysisPlugin {
23 |
24 | @Override
25 | public Map> getTokenizers() {
26 | return singletonMap("jaso_tokenizer", JasoTokenizerFactory::new);
27 | }
28 |
29 | @Override
30 | public Map>> getAnalyzers() {
31 | return singletonMap("jaso_analyzer", JasoAnalyzerProvider::new);
32 | }
33 | }
--------------------------------------------------------------------------------
/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | description=Jaso Korean Text Analyzer
2 | version=8.6.2
3 | name=jaso-analyzer
4 | classname=org.elasticsearch.plugin.analysis.JasoAnalysisPlugin
5 | java.version=17
6 | elasticsearch.version=8.6.2
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/analysis/JasoTest.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.analysis;
2 |
3 | import junit.framework.TestCase;
4 |
5 | /**
6 | * 자동완성 기능 유닛테스트
7 | *
8 | * @author 최일규
9 | * @since 2016-02-03
10 | */
11 | public class JasoTest extends TestCase {
12 |
13 | public void testJasoDecomposer() {
14 | TokenizerOptions options = TokenizerOptions.create("testJasoDecomposer");
15 | options.setMistype(true);
16 | options.setChosung(true);
17 |
18 | JasoDecomposer aa = new JasoDecomposer();
19 |
20 | String expected = "ㅅㅅㄱㄱ";
21 | String actual = aa.runJasoDecompose("ㅆㄲ", options);
22 | assertEquals(expected, actual);
23 | }
24 | }
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/analysis/JasoTokenizerTest.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.analysis;
2 |
3 | import junit.framework.TestCase;
4 | import org.apache.lucene.analysis.Tokenizer;
5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
6 |
7 | import java.io.IOException;
8 | import java.io.StringReader;
9 | import java.util.ArrayList;
10 | import java.util.List;
11 |
12 | /**
13 | * 토크나이저 유닛테스트
14 | *
15 | * @author 최일규
16 | * @since 2016-02-11
17 | */
18 | public class JasoTokenizerTest extends TestCase {
19 |
20 | public void testTokenizer() throws IOException {
21 |
22 | long start = System.currentTimeMillis();
23 | TokenizerOptions options = TokenizerOptions.create("testTokenizer");
24 |
25 | //한영오타에 대한 토큰 추출여부 (hello -> ㅗㄷㅣㅣㅐ, 최일규 -> chldlfrb)
26 | options.setMistype(false);
27 |
28 | //초성검색을 위한 토큰 추출여부 (최일규 -> ㅊㅇㄱ)
29 | options.setChosung(false);
30 |
31 | List testCase = new ArrayList();
32 |
33 | if (options.isMistype() && options.isChosung()) {
34 |
35 | testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ/chldlfrb/ㅊㅇㄱ"));
36 | testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/thsutleo/ㅅㄴㅅㄷ"));
37 | testCase.add(new TestCaseVO("Hello", "hello/ㅗㄷㅣㅣㅐ"));
38 | testCase.add(new TestCaseVO("Hello~", "hello~/ㅗㄷㅣㅣㅐ~"));
39 | testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld/anwhrjs/govldpseld/ㅁㅈㄱㅎㅍㅇㄷ"));
40 | testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/dkelektmdnsehdghk/dkelektm/dnsehdghk/ㅇㄷㄷㅅㅇㄷㅎ"));
41 | testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/xnepdlxmrrk/ㅌㄷㅇㅌㄱ"));
42 |
43 | } else if (options.isMistype() && !options.isChosung()) {
44 |
45 | testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ/chldlfrb"));
46 | testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/thsutleo"));
47 | testCase.add(new TestCaseVO("Hello", "hello/ㅗㄷㅣㅣㅐ"));
48 | testCase.add(new TestCaseVO("Hello~", "hello~/ㅗㄷㅣㅣㅐ~"));
49 | testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld/anwhrjs/govldpseld"));
50 | testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/dkelektmdnsehdghk/dkelektm/dnsehdghk"));
51 | testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/xnepdlxmrrk"));
52 |
53 | } else if (!options.isMistype() && options.isChosung()) {
54 |
55 | testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ/ㅊㅇㄱ"));
56 | testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/ㅅㄴㅅㄷ"));
57 | testCase.add(new TestCaseVO("Hello", "hello"));
58 | testCase.add(new TestCaseVO("Hello~", "hello~"));
59 | testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅈㄱㅎㅍㅇㄷ"));
60 | testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㄷㄷㅅㅇㄷㅎ"));
61 | testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/ㅌㄷㅇㅌㄱ"));
62 |
63 | } else if (!options.isMistype() && !options.isChosung()) {
64 |
65 | testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ"));
66 | testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ"));
67 | testCase.add(new TestCaseVO("Hello", "hello"));
68 | testCase.add(new TestCaseVO("Hello~", "hello~"));
69 | testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ"));
70 | testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ"));
71 | testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ"));
72 | }
73 |
74 | for (TestCaseVO vo : testCase) {
75 |
76 | StringReader reader = new StringReader(vo.getOrigin());
77 |
78 | Tokenizer tokenizer = new JasoTokenizer(options);
79 | tokenizer.setReader(reader);
80 | CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
81 |
82 | tokenizer.reset();
83 |
84 | StringBuffer sb = new StringBuffer();
85 |
86 | while (tokenizer.incrementToken()) {
87 | if (sb.length() > 0) sb.append('/');
88 | sb.append(termAtt.toString());
89 | }
90 |
91 | TestCase.assertEquals(vo.getCompare(), sb.toString());
92 | tokenizer.close();
93 |
94 | System.out.printf("%s => %s%n", vo.getOrigin(), sb);
95 | }
96 |
97 | long end = System.currentTimeMillis();
98 | System.out.println("실행 시간 : " + (end - start) / 1000.0);
99 | }
100 | }
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/analysis/TestCaseVO.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.analysis;
2 |
3 | /**
4 | * 테스트케이스 VO
5 | *
6 | * @author 최일규
7 | * @since 2016-02-13
8 | */
9 | public class TestCaseVO {
10 |
11 | private final String origin;
12 | private final String compare;
13 |
14 | public TestCaseVO(String origin, String compare) {
15 | this.origin = origin;
16 | this.compare = compare;
17 | }
18 |
19 | public String getOrigin() {
20 | return origin;
21 | }
22 |
23 | public String getCompare() {
24 | return compare;
25 | }
26 | }
27 |
--------------------------------------------------------------------------------