├── .gitignore
├── LICENSE
├── README.md
├── build.gradle
├── docker
    ├── Dockerfile
    └── docker-compose.yml
├── elasticsearch-jaso-analyzer.iml
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
└── src
    ├── main
        ├── java
        │   └── org
        │   │   └── elasticsearch
        │   │       ├── analysis
        │   │           ├── BaseTokenizer.java
        │   │           ├── JasoAnalyzer.java
        │   │           ├── JasoDecomposer.java
        │   │           ├── JasoTokenizer.java
        │   │           └── TokenizerOptions.java
        │   │       ├── common
        │   │           └── config.java
        │   │       ├── index
        │   │           └── analysis
        │   │           │   ├── JasoAnalyzerProvider.java
        │   │           │   └── JasoTokenizerFactory.java
        │   │       └── plugin
        │   │           └── analysis
        │   │               └── JasoAnalysisPlugin.java
        └── resources
        │   ├── log4j2.xml
        │   └── plugin-descriptor.properties
    └── test
        └── java
            └── org
                └── elasticsearch
                    └── analysis
                        ├── JasoTest.java
                        ├── JasoTokenizerTest.java
                        └── TestCaseVO.java


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .idea/
3 | .git/
4 | .gradle/
5 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Choi ilkyu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Korean Jaso Analyzer for Elasticsearch 8.6.2 
  2 |   (자동완성 플러그인)
  3 | 
  4 | ## Build & Packaging
  5 | 
  6 | ###### 터미널 환경에서 자바 버전은 17로 변경해야합니다.
  7 | ~~~shell
  8 | $ sh gradlew clean build buildPluginZip
  9 | ~~~
 10 | 
 11 | ###### 자동완성용 한글 자소분석기입니다. elasticsearch 8.6.2 에서 테스트 되었습니다
 12 | 
 13 | ## 도커 컨데이이너에서 elasticsearch, kibana 설치/실행
 14 | ```
 15 | #플러그인이 자동으로 설치된다.
 16 | cd docker
 17 | docker-compose up -d
 18 | ```
 19 | 
 20 | ## 직접설치
 21 | 
 22 | ###### *설치*
 23 | ```
 24 | bin/elasticsearch-plugin install https://github.com/netcrazy/elasticsearch-jaso-analyzer/releases/download/v8.6.2/jaso-analyzer-plugin-8.6.2-plugin.zip
 25 | ```
 26 | 
 27 | ###### *삭제 (필요시)*
 28 | ```
 29 | bin/elasticsearch-plugin remove jaso-analyzer
 30 | ```
 31 | 
 32 | ###### *인덱스 삭제 (필요시)*
 33 | ```
 34 | curl -XDELETE 'http://localhost:9200/jaso'
 35 | ```
 36 | 
 37 | ###### *Korean Jaso Analyer 설정 및 인덱스 생성 (기본 자소검색용)*
 38 | ```
 39 | curl -XPUT -H 'Content-Type: application/json' localhost:9200/jaso -d '{
 40 |   "settings": {
 41 |     "index": {
 42 |       "analysis": {
 43 |         "filter": {
 44 |           "suggest_filter": {
 45 |             "type": "edge_ngram",
 46 |             "min_gram": 1,
 47 |             "max_gram": 50
 48 |           }
 49 |         },
 50 |         "analyzer": {
 51 |           "suggest_search_analyzer": {
 52 |             "type": "custom",
 53 |             "tokenizer": "jaso_tokenizer"
 54 |           },
 55 |           "suggest_index_analyzer": {
 56 |             "type": "custom",
 57 |             "tokenizer": "jaso_tokenizer",
 58 |             "filter": [
 59 |               "suggest_filter"
 60 |             ]
 61 |           }
 62 |         }
 63 |       }
 64 |     }
 65 |   }
 66 | }'
 67 | ```
 68 | 
 69 | ###### *Korean Jaso Analyer 설정 및 인덱스 생성 (한,영오타 및 초성토큰 추출이 필요할 때..)*
 70 | ```
 71 | curl -XPUT -H 'Content-Type: application/json' http://localhost:9200/jaso/ -d '{
 72 |   "settings": {
 73 |     "index": {
 74 |       "analysis": {
 75 |         "filter": {
 76 |           "suggest_filter": {
 77 |             "type": "edge_ngram",
 78 |             "min_gram": 1,
 79 |             "max_gram": 50
 80 |           }
 81 |         },
 82 |         "tokenizer": {
 83 |           "jaso_search_tokenizer": {
 84 |             "type": "jaso_tokenizer",
 85 |             "mistype": true,
 86 |             "chosung": false
 87 |           },
 88 |           "jaso_index_tokenizer": {
 89 |             "type": "jaso_tokenizer",
 90 |             "mistype": true,
 91 |             "chosung": true
 92 |           }
 93 |         },
 94 |         "analyzer": {
 95 |           "suggest_search_analyzer": {
 96 |             "type": "custom",
 97 |             "tokenizer": "jaso_search_tokenizer"
 98 |           },
 99 |           "suggest_index_analyzer": {
100 |             "type": "custom",
101 |             "tokenizer": "jaso_index_tokenizer",
102 |             "filter": [
103 |               "suggest_filter"
104 |             ]
105 |           }
106 |         }
107 |       }
108 |     }
109 |   }
110 | }'
111 | ```
112 | 
113 | ###### *인덱스 맵핑*
114 | ```
115 | curl -XPUT -H 'Content-Type: application/json' http://localhost:9200/jaso/_mapping -d '{
116 |   "properties": {
117 |     "name": {
118 |       "type": "text",
119 |       "store": true,
120 |       "analyzer": "suggest_index_analyzer",
121 |       "search_analyzer": "suggest_search_analyzer"
122 |     }
123 |   }
124 | }'
125 | ```
126 | 
127 | 
128 | ###### *인덱스타임 분석기 테스트*
129 | ```
130 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_analyze?pretty=true -d '{
131 |     "analyzer" : "suggest_index_analyzer",
132 |     "text" : "최일규 Hello"
133 | }'
134 | ```
135 | 
136 | 
137 | ###### *쿼리타임 분석기 테스트*
138 | ```
139 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_analyze?pretty=true -d '{
140 |     "analyzer" : "suggest_search_analyzer",
141 |     "text" : "쵱"
142 | }'
143 | ```
144 | 
145 | 
146 | ###### *문서생성*
147 | ```
148 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_doc?pretty=true -d '{
149 |     "name":"최일규 Hello"
150 | }'
151 | 
152 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_doc?pretty=true -d '{
153 |     "name":"초아"
154 | }'
155 | ```
156 | 
157 | ###### *문서검색*
158 | ```
159 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_search?pretty=true -d '{
160 |     "query" : {
161 |         "match" : { "name" : "초" }
162 |     }
163 | }'
164 | 
165 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_search?pretty=true -d '{
166 |     "query" : {
167 |         "match" : { "name" : "ㅊㅇㄱ" }
168 |     }
169 | }'
170 | ```
171 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'java'
 2 | 
 3 | compileJava {
 4 |     sourceCompatibility = JavaVersion.VERSION_17
 5 |     targetCompatibility = JavaVersion.VERSION_17
 6 | }
 7 | 
 8 | version = '8.6.2'
 9 | jar {
10 |     manifest {
11 |         attributes 'Implementation-Title': 'Elasticsearch Jaso Analyzer Plugin',
12 |                    'Implementation-Version': version
13 |     }
14 | }
15 | 
16 | repositories {
17 |     mavenCentral()
18 | }
19 | 
20 | dependencies {
21 |     implementation group: 'org.elasticsearch', name: 'elasticsearch', version: version
22 |     implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1'
23 |     implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1'
24 |     implementation group: 'org.slf4j', name: 'slf4j-api', version: '1.7.32'
25 |     testImplementation group: 'junit', name: 'junit', version: '4.+'
26 | }
27 | 
28 | test {
29 |     systemProperties 'property': 'value'
30 | }
31 | 
32 | task buildPluginZip(type: Zip, dependsOn:[':jar']) {
33 |     baseName = 'jaso-analyzer-plugin'
34 |     classifier = 'plugin'
35 |     from 'build/libs'
36 |     from 'src/main/resources'
37 | 
38 |     copy {
39 |         from "build/distributions"
40 |         into "docker"
41 |     }
42 | }
43 | 
44 | artifacts {
45 |     archives buildPluginZip
46 | }
47 | 
48 | [ compileJava, compileTestJava ]*.options*.encoding = 'UTF-8'
49 | [ compileJava, compileTestJava ]*.options*.compilerArgs = ['-Xlint:-options']


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.elastic.co/elasticsearch/elasticsearch:8.6.2
2 | 
3 | COPY jaso-analyzer-plugin-8.6.2-plugin.zip /tmp/
4 | RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install file:///tmp/jaso-analyzer-plugin-8.6.2-plugin.zip


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2.2'
 2 | services:
 3 |   elasticsearch:
 4 |     build:
 5 |       context: .
 6 |       dockerfile: ./Dockerfile
 7 |     container_name: es-node-01
 8 |     environment:
 9 |       - cluster.name=es-docker-cluster
10 |       - node.name=es01
11 |       - xpack.security.enabled=false
12 |       - discovery.type=single-node
13 |     ulimits:
14 |       memlock:
15 |         soft: -1
16 |         hard: -1
17 |       nofile:
18 |         soft: 262144
19 |         hard: 262144
20 |     cap_add:
21 |       - IPC_LOCK
22 |     volumes:
23 |       - data01:/usr/share/elasticsearch/data
24 |     ports:
25 |       - "9200:9200"
26 |       - "9300:9300"
27 |     networks:
28 |       - es-net
29 | 
30 |   kibana:
31 |     container_name: kibana
32 |     image: docker.elastic.co/kibana/kibana:8.6.2
33 |     environment:
34 |       ELASTICSEARCH_URL: http://es-node-01:9200
35 |       ELASTICSEARCH_HOSTS: http://es-node-01:9200
36 |     ports:
37 |       - "5601:5601"
38 |     depends_on:
39 |       - elasticsearch
40 |     networks:
41 |       - es-net
42 | 
43 | volumes:
44 |   data01:
45 |     driver: local
46 |   data02:
47 |     driver: local
48 | 
49 | networks:
50 |   es-net:
51 |     driver: bridge


--------------------------------------------------------------------------------
/elasticsearch-jaso-analyzer.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module external.linked.project.id="elasticsearch-jaso-analyzer" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$" external.system.id="GRADLE" external.system.module.group="" external.system.module.version="8.6.2" type="JAVA_MODULE" version="4">
 3 |   <component name="NewModuleRootManager" inherit-compiler-output="true">
 4 |     <exclude-output />
 5 |     <content url="file://$MODULE_DIR$">
 6 |       <excludeFolder url="file://$MODULE_DIR$/.gradle" />
 7 |       <excludeFolder url="file://$MODULE_DIR$/build" />
 8 |     </content>
 9 |     <orderEntry type="inheritedJdk" />
10 |     <orderEntry type="sourceFolder" forTests="false" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netcrazy/elasticsearch-jaso-analyzer/d7204d0f698040bd53c9816e6daee9d09a38f127/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | #
  4 | # Copyright 2015 the original author or authors.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      https://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | ##############################################################################
 20 | ##
 21 | ##  Gradle start up script for UN*X
 22 | ##
 23 | ##############################################################################
 24 | 
 25 | # Attempt to set APP_HOME
 26 | # Resolve links: $0 may be a link
 27 | PRG="$0"
 28 | # Need this for relative symlinks.
 29 | while [ -h "$PRG" ] ; do
 30 |     ls=`ls -ld "$PRG"`
 31 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 32 |     if expr "$link" : '/.*' > /dev/null; then
 33 |         PRG="$link"
 34 |     else
 35 |         PRG=`dirname "$PRG"`"/$link"
 36 |     fi
 37 | done
 38 | SAVED="`pwd`"
 39 | cd "`dirname \"$PRG\"`/" >/dev/null
 40 | APP_HOME="`pwd -P`"
 41 | cd "$SAVED" >/dev/null
 42 | 
 43 | APP_NAME="Gradle"
 44 | APP_BASE_NAME=`basename "$0"`
 45 | 
 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 48 | 
 49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 50 | MAX_FD="maximum"
 51 | 
 52 | warn () {
 53 |     echo "$*"
 54 | }
 55 | 
 56 | die () {
 57 |     echo
 58 |     echo "$*"
 59 |     echo
 60 |     exit 1
 61 | }
 62 | 
 63 | # OS specific support (must be 'true' or 'false').
 64 | cygwin=false
 65 | msys=false
 66 | darwin=false
 67 | nonstop=false
 68 | case "`uname`" in
 69 |   CYGWIN* )
 70 |     cygwin=true
 71 |     ;;
 72 |   Darwin* )
 73 |     darwin=true
 74 |     ;;
 75 |   MINGW* )
 76 |     msys=true
 77 |     ;;
 78 |   NONSTOP* )
 79 |     nonstop=true
 80 |     ;;
 81 | esac
 82 | 
 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 84 | 
 85 | 
 86 | # Determine the Java command to use to start the JVM.
 87 | if [ -n "$JAVA_HOME" ] ; then
 88 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 89 |         # IBM's JDK on AIX uses strange locations for the executables
 90 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 91 |     else
 92 |         JAVACMD="$JAVA_HOME/bin/java"
 93 |     fi
 94 |     if [ ! -x "$JAVACMD" ] ; then
 95 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 96 | 
 97 | Please set the JAVA_HOME variable in your environment to match the
 98 | location of your Java installation."
 99 |     fi
100 | else
101 |     JAVACMD="java"
102 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
103 | 
104 | Please set the JAVA_HOME variable in your environment to match the
105 | location of your Java installation."
106 | fi
107 | 
108 | # Increase the maximum file descriptors if we can.
109 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
110 |     MAX_FD_LIMIT=`ulimit -H -n`
111 |     if [ $? -eq 0 ] ; then
112 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
113 |             MAX_FD="$MAX_FD_LIMIT"
114 |         fi
115 |         ulimit -n $MAX_FD
116 |         if [ $? -ne 0 ] ; then
117 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
118 |         fi
119 |     else
120 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
121 |     fi
122 | fi
123 | 
124 | # For Darwin, add options to specify how the application appears in the dock
125 | if $darwin; then
126 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
127 | fi
128 | 
129 | # For Cygwin or MSYS, switch paths to Windows format before running java
130 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
131 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
132 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
133 | 
134 |     JAVACMD=`cygpath --unix "$JAVACMD"`
135 | 
136 |     # We build the pattern for arguments to be converted via cygpath
137 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
138 |     SEP=""
139 |     for dir in $ROOTDIRSRAW ; do
140 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
141 |         SEP="|"
142 |     done
143 |     OURCYGPATTERN="(^($ROOTDIRS))"
144 |     # Add a user-defined pattern to the cygpath arguments
145 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
146 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
147 |     fi
148 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
149 |     i=0
150 |     for arg in "$@" ; do
151 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
152 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
153 | 
154 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
155 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
156 |         else
157 |             eval `echo args$i`="\"$arg\""
158 |         fi
159 |         i=`expr $i + 1`
160 |     done
161 |     case $i in
162 |         0) set -- ;;
163 |         1) set -- "$args0" ;;
164 |         2) set -- "$args0" "$args1" ;;
165 |         3) set -- "$args0" "$args1" "$args2" ;;
166 |         4) set -- "$args0" "$args1" "$args2" "$args3" ;;
167 |         5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
168 |         6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
169 |         7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
170 |         8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
171 |         9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
172 |     esac
173 | fi
174 | 
175 | # Escape application args
176 | save () {
177 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
178 |     echo " "
179 | }
180 | APP_ARGS=`save "$@"`
181 | 
182 | # Collect all arguments for the java command, following the shell quoting and substitution rules
183 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
184 | 
185 | exec "$JAVACMD" "$@"
186 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/BaseTokenizer.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.analysis;
  2 | 
  3 | import org.apache.lucene.analysis.Tokenizer;
  4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  6 | import org.apache.lucene.analysis.CharacterUtils;
  7 | import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
  8 | 
  9 | import java.io.*;
 10 | 
 11 | /**
 12 |  * Base 자소 토크나이저 구현
 13 |  *
 14 |  * @author 최일규
 15 |  * @since 2016-02-10
 16 |  */
 17 | public abstract class BaseTokenizer extends Tokenizer {
 18 | 
 19 |     private final TokenizerOptions options;
 20 | 
 21 |     private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
 22 |     private static final int MAX_WORD_LEN = 2048;
 23 |     private static final int IO_BUFFER_SIZE = 4096;
 24 | 
 25 |     private final CharTermAttribute termAtt;
 26 |     private final OffsetAttribute offsetAtt;
 27 | 
 28 |     private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
 29 | 
 30 |     protected BaseTokenizer(TokenizerOptions options) {
 31 |         this.options = options;
 32 | 
 33 |         termAtt = addAttribute(CharTermAttribute.class);
 34 |         offsetAtt = addAttribute(OffsetAttribute.class);
 35 | 
 36 |         offset = 0;
 37 |         bufferIndex = 0;
 38 |         dataLen = 0;
 39 |         finalOffset = 0;
 40 |     }
 41 | 
 42 |     protected boolean isTokenChar(int c) {
 43 |         throw new UnsupportedOperationException("Subclasses of CharTokenizer must implement isTokenChar(int)");
 44 |     }
 45 | 
 46 |     protected int normalize(int c) {
 47 |         return c;
 48 |     }
 49 | 
 50 |     /**
 51 |      * lucene 4.2x의 경우 데이터가 있으면 자소분리 후 true가 떨어지나, 여기서는 false로 떨어져 ioBuffer사이즈 상태로 조건변경 (CharacterUtils.fill)
 52 |      *
 53 |      * @author 최일규
 54 |      * @since 2014-07-11
 55 |      */
 56 |     @Override
 57 |     public final boolean incrementToken() throws IOException {
 58 |         clearAttributes();
 59 | 
 60 |         int length = 0;
 61 |         int start = -1; // this variable is always initialized
 62 |         char[] buffer = termAtt.buffer();
 63 |         while (true) {
 64 |             if (bufferIndex >= dataLen) {
 65 | 
 66 |                 offset += dataLen;
 67 |                 CharacterUtils.fill(ioBuffer, jasoDecompose(input, this.options));
 68 | 
 69 |                 //버퍼사이즈가 있으면 분석한다. (return false일때까지... 재귀호출)
 70 |                 if (ioBuffer.getLength() == 0) {
 71 |                     dataLen = 0; // so next offset += dataLen won't decrement offset
 72 |                     if (length > 0) {
 73 |                         break;
 74 |                     } else {
 75 |                         finalOffset = correctOffset(offset);
 76 |                         return false;
 77 |                     }
 78 |                 }
 79 |                 dataLen = ioBuffer.getLength();
 80 |                 bufferIndex = 0;
 81 |             }
 82 |             // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
 83 |             final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, dataLen);
 84 |             bufferIndex += Character.charCount(c);
 85 | 
 86 |             // if it's a token char
 87 |             if (isTokenChar(c)) {
 88 | 
 89 |                 // start of token
 90 |                 if (length == 0) {
 91 |                     assert start == -1;
 92 |                     start = offset + bufferIndex - 1;
 93 | 
 94 |                     // check if a supplementary could run out of bounds
 95 |                 } else if (length >= buffer.length - 1) {
 96 | 
 97 |                     // make sure a supplementary fits in the buffer
 98 |                     buffer = termAtt.resizeBuffer(2 + length);
 99 |                 }
100 | 
101 |                 // buffer it, normalized
102 |                 length += Character.toChars(normalize(c), buffer, length);
103 |                 if (length >= MAX_WORD_LEN) {
104 |                     break;
105 |                 }
106 |             } else if (length > 0) {
107 |                 // return 'em
108 |                 break;
109 |             }
110 |         }
111 | 
112 |         termAtt.setLength(length);
113 |         assert start != -1;
114 |         offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length));
115 |         return true;
116 |     }
117 | 
118 |     @Override
119 |     public final void end() {
120 |         // set final offset
121 |         offsetAtt.setOffset(finalOffset, finalOffset);
122 |     }
123 | 
124 |     /**
125 |      * Reader -> String -> 자소변환 -> String -> Reader
126 |      */
127 |     public static Reader jasoDecompose(Reader in, TokenizerOptions options) {
128 |         Writer writer = new StringWriter();
129 |         JasoDecomposer decomposer = new JasoDecomposer();
130 |         char[] buffer = new char[2048];
131 |         String temp;
132 | 
133 |         try {
134 |             int n;
135 |             while ((n = in.read(buffer)) != -1) {
136 |                 writer.write(buffer, 0, n);
137 |             }
138 |             temp = writer.toString();
139 |             temp = decomposer.runJasoDecompose(temp, options);
140 |             // System.out.println(temp);
141 |             in = new StringReader(temp);
142 |         } catch (Exception e) {
143 |             StringWriter errors = new StringWriter();
144 |             e.printStackTrace(new PrintWriter(errors));
145 |         }
146 |         return in;
147 |     }
148 | 
149 |     @Override
150 |     public void reset() throws IOException {
151 |         super.reset();
152 |         bufferIndex = 0;
153 |         offset = 0;
154 |         dataLen = 0;
155 |         finalOffset = 0;
156 |         ioBuffer.reset(); // make sure to reset the IO buffer!!
157 |     }
158 | }


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/JasoAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.analysis;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | 
 5 | /**
 6 |  * JasoAnalyzer
 7 |  *
 8 |  * @author 최일규
 9 |  * @since 2018-03-21
10 |  */
11 | public class JasoAnalyzer extends Analyzer {
12 |     public JasoAnalyzer() {
13 |     }
14 | 
15 |     @Override
16 |     protected Analyzer.TokenStreamComponents createComponents(final String fieldName) {
17 |         return new Analyzer.TokenStreamComponents(new JasoTokenizer(TokenizerOptions.create("jaso_analyzer")));
18 |     }
19 | }


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/JasoDecomposer.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.analysis;
  2 | 
  3 | /**
  4 |  * 자동완성용 자소분해 (자소분해 with WhiteSpace)
  5 |  *
  6 |  * @author 최일규
  7 |  * @since 2016-02-10
  8 |  */
  9 | public class JasoDecomposer {
 10 | 
 11 |     //초성(19자) ㄱ ㄲ ㄴ ㄷ ㄸ ㄹ ㅁ ㅂ ㅃ ㅅ ㅆ ㅇ ㅈ ㅉ ㅊ ㅋ ㅌ ㅍ ㅎ
 12 |     static String[] chosungKor = {"ㄱ", "ㄱㄱ", "ㄴ", "ㄷ", "ㄷㄷ", "ㄹ", "ㅁ", "ㅂ", "ㅂㅂ", "ㅅ", "ㅅㅅ", "ㅇ", "ㅈ", "ㅈㅈ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ"};
 13 |     //중성(21자) ㅏ ㅐ ㅑ ㅒ ㅓ ㅔ ㅕ ㅖ ㅗ ㅘ(9) ㅙ(10) ㅚ(11) ㅛ ㅜ ㅝ(14) ㅞ(15) ㅟ(16) ㅠ ㅡ ㅢ(19) ㅣ
 14 |     static String[] jungsungKor = {"ㅏ", "ㅐ", "ㅑ", "ㅒ", "ㅓ", "ㅔ", "ㅕ", "ㅖ", "ㅗ", "ㅗㅏ", "ㅗㅐ", "ㅗㅣ", "ㅛ", "ㅜ", "ㅜㅓ", "ㅜㅔ", "ㅜㅣ", "ㅠ", "ㅡ", "ㅡㅣ", "ㅣ"};
 15 |     //종성(28자) <없음> ㄱ ㄲ ㄳ(3) ㄴ ㄵ(5) ㄶ(6) ㄷ ㄹ ㄺ(9) ㄻ(10) ㄼ(11) ㄽ(12) ㄾ(13) ㄿ(14) ㅀ(15) ㅁ ㅂ ㅄ(18) ㅅ ㅆ ㅇ ㅈ ㅊ ㅋ ㅌ ㅍ ㅎ
 16 |     static String[] jongsungKor = {" ", "ㄱ", "ㄱㄱ", "ㄱㅅ", "ㄴ", "ㄴㅈ", "ㄴㅎ", "ㄷ", "ㄹ", "ㄹㄱ", "ㄹㅁ", "ㄹㅂ", "ㄹㅅ", "ㄹㅌ", "ㄹㅍ", "ㄹㅎ", "ㅁ", "ㅂ", "ㅂㅅ", "ㅅ", "ㅅㅅ", "ㅇ", "ㅈ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ"};
 17 | 
 18 |     static String[] chosungEng = {"r", "R", "s", "e", "E", "f", "a", "q", "Q", "t", "T", "d", "w", "W", "c", "z", "x", "v", "g"};
 19 |     static String[] jungsungEng = {"k", "o", "i", "O", "j", "p", "u", "P", "h", "hk", "ho", "hl", "y", "n", "nj", "np", "nl", "b", "m", "ml", "l"};
 20 |     static String[] jongsungEng = {"", "r", "R", "rt", "s", "sw", "sg", "e", "f", "fr", "fa", "fq", "ft", "fx", "fv", "fg", "a", "q", "qt", "t", "T", "d", "w", "c", "z", "x", "v", "g"};
 21 | 
 22 |     static String[] mistyping = {"ㅁ", "ㅠ", "ㅊ", "ㅇ", "ㄷ", "ㄹ", "ㅎ", "ㅗ", "ㅑ", "ㅓ", "ㅏ", "ㅣ", "ㅡ", "ㅜ", "ㅐ", "ㅔ", "ㅂ", "ㄱ", "ㄴ", "ㅅ", "ㅕ", "ㅍ", "ㅈ", "ㅌ", "ㅛ", "ㅋ"};
 23 | 
 24 |     public String runJasoDecompose(String originStr, TokenizerOptions options) {
 25 | 
 26 |         if (!originStr.isEmpty()) {
 27 | 
 28 |             //lowercase 처리
 29 |             originStr = originStr.toLowerCase();
 30 | 
 31 |             char[] termBuffer = originStr.toCharArray();
 32 |             StringBuilder korBuffer = new StringBuilder();
 33 |             StringBuilder engBuffer = new StringBuilder();
 34 |             StringBuilder chosungBuffer = new StringBuilder();
 35 |             StringBuilder mistypingBuffer = new StringBuilder();
 36 |             StringBuilder etcBuffer = new StringBuilder();
 37 |             StringBuilder returnBuffer = new StringBuilder();
 38 | 
 39 |             //첫글자가 한글일때만 초성분해
 40 |             boolean firstCharType = false;
 41 |             if (termBuffer.length > 0)
 42 |                 firstCharType = isHangul(Character.toString(termBuffer[0]));
 43 | 
 44 |             //자소포함여부
 45 |             boolean jaso = isJaso(originStr);
 46 |             //한글포함여부
 47 |             boolean hangul = isHangul(originStr);
 48 |             //영문포함여부
 49 |             boolean english = isEnglish(originStr);
 50 | 
 51 |             int strLen = originStr.length();
 52 | 
 53 |             int cho;
 54 |             int jung;
 55 |             int jong;
 56 |             for (char ch : termBuffer) {
 57 |                 //가(AC00)~힣(D7A3) 에 속한 글자면 분해
 58 |                 if (ch >= 0xAC00 && ch <= 0xD7A3 && !jaso) {
 59 |                     //Unicode 값으로 환산한다.
 60 |                     int uniValue = ch - 0xAC00;
 61 | 
 62 |                     jong = uniValue % 28;                   //종성
 63 |                     cho = ((uniValue - jong) / 28) / 21;    //초성
 64 |                     jung = ((uniValue - jong) / 28) % 21;   //중성
 65 | 
 66 |                     //한글초성
 67 |                     korBuffer.append(chosungKor[cho]);
 68 | 
 69 |                     //한글에 대한 초성처리 (일반적으로 색인시 초성을 담는다.)
 70 |                     if (options.isChosung() && firstCharType) {
 71 |                         //초성은 2자이상일때 포함
 72 |                         if (strLen >= 2)
 73 |                             chosungBuffer.append(chosungKor[cho]);
 74 |                     }
 75 | 
 76 |                     //한글문장에 대한 영문오타처리 (ㄱ -> r)
 77 |                     if (options.isMistype()) {
 78 |                         engBuffer.append(chosungEng[cho].toLowerCase());
 79 |                     }
 80 | 
 81 |                     //한글중성
 82 |                     korBuffer.append(jungsungKor[jung]);
 83 | 
 84 |                     //한글문장에 대한 영문오타처리 (ㅏ-> k)
 85 |                     if (options.isMistype()) {
 86 |                         engBuffer.append(jungsungEng[jung].toLowerCase());
 87 |                     }
 88 | 
 89 |                     //받침이 있으면
 90 |                     if (jong != 0) {
 91 |                         korBuffer.append(jongsungKor[jong]);
 92 | 
 93 |                         //한글문장에 대한 영문오타처리 (ㄲ -> R)
 94 |                         if (options.isMistype()) {
 95 |                             engBuffer.append(jongsungEng[jong].toLowerCase());
 96 |                         }
 97 |                     }
 98 |                 } else {
 99 | 
100 |                     if (options.isMistype()) {
101 |                         if (!jaso) {
102 |                             if (hangul) {
103 |                                 korBuffer.append(ch);
104 |                             }
105 |                             engBuffer.append(ch);
106 |                         }
107 |                     } else {
108 |                         if (!jaso) {
109 |                             if (hangul) {
110 |                                 korBuffer.append(ch);
111 |                             } else {
112 |                                 engBuffer.append(ch);
113 |                             }
114 |                         }
115 |                     }
116 | 
117 |                     //영문문장에 대한 한글오타처리 (hello -> ㅗ디ㅣㅐ)
118 |                     if (options.isMistype() && !hangul) {
119 |                         int index;
120 |                         if (ch >= 0x61 && ch <= 0x7A) {
121 |                             //소문자
122 |                             index = (int) ch - 97;
123 |                             mistypingBuffer.append(mistyping[index]);
124 |                         } else if (ch >= 0x41 && ch <= 0x5A) {
125 |                             //대문자
126 |                             index = (int) ch - 65;
127 |                             mistypingBuffer.append(mistyping[index]);
128 |                         } else {
129 |                             if (english)
130 |                                 mistypingBuffer.append(ch);
131 |                         }
132 |                     }
133 |                 }
134 | 
135 |                 //추가적인 예외상황으로 추가 토큰처리 (ㅗ디ㅣㅐ -> ㅗㄷㅣㅣㅐ 자소분해)
136 |                 if (jaso) {
137 | 
138 |                     if (ch >= 0xAC00 && ch <= 0xD7A3) {
139 |                         //Unicode 값으로 환산한다.
140 |                         int uniValue = ch - 0xAC00;
141 | 
142 |                         jong = uniValue % 28;                   //종성
143 |                         cho = ((uniValue - jong) / 28) / 21;    //초성
144 |                         jung = ((uniValue - jong) / 28) % 21;   //중성
145 | 
146 |                         etcBuffer.append(chosungKor[cho]);
147 |                         etcBuffer.append(jungsungKor[jung]);
148 |                         //받침이 있으면
149 |                         if (jong != 0) {
150 |                             etcBuffer.append(jongsungKor[jong]);
151 |                         }
152 |                     } else if (isJaso(Character.toString(ch))) {
153 |                         //복자음 강제분리
154 |                         switch (ch) {
155 |                             case 'ㄲ':
156 |                                 etcBuffer.append("ㄱㄱ");
157 |                                 break;
158 |                             case 'ㄳ':
159 |                                 etcBuffer.append("ㄱㅅ");
160 |                                 break;
161 |                             case 'ㄵ':
162 |                                 etcBuffer.append("ㄴㅈ");
163 |                                 break;
164 |                             case 'ㄶ':
165 |                                 etcBuffer.append("ㄴㅎ");
166 |                                 break;
167 |                             case 'ㄺ':
168 |                                 etcBuffer.append("ㄹㄱ");
169 |                                 break;
170 |                             case 'ㄻ':
171 |                                 etcBuffer.append("ㄹㅁ");
172 |                                 break;
173 |                             case 'ㄼ':
174 |                                 etcBuffer.append("ㄹㅂ");
175 |                                 break;
176 |                             case 'ㄽ':
177 |                                 etcBuffer.append("ㄹㅅ");
178 |                                 break;
179 |                             case 'ㄾ':
180 |                                 etcBuffer.append("ㄹㅌ");
181 |                                 break;
182 |                             case 'ㄿ':
183 |                                 etcBuffer.append("ㄹㅍ");
184 |                                 break;
185 |                             case 'ㅀ':
186 |                                 etcBuffer.append("ㄹㅎ");
187 |                                 break;
188 |                             case 'ㅄ':
189 |                                 etcBuffer.append("ㅂㅅ");
190 |                                 break;
191 |                             case 'ㄸ':
192 |                                 etcBuffer.append("ㄷㄷ");
193 |                                 break;
194 |                             case 'ㅃ':
195 |                                 etcBuffer.append("ㅂㅂ");
196 |                                 break;
197 |                             case 'ㅆ':
198 |                                 etcBuffer.append("ㅅㅅ");
199 |                                 break;
200 |                             case 'ㅉ':
201 |                                 etcBuffer.append("ㅈㅈ");
202 |                                 break;
203 |                             default:
204 |                                 etcBuffer.append(ch);
205 |                         }
206 |                     } else {
207 |                         etcBuffer.append(ch);
208 |                     }
209 |                 }
210 |             }
211 | 
212 |             //결과 조합
213 | 
214 |             //공백을 붙인 전체 문자열 (한글)
215 |             if (korBuffer.indexOf(" ") != -1) {
216 |                 if (korBuffer.length() > 0) {
217 |                     returnBuffer.append(korBuffer.toString().replaceAll(" ", ""));
218 |                     returnBuffer.append(" ");
219 |                 }
220 |             }
221 | 
222 |             //공백으로 분리된 문자열 (한글)
223 |             if (korBuffer.length() > 0) {
224 |                 returnBuffer.append(korBuffer.toString());
225 |                 returnBuffer.append(" ");
226 |             }
227 | 
228 |             //공백을 붙인 전체 문자열 (영문)
229 |             if (engBuffer.indexOf(" ") != -1) {
230 |                 if (engBuffer.length() > 0) {
231 |                     returnBuffer.append(engBuffer.toString().replaceAll(" ", ""));
232 |                     returnBuffer.append(" ");
233 |                 }
234 |             }
235 | 
236 |             //공백으로 분리된 문자열 (영문)
237 |             if (engBuffer.length() > 0) {
238 |                 returnBuffer.append(engBuffer.toString());
239 |                 returnBuffer.append(" ");
240 |             }
241 | 
242 |             //공백을 붙인 전체 문자열 (오타)
243 |             if (mistypingBuffer.indexOf(" ") != -1) {
244 |                 if (mistypingBuffer.length() > 0) {
245 |                     returnBuffer.append(mistypingBuffer.toString().replaceAll(" ", ""));
246 |                     returnBuffer.append(" ");
247 |                 }
248 |             }
249 | 
250 |             //공백으로 분리된 문자열 (오타)
251 |             if (mistypingBuffer.length() > 0) {
252 |                 returnBuffer.append(mistypingBuffer);
253 |                 returnBuffer.append(" ");
254 |             }
255 | 
256 |             if (chosungBuffer.length() > 0) {
257 |                 returnBuffer.append(chosungBuffer);
258 |                 returnBuffer.append(" ");
259 |             }
260 | 
261 |             if (etcBuffer.length() > 0) {
262 |                 returnBuffer.append(etcBuffer);
263 |                 returnBuffer.append(" ");
264 |             }
265 | 
266 |             return returnBuffer.toString().trim();
267 |         } else {
268 |             return "";
269 |         }
270 |     }
271 | 
272 |     /**
273 |      * 문자열에 한글포함 여부
274 |      */
275 |     private boolean isHangul(String str) {
276 |         return str.matches(".*[ㄱ-ㅎㅏ-ㅣ가-힣]+.*");
277 |     }
278 | 
279 |     /**
280 |      * 문자열에 영문포함 여부
281 |      */
282 |     private boolean isEnglish(String str) {
283 |         return str.matches(".*[a-zA-Z]+.*");
284 |     }
285 | 
286 |     /**
287 |      * 문자열에 초성,중성 포함 여부
288 |      */
289 |     private boolean isJaso(String str) {
290 |         return str.matches(".*[ㄱ-ㅎㅏ-ㅣ]+.*");
291 |     }
292 | }
293 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/JasoTokenizer.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.analysis;
 2 | 
 3 | import org.elasticsearch.common.config;
 4 | 
 5 | /**
 6 |  * 자소 토크나이저 구현
 7 |  *
 8 |  * @author 최일규
 9 |  * @since 2018-03-21
10 |  */
11 | public final class JasoTokenizer extends BaseTokenizer {
12 | 
13 |     /**
14 |      * 자소 토크나이저 생성자
15 |      *
16 |      * @param options 토크나이저 옵션
17 |      */
18 |     public JasoTokenizer(TokenizerOptions options) {
19 |         super(options);
20 |     }
21 | 
22 |     /**
23 |      * Collects only characters which do not satisfy
24 |      * {@link Character#isWhitespace(int)}.
25 |      */
26 |     @Override
27 |     protected boolean isTokenChar(int c) {
28 |         return !isSplit(c);
29 |     }
30 | 
31 |     /**
32 |      * White Space로 토큰분해
33 |      */
34 |     private boolean isSplit(int c) {
35 |         return (char) c == config.WHITESPACE_CHAR;
36 |     }
37 | }


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/analysis/TokenizerOptions.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.analysis;
 2 | 
 3 | /**
 4 |  * 토크나이저 옵션
 5 |  *
 6 |  * @author 최일규
 7 |  * @since 2016-02-12
 8 |  */
 9 | public class TokenizerOptions {
10 | 
11 |     //한영오타에 대한 토큰 추출여부 (hello -> ㅗㄷㅣㅣㅐ, 최일규 -> chldlfrb)
12 |     public final static boolean MISTYPE = false;
13 | 
14 |     //초성검색을 위한 토큰 추출여부 (최일규 -> ㅊㅇㄱ)
15 |     public final static boolean CHOSUNG = false;
16 | 
17 |     private boolean mistype = MISTYPE;
18 |     private boolean chosung = CHOSUNG;
19 | 
20 |     private String name = null;
21 | 
22 |     public static TokenizerOptions create(String name) {
23 |         return new TokenizerOptions(name);
24 |     }
25 | 
26 |     private TokenizerOptions(String name) {
27 |         this.name = name;
28 |     }
29 | 
30 |     public String getName() {
31 |         return name;
32 |     }
33 | 
34 |     public boolean isMistype() {
35 |         return mistype;
36 |     }
37 | 
38 |     public void setMistype(boolean mistype) {
39 |         this.mistype = mistype;
40 |     }
41 | 
42 |     public boolean isChosung() {
43 |         return chosung;
44 |     }
45 | 
46 |     public void setChosung(boolean chosung) {
47 |         this.chosung = chosung;
48 |     }
49 | }


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/common/config.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.common;
 2 | 
 3 | /**
 4 |  * 글로벌 공통변수
 5 |  *
 6 |  * @author 최일규
 7 |  * @since 2016-02-03
 8 |  */
 9 | public class config {
10 |     public static final char WHITESPACE_CHAR = ' ';
11 | }


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/JasoAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import org.elasticsearch.analysis.JasoAnalyzer;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | 
 8 | /**
 9 |  * JasoAnalyzerProvider
10 |  *
11 |  * @author 최일규
12 |  * @since 2018-03-21
13 |  */
14 | public class JasoAnalyzerProvider extends AbstractIndexAnalyzerProvider<JasoAnalyzer> {
15 |     private final JasoAnalyzer analyzer;
16 | 
17 |     public JasoAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
18 |         super(name, settings);
19 |         analyzer = new JasoAnalyzer();
20 |     }
21 | 
22 |     @Override
23 |     public JasoAnalyzer get() {
24 |         return analyzer;
25 |     }
26 | }


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/JasoTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.elasticsearch.analysis.JasoTokenizer;
 5 | import org.elasticsearch.analysis.TokenizerOptions;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | import org.elasticsearch.env.Environment;
 9 | 
10 | /**
11 |  * JasoTokenizerFactory
12 |  *
13 |  * @author 최일규
14 |  * @since 2018-03-21
15 |  */
16 | public class JasoTokenizerFactory extends AbstractTokenizerFactory {
17 | 
18 |     private final TokenizerOptions options;
19 | 
20 |     public JasoTokenizerFactory(IndexSettings indexSettings,
21 |                                 Environment environment,
22 |                                 String name,
23 |                                 Settings settings) {
24 | 
25 |         super(indexSettings, settings, name);
26 | 
27 |         this.options = TokenizerOptions.create(name);
28 |         this.options.setMistype(settings.getAsBoolean("mistype", TokenizerOptions.MISTYPE));
29 |         this.options.setChosung(settings.getAsBoolean("chosung", TokenizerOptions.CHOSUNG));
30 |     }
31 | 
32 |     @Override
33 |     public Tokenizer create() {
34 |         return new JasoTokenizer(this.options);
35 |     }
36 | }


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/JasoAnalysisPlugin.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.analysis;
 2 | 
 3 | import org.elasticsearch.index.analysis.JasoTokenizerFactory;
 4 | import org.elasticsearch.index.analysis.*;
 5 | import org.elasticsearch.plugins.AnalysisPlugin;
 6 | import org.elasticsearch.plugins.Plugin;
 7 | 
 8 | import java.util.Map;
 9 | 
10 | import org.elasticsearch.indices.analysis.AnalysisModule;
11 | 
12 | import static java.util.Collections.singletonMap;
13 | 
14 | import org.apache.lucene.analysis.Analyzer;
15 | 
16 | /**
17 |  * JasoAnalysisPlugin
18 |  *
19 |  * @author 최일규
20 |  * @since 2018-03-21
21 |  */
22 | public class JasoAnalysisPlugin extends Plugin implements AnalysisPlugin {
23 | 
24 |     @Override
25 |     public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
26 |         return singletonMap("jaso_tokenizer", JasoTokenizerFactory::new);
27 |     }
28 | 
29 |     @Override
30 |     public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
31 |         return singletonMap("jaso_analyzer", JasoAnalyzerProvider::new);
32 |     }
33 | }


--------------------------------------------------------------------------------
/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <Configuration status="WARN">
 3 |     <Appenders>
 4 |         <Console name="Console" target="SYSTEM_OUT">
 5 |             <PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
 6 |         </Console>
 7 |     </Appenders>
 8 |     <Loggers>
 9 |         <Root level="error">
10 |             <AppenderRef ref="Console"/>
11 |         </Root>
12 |     </Loggers>
13 | </Configuration>


--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | description=Jaso Korean Text Analyzer
2 | version=8.6.2
3 | name=jaso-analyzer
4 | classname=org.elasticsearch.plugin.analysis.JasoAnalysisPlugin
5 | java.version=17
6 | elasticsearch.version=8.6.2


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/analysis/JasoTest.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.analysis;
 2 | 
 3 | import junit.framework.TestCase;
 4 | 
 5 | /**
 6 |  * 자동완성 기능 유닛테스트
 7 |  *
 8 |  * @author 최일규
 9 |  * @since 2016-02-03
10 |  */
11 | public class JasoTest extends TestCase {
12 | 
13 |     public void testJasoDecomposer() {
14 |         TokenizerOptions options = TokenizerOptions.create("testJasoDecomposer");
15 |         options.setMistype(true);
16 |         options.setChosung(true);
17 | 
18 |         JasoDecomposer aa = new JasoDecomposer();
19 | 
20 |         String expected = "ㅅㅅㄱㄱ";
21 |         String actual = aa.runJasoDecompose("ㅆㄲ", options);
22 |         assertEquals(expected, actual);
23 |     }
24 | }


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/analysis/JasoTokenizerTest.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.analysis;
  2 | 
  3 | import junit.framework.TestCase;
  4 | import org.apache.lucene.analysis.Tokenizer;
  5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  6 | 
  7 | import java.io.IOException;
  8 | import java.io.StringReader;
  9 | import java.util.ArrayList;
 10 | import java.util.List;
 11 | 
 12 | /**
 13 |  * 토크나이저 유닛테스트
 14 |  *
 15 |  * @author 최일규
 16 |  * @since 2016-02-11
 17 |  */
 18 | public class JasoTokenizerTest extends TestCase {
 19 | 
 20 |     public void testTokenizer() throws IOException {
 21 | 
 22 |         long start = System.currentTimeMillis();
 23 |         TokenizerOptions options = TokenizerOptions.create("testTokenizer");
 24 | 
 25 |         //한영오타에 대한 토큰 추출여부 (hello -> ㅗㄷㅣㅣㅐ, 최일규 -> chldlfrb)
 26 |         options.setMistype(false);
 27 | 
 28 |         //초성검색을 위한 토큰 추출여부 (최일규 -> ㅊㅇㄱ)
 29 |         options.setChosung(false);
 30 | 
 31 |         List<TestCaseVO> testCase = new ArrayList<TestCaseVO>();
 32 | 
 33 |         if (options.isMistype() && options.isChosung()) {
 34 | 
 35 |             testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ/chldlfrb/ㅊㅇㄱ"));
 36 |             testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/thsutleo/ㅅㄴㅅㄷ"));
 37 |             testCase.add(new TestCaseVO("Hello", "hello/ㅗㄷㅣㅣㅐ"));
 38 |             testCase.add(new TestCaseVO("Hello~", "hello~/ㅗㄷㅣㅣㅐ~"));
 39 |             testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld/anwhrjs/govldpseld/ㅁㅈㄱㅎㅍㅇㄷ"));
 40 |             testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/dkelektmdnsehdghk/dkelektm/dnsehdghk/ㅇㄷㄷㅅㅇㄷㅎ"));
 41 |             testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/xnepdlxmrrk/ㅌㄷㅇㅌㄱ"));
 42 | 
 43 |         } else if (options.isMistype() && !options.isChosung()) {
 44 | 
 45 |             testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ/chldlfrb"));
 46 |             testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/thsutleo"));
 47 |             testCase.add(new TestCaseVO("Hello", "hello/ㅗㄷㅣㅣㅐ"));
 48 |             testCase.add(new TestCaseVO("Hello~", "hello~/ㅗㄷㅣㅣㅐ~"));
 49 |             testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld/anwhrjs/govldpseld"));
 50 |             testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/dkelektmdnsehdghk/dkelektm/dnsehdghk"));
 51 |             testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/xnepdlxmrrk"));
 52 | 
 53 |         } else if (!options.isMistype() && options.isChosung()) {
 54 | 
 55 |             testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ/ㅊㅇㄱ"));
 56 |             testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/ㅅㄴㅅㄷ"));
 57 |             testCase.add(new TestCaseVO("Hello", "hello"));
 58 |             testCase.add(new TestCaseVO("Hello~", "hello~"));
 59 |             testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅈㄱㅎㅍㅇㄷ"));
 60 |             testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㄷㄷㅅㅇㄷㅎ"));
 61 |             testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/ㅌㄷㅇㅌㄱ"));
 62 | 
 63 |         } else if (!options.isMistype() && !options.isChosung()) {
 64 | 
 65 |             testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ"));
 66 |             testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ"));
 67 |             testCase.add(new TestCaseVO("Hello", "hello"));
 68 |             testCase.add(new TestCaseVO("Hello~", "hello~"));
 69 |             testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ"));
 70 |             testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ"));
 71 |             testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ"));
 72 |         }
 73 | 
 74 |         for (TestCaseVO vo : testCase) {
 75 | 
 76 |             StringReader reader = new StringReader(vo.getOrigin());
 77 | 
 78 |             Tokenizer tokenizer = new JasoTokenizer(options);
 79 |             tokenizer.setReader(reader);
 80 |             CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
 81 | 
 82 |             tokenizer.reset();
 83 | 
 84 |             StringBuffer sb = new StringBuffer();
 85 | 
 86 |             while (tokenizer.incrementToken()) {
 87 |                 if (sb.length() > 0) sb.append('/');
 88 |                 sb.append(termAtt.toString());
 89 |             }
 90 | 
 91 |             TestCase.assertEquals(vo.getCompare(), sb.toString());
 92 |             tokenizer.close();
 93 | 
 94 |             System.out.printf("%s => %s%n", vo.getOrigin(), sb);
 95 |         }
 96 | 
 97 |         long end = System.currentTimeMillis();
 98 |         System.out.println("실행 시간 : " + (end - start) / 1000.0);
 99 |     }
100 | }


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/analysis/TestCaseVO.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.analysis;
 2 | 
 3 | /**
 4 |  * 테스트케이스 VO
 5 |  *
 6 |  * @author 최일규
 7 |  * @since 2016-02-13
 8 |  */
 9 | public class TestCaseVO {
10 | 
11 |     private final String origin;
12 |     private final String compare;
13 | 
14 |     public TestCaseVO(String origin, String compare) {
15 |         this.origin = origin;
16 |         this.compare = compare;
17 |     }
18 | 
19 |     public String getOrigin() {
20 |         return origin;
21 |     }
22 | 
23 |     public String getCompare() {
24 |         return compare;
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------