├── .gitignore ├── LICENSE ├── README.md ├── build.gradle ├── docker ├── Dockerfile └── docker-compose.yml ├── elasticsearch-jaso-analyzer.iml ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat └── src ├── main ├── java │ └── org │ │ └── elasticsearch │ │ ├── analysis │ │ ├── BaseTokenizer.java │ │ ├── JasoAnalyzer.java │ │ ├── JasoDecomposer.java │ │ ├── JasoTokenizer.java │ │ └── TokenizerOptions.java │ │ ├── common │ │ └── config.java │ │ ├── index │ │ └── analysis │ │ │ ├── JasoAnalyzerProvider.java │ │ │ └── JasoTokenizerFactory.java │ │ └── plugin │ │ └── analysis │ │ └── JasoAnalysisPlugin.java └── resources │ ├── log4j2.xml │ └── plugin-descriptor.properties └── test └── java └── org └── elasticsearch └── analysis ├── JasoTest.java ├── JasoTokenizerTest.java └── TestCaseVO.java /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .idea/ 3 | .git/ 4 | .gradle/ 5 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Choi ilkyu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Korean Jaso Analyzer for Elasticsearch 8.6.2 2 | (자동완성 플러그인) 3 | 4 | ## Build & Packaging 5 | 6 | ###### 터미널 환경에서 자바 버전은 17로 변경해야합니다. 7 | ~~~shell 8 | $ sh gradlew clean build buildPluginZip 9 | ~~~ 10 | 11 | ###### 자동완성용 한글 자소분석기입니다. elasticsearch 8.6.2 에서 테스트 되었습니다 12 | 13 | ## 도커 컨데이이너에서 elasticsearch, kibana 설치/실행 14 | ``` 15 | #플러그인이 자동으로 설치된다. 16 | cd docker 17 | docker-compose up -d 18 | ``` 19 | 20 | ## 직접설치 21 | 22 | ###### *설치* 23 | ``` 24 | bin/elasticsearch-plugin install https://github.com/netcrazy/elasticsearch-jaso-analyzer/releases/download/v8.6.2/jaso-analyzer-plugin-8.6.2-plugin.zip 25 | ``` 26 | 27 | ###### *삭제 (필요시)* 28 | ``` 29 | bin/elasticsearch-plugin remove jaso-analyzer 30 | ``` 31 | 32 | ###### *인덱스 삭제 (필요시)* 33 | ``` 34 | curl -XDELETE 'http://localhost:9200/jaso' 35 | ``` 36 | 37 | ###### *Korean Jaso Analyer 설정 및 인덱스 생성 (기본 자소검색용)* 38 | ``` 39 | curl -XPUT -H 'Content-Type: application/json' localhost:9200/jaso -d '{ 40 | "settings": { 41 | "index": { 42 | "analysis": { 43 | "filter": { 44 | "suggest_filter": { 45 | "type": "edge_ngram", 46 | "min_gram": 1, 47 | "max_gram": 50 48 | } 49 | }, 50 | "analyzer": { 51 | "suggest_search_analyzer": { 52 | "type": "custom", 53 | "tokenizer": "jaso_tokenizer" 54 | }, 55 | "suggest_index_analyzer": { 56 | "type": "custom", 57 | "tokenizer": "jaso_tokenizer", 58 | "filter": [ 59 | "suggest_filter" 60 | ] 61 | } 62 | } 63 | } 64 | } 65 | } 66 | }' 67 | ``` 68 | 69 | ###### *Korean Jaso Analyer 설정 및 인덱스 생성 (한,영오타 및 초성토큰 추출이 필요할 때..)* 70 | ``` 71 | curl -XPUT -H 'Content-Type: application/json' http://localhost:9200/jaso/ -d '{ 72 | "settings": { 73 | "index": { 74 | "analysis": { 75 | "filter": { 76 | "suggest_filter": { 77 | "type": "edge_ngram", 78 | "min_gram": 1, 79 | "max_gram": 50 80 | } 81 | }, 82 | "tokenizer": { 83 | "jaso_search_tokenizer": { 84 | "type": "jaso_tokenizer", 85 | "mistype": true, 86 | "chosung": false 87 | }, 88 | "jaso_index_tokenizer": { 89 | "type": "jaso_tokenizer", 90 | "mistype": true, 91 | "chosung": true 92 | } 93 | }, 94 | "analyzer": { 95 | "suggest_search_analyzer": { 96 | "type": "custom", 97 | "tokenizer": "jaso_search_tokenizer" 98 | }, 99 | "suggest_index_analyzer": { 100 | "type": "custom", 101 | "tokenizer": "jaso_index_tokenizer", 102 | "filter": [ 103 | "suggest_filter" 104 | ] 105 | } 106 | } 107 | } 108 | } 109 | } 110 | }' 111 | ``` 112 | 113 | ###### *인덱스 맵핑* 114 | ``` 115 | curl -XPUT -H 'Content-Type: application/json' http://localhost:9200/jaso/_mapping -d '{ 116 | "properties": { 117 | "name": { 118 | "type": "text", 119 | "store": true, 120 | "analyzer": "suggest_index_analyzer", 121 | "search_analyzer": "suggest_search_analyzer" 122 | } 123 | } 124 | }' 125 | ``` 126 | 127 | 128 | ###### *인덱스타임 분석기 테스트* 129 | ``` 130 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_analyze?pretty=true -d '{ 131 | "analyzer" : "suggest_index_analyzer", 132 | "text" : "최일규 Hello" 133 | }' 134 | ``` 135 | 136 | 137 | ###### *쿼리타임 분석기 테스트* 138 | ``` 139 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_analyze?pretty=true -d '{ 140 | "analyzer" : "suggest_search_analyzer", 141 | "text" : "쵱" 142 | }' 143 | ``` 144 | 145 | 146 | ###### *문서생성* 147 | ``` 148 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_doc?pretty=true -d '{ 149 | "name":"최일규 Hello" 150 | }' 151 | 152 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_doc?pretty=true -d '{ 153 | "name":"초아" 154 | }' 155 | ``` 156 | 157 | ###### *문서검색* 158 | ``` 159 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_search?pretty=true -d '{ 160 | "query" : { 161 | "match" : { "name" : "초" } 162 | } 163 | }' 164 | 165 | curl -XPOST -H 'Content-Type: application/json' http://localhost:9200/jaso/_search?pretty=true -d '{ 166 | "query" : { 167 | "match" : { "name" : "ㅊㅇㄱ" } 168 | } 169 | }' 170 | ``` 171 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | 3 | compileJava { 4 | sourceCompatibility = JavaVersion.VERSION_17 5 | targetCompatibility = JavaVersion.VERSION_17 6 | } 7 | 8 | version = '8.6.2' 9 | jar { 10 | manifest { 11 | attributes 'Implementation-Title': 'Elasticsearch Jaso Analyzer Plugin', 12 | 'Implementation-Version': version 13 | } 14 | } 15 | 16 | repositories { 17 | mavenCentral() 18 | } 19 | 20 | dependencies { 21 | implementation group: 'org.elasticsearch', name: 'elasticsearch', version: version 22 | implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' 23 | implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' 24 | implementation group: 'org.slf4j', name: 'slf4j-api', version: '1.7.32' 25 | testImplementation group: 'junit', name: 'junit', version: '4.+' 26 | } 27 | 28 | test { 29 | systemProperties 'property': 'value' 30 | } 31 | 32 | task buildPluginZip(type: Zip, dependsOn:[':jar']) { 33 | baseName = 'jaso-analyzer-plugin' 34 | classifier = 'plugin' 35 | from 'build/libs' 36 | from 'src/main/resources' 37 | 38 | copy { 39 | from "build/distributions" 40 | into "docker" 41 | } 42 | } 43 | 44 | artifacts { 45 | archives buildPluginZip 46 | } 47 | 48 | [ compileJava, compileTestJava ]*.options*.encoding = 'UTF-8' 49 | [ compileJava, compileTestJava ]*.options*.compilerArgs = ['-Xlint:-options'] -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.elastic.co/elasticsearch/elasticsearch:8.6.2 2 | 3 | COPY jaso-analyzer-plugin-8.6.2-plugin.zip /tmp/ 4 | RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install file:///tmp/jaso-analyzer-plugin-8.6.2-plugin.zip -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.2' 2 | services: 3 | elasticsearch: 4 | build: 5 | context: . 6 | dockerfile: ./Dockerfile 7 | container_name: es-node-01 8 | environment: 9 | - cluster.name=es-docker-cluster 10 | - node.name=es01 11 | - xpack.security.enabled=false 12 | - discovery.type=single-node 13 | ulimits: 14 | memlock: 15 | soft: -1 16 | hard: -1 17 | nofile: 18 | soft: 262144 19 | hard: 262144 20 | cap_add: 21 | - IPC_LOCK 22 | volumes: 23 | - data01:/usr/share/elasticsearch/data 24 | ports: 25 | - "9200:9200" 26 | - "9300:9300" 27 | networks: 28 | - es-net 29 | 30 | kibana: 31 | container_name: kibana 32 | image: docker.elastic.co/kibana/kibana:8.6.2 33 | environment: 34 | ELASTICSEARCH_URL: http://es-node-01:9200 35 | ELASTICSEARCH_HOSTS: http://es-node-01:9200 36 | ports: 37 | - "5601:5601" 38 | depends_on: 39 | - elasticsearch 40 | networks: 41 | - es-net 42 | 43 | volumes: 44 | data01: 45 | driver: local 46 | data02: 47 | driver: local 48 | 49 | networks: 50 | es-net: 51 | driver: bridge -------------------------------------------------------------------------------- /elasticsearch-jaso-analyzer.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/netcrazy/elasticsearch-jaso-analyzer/d7204d0f698040bd53c9816e6daee9d09a38f127/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | # 4 | # Copyright 2015 the original author or authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | ## 21 | ## Gradle start up script for UN*X 22 | ## 23 | ############################################################################## 24 | 25 | # Attempt to set APP_HOME 26 | # Resolve links: $0 may be a link 27 | PRG="$0" 28 | # Need this for relative symlinks. 29 | while [ -h "$PRG" ] ; do 30 | ls=`ls -ld "$PRG"` 31 | link=`expr "$ls" : '.*-> \(.*\)$'` 32 | if expr "$link" : '/.*' > /dev/null; then 33 | PRG="$link" 34 | else 35 | PRG=`dirname "$PRG"`"/$link" 36 | fi 37 | done 38 | SAVED="`pwd`" 39 | cd "`dirname \"$PRG\"`/" >/dev/null 40 | APP_HOME="`pwd -P`" 41 | cd "$SAVED" >/dev/null 42 | 43 | APP_NAME="Gradle" 44 | APP_BASE_NAME=`basename "$0"` 45 | 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 48 | 49 | # Use the maximum available, or set MAX_FD != -1 to use that value. 50 | MAX_FD="maximum" 51 | 52 | warn () { 53 | echo "$*" 54 | } 55 | 56 | die () { 57 | echo 58 | echo "$*" 59 | echo 60 | exit 1 61 | } 62 | 63 | # OS specific support (must be 'true' or 'false'). 64 | cygwin=false 65 | msys=false 66 | darwin=false 67 | nonstop=false 68 | case "`uname`" in 69 | CYGWIN* ) 70 | cygwin=true 71 | ;; 72 | Darwin* ) 73 | darwin=true 74 | ;; 75 | MINGW* ) 76 | msys=true 77 | ;; 78 | NONSTOP* ) 79 | nonstop=true 80 | ;; 81 | esac 82 | 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 84 | 85 | 86 | # Determine the Java command to use to start the JVM. 87 | if [ -n "$JAVA_HOME" ] ; then 88 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 89 | # IBM's JDK on AIX uses strange locations for the executables 90 | JAVACMD="$JAVA_HOME/jre/sh/java" 91 | else 92 | JAVACMD="$JAVA_HOME/bin/java" 93 | fi 94 | if [ ! -x "$JAVACMD" ] ; then 95 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 96 | 97 | Please set the JAVA_HOME variable in your environment to match the 98 | location of your Java installation." 99 | fi 100 | else 101 | JAVACMD="java" 102 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 103 | 104 | Please set the JAVA_HOME variable in your environment to match the 105 | location of your Java installation." 106 | fi 107 | 108 | # Increase the maximum file descriptors if we can. 109 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 110 | MAX_FD_LIMIT=`ulimit -H -n` 111 | if [ $? -eq 0 ] ; then 112 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 113 | MAX_FD="$MAX_FD_LIMIT" 114 | fi 115 | ulimit -n $MAX_FD 116 | if [ $? -ne 0 ] ; then 117 | warn "Could not set maximum file descriptor limit: $MAX_FD" 118 | fi 119 | else 120 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 121 | fi 122 | fi 123 | 124 | # For Darwin, add options to specify how the application appears in the dock 125 | if $darwin; then 126 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 127 | fi 128 | 129 | # For Cygwin or MSYS, switch paths to Windows format before running java 130 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then 131 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 132 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 133 | 134 | JAVACMD=`cygpath --unix "$JAVACMD"` 135 | 136 | # We build the pattern for arguments to be converted via cygpath 137 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 138 | SEP="" 139 | for dir in $ROOTDIRSRAW ; do 140 | ROOTDIRS="$ROOTDIRS$SEP$dir" 141 | SEP="|" 142 | done 143 | OURCYGPATTERN="(^($ROOTDIRS))" 144 | # Add a user-defined pattern to the cygpath arguments 145 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 146 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 147 | fi 148 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 149 | i=0 150 | for arg in "$@" ; do 151 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 152 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 153 | 154 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 155 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 156 | else 157 | eval `echo args$i`="\"$arg\"" 158 | fi 159 | i=`expr $i + 1` 160 | done 161 | case $i in 162 | 0) set -- ;; 163 | 1) set -- "$args0" ;; 164 | 2) set -- "$args0" "$args1" ;; 165 | 3) set -- "$args0" "$args1" "$args2" ;; 166 | 4) set -- "$args0" "$args1" "$args2" "$args3" ;; 167 | 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 168 | 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 169 | 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 170 | 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 171 | 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 172 | esac 173 | fi 174 | 175 | # Escape application args 176 | save () { 177 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 178 | echo " " 179 | } 180 | APP_ARGS=`save "$@"` 181 | 182 | # Collect all arguments for the java command, following the shell quoting and substitution rules 183 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 184 | 185 | exec "$JAVACMD" "$@" 186 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/analysis/BaseTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.analysis; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 6 | import org.apache.lucene.analysis.CharacterUtils; 7 | import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer; 8 | 9 | import java.io.*; 10 | 11 | /** 12 | * Base 자소 토크나이저 구현 13 | * 14 | * @author 최일규 15 | * @since 2016-02-10 16 | */ 17 | public abstract class BaseTokenizer extends Tokenizer { 18 | 19 | private final TokenizerOptions options; 20 | 21 | private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; 22 | private static final int MAX_WORD_LEN = 2048; 23 | private static final int IO_BUFFER_SIZE = 4096; 24 | 25 | private final CharTermAttribute termAtt; 26 | private final OffsetAttribute offsetAtt; 27 | 28 | private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); 29 | 30 | protected BaseTokenizer(TokenizerOptions options) { 31 | this.options = options; 32 | 33 | termAtt = addAttribute(CharTermAttribute.class); 34 | offsetAtt = addAttribute(OffsetAttribute.class); 35 | 36 | offset = 0; 37 | bufferIndex = 0; 38 | dataLen = 0; 39 | finalOffset = 0; 40 | } 41 | 42 | protected boolean isTokenChar(int c) { 43 | throw new UnsupportedOperationException("Subclasses of CharTokenizer must implement isTokenChar(int)"); 44 | } 45 | 46 | protected int normalize(int c) { 47 | return c; 48 | } 49 | 50 | /** 51 | * lucene 4.2x의 경우 데이터가 있으면 자소분리 후 true가 떨어지나, 여기서는 false로 떨어져 ioBuffer사이즈 상태로 조건변경 (CharacterUtils.fill) 52 | * 53 | * @author 최일규 54 | * @since 2014-07-11 55 | */ 56 | @Override 57 | public final boolean incrementToken() throws IOException { 58 | clearAttributes(); 59 | 60 | int length = 0; 61 | int start = -1; // this variable is always initialized 62 | char[] buffer = termAtt.buffer(); 63 | while (true) { 64 | if (bufferIndex >= dataLen) { 65 | 66 | offset += dataLen; 67 | CharacterUtils.fill(ioBuffer, jasoDecompose(input, this.options)); 68 | 69 | //버퍼사이즈가 있으면 분석한다. (return false일때까지... 재귀호출) 70 | if (ioBuffer.getLength() == 0) { 71 | dataLen = 0; // so next offset += dataLen won't decrement offset 72 | if (length > 0) { 73 | break; 74 | } else { 75 | finalOffset = correctOffset(offset); 76 | return false; 77 | } 78 | } 79 | dataLen = ioBuffer.getLength(); 80 | bufferIndex = 0; 81 | } 82 | // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone 83 | final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, dataLen); 84 | bufferIndex += Character.charCount(c); 85 | 86 | // if it's a token char 87 | if (isTokenChar(c)) { 88 | 89 | // start of token 90 | if (length == 0) { 91 | assert start == -1; 92 | start = offset + bufferIndex - 1; 93 | 94 | // check if a supplementary could run out of bounds 95 | } else if (length >= buffer.length - 1) { 96 | 97 | // make sure a supplementary fits in the buffer 98 | buffer = termAtt.resizeBuffer(2 + length); 99 | } 100 | 101 | // buffer it, normalized 102 | length += Character.toChars(normalize(c), buffer, length); 103 | if (length >= MAX_WORD_LEN) { 104 | break; 105 | } 106 | } else if (length > 0) { 107 | // return 'em 108 | break; 109 | } 110 | } 111 | 112 | termAtt.setLength(length); 113 | assert start != -1; 114 | offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length)); 115 | return true; 116 | } 117 | 118 | @Override 119 | public final void end() { 120 | // set final offset 121 | offsetAtt.setOffset(finalOffset, finalOffset); 122 | } 123 | 124 | /** 125 | * Reader -> String -> 자소변환 -> String -> Reader 126 | */ 127 | public static Reader jasoDecompose(Reader in, TokenizerOptions options) { 128 | Writer writer = new StringWriter(); 129 | JasoDecomposer decomposer = new JasoDecomposer(); 130 | char[] buffer = new char[2048]; 131 | String temp; 132 | 133 | try { 134 | int n; 135 | while ((n = in.read(buffer)) != -1) { 136 | writer.write(buffer, 0, n); 137 | } 138 | temp = writer.toString(); 139 | temp = decomposer.runJasoDecompose(temp, options); 140 | // System.out.println(temp); 141 | in = new StringReader(temp); 142 | } catch (Exception e) { 143 | StringWriter errors = new StringWriter(); 144 | e.printStackTrace(new PrintWriter(errors)); 145 | } 146 | return in; 147 | } 148 | 149 | @Override 150 | public void reset() throws IOException { 151 | super.reset(); 152 | bufferIndex = 0; 153 | offset = 0; 154 | dataLen = 0; 155 | finalOffset = 0; 156 | ioBuffer.reset(); // make sure to reset the IO buffer!! 157 | } 158 | } -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/analysis/JasoAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.analysis; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | 5 | /** 6 | * JasoAnalyzer 7 | * 8 | * @author 최일규 9 | * @since 2018-03-21 10 | */ 11 | public class JasoAnalyzer extends Analyzer { 12 | public JasoAnalyzer() { 13 | } 14 | 15 | @Override 16 | protected Analyzer.TokenStreamComponents createComponents(final String fieldName) { 17 | return new Analyzer.TokenStreamComponents(new JasoTokenizer(TokenizerOptions.create("jaso_analyzer"))); 18 | } 19 | } -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/analysis/JasoDecomposer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.analysis; 2 | 3 | /** 4 | * 자동완성용 자소분해 (자소분해 with WhiteSpace) 5 | * 6 | * @author 최일규 7 | * @since 2016-02-10 8 | */ 9 | public class JasoDecomposer { 10 | 11 | //초성(19자) ㄱ ㄲ ㄴ ㄷ ㄸ ㄹ ㅁ ㅂ ㅃ ㅅ ㅆ ㅇ ㅈ ㅉ ㅊ ㅋ ㅌ ㅍ ㅎ 12 | static String[] chosungKor = {"ㄱ", "ㄱㄱ", "ㄴ", "ㄷ", "ㄷㄷ", "ㄹ", "ㅁ", "ㅂ", "ㅂㅂ", "ㅅ", "ㅅㅅ", "ㅇ", "ㅈ", "ㅈㅈ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ"}; 13 | //중성(21자) ㅏ ㅐ ㅑ ㅒ ㅓ ㅔ ㅕ ㅖ ㅗ ㅘ(9) ㅙ(10) ㅚ(11) ㅛ ㅜ ㅝ(14) ㅞ(15) ㅟ(16) ㅠ ㅡ ㅢ(19) ㅣ 14 | static String[] jungsungKor = {"ㅏ", "ㅐ", "ㅑ", "ㅒ", "ㅓ", "ㅔ", "ㅕ", "ㅖ", "ㅗ", "ㅗㅏ", "ㅗㅐ", "ㅗㅣ", "ㅛ", "ㅜ", "ㅜㅓ", "ㅜㅔ", "ㅜㅣ", "ㅠ", "ㅡ", "ㅡㅣ", "ㅣ"}; 15 | //종성(28자) <없음> ㄱ ㄲ ㄳ(3) ㄴ ㄵ(5) ㄶ(6) ㄷ ㄹ ㄺ(9) ㄻ(10) ㄼ(11) ㄽ(12) ㄾ(13) ㄿ(14) ㅀ(15) ㅁ ㅂ ㅄ(18) ㅅ ㅆ ㅇ ㅈ ㅊ ㅋ ㅌ ㅍ ㅎ 16 | static String[] jongsungKor = {" ", "ㄱ", "ㄱㄱ", "ㄱㅅ", "ㄴ", "ㄴㅈ", "ㄴㅎ", "ㄷ", "ㄹ", "ㄹㄱ", "ㄹㅁ", "ㄹㅂ", "ㄹㅅ", "ㄹㅌ", "ㄹㅍ", "ㄹㅎ", "ㅁ", "ㅂ", "ㅂㅅ", "ㅅ", "ㅅㅅ", "ㅇ", "ㅈ", "ㅊ", "ㅋ", "ㅌ", "ㅍ", "ㅎ"}; 17 | 18 | static String[] chosungEng = {"r", "R", "s", "e", "E", "f", "a", "q", "Q", "t", "T", "d", "w", "W", "c", "z", "x", "v", "g"}; 19 | static String[] jungsungEng = {"k", "o", "i", "O", "j", "p", "u", "P", "h", "hk", "ho", "hl", "y", "n", "nj", "np", "nl", "b", "m", "ml", "l"}; 20 | static String[] jongsungEng = {"", "r", "R", "rt", "s", "sw", "sg", "e", "f", "fr", "fa", "fq", "ft", "fx", "fv", "fg", "a", "q", "qt", "t", "T", "d", "w", "c", "z", "x", "v", "g"}; 21 | 22 | static String[] mistyping = {"ㅁ", "ㅠ", "ㅊ", "ㅇ", "ㄷ", "ㄹ", "ㅎ", "ㅗ", "ㅑ", "ㅓ", "ㅏ", "ㅣ", "ㅡ", "ㅜ", "ㅐ", "ㅔ", "ㅂ", "ㄱ", "ㄴ", "ㅅ", "ㅕ", "ㅍ", "ㅈ", "ㅌ", "ㅛ", "ㅋ"}; 23 | 24 | public String runJasoDecompose(String originStr, TokenizerOptions options) { 25 | 26 | if (!originStr.isEmpty()) { 27 | 28 | //lowercase 처리 29 | originStr = originStr.toLowerCase(); 30 | 31 | char[] termBuffer = originStr.toCharArray(); 32 | StringBuilder korBuffer = new StringBuilder(); 33 | StringBuilder engBuffer = new StringBuilder(); 34 | StringBuilder chosungBuffer = new StringBuilder(); 35 | StringBuilder mistypingBuffer = new StringBuilder(); 36 | StringBuilder etcBuffer = new StringBuilder(); 37 | StringBuilder returnBuffer = new StringBuilder(); 38 | 39 | //첫글자가 한글일때만 초성분해 40 | boolean firstCharType = false; 41 | if (termBuffer.length > 0) 42 | firstCharType = isHangul(Character.toString(termBuffer[0])); 43 | 44 | //자소포함여부 45 | boolean jaso = isJaso(originStr); 46 | //한글포함여부 47 | boolean hangul = isHangul(originStr); 48 | //영문포함여부 49 | boolean english = isEnglish(originStr); 50 | 51 | int strLen = originStr.length(); 52 | 53 | int cho; 54 | int jung; 55 | int jong; 56 | for (char ch : termBuffer) { 57 | //가(AC00)~힣(D7A3) 에 속한 글자면 분해 58 | if (ch >= 0xAC00 && ch <= 0xD7A3 && !jaso) { 59 | //Unicode 값으로 환산한다. 60 | int uniValue = ch - 0xAC00; 61 | 62 | jong = uniValue % 28; //종성 63 | cho = ((uniValue - jong) / 28) / 21; //초성 64 | jung = ((uniValue - jong) / 28) % 21; //중성 65 | 66 | //한글초성 67 | korBuffer.append(chosungKor[cho]); 68 | 69 | //한글에 대한 초성처리 (일반적으로 색인시 초성을 담는다.) 70 | if (options.isChosung() && firstCharType) { 71 | //초성은 2자이상일때 포함 72 | if (strLen >= 2) 73 | chosungBuffer.append(chosungKor[cho]); 74 | } 75 | 76 | //한글문장에 대한 영문오타처리 (ㄱ -> r) 77 | if (options.isMistype()) { 78 | engBuffer.append(chosungEng[cho].toLowerCase()); 79 | } 80 | 81 | //한글중성 82 | korBuffer.append(jungsungKor[jung]); 83 | 84 | //한글문장에 대한 영문오타처리 (ㅏ-> k) 85 | if (options.isMistype()) { 86 | engBuffer.append(jungsungEng[jung].toLowerCase()); 87 | } 88 | 89 | //받침이 있으면 90 | if (jong != 0) { 91 | korBuffer.append(jongsungKor[jong]); 92 | 93 | //한글문장에 대한 영문오타처리 (ㄲ -> R) 94 | if (options.isMistype()) { 95 | engBuffer.append(jongsungEng[jong].toLowerCase()); 96 | } 97 | } 98 | } else { 99 | 100 | if (options.isMistype()) { 101 | if (!jaso) { 102 | if (hangul) { 103 | korBuffer.append(ch); 104 | } 105 | engBuffer.append(ch); 106 | } 107 | } else { 108 | if (!jaso) { 109 | if (hangul) { 110 | korBuffer.append(ch); 111 | } else { 112 | engBuffer.append(ch); 113 | } 114 | } 115 | } 116 | 117 | //영문문장에 대한 한글오타처리 (hello -> ㅗ디ㅣㅐ) 118 | if (options.isMistype() && !hangul) { 119 | int index; 120 | if (ch >= 0x61 && ch <= 0x7A) { 121 | //소문자 122 | index = (int) ch - 97; 123 | mistypingBuffer.append(mistyping[index]); 124 | } else if (ch >= 0x41 && ch <= 0x5A) { 125 | //대문자 126 | index = (int) ch - 65; 127 | mistypingBuffer.append(mistyping[index]); 128 | } else { 129 | if (english) 130 | mistypingBuffer.append(ch); 131 | } 132 | } 133 | } 134 | 135 | //추가적인 예외상황으로 추가 토큰처리 (ㅗ디ㅣㅐ -> ㅗㄷㅣㅣㅐ 자소분해) 136 | if (jaso) { 137 | 138 | if (ch >= 0xAC00 && ch <= 0xD7A3) { 139 | //Unicode 값으로 환산한다. 140 | int uniValue = ch - 0xAC00; 141 | 142 | jong = uniValue % 28; //종성 143 | cho = ((uniValue - jong) / 28) / 21; //초성 144 | jung = ((uniValue - jong) / 28) % 21; //중성 145 | 146 | etcBuffer.append(chosungKor[cho]); 147 | etcBuffer.append(jungsungKor[jung]); 148 | //받침이 있으면 149 | if (jong != 0) { 150 | etcBuffer.append(jongsungKor[jong]); 151 | } 152 | } else if (isJaso(Character.toString(ch))) { 153 | //복자음 강제분리 154 | switch (ch) { 155 | case 'ㄲ': 156 | etcBuffer.append("ㄱㄱ"); 157 | break; 158 | case 'ㄳ': 159 | etcBuffer.append("ㄱㅅ"); 160 | break; 161 | case 'ㄵ': 162 | etcBuffer.append("ㄴㅈ"); 163 | break; 164 | case 'ㄶ': 165 | etcBuffer.append("ㄴㅎ"); 166 | break; 167 | case 'ㄺ': 168 | etcBuffer.append("ㄹㄱ"); 169 | break; 170 | case 'ㄻ': 171 | etcBuffer.append("ㄹㅁ"); 172 | break; 173 | case 'ㄼ': 174 | etcBuffer.append("ㄹㅂ"); 175 | break; 176 | case 'ㄽ': 177 | etcBuffer.append("ㄹㅅ"); 178 | break; 179 | case 'ㄾ': 180 | etcBuffer.append("ㄹㅌ"); 181 | break; 182 | case 'ㄿ': 183 | etcBuffer.append("ㄹㅍ"); 184 | break; 185 | case 'ㅀ': 186 | etcBuffer.append("ㄹㅎ"); 187 | break; 188 | case 'ㅄ': 189 | etcBuffer.append("ㅂㅅ"); 190 | break; 191 | case 'ㄸ': 192 | etcBuffer.append("ㄷㄷ"); 193 | break; 194 | case 'ㅃ': 195 | etcBuffer.append("ㅂㅂ"); 196 | break; 197 | case 'ㅆ': 198 | etcBuffer.append("ㅅㅅ"); 199 | break; 200 | case 'ㅉ': 201 | etcBuffer.append("ㅈㅈ"); 202 | break; 203 | default: 204 | etcBuffer.append(ch); 205 | } 206 | } else { 207 | etcBuffer.append(ch); 208 | } 209 | } 210 | } 211 | 212 | //결과 조합 213 | 214 | //공백을 붙인 전체 문자열 (한글) 215 | if (korBuffer.indexOf(" ") != -1) { 216 | if (korBuffer.length() > 0) { 217 | returnBuffer.append(korBuffer.toString().replaceAll(" ", "")); 218 | returnBuffer.append(" "); 219 | } 220 | } 221 | 222 | //공백으로 분리된 문자열 (한글) 223 | if (korBuffer.length() > 0) { 224 | returnBuffer.append(korBuffer.toString()); 225 | returnBuffer.append(" "); 226 | } 227 | 228 | //공백을 붙인 전체 문자열 (영문) 229 | if (engBuffer.indexOf(" ") != -1) { 230 | if (engBuffer.length() > 0) { 231 | returnBuffer.append(engBuffer.toString().replaceAll(" ", "")); 232 | returnBuffer.append(" "); 233 | } 234 | } 235 | 236 | //공백으로 분리된 문자열 (영문) 237 | if (engBuffer.length() > 0) { 238 | returnBuffer.append(engBuffer.toString()); 239 | returnBuffer.append(" "); 240 | } 241 | 242 | //공백을 붙인 전체 문자열 (오타) 243 | if (mistypingBuffer.indexOf(" ") != -1) { 244 | if (mistypingBuffer.length() > 0) { 245 | returnBuffer.append(mistypingBuffer.toString().replaceAll(" ", "")); 246 | returnBuffer.append(" "); 247 | } 248 | } 249 | 250 | //공백으로 분리된 문자열 (오타) 251 | if (mistypingBuffer.length() > 0) { 252 | returnBuffer.append(mistypingBuffer); 253 | returnBuffer.append(" "); 254 | } 255 | 256 | if (chosungBuffer.length() > 0) { 257 | returnBuffer.append(chosungBuffer); 258 | returnBuffer.append(" "); 259 | } 260 | 261 | if (etcBuffer.length() > 0) { 262 | returnBuffer.append(etcBuffer); 263 | returnBuffer.append(" "); 264 | } 265 | 266 | return returnBuffer.toString().trim(); 267 | } else { 268 | return ""; 269 | } 270 | } 271 | 272 | /** 273 | * 문자열에 한글포함 여부 274 | */ 275 | private boolean isHangul(String str) { 276 | return str.matches(".*[ㄱ-ㅎㅏ-ㅣ가-힣]+.*"); 277 | } 278 | 279 | /** 280 | * 문자열에 영문포함 여부 281 | */ 282 | private boolean isEnglish(String str) { 283 | return str.matches(".*[a-zA-Z]+.*"); 284 | } 285 | 286 | /** 287 | * 문자열에 초성,중성 포함 여부 288 | */ 289 | private boolean isJaso(String str) { 290 | return str.matches(".*[ㄱ-ㅎㅏ-ㅣ]+.*"); 291 | } 292 | } 293 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/analysis/JasoTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.analysis; 2 | 3 | import org.elasticsearch.common.config; 4 | 5 | /** 6 | * 자소 토크나이저 구현 7 | * 8 | * @author 최일규 9 | * @since 2018-03-21 10 | */ 11 | public final class JasoTokenizer extends BaseTokenizer { 12 | 13 | /** 14 | * 자소 토크나이저 생성자 15 | * 16 | * @param options 토크나이저 옵션 17 | */ 18 | public JasoTokenizer(TokenizerOptions options) { 19 | super(options); 20 | } 21 | 22 | /** 23 | * Collects only characters which do not satisfy 24 | * {@link Character#isWhitespace(int)}. 25 | */ 26 | @Override 27 | protected boolean isTokenChar(int c) { 28 | return !isSplit(c); 29 | } 30 | 31 | /** 32 | * White Space로 토큰분해 33 | */ 34 | private boolean isSplit(int c) { 35 | return (char) c == config.WHITESPACE_CHAR; 36 | } 37 | } -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/analysis/TokenizerOptions.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.analysis; 2 | 3 | /** 4 | * 토크나이저 옵션 5 | * 6 | * @author 최일규 7 | * @since 2016-02-12 8 | */ 9 | public class TokenizerOptions { 10 | 11 | //한영오타에 대한 토큰 추출여부 (hello -> ㅗㄷㅣㅣㅐ, 최일규 -> chldlfrb) 12 | public final static boolean MISTYPE = false; 13 | 14 | //초성검색을 위한 토큰 추출여부 (최일규 -> ㅊㅇㄱ) 15 | public final static boolean CHOSUNG = false; 16 | 17 | private boolean mistype = MISTYPE; 18 | private boolean chosung = CHOSUNG; 19 | 20 | private String name = null; 21 | 22 | public static TokenizerOptions create(String name) { 23 | return new TokenizerOptions(name); 24 | } 25 | 26 | private TokenizerOptions(String name) { 27 | this.name = name; 28 | } 29 | 30 | public String getName() { 31 | return name; 32 | } 33 | 34 | public boolean isMistype() { 35 | return mistype; 36 | } 37 | 38 | public void setMistype(boolean mistype) { 39 | this.mistype = mistype; 40 | } 41 | 42 | public boolean isChosung() { 43 | return chosung; 44 | } 45 | 46 | public void setChosung(boolean chosung) { 47 | this.chosung = chosung; 48 | } 49 | } -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/common/config.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.common; 2 | 3 | /** 4 | * 글로벌 공통변수 5 | * 6 | * @author 최일규 7 | * @since 2016-02-03 8 | */ 9 | public class config { 10 | public static final char WHITESPACE_CHAR = ' '; 11 | } -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/JasoAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.elasticsearch.analysis.JasoAnalyzer; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | 8 | /** 9 | * JasoAnalyzerProvider 10 | * 11 | * @author 최일규 12 | * @since 2018-03-21 13 | */ 14 | public class JasoAnalyzerProvider extends AbstractIndexAnalyzerProvider { 15 | private final JasoAnalyzer analyzer; 16 | 17 | public JasoAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 18 | super(name, settings); 19 | analyzer = new JasoAnalyzer(); 20 | } 21 | 22 | @Override 23 | public JasoAnalyzer get() { 24 | return analyzer; 25 | } 26 | } -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/JasoTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.elasticsearch.analysis.JasoTokenizer; 5 | import org.elasticsearch.analysis.TokenizerOptions; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.index.IndexSettings; 8 | import org.elasticsearch.env.Environment; 9 | 10 | /** 11 | * JasoTokenizerFactory 12 | * 13 | * @author 최일규 14 | * @since 2018-03-21 15 | */ 16 | public class JasoTokenizerFactory extends AbstractTokenizerFactory { 17 | 18 | private final TokenizerOptions options; 19 | 20 | public JasoTokenizerFactory(IndexSettings indexSettings, 21 | Environment environment, 22 | String name, 23 | Settings settings) { 24 | 25 | super(indexSettings, settings, name); 26 | 27 | this.options = TokenizerOptions.create(name); 28 | this.options.setMistype(settings.getAsBoolean("mistype", TokenizerOptions.MISTYPE)); 29 | this.options.setChosung(settings.getAsBoolean("chosung", TokenizerOptions.CHOSUNG)); 30 | } 31 | 32 | @Override 33 | public Tokenizer create() { 34 | return new JasoTokenizer(this.options); 35 | } 36 | } -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/JasoAnalysisPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analysis; 2 | 3 | import org.elasticsearch.index.analysis.JasoTokenizerFactory; 4 | import org.elasticsearch.index.analysis.*; 5 | import org.elasticsearch.plugins.AnalysisPlugin; 6 | import org.elasticsearch.plugins.Plugin; 7 | 8 | import java.util.Map; 9 | 10 | import org.elasticsearch.indices.analysis.AnalysisModule; 11 | 12 | import static java.util.Collections.singletonMap; 13 | 14 | import org.apache.lucene.analysis.Analyzer; 15 | 16 | /** 17 | * JasoAnalysisPlugin 18 | * 19 | * @author 최일규 20 | * @since 2018-03-21 21 | */ 22 | public class JasoAnalysisPlugin extends Plugin implements AnalysisPlugin { 23 | 24 | @Override 25 | public Map> getTokenizers() { 26 | return singletonMap("jaso_tokenizer", JasoTokenizerFactory::new); 27 | } 28 | 29 | @Override 30 | public Map>> getAnalyzers() { 31 | return singletonMap("jaso_analyzer", JasoAnalyzerProvider::new); 32 | } 33 | } -------------------------------------------------------------------------------- /src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | description=Jaso Korean Text Analyzer 2 | version=8.6.2 3 | name=jaso-analyzer 4 | classname=org.elasticsearch.plugin.analysis.JasoAnalysisPlugin 5 | java.version=17 6 | elasticsearch.version=8.6.2 -------------------------------------------------------------------------------- /src/test/java/org/elasticsearch/analysis/JasoTest.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.analysis; 2 | 3 | import junit.framework.TestCase; 4 | 5 | /** 6 | * 자동완성 기능 유닛테스트 7 | * 8 | * @author 최일규 9 | * @since 2016-02-03 10 | */ 11 | public class JasoTest extends TestCase { 12 | 13 | public void testJasoDecomposer() { 14 | TokenizerOptions options = TokenizerOptions.create("testJasoDecomposer"); 15 | options.setMistype(true); 16 | options.setChosung(true); 17 | 18 | JasoDecomposer aa = new JasoDecomposer(); 19 | 20 | String expected = "ㅅㅅㄱㄱ"; 21 | String actual = aa.runJasoDecompose("ㅆㄲ", options); 22 | assertEquals(expected, actual); 23 | } 24 | } -------------------------------------------------------------------------------- /src/test/java/org/elasticsearch/analysis/JasoTokenizerTest.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.analysis; 2 | 3 | import junit.framework.TestCase; 4 | import org.apache.lucene.analysis.Tokenizer; 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 6 | 7 | import java.io.IOException; 8 | import java.io.StringReader; 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | 12 | /** 13 | * 토크나이저 유닛테스트 14 | * 15 | * @author 최일규 16 | * @since 2016-02-11 17 | */ 18 | public class JasoTokenizerTest extends TestCase { 19 | 20 | public void testTokenizer() throws IOException { 21 | 22 | long start = System.currentTimeMillis(); 23 | TokenizerOptions options = TokenizerOptions.create("testTokenizer"); 24 | 25 | //한영오타에 대한 토큰 추출여부 (hello -> ㅗㄷㅣㅣㅐ, 최일규 -> chldlfrb) 26 | options.setMistype(false); 27 | 28 | //초성검색을 위한 토큰 추출여부 (최일규 -> ㅊㅇㄱ) 29 | options.setChosung(false); 30 | 31 | List testCase = new ArrayList(); 32 | 33 | if (options.isMistype() && options.isChosung()) { 34 | 35 | testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ/chldlfrb/ㅊㅇㄱ")); 36 | testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/thsutleo/ㅅㄴㅅㄷ")); 37 | testCase.add(new TestCaseVO("Hello", "hello/ㅗㄷㅣㅣㅐ")); 38 | testCase.add(new TestCaseVO("Hello~", "hello~/ㅗㄷㅣㅣㅐ~")); 39 | testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld/anwhrjs/govldpseld/ㅁㅈㄱㅎㅍㅇㄷ")); 40 | testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/dkelektmdnsehdghk/dkelektm/dnsehdghk/ㅇㄷㄷㅅㅇㄷㅎ")); 41 | testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/xnepdlxmrrk/ㅌㄷㅇㅌㄱ")); 42 | 43 | } else if (options.isMistype() && !options.isChosung()) { 44 | 45 | testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ/chldlfrb")); 46 | testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/thsutleo")); 47 | testCase.add(new TestCaseVO("Hello", "hello/ㅗㄷㅣㅣㅐ")); 48 | testCase.add(new TestCaseVO("Hello~", "hello~/ㅗㄷㅣㅣㅐ~")); 49 | testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/anwhrjsgovldpseld/anwhrjs/govldpseld")); 50 | testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/dkelektmdnsehdghk/dkelektm/dnsehdghk")); 51 | testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/xnepdlxmrrk")); 52 | 53 | } else if (!options.isMistype() && options.isChosung()) { 54 | 55 | testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ/ㅊㅇㄱ")); 56 | testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ/ㅅㄴㅅㄷ")); 57 | testCase.add(new TestCaseVO("Hello", "hello")); 58 | testCase.add(new TestCaseVO("Hello~", "hello~")); 59 | testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅈㄱㅎㅍㅇㄷ")); 60 | testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㄷㄷㅅㅇㄷㅎ")); 61 | testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ/ㅌㄷㅇㅌㄱ")); 62 | 63 | } else if (!options.isMistype() && !options.isChosung()) { 64 | 65 | testCase.add(new TestCaseVO("최일규", "ㅊㅗㅣㅇㅣㄹㄱㅠ")); 66 | testCase.add(new TestCaseVO("소녀시대", "ㅅㅗㄴㅕㅅㅣㄷㅐ")); 67 | testCase.add(new TestCaseVO("Hello", "hello")); 68 | testCase.add(new TestCaseVO("Hello~", "hello~")); 69 | testCase.add(new TestCaseVO("무조건 해피엔딩", "ㅁㅜㅈㅗㄱㅓㄴㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ/ㅁㅜㅈㅗㄱㅓㄴ/ㅎㅐㅍㅣㅇㅔㄴㄷㅣㅇ")); 70 | testCase.add(new TestCaseVO("아디다스 운동화", "ㅇㅏㄷㅣㄷㅏㅅㅡㅇㅜㄴㄷㅗㅇㅎㅗㅏ/ㅇㅏㄷㅣㄷㅏㅅㅡ/ㅇㅜㄴㄷㅗㅇㅎㅗㅏ")); 71 | testCase.add(new TestCaseVO("투데이특가", "ㅌㅜㄷㅔㅇㅣㅌㅡㄱㄱㅏ")); 72 | } 73 | 74 | for (TestCaseVO vo : testCase) { 75 | 76 | StringReader reader = new StringReader(vo.getOrigin()); 77 | 78 | Tokenizer tokenizer = new JasoTokenizer(options); 79 | tokenizer.setReader(reader); 80 | CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); 81 | 82 | tokenizer.reset(); 83 | 84 | StringBuffer sb = new StringBuffer(); 85 | 86 | while (tokenizer.incrementToken()) { 87 | if (sb.length() > 0) sb.append('/'); 88 | sb.append(termAtt.toString()); 89 | } 90 | 91 | TestCase.assertEquals(vo.getCompare(), sb.toString()); 92 | tokenizer.close(); 93 | 94 | System.out.printf("%s => %s%n", vo.getOrigin(), sb); 95 | } 96 | 97 | long end = System.currentTimeMillis(); 98 | System.out.println("실행 시간 : " + (end - start) / 1000.0); 99 | } 100 | } -------------------------------------------------------------------------------- /src/test/java/org/elasticsearch/analysis/TestCaseVO.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.analysis; 2 | 3 | /** 4 | * 테스트케이스 VO 5 | * 6 | * @author 최일규 7 | * @since 2016-02-13 8 | */ 9 | public class TestCaseVO { 10 | 11 | private final String origin; 12 | private final String compare; 13 | 14 | public TestCaseVO(String origin, String compare) { 15 | this.origin = origin; 16 | this.compare = compare; 17 | } 18 | 19 | public String getOrigin() { 20 | return origin; 21 | } 22 | 23 | public String getCompare() { 24 | return compare; 25 | } 26 | } 27 | --------------------------------------------------------------------------------