├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── build.gradle ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── settings.gradle ├── src ├── main │ ├── java │ │ ├── com │ │ │ └── coccoc │ │ │ │ ├── Token.java │ │ │ │ └── Tokenizer.java │ │ └── org │ │ │ ├── apache │ │ │ └── lucene │ │ │ │ └── analysis │ │ │ │ └── vi │ │ │ │ ├── VietnameseAnalyzer.java │ │ │ │ └── VietnameseTokenizer.java │ │ │ └── elasticsearch │ │ │ ├── index │ │ │ └── analysis │ │ │ │ ├── VietnameseAnalyzerProvider.java │ │ │ │ ├── VietnameseStopTokenFilterFactory.java │ │ │ │ └── VietnameseTokenizerFactory.java │ │ │ └── plugin │ │ │ └── analysis │ │ │ └── vi │ │ │ └── AnalysisViPlugin.java │ ├── jni │ │ └── Tokenizer.cpp │ ├── plugin-metadata │ │ └── plugin-security.policy │ └── resources │ │ └── stopwords.txt └── test │ └── java │ └── org │ └── elasticsearch │ └── index │ └── analysis │ └── VietnameseAnalysisIntegrationTests.java └── tokenizer.gradle /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | 25 | # Gradle 26 | .gradle 27 | build 28 | !gradle-wrapper.jar 29 | 30 | # Eclipse project files 31 | .project 32 | .classpath 33 | .settings 34 | 35 | # IDE 36 | .vscode 37 | .idea 38 | *.iml 39 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "coccoc-tokenizer"] 2 | path = coccoc-tokenizer 3 | url = https://github.com/coccoc/coccoc-tokenizer 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vietnamese Analysis Plugin for Elasticsearch 2 | 3 | Vietnamese Analysis plugin integrates Vietnamese language analysis into Elasticsearch. 4 | The plugin provides the following functions: 5 | 6 | Analyzer: `vi_analyzer`. Tokenizer: `vi_tokenizer`. Filter: `vi_stop`. The `vi_analyzer` itself is composed of the `vi_tokenizer` and the `vi_stop` filter. 7 | 8 | The tokenizer uses [coccoc-tokenizer](https://github.com/coccoc/coccoc-tokenizer) for tokenization. 9 | 10 | ## Installation 11 | 12 | Choose a version from the [releases](https://github.com/sun-asterisk-research/elasticsearch-analysis-vi/releases) page to install: 13 | 14 | ```sh 15 | elasticsearch-plugin install https://github.com/sun-asterisk-research/elasticsearch-analysis-vi/releases/download// 16 | ``` 17 | 18 | Or [build from source](#build-from-source) and install from a plugin bundle. 19 | 20 | ```sh 21 | elasticsearch-plugin instal file:///path/to/plugin 22 | ``` 23 | 24 | ## Supported versions 25 | 26 | | Branch | Elasticsearch version | 27 | |--------|-----------------------| 28 | | master | 7.4+ | 29 | | 7.3 | 7.0 - 7.3 | 30 | 31 | ## Build from source 32 | 33 | You need the following build dependencies: `JDK`, `make`, `cmake`, `libstdc++`. At least JDK 11 is required. Beware of your `libstdc++` version. If you build on a version too new, it will not work on older systems. 34 | 35 | First update the git submodules: 36 | 37 | ```sh 38 | git submodule update --init 39 | ``` 40 | 41 | Build and bundle the plugin: 42 | 43 | ```sh 44 | ./gradlew assemble 45 | ``` 46 | 47 | To build for a different elasticsearch version, add `-PelasticsearchVersion=` to your build command. Also note the [branch and supported versions](#supported-versions). For example, to build for Elasticsearch 7.3.1: 48 | 49 | ```sh 50 | ./gradlew assemble -PelasticsearchVersion=7.3.1 51 | ``` 52 | 53 | To run tests: 54 | 55 | ```sh 56 | ./gradlew check 57 | ``` 58 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | repositories { 3 | mavenCentral() 4 | jcenter() 5 | } 6 | 7 | dependencies { 8 | classpath "org.elasticsearch.gradle:build-tools:${elasticsearchVersion}" 9 | } 10 | } 11 | 12 | apply plugin: 'java' 13 | apply plugin: 'elasticsearch.esplugin' 14 | apply from: 'tokenizer.gradle' 15 | 16 | version = "${pluginVersion}-es${versions.elasticsearch}" 17 | 18 | esplugin { 19 | name 'analysis-vi' 20 | version "${pluginVersion}" 21 | description 'Elasticsearch Vietnamese Analysis Plugin' 22 | classname 'org.elasticsearch.plugin.analysis.vi.AnalysisViPlugin' 23 | licenseFile rootProject.file('LICENSE') 24 | noticeFile rootProject.file('README.md') 25 | } 26 | 27 | integTestRunner { 28 | include 'org/elasticsearch/index/analysis/*Tests.class' 29 | } 30 | 31 | test.enabled = false 32 | licenseHeaders.enabled = false 33 | -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | pluginVersion=1.0.0 2 | elasticsearchVersion=7.5.1 3 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sun-asterisk-research/elasticsearch-analysis-vi/84b7e5301461ce633f8b6e8aa52104dc37d51af4/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.0.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | # 4 | # Copyright 2015 the original author or authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | ## 21 | ## Gradle start up script for UN*X 22 | ## 23 | ############################################################################## 24 | 25 | # Attempt to set APP_HOME 26 | # Resolve links: $0 may be a link 27 | PRG="$0" 28 | # Need this for relative symlinks. 29 | while [ -h "$PRG" ] ; do 30 | ls=`ls -ld "$PRG"` 31 | link=`expr "$ls" : '.*-> \(.*\)$'` 32 | if expr "$link" : '/.*' > /dev/null; then 33 | PRG="$link" 34 | else 35 | PRG=`dirname "$PRG"`"/$link" 36 | fi 37 | done 38 | SAVED="`pwd`" 39 | cd "`dirname \"$PRG\"`/" >/dev/null 40 | APP_HOME="`pwd -P`" 41 | cd "$SAVED" >/dev/null 42 | 43 | APP_NAME="Gradle" 44 | APP_BASE_NAME=`basename "$0"` 45 | 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 48 | 49 | # Use the maximum available, or set MAX_FD != -1 to use that value. 50 | MAX_FD="maximum" 51 | 52 | warn () { 53 | echo "$*" 54 | } 55 | 56 | die () { 57 | echo 58 | echo "$*" 59 | echo 60 | exit 1 61 | } 62 | 63 | # OS specific support (must be 'true' or 'false'). 64 | cygwin=false 65 | msys=false 66 | darwin=false 67 | nonstop=false 68 | case "`uname`" in 69 | CYGWIN* ) 70 | cygwin=true 71 | ;; 72 | Darwin* ) 73 | darwin=true 74 | ;; 75 | MINGW* ) 76 | msys=true 77 | ;; 78 | NONSTOP* ) 79 | nonstop=true 80 | ;; 81 | esac 82 | 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 84 | 85 | # Determine the Java command to use to start the JVM. 86 | if [ -n "$JAVA_HOME" ] ; then 87 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 88 | # IBM's JDK on AIX uses strange locations for the executables 89 | JAVACMD="$JAVA_HOME/jre/sh/java" 90 | else 91 | JAVACMD="$JAVA_HOME/bin/java" 92 | fi 93 | if [ ! -x "$JAVACMD" ] ; then 94 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 95 | 96 | Please set the JAVA_HOME variable in your environment to match the 97 | location of your Java installation." 98 | fi 99 | else 100 | JAVACMD="java" 101 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 102 | 103 | Please set the JAVA_HOME variable in your environment to match the 104 | location of your Java installation." 105 | fi 106 | 107 | # Increase the maximum file descriptors if we can. 108 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 109 | MAX_FD_LIMIT=`ulimit -H -n` 110 | if [ $? -eq 0 ] ; then 111 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 112 | MAX_FD="$MAX_FD_LIMIT" 113 | fi 114 | ulimit -n $MAX_FD 115 | if [ $? -ne 0 ] ; then 116 | warn "Could not set maximum file descriptor limit: $MAX_FD" 117 | fi 118 | else 119 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 120 | fi 121 | fi 122 | 123 | # For Darwin, add options to specify how the application appears in the dock 124 | if $darwin; then 125 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 126 | fi 127 | 128 | # For Cygwin or MSYS, switch paths to Windows format before running java 129 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then 130 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 131 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 132 | JAVACMD=`cygpath --unix "$JAVACMD"` 133 | 134 | # We build the pattern for arguments to be converted via cygpath 135 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 136 | SEP="" 137 | for dir in $ROOTDIRSRAW ; do 138 | ROOTDIRS="$ROOTDIRS$SEP$dir" 139 | SEP="|" 140 | done 141 | OURCYGPATTERN="(^($ROOTDIRS))" 142 | # Add a user-defined pattern to the cygpath arguments 143 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 144 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 145 | fi 146 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 147 | i=0 148 | for arg in "$@" ; do 149 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 150 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 151 | 152 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 153 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 154 | else 155 | eval `echo args$i`="\"$arg\"" 156 | fi 157 | i=$((i+1)) 158 | done 159 | case $i in 160 | (0) set -- ;; 161 | (1) set -- "$args0" ;; 162 | (2) set -- "$args0" "$args1" ;; 163 | (3) set -- "$args0" "$args1" "$args2" ;; 164 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 165 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 166 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 167 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 168 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 169 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 170 | esac 171 | fi 172 | 173 | # Escape application args 174 | save () { 175 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 176 | echo " " 177 | } 178 | APP_ARGS=$(save "$@") 179 | 180 | # Collect all arguments for the java command, following the shell quoting and substitution rules 181 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 182 | 183 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 184 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 185 | cd "$(dirname "$0")" 186 | fi 187 | 188 | exec "$JAVACMD" "$@" 189 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 33 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 34 | 35 | @rem Find java.exe 36 | if defined JAVA_HOME goto findJavaFromJavaHome 37 | 38 | set JAVA_EXE=java.exe 39 | %JAVA_EXE% -version >NUL 2>&1 40 | if "%ERRORLEVEL%" == "0" goto init 41 | 42 | echo. 43 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 44 | echo. 45 | echo Please set the JAVA_HOME variable in your environment to match the 46 | echo location of your Java installation. 47 | 48 | goto fail 49 | 50 | :findJavaFromJavaHome 51 | set JAVA_HOME=%JAVA_HOME:"=% 52 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 53 | 54 | if exist "%JAVA_EXE%" goto init 55 | 56 | echo. 57 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 58 | echo. 59 | echo Please set the JAVA_HOME variable in your environment to match the 60 | echo location of your Java installation. 61 | 62 | goto fail 63 | 64 | :init 65 | @rem Get command-line arguments, handling Windows variants 66 | 67 | if not "%OS%" == "Windows_NT" goto win9xME_args 68 | 69 | :win9xME_args 70 | @rem Slurp the command line arguments. 71 | set CMD_LINE_ARGS= 72 | set _SKIP=2 73 | 74 | :win9xME_args_slurp 75 | if "x%~1" == "x" goto execute 76 | 77 | set CMD_LINE_ARGS=%* 78 | 79 | :execute 80 | @rem Setup the command line 81 | 82 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 83 | 84 | @rem Execute Gradle 85 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 86 | 87 | :end 88 | @rem End local scope for the variables with windows NT shell 89 | if "%ERRORLEVEL%"=="0" goto mainEnd 90 | 91 | :fail 92 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 93 | rem the _cmd.exe /c_ return code! 94 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 95 | exit /b 1 96 | 97 | :mainEnd 98 | if "%OS%"=="Windows_NT" endlocal 99 | 100 | :omega 101 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'elasticsearch-analysis-vi' 2 | -------------------------------------------------------------------------------- /src/main/java/com/coccoc/Token.java: -------------------------------------------------------------------------------- 1 | package com.coccoc; 2 | 3 | public final class Token { 4 | public enum Type { 5 | WORD, 6 | NUMBER; 7 | 8 | private static Type[] values = null; 9 | 10 | static { 11 | Type.values = Type.values(); 12 | } 13 | 14 | public static Type fromInt(int i) { 15 | return Type.values[i]; 16 | } 17 | } 18 | 19 | private final String text; 20 | private final Type type; 21 | private final int originalStart; 22 | private final int originalEnd; 23 | 24 | public Token(String text, int type, int originalStart, int originalEnd) { 25 | this(text, Type.fromInt(type), originalStart, originalEnd); 26 | } 27 | 28 | public Token(String text, Type type, int originalStart, int originalEnd) { 29 | this.text = text; 30 | this.type = type; 31 | this.originalStart = originalStart; 32 | this.originalEnd = originalEnd; 33 | } 34 | 35 | public String getText() { 36 | return text; 37 | } 38 | 39 | public int getLength() { 40 | return text.length(); 41 | } 42 | 43 | public Type getType() { 44 | return type; 45 | } 46 | 47 | public int getOriginalStart() { 48 | return originalStart; 49 | } 50 | 51 | public int getOriginalEnd() { 52 | return originalEnd; 53 | } 54 | 55 | public String toString() { 56 | return text; 57 | } 58 | 59 | public char[] toCharArray() { 60 | return text.toCharArray(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/com/coccoc/Tokenizer.java: -------------------------------------------------------------------------------- 1 | package com.coccoc; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | import java.net.URI; 6 | import java.net.URISyntaxException; 7 | import java.nio.ByteBuffer; 8 | import java.nio.ByteOrder; 9 | import java.nio.IntBuffer; 10 | import java.security.AccessController; 11 | import java.security.PrivilegedAction; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | public class Tokenizer { 16 | public enum Mode { 17 | NORMAL(0), 18 | HOST(1), 19 | URL(2); 20 | 21 | private int value; 22 | 23 | Mode(int value) { 24 | this.value = value; 25 | } 26 | } 27 | 28 | private static final String libPath; 29 | private static final String dictPath; 30 | 31 | private native ByteBuffer[] segment(String text, int tokenizeOption); 32 | private native void freeMemory(ByteBuffer p); 33 | private native void initialize(String dictPath) throws RuntimeException; 34 | 35 | private static Tokenizer instance; 36 | 37 | static { 38 | try { 39 | URI file = Tokenizer.class.getProtectionDomain().getCodeSource().getLocation().toURI(); 40 | libPath = file.resolve("lib/libcoccoc_tokenizer_jni.so").getPath(); 41 | dictPath = file.resolve("dicts").getPath(); 42 | } catch (URISyntaxException e) { 43 | throw new RuntimeException("Could not initialize Tokenizer"); 44 | } 45 | } 46 | 47 | public static Tokenizer getInstance() { 48 | if (instance == null) { 49 | instance = new Tokenizer(); 50 | } 51 | 52 | return instance; 53 | } 54 | 55 | private Tokenizer() { 56 | AccessController.doPrivileged((PrivilegedAction) () -> { 57 | System.load(libPath); 58 | return null; 59 | }); 60 | initialize(dictPath); 61 | } 62 | 63 | public List tokenize(String text, Mode mode) { 64 | if (text == null) { 65 | throw new IllegalArgumentException("text is null"); 66 | } 67 | 68 | ByteBuffer[] segmentResults = segment(text, mode.value); 69 | 70 | IntBuffer normalizedChars = segmentResults[0].order(ByteOrder.nativeOrder()).asIntBuffer(); 71 | IntBuffer rawTokens = segmentResults[1].order(ByteOrder.nativeOrder()).asIntBuffer(); 72 | ByteBuffer pointers = segmentResults[2]; 73 | 74 | StringBuilder sb = new StringBuilder(); 75 | 76 | while (normalizedChars.hasRemaining()) { 77 | sb.appendCodePoint(normalizedChars.get()); 78 | } 79 | 80 | String normalizedText = sb.toString(); 81 | 82 | int tokensCount = rawTokens.capacity() / 6; 83 | List tokens = new ArrayList(tokensCount); 84 | 85 | for (int i = 0; i < tokensCount; i++) { 86 | int offset = i * 6; 87 | int normalizedStart = rawTokens.get(offset); 88 | int normalizedEnd = rawTokens.get(offset + 1); 89 | int originalStart = rawTokens.get(offset + 2); 90 | int originalEnd = rawTokens.get(offset + 3); 91 | 92 | String tokenText = normalizedText.substring(normalizedStart, normalizedEnd); 93 | int tokenType = rawTokens.get(offset + 4); 94 | 95 | tokens.add(new Token(tokenText, tokenType, originalStart, originalEnd)); 96 | } 97 | 98 | freeMemory(pointers); 99 | 100 | return tokens; 101 | } 102 | 103 | public List tokenize(Reader input, Mode mode) throws IOException { 104 | char[] buffer = new char[1024]; 105 | StringBuilder sb = new StringBuilder(); 106 | int numCharsRead; 107 | while ((numCharsRead = input.read(buffer, 0, buffer.length)) != -1) { 108 | sb.append(buffer, 0, numCharsRead); 109 | } 110 | 111 | return tokenize(sb.toString(), mode); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/vi/VietnameseAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.vi; 2 | 3 | import java.io.IOException; 4 | 5 | import com.coccoc.Tokenizer.Mode; 6 | 7 | import org.apache.lucene.analysis.CharArraySet; 8 | import org.apache.lucene.analysis.StopwordAnalyzerBase; 9 | import org.apache.lucene.analysis.TokenStream; 10 | import org.apache.lucene.analysis.Tokenizer; 11 | import org.apache.lucene.analysis.core.StopFilter; 12 | 13 | public class VietnameseAnalyzer extends StopwordAnalyzerBase { 14 | private final Mode mode; 15 | 16 | public VietnameseAnalyzer(Mode mode, CharArraySet stopwords) { 17 | super(stopwords); 18 | this.mode = mode; 19 | } 20 | 21 | public static CharArraySet getDefaultStopSet() { 22 | return DefaultSetHolder.DEFAULT_STOP_SET; 23 | } 24 | 25 | private static class DefaultSetHolder { 26 | static final CharArraySet DEFAULT_STOP_SET; 27 | 28 | static { 29 | try { 30 | DEFAULT_STOP_SET = loadStopwordSet(true, VietnameseAnalyzer.class, "/stopwords.txt", "#"); 31 | } catch (IOException e) { 32 | throw new RuntimeException("Unable to load default stopword set"); 33 | } 34 | } 35 | } 36 | 37 | @Override 38 | protected TokenStreamComponents createComponents(String fieldName) { 39 | Tokenizer tokenizer = new VietnameseTokenizer(mode); 40 | TokenStream stream = new StopFilter(tokenizer, stopwords); 41 | 42 | return new TokenStreamComponents(tokenizer, stream); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.vi; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | import java.util.Locale; 6 | 7 | import com.coccoc.Token; 8 | import com.coccoc.Tokenizer.Mode; 9 | 10 | import org.apache.lucene.analysis.Tokenizer; 11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 12 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 13 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 14 | 15 | public class VietnameseTokenizer extends Tokenizer { 16 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 17 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 18 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 19 | 20 | private Mode tokenizeMode; 21 | private com.coccoc.Tokenizer tokenizer; 22 | private Iterator tokens; 23 | 24 | private int currentOffset = 0; 25 | 26 | public VietnameseTokenizer(Mode tokenizeMode) { 27 | this.tokenizeMode = tokenizeMode; 28 | this.tokenizer = com.coccoc.Tokenizer.getInstance(); 29 | } 30 | 31 | @Override 32 | public boolean incrementToken() throws IOException { 33 | clearAttributes(); 34 | 35 | if (tokens.hasNext()) { 36 | final Token token = tokens.next(); 37 | final int tokenLength = token.getLength(); 38 | final int start = correctOffset(token.getOriginalStart()); 39 | final int end = correctOffset(token.getOriginalEnd()); 40 | 41 | termAtt.copyBuffer(token.toCharArray(), 0, tokenLength); 42 | typeAtt.setType(token.getType().name().toLowerCase(Locale.ROOT)); 43 | offsetAtt.setOffset(start, end); 44 | currentOffset = end; 45 | 46 | return true; 47 | } 48 | 49 | return false; 50 | } 51 | 52 | @Override 53 | public void end() throws IOException { 54 | super.end(); 55 | int finalOffset = correctOffset(currentOffset); 56 | offsetAtt.setOffset(finalOffset, finalOffset); 57 | } 58 | 59 | @Override 60 | public void reset() throws IOException { 61 | super.reset(); 62 | currentOffset = 0; 63 | tokens = tokenizer.tokenize(input, tokenizeMode).iterator(); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/VietnameseAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import com.coccoc.Tokenizer.Mode; 4 | 5 | import org.apache.lucene.analysis.CharArraySet; 6 | import org.apache.lucene.analysis.vi.VietnameseAnalyzer; 7 | import org.elasticsearch.common.settings.Settings; 8 | import org.elasticsearch.env.Environment; 9 | import org.elasticsearch.index.IndexSettings; 10 | 11 | public class VietnameseAnalyzerProvider extends AbstractIndexAnalyzerProvider { 12 | private final VietnameseAnalyzer analyzer; 13 | 14 | public VietnameseAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 15 | super(indexSettings, name, settings); 16 | 17 | final Mode tokenizeMode = VietnameseTokenizerFactory.getTokenizeMode(settings); 18 | final CharArraySet stopwords = Analysis.parseStopWords(env, settings, VietnameseAnalyzer.getDefaultStopSet()); 19 | analyzer = new VietnameseAnalyzer(tokenizeMode, stopwords); 20 | } 21 | 22 | @Override 23 | public VietnameseAnalyzer get() { 24 | return this.analyzer; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/VietnameseStopTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import java.util.Collections; 4 | import java.util.Map; 5 | import java.util.Set; 6 | 7 | import org.apache.lucene.analysis.CharArraySet; 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.apache.lucene.analysis.core.StopFilter; 10 | import org.apache.lucene.analysis.vi.VietnameseAnalyzer; 11 | import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter; 12 | import org.elasticsearch.common.settings.Settings; 13 | import org.elasticsearch.env.Environment; 14 | import org.elasticsearch.index.IndexSettings; 15 | 16 | public class VietnameseStopTokenFilterFactory extends AbstractTokenFilterFactory { 17 | private static final Map> NAMED_STOP_WORDS; 18 | private final CharArraySet stopwords; 19 | private final boolean ignoreCase; 20 | private final boolean removeTrailing; 21 | 22 | static { 23 | NAMED_STOP_WORDS = Collections.singletonMap("_vietnamese_", VietnameseAnalyzer.getDefaultStopSet()); 24 | } 25 | 26 | public VietnameseStopTokenFilterFactory( 27 | IndexSettings indexSettings, Environment env, String name, Settings settings 28 | ) { 29 | super(indexSettings, name, settings); 30 | this.ignoreCase = settings.getAsBoolean("ignore_case", false); 31 | this.removeTrailing = settings.getAsBoolean("remove_trailing", true); 32 | this.stopwords = Analysis.parseWords( 33 | env, settings, "stopwords", VietnameseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase 34 | ); 35 | } 36 | 37 | @Override 38 | public TokenStream create(TokenStream tokenStream) { 39 | return removeTrailing 40 | ? new StopFilter(tokenStream, stopwords) 41 | : new SuggestStopFilter(tokenStream, stopwords); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/VietnameseTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import java.util.Locale; 4 | 5 | import com.coccoc.Tokenizer.Mode; 6 | 7 | import org.apache.lucene.analysis.Tokenizer; 8 | import org.apache.lucene.analysis.vi.VietnameseTokenizer; 9 | import org.elasticsearch.common.settings.Settings; 10 | import org.elasticsearch.env.Environment; 11 | import org.elasticsearch.index.IndexSettings; 12 | 13 | public class VietnameseTokenizerFactory extends AbstractTokenizerFactory { 14 | private final Mode tokenizeMode; 15 | 16 | public VietnameseTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 17 | super(indexSettings, settings, name); 18 | 19 | tokenizeMode = getTokenizeMode(settings); 20 | } 21 | 22 | public static Mode getTokenizeMode(Settings settings) 23 | { 24 | String modeSetting = settings.get("mode", "normal").toUpperCase(Locale.ROOT); 25 | return Mode.valueOf(modeSetting); 26 | } 27 | 28 | public Tokenizer create() { 29 | return new VietnameseTokenizer(tokenizeMode); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/vi/AnalysisViPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analysis.vi; 2 | 3 | import java.util.Collections; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | import org.apache.lucene.analysis.Analyzer; 8 | import org.elasticsearch.index.analysis.AnalyzerProvider; 9 | import org.elasticsearch.index.analysis.TokenFilterFactory; 10 | import org.elasticsearch.index.analysis.TokenizerFactory; 11 | import org.elasticsearch.index.analysis.VietnameseAnalyzerProvider; 12 | import org.elasticsearch.index.analysis.VietnameseStopTokenFilterFactory; 13 | import org.elasticsearch.index.analysis.VietnameseTokenizerFactory; 14 | import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; 15 | import org.elasticsearch.plugins.AnalysisPlugin; 16 | import org.elasticsearch.plugins.Plugin; 17 | 18 | public class AnalysisViPlugin extends Plugin implements AnalysisPlugin { 19 | @Override 20 | public Map> getTokenFilters() { 21 | Map> extra = new HashMap<>(); 22 | extra.put("vi_stop", VietnameseStopTokenFilterFactory::new); 23 | 24 | return extra; 25 | } 26 | 27 | @Override 28 | public Map> getTokenizers() { 29 | return Collections.singletonMap("vi_tokenizer", VietnameseTokenizerFactory::new); 30 | } 31 | 32 | @Override 33 | public Map>> getAnalyzers() { 34 | return Collections.singletonMap("vi_analyzer", VietnameseAnalyzerProvider::new); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/jni/Tokenizer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "com_coccoc_Tokenizer.h" 3 | 4 | static jclass java_nio_ByteBuffer; 5 | static jint JNI_VERSION = JNI_VERSION_10; 6 | 7 | jint JNI_OnLoad(JavaVM *vm, void *reserved) 8 | { 9 | JNIEnv* env; 10 | if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION) != JNI_OK) { 11 | return JNI_ERR; 12 | } 13 | 14 | java_nio_ByteBuffer = static_cast(env->NewGlobalRef(env->FindClass("java/nio/ByteBuffer"))); 15 | 16 | return JNI_VERSION; 17 | } 18 | 19 | void JNI_OnUnload(JavaVM *vm, void *reserved) 20 | { 21 | JNIEnv* env; 22 | vm->GetEnv(reinterpret_cast(&env), JNI_VERSION); 23 | 24 | env->DeleteGlobalRef(java_nio_ByteBuffer); 25 | } 26 | 27 | /** 28 | * Segment a document and return an array of direct ByteBuffer referring 29 | * to segmentation result vectors. Further processing happens in Java code. 30 | * The method returns 3 ByteBuffer. The first one is the normalized text. 31 | * The second one contains Token structs (see token.hpp). The last one contains 32 | * pointers to dynamically created vectors, used for clean up when done. 33 | */ 34 | JNIEXPORT jobjectArray JNICALL Java_com_coccoc_Tokenizer_segment( 35 | JNIEnv *env, jobject obj, jstring jni_text, jint tokenize_option) 36 | { 37 | const jchar *jtext = env->GetStringCritical(jni_text, nullptr); 38 | int text_length = env->GetStringLength(jni_text); 39 | 40 | // Use pointer to avoid automatic deallocation 41 | // Must call `freeMemory` when done to clean up 42 | std::vector< uint32_t > *normalized = new std::vector< uint32_t >(); 43 | normalized->reserve(text_length); 44 | 45 | std::vector< int > original_pos; 46 | Tokenizer::instance().normalize_for_tokenization(jtext, text_length, *normalized, original_pos, true); 47 | env->ReleaseStringCritical(jni_text, jtext); 48 | 49 | // Use pointer here too 50 | std::vector< Token > *tokens = new std::vector< Token >(); 51 | // space_positions is only used when `for_transforming` is true? 52 | std::vector< int > space_positions; 53 | 54 | Tokenizer::instance().handle_tokenization_request< Token >( 55 | *normalized, *tokens, space_positions, original_pos, false, tokenize_option); 56 | 57 | for (size_t i = 0; i < tokens->size(); ++i) 58 | { 59 | tokens->at(i).original_start += original_pos[tokens->at(i).normalized_start]; 60 | tokens->at(i).original_end += original_pos[tokens->at(i).normalized_end]; 61 | } 62 | 63 | // Keep pointers to original vectors in another array so we can clean up later 64 | // When done, pass this pointer (ByteBuffer) to `freeMemory` to clean up 65 | int64_t *p = new int64_t[2]; 66 | p[0] = (int64_t) normalized; 67 | p[1] = (int64_t) tokens; 68 | 69 | jobjectArray results = env->NewObjectArray(3, java_nio_ByteBuffer, nullptr); 70 | 71 | env->SetObjectArrayElement(results, 0, env->NewDirectByteBuffer(normalized->data(), normalized->size() * 4)); 72 | env->SetObjectArrayElement(results, 1, env->NewDirectByteBuffer(tokens->data(), tokens->size() * 6 * 4)); 73 | env->SetObjectArrayElement(results, 2, env->NewDirectByteBuffer(p, 0)); 74 | 75 | return results; 76 | } 77 | 78 | JNIEXPORT void JNICALL Java_com_coccoc_Tokenizer_freeMemory(JNIEnv *env, jobject obj, jobject res_pointer) 79 | { 80 | // Cast each object pointer to their respective type, must be careful 81 | int64_t *p = static_cast(env->GetDirectBufferAddress(res_pointer)); 82 | delete (std::vector< uint32_t > *) (p[0]); 83 | delete (std::vector< Token > *) (p[1]); 84 | delete[](int64_t *) p; 85 | } 86 | 87 | JNIEXPORT void JNICALL Java_com_coccoc_Tokenizer_initialize(JNIEnv *env, jobject obj, jstring jni_dict_path) 88 | { 89 | const char *dict_path = env->GetStringUTFChars(jni_dict_path, nullptr); 90 | int status_code = Tokenizer::instance().initialize(std::string(dict_path)); 91 | 92 | if (status_code != 0) { 93 | jclass java_lang_RuntimeException = env->FindClass("java/lang/RuntimeException"); 94 | 95 | env->ThrowNew(java_lang_RuntimeException, "Could not load dictionary"); 96 | } 97 | 98 | env->ReleaseStringUTFChars(jni_dict_path, dict_path); 99 | } 100 | -------------------------------------------------------------------------------- /src/main/plugin-metadata/plugin-security.policy: -------------------------------------------------------------------------------- 1 | grant { 2 | permission java.lang.RuntimePermission "loadLibrary.*"; 3 | }; 4 | -------------------------------------------------------------------------------- /src/main/resources/stopwords.txt: -------------------------------------------------------------------------------- 1 | bị 2 | bởi 3 | cả 4 | các 5 | cái 6 | cần 7 | càng 8 | chỉ 9 | chiếc 10 | cho 11 | chứ 12 | chưa 13 | chuyện 14 | có 15 | có thể 16 | cứ 17 | của 18 | cùng 19 | cũng 20 | đã 21 | đang 22 | để 23 | đến nỗi 24 | đều 25 | điều 26 | do 27 | đó 28 | được 29 | dưới 30 | gì 31 | khi 32 | không 33 | là 34 | lại 35 | lên 36 | lúc 37 | mà 38 | mỗi 39 | một cách 40 | này 41 | nên 42 | nếu 43 | ngay 44 | nhiều 45 | như 46 | nhưng 47 | những 48 | nơi 49 | nữa 50 | phải 51 | qua 52 | ra 53 | rằng 54 | rất 55 | rồi 56 | sau 57 | sẽ 58 | so 59 | sự 60 | tại 61 | theo 62 | thì 63 | trên 64 | trước 65 | từ 66 | từng 67 | và 68 | vẫn 69 | vào 70 | vậy 71 | vì 72 | việc 73 | với 74 | vừa 75 | vâng 76 | à 77 | ừ 78 | từ 79 | -------------------------------------------------------------------------------- /src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTests.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import java.util.Collection; 4 | import java.util.Collections; 5 | 6 | import org.elasticsearch.action.admin.cluster.node.info.NodeInfo; 7 | import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; 8 | import org.elasticsearch.plugin.analysis.vi.AnalysisViPlugin; 9 | import org.elasticsearch.plugins.Plugin; 10 | import org.elasticsearch.test.ESIntegTestCase; 11 | 12 | import static org.hamcrest.Matchers.is; 13 | 14 | public class VietnameseAnalysisIntegrationTests extends ESIntegTestCase { 15 | @Override 16 | protected Collection> nodePlugins() { 17 | return Collections.singleton(AnalysisViPlugin.class); 18 | } 19 | 20 | public void testPluginIsLoaded() throws Exception { 21 | NodesInfoResponse response = client().admin().cluster().prepareNodesInfo().setPlugins(true).get(); 22 | 23 | for (NodeInfo nodeInfo : response.getNodes()) { 24 | boolean pluginLoaded = nodeInfo.getPlugins().getPluginInfos() 25 | .stream().anyMatch(plugin -> plugin.getName().equals("analysis-vi")); 26 | 27 | assertThat(pluginLoaded, is(true)); 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tokenizer.gradle: -------------------------------------------------------------------------------- 1 | def JAVA_HOME = System.getenv('JAVA_HOME') ?: System.getenv('JDK_HOME') 2 | 3 | task configureTokenizer(type:Exec) { 4 | outputs.dir 'build/tokenizer' 5 | 6 | workingDir 'build/tokenizer' 7 | commandLine 'cmake', '../../coccoc-tokenizer' 8 | } 9 | 10 | task compileDict(type:Exec) { 11 | outputs.dir 'build/tokenizer' 12 | outputs.files 'multiterm_trie.dump', 'nontone_pair_freq_map.dump', 'syllable_trie.dump' 13 | 14 | dependsOn 'configureTokenizer' 15 | workingDir 'build/tokenizer' 16 | commandLine 'make', 'compile_dict' 17 | } 18 | 19 | task compileTokenizer(type:Exec) { 20 | outputs.dirs 'build/lib' 21 | 22 | dependsOn 'configureTokenizer' 23 | dependsOn 'compileJava' 24 | 25 | commandLine 'g++', '-Wall', '-Werror', '-Wno-deprecated', '-shared', '-std=c++11', '-O3', '-DNDEBUG', '-ggdb', '-fPIC', 26 | '-I', 'coccoc-tokenizer', 27 | '-I', 'build/headers', 28 | '-I', "build/tokenizer/auto", 29 | '-I', "${JAVA_HOME}/include", 30 | '-I', "${JAVA_HOME}/include/linux", 31 | '-o', 'build/lib/libcoccoc_tokenizer_jni.so', 32 | 'src/main/jni/Tokenizer.cpp' 33 | } 34 | 35 | compileJava { 36 | options.compilerArgs += ['-h', file('build/headers')] 37 | } 38 | 39 | bundlePlugin { 40 | dependsOn 'compileDict' 41 | dependsOn 'compileTokenizer' 42 | 43 | from('build/lib') { 44 | into 'lib' 45 | } 46 | 47 | from('build/tokenizer') { 48 | include '*.dump' 49 | into 'dicts' 50 | } 51 | 52 | from('coccoc-tokenizer/dicts/vn_lang_tool') { 53 | into 'dicts' 54 | } 55 | } 56 | --------------------------------------------------------------------------------