├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── build.gradle
├── gradle.properties
├── gradle
└── wrapper
│ ├── gradle-wrapper.jar
│ └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── settings.gradle
├── src
├── main
│ ├── java
│ │ ├── com
│ │ │ └── coccoc
│ │ │ │ ├── Token.java
│ │ │ │ └── Tokenizer.java
│ │ └── org
│ │ │ ├── apache
│ │ │ └── lucene
│ │ │ │ └── analysis
│ │ │ │ └── vi
│ │ │ │ ├── VietnameseAnalyzer.java
│ │ │ │ └── VietnameseTokenizer.java
│ │ │ └── elasticsearch
│ │ │ ├── index
│ │ │ └── analysis
│ │ │ │ ├── VietnameseAnalyzerProvider.java
│ │ │ │ ├── VietnameseStopTokenFilterFactory.java
│ │ │ │ └── VietnameseTokenizerFactory.java
│ │ │ └── plugin
│ │ │ └── analysis
│ │ │ └── vi
│ │ │ └── AnalysisViPlugin.java
│ ├── jni
│ │ └── Tokenizer.cpp
│ ├── plugin-metadata
│ │ └── plugin-security.policy
│ └── resources
│ │ └── stopwords.txt
└── test
│ └── java
│ └── org
│ └── elasticsearch
│ └── index
│ └── analysis
│ └── VietnameseAnalysisIntegrationTests.java
└── tokenizer.gradle
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled class file
2 | *.class
3 |
4 | # Log file
5 | *.log
6 |
7 | # BlueJ files
8 | *.ctxt
9 |
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 |
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 |
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 |
25 | # Gradle
26 | .gradle
27 | build
28 | !gradle-wrapper.jar
29 |
30 | # Eclipse project files
31 | .project
32 | .classpath
33 | .settings
34 |
35 | # IDE
36 | .vscode
37 | .idea
38 | *.iml
39 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "coccoc-tokenizer"]
2 | path = coccoc-tokenizer
3 | url = https://github.com/coccoc/coccoc-tokenizer
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Vietnamese Analysis Plugin for Elasticsearch
2 |
3 | Vietnamese Analysis plugin integrates Vietnamese language analysis into Elasticsearch.
4 | The plugin provides the following functions:
5 |
6 | Analyzer: `vi_analyzer`. Tokenizer: `vi_tokenizer`. Filter: `vi_stop`. The `vi_analyzer` itself is composed of the `vi_tokenizer` and the `vi_stop` filter.
7 |
8 | The tokenizer uses [coccoc-tokenizer](https://github.com/coccoc/coccoc-tokenizer) for tokenization.
9 |
10 | ## Installation
11 |
12 | Choose a version from the [releases](https://github.com/sun-asterisk-research/elasticsearch-analysis-vi/releases) page to install:
13 |
14 | ```sh
15 | elasticsearch-plugin install https://github.com/sun-asterisk-research/elasticsearch-analysis-vi/releases/download//
16 | ```
17 |
18 | Or [build from source](#build-from-source) and install from a plugin bundle.
19 |
20 | ```sh
21 | elasticsearch-plugin instal file:///path/to/plugin
22 | ```
23 |
24 | ## Supported versions
25 |
26 | | Branch | Elasticsearch version |
27 | |--------|-----------------------|
28 | | master | 7.4+ |
29 | | 7.3 | 7.0 - 7.3 |
30 |
31 | ## Build from source
32 |
33 | You need the following build dependencies: `JDK`, `make`, `cmake`, `libstdc++`. At least JDK 11 is required. Beware of your `libstdc++` version. If you build on a version too new, it will not work on older systems.
34 |
35 | First update the git submodules:
36 |
37 | ```sh
38 | git submodule update --init
39 | ```
40 |
41 | Build and bundle the plugin:
42 |
43 | ```sh
44 | ./gradlew assemble
45 | ```
46 |
47 | To build for a different elasticsearch version, add `-PelasticsearchVersion=` to your build command. Also note the [branch and supported versions](#supported-versions). For example, to build for Elasticsearch 7.3.1:
48 |
49 | ```sh
50 | ./gradlew assemble -PelasticsearchVersion=7.3.1
51 | ```
52 |
53 | To run tests:
54 |
55 | ```sh
56 | ./gradlew check
57 | ```
58 |
--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
1 | buildscript {
2 | repositories {
3 | mavenCentral()
4 | jcenter()
5 | }
6 |
7 | dependencies {
8 | classpath "org.elasticsearch.gradle:build-tools:${elasticsearchVersion}"
9 | }
10 | }
11 |
12 | apply plugin: 'java'
13 | apply plugin: 'elasticsearch.esplugin'
14 | apply from: 'tokenizer.gradle'
15 |
16 | version = "${pluginVersion}-es${versions.elasticsearch}"
17 |
18 | esplugin {
19 | name 'analysis-vi'
20 | version "${pluginVersion}"
21 | description 'Elasticsearch Vietnamese Analysis Plugin'
22 | classname 'org.elasticsearch.plugin.analysis.vi.AnalysisViPlugin'
23 | licenseFile rootProject.file('LICENSE')
24 | noticeFile rootProject.file('README.md')
25 | }
26 |
27 | integTestRunner {
28 | include 'org/elasticsearch/index/analysis/*Tests.class'
29 | }
30 |
31 | test.enabled = false
32 | licenseHeaders.enabled = false
33 |
--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
1 | pluginVersion=1.0.0
2 | elasticsearchVersion=7.5.1
3 |
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sun-asterisk-research/elasticsearch-analysis-vi/84b7e5301461ce633f8b6e8aa52104dc37d51af4/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.0.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | #
4 | # Copyright 2015 the original author or authors.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # https://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | ##############################################################################
20 | ##
21 | ## Gradle start up script for UN*X
22 | ##
23 | ##############################################################################
24 |
25 | # Attempt to set APP_HOME
26 | # Resolve links: $0 may be a link
27 | PRG="$0"
28 | # Need this for relative symlinks.
29 | while [ -h "$PRG" ] ; do
30 | ls=`ls -ld "$PRG"`
31 | link=`expr "$ls" : '.*-> \(.*\)$'`
32 | if expr "$link" : '/.*' > /dev/null; then
33 | PRG="$link"
34 | else
35 | PRG=`dirname "$PRG"`"/$link"
36 | fi
37 | done
38 | SAVED="`pwd`"
39 | cd "`dirname \"$PRG\"`/" >/dev/null
40 | APP_HOME="`pwd -P`"
41 | cd "$SAVED" >/dev/null
42 |
43 | APP_NAME="Gradle"
44 | APP_BASE_NAME=`basename "$0"`
45 |
46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
48 |
49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
50 | MAX_FD="maximum"
51 |
52 | warn () {
53 | echo "$*"
54 | }
55 |
56 | die () {
57 | echo
58 | echo "$*"
59 | echo
60 | exit 1
61 | }
62 |
63 | # OS specific support (must be 'true' or 'false').
64 | cygwin=false
65 | msys=false
66 | darwin=false
67 | nonstop=false
68 | case "`uname`" in
69 | CYGWIN* )
70 | cygwin=true
71 | ;;
72 | Darwin* )
73 | darwin=true
74 | ;;
75 | MINGW* )
76 | msys=true
77 | ;;
78 | NONSTOP* )
79 | nonstop=true
80 | ;;
81 | esac
82 |
83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
84 |
85 | # Determine the Java command to use to start the JVM.
86 | if [ -n "$JAVA_HOME" ] ; then
87 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
88 | # IBM's JDK on AIX uses strange locations for the executables
89 | JAVACMD="$JAVA_HOME/jre/sh/java"
90 | else
91 | JAVACMD="$JAVA_HOME/bin/java"
92 | fi
93 | if [ ! -x "$JAVACMD" ] ; then
94 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
95 |
96 | Please set the JAVA_HOME variable in your environment to match the
97 | location of your Java installation."
98 | fi
99 | else
100 | JAVACMD="java"
101 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
102 |
103 | Please set the JAVA_HOME variable in your environment to match the
104 | location of your Java installation."
105 | fi
106 |
107 | # Increase the maximum file descriptors if we can.
108 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
109 | MAX_FD_LIMIT=`ulimit -H -n`
110 | if [ $? -eq 0 ] ; then
111 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
112 | MAX_FD="$MAX_FD_LIMIT"
113 | fi
114 | ulimit -n $MAX_FD
115 | if [ $? -ne 0 ] ; then
116 | warn "Could not set maximum file descriptor limit: $MAX_FD"
117 | fi
118 | else
119 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
120 | fi
121 | fi
122 |
123 | # For Darwin, add options to specify how the application appears in the dock
124 | if $darwin; then
125 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
126 | fi
127 |
128 | # For Cygwin or MSYS, switch paths to Windows format before running java
129 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
130 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
131 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
132 | JAVACMD=`cygpath --unix "$JAVACMD"`
133 |
134 | # We build the pattern for arguments to be converted via cygpath
135 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
136 | SEP=""
137 | for dir in $ROOTDIRSRAW ; do
138 | ROOTDIRS="$ROOTDIRS$SEP$dir"
139 | SEP="|"
140 | done
141 | OURCYGPATTERN="(^($ROOTDIRS))"
142 | # Add a user-defined pattern to the cygpath arguments
143 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
144 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
145 | fi
146 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
147 | i=0
148 | for arg in "$@" ; do
149 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
150 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
151 |
152 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
153 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
154 | else
155 | eval `echo args$i`="\"$arg\""
156 | fi
157 | i=$((i+1))
158 | done
159 | case $i in
160 | (0) set -- ;;
161 | (1) set -- "$args0" ;;
162 | (2) set -- "$args0" "$args1" ;;
163 | (3) set -- "$args0" "$args1" "$args2" ;;
164 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
165 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
166 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
167 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
168 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
169 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
170 | esac
171 | fi
172 |
173 | # Escape application args
174 | save () {
175 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
176 | echo " "
177 | }
178 | APP_ARGS=$(save "$@")
179 |
180 | # Collect all arguments for the java command, following the shell quoting and substitution rules
181 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
182 |
183 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
184 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
185 | cd "$(dirname "$0")"
186 | fi
187 |
188 | exec "$JAVACMD" "$@"
189 |
--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
33 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
34 |
35 | @rem Find java.exe
36 | if defined JAVA_HOME goto findJavaFromJavaHome
37 |
38 | set JAVA_EXE=java.exe
39 | %JAVA_EXE% -version >NUL 2>&1
40 | if "%ERRORLEVEL%" == "0" goto init
41 |
42 | echo.
43 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
44 | echo.
45 | echo Please set the JAVA_HOME variable in your environment to match the
46 | echo location of your Java installation.
47 |
48 | goto fail
49 |
50 | :findJavaFromJavaHome
51 | set JAVA_HOME=%JAVA_HOME:"=%
52 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
53 |
54 | if exist "%JAVA_EXE%" goto init
55 |
56 | echo.
57 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
58 | echo.
59 | echo Please set the JAVA_HOME variable in your environment to match the
60 | echo location of your Java installation.
61 |
62 | goto fail
63 |
64 | :init
65 | @rem Get command-line arguments, handling Windows variants
66 |
67 | if not "%OS%" == "Windows_NT" goto win9xME_args
68 |
69 | :win9xME_args
70 | @rem Slurp the command line arguments.
71 | set CMD_LINE_ARGS=
72 | set _SKIP=2
73 |
74 | :win9xME_args_slurp
75 | if "x%~1" == "x" goto execute
76 |
77 | set CMD_LINE_ARGS=%*
78 |
79 | :execute
80 | @rem Setup the command line
81 |
82 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
83 |
84 | @rem Execute Gradle
85 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
86 |
87 | :end
88 | @rem End local scope for the variables with windows NT shell
89 | if "%ERRORLEVEL%"=="0" goto mainEnd
90 |
91 | :fail
92 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
93 | rem the _cmd.exe /c_ return code!
94 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
95 | exit /b 1
96 |
97 | :mainEnd
98 | if "%OS%"=="Windows_NT" endlocal
99 |
100 | :omega
101 |
--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'elasticsearch-analysis-vi'
2 |
--------------------------------------------------------------------------------
/src/main/java/com/coccoc/Token.java:
--------------------------------------------------------------------------------
1 | package com.coccoc;
2 |
3 | public final class Token {
4 | public enum Type {
5 | WORD,
6 | NUMBER;
7 |
8 | private static Type[] values = null;
9 |
10 | static {
11 | Type.values = Type.values();
12 | }
13 |
14 | public static Type fromInt(int i) {
15 | return Type.values[i];
16 | }
17 | }
18 |
19 | private final String text;
20 | private final Type type;
21 | private final int originalStart;
22 | private final int originalEnd;
23 |
24 | public Token(String text, int type, int originalStart, int originalEnd) {
25 | this(text, Type.fromInt(type), originalStart, originalEnd);
26 | }
27 |
28 | public Token(String text, Type type, int originalStart, int originalEnd) {
29 | this.text = text;
30 | this.type = type;
31 | this.originalStart = originalStart;
32 | this.originalEnd = originalEnd;
33 | }
34 |
35 | public String getText() {
36 | return text;
37 | }
38 |
39 | public int getLength() {
40 | return text.length();
41 | }
42 |
43 | public Type getType() {
44 | return type;
45 | }
46 |
47 | public int getOriginalStart() {
48 | return originalStart;
49 | }
50 |
51 | public int getOriginalEnd() {
52 | return originalEnd;
53 | }
54 |
55 | public String toString() {
56 | return text;
57 | }
58 |
59 | public char[] toCharArray() {
60 | return text.toCharArray();
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/java/com/coccoc/Tokenizer.java:
--------------------------------------------------------------------------------
1 | package com.coccoc;
2 |
3 | import java.io.IOException;
4 | import java.io.Reader;
5 | import java.net.URI;
6 | import java.net.URISyntaxException;
7 | import java.nio.ByteBuffer;
8 | import java.nio.ByteOrder;
9 | import java.nio.IntBuffer;
10 | import java.security.AccessController;
11 | import java.security.PrivilegedAction;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 |
15 | public class Tokenizer {
16 | public enum Mode {
17 | NORMAL(0),
18 | HOST(1),
19 | URL(2);
20 |
21 | private int value;
22 |
23 | Mode(int value) {
24 | this.value = value;
25 | }
26 | }
27 |
28 | private static final String libPath;
29 | private static final String dictPath;
30 |
31 | private native ByteBuffer[] segment(String text, int tokenizeOption);
32 | private native void freeMemory(ByteBuffer p);
33 | private native void initialize(String dictPath) throws RuntimeException;
34 |
35 | private static Tokenizer instance;
36 |
37 | static {
38 | try {
39 | URI file = Tokenizer.class.getProtectionDomain().getCodeSource().getLocation().toURI();
40 | libPath = file.resolve("lib/libcoccoc_tokenizer_jni.so").getPath();
41 | dictPath = file.resolve("dicts").getPath();
42 | } catch (URISyntaxException e) {
43 | throw new RuntimeException("Could not initialize Tokenizer");
44 | }
45 | }
46 |
47 | public static Tokenizer getInstance() {
48 | if (instance == null) {
49 | instance = new Tokenizer();
50 | }
51 |
52 | return instance;
53 | }
54 |
55 | private Tokenizer() {
56 | AccessController.doPrivileged((PrivilegedAction) () -> {
57 | System.load(libPath);
58 | return null;
59 | });
60 | initialize(dictPath);
61 | }
62 |
63 | public List tokenize(String text, Mode mode) {
64 | if (text == null) {
65 | throw new IllegalArgumentException("text is null");
66 | }
67 |
68 | ByteBuffer[] segmentResults = segment(text, mode.value);
69 |
70 | IntBuffer normalizedChars = segmentResults[0].order(ByteOrder.nativeOrder()).asIntBuffer();
71 | IntBuffer rawTokens = segmentResults[1].order(ByteOrder.nativeOrder()).asIntBuffer();
72 | ByteBuffer pointers = segmentResults[2];
73 |
74 | StringBuilder sb = new StringBuilder();
75 |
76 | while (normalizedChars.hasRemaining()) {
77 | sb.appendCodePoint(normalizedChars.get());
78 | }
79 |
80 | String normalizedText = sb.toString();
81 |
82 | int tokensCount = rawTokens.capacity() / 6;
83 | List tokens = new ArrayList(tokensCount);
84 |
85 | for (int i = 0; i < tokensCount; i++) {
86 | int offset = i * 6;
87 | int normalizedStart = rawTokens.get(offset);
88 | int normalizedEnd = rawTokens.get(offset + 1);
89 | int originalStart = rawTokens.get(offset + 2);
90 | int originalEnd = rawTokens.get(offset + 3);
91 |
92 | String tokenText = normalizedText.substring(normalizedStart, normalizedEnd);
93 | int tokenType = rawTokens.get(offset + 4);
94 |
95 | tokens.add(new Token(tokenText, tokenType, originalStart, originalEnd));
96 | }
97 |
98 | freeMemory(pointers);
99 |
100 | return tokens;
101 | }
102 |
103 | public List tokenize(Reader input, Mode mode) throws IOException {
104 | char[] buffer = new char[1024];
105 | StringBuilder sb = new StringBuilder();
106 | int numCharsRead;
107 | while ((numCharsRead = input.read(buffer, 0, buffer.length)) != -1) {
108 | sb.append(buffer, 0, numCharsRead);
109 | }
110 |
111 | return tokenize(sb.toString(), mode);
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/main/java/org/apache/lucene/analysis/vi/VietnameseAnalyzer.java:
--------------------------------------------------------------------------------
1 | package org.apache.lucene.analysis.vi;
2 |
3 | import java.io.IOException;
4 |
5 | import com.coccoc.Tokenizer.Mode;
6 |
7 | import org.apache.lucene.analysis.CharArraySet;
8 | import org.apache.lucene.analysis.StopwordAnalyzerBase;
9 | import org.apache.lucene.analysis.TokenStream;
10 | import org.apache.lucene.analysis.Tokenizer;
11 | import org.apache.lucene.analysis.core.StopFilter;
12 |
13 | public class VietnameseAnalyzer extends StopwordAnalyzerBase {
14 | private final Mode mode;
15 |
16 | public VietnameseAnalyzer(Mode mode, CharArraySet stopwords) {
17 | super(stopwords);
18 | this.mode = mode;
19 | }
20 |
21 | public static CharArraySet getDefaultStopSet() {
22 | return DefaultSetHolder.DEFAULT_STOP_SET;
23 | }
24 |
25 | private static class DefaultSetHolder {
26 | static final CharArraySet DEFAULT_STOP_SET;
27 |
28 | static {
29 | try {
30 | DEFAULT_STOP_SET = loadStopwordSet(true, VietnameseAnalyzer.class, "/stopwords.txt", "#");
31 | } catch (IOException e) {
32 | throw new RuntimeException("Unable to load default stopword set");
33 | }
34 | }
35 | }
36 |
37 | @Override
38 | protected TokenStreamComponents createComponents(String fieldName) {
39 | Tokenizer tokenizer = new VietnameseTokenizer(mode);
40 | TokenStream stream = new StopFilter(tokenizer, stopwords);
41 |
42 | return new TokenStreamComponents(tokenizer, stream);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java:
--------------------------------------------------------------------------------
1 | package org.apache.lucene.analysis.vi;
2 |
3 | import java.io.IOException;
4 | import java.util.Iterator;
5 | import java.util.Locale;
6 |
7 | import com.coccoc.Token;
8 | import com.coccoc.Tokenizer.Mode;
9 |
10 | import org.apache.lucene.analysis.Tokenizer;
11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
12 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
13 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
14 |
15 | public class VietnameseTokenizer extends Tokenizer {
16 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
17 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
18 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
19 |
20 | private Mode tokenizeMode;
21 | private com.coccoc.Tokenizer tokenizer;
22 | private Iterator tokens;
23 |
24 | private int currentOffset = 0;
25 |
26 | public VietnameseTokenizer(Mode tokenizeMode) {
27 | this.tokenizeMode = tokenizeMode;
28 | this.tokenizer = com.coccoc.Tokenizer.getInstance();
29 | }
30 |
31 | @Override
32 | public boolean incrementToken() throws IOException {
33 | clearAttributes();
34 |
35 | if (tokens.hasNext()) {
36 | final Token token = tokens.next();
37 | final int tokenLength = token.getLength();
38 | final int start = correctOffset(token.getOriginalStart());
39 | final int end = correctOffset(token.getOriginalEnd());
40 |
41 | termAtt.copyBuffer(token.toCharArray(), 0, tokenLength);
42 | typeAtt.setType(token.getType().name().toLowerCase(Locale.ROOT));
43 | offsetAtt.setOffset(start, end);
44 | currentOffset = end;
45 |
46 | return true;
47 | }
48 |
49 | return false;
50 | }
51 |
52 | @Override
53 | public void end() throws IOException {
54 | super.end();
55 | int finalOffset = correctOffset(currentOffset);
56 | offsetAtt.setOffset(finalOffset, finalOffset);
57 | }
58 |
59 | @Override
60 | public void reset() throws IOException {
61 | super.reset();
62 | currentOffset = 0;
63 | tokens = tokenizer.tokenize(input, tokenizeMode).iterator();
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalyzerProvider.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import com.coccoc.Tokenizer.Mode;
4 |
5 | import org.apache.lucene.analysis.CharArraySet;
6 | import org.apache.lucene.analysis.vi.VietnameseAnalyzer;
7 | import org.elasticsearch.common.settings.Settings;
8 | import org.elasticsearch.env.Environment;
9 | import org.elasticsearch.index.IndexSettings;
10 |
11 | public class VietnameseAnalyzerProvider extends AbstractIndexAnalyzerProvider {
12 | private final VietnameseAnalyzer analyzer;
13 |
14 | public VietnameseAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
15 | super(indexSettings, name, settings);
16 |
17 | final Mode tokenizeMode = VietnameseTokenizerFactory.getTokenizeMode(settings);
18 | final CharArraySet stopwords = Analysis.parseStopWords(env, settings, VietnameseAnalyzer.getDefaultStopSet());
19 | analyzer = new VietnameseAnalyzer(tokenizeMode, stopwords);
20 | }
21 |
22 | @Override
23 | public VietnameseAnalyzer get() {
24 | return this.analyzer;
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/VietnameseStopTokenFilterFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import java.util.Collections;
4 | import java.util.Map;
5 | import java.util.Set;
6 |
7 | import org.apache.lucene.analysis.CharArraySet;
8 | import org.apache.lucene.analysis.TokenStream;
9 | import org.apache.lucene.analysis.core.StopFilter;
10 | import org.apache.lucene.analysis.vi.VietnameseAnalyzer;
11 | import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
12 | import org.elasticsearch.common.settings.Settings;
13 | import org.elasticsearch.env.Environment;
14 | import org.elasticsearch.index.IndexSettings;
15 |
16 | public class VietnameseStopTokenFilterFactory extends AbstractTokenFilterFactory {
17 | private static final Map> NAMED_STOP_WORDS;
18 | private final CharArraySet stopwords;
19 | private final boolean ignoreCase;
20 | private final boolean removeTrailing;
21 |
22 | static {
23 | NAMED_STOP_WORDS = Collections.singletonMap("_vietnamese_", VietnameseAnalyzer.getDefaultStopSet());
24 | }
25 |
26 | public VietnameseStopTokenFilterFactory(
27 | IndexSettings indexSettings, Environment env, String name, Settings settings
28 | ) {
29 | super(indexSettings, name, settings);
30 | this.ignoreCase = settings.getAsBoolean("ignore_case", false);
31 | this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
32 | this.stopwords = Analysis.parseWords(
33 | env, settings, "stopwords", VietnameseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase
34 | );
35 | }
36 |
37 | @Override
38 | public TokenStream create(TokenStream tokenStream) {
39 | return removeTrailing
40 | ? new StopFilter(tokenStream, stopwords)
41 | : new SuggestStopFilter(tokenStream, stopwords);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/VietnameseTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import java.util.Locale;
4 |
5 | import com.coccoc.Tokenizer.Mode;
6 |
7 | import org.apache.lucene.analysis.Tokenizer;
8 | import org.apache.lucene.analysis.vi.VietnameseTokenizer;
9 | import org.elasticsearch.common.settings.Settings;
10 | import org.elasticsearch.env.Environment;
11 | import org.elasticsearch.index.IndexSettings;
12 |
13 | public class VietnameseTokenizerFactory extends AbstractTokenizerFactory {
14 | private final Mode tokenizeMode;
15 |
16 | public VietnameseTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
17 | super(indexSettings, settings, name);
18 |
19 | tokenizeMode = getTokenizeMode(settings);
20 | }
21 |
22 | public static Mode getTokenizeMode(Settings settings)
23 | {
24 | String modeSetting = settings.get("mode", "normal").toUpperCase(Locale.ROOT);
25 | return Mode.valueOf(modeSetting);
26 | }
27 |
28 | public Tokenizer create() {
29 | return new VietnameseTokenizer(tokenizeMode);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/vi/AnalysisViPlugin.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.analysis.vi;
2 |
3 | import java.util.Collections;
4 | import java.util.HashMap;
5 | import java.util.Map;
6 |
7 | import org.apache.lucene.analysis.Analyzer;
8 | import org.elasticsearch.index.analysis.AnalyzerProvider;
9 | import org.elasticsearch.index.analysis.TokenFilterFactory;
10 | import org.elasticsearch.index.analysis.TokenizerFactory;
11 | import org.elasticsearch.index.analysis.VietnameseAnalyzerProvider;
12 | import org.elasticsearch.index.analysis.VietnameseStopTokenFilterFactory;
13 | import org.elasticsearch.index.analysis.VietnameseTokenizerFactory;
14 | import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
15 | import org.elasticsearch.plugins.AnalysisPlugin;
16 | import org.elasticsearch.plugins.Plugin;
17 |
18 | public class AnalysisViPlugin extends Plugin implements AnalysisPlugin {
19 | @Override
20 | public Map> getTokenFilters() {
21 | Map> extra = new HashMap<>();
22 | extra.put("vi_stop", VietnameseStopTokenFilterFactory::new);
23 |
24 | return extra;
25 | }
26 |
27 | @Override
28 | public Map> getTokenizers() {
29 | return Collections.singletonMap("vi_tokenizer", VietnameseTokenizerFactory::new);
30 | }
31 |
32 | @Override
33 | public Map>> getAnalyzers() {
34 | return Collections.singletonMap("vi_analyzer", VietnameseAnalyzerProvider::new);
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/jni/Tokenizer.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "com_coccoc_Tokenizer.h"
3 |
4 | static jclass java_nio_ByteBuffer;
5 | static jint JNI_VERSION = JNI_VERSION_10;
6 |
7 | jint JNI_OnLoad(JavaVM *vm, void *reserved)
8 | {
9 | JNIEnv* env;
10 | if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION) != JNI_OK) {
11 | return JNI_ERR;
12 | }
13 |
14 | java_nio_ByteBuffer = static_cast(env->NewGlobalRef(env->FindClass("java/nio/ByteBuffer")));
15 |
16 | return JNI_VERSION;
17 | }
18 |
19 | void JNI_OnUnload(JavaVM *vm, void *reserved)
20 | {
21 | JNIEnv* env;
22 | vm->GetEnv(reinterpret_cast(&env), JNI_VERSION);
23 |
24 | env->DeleteGlobalRef(java_nio_ByteBuffer);
25 | }
26 |
27 | /**
28 | * Segment a document and return an array of direct ByteBuffer referring
29 | * to segmentation result vectors. Further processing happens in Java code.
30 | * The method returns 3 ByteBuffer. The first one is the normalized text.
31 | * The second one contains Token structs (see token.hpp). The last one contains
32 | * pointers to dynamically created vectors, used for clean up when done.
33 | */
34 | JNIEXPORT jobjectArray JNICALL Java_com_coccoc_Tokenizer_segment(
35 | JNIEnv *env, jobject obj, jstring jni_text, jint tokenize_option)
36 | {
37 | const jchar *jtext = env->GetStringCritical(jni_text, nullptr);
38 | int text_length = env->GetStringLength(jni_text);
39 |
40 | // Use pointer to avoid automatic deallocation
41 | // Must call `freeMemory` when done to clean up
42 | std::vector< uint32_t > *normalized = new std::vector< uint32_t >();
43 | normalized->reserve(text_length);
44 |
45 | std::vector< int > original_pos;
46 | Tokenizer::instance().normalize_for_tokenization(jtext, text_length, *normalized, original_pos, true);
47 | env->ReleaseStringCritical(jni_text, jtext);
48 |
49 | // Use pointer here too
50 | std::vector< Token > *tokens = new std::vector< Token >();
51 | // space_positions is only used when `for_transforming` is true?
52 | std::vector< int > space_positions;
53 |
54 | Tokenizer::instance().handle_tokenization_request< Token >(
55 | *normalized, *tokens, space_positions, original_pos, false, tokenize_option);
56 |
57 | for (size_t i = 0; i < tokens->size(); ++i)
58 | {
59 | tokens->at(i).original_start += original_pos[tokens->at(i).normalized_start];
60 | tokens->at(i).original_end += original_pos[tokens->at(i).normalized_end];
61 | }
62 |
63 | // Keep pointers to original vectors in another array so we can clean up later
64 | // When done, pass this pointer (ByteBuffer) to `freeMemory` to clean up
65 | int64_t *p = new int64_t[2];
66 | p[0] = (int64_t) normalized;
67 | p[1] = (int64_t) tokens;
68 |
69 | jobjectArray results = env->NewObjectArray(3, java_nio_ByteBuffer, nullptr);
70 |
71 | env->SetObjectArrayElement(results, 0, env->NewDirectByteBuffer(normalized->data(), normalized->size() * 4));
72 | env->SetObjectArrayElement(results, 1, env->NewDirectByteBuffer(tokens->data(), tokens->size() * 6 * 4));
73 | env->SetObjectArrayElement(results, 2, env->NewDirectByteBuffer(p, 0));
74 |
75 | return results;
76 | }
77 |
78 | JNIEXPORT void JNICALL Java_com_coccoc_Tokenizer_freeMemory(JNIEnv *env, jobject obj, jobject res_pointer)
79 | {
80 | // Cast each object pointer to their respective type, must be careful
81 | int64_t *p = static_cast(env->GetDirectBufferAddress(res_pointer));
82 | delete (std::vector< uint32_t > *) (p[0]);
83 | delete (std::vector< Token > *) (p[1]);
84 | delete[](int64_t *) p;
85 | }
86 |
87 | JNIEXPORT void JNICALL Java_com_coccoc_Tokenizer_initialize(JNIEnv *env, jobject obj, jstring jni_dict_path)
88 | {
89 | const char *dict_path = env->GetStringUTFChars(jni_dict_path, nullptr);
90 | int status_code = Tokenizer::instance().initialize(std::string(dict_path));
91 |
92 | if (status_code != 0) {
93 | jclass java_lang_RuntimeException = env->FindClass("java/lang/RuntimeException");
94 |
95 | env->ThrowNew(java_lang_RuntimeException, "Could not load dictionary");
96 | }
97 |
98 | env->ReleaseStringUTFChars(jni_dict_path, dict_path);
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/plugin-metadata/plugin-security.policy:
--------------------------------------------------------------------------------
1 | grant {
2 | permission java.lang.RuntimePermission "loadLibrary.*";
3 | };
4 |
--------------------------------------------------------------------------------
/src/main/resources/stopwords.txt:
--------------------------------------------------------------------------------
1 | bị
2 | bởi
3 | cả
4 | các
5 | cái
6 | cần
7 | càng
8 | chỉ
9 | chiếc
10 | cho
11 | chứ
12 | chưa
13 | chuyện
14 | có
15 | có thể
16 | cứ
17 | của
18 | cùng
19 | cũng
20 | đã
21 | đang
22 | để
23 | đến nỗi
24 | đều
25 | điều
26 | do
27 | đó
28 | được
29 | dưới
30 | gì
31 | khi
32 | không
33 | là
34 | lại
35 | lên
36 | lúc
37 | mà
38 | mỗi
39 | một cách
40 | này
41 | nên
42 | nếu
43 | ngay
44 | nhiều
45 | như
46 | nhưng
47 | những
48 | nơi
49 | nữa
50 | phải
51 | qua
52 | ra
53 | rằng
54 | rất
55 | rồi
56 | sau
57 | sẽ
58 | so
59 | sự
60 | tại
61 | theo
62 | thì
63 | trên
64 | trước
65 | từ
66 | từng
67 | và
68 | vẫn
69 | vào
70 | vậy
71 | vì
72 | việc
73 | với
74 | vừa
75 | vâng
76 | à
77 | ừ
78 | từ
79 |
--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTests.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import java.util.Collection;
4 | import java.util.Collections;
5 |
6 | import org.elasticsearch.action.admin.cluster.node.info.NodeInfo;
7 | import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
8 | import org.elasticsearch.plugin.analysis.vi.AnalysisViPlugin;
9 | import org.elasticsearch.plugins.Plugin;
10 | import org.elasticsearch.test.ESIntegTestCase;
11 |
12 | import static org.hamcrest.Matchers.is;
13 |
14 | public class VietnameseAnalysisIntegrationTests extends ESIntegTestCase {
15 | @Override
16 | protected Collection> nodePlugins() {
17 | return Collections.singleton(AnalysisViPlugin.class);
18 | }
19 |
20 | public void testPluginIsLoaded() throws Exception {
21 | NodesInfoResponse response = client().admin().cluster().prepareNodesInfo().setPlugins(true).get();
22 |
23 | for (NodeInfo nodeInfo : response.getNodes()) {
24 | boolean pluginLoaded = nodeInfo.getPlugins().getPluginInfos()
25 | .stream().anyMatch(plugin -> plugin.getName().equals("analysis-vi"));
26 |
27 | assertThat(pluginLoaded, is(true));
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/tokenizer.gradle:
--------------------------------------------------------------------------------
1 | def JAVA_HOME = System.getenv('JAVA_HOME') ?: System.getenv('JDK_HOME')
2 |
3 | task configureTokenizer(type:Exec) {
4 | outputs.dir 'build/tokenizer'
5 |
6 | workingDir 'build/tokenizer'
7 | commandLine 'cmake', '../../coccoc-tokenizer'
8 | }
9 |
10 | task compileDict(type:Exec) {
11 | outputs.dir 'build/tokenizer'
12 | outputs.files 'multiterm_trie.dump', 'nontone_pair_freq_map.dump', 'syllable_trie.dump'
13 |
14 | dependsOn 'configureTokenizer'
15 | workingDir 'build/tokenizer'
16 | commandLine 'make', 'compile_dict'
17 | }
18 |
19 | task compileTokenizer(type:Exec) {
20 | outputs.dirs 'build/lib'
21 |
22 | dependsOn 'configureTokenizer'
23 | dependsOn 'compileJava'
24 |
25 | commandLine 'g++', '-Wall', '-Werror', '-Wno-deprecated', '-shared', '-std=c++11', '-O3', '-DNDEBUG', '-ggdb', '-fPIC',
26 | '-I', 'coccoc-tokenizer',
27 | '-I', 'build/headers',
28 | '-I', "build/tokenizer/auto",
29 | '-I', "${JAVA_HOME}/include",
30 | '-I', "${JAVA_HOME}/include/linux",
31 | '-o', 'build/lib/libcoccoc_tokenizer_jni.so',
32 | 'src/main/jni/Tokenizer.cpp'
33 | }
34 |
35 | compileJava {
36 | options.compilerArgs += ['-h', file('build/headers')]
37 | }
38 |
39 | bundlePlugin {
40 | dependsOn 'compileDict'
41 | dependsOn 'compileTokenizer'
42 |
43 | from('build/lib') {
44 | into 'lib'
45 | }
46 |
47 | from('build/tokenizer') {
48 | include '*.dump'
49 | into 'dicts'
50 | }
51 |
52 | from('coccoc-tokenizer/dicts/vn_lang_tool') {
53 | into 'dicts'
54 | }
55 | }
56 |
--------------------------------------------------------------------------------