├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── build.gradle
├── gradle.properties
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── settings.gradle
├── src
    ├── main
    │   ├── java
    │   │   ├── com
    │   │   │   └── coccoc
    │   │   │   │   ├── Token.java
    │   │   │   │   └── Tokenizer.java
    │   │   └── org
    │   │   │   ├── apache
    │   │   │       └── lucene
    │   │   │       │   └── analysis
    │   │   │       │       └── vi
    │   │   │       │           ├── VietnameseAnalyzer.java
    │   │   │       │           └── VietnameseTokenizer.java
    │   │   │   └── elasticsearch
    │   │   │       ├── index
    │   │   │           └── analysis
    │   │   │           │   ├── VietnameseAnalyzerProvider.java
    │   │   │           │   ├── VietnameseStopTokenFilterFactory.java
    │   │   │           │   └── VietnameseTokenizerFactory.java
    │   │   │       └── plugin
    │   │   │           └── analysis
    │   │   │               └── vi
    │   │   │                   └── AnalysisViPlugin.java
    │   ├── jni
    │   │   └── Tokenizer.cpp
    │   ├── plugin-metadata
    │   │   └── plugin-security.policy
    │   └── resources
    │   │   └── stopwords.txt
    └── test
    │   └── java
    │       └── org
    │           └── elasticsearch
    │               └── index
    │                   └── analysis
    │                       └── VietnameseAnalysisIntegrationTests.java
└── tokenizer.gradle


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled class file
 2 | *.class
 3 | 
 4 | # Log file
 5 | *.log
 6 | 
 7 | # BlueJ files
 8 | *.ctxt
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 | 
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 | 
25 | # Gradle
26 | .gradle
27 | build
28 | !gradle-wrapper.jar
29 | 
30 | # Eclipse project files
31 | .project
32 | .classpath
33 | .settings
34 | 
35 | # IDE
36 | .vscode
37 | .idea
38 | *.iml
39 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "coccoc-tokenizer"]
2 | 	path = coccoc-tokenizer
3 | 	url = https://github.com/coccoc/coccoc-tokenizer
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Vietnamese Analysis Plugin for Elasticsearch
 2 | 
 3 | Vietnamese Analysis plugin integrates Vietnamese language analysis into Elasticsearch.
 4 | The plugin provides the following functions:
 5 | 
 6 | Analyzer: `vi_analyzer`. Tokenizer: `vi_tokenizer`. Filter: `vi_stop`. The `vi_analyzer` itself is composed of the `vi_tokenizer` and the `vi_stop` filter.
 7 | 
 8 | The tokenizer uses [coccoc-tokenizer](https://github.com/coccoc/coccoc-tokenizer) for tokenization.
 9 | 
10 | ## Installation
11 | 
12 | Choose a version from the [releases](https://github.com/sun-asterisk-research/elasticsearch-analysis-vi/releases) page to install:
13 | 
14 | ```sh
15 | elasticsearch-plugin install https://github.com/sun-asterisk-research/elasticsearch-analysis-vi/releases/download/<release>/<bundle>
16 | ```
17 | 
18 | Or [build from source](#build-from-source) and install from a plugin bundle.
19 | 
20 | ```sh
21 | elasticsearch-plugin instal file:///path/to/plugin
22 | ```
23 | 
24 | ## Supported versions
25 | 
26 | | Branch | Elasticsearch version |
27 | |--------|-----------------------|
28 | | master | 7.4+                  |
29 | | 7.3    | 7.0 - 7.3             |
30 | 
31 | ## Build from source
32 | 
33 | You need the following build dependencies: `JDK`, `make`, `cmake`, `libstdc++`. At least JDK 11 is required. Beware of your `libstdc++` version. If you build on a version too new, it will not work on older systems.
34 | 
35 | First update the git submodules:
36 | 
37 | ```sh
38 | git submodule update --init
39 | ```
40 | 
41 | Build and bundle the plugin:
42 | 
43 | ```sh
44 | ./gradlew assemble
45 | ```
46 | 
47 | To build for a different elasticsearch version, add `-PelasticsearchVersion=<version>` to your build command. Also note the [branch and supported versions](#supported-versions). For example, to build for Elasticsearch 7.3.1:
48 | 
49 | ```sh
50 | ./gradlew assemble -PelasticsearchVersion=7.3.1
51 | ```
52 | 
53 | To run tests:
54 | 
55 | ```sh
56 | ./gradlew check
57 | ```
58 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | buildscript {
 2 |     repositories {
 3 |         mavenCentral()
 4 |         jcenter()
 5 |     }
 6 | 
 7 |     dependencies {
 8 |         classpath "org.elasticsearch.gradle:build-tools:${elasticsearchVersion}"
 9 |     }
10 | }
11 | 
12 | apply plugin: 'java'
13 | apply plugin: 'elasticsearch.esplugin'
14 | apply from:   'tokenizer.gradle'
15 | 
16 | version = "${pluginVersion}-es${versions.elasticsearch}"
17 | 
18 | esplugin {
19 |     name 'analysis-vi'
20 |     version "${pluginVersion}"
21 |     description 'Elasticsearch Vietnamese Analysis Plugin'
22 |     classname 'org.elasticsearch.plugin.analysis.vi.AnalysisViPlugin'
23 |     licenseFile rootProject.file('LICENSE')
24 |     noticeFile rootProject.file('README.md')
25 | }
26 | 
27 | integTestRunner {
28 |     include 'org/elasticsearch/index/analysis/*Tests.class'
29 | }
30 | 
31 | test.enabled = false
32 | licenseHeaders.enabled = false
33 | 


--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
1 | pluginVersion=1.0.0
2 | elasticsearchVersion=7.5.1
3 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sun-asterisk-research/elasticsearch-analysis-vi/84b7e5301461ce633f8b6e8aa52104dc37d51af4/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.0.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | #
  4 | # Copyright 2015 the original author or authors.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      https://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | ##############################################################################
 20 | ##
 21 | ##  Gradle start up script for UN*X
 22 | ##
 23 | ##############################################################################
 24 | 
 25 | # Attempt to set APP_HOME
 26 | # Resolve links: $0 may be a link
 27 | PRG="$0"
 28 | # Need this for relative symlinks.
 29 | while [ -h "$PRG" ] ; do
 30 |     ls=`ls -ld "$PRG"`
 31 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 32 |     if expr "$link" : '/.*' > /dev/null; then
 33 |         PRG="$link"
 34 |     else
 35 |         PRG=`dirname "$PRG"`"/$link"
 36 |     fi
 37 | done
 38 | SAVED="`pwd`"
 39 | cd "`dirname \"$PRG\"`/" >/dev/null
 40 | APP_HOME="`pwd -P`"
 41 | cd "$SAVED" >/dev/null
 42 | 
 43 | APP_NAME="Gradle"
 44 | APP_BASE_NAME=`basename "$0"`
 45 | 
 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 48 | 
 49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 50 | MAX_FD="maximum"
 51 | 
 52 | warn () {
 53 |     echo "$*"
 54 | }
 55 | 
 56 | die () {
 57 |     echo
 58 |     echo "$*"
 59 |     echo
 60 |     exit 1
 61 | }
 62 | 
 63 | # OS specific support (must be 'true' or 'false').
 64 | cygwin=false
 65 | msys=false
 66 | darwin=false
 67 | nonstop=false
 68 | case "`uname`" in
 69 |   CYGWIN* )
 70 |     cygwin=true
 71 |     ;;
 72 |   Darwin* )
 73 |     darwin=true
 74 |     ;;
 75 |   MINGW* )
 76 |     msys=true
 77 |     ;;
 78 |   NONSTOP* )
 79 |     nonstop=true
 80 |     ;;
 81 | esac
 82 | 
 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 84 | 
 85 | # Determine the Java command to use to start the JVM.
 86 | if [ -n "$JAVA_HOME" ] ; then
 87 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 88 |         # IBM's JDK on AIX uses strange locations for the executables
 89 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 90 |     else
 91 |         JAVACMD="$JAVA_HOME/bin/java"
 92 |     fi
 93 |     if [ ! -x "$JAVACMD" ] ; then
 94 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 95 | 
 96 | Please set the JAVA_HOME variable in your environment to match the
 97 | location of your Java installation."
 98 |     fi
 99 | else
100 |     JAVACMD="java"
101 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
102 | 
103 | Please set the JAVA_HOME variable in your environment to match the
104 | location of your Java installation."
105 | fi
106 | 
107 | # Increase the maximum file descriptors if we can.
108 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
109 |     MAX_FD_LIMIT=`ulimit -H -n`
110 |     if [ $? -eq 0 ] ; then
111 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
112 |             MAX_FD="$MAX_FD_LIMIT"
113 |         fi
114 |         ulimit -n $MAX_FD
115 |         if [ $? -ne 0 ] ; then
116 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
117 |         fi
118 |     else
119 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
120 |     fi
121 | fi
122 | 
123 | # For Darwin, add options to specify how the application appears in the dock
124 | if $darwin; then
125 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
126 | fi
127 | 
128 | # For Cygwin or MSYS, switch paths to Windows format before running java
129 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
130 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
131 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
132 |     JAVACMD=`cygpath --unix "$JAVACMD"`
133 | 
134 |     # We build the pattern for arguments to be converted via cygpath
135 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
136 |     SEP=""
137 |     for dir in $ROOTDIRSRAW ; do
138 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
139 |         SEP="|"
140 |     done
141 |     OURCYGPATTERN="(^($ROOTDIRS))"
142 |     # Add a user-defined pattern to the cygpath arguments
143 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
144 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
145 |     fi
146 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
147 |     i=0
148 |     for arg in "$@" ; do
149 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
150 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
151 | 
152 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
153 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
154 |         else
155 |             eval `echo args$i`="\"$arg\""
156 |         fi
157 |         i=$((i+1))
158 |     done
159 |     case $i in
160 |         (0) set -- ;;
161 |         (1) set -- "$args0" ;;
162 |         (2) set -- "$args0" "$args1" ;;
163 |         (3) set -- "$args0" "$args1" "$args2" ;;
164 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
165 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
166 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
167 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
168 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
169 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
170 |     esac
171 | fi
172 | 
173 | # Escape application args
174 | save () {
175 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
176 |     echo " "
177 | }
178 | APP_ARGS=$(save "$@")
179 | 
180 | # Collect all arguments for the java command, following the shell quoting and substitution rules
181 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
182 | 
183 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
184 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
185 |   cd "$(dirname "$0")"
186 | fi
187 | 
188 | exec "$JAVACMD" "$@"
189 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
  1 | @rem
  2 | @rem Copyright 2015 the original author or authors.
  3 | @rem
  4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
  5 | @rem you may not use this file except in compliance with the License.
  6 | @rem You may obtain a copy of the License at
  7 | @rem
  8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
  9 | @rem
 10 | @rem Unless required by applicable law or agreed to in writing, software
 11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | @rem See the License for the specific language governing permissions and
 14 | @rem limitations under the License.
 15 | @rem
 16 | 
 17 | @if "%DEBUG%" == "" @echo off
 18 | @rem ##########################################################################
 19 | @rem
 20 | @rem  Gradle startup script for Windows
 21 | @rem
 22 | @rem ##########################################################################
 23 | 
 24 | @rem Set local scope for the variables with windows NT shell
 25 | if "%OS%"=="Windows_NT" setlocal
 26 | 
 27 | set DIRNAME=%~dp0
 28 | if "%DIRNAME%" == "" set DIRNAME=.
 29 | set APP_BASE_NAME=%~n0
 30 | set APP_HOME=%DIRNAME%
 31 | 
 32 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 33 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
 34 | 
 35 | @rem Find java.exe
 36 | if defined JAVA_HOME goto findJavaFromJavaHome
 37 | 
 38 | set JAVA_EXE=java.exe
 39 | %JAVA_EXE% -version >NUL 2>&1
 40 | if "%ERRORLEVEL%" == "0" goto init
 41 | 
 42 | echo.
 43 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 44 | echo.
 45 | echo Please set the JAVA_HOME variable in your environment to match the
 46 | echo location of your Java installation.
 47 | 
 48 | goto fail
 49 | 
 50 | :findJavaFromJavaHome
 51 | set JAVA_HOME=%JAVA_HOME:"=%
 52 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
 53 | 
 54 | if exist "%JAVA_EXE%" goto init
 55 | 
 56 | echo.
 57 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
 58 | echo.
 59 | echo Please set the JAVA_HOME variable in your environment to match the
 60 | echo location of your Java installation.
 61 | 
 62 | goto fail
 63 | 
 64 | :init
 65 | @rem Get command-line arguments, handling Windows variants
 66 | 
 67 | if not "%OS%" == "Windows_NT" goto win9xME_args
 68 | 
 69 | :win9xME_args
 70 | @rem Slurp the command line arguments.
 71 | set CMD_LINE_ARGS=
 72 | set _SKIP=2
 73 | 
 74 | :win9xME_args_slurp
 75 | if "x%~1" == "x" goto execute
 76 | 
 77 | set CMD_LINE_ARGS=%*
 78 | 
 79 | :execute
 80 | @rem Setup the command line
 81 | 
 82 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
 83 | 
 84 | @rem Execute Gradle
 85 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
 86 | 
 87 | :end
 88 | @rem End local scope for the variables with windows NT shell
 89 | if "%ERRORLEVEL%"=="0" goto mainEnd
 90 | 
 91 | :fail
 92 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
 93 | rem the _cmd.exe /c_ return code!
 94 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
 95 | exit /b 1
 96 | 
 97 | :mainEnd
 98 | if "%OS%"=="Windows_NT" endlocal
 99 | 
100 | :omega
101 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'elasticsearch-analysis-vi'
2 | 


--------------------------------------------------------------------------------
/src/main/java/com/coccoc/Token.java:
--------------------------------------------------------------------------------
 1 | package com.coccoc;
 2 | 
 3 | public final class Token {
 4 |     public enum Type {
 5 |         WORD,
 6 |         NUMBER;
 7 | 
 8 |         private static Type[] values = null;
 9 | 
10 |         static {
11 |             Type.values = Type.values();
12 |         }
13 | 
14 |         public static Type fromInt(int i) {
15 |             return Type.values[i];
16 |         }
17 |     }
18 | 
19 |     private final String text;
20 |     private final Type type;
21 |     private final int originalStart;
22 |     private final int originalEnd;
23 | 
24 |     public Token(String text, int type, int originalStart, int originalEnd) {
25 |         this(text, Type.fromInt(type), originalStart, originalEnd);
26 |     }
27 | 
28 |     public Token(String text, Type type, int originalStart, int originalEnd) {
29 |         this.text = text;
30 |         this.type = type;
31 |         this.originalStart = originalStart;
32 |         this.originalEnd = originalEnd;
33 |     }
34 | 
35 |     public String getText() {
36 |         return text;
37 |     }
38 | 
39 |     public int getLength() {
40 |         return text.length();
41 |     }
42 | 
43 |     public Type getType() {
44 |         return type;
45 |     }
46 | 
47 |     public int getOriginalStart() {
48 |         return originalStart;
49 |     }
50 | 
51 |     public int getOriginalEnd() {
52 |         return originalEnd;
53 |     }
54 | 
55 |     public String toString() {
56 |         return text;
57 |     }
58 | 
59 |     public char[] toCharArray() {
60 |         return text.toCharArray();
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/com/coccoc/Tokenizer.java:
--------------------------------------------------------------------------------
  1 | package com.coccoc;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.Reader;
  5 | import java.net.URI;
  6 | import java.net.URISyntaxException;
  7 | import java.nio.ByteBuffer;
  8 | import java.nio.ByteOrder;
  9 | import java.nio.IntBuffer;
 10 | import java.security.AccessController;
 11 | import java.security.PrivilegedAction;
 12 | import java.util.ArrayList;
 13 | import java.util.List;
 14 | 
 15 | public class Tokenizer {
 16 |     public enum Mode {
 17 |         NORMAL(0),
 18 |         HOST(1),
 19 |         URL(2);
 20 | 
 21 |         private int value;
 22 | 
 23 |         Mode(int value) {
 24 |             this.value = value;
 25 |         }
 26 |     }
 27 | 
 28 |     private static final String libPath;
 29 |     private static final String dictPath;
 30 | 
 31 |     private native ByteBuffer[] segment(String text, int tokenizeOption);
 32 |     private native void freeMemory(ByteBuffer p);
 33 |     private native void initialize(String dictPath) throws RuntimeException;
 34 | 
 35 |     private static Tokenizer instance;
 36 | 
 37 |     static {
 38 |         try {
 39 |             URI file = Tokenizer.class.getProtectionDomain().getCodeSource().getLocation().toURI();
 40 |             libPath = file.resolve("lib/libcoccoc_tokenizer_jni.so").getPath();
 41 |             dictPath = file.resolve("dicts").getPath();
 42 |         } catch (URISyntaxException e) {
 43 |             throw new RuntimeException("Could not initialize Tokenizer");
 44 |         }
 45 |     }
 46 | 
 47 |     public static Tokenizer getInstance() {
 48 |         if (instance == null) {
 49 |             instance = new Tokenizer();
 50 |         }
 51 | 
 52 |         return instance;
 53 |     }
 54 | 
 55 |     private Tokenizer() {
 56 |         AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
 57 |             System.load(libPath);
 58 |             return null;
 59 |         });
 60 |         initialize(dictPath);
 61 |     }
 62 | 
 63 |     public List<Token> tokenize(String text, Mode mode) {
 64 |         if (text == null) {
 65 |             throw new IllegalArgumentException("text is null");
 66 |         }
 67 | 
 68 |         ByteBuffer[] segmentResults = segment(text, mode.value);
 69 | 
 70 |         IntBuffer normalizedChars = segmentResults[0].order(ByteOrder.nativeOrder()).asIntBuffer();
 71 |         IntBuffer rawTokens = segmentResults[1].order(ByteOrder.nativeOrder()).asIntBuffer();
 72 |         ByteBuffer pointers = segmentResults[2];
 73 | 
 74 |         StringBuilder sb = new StringBuilder();
 75 | 
 76 |         while (normalizedChars.hasRemaining()) {
 77 |             sb.appendCodePoint(normalizedChars.get());
 78 |         }
 79 | 
 80 |         String normalizedText = sb.toString();
 81 | 
 82 |         int tokensCount = rawTokens.capacity() / 6;
 83 |         List<Token> tokens = new ArrayList<Token>(tokensCount);
 84 | 
 85 |         for (int i = 0; i < tokensCount; i++) {
 86 |             int offset = i * 6;
 87 |             int normalizedStart = rawTokens.get(offset);
 88 |             int normalizedEnd = rawTokens.get(offset + 1);
 89 |             int originalStart = rawTokens.get(offset + 2);
 90 |             int originalEnd = rawTokens.get(offset + 3);
 91 | 
 92 |             String tokenText = normalizedText.substring(normalizedStart, normalizedEnd);
 93 |             int tokenType = rawTokens.get(offset + 4);
 94 | 
 95 |             tokens.add(new Token(tokenText, tokenType, originalStart, originalEnd));
 96 |         }
 97 | 
 98 |         freeMemory(pointers);
 99 | 
100 |         return tokens;
101 |     }
102 | 
103 |     public List<Token> tokenize(Reader input, Mode mode) throws IOException {
104 |         char[] buffer = new char[1024];
105 |         StringBuilder sb = new StringBuilder();
106 |         int numCharsRead;
107 |         while ((numCharsRead = input.read(buffer, 0, buffer.length)) != -1) {
108 |             sb.append(buffer, 0, numCharsRead);
109 |         }
110 | 
111 |         return tokenize(sb.toString(), mode);
112 |     }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/lucene/analysis/vi/VietnameseAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.vi;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import com.coccoc.Tokenizer.Mode;
 6 | 
 7 | import org.apache.lucene.analysis.CharArraySet;
 8 | import org.apache.lucene.analysis.StopwordAnalyzerBase;
 9 | import org.apache.lucene.analysis.TokenStream;
10 | import org.apache.lucene.analysis.Tokenizer;
11 | import org.apache.lucene.analysis.core.StopFilter;
12 | 
13 | public class VietnameseAnalyzer extends StopwordAnalyzerBase {
14 |     private final Mode mode;
15 | 
16 |     public VietnameseAnalyzer(Mode mode, CharArraySet stopwords) {
17 |         super(stopwords);
18 |         this.mode = mode;
19 |     }
20 | 
21 |     public static CharArraySet getDefaultStopSet() {
22 |         return DefaultSetHolder.DEFAULT_STOP_SET;
23 |     }
24 | 
25 |     private static class DefaultSetHolder {
26 |         static final CharArraySet DEFAULT_STOP_SET;
27 | 
28 |         static {
29 |             try {
30 |                 DEFAULT_STOP_SET = loadStopwordSet(true, VietnameseAnalyzer.class, "/stopwords.txt", "#");
31 |             } catch (IOException e) {
32 |                 throw new RuntimeException("Unable to load default stopword set");
33 |             }
34 |         }
35 |     }
36 | 
37 |     @Override
38 |     protected TokenStreamComponents createComponents(String fieldName) {
39 |         Tokenizer tokenizer = new VietnameseTokenizer(mode);
40 |         TokenStream stream = new StopFilter(tokenizer, stopwords);
41 | 
42 |         return new TokenStreamComponents(tokenizer, stream);
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.vi;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | import java.util.Locale;
 6 | 
 7 | import com.coccoc.Token;
 8 | import com.coccoc.Tokenizer.Mode;
 9 | 
10 | import org.apache.lucene.analysis.Tokenizer;
11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
12 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
13 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
14 | 
15 | public class VietnameseTokenizer extends Tokenizer {
16 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
17 |     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
18 |     private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
19 | 
20 |     private Mode tokenizeMode;
21 |     private com.coccoc.Tokenizer tokenizer;
22 |     private Iterator<Token> tokens;
23 | 
24 |     private int currentOffset = 0;
25 | 
26 |     public VietnameseTokenizer(Mode tokenizeMode) {
27 |         this.tokenizeMode = tokenizeMode;
28 |         this.tokenizer = com.coccoc.Tokenizer.getInstance();
29 |     }
30 | 
31 |     @Override
32 |     public boolean incrementToken() throws IOException {
33 |         clearAttributes();
34 | 
35 |         if (tokens.hasNext()) {
36 |             final Token token = tokens.next();
37 |             final int tokenLength = token.getLength();
38 |             final int start = correctOffset(token.getOriginalStart());
39 |             final int end = correctOffset(token.getOriginalEnd());
40 | 
41 |             termAtt.copyBuffer(token.toCharArray(), 0, tokenLength);
42 |             typeAtt.setType(token.getType().name().toLowerCase(Locale.ROOT));
43 |             offsetAtt.setOffset(start, end);
44 |             currentOffset = end;
45 | 
46 |             return true;
47 |         }
48 | 
49 |         return false;
50 |     }
51 | 
52 |     @Override
53 |     public void end() throws IOException {
54 |         super.end();
55 |         int finalOffset = correctOffset(currentOffset);
56 |         offsetAtt.setOffset(finalOffset, finalOffset);
57 |     }
58 | 
59 |     @Override
60 |     public void reset() throws IOException {
61 |         super.reset();
62 |         currentOffset = 0;
63 |         tokens = tokenizer.tokenize(input, tokenizeMode).iterator();
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/VietnameseAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import com.coccoc.Tokenizer.Mode;
 4 | 
 5 | import org.apache.lucene.analysis.CharArraySet;
 6 | import org.apache.lucene.analysis.vi.VietnameseAnalyzer;
 7 | import org.elasticsearch.common.settings.Settings;
 8 | import org.elasticsearch.env.Environment;
 9 | import org.elasticsearch.index.IndexSettings;
10 | 
11 | public class VietnameseAnalyzerProvider extends AbstractIndexAnalyzerProvider<VietnameseAnalyzer> {
12 |     private final VietnameseAnalyzer analyzer;
13 | 
14 |     public VietnameseAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
15 |         super(indexSettings, name, settings);
16 | 
17 |         final Mode tokenizeMode = VietnameseTokenizerFactory.getTokenizeMode(settings);
18 |         final CharArraySet stopwords = Analysis.parseStopWords(env, settings, VietnameseAnalyzer.getDefaultStopSet());
19 |         analyzer = new VietnameseAnalyzer(tokenizeMode, stopwords);
20 |     }
21 | 
22 |     @Override
23 |     public VietnameseAnalyzer get() {
24 |         return this.analyzer;
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/VietnameseStopTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import java.util.Collections;
 4 | import java.util.Map;
 5 | import java.util.Set;
 6 | 
 7 | import org.apache.lucene.analysis.CharArraySet;
 8 | import org.apache.lucene.analysis.TokenStream;
 9 | import org.apache.lucene.analysis.core.StopFilter;
10 | import org.apache.lucene.analysis.vi.VietnameseAnalyzer;
11 | import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
12 | import org.elasticsearch.common.settings.Settings;
13 | import org.elasticsearch.env.Environment;
14 | import org.elasticsearch.index.IndexSettings;
15 | 
16 | public class VietnameseStopTokenFilterFactory extends AbstractTokenFilterFactory {
17 |     private static final Map<String, Set<?>> NAMED_STOP_WORDS;
18 |     private final CharArraySet stopwords;
19 |     private final boolean ignoreCase;
20 |     private final boolean removeTrailing;
21 | 
22 |     static {
23 |         NAMED_STOP_WORDS = Collections.singletonMap("_vietnamese_", VietnameseAnalyzer.getDefaultStopSet());
24 |     }
25 | 
26 |     public VietnameseStopTokenFilterFactory(
27 |         IndexSettings indexSettings, Environment env, String name, Settings settings
28 |     ) {
29 |         super(indexSettings, name, settings);
30 |         this.ignoreCase = settings.getAsBoolean("ignore_case", false);
31 |         this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
32 |         this.stopwords = Analysis.parseWords(
33 |             env, settings, "stopwords", VietnameseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase
34 |         );
35 |     }
36 | 
37 |     @Override
38 |     public TokenStream create(TokenStream tokenStream) {
39 |         return removeTrailing
40 |             ? new StopFilter(tokenStream, stopwords)
41 |             : new SuggestStopFilter(tokenStream, stopwords);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/VietnameseTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import java.util.Locale;
 4 | 
 5 | import com.coccoc.Tokenizer.Mode;
 6 | 
 7 | import org.apache.lucene.analysis.Tokenizer;
 8 | import org.apache.lucene.analysis.vi.VietnameseTokenizer;
 9 | import org.elasticsearch.common.settings.Settings;
10 | import org.elasticsearch.env.Environment;
11 | import org.elasticsearch.index.IndexSettings;
12 | 
13 | public class VietnameseTokenizerFactory extends AbstractTokenizerFactory {
14 |     private final Mode tokenizeMode;
15 | 
16 |     public VietnameseTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
17 |         super(indexSettings, settings, name);
18 | 
19 |         tokenizeMode = getTokenizeMode(settings);
20 |     }
21 | 
22 |     public static Mode getTokenizeMode(Settings settings)
23 |     {
24 |         String modeSetting = settings.get("mode", "normal").toUpperCase(Locale.ROOT);
25 |         return Mode.valueOf(modeSetting);
26 |     }
27 | 
28 |     public Tokenizer create() {
29 |         return new VietnameseTokenizer(tokenizeMode);
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/vi/AnalysisViPlugin.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.analysis.vi;
 2 | 
 3 | import java.util.Collections;
 4 | import java.util.HashMap;
 5 | import java.util.Map;
 6 | 
 7 | import org.apache.lucene.analysis.Analyzer;
 8 | import org.elasticsearch.index.analysis.AnalyzerProvider;
 9 | import org.elasticsearch.index.analysis.TokenFilterFactory;
10 | import org.elasticsearch.index.analysis.TokenizerFactory;
11 | import org.elasticsearch.index.analysis.VietnameseAnalyzerProvider;
12 | import org.elasticsearch.index.analysis.VietnameseStopTokenFilterFactory;
13 | import org.elasticsearch.index.analysis.VietnameseTokenizerFactory;
14 | import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
15 | import org.elasticsearch.plugins.AnalysisPlugin;
16 | import org.elasticsearch.plugins.Plugin;
17 | 
18 | public class AnalysisViPlugin extends Plugin implements AnalysisPlugin {
19 |     @Override
20 |     public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
21 |         Map<String, AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>();
22 |         extra.put("vi_stop", VietnameseStopTokenFilterFactory::new);
23 | 
24 |         return extra;
25 |     }
26 | 
27 |     @Override
28 |     public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
29 |         return Collections.singletonMap("vi_tokenizer", VietnameseTokenizerFactory::new);
30 |     }
31 | 
32 |     @Override
33 |     public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
34 |         return Collections.singletonMap("vi_analyzer", VietnameseAnalyzerProvider::new);
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/jni/Tokenizer.cpp:
--------------------------------------------------------------------------------
  1 | #include <tokenizer/tokenizer.hpp>
  2 | #include "com_coccoc_Tokenizer.h"
  3 | 
  4 | static jclass java_nio_ByteBuffer;
  5 | static jint JNI_VERSION = JNI_VERSION_10;
  6 | 
  7 | jint JNI_OnLoad(JavaVM *vm, void *reserved)
  8 | {
  9 | 	JNIEnv* env;
 10 | 	if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION) != JNI_OK) {
 11 | 		return JNI_ERR;
 12 | 	}
 13 | 
 14 | 	java_nio_ByteBuffer = static_cast<jclass>(env->NewGlobalRef(env->FindClass("java/nio/ByteBuffer")));
 15 | 
 16 | 	return JNI_VERSION;
 17 | }
 18 | 
 19 | void JNI_OnUnload(JavaVM *vm, void *reserved)
 20 | {
 21 | 	JNIEnv* env;
 22 | 	vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION);
 23 | 
 24 | 	env->DeleteGlobalRef(java_nio_ByteBuffer);
 25 | }
 26 | 
 27 | /**
 28 |  * Segment a document and return an array of direct ByteBuffer referring
 29 |  * to segmentation result vectors. Further processing happens in Java code.
 30 |  * The method returns 3 ByteBuffer. The first one is the normalized text.
 31 |  * The second one contains Token structs (see token.hpp). The last one contains
 32 |  * pointers to dynamically created vectors, used for clean up when done.
 33 |  */
 34 | JNIEXPORT jobjectArray JNICALL Java_com_coccoc_Tokenizer_segment(
 35 | 	JNIEnv *env, jobject obj, jstring jni_text, jint tokenize_option)
 36 | {
 37 | 	const jchar *jtext = env->GetStringCritical(jni_text, nullptr);
 38 | 	int text_length = env->GetStringLength(jni_text);
 39 | 
 40 | 	// Use pointer to avoid automatic deallocation
 41 | 	// Must call `freeMemory` when done to clean up
 42 | 	std::vector< uint32_t > *normalized = new std::vector< uint32_t >();
 43 | 	normalized->reserve(text_length);
 44 | 
 45 | 	std::vector< int > original_pos;
 46 | 	Tokenizer::instance().normalize_for_tokenization(jtext, text_length, *normalized, original_pos, true);
 47 | 	env->ReleaseStringCritical(jni_text, jtext);
 48 | 
 49 | 	// Use pointer here too
 50 | 	std::vector< Token > *tokens = new std::vector< Token >();
 51 | 	// space_positions is only used when `for_transforming` is true?
 52 | 	std::vector< int > space_positions;
 53 | 
 54 | 	Tokenizer::instance().handle_tokenization_request< Token >(
 55 | 		*normalized, *tokens, space_positions, original_pos, false, tokenize_option);
 56 | 
 57 | 	for (size_t i = 0; i < tokens->size(); ++i)
 58 | 	{
 59 | 		tokens->at(i).original_start += original_pos[tokens->at(i).normalized_start];
 60 | 		tokens->at(i).original_end += original_pos[tokens->at(i).normalized_end];
 61 | 	}
 62 | 
 63 | 	// Keep pointers to original vectors in another array so we can clean up later
 64 | 	// When done, pass this pointer (ByteBuffer) to `freeMemory` to clean up
 65 | 	int64_t *p = new int64_t[2];
 66 | 	p[0] = (int64_t) normalized;
 67 | 	p[1] = (int64_t) tokens;
 68 | 
 69 | 	jobjectArray results = env->NewObjectArray(3, java_nio_ByteBuffer, nullptr);
 70 | 
 71 | 	env->SetObjectArrayElement(results, 0, env->NewDirectByteBuffer(normalized->data(), normalized->size() * 4));
 72 | 	env->SetObjectArrayElement(results, 1, env->NewDirectByteBuffer(tokens->data(), tokens->size() * 6 * 4));
 73 | 	env->SetObjectArrayElement(results, 2, env->NewDirectByteBuffer(p, 0));
 74 | 
 75 | 	return results;
 76 | }
 77 | 
 78 | JNIEXPORT void JNICALL Java_com_coccoc_Tokenizer_freeMemory(JNIEnv *env, jobject obj, jobject res_pointer)
 79 | {
 80 | 	// Cast each object pointer to their respective type, must be careful
 81 | 	int64_t *p = static_cast<int64_t*>(env->GetDirectBufferAddress(res_pointer));
 82 | 	delete (std::vector< uint32_t > *) (p[0]);
 83 | 	delete (std::vector< Token > *) (p[1]);
 84 | 	delete[](int64_t *) p;
 85 | }
 86 | 
 87 | JNIEXPORT void JNICALL Java_com_coccoc_Tokenizer_initialize(JNIEnv *env, jobject obj, jstring jni_dict_path)
 88 | {
 89 | 	const char *dict_path = env->GetStringUTFChars(jni_dict_path, nullptr);
 90 | 	int status_code = Tokenizer::instance().initialize(std::string(dict_path));
 91 | 
 92 | 	if (status_code != 0) {
 93 | 		jclass java_lang_RuntimeException = env->FindClass("java/lang/RuntimeException");
 94 | 
 95 | 		env->ThrowNew(java_lang_RuntimeException, "Could not load dictionary");
 96 | 	}
 97 | 
 98 | 	env->ReleaseStringUTFChars(jni_dict_path, dict_path);
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/main/plugin-metadata/plugin-security.policy:
--------------------------------------------------------------------------------
1 | grant {
2 |     permission java.lang.RuntimePermission "loadLibrary.*";
3 | };
4 | 


--------------------------------------------------------------------------------
/src/main/resources/stopwords.txt:
--------------------------------------------------------------------------------
 1 | bị
 2 | bởi
 3 | cả
 4 | các
 5 | cái
 6 | cần
 7 | càng
 8 | chỉ
 9 | chiếc
10 | cho
11 | chứ
12 | chưa
13 | chuyện
14 | có
15 | có thể
16 | cứ
17 | của
18 | cùng
19 | cũng
20 | đã
21 | đang
22 | để
23 | đến nỗi
24 | đều
25 | điều
26 | do
27 | đó
28 | được
29 | dưới
30 | gì
31 | khi
32 | không
33 | là
34 | lại
35 | lên
36 | lúc
37 | mà
38 | mỗi
39 | một cách
40 | này
41 | nên
42 | nếu
43 | ngay
44 | nhiều
45 | như
46 | nhưng
47 | những
48 | nơi
49 | nữa
50 | phải
51 | qua
52 | ra
53 | rằng
54 | rất
55 | rồi
56 | sau
57 | sẽ
58 | so
59 | sự
60 | tại
61 | theo
62 | thì
63 | trên
64 | trước
65 | từ
66 | từng
67 | và
68 | vẫn
69 | vào
70 | vậy
71 | vì
72 | việc
73 | với
74 | vừa
75 | vâng
76 | à
77 | ừ
78 | từ
79 | 


--------------------------------------------------------------------------------
/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTests.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis;
 2 | 
 3 | import java.util.Collection;
 4 | import java.util.Collections;
 5 | 
 6 | import org.elasticsearch.action.admin.cluster.node.info.NodeInfo;
 7 | import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
 8 | import org.elasticsearch.plugin.analysis.vi.AnalysisViPlugin;
 9 | import org.elasticsearch.plugins.Plugin;
10 | import org.elasticsearch.test.ESIntegTestCase;
11 | 
12 | import static org.hamcrest.Matchers.is;
13 | 
14 | public class VietnameseAnalysisIntegrationTests extends ESIntegTestCase {
15 |     @Override
16 |     protected Collection<Class<? extends Plugin>> nodePlugins() {
17 |         return Collections.singleton(AnalysisViPlugin.class);
18 |     }
19 | 
20 |     public void testPluginIsLoaded() throws Exception {
21 |         NodesInfoResponse response = client().admin().cluster().prepareNodesInfo().setPlugins(true).get();
22 | 
23 |         for (NodeInfo nodeInfo : response.getNodes()) {
24 |             boolean pluginLoaded = nodeInfo.getPlugins().getPluginInfos()
25 |                 .stream().anyMatch(plugin -> plugin.getName().equals("analysis-vi"));
26 | 
27 |             assertThat(pluginLoaded, is(true));
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/tokenizer.gradle:
--------------------------------------------------------------------------------
 1 | def JAVA_HOME = System.getenv('JAVA_HOME') ?: System.getenv('JDK_HOME')
 2 | 
 3 | task configureTokenizer(type:Exec) {
 4 |     outputs.dir 'build/tokenizer'
 5 | 
 6 |     workingDir 'build/tokenizer'
 7 |     commandLine 'cmake', '../../coccoc-tokenizer'
 8 | }
 9 | 
10 | task compileDict(type:Exec) {
11 |     outputs.dir 'build/tokenizer'
12 |     outputs.files 'multiterm_trie.dump', 'nontone_pair_freq_map.dump', 'syllable_trie.dump'
13 | 
14 |     dependsOn 'configureTokenizer'
15 |     workingDir 'build/tokenizer'
16 |     commandLine 'make', 'compile_dict'
17 | }
18 | 
19 | task compileTokenizer(type:Exec) {
20 |     outputs.dirs 'build/lib'
21 | 
22 |     dependsOn 'configureTokenizer'
23 |     dependsOn 'compileJava'
24 | 
25 |     commandLine 'g++', '-Wall', '-Werror', '-Wno-deprecated', '-shared', '-std=c++11', '-O3', '-DNDEBUG', '-ggdb', '-fPIC',
26 |         '-I', 'coccoc-tokenizer',
27 |         '-I', 'build/headers',
28 |         '-I', "build/tokenizer/auto",
29 |         '-I', "${JAVA_HOME}/include",
30 |         '-I', "${JAVA_HOME}/include/linux",
31 |         '-o', 'build/lib/libcoccoc_tokenizer_jni.so',
32 |         'src/main/jni/Tokenizer.cpp'
33 | }
34 | 
35 | compileJava {
36 |     options.compilerArgs += ['-h', file('build/headers')]
37 | }
38 | 
39 | bundlePlugin {
40 |     dependsOn 'compileDict'
41 |     dependsOn 'compileTokenizer'
42 | 
43 |     from('build/lib') {
44 |         into 'lib'
45 |     }
46 | 
47 |     from('build/tokenizer') {
48 |         include '*.dump'
49 |         into 'dicts'
50 |     }
51 | 
52 |     from('coccoc-tokenizer/dicts/vn_lang_tool') {
53 |         into 'dicts'
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------