├── .gitignore ├── README.md ├── build.gradle ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── models └── README.md ├── settings.gradle └── src ├── main ├── java │ └── org │ │ ├── elasticsearch │ │ ├── index │ │ │ ├── ThulacAnalyzer.java │ │ │ ├── ThulacAnalyzerProvider.java │ │ │ ├── ThulacTokenizer.java │ │ │ └── ThulacTokenizerFactory.java │ │ ├── plugin │ │ │ └── analysis │ │ │ │ └── ThulacAnalysisPlugin.java │ │ └── thulac │ │ │ ├── Configuration.java │ │ │ ├── ThulacLiteSegment.java │ │ │ ├── ThulacLiteTokenizerScanner.java │ │ │ ├── postprocess │ │ │ ├── DictionaryPassBuilder.java │ │ │ ├── DoubleWordPassBuilder.java │ │ │ ├── FilterPassBuilder.java │ │ │ ├── NegWordPassBuilder.java │ │ │ ├── SpecialPassBuilder.java │ │ │ ├── TimeWordPassBuilder.java │ │ │ └── VerbPassBuilder.java │ │ │ └── preprocess │ │ │ ├── ConvertT2SPassBuilder.java │ │ │ └── PreProcessPassBuilder.java │ │ └── thunlp │ │ └── thulac │ │ ├── Thulac.java │ │ ├── cb │ │ ├── AlphaBeta.java │ │ ├── CBModel.java │ │ ├── CBNGramFeature.java │ │ ├── CBTaggingDecoder.java │ │ └── Node.java │ │ ├── data │ │ ├── Dat.java │ │ ├── DatMaker.java │ │ ├── POCGraph.java │ │ └── TaggedWord.java │ │ ├── io │ │ ├── IInputProvider.java │ │ ├── IOutputHandler.java │ │ ├── IProgramStateListener.java │ │ ├── ReaderInputProvider.java │ │ ├── StringInputProvider.java │ │ ├── StringOutputHandler.java │ │ └── WriterOutputHandler.java │ │ ├── main │ │ └── Main.java │ │ ├── postprocess │ │ ├── DictionaryPass.java │ │ ├── DoubleWordPass.java │ │ ├── FilterPass.java │ │ ├── IPostprocessPass.java │ │ ├── NegWordPass.java │ │ ├── SpecialPass.java │ │ ├── TimeWordPass.java │ │ └── VerbPass.java │ │ ├── preprocess │ │ ├── ConvertT2SPass.java │ │ ├── IPreprocessPass.java │ │ └── PreProcessPass.java │ │ └── util │ │ ├── BufferUtils.java │ │ ├── CodePointUtils.java │ │ ├── IOUtils.java │ │ └── StringUtils.java └── resources │ ├── plugin-descriptor.properties │ └── plugin-security.policy └── test ├── java └── TestThulac.java └── resources └── input /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### macOS template 3 | # General 4 | .DS_Store 5 | .AppleDouble 6 | .LSOverride 7 | 8 | # Icon must end with two \r 9 | Icon 10 | 11 | # Thumbnails 12 | ._* 13 | 14 | # Files that might appear in the root of a volume 15 | .DocumentRevisions-V100 16 | .fseventsd 17 | .Spotlight-V100 18 | .TemporaryItems 19 | .Trashes 20 | .VolumeIcon.icns 21 | .com.apple.timemachine.donotpresent 22 | 23 | # Directories potentially created on remote AFP share 24 | .AppleDB 25 | .AppleDesktop 26 | Network Trash Folder 27 | Temporary Items 28 | .apdisk 29 | ### Eclipse template 30 | 31 | .metadata 32 | bin/ 33 | tmp/ 34 | *.tmp 35 | *.bak 36 | *.swp 37 | *~.nib 38 | local.properties 39 | .settings/ 40 | .loadpath 41 | .recommenders 42 | 43 | # External tool builders 44 | .externalToolBuilders/ 45 | 46 | # Locally stored "Eclipse launch configurations" 47 | *.launch 48 | 49 | # PyDev specific (Python IDE for Eclipse) 50 | *.pydevproject 51 | 52 | # CDT-specific (C/C++ Development Tooling) 53 | .cproject 54 | 55 | # Java annotation processor (APT) 56 | .factorypath 57 | 58 | # PDT-specific (PHP Development Tools) 59 | .buildpath 60 | 61 | # sbteclipse plugin 62 | .target 63 | 64 | # Tern plugin 65 | .tern-project 66 | 67 | # TeXlipse plugin 68 | .texlipse 69 | 70 | # STS (Spring Tool Suite) 71 | .springBeans 72 | 73 | # Code Recommenders 74 | .recommenders/ 75 | 76 | # Scala IDE specific (Scala & Java development for Eclipse) 77 | .cache-main 78 | .scala_dependencies 79 | .worksheet 80 | ### Java template 81 | # Compiled class file 82 | *.class 83 | 84 | # Log file 85 | *.log 86 | 87 | # BlueJ files 88 | *.ctxt 89 | 90 | # Mobile Tools for Java (J2ME) 91 | .mtj.tmp/ 92 | 93 | # Package Files # 94 | *.jar 95 | *.war 96 | *.ear 97 | *.zip 98 | *.tar.gz 99 | *.rar 100 | 101 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 102 | hs_err_pid* 103 | ### Gradle template 104 | .gradle 105 | /build/ 106 | 107 | # Ignore Gradle GUI config 108 | gradle-app.setting 109 | 110 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 111 | !gradle-wrapper.jar 112 | 113 | # Cache of project 114 | .gradletasknamecache 115 | 116 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898 117 | # gradle/wrapper/gradle-wrapper.properties 118 | ### JetBrains template 119 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 120 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 121 | 122 | .idea 123 | 124 | # User-specific stuff: 125 | .idea/**/workspace.xml 126 | .idea/**/tasks.xml 127 | .idea/dictionaries 128 | 129 | # Sensitive or high-churn files: 130 | .idea/**/dataSources/ 131 | .idea/**/dataSources.ids 132 | .idea/**/dataSources.xml 133 | .idea/**/dataSources.local.xml 134 | .idea/**/sqlDataSources.xml 135 | .idea/**/dynamic.xml 136 | .idea/**/uiDesigner.xml 137 | 138 | # Gradle: 139 | .idea/**/gradle.xml 140 | .idea/**/libraries 141 | 142 | # CMake 143 | cmake-build-debug/ 144 | 145 | # Mongo Explorer plugin: 146 | .idea/**/mongoSettings.xml 147 | 148 | ## File-based project format: 149 | *.iws 150 | 151 | ## Plugin-specific files: 152 | 153 | # IntelliJ 154 | out/ 155 | 156 | # mpeltonen/sbt-idea plugin 157 | .idea_modules/ 158 | 159 | # JIRA plugin 160 | atlassian-ide-plugin.xml 161 | 162 | # Cursive Clojure plugin 163 | .idea/replstate.xml 164 | 165 | # Crashlytics plugin (for Android Studio and IntelliJ) 166 | com_crashlytics_export_strings.xml 167 | crashlytics.properties 168 | crashlytics-build.properties 169 | fabric.properties 170 | 171 | .classpath 172 | .project 173 | models/*.bin 174 | models/*.txt 175 | models/model* 176 | models/*.dat 177 | *.iml 178 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # THULAC Analysis for Elasticsearch 2 | 采用[THULAC](https://github.com/thunlp/THULAC-Java)实现的[Elasticsearch](https://www.elastic.co)中文分词插件。 3 | 4 | 版本 5 | -------- 6 | 7 | Plugin 版本 | ES 版本 | THULAC 版本 | Link 8 | -----------|-----------|----------|------------ 9 | master | 7.x -> master | lite | 10 | 7.9.1 | 7.9.1 | lite |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/7.9.1/elasticsearch-thulac-plugin-7.9.1.zip) 11 | 6.4.1-181027 | 6.4.1 | lite |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.4.1-181027/elasticsearch-thulac-plugin-6.4.1-181027.zip) 12 | 6.4.0-181027 | 6.4.0 | lite |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.4.0-181027/elasticsearch-thulac-plugin-6.4.0-181027.zip) 13 | 6.3.0-181027 | 6.3.0 | lite |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.3.0-181027/elasticsearch-thulac-plugin-6.3.0-181027.zip) 14 | 6.2.0-181027 | 6.2.0 | lite |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.2.0-181027/elasticsearch-thulac-plugin-6.2.0-181027.zip) 15 | 6.1.0-181027 | 6.1.0 | lite |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.1.0-181027/elasticsearch-thulac-plugin-6.1.0-181027.zip) 16 | 17 | 18 | 下载安装 19 | -------- 20 | 直接下载已经打包好的插件,解压到elasticsearch的plugins目录下即可。 21 | 22 | 编译安装 23 | -------- 24 | 1.编译打包 25 | 26 | ```bash 27 | git clone git@github.com:microbun/elasticsearch-thulac-plugin.git 28 | cd elasticsearch-thulac-plugin 29 | ./gradlew release 30 | ``` 31 | 32 | 2.安装到elasticsearch 33 | ``` 34 | cp build/distributions/elasticsearch-thulac-plugin-7.9.1.zip ${ES_HOME}/plugins 35 | cd ${ES_HOME}/plugins 36 | unzip elasticsearch-thulac-plugin-7.9.1.zip 37 | rm elasticsearch-thulac-plugin-7.9.1.zip 38 | ``` 39 | 解压后在plugins目录下会有一个thulac文件夹。 40 | ``` 41 | thulac 42 | |-elasticsearch-thulac-plugin-7.9.1.jar 43 | |-models #算法模型目录 44 | |-plugin-descriptor.properties 45 | |-plugin.xml 46 | ``` 47 | 48 | 3.由于THULAC的模型太大,插件中没有包含模型数据,可以在[THULAC](https://github.com/thunlp/THULAC-Java) 下载模型(lite),将模型拷贝到models中。 49 | 50 | 51 | 示例 52 | -------- 53 | #### 1.创建索引 54 | 55 | 1.1 使用默认分词方式 56 | ```bash 57 | curl -H "Content-Type:application/json" -XPUT http://localhost:9200/index -d' 58 | { 59 | "mappings": { 60 | "properties": { 61 | "text": { 62 | "type": "text", 63 | "analyzer": "thulac" 64 | } 65 | } 66 | } 67 | } 68 | ' 69 | ``` 70 | 71 | 1.2 自定义分词器 72 | ```bash 73 | curl -H "Content-Type:application/json" -XPUT http://localhost:9200/index -d' 74 | { 75 | "settings": { 76 | "analysis": { 77 | "tokenizer": { 78 | "custom_thulac_tokenizer": { 79 | "type": "thulac", 80 | "user_dict": "userdict.txt", 81 | "t2s": true, 82 | "filter": false 83 | } 84 | }, 85 | "analyzer": { 86 | "custom_thulac_analyzer": { 87 | "tokenizer": "custom_thulac_tokenizer", 88 | "filter": [ 89 | "lowercase" 90 | ] 91 | } 92 | } 93 | } 94 | }, 95 | "mappings": { 96 | "properties": { 97 | "text": { 98 | "type": "text", 99 | "analyzer": "custom_thulac_analyzer" 100 | } 101 | } 102 | } 103 | }' 104 | ``` 105 | 106 | | 参数名称 | 含义 | 值 | 107 | | --- | --- |---| 108 | | t2s | 将句子从繁体转化为简体。默认:true | false/true | 109 | | filter | 使用过滤器去除一些没有意义的词语,例如“可以”。默认:false | false/true | 110 | | user_dict | 自定义词典路径,每一个词一行,UTF8编码,相对路径和绝对路径.
相对路径:userdict.txt 会加载 ${ES_HOME}/plugins/module/userdict.txt文件
绝对路径:/home/elasticsearch/userdict.txt
默认:userdict.txt | | 111 | 112 | #### 2.查看索引 113 | ```bash 114 | curl http://localhost:9200/index 115 | ``` 116 | 117 | #### 3.测试分词效果 118 | ```bash 119 | curl -H "Content-Type:application/json" -XPOST http://localhost:9200/index/_analyze -d' 120 | { 121 | "analyzer":"thulac", 122 | "text":"我是中国人" 123 | } 124 | ' 125 | ``` 126 | 127 | #### 4.删除索引 128 | ``` 129 | curl -XDELETE http://localhost:9200/index 130 | ``` 131 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | group 'org.elasticsearch.thulac' 2 | version '7.9.1' 3 | 4 | apply plugin: 'java' 5 | 6 | sourceCompatibility = 1.8 7 | 8 | repositories { 9 | mavenCentral() 10 | } 11 | 12 | configurations { 13 | wagon 14 | distJars { 15 | extendsFrom runtime 16 | exclude group: 'org.elasticsearch' 17 | exclude group: 'lucene-core' 18 | exclude group: 'org.apache.logging.log4j' 19 | exclude group: 'lucene-analyzers-common' 20 | exclude group: 'org.apache.commons' 21 | } 22 | } 23 | 24 | sourceSets { 25 | main { 26 | java { 27 | srcDir "src/main/java" 28 | } 29 | resources { 30 | srcDir "src/main/resources" 31 | include "**/*" 32 | } 33 | } 34 | } 35 | 36 | dependencies { 37 | testCompile group: 'junit', name: 'junit', version: '4.11' 38 | compile 'org.elasticsearch:elasticsearch:7.9.1' 39 | } 40 | 41 | task release_full(type: Zip, dependsOn: [':jar']) { 42 | into('thulac') { 43 | from configurations.distJars 44 | from 'build/libs' 45 | from 'build/resources/main/plugin.xml' 46 | from 'build/resources/main/plugin-descriptor.properties' 47 | from 'build/resources/main/plugin-security.policy' 48 | } 49 | from('models') { 50 | include "**/*" 51 | into ('thulac/models') 52 | } 53 | } 54 | 55 | task release_lite(type: Zip, dependsOn: [':jar']) { 56 | into('thulac') { 57 | from configurations.distJars 58 | from 'build/libs' 59 | from 'build/resources/main/plugin.xml' 60 | from 'build/resources/main/plugin-descriptor.properties' 61 | from 'build/resources/main/plugin-security.policy' 62 | } 63 | from('models') { 64 | include "README.md" 65 | include "userdict.txt" 66 | into ('thulac/models') 67 | } 68 | } -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microbun/elasticsearch-thulac-plugin/ddc29e6eb21fb08c80c7faa36ed85e55922d985a/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Sun Dec 17 15:22:11 CST 2017 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-3.1-all.zip 7 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn ( ) { 37 | echo "$*" 38 | } 39 | 40 | die ( ) { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 158 | function splitJvmOpts() { 159 | JVM_OPTS=("$@") 160 | } 161 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 162 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 163 | 164 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 165 | if [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]]; then 166 | cd "$(dirname "$0")" 167 | fi 168 | 169 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 170 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | #算法模型 放倒当前目录下 2 | 模型列表 3 | cws_dat.bin 4 | cws_label.txt 5 | cws_model.bin 6 | idiom.dat 7 | model_c_dat.bin 8 | model_c_label.txt 9 | model_c_model.bin 10 | model_w 11 | neg.dat 12 | ns.dat 13 | singlepun.dat 14 | t2s.dat 15 | time.dat 16 | vD.dat 17 | vM.dat 18 | xu.dat -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'elasticsearch-thulac-plugin' 2 | 3 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/ThulacAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.thulac.Configuration; 5 | 6 | /** 7 | * Created by micro on 2017-12-17. 8 | */ 9 | public class ThulacAnalyzer extends Analyzer { 10 | 11 | private Configuration configuration; 12 | 13 | public ThulacAnalyzer(Configuration configuration) { 14 | this.configuration = configuration; 15 | } 16 | 17 | @Override 18 | protected TokenStreamComponents createComponents(String fieldName) { 19 | return new TokenStreamComponents(new ThulacTokenizer(configuration)); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/ThulacAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | import org.elasticsearch.env.Environment; 5 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; 6 | import org.elasticsearch.thulac.Configuration; 7 | 8 | /** 9 | * Created by micro on 2017-12-17. 10 | */ 11 | public class ThulacAnalyzerProvider extends AbstractIndexAnalyzerProvider { 12 | 13 | private ThulacAnalyzer thulacAnalyzer; 14 | 15 | /** 16 | * Constructs a new analyzer component, with the index name and its settings and the analyzer name. 17 | * 18 | * @param indexSettings the settings and the name of the index 19 | * @param name The analyzer name 20 | * @param settings 21 | */ 22 | public ThulacAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 23 | super(indexSettings, name, settings); 24 | Configuration configuration = new Configuration(environment,indexSettings, settings); 25 | thulacAnalyzer = new ThulacAnalyzer(configuration); 26 | } 27 | 28 | @Override 29 | public ThulacAnalyzer get() { 30 | return thulacAnalyzer; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/ThulacTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 6 | import org.elasticsearch.thulac.Configuration; 7 | import org.elasticsearch.thulac.ThulacLiteTokenizerScanner; 8 | import org.thunlp.thulac.data.TaggedWord; 9 | 10 | import java.io.IOException; 11 | 12 | /** 13 | * Created by micro on 2017-12-17. 14 | */ 15 | public class ThulacTokenizer extends Tokenizer { 16 | 17 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 18 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 19 | // private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 20 | private ThulacLiteTokenizerScanner scanner; 21 | private int endPosition; 22 | 23 | 24 | public ThulacTokenizer(Configuration configuration) { 25 | try { 26 | scanner = new ThulacLiteTokenizerScanner(configuration); 27 | } catch (IOException e) { 28 | throw new IllegalArgumentException("thulac configuration error", e); 29 | } 30 | } 31 | 32 | @Override 33 | public boolean incrementToken() { 34 | clearAttributes(); 35 | if (scanner.hasNext()) { 36 | TaggedWord token = scanner.next(); 37 | termAtt.append(token.word); 38 | termAtt.setLength(token.word.length()); 39 | offsetAtt.setOffset(token.startOffset, token.endOffset); 40 | endPosition = token.endOffset; 41 | return true; 42 | } 43 | return false; 44 | } 45 | 46 | 47 | @Override 48 | public final void end() throws IOException { 49 | super.end(); 50 | int finalOffset = correctOffset(this.endPosition); 51 | offsetAtt.setOffset(finalOffset, finalOffset); 52 | } 53 | 54 | // @Override 55 | // public void close() throws IOException { 56 | // super.close(); 57 | // scanner.reset(input); 58 | // } 59 | 60 | @Override 61 | public void reset() throws IOException { 62 | super.reset(); 63 | scanner.reset(input); 64 | } 65 | 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/ThulacTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory; 7 | import org.elasticsearch.thulac.Configuration; 8 | 9 | /** 10 | * Created by micro on 2017-12-17. 11 | */ 12 | public class ThulacTokenizerFactory extends AbstractTokenizerFactory { 13 | 14 | private Configuration configuration; 15 | 16 | public ThulacTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 17 | super(indexSettings, settings, name); 18 | configuration = new Configuration(environment, indexSettings, settings); 19 | } 20 | 21 | @Override 22 | public Tokenizer create() { 23 | return new ThulacTokenizer(configuration); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/ThulacAnalysisPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analysis; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.index.ThulacAnalyzerProvider; 5 | import org.elasticsearch.index.ThulacTokenizerFactory; 6 | import org.elasticsearch.index.analysis.AnalyzerProvider; 7 | import org.elasticsearch.index.analysis.TokenizerFactory; 8 | import org.elasticsearch.indices.analysis.AnalysisModule; 9 | import org.elasticsearch.plugins.AnalysisPlugin; 10 | import org.elasticsearch.plugins.Plugin; 11 | 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | 15 | /** 16 | * @author Microbun on 2017/12/17. 17 | */ 18 | public class ThulacAnalysisPlugin extends Plugin implements AnalysisPlugin { 19 | 20 | public Map> getTokenizers() { 21 | Map> extra = new HashMap<>(); 22 | extra.put("thulac", ThulacTokenizerFactory::new); 23 | return extra; 24 | } 25 | 26 | public Map>> getAnalyzers() { 27 | Map>> extra = new HashMap<>(); 28 | extra.put("thulac", ThulacAnalyzerProvider::new); 29 | return extra; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/Configuration.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac; 2 | 3 | import org.apache.logging.log4j.Logger; 4 | import org.elasticsearch.common.logging.Loggers; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | 9 | import java.nio.file.FileSystems; 10 | import java.nio.file.Path; 11 | 12 | /** 13 | * Created by micro on 2017-12-17. 14 | */ 15 | public class Configuration { 16 | 17 | String userDict = "userdict.txt"; 18 | boolean t2s = false; 19 | boolean segOnly = true; 20 | boolean filter = false; 21 | Path modelPath = FileSystems.getDefault().getPath("models/"); 22 | private Environment environment; 23 | private IndexSettings indexSettings; 24 | private Settings settings; 25 | private Logger logger = Loggers.getLogger(getClass(),"thulac"); 26 | 27 | public Configuration() { 28 | } 29 | 30 | public Configuration(Environment environment, IndexSettings indexSettings, Settings settings) { 31 | this.environment = environment; 32 | this.indexSettings = indexSettings; 33 | this.settings = settings; 34 | userDict = settings.get("user_dict", "userdict.txt"); 35 | t2s = settings.getAsBoolean("t2s", true); 36 | // segOnly = settings.getAsBoolean("seg_only", true); 37 | filter = settings.getAsBoolean("filter", false); 38 | modelPath = environment.pluginsFile().resolve("thulac/models"); 39 | // logger.info("thulac settings: path={}", modelPath.toAbsolutePath().toString()); 40 | // logger.info("thulac settings: user_dict={} use_t2s={} seg_only={} use_filter={} ", userDict, t2s, segOnly, useFilter); 41 | } 42 | 43 | public Environment getEnvironment() { 44 | return environment; 45 | } 46 | 47 | public IndexSettings getIndexSettings() { 48 | return indexSettings; 49 | } 50 | 51 | public Settings getSettings() { 52 | return settings; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/ThulacLiteSegment.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac; 2 | 3 | import org.apache.logging.log4j.Logger; 4 | import org.elasticsearch.common.logging.Loggers; 5 | import org.elasticsearch.thulac.postprocess.*; 6 | import org.elasticsearch.thulac.preprocess.ConvertT2SPassBuilder; 7 | import org.elasticsearch.thulac.preprocess.PreProcessPassBuilder; 8 | import org.thunlp.thulac.cb.CBTaggingDecoder; 9 | import org.thunlp.thulac.data.POCGraph; 10 | import org.thunlp.thulac.data.TaggedWord; 11 | import org.thunlp.thulac.postprocess.IPostprocessPass; 12 | import org.thunlp.thulac.preprocess.IPreprocessPass; 13 | 14 | import java.io.IOException; 15 | import java.nio.file.Files; 16 | import java.nio.file.Path; 17 | import java.nio.file.Paths; 18 | import java.util.ArrayList; 19 | import java.util.HashMap; 20 | import java.util.List; 21 | import java.util.Map; 22 | import java.util.concurrent.ConcurrentHashMap; 23 | 24 | public class ThulacLiteSegment { 25 | 26 | private static final Map decoder = new HashMap<>(); 27 | 28 | private static final Map cache = new ConcurrentHashMap<>(); 29 | ; 30 | private CBTaggingDecoder taggingDecoder; 31 | // preprocess passes 32 | private List pre = new ArrayList<>(); 33 | // postprocess passes 34 | private List post = new ArrayList<>(); 35 | 36 | private ThulacLiteSegment(Configuration configuration) throws IOException { 37 | synchronized (decoder) { 38 | init(configuration); 39 | } 40 | } 41 | 42 | public static ThulacLiteSegment getInstance(Configuration configuration) throws IOException { 43 | ThulacLiteSegment segment; 44 | if (cache.containsKey(configuration)) { 45 | segment = cache.get(configuration); 46 | } else { 47 | segment = new ThulacLiteSegment(configuration); 48 | cache.put(configuration, segment); 49 | } 50 | return segment; 51 | } 52 | 53 | private void init(Configuration configuration) throws IOException { 54 | // segmentation 55 | // load model 56 | String prefix = configuration.segOnly ? "cws_" : "model_c_"; 57 | if (!decoder.containsKey(prefix)) { 58 | CBTaggingDecoder temp = new CBTaggingDecoder(); 59 | temp.threshold = configuration.segOnly ? 0 : 10000; 60 | temp.loadFiles( 61 | join(configuration.modelPath, prefix + "model.bin"), 62 | join(configuration.modelPath, prefix + "dat.bin"), 63 | join(configuration.modelPath, prefix + "label.txt")); 64 | temp.setLabelTrans(); 65 | decoder.put(prefix, temp); 66 | } 67 | taggingDecoder = decoder.get(prefix); 68 | 69 | //pre pass 70 | pre.add(PreProcessPassBuilder.getInstance()); 71 | if (configuration.t2s) { 72 | pre.add(ConvertT2SPassBuilder.getInstance(join(configuration.modelPath, "t2s.dat"))); 73 | } 74 | 75 | //post pass 76 | post.add(DictionaryPassBuilder.getInstance(join(configuration.modelPath, "ns.dat"), "ns", false)); 77 | post.add(DictionaryPassBuilder.getInstance(join(configuration.modelPath, "idiom.dat"), "i", false)); 78 | post.add(DictionaryPassBuilder.getInstance(join(configuration.modelPath, "singlepun.dat"), "w", false)); 79 | post.add(TimeWordPassBuilder.getInstance()); 80 | post.add(DoubleWordPassBuilder.getInstance()); 81 | post.add(SpecialPassBuilder.getInstance()); 82 | post.add(NegWordPassBuilder.getInstance(join(configuration.modelPath, "neg.dat"))); 83 | if (configuration.userDict != null) { 84 | String path = configuration.userDict; 85 | if (!Paths.get(path).isAbsolute()) { 86 | path = join(configuration.modelPath, configuration.userDict); 87 | } 88 | if (Files.exists(Paths.get(path))) { 89 | post.add(DictionaryPassBuilder.getInstance(path, "uw", true)); 90 | } else { 91 | if (!configuration.userDict.equals("userdict.txt")) { 92 | throw new IllegalArgumentException("not exists user_dict[" + path + "]"); 93 | } 94 | } 95 | } 96 | } 97 | 98 | 99 | private String join(Path path, String... more) { 100 | return Paths.get(path.toAbsolutePath().toString(), more).toAbsolutePath().toString(); 101 | } 102 | 103 | public List segment(String raw) { 104 | List words = new ArrayList<>(); 105 | POCGraph graph = new POCGraph(); 106 | for (IPreprocessPass pass : pre) { 107 | raw = pass.process(raw, graph); 108 | } 109 | taggingDecoder.segment(raw, graph, words); 110 | for (IPostprocessPass pass : post) { 111 | pass.process(words); 112 | } 113 | return words; 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/ThulacLiteTokenizerScanner.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac; 2 | 3 | 4 | import org.thunlp.thulac.data.TaggedWord; 5 | 6 | import java.io.IOException; 7 | import java.io.Reader; 8 | import java.util.Iterator; 9 | import java.util.List; 10 | 11 | /** 12 | * Created by micro on 2017-12-17. 13 | */ 14 | public class ThulacLiteTokenizerScanner implements Iterator { 15 | 16 | 17 | // private Logger logger; 18 | private ThulacLiteSegment segment; 19 | private Iterator tokens; 20 | 21 | public ThulacLiteTokenizerScanner(Configuration configuration) throws IOException { 22 | // logger = Loggers.getLogger(getClass(), configuration.getSettings()); 23 | segment = ThulacLiteSegment.getInstance(configuration); 24 | } 25 | 26 | @Override 27 | public boolean hasNext() { 28 | return tokens.hasNext(); 29 | } 30 | 31 | @Override 32 | public TaggedWord next() { 33 | return tokens.next(); 34 | } 35 | 36 | @Override 37 | public void remove() { 38 | tokens.remove(); 39 | } 40 | 41 | public void reset(Reader reader) { 42 | String raw; 43 | try { 44 | StringBuilder bdr = new StringBuilder(); 45 | int size = 1024; 46 | char[] buf = new char[size]; 47 | while ((size = reader.read(buf, 0, size)) != -1) { 48 | bdr.append(new String(buf, 0, size)); 49 | } 50 | raw = bdr.toString(); 51 | List words = segment.segment(raw); 52 | tokens = words.iterator(); 53 | } catch (IOException e) { 54 | e.printStackTrace(); 55 | } 56 | 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/postprocess/DictionaryPassBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.postprocess.DictionaryPass; 4 | 5 | import java.io.IOException; 6 | import java.util.Map; 7 | import java.util.concurrent.ConcurrentHashMap; 8 | 9 | public class DictionaryPassBuilder { 10 | 11 | private static Map cache = new ConcurrentHashMap<>(); 12 | 13 | public static DictionaryPass getInstance(String dictFile, String tag, boolean isTxt) throws IOException { 14 | String key = dictFile + "#" + tag + "#" + isTxt; 15 | if (!cache.containsKey(key)) { 16 | cache.put(key, new DictionaryPass(dictFile, tag, isTxt)); 17 | } 18 | return cache.get(key); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/postprocess/DoubleWordPassBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.postprocess.DoubleWordPass; 4 | 5 | public class DoubleWordPassBuilder { 6 | 7 | private static DoubleWordPass instance = new DoubleWordPass(); 8 | 9 | public static DoubleWordPass getInstance() { 10 | return instance; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/postprocess/FilterPassBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.postprocess.FilterPass; 4 | 5 | import java.io.IOException; 6 | import java.util.Map; 7 | import java.util.concurrent.ConcurrentHashMap; 8 | 9 | public class FilterPassBuilder { 10 | 11 | private static Map cache = new ConcurrentHashMap<>(); 12 | 13 | public static FilterPass getInstance(String xuDatFile, String timeDatFile) throws IOException { 14 | String key = xuDatFile + "#" + timeDatFile; 15 | if (!cache.containsKey(key)) { 16 | cache.put(key, new FilterPass(xuDatFile, timeDatFile)); 17 | } 18 | return cache.get(key); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/postprocess/NegWordPassBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.postprocess.NegWordPass; 4 | 5 | import java.io.IOException; 6 | import java.util.Map; 7 | import java.util.concurrent.ConcurrentHashMap; 8 | 9 | public class NegWordPassBuilder { 10 | private static Map cache = new ConcurrentHashMap<>(); 11 | 12 | public static NegWordPass getInstance(String negDatFile) throws IOException { 13 | String key = negDatFile; 14 | if (!cache.containsKey(key)) { 15 | cache.put(key, new NegWordPass(negDatFile)); 16 | } 17 | return cache.get(key); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/postprocess/SpecialPassBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.postprocess.SpecialPass; 4 | 5 | public class SpecialPassBuilder { 6 | private static SpecialPass instance = new SpecialPass(); 7 | 8 | public static SpecialPass getInstance() { 9 | return instance; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/postprocess/TimeWordPassBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.postprocess.TimeWordPass; 4 | 5 | public class TimeWordPassBuilder { 6 | private static TimeWordPass instance = new TimeWordPass(); 7 | 8 | public static TimeWordPass getInstance() { 9 | return instance; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/postprocess/VerbPassBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.postprocess.VerbPass; 4 | 5 | import java.io.IOException; 6 | import java.util.Map; 7 | import java.util.concurrent.ConcurrentHashMap; 8 | 9 | public class VerbPassBuilder { 10 | private static Map cache = new ConcurrentHashMap<>(); 11 | 12 | public static VerbPass getInstance(String vMFile, String vDFile) throws IOException { 13 | String key = vMFile + "#" + vDFile; 14 | if (!cache.containsKey(key)) { 15 | cache.put(key, new VerbPass(vMFile, vDFile)); 16 | } 17 | return cache.get(key); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/preprocess/ConvertT2SPassBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac.preprocess; 2 | 3 | import org.thunlp.thulac.preprocess.ConvertT2SPass; 4 | 5 | import java.io.IOException; 6 | import java.util.Map; 7 | import java.util.concurrent.ConcurrentHashMap; 8 | 9 | public class ConvertT2SPassBuilder { 10 | 11 | private static Map cache = new ConcurrentHashMap<>(); 12 | 13 | public static ConvertT2SPass getInstance(String file) throws IOException { 14 | String key = file; 15 | if (!cache.containsKey(key)) { 16 | cache.put(key, new ConvertT2SPass(file)); 17 | } 18 | return cache.get(key); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/thulac/preprocess/PreProcessPassBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.thulac.preprocess; 2 | 3 | import org.thunlp.thulac.preprocess.PreProcessPass; 4 | 5 | public class PreProcessPassBuilder { 6 | 7 | private static PreProcessPass instance = new PreProcessPass(); 8 | 9 | public static PreProcessPass getInstance() { 10 | return instance; 11 | } 12 | 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/Thulac.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac; 2 | 3 | import org.thunlp.thulac.cb.CBTaggingDecoder; 4 | import org.thunlp.thulac.data.POCGraph; 5 | import org.thunlp.thulac.data.TaggedWord; 6 | import org.thunlp.thulac.io.IInputProvider; 7 | import org.thunlp.thulac.io.IOutputHandler; 8 | import org.thunlp.thulac.io.StringOutputHandler; 9 | import org.thunlp.thulac.postprocess.DictionaryPass; 10 | import org.thunlp.thulac.postprocess.DoubleWordPass; 11 | import org.thunlp.thulac.postprocess.FilterPass; 12 | import org.thunlp.thulac.postprocess.IPostprocessPass; 13 | import org.thunlp.thulac.postprocess.NegWordPass; 14 | import org.thunlp.thulac.postprocess.SpecialPass; 15 | import org.thunlp.thulac.postprocess.TimeWordPass; 16 | import org.thunlp.thulac.preprocess.ConvertT2SPass; 17 | import org.thunlp.thulac.preprocess.IPreprocessPass; 18 | import org.thunlp.thulac.preprocess.PreProcessPass; 19 | import org.thunlp.thulac.util.IOUtils; 20 | 21 | import java.io.File; 22 | import java.io.FileNotFoundException; 23 | import java.io.IOException; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import java.util.Vector; 27 | 28 | /** 29 | * The central class which acts as core of the THULAC API. It provides several 30 | * convenient methods make things easier for users. 31 | */ 32 | public class Thulac { 33 | /** 34 | * Run the segmentation program with argument {@code segOnly}, taking input from the 35 | * given {@link String} and return the segmented output as a {@link String}. 36 | * 37 | * @param input 38 | * The input {@link String}. 39 | * @param segOnly 40 | * Whether to output only segments. 41 | * 42 | * @return The segmented output as a {@link String}. 43 | * 44 | * @throws java.io.IOException 45 | * If one of the model files fails to load. 46 | */ 47 | public static String split(String input, boolean segOnly) throws IOException { 48 | StringOutputHandler outputProvider = IOUtils.outputToString(); 49 | IInputProvider inputProvider = IOUtils.inputFromString(input); 50 | split(inputProvider, outputProvider, segOnly); 51 | return outputProvider.getString(); 52 | } 53 | 54 | /** 55 | * Run the segmentation program with argument {@code segOnly}, taking input from the 56 | * given {@link java.io.File} and output the segmented return to a given {@link java.io.File}.
57 | * This method returns directly if either {@code inputFile} or {@code outputFile} 58 | * is null. 59 | * 60 | * @param inputFile 61 | * The name of the input file. 62 | * @param outputFile 63 | * The name of the output file. 64 | * @param segOnly 65 | * Whether to output only segments. 66 | * 67 | * @throws java.io.IOException 68 | * If one of the model files fails to load or either the input file or the output 69 | * file is {@code null}. 70 | */ 71 | public static void split(String inputFile, String outputFile, boolean segOnly) 72 | throws IOException { 73 | if (inputFile == null || outputFile == null) return; 74 | IInputProvider input = IOUtils.inputFromFile(inputFile); 75 | IOutputHandler output = IOUtils.outputToFile(outputFile); 76 | split(input, output, segOnly); 77 | } 78 | 79 | /** 80 | * Run the segmentation program with argument {@code segOnly}, taking input from the 81 | * given {@link java.io.File} and output the segmented return to a given {@link java.io.File}. 82 | * 83 | * @param input 84 | * The input {@link java.io.File}. 85 | * @param output 86 | * The output {@link java.io.File}. 87 | * @param segOnly 88 | * Whether to output only segments. 89 | * 90 | * @throws java.io.IOException 91 | * If one of the model files fails to load or either the input file or the output 92 | * file is {@code null}. 93 | */ 94 | public static void split(File input, File output, boolean segOnly) 95 | throws IOException { 96 | if (input == null) throw new FileNotFoundException("input == null!"); 97 | if (output == null) throw new FileNotFoundException("output == null!"); 98 | IInputProvider inputProvider = IOUtils.inputFromFile(input); 99 | IOutputHandler outputHandler = IOUtils.outputToFile(output); 100 | split(inputProvider, outputHandler, segOnly); 101 | } 102 | 103 | /** 104 | * Run the segmentation program with argument {@code segOnly} and default values 105 | * for all others. 106 | * 107 | * @param input 108 | * The {@link IInputProvider} instance to provide input. 109 | * @param output 110 | * The {@link IOutputHandler} instance to handle output. 111 | * @param segOnly 112 | * Whether to output only segments. 113 | * 114 | * @throws java.io.IOException 115 | * If I/O of either {@code input}, {@code output} or one of the model files 116 | * resulted in an exception. 117 | */ 118 | public static void split(IInputProvider input, IOutputHandler output, boolean segOnly) 119 | throws IOException { 120 | split("models/", '_', null, false, segOnly, false, input, output); 121 | } 122 | 123 | /** 124 | * Run the segmentation program with full arguments. 125 | * 126 | * @param modelDir 127 | * The directory under which the model files are located. 128 | * @param separator 129 | * The separator to use to separate words and tags. 130 | * @param userDict 131 | * The optional file name of the user-specified dictionary. 132 | * @param useT2S 133 | * Whether to transfer traditional Chinese to simplified Chinese before 134 | * segmentation. 135 | * @param segOnly 136 | * Whether to output only segments. 137 | * @param useFilter 138 | * Whether to use filters while processing. 139 | * @param input 140 | * The {@link IInputProvider} instance to provide input. 141 | * @param output 142 | * The {@link IOutputHandler} instance to handle output. 143 | * 144 | * @throws java.io.IOException 145 | * If I/O of either {@code input}, {@code output} or one of the model files 146 | * resulted in an exception. 147 | */ 148 | public static void split( 149 | String modelDir, char separator, String userDict, 150 | boolean useT2S, boolean segOnly, boolean useFilter, 151 | IInputProvider input, IOutputHandler output) throws IOException { 152 | try { 153 | input.onProgramStart(); 154 | output.onProgramStart(); 155 | 156 | // segmentation 157 | CBTaggingDecoder taggingDecoder = new CBTaggingDecoder(); 158 | taggingDecoder.threshold = segOnly ? 0 : 10000; 159 | String prefix = modelDir + (segOnly ? "cws_" : "model_c_"); 160 | taggingDecoder.loadFiles(prefix + "model.bin", 161 | prefix + "dat.bin", 162 | prefix + "label.txt"); 163 | taggingDecoder.setLabelTrans(); 164 | 165 | // preprocess passes 166 | List pre = new ArrayList<>(); 167 | pre.add(new PreProcessPass()); 168 | if (useT2S) pre.add(new ConvertT2SPass(modelDir + "t2s.dat")); 169 | 170 | // postprocess passes 171 | List post = new ArrayList<>(); 172 | post.add(new DictionaryPass(modelDir + "ns.dat", "ns", false)); 173 | post.add(new DictionaryPass(modelDir + "idiom.dat", "i", false)); 174 | post.add(new DictionaryPass(modelDir + "singlepun.dat", "w", false)); 175 | post.add(new TimeWordPass()); 176 | post.add(new DoubleWordPass()); 177 | post.add(new SpecialPass()); 178 | post.add(new NegWordPass(modelDir + "neg.dat")); 179 | if (userDict != null) post.add(new DictionaryPass(userDict, "uw", true)); 180 | if (useFilter) 181 | post.add(new FilterPass(modelDir + "xu.dat", modelDir + "time.dat")); 182 | 183 | // main loop 184 | List words = new Vector<>(); 185 | POCGraph graph = new POCGraph(); 186 | for (List lineSegments = input.provideInput(); 187 | lineSegments != null; 188 | lineSegments = input.provideInput()) { 189 | output.handleLineStart(); 190 | for (String raw : lineSegments) { 191 | for (IPreprocessPass pass : pre) raw = pass.process(raw, graph); 192 | taggingDecoder.segment(raw, graph, words); 193 | for (IPostprocessPass pass : post) pass.process(words); 194 | 195 | output.handleLineSegment(words, segOnly, separator); 196 | } 197 | output.handleLineEnd(); 198 | } 199 | } finally { // close resources even when program crashes 200 | input.onProgramEnd(); 201 | output.onProgramEnd(); 202 | } 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/cb/AlphaBeta.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.cb; 2 | 3 | 4 | // a structure for alphas and betas 5 | public class AlphaBeta { 6 | // TODO: add documentation 7 | 8 | public int value; 9 | public int nodeId; 10 | public int labelId; 11 | 12 | public AlphaBeta() { 13 | super(); 14 | this.value = 0; 15 | this.nodeId = -2; 16 | this.labelId = 0; 17 | } 18 | 19 | public AlphaBeta(int value, int nodeId, int labelId) { 20 | super(); 21 | this.value = value; 22 | this.nodeId = nodeId; 23 | this.labelId = labelId; 24 | } 25 | 26 | 27 | public static int dbDecode( 28 | int l_size, int[] llWeights, int nodeCount, Node[] nodes, int[] values, 29 | AlphaBeta[] alphas, 30 | int[] result, int[][] preLabels, int[][] allowedLabelLists) { 31 | int nodeId; 32 | int[] pNodeId; 33 | int[] pPreLabel; 34 | int[] pAllowedLabel; 35 | int k; 36 | int j; 37 | AlphaBeta tmp; 38 | AlphaBeta best = new AlphaBeta(); 39 | best.nodeId = -1; 40 | AlphaBeta preAlpha; 41 | 42 | int score; 43 | int index = 0; 44 | int index2 = 0; 45 | int index3 = 0; 46 | 47 | for (int i = 0; i < nodeCount * l_size; i++) { 48 | alphas[i] = new AlphaBeta(); 49 | alphas[i].nodeId = -2; 50 | } 51 | for (int i = 0; i < nodeCount; i++) { 52 | pAllowedLabel = allowedLabelLists != null ? allowedLabelLists[i] : null; 53 | j = -1; 54 | int maxValue = 0; 55 | boolean hasMaxValue = false; 56 | if (pAllowedLabel != null) { 57 | index = 0; 58 | while ((j = pAllowedLabel[index]) != -1) { 59 | index++; 60 | if (!hasMaxValue || (maxValue < values[i * l_size + j])) { 61 | hasMaxValue = true; 62 | maxValue = values[i * l_size + j]; 63 | } 64 | } 65 | index = 0; 66 | j = -1; 67 | while ((j = pAllowedLabel[index]) != -1) { 68 | index++; 69 | tmp = alphas[i * l_size + j]; 70 | tmp.value = 0; 71 | pNodeId = nodes[i].predecessors; 72 | pPreLabel = preLabels != null ? preLabels[j] : null; 73 | index2 = 0; 74 | while ((nodeId = pNodeId[index2]) >= 0) { 75 | index2++; 76 | k = -1; 77 | if (pPreLabel != null) { 78 | index3 = 0; 79 | while ((k = pPreLabel[index3]) != -1) { 80 | index3++; 81 | preAlpha = alphas[nodeId * l_size + k]; 82 | if (preAlpha.nodeId == -2) continue; 83 | score = preAlpha.value + llWeights[k * l_size + j]; 84 | if ((tmp.nodeId < 0) || (score > tmp.value)) { 85 | tmp.value = score; 86 | tmp.nodeId = nodeId; 87 | tmp.labelId = k; 88 | } 89 | } 90 | } else { 91 | k++; 92 | while (k != l_size) { 93 | preAlpha = alphas[nodeId * l_size + k]; 94 | if (preAlpha.nodeId == -2) continue; 95 | score = preAlpha.value + llWeights[k * l_size + j]; 96 | if ((tmp.nodeId < 0) || (score > tmp.value)) { 97 | tmp.value = score; 98 | tmp.nodeId = nodeId; 99 | tmp.labelId = k; 100 | } 101 | k++; 102 | } 103 | } 104 | } 105 | tmp.value += values[i * l_size + j]; 106 | if ((nodes[i].type == 1) || (nodes[i].type == 3)) { 107 | tmp.nodeId = -1; 108 | } 109 | if (nodes[i].type >= 2) { 110 | if ((best.nodeId == -1) || best.value < tmp.value) { 111 | best.value = tmp.value; 112 | best.nodeId = i; 113 | best.labelId = j; 114 | } 115 | } 116 | } 117 | 118 | } else { 119 | j++; 120 | while (j != l_size) { 121 | if (!hasMaxValue || (maxValue < values[i * l_size + j])) { 122 | hasMaxValue = true; 123 | maxValue = values[i * l_size + j]; 124 | } 125 | j++; 126 | } 127 | j = 0; 128 | while (j != l_size) { 129 | tmp = alphas[i * l_size + j]; 130 | tmp.value = 0; 131 | pNodeId = nodes[i].predecessors; 132 | pPreLabel = preLabels != null ? preLabels[j] : null; 133 | index2 = 0; 134 | while ((nodeId = pNodeId[index2]) >= 0) { 135 | index2++; 136 | k = -1; 137 | if (pPreLabel != null) { 138 | index3 = 0; 139 | while ((k = pPreLabel[index3]) != -1) { 140 | index3++; 141 | preAlpha = alphas[nodeId * l_size + k]; 142 | if (preAlpha.nodeId == -2) continue; 143 | score = preAlpha.value + llWeights[k * l_size + j]; 144 | if ((tmp.nodeId < 0) || (score > tmp.value)) { 145 | tmp.value = score; 146 | tmp.nodeId = nodeId; 147 | tmp.labelId = k; 148 | } 149 | 150 | } 151 | } else { 152 | k++; 153 | while (k != l_size) { 154 | preAlpha = alphas[nodeId * l_size + k]; 155 | if (preAlpha.nodeId == -2) continue; 156 | score = preAlpha.value + llWeights[k * l_size + j]; 157 | if ((tmp.nodeId < 0) || (score > tmp.value)) { 158 | tmp.value = score; 159 | tmp.nodeId = nodeId; 160 | tmp.labelId = k; 161 | } 162 | k++; 163 | } 164 | } 165 | } 166 | tmp.value += values[i * l_size + j]; 167 | if ((nodes[i].type == 1) || (nodes[i].type == 3)) { 168 | tmp.nodeId = -1; 169 | } 170 | if (nodes[i].type >= 2) { 171 | if ((best.nodeId == -1) || best.value < tmp.value) { 172 | best.value = tmp.value; 173 | best.nodeId = i; 174 | best.labelId = j; 175 | } 176 | } 177 | // System.out.println(""+tmp.value+" "+tmp.nodeId+" "+tmp.labelId); 178 | j++; 179 | } 180 | 181 | } 182 | } 183 | tmp = best; 184 | while (tmp.nodeId >= 0) { 185 | result[tmp.nodeId] = tmp.labelId; 186 | tmp = alphas[tmp.nodeId * l_size + tmp.labelId]; 187 | } 188 | return best.value; 189 | } 190 | } -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/cb/CBModel.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.cb; 2 | 3 | import org.thunlp.thulac.util.BufferUtils; 4 | 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.nio.ByteBuffer; 8 | import java.nio.ByteOrder; 9 | import java.nio.IntBuffer; 10 | import java.nio.channels.FileChannel; 11 | 12 | public class CBModel { 13 | // TODO: add documentation 14 | 15 | public int l_size; // size of the labels 16 | public int f_size; // size of the features 17 | 18 | public int[] ll_weights; // weights of (label, label) 19 | public int[] fl_weights; // weights of (feature, label) 20 | 21 | public CBModel(String filename) throws IOException { 22 | FileInputStream in = new FileInputStream(filename); 23 | FileChannel channel = in.getChannel(); 24 | 25 | ByteBuffer header = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN); 26 | header.clear(); 27 | channel.read(header); 28 | header.flip(); 29 | IntBuffer intHeader = header.asIntBuffer(); 30 | this.l_size = intHeader.get(); 31 | this.f_size = intHeader.get(); 32 | 33 | int llSize = this.l_size * this.l_size, flSize = this.l_size * this.f_size; 34 | this.ll_weights = new int[llSize]; 35 | this.fl_weights = new int[flSize]; 36 | ByteBuffer buf = ByteBuffer.allocate(64 * 1024).order(ByteOrder.LITTLE_ENDIAN); 37 | buf.clear(); 38 | BufferUtils.readInts(channel, buf, this.ll_weights, this.fl_weights); 39 | 40 | channel.close(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/cb/CBNGramFeature.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.cb; 2 | 3 | import org.thunlp.thulac.data.Dat; 4 | 5 | import java.util.Vector; 6 | 7 | public class CBNGramFeature { 8 | // TODO: add documentation 9 | 10 | private static final int SENTENCE_BOUNDARY = '#'; 11 | 12 | private int separator; 13 | private int maxLength; 14 | private int[] uniBases; 15 | private int[] biBases; 16 | private int[] values; 17 | private int datSize; 18 | private int[] dat; 19 | private CBModel model; 20 | 21 | public CBNGramFeature(Dat myDat, CBModel model, int[] values) { 22 | this.separator = ' '; 23 | this.datSize = myDat.datSize; 24 | this.dat = myDat.dat; 25 | this.model = model; 26 | this.maxLength = 20000; 27 | this.uniBases = new int[this.maxLength + 2]; 28 | this.biBases = new int[this.maxLength + 4]; 29 | this.values = values; 30 | } 31 | 32 | private void addValues(int valueOffset, int base, int del) { 33 | int ind = this.dat[base << 1] + del; 34 | if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return; 35 | int offset = this.dat[ind << 1]; 36 | int weightOffset = offset * this.model.l_size; 37 | if (this.model.l_size == 4) { 38 | this.values[valueOffset] += this.model.fl_weights[weightOffset]; 39 | this.values[valueOffset + 1] += this.model.fl_weights[weightOffset + 1]; 40 | this.values[valueOffset + 2] += this.model.fl_weights[weightOffset + 2]; 41 | this.values[valueOffset + 3] += this.model.fl_weights[weightOffset + 3]; 42 | } else for (int i = 0; i < this.model.l_size; i++) { 43 | this.values[valueOffset + i] += this.model.fl_weights[weightOffset + i]; 44 | } 45 | } 46 | 47 | private Vector findBases(int datSize, int ch1, int ch2) { 48 | Vector result = new Vector<>(); 49 | int uniBase; 50 | int biBase; 51 | if (ch1 > 32 && ch1 < 128) ch1 += 65248; 52 | if (ch2 > 32 && ch2 < 128) ch2 += 65248; 53 | if (ch1 >= datSize || this.dat[(ch1 << 1) + 1] != 0) { 54 | uniBase = -1; 55 | biBase = -1; 56 | result.clear(); 57 | result.add(uniBase); 58 | result.add(biBase); 59 | return result; 60 | } 61 | uniBase = this.dat[ch1 << 1] + this.separator; 62 | int ind = this.dat[ch1 << 1] + ch2; 63 | if (ind >= datSize || this.dat[(ind << 1) + 1] != ch1) { 64 | biBase = -1; 65 | result.clear(); 66 | result.add(uniBase); 67 | result.add(biBase); 68 | return result; 69 | } 70 | biBase = this.dat[ind << 1] + this.separator; 71 | result.clear(); 72 | result.add(uniBase); 73 | result.add(biBase); 74 | return result; 75 | } 76 | 77 | public int putValues(String sequence, int len) { 78 | if (len >= this.maxLength) { 79 | System.err.println("Length larger than maxLength."); 80 | return 1; 81 | } 82 | 83 | Vector result = this.findBases(this.datSize, SENTENCE_BOUNDARY, 84 | SENTENCE_BOUNDARY); 85 | this.uniBases[0] = result.get(0); 86 | this.biBases[0] = result.get(1); 87 | 88 | result = this.findBases(this.datSize, SENTENCE_BOUNDARY, sequence.charAt(0)); 89 | this.uniBases[0] = result.get(0); 90 | this.biBases[1] = result.get(1); 91 | for (int i = 0; i + 1 < len; i++) { 92 | result = this.findBases(this.datSize, sequence.charAt(i), 93 | sequence.charAt(i + 1)); 94 | this.uniBases[i + 1] = result.get(0); 95 | this.biBases[i + 2] = result.get(1); 96 | } 97 | 98 | result = this.findBases(this.datSize, (int) sequence.charAt(len - 1), 99 | SENTENCE_BOUNDARY); 100 | this.uniBases[len] = result.get(0); 101 | this.biBases[len + 1] = result.get(1); 102 | 103 | result = this.findBases(this.datSize, SENTENCE_BOUNDARY, SENTENCE_BOUNDARY); 104 | this.uniBases[len + 1] = result.get(0); 105 | this.biBases[len + 2] = result.get(1); 106 | 107 | int base; 108 | for (int i = 0; i < len; i++) { 109 | int valueOffset = i * this.model.l_size; 110 | if ((base = this.uniBases[i + 1]) != -1) { 111 | this.addValues(valueOffset, base, 49); 112 | } 113 | if ((base = this.uniBases[i]) != -1) { 114 | this.addValues(valueOffset, base, 50); 115 | } 116 | if ((base = this.uniBases[i + 2]) != -1) { 117 | this.addValues(valueOffset, base, 51); 118 | } 119 | if ((base = this.biBases[i + 1]) != -1) { 120 | this.addValues(valueOffset, base, 49); 121 | } 122 | if ((base = this.biBases[i + 2]) != -1) { 123 | this.addValues(valueOffset, base, 50); 124 | } 125 | if ((base = this.biBases[i]) != -1) { 126 | this.addValues(valueOffset, base, 51); 127 | } 128 | if ((base = this.biBases[i + 3]) != -1) { 129 | this.addValues(valueOffset, base, 52); 130 | } 131 | } 132 | return 0; 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/cb/CBTaggingDecoder.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.cb; 2 | 3 | import org.thunlp.thulac.data.Dat; 4 | import org.thunlp.thulac.data.POCGraph; 5 | import org.thunlp.thulac.data.TaggedWord; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.FileInputStream; 9 | import java.io.IOException; 10 | import java.io.InputStreamReader; 11 | import java.util.List; 12 | import java.util.Vector; 13 | 14 | public class CBTaggingDecoder { 15 | // TODO: add documentation 16 | 17 | private int maxLength; 18 | private int len; 19 | private String sequence; 20 | private int[][] allowedLabelLists; 21 | private int[][] pocsToTags; 22 | 23 | private CBNGramFeature nGramFeature; 24 | private Dat dat; 25 | 26 | private CBModel model; 27 | 28 | private Node[] nodes; 29 | private int[] values; 30 | private AlphaBeta[] alphas; 31 | private int[] result; 32 | 33 | private String[] labelInfo; 34 | 35 | private int[][] labelTransPre; 36 | private int[][] labelTransPost; 37 | 38 | public int threshold; 39 | 40 | public CBTaggingDecoder() { 41 | this.maxLength = 20000; 42 | this.len = 0; 43 | this.sequence = ""; 44 | this.allowedLabelLists = new int[this.maxLength][]; 45 | 46 | this.pocsToTags = null; 47 | this.nGramFeature = null; 48 | this.dat = null; 49 | this.nodes = new Node[this.maxLength]; 50 | this.labelTransPre = null; 51 | this.labelTransPost = null; 52 | this.threshold = 0; 53 | 54 | this.model = null; 55 | this.alphas = null; 56 | } 57 | 58 | public void loadFiles(String modelFile, String datFile, String labelFile) throws 59 | IOException { 60 | this.model = new CBModel(modelFile); 61 | 62 | this.values = new int[this.maxLength * this.model.l_size]; 63 | this.alphas = new AlphaBeta[this.maxLength * this.model.l_size]; 64 | this.result = new int[this.maxLength * this.model.l_size]; 65 | 66 | for (int i = 0; i < this.maxLength; i++) { 67 | this.nodes[i] = new Node(); 68 | 69 | int[] pre = new int[2]; 70 | pre[0] = i - 1; 71 | pre[1] = -1; 72 | this.nodes[i].predecessors = pre; 73 | 74 | pre = new int[2]; 75 | pre[0] = i + 1; 76 | pre[1] = -1; 77 | this.nodes[i].successors = pre; 78 | } 79 | 80 | this.dat = new Dat(datFile); 81 | this.nGramFeature = new CBNGramFeature(this.dat, this.model, this.values); 82 | 83 | this.labelInfo = new String[10000]; 84 | Vector> pocTags = new Vector<>(); 85 | for (int i = 0; i < 16; i++) pocTags.add(new Vector<>()); 86 | BufferedReader in = new BufferedReader( 87 | new InputStreamReader(new FileInputStream(labelFile))); 88 | String line; 89 | int ind = 0; 90 | while ((line = in.readLine()) != null) { 91 | this.labelInfo[ind] = line; 92 | int segInd = line.charAt(0) - '0'; 93 | for (int j = 0; j < 16; j++) 94 | if (((1 << segInd) & j) != 0) pocTags.get(j).add(ind); 95 | ind++; 96 | } 97 | in.close(); 98 | 99 | this.pocsToTags = new int[16][]; 100 | for (int j = 1; j < 16; j++) { 101 | this.pocsToTags[j] = new int[pocTags.get(j).size() + 1]; 102 | for (int k = 0; k < pocTags.get(j).size(); k++) 103 | this.pocsToTags[j][k] = pocTags.get(j).get(k); 104 | this.pocsToTags[j][pocTags.get(j).size()] = -1; 105 | } 106 | 107 | int[][] labelLookingFor = new int[this.model.l_size][]; 108 | for (int i = 0; i < this.model.l_size; i++) labelLookingFor[i] = null; 109 | for (int i = 0; i < this.model.l_size; i++) { 110 | if ("30".indexOf(this.labelInfo[i].charAt(0)) != -1) continue; 111 | for (int j = 0; j <= i; j++) { 112 | if ((this.labelInfo[i].substring(1).equals( 113 | this.labelInfo[j].substring(1))) && (this.labelInfo[j].charAt( 114 | 0) == '0')) { 115 | if (labelLookingFor[j] == null) { 116 | labelLookingFor[j] = new int[2]; 117 | labelLookingFor[j][0] = -1; 118 | labelLookingFor[j][1] = -1; 119 | } 120 | labelLookingFor[j][this.labelInfo[i].charAt(0) - '1'] = i; 121 | break; 122 | } 123 | } 124 | } 125 | 126 | 127 | for (int i = 0; i < this.maxLength; i++) this.allowedLabelLists[i] = null; 128 | } 129 | 130 | public void dp() { 131 | if (this.allowedLabelLists[0] == null) 132 | this.allowedLabelLists[0] = this.pocsToTags[9]; 133 | if (this.allowedLabelLists[this.len - 1] == null) 134 | this.allowedLabelLists[this.len - 1] = this.pocsToTags[12]; 135 | AlphaBeta.dbDecode(this.model.l_size, this.model.ll_weights, 136 | this.len, this.nodes, this.values, this.alphas, this.result, 137 | this.labelTransPre, this.allowedLabelLists); 138 | this.allowedLabelLists[0] = null; 139 | this.allowedLabelLists[this.len - 1] = null; 140 | } 141 | 142 | public void setLabelTrans() { 143 | int lSize = this.model.l_size; 144 | Vector> preLabels = new Vector<>(); 145 | Vector> postLabels = new Vector<>(); 146 | for (int i = 0; i < lSize; i++) { 147 | preLabels.add(new Vector<>()); 148 | postLabels.add(new Vector<>()); 149 | } 150 | for (int i = 0; i < lSize; i++) { 151 | for (int j = 0; j < lSize; j++) { 152 | int ni = this.labelInfo[i].charAt(0) - '0'; 153 | int nj = this.labelInfo[j].charAt(0) - '0'; 154 | boolean iIsEnd = ((ni == 2) || (ni == 3)); 155 | boolean jIsBegin = ((nj == 0) || (nj == 3)); 156 | boolean sameTag = this.labelInfo[i].substring(1) 157 | .equals(this.labelInfo[j].substring(1)); 158 | if (sameTag) { 159 | if ((ni == 0 && nj == 1) || 160 | (ni == 0 && nj == 2) || 161 | (ni == 1 && nj == 2) || 162 | (ni == 1 && nj == 1) || 163 | (ni == 2 && nj == 0) || 164 | (ni == 2 && nj == 3) || 165 | (ni == 3 && nj == 3) || 166 | (ni == 3 && nj == 0)) { 167 | preLabels.get(j).add(i); 168 | postLabels.get(i).add(j); 169 | } 170 | } else if (iIsEnd && jIsBegin) { 171 | preLabels.get(j).add(i); 172 | postLabels.get(i).add(j); 173 | } 174 | } 175 | } 176 | this.labelTransPre = new int[lSize][]; 177 | for (int i = 0; i < lSize; i++) { 178 | this.labelTransPre[i] = new int[preLabels.get(i).size() + 1]; 179 | for (int j = 0; j < preLabels.get(i).size(); j++) { 180 | this.labelTransPre[i][j] = preLabels.get(i).get(j); 181 | } 182 | this.labelTransPre[i][preLabels.get(i).size()] = -1; 183 | } 184 | 185 | this.labelTransPost = new int[lSize][]; 186 | for (int i = 0; i < lSize; i++) { 187 | this.labelTransPost[i] = new int[postLabels.get(i).size() + 1]; 188 | for (int j = 0; j < postLabels.get(i).size(); j++) 189 | this.labelTransPost[i][j] = postLabels.get(i).get(j); 190 | this.labelTransPost[i][postLabels.get(i).size()] = -1; 191 | } 192 | } 193 | 194 | public void putValues() { 195 | if (this.len == 0) return; 196 | for (int i = 0; i < this.len; i++) this.nodes[i].type = 0; 197 | this.nodes[0].type += 1; 198 | this.nodes[this.len - 1].type += 2; 199 | 200 | int size = this.len * this.model.l_size; 201 | for (int i = 0; i < size; i++) this.values[i] = 0; 202 | this.nGramFeature.putValues(this.sequence, this.len); 203 | } 204 | 205 | public boolean segment(String raw, POCGraph graph, List ts) { 206 | if (raw.length() == 0) return false; 207 | 208 | for (int i = 0; i < raw.length(); i++) 209 | this.allowedLabelLists[i] = this.pocsToTags[ 210 | graph.get(i) == 0 ? 15 : graph.get(i)]; 211 | this.sequence = ""; 212 | for (int i = 0; i < raw.length(); i++) this.sequence += raw.charAt(i); 213 | this.len = raw.length(); 214 | this.putValues(); // calculate eigenvalue and initialize and store them in values 215 | this.dp(); // DP search for the best answer and store it in result 216 | 217 | for (int i = 0; i < raw.length(); i++) this.allowedLabelLists[i] = null; 218 | int offset = 0; 219 | ts.clear(); 220 | for (int i = 0; i < this.len; i++) { 221 | if ((i == this.len - 1) || (this.labelInfo[this.result[i]].charAt( 222 | 0) == '2') || (this.labelInfo[this.result[i]].charAt(0) == '3')) { 223 | ts.add(new TaggedWord()); 224 | for (int j = offset; j < i + 1; j++) { 225 | ts.get(ts.size() - 1).word += (this.sequence.charAt(j)); 226 | } 227 | offset = i + 1; // output tags 228 | ts.get(ts.size() - 1).endOffset=offset; 229 | ts.get(ts.size() - 1).startOffset=offset-ts.get(ts.size() - 1).word.length(); 230 | ts.get(ts.size() - 1).tag = this.labelInfo[this.result[i]].substring(1); 231 | } 232 | } 233 | return true; 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/cb/Node.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.cb; 2 | 3 | /** 4 | * A class which contains topological information of a node. 5 | */ 6 | public class Node { 7 | // TODO: add more documentation 8 | 9 | /** 10 | * Value:
11 | *
    12 | *
  • 1: If this {@link Node} is a starting node.
  • 13 | *
  • 2: If this {@link Node} is a ending node.
  • 14 | *
  • 0: otherwise
  • 15 | *
16 | */ 17 | public int type; 18 | 19 | public int[] predecessors; // last element should be -1 20 | public int[] successors; // last element should be -1 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/data/Dat.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.data; 2 | 3 | import org.thunlp.thulac.util.BufferUtils; 4 | import org.thunlp.thulac.util.StringUtils; 5 | 6 | import java.io.IOException; 7 | import java.nio.ByteBuffer; 8 | import java.nio.ByteOrder; 9 | import java.nio.channels.SeekableByteChannel; 10 | import java.nio.file.Files; 11 | import java.nio.file.Paths; 12 | 13 | /** 14 | * A class which loads data files from disk and provide necessary operations. Instances 15 | * are created with the {@link #Dat(String)} constructor which reads from a file of 16 | * with {@link DatMaker#readFromTxtFile(String)} which constructs a {@code Dat} 17 | * structure with the user-specified dictionary.
18 | * Internally, {@code Dat} uses the two-array Trie Tree to store information that can 19 | * be searched though at high speed, (sometimes) even faster than using 20 | * {@link java.util.HashMap}. 21 | */ 22 | public class Dat { 23 | /** 24 | * The two-array Trie Tree, use {@code dat[i << 1]} to access {@code base[i]} and 25 | * {@code dat[(i << 1) + 1]} to access {@code check[i]}. 26 | */ 27 | public int[] dat; 28 | /** 29 | * The size of the Trie Tree, should be {@code this.dat.length / 2}. 30 | */ 31 | public int datSize; 32 | 33 | protected Dat(int size) { 34 | this.dat = new int[size << 1]; 35 | this.datSize = size; 36 | } 37 | 38 | /** 39 | * Read a {@link Dat} from a given file. 40 | * 41 | * @param filename 42 | * The name of the {@link Dat} file. 43 | * 44 | * @throws java.io.IOException 45 | * If an I/O error occurred while reading the file. 46 | */ 47 | public Dat(String filename) throws IOException { 48 | SeekableByteChannel channel = Files.newByteChannel(Paths.get(filename)); 49 | // DWORD base + DWORD check -> 8 bytes per record 50 | this.datSize = (int) (channel.size() >> 3); 51 | this.dat = new int[this.datSize << 1]; 52 | // strange though, dat files are stored little endian 53 | ByteBuffer bb = ByteBuffer.allocateDirect(64 * 1024) 54 | .order(ByteOrder.LITTLE_ENDIAN); 55 | bb.clear(); 56 | if (!BufferUtils.readInts(channel, bb, this.dat)) 57 | throw new IOException("File does not contain enough data!"); 58 | channel.close(); 59 | } 60 | 61 | // if word in dat, return leaf element, otherwise return -1 62 | private int match(String word) { 63 | int ind = 0; 64 | int base = 0; 65 | int[] codePoints = StringUtils.toCodePoints(word); 66 | for (int c : codePoints) { 67 | ind = this.dat[ind << 1] + c; 68 | if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return -1; 69 | base = ind; 70 | } 71 | ind = this.dat[base << 1]; 72 | return ind < this.datSize && this.dat[(ind << 1) + 1] == base ? ind : -1; 73 | } 74 | 75 | // if prefix in dat, return -base, otherwise return longest substring of prefix in dat 76 | public int getInfo(String prefix) { 77 | int ind = 0; 78 | int base = 0; 79 | for (int i = 0; i < prefix.length(); i++) { 80 | ind = this.dat[ind << 1] + prefix.charAt(i); 81 | if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return i; 82 | base = ind; 83 | } 84 | return -base; 85 | } 86 | 87 | /** 88 | * Returns whether this {@link Dat} contains one or more words that begin with 89 | * {@code prefix}. 90 | * 91 | * @param prefix 92 | * The query prefix. 93 | * 94 | * @return Whether this {@link Dat} contains one or more words that begin with 95 | * {@code prefix}. 96 | */ 97 | public boolean containsPrefix(String prefix) { 98 | return getInfo(prefix) < 0; 99 | } 100 | 101 | /** 102 | * Returns whether this {@link Dat} contains the given word. 103 | * 104 | * @param word 105 | * The query word. 106 | * 107 | * @return Whether this {@link Dat} contains {@code word}. 108 | */ 109 | public boolean contains(String word) { 110 | return this.match(word) != -1; 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/data/DatMaker.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.data; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.io.InputStreamReader; 8 | import java.util.ArrayList; 9 | import java.util.Comparator; 10 | import java.util.List; 11 | import java.util.Vector; 12 | 13 | /** 14 | * A class used to construct instances of {@link Dat} from user-specified dictionary 15 | * files. It extends {@link Dat} to avoid unnecessary array copies and to increase 16 | * performance.
17 | * A confusing algorithm is used to construct the two-array Trie Tree used by 18 | * {@link Dat}, see in-line comments for more information. 19 | */ 20 | public class DatMaker extends Dat { 21 | // a record of a word with an related integer 22 | private static class Record { 23 | public String word; 24 | public int num; 25 | 26 | public Record() { 27 | this("", 0); 28 | } 29 | 30 | public Record(String key, int value) { 31 | this.word = key; 32 | this.num = value; 33 | } 34 | } 35 | 36 | // pairs of Records are compared by comparing their words 37 | private static Comparator RECORDS_COMPARATOR = 38 | new Comparator() { 39 | @Override 40 | public int compare(Record a, Record b) { 41 | return a.word.compareTo(b.word); 42 | } 43 | }; 44 | 45 | /** 46 | * Reads (or more precisely, constructs) an instance of {@link Dat} from the given 47 | * {@link java.io.InputStream}. This is used to generate {@link Dat} from a user-specified 48 | * dictionary, which consists of multiple lines, each one representing a word in the 49 | * dictionary. 50 | * 51 | * @param in 52 | * The {@link java.io.InputStream} to read. 53 | * 54 | * @return The generated {@link Dat}. 55 | * 56 | * @throws java.io.IOException 57 | * If an I/O error happens. 58 | */ 59 | public static Dat readFromInputStream(InputStream in) throws IOException { 60 | List words = new ArrayList<>(); 61 | BufferedReader reader = new BufferedReader(new InputStreamReader(in)); 62 | String str; 63 | while ((str = reader.readLine()) != null) words.add(str); 64 | reader.close(); 65 | 66 | DatMaker dat = new DatMaker(); 67 | dat.buildDat(words); 68 | return dat; 69 | } 70 | 71 | /** 72 | * Reads (or more precisely, constructs) an instance of {@link Dat} from the given 73 | * file. This is used to generate {@link Dat} from a user-specified dictionary, 74 | * which consists of multiple lines, each one representing a word in the dictionary. 75 | * 76 | * @param filename 77 | * The name of the file. 78 | * 79 | * @return The generated {@link Dat}. 80 | * 81 | * @throws java.io.IOException 82 | * If the given file does not exist or is not readable. 83 | */ 84 | public static Dat readFromTxtFile(String filename) throws IOException { 85 | return readFromInputStream(new FileInputStream(filename)); 86 | } 87 | 88 | // The main idea of this ingenious algorithm that generates a Dat instance from the 89 | // input string is that it makes use of the unused space of the original double-array 90 | // Trie Tree to store a double-linked list. This means that it is fully 91 | // compatible with the standard double-array Trie Tree data structure. What's more, 92 | // this algorithm achieves its goal without extra storage space, expect for the head 93 | // and tail fields. But these only require O(1) space, so they can be safely ignored. 94 | 95 | // this.dat, the only storage block used by this algorithm, is an 96 | // array of ELEMENTS. An ELEMENT contains two values, called BASE and CHECK, both 97 | // integers. this.dat is structured in this way: 98 | // ELEMENTS[0].BASE, ELEMENTS[0].CHECK, ELEMENTS[1].BASE, ELEMENTS[1].CHECK, ... 99 | // this.datSize is the total number of ELEMENTS, so 100 | // this.dat.length = 2 * this.datSize. 101 | // In the following parts,BASE and CHECK will be referred to as the 102 | // FIELDS of an ELEMENT, for example, "the BASE FIELD of ELEMENT[4]". 103 | 104 | // The program distinguishes the two different data structures stored in this.dat by 105 | // the sign of the ELEMENTS' FIELDS. 106 | // ELEMENTS whose CHECK and BASE FIELDS are positive belong to the double-array Trie 107 | // Tree, while those whose CHECK and BASE FIELDS are negative belong to the 108 | // double-linked list. When an ELEMENT belongs to the Trie Tree, we call it USED. 109 | // Otherwise, we call it UNUSED. 110 | 111 | // Here the specific data structures are explained. 112 | // The data structure of the Trie Tree: 113 | // FIELDS of USED ELEMENTS strictly follow the definitions of the double-array Trie 114 | // Tree. (If unfamiliar, consult Google) For the current stage S and input 115 | // character C, we have: 116 | // ELEMENTS[ELEMENTS[S].BASE + C].CHECK = S 117 | // ELEMENTS[S].BASE + C = T 118 | // where T is the next stage the DFA (Deterministic Finite Automaton) described by 119 | // the Trie Tree should jump to. 120 | 121 | // The data structure of the double-linked list: 122 | // In a double-linked list there are multiple NODES, each containing two 123 | // pointers PREV and NEXT. In accordance with the c-style arrow (->) operator, this 124 | // list conforms to the following equations: 125 | // NODE->NEXT->PREV = NODE 126 | // NODE->PREV->NEXT = NODE 127 | // In this implementation, pointers take the negative of the values of the indices of 128 | // the NODES they point to. The PREV pointer is stored in the BASE field, and the 129 | // NEXT pointer in the CHECK field. We have, 130 | // -ELEMENTS[ -ELEMENTS[i].CHECK ].BASE = i 131 | // -ELEMENTS[ -ELEMENTS[i].BASE ].CHECK = i 132 | // The negative signs appear because fields of ELEMENTs in the double-linked list 133 | // are negative. 134 | 135 | // The pointers to the HEAD NODE and the TAIL NODE are stored in this.head and 136 | // this.tail, respectively. -this.head is the index of the first NODE in the 137 | // double-linked list, and -this.tail is the index of the last NODE. 138 | 139 | // After so many explanations of the data structure, we finally come to the 140 | // actual behavior of this algorithm. 141 | // The buildDat() method takes a list of strings as input and sorts them in 142 | // alphabetical order. Afterwards, findChildren() breaks strings - char sequences - 143 | // into a tree of characters, as described in the Trie Tree. 144 | // Since the Trie Tree is a representation of an DFA (Deterministic Finite 145 | // Automaton), a stage has to be generated for each node in the tree. Such a stage, 146 | // stored as ELEMENTS, have the BASE and CHECK FIELDS. The CHECK field of an ELEMENT 147 | // is assigned when its parent stage is generated. The assignment of the value in 148 | // BASE FIELD is implemented in allocate() and described below: 149 | 150 | // 1. Set variable BASE to this.head. 151 | // 2. Determine whether BASE is available. (If all ELEMENTS[BASE + C] are UNUSED 152 | // for every C of the child nodes of the current one) 153 | // 3. If BASE is available, return BASE; otherwise, set BASE to the next UNUSED 154 | // ELEMENT, using the double-linked list. 155 | // In this process, if no available BASE is found, the size of this.dat is doubled 156 | // through the expandDat() method, which also maintains the double-linked list in 157 | // the newly allocated ELEMENTS. 158 | 159 | // After an available BASE has been found for the current stage, markAsUsed() 160 | // is called with BASE and all BASE + C, updating the double-linked list. 161 | 162 | // Afterwards, populate() is called. It sets ELEMENTS[BASE + C].CHECK to S 163 | // for all C in the child nodes and sets ELEMENTS[S].BASE to BASE. ELEMENTS[S] 164 | // .CHECK is set to S if stage BASE can be the end of a word; otherwise, it is set 165 | // to BASE otherwise. For each word in lexicon, its corresponding leaf node in the 166 | // Trie Tree will have its BASE field set to the line number of the word. (Remember 167 | // that the user-specified dictionary consists of multiple lines, each one 168 | // representing a word in the dictionary. 169 | 170 | // Finally, method packDat() is invoked to minimize the size of this.dat and reduce 171 | // memory usage. 172 | 173 | private int head, tail; 174 | 175 | private DatMaker() { 176 | super(1); 177 | 178 | // initialize the double-linked list: head = 0, next = 1 179 | this.dat[0] = this.dat[1] = -1; 180 | this.head = this.tail = 0; 181 | } 182 | 183 | // mark element as used by modifying the double-linked list 184 | private void markAsUsed(int index) { 185 | // -base -> the previous element, -check -> the next element 186 | int base = this.dat[index << 1], check = this.dat[(index << 1) + 1]; 187 | 188 | // if the the next element is already USED, print an error message 189 | if (check >= 0) throw new RuntimeException("Cell reused! Index: " + index); 190 | 191 | // maintain the double-linked list 192 | if (base == -1) this.head = check; 193 | else this.dat[((-base) << 1) + 1] = check; 194 | if (check == -this.datSize) this.tail = base; 195 | else this.dat[(-check) << 1] = base; 196 | 197 | this.dat[(index << 1) + 1] = index; // positive check: element used 198 | } 199 | 200 | // expand size of this.dat 201 | private void expandDat() { 202 | int oldSize = this.datSize; 203 | 204 | // alloc & copy 205 | this.datSize *= 2; 206 | int[] newDat = new int[this.dat.length << 1]; 207 | System.arraycopy(this.dat, 0, newDat, 0, this.dat.length); 208 | this.dat = newDat; 209 | 210 | // expand the double-linked list 211 | for (int i = 0; i < oldSize; i++) { 212 | int pos = (oldSize + i) << 1; 213 | newDat[pos] = -(oldSize + i - 1); 214 | newDat[pos + 1] = -(oldSize + i + 1); 215 | } 216 | this.dat[oldSize << 1] = this.tail; 217 | this.dat[((-this.tail) << 1) + 1] = -oldSize; 218 | this.tail = -(oldSize * 2 - 1); // set tail to the last element 219 | } 220 | 221 | // remove unused elements to save memory 222 | private void packDat() { 223 | // calculate minimum size 224 | int last = this.datSize - 1; 225 | for (; this.dat[(last << 1) + 1] < 0; --last) ; 226 | this.datSize = last + 1; 227 | 228 | // truncate this.dat 229 | int[] newDat = new int[this.datSize << 1]; 230 | System.arraycopy(this.dat, 0, newDat, 0, this.datSize << 1); 231 | this.dat = newDat; 232 | } 233 | 234 | // allocate elements according to offsets and return BASE 235 | private int allocate(List offsets) { 236 | int size = offsets.size(); 237 | int base = -this.head; // initialized to the head of the double-linked list 238 | while (true) { 239 | // expand this.dat as needed 240 | if (base == this.datSize) this.expandDat(); 241 | if (size != 0) { 242 | // sorted, offsets.get(size - 1) is the greatest 243 | int requiredSize = base + offsets.get(size - 1); 244 | while (requiredSize >= this.datSize) this.expandDat(); 245 | } 246 | 247 | boolean available = true; // check availability 248 | if (this.dat[(base << 1) + 1] >= 0) available = false; // ELEMENTS[BASE] USED 249 | else { 250 | // if any ELEMENTS[BASE + C] is USED, available = false 251 | int i = 0; 252 | for (; i < size && this.dat[(base + offsets.get(i) << 1) + 1] < 0; i++) ; 253 | if (i < size) available = false; 254 | } 255 | 256 | if (available) { // if BASE is available, update double-linked list 257 | this.markAsUsed(base); 258 | for (int offset : offsets) this.markAsUsed(base + offset); 259 | 260 | return base; 261 | } 262 | 263 | // find next BASE to check availability 264 | int newBase = -this.dat[(base << 1) + 1]; 265 | if (newBase == this.datSize) this.expandDat(); // ensure capacity 266 | base = newBase; 267 | } 268 | } 269 | 270 | // find characters in lexicon which might follow the prefix 271 | private List findChildren(List lexicon, int start, String prefix) { 272 | List children = new ArrayList<>(); 273 | int length = prefix.length(), currentChild = -1; 274 | for (int i = start, size = lexicon.size(); i < size; ++i) { 275 | String word = lexicon.get(i).word; 276 | if (!word.startsWith(prefix)) return children; 277 | if (word.length() == length) continue; 278 | int nextCh = word.charAt(length); 279 | if (nextCh != currentChild) children.add(currentChild = nextCh); 280 | } 281 | return children; 282 | } 283 | 284 | // populate BASE and CHECK FIELDS of allocated BASE and BASE + C 285 | // @param isWord Whether the end of a word has been reached. 286 | private int populate(int check, List offsets, boolean isWord) { 287 | int base = this.allocate(offsets); 288 | 289 | this.dat[base << 1] = 0; 290 | this.dat[(base << 1) + 1] = isWord ? check : base; 291 | 292 | for (int offset : offsets) { // update Trie Tree 293 | int pos = base + offset << 1; 294 | this.dat[pos] = 0; 295 | this.dat[pos + 1] = check; // ELEMENTS[ELEMENTS[S].BASE + C].CHECK = S 296 | } 297 | this.dat[check << 1] = base; // ELEMENTS[CHECK].BASE = BASE 298 | 299 | return base; 300 | } 301 | 302 | // build the Dat structure with a word list as input 303 | private void buildDat(List words) { 304 | // construct lexicon 305 | Vector lexicon = new Vector<>(); 306 | lexicon.add(new Record()); 307 | for (int i = 0, size = words.size(); i < size; ++i) 308 | lexicon.add(new Record(words.get(i), i)); 309 | lexicon.sort(RECORDS_COMPARATOR); // sort input 310 | 311 | // root elements 312 | this.dat[0] = this.populate(0, this.findChildren(lexicon, 0, ""), true); 313 | 314 | for (int i = 0, size = lexicon.size(); i < size; i++) { 315 | String word = lexicon.get(i).word; 316 | 317 | int off = this.getInfo(word); 318 | if (off <= 0) off = word.length(); // if dat already contains word 319 | 320 | // iterate through characters after offset and add new entries 321 | for (int offset = off; offset <= word.length(); offset++) { 322 | String prefix = word.substring(0, offset); 323 | int pBase = -this.getInfo(prefix); // should always be positive 324 | this.populate(pBase, this.findChildren(lexicon, i, prefix), 325 | offset == word.length()); // on word end 326 | } 327 | 328 | off = -this.getInfo(word); // should always be positive 329 | this.dat[this.dat[off << 1] << 1] = lexicon.get(i).num; // leaf node value 330 | } 331 | 332 | this.packDat(); 333 | } 334 | } -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/data/POCGraph.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.data; 2 | 3 | import java.util.Vector; 4 | 5 | /** 6 | * POC means Position Of Character, representing the possible positions 7 | * of a character in the segmented words.
8 | * {@code POCGraph} is a list of integers, possesses a length of {@code l} when generated 9 | * by processing a string of length {@code l}, therefore we get:
10 | * Let {@code graph} be an instance of {@code POCGraph}, and {@code l} be the length of 11 | * the graph. (retrieved by calling {@code graph.size()})
12 | * {@code graph.get(i)} ({@code 0 <= i < length}) is an integer calculated by bitwise 13 | * or-ing zero or more of the following constants:
14 | *
    15 | *
  • POC_B = 0x01: included if the character can be the beginning of a word.
  • 16 | *
  • POC_M = 0x02: included if the character can be the middle of a word.
  • 17 | *
  • POC_E = 0x04: included if the character can be the end of a word.
  • 18 | *
  • POC_S = 0x08: included if the character can be exactly one single world.
  • 19 | *
20 | * As pseudo-code:
21 | *
22 |  * int i = <index>;
23 |  * boolean canBeBeginning = input.canBeBeginning(i);
24 |  * boolean canBeMiddle    = input.canBeMiddle(i);
25 |  * boolean canBeEnd       = input.canBeEnd(i);
26 |  * boolean canBeSingle    = input.canBeSingle(i);
27 |  * int positions = (canBeBeginning ? POC_B : 0) |
28 |  *                 (canBeMiddle    ? POC_M : 0) |
29 |  *                 (canBeEnd       ? POC_E : 0) |
30 |  *                 (canBeSingle    ? POC_S : 0);
31 |  * graph[i] = positions;
32 |  * 
33 | * Note that the {@code POC_M} flag does not conflict with the other flags, e.g., a 34 | * {@code position} of {@code POC_M | POC_B} means that the character can either be the 35 | * middle or the beginning of a word. This applies also for {@code POC_S}, which 36 | * indicates that the character can form a single-character word.
37 | * The generation of {@code POCGraph} is mainly based on punctuations and line breaks, 38 | * but in various implementations also on characters that would certainly not be a part 39 | * of a word, such as whitespaces or numbers.
40 | * This class is merely a alias for {@linkplain java.util.Vector Vector<Integer>}, 41 | * indicating that instances of this class are used as only as the list of {@code POCs}, 42 | * no more behaviour is added. 43 | */ 44 | public class POCGraph extends Vector { 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/data/TaggedWord.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.data; 2 | 3 | /** 4 | * A class which represent a tagged word, that is, a word with a tag. 5 | */ 6 | public class TaggedWord { 7 | public String word; 8 | public String tag; 9 | public int startOffset; 10 | public int endOffset; 11 | 12 | public TaggedWord() { 13 | this.word = ""; 14 | } 15 | 16 | public TaggedWord(String word, String tag, int startOffset, int endOffset) { 17 | this.word = word; 18 | this.tag = tag; 19 | this.startOffset = startOffset; 20 | this.endOffset = endOffset; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/io/IInputProvider.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.io; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | /** 7 | * An interface used to provide input for {@link org.thunlp.thulac.Thulac}. Implementations of this 8 | * interface should contain its own context, since {@link #provideInput()} does not 9 | * pass any kind of parameter. It is recommended that implementations read input from a 10 | * stream, e.g., from a file of from the console ({@code System.in}). 11 | */ 12 | public interface IInputProvider extends IProgramStateListener { 13 | /** 14 | * Provide a {@link java.util.List} of {@link String} which contains the input for the 15 | * segmentation program to process. By contract, the return value of this method, 16 | * joined with whitespaces (U+0020) should logically represent a line from the input, 17 | * though this is not compulsory. A {@code null} return value will be regarded as 18 | * an EOF and the program will terminate. A {@link java.util.List} is used because it is 19 | * recommended to split an enormous line into separate line segments based on the 20 | * punctuations. 21 | * 22 | * @return The input to the segmentation program. 23 | */ 24 | List provideInput() throws IOException; 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/io/IOutputHandler.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.io; 2 | 3 | import org.thunlp.thulac.data.TaggedWord; 4 | 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | /** 9 | * An interface used to handle the output from the segmentation program. The whole 10 | * handling process is based on lines, though its extending the 11 | * {@link IProgramStateListener} allows it to listen the starting and termination 12 | * events of the program, therefore implementations should also concentrate on lines. 13 | */ 14 | public interface IOutputHandler extends IProgramStateListener { 15 | /** 16 | * Handles the {@link java.util.List} of {@link org.thunlp.thulac.data.TaggedWord} generated by the segmentation 17 | * program. Since one input line might be split into multiple line segments, 18 | * this method might be invoked several times between a pair of 19 | * {@link #handleLineStart()} and {@link #handleLineEnd()}. Traditionally, the 20 | * param {@code word} of all the invocations of this methods between a pair of 21 | * {@link #handleLineEnd()} and {@link #handleLineEnd()} come from the same line of 22 | * input, and the output handler should output to the same line as well, however 23 | * this is not compulsory. 24 | * 25 | * @param words 26 | * The {@link java.util.List} of {@link org.thunlp.thulac.data.TaggedWord} generated processing one line segment. 27 | * @param segOnly 28 | * Whether to output without tags. 29 | * @param separator 30 | * The separator between output words and tags. 31 | */ 32 | void handleLineSegment(List words, boolean segOnly, char separator) 33 | throws IOException; 34 | 35 | /** 36 | * Called when an input line is obtained from {@link IInputProvider} and the 37 | * segmentation program is about to begin breaking the line into segments. This 38 | * method is basically for initializations, e.g., creating new line, etc.
39 | * This method is invoked before {@link #handleLineSegment(java.util.List, boolean, char)}. 40 | */ 41 | void handleLineStart() throws IOException; 42 | 43 | /** 44 | * Called when segmentation of an input line is finished and the segmentation 45 | * program is about to begin processing the next line. This method is basically for 46 | * finalisation, e.g., flushing input of this line, etc.
47 | * This method is invoked after {@link #handleLineSegment(java.util.List, boolean, char)}. 48 | */ 49 | void handleLineEnd() throws IOException; 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/io/IProgramStateListener.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.io; 2 | 3 | /** 4 | * An interface used to listen to the starting and termination events of the 5 | * segmentation program. 6 | */ 7 | public interface IProgramStateListener { 8 | /** 9 | * Called when the segmentation program starts. 10 | */ 11 | void onProgramStart(); 12 | 13 | /** 14 | * Called when the segmentation program terminates. (in finally block) 15 | */ 16 | void onProgramEnd(); 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/io/ReaderInputProvider.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.io; 2 | 3 | import org.thunlp.thulac.util.IOUtils; 4 | 5 | import java.io.BufferedReader; 6 | import java.io.IOException; 7 | import java.util.List; 8 | 9 | /** 10 | * An implementation of {@link IInputProvider} which retrieves input from a 11 | * {@link java.io.BufferedReader}. 12 | */ 13 | public class ReaderInputProvider implements IInputProvider { 14 | private BufferedReader reader; 15 | 16 | public ReaderInputProvider(BufferedReader reader) { 17 | // reader must be non-null 18 | if (reader == null) throw new IllegalArgumentException("reader == null!"); 19 | this.reader = reader; 20 | } 21 | 22 | @Override 23 | public List provideInput() throws IOException { 24 | String line = this.reader.readLine(); 25 | if (line == null) return null; 26 | return IOUtils.getLineSegments(line); 27 | } 28 | 29 | @Override 30 | public void onProgramStart() { 31 | } 32 | 33 | @Override 34 | public void onProgramEnd() { 35 | try { 36 | this.reader.close(); // release system resources 37 | } catch (IOException e) { 38 | e.printStackTrace(); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/io/StringInputProvider.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.io; 2 | 3 | import org.thunlp.thulac.util.IOUtils; 4 | 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | /** 9 | * An implementation of {@link IInputProvider} which retrieves input from a {@link 10 | * String}. 11 | */ 12 | public class StringInputProvider implements IInputProvider { 13 | private String[] lines; 14 | private int pointer; 15 | 16 | public StringInputProvider(String input) { 17 | // input must be non-null 18 | if (input == null) throw new IllegalArgumentException("input == null!"); 19 | this.lines = input.split("\n"); // empty lines are discarded 20 | this.pointer = 0; 21 | } 22 | 23 | @Override 24 | public void onProgramStart() { 25 | } 26 | 27 | @Override 28 | public void onProgramEnd() { 29 | } 30 | 31 | @Override 32 | public List provideInput() throws IOException { 33 | if (this.pointer == this.lines.length) return null; 34 | return IOUtils.getLineSegments(this.lines[pointer++]); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/io/StringOutputHandler.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.io; 2 | 3 | import org.thunlp.thulac.data.TaggedWord; 4 | 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | /** 9 | * An implementation of {@link IOutputHandler} to allow access to the output in form of 10 | * {@link String}. 11 | */ 12 | public class StringOutputHandler implements IOutputHandler { 13 | private StringBuilder str; 14 | 15 | public StringOutputHandler() { 16 | this.str = new StringBuilder(); 17 | } 18 | 19 | @Override 20 | public void onProgramStart() { 21 | } 22 | 23 | @Override 24 | public void onProgramEnd() { 25 | } 26 | 27 | @Override 28 | public void handleLineSegment(List words, 29 | boolean segOnly, char separator) { 30 | if (segOnly) { 31 | for (TaggedWord word : words) { 32 | this.str.append(word.word); 33 | this.str.append(' '); 34 | } 35 | } else { 36 | for (TaggedWord word : words) { 37 | this.str.append(word.word); 38 | this.str.append(separator); 39 | this.str.append(word.tag); 40 | this.str.append(' '); 41 | } 42 | } 43 | } 44 | 45 | @Override 46 | public void handleLineStart() throws IOException { 47 | } 48 | 49 | @Override 50 | public void handleLineEnd() throws IOException { 51 | this.str.append("\n"); 52 | } 53 | 54 | public String getString() { 55 | return this.str.toString(); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/io/WriterOutputHandler.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.io; 2 | 3 | import org.thunlp.thulac.data.TaggedWord; 4 | 5 | import java.io.BufferedWriter; 6 | import java.io.IOException; 7 | import java.util.List; 8 | 9 | /** 10 | * An implementation of {@link IOutputHandler} which writes output to a {@link 11 | * java.io.BufferedWriter}. 12 | */ 13 | public class WriterOutputHandler implements IOutputHandler { 14 | private BufferedWriter writer; 15 | private StringBuilder sb; 16 | 17 | public WriterOutputHandler(BufferedWriter writer) { 18 | // writer must be non-null 19 | if (writer == null) throw new IllegalArgumentException("writer == null!"); 20 | this.writer = writer; 21 | this.sb = new StringBuilder(); 22 | } 23 | 24 | @Override 25 | public void handleLineSegment(List words, boolean segOnly, char separator) 26 | throws IOException { 27 | if (segOnly) { 28 | for (TaggedWord word : words) { 29 | this.sb.append(word.word); 30 | this.sb.append(' '); 31 | } 32 | } else { 33 | for (TaggedWord word : words) { 34 | this.sb.append(word.word); 35 | this.sb.append(separator); 36 | this.sb.append(word.tag); 37 | this.sb.append(' '); 38 | } 39 | } 40 | } 41 | 42 | @Override 43 | public void handleLineStart() throws IOException { 44 | this.sb.setLength(0); 45 | } 46 | 47 | @Override 48 | public void handleLineEnd() throws IOException { 49 | this.sb.append("\n"); 50 | this.writer.write(this.sb.toString()); 51 | } 52 | 53 | @Override 54 | public void onProgramStart() { 55 | } 56 | 57 | @Override 58 | public void onProgramEnd() { 59 | try { 60 | this.writer.close(); // release system resources 61 | } catch (IOException e) { 62 | e.printStackTrace(); 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/main/Main.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.main; 2 | 3 | import org.thunlp.thulac.Thulac; 4 | import org.thunlp.thulac.io.IInputProvider; 5 | import org.thunlp.thulac.io.IOutputHandler; 6 | import org.thunlp.thulac.util.IOUtils; 7 | 8 | import java.io.IOException; 9 | 10 | /** 11 | * The program entrance which deals with command line arguments. 12 | */ 13 | public class Main { 14 | public static void main(String[] args) throws IOException { 15 | String modelDir = "models/"; 16 | char separator = '_'; 17 | String userDict = null; 18 | boolean useT2S = false; 19 | boolean segOnly = false; 20 | boolean useFilter = false; 21 | IInputProvider input = null; 22 | IOutputHandler output = null; 23 | 24 | for (int c = 0; c < args.length; ++c) 25 | switch (args[c]) { 26 | case "-t2s": 27 | useT2S = true; 28 | break; 29 | case "-user": 30 | userDict = args[++c]; 31 | break; 32 | case "-deli": 33 | separator = args[++c].charAt(0); 34 | break; 35 | case "-seg_only": 36 | segOnly = true; 37 | break; 38 | case "-filter": 39 | useFilter = true; 40 | break; 41 | case "-model_dir": 42 | modelDir = args[++c]; 43 | if (modelDir.charAt(modelDir.length() - 1) != '/') 44 | modelDir += '/'; 45 | break; 46 | case "-input": 47 | input = IOUtils.inputFromFile(args[++c]); // use UTf-8 48 | break; 49 | case "-output": 50 | output = IOUtils.outputToFile(args[++c]); // use UTF-8 51 | break; 52 | } 53 | if (input == null) input = IOUtils.inputFromConsole(); 54 | if (output == null) output = IOUtils.outputToConsole(); 55 | 56 | Thulac.split(modelDir, separator, userDict, useT2S, segOnly, useFilter, 57 | input, output); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/postprocess/DictionaryPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.data.Dat; 4 | import org.thunlp.thulac.data.DatMaker; 5 | import org.thunlp.thulac.data.TaggedWord; 6 | 7 | import java.io.IOException; 8 | import java.util.List; 9 | 10 | /** 11 | * A postprocess pass which scans the word list, extract words that are found in the 12 | * dictionary and tag them.
13 | * To show its behavior more clearly, we raise the following example:
14 | * Assume that the input {@code sentence} is {@code "A", "B", "C", "DE"}, and the word 15 | * list specified by {@link #dictionary} is {@code "AB", "ABC", "ABCD"}.
16 | * The {@link #process(java.util.List)} method tends to find the longest concatenation of words 17 | * in the word list which exists in the dictionary and combine these words into one 18 | * single {@link TaggedWord}.
19 | * So, as for this example, all concatenations of words in the list beginning from 20 | * index 0 would be: {@code "A", "AB", "ABC", "ABCDE"}, in which only {@code "AB"} and 21 | * {@code "ABC"} is present in {@link #dictionary}.
22 | * In this case, the longest concatenation would be {@code "ABC"} and therefore the 23 | * words {@code "A", "B", "C"} are removed and one single word {@code "ABC"} is added 24 | * to the word list, which makes the final output from {@link #process(java.util.List)} {@code 25 | * "ABC", "DE"}.
26 | * Please notice that although {@code "ABCD"} exists in {@link #dictionary}, the 27 | * {@link #process(java.util.List)} method will not attempt to split whole words apart. 28 | */ 29 | public class DictionaryPass implements IPostprocessPass { 30 | 31 | private Dat dictionary; 32 | private String tag; 33 | 34 | public DictionaryPass(String dictFile, String tag, boolean isTxt) 35 | throws IOException { 36 | this.tag = tag; 37 | if (isTxt) this.dictionary = DatMaker.readFromTxtFile(dictFile); 38 | else this.dictionary = new Dat(dictFile); 39 | } 40 | 41 | @Override 42 | public void process(List sentence) { 43 | if (this.dictionary == null || sentence.isEmpty()) return; 44 | 45 | for (int i = 0, size = sentence.size(); i < size; i++) { 46 | // search for longest concatenation which exists in dict 47 | StringBuilder sb = new StringBuilder(); 48 | String longest = null, current; 49 | int longestIndex = -1; 50 | for (int j = i; j < size; j++) { 51 | current = sb.append(sentence.get(j).word).toString(); 52 | if (!this.dictionary.containsPrefix(current)) break; 53 | if (this.dictionary.contains(current)) { 54 | longest = current; 55 | longestIndex = j; 56 | } 57 | } 58 | 59 | // if found, combine the words and update the sentence 60 | if (longest == null) continue; 61 | Integer startOffset = sentence.get(i).startOffset; 62 | sentence.set(i, new TaggedWord(longest,tag, startOffset,startOffset+longest.length())); 63 | for (int j = longestIndex; j > i; --j) sentence.remove(j); 64 | size = sentence.size(); 65 | } 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/postprocess/DoubleWordPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.data.TaggedWord; 4 | import org.thunlp.thulac.util.StringUtils; 5 | 6 | import java.util.List; 7 | 8 | import static org.thunlp.thulac.util.CodePointUtils.SPECIAL_CHARS; 9 | 10 | /** 11 | * A postprocess pass combining adjacent words which can form a double word together. 12 | * 13 | * @see #canFormDoubleWord(String, String) 14 | */ 15 | public class DoubleWordPass implements IPostprocessPass { 16 | @Override 17 | public void process(List sentence) { 18 | if (sentence.size() <= 1) return; 19 | 20 | TaggedWord tagged, last = sentence.get(sentence.size() - 1); 21 | for (int i = sentence.size() - 2; i >= 0; --i, last = tagged) { 22 | tagged = sentence.get(i); 23 | if (this.canFormDoubleWord(tagged.word, last.word)) { 24 | tagged.word += last.word; 25 | sentence.remove(i + 1); 26 | } 27 | } 28 | } 29 | 30 | /** 31 | * Two words can form a double word if and only of:
32 | *
    33 | *
  • Both words contain only one code points and,
  • 34 | *
  • The only code points in both words are identical and,
  • 35 | *
  • This code point is not a {@linkplain org.thunlp.thulac.util.CodePointUtils#SPECIAL_CHARS 36 | * special character}.
  • 37 | *
38 | * 39 | * @param first 40 | * The first word. 41 | * @param second 42 | * The second word. 43 | * 44 | * @return If the two words can form a double word. 45 | */ 46 | private boolean canFormDoubleWord(String first, String second) { 47 | if (StringUtils.codePointCount(first) != 1 || 48 | StringUtils.codePointCount(second) != 1) return false; 49 | int firstCP = first.codePointAt(0); 50 | int secondCP = second.codePointAt(0); 51 | return firstCP == secondCP && SPECIAL_CHARS.indexOf(firstCP) == -1; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/postprocess/FilterPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.data.Dat; 4 | import org.thunlp.thulac.data.TaggedWord; 5 | import org.thunlp.thulac.util.StringUtils; 6 | 7 | import java.io.IOException; 8 | import java.util.Arrays; 9 | import java.util.HashSet; 10 | import java.util.List; 11 | import java.util.Set; 12 | 13 | import static org.thunlp.thulac.util.CodePointUtils.CHINESE_DIGITS; 14 | import static org.thunlp.thulac.util.CodePointUtils.DIGITS; 15 | 16 | /** 17 | * A postprocess pass which filters forbidden tags from the the word list. 18 | */ 19 | public class FilterPass implements IPostprocessPass { 20 | /** 21 | * Tags allowed to pass the filter. Words with tags out of this list will be 22 | * discarded. 23 | */ 24 | private static final Set ALLOWED_TAGS = new HashSet<>(Arrays.asList( 25 | "n", "np", "ns", "ni", "nz", "v", "a", "id", "t", "uw")); 26 | 27 | private Dat xuDat; 28 | private Dat timeDat; 29 | 30 | public FilterPass(String xuDatFile, String timeDatFile) throws IOException { 31 | this.xuDat = new Dat(xuDatFile); 32 | this.timeDat = new Dat(timeDatFile); 33 | } 34 | 35 | /** 36 | * Returns {@code true} is one of the following is true:
37 | *
    38 | *
  • Word contains one or more normal digits.
  • 39 | *
  • Word contains two or more Chinese digits.
  • 40 | *
  • Word is in dictionary specified by {@link #timeDat}.
  • 41 | *
42 | * 43 | * @param word 44 | * The word to check. 45 | * 46 | * @return Whether the word contains number digits. 47 | */ 48 | private boolean hasNumber(String word) { 49 | int count = 0; 50 | for (int c : StringUtils.toCodePoints(word)) 51 | if (DIGITS.indexOf(c) != -1) return true; 52 | else if (CHINESE_DIGITS.indexOf(c) != -1 && count++ != 0) return true; 53 | return this.timeDat.contains(word); 54 | } 55 | 56 | /** 57 | * Remove words in segmented word list if one of the following is true:
58 | *
    59 | *
  • Tag of word not in {@link #ALLOWED_TAGS}.
  • 60 | *
  • Word in dictionary specified by {@link #timeDat}.
  • 61 | *
  • Word has tag "t" and {@linkplain #hasNumber(String) hasNumber(word)} 62 | * returns {@code true}.
  • 63 | *
64 | * 65 | * @param sentence 66 | * The sentence to filter. 67 | */ 68 | @Override 69 | public void process(List sentence) { 70 | if (this.xuDat == null || this.timeDat == null || sentence.isEmpty()) return; 71 | 72 | for (int i = sentence.size() - 1; i >= 0; --i) { 73 | String word = sentence.get(i).word; 74 | String tag = sentence.get(i).tag; 75 | if (!ALLOWED_TAGS.contains(tag) || this.xuDat.contains(word) || 76 | ("t".equals(tag) && this.hasNumber(word))) sentence.remove(i); 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/postprocess/IPostprocessPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.data.TaggedWord; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * An interface which process the list of {@link TaggedWord} after segmentation. 9 | */ 10 | public interface IPostprocessPass { 11 | /** 12 | * Process the list of {@link TaggedWord}. 13 | * 14 | * @param sentence 15 | * The list of {@link TaggedWord}. 16 | */ 17 | void process(List sentence); 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/postprocess/NegWordPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.data.Dat; 4 | import org.thunlp.thulac.data.TaggedWord; 5 | import org.thunlp.thulac.util.StringUtils; 6 | 7 | import java.io.IOException; 8 | import java.util.List; 9 | 10 | /** 11 | * A postprocess pass which recognises certain negative phrases (for example, "not good 12 | * enough" in English) and separate the negative word from the rest parts in the phrase 13 | * (in this example, "not good" is converted into "not" and "good enough") and give the 14 | * separated parts their respective tags. A {@link org.thunlp.thulac.data.Dat} file stores the list of negative 15 | * phrases to be separated by {@link #process(java.util.List)}. 16 | */ 17 | public class NegWordPass implements IPostprocessPass { 18 | private Dat negPhrases; 19 | 20 | public NegWordPass(String negDatFile) throws IOException { 21 | this.negPhrases = new Dat(negDatFile); 22 | } 23 | 24 | @Override 25 | public void process(List sentence) { 26 | if (this.negPhrases == null || sentence.isEmpty()) return; 27 | 28 | for (int i = sentence.size() - 1; i >= 0; --i) { 29 | TaggedWord tagged = sentence.get(i); 30 | if (this.negPhrases.contains(tagged.word)) { 31 | int[] codePoints = StringUtils.toCodePoints(tagged.word); 32 | String word = StringUtils.toString(codePoints, 1, codePoints.length - 1); 33 | sentence.add(i + 1, new TaggedWord(word, "v",tagged.startOffset+1,tagged.endOffset)); 34 | sentence.get(i).endOffset = sentence.get(i).startOffset +1; 35 | tagged.word = StringUtils.toString(codePoints[0]); 36 | tagged.tag = "d"; 37 | } 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/postprocess/SpecialPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.data.TaggedWord; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * A postprocess path which deals with special cases. 9 | */ 10 | public class SpecialPass implements IPostprocessPass { 11 | @Override 12 | public void process(List sentence) { 13 | this.filterHTTPURLs(sentence); 14 | } 15 | 16 | /** 17 | * Tag "x" for HTTP URLs.
18 | * HTTP URLs are identified as is, if the word is longer than 4 characters and 19 | * starts with "http". (to conform with both {@code http} and {@code https} schemes) 20 | * 21 | * @param sentence 22 | * The input sentence. 23 | */ 24 | private void filterHTTPURLs(List sentence) { 25 | for (TaggedWord tagged : sentence) 26 | if (tagged.word.length() >= 5 && tagged.word.startsWith("http")) 27 | tagged.tag = "x"; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/postprocess/TimeWordPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.data.TaggedWord; 4 | import org.thunlp.thulac.util.StringUtils; 5 | 6 | import java.util.List; 7 | 8 | import static org.thunlp.thulac.util.CodePointUtils.DIGITS; 9 | import static org.thunlp.thulac.util.CodePointUtils.generate; 10 | 11 | /** 12 | * A postprocess pass which combine words which together represent a time period into 13 | * one word.
14 | * For example, for input word list {@code "A", "B", "C1", "2", "34" "year"} ("year" 15 | * here can by any Chinese time unit in {@link #TIME_UNITS}), the output should be: 16 | * {@code "A", "B", "C1", "234year"}.
17 | * It can be seen that {@code "C1"} is not concatenated to {@code "234year"}, since it 18 | * contains non-digit characters.
19 | * Please notice that this class is able to deal with full-width numbers like U+FF10 20 | * (full-width digit 1) yet not Chinese digits like U+3007 (Chinese for "one"). 21 | */ 22 | public class TimeWordPass implements IPostprocessPass { 23 | /** 24 | * Chinese characters which represent time units: (description in English)
25 | * YEAR: U+5E74, MONTH: U+6708, DAY: U+65E5 & U+53F7, HOUR: U+65F6 & U+70B9, 26 | * MINUTE: U+5206, SECOND: U+79D2. 27 | */ 28 | private static final String TIME_UNITS = generate('\u5E74', '\u6708', '\u65E5', 29 | '\u53F7', '\u65F6', '\u70B9', '\u5206', '\u79D2'); 30 | 31 | /** 32 | * {@code word} is a number if all the code points in {@code word} is a 33 | * {@linkplain org.thunlp.thulac.util.CodePointUtils#DIGITS digit}. 34 | * 35 | * @param word 36 | * The word to check. 37 | * 38 | * @return Whether this {@code word} is a number. 39 | */ 40 | private boolean isNumber(String word) { 41 | for (int codePoint : StringUtils.toCodePoints(word)) 42 | if (DIGITS.indexOf(codePoint) == -1) return false; 43 | return true; 44 | } 45 | 46 | /** 47 | * {@code word} is a time unit if and only if: {@code word} contains only ont code 48 | * point and this code point is a {@linkplain #TIME_UNITS time unit}. 49 | * 50 | * @param word 51 | * The word to check. 52 | * 53 | * @return Whether this {@code word} is a time unit. 54 | */ 55 | private boolean isTimeUnit(String word) { 56 | return StringUtils.codePointCount(word) == 1 && 57 | TIME_UNITS.indexOf(word.codePointAt(0)) != -1; 58 | } 59 | 60 | @Override 61 | public void process(List sentence) { 62 | boolean isTimeWord = false; 63 | for (int i = sentence.size() - 1; i >= 0; i--) { 64 | TaggedWord tagged = sentence.get(i); 65 | if (this.isTimeUnit(tagged.word)) isTimeWord = true; 66 | else if (isTimeWord && this.isNumber(tagged.word)) { 67 | tagged.word += sentence.remove(i + 1).word; 68 | tagged.tag = "t"; 69 | } else isTimeWord = false; 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/postprocess/VerbPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.postprocess; 2 | 3 | import org.thunlp.thulac.data.Dat; 4 | import org.thunlp.thulac.data.TaggedWord; 5 | 6 | import java.io.IOException; 7 | import java.util.List; 8 | 9 | /** 10 | * A postprocess pass which identifies Dictionary Verbs and Directional Verbs. 11 | * 12 | * @see Dictionary Verb 13 | * @see 14 | * Dictionary Verb in Chinese 15 | * @see 16 | * Directional Verb 17 | * @see sentence) { 49 | if (this.vM == null || this.vD == null || sentence.isEmpty()) return; 50 | 51 | TaggedWord last = sentence.get(0), tagged; 52 | for (int i = 1, size = sentence.size(); i < size; i++, last = tagged) { 53 | tagged = sentence.get(i + 1); 54 | if (this.tag.equals(last.tag) && this.tag.equals(tagged.tag)) 55 | if (this.vM.contains(last.word)) tagged.tag = "vm"; 56 | else if (this.vD.contains(tagged.word)) tagged.tag = "vd"; 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/preprocess/ConvertT2SPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.preprocess; 2 | 3 | import org.thunlp.thulac.data.POCGraph; 4 | import org.thunlp.thulac.util.StringUtils; 5 | 6 | import java.io.DataInputStream; 7 | import java.io.File; 8 | import java.io.FileInputStream; 9 | import java.io.IOException; 10 | import java.util.HashMap; 11 | 12 | /** 13 | * A preprocess pass which convert traditional Chinese characters to simplified ones, 14 | * used when switch {@code -t2s} exists in the command line. 15 | */ 16 | public class ConvertT2SPass implements IPreprocessPass { 17 | private HashMap t2sMap; 18 | 19 | public ConvertT2SPass(String fileName) throws IOException { 20 | this.t2sMap = new HashMap<>(); 21 | this.loadT2SMap(fileName); 22 | } 23 | 24 | private void loadT2SMap(String filename) throws IOException { 25 | // TODO: adapt NIO 26 | 27 | File mapFile = new File(filename); 28 | // t2s map format: recordCount * DWORD traditional + 29 | // recordCount * DWORD simplified 30 | // -> 8 * recordCount bytes in total 31 | int recordCount = (int) (mapFile.length() >> 3); 32 | 33 | DataInputStream input = new DataInputStream(new FileInputStream(mapFile)); 34 | int[] traditional = new int[recordCount]; // cache 35 | for (int i = 0; i < recordCount; ++i) traditional[i] = input.readInt(); 36 | for (int i = 0; i < recordCount; ++i) { 37 | int simplified = input.readInt(); 38 | this.t2sMap.put(traditional[i], simplified); 39 | } 40 | input.close(); 41 | } 42 | 43 | private int getSimplifiedCodePoint(int c) { 44 | if (this.t2sMap.containsKey(c)) return this.t2sMap.get(c); 45 | return c; 46 | } 47 | 48 | private String convertT2S(String sentence) { 49 | int[] codePoints = StringUtils.toCodePoints(sentence); 50 | StringBuilder sb = new StringBuilder(); 51 | for (int codePoint : codePoints) 52 | sb.appendCodePoint(this.getSimplifiedCodePoint(codePoint)); 53 | return sb.toString(); 54 | } 55 | 56 | @Override 57 | public String process(String raw, POCGraph ignored) { 58 | return this.convertT2S(raw); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/preprocess/IPreprocessPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.preprocess; 2 | 3 | import org.thunlp.thulac.data.POCGraph; 4 | 5 | /** 6 | * An interface which process the raw {@link String} before segmentation. 7 | */ 8 | public interface IPreprocessPass { 9 | /** 10 | * Process the raw {@link String}. 11 | * 12 | * @param raw 13 | * The raw {@link String} to process. 14 | * @param graph 15 | * The {@link org.thunlp.thulac.data.POCGraph} to write to. 16 | * 17 | * @return The processed {@link String}. 18 | */ 19 | String process(String raw, POCGraph graph); 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/preprocess/PreProcessPass.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.preprocess; 2 | 3 | import org.thunlp.thulac.data.POCGraph; 4 | import org.thunlp.thulac.util.StringUtils; 5 | 6 | import static org.thunlp.thulac.util.CodePointUtils.SPECIAL_CHARS; 7 | import static org.thunlp.thulac.util.CodePointUtils.WHITESPACE_CHARS; 8 | 9 | /** 10 | * A preprocess pass which cleans raw input up. 11 | */ 12 | public class PreProcessPass implements IPreprocessPass { 13 | // TODO: add more documentation 14 | 15 | private static final String SINGLE_PUNCTUATION_CODE_POINTS = StringUtils.toString( 16 | 65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 1230, 12304, 17 | 12305, 12289, 12298, 12299, 64, 35, 65288, 65289, 34, 91, 93, 126, 47, 44, 58, 18 | 63, 9700, 9734, 9733, 8230, 39, 33, 42, 43, 62, 40, 41, 59, 61); 19 | 20 | private boolean isSinglePunctuation(int c) { 21 | return SINGLE_PUNCTUATION_CODE_POINTS.indexOf(c) != -1; 22 | } 23 | 24 | private String cleanup(String sentence, POCGraph graph) { 25 | StringBuilder cleaned = new StringBuilder(); 26 | graph.clear(); 27 | boolean spaceFlag = false, otherFlag = false, 28 | singlePunctuationFlag = false, titleFlag = false; 29 | 30 | int titleStart = 0; 31 | int[] codePoints = StringUtils.toCodePoints(sentence); 32 | for (int c : codePoints) { 33 | if (WHITESPACE_CHARS.indexOf(c) != -1) { 34 | otherFlag = false; 35 | if (spaceFlag) continue; 36 | if (!graph.isEmpty()) 37 | graph.setElementAt(graph.lastElement() & 12, graph.size() - 1); 38 | spaceFlag = true; 39 | continue; 40 | } 41 | 42 | cleaned.appendCodePoint(c); 43 | if (SPECIAL_CHARS.indexOf(c) != -1) { 44 | if (spaceFlag) { 45 | singlePunctuationFlag = this.isSinglePunctuation(c); 46 | graph.add(singlePunctuationFlag ? 8 : 9); 47 | spaceFlag = false; 48 | } else { 49 | if (otherFlag) { 50 | if (this.isSinglePunctuation(c)) { 51 | if (!graph.isEmpty()) graph.setElementAt( 52 | graph.lastElement() & 12, graph.size() - 1); 53 | graph.add(8); 54 | } else if (singlePunctuationFlag) graph.add(9); 55 | else { 56 | if (!graph.isEmpty() && graph.lastElement() == 0) 57 | graph.setElementAt(7, graph.size() - 1); 58 | graph.add(2); 59 | } 60 | } else graph.add(9); 61 | singlePunctuationFlag = this.isSinglePunctuation(c); 62 | } 63 | otherFlag = true; 64 | 65 | if (c == 12298) titleStart = graph.size(); 66 | else if (c == 12299 && titleFlag) { 67 | int titleEnd = graph.size() - 2; 68 | if (titleEnd <= titleStart + 9) 69 | if (titleStart == titleEnd) graph.setElementAt(9, titleStart); 70 | else { 71 | graph.setElementAt(1, titleStart); 72 | for (int i = titleStart + 1; i < titleEnd; ++i) 73 | graph.setElementAt(2, i); 74 | graph.setElementAt(4, titleEnd); 75 | } 76 | } 77 | titleFlag = c == 12298; 78 | } else { 79 | if (spaceFlag) graph.add(9); 80 | else if (otherFlag) { 81 | graph.setElementAt(graph.lastElement() & 12, graph.size() - 1); 82 | graph.add(9); 83 | singlePunctuationFlag = false; 84 | } else graph.add(15); 85 | spaceFlag = false; 86 | otherFlag = false; 87 | } 88 | } 89 | 90 | // deal with first & last character 91 | if (!graph.isEmpty()) { 92 | int first = graph.firstElement() & 9, last = graph.lastElement() & 12; 93 | graph.setElementAt(first == 0 ? 9 : first, 0); 94 | graph.setElementAt(last == 0 ? 12 : last, graph.size() - 1); 95 | } 96 | 97 | return cleaned.toString(); 98 | } 99 | 100 | @Override 101 | public String process(String raw, POCGraph graph) { 102 | return this.cleanup(raw, graph); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/util/BufferUtils.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.util; 2 | 3 | import java.io.IOException; 4 | import java.nio.ByteBuffer; 5 | import java.nio.IntBuffer; 6 | import java.nio.channels.SeekableByteChannel; 7 | 8 | /** 9 | * An utility class which deals with buffers. 10 | * 11 | * @see java.nio.Buffer 12 | */ 13 | public class BufferUtils { 14 | /** 15 | * Read ints from {@code channel} using {@code buf} as buffer and putting them 16 | * sequentially into the array of {@code int[]} represented by {@code arrays}.
17 | * {@code buf} is always in read mode after this method returns, that is, users 18 | * have to call {@code buf.flip()} first if they wish to reuse it. {@code 19 | * channel} is NOT closed after this method returns (since the EOF might not have been 20 | * reached yet), therefore users should close it manually.
21 | * 22 | * @param channel 23 | * The {@link java.nio.channels.FileChannel} to read from. 24 | * @param buf 25 | * The {@link java.nio.ByteBuffer} to use as buffer. 26 | * @param arrays 27 | * The array of {@code int[]} to store the read ints. 28 | * 29 | * @return A return value of {@code true} means that all the arrays are successfully 30 | * filled with data read from {@code channel}, while {@code false} means that the 31 | * EOF is reached before all the arrays are filled. In special case that all arrays 32 | * are filled and EOF is reached, {@code true} is returned. 33 | * 34 | * @throws java.io.IOException 35 | * If an exception is thrown while reading from {@code channel}. 36 | * @throws NullPointerException 37 | * If either channel is null, buf is null, or any element of {@code arrays} is 38 | * null. 39 | */ 40 | public static boolean readInts( 41 | SeekableByteChannel channel, ByteBuffer buf, int[]... arrays) 42 | throws IOException { 43 | int position = 0, offset = 0; 44 | int[] current = arrays[position]; 45 | int currentLeft = current.length, readBytes, readInts; 46 | 47 | while (true) { 48 | // read buffer 49 | readBytes = channel.read(buf); 50 | // if EOF is reached and there are still arrays left not filled 51 | if (readBytes == -1) return false; 52 | buf.flip(); 53 | IntBuffer intBuf = buf.asIntBuffer(); 54 | readInts = readBytes >> 2; 55 | 56 | // copy buffer content to arrays 57 | while (readInts > 0) { 58 | int getLen = Math.min(readInts, currentLeft); 59 | intBuf.get(current, offset, getLen); 60 | offset += getLen; 61 | readInts -= getLen; 62 | currentLeft -= getLen; 63 | 64 | if (currentLeft == 0) { // if current array is filled 65 | ++position; 66 | if (position == arrays.length) { // if all arrays have been filled 67 | buf.clear(); 68 | return true; 69 | } 70 | current = arrays[position]; 71 | offset = 0; 72 | currentLeft = current.length; 73 | } 74 | } 75 | 76 | buf.clear(); 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/util/CodePointUtils.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * An utility class providing definitions for many sets of code points. 8 | */ 9 | public class CodePointUtils { 10 | /** 11 | * ASCII and full-width digits. 12 | */ 13 | public static final String DIGITS = 14 | generate(range('0', '9'), range('\uFF10', '\uFF19')); 15 | 16 | /** 17 | * Chinese digits. 18 | */ 19 | public static final String CHINESE_DIGITS = generate('\u3007', '\u4E00', '\u4E8C', 20 | '\u4E09', '\u56DB', '\u4E94', '\u516D', '\u4E03', '\u516B', '\u4E5D'); 21 | 22 | /** 23 | * Special characters, containing:
24 | *
    25 | *
  • Chinese full-width punctuations:
    26 | * U+FF0C: Comma, U+3002: Full Stop, U+FF1F: Question Mark, U+FF01: Exclamation 27 | * Mark, U+FF1A: Colon, U+FF1B: Semicolon, U+3010 & U+3011: Brackets, U+3001: 28 | * Ideographic Comma, U+300A & U+300B: Guillemets, U+FF08 & U+FF09: Parentheses. 29 | *
  • 30 | *
  • Standard punctuations:
    31 | * U+2018 & U+2019: Single Quotation Marks,U+201C & U+201D: Double Quotation 32 | * Marks, U+00B7: Middle Point, U+2026: Horizontal Ellipsis, U+2014: Em Dash. 33 | *
  • 34 | *
  • Special characters: 35 | * U+FFE5: Full-width Yen Sign, U+25E4: Black Upper Left Triangle, U+2605: Black 36 | * Star, U+2606: White Star. 37 | *
  • 38 | *
  • ASCII characters: All printable ASCII characters (from U+0021 to 39 | * U+007E) except for U+0060: Grave Accent.
  • 40 | *
41 | * (All of above character names are referred from the Unicode Consortium.) 42 | */ 43 | public static final String SPECIAL_CHARS = generate('\uFF0C', '\u3002', '\uFF1F', 44 | '\uFF01', '\uFF1A', '\uFF1B', '\u3010', '\u3011', '\u3001', '\u300A', 45 | '\u300B', '\uFF08', '\uFF09', '\u2018', '\u2019', '\u201C', '\u201D', 46 | '\u00B7', '\u2026', '\u2014', '\uFFE5', '\u25E4', '\u2605', '\u2606', 47 | range('\u0021', '\u005F'), range('\u0061', '\u007E')); 48 | 49 | /** 50 | * Whitespaces: U+0020 & U+3000. 51 | */ 52 | public static final String WHITESPACE_CHARS = generate('\u0020', '\u3000'); 53 | 54 | /** 55 | * Generate a {@link String} containing a list of code points produced following 56 | * these steps:
57 | *
    58 | *
  1. Let {@code list} be the empty list of integers.
  2. 59 | *
  3. For each {@link Object} {@code param} in {@code params}, sequentially from 60 | * {@code params[0]} to {@code params[params.length - 1]}, switch on {@code 61 | * param}'s class:
    62 | *
      63 | *
    • {@link Integer}: Append {@code param} to {@code list}.
    • 64 | *
    • {@code int[]}: Append every integer in {@code param} to {@code 65 | * list}.
    • 66 | *
    • {@link Character}: Append {@code param}, converted to {@code char} 67 | * and then to {@code int} and then to {@link Integer}, to {@code list}.
    • 68 | *
    • {@link String}: Append every code point in the content of {@code 69 | * param} retrieved using {@link StringUtils#toCodePoints(String)} to {@code 70 | * list}.
    • 71 | *
    • Other: Do nothing.
    • 72 | *
    73 | *
  4. 74 | *
  5. Convert {@code list} to {@link String} using {@link StringUtils#toString(int...)}
  6. 75 | *
76 | * 77 | * @param params 78 | * The input parameters. 79 | * 80 | * @return The generated {@link String}. 81 | */ 82 | public static String generate(Object... params) { 83 | List codePoints = new ArrayList<>(); 84 | for (Object param : params) 85 | if (param instanceof Integer) codePoints.add((Integer) param); 86 | else if (param instanceof int[]) for (int codePoint : (int[]) param) 87 | codePoints.add(codePoint); 88 | else if (param instanceof String) 89 | for (int codePoint : StringUtils.toCodePoints((String) param)) 90 | codePoints.add(codePoint); 91 | else if (param instanceof Character) codePoints.add((int) (Character) param); 92 | 93 | int[] cps = new int[codePoints.size()]; 94 | for (int i = 0, size = codePoints.size(); i < size; ++i) 95 | cps[i] = codePoints.get(i); 96 | 97 | return StringUtils.toString(cps); 98 | } 99 | 100 | /** 101 | * Return an {@code int[]} containing code points ranging from start to end 102 | * (inclusive); 103 | */ 104 | public static int[] range(int start, int end) { 105 | if (end < start) return null; 106 | int[] range = new int[end - start + 1]; 107 | for (int i = start; i <= end; ++i) range[i - start] = i; 108 | return range; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/util/IOUtils.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.util; 2 | 3 | import org.thunlp.thulac.io.IInputProvider; 4 | import org.thunlp.thulac.io.IOutputHandler; 5 | import org.thunlp.thulac.io.ReaderInputProvider; 6 | import org.thunlp.thulac.io.StringInputProvider; 7 | import org.thunlp.thulac.io.StringOutputHandler; 8 | import org.thunlp.thulac.io.WriterOutputHandler; 9 | 10 | import java.io.BufferedReader; 11 | import java.io.BufferedWriter; 12 | import java.io.File; 13 | import java.io.IOException; 14 | import java.io.InputStream; 15 | import java.io.InputStreamReader; 16 | import java.io.OutputStream; 17 | import java.io.OutputStreamWriter; 18 | import java.nio.charset.Charset; 19 | import java.nio.charset.StandardCharsets; 20 | import java.nio.charset.UnsupportedCharsetException; 21 | import java.nio.file.Files; 22 | import java.nio.file.Paths; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import java.util.regex.Matcher; 26 | import java.util.regex.Pattern; 27 | 28 | /** 29 | * A class which provides static utility methods used dealing with {@link org.thunlp.thulac.io.IInputProvider} 30 | * and {@link IOutputHandler}. Some of them construct instances of {@link org.thunlp.thulac.io.IInputProvider} 31 | * and {@link IOutputHandler}, hiding the implementation details from the user. Others 32 | * can be used within implementations of {@link org.thunlp.thulac.io.IInputProvider} and 33 | * {@link IOutputHandler}, avoiding code duplicates. 34 | * 35 | * @see org.thunlp.thulac.io.IInputProvider 36 | * @see IOutputHandler 37 | */ 38 | public class IOUtils { 39 | /** 40 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from 41 | * {@link System#in}, using the default charset as the input encoding. 42 | * 43 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 44 | */ 45 | public static IInputProvider inputFromConsole() { 46 | return inputFromInputStream(System.in); // use default charset for System.in 47 | } 48 | 49 | /** 50 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from a given 51 | * {@link java.io.InputStream} using UTF-8 as encoding.
52 | * It is recommended to use {@link #inputFromFile(java.io.File, java.nio.charset.Charset)} when reading 53 | * input from files, since it takes better advantage of Java NIO and have better 54 | * performances. 55 | * 56 | * @param in 57 | * The {@link java.io.InputStream} to retrieve input from. 58 | * 59 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 60 | */ 61 | public static IInputProvider inputFromInputStream(InputStream in) { 62 | return inputFromInputStream(in, (Charset) null); 63 | } 64 | 65 | /** 66 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from a given 67 | * {@link java.io.InputStream} using a given charset as encoding.
68 | * It is recommended to use {@link #inputFromFile(java.io.File, java.nio.charset.Charset)} when reading 69 | * input from files, since it takes better advantage of Java NIO and have better 70 | * performances. 71 | * 72 | * @param in 73 | * The {@link java.io.InputStream} to retrieve input from. 74 | * @param charsetName 75 | * The optional name of the charset to use, defaulted to "UTF-8". 76 | * 77 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 78 | */ 79 | public static IInputProvider inputFromInputStream(InputStream in, String charsetName) 80 | throws UnsupportedCharsetException { 81 | return inputFromInputStream(in, forName(charsetName)); 82 | } 83 | 84 | /** 85 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from a given 86 | * {@link java.io.InputStream} using a given charset as encoding.
87 | * It is recommended to use {@link #inputFromFile(java.io.File, java.nio.charset.Charset)} when reading 88 | * input from files, since it takes better advantage of Java NIO and have better 89 | * performances. 90 | * 91 | * @param in 92 | * The {@link java.io.InputStream} to retrieve input from. 93 | * @param charset 94 | * The optional charset to use, defaulted to UTF-8. 95 | * 96 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 97 | */ 98 | public static IInputProvider inputFromInputStream(InputStream in, Charset charset) { 99 | return new ReaderInputProvider(new BufferedReader( 100 | new InputStreamReader(in, getOrDefault(charset)))); 101 | } 102 | 103 | /** 104 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the 105 | * given file using UTF-8 as file encoding. 106 | * 107 | * @param filename 108 | * The name of the file to retrieve input from. 109 | * 110 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 111 | * 112 | * @throws java.io.IOException 113 | * If the file does not exist or is not readable. 114 | */ 115 | public static IInputProvider inputFromFile(String filename) throws IOException { 116 | return inputFromFile(filename, (Charset) null); 117 | } 118 | 119 | /** 120 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the 121 | * given file using UTF-8 as file encoding. 122 | * 123 | * @param file 124 | * The file to retrieve input from. 125 | * 126 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 127 | * 128 | * @throws java.io.IOException 129 | * If the file does not exist or is not readable. 130 | */ 131 | public static IInputProvider inputFromFile(File file) throws IOException { 132 | return inputFromFile(file, (Charset) null); 133 | } 134 | 135 | /** 136 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the 137 | * given file using a given charset as encoding. 138 | * 139 | * @param filename 140 | * The name of the file to retrieve input from. 141 | * @param charsetName 142 | * The optional name of the charset to use, defaulted to "UTF-8". 143 | * 144 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 145 | * 146 | * @throws java.io.IOException 147 | * If the file does not exist or is not readable. 148 | * @throws java.nio.charset.UnsupportedCharsetException 149 | * If the charset referred to by the given name is not supported. 150 | */ 151 | public static IInputProvider inputFromFile(String filename, String charsetName) 152 | throws IOException, UnsupportedCharsetException { 153 | return inputFromFile(filename, forName(charsetName)); 154 | } 155 | 156 | /** 157 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the 158 | * given file using a given charset as encoding. 159 | * 160 | * @param filename 161 | * The file to retrieve input from. 162 | * @param charset 163 | * The optional file encoding to use, defaulted to UTF-8. 164 | * 165 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 166 | * 167 | * @throws java.io.IOException 168 | * If the file does not exist or is not readable. 169 | */ 170 | public static IInputProvider inputFromFile(String filename, Charset charset) 171 | throws IOException { 172 | if (filename == null) return null; // new File(null) throws NPE 173 | return inputFromFile(new File(filename), charset); 174 | } 175 | 176 | /** 177 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the 178 | * given file using a given charset as encoding. 179 | * 180 | * @param file 181 | * The name of the file to retrieve input from. 182 | * @param charsetName 183 | * The optional name of the file encoding to use, defaulted to UTF-8. 184 | * 185 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 186 | * 187 | * @throws java.io.IOException 188 | * If the file does not exist or is not readable. 189 | * @throws java.nio.charset.UnsupportedCharsetException 190 | * If the charset referred to by the given name is not supported. 191 | */ 192 | public static IInputProvider inputFromFile(File file, String charsetName) 193 | throws IOException, UnsupportedCharsetException { 194 | return inputFromFile(file, forName(charsetName)); 195 | } 196 | 197 | /** 198 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the 199 | * given file using a given charset as encoding. 200 | * 201 | * @param file 202 | * The name of the file to retrieve input from. 203 | * @param charset 204 | * The optional file encoding to use, defaulted to UTF-8. 205 | * 206 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 207 | * 208 | * @throws java.io.IOException 209 | * If the file does not exist or is not readable. 210 | */ 211 | public static IInputProvider inputFromFile(File file, Charset charset) 212 | throws IOException { 213 | if (file == null) return null; 214 | return new ReaderInputProvider( 215 | Files.newBufferedReader(Paths.get(file.toURI()), getOrDefault(charset))); 216 | } 217 | 218 | /** 219 | * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the 220 | * given {@link String}. 221 | * 222 | * @param input 223 | * The input string. 224 | * 225 | * @return The {@link org.thunlp.thulac.io.IInputProvider} created. 226 | */ 227 | public static IInputProvider inputFromString(String input) { 228 | if (input == null) return null; 229 | return new StringInputProvider(input); 230 | } 231 | 232 | /** 233 | * Creates an instance of {@link IOutputHandler} which writes output to 234 | * {@link System#out}, using the default charset as the output encoding. 235 | * 236 | * @return The {@link IOutputHandler} created. 237 | */ 238 | public static IOutputHandler outputToConsole() { 239 | return new WriterOutputHandler(new BufferedWriter( 240 | new OutputStreamWriter(System.out))); 241 | } 242 | 243 | /** 244 | * Creates an instance of {@link IOutputHandler} which writes output to a given 245 | * {@link java.io.OutputStream} using UTF-8 as encoding.
246 | * It is recommended to use {@link #outputToFile(java.io.File, String)} when writing 247 | * output to files, since it takes better advantage of Java NIO and have better 248 | * performances. 249 | * 250 | * @param out 251 | * The {@link java.io.OutputStream} to write output to. 252 | * 253 | * @return The {@link IOutputHandler} created. 254 | */ 255 | public static IOutputHandler outputToOutputStream(OutputStream out) { 256 | return outputToOutputStream(out, (Charset) null); 257 | } 258 | 259 | /** 260 | * Creates an instance of {@link IOutputHandler} which writes output to a given 261 | * {@link java.io.OutputStream} using a given charset as encoding.
262 | * It is recommended to use {@link #outputToFile(java.io.File, String)} when writing 263 | * output to files, since it takes better advantage of Java NIO and have better 264 | * performances. 265 | * 266 | * @param out 267 | * The {@link java.io.OutputStream} to write output to. 268 | * @param charsetName 269 | * The optional name of the charset to use, defaulted to UTF-8. 270 | * 271 | * @return The {@link IOutputHandler} created. 272 | * 273 | * @throws java.nio.charset.UnsupportedCharsetException 274 | * If the charset referred to by the name is not supported. 275 | */ 276 | public static IOutputHandler outputToOutputStream( 277 | OutputStream out, String charsetName) throws UnsupportedCharsetException { 278 | return outputToOutputStream(out, forName(charsetName)); 279 | } 280 | 281 | /** 282 | * Creates an instance of {@link IOutputHandler} which writes output to a given 283 | * {@link java.io.OutputStream} using a given charset as encoding.
284 | * It is recommended to use {@link #outputToFile(java.io.File, String)} when writing 285 | * output to files, since it takes better advantage of Java NIO and have better 286 | * performances. 287 | * 288 | * @param out 289 | * The {@link java.io.OutputStream} to write output to. 290 | * @param charset 291 | * The optional charset to use, defaulted to UTF-8. 292 | * 293 | * @return The {@link IOutputHandler} created. 294 | */ 295 | public static IOutputHandler outputToOutputStream(OutputStream out, Charset charset) { 296 | return new WriterOutputHandler(new BufferedWriter( 297 | new OutputStreamWriter(out, getOrDefault(charset)))); 298 | } 299 | 300 | /** 301 | * Creates an instance of {@link IOutputHandler} which writes output to the 302 | * given file using UTF-8 as file encoding. 303 | * 304 | * @param filename 305 | * The name of the file to output to. 306 | * 307 | * @return The {@link IOutputHandler} created. 308 | * 309 | * @throws java.io.IOException 310 | * If the file cannot be created or is not writable. 311 | */ 312 | public static IOutputHandler outputToFile(String filename) throws IOException { 313 | return outputToFile(filename, (Charset) null); 314 | } 315 | 316 | /** 317 | * Creates an instance of {@link IOutputHandler} which writes output to the 318 | * given file using UTF-8 as file encoding. 319 | * 320 | * @param file 321 | * The file to output to. 322 | * 323 | * @return The {@link IOutputHandler} created. 324 | * 325 | * @throws java.io.IOException 326 | * If the file cannot be created or is not writable. 327 | */ 328 | public static IOutputHandler outputToFile(File file) throws IOException { 329 | return outputToFile(file, (Charset) null); 330 | } 331 | 332 | /** 333 | * Creates an instance of {@link IOutputHandler} which writes output to the 334 | * given file using a given charset as encoding. 335 | * 336 | * @param filename 337 | * The name of the file to output to. 338 | * @param charsetName 339 | * The optional name of the charset to use, defaulted to "UTF-8". 340 | * 341 | * @return The {@link IOutputHandler} created. 342 | * 343 | * @throws java.io.IOException 344 | * If the file cannot be created or is not writable. 345 | * @throws java.nio.charset.UnsupportedCharsetException 346 | * If the charset referred to by the given name is not supported. 347 | */ 348 | public static IOutputHandler outputToFile(String filename, String charsetName) 349 | throws IOException, UnsupportedCharsetException { 350 | return outputToFile(filename, forName(charsetName)); 351 | } 352 | 353 | /** 354 | * Creates an instance of {@link IOutputHandler} which writes output to the 355 | * given file using a given charset as encoding. 356 | * 357 | * @param filename 358 | * The name of the file to output to. 359 | * @param charset 360 | * The optional file encoding to use, defaulted to UTF-8. 361 | * 362 | * @return The {@link IOutputHandler} created. 363 | * 364 | * @throws java.io.IOException 365 | * If the file cannot be created or is not writable. 366 | */ 367 | public static IOutputHandler outputToFile(String filename, Charset charset) 368 | throws IOException { 369 | if (filename == null) return null; // new File(null) throws NPE 370 | return outputToFile(new File(filename), charset); 371 | } 372 | 373 | /** 374 | * Creates an instance of {@link IOutputHandler} which writes output to the 375 | * given file using a given charset as encoding. 376 | * 377 | * @param file 378 | * The file to output to. 379 | * @param charsetName 380 | * The optional name of the file encoding to use, defaulted to "UTF-8". 381 | * 382 | * @return The {@link IOutputHandler} created. 383 | * 384 | * @throws java.io.IOException 385 | * If the file cannot be created or is not writable. 386 | * @throws java.nio.charset.UnsupportedCharsetException 387 | * If the charset referred to by the given name is not supported. 388 | */ 389 | public static IOutputHandler outputToFile(File file, String charsetName) 390 | throws IOException, UnsupportedCharsetException { 391 | return outputToFile(file, forName(charsetName)); 392 | } 393 | 394 | /** 395 | * Creates an instance of {@link IOutputHandler} which writes output to the 396 | * given file using a given charset as encoding. 397 | * 398 | * @param file 399 | * The file to output to. 400 | * @param charset 401 | * The optional file encoding to use, defaulted to UTF-8. 402 | * 403 | * @return The {@link IOutputHandler} created. 404 | * 405 | * @throws java.io.IOException 406 | * If the file cannot be created or is not writable. 407 | */ 408 | public static IOutputHandler outputToFile(File file, Charset charset) 409 | throws IOException { 410 | if (file == null) return null; 411 | return new WriterOutputHandler( 412 | Files.newBufferedWriter(Paths.get(file.toURI()), getOrDefault(charset))); 413 | } 414 | 415 | /** 416 | * Creates an instance of {@link org.thunlp.thulac.io.StringOutputHandler} which writes output to an 417 | * {@link String} in memory.
418 | * It is typical to use this method like this: 419 | *

420 | 	 * StringOutputHandler output = IOUtils.outputToString();
421 | 	 * Thulac.split(input, output, segOnly); // or anything else
422 | 	 * String outputStr = output.getString();
423 | 	 * 
424 | * 425 | * @return The {@link org.thunlp.thulac.io.StringOutputHandler} created. 426 | */ 427 | public static StringOutputHandler outputToString() { 428 | return new StringOutputHandler(); 429 | } 430 | 431 | private static final int MAX_LENGTH = 20000; 432 | private static final Pattern SPLIT_PATTERN = 433 | Pattern.compile(".*([\u3002\uff1f\uff01\uff1b;!?]|$)"); 434 | 435 | /** 436 | * Split a given line into a list of line segments if the line is too long. It is 437 | * promised that each line segment either is the last one or ends with an 438 | * punctuation character. 439 | * 440 | * @param line 441 | * The line to split into line segments. 442 | * 443 | * @return The list of line segments split. 444 | */ 445 | public static List getLineSegments(String line) { 446 | List lineSegments = new ArrayList<>(); 447 | if (line.length() < MAX_LENGTH) lineSegments.add(line); 448 | else { // split the line into short line segments 449 | Matcher matcher = SPLIT_PATTERN.matcher(line); 450 | while (matcher.find()) lineSegments.add(matcher.group()); 451 | } 452 | return lineSegments; 453 | } 454 | 455 | /** 456 | * Returns a {@link java.nio.charset.Charset} wich name {@code charset}. This methods differs from 457 | * the {@link java.nio.charset.Charset#forName(String)} when {@code charset} is {@code null}, with 458 | * this method returning {@code null} while {@link java.nio.charset.Charset#forName(String)} throws 459 | * an NPE. 460 | * 461 | * @param charset 462 | * The name of the {@link java.nio.charset.Charset}. 463 | * 464 | * @return The {@link java.nio.charset.Charset} with name {@code charset}. 465 | * 466 | * @throws java.nio.charset.UnsupportedCharsetException 467 | * If the charset referred to by the given name is not supported. 468 | */ 469 | private static Charset forName(String charset) throws UnsupportedCharsetException { 470 | if (charset == null) return null; 471 | return Charset.forName(charset); 472 | } 473 | 474 | /** 475 | * Returns the given {@link java.nio.charset.Charset} when non-null, or 476 | * {@link java.nio.charset.StandardCharsets#UTF_8} otherwise, since many applications using 477 | * {@link java.nio.charset.Charset} throws NPE if charset is {@code null}. 478 | * 479 | * @param charset 480 | * The given {@link java.nio.charset.Charset}. 481 | * 482 | * @return {@code charset} when non-null, {@link java.nio.charset.StandardCharsets#UTF_8} otherwise. 483 | */ 484 | private static Charset getOrDefault(Charset charset) { 485 | return charset == null ? StandardCharsets.UTF_8 : charset; 486 | } 487 | } 488 | -------------------------------------------------------------------------------- /src/main/java/org/thunlp/thulac/util/StringUtils.java: -------------------------------------------------------------------------------- 1 | package org.thunlp.thulac.util; 2 | 3 | /** 4 | * An utility class which deals with string, converting array of code points to and from 5 | * strings. 6 | */ 7 | public class StringUtils { 8 | /** 9 | * Convert an array of code points to {@link String}. 10 | * 11 | * @param codePoints 12 | * The code points to convert. 13 | * 14 | * @return The converted {@link String}. 15 | */ 16 | public static String toString(int... codePoints) { 17 | return toString(codePoints, 0, codePoints.length); 18 | } 19 | 20 | /** 21 | * Convert an array of code points to {@link String}. 22 | * 23 | * @param codePoints 24 | * The code points to convert. 25 | * @param offset 26 | * The starting offset of {@code codePoints}. 27 | * @param len 28 | * The number of code points to convert. 29 | * 30 | * @return The converted {@link String}, indices which exceeds {@code 31 | * codePoints.length} are discarded. 32 | */ 33 | public static String toString(int[] codePoints, int offset, int len) { 34 | StringBuilder sb = new StringBuilder(); 35 | for (int i = offset, max = Math.min(codePoints.length, offset + len); 36 | i < max; ++i) 37 | sb.appendCodePoint(codePoints[i]); 38 | return sb.toString(); 39 | } 40 | 41 | /** 42 | * Convert a {@link String} to an array of code points.
43 | * Internally, data in {@link String} is stored in {@code char[]}, however for 44 | * Unicode code points greater than U+FFFF, one {@code char} (that is, two bytes) 45 | * is not enough. Therefore, Java uses surrogates to divide code points 46 | * that cannot be represented by one {@code} into two. The problem is, 47 | * {@link String#length()} return the length of its internal {@code char[]}, while 48 | * the return value of {@link String#length()} is not necessarily (though in most 49 | * cases) equal to the number of code points stored in the {@link String}.
50 | * To solve this problem, the {@link String} class provides a set of methods to 51 | * retrieve the actual number of code points stored and to access a code points in 52 | * the {@link String} using the index by code points, as implemented in this method. 53 | * However, the iteration through a {@link String} by the actual code points is 54 | * fairly complicated, and it is much easier for applications to achieve this if 55 | * the string data is stored as {@code int[]}, each element representing a code point. 56 | * And this is exactly What this method does: take a {@link String} as input, 57 | * convert it into a {@code int[]} which contains exactly the same data as the 58 | * {@link String}.
59 | * It is recommended that all applications which iterate through the characters 60 | * stored in a {@link String} use
61 | *

 62 | 	 * int[] codePoints = StringUtils.toCodePoints(str);
 63 | 	 * for (int codePoint: codePoints) // do something ...
 64 | 	 * 
65 | * instead of the traditional
66 | *

 67 | 	 * for (int i = 0, length = str.length(); i < length; ++i) {
 68 | 	 *     char c = str.charAt(i);
 69 | 	 *     // do something ...
 70 | 	 * }
 71 | 	 * 
72 | * 73 | * @param str 74 | * The {@link String} to convert. 75 | * 76 | * @return The converted array of code points. 77 | */ 78 | public static int[] toCodePoints(String str) { 79 | if (str == null) return null; 80 | int codePointCount = str.codePointCount(0, str.length()); 81 | int[] codePoints = new int[codePointCount]; 82 | for (int i = 0; i < codePointCount; ++i) 83 | codePoints[i] = str.codePointAt(str.offsetByCodePoints(0, i)); 84 | return codePoints; 85 | } 86 | 87 | /** 88 | * Return the number of code points in the given {@link String}. 89 | * 90 | * @param str 91 | * The given {@link String}. 92 | * 93 | * @return The number of code points in {@code str}. 94 | */ 95 | public static int codePointCount(String str) { 96 | return str.codePointCount(0, str.length()); 97 | } 98 | 99 | /** 100 | * Return code point {@code index}-ith code point in the given {@link String}. 101 | * 102 | * @param str 103 | * The given {@link String}. 104 | * @param index 105 | * The index of the code point to return. 106 | * 107 | * @return The cde point at {@code index}. 108 | * 109 | * @throws IndexOutOfBoundsException 110 | * If index is negative or greater than or equal to the number of code points 111 | * of {@code str}. 112 | */ 113 | public static int codePointAt(String str, int index) { 114 | int codePointIndex = str.offsetByCodePoints(0, index); 115 | return str.codePointAt(codePointIndex); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=2.0 29 | # java.version=1.7 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=A thulac analysis of plugins for Elasticsearch 35 | # 36 | # 'version': plugin's version 37 | version=7.9.1 38 | # 39 | # 'name': the plugin name 40 | name=analysis-thulac-plugin 41 | 42 | ### mandatory elements for site plugins: 43 | # 44 | # 'site': set to true to indicate contents of the _site/ 45 | # directory in the root of the plugin should be served. 46 | #site=${elasticsearch.plugin.site} 47 | 48 | 49 | ### mandatory elements for jvm plugins : 50 | # 51 | # 'jvm': true if the 'classname' class should be loaded 52 | # from jar files in the root directory of the plugin. 53 | # Note that only jar files in the root directory are 54 | # added to the classpath for the plugin! If you need 55 | # other resources, package them into a resources jar. 56 | # jvm=true 57 | # 'classname': the name of the class to load, fully-qualified. 58 | classname=org.elasticsearch.plugin.analysis.ThulacAnalysisPlugin 59 | # 60 | # 'java.version' version of java the code is built against 61 | # use the system property java.specification.version 62 | # version string must be a sequence of nonnegative decimal integers 63 | # separated by "."'s and may have leading zeros 64 | java.version=1.8 65 | # 66 | # 'elasticsearch.version' version of elasticsearch compiled against 67 | # You will have to release a new version of the plugin for each new 68 | # elasticsearch release. This version is checked when the plugin 69 | # is loaded so Elasticsearch will refuse to start in the presence of 70 | # plugins with the incorrect elasticsearch.version. 71 | elasticsearch.version=7.9.1 72 | # 73 | ### deprecated elements for jvm plugins : 74 | # 75 | # 'isolated': true if the plugin should have its own classloader. 76 | # passing false is deprecated, and only intended to support plugins 77 | # that have hard dependencies against each other. If this is 78 | # not specified, then the plugin is isolated by default. 79 | #isolated=${elasticsearch.plugin.isolated} 80 | # -------------------------------------------------------------------------------- /src/main/resources/plugin-security.policy: -------------------------------------------------------------------------------- 1 | grant { 2 | // needed because of the hot reload functionality 3 | permission java.io.FilePermission "<>", "read"; 4 | }; -------------------------------------------------------------------------------- /src/test/java/TestThulac.java: -------------------------------------------------------------------------------- 1 | import org.elasticsearch.thulac.Configuration; 2 | import org.elasticsearch.thulac.ThulacLiteTokenizerScanner; 3 | import org.junit.Test; 4 | import org.thunlp.thulac.data.TaggedWord; 5 | 6 | import java.io.*; 7 | 8 | /** 9 | * Created by micro on 2017-12-17. 10 | */ 11 | public class TestThulac { 12 | 13 | @Test 14 | public void test2() throws IOException { 15 | ThulacLiteTokenizerScanner tokenizer = new ThulacLiteTokenizerScanner(new Configuration()); 16 | InputStreamReader isr = new InputStreamReader(getClass().getClassLoader().getResource("input").openStream()); 17 | tokenizer.reset(isr); 18 | while (tokenizer.hasNext()) { 19 | TaggedWord token = tokenizer.next(); 20 | System.out.println("word = " + token.word + ", tag= " + token.tag + ", start= " + token.startOffset + ", end=" + token.endOffset); 21 | } 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/test/resources/input: -------------------------------------------------------------------------------- 1 | 中国是世界上 人口最多的国家, 2008 年底中国大陆人口13.28亿,占全世界人口的20%,也就是说全世界平均每5个人当中就有一个是黄皮肤、黑头发的中国人。为什么中国人有这么多?中国怎么解决这个问题?请让我慢慢说起。古代的时候,中国人口并不多。清朝以前,中国人口都不超过一亿,有些朝代人口甚至只有一两千万,即使最强大的唐朝,人口也只有五千万。由于古代医学比较落后,生孩子有很大的危险,很多孩子一出生就死了;古代人的生活条件也不好,有些 家庭很穷,就算生了孩子也无法把他们养大;还有,古代人喜欢男孩儿,认为男孩儿可以劳动,而女孩儿却没有用,所以女孩儿常常被抛弃。这种重男轻女的错误思想也是造成中国古代人 少的一个原因。农业对古代中国非常重要,而人口就是劳动力。“谁生的孩子越多,谁就越光荣”,这种思想一直保持到新中国成立。为了加快发展,政府鼓励人们多生孩子,而且现代社会的医学水平比较发达,人们的生活条件也提高了,生孩子从很困难变成很容易,最后竟然难以控制。50年里,中国人口从5亿增加到11亿,翻了一番。这个速度太快了,如果不控制,后果不堪设想。于是70年代,中国政府制定法律,实行“计划生育”。“计划生育”就是有计划地生孩子,并且尽可能保证孩子的健康。具体地说,就是一个家庭只生一个孩子。如果多生了孩子,就要受到一些处罚。到现在为止,计划生育使中国人口的增长减少了至少4亿,人口的增长速度也越来越低。但是,计划生育也带来了一些问题。其中最重要的一个,就是我们这一代孩子比较孤独,因为我们都没有兄弟姐妹。这样的孩子叫做“独生子女”。中国现在的年轻人基本上都是独生子女,他们的性格和父母那一代人不一样。他们更加独立,但是也更加自我。其次,中国人口的出生率降低了,这也就意味着,几年后中国的人口达到顶峰,然后不再增长,变得越来越少。还有一个问题是人口分布不均匀,也就是有些地方人多,有些地方人少。中国东部人口多,西部人口少。中国人口最多的一个省是河南省,人口将近一亿,平均每平方千米有600多人;而中国人口最少的一个省是西藏自治区,在那里,平均每平方千米可能都找不到一个人。如果你认为这些问题还不算大的话,请继续听我说。现在中国老人多,年轻人少;男人多,女人少。老人多是因为年轻一代都是独生子女,占总人口的比例突然少了,于是老人就显得很多。在性别比例上,据说中国男女比例是118:100,也就是说,有18个男人一出生就注定了以后找不到老婆。为了让人口发展更加自然和科学,政府也对计划生育进行修改。现在,在中国大部分地区,如果父母都是独生子女的家庭允许生第二个孩子。对于“人口老龄化”,也就是老人越来越多的问题,政府的目标就是改善社会福利,提高人们的生活质量。清朝的时候,中国人的平均寿命是33岁,而现在,中国人的平均寿命已经达到73岁。对于13亿的人口来说,这的确是一个值得骄傲的成绩。 --------------------------------------------------------------------------------