├── .gitignore
├── README.md
├── build.gradle
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── models
    └── README.md
├── settings.gradle
└── src
    ├── main
        ├── java
        │   └── org
        │   │   ├── elasticsearch
        │   │       ├── index
        │   │       │   ├── ThulacAnalyzer.java
        │   │       │   ├── ThulacAnalyzerProvider.java
        │   │       │   ├── ThulacTokenizer.java
        │   │       │   └── ThulacTokenizerFactory.java
        │   │       ├── plugin
        │   │       │   └── analysis
        │   │       │   │   └── ThulacAnalysisPlugin.java
        │   │       └── thulac
        │   │       │   ├── Configuration.java
        │   │       │   ├── ThulacLiteSegment.java
        │   │       │   ├── ThulacLiteTokenizerScanner.java
        │   │       │   ├── postprocess
        │   │       │       ├── DictionaryPassBuilder.java
        │   │       │       ├── DoubleWordPassBuilder.java
        │   │       │       ├── FilterPassBuilder.java
        │   │       │       ├── NegWordPassBuilder.java
        │   │       │       ├── SpecialPassBuilder.java
        │   │       │       ├── TimeWordPassBuilder.java
        │   │       │       └── VerbPassBuilder.java
        │   │       │   └── preprocess
        │   │       │       ├── ConvertT2SPassBuilder.java
        │   │       │       └── PreProcessPassBuilder.java
        │   │   └── thunlp
        │   │       └── thulac
        │   │           ├── Thulac.java
        │   │           ├── cb
        │   │               ├── AlphaBeta.java
        │   │               ├── CBModel.java
        │   │               ├── CBNGramFeature.java
        │   │               ├── CBTaggingDecoder.java
        │   │               └── Node.java
        │   │           ├── data
        │   │               ├── Dat.java
        │   │               ├── DatMaker.java
        │   │               ├── POCGraph.java
        │   │               └── TaggedWord.java
        │   │           ├── io
        │   │               ├── IInputProvider.java
        │   │               ├── IOutputHandler.java
        │   │               ├── IProgramStateListener.java
        │   │               ├── ReaderInputProvider.java
        │   │               ├── StringInputProvider.java
        │   │               ├── StringOutputHandler.java
        │   │               └── WriterOutputHandler.java
        │   │           ├── main
        │   │               └── Main.java
        │   │           ├── postprocess
        │   │               ├── DictionaryPass.java
        │   │               ├── DoubleWordPass.java
        │   │               ├── FilterPass.java
        │   │               ├── IPostprocessPass.java
        │   │               ├── NegWordPass.java
        │   │               ├── SpecialPass.java
        │   │               ├── TimeWordPass.java
        │   │               └── VerbPass.java
        │   │           ├── preprocess
        │   │               ├── ConvertT2SPass.java
        │   │               ├── IPreprocessPass.java
        │   │               └── PreProcessPass.java
        │   │           └── util
        │   │               ├── BufferUtils.java
        │   │               ├── CodePointUtils.java
        │   │               ├── IOUtils.java
        │   │               └── StringUtils.java
        └── resources
        │   ├── plugin-descriptor.properties
        │   └── plugin-security.policy
    └── test
        ├── java
            └── TestThulac.java
        └── resources
            └── input


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### macOS template
  3 | # General
  4 | .DS_Store
  5 | .AppleDouble
  6 | .LSOverride
  7 | 
  8 | # Icon must end with two \r
  9 | Icon
 10 | 
 11 | # Thumbnails
 12 | ._*
 13 | 
 14 | # Files that might appear in the root of a volume
 15 | .DocumentRevisions-V100
 16 | .fseventsd
 17 | .Spotlight-V100
 18 | .TemporaryItems
 19 | .Trashes
 20 | .VolumeIcon.icns
 21 | .com.apple.timemachine.donotpresent
 22 | 
 23 | # Directories potentially created on remote AFP share
 24 | .AppleDB
 25 | .AppleDesktop
 26 | Network Trash Folder
 27 | Temporary Items
 28 | .apdisk
 29 | ### Eclipse template
 30 | 
 31 | .metadata
 32 | bin/
 33 | tmp/
 34 | *.tmp
 35 | *.bak
 36 | *.swp
 37 | *~.nib
 38 | local.properties
 39 | .settings/
 40 | .loadpath
 41 | .recommenders
 42 | 
 43 | # External tool builders
 44 | .externalToolBuilders/
 45 | 
 46 | # Locally stored "Eclipse launch configurations"
 47 | *.launch
 48 | 
 49 | # PyDev specific (Python IDE for Eclipse)
 50 | *.pydevproject
 51 | 
 52 | # CDT-specific (C/C++ Development Tooling)
 53 | .cproject
 54 | 
 55 | # Java annotation processor (APT)
 56 | .factorypath
 57 | 
 58 | # PDT-specific (PHP Development Tools)
 59 | .buildpath
 60 | 
 61 | # sbteclipse plugin
 62 | .target
 63 | 
 64 | # Tern plugin
 65 | .tern-project
 66 | 
 67 | # TeXlipse plugin
 68 | .texlipse
 69 | 
 70 | # STS (Spring Tool Suite)
 71 | .springBeans
 72 | 
 73 | # Code Recommenders
 74 | .recommenders/
 75 | 
 76 | # Scala IDE specific (Scala & Java development for Eclipse)
 77 | .cache-main
 78 | .scala_dependencies
 79 | .worksheet
 80 | ### Java template
 81 | # Compiled class file
 82 | *.class
 83 | 
 84 | # Log file
 85 | *.log
 86 | 
 87 | # BlueJ files
 88 | *.ctxt
 89 | 
 90 | # Mobile Tools for Java (J2ME)
 91 | .mtj.tmp/
 92 | 
 93 | # Package Files #
 94 | *.jar
 95 | *.war
 96 | *.ear
 97 | *.zip
 98 | *.tar.gz
 99 | *.rar
100 | 
101 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
102 | hs_err_pid*
103 | ### Gradle template
104 | .gradle
105 | /build/
106 | 
107 | # Ignore Gradle GUI config
108 | gradle-app.setting
109 | 
110 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
111 | !gradle-wrapper.jar
112 | 
113 | # Cache of project
114 | .gradletasknamecache
115 | 
116 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
117 | # gradle/wrapper/gradle-wrapper.properties
118 | ### JetBrains template
119 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
120 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
121 | 
122 | .idea
123 | 
124 | # User-specific stuff:
125 | .idea/**/workspace.xml
126 | .idea/**/tasks.xml
127 | .idea/dictionaries
128 | 
129 | # Sensitive or high-churn files:
130 | .idea/**/dataSources/
131 | .idea/**/dataSources.ids
132 | .idea/**/dataSources.xml
133 | .idea/**/dataSources.local.xml
134 | .idea/**/sqlDataSources.xml
135 | .idea/**/dynamic.xml
136 | .idea/**/uiDesigner.xml
137 | 
138 | # Gradle:
139 | .idea/**/gradle.xml
140 | .idea/**/libraries
141 | 
142 | # CMake
143 | cmake-build-debug/
144 | 
145 | # Mongo Explorer plugin:
146 | .idea/**/mongoSettings.xml
147 | 
148 | ## File-based project format:
149 | *.iws
150 | 
151 | ## Plugin-specific files:
152 | 
153 | # IntelliJ
154 | out/
155 | 
156 | # mpeltonen/sbt-idea plugin
157 | .idea_modules/
158 | 
159 | # JIRA plugin
160 | atlassian-ide-plugin.xml
161 | 
162 | # Cursive Clojure plugin
163 | .idea/replstate.xml
164 | 
165 | # Crashlytics plugin (for Android Studio and IntelliJ)
166 | com_crashlytics_export_strings.xml
167 | crashlytics.properties
168 | crashlytics-build.properties
169 | fabric.properties
170 | 
171 | .classpath
172 | .project
173 | models/*.bin
174 | models/*.txt
175 | models/model*
176 | models/*.dat
177 | *.iml
178 | .idea


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # THULAC Analysis for Elasticsearch 
  2 | 采用[THULAC](https://github.com/thunlp/THULAC-Java)实现的[Elasticsearch](https://www.elastic.co)中文分词插件。
  3 | 
  4 | 版本
  5 | --------
  6 | 
  7 | Plugin 版本 | ES 版本 | THULAC 版本 |  Link
  8 | -----------|-----------|----------|------------
  9 | master | 7.x -> master | lite      |
 10 | 7.9.1  | 7.9.1         | lite      |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/7.9.1/elasticsearch-thulac-plugin-7.9.1.zip)
 11 | 6.4.1-181027 | 6.4.1          | lite      |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.4.1-181027/elasticsearch-thulac-plugin-6.4.1-181027.zip)
 12 | 6.4.0-181027 | 6.4.0          | lite      |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.4.0-181027/elasticsearch-thulac-plugin-6.4.0-181027.zip)
 13 | 6.3.0-181027 | 6.3.0          | lite      |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.3.0-181027/elasticsearch-thulac-plugin-6.3.0-181027.zip)
 14 | 6.2.0-181027 | 6.2.0          | lite      |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.2.0-181027/elasticsearch-thulac-plugin-6.2.0-181027.zip)
 15 | 6.1.0-181027 | 6.1.0          | lite      |[下载](https://github.com/microbun/elasticsearch-thulac-plugin/releases/download/6.1.0-181027/elasticsearch-thulac-plugin-6.1.0-181027.zip)
 16 | 
 17 | 
 18 | 下载安装
 19 | --------
 20 | 直接下载已经打包好的插件，解压到elasticsearch的plugins目录下即可。
 21 | 
 22 | 编译安装
 23 | --------
 24 | 1.编译打包 
 25 | 
 26 | ```bash
 27 | git clone git@github.com:microbun/elasticsearch-thulac-plugin.git
 28 | cd elasticsearch-thulac-plugin
 29 | ./gradlew release
 30 | ```
 31 | 
 32 | 2.安装到elasticsearch
 33 | ```
 34 | cp build/distributions/elasticsearch-thulac-plugin-7.9.1.zip ${ES_HOME}/plugins
 35 | cd ${ES_HOME}/plugins
 36 | unzip elasticsearch-thulac-plugin-7.9.1.zip
 37 | rm elasticsearch-thulac-plugin-7.9.1.zip
 38 | ```
 39 | 解压后在plugins目录下会有一个thulac文件夹。
 40 | ```
 41 | thulac
 42 |  |-elasticsearch-thulac-plugin-7.9.1.jar
 43 |  |-models #算法模型目录
 44 |  |-plugin-descriptor.properties
 45 |  |-plugin.xml
 46 | ```
 47 | 
 48 | 3.由于THULAC的模型太大，插件中没有包含模型数据，可以在[THULAC](https://github.com/thunlp/THULAC-Java) 下载模型(lite)，将模型拷贝到models中。
 49 | 
 50 | 
 51 | 示例
 52 | --------
 53 | #### 1.创建索引
 54 | 
 55 | 1.1 使用默认分词方式
 56 | ```bash
 57 | curl -H "Content-Type:application/json" -XPUT http://localhost:9200/index -d'
 58 | {
 59 |   "mappings": {
 60 |     "properties": {
 61 |       "text": {
 62 |         "type": "text",
 63 |         "analyzer": "thulac"
 64 |       }
 65 |     }
 66 |   }
 67 | }
 68 | '
 69 | ```
 70 | 
 71 | 1.2 自定义分词器
 72 | ```bash
 73 | curl -H "Content-Type:application/json" -XPUT http://localhost:9200/index -d'
 74 | {
 75 |   "settings": {
 76 |     "analysis": {
 77 |       "tokenizer": {
 78 |         "custom_thulac_tokenizer": {
 79 |           "type": "thulac",
 80 |           "user_dict": "userdict.txt",
 81 |           "t2s": true,
 82 |           "filter": false
 83 |         }
 84 |       },
 85 |       "analyzer": {
 86 |         "custom_thulac_analyzer": {
 87 |           "tokenizer": "custom_thulac_tokenizer",
 88 |           "filter": [
 89 |             "lowercase"
 90 |           ]
 91 |         }
 92 |       }
 93 |     }
 94 |   },
 95 |   "mappings": {
 96 |     "properties": {
 97 |       "text": {
 98 |         "type": "text",
 99 |         "analyzer": "custom_thulac_analyzer"
100 |       }
101 |     }
102 |   }
103 | }'
104 | ```
105 | 
106 | | 参数名称 | 含义 | 值 |
107 | | --- | --- |---|
108 | | t2s | 将句子从繁体转化为简体。默认：true | false/true |
109 | | filter | 使用过滤器去除一些没有意义的词语，例如“可以”。默认：false | false/true |
110 | | user_dict | 自定义词典路径，每一个词一行，UTF8编码，相对路径和绝对路径.</br>相对路径：userdict.txt 会加载 ${ES_HOME}/plugins/module/userdict.txt文件</br>绝对路径：/home/elasticsearch/userdict.txt</br>默认：userdict.txt |  |
111 | 
112 | #### 2.查看索引
113 | ```bash
114 | curl http://localhost:9200/index
115 | ```
116 | 
117 | #### 3.测试分词效果
118 | ```bash
119 | curl -H "Content-Type:application/json"  -XPOST http://localhost:9200/index/_analyze -d'
120 | {
121 |  "analyzer":"thulac", 
122 |  "text":"我是中国人"
123 | }
124 | '
125 | ```
126 | 
127 | #### 4.删除索引
128 | ```
129 | curl -XDELETE http://localhost:9200/index
130 | ```
131 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | group 'org.elasticsearch.thulac'
 2 | version '7.9.1'
 3 | 
 4 | apply plugin: 'java'
 5 | 
 6 | sourceCompatibility = 1.8
 7 | 
 8 | repositories {
 9 |     mavenCentral()
10 | }
11 | 
12 | configurations {
13 |     wagon
14 |     distJars {
15 |         extendsFrom runtime
16 |         exclude group: 'org.elasticsearch'
17 |         exclude group: 'lucene-core'
18 |         exclude group: 'org.apache.logging.log4j'
19 |         exclude group: 'lucene-analyzers-common'
20 |         exclude group: 'org.apache.commons'
21 |     }
22 | }
23 | 
24 | sourceSets {
25 |     main {
26 |         java {
27 |             srcDir "src/main/java"
28 |         }
29 |         resources {
30 |             srcDir "src/main/resources"
31 |             include "**/*"
32 |         }
33 |     }
34 | }
35 | 
36 | dependencies {
37 |     testCompile group: 'junit', name: 'junit', version: '4.11'
38 |     compile 'org.elasticsearch:elasticsearch:7.9.1'
39 | }
40 | 
41 | task release_full(type: Zip, dependsOn: [':jar']) {
42 |     into('thulac') {
43 |         from configurations.distJars
44 |         from 'build/libs'
45 |         from 'build/resources/main/plugin.xml'
46 |         from 'build/resources/main/plugin-descriptor.properties'
47 |         from 'build/resources/main/plugin-security.policy'
48 |     }
49 |     from('models') {
50 |         include "**/*"
51 |         into ('thulac/models')
52 |     }
53 | }
54 | 
55 | task release_lite(type: Zip, dependsOn: [':jar']) {
56 |     into('thulac') {
57 |         from configurations.distJars
58 |         from 'build/libs'
59 |         from 'build/resources/main/plugin.xml'
60 |         from 'build/resources/main/plugin-descriptor.properties'
61 |         from 'build/resources/main/plugin-security.policy'
62 |     }
63 |     from('models') {
64 |         include "README.md"
65 |         include "userdict.txt"
66 |         into ('thulac/models')
67 |     }
68 | }


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microbun/elasticsearch-thulac-plugin/ddc29e6eb21fb08c80c7faa36ed85e55922d985a/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Sun Dec 17 15:22:11 CST 2017
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-3.1-all.zip
7 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Attempt to set APP_HOME
 10 | # Resolve links: $0 may be a link
 11 | PRG="$0"
 12 | # Need this for relative symlinks.
 13 | while [ -h "$PRG" ] ; do
 14 |     ls=`ls -ld "$PRG"`
 15 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 16 |     if expr "$link" : '/.*' > /dev/null; then
 17 |         PRG="$link"
 18 |     else
 19 |         PRG=`dirname "$PRG"`"/$link"
 20 |     fi
 21 | done
 22 | SAVED="`pwd`"
 23 | cd "`dirname \"$PRG\"`/" >/dev/null
 24 | APP_HOME="`pwd -P`"
 25 | cd "$SAVED" >/dev/null
 26 | 
 27 | APP_NAME="Gradle"
 28 | APP_BASE_NAME=`basename "$0"`
 29 | 
 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 31 | DEFAULT_JVM_OPTS=""
 32 | 
 33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 34 | MAX_FD="maximum"
 35 | 
 36 | warn ( ) {
 37 |     echo "$*"
 38 | }
 39 | 
 40 | die ( ) {
 41 |     echo
 42 |     echo "$*"
 43 |     echo
 44 |     exit 1
 45 | }
 46 | 
 47 | # OS specific support (must be 'true' or 'false').
 48 | cygwin=false
 49 | msys=false
 50 | darwin=false
 51 | nonstop=false
 52 | case "`uname`" in
 53 |   CYGWIN* )
 54 |     cygwin=true
 55 |     ;;
 56 |   Darwin* )
 57 |     darwin=true
 58 |     ;;
 59 |   MINGW* )
 60 |     msys=true
 61 |     ;;
 62 |   NONSTOP* )
 63 |     nonstop=true
 64 |     ;;
 65 | esac
 66 | 
 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 68 | 
 69 | # Determine the Java command to use to start the JVM.
 70 | if [ -n "$JAVA_HOME" ] ; then
 71 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 72 |         # IBM's JDK on AIX uses strange locations for the executables
 73 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 74 |     else
 75 |         JAVACMD="$JAVA_HOME/bin/java"
 76 |     fi
 77 |     if [ ! -x "$JAVACMD" ] ; then
 78 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 79 | 
 80 | Please set the JAVA_HOME variable in your environment to match the
 81 | location of your Java installation."
 82 |     fi
 83 | else
 84 |     JAVACMD="java"
 85 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 86 | 
 87 | Please set the JAVA_HOME variable in your environment to match the
 88 | location of your Java installation."
 89 | fi
 90 | 
 91 | # Increase the maximum file descriptors if we can.
 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
 93 |     MAX_FD_LIMIT=`ulimit -H -n`
 94 |     if [ $? -eq 0 ] ; then
 95 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 96 |             MAX_FD="$MAX_FD_LIMIT"
 97 |         fi
 98 |         ulimit -n $MAX_FD
 99 |         if [ $? -ne 0 ] ; then
100 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
101 |         fi
102 |     else
103 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 |     fi
105 | fi
106 | 
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 | 
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 |     JAVACMD=`cygpath --unix "$JAVACMD"`
117 | 
118 |     # We build the pattern for arguments to be converted via cygpath
119 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 |     SEP=""
121 |     for dir in $ROOTDIRSRAW ; do
122 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
123 |         SEP="|"
124 |     done
125 |     OURCYGPATTERN="(^($ROOTDIRS))"
126 |     # Add a user-defined pattern to the cygpath arguments
127 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 |     fi
130 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 |     i=0
132 |     for arg in "$@" ; do
133 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
135 | 
136 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
137 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 |         else
139 |             eval `echo args$i`="\"$arg\""
140 |         fi
141 |         i=$((i+1))
142 |     done
143 |     case $i in
144 |         (0) set -- ;;
145 |         (1) set -- "$args0" ;;
146 |         (2) set -- "$args0" "$args1" ;;
147 |         (3) set -- "$args0" "$args1" "$args2" ;;
148 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 |     esac
155 | fi
156 | 
157 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158 | function splitJvmOpts() {
159 |     JVM_OPTS=("$@")
160 | }
161 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163 | 
164 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
165 | if [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]]; then
166 |   cd "$(dirname "$0")"
167 | fi
168 | 
169 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
170 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/models/README.md:
--------------------------------------------------------------------------------
 1 | #算法模型 放倒当前目录下
 2 | 模型列表
 3 | cws_dat.bin
 4 | cws_label.txt
 5 | cws_model.bin
 6 | idiom.dat
 7 | model_c_dat.bin
 8 | model_c_label.txt
 9 | model_c_model.bin
10 | model_w
11 | neg.dat
12 | ns.dat
13 | singlepun.dat
14 | t2s.dat
15 | time.dat
16 | vD.dat
17 | vM.dat
18 | xu.dat


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'elasticsearch-thulac-plugin'
2 | 
3 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/ThulacAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.elasticsearch.thulac.Configuration;
 5 | 
 6 | /**
 7 |  * Created by micro on 2017-12-17.
 8 |  */
 9 | public class ThulacAnalyzer extends Analyzer {
10 | 
11 |     private Configuration configuration;
12 | 
13 |     public ThulacAnalyzer(Configuration configuration) {
14 |         this.configuration = configuration;
15 |     }
16 | 
17 |     @Override
18 |     protected TokenStreamComponents createComponents(String fieldName) {
19 |         return new TokenStreamComponents(new ThulacTokenizer(configuration));
20 |     }
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/ThulacAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index;
 2 | 
 3 | import org.elasticsearch.common.settings.Settings;
 4 | import org.elasticsearch.env.Environment;
 5 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
 6 | import org.elasticsearch.thulac.Configuration;
 7 | 
 8 | /**
 9 |  * Created by micro on 2017-12-17.
10 |  */
11 | public class ThulacAnalyzerProvider extends AbstractIndexAnalyzerProvider<ThulacAnalyzer> {
12 | 
13 |     private ThulacAnalyzer thulacAnalyzer;
14 | 
15 |     /**
16 |      * Constructs a new analyzer component, with the index name and its settings and the analyzer name.
17 |      *
18 |      * @param indexSettings the settings and the name of the index
19 |      * @param name          The analyzer name
20 |      * @param settings
21 |      */
22 |     public ThulacAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
23 |         super(indexSettings, name, settings);
24 |         Configuration configuration = new Configuration(environment,indexSettings, settings);
25 |         thulacAnalyzer = new ThulacAnalyzer(configuration);
26 |     }
27 | 
28 |     @Override
29 |     public ThulacAnalyzer get() {
30 |         return thulacAnalyzer;
31 |     }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/ThulacTokenizer.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 6 | import org.elasticsearch.thulac.Configuration;
 7 | import org.elasticsearch.thulac.ThulacLiteTokenizerScanner;
 8 | import org.thunlp.thulac.data.TaggedWord;
 9 | 
10 | import java.io.IOException;
11 | 
12 | /**
13 |  * Created by micro on 2017-12-17.
14 |  */
15 | public class ThulacTokenizer extends Tokenizer {
16 | 
17 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
18 |     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
19 |     //    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
20 |     private ThulacLiteTokenizerScanner scanner;
21 |     private int endPosition;
22 | 
23 | 
24 |     public ThulacTokenizer(Configuration configuration) {
25 |         try {
26 |             scanner = new ThulacLiteTokenizerScanner(configuration);
27 |         } catch (IOException e) {
28 |             throw new IllegalArgumentException("thulac configuration error", e);
29 |         }
30 |     }
31 | 
32 |     @Override
33 |     public boolean incrementToken() {
34 |         clearAttributes();
35 |         if (scanner.hasNext()) {
36 |             TaggedWord token = scanner.next();
37 |             termAtt.append(token.word);
38 |             termAtt.setLength(token.word.length());
39 |             offsetAtt.setOffset(token.startOffset, token.endOffset);
40 |             endPosition = token.endOffset;
41 |             return true;
42 |         }
43 |         return false;
44 |     }
45 | 
46 | 
47 |     @Override
48 |     public final void end() throws IOException {
49 |         super.end();
50 |         int finalOffset = correctOffset(this.endPosition);
51 |         offsetAtt.setOffset(finalOffset, finalOffset);
52 |     }
53 | 
54 | //    @Override
55 | //    public void close() throws IOException {
56 | //        super.close();
57 | //        scanner.reset(input);
58 | //    }
59 | 
60 |     @Override
61 |     public void reset() throws IOException {
62 |         super.reset();
63 |         scanner.reset(input);
64 |     }
65 | 
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/ThulacTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 7 | import org.elasticsearch.thulac.Configuration;
 8 | 
 9 | /**
10 |  * Created by micro on 2017-12-17.
11 |  */
12 | public class ThulacTokenizerFactory extends AbstractTokenizerFactory {
13 | 
14 |     private Configuration configuration;
15 | 
16 |     public ThulacTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
17 |         super(indexSettings, settings, name);
18 |         configuration = new Configuration(environment, indexSettings, settings);
19 |     }
20 | 
21 |     @Override
22 |     public Tokenizer create() {
23 |         return new ThulacTokenizer(configuration);
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/ThulacAnalysisPlugin.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.analysis;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.elasticsearch.index.ThulacAnalyzerProvider;
 5 | import org.elasticsearch.index.ThulacTokenizerFactory;
 6 | import org.elasticsearch.index.analysis.AnalyzerProvider;
 7 | import org.elasticsearch.index.analysis.TokenizerFactory;
 8 | import org.elasticsearch.indices.analysis.AnalysisModule;
 9 | import org.elasticsearch.plugins.AnalysisPlugin;
10 | import org.elasticsearch.plugins.Plugin;
11 | 
12 | import java.util.HashMap;
13 | import java.util.Map;
14 | 
15 | /**
16 |  * @author Microbun on 2017/12/17.
17 |  */
18 | public class ThulacAnalysisPlugin extends Plugin implements AnalysisPlugin {
19 | 
20 |     public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
21 |         Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
22 |         extra.put("thulac", ThulacTokenizerFactory::new);
23 |         return extra;
24 |     }
25 | 
26 |     public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
27 |         Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
28 |         extra.put("thulac", ThulacAnalyzerProvider::new);
29 |         return extra;
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/Configuration.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac;
 2 | 
 3 | import org.apache.logging.log4j.Logger;
 4 | import org.elasticsearch.common.logging.Loggers;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | 
 9 | import java.nio.file.FileSystems;
10 | import java.nio.file.Path;
11 | 
12 | /**
13 |  * Created by micro on 2017-12-17.
14 |  */
15 | public class Configuration {
16 | 
17 |     String userDict = "userdict.txt";
18 |     boolean t2s = false;
19 |     boolean segOnly = true;
20 |     boolean filter = false;
21 |     Path modelPath = FileSystems.getDefault().getPath("models/");
22 |     private Environment environment;
23 |     private IndexSettings indexSettings;
24 |     private Settings settings;
25 |     private Logger logger = Loggers.getLogger(getClass(),"thulac");
26 | 
27 |     public Configuration() {
28 |     }
29 | 
30 |     public Configuration(Environment environment, IndexSettings indexSettings, Settings settings) {
31 |         this.environment = environment;
32 |         this.indexSettings = indexSettings;
33 |         this.settings = settings;
34 |         userDict = settings.get("user_dict", "userdict.txt");
35 |         t2s = settings.getAsBoolean("t2s", true);
36 | //        segOnly = settings.getAsBoolean("seg_only", true);
37 |         filter = settings.getAsBoolean("filter", false);
38 |         modelPath = environment.pluginsFile().resolve("thulac/models");
39 | //        logger.info("thulac settings: path={}", modelPath.toAbsolutePath().toString());
40 | //        logger.info("thulac settings: user_dict={} use_t2s={} seg_only={} use_filter={} ", userDict, t2s, segOnly, useFilter);
41 |     }
42 | 
43 |     public Environment getEnvironment() {
44 |         return environment;
45 |     }
46 | 
47 |     public IndexSettings getIndexSettings() {
48 |         return indexSettings;
49 |     }
50 | 
51 |     public Settings getSettings() {
52 |         return settings;
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/ThulacLiteSegment.java:
--------------------------------------------------------------------------------
  1 | package org.elasticsearch.thulac;
  2 | 
  3 | import org.apache.logging.log4j.Logger;
  4 | import org.elasticsearch.common.logging.Loggers;
  5 | import org.elasticsearch.thulac.postprocess.*;
  6 | import org.elasticsearch.thulac.preprocess.ConvertT2SPassBuilder;
  7 | import org.elasticsearch.thulac.preprocess.PreProcessPassBuilder;
  8 | import org.thunlp.thulac.cb.CBTaggingDecoder;
  9 | import org.thunlp.thulac.data.POCGraph;
 10 | import org.thunlp.thulac.data.TaggedWord;
 11 | import org.thunlp.thulac.postprocess.IPostprocessPass;
 12 | import org.thunlp.thulac.preprocess.IPreprocessPass;
 13 | 
 14 | import java.io.IOException;
 15 | import java.nio.file.Files;
 16 | import java.nio.file.Path;
 17 | import java.nio.file.Paths;
 18 | import java.util.ArrayList;
 19 | import java.util.HashMap;
 20 | import java.util.List;
 21 | import java.util.Map;
 22 | import java.util.concurrent.ConcurrentHashMap;
 23 | 
 24 | public class ThulacLiteSegment {
 25 | 
 26 |     private static final Map<String, CBTaggingDecoder> decoder = new HashMap<>();
 27 | 
 28 |     private static final Map<Configuration, ThulacLiteSegment> cache = new ConcurrentHashMap<>();
 29 |     ;
 30 |     private CBTaggingDecoder taggingDecoder;
 31 |     // preprocess passes
 32 |     private List<IPreprocessPass> pre = new ArrayList<>();
 33 |     // postprocess passes
 34 |     private List<IPostprocessPass> post = new ArrayList<>();
 35 | 
 36 |     private ThulacLiteSegment(Configuration configuration) throws IOException {
 37 |         synchronized (decoder) {
 38 |             init(configuration);
 39 |         }
 40 |     }
 41 | 
 42 |     public static ThulacLiteSegment getInstance(Configuration configuration) throws IOException {
 43 |         ThulacLiteSegment segment;
 44 |         if (cache.containsKey(configuration)) {
 45 |             segment = cache.get(configuration);
 46 |         } else {
 47 |             segment = new ThulacLiteSegment(configuration);
 48 |             cache.put(configuration, segment);
 49 |         }
 50 |         return segment;
 51 |     }
 52 | 
 53 |     private void init(Configuration configuration) throws IOException {
 54 |         // segmentation
 55 |         // load model
 56 |         String prefix = configuration.segOnly ? "cws_" : "model_c_";
 57 |         if (!decoder.containsKey(prefix)) {
 58 |             CBTaggingDecoder temp = new CBTaggingDecoder();
 59 |             temp.threshold = configuration.segOnly ? 0 : 10000;
 60 |             temp.loadFiles(
 61 |                     join(configuration.modelPath, prefix + "model.bin"),
 62 |                     join(configuration.modelPath, prefix + "dat.bin"),
 63 |                     join(configuration.modelPath, prefix + "label.txt"));
 64 |             temp.setLabelTrans();
 65 |             decoder.put(prefix, temp);
 66 |         }
 67 |         taggingDecoder = decoder.get(prefix);
 68 | 
 69 |         //pre pass
 70 |         pre.add(PreProcessPassBuilder.getInstance());
 71 |         if (configuration.t2s) {
 72 |             pre.add(ConvertT2SPassBuilder.getInstance(join(configuration.modelPath, "t2s.dat")));
 73 |         }
 74 | 
 75 |         //post pass
 76 |         post.add(DictionaryPassBuilder.getInstance(join(configuration.modelPath, "ns.dat"), "ns", false));
 77 |         post.add(DictionaryPassBuilder.getInstance(join(configuration.modelPath, "idiom.dat"), "i", false));
 78 |         post.add(DictionaryPassBuilder.getInstance(join(configuration.modelPath, "singlepun.dat"), "w", false));
 79 |         post.add(TimeWordPassBuilder.getInstance());
 80 |         post.add(DoubleWordPassBuilder.getInstance());
 81 |         post.add(SpecialPassBuilder.getInstance());
 82 |         post.add(NegWordPassBuilder.getInstance(join(configuration.modelPath, "neg.dat")));
 83 |         if (configuration.userDict != null) {
 84 |             String path = configuration.userDict;
 85 |             if (!Paths.get(path).isAbsolute()) {
 86 |                 path = join(configuration.modelPath, configuration.userDict);
 87 |             }
 88 |             if (Files.exists(Paths.get(path))) {
 89 |                 post.add(DictionaryPassBuilder.getInstance(path, "uw", true));
 90 |             } else {
 91 |                 if (!configuration.userDict.equals("userdict.txt")) {
 92 |                     throw new IllegalArgumentException("not exists user_dict[" + path + "]");
 93 |                 }
 94 |             }
 95 |         }
 96 |     }
 97 | 
 98 | 
 99 |     private String join(Path path, String... more) {
100 |         return Paths.get(path.toAbsolutePath().toString(), more).toAbsolutePath().toString();
101 |     }
102 | 
103 |     public List<TaggedWord> segment(String raw) {
104 |         List<TaggedWord> words = new ArrayList<>();
105 |         POCGraph graph = new POCGraph();
106 |         for (IPreprocessPass pass : pre) {
107 |             raw = pass.process(raw, graph);
108 |         }
109 |         taggingDecoder.segment(raw, graph, words);
110 |         for (IPostprocessPass pass : post) {
111 |             pass.process(words);
112 |         }
113 |         return words;
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/ThulacLiteTokenizerScanner.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac;
 2 | 
 3 | 
 4 | import org.thunlp.thulac.data.TaggedWord;
 5 | 
 6 | import java.io.IOException;
 7 | import java.io.Reader;
 8 | import java.util.Iterator;
 9 | import java.util.List;
10 | 
11 | /**
12 |  * Created by micro on 2017-12-17.
13 |  */
14 | public class ThulacLiteTokenizerScanner implements Iterator<TaggedWord> {
15 | 
16 | 
17 |     //    private Logger logger;
18 |     private ThulacLiteSegment segment;
19 |     private Iterator<TaggedWord> tokens;
20 | 
21 |     public ThulacLiteTokenizerScanner(Configuration configuration) throws IOException {
22 | //        logger = Loggers.getLogger(getClass(), configuration.getSettings());
23 |         segment = ThulacLiteSegment.getInstance(configuration);
24 |     }
25 | 
26 |     @Override
27 |     public boolean hasNext() {
28 |         return tokens.hasNext();
29 |     }
30 | 
31 |     @Override
32 |     public TaggedWord next() {
33 |         return tokens.next();
34 |     }
35 | 
36 |     @Override
37 |     public void remove() {
38 |         tokens.remove();
39 |     }
40 | 
41 |     public void reset(Reader reader) {
42 |         String raw;
43 |         try {
44 |             StringBuilder bdr = new StringBuilder();
45 |             int size = 1024;
46 |             char[] buf = new char[size];
47 |             while ((size = reader.read(buf, 0, size)) != -1) {
48 |                 bdr.append(new String(buf, 0, size));
49 |             }
50 |             raw = bdr.toString();
51 |             List<TaggedWord> words = segment.segment(raw);
52 |             tokens = words.iterator();
53 |         } catch (IOException e) {
54 |             e.printStackTrace();
55 |         }
56 | 
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/postprocess/DictionaryPassBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.postprocess.DictionaryPass;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.Map;
 7 | import java.util.concurrent.ConcurrentHashMap;
 8 | 
 9 | public class DictionaryPassBuilder {
10 | 
11 |     private static Map<String, DictionaryPass> cache = new ConcurrentHashMap<>();
12 | 
13 |     public static DictionaryPass getInstance(String dictFile, String tag, boolean isTxt) throws IOException {
14 |         String key = dictFile + "#" + tag + "#" + isTxt;
15 |         if (!cache.containsKey(key)) {
16 |             cache.put(key, new DictionaryPass(dictFile, tag, isTxt));
17 |         }
18 |         return cache.get(key);
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/postprocess/DoubleWordPassBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.postprocess.DoubleWordPass;
 4 | 
 5 | public class DoubleWordPassBuilder {
 6 | 
 7 |     private static DoubleWordPass instance = new DoubleWordPass();
 8 | 
 9 |     public static DoubleWordPass getInstance() {
10 |         return instance;
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/postprocess/FilterPassBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.postprocess.FilterPass;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.Map;
 7 | import java.util.concurrent.ConcurrentHashMap;
 8 | 
 9 | public class FilterPassBuilder {
10 | 
11 |     private static Map<String, FilterPass> cache = new ConcurrentHashMap<>();
12 | 
13 |     public static FilterPass getInstance(String xuDatFile, String timeDatFile) throws IOException {
14 |         String key = xuDatFile + "#" + timeDatFile;
15 |         if (!cache.containsKey(key)) {
16 |             cache.put(key, new FilterPass(xuDatFile, timeDatFile));
17 |         }
18 |         return cache.get(key);
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/postprocess/NegWordPassBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.postprocess.NegWordPass;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.Map;
 7 | import java.util.concurrent.ConcurrentHashMap;
 8 | 
 9 | public class NegWordPassBuilder {
10 |     private static Map<String, NegWordPass> cache = new ConcurrentHashMap<>();
11 | 
12 |     public static NegWordPass getInstance(String negDatFile) throws IOException {
13 |         String key = negDatFile;
14 |         if (!cache.containsKey(key)) {
15 |             cache.put(key, new NegWordPass(negDatFile));
16 |         }
17 |         return cache.get(key);
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/postprocess/SpecialPassBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.postprocess.SpecialPass;
 4 | 
 5 | public class SpecialPassBuilder {
 6 |     private static SpecialPass instance = new SpecialPass();
 7 | 
 8 |     public static SpecialPass getInstance() {
 9 |         return instance;
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/postprocess/TimeWordPassBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.postprocess.TimeWordPass;
 4 | 
 5 | public class TimeWordPassBuilder {
 6 |     private static TimeWordPass instance = new TimeWordPass();
 7 | 
 8 |     public static TimeWordPass getInstance() {
 9 |         return instance;
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/postprocess/VerbPassBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.postprocess.VerbPass;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.Map;
 7 | import java.util.concurrent.ConcurrentHashMap;
 8 | 
 9 | public class VerbPassBuilder {
10 |     private static Map<String, VerbPass> cache = new ConcurrentHashMap<>();
11 | 
12 |     public static VerbPass getInstance(String vMFile, String vDFile) throws IOException {
13 |         String key = vMFile + "#" + vDFile;
14 |         if (!cache.containsKey(key)) {
15 |             cache.put(key, new VerbPass(vMFile, vDFile));
16 |         }
17 |         return cache.get(key);
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/preprocess/ConvertT2SPassBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac.preprocess;
 2 | 
 3 | import org.thunlp.thulac.preprocess.ConvertT2SPass;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.Map;
 7 | import java.util.concurrent.ConcurrentHashMap;
 8 | 
 9 | public class ConvertT2SPassBuilder {
10 | 
11 |     private static Map<String, ConvertT2SPass> cache = new ConcurrentHashMap<>();
12 | 
13 |     public static ConvertT2SPass getInstance(String file) throws IOException {
14 |         String key = file;
15 |         if (!cache.containsKey(key)) {
16 |             cache.put(key, new ConvertT2SPass(file));
17 |         }
18 |         return cache.get(key);
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/thulac/preprocess/PreProcessPassBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.thulac.preprocess;
 2 | 
 3 | import org.thunlp.thulac.preprocess.PreProcessPass;
 4 | 
 5 | public class PreProcessPassBuilder {
 6 | 
 7 |     private static PreProcessPass instance = new PreProcessPass();
 8 | 
 9 |     public static PreProcessPass getInstance() {
10 |         return instance;
11 |     }
12 | 
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/Thulac.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac;
  2 | 
  3 | import org.thunlp.thulac.cb.CBTaggingDecoder;
  4 | import org.thunlp.thulac.data.POCGraph;
  5 | import org.thunlp.thulac.data.TaggedWord;
  6 | import org.thunlp.thulac.io.IInputProvider;
  7 | import org.thunlp.thulac.io.IOutputHandler;
  8 | import org.thunlp.thulac.io.StringOutputHandler;
  9 | import org.thunlp.thulac.postprocess.DictionaryPass;
 10 | import org.thunlp.thulac.postprocess.DoubleWordPass;
 11 | import org.thunlp.thulac.postprocess.FilterPass;
 12 | import org.thunlp.thulac.postprocess.IPostprocessPass;
 13 | import org.thunlp.thulac.postprocess.NegWordPass;
 14 | import org.thunlp.thulac.postprocess.SpecialPass;
 15 | import org.thunlp.thulac.postprocess.TimeWordPass;
 16 | import org.thunlp.thulac.preprocess.ConvertT2SPass;
 17 | import org.thunlp.thulac.preprocess.IPreprocessPass;
 18 | import org.thunlp.thulac.preprocess.PreProcessPass;
 19 | import org.thunlp.thulac.util.IOUtils;
 20 | 
 21 | import java.io.File;
 22 | import java.io.FileNotFoundException;
 23 | import java.io.IOException;
 24 | import java.util.ArrayList;
 25 | import java.util.List;
 26 | import java.util.Vector;
 27 | 
 28 | /**
 29 |  * The central class which acts as core of the THULAC API. It provides several
 30 |  * convenient methods make things easier for users.
 31 |  */
 32 | public class Thulac {
 33 | 	/**
 34 | 	 * Run the segmentation program with argument {@code segOnly}, taking input from the
 35 | 	 * given {@link String} and return the segmented output as a {@link String}.
 36 | 	 *
 37 | 	 * @param input
 38 | 	 * 		The input {@link String}.
 39 | 	 * @param segOnly
 40 | 	 * 		Whether to output only segments.
 41 | 	 *
 42 | 	 * @return The segmented output as a {@link String}.
 43 | 	 *
 44 | 	 * @throws java.io.IOException
 45 | 	 * 		If one of the model files fails to load.
 46 | 	 */
 47 | 	public static String split(String input, boolean segOnly) throws IOException {
 48 | 		StringOutputHandler outputProvider = IOUtils.outputToString();
 49 | 		IInputProvider inputProvider = IOUtils.inputFromString(input);
 50 | 		split(inputProvider, outputProvider, segOnly);
 51 | 		return outputProvider.getString();
 52 | 	}
 53 | 
 54 | 	/**
 55 | 	 * Run the segmentation program with argument {@code segOnly}, taking input from the
 56 | 	 * given {@link java.io.File} and output the segmented return to a given {@link java.io.File}.<br>
 57 | 	 * This method returns directly if either {@code inputFile} or {@code outputFile}
 58 | 	 * is null.
 59 | 	 *
 60 | 	 * @param inputFile
 61 | 	 * 		The name of the input file.
 62 | 	 * @param outputFile
 63 | 	 * 		The name of the output file.
 64 | 	 * @param segOnly
 65 | 	 * 		Whether to output only segments.
 66 | 	 *
 67 | 	 * @throws java.io.IOException
 68 | 	 * 		If one of the model files fails to load or either the input file or the output
 69 | 	 * 		file is {@code null}.
 70 | 	 */
 71 | 	public static void split(String inputFile, String outputFile, boolean segOnly)
 72 | 			throws IOException {
 73 | 		if (inputFile == null || outputFile == null) return;
 74 | 		IInputProvider input = IOUtils.inputFromFile(inputFile);
 75 | 		IOutputHandler output = IOUtils.outputToFile(outputFile);
 76 | 		split(input, output, segOnly);
 77 | 	}
 78 | 
 79 | 	/**
 80 | 	 * Run the segmentation program with argument {@code segOnly}, taking input from the
 81 | 	 * given {@link java.io.File} and output the segmented return to a given {@link java.io.File}.
 82 | 	 *
 83 | 	 * @param input
 84 | 	 * 		The input {@link java.io.File}.
 85 | 	 * @param output
 86 | 	 * 		The output {@link java.io.File}.
 87 | 	 * @param segOnly
 88 | 	 * 		Whether to output only segments.
 89 | 	 *
 90 | 	 * @throws java.io.IOException
 91 | 	 * 		If one of the model files fails to load or either the input file or the output
 92 | 	 * 		file is {@code null}.
 93 | 	 */
 94 | 	public static void split(File input, File output, boolean segOnly)
 95 | 			throws IOException {
 96 | 		if (input == null) throw new FileNotFoundException("input == null!");
 97 | 		if (output == null) throw new FileNotFoundException("output == null!");
 98 | 		IInputProvider inputProvider = IOUtils.inputFromFile(input);
 99 | 		IOutputHandler outputHandler = IOUtils.outputToFile(output);
100 | 		split(inputProvider, outputHandler, segOnly);
101 | 	}
102 | 
103 | 	/**
104 | 	 * Run the segmentation program with argument {@code segOnly} and default values
105 | 	 * for all others.
106 | 	 *
107 | 	 * @param input
108 | 	 * 		The {@link IInputProvider} instance to provide input.
109 | 	 * @param output
110 | 	 * 		The {@link IOutputHandler} instance to handle output.
111 | 	 * @param segOnly
112 | 	 * 		Whether to output only segments.
113 | 	 *
114 | 	 * @throws java.io.IOException
115 | 	 * 		If I/O of either {@code input}, {@code output} or one of the model files
116 | 	 * 		resulted in an exception.
117 | 	 */
118 | 	public static void split(IInputProvider input, IOutputHandler output, boolean segOnly)
119 | 			throws IOException {
120 | 		split("models/", '_', null, false, segOnly, false, input, output);
121 | 	}
122 | 
123 | 	/**
124 | 	 * Run the segmentation program with full arguments.
125 | 	 *
126 | 	 * @param modelDir
127 | 	 * 		The directory under which the model files are located.
128 | 	 * @param separator
129 | 	 * 		The separator to use to separate words and tags.
130 | 	 * @param userDict
131 | 	 * 		The optional file name of the user-specified dictionary.
132 | 	 * @param useT2S
133 | 	 * 		Whether to transfer traditional Chinese to simplified Chinese before
134 | 	 * 		segmentation.
135 | 	 * @param segOnly
136 | 	 * 		Whether to output only segments.
137 | 	 * @param useFilter
138 | 	 * 		Whether to use filters while processing.
139 | 	 * @param input
140 | 	 * 		The {@link IInputProvider} instance to provide input.
141 | 	 * @param output
142 | 	 * 		The {@link IOutputHandler} instance to handle output.
143 | 	 *
144 | 	 * @throws java.io.IOException
145 | 	 * 		If I/O of either {@code input}, {@code output} or one of the model files
146 | 	 * 		resulted in an exception.
147 | 	 */
148 | 	public static void split(
149 | 			String modelDir, char separator, String userDict,
150 | 			boolean useT2S, boolean segOnly, boolean useFilter,
151 | 			IInputProvider input, IOutputHandler output) throws IOException {
152 | 		try {
153 | 			input.onProgramStart();
154 | 			output.onProgramStart();
155 | 
156 | 			// segmentation
157 | 			CBTaggingDecoder taggingDecoder = new CBTaggingDecoder();
158 | 			taggingDecoder.threshold = segOnly ? 0 : 10000;
159 | 			String prefix = modelDir + (segOnly ? "cws_" : "model_c_");
160 | 			taggingDecoder.loadFiles(prefix + "model.bin",
161 | 					prefix + "dat.bin",
162 | 					prefix + "label.txt");
163 | 			taggingDecoder.setLabelTrans();
164 | 
165 | 			// preprocess passes
166 | 			List<IPreprocessPass> pre = new ArrayList<>();
167 | 			pre.add(new PreProcessPass());
168 | 			if (useT2S) pre.add(new ConvertT2SPass(modelDir + "t2s.dat"));
169 | 
170 | 			// postprocess passes
171 | 			List<IPostprocessPass> post = new ArrayList<>();
172 | 			post.add(new DictionaryPass(modelDir + "ns.dat", "ns", false));
173 | 			post.add(new DictionaryPass(modelDir + "idiom.dat", "i", false));
174 | 			post.add(new DictionaryPass(modelDir + "singlepun.dat", "w", false));
175 | 			post.add(new TimeWordPass());
176 | 			post.add(new DoubleWordPass());
177 | 			post.add(new SpecialPass());
178 | 			post.add(new NegWordPass(modelDir + "neg.dat"));
179 | 			if (userDict != null) post.add(new DictionaryPass(userDict, "uw", true));
180 | 			if (useFilter)
181 | 				post.add(new FilterPass(modelDir + "xu.dat", modelDir + "time.dat"));
182 | 
183 | 			// main loop
184 | 			List<TaggedWord> words = new Vector<>();
185 | 			POCGraph graph = new POCGraph();
186 | 			for (List<String> lineSegments = input.provideInput();
187 | 				 lineSegments != null;
188 | 				 lineSegments = input.provideInput()) {
189 | 				output.handleLineStart();
190 | 				for (String raw : lineSegments) {
191 | 					for (IPreprocessPass pass : pre) raw = pass.process(raw, graph);
192 | 					taggingDecoder.segment(raw, graph, words);
193 | 					for (IPostprocessPass pass : post) pass.process(words);
194 | 
195 | 					output.handleLineSegment(words, segOnly, separator);
196 | 				}
197 | 				output.handleLineEnd();
198 | 			}
199 | 		} finally { // close resources even when program crashes
200 | 			input.onProgramEnd();
201 | 			output.onProgramEnd();
202 | 		}
203 | 	}
204 | }
205 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/AlphaBeta.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.cb;
  2 | 
  3 | 
  4 | // a structure for alphas and betas
  5 | public class AlphaBeta {
  6 | 	// TODO: add documentation
  7 | 
  8 | 	public int value;
  9 | 	public int nodeId;
 10 | 	public int labelId;
 11 | 
 12 | 	public AlphaBeta() {
 13 | 		super();
 14 | 		this.value = 0;
 15 | 		this.nodeId = -2;
 16 | 		this.labelId = 0;
 17 | 	}
 18 | 
 19 | 	public AlphaBeta(int value, int nodeId, int labelId) {
 20 | 		super();
 21 | 		this.value = value;
 22 | 		this.nodeId = nodeId;
 23 | 		this.labelId = labelId;
 24 | 	}
 25 | 
 26 | 
 27 | 	public static int dbDecode(
 28 | 			int l_size, int[] llWeights, int nodeCount, Node[] nodes, int[] values,
 29 | 			AlphaBeta[] alphas,
 30 | 			int[] result, int[][] preLabels, int[][] allowedLabelLists) {
 31 | 		int nodeId;
 32 | 		int[] pNodeId;
 33 | 		int[] pPreLabel;
 34 | 		int[] pAllowedLabel;
 35 | 		int k;
 36 | 		int j;
 37 | 		AlphaBeta tmp;
 38 | 		AlphaBeta best = new AlphaBeta();
 39 | 		best.nodeId = -1;
 40 | 		AlphaBeta preAlpha;
 41 | 
 42 | 		int score;
 43 | 		int index = 0;
 44 | 		int index2 = 0;
 45 | 		int index3 = 0;
 46 | 
 47 | 		for (int i = 0; i < nodeCount * l_size; i++) {
 48 | 			alphas[i] = new AlphaBeta();
 49 | 			alphas[i].nodeId = -2;
 50 | 		}
 51 | 		for (int i = 0; i < nodeCount; i++) {
 52 | 			pAllowedLabel = allowedLabelLists != null ? allowedLabelLists[i] : null;
 53 | 			j = -1;
 54 | 			int maxValue = 0;
 55 | 			boolean hasMaxValue = false;
 56 | 			if (pAllowedLabel != null) {
 57 | 				index = 0;
 58 | 				while ((j = pAllowedLabel[index]) != -1) {
 59 | 					index++;
 60 | 					if (!hasMaxValue || (maxValue < values[i * l_size + j])) {
 61 | 						hasMaxValue = true;
 62 | 						maxValue = values[i * l_size + j];
 63 | 					}
 64 | 				}
 65 | 				index = 0;
 66 | 				j = -1;
 67 | 				while ((j = pAllowedLabel[index]) != -1) {
 68 | 					index++;
 69 | 					tmp = alphas[i * l_size + j];
 70 | 					tmp.value = 0;
 71 | 					pNodeId = nodes[i].predecessors;
 72 | 					pPreLabel = preLabels != null ? preLabels[j] : null;
 73 | 					index2 = 0;
 74 | 					while ((nodeId = pNodeId[index2]) >= 0) {
 75 | 						index2++;
 76 | 						k = -1;
 77 | 						if (pPreLabel != null) {
 78 | 							index3 = 0;
 79 | 							while ((k = pPreLabel[index3]) != -1) {
 80 | 								index3++;
 81 | 								preAlpha = alphas[nodeId * l_size + k];
 82 | 								if (preAlpha.nodeId == -2) continue;
 83 | 								score = preAlpha.value + llWeights[k * l_size + j];
 84 | 								if ((tmp.nodeId < 0) || (score > tmp.value)) {
 85 | 									tmp.value = score;
 86 | 									tmp.nodeId = nodeId;
 87 | 									tmp.labelId = k;
 88 | 								}
 89 | 							}
 90 | 						} else {
 91 | 							k++;
 92 | 							while (k != l_size) {
 93 | 								preAlpha = alphas[nodeId * l_size + k];
 94 | 								if (preAlpha.nodeId == -2) continue;
 95 | 								score = preAlpha.value + llWeights[k * l_size + j];
 96 | 								if ((tmp.nodeId < 0) || (score > tmp.value)) {
 97 | 									tmp.value = score;
 98 | 									tmp.nodeId = nodeId;
 99 | 									tmp.labelId = k;
100 | 								}
101 | 								k++;
102 | 							}
103 | 						}
104 | 					}
105 | 					tmp.value += values[i * l_size + j];
106 | 					if ((nodes[i].type == 1) || (nodes[i].type == 3)) {
107 | 						tmp.nodeId = -1;
108 | 					}
109 | 					if (nodes[i].type >= 2) {
110 | 						if ((best.nodeId == -1) || best.value < tmp.value) {
111 | 							best.value = tmp.value;
112 | 							best.nodeId = i;
113 | 							best.labelId = j;
114 | 						}
115 | 					}
116 | 				}
117 | 
118 | 			} else {
119 | 				j++;
120 | 				while (j != l_size) {
121 | 					if (!hasMaxValue || (maxValue < values[i * l_size + j])) {
122 | 						hasMaxValue = true;
123 | 						maxValue = values[i * l_size + j];
124 | 					}
125 | 					j++;
126 | 				}
127 | 				j = 0;
128 | 				while (j != l_size) {
129 | 					tmp = alphas[i * l_size + j];
130 | 					tmp.value = 0;
131 | 					pNodeId = nodes[i].predecessors;
132 | 					pPreLabel = preLabels != null ? preLabels[j] : null;
133 | 					index2 = 0;
134 | 					while ((nodeId = pNodeId[index2]) >= 0) {
135 | 						index2++;
136 | 						k = -1;
137 | 						if (pPreLabel != null) {
138 | 							index3 = 0;
139 | 							while ((k = pPreLabel[index3]) != -1) {
140 | 								index3++;
141 | 								preAlpha = alphas[nodeId * l_size + k];
142 | 								if (preAlpha.nodeId == -2) continue;
143 | 								score = preAlpha.value + llWeights[k * l_size + j];
144 | 								if ((tmp.nodeId < 0) || (score > tmp.value)) {
145 | 									tmp.value = score;
146 | 									tmp.nodeId = nodeId;
147 | 									tmp.labelId = k;
148 | 								}
149 | 
150 | 							}
151 | 						} else {
152 | 							k++;
153 | 							while (k != l_size) {
154 | 								preAlpha = alphas[nodeId * l_size + k];
155 | 								if (preAlpha.nodeId == -2) continue;
156 | 								score = preAlpha.value + llWeights[k * l_size + j];
157 | 								if ((tmp.nodeId < 0) || (score > tmp.value)) {
158 | 									tmp.value = score;
159 | 									tmp.nodeId = nodeId;
160 | 									tmp.labelId = k;
161 | 								}
162 | 								k++;
163 | 							}
164 | 						}
165 | 					}
166 | 					tmp.value += values[i * l_size + j];
167 | 					if ((nodes[i].type == 1) || (nodes[i].type == 3)) {
168 | 						tmp.nodeId = -1;
169 | 					}
170 | 					if (nodes[i].type >= 2) {
171 | 						if ((best.nodeId == -1) || best.value < tmp.value) {
172 | 							best.value = tmp.value;
173 | 							best.nodeId = i;
174 | 							best.labelId = j;
175 | 						}
176 | 					}
177 | //					System.out.println(""+tmp.value+" "+tmp.nodeId+" "+tmp.labelId);
178 | 					j++;
179 | 				}
180 | 
181 | 			}
182 | 		}
183 | 		tmp = best;
184 | 		while (tmp.nodeId >= 0) {
185 | 			result[tmp.nodeId] = tmp.labelId;
186 | 			tmp = alphas[tmp.nodeId * l_size + tmp.labelId];
187 | 		}
188 | 		return best.value;
189 | 	}
190 | }


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/CBModel.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.cb;
 2 | 
 3 | import org.thunlp.thulac.util.BufferUtils;
 4 | 
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.nio.ByteBuffer;
 8 | import java.nio.ByteOrder;
 9 | import java.nio.IntBuffer;
10 | import java.nio.channels.FileChannel;
11 | 
12 | public class CBModel {
13 | 	// TODO: add documentation
14 | 
15 | 	public int l_size; // size of the labels
16 | 	public int f_size; // size of the features
17 | 
18 | 	public int[] ll_weights; // weights of (label, label)
19 | 	public int[] fl_weights; // weights of (feature, label)
20 | 
21 | 	public CBModel(String filename) throws IOException {
22 | 		FileInputStream in = new FileInputStream(filename);
23 | 		FileChannel channel = in.getChannel();
24 | 
25 | 		ByteBuffer header = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN);
26 | 		header.clear();
27 | 		channel.read(header);
28 | 		header.flip();
29 | 		IntBuffer intHeader = header.asIntBuffer();
30 | 		this.l_size = intHeader.get();
31 | 		this.f_size = intHeader.get();
32 | 
33 | 		int llSize = this.l_size * this.l_size, flSize = this.l_size * this.f_size;
34 | 		this.ll_weights = new int[llSize];
35 | 		this.fl_weights = new int[flSize];
36 | 		ByteBuffer buf = ByteBuffer.allocate(64 * 1024).order(ByteOrder.LITTLE_ENDIAN);
37 | 		buf.clear();
38 | 		BufferUtils.readInts(channel, buf, this.ll_weights, this.fl_weights);
39 | 
40 | 		channel.close();
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/CBNGramFeature.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.cb;
  2 | 
  3 | import org.thunlp.thulac.data.Dat;
  4 | 
  5 | import java.util.Vector;
  6 | 
  7 | public class CBNGramFeature {
  8 | 	// TODO: add documentation
  9 | 
 10 | 	private static final int SENTENCE_BOUNDARY = '#';
 11 | 
 12 | 	private int separator;
 13 | 	private int maxLength;
 14 | 	private int[] uniBases;
 15 | 	private int[] biBases;
 16 | 	private int[] values;
 17 | 	private int datSize;
 18 | 	private int[] dat;
 19 | 	private CBModel model;
 20 | 
 21 | 	public CBNGramFeature(Dat myDat, CBModel model, int[] values) {
 22 | 		this.separator = ' ';
 23 | 		this.datSize = myDat.datSize;
 24 | 		this.dat = myDat.dat;
 25 | 		this.model = model;
 26 | 		this.maxLength = 20000;
 27 | 		this.uniBases = new int[this.maxLength + 2];
 28 | 		this.biBases = new int[this.maxLength + 4];
 29 | 		this.values = values;
 30 | 	}
 31 | 
 32 | 	private void addValues(int valueOffset, int base, int del) {
 33 | 		int ind = this.dat[base << 1] + del;
 34 | 		if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return;
 35 | 		int offset = this.dat[ind << 1];
 36 | 		int weightOffset = offset * this.model.l_size;
 37 | 		if (this.model.l_size == 4) {
 38 | 			this.values[valueOffset] += this.model.fl_weights[weightOffset];
 39 | 			this.values[valueOffset + 1] += this.model.fl_weights[weightOffset + 1];
 40 | 			this.values[valueOffset + 2] += this.model.fl_weights[weightOffset + 2];
 41 | 			this.values[valueOffset + 3] += this.model.fl_weights[weightOffset + 3];
 42 | 		} else for (int i = 0; i < this.model.l_size; i++) {
 43 | 			this.values[valueOffset + i] += this.model.fl_weights[weightOffset + i];
 44 | 		}
 45 | 	}
 46 | 
 47 | 	private Vector<Integer> findBases(int datSize, int ch1, int ch2) {
 48 | 		Vector<Integer> result = new Vector<>();
 49 | 		int uniBase;
 50 | 		int biBase;
 51 | 		if (ch1 > 32 && ch1 < 128) ch1 += 65248;
 52 | 		if (ch2 > 32 && ch2 < 128) ch2 += 65248;
 53 | 		if (ch1 >= datSize || this.dat[(ch1 << 1) + 1] != 0) {
 54 | 			uniBase = -1;
 55 | 			biBase = -1;
 56 | 			result.clear();
 57 | 			result.add(uniBase);
 58 | 			result.add(biBase);
 59 | 			return result;
 60 | 		}
 61 | 		uniBase = this.dat[ch1 << 1] + this.separator;
 62 | 		int ind = this.dat[ch1 << 1] + ch2;
 63 | 		if (ind >= datSize || this.dat[(ind << 1) + 1] != ch1) {
 64 | 			biBase = -1;
 65 | 			result.clear();
 66 | 			result.add(uniBase);
 67 | 			result.add(biBase);
 68 | 			return result;
 69 | 		}
 70 | 		biBase = this.dat[ind << 1] + this.separator;
 71 | 		result.clear();
 72 | 		result.add(uniBase);
 73 | 		result.add(biBase);
 74 | 		return result;
 75 | 	}
 76 | 
 77 | 	public int putValues(String sequence, int len) {
 78 | 		if (len >= this.maxLength) {
 79 | 			System.err.println("Length larger than maxLength.");
 80 | 			return 1;
 81 | 		}
 82 | 
 83 | 		Vector<Integer> result = this.findBases(this.datSize, SENTENCE_BOUNDARY,
 84 | 				SENTENCE_BOUNDARY);
 85 | 		this.uniBases[0] = result.get(0);
 86 | 		this.biBases[0] = result.get(1);
 87 | 
 88 | 		result = this.findBases(this.datSize, SENTENCE_BOUNDARY, sequence.charAt(0));
 89 | 		this.uniBases[0] = result.get(0);
 90 | 		this.biBases[1] = result.get(1);
 91 | 		for (int i = 0; i + 1 < len; i++) {
 92 | 			result = this.findBases(this.datSize, sequence.charAt(i),
 93 | 					sequence.charAt(i + 1));
 94 | 			this.uniBases[i + 1] = result.get(0);
 95 | 			this.biBases[i + 2] = result.get(1);
 96 | 		}
 97 | 
 98 | 		result = this.findBases(this.datSize, (int) sequence.charAt(len - 1),
 99 | 				SENTENCE_BOUNDARY);
100 | 		this.uniBases[len] = result.get(0);
101 | 		this.biBases[len + 1] = result.get(1);
102 | 
103 | 		result = this.findBases(this.datSize, SENTENCE_BOUNDARY, SENTENCE_BOUNDARY);
104 | 		this.uniBases[len + 1] = result.get(0);
105 | 		this.biBases[len + 2] = result.get(1);
106 | 
107 | 		int base;
108 | 		for (int i = 0; i < len; i++) {
109 | 			int valueOffset = i * this.model.l_size;
110 | 			if ((base = this.uniBases[i + 1]) != -1) {
111 | 				this.addValues(valueOffset, base, 49);
112 | 			}
113 | 			if ((base = this.uniBases[i]) != -1) {
114 | 				this.addValues(valueOffset, base, 50);
115 | 			}
116 | 			if ((base = this.uniBases[i + 2]) != -1) {
117 | 				this.addValues(valueOffset, base, 51);
118 | 			}
119 | 			if ((base = this.biBases[i + 1]) != -1) {
120 | 				this.addValues(valueOffset, base, 49);
121 | 			}
122 | 			if ((base = this.biBases[i + 2]) != -1) {
123 | 				this.addValues(valueOffset, base, 50);
124 | 			}
125 | 			if ((base = this.biBases[i]) != -1) {
126 | 				this.addValues(valueOffset, base, 51);
127 | 			}
128 | 			if ((base = this.biBases[i + 3]) != -1) {
129 | 				this.addValues(valueOffset, base, 52);
130 | 			}
131 | 		}
132 | 		return 0;
133 | 	}
134 | }
135 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/CBTaggingDecoder.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.cb;
  2 | 
  3 | import org.thunlp.thulac.data.Dat;
  4 | import org.thunlp.thulac.data.POCGraph;
  5 | import org.thunlp.thulac.data.TaggedWord;
  6 | 
  7 | import java.io.BufferedReader;
  8 | import java.io.FileInputStream;
  9 | import java.io.IOException;
 10 | import java.io.InputStreamReader;
 11 | import java.util.List;
 12 | import java.util.Vector;
 13 | 
 14 | public class CBTaggingDecoder {
 15 | 	// TODO: add documentation
 16 | 
 17 | 	private int maxLength;
 18 | 	private int len;
 19 | 	private String sequence;
 20 | 	private int[][] allowedLabelLists;
 21 | 	private int[][] pocsToTags;
 22 | 
 23 | 	private CBNGramFeature nGramFeature;
 24 | 	private Dat dat;
 25 | 
 26 | 	private CBModel model;
 27 | 
 28 | 	private Node[] nodes;
 29 | 	private int[] values;
 30 | 	private AlphaBeta[] alphas;
 31 | 	private int[] result;
 32 | 
 33 | 	private String[] labelInfo;
 34 | 
 35 | 	private int[][] labelTransPre;
 36 | 	private int[][] labelTransPost;
 37 | 
 38 | 	public int threshold;
 39 | 
 40 | 	public CBTaggingDecoder() {
 41 | 		this.maxLength = 20000;
 42 | 		this.len = 0;
 43 | 		this.sequence = "";
 44 | 		this.allowedLabelLists = new int[this.maxLength][];
 45 | 
 46 | 		this.pocsToTags = null;
 47 | 		this.nGramFeature = null;
 48 | 		this.dat = null;
 49 | 		this.nodes = new Node[this.maxLength];
 50 | 		this.labelTransPre = null;
 51 | 		this.labelTransPost = null;
 52 | 		this.threshold = 0;
 53 | 
 54 | 		this.model = null;
 55 | 		this.alphas = null;
 56 | 	}
 57 | 
 58 | 	public void loadFiles(String modelFile, String datFile, String labelFile) throws
 59 | 			IOException {
 60 | 		this.model = new CBModel(modelFile);
 61 | 
 62 | 		this.values = new int[this.maxLength * this.model.l_size];
 63 | 		this.alphas = new AlphaBeta[this.maxLength * this.model.l_size];
 64 | 		this.result = new int[this.maxLength * this.model.l_size];
 65 | 
 66 | 		for (int i = 0; i < this.maxLength; i++) {
 67 | 			this.nodes[i] = new Node();
 68 | 
 69 | 			int[] pre = new int[2];
 70 | 			pre[0] = i - 1;
 71 | 			pre[1] = -1;
 72 | 			this.nodes[i].predecessors = pre;
 73 | 
 74 | 			pre = new int[2];
 75 | 			pre[0] = i + 1;
 76 | 			pre[1] = -1;
 77 | 			this.nodes[i].successors = pre;
 78 | 		}
 79 | 
 80 | 		this.dat = new Dat(datFile);
 81 | 		this.nGramFeature = new CBNGramFeature(this.dat, this.model, this.values);
 82 | 
 83 | 		this.labelInfo = new String[10000];
 84 | 		Vector<Vector<Integer>> pocTags = new Vector<>();
 85 | 		for (int i = 0; i < 16; i++) pocTags.add(new Vector<>());
 86 | 		BufferedReader in = new BufferedReader(
 87 | 				new InputStreamReader(new FileInputStream(labelFile)));
 88 | 		String line;
 89 | 		int ind = 0;
 90 | 		while ((line = in.readLine()) != null) {
 91 | 			this.labelInfo[ind] = line;
 92 | 			int segInd = line.charAt(0) - '0';
 93 | 			for (int j = 0; j < 16; j++)
 94 | 				if (((1 << segInd) & j) != 0) pocTags.get(j).add(ind);
 95 | 			ind++;
 96 | 		}
 97 | 		in.close();
 98 | 
 99 | 		this.pocsToTags = new int[16][];
100 | 		for (int j = 1; j < 16; j++) {
101 | 			this.pocsToTags[j] = new int[pocTags.get(j).size() + 1];
102 | 			for (int k = 0; k < pocTags.get(j).size(); k++)
103 | 				this.pocsToTags[j][k] = pocTags.get(j).get(k);
104 | 			this.pocsToTags[j][pocTags.get(j).size()] = -1;
105 | 		}
106 | 
107 | 		int[][] labelLookingFor = new int[this.model.l_size][];
108 | 		for (int i = 0; i < this.model.l_size; i++) labelLookingFor[i] = null;
109 | 		for (int i = 0; i < this.model.l_size; i++) {
110 | 			if ("30".indexOf(this.labelInfo[i].charAt(0)) != -1) continue;
111 | 			for (int j = 0; j <= i; j++) {
112 | 				if ((this.labelInfo[i].substring(1).equals(
113 | 						this.labelInfo[j].substring(1))) && (this.labelInfo[j].charAt(
114 | 						0) == '0')) {
115 | 					if (labelLookingFor[j] == null) {
116 | 						labelLookingFor[j] = new int[2];
117 | 						labelLookingFor[j][0] = -1;
118 | 						labelLookingFor[j][1] = -1;
119 | 					}
120 | 					labelLookingFor[j][this.labelInfo[i].charAt(0) - '1'] = i;
121 | 					break;
122 | 				}
123 | 			}
124 | 		}
125 | 
126 | 
127 | 		for (int i = 0; i < this.maxLength; i++) this.allowedLabelLists[i] = null;
128 | 	}
129 | 
130 | 	public void dp() {
131 | 		if (this.allowedLabelLists[0] == null)
132 | 			this.allowedLabelLists[0] = this.pocsToTags[9];
133 | 		if (this.allowedLabelLists[this.len - 1] == null)
134 | 			this.allowedLabelLists[this.len - 1] = this.pocsToTags[12];
135 | 		AlphaBeta.dbDecode(this.model.l_size, this.model.ll_weights,
136 | 				this.len, this.nodes, this.values, this.alphas, this.result,
137 | 				this.labelTransPre, this.allowedLabelLists);
138 | 		this.allowedLabelLists[0] = null;
139 | 		this.allowedLabelLists[this.len - 1] = null;
140 | 	}
141 | 
142 | 	public void setLabelTrans() {
143 | 		int lSize = this.model.l_size;
144 | 		Vector<Vector<Integer>> preLabels = new Vector<>();
145 | 		Vector<Vector<Integer>> postLabels = new Vector<>();
146 | 		for (int i = 0; i < lSize; i++) {
147 | 			preLabels.add(new Vector<>());
148 | 			postLabels.add(new Vector<>());
149 | 		}
150 | 		for (int i = 0; i < lSize; i++) {
151 | 			for (int j = 0; j < lSize; j++) {
152 | 				int ni = this.labelInfo[i].charAt(0) - '0';
153 | 				int nj = this.labelInfo[j].charAt(0) - '0';
154 | 				boolean iIsEnd = ((ni == 2) || (ni == 3));
155 | 				boolean jIsBegin = ((nj == 0) || (nj == 3));
156 | 				boolean sameTag = this.labelInfo[i].substring(1)
157 | 						.equals(this.labelInfo[j].substring(1));
158 | 				if (sameTag) {
159 | 					if ((ni == 0 && nj == 1) ||
160 | 							(ni == 0 && nj == 2) ||
161 | 							(ni == 1 && nj == 2) ||
162 | 							(ni == 1 && nj == 1) ||
163 | 							(ni == 2 && nj == 0) ||
164 | 							(ni == 2 && nj == 3) ||
165 | 							(ni == 3 && nj == 3) ||
166 | 							(ni == 3 && nj == 0)) {
167 | 						preLabels.get(j).add(i);
168 | 						postLabels.get(i).add(j);
169 | 					}
170 | 				} else if (iIsEnd && jIsBegin) {
171 | 					preLabels.get(j).add(i);
172 | 					postLabels.get(i).add(j);
173 | 				}
174 | 			}
175 | 		}
176 | 		this.labelTransPre = new int[lSize][];
177 | 		for (int i = 0; i < lSize; i++) {
178 | 			this.labelTransPre[i] = new int[preLabels.get(i).size() + 1];
179 | 			for (int j = 0; j < preLabels.get(i).size(); j++) {
180 | 				this.labelTransPre[i][j] = preLabels.get(i).get(j);
181 | 			}
182 | 			this.labelTransPre[i][preLabels.get(i).size()] = -1;
183 | 		}
184 | 
185 | 		this.labelTransPost = new int[lSize][];
186 | 		for (int i = 0; i < lSize; i++) {
187 | 			this.labelTransPost[i] = new int[postLabels.get(i).size() + 1];
188 | 			for (int j = 0; j < postLabels.get(i).size(); j++)
189 | 				this.labelTransPost[i][j] = postLabels.get(i).get(j);
190 | 			this.labelTransPost[i][postLabels.get(i).size()] = -1;
191 | 		}
192 | 	}
193 | 
194 | 	public void putValues() {
195 | 		if (this.len == 0) return;
196 | 		for (int i = 0; i < this.len; i++) this.nodes[i].type = 0;
197 | 		this.nodes[0].type += 1;
198 | 		this.nodes[this.len - 1].type += 2;
199 | 
200 | 		int size = this.len * this.model.l_size;
201 | 		for (int i = 0; i < size; i++) this.values[i] = 0;
202 | 		this.nGramFeature.putValues(this.sequence, this.len);
203 | 	}
204 | 
205 | 	public boolean segment(String raw, POCGraph graph, List<TaggedWord> ts) {
206 | 		if (raw.length() == 0) return false;
207 | 
208 | 		for (int i = 0; i < raw.length(); i++)
209 | 			this.allowedLabelLists[i] = this.pocsToTags[
210 | 					graph.get(i) == 0 ? 15 : graph.get(i)];
211 | 		this.sequence = "";
212 | 		for (int i = 0; i < raw.length(); i++) this.sequence += raw.charAt(i);
213 | 		this.len = raw.length();
214 | 		this.putValues(); // calculate eigenvalue and initialize and store them in values
215 | 		this.dp(); // DP search for the best answer and store it in result
216 | 
217 | 		for (int i = 0; i < raw.length(); i++) this.allowedLabelLists[i] = null;
218 | 		int offset = 0;
219 | 		ts.clear();
220 | 		for (int i = 0; i < this.len; i++) {
221 | 			if ((i == this.len - 1) || (this.labelInfo[this.result[i]].charAt(
222 | 					0) == '2') || (this.labelInfo[this.result[i]].charAt(0) == '3')) {
223 | 				ts.add(new TaggedWord());
224 | 				for (int j = offset; j < i + 1; j++) {
225 | 					ts.get(ts.size() - 1).word += (this.sequence.charAt(j));
226 | 				}
227 | 				offset = i + 1; // output tags
228 | 				ts.get(ts.size() - 1).endOffset=offset;
229 |         ts.get(ts.size() - 1).startOffset=offset-ts.get(ts.size() - 1).word.length();
230 | 				ts.get(ts.size() - 1).tag = this.labelInfo[this.result[i]].substring(1);
231 | 			}
232 | 		}
233 | 		return true;
234 | 	}
235 | }
236 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/Node.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.cb;
 2 | 
 3 | /**
 4 |  * A class which contains topological information of a node.
 5 |  */
 6 | public class Node {
 7 | 	// TODO: add more documentation
 8 | 
 9 | 	/**
10 | 	 * Value:<br>
11 | 	 * <ul>
12 | 	 * <li>1: If this {@link Node} is a starting node.</li>
13 | 	 * <li>2: If this {@link Node} is a ending node.</li>
14 | 	 * <li>0: otherwise</li>
15 | 	 * </ul>
16 | 	 */
17 | 	public int type;
18 | 
19 | 	public int[] predecessors; // last element should be -1
20 | 	public int[] successors; // last element should be -1
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/data/Dat.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.data;
  2 | 
  3 | import org.thunlp.thulac.util.BufferUtils;
  4 | import org.thunlp.thulac.util.StringUtils;
  5 | 
  6 | import java.io.IOException;
  7 | import java.nio.ByteBuffer;
  8 | import java.nio.ByteOrder;
  9 | import java.nio.channels.SeekableByteChannel;
 10 | import java.nio.file.Files;
 11 | import java.nio.file.Paths;
 12 | 
 13 | /**
 14 |  * A class which loads data files from disk and provide necessary operations. Instances
 15 |  * are created with the {@link #Dat(String)} constructor which reads from a file of
 16 |  * with {@link DatMaker#readFromTxtFile(String)} which constructs a {@code Dat}
 17 |  * structure with the user-specified dictionary.<br>
 18 |  * Internally, {@code Dat} uses the two-array Trie Tree to store information that can
 19 |  * be searched though at high speed, (sometimes) even faster than using
 20 |  * {@link java.util.HashMap}.
 21 |  */
 22 | public class Dat {
 23 | 	/**
 24 | 	 * The two-array Trie Tree, use {@code dat[i << 1]} to access {@code base[i]} and
 25 | 	 * {@code dat[(i << 1) + 1]} to access {@code check[i]}.
 26 | 	 */
 27 | 	public int[] dat;
 28 | 	/**
 29 | 	 * The size of the Trie Tree, should be {@code this.dat.length / 2}.
 30 | 	 */
 31 | 	public int datSize;
 32 | 
 33 | 	protected Dat(int size) {
 34 | 		this.dat = new int[size << 1];
 35 | 		this.datSize = size;
 36 | 	}
 37 | 
 38 | 	/**
 39 | 	 * Read a {@link Dat} from a given file.
 40 | 	 *
 41 | 	 * @param filename
 42 | 	 * 		The name of the {@link Dat} file.
 43 | 	 *
 44 | 	 * @throws java.io.IOException
 45 | 	 * 		If an I/O error occurred while reading the file.
 46 | 	 */
 47 | 	public Dat(String filename) throws IOException {
 48 | 		SeekableByteChannel channel = Files.newByteChannel(Paths.get(filename));
 49 | 		// DWORD base + DWORD check -> 8 bytes per record
 50 | 		this.datSize = (int) (channel.size() >> 3);
 51 | 		this.dat = new int[this.datSize << 1];
 52 | 		// strange though, dat files are stored little endian
 53 | 		ByteBuffer bb = ByteBuffer.allocateDirect(64 * 1024)
 54 | 				.order(ByteOrder.LITTLE_ENDIAN);
 55 | 		bb.clear();
 56 | 		if (!BufferUtils.readInts(channel, bb, this.dat))
 57 | 			throw new IOException("File does not contain enough data!");
 58 | 		channel.close();
 59 | 	}
 60 | 
 61 | 	// if word in dat, return leaf element, otherwise return -1
 62 | 	private int match(String word) {
 63 | 		int ind = 0;
 64 | 		int base = 0;
 65 | 		int[] codePoints = StringUtils.toCodePoints(word);
 66 | 		for (int c : codePoints) {
 67 | 			ind = this.dat[ind << 1] + c;
 68 | 			if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return -1;
 69 | 			base = ind;
 70 | 		}
 71 | 		ind = this.dat[base << 1];
 72 | 		return ind < this.datSize && this.dat[(ind << 1) + 1] == base ? ind : -1;
 73 | 	}
 74 | 
 75 | 	// if prefix in dat, return -base, otherwise return longest substring of prefix in dat
 76 | 	public int getInfo(String prefix) {
 77 | 		int ind = 0;
 78 | 		int base = 0;
 79 | 		for (int i = 0; i < prefix.length(); i++) {
 80 | 			ind = this.dat[ind << 1] + prefix.charAt(i);
 81 | 			if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return i;
 82 | 			base = ind;
 83 | 		}
 84 | 		return -base;
 85 | 	}
 86 | 
 87 | 	/**
 88 | 	 * Returns whether this {@link Dat} contains one or more words that begin with
 89 | 	 * {@code prefix}.
 90 | 	 *
 91 | 	 * @param prefix
 92 | 	 * 		The query prefix.
 93 | 	 *
 94 | 	 * @return Whether this {@link Dat} contains one or more words that begin with
 95 | 	 * {@code prefix}.
 96 | 	 */
 97 | 	public boolean containsPrefix(String prefix) {
 98 | 		return getInfo(prefix) < 0;
 99 | 	}
100 | 
101 | 	/**
102 | 	 * Returns whether this {@link Dat} contains the given word.
103 | 	 *
104 | 	 * @param word
105 | 	 * 		The query word.
106 | 	 *
107 | 	 * @return Whether this {@link Dat} contains {@code word}.
108 | 	 */
109 | 	public boolean contains(String word) {
110 | 		return this.match(word) != -1;
111 | 	}
112 | }
113 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/data/DatMaker.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.data;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileInputStream;
  5 | import java.io.IOException;
  6 | import java.io.InputStream;
  7 | import java.io.InputStreamReader;
  8 | import java.util.ArrayList;
  9 | import java.util.Comparator;
 10 | import java.util.List;
 11 | import java.util.Vector;
 12 | 
 13 | /**
 14 |  * A class used to construct instances of {@link Dat} from user-specified dictionary
 15 |  * files. It extends {@link Dat} to avoid unnecessary array copies and to increase
 16 |  * performance.<br>
 17 |  * A confusing algorithm is used to construct the two-array Trie Tree used by
 18 |  * {@link Dat}, see in-line comments for more information.
 19 |  */
 20 | public class DatMaker extends Dat {
 21 | 	// a record of a word with an related integer
 22 | 	private static class Record {
 23 | 		public String word;
 24 | 		public int num;
 25 | 
 26 | 		public Record() {
 27 | 			this("", 0);
 28 | 		}
 29 | 
 30 | 		public Record(String key, int value) {
 31 | 			this.word = key;
 32 | 			this.num = value;
 33 | 		}
 34 | 	}
 35 | 
 36 | 	// pairs of Records are compared by comparing their words
 37 | 	private static Comparator<Record> RECORDS_COMPARATOR =
 38 | 			new Comparator<Record>() {
 39 | 				@Override
 40 | 				public int compare(Record a, Record b) {
 41 | 					return a.word.compareTo(b.word);
 42 | 				}
 43 | 			};
 44 | 
 45 | 	/**
 46 | 	 * Reads (or more precisely, constructs) an instance of {@link Dat} from the given
 47 | 	 * {@link java.io.InputStream}. This is used to generate {@link Dat} from a user-specified
 48 | 	 * dictionary, which consists of multiple lines, each one representing a word in the
 49 | 	 * dictionary.
 50 | 	 *
 51 | 	 * @param in
 52 | 	 * 		The {@link java.io.InputStream} to read.
 53 | 	 *
 54 | 	 * @return The generated {@link Dat}.
 55 | 	 *
 56 | 	 * @throws java.io.IOException
 57 | 	 * 		If an I/O error happens.
 58 | 	 */
 59 | 	public static Dat readFromInputStream(InputStream in) throws IOException {
 60 | 		List<String> words = new ArrayList<>();
 61 | 		BufferedReader reader = new BufferedReader(new InputStreamReader(in));
 62 | 		String str;
 63 | 		while ((str = reader.readLine()) != null) words.add(str);
 64 | 		reader.close();
 65 | 
 66 | 		DatMaker dat = new DatMaker();
 67 | 		dat.buildDat(words);
 68 | 		return dat;
 69 | 	}
 70 | 
 71 | 	/**
 72 | 	 * Reads (or more precisely, constructs) an instance of {@link Dat} from the given
 73 | 	 * file. This is used to generate {@link Dat} from a user-specified dictionary,
 74 | 	 * which consists of multiple lines, each one representing a word in the dictionary.
 75 | 	 *
 76 | 	 * @param filename
 77 | 	 * 		The name of the file.
 78 | 	 *
 79 | 	 * @return The generated {@link Dat}.
 80 | 	 *
 81 | 	 * @throws java.io.IOException
 82 | 	 * 		If the given file does not exist or is not readable.
 83 | 	 */
 84 | 	public static Dat readFromTxtFile(String filename) throws IOException {
 85 | 		return readFromInputStream(new FileInputStream(filename));
 86 | 	}
 87 | 
 88 | 	// The main idea of this ingenious algorithm that generates a Dat instance from the
 89 | 	// input string is that it makes use of the unused space of the original double-array
 90 | 	// Trie Tree to store a double-linked list. This means that it is fully
 91 | 	// compatible with the standard double-array Trie Tree data structure. What's more,
 92 | 	// this algorithm achieves its goal without extra storage space, expect for the head
 93 | 	// and tail fields. But these only require O(1) space, so they can be safely ignored.
 94 | 
 95 | 	// this.dat, the only storage block used by this algorithm, is an
 96 | 	// array of ELEMENTS. An ELEMENT contains two values, called BASE and CHECK, both
 97 | 	// integers. this.dat is structured in this way:
 98 | 	// ELEMENTS[0].BASE, ELEMENTS[0].CHECK, ELEMENTS[1].BASE, ELEMENTS[1].CHECK, ...
 99 | 	// this.datSize is the total number of ELEMENTS, so
100 | 	// this.dat.length = 2 * this.datSize.
101 | 	// In the following parts,BASE and CHECK will be referred to as the
102 | 	// FIELDS of an ELEMENT, for example, "the BASE FIELD of ELEMENT[4]".
103 | 
104 | 	// The program distinguishes the two different data structures stored in this.dat by
105 | 	// the sign of the ELEMENTS' FIELDS.
106 | 	// ELEMENTS whose CHECK and BASE FIELDS are positive belong to the double-array Trie
107 | 	// Tree, while those whose CHECK and BASE FIELDS are negative belong to the
108 | 	// double-linked list. When an ELEMENT belongs to the Trie Tree, we call it USED.
109 | 	// Otherwise, we call it UNUSED.
110 | 
111 | 	// Here the specific data structures are explained.
112 | 	// The data structure of the Trie Tree:
113 | 	// FIELDS of USED ELEMENTS strictly follow the definitions of the double-array Trie
114 | 	// Tree. (If unfamiliar, consult Google) For the current stage S and input
115 | 	// character C, we have:
116 | 	// ELEMENTS[ELEMENTS[S].BASE + C].CHECK = S
117 | 	// ELEMENTS[S].BASE + C = T
118 | 	// where T is the next stage the DFA (Deterministic Finite Automaton) described by
119 | 	// the Trie Tree should jump to.
120 | 
121 | 	// The data structure of the double-linked list:
122 | 	// In a double-linked list there are multiple NODES, each containing two
123 | 	// pointers PREV and NEXT. In accordance with the c-style arrow (->) operator, this
124 | 	// list conforms to the following equations:
125 | 	// NODE->NEXT->PREV = NODE
126 | 	// NODE->PREV->NEXT = NODE
127 | 	// In this implementation, pointers take the negative of the values of the indices of
128 | 	// the NODES they point to. The PREV pointer is stored in the BASE field, and the
129 | 	// NEXT pointer in the CHECK field. We have,
130 | 	// -ELEMENTS[ -ELEMENTS[i].CHECK ].BASE = i
131 | 	// -ELEMENTS[ -ELEMENTS[i].BASE ].CHECK = i
132 | 	// The negative signs appear because fields of ELEMENTs in the double-linked list
133 | 	// are negative.
134 | 
135 | 	// The pointers to the HEAD NODE and the TAIL NODE are stored in this.head and
136 | 	// this.tail, respectively. -this.head is the index of the first NODE in the
137 | 	// double-linked list, and -this.tail is the index of the last NODE.
138 | 
139 | 	// After so many explanations of the data structure, we finally come to the
140 | 	// actual behavior of this algorithm.
141 | 	// The buildDat() method takes a list of strings as input and sorts them in
142 | 	// alphabetical order. Afterwards, findChildren() breaks strings - char sequences -
143 | 	// into a tree of characters, as described in the Trie Tree.
144 | 	// Since the Trie Tree is a representation of an DFA (Deterministic Finite
145 | 	// Automaton), a stage has to be generated for each node in the tree. Such a stage,
146 | 	// stored as ELEMENTS, have the BASE and CHECK FIELDS. The CHECK field of an ELEMENT
147 | 	// is assigned when its parent stage is generated. The assignment of the value in
148 | 	// BASE FIELD is implemented in allocate() and described below:
149 | 
150 | 	// 1. Set variable BASE to this.head.
151 | 	// 2. Determine whether BASE is available. (If all ELEMENTS[BASE + C] are UNUSED
152 | 	//    for every C of the child nodes of the current one)
153 | 	// 3. If BASE is available, return BASE; otherwise, set BASE to the next UNUSED
154 | 	//    ELEMENT, using the double-linked list.
155 | 	// In this process, if no available BASE is found, the size of this.dat is doubled
156 | 	// through the expandDat() method, which also maintains the double-linked list in
157 | 	// the newly allocated ELEMENTS.
158 | 
159 | 	// After an available BASE has been found for the current stage, markAsUsed()
160 | 	// is called with BASE and all BASE + C, updating the double-linked list.
161 | 
162 | 	// Afterwards, populate() is called. It sets ELEMENTS[BASE + C].CHECK to S
163 | 	// for all C in the child nodes and sets ELEMENTS[S].BASE to BASE. ELEMENTS[S]
164 | 	// .CHECK is set to S if stage BASE can be the end of a word; otherwise, it is set
165 | 	// to BASE otherwise. For each word in lexicon, its corresponding leaf node in the
166 | 	// Trie Tree will have its BASE field set to the line number of the word. (Remember
167 | 	// that the user-specified dictionary consists of multiple lines, each one
168 | 	// representing a word in the dictionary.
169 | 
170 | 	// Finally, method packDat() is invoked to minimize the size of this.dat and reduce
171 | 	// memory usage.
172 | 
173 | 	private int head, tail;
174 | 
175 | 	private DatMaker() {
176 | 		super(1);
177 | 
178 | 		// initialize the double-linked list: head = 0, next = 1
179 | 		this.dat[0] = this.dat[1] = -1;
180 | 		this.head = this.tail = 0;
181 | 	}
182 | 
183 | 	// mark element as used by modifying the double-linked list
184 | 	private void markAsUsed(int index) {
185 | 		// -base -> the previous element, -check -> the next element
186 | 		int base = this.dat[index << 1], check = this.dat[(index << 1) + 1];
187 | 
188 | 		// if the the next element is already USED, print an error message
189 | 		if (check >= 0) throw new RuntimeException("Cell reused! Index: " + index);
190 | 
191 | 		// maintain the double-linked list
192 | 		if (base == -1) this.head = check;
193 | 		else this.dat[((-base) << 1) + 1] = check;
194 | 		if (check == -this.datSize) this.tail = base;
195 | 		else this.dat[(-check) << 1] = base;
196 | 
197 | 		this.dat[(index << 1) + 1] = index; // positive check: element used
198 | 	}
199 | 
200 | 	// expand size of this.dat
201 | 	private void expandDat() {
202 | 		int oldSize = this.datSize;
203 | 
204 | 		// alloc & copy
205 | 		this.datSize *= 2;
206 | 		int[] newDat = new int[this.dat.length << 1];
207 | 		System.arraycopy(this.dat, 0, newDat, 0, this.dat.length);
208 | 		this.dat = newDat;
209 | 
210 | 		// expand the double-linked list
211 | 		for (int i = 0; i < oldSize; i++) {
212 | 			int pos = (oldSize + i) << 1;
213 | 			newDat[pos] = -(oldSize + i - 1);
214 | 			newDat[pos + 1] = -(oldSize + i + 1);
215 | 		}
216 | 		this.dat[oldSize << 1] = this.tail;
217 | 		this.dat[((-this.tail) << 1) + 1] = -oldSize;
218 | 		this.tail = -(oldSize * 2 - 1); // set tail to the last element
219 | 	}
220 | 
221 | 	// remove unused elements to save memory
222 | 	private void packDat() {
223 | 		// calculate minimum size
224 | 		int last = this.datSize - 1;
225 | 		for (; this.dat[(last << 1) + 1] < 0; --last) ;
226 | 		this.datSize = last + 1;
227 | 
228 | 		// truncate this.dat
229 | 		int[] newDat = new int[this.datSize << 1];
230 | 		System.arraycopy(this.dat, 0, newDat, 0, this.datSize << 1);
231 | 		this.dat = newDat;
232 | 	}
233 | 
234 | 	// allocate elements according to offsets and return BASE
235 | 	private int allocate(List<Integer> offsets) {
236 | 		int size = offsets.size();
237 | 		int base = -this.head; // initialized to the head of the double-linked list
238 | 		while (true) {
239 | 			// expand this.dat as needed
240 | 			if (base == this.datSize) this.expandDat();
241 | 			if (size != 0) {
242 | 				// sorted, offsets.get(size - 1) is the greatest
243 | 				int requiredSize = base + offsets.get(size - 1);
244 | 				while (requiredSize >= this.datSize) this.expandDat();
245 | 			}
246 | 
247 | 			boolean available = true; // check availability
248 | 			if (this.dat[(base << 1) + 1] >= 0) available = false; // ELEMENTS[BASE] USED
249 | 			else {
250 | 				// if any ELEMENTS[BASE + C] is USED, available = false
251 | 				int i = 0;
252 | 				for (; i < size && this.dat[(base + offsets.get(i) << 1) + 1] < 0; i++) ;
253 | 				if (i < size) available = false;
254 | 			}
255 | 
256 | 			if (available) { // if BASE is available, update double-linked list
257 | 				this.markAsUsed(base);
258 | 				for (int offset : offsets) this.markAsUsed(base + offset);
259 | 
260 | 				return base;
261 | 			}
262 | 
263 | 			// find next BASE to check availability
264 | 			int newBase = -this.dat[(base << 1) + 1];
265 | 			if (newBase == this.datSize) this.expandDat(); // ensure capacity
266 | 			base = newBase;
267 | 		}
268 | 	}
269 | 
270 | 	// find characters in lexicon which might follow the prefix
271 | 	private List<Integer> findChildren(List<Record> lexicon, int start, String prefix) {
272 | 		List<Integer> children = new ArrayList<>();
273 | 		int length = prefix.length(), currentChild = -1;
274 | 		for (int i = start, size = lexicon.size(); i < size; ++i) {
275 | 			String word = lexicon.get(i).word;
276 | 			if (!word.startsWith(prefix)) return children;
277 | 			if (word.length() == length) continue;
278 | 			int nextCh = word.charAt(length);
279 | 			if (nextCh != currentChild) children.add(currentChild = nextCh);
280 | 		}
281 | 		return children;
282 | 	}
283 | 
284 | 	// populate BASE and CHECK FIELDS of allocated BASE and BASE + C
285 | 	// @param isWord Whether the end of a word has been reached.
286 | 	private int populate(int check, List<Integer> offsets, boolean isWord) {
287 | 		int base = this.allocate(offsets);
288 | 
289 | 		this.dat[base << 1] = 0;
290 | 		this.dat[(base << 1) + 1] = isWord ? check : base;
291 | 
292 | 		for (int offset : offsets) { // update Trie Tree
293 | 			int pos = base + offset << 1;
294 | 			this.dat[pos] = 0;
295 | 			this.dat[pos + 1] = check; // ELEMENTS[ELEMENTS[S].BASE + C].CHECK = S
296 | 		}
297 | 		this.dat[check << 1] = base; // ELEMENTS[CHECK].BASE = BASE
298 | 
299 | 		return base;
300 | 	}
301 | 
302 | 	// build the Dat structure with a word list as input
303 | 	private void buildDat(List<String> words) {
304 | 		// construct lexicon
305 | 		Vector<Record> lexicon = new Vector<>();
306 | 		lexicon.add(new Record());
307 | 		for (int i = 0, size = words.size(); i < size; ++i)
308 | 			lexicon.add(new Record(words.get(i), i));
309 | 		lexicon.sort(RECORDS_COMPARATOR); // sort input
310 | 
311 | 		// root elements
312 | 		this.dat[0] = this.populate(0, this.findChildren(lexicon, 0, ""), true);
313 | 
314 | 		for (int i = 0, size = lexicon.size(); i < size; i++) {
315 | 			String word = lexicon.get(i).word;
316 | 
317 | 			int off = this.getInfo(word);
318 | 			if (off <= 0) off = word.length(); // if dat already contains word
319 | 
320 | 			// iterate through characters after offset and add new entries
321 | 			for (int offset = off; offset <= word.length(); offset++) {
322 | 				String prefix = word.substring(0, offset);
323 | 				int pBase = -this.getInfo(prefix); // should always be positive
324 | 				this.populate(pBase, this.findChildren(lexicon, i, prefix),
325 | 						offset == word.length()); // on word end
326 | 			}
327 | 
328 | 			off = -this.getInfo(word); // should always be positive
329 | 			this.dat[this.dat[off << 1] << 1] = lexicon.get(i).num; // leaf node value
330 | 		}
331 | 
332 | 		this.packDat();
333 | 	}
334 | }


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/data/POCGraph.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.data;
 2 | 
 3 | import java.util.Vector;
 4 | 
 5 | /**
 6 |  * <i>POC</i> means <i>Position Of Character</i>, representing the possible positions
 7 |  * of a character in the segmented words.<br>
 8 |  * {@code POCGraph} is a list of integers, possesses a length of {@code l} when generated
 9 |  * by processing a string of length {@code l}, therefore we get:<br>
10 |  * Let {@code graph} be an instance of {@code POCGraph}, and {@code l} be the length of
11 |  * the graph. (retrieved by calling {@code graph.size()})<br>
12 |  * {@code graph.get(i)} ({@code 0 <= i < length}) is an integer calculated by bitwise
13 |  * or-ing zero or more of the following constants:<br>
14 |  * <ul>
15 |  * <li>POC_B = 0x01: included if the character can be the beginning of a word.</li>
16 |  * <li>POC_M = 0x02: included if the character can be the middle of a word.</li>
17 |  * <li>POC_E = 0x04: included if the character can be the end of a word.</li>
18 |  * <li>POC_S = 0x08: included if the character can be exactly one single world.</li>
19 |  * </ul>
20 |  * As pseudo-code:<br>
21 |  * <code><pre>
22 |  * int i = &lt;index&gt;;
23 |  * boolean canBeBeginning = input.canBeBeginning(i);
24 |  * boolean canBeMiddle    = input.canBeMiddle(i);
25 |  * boolean canBeEnd       = input.canBeEnd(i);
26 |  * boolean canBeSingle    = input.canBeSingle(i);
27 |  * int positions = (canBeBeginning ? POC_B : 0) |
28 |  *                 (canBeMiddle    ? POC_M : 0) |
29 |  *                 (canBeEnd       ? POC_E : 0) |
30 |  *                 (canBeSingle    ? POC_S : 0);
31 |  * graph[i] = positions;
32 |  * </pre></code>
33 |  * Note that the {@code POC_M} flag does not conflict with the other flags, e.g., a
34 |  * {@code position} of {@code POC_M | POC_B} means that the character can either be the
35 |  * middle or the beginning of a word. This applies also for {@code POC_S}, which
36 |  * indicates that the character can form a single-character word.<br>
37 |  * The generation of {@code POCGraph} is mainly based on punctuations and line breaks,
38 |  * but in various implementations also on characters that would certainly not be a part
39 |  * of a word, such as whitespaces or numbers.<br>
40 |  * This class is merely a alias for {@linkplain java.util.Vector Vector&lt;Integer&gt;},
41 |  * indicating that instances of this class are used as only as the list of {@code POCs},
42 |  * no more behaviour is added.
43 |  */
44 | public class POCGraph extends Vector<Integer> {
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/data/TaggedWord.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.data;
 2 | 
 3 | /**
 4 |  * A class which represent a tagged word, that is, a word with a tag.
 5 |  */
 6 | public class TaggedWord {
 7 | 	public String word;
 8 | 	public String tag;
 9 | 	public int startOffset;
10 | 	public int endOffset;
11 | 
12 | 	public TaggedWord() {
13 | 		this.word = "";
14 | 	}
15 | 
16 | 	public TaggedWord(String word, String tag, int startOffset, int endOffset) {
17 | 		this.word = word;
18 | 		this.tag = tag;
19 | 		this.startOffset = startOffset;
20 | 		this.endOffset = endOffset;
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/IInputProvider.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * An interface used to provide input for {@link org.thunlp.thulac.Thulac}. Implementations of this
 8 |  * interface should contain its own context, since {@link #provideInput()} does not
 9 |  * pass any kind of parameter. It is recommended that implementations read input from a
10 |  * stream, e.g., from a file of from the console ({@code System.in}).
11 |  */
12 | public interface IInputProvider extends IProgramStateListener {
13 | 	/**
14 | 	 * Provide a {@link java.util.List} of {@link String} which contains the input for the
15 | 	 * segmentation program to process. By contract, the return value of this method,
16 | 	 * joined with whitespaces (U+0020) should logically represent a line from the input,
17 | 	 * though this is not compulsory. A {@code null} return value will be regarded as
18 | 	 * an EOF and the program will terminate. A {@link java.util.List} is used because it is
19 | 	 * recommended to split an enormous line into separate line segments based on the
20 | 	 * punctuations.
21 | 	 *
22 | 	 * @return The input to the segmentation program.
23 | 	 */
24 | 	List<String> provideInput() throws IOException;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/IOutputHandler.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | /**
 9 |  * An interface used to handle the output from the segmentation program. The whole
10 |  * handling process is based on lines, though its extending the
11 |  * {@link IProgramStateListener} allows it to listen the starting and termination
12 |  * events of the program, therefore implementations should also concentrate on lines.
13 |  */
14 | public interface IOutputHandler extends IProgramStateListener {
15 | 	/**
16 | 	 * Handles the {@link java.util.List} of {@link org.thunlp.thulac.data.TaggedWord} generated by the segmentation
17 | 	 * program. Since one input line might be split into multiple line segments,
18 | 	 * this method might be invoked several times between a pair of
19 | 	 * {@link #handleLineStart()} and {@link #handleLineEnd()}. Traditionally, the
20 | 	 * param {@code word} of all the invocations of this methods between a pair of
21 | 	 * {@link #handleLineEnd()} and {@link #handleLineEnd()} come from the same line of
22 | 	 * input, and the output handler should output to the same line as well, however
23 | 	 * this is not compulsory.
24 | 	 *
25 | 	 * @param words
26 | 	 * 		The {@link java.util.List} of {@link org.thunlp.thulac.data.TaggedWord} generated processing one line segment.
27 | 	 * @param segOnly
28 | 	 * 		Whether to output without tags.
29 | 	 * @param separator
30 | 	 * 		The separator between output words and tags.
31 | 	 */
32 | 	void handleLineSegment(List<TaggedWord> words, boolean segOnly, char separator)
33 | 			throws IOException;
34 | 
35 | 	/**
36 | 	 * Called when an input line is obtained from {@link IInputProvider} and the
37 | 	 * segmentation program is about to begin breaking the line into segments. This
38 | 	 * method is basically for initializations, e.g., creating new line, etc.<br>
39 | 	 * This method is invoked before {@link #handleLineSegment(java.util.List, boolean, char)}.
40 | 	 */
41 | 	void handleLineStart() throws IOException;
42 | 
43 | 	/**
44 | 	 * Called when segmentation of an input line is finished and the segmentation
45 | 	 * program is about to begin processing the next line. This method is basically for
46 | 	 * finalisation, e.g., flushing input of this line, etc.<br>
47 | 	 * This method is invoked after {@link #handleLineSegment(java.util.List, boolean, char)}.
48 | 	 */
49 | 	void handleLineEnd() throws IOException;
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/IProgramStateListener.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | /**
 4 |  * An interface used to listen to the starting and termination events of the
 5 |  * segmentation program.
 6 |  */
 7 | public interface IProgramStateListener {
 8 | 	/**
 9 | 	 * Called when the segmentation program starts.
10 | 	 */
11 | 	void onProgramStart();
12 | 
13 | 	/**
14 | 	 * Called when the segmentation program terminates. (in finally block)
15 | 	 */
16 | 	void onProgramEnd();
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/ReaderInputProvider.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.util.IOUtils;
 4 | 
 5 | import java.io.BufferedReader;
 6 | import java.io.IOException;
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * An implementation of {@link IInputProvider} which retrieves input from a
11 |  * {@link java.io.BufferedReader}.
12 |  */
13 | public class ReaderInputProvider implements IInputProvider {
14 | 	private BufferedReader reader;
15 | 
16 | 	public ReaderInputProvider(BufferedReader reader) {
17 | 		// reader must be non-null
18 | 		if (reader == null) throw new IllegalArgumentException("reader == null!");
19 | 		this.reader = reader;
20 | 	}
21 | 
22 | 	@Override
23 | 	public List<String> provideInput() throws IOException {
24 | 		String line = this.reader.readLine();
25 | 		if (line == null) return null;
26 | 		return IOUtils.getLineSegments(line);
27 | 	}
28 | 
29 | 	@Override
30 | 	public void onProgramStart() {
31 | 	}
32 | 
33 | 	@Override
34 | 	public void onProgramEnd() {
35 | 		try {
36 | 			this.reader.close(); // release system resources
37 | 		} catch (IOException e) {
38 | 			e.printStackTrace();
39 | 		}
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/StringInputProvider.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.util.IOUtils;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | /**
 9 |  * An implementation of {@link IInputProvider} which retrieves input from a {@link
10 |  * String}.
11 |  */
12 | public class StringInputProvider implements IInputProvider {
13 | 	private String[] lines;
14 | 	private int pointer;
15 | 
16 | 	public StringInputProvider(String input) {
17 | 		// input must be non-null
18 | 		if (input == null) throw new IllegalArgumentException("input == null!");
19 | 		this.lines = input.split("\n"); // empty lines are discarded
20 | 		this.pointer = 0;
21 | 	}
22 | 
23 | 	@Override
24 | 	public void onProgramStart() {
25 | 	}
26 | 
27 | 	@Override
28 | 	public void onProgramEnd() {
29 | 	}
30 | 
31 | 	@Override
32 | 	public List<String> provideInput() throws IOException {
33 | 		if (this.pointer == this.lines.length) return null;
34 | 		return IOUtils.getLineSegments(this.lines[pointer++]);
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/StringOutputHandler.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | /**
 9 |  * An implementation of {@link IOutputHandler} to allow access to the output in form of
10 |  * {@link String}.
11 |  */
12 | public class StringOutputHandler implements IOutputHandler {
13 | 	private StringBuilder str;
14 | 
15 | 	public StringOutputHandler() {
16 | 		this.str = new StringBuilder();
17 | 	}
18 | 
19 | 	@Override
20 | 	public void onProgramStart() {
21 | 	}
22 | 
23 | 	@Override
24 | 	public void onProgramEnd() {
25 | 	}
26 | 
27 | 	@Override
28 | 	public void handleLineSegment(List<TaggedWord> words,
29 | 								  boolean segOnly, char separator) {
30 | 		if (segOnly) {
31 | 			for (TaggedWord word : words) {
32 | 				this.str.append(word.word);
33 | 				this.str.append(' ');
34 | 			}
35 | 		} else {
36 | 			for (TaggedWord word : words) {
37 | 				this.str.append(word.word);
38 | 				this.str.append(separator);
39 | 				this.str.append(word.tag);
40 | 				this.str.append(' ');
41 | 			}
42 | 		}
43 | 	}
44 | 
45 | 	@Override
46 | 	public void handleLineStart() throws IOException {
47 | 	}
48 | 
49 | 	@Override
50 | 	public void handleLineEnd() throws IOException {
51 | 		this.str.append("\n");
52 | 	}
53 | 
54 | 	public String getString() {
55 | 		return this.str.toString();
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/WriterOutputHandler.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.io.BufferedWriter;
 6 | import java.io.IOException;
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * An implementation of {@link IOutputHandler} which writes output to a {@link
11 |  * java.io.BufferedWriter}.
12 |  */
13 | public class WriterOutputHandler implements IOutputHandler {
14 | 	private BufferedWriter writer;
15 | 	private StringBuilder sb;
16 | 
17 | 	public WriterOutputHandler(BufferedWriter writer) {
18 | 		// writer must be non-null
19 | 		if (writer == null) throw new IllegalArgumentException("writer == null!");
20 | 		this.writer = writer;
21 | 		this.sb = new StringBuilder();
22 | 	}
23 | 
24 | 	@Override
25 | 	public void handleLineSegment(List<TaggedWord> words, boolean segOnly, char separator)
26 | 			throws IOException {
27 | 		if (segOnly) {
28 | 			for (TaggedWord word : words) {
29 | 				this.sb.append(word.word);
30 | 				this.sb.append(' ');
31 | 			}
32 | 		} else {
33 | 			for (TaggedWord word : words) {
34 | 				this.sb.append(word.word);
35 | 				this.sb.append(separator);
36 | 				this.sb.append(word.tag);
37 | 				this.sb.append(' ');
38 | 			}
39 | 		}
40 | 	}
41 | 
42 | 	@Override
43 | 	public void handleLineStart() throws IOException {
44 | 		this.sb.setLength(0);
45 | 	}
46 | 
47 | 	@Override
48 | 	public void handleLineEnd() throws IOException {
49 | 		this.sb.append("\n");
50 | 		this.writer.write(this.sb.toString());
51 | 	}
52 | 
53 | 	@Override
54 | 	public void onProgramStart() {
55 | 	}
56 | 
57 | 	@Override
58 | 	public void onProgramEnd() {
59 | 		try {
60 | 			this.writer.close(); // release system resources
61 | 		} catch (IOException e) {
62 | 			e.printStackTrace();
63 | 		}
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/main/Main.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.main;
 2 | 
 3 | import org.thunlp.thulac.Thulac;
 4 | import org.thunlp.thulac.io.IInputProvider;
 5 | import org.thunlp.thulac.io.IOutputHandler;
 6 | import org.thunlp.thulac.util.IOUtils;
 7 | 
 8 | import java.io.IOException;
 9 | 
10 | /**
11 |  * The program entrance which deals with command line arguments.
12 |  */
13 | public class Main {
14 | 	public static void main(String[] args) throws IOException {
15 | 		String modelDir = "models/";
16 | 		char separator = '_';
17 | 		String userDict = null;
18 | 		boolean useT2S = false;
19 | 		boolean segOnly = false;
20 | 		boolean useFilter = false;
21 | 		IInputProvider input = null;
22 | 		IOutputHandler output = null;
23 | 
24 | 		for (int c = 0; c < args.length; ++c)
25 | 			switch (args[c]) {
26 | 				case "-t2s":
27 | 					useT2S = true;
28 | 					break;
29 | 				case "-user":
30 | 					userDict = args[++c];
31 | 					break;
32 | 				case "-deli":
33 | 					separator = args[++c].charAt(0);
34 | 					break;
35 | 				case "-seg_only":
36 | 					segOnly = true;
37 | 					break;
38 | 				case "-filter":
39 | 					useFilter = true;
40 | 					break;
41 | 				case "-model_dir":
42 | 					modelDir = args[++c];
43 | 					if (modelDir.charAt(modelDir.length() - 1) != '/')
44 | 						modelDir += '/';
45 | 					break;
46 | 				case "-input":
47 | 					input = IOUtils.inputFromFile(args[++c]); // use UTf-8
48 | 					break;
49 | 				case "-output":
50 | 					output = IOUtils.outputToFile(args[++c]); // use UTF-8
51 | 					break;
52 | 			}
53 | 		if (input == null) input = IOUtils.inputFromConsole();
54 | 		if (output == null) output = IOUtils.outputToConsole();
55 | 
56 | 		Thulac.split(modelDir, separator, userDict, useT2S, segOnly, useFilter,
57 | 				input, output);
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/DictionaryPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.Dat;
 4 | import org.thunlp.thulac.data.DatMaker;
 5 | import org.thunlp.thulac.data.TaggedWord;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.List;
 9 | 
10 | /**
11 |  * A postprocess pass which scans the word list, extract words that are found in the
12 |  * dictionary and tag them.<br>
13 |  * To show its behavior more clearly, we raise the following example:<br>
14 |  * Assume that the input {@code sentence} is {@code "A", "B", "C", "DE"}, and the word
15 |  * list specified by {@link #dictionary} is {@code "AB", "ABC", "ABCD"}.<br>
16 |  * The {@link #process(java.util.List)} method tends to find the longest concatenation of words
17 |  * in the word list which exists in the dictionary and combine these words into one
18 |  * single {@link TaggedWord}.<br>
19 |  * So, as for this example, all concatenations of words in the list beginning from
20 |  * index 0 would be: {@code "A", "AB", "ABC", "ABCDE"}, in which only {@code "AB"} and
21 |  * {@code "ABC"} is present in {@link #dictionary}.<br>
22 |  * In this case, the longest concatenation would be {@code "ABC"} and therefore the
23 |  * words {@code "A", "B", "C"} are removed and one single word {@code "ABC"} is added
24 |  * to the word list, which makes the final output from {@link #process(java.util.List)} {@code
25 |  * "ABC", "DE"}.<br>
26 |  * Please notice that although {@code "ABCD"} exists in {@link #dictionary}, the
27 |  * {@link #process(java.util.List)} method will not attempt to split whole words apart.
28 |  */
29 | public class DictionaryPass implements IPostprocessPass {
30 | 
31 | 	private Dat dictionary;
32 | 	private String tag;
33 | 
34 | 	public DictionaryPass(String dictFile, String tag, boolean isTxt)
35 | 			throws IOException {
36 | 		this.tag = tag;
37 | 		if (isTxt) this.dictionary = DatMaker.readFromTxtFile(dictFile);
38 | 		else this.dictionary = new Dat(dictFile);
39 | 	}
40 | 
41 | 	@Override
42 | 	public void process(List<TaggedWord> sentence) {
43 | 		if (this.dictionary == null || sentence.isEmpty()) return;
44 | 
45 | 		for (int i = 0, size = sentence.size(); i < size; i++) {
46 | 			// search for longest concatenation which exists in dict
47 | 			StringBuilder sb = new StringBuilder();
48 | 			String longest = null, current;
49 | 			int longestIndex = -1;
50 | 			for (int j = i; j < size; j++) {
51 | 				current = sb.append(sentence.get(j).word).toString();
52 | 				if (!this.dictionary.containsPrefix(current)) break;
53 | 				if (this.dictionary.contains(current)) {
54 | 					longest = current;
55 | 					longestIndex = j;
56 | 				}
57 | 			}
58 | 
59 | 			// if found, combine the words and update the sentence
60 | 			if (longest == null) continue;
61 | 			Integer startOffset = sentence.get(i).startOffset;
62 | 			sentence.set(i, new TaggedWord(longest,tag, startOffset,startOffset+longest.length()));
63 | 			for (int j = longestIndex; j > i; --j) sentence.remove(j);
64 | 			size = sentence.size();
65 | 		}
66 | 	}
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/DoubleWordPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | import org.thunlp.thulac.util.StringUtils;
 5 | 
 6 | import java.util.List;
 7 | 
 8 | import static org.thunlp.thulac.util.CodePointUtils.SPECIAL_CHARS;
 9 | 
10 | /**
11 |  * A postprocess pass combining adjacent words which can form a double word together.
12 |  *
13 |  * @see #canFormDoubleWord(String, String)
14 |  */
15 | public class DoubleWordPass implements IPostprocessPass {
16 | 	@Override
17 | 	public void process(List<TaggedWord> sentence) {
18 | 		if (sentence.size() <= 1) return;
19 | 
20 | 		TaggedWord tagged, last = sentence.get(sentence.size() - 1);
21 | 		for (int i = sentence.size() - 2; i >= 0; --i, last = tagged) {
22 | 			tagged = sentence.get(i);
23 | 			if (this.canFormDoubleWord(tagged.word, last.word)) {
24 | 				tagged.word += last.word;
25 | 				sentence.remove(i + 1);
26 | 			}
27 | 		}
28 | 	}
29 | 
30 | 	/**
31 | 	 * Two words can form a double word if and only of:<br>
32 | 	 * <ul>
33 | 	 * <li>Both words contain only one code points and,</li>
34 | 	 * <li>The only code points in both words are identical and,</li>
35 | 	 * <li>This code point is not a {@linkplain org.thunlp.thulac.util.CodePointUtils#SPECIAL_CHARS
36 | 	 * special character}.</li>
37 | 	 * </ul>
38 | 	 *
39 | 	 * @param first
40 | 	 * 		The first word.
41 | 	 * @param second
42 | 	 * 		The second word.
43 | 	 *
44 | 	 * @return If the two words can form a double word.
45 | 	 */
46 | 	private boolean canFormDoubleWord(String first, String second) {
47 | 		if (StringUtils.codePointCount(first) != 1 ||
48 | 				StringUtils.codePointCount(second) != 1) return false;
49 | 		int firstCP = first.codePointAt(0);
50 | 		int secondCP = second.codePointAt(0);
51 | 		return firstCP == secondCP && SPECIAL_CHARS.indexOf(firstCP) == -1;
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/FilterPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.Dat;
 4 | import org.thunlp.thulac.data.TaggedWord;
 5 | import org.thunlp.thulac.util.StringUtils;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.Arrays;
 9 | import java.util.HashSet;
10 | import java.util.List;
11 | import java.util.Set;
12 | 
13 | import static org.thunlp.thulac.util.CodePointUtils.CHINESE_DIGITS;
14 | import static org.thunlp.thulac.util.CodePointUtils.DIGITS;
15 | 
16 | /**
17 |  * A postprocess pass which filters forbidden tags from the the word list.
18 |  */
19 | public class FilterPass implements IPostprocessPass {
20 | 	/**
21 | 	 * Tags allowed to pass the filter. Words with tags out of this list will be
22 | 	 * discarded.
23 | 	 */
24 | 	private static final Set<String> ALLOWED_TAGS = new HashSet<>(Arrays.asList(
25 | 			"n", "np", "ns", "ni", "nz", "v", "a", "id", "t", "uw"));
26 | 
27 | 	private Dat xuDat;
28 | 	private Dat timeDat;
29 | 
30 | 	public FilterPass(String xuDatFile, String timeDatFile) throws IOException {
31 | 		this.xuDat = new Dat(xuDatFile);
32 | 		this.timeDat = new Dat(timeDatFile);
33 | 	}
34 | 
35 | 	/**
36 | 	 * Returns {@code true} is one of the following is true:<br>
37 | 	 * <ul>
38 | 	 * <li>Word contains one or more normal digits.</li>
39 | 	 * <li>Word contains two or more Chinese digits.</li>
40 | 	 * <li>Word is in dictionary specified by {@link #timeDat}.</li>
41 | 	 * </ul>
42 | 	 *
43 | 	 * @param word
44 | 	 * 		The word to check.
45 | 	 *
46 | 	 * @return Whether the word contains number digits.
47 | 	 */
48 | 	private boolean hasNumber(String word) {
49 | 		int count = 0;
50 | 		for (int c : StringUtils.toCodePoints(word))
51 | 			if (DIGITS.indexOf(c) != -1) return true;
52 | 			else if (CHINESE_DIGITS.indexOf(c) != -1 && count++ != 0) return true;
53 | 		return this.timeDat.contains(word);
54 | 	}
55 | 
56 | 	/**
57 | 	 * Remove words in segmented word list if one of the following is true:<br>
58 | 	 * <ul>
59 | 	 * <li>Tag of word not in {@link #ALLOWED_TAGS}.</li>
60 | 	 * <li>Word in dictionary specified by {@link #timeDat}.</li>
61 | 	 * <li>Word has tag "t" and {@linkplain #hasNumber(String) hasNumber(word)}
62 | 	 * returns {@code true}.</li>
63 | 	 * </ul>
64 | 	 *
65 | 	 * @param sentence
66 | 	 * 		The sentence to filter.
67 | 	 */
68 | 	@Override
69 | 	public void process(List<TaggedWord> sentence) {
70 | 		if (this.xuDat == null || this.timeDat == null || sentence.isEmpty()) return;
71 | 
72 | 		for (int i = sentence.size() - 1; i >= 0; --i) {
73 | 			String word = sentence.get(i).word;
74 | 			String tag = sentence.get(i).tag;
75 | 			if (!ALLOWED_TAGS.contains(tag) || this.xuDat.contains(word) ||
76 | 					("t".equals(tag) && this.hasNumber(word))) sentence.remove(i);
77 | 		}
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/IPostprocessPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.util.List;
 6 | 
 7 | /**
 8 |  * An interface which process the list of {@link TaggedWord} after segmentation.
 9 |  */
10 | public interface IPostprocessPass {
11 | 	/**
12 | 	 * Process the list of {@link TaggedWord}.
13 | 	 *
14 | 	 * @param sentence
15 | 	 * 		The list of {@link TaggedWord}.
16 | 	 */
17 | 	void process(List<TaggedWord> sentence);
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/NegWordPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.Dat;
 4 | import org.thunlp.thulac.data.TaggedWord;
 5 | import org.thunlp.thulac.util.StringUtils;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.List;
 9 | 
10 | /**
11 |  * A postprocess pass which recognises certain negative phrases (for example, "not good
12 |  * enough" in English) and separate the negative word from the rest parts in the phrase
13 |  * (in this example, "not good" is converted into "not" and "good enough") and give the
14 |  * separated parts their respective tags. A {@link org.thunlp.thulac.data.Dat} file stores the list of negative
15 |  * phrases to be separated by {@link #process(java.util.List)}.
16 |  */
17 | public class NegWordPass implements IPostprocessPass {
18 | 	private Dat negPhrases;
19 | 
20 | 	public NegWordPass(String negDatFile) throws IOException {
21 | 		this.negPhrases = new Dat(negDatFile);
22 | 	}
23 | 
24 | 	@Override
25 | 	public void process(List<TaggedWord> sentence) {
26 | 		if (this.negPhrases == null || sentence.isEmpty()) return;
27 | 
28 | 		for (int i = sentence.size() - 1; i >= 0; --i) {
29 | 			TaggedWord tagged = sentence.get(i);
30 | 			if (this.negPhrases.contains(tagged.word)) {
31 | 				int[] codePoints = StringUtils.toCodePoints(tagged.word);
32 | 				String word = StringUtils.toString(codePoints, 1, codePoints.length - 1);
33 | 				sentence.add(i + 1, new TaggedWord(word, "v",tagged.startOffset+1,tagged.endOffset));
34 | 				sentence.get(i).endOffset = sentence.get(i).startOffset +1;
35 | 				tagged.word = StringUtils.toString(codePoints[0]);
36 | 				tagged.tag = "d";
37 | 			}
38 | 		}
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/SpecialPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.util.List;
 6 | 
 7 | /**
 8 |  * A postprocess path which deals with special cases.
 9 |  */
10 | public class SpecialPass implements IPostprocessPass {
11 | 	@Override
12 | 	public void process(List<TaggedWord> sentence) {
13 | 		this.filterHTTPURLs(sentence);
14 | 	}
15 | 
16 | 	/**
17 | 	 * Tag "x" for HTTP URLs.<br>
18 | 	 * HTTP URLs are identified as is, if the word is longer than 4 characters and
19 | 	 * starts with "http". (to conform with both {@code http} and {@code https} schemes)
20 | 	 *
21 | 	 * @param sentence
22 | 	 * 		The input sentence.
23 | 	 */
24 | 	private void filterHTTPURLs(List<TaggedWord> sentence) {
25 | 		for (TaggedWord tagged : sentence)
26 | 			if (tagged.word.length() >= 5 && tagged.word.startsWith("http"))
27 | 				tagged.tag = "x";
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/TimeWordPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | import org.thunlp.thulac.util.StringUtils;
 5 | 
 6 | import java.util.List;
 7 | 
 8 | import static org.thunlp.thulac.util.CodePointUtils.DIGITS;
 9 | import static org.thunlp.thulac.util.CodePointUtils.generate;
10 | 
11 | /**
12 |  * A postprocess pass which combine words which together represent a time period into
13 |  * one word.<br>
14 |  * For example, for input word list {@code "A", "B", "C1", "2", "34" "year"} ("year"
15 |  * here can by any Chinese time unit in {@link #TIME_UNITS}), the output should be:
16 |  * {@code "A", "B", "C1", "234year"}.<br>
17 |  * It can be seen that {@code "C1"} is not concatenated to {@code "234year"}, since it
18 |  * contains non-digit characters.<br>
19 |  * Please notice that this class is able to deal with full-width numbers like U+FF10
20 |  * (full-width digit 1) yet not Chinese digits like U+3007 (Chinese for "one").
21 |  */
22 | public class TimeWordPass implements IPostprocessPass {
23 | 	/**
24 | 	 * Chinese characters which represent time units: (description in English)<br>
25 | 	 * YEAR: U+5E74, MONTH: U+6708, DAY: U+65E5 & U+53F7, HOUR: U+65F6 & U+70B9,
26 | 	 * MINUTE: U+5206, SECOND: U+79D2.
27 | 	 */
28 | 	private static final String TIME_UNITS = generate('\u5E74', '\u6708', '\u65E5',
29 | 			'\u53F7', '\u65F6', '\u70B9', '\u5206', '\u79D2');
30 | 
31 | 	/**
32 | 	 * {@code word} is a number if all the code points in {@code word} is a
33 | 	 * {@linkplain org.thunlp.thulac.util.CodePointUtils#DIGITS digit}.
34 | 	 *
35 | 	 * @param word
36 | 	 * 		The word to check.
37 | 	 *
38 | 	 * @return Whether this {@code word} is a number.
39 | 	 */
40 | 	private boolean isNumber(String word) {
41 | 		for (int codePoint : StringUtils.toCodePoints(word))
42 | 			if (DIGITS.indexOf(codePoint) == -1) return false;
43 | 		return true;
44 | 	}
45 | 
46 | 	/**
47 | 	 * {@code word} is a time unit if and only if: {@code word} contains only ont code
48 | 	 * point and this code point is a {@linkplain #TIME_UNITS time unit}.
49 | 	 *
50 | 	 * @param word
51 | 	 * 		The word to check.
52 | 	 *
53 | 	 * @return Whether this {@code word} is a time unit.
54 | 	 */
55 | 	private boolean isTimeUnit(String word) {
56 | 		return StringUtils.codePointCount(word) == 1 &&
57 | 				TIME_UNITS.indexOf(word.codePointAt(0)) != -1;
58 | 	}
59 | 
60 | 	@Override
61 | 	public void process(List<TaggedWord> sentence) {
62 | 		boolean isTimeWord = false;
63 | 		for (int i = sentence.size() - 1; i >= 0; i--) {
64 | 			TaggedWord tagged = sentence.get(i);
65 | 			if (this.isTimeUnit(tagged.word)) isTimeWord = true;
66 | 			else if (isTimeWord && this.isNumber(tagged.word)) {
67 | 				tagged.word += sentence.remove(i + 1).word;
68 | 				tagged.tag = "t";
69 | 			} else isTimeWord = false;
70 | 		}
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/VerbPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.Dat;
 4 | import org.thunlp.thulac.data.TaggedWord;
 5 | 
 6 | import java.io.IOException;
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * A postprocess pass which identifies <i>Dictionary Verbs</i> and <i>Directional Verbs</i>.
11 |  *
12 |  * @see <a href="https://en.wikipedia.org/wiki/Modal_verb">Dictionary Verb</a>
13 |  * @see <a href="https://zh.wikipedia.org/wiki/%E8%83%BD%E6%84%BF%E5%8A%A8%E8%AF%8D">
14 |  * Dictionary Verb in Chinese</a>
15 |  * @see <a href="http://baike.baidu.com/item/%E8%B6%8B%E5%90%91%E5%8A%A8%E8%AF%8D">
16 |  * Directional Verb</a>
17 |  * @see <a href="https://zh.wikipedia.org/wiki/%E6%B1%89%E8%AF%AD%E8%AF%8D%E7%B1%BB#.E5.88.86.E7.B1.BB_2>
18 |  * Chinese Word Categories</a>
19 |  */
20 | public class VerbPass implements IPostprocessPass {
21 | 	/**
22 | 	 * {@link org.thunlp.thulac.data.Dat} file containing word list of Dictionary Verbs.
23 | 	 */
24 | 	private Dat vM;
25 | 	/**
26 | 	 * {@link org.thunlp.thulac.data.Dat} file containing word list of Directional Verbs.
27 | 	 */
28 | 	private Dat vD;
29 | 	/**
30 | 	 * The tag to represent ordinary verbs.
31 | 	 */
32 | 	private String tag;
33 | 
34 | 	public VerbPass(String vMFile, String vDFile) throws IOException {
35 | 		this.vM = new Dat(vMFile);
36 | 		this.vD = new Dat(vDFile);
37 | 		this.tag = "v";
38 | 	}
39 | 
40 | 	/**
41 | 	 * Within two adjacent verbs, only the first one might be a Dictionary Verb and only the
42 | 	 * second one might be a Directional Verb.
43 | 	 *
44 | 	 * @param sentence
45 | 	 * 		The input sentence.
46 | 	 */
47 | 	@Override
48 | 	public void process(List<TaggedWord> sentence) {
49 | 		if (this.vM == null || this.vD == null || sentence.isEmpty()) return;
50 | 
51 | 		TaggedWord last = sentence.get(0), tagged;
52 | 		for (int i = 1, size = sentence.size(); i < size; i++, last = tagged) {
53 | 			tagged = sentence.get(i + 1);
54 | 			if (this.tag.equals(last.tag) && this.tag.equals(tagged.tag))
55 | 				if (this.vM.contains(last.word)) tagged.tag = "vm";
56 | 				else if (this.vD.contains(tagged.word)) tagged.tag = "vd";
57 | 		}
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/preprocess/ConvertT2SPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.preprocess;
 2 | 
 3 | import org.thunlp.thulac.data.POCGraph;
 4 | import org.thunlp.thulac.util.StringUtils;
 5 | 
 6 | import java.io.DataInputStream;
 7 | import java.io.File;
 8 | import java.io.FileInputStream;
 9 | import java.io.IOException;
10 | import java.util.HashMap;
11 | 
12 | /**
13 |  * A preprocess pass which convert traditional Chinese characters to simplified ones,
14 |  * used when switch {@code -t2s} exists in the command line.
15 |  */
16 | public class ConvertT2SPass implements IPreprocessPass {
17 | 	private HashMap<Integer, Integer> t2sMap;
18 | 
19 | 	public ConvertT2SPass(String fileName) throws IOException {
20 | 		this.t2sMap = new HashMap<>();
21 | 		this.loadT2SMap(fileName);
22 | 	}
23 | 
24 | 	private void loadT2SMap(String filename) throws IOException {
25 | 		// TODO: adapt NIO
26 | 
27 | 		File mapFile = new File(filename);
28 | 		// t2s map format: recordCount * DWORD traditional +
29 | 		//                 recordCount * DWORD simplified
30 | 		// -> 8 * recordCount bytes in total
31 | 		int recordCount = (int) (mapFile.length() >> 3);
32 | 
33 | 		DataInputStream input = new DataInputStream(new FileInputStream(mapFile));
34 | 		int[] traditional = new int[recordCount]; // cache
35 | 		for (int i = 0; i < recordCount; ++i) traditional[i] = input.readInt();
36 | 		for (int i = 0; i < recordCount; ++i) {
37 | 			int simplified = input.readInt();
38 | 			this.t2sMap.put(traditional[i], simplified);
39 | 		}
40 | 		input.close();
41 | 	}
42 | 
43 | 	private int getSimplifiedCodePoint(int c) {
44 | 		if (this.t2sMap.containsKey(c)) return this.t2sMap.get(c);
45 | 		return c;
46 | 	}
47 | 
48 | 	private String convertT2S(String sentence) {
49 | 		int[] codePoints = StringUtils.toCodePoints(sentence);
50 | 		StringBuilder sb = new StringBuilder();
51 | 		for (int codePoint : codePoints)
52 | 			sb.appendCodePoint(this.getSimplifiedCodePoint(codePoint));
53 | 		return sb.toString();
54 | 	}
55 | 
56 | 	@Override
57 | 	public String process(String raw, POCGraph ignored) {
58 | 		return this.convertT2S(raw);
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/preprocess/IPreprocessPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.preprocess;
 2 | 
 3 | import org.thunlp.thulac.data.POCGraph;
 4 | 
 5 | /**
 6 |  * An interface which process the raw {@link String} before segmentation.
 7 |  */
 8 | public interface IPreprocessPass {
 9 | 	/**
10 | 	 * Process the raw {@link String}.
11 | 	 *
12 | 	 * @param raw
13 | 	 * 		The raw {@link String} to process.
14 | 	 * @param graph
15 | 	 * 		The {@link org.thunlp.thulac.data.POCGraph} to write to.
16 | 	 *
17 | 	 * @return The processed {@link String}.
18 | 	 */
19 | 	String process(String raw, POCGraph graph);
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/preprocess/PreProcessPass.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.preprocess;
  2 | 
  3 | import org.thunlp.thulac.data.POCGraph;
  4 | import org.thunlp.thulac.util.StringUtils;
  5 | 
  6 | import static org.thunlp.thulac.util.CodePointUtils.SPECIAL_CHARS;
  7 | import static org.thunlp.thulac.util.CodePointUtils.WHITESPACE_CHARS;
  8 | 
  9 | /**
 10 |  * A preprocess pass which cleans raw input up.
 11 |  */
 12 | public class PreProcessPass implements IPreprocessPass {
 13 | 	// TODO: add more documentation
 14 | 
 15 | 	private static final String SINGLE_PUNCTUATION_CODE_POINTS = StringUtils.toString(
 16 | 			65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 1230, 12304,
 17 | 			12305, 12289, 12298, 12299, 64, 35, 65288, 65289, 34, 91, 93, 126, 47, 44, 58,
 18 | 			63, 9700, 9734, 9733, 8230, 39, 33, 42, 43, 62, 40, 41, 59, 61);
 19 | 
 20 | 	private boolean isSinglePunctuation(int c) {
 21 | 		return SINGLE_PUNCTUATION_CODE_POINTS.indexOf(c) != -1;
 22 | 	}
 23 | 
 24 | 	private String cleanup(String sentence, POCGraph graph) {
 25 | 		StringBuilder cleaned = new StringBuilder();
 26 | 		graph.clear();
 27 | 		boolean spaceFlag = false, otherFlag = false,
 28 | 				singlePunctuationFlag = false, titleFlag = false;
 29 | 
 30 | 		int titleStart = 0;
 31 | 		int[] codePoints = StringUtils.toCodePoints(sentence);
 32 | 		for (int c : codePoints) {
 33 | 			if (WHITESPACE_CHARS.indexOf(c) != -1) {
 34 | 				otherFlag = false;
 35 | 				if (spaceFlag) continue;
 36 | 				if (!graph.isEmpty())
 37 | 					graph.setElementAt(graph.lastElement() & 12, graph.size() - 1);
 38 | 				spaceFlag = true;
 39 | 				continue;
 40 | 			}
 41 | 
 42 | 			cleaned.appendCodePoint(c);
 43 | 			if (SPECIAL_CHARS.indexOf(c) != -1) {
 44 | 				if (spaceFlag) {
 45 | 					singlePunctuationFlag = this.isSinglePunctuation(c);
 46 | 					graph.add(singlePunctuationFlag ? 8 : 9);
 47 | 					spaceFlag = false;
 48 | 				} else {
 49 | 					if (otherFlag) {
 50 | 						if (this.isSinglePunctuation(c)) {
 51 | 							if (!graph.isEmpty()) graph.setElementAt(
 52 | 									graph.lastElement() & 12, graph.size() - 1);
 53 | 							graph.add(8);
 54 | 						} else if (singlePunctuationFlag) graph.add(9);
 55 | 						else {
 56 | 							if (!graph.isEmpty() && graph.lastElement() == 0)
 57 | 								graph.setElementAt(7, graph.size() - 1);
 58 | 							graph.add(2);
 59 | 						}
 60 | 					} else graph.add(9);
 61 | 					singlePunctuationFlag = this.isSinglePunctuation(c);
 62 | 				}
 63 | 				otherFlag = true;
 64 | 
 65 | 				if (c == 12298) titleStart = graph.size();
 66 | 				else if (c == 12299 && titleFlag) {
 67 | 					int titleEnd = graph.size() - 2;
 68 | 					if (titleEnd <= titleStart + 9)
 69 | 						if (titleStart == titleEnd) graph.setElementAt(9, titleStart);
 70 | 						else {
 71 | 							graph.setElementAt(1, titleStart);
 72 | 							for (int i = titleStart + 1; i < titleEnd; ++i)
 73 | 								graph.setElementAt(2, i);
 74 | 							graph.setElementAt(4, titleEnd);
 75 | 						}
 76 | 				}
 77 | 				titleFlag = c == 12298;
 78 | 			} else {
 79 | 				if (spaceFlag) graph.add(9);
 80 | 				else if (otherFlag) {
 81 | 					graph.setElementAt(graph.lastElement() & 12, graph.size() - 1);
 82 | 					graph.add(9);
 83 | 					singlePunctuationFlag = false;
 84 | 				} else graph.add(15);
 85 | 				spaceFlag = false;
 86 | 				otherFlag = false;
 87 | 			}
 88 | 		}
 89 | 
 90 | 		// deal with first & last character
 91 | 		if (!graph.isEmpty()) {
 92 | 			int first = graph.firstElement() & 9, last = graph.lastElement() & 12;
 93 | 			graph.setElementAt(first == 0 ? 9 : first, 0);
 94 | 			graph.setElementAt(last == 0 ? 12 : last, graph.size() - 1);
 95 | 		}
 96 | 
 97 | 		return cleaned.toString();
 98 | 	}
 99 | 
100 | 	@Override
101 | 	public String process(String raw, POCGraph graph) {
102 | 		return this.cleanup(raw, graph);
103 | 	}
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/util/BufferUtils.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.util;
 2 | 
 3 | import java.io.IOException;
 4 | import java.nio.ByteBuffer;
 5 | import java.nio.IntBuffer;
 6 | import java.nio.channels.SeekableByteChannel;
 7 | 
 8 | /**
 9 |  * An utility class which deals with buffers.
10 |  *
11 |  * @see java.nio.Buffer
12 |  */
13 | public class BufferUtils {
14 | 	/**
15 | 	 * Read ints from {@code channel} using {@code buf} as buffer and putting them
16 | 	 * sequentially into the array of {@code int[]} represented by {@code arrays}.<br>
17 | 	 * {@code buf} is always in read mode after this method returns, that is, users
18 | 	 * have to call {@code buf.flip()} first if they wish to reuse it. {@code
19 | 	 * channel} is NOT closed after this method returns (since the EOF might not have been
20 | 	 * reached yet), therefore users should close it manually.<br>
21 | 	 *
22 | 	 * @param channel
23 | 	 * 		The {@link java.nio.channels.FileChannel} to read from.
24 | 	 * @param buf
25 | 	 * 		The {@link java.nio.ByteBuffer} to use as buffer.
26 | 	 * @param arrays
27 | 	 * 		The array of {@code int[]} to store the read ints.
28 | 	 *
29 | 	 * @return A return value of {@code true} means that all the arrays are successfully
30 | 	 * filled with data read from {@code channel}, while {@code false} means that the
31 | 	 * EOF is reached before all the arrays are filled. In special case that all arrays
32 | 	 * are filled and EOF is reached, {@code true} is returned.
33 | 	 *
34 | 	 * @throws java.io.IOException
35 | 	 * 		If an exception is thrown while reading from {@code channel}.
36 | 	 * @throws NullPointerException
37 | 	 * 		If either channel is null, buf is null, or any element of {@code arrays} is
38 | 	 * 		null.
39 | 	 */
40 | 	public static boolean readInts(
41 | 			SeekableByteChannel channel, ByteBuffer buf, int[]... arrays)
42 | 			throws IOException {
43 | 		int position = 0, offset = 0;
44 | 		int[] current = arrays[position];
45 | 		int currentLeft = current.length, readBytes, readInts;
46 | 
47 | 		while (true) {
48 | 			// read buffer
49 | 			readBytes = channel.read(buf);
50 | 			// if EOF is reached and there are still arrays left not filled
51 | 			if (readBytes == -1) return false;
52 | 			buf.flip();
53 | 			IntBuffer intBuf = buf.asIntBuffer();
54 | 			readInts = readBytes >> 2;
55 | 
56 | 			// copy buffer content to arrays
57 | 			while (readInts > 0) {
58 | 				int getLen = Math.min(readInts, currentLeft);
59 | 				intBuf.get(current, offset, getLen);
60 | 				offset += getLen;
61 | 				readInts -= getLen;
62 | 				currentLeft -= getLen;
63 | 
64 | 				if (currentLeft == 0) { // if current array is filled
65 | 					++position;
66 | 					if (position == arrays.length) { // if all arrays have been filled
67 | 						buf.clear();
68 | 						return true;
69 | 					}
70 | 					current = arrays[position];
71 | 					offset = 0;
72 | 					currentLeft = current.length;
73 | 				}
74 | 			}
75 | 
76 | 			buf.clear();
77 | 		}
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/util/CodePointUtils.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.util;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | /**
  7 |  * An utility class providing definitions for many sets of code points.
  8 |  */
  9 | public class CodePointUtils {
 10 | 	/**
 11 | 	 * ASCII and full-width digits.
 12 | 	 */
 13 | 	public static final String DIGITS =
 14 | 			generate(range('0', '9'), range('\uFF10', '\uFF19'));
 15 | 
 16 | 	/**
 17 | 	 * Chinese digits.
 18 | 	 */
 19 | 	public static final String CHINESE_DIGITS = generate('\u3007', '\u4E00', '\u4E8C',
 20 | 			'\u4E09', '\u56DB', '\u4E94', '\u516D', '\u4E03', '\u516B', '\u4E5D');
 21 | 
 22 | 	/**
 23 | 	 * Special characters, containing:<br>
 24 | 	 * <ul>
 25 | 	 * <li><b>Chinese full-width punctuations</b>:<br>
 26 | 	 * U+FF0C: Comma, U+3002: Full Stop, U+FF1F: Question Mark, U+FF01: Exclamation
 27 | 	 * Mark, U+FF1A: Colon, U+FF1B: Semicolon, U+3010 & U+3011: Brackets, U+3001:
 28 | 	 * Ideographic Comma, U+300A & U+300B: Guillemets, U+FF08 & U+FF09: Parentheses.
 29 | 	 * </li>
 30 | 	 * <li><b>Standard punctuations</b>:<br>
 31 | 	 * U+2018 & U+2019: Single Quotation Marks,U+201C & U+201D: Double Quotation
 32 | 	 * Marks, U+00B7: Middle Point, U+2026: Horizontal Ellipsis, U+2014: Em Dash.
 33 | 	 * </li>
 34 | 	 * <li><b>Special characters</b>:
 35 | 	 * U+FFE5: Full-width Yen Sign, U+25E4: Black Upper Left Triangle, U+2605: Black
 36 | 	 * Star, U+2606: White Star.
 37 | 	 * </li>
 38 | 	 * <li><b>ASCII characters</b>: All printable ASCII characters (from U+0021 to
 39 | 	 * U+007E) except for U+0060: Grave Accent.</li>
 40 | 	 * </ul>
 41 | 	 * (All of above character names are referred from the Unicode Consortium.)
 42 | 	 */
 43 | 	public static final String SPECIAL_CHARS = generate('\uFF0C', '\u3002', '\uFF1F',
 44 | 			'\uFF01', '\uFF1A', '\uFF1B', '\u3010', '\u3011', '\u3001', '\u300A',
 45 | 			'\u300B', '\uFF08', '\uFF09', '\u2018', '\u2019', '\u201C', '\u201D',
 46 | 			'\u00B7', '\u2026', '\u2014', '\uFFE5', '\u25E4', '\u2605', '\u2606',
 47 | 			range('\u0021', '\u005F'), range('\u0061', '\u007E'));
 48 | 
 49 | 	/**
 50 | 	 * Whitespaces: U+0020 & U+3000.
 51 | 	 */
 52 | 	public static final String WHITESPACE_CHARS = generate('\u0020', '\u3000');
 53 | 
 54 | 	/**
 55 | 	 * Generate a {@link String} containing a list of code points produced following
 56 | 	 * these steps:<br>
 57 | 	 * <ol>
 58 | 	 * <li>Let {@code list} be the empty list of integers.</li>
 59 | 	 * <li>For each {@link Object} {@code param} in {@code params}, sequentially from
 60 | 	 * {@code params[0]} to {@code params[params.length - 1]}, switch on {@code
 61 | 	 * param}'s class:<br>
 62 | 	 * <ul>
 63 | 	 * <li><b>{@link Integer}</b>: Append {@code param} to {@code list}.</li>
 64 | 	 * <li><b>{@code int[]}</b>: Append every integer in {@code param} to {@code
 65 | 	 * list}.</li>
 66 | 	 * <li><b>{@link Character}</b>: Append {@code param}, converted to {@code char}
 67 | 	 * and then to {@code int} and then to {@link Integer}, to {@code list}.</li>
 68 | 	 * <li><b>{@link String}</b>: Append every code point in the content of {@code
 69 | 	 * param} retrieved using {@link StringUtils#toCodePoints(String)} to {@code
 70 | 	 * list}.</li>
 71 | 	 * <li><b>Other</b>: Do nothing.</li>
 72 | 	 * </ul>
 73 | 	 * </li>
 74 | 	 * <li>Convert {@code list} to {@link String} using {@link StringUtils#toString(int...)}</li>
 75 | 	 * </ol>
 76 | 	 *
 77 | 	 * @param params
 78 | 	 * 		The input parameters.
 79 | 	 *
 80 | 	 * @return The generated {@link String}.
 81 | 	 */
 82 | 	public static String generate(Object... params) {
 83 | 		List<Integer> codePoints = new ArrayList<>();
 84 | 		for (Object param : params)
 85 | 			if (param instanceof Integer) codePoints.add((Integer) param);
 86 | 			else if (param instanceof int[]) for (int codePoint : (int[]) param)
 87 | 				codePoints.add(codePoint);
 88 | 			else if (param instanceof String)
 89 | 				for (int codePoint : StringUtils.toCodePoints((String) param))
 90 | 					codePoints.add(codePoint);
 91 | 			else if (param instanceof Character) codePoints.add((int) (Character) param);
 92 | 
 93 | 		int[] cps = new int[codePoints.size()];
 94 | 		for (int i = 0, size = codePoints.size(); i < size; ++i)
 95 | 			cps[i] = codePoints.get(i);
 96 | 
 97 | 		return StringUtils.toString(cps);
 98 | 	}
 99 | 
100 | 	/**
101 | 	 * Return an {@code int[]} containing code points ranging from start to end
102 | 	 * (inclusive);
103 | 	 */
104 | 	public static int[] range(int start, int end) {
105 | 		if (end < start) return null;
106 | 		int[] range = new int[end - start + 1];
107 | 		for (int i = start; i <= end; ++i) range[i - start] = i;
108 | 		return range;
109 | 	}
110 | }
111 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/util/IOUtils.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.util;
  2 | 
  3 | import org.thunlp.thulac.io.IInputProvider;
  4 | import org.thunlp.thulac.io.IOutputHandler;
  5 | import org.thunlp.thulac.io.ReaderInputProvider;
  6 | import org.thunlp.thulac.io.StringInputProvider;
  7 | import org.thunlp.thulac.io.StringOutputHandler;
  8 | import org.thunlp.thulac.io.WriterOutputHandler;
  9 | 
 10 | import java.io.BufferedReader;
 11 | import java.io.BufferedWriter;
 12 | import java.io.File;
 13 | import java.io.IOException;
 14 | import java.io.InputStream;
 15 | import java.io.InputStreamReader;
 16 | import java.io.OutputStream;
 17 | import java.io.OutputStreamWriter;
 18 | import java.nio.charset.Charset;
 19 | import java.nio.charset.StandardCharsets;
 20 | import java.nio.charset.UnsupportedCharsetException;
 21 | import java.nio.file.Files;
 22 | import java.nio.file.Paths;
 23 | import java.util.ArrayList;
 24 | import java.util.List;
 25 | import java.util.regex.Matcher;
 26 | import java.util.regex.Pattern;
 27 | 
 28 | /**
 29 |  * A class which provides static utility methods used dealing with {@link org.thunlp.thulac.io.IInputProvider}
 30 |  * and {@link IOutputHandler}. Some of them construct instances of {@link org.thunlp.thulac.io.IInputProvider}
 31 |  * and {@link IOutputHandler}, hiding the implementation details from the user. Others
 32 |  * can be used within implementations of {@link org.thunlp.thulac.io.IInputProvider} and
 33 |  * {@link IOutputHandler}, avoiding code duplicates.
 34 |  *
 35 |  * @see org.thunlp.thulac.io.IInputProvider
 36 |  * @see IOutputHandler
 37 |  */
 38 | public class IOUtils {
 39 | 	/**
 40 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from
 41 | 	 * {@link System#in}, using the default charset as the input encoding.
 42 | 	 *
 43 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
 44 | 	 */
 45 | 	public static IInputProvider inputFromConsole() {
 46 | 		return inputFromInputStream(System.in); // use default charset for System.in
 47 | 	}
 48 | 
 49 | 	/**
 50 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from a given
 51 | 	 * {@link java.io.InputStream} using UTF-8 as encoding.<br>
 52 | 	 * It is recommended to use {@link #inputFromFile(java.io.File, java.nio.charset.Charset)} when reading
 53 | 	 * input from files, since it takes better advantage of Java NIO and have better
 54 | 	 * performances.
 55 | 	 *
 56 | 	 * @param in
 57 | 	 * 		The {@link java.io.InputStream} to retrieve input from.
 58 | 	 *
 59 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
 60 | 	 */
 61 | 	public static IInputProvider inputFromInputStream(InputStream in) {
 62 | 		return inputFromInputStream(in, (Charset) null);
 63 | 	}
 64 | 
 65 | 	/**
 66 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from a given
 67 | 	 * {@link java.io.InputStream} using a given charset as encoding.<br>
 68 | 	 * It is recommended to use {@link #inputFromFile(java.io.File, java.nio.charset.Charset)} when reading
 69 | 	 * input from files, since it takes better advantage of Java NIO and have better
 70 | 	 * performances.
 71 | 	 *
 72 | 	 * @param in
 73 | 	 * 		The {@link java.io.InputStream} to retrieve input from.
 74 | 	 * @param charsetName
 75 | 	 * 		The optional name of the charset to use, defaulted to "UTF-8".
 76 | 	 *
 77 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
 78 | 	 */
 79 | 	public static IInputProvider inputFromInputStream(InputStream in, String charsetName)
 80 | 			throws UnsupportedCharsetException {
 81 | 		return inputFromInputStream(in, forName(charsetName));
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from a given
 86 | 	 * {@link java.io.InputStream} using a given charset as encoding.<br>
 87 | 	 * It is recommended to use {@link #inputFromFile(java.io.File, java.nio.charset.Charset)} when reading
 88 | 	 * input from files, since it takes better advantage of Java NIO and have better
 89 | 	 * performances.
 90 | 	 *
 91 | 	 * @param in
 92 | 	 * 		The {@link java.io.InputStream} to retrieve input from.
 93 | 	 * @param charset
 94 | 	 * 		The optional charset to use, defaulted to UTF-8.
 95 | 	 *
 96 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
 97 | 	 */
 98 | 	public static IInputProvider inputFromInputStream(InputStream in, Charset charset) {
 99 | 		return new ReaderInputProvider(new BufferedReader(
100 | 				new InputStreamReader(in, getOrDefault(charset))));
101 | 	}
102 | 
103 | 	/**
104 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the
105 | 	 * given file using UTF-8 as file encoding.
106 | 	 *
107 | 	 * @param filename
108 | 	 * 		The name of the file to retrieve input from.
109 | 	 *
110 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
111 | 	 *
112 | 	 * @throws java.io.IOException
113 | 	 * 		If the file does not exist or is not readable.
114 | 	 */
115 | 	public static IInputProvider inputFromFile(String filename) throws IOException {
116 | 		return inputFromFile(filename, (Charset) null);
117 | 	}
118 | 
119 | 	/**
120 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the
121 | 	 * given file using UTF-8 as file encoding.
122 | 	 *
123 | 	 * @param file
124 | 	 * 		The file to retrieve input from.
125 | 	 *
126 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
127 | 	 *
128 | 	 * @throws java.io.IOException
129 | 	 * 		If the file does not exist or is not readable.
130 | 	 */
131 | 	public static IInputProvider inputFromFile(File file) throws IOException {
132 | 		return inputFromFile(file, (Charset) null);
133 | 	}
134 | 
135 | 	/**
136 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the
137 | 	 * given file using a given charset as encoding.
138 | 	 *
139 | 	 * @param filename
140 | 	 * 		The name of the file to retrieve input from.
141 | 	 * @param charsetName
142 | 	 * 		The optional name of the charset to use, defaulted to "UTF-8".
143 | 	 *
144 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
145 | 	 *
146 | 	 * @throws java.io.IOException
147 | 	 * 		If the file does not exist or is not readable.
148 | 	 * @throws java.nio.charset.UnsupportedCharsetException
149 | 	 * 		If the charset referred to by the given name is not supported.
150 | 	 */
151 | 	public static IInputProvider inputFromFile(String filename, String charsetName)
152 | 			throws IOException, UnsupportedCharsetException {
153 | 		return inputFromFile(filename, forName(charsetName));
154 | 	}
155 | 
156 | 	/**
157 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the
158 | 	 * given file using a given charset as encoding.
159 | 	 *
160 | 	 * @param filename
161 | 	 * 		The file to retrieve input from.
162 | 	 * @param charset
163 | 	 * 		The optional file encoding to use, defaulted to UTF-8.
164 | 	 *
165 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
166 | 	 *
167 | 	 * @throws java.io.IOException
168 | 	 * 		If the file does not exist or is not readable.
169 | 	 */
170 | 	public static IInputProvider inputFromFile(String filename, Charset charset)
171 | 			throws IOException {
172 | 		if (filename == null) return null; // new File(null) throws NPE
173 | 		return inputFromFile(new File(filename), charset);
174 | 	}
175 | 
176 | 	/**
177 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the
178 | 	 * given file using a given charset as encoding.
179 | 	 *
180 | 	 * @param file
181 | 	 * 		The name of the file to retrieve input from.
182 | 	 * @param charsetName
183 | 	 * 		The optional name of the file encoding to use, defaulted to UTF-8.
184 | 	 *
185 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
186 | 	 *
187 | 	 * @throws java.io.IOException
188 | 	 * 		If the file does not exist or is not readable.
189 | 	 * @throws java.nio.charset.UnsupportedCharsetException
190 | 	 * 		If the charset referred to by the given	name is not supported.
191 | 	 */
192 | 	public static IInputProvider inputFromFile(File file, String charsetName)
193 | 			throws IOException, UnsupportedCharsetException {
194 | 		return inputFromFile(file, forName(charsetName));
195 | 	}
196 | 
197 | 	/**
198 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the
199 | 	 * given file using a given charset as encoding.
200 | 	 *
201 | 	 * @param file
202 | 	 * 		The name of the file to retrieve input from.
203 | 	 * @param charset
204 | 	 * 		The optional file encoding to use, defaulted to UTF-8.
205 | 	 *
206 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
207 | 	 *
208 | 	 * @throws java.io.IOException
209 | 	 * 		If the file does not exist or is not readable.
210 | 	 */
211 | 	public static IInputProvider inputFromFile(File file, Charset charset)
212 | 			throws IOException {
213 | 		if (file == null) return null;
214 | 		return new ReaderInputProvider(
215 | 				Files.newBufferedReader(Paths.get(file.toURI()), getOrDefault(charset)));
216 | 	}
217 | 
218 | 	/**
219 | 	 * Creates an instance of {@link org.thunlp.thulac.io.IInputProvider} which retrieves input from the
220 | 	 * given {@link String}.
221 | 	 *
222 | 	 * @param input
223 | 	 * 		The input string.
224 | 	 *
225 | 	 * @return The {@link org.thunlp.thulac.io.IInputProvider} created.
226 | 	 */
227 | 	public static IInputProvider inputFromString(String input) {
228 | 		if (input == null) return null;
229 | 		return new StringInputProvider(input);
230 | 	}
231 | 
232 | 	/**
233 | 	 * Creates an instance of {@link IOutputHandler} which writes output to
234 | 	 * {@link System#out}, using the default charset as the output encoding.
235 | 	 *
236 | 	 * @return The {@link IOutputHandler} created.
237 | 	 */
238 | 	public static IOutputHandler outputToConsole() {
239 | 		return new WriterOutputHandler(new BufferedWriter(
240 | 				new OutputStreamWriter(System.out)));
241 | 	}
242 | 
243 | 	/**
244 | 	 * Creates an instance of {@link IOutputHandler} which writes output to a given
245 | 	 * {@link java.io.OutputStream} using UTF-8 as encoding.<br>
246 | 	 * It is recommended to use {@link #outputToFile(java.io.File, String)} when writing
247 | 	 * output to files, since it takes better advantage of Java NIO and have better
248 | 	 * performances.
249 | 	 *
250 | 	 * @param out
251 | 	 * 		The {@link java.io.OutputStream} to write output to.
252 | 	 *
253 | 	 * @return The {@link IOutputHandler} created.
254 | 	 */
255 | 	public static IOutputHandler outputToOutputStream(OutputStream out) {
256 | 		return outputToOutputStream(out, (Charset) null);
257 | 	}
258 | 
259 | 	/**
260 | 	 * Creates an instance of {@link IOutputHandler} which writes output to a given
261 | 	 * {@link java.io.OutputStream} using a given charset as encoding.<br>
262 | 	 * It is recommended to use {@link #outputToFile(java.io.File, String)} when writing
263 | 	 * output to files, since it takes better advantage of Java NIO and have better
264 | 	 * performances.
265 | 	 *
266 | 	 * @param out
267 | 	 * 		The {@link java.io.OutputStream} to write output to.
268 | 	 * @param charsetName
269 | 	 * 		The optional name of the charset to use, defaulted to UTF-8.
270 | 	 *
271 | 	 * @return The {@link IOutputHandler} created.
272 | 	 *
273 | 	 * @throws java.nio.charset.UnsupportedCharsetException
274 | 	 * 		If the charset referred to by the name is not supported.
275 | 	 */
276 | 	public static IOutputHandler outputToOutputStream(
277 | 			OutputStream out, String charsetName) throws UnsupportedCharsetException {
278 | 		return outputToOutputStream(out, forName(charsetName));
279 | 	}
280 | 
281 | 	/**
282 | 	 * Creates an instance of {@link IOutputHandler} which writes output to a given
283 | 	 * {@link java.io.OutputStream} using a given charset as encoding.<br>
284 | 	 * It is recommended to use {@link #outputToFile(java.io.File, String)} when writing
285 | 	 * output to files, since it takes better advantage of Java NIO and have better
286 | 	 * performances.
287 | 	 *
288 | 	 * @param out
289 | 	 * 		The {@link java.io.OutputStream} to write output to.
290 | 	 * @param charset
291 | 	 * 		The optional charset to use, defaulted to UTF-8.
292 | 	 *
293 | 	 * @return The {@link IOutputHandler} created.
294 | 	 */
295 | 	public static IOutputHandler outputToOutputStream(OutputStream out, Charset charset) {
296 | 		return new WriterOutputHandler(new BufferedWriter(
297 | 				new OutputStreamWriter(out, getOrDefault(charset))));
298 | 	}
299 | 
300 | 	/**
301 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
302 | 	 * given file using UTF-8 as file encoding.
303 | 	 *
304 | 	 * @param filename
305 | 	 * 		The name of the file to output to.
306 | 	 *
307 | 	 * @return The {@link IOutputHandler} created.
308 | 	 *
309 | 	 * @throws java.io.IOException
310 | 	 * 		If the file cannot be created or is not writable.
311 | 	 */
312 | 	public static IOutputHandler outputToFile(String filename) throws IOException {
313 | 		return outputToFile(filename, (Charset) null);
314 | 	}
315 | 
316 | 	/**
317 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
318 | 	 * given file using UTF-8 as file encoding.
319 | 	 *
320 | 	 * @param file
321 | 	 * 		The file to output to.
322 | 	 *
323 | 	 * @return The {@link IOutputHandler} created.
324 | 	 *
325 | 	 * @throws java.io.IOException
326 | 	 * 		If the file cannot be created or is not writable.
327 | 	 */
328 | 	public static IOutputHandler outputToFile(File file) throws IOException {
329 | 		return outputToFile(file, (Charset) null);
330 | 	}
331 | 
332 | 	/**
333 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
334 | 	 * given file using a given charset as encoding.
335 | 	 *
336 | 	 * @param filename
337 | 	 * 		The name of the file to output to.
338 | 	 * @param charsetName
339 | 	 * 		The optional name of the charset to use, defaulted to "UTF-8".
340 | 	 *
341 | 	 * @return The {@link IOutputHandler} created.
342 | 	 *
343 | 	 * @throws java.io.IOException
344 | 	 * 		If the file cannot be created or is not writable.
345 | 	 * @throws java.nio.charset.UnsupportedCharsetException
346 | 	 * 		If the charset referred to by the given name is not supported.
347 | 	 */
348 | 	public static IOutputHandler outputToFile(String filename, String charsetName)
349 | 			throws IOException, UnsupportedCharsetException {
350 | 		return outputToFile(filename, forName(charsetName));
351 | 	}
352 | 
353 | 	/**
354 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
355 | 	 * given file using a given charset as encoding.
356 | 	 *
357 | 	 * @param filename
358 | 	 * 		The name of the file to output to.
359 | 	 * @param charset
360 | 	 * 		The optional file encoding to use, defaulted to UTF-8.
361 | 	 *
362 | 	 * @return The {@link IOutputHandler} created.
363 | 	 *
364 | 	 * @throws java.io.IOException
365 | 	 * 		If the file cannot be created or is not writable.
366 | 	 */
367 | 	public static IOutputHandler outputToFile(String filename, Charset charset)
368 | 			throws IOException {
369 | 		if (filename == null) return null; // new File(null) throws NPE
370 | 		return outputToFile(new File(filename), charset);
371 | 	}
372 | 
373 | 	/**
374 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
375 | 	 * given file using a given charset as encoding.
376 | 	 *
377 | 	 * @param file
378 | 	 * 		The file to output to.
379 | 	 * @param charsetName
380 | 	 * 		The optional name of the file encoding to use, defaulted to "UTF-8".
381 | 	 *
382 | 	 * @return The {@link IOutputHandler} created.
383 | 	 *
384 | 	 * @throws java.io.IOException
385 | 	 * 		If the file cannot be created or is not writable.
386 | 	 * @throws java.nio.charset.UnsupportedCharsetException
387 | 	 * 		If the charset referred to by the given name is not supported.
388 | 	 */
389 | 	public static IOutputHandler outputToFile(File file, String charsetName)
390 | 			throws IOException, UnsupportedCharsetException {
391 | 		return outputToFile(file, forName(charsetName));
392 | 	}
393 | 
394 | 	/**
395 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
396 | 	 * given file using a given charset as encoding.
397 | 	 *
398 | 	 * @param file
399 | 	 * 		The file to output to.
400 | 	 * @param charset
401 | 	 * 		The optional file encoding to use, defaulted to UTF-8.
402 | 	 *
403 | 	 * @return The {@link IOutputHandler} created.
404 | 	 *
405 | 	 * @throws java.io.IOException
406 | 	 * 		If the file cannot be created or is not writable.
407 | 	 */
408 | 	public static IOutputHandler outputToFile(File file, Charset charset)
409 | 			throws IOException {
410 | 		if (file == null) return null;
411 | 		return new WriterOutputHandler(
412 | 				Files.newBufferedWriter(Paths.get(file.toURI()), getOrDefault(charset)));
413 | 	}
414 | 
415 | 	/**
416 | 	 * Creates an instance of {@link org.thunlp.thulac.io.StringOutputHandler} which writes output to an
417 | 	 * {@link String} in memory.<br>
418 | 	 * It is typical to use this method like this:
419 | 	 * <pre><code>
420 | 	 * StringOutputHandler output = IOUtils.outputToString();
421 | 	 * Thulac.split(input, output, segOnly); // or anything else
422 | 	 * String outputStr = output.getString();
423 | 	 * </code></pre>
424 | 	 *
425 | 	 * @return The {@link org.thunlp.thulac.io.StringOutputHandler} created.
426 | 	 */
427 | 	public static StringOutputHandler outputToString() {
428 | 		return new StringOutputHandler();
429 | 	}
430 | 
431 | 	private static final int MAX_LENGTH = 20000;
432 | 	private static final Pattern SPLIT_PATTERN =
433 | 			Pattern.compile(".*([\u3002\uff1f\uff01\uff1b;!?]|$)");
434 | 
435 | 	/**
436 | 	 * Split a given line into a list of line segments if the line is too long. It is
437 | 	 * promised that each line segment either is the last one or ends with an
438 | 	 * punctuation character.
439 | 	 *
440 | 	 * @param line
441 | 	 * 		The line to split into line segments.
442 | 	 *
443 | 	 * @return The list of line segments split.
444 | 	 */
445 | 	public static List<String> getLineSegments(String line) {
446 | 		List<String> lineSegments = new ArrayList<>();
447 | 		if (line.length() < MAX_LENGTH) lineSegments.add(line);
448 | 		else { // split the line into short line segments
449 | 			Matcher matcher = SPLIT_PATTERN.matcher(line);
450 | 			while (matcher.find()) lineSegments.add(matcher.group());
451 | 		}
452 | 		return lineSegments;
453 | 	}
454 | 
455 | 	/**
456 | 	 * Returns a {@link java.nio.charset.Charset} wich name {@code charset}. This methods differs from
457 | 	 * the {@link java.nio.charset.Charset#forName(String)} when {@code charset} is {@code null}, with
458 | 	 * this method returning {@code null} while {@link java.nio.charset.Charset#forName(String)} throws
459 | 	 * an NPE.
460 | 	 *
461 | 	 * @param charset
462 | 	 * 		The name of the {@link java.nio.charset.Charset}.
463 | 	 *
464 | 	 * @return The {@link java.nio.charset.Charset} with name {@code charset}.
465 | 	 *
466 | 	 * @throws java.nio.charset.UnsupportedCharsetException
467 | 	 * 		If the charset referred to by the given name is not supported.
468 | 	 */
469 | 	private static Charset forName(String charset) throws UnsupportedCharsetException {
470 | 		if (charset == null) return null;
471 | 		return Charset.forName(charset);
472 | 	}
473 | 
474 | 	/**
475 | 	 * Returns the given {@link java.nio.charset.Charset} when non-null, or
476 | 	 * {@link java.nio.charset.StandardCharsets#UTF_8} otherwise, since many applications using
477 | 	 * {@link java.nio.charset.Charset} throws NPE if charset is {@code null}.
478 | 	 *
479 | 	 * @param charset
480 | 	 * 		The given {@link java.nio.charset.Charset}.
481 | 	 *
482 | 	 * @return {@code charset} when non-null, {@link java.nio.charset.StandardCharsets#UTF_8} otherwise.
483 | 	 */
484 | 	private static Charset getOrDefault(Charset charset) {
485 | 		return charset == null ? StandardCharsets.UTF_8 : charset;
486 | 	}
487 | }
488 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/util/StringUtils.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.util;
  2 | 
  3 | /**
  4 |  * An utility class which deals with string, converting array of code points to and from
  5 |  * strings.
  6 |  */
  7 | public class StringUtils {
  8 | 	/**
  9 | 	 * Convert an array of code points to {@link String}.
 10 | 	 *
 11 | 	 * @param codePoints
 12 | 	 * 		The code points to convert.
 13 | 	 *
 14 | 	 * @return The converted {@link String}.
 15 | 	 */
 16 | 	public static String toString(int... codePoints) {
 17 | 		return toString(codePoints, 0, codePoints.length);
 18 | 	}
 19 | 
 20 | 	/**
 21 | 	 * Convert an array of code points to {@link String}.
 22 | 	 *
 23 | 	 * @param codePoints
 24 | 	 * 		The code points to convert.
 25 | 	 * @param offset
 26 | 	 * 		The starting offset of {@code codePoints}.
 27 | 	 * @param len
 28 | 	 * 		The number of code points to convert.
 29 | 	 *
 30 | 	 * @return The converted {@link String}, indices which exceeds {@code
 31 | 	 * codePoints.length} are discarded.
 32 | 	 */
 33 | 	public static String toString(int[] codePoints, int offset, int len) {
 34 | 		StringBuilder sb = new StringBuilder();
 35 | 		for (int i = offset, max = Math.min(codePoints.length, offset + len);
 36 | 			 i < max; ++i)
 37 | 			sb.appendCodePoint(codePoints[i]);
 38 | 		return sb.toString();
 39 | 	}
 40 | 
 41 | 	/**
 42 | 	 * Convert a {@link String} to an array of code points.<br>
 43 | 	 * Internally, data in {@link String} is stored in {@code char[]}, however for
 44 | 	 * Unicode code points greater than U+FFFF, one {@code char} (that is, two bytes)
 45 | 	 * is not enough. Therefore, Java uses <i>surrogates</i> to divide code points
 46 | 	 * that cannot be represented by one {@code} into two. The problem is,
 47 | 	 * {@link String#length()} return the length of its internal {@code char[]}, while
 48 | 	 * the return value of {@link String#length()} is not necessarily (though in most
 49 | 	 * cases) equal to the number of code points stored in the {@link String}.<br>
 50 | 	 * To solve this problem, the {@link String} class provides a set of methods to
 51 | 	 * retrieve the actual number of code points stored and to access a code points in
 52 | 	 * the {@link String} using the index by code points, as implemented in this method.
 53 | 	 * However, the iteration through a {@link String} by the actual code points is
 54 | 	 * fairly complicated, and it is much easier for applications to achieve this if
 55 | 	 * the string data is stored as {@code int[]}, each element representing a code point.
 56 | 	 * And this is exactly What this method does: take a {@link String} as input,
 57 | 	 * convert it into a {@code int[]} which contains exactly the same data as the
 58 | 	 * {@link String}.<br>
 59 | 	 * It is recommended that all applications which iterate through the characters
 60 | 	 * stored in a {@link String} use<br>
 61 | 	 * <pre><code>
 62 | 	 * int[] codePoints = StringUtils.toCodePoints(str);
 63 | 	 * for (int codePoint: codePoints) // do something ...
 64 | 	 * </code></pre>
 65 | 	 * instead of the traditional<br>
 66 | 	 * <pre><code>
 67 | 	 * for (int i = 0, length = str.length(); i < length; ++i) {
 68 | 	 *     char c = str.charAt(i);
 69 | 	 *     // do something ...
 70 | 	 * }
 71 | 	 * </code></pre>
 72 | 	 *
 73 | 	 * @param str
 74 | 	 * 		The {@link String} to convert.
 75 | 	 *
 76 | 	 * @return The converted array of code points.
 77 | 	 */
 78 | 	public static int[] toCodePoints(String str) {
 79 | 		if (str == null) return null;
 80 | 		int codePointCount = str.codePointCount(0, str.length());
 81 | 		int[] codePoints = new int[codePointCount];
 82 | 		for (int i = 0; i < codePointCount; ++i)
 83 | 			codePoints[i] = str.codePointAt(str.offsetByCodePoints(0, i));
 84 | 		return codePoints;
 85 | 	}
 86 | 
 87 | 	/**
 88 | 	 * Return the number of code points in the given {@link String}.
 89 | 	 *
 90 | 	 * @param str
 91 | 	 * 		The given {@link String}.
 92 | 	 *
 93 | 	 * @return The number of code points in {@code str}.
 94 | 	 */
 95 | 	public static int codePointCount(String str) {
 96 | 		return str.codePointCount(0, str.length());
 97 | 	}
 98 | 
 99 | 	/**
100 | 	 * Return code point {@code index}-ith code point in the given {@link String}.
101 | 	 *
102 | 	 * @param str
103 | 	 * 		The given {@link String}.
104 | 	 * @param index
105 | 	 * 		The index of the code point to return.
106 | 	 *
107 | 	 * @return The cde point at {@code index}.
108 | 	 *
109 | 	 * @throws IndexOutOfBoundsException
110 | 	 * 		If index is negative or greater than or equal to the number of code points
111 | 	 * 		of {@code str}.
112 | 	 */
113 | 	public static int codePointAt(String str, int index) {
114 | 		int codePointIndex = str.offsetByCodePoints(0, index);
115 | 		return str.codePointAt(codePointIndex);
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
 1 | # Elasticsearch plugin descriptor file
 2 | # This file must exist as 'plugin-descriptor.properties' at
 3 | # the root directory of all plugins.
 4 | #
 5 | # A plugin can be 'site', 'jvm', or both.
 6 | #
 7 | ### example site plugin for "foo":
 8 | #
 9 | # foo.zip <-- zip file for the plugin, with this structure:
10 | #   _site/ <-- the contents that will be served
11 | #   plugin-descriptor.properties <-- example contents below:
12 | #
13 | # site=true
14 | # description=My cool plugin
15 | # version=1.0
16 | #
17 | ### example jvm plugin for "foo"
18 | #
19 | # foo.zip <-- zip file for the plugin, with this structure:
20 | #   <arbitrary name1>.jar <-- classes, resources, dependencies
21 | #   <arbitrary nameN>.jar <-- any number of jars
22 | #   plugin-descriptor.properties <-- example contents below:
23 | #
24 | # jvm=true
25 | # classname=foo.bar.BazPlugin
26 | # description=My cool plugin
27 | # version=2.0.0-rc1
28 | # elasticsearch.version=2.0
29 | # java.version=1.7
30 | #
31 | ### mandatory elements for all plugins:
32 | #
33 | # 'description': simple summary of the plugin
34 | description=A thulac analysis of plugins for Elasticsearch
35 | #
36 | # 'version': plugin's version
37 | version=7.9.1
38 | #
39 | # 'name': the plugin name
40 | name=analysis-thulac-plugin
41 | 
42 | ### mandatory elements for site plugins:
43 | #
44 | # 'site': set to true to indicate contents of the _site/
45 | #  directory in the root of the plugin should be served.
46 | #site=${elasticsearch.plugin.site}
47 | 
48 | 
49 | ### mandatory elements for jvm plugins :
50 | #
51 | # 'jvm': true if the 'classname' class should be loaded
52 | #  from jar files in the root directory of the plugin.
53 | #  Note that only jar files in the root directory are
54 | #  added to the classpath for the plugin! If you need
55 | #  other resources, package them into a resources jar.
56 | # jvm=true
57 | # 'classname': the name of the class to load, fully-qualified.
58 | classname=org.elasticsearch.plugin.analysis.ThulacAnalysisPlugin
59 | #
60 | # 'java.version' version of java the code is built against
61 | # use the system property java.specification.version
62 | # version string must be a sequence of nonnegative decimal integers
63 | # separated by "."'s and may have leading zeros
64 | java.version=1.8
65 | #
66 | # 'elasticsearch.version' version of elasticsearch compiled against
67 | # You will have to release a new version of the plugin for each new
68 | # elasticsearch release. This version is checked when the plugin
69 | # is loaded so Elasticsearch will refuse to start in the presence of
70 | # plugins with the incorrect elasticsearch.version.
71 | elasticsearch.version=7.9.1
72 | #
73 | ### deprecated elements for jvm plugins :
74 | #
75 | # 'isolated': true if the plugin should have its own classloader.
76 | # passing false is deprecated, and only intended to support plugins
77 | # that have hard dependencies against each other. If this is
78 | # not specified, then the plugin is isolated by default.
79 | #isolated=${elasticsearch.plugin.isolated}
80 | #


--------------------------------------------------------------------------------
/src/main/resources/plugin-security.policy:
--------------------------------------------------------------------------------
1 | grant {
2 |   // needed because of the hot reload functionality
3 |   permission java.io.FilePermission "<<ALL FILES>>", "read";
4 | };


--------------------------------------------------------------------------------
/src/test/java/TestThulac.java:
--------------------------------------------------------------------------------
 1 | import org.elasticsearch.thulac.Configuration;
 2 | import org.elasticsearch.thulac.ThulacLiteTokenizerScanner;
 3 | import org.junit.Test;
 4 | import org.thunlp.thulac.data.TaggedWord;
 5 | 
 6 | import java.io.*;
 7 | 
 8 | /**
 9 |  * Created by micro on 2017-12-17.
10 |  */
11 | public class TestThulac {
12 | 
13 |     @Test
14 |     public void test2() throws IOException {
15 |         ThulacLiteTokenizerScanner tokenizer = new ThulacLiteTokenizerScanner(new Configuration());
16 |         InputStreamReader isr = new InputStreamReader(getClass().getClassLoader().getResource("input").openStream());
17 |         tokenizer.reset(isr);
18 |         while (tokenizer.hasNext()) {
19 |             TaggedWord token = tokenizer.next();
20 |             System.out.println("word = " + token.word + ", tag= " + token.tag + ", start= " + token.startOffset + ", end=" + token.endOffset);
21 |         }
22 |     }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/test/resources/input:
--------------------------------------------------------------------------------
1 | 中国是世界上 人口最多的国家， 2008 年底中国大陆人口13.28亿，占全世界人口的20%，也就是说全世界平均每5个人当中就有一个是黄皮肤、黑头发的中国人。为什么中国人有这么多？中国怎么解决这个问题？请让我慢慢说起。古代的时候，中国人口并不多。清朝以前，中国人口都不超过一亿，有些朝代人口甚至只有一两千万，即使最强大的唐朝，人口也只有五千万。由于古代医学比较落后，生孩子有很大的危险，很多孩子一出生就死了；古代人的生活条件也不好，有些 家庭很穷，就算生了孩子也无法把他们养大；还有，古代人喜欢男孩儿，认为男孩儿可以劳动，而女孩儿却没有用，所以女孩儿常常被抛弃。这种重男轻女的错误思想也是造成中国古代人 少的一个原因。农业对古代中国非常重要，而人口就是劳动力。“谁生的孩子越多，谁就越光荣”，这种思想一直保持到新中国成立。为了加快发展，政府鼓励人们多生孩子，而且现代社会的医学水平比较发达，人们的生活条件也提高了，生孩子从很困难变成很容易，最后竟然难以控制。50年里，中国人口从5亿增加到11亿，翻了一番。这个速度太快了，如果不控制，后果不堪设想。于是70年代，中国政府制定法律，实行“计划生育”。“计划生育”就是有计划地生孩子，并且尽可能保证孩子的健康。具体地说，就是一个家庭只生一个孩子。如果多生了孩子，就要受到一些处罚。到现在为止，计划生育使中国人口的增长减少了至少4亿，人口的增长速度也越来越低。但是，计划生育也带来了一些问题。其中最重要的一个，就是我们这一代孩子比较孤独，因为我们都没有兄弟姐妹。这样的孩子叫做“独生子女”。中国现在的年轻人基本上都是独生子女，他们的性格和父母那一代人不一样。他们更加独立，但是也更加自我。其次，中国人口的出生率降低了，这也就意味着，几年后中国的人口达到顶峰，然后不再增长，变得越来越少。还有一个问题是人口分布不均匀，也就是有些地方人多，有些地方人少。中国东部人口多，西部人口少。中国人口最多的一个省是河南省，人口将近一亿，平均每平方千米有600多人；而中国人口最少的一个省是西藏自治区，在那里，平均每平方千米可能都找不到一个人。如果你认为这些问题还不算大的话，请继续听我说。现在中国老人多，年轻人少；男人多，女人少。老人多是因为年轻一代都是独生子女，占总人口的比例突然少了，于是老人就显得很多。在性别比例上，据说中国男女比例是118:100，也就是说，有18个男人一出生就注定了以后找不到老婆。为了让人口发展更加自然和科学，政府也对计划生育进行修改。现在，在中国大部分地区，如果父母都是独生子女的家庭允许生第二个孩子。对于“人口老龄化”，也就是老人越来越多的问题，政府的目标就是改善社会福利，提高人们的生活质量。清朝的时候，中国人的平均寿命是33岁，而现在，中国人的平均寿命已经达到73岁。对于13亿的人口来说，这的确是一个值得骄傲的成绩。


--------------------------------------------------------------------------------