├── .gitignore
├── LICENSE
├── README.md
├── build.gradle
├── doc
    └── 申请表.docx
├── settings.gradle
└── src
    ├── main
        └── java
        │   └── org
        │       └── thunlp
        │           └── thulac
        │               ├── Thulac.java
        │               ├── cb
        │                   ├── AlphaBeta.java
        │                   ├── CBModel.java
        │                   ├── CBNGramFeature.java
        │                   ├── CBTaggingDecoder.java
        │                   └── Node.java
        │               ├── data
        │                   ├── Dat.java
        │                   ├── DatMaker.java
        │                   ├── POCGraph.java
        │                   └── TaggedWord.java
        │               ├── io
        │                   ├── IInputProvider.java
        │                   ├── IOutputHandler.java
        │                   ├── IProgramStateListener.java
        │                   ├── ReaderInputProvider.java
        │                   ├── StringInputProvider.java
        │                   ├── StringOutputHandler.java
        │                   └── WriterOutputHandler.java
        │               ├── main
        │                   └── Main.java
        │               ├── postprocess
        │                   ├── DictionaryPass.java
        │                   ├── DoubleWordPass.java
        │                   ├── FilterPass.java
        │                   ├── IPostprocessPass.java
        │                   ├── NegWordPass.java
        │                   ├── SpecialPass.java
        │                   ├── TimeWordPass.java
        │                   └── VerbPass.java
        │               ├── preprocess
        │                   ├── ConvertT2SPass.java
        │                   ├── IPreprocessPass.java
        │                   └── PreprocessPass.java
        │               └── util
        │                   ├── BufferUtils.java
        │                   ├── CodePointUtils.java
        │                   ├── IOUtils.java
        │                   └── StringUtils.java
    └── test
        ├── java
            └── org
            │   └── thunlp
            │       └── thulac
            │           ├── IAccessible.java
            │           ├── MainAlt.java
            │           ├── ProfilerInputProvider.java
            │           ├── ProfilerOutputHandler.java
            │           ├── TestHelper.java
            │           ├── Tests.java
            │           ├── data
            │               ├── Dat2WordsConverter.java
            │               └── DatMakerTest.java
            │           └── util
            │               └── CodePointUtilsTest.java
        └── resources
            └── dat_maker_test_1.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | models
2 | .idea
3 | release


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 THUNLP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # THULAC：一个高效的中文词法分析工具包
  2 | 
  3 | > 本文档只针对THULAC的java版本，其他版本的使用方式请查阅对应的README文件。
  4 | 
  5 | ## 目录
  6 | * [项目介绍](#项目介绍)
  7 | * [编译和安装](#编译和安装)
  8 | * [使用方式](#使用方式)
  9 | * [各类分词的性能对比](#各类分词的性能对比)
 10 | * [词性解释](#词性解释)
 11 | * [THULAC模型介绍](#THULAC模型介绍)
 12 | * [注意事项](#注意事项)
 13 | * [其他语言实现](#其他语言实现)
 14 | * [更新历史](#历史)
 15 | * [开源协议](#开源协议)
 16 | * [相关论文](#相关论文)
 17 | * [作者](#作者)
 18 | * [致谢](#致谢)
 19 | 
 20 | ## 项目介绍
 21 | 
 22 | THULAC (THU Lexical Analyzer for Chinese) 是由清华大学自然语言处理与社会人文计算实验室研制推出的一套中文词法分析工具包，具有中文分词和词性标注功能。THULAC具有如下几个特点：
 23 | 
 24 | 1. 能力强。利用我们集成的目前世界上规模最大的人工分词和词性标注中文语料库（约含5800万字）训练而成，模型标注能力强大。
 25 | 1. 准确率高。该工具包在标准数据集Chinese Treebank (CTB5) 上分词的F1值可达97.3％，词性标注的F1值可达到92.9％，与该数据集上最好方法效果相当。
 26 | 1. 速度较快。同时进行分词和词性标注速度为300KB/s，每秒可处理约15万字。只进行分词速度可达到1.3MB/s。（该数据取自本库的c++版本，java版本可能速度略慢）
 27 | 
 28 | ## 编译和安装
 29 | * **可执行jar包**
 30 | 
 31 | 本库正在持续开发中，请参阅下文自行编译运行。
 32 | * **下载源代码编译运行**
 33 | 
 34 | 下载编译本库需要计算机上已安装[java](https://www.java.com/), [git](https://git-scm.com/)和[gradle](https://gradle.org/)，以及稳定可靠的网络连接。
 35 | 之后，运行命令行：
 36 | ``` bat
 37 | git clone https://github.com/thunlp/THULAC-Java.git
 38 | ```
 39 | 执行完毕后，运行命令行：
 40 | ``` bat
 41 | gradle check
 42 | ```
 43 | 如果控制台上打出`BUILD SUCCESSFUL`字样则说明编译成功。
 44 | 
 45 | ## 使用方式
 46 | ### 1. 分词和词性标注程序
 47 | #### 1.1. 命令格式
 48 | ``` bat
 49 | 从命令行输入输出：
 50 | java -jar THULAC_lite_java_run.jar [-t2s] [-seg_only] [-deli delimiter] [-user userdict.txt]
 51 | 从文本文件（UTF-8编码）输入输出：
 52 | java -jar THULAC_lite_java_run.jar [-t2s] [-seg_only] [-deli delimiter] [-user userdict.txt] -input input_file -output output_file
 53 | ```
 54 | 
 55 | #### 1.2. 命令参数
 56 | | 参数名称 | 含义 |
 57 | | --- | --- |
 58 | | -t2s | 将句子从繁体转化为简体 |
 59 | | -seg_only | 只进行分词，不进行词性标注 |
 60 | | -deli delimiter | 将词与词性间的分隔符设置为delimiter，默认为下划线_ |
 61 | | -filter | 使用过滤器去除一些没有意义的词语，例如“可以”。 |
 62 | | -user userdict.txt | 设置用户词典为userdict.txt，词典中的词会被打上uw标签。词典中每一个词一行，UTF8编码 |
 63 | | -model_dir dir | 设置模型文件所在文件夹为dir，默认为models/ |
 64 | | -input input_file | 设置输入文件为input_file，默认为命令行输入 |
 65 | | -output output_file | 设置输出文件为output_file，默认为命令行输出 |
 66 | 
 67 | ### 2. 获取模型
 68 | THULAC需要分词和词性标注模型的支持，获取下载好的模型用户可以登录[thulac.thunlp.org](http://thulac.thunlp.org)网站填写个人信息进行下载，并放到THULAC的根目录即可，或者使用参数`-model_dir dir`指定模型的位置。
 69 | 
 70 | ## 代表分词软件的性能对比
 71 | 我们选择LTP、ICTCLAS、结巴分词等国内代表分词软件与THULAC做性能比较。我们选择Windows作为测试环境，根据第二届国际汉语分词测评发布的国际中文分词测评标准，对不同软件进行了速度和准确率测试。
 72 | 
 73 | 在第二届国际汉语分词测评中，共有四家单位提供的测试语料 (Academia Sinica, City University, Peking University, Microsoft Research), 在评测提供的资源[icwb2-data](http://sighan.cs.uchicago.edu/bakeoff2005/)中包含了来自这四家单位的训练集（training）、测试集（testing）, 以及根据各自分词标准而提供的相应测试集的标准答案 (icwb2-data/scripts/gold)．在icwb2-data/scripts目录下含有对分词进行自动评分的perl脚本score。
 74 | 
 75 | 我们在统一测试环境下，对若干流行分词软件和THULAC进行了测试，使用的模型为各分词软件自带模型。THULAC使用的是随软件提供的简单模型Model_1。评测环境为 Intel Core i5 2.4 GHz 评测结果如下：
 76 | 
 77 | msr_test（560KB）
 78 | 
 79 | | Algorithm | Time | Precision | Recall |
 80 | |:------------|-------------:|------------:|-------:|
 81 | | LTP-3.2.0 | 3.21s | 0.867 | 0.896 |
 82 | | ICTCLAS(2015版) | 0.55s | 0.869 | 0.914 |
 83 | | jieba | 0.26s | 0.814 | 0.809 |
 84 | | THULAC | 0.62s  | 0.877 | 0.899 |
 85 | 
 86 | pku_test（510KB）
 87 | 
 88 | | Algorithm | Time | Precision | Recall |
 89 | |:------------|-------------:|------------:|-------:|
 90 | | LTP-3.2.0 | 3.83s | 0.960 | 0.947 |
 91 | | ICTCLAS(2015版) | 0.53s | 0.939 | 0.944|
 92 | | jieba | 0.23s | 0.850 | 0.784 |
 93 | | THULAC | 0.51s | 0.944 | 0.908 |
 94 | 
 95 | 除了以上在标准测试集上的评测，我们也对各个分词工具在大数据上的速度进行了评测，结果如下：
 96 | 
 97 | CNKI_journal.txt（51 MB）
 98 | 
 99 | | Algorithm | Time | Speed |
100 | |:------------|-------------:|------------:|
101 | | LTP-3.2.0 | 348.624s  | 149.80KB/s|
102 | | ICTCLAS(2015版) | 106.461s | 490.59KB/s|
103 | | jieba | 22.5583s | 2314.89KB/s |
104 | | THULAC | 42.625s  | 1221.05KB/s|
105 | 
106 | ## 词性解释
107 | 	n/名词 np/人名 ns/地名 ni/机构名 nz/其它专名
108 | 	m/数词 q/量词 mq/数量词 t/时间词 f/方位词 s/处所词
109 | 	v/动词 vm/能愿动词 vd/趋向动词 a/形容词 d/副词
110 | 	h/前接成分 k/后接成分 i/习语 j/简称
111 | 	r/代词 c/连词 p/介词 u/助词 y/语气助词
112 | 	e/叹词 o/拟声词 g/语素 w/标点 x/其它
113 | 
114 | ## THULAC模型介绍
115 | 1. 我们随THULAC源代码附带了简单的分词模型Model_1，仅支持分词功能。该模型由人民日报分词语料库训练得到。
116 | 1. 我们随THULAC源代码附带了分词和词性标注联合模型Model_2，支持同时分词和词性标注功能。该模型由人民日报分词和词性标注语料库训练得到。
117 | 1. 我们还提供更复杂、完善和精确的分词和词性标注联合模型Model_3和分词词表。该模型是由多语料联合训练训练得到（语料包括来自多文体的标注文本和人民日报标注文本等）。由于模型较大，如有机构或个人需要，请填写“doc/资源申请表.doc”，并发送至 thunlp@gmail.com，通过审核后我们会将相关资源发送给联系人。
118 | 
119 | ## 注意事项
120 | 该工具目前仅处理UTF8编码中文文本，之后会逐渐增加支持其他编码的功能，敬请期待。
121 | 
122 | ## 其他语言实现
123 | * C++版
124 | [https://github.com/thunlp/THULAC](https://github.com/thunlp/THULAC)
125 | * Python版
126 | [https://github.com/thunlp/THULAC-Python](https://github.com/thunlp/THULAC-Python)
127 | * so版
128 | [https://github.com/thunlp/THULAC.so](https://github.com/thunlp/THULAC.so)
129 | 
130 | ## 更新历史
131 | | 更新时间 | 更新内容 |
132 | |:------------|:-------------:|
133 | | 2016-09-29 | 增加THULAC分词so版本。|
134 | | 2016-03-31 | 增加THULAC分词python版本。|
135 | | 2016-01-20 | 增加THULAC分词Java版本。|
136 | | 2016-01-10 | 开源THULAC分词工具C++版本。|
137 | 
138 | ## 开源协议
139 | 1. THULAC面向国内外大学、研究所、企业以及个人用于研究目的免费开放源代码。
140 | 1. 如有机构或个人拟将THULAC用于商业目的，请发邮件至thunlp@gmail.com洽谈技术许可协议。
141 | 1. 欢迎对该工具包提出任何宝贵意见和建议。请发邮件至thunlp@gmail.com。
142 | 1. 如果您在THULAC基础上发表论文或取得科研成果，请您在发表论文和申报成果时声明“使用了清华大学THULAC”，并按如下格式引用：
143 | 
144 | 中文： 
145 | > **孙茂松, 陈新雄, 张开旭, 郭志芃, 刘知远. THULAC：一个高效的中文词法分析工具包. 2016.**
146 | 
147 | 英文：
148 | > **Maosong Sun, Xinxiong Chen, Kaixu Zhang, Zhipeng Guo, Zhiyuan Liu. THULAC: An Efficient Lexical Analyzer for Chinese. 2016.**
149 | 
150 | ## 相关论文
151 | * Zhongguo Li, Maosong Sun. Punctuation as Implicit Annotations for Chinese Word Segmentation. Computational Linguistics, vol. 35, no. 4, pp. 505-512, 2009.
152 | 
153 | ## 作者
154 | Maosong Sun （孙茂松，导师）,  Xinxiong Chen（陈新雄，博士生）,  Kaixu Zhang (张开旭，硕士生）,  Zhipeng Guo（郭志芃，本科生）,  Zhiyuan Liu（刘知远，助理教授）.
155 | 
156 | ## 致谢
157 | [std4453](https://www.github.com/std4453)：对此java版本的代码进行优化和添加注释。


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'java'
 2 | 
 3 | repositories {
 4 |     mavenLocal()
 5 |     jcenter()
 6 |     mavenCentral()
 7 | }
 8 | 
 9 | dependencies {
10 |     testCompile group: 'junit', name: 'junit', version: '4.+'
11 |     testCompile group: 'net.sf.jopt-simple', name: 'jopt-simple', version: '5.+'
12 | }
13 | 
14 | jar {
15 |     version '1.0'
16 |     manifest {
17 |         attributes("Main-Class": "org.thunlp.thulac.main.Main")
18 |     }
19 | }
20 | 
21 | task release(dependsOn: 'jar') {
22 |     copy {
23 |         from 'build/libs'
24 |         into 'release'
25 |     }
26 |     copy {
27 |         from 'models'
28 |         into 'release/models'
29 |     }
30 | }


--------------------------------------------------------------------------------
/doc/申请表.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/THULAC-Java/7b5e3cc408dee25e1bcd8f43ef13018ae94b4905/doc/申请表.docx


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'thulac'


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/Thulac.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac;
  2 | 
  3 | import org.thunlp.thulac.cb.CBTaggingDecoder;
  4 | import org.thunlp.thulac.data.POCGraph;
  5 | import org.thunlp.thulac.data.TaggedWord;
  6 | import org.thunlp.thulac.io.IInputProvider;
  7 | import org.thunlp.thulac.io.IOutputHandler;
  8 | import org.thunlp.thulac.io.StringOutputHandler;
  9 | import org.thunlp.thulac.postprocess.*;
 10 | import org.thunlp.thulac.preprocess.ConvertT2SPass;
 11 | import org.thunlp.thulac.preprocess.IPreprocessPass;
 12 | import org.thunlp.thulac.preprocess.PreprocessPass;
 13 | import org.thunlp.thulac.util.IOUtils;
 14 | 
 15 | import java.io.File;
 16 | import java.io.FileNotFoundException;
 17 | import java.io.IOException;
 18 | import java.util.ArrayList;
 19 | import java.util.List;
 20 | import java.util.Vector;
 21 | 
 22 | /**
 23 |  * The central class which acts as core of the THULAC API. It provides several
 24 |  * convenient methods make things easier for users.
 25 |  */
 26 | public class Thulac {
 27 | 	/**
 28 | 	 * Run the segmentation program with argument {@code segOnly}, taking input from the
 29 | 	 * given {@link String} and return the segmented output as a {@link String}.
 30 | 	 *
 31 | 	 * @param input
 32 | 	 * 		The input {@link String}.
 33 | 	 * @param segOnly
 34 | 	 * 		Whether to output only segments.
 35 | 	 *
 36 | 	 * @return The segmented output as a {@link String}.
 37 | 	 *
 38 | 	 * @throws IOException
 39 | 	 * 		If one of the model files fails to load.
 40 | 	 */
 41 | 	public static String split(String input, boolean segOnly) throws IOException {
 42 | 		StringOutputHandler outputProvider = IOUtils.outputToString();
 43 | 		IInputProvider inputProvider = IOUtils.inputFromString(input);
 44 | 		split(inputProvider, outputProvider, segOnly);
 45 | 		return outputProvider.getString();
 46 | 	}
 47 | 
 48 | 	/**
 49 | 	 * Run the segmentation program with argument {@code segOnly}, taking input from the
 50 | 	 * given {@link File} and output the segmented return to a given {@link File}.<br>
 51 | 	 * This method returns directly if either {@code inputFile} or {@code outputFile}
 52 | 	 * is null.
 53 | 	 *
 54 | 	 * @param inputFile
 55 | 	 * 		The name of the input file.
 56 | 	 * @param outputFile
 57 | 	 * 		The name of the output file.
 58 | 	 * @param segOnly
 59 | 	 * 		Whether to output only segments.
 60 | 	 *
 61 | 	 * @throws IOException
 62 | 	 * 		If one of the model files fails to load or either the input file or the output
 63 | 	 * 		file is {@code null}.
 64 | 	 */
 65 | 	public static void split(String inputFile, String outputFile, boolean segOnly)
 66 | 			throws IOException {
 67 | 		if (inputFile == null || outputFile == null) return;
 68 | 		IInputProvider input = IOUtils.inputFromFile(inputFile);
 69 | 		IOutputHandler output = IOUtils.outputToFile(outputFile);
 70 | 		split(input, output, segOnly);
 71 | 	}
 72 | 
 73 | 	/**
 74 | 	 * Run the segmentation program with argument {@code segOnly}, taking input from the
 75 | 	 * given {@link File} and output the segmented return to a given {@link File}.
 76 | 	 *
 77 | 	 * @param input
 78 | 	 * 		The input {@link File}.
 79 | 	 * @param output
 80 | 	 * 		The output {@link File}.
 81 | 	 * @param segOnly
 82 | 	 * 		Whether to output only segments.
 83 | 	 *
 84 | 	 * @throws IOException
 85 | 	 * 		If one of the model files fails to load or either the input file or the output
 86 | 	 * 		file is {@code null}.
 87 | 	 */
 88 | 	public static void split(File input, File output, boolean segOnly)
 89 | 			throws IOException {
 90 | 		if (input == null) throw new FileNotFoundException("input == null!");
 91 | 		if (output == null) throw new FileNotFoundException("output == null!");
 92 | 		IInputProvider inputProvider = IOUtils.inputFromFile(input);
 93 | 		IOutputHandler outputHandler = IOUtils.outputToFile(output);
 94 | 		split(inputProvider, outputHandler, segOnly);
 95 | 	}
 96 | 
 97 | 	/**
 98 | 	 * Run the segmentation program with argument {@code segOnly} and default values
 99 | 	 * for all others.
100 | 	 *
101 | 	 * @param input
102 | 	 * 		The {@link IInputProvider} instance to provide input.
103 | 	 * @param output
104 | 	 * 		The {@link IOutputHandler} instance to handle output.
105 | 	 * @param segOnly
106 | 	 * 		Whether to output only segments.
107 | 	 *
108 | 	 * @throws IOException
109 | 	 * 		If I/O of either {@code input}, {@code output} or one of the model files
110 | 	 * 		resulted in an exception.
111 | 	 */
112 | 	public static void split(IInputProvider input, IOutputHandler output, boolean segOnly)
113 | 			throws IOException {
114 | 		split("models/", '_', null, false, segOnly, false, input, output);
115 | 	}
116 | 
117 | 	/**
118 | 	 * Run the segmentation program with full arguments.
119 | 	 *
120 | 	 * @param modelDir
121 | 	 * 		The directory under which the model files are located.
122 | 	 * @param separator
123 | 	 * 		The separator to use to separate words and tags.
124 | 	 * @param userDict
125 | 	 * 		The optional file name of the user-specified dictionary.
126 | 	 * @param useT2S
127 | 	 * 		Whether to transfer traditional Chinese to simplified Chinese before
128 | 	 * 		segmentation.
129 | 	 * @param segOnly
130 | 	 * 		Whether to output only segments.
131 | 	 * @param useFilter
132 | 	 * 		Whether to use filters while processing.
133 | 	 * @param input
134 | 	 * 		The {@link IInputProvider} instance to provide input.
135 | 	 * @param output
136 | 	 * 		The {@link IOutputHandler} instance to handle output.
137 | 	 *
138 | 	 * @throws IOException
139 | 	 * 		If I/O of either {@code input}, {@code output} or one of the model files
140 | 	 * 		resulted in an exception.
141 | 	 */
142 | 	public static void split(
143 | 			String modelDir, char separator, String userDict,
144 | 			boolean useT2S, boolean segOnly, boolean useFilter,
145 | 			IInputProvider input, IOutputHandler output) throws IOException {
146 | 		try {
147 | 			input.onProgramStart();
148 | 			output.onProgramStart();
149 | 
150 | 			// segmentation
151 | 			CBTaggingDecoder taggingDecoder = new CBTaggingDecoder();
152 | 			taggingDecoder.threshold = segOnly ? 0 : 10000;
153 | 			String prefix = modelDir + (segOnly ? "cws_" : "model_c_");
154 | 			taggingDecoder.loadFiles(prefix + "model.bin",
155 | 					prefix + "dat.bin",
156 | 					prefix + "label.txt");
157 | 			taggingDecoder.setLabelTrans();
158 | 
159 | 			// preprocess passes
160 | 			List<IPreprocessPass> pre = new ArrayList<>();
161 | 			pre.add(new PreprocessPass());
162 | 			if (useT2S) pre.add(new ConvertT2SPass(modelDir + "t2s.dat"));
163 | 
164 | 			// postprocess passes
165 | 			List<IPostprocessPass> post = new ArrayList<>();
166 | 			post.add(new DictionaryPass(modelDir + "ns.dat", "ns", false));
167 | 			post.add(new DictionaryPass(modelDir + "idiom.dat", "i", false));
168 | 			post.add(new DictionaryPass(modelDir + "singlepun.dat", "w", false));
169 | 			post.add(new TimeWordPass());
170 | 			post.add(new DoubleWordPass());
171 | 			post.add(new SpecialPass());
172 | 			post.add(new NegWordPass(modelDir + "neg.dat"));
173 | 			if (userDict != null) post.add(new DictionaryPass(userDict, "uw", true));
174 | 			if (useFilter)
175 | 				post.add(new FilterPass(modelDir + "xu.dat", modelDir + "time.dat"));
176 | 
177 | 			// main loop
178 | 			List<TaggedWord> words = new Vector<>();
179 | 			POCGraph graph = new POCGraph();
180 | 			for (List<String> lineSegments = input.provideInput();
181 | 				 lineSegments != null;
182 | 				 lineSegments = input.provideInput()) {
183 | 				output.handleLineStart();
184 | 				for (String raw : lineSegments) {
185 | 					for (IPreprocessPass pass : pre) raw = pass.process(raw, graph);
186 | 					taggingDecoder.segment(raw, graph, words);
187 | 					for (IPostprocessPass pass : post) pass.process(words);
188 | 
189 | 					output.handleLineSegment(words, segOnly, separator);
190 | 				}
191 | 				output.handleLineEnd();
192 | 			}
193 | 		} finally { // close resources even when program crashes
194 | 			input.onProgramEnd();
195 | 			output.onProgramEnd();
196 | 		}
197 | 	}
198 | }
199 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/AlphaBeta.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.cb;
  2 | 
  3 | 
  4 | // a structure for alphas and betas
  5 | public class AlphaBeta {
  6 | 	// TODO: add documentation
  7 | 
  8 | 	public int value;
  9 | 	public int nodeId;
 10 | 	public int labelId;
 11 | 
 12 | 	public AlphaBeta() {
 13 | 		super();
 14 | 		this.value = 0;
 15 | 		this.nodeId = -2;
 16 | 		this.labelId = 0;
 17 | 	}
 18 | 
 19 | 	public AlphaBeta(int value, int nodeId, int labelId) {
 20 | 		super();
 21 | 		this.value = value;
 22 | 		this.nodeId = nodeId;
 23 | 		this.labelId = labelId;
 24 | 	}
 25 | 
 26 | 
 27 | 	public static int dbDecode(
 28 | 			int l_size, int[] llWeights, int nodeCount, Node[] nodes, int[] values,
 29 | 			AlphaBeta[] alphas,
 30 | 			int[] result, int[][] preLabels, int[][] allowedLabelLists) {
 31 | 		int nodeId;
 32 | 		int[] pNodeId;
 33 | 		int[] pPreLabel;
 34 | 		int[] pAllowedLabel;
 35 | 		int k;
 36 | 		int j;
 37 | 		AlphaBeta tmp;
 38 | 		AlphaBeta best = new AlphaBeta();
 39 | 		best.nodeId = -1;
 40 | 		AlphaBeta preAlpha;
 41 | 
 42 | 		int score;
 43 | 		int index = 0;
 44 | 		int index2 = 0;
 45 | 		int index3 = 0;
 46 | 
 47 | 		for (int i = 0; i < nodeCount * l_size; i++) {
 48 | 			alphas[i] = new AlphaBeta();
 49 | 			alphas[i].nodeId = -2;
 50 | 		}
 51 | 		for (int i = 0; i < nodeCount; i++) {
 52 | 			pAllowedLabel = allowedLabelLists != null ? allowedLabelLists[i] : null;
 53 | 			j = -1;
 54 | 			int maxValue = 0;
 55 | 			boolean hasMaxValue = false;
 56 | 			if (pAllowedLabel != null) {
 57 | 				index = 0;
 58 | 				while ((j = pAllowedLabel[index]) != -1) {
 59 | 					index++;
 60 | 					if (!hasMaxValue || (maxValue < values[i * l_size + j])) {
 61 | 						hasMaxValue = true;
 62 | 						maxValue = values[i * l_size + j];
 63 | 					}
 64 | 				}
 65 | 				index = 0;
 66 | 				j = -1;
 67 | 				while ((j = pAllowedLabel[index]) != -1) {
 68 | 					index++;
 69 | 					tmp = alphas[i * l_size + j];
 70 | 					tmp.value = 0;
 71 | 					pNodeId = nodes[i].predecessors;
 72 | 					pPreLabel = preLabels != null ? preLabels[j] : null;
 73 | 					index2 = 0;
 74 | 					while ((nodeId = pNodeId[index2]) >= 0) {
 75 | 						index2++;
 76 | 						k = -1;
 77 | 						if (pPreLabel != null) {
 78 | 							index3 = 0;
 79 | 							while ((k = pPreLabel[index3]) != -1) {
 80 | 								index3++;
 81 | 								preAlpha = alphas[nodeId * l_size + k];
 82 | 								if (preAlpha.nodeId == -2) continue;
 83 | 								score = preAlpha.value + llWeights[k * l_size + j];
 84 | 								if ((tmp.nodeId < 0) || (score > tmp.value)) {
 85 | 									tmp.value = score;
 86 | 									tmp.nodeId = nodeId;
 87 | 									tmp.labelId = k;
 88 | 								}
 89 | 							}
 90 | 						} else {
 91 | 							k++;
 92 | 							while (k != l_size) {
 93 | 								preAlpha = alphas[nodeId * l_size + k];
 94 | 								if (preAlpha.nodeId == -2) continue;
 95 | 								score = preAlpha.value + llWeights[k * l_size + j];
 96 | 								if ((tmp.nodeId < 0) || (score > tmp.value)) {
 97 | 									tmp.value = score;
 98 | 									tmp.nodeId = nodeId;
 99 | 									tmp.labelId = k;
100 | 								}
101 | 								k++;
102 | 							}
103 | 						}
104 | 					}
105 | 					tmp.value += values[i * l_size + j];
106 | 					if ((nodes[i].type == 1) || (nodes[i].type == 3)) {
107 | 						tmp.nodeId = -1;
108 | 					}
109 | 					if (nodes[i].type >= 2) {
110 | 						if ((best.nodeId == -1) || best.value < tmp.value) {
111 | 							best.value = tmp.value;
112 | 							best.nodeId = i;
113 | 							best.labelId = j;
114 | 						}
115 | 					}
116 | 				}
117 | 
118 | 			} else {
119 | 				j++;
120 | 				while (j != l_size) {
121 | 					if (!hasMaxValue || (maxValue < values[i * l_size + j])) {
122 | 						hasMaxValue = true;
123 | 						maxValue = values[i * l_size + j];
124 | 					}
125 | 					j++;
126 | 				}
127 | 				j = 0;
128 | 				while (j != l_size) {
129 | 					tmp = alphas[i * l_size + j];
130 | 					tmp.value = 0;
131 | 					pNodeId = nodes[i].predecessors;
132 | 					pPreLabel = preLabels != null ? preLabels[j] : null;
133 | 					index2 = 0;
134 | 					while ((nodeId = pNodeId[index2]) >= 0) {
135 | 						index2++;
136 | 						k = -1;
137 | 						if (pPreLabel != null) {
138 | 							index3 = 0;
139 | 							while ((k = pPreLabel[index3]) != -1) {
140 | 								index3++;
141 | 								preAlpha = alphas[nodeId * l_size + k];
142 | 								if (preAlpha.nodeId == -2) continue;
143 | 								score = preAlpha.value + llWeights[k * l_size + j];
144 | 								if ((tmp.nodeId < 0) || (score > tmp.value)) {
145 | 									tmp.value = score;
146 | 									tmp.nodeId = nodeId;
147 | 									tmp.labelId = k;
148 | 								}
149 | 
150 | 							}
151 | 						} else {
152 | 							k++;
153 | 							while (k != l_size) {
154 | 								preAlpha = alphas[nodeId * l_size + k];
155 | 								if (preAlpha.nodeId == -2) continue;
156 | 								score = preAlpha.value + llWeights[k * l_size + j];
157 | 								if ((tmp.nodeId < 0) || (score > tmp.value)) {
158 | 									tmp.value = score;
159 | 									tmp.nodeId = nodeId;
160 | 									tmp.labelId = k;
161 | 								}
162 | 								k++;
163 | 							}
164 | 						}
165 | 					}
166 | 					tmp.value += values[i * l_size + j];
167 | 					if ((nodes[i].type == 1) || (nodes[i].type == 3)) {
168 | 						tmp.nodeId = -1;
169 | 					}
170 | 					if (nodes[i].type >= 2) {
171 | 						if ((best.nodeId == -1) || best.value < tmp.value) {
172 | 							best.value = tmp.value;
173 | 							best.nodeId = i;
174 | 							best.labelId = j;
175 | 						}
176 | 					}
177 | //					System.out.println(""+tmp.value+" "+tmp.nodeId+" "+tmp.labelId);
178 | 					j++;
179 | 				}
180 | 
181 | 			}
182 | 		}
183 | 		tmp = best;
184 | 		while (tmp.nodeId >= 0) {
185 | 			result[tmp.nodeId] = tmp.labelId;
186 | 			tmp = alphas[tmp.nodeId * l_size + tmp.labelId];
187 | 		}
188 | 		return best.value;
189 | 	}
190 | }


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/CBModel.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.cb;
 2 | 
 3 | import org.thunlp.thulac.util.BufferUtils;
 4 | 
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.nio.ByteBuffer;
 8 | import java.nio.ByteOrder;
 9 | import java.nio.IntBuffer;
10 | import java.nio.channels.FileChannel;
11 | 
12 | public class CBModel {
13 | 	// TODO: add documentation
14 | 
15 | 	public int l_size; // size of the labels
16 | 	public int f_size; // size of the features
17 | 
18 | 	public int[] ll_weights; // weights of (label, label)
19 | 	public int[] fl_weights; // weights of (feature, label)
20 | 
21 | 	public CBModel(String filename) throws IOException {
22 | 		FileInputStream in = new FileInputStream(filename);
23 | 		FileChannel channel = in.getChannel();
24 | 
25 | 		ByteBuffer header = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN);
26 | 		header.clear();
27 | 		channel.read(header);
28 | 		header.flip();
29 | 		IntBuffer intHeader = header.asIntBuffer();
30 | 		this.l_size = intHeader.get();
31 | 		this.f_size = intHeader.get();
32 | 
33 | 		int llSize = this.l_size * this.l_size, flSize = this.l_size * this.f_size;
34 | 		this.ll_weights = new int[llSize];
35 | 		this.fl_weights = new int[flSize];
36 | 		ByteBuffer buf = ByteBuffer.allocate(64 * 1024).order(ByteOrder.LITTLE_ENDIAN);
37 | 		buf.clear();
38 | 		BufferUtils.readInts(channel, buf, this.ll_weights, this.fl_weights);
39 | 
40 | 		channel.close();
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/CBNGramFeature.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.cb;
  2 | 
  3 | import org.thunlp.thulac.data.Dat;
  4 | 
  5 | import java.util.Vector;
  6 | 
  7 | public class CBNGramFeature {
  8 | 	// TODO: add documentation
  9 | 
 10 | 	private static final int SENTENCE_BOUNDARY = '#';
 11 | 
 12 | 	private int separator;
 13 | 	private int maxLength;
 14 | 	private int[] uniBases;
 15 | 	private int[] biBases;
 16 | 	private int[] values;
 17 | 	private int datSize;
 18 | 	private int[] dat;
 19 | 	private CBModel model;
 20 | 
 21 | 	public CBNGramFeature(Dat myDat, CBModel model, int[] values) {
 22 | 		this.separator = ' ';
 23 | 		this.datSize = myDat.datSize;
 24 | 		this.dat = myDat.dat;
 25 | 		this.model = model;
 26 | 		this.maxLength = 20000;
 27 | 		this.uniBases = new int[this.maxLength + 2];
 28 | 		this.biBases = new int[this.maxLength + 4];
 29 | 		this.values = values;
 30 | 	}
 31 | 
 32 | 	private void addValues(int valueOffset, int base, int del) {
 33 | 		int ind = this.dat[base << 1] + del;
 34 | 		if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return;
 35 | 		int offset = this.dat[ind << 1];
 36 | 		int weightOffset = offset * this.model.l_size;
 37 | 		if (this.model.l_size == 4) {
 38 | 			this.values[valueOffset] += this.model.fl_weights[weightOffset];
 39 | 			this.values[valueOffset + 1] += this.model.fl_weights[weightOffset + 1];
 40 | 			this.values[valueOffset + 2] += this.model.fl_weights[weightOffset + 2];
 41 | 			this.values[valueOffset + 3] += this.model.fl_weights[weightOffset + 3];
 42 | 		} else for (int i = 0; i < this.model.l_size; i++) {
 43 | 			this.values[valueOffset + i] += this.model.fl_weights[weightOffset + i];
 44 | 		}
 45 | 	}
 46 | 
 47 | 	private Vector<Integer> findBases(int datSize, int ch1, int ch2) {
 48 | 		Vector<Integer> result = new Vector<>();
 49 | 		int uniBase;
 50 | 		int biBase;
 51 | 		if (ch1 > 32 && ch1 < 128) ch1 += 65248;
 52 | 		if (ch2 > 32 && ch2 < 128) ch2 += 65248;
 53 | 		if (ch1 >= datSize || this.dat[(ch1 << 1) + 1] != 0) {
 54 | 			uniBase = -1;
 55 | 			biBase = -1;
 56 | 			result.clear();
 57 | 			result.add(uniBase);
 58 | 			result.add(biBase);
 59 | 			return result;
 60 | 		}
 61 | 		uniBase = this.dat[ch1 << 1] + this.separator;
 62 | 		int ind = this.dat[ch1 << 1] + ch2;
 63 | 		if (ind >= datSize || this.dat[(ind << 1) + 1] != ch1) {
 64 | 			biBase = -1;
 65 | 			result.clear();
 66 | 			result.add(uniBase);
 67 | 			result.add(biBase);
 68 | 			return result;
 69 | 		}
 70 | 		biBase = this.dat[ind << 1] + this.separator;
 71 | 		result.clear();
 72 | 		result.add(uniBase);
 73 | 		result.add(biBase);
 74 | 		return result;
 75 | 	}
 76 | 
 77 | 	public int putValues(String sequence, int len) {
 78 | 		if (len >= this.maxLength) {
 79 | 			System.err.println("Length larger than maxLength.");
 80 | 			return 1;
 81 | 		}
 82 | 
 83 | 		Vector<Integer> result = this.findBases(this.datSize, SENTENCE_BOUNDARY,
 84 | 				SENTENCE_BOUNDARY);
 85 | 		this.uniBases[0] = result.get(0);
 86 | 		this.biBases[0] = result.get(1);
 87 | 
 88 | 		result = this.findBases(this.datSize, SENTENCE_BOUNDARY, sequence.charAt(0));
 89 | 		this.uniBases[0] = result.get(0);
 90 | 		this.biBases[1] = result.get(1);
 91 | 		for (int i = 0; i + 1 < len; i++) {
 92 | 			result = this.findBases(this.datSize, sequence.charAt(i),
 93 | 					sequence.charAt(i + 1));
 94 | 			this.uniBases[i + 1] = result.get(0);
 95 | 			this.biBases[i + 2] = result.get(1);
 96 | 		}
 97 | 
 98 | 		result = this.findBases(this.datSize, (int) sequence.charAt(len - 1),
 99 | 				SENTENCE_BOUNDARY);
100 | 		this.uniBases[len] = result.get(0);
101 | 		this.biBases[len + 1] = result.get(1);
102 | 
103 | 		result = this.findBases(this.datSize, SENTENCE_BOUNDARY, SENTENCE_BOUNDARY);
104 | 		this.uniBases[len + 1] = result.get(0);
105 | 		this.biBases[len + 2] = result.get(1);
106 | 
107 | 		int base;
108 | 		for (int i = 0; i < len; i++) {
109 | 			int valueOffset = i * this.model.l_size;
110 | 			if ((base = this.uniBases[i + 1]) != -1) {
111 | 				this.addValues(valueOffset, base, 49);
112 | 			}
113 | 			if ((base = this.uniBases[i]) != -1) {
114 | 				this.addValues(valueOffset, base, 50);
115 | 			}
116 | 			if ((base = this.uniBases[i + 2]) != -1) {
117 | 				this.addValues(valueOffset, base, 51);
118 | 			}
119 | 			if ((base = this.biBases[i + 1]) != -1) {
120 | 				this.addValues(valueOffset, base, 49);
121 | 			}
122 | 			if ((base = this.biBases[i + 2]) != -1) {
123 | 				this.addValues(valueOffset, base, 50);
124 | 			}
125 | 			if ((base = this.biBases[i]) != -1) {
126 | 				this.addValues(valueOffset, base, 51);
127 | 			}
128 | 			if ((base = this.biBases[i + 3]) != -1) {
129 | 				this.addValues(valueOffset, base, 52);
130 | 			}
131 | 		}
132 | 		return 0;
133 | 	}
134 | }
135 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/CBTaggingDecoder.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.cb;
  2 | 
  3 | import org.thunlp.thulac.data.Dat;
  4 | import org.thunlp.thulac.data.POCGraph;
  5 | import org.thunlp.thulac.data.TaggedWord;
  6 | 
  7 | import java.io.BufferedReader;
  8 | import java.io.FileInputStream;
  9 | import java.io.IOException;
 10 | import java.io.InputStreamReader;
 11 | import java.util.List;
 12 | import java.util.Vector;
 13 | 
 14 | public class CBTaggingDecoder {
 15 | 	// TODO: add documentation
 16 | 
 17 | 	private int maxLength;
 18 | 	private int len;
 19 | 	private String sequence;
 20 | 	private int[][] allowedLabelLists;
 21 | 	private int[][] pocsToTags;
 22 | 
 23 | 	private CBNGramFeature nGramFeature;
 24 | 	private Dat dat;
 25 | 
 26 | 	private CBModel model;
 27 | 
 28 | 	private Node[] nodes;
 29 | 	private int[] values;
 30 | 	private AlphaBeta[] alphas;
 31 | 	private int[] result;
 32 | 
 33 | 	private String[] labelInfo;
 34 | 
 35 | 	private int[][] labelTransPre;
 36 | 	private int[][] labelTransPost;
 37 | 
 38 | 	public int threshold;
 39 | 
 40 | 	public CBTaggingDecoder() {
 41 | 		this.maxLength = 20000;
 42 | 		this.len = 0;
 43 | 		this.sequence = "";
 44 | 		this.allowedLabelLists = new int[this.maxLength][];
 45 | 
 46 | 		this.pocsToTags = null;
 47 | 		this.nGramFeature = null;
 48 | 		this.dat = null;
 49 | 		this.nodes = new Node[this.maxLength];
 50 | 		this.labelTransPre = null;
 51 | 		this.labelTransPost = null;
 52 | 		this.threshold = 0;
 53 | 
 54 | 		this.model = null;
 55 | 		this.alphas = null;
 56 | 	}
 57 | 
 58 | 	public void loadFiles(String modelFile, String datFile, String labelFile) throws
 59 | 			IOException {
 60 | 		this.model = new CBModel(modelFile);
 61 | 
 62 | 		this.values = new int[this.maxLength * this.model.l_size];
 63 | 		this.alphas = new AlphaBeta[this.maxLength * this.model.l_size];
 64 | 		this.result = new int[this.maxLength * this.model.l_size];
 65 | 
 66 | 		for (int i = 0; i < this.maxLength; i++) {
 67 | 			this.nodes[i] = new Node();
 68 | 
 69 | 			int[] pre = new int[2];
 70 | 			pre[0] = i - 1;
 71 | 			pre[1] = -1;
 72 | 			this.nodes[i].predecessors = pre;
 73 | 
 74 | 			pre = new int[2];
 75 | 			pre[0] = i + 1;
 76 | 			pre[1] = -1;
 77 | 			this.nodes[i].successors = pre;
 78 | 		}
 79 | 
 80 | 		this.dat = new Dat(datFile);
 81 | 		this.nGramFeature = new CBNGramFeature(this.dat, this.model, this.values);
 82 | 
 83 | 		this.labelInfo = new String[10000];
 84 | 		Vector<Vector<Integer>> pocTags = new Vector<>();
 85 | 		for (int i = 0; i < 16; i++) pocTags.add(new Vector<>());
 86 | 		BufferedReader in = new BufferedReader(
 87 | 				new InputStreamReader(new FileInputStream(labelFile)));
 88 | 		String line;
 89 | 		int ind = 0;
 90 | 		while ((line = in.readLine()) != null) {
 91 | 			this.labelInfo[ind] = line;
 92 | 			int segInd = line.charAt(0) - '0';
 93 | 			for (int j = 0; j < 16; j++)
 94 | 				if (((1 << segInd) & j) != 0) pocTags.get(j).add(ind);
 95 | 			ind++;
 96 | 		}
 97 | 		in.close();
 98 | 
 99 | 		this.pocsToTags = new int[16][];
100 | 		for (int j = 1; j < 16; j++) {
101 | 			this.pocsToTags[j] = new int[pocTags.get(j).size() + 1];
102 | 			for (int k = 0; k < pocTags.get(j).size(); k++)
103 | 				this.pocsToTags[j][k] = pocTags.get(j).get(k);
104 | 			this.pocsToTags[j][pocTags.get(j).size()] = -1;
105 | 		}
106 | 
107 | 		int[][] labelLookingFor = new int[this.model.l_size][];
108 | 		for (int i = 0; i < this.model.l_size; i++) labelLookingFor[i] = null;
109 | 		for (int i = 0; i < this.model.l_size; i++) {
110 | 			if ("30".indexOf(this.labelInfo[i].charAt(0)) != -1) continue;
111 | 			for (int j = 0; j <= i; j++) {
112 | 				if ((this.labelInfo[i].substring(1).equals(
113 | 						this.labelInfo[j].substring(1))) && (this.labelInfo[j].charAt(
114 | 						0) == '0')) {
115 | 					if (labelLookingFor[j] == null) {
116 | 						labelLookingFor[j] = new int[2];
117 | 						labelLookingFor[j][0] = -1;
118 | 						labelLookingFor[j][1] = -1;
119 | 					}
120 | 					labelLookingFor[j][this.labelInfo[i].charAt(0) - '1'] = i;
121 | 					break;
122 | 				}
123 | 			}
124 | 		}
125 | 
126 | 
127 | 		for (int i = 0; i < this.maxLength; i++) this.allowedLabelLists[i] = null;
128 | 	}
129 | 
130 | 	public void dp() {
131 | 		if (this.allowedLabelLists[0] == null)
132 | 			this.allowedLabelLists[0] = this.pocsToTags[9];
133 | 		if (this.allowedLabelLists[this.len - 1] == null)
134 | 			this.allowedLabelLists[this.len - 1] = this.pocsToTags[12];
135 | 		AlphaBeta.dbDecode(this.model.l_size, this.model.ll_weights,
136 | 				this.len, this.nodes, this.values, this.alphas, this.result,
137 | 				this.labelTransPre, this.allowedLabelLists);
138 | 		this.allowedLabelLists[0] = null;
139 | 		this.allowedLabelLists[this.len - 1] = null;
140 | 	}
141 | 
142 | 	public void setLabelTrans() {
143 | 		int lSize = this.model.l_size;
144 | 		Vector<Vector<Integer>> preLabels = new Vector<>();
145 | 		Vector<Vector<Integer>> postLabels = new Vector<>();
146 | 		for (int i = 0; i < lSize; i++) {
147 | 			preLabels.add(new Vector<>());
148 | 			postLabels.add(new Vector<>());
149 | 		}
150 | 		for (int i = 0; i < lSize; i++) {
151 | 			for (int j = 0; j < lSize; j++) {
152 | 				int ni = this.labelInfo[i].charAt(0) - '0';
153 | 				int nj = this.labelInfo[j].charAt(0) - '0';
154 | 				boolean iIsEnd = ((ni == 2) || (ni == 3));
155 | 				boolean jIsBegin = ((nj == 0) || (nj == 3));
156 | 				boolean sameTag = this.labelInfo[i].substring(1)
157 | 						.equals(this.labelInfo[j].substring(1));
158 | 				if (sameTag) {
159 | 					if ((ni == 0 && nj == 1) ||
160 | 							(ni == 0 && nj == 2) ||
161 | 							(ni == 1 && nj == 2) ||
162 | 							(ni == 1 && nj == 1) ||
163 | 							(ni == 2 && nj == 0) ||
164 | 							(ni == 2 && nj == 3) ||
165 | 							(ni == 3 && nj == 3) ||
166 | 							(ni == 3 && nj == 0)) {
167 | 						preLabels.get(j).add(i);
168 | 						postLabels.get(i).add(j);
169 | 					}
170 | 				} else if (iIsEnd && jIsBegin) {
171 | 					preLabels.get(j).add(i);
172 | 					postLabels.get(i).add(j);
173 | 				}
174 | 			}
175 | 		}
176 | 		this.labelTransPre = new int[lSize][];
177 | 		for (int i = 0; i < lSize; i++) {
178 | 			this.labelTransPre[i] = new int[preLabels.get(i).size() + 1];
179 | 			for (int j = 0; j < preLabels.get(i).size(); j++) {
180 | 				this.labelTransPre[i][j] = preLabels.get(i).get(j);
181 | 			}
182 | 			this.labelTransPre[i][preLabels.get(i).size()] = -1;
183 | 		}
184 | 
185 | 		this.labelTransPost = new int[lSize][];
186 | 		for (int i = 0; i < lSize; i++) {
187 | 			this.labelTransPost[i] = new int[postLabels.get(i).size() + 1];
188 | 			for (int j = 0; j < postLabels.get(i).size(); j++)
189 | 				this.labelTransPost[i][j] = postLabels.get(i).get(j);
190 | 			this.labelTransPost[i][postLabels.get(i).size()] = -1;
191 | 		}
192 | 	}
193 | 
194 | 	public void putValues() {
195 | 		if (this.len == 0) return;
196 | 		for (int i = 0; i < this.len; i++) this.nodes[i].type = 0;
197 | 		this.nodes[0].type += 1;
198 | 		this.nodes[this.len - 1].type += 2;
199 | 
200 | 		int size = this.len * this.model.l_size;
201 | 		for (int i = 0; i < size; i++) this.values[i] = 0;
202 | 		this.nGramFeature.putValues(this.sequence, this.len);
203 | 	}
204 | 
205 | 	public boolean segment(String raw, POCGraph graph, List<TaggedWord> ts) {
206 | 		if (raw.length() == 0) return false;
207 | 
208 | 		for (int i = 0; i < raw.length(); i++)
209 | 			this.allowedLabelLists[i] = this.pocsToTags[
210 | 					graph.get(i) == 0 ? 15 : graph.get(i)];
211 | 		this.sequence = "";
212 | 		for (int i = 0; i < raw.length(); i++) this.sequence += raw.charAt(i);
213 | 		this.len = raw.length();
214 | 		this.putValues(); // calculate eigenvalue and initialize and store them in values
215 | 		this.dp(); // DP search for the best answer and store it in result
216 | 
217 | 		for (int i = 0; i < raw.length(); i++) this.allowedLabelLists[i] = null;
218 | 		int offset = 0;
219 | 		ts.clear();
220 | 		for (int i = 0; i < this.len; i++) {
221 | 			if ((i == this.len - 1) || (this.labelInfo[this.result[i]].charAt(
222 | 					0) == '2') || (this.labelInfo[this.result[i]].charAt(0) == '3')) {
223 | 				ts.add(new TaggedWord());
224 | 				for (int j = offset; j < i + 1; j++) {
225 | 					ts.get(ts.size() - 1).word += (this.sequence.charAt(j));
226 | 				}
227 | 				offset = i + 1; // output tags
228 | 				ts.get(ts.size() - 1).tag = this.labelInfo[this.result[i]].substring(
229 | 						1);
230 | 			}
231 | 		}
232 | 		return true;
233 | 	}
234 | }
235 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/cb/Node.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.cb;
 2 | 
 3 | /**
 4 |  * A class which contains topological information of a node.
 5 |  */
 6 | public class Node {
 7 | 	// TODO: add more documentation
 8 | 
 9 | 	/**
10 | 	 * Value:<br>
11 | 	 * <ul>
12 | 	 * <li>1: If this {@link Node} is a starting node.</li>
13 | 	 * <li>2: If this {@link Node} is a ending node.</li>
14 | 	 * <li>0: otherwise</li>
15 | 	 * </ul>
16 | 	 */
17 | 	public int type;
18 | 
19 | 	public int[] predecessors; // last element should be -1
20 | 	public int[] successors; // last element should be -1
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/data/Dat.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.data;
  2 | 
  3 | import org.thunlp.thulac.util.BufferUtils;
  4 | import org.thunlp.thulac.util.StringUtils;
  5 | 
  6 | import java.io.IOException;
  7 | import java.nio.ByteBuffer;
  8 | import java.nio.ByteOrder;
  9 | import java.nio.channels.SeekableByteChannel;
 10 | import java.nio.file.Files;
 11 | import java.nio.file.Paths;
 12 | 
 13 | /**
 14 |  * A class which loads data files from disk and provide necessary operations. Instances
 15 |  * are created with the {@link #Dat(String)} constructor which reads from a file of
 16 |  * with {@link DatMaker#readFromTxtFile(String)} which constructs a {@code Dat}
 17 |  * structure with the user-specified dictionary.<br>
 18 |  * Internally, {@code Dat} uses the two-array Trie Tree to store information that can
 19 |  * be searched though at high speed, (sometimes) even faster than using
 20 |  * {@link java.util.HashMap}.
 21 |  */
 22 | public class Dat {
 23 | 	/**
 24 | 	 * The two-array Trie Tree, use {@code dat[i << 1]} to access {@code base[i]} and
 25 | 	 * {@code dat[(i << 1) + 1]} to access {@code check[i]}.
 26 | 	 */
 27 | 	public int[] dat;
 28 | 	/**
 29 | 	 * The size of the Trie Tree, should be {@code this.dat.length / 2}.
 30 | 	 */
 31 | 	public int datSize;
 32 | 
 33 | 	protected Dat(int size) {
 34 | 		this.dat = new int[size << 1];
 35 | 		this.datSize = size;
 36 | 	}
 37 | 
 38 | 	/**
 39 | 	 * Read a {@link Dat} from a given file.
 40 | 	 *
 41 | 	 * @param filename
 42 | 	 * 		The name of the {@link Dat} file.
 43 | 	 *
 44 | 	 * @throws IOException
 45 | 	 * 		If an I/O error occurred while reading the file.
 46 | 	 */
 47 | 	public Dat(String filename) throws IOException {
 48 | 		SeekableByteChannel channel = Files.newByteChannel(Paths.get(filename));
 49 | 		// DWORD base + DWORD check -> 8 bytes per record
 50 | 		this.datSize = (int) (channel.size() >> 3);
 51 | 		this.dat = new int[this.datSize << 1];
 52 | 		// strange though, dat files are stored little endian
 53 | 		ByteBuffer bb = ByteBuffer.allocateDirect(64 * 1024)
 54 | 				.order(ByteOrder.LITTLE_ENDIAN);
 55 | 		bb.clear();
 56 | 		if (!BufferUtils.readInts(channel, bb, this.dat))
 57 | 			throw new IOException("File does not contain enough data!");
 58 | 		channel.close();
 59 | 	}
 60 | 
 61 | 	// if word in dat, return leaf element, otherwise return -1
 62 | 	private int match(String word) {
 63 | 		int ind = 0;
 64 | 		int base = 0;
 65 | 		int[] codePoints = StringUtils.toCodePoints(word);
 66 | 		for (int c : codePoints) {
 67 | 			ind = this.dat[ind << 1] + c;
 68 | 			if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return -1;
 69 | 			base = ind;
 70 | 		}
 71 | 		ind = this.dat[base << 1];
 72 | 		return ind < this.datSize && this.dat[(ind << 1) + 1] == base ? ind : -1;
 73 | 	}
 74 | 
 75 | 	// if prefix in dat, return -base, otherwise return longest substring of prefix in dat
 76 | 	public int getInfo(String prefix) {
 77 | 		int ind = 0;
 78 | 		int base = 0;
 79 | 		for (int i = 0; i < prefix.length(); i++) {
 80 | 			ind = this.dat[ind << 1] + prefix.charAt(i);
 81 | 			if (ind >= this.datSize || this.dat[(ind << 1) + 1] != base) return i;
 82 | 			base = ind;
 83 | 		}
 84 | 		return -base;
 85 | 	}
 86 | 
 87 | 	/**
 88 | 	 * Returns whether this {@link Dat} contains one or more words that begin with
 89 | 	 * {@code prefix}.
 90 | 	 *
 91 | 	 * @param prefix
 92 | 	 * 		The query prefix.
 93 | 	 *
 94 | 	 * @return Whether this {@link Dat} contains one or more words that begin with
 95 | 	 * {@code prefix}.
 96 | 	 */
 97 | 	public boolean containsPrefix(String prefix) {
 98 | 		return getInfo(prefix) < 0;
 99 | 	}
100 | 
101 | 	/**
102 | 	 * Returns whether this {@link Dat} contains the given word.
103 | 	 *
104 | 	 * @param word
105 | 	 * 		The query word.
106 | 	 *
107 | 	 * @return Whether this {@link Dat} contains {@code word}.
108 | 	 */
109 | 	public boolean contains(String word) {
110 | 		return this.match(word) != -1;
111 | 	}
112 | }
113 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/data/DatMaker.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.data;
  2 | 
  3 | import java.io.*;
  4 | import java.util.ArrayList;
  5 | import java.util.Comparator;
  6 | import java.util.List;
  7 | import java.util.Vector;
  8 | 
  9 | /**
 10 |  * A class used to construct instances of {@link Dat} from user-specified dictionary
 11 |  * files. It extends {@link Dat} to avoid unnecessary array copies and to increase
 12 |  * performance.<br>
 13 |  * A confusing algorithm is used to construct the two-array Trie Tree used by
 14 |  * {@link Dat}, see in-line comments for more information.
 15 |  */
 16 | public class DatMaker extends Dat {
 17 | 	// a record of a word with an related integer
 18 | 	private static class Record {
 19 | 		public String word;
 20 | 		public int num;
 21 | 
 22 | 		public Record() {
 23 | 			this("", 0);
 24 | 		}
 25 | 
 26 | 		public Record(String key, int value) {
 27 | 			this.word = key;
 28 | 			this.num = value;
 29 | 		}
 30 | 	}
 31 | 
 32 | 	// pairs of Records are compared by comparing their words
 33 | 	private static Comparator<Record> RECORDS_COMPARATOR =
 34 | 			new Comparator<Record>() {
 35 | 				@Override
 36 | 				public int compare(Record a, Record b) {
 37 | 					return a.word.compareTo(b.word);
 38 | 				}
 39 | 			};
 40 | 
 41 | 	/**
 42 | 	 * Reads (or more precisely, constructs) an instance of {@link Dat} from the given
 43 | 	 * {@link InputStream}. This is used to generate {@link Dat} from a user-specified
 44 | 	 * dictionary, which consists of multiple lines, each one representing a word in the
 45 | 	 * dictionary.
 46 | 	 *
 47 | 	 * @param in
 48 | 	 * 		The {@link InputStream} to read.
 49 | 	 *
 50 | 	 * @return The generated {@link Dat}.
 51 | 	 *
 52 | 	 * @throws IOException
 53 | 	 * 		If an I/O error happens.
 54 | 	 */
 55 | 	public static Dat readFromInputStream(InputStream in) throws IOException {
 56 | 		List<String> words = new ArrayList<>();
 57 | 		BufferedReader reader = new BufferedReader(new InputStreamReader(in));
 58 | 		String str;
 59 | 		while ((str = reader.readLine()) != null) words.add(str);
 60 | 		reader.close();
 61 | 
 62 | 		DatMaker dat = new DatMaker();
 63 | 		dat.buildDat(words);
 64 | 		return dat;
 65 | 	}
 66 | 
 67 | 	/**
 68 | 	 * Reads (or more precisely, constructs) an instance of {@link Dat} from the given
 69 | 	 * file. This is used to generate {@link Dat} from a user-specified dictionary,
 70 | 	 * which consists of multiple lines, each one representing a word in the dictionary.
 71 | 	 *
 72 | 	 * @param filename
 73 | 	 * 		The name of the file.
 74 | 	 *
 75 | 	 * @return The generated {@link Dat}.
 76 | 	 *
 77 | 	 * @throws IOException
 78 | 	 * 		If the given file does not exist or is not readable.
 79 | 	 */
 80 | 	public static Dat readFromTxtFile(String filename) throws IOException {
 81 | 		return readFromInputStream(new FileInputStream(filename));
 82 | 	}
 83 | 
 84 | 	// The main idea of this ingenious algorithm that generates a Dat instance from the
 85 | 	// input string is that it makes use of the unused space of the original double-array
 86 | 	// Trie Tree to store a double-linked list. This means that it is fully
 87 | 	// compatible with the standard double-array Trie Tree data structure. What's more,
 88 | 	// this algorithm achieves its goal without extra storage space, expect for the head
 89 | 	// and tail fields. But these only require O(1) space, so they can be safely ignored.
 90 | 
 91 | 	// this.dat, the only storage block used by this algorithm, is an
 92 | 	// array of ELEMENTS. An ELEMENT contains two values, called BASE and CHECK, both
 93 | 	// integers. this.dat is structured in this way:
 94 | 	// ELEMENTS[0].BASE, ELEMENTS[0].CHECK, ELEMENTS[1].BASE, ELEMENTS[1].CHECK, ...
 95 | 	// this.datSize is the total number of ELEMENTS, so
 96 | 	// this.dat.length = 2 * this.datSize.
 97 | 	// In the following parts,BASE and CHECK will be referred to as the
 98 | 	// FIELDS of an ELEMENT, for example, "the BASE FIELD of ELEMENT[4]".
 99 | 
100 | 	// The program distinguishes the two different data structures stored in this.dat by
101 | 	// the sign of the ELEMENTS' FIELDS.
102 | 	// ELEMENTS whose CHECK and BASE FIELDS are positive belong to the double-array Trie
103 | 	// Tree, while those whose CHECK and BASE FIELDS are negative belong to the
104 | 	// double-linked list. When an ELEMENT belongs to the Trie Tree, we call it USED.
105 | 	// Otherwise, we call it UNUSED.
106 | 
107 | 	// Here the specific data structures are explained.
108 | 	// The data structure of the Trie Tree:
109 | 	// FIELDS of USED ELEMENTS strictly follow the definitions of the double-array Trie
110 | 	// Tree. (If unfamiliar, consult Google) For the current stage S and input
111 | 	// character C, we have:
112 | 	// ELEMENTS[ELEMENTS[S].BASE + C].CHECK = S
113 | 	// ELEMENTS[S].BASE + C = T
114 | 	// where T is the next stage the DFA (Deterministic Finite Automaton) described by
115 | 	// the Trie Tree should jump to.
116 | 
117 | 	// The data structure of the double-linked list:
118 | 	// In a double-linked list there are multiple NODES, each containing two
119 | 	// pointers PREV and NEXT. In accordance with the c-style arrow (->) operator, this
120 | 	// list conforms to the following equations:
121 | 	// NODE->NEXT->PREV = NODE
122 | 	// NODE->PREV->NEXT = NODE
123 | 	// In this implementation, pointers take the negative of the values of the indices of
124 | 	// the NODES they point to. The PREV pointer is stored in the BASE field, and the
125 | 	// NEXT pointer in the CHECK field. We have,
126 | 	// -ELEMENTS[ -ELEMENTS[i].CHECK ].BASE = i
127 | 	// -ELEMENTS[ -ELEMENTS[i].BASE ].CHECK = i
128 | 	// The negative signs appear because fields of ELEMENTs in the double-linked list
129 | 	// are negative.
130 | 
131 | 	// The pointers to the HEAD NODE and the TAIL NODE are stored in this.head and
132 | 	// this.tail, respectively. -this.head is the index of the first NODE in the
133 | 	// double-linked list, and -this.tail is the index of the last NODE.
134 | 
135 | 	// After so many explanations of the data structure, we finally come to the
136 | 	// actual behavior of this algorithm.
137 | 	// The buildDat() method takes a list of strings as input and sorts them in
138 | 	// alphabetical order. Afterwards, findChildren() breaks strings - char sequences -
139 | 	// into a tree of characters, as described in the Trie Tree.
140 | 	// Since the Trie Tree is a representation of an DFA (Deterministic Finite
141 | 	// Automaton), a stage has to be generated for each node in the tree. Such a stage,
142 | 	// stored as ELEMENTS, have the BASE and CHECK FIELDS. The CHECK field of an ELEMENT
143 | 	// is assigned when its parent stage is generated. The assignment of the value in
144 | 	// BASE FIELD is implemented in allocate() and described below:
145 | 
146 | 	// 1. Set variable BASE to this.head.
147 | 	// 2. Determine whether BASE is available. (If all ELEMENTS[BASE + C] are UNUSED
148 | 	//    for every C of the child nodes of the current one)
149 | 	// 3. If BASE is available, return BASE; otherwise, set BASE to the next UNUSED
150 | 	//    ELEMENT, using the double-linked list.
151 | 	// In this process, if no available BASE is found, the size of this.dat is doubled
152 | 	// through the expandDat() method, which also maintains the double-linked list in
153 | 	// the newly allocated ELEMENTS.
154 | 
155 | 	// After an available BASE has been found for the current stage, markAsUsed()
156 | 	// is called with BASE and all BASE + C, updating the double-linked list.
157 | 
158 | 	// Afterwards, populate() is called. It sets ELEMENTS[BASE + C].CHECK to S
159 | 	// for all C in the child nodes and sets ELEMENTS[S].BASE to BASE. ELEMENTS[S]
160 | 	// .CHECK is set to S if stage BASE can be the end of a word; otherwise, it is set
161 | 	// to BASE otherwise. For each word in lexicon, its corresponding leaf node in the
162 | 	// Trie Tree will have its BASE field set to the line number of the word. (Remember
163 | 	// that the user-specified dictionary consists of multiple lines, each one
164 | 	// representing a word in the dictionary.
165 | 
166 | 	// Finally, method packDat() is invoked to minimize the size of this.dat and reduce
167 | 	// memory usage.
168 | 
169 | 	private int head, tail;
170 | 
171 | 	private DatMaker() {
172 | 		super(1);
173 | 
174 | 		// initialize the double-linked list: head = 0, next = 1
175 | 		this.dat[0] = this.dat[1] = -1;
176 | 		this.head = this.tail = 0;
177 | 	}
178 | 
179 | 	// mark element as used by modifying the double-linked list
180 | 	private void markAsUsed(int index) {
181 | 		// -base -> the previous element, -check -> the next element
182 | 		int base = this.dat[index << 1], check = this.dat[(index << 1) + 1];
183 | 
184 | 		// if the the next element is already USED, print an error message
185 | 		if (check >= 0) throw new RuntimeException("Cell reused! Index: " + index);
186 | 
187 | 		// maintain the double-linked list
188 | 		if (base == -1) this.head = check;
189 | 		else this.dat[((-base) << 1) + 1] = check;
190 | 		if (check == -this.datSize) this.tail = base;
191 | 		else this.dat[(-check) << 1] = base;
192 | 
193 | 		this.dat[(index << 1) + 1] = index; // positive check: element used
194 | 	}
195 | 
196 | 	// expand size of this.dat
197 | 	private void expandDat() {
198 | 		int oldSize = this.datSize;
199 | 
200 | 		// alloc & copy
201 | 		this.datSize *= 2;
202 | 		int[] newDat = new int[this.dat.length << 1];
203 | 		System.arraycopy(this.dat, 0, newDat, 0, this.dat.length);
204 | 		this.dat = newDat;
205 | 
206 | 		// expand the double-linked list
207 | 		for (int i = 0; i < oldSize; i++) {
208 | 			int pos = (oldSize + i) << 1;
209 | 			newDat[pos] = -(oldSize + i - 1);
210 | 			newDat[pos + 1] = -(oldSize + i + 1);
211 | 		}
212 | 		this.dat[oldSize << 1] = this.tail;
213 | 		this.dat[((-this.tail) << 1) + 1] = -oldSize;
214 | 		this.tail = -(oldSize * 2 - 1); // set tail to the last element
215 | 	}
216 | 
217 | 	// remove unused elements to save memory
218 | 	private void packDat() {
219 | 		// calculate minimum size
220 | 		int last = this.datSize - 1;
221 | 		for (; this.dat[(last << 1) + 1] < 0; --last) ;
222 | 		this.datSize = last + 1;
223 | 
224 | 		// truncate this.dat
225 | 		int[] newDat = new int[this.datSize << 1];
226 | 		System.arraycopy(this.dat, 0, newDat, 0, this.datSize << 1);
227 | 		this.dat = newDat;
228 | 	}
229 | 
230 | 	// allocate elements according to offsets and return BASE
231 | 	private int allocate(List<Integer> offsets) {
232 | 		int size = offsets.size();
233 | 		int base = -this.head; // initialized to the head of the double-linked list
234 | 		while (true) {
235 | 			// expand this.dat as needed
236 | 			if (base == this.datSize) this.expandDat();
237 | 			if (size != 0) {
238 | 				// sorted, offsets.get(size - 1) is the greatest
239 | 				int requiredSize = base + offsets.get(size - 1);
240 | 				while (requiredSize >= this.datSize) this.expandDat();
241 | 			}
242 | 
243 | 			boolean available = true; // check availability
244 | 			if (this.dat[(base << 1) + 1] >= 0) available = false; // ELEMENTS[BASE] USED
245 | 			else {
246 | 				// if any ELEMENTS[BASE + C] is USED, available = false
247 | 				int i = 0;
248 | 				for (; i < size && this.dat[(base + offsets.get(i) << 1) + 1] < 0; i++) ;
249 | 				if (i < size) available = false;
250 | 			}
251 | 
252 | 			if (available) { // if BASE is available, update double-linked list
253 | 				this.markAsUsed(base);
254 | 				for (int offset : offsets) this.markAsUsed(base + offset);
255 | 
256 | 				return base;
257 | 			}
258 | 
259 | 			// find next BASE to check availability
260 | 			int newBase = -this.dat[(base << 1) + 1];
261 | 			if (newBase == this.datSize) this.expandDat(); // ensure capacity
262 | 			base = newBase;
263 | 		}
264 | 	}
265 | 
266 | 	// find characters in lexicon which might follow the prefix
267 | 	private List<Integer> findChildren(List<Record> lexicon, int start, String prefix) {
268 | 		List<Integer> children = new ArrayList<>();
269 | 		int length = prefix.length(), currentChild = -1;
270 | 		for (int i = start, size = lexicon.size(); i < size; ++i) {
271 | 			String word = lexicon.get(i).word;
272 | 			if (!word.startsWith(prefix)) return children;
273 | 			if (word.length() == length) continue;
274 | 			int nextCh = word.charAt(length);
275 | 			if (nextCh != currentChild) children.add(currentChild = nextCh);
276 | 		}
277 | 		return children;
278 | 	}
279 | 
280 | 	// populate BASE and CHECK FIELDS of allocated BASE and BASE + C
281 | 	// @param isWord Whether the end of a word has been reached.
282 | 	private int populate(int check, List<Integer> offsets, boolean isWord) {
283 | 		int base = this.allocate(offsets);
284 | 
285 | 		this.dat[base << 1] = 0;
286 | 		this.dat[(base << 1) + 1] = isWord ? check : base;
287 | 
288 | 		for (int offset : offsets) { // update Trie Tree
289 | 			int pos = base + offset << 1;
290 | 			this.dat[pos] = 0;
291 | 			this.dat[pos + 1] = check; // ELEMENTS[ELEMENTS[S].BASE + C].CHECK = S
292 | 		}
293 | 		this.dat[check << 1] = base; // ELEMENTS[CHECK].BASE = BASE
294 | 
295 | 		return base;
296 | 	}
297 | 
298 | 	// build the Dat structure with a word list as input
299 | 	private void buildDat(List<String> words) {
300 | 		// construct lexicon
301 | 		Vector<Record> lexicon = new Vector<>();
302 | 		lexicon.add(new Record());
303 | 		for (int i = 0, size = words.size(); i < size; ++i)
304 | 			lexicon.add(new Record(words.get(i), i));
305 | 		lexicon.sort(RECORDS_COMPARATOR); // sort input
306 | 
307 | 		// root elements
308 | 		this.dat[0] = this.populate(0, this.findChildren(lexicon, 0, ""), true);
309 | 
310 | 		for (int i = 0, size = lexicon.size(); i < size; i++) {
311 | 			String word = lexicon.get(i).word;
312 | 
313 | 			int off = this.getInfo(word);
314 | 			if (off <= 0) off = word.length(); // if dat already contains word
315 | 
316 | 			// iterate through characters after offset and add new entries
317 | 			for (int offset = off; offset <= word.length(); offset++) {
318 | 				String prefix = word.substring(0, offset);
319 | 				int pBase = -this.getInfo(prefix); // should always be positive
320 | 				this.populate(pBase, this.findChildren(lexicon, i, prefix),
321 | 						offset == word.length()); // on word end
322 | 			}
323 | 
324 | 			off = -this.getInfo(word); // should always be positive
325 | 			this.dat[this.dat[off << 1] << 1] = lexicon.get(i).num; // leaf node value
326 | 		}
327 | 
328 | 		this.packDat();
329 | 	}
330 | }


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/data/POCGraph.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.data;
 2 | 
 3 | import java.util.Vector;
 4 | 
 5 | /**
 6 |  * <i>POC</i> means <i>Position Of Character</i>, representing the possible positions
 7 |  * of a character in the segmented words.<br>
 8 |  * {@code POCGraph} is a list of integers, possesses a length of {@code l} when generated
 9 |  * by processing a string of length {@code l}, therefore we get:<br>
10 |  * Let {@code graph} be an instance of {@code POCGraph}, and {@code l} be the length of
11 |  * the graph. (retrieved by calling {@code graph.size()})<br>
12 |  * {@code graph.get(i)} ({@code 0 <= i < length}) is an integer calculated by bitwise
13 |  * or-ing zero or more of the following constants:<br>
14 |  * <ul>
15 |  * <li>POC_B = 0x01: included if the character can be the beginning of a word.</li>
16 |  * <li>POC_M = 0x02: included if the character can be the middle of a word.</li>
17 |  * <li>POC_E = 0x04: included if the character can be the end of a word.</li>
18 |  * <li>POC_S = 0x08: included if the character can be exactly one single world.</li>
19 |  * </ul>
20 |  * As pseudo-code:<br>
21 |  * <code><pre>
22 |  * int i = &lt;index&gt;;
23 |  * boolean canBeBeginning = input.canBeBeginning(i);
24 |  * boolean canBeMiddle    = input.canBeMiddle(i);
25 |  * boolean canBeEnd       = input.canBeEnd(i);
26 |  * boolean canBeSingle    = input.canBeSingle(i);
27 |  * int positions = (canBeBeginning ? POC_B : 0) |
28 |  *                 (canBeMiddle    ? POC_M : 0) |
29 |  *                 (canBeEnd       ? POC_E : 0) |
30 |  *                 (canBeSingle    ? POC_S : 0);
31 |  * graph[i] = positions;
32 |  * </pre></code>
33 |  * Note that the {@code POC_M} flag does not conflict with the other flags, e.g., a
34 |  * {@code position} of {@code POC_M | POC_B} means that the character can either be the
35 |  * middle or the beginning of a word. This applies also for {@code POC_S}, which
36 |  * indicates that the character can form a single-character word.<br>
37 |  * The generation of {@code POCGraph} is mainly based on punctuations and line breaks,
38 |  * but in various implementations also on characters that would certainly not be a part
39 |  * of a word, such as whitespaces or numbers.<br>
40 |  * This class is merely a alias for {@linkplain Vector Vector&lt;Integer&gt;},
41 |  * indicating that instances of this class are used as only as the list of {@code POCs},
42 |  * no more behaviour is added.
43 |  */
44 | public class POCGraph extends Vector<Integer> {
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/data/TaggedWord.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.data;
 2 | 
 3 | /**
 4 |  * A class which represent a tagged word, that is, a word with a tag.
 5 |  */
 6 | public class TaggedWord {
 7 | 	public String word;
 8 | 	public String tag;
 9 | 
10 | 	public TaggedWord() {
11 | 		this.word = "";
12 | 	}
13 | 
14 | 	public TaggedWord(String word, String tag) {
15 | 		this.word = word;
16 | 		this.tag = tag;
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/IInputProvider.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.Thulac;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | /**
 9 |  * An interface used to provide input for {@link Thulac}. Implementations of this
10 |  * interface should contain its own context, since {@link #provideInput()} does not
11 |  * pass any kind of parameter. It is recommended that implementations read input from a
12 |  * stream, e.g., from a file of from the console ({@code System.in}).
13 |  */
14 | public interface IInputProvider extends IProgramStateListener {
15 | 	/**
16 | 	 * Provide a {@link List} of {@link String} which contains the input for the
17 | 	 * segmentation program to process. By contract, the return value of this method,
18 | 	 * joined with whitespaces (U+0020) should logically represent a line from the input,
19 | 	 * though this is not compulsory. A {@code null} return value will be regarded as
20 | 	 * an EOF and the program will terminate. A {@link List} is used because it is
21 | 	 * recommended to split an enormous line into separate line segments based on the
22 | 	 * punctuations.
23 | 	 *
24 | 	 * @return The input to the segmentation program.
25 | 	 */
26 | 	List<String> provideInput() throws IOException;
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/IOutputHandler.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | /**
 9 |  * An interface used to handle the output from the segmentation program. The whole
10 |  * handling process is based on lines, though its extending the
11 |  * {@link IProgramStateListener} allows it to listen the starting and termination
12 |  * events of the program, therefore implementations should also concentrate on lines.
13 |  */
14 | public interface IOutputHandler extends IProgramStateListener {
15 | 	/**
16 | 	 * Handles the {@link List} of {@link TaggedWord} generated by the segmentation
17 | 	 * program. Since one input line might be split into multiple line segments,
18 | 	 * this method might be invoked several times between a pair of
19 | 	 * {@link #handleLineStart()} and {@link #handleLineEnd()}. Traditionally, the
20 | 	 * param {@code word} of all the invocations of this methods between a pair of
21 | 	 * {@link #handleLineEnd()} and {@link #handleLineEnd()} come from the same line of
22 | 	 * input, and the output handler should output to the same line as well, however
23 | 	 * this is not compulsory.
24 | 	 *
25 | 	 * @param words
26 | 	 * 		The {@link List} of {@link TaggedWord} generated processing one line segment.
27 | 	 * @param segOnly
28 | 	 * 		Whether to output without tags.
29 | 	 * @param separator
30 | 	 * 		The separator between output words and tags.
31 | 	 */
32 | 	void handleLineSegment(List<TaggedWord> words, boolean segOnly, char separator)
33 | 			throws IOException;
34 | 
35 | 	/**
36 | 	 * Called when an input line is obtained from {@link IInputProvider} and the
37 | 	 * segmentation program is about to begin breaking the line into segments. This
38 | 	 * method is basically for initializations, e.g., creating new line, etc.<br>
39 | 	 * This method is invoked before {@link #handleLineSegment(List, boolean, char)}.
40 | 	 */
41 | 	void handleLineStart() throws IOException;
42 | 
43 | 	/**
44 | 	 * Called when segmentation of an input line is finished and the segmentation
45 | 	 * program is about to begin processing the next line. This method is basically for
46 | 	 * finalisation, e.g., flushing input of this line, etc.<br>
47 | 	 * This method is invoked after {@link #handleLineSegment(List, boolean, char)}.
48 | 	 */
49 | 	void handleLineEnd() throws IOException;
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/IProgramStateListener.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | /**
 4 |  * An interface used to listen to the starting and termination events of the
 5 |  * segmentation program.
 6 |  */
 7 | public interface IProgramStateListener {
 8 | 	/**
 9 | 	 * Called when the segmentation program starts.
10 | 	 */
11 | 	void onProgramStart();
12 | 
13 | 	/**
14 | 	 * Called when the segmentation program terminates. (in finally block)
15 | 	 */
16 | 	void onProgramEnd();
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/ReaderInputProvider.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.util.IOUtils;
 4 | 
 5 | import java.io.BufferedReader;
 6 | import java.io.IOException;
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * An implementation of {@link IInputProvider} which retrieves input from a
11 |  * {@link BufferedReader}.
12 |  */
13 | public class ReaderInputProvider implements IInputProvider {
14 | 	private BufferedReader reader;
15 | 
16 | 	public ReaderInputProvider(BufferedReader reader) {
17 | 		// reader must be non-null
18 | 		if (reader == null) throw new IllegalArgumentException("reader == null!");
19 | 		this.reader = reader;
20 | 	}
21 | 
22 | 	@Override
23 | 	public List<String> provideInput() throws IOException {
24 | 		String line = this.reader.readLine();
25 | 		if (line == null) return null;
26 | 		return IOUtils.getLineSegments(line);
27 | 	}
28 | 
29 | 	@Override
30 | 	public void onProgramStart() {
31 | 	}
32 | 
33 | 	@Override
34 | 	public void onProgramEnd() {
35 | 		try {
36 | 			this.reader.close(); // release system resources
37 | 		} catch (IOException e) {
38 | 			e.printStackTrace();
39 | 		}
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/StringInputProvider.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.util.IOUtils;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | /**
 9 |  * An implementation of {@link IInputProvider} which retrieves input from a {@link
10 |  * String}.
11 |  */
12 | public class StringInputProvider implements IInputProvider {
13 | 	private String[] lines;
14 | 	private int pointer;
15 | 
16 | 	public StringInputProvider(String input) {
17 | 		// input must be non-null
18 | 		if (input == null) throw new IllegalArgumentException("input == null!");
19 | 		this.lines = input.split("\n"); // empty lines are discarded
20 | 		this.pointer = 0;
21 | 	}
22 | 
23 | 	@Override
24 | 	public void onProgramStart() {
25 | 	}
26 | 
27 | 	@Override
28 | 	public void onProgramEnd() {
29 | 	}
30 | 
31 | 	@Override
32 | 	public List<String> provideInput() throws IOException {
33 | 		if (this.pointer == this.lines.length) return null;
34 | 		return IOUtils.getLineSegments(this.lines[pointer++]);
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/StringOutputHandler.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | /**
 9 |  * An implementation of {@link IOutputHandler} to allow access to the output in form of
10 |  * {@link String}.
11 |  */
12 | public class StringOutputHandler implements IOutputHandler {
13 | 	private StringBuilder str;
14 | 
15 | 	public StringOutputHandler() {
16 | 		this.str = new StringBuilder();
17 | 	}
18 | 
19 | 	@Override
20 | 	public void onProgramStart() {
21 | 	}
22 | 
23 | 	@Override
24 | 	public void onProgramEnd() {
25 | 	}
26 | 
27 | 	@Override
28 | 	public void handleLineSegment(List<TaggedWord> words,
29 | 								  boolean segOnly, char separator) {
30 | 		if (segOnly) {
31 | 			for (TaggedWord word : words) {
32 | 				this.str.append(word.word);
33 | 				this.str.append(' ');
34 | 			}
35 | 		} else {
36 | 			for (TaggedWord word : words) {
37 | 				this.str.append(word.word);
38 | 				this.str.append(separator);
39 | 				this.str.append(word.tag);
40 | 				this.str.append(' ');
41 | 			}
42 | 		}
43 | 	}
44 | 
45 | 	@Override
46 | 	public void handleLineStart() throws IOException {
47 | 	}
48 | 
49 | 	@Override
50 | 	public void handleLineEnd() throws IOException {
51 | 		this.str.append("\n");
52 | 	}
53 | 
54 | 	public String getString() {
55 | 		return this.str.toString();
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/io/WriterOutputHandler.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.io;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.io.BufferedWriter;
 6 | import java.io.IOException;
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * An implementation of {@link IOutputHandler} which writes output to a {@link
11 |  * BufferedWriter}.
12 |  */
13 | public class WriterOutputHandler implements IOutputHandler {
14 | 	private BufferedWriter writer;
15 | 	private StringBuilder sb;
16 | 
17 | 	public WriterOutputHandler(BufferedWriter writer) {
18 | 		// writer must be non-null
19 | 		if (writer == null) throw new IllegalArgumentException("writer == null!");
20 | 		this.writer = writer;
21 | 		this.sb = new StringBuilder();
22 | 	}
23 | 
24 | 	@Override
25 | 	public void handleLineSegment(List<TaggedWord> words, boolean segOnly, char separator)
26 | 			throws IOException {
27 | 		if (segOnly) {
28 | 			for (TaggedWord word : words) {
29 | 				this.sb.append(word.word);
30 | 				this.sb.append(' ');
31 | 			}
32 | 		} else {
33 | 			for (TaggedWord word : words) {
34 | 				this.sb.append(word.word);
35 | 				this.sb.append(separator);
36 | 				this.sb.append(word.tag);
37 | 				this.sb.append(' ');
38 | 			}
39 | 		}
40 | 	}
41 | 
42 | 	@Override
43 | 	public void handleLineStart() throws IOException {
44 | 		this.sb.setLength(0);
45 | 	}
46 | 
47 | 	@Override
48 | 	public void handleLineEnd() throws IOException {
49 | 		this.sb.append("\n");
50 | 		this.writer.write(this.sb.toString());
51 | 	}
52 | 
53 | 	@Override
54 | 	public void onProgramStart() {
55 | 	}
56 | 
57 | 	@Override
58 | 	public void onProgramEnd() {
59 | 		try {
60 | 			this.writer.close(); // release system resources
61 | 		} catch (IOException e) {
62 | 			e.printStackTrace();
63 | 		}
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/main/Main.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.main;
 2 | 
 3 | import org.thunlp.thulac.Thulac;
 4 | import org.thunlp.thulac.io.IInputProvider;
 5 | import org.thunlp.thulac.io.IOutputHandler;
 6 | import org.thunlp.thulac.util.IOUtils;
 7 | 
 8 | import java.io.IOException;
 9 | 
10 | /**
11 |  * The program entrance which deals with command line arguments.
12 |  */
13 | public class Main {
14 | 	public static void main(String[] args) throws IOException {
15 | 		String modelDir = "models/";
16 | 		char separator = '_';
17 | 		String userDict = null;
18 | 		boolean useT2S = false;
19 | 		boolean segOnly = false;
20 | 		boolean useFilter = false;
21 | 		IInputProvider input = null;
22 | 		IOutputHandler output = null;
23 | 
24 | 		for (int c = 0; c < args.length; ++c)
25 | 			switch (args[c]) {
26 | 				case "-t2s":
27 | 					useT2S = true;
28 | 					break;
29 | 				case "-user":
30 | 					userDict = args[++c];
31 | 					break;
32 | 				case "-deli":
33 | 					separator = args[++c].charAt(0);
34 | 					break;
35 | 				case "-seg_only":
36 | 					segOnly = true;
37 | 					break;
38 | 				case "-filter":
39 | 					useFilter = true;
40 | 					break;
41 | 				case "-model_dir":
42 | 					modelDir = args[++c];
43 | 					if (modelDir.charAt(modelDir.length() - 1) != '/')
44 | 						modelDir += '/';
45 | 					break;
46 | 				case "-input":
47 | 					input = IOUtils.inputFromFile(args[++c]); // use UTf-8
48 | 					break;
49 | 				case "-output":
50 | 					output = IOUtils.outputToFile(args[++c]); // use UTF-8
51 | 					break;
52 | 			}
53 | 		if (input == null) input = IOUtils.inputFromConsole();
54 | 		if (output == null) output = IOUtils.outputToConsole();
55 | 
56 | 		Thulac.split(modelDir, separator, userDict, useT2S, segOnly, useFilter,
57 | 				input, output);
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/DictionaryPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.Dat;
 4 | import org.thunlp.thulac.data.DatMaker;
 5 | import org.thunlp.thulac.data.TaggedWord;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.List;
 9 | 
10 | /**
11 |  * A postprocess pass which scans the word list, extract words that are found in the
12 |  * dictionary and tag them.<br>
13 |  * To show its behavior more clearly, we raise the following example:<br>
14 |  * Assume that the input {@code sentence} is {@code "A", "B", "C", "DE"}, and the word
15 |  * list specified by {@link #dictionary} is {@code "AB", "ABC", "ABCD"}.<br>
16 |  * The {@link #process(List)} method tends to find the longest concatenation of words
17 |  * in the word list which exists in the dictionary and combine these words into one
18 |  * single {@link TaggedWord}.<br>
19 |  * So, as for this example, all concatenations of words in the list beginning from
20 |  * index 0 would be: {@code "A", "AB", "ABC", "ABCDE"}, in which only {@code "AB"} and
21 |  * {@code "ABC"} is present in {@link #dictionary}.<br>
22 |  * In this case, the longest concatenation would be {@code "ABC"} and therefore the
23 |  * words {@code "A", "B", "C"} are removed and one single word {@code "ABC"} is added
24 |  * to the word list, which makes the final output from {@link #process(List)} {@code
25 |  * "ABC", "DE"}.<br>
26 |  * Please notice that although {@code "ABCD"} exists in {@link #dictionary}, the
27 |  * {@link #process(List)} method will not attempt to split whole words apart.
28 |  */
29 | public class DictionaryPass implements IPostprocessPass {
30 | 	private Dat dictionary;
31 | 	private String tag;
32 | 
33 | 	public DictionaryPass(String dictFile, String tag, boolean isTxt)
34 | 			throws IOException {
35 | 		this.tag = tag;
36 | 		if (isTxt) this.dictionary = DatMaker.readFromTxtFile(dictFile);
37 | 		else this.dictionary = new Dat(dictFile);
38 | 	}
39 | 
40 | 	@Override
41 | 	public void process(List<TaggedWord> sentence) {
42 | 		if (this.dictionary == null || sentence.isEmpty()) return;
43 | 
44 | 		for (int i = 0, size = sentence.size(); i < size; i++) {
45 | 			// search for longest concatenation which exists in dict
46 | 			StringBuilder sb = new StringBuilder();
47 | 			String longest = null, current;
48 | 			int longestIndex = -1;
49 | 			for (int j = i; j < size; j++) {
50 | 				current = sb.append(sentence.get(j).word).toString();
51 | 				if (!this.dictionary.containsPrefix(current)) break;
52 | 				if (this.dictionary.contains(current)) {
53 | 					longest = current;
54 | 					longestIndex = j;
55 | 				}
56 | 			}
57 | 
58 | 			// if found, combine the words and update the sentence
59 | 			if (longest == null) continue;
60 | 			sentence.set(i, new TaggedWord(longest, this.tag));
61 | 			for (int j = longestIndex; j > i; --j) sentence.remove(j);
62 | 			size = sentence.size();
63 | 		}
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/DoubleWordPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | import org.thunlp.thulac.util.StringUtils;
 5 | 
 6 | import java.util.List;
 7 | 
 8 | import static org.thunlp.thulac.util.CodePointUtils.SPECIAL_CHARS;
 9 | 
10 | /**
11 |  * A postprocess pass combining adjacent words which can form a double word together.
12 |  *
13 |  * @see #canFormDoubleWord(String, String)
14 |  */
15 | public class DoubleWordPass implements IPostprocessPass {
16 | 	@Override
17 | 	public void process(List<TaggedWord> sentence) {
18 | 		if (sentence.size() <= 1) return;
19 | 
20 | 		TaggedWord tagged, last = sentence.get(sentence.size() - 1);
21 | 		for (int i = sentence.size() - 2; i >= 0; --i, last = tagged) {
22 | 			tagged = sentence.get(i);
23 | 			if (this.canFormDoubleWord(tagged.word, last.word)) {
24 | 				tagged.word += last.word;
25 | 				sentence.remove(i + 1);
26 | 			}
27 | 		}
28 | 	}
29 | 
30 | 	/**
31 | 	 * Two words can form a double word if and only of:<br>
32 | 	 * <ul>
33 | 	 * <li>Both words contain only one code points and,</li>
34 | 	 * <li>The only code points in both words are identical and,</li>
35 | 	 * <li>This code point is not a {@linkplain org.thunlp.thulac.util.CodePointUtils#SPECIAL_CHARS
36 | 	 * special character}.</li>
37 | 	 * </ul>
38 | 	 *
39 | 	 * @param first
40 | 	 * 		The first word.
41 | 	 * @param second
42 | 	 * 		The second word.
43 | 	 *
44 | 	 * @return If the two words can form a double word.
45 | 	 */
46 | 	private boolean canFormDoubleWord(String first, String second) {
47 | 		if (StringUtils.codePointCount(first) != 1 ||
48 | 				StringUtils.codePointCount(second) != 1) return false;
49 | 		int firstCP = first.codePointAt(0);
50 | 		int secondCP = second.codePointAt(0);
51 | 		return firstCP == secondCP && SPECIAL_CHARS.indexOf(firstCP) == -1;
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/FilterPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.Dat;
 4 | import org.thunlp.thulac.data.TaggedWord;
 5 | import org.thunlp.thulac.util.StringUtils;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.Arrays;
 9 | import java.util.HashSet;
10 | import java.util.List;
11 | import java.util.Set;
12 | 
13 | import static org.thunlp.thulac.util.CodePointUtils.CHINESE_DIGITS;
14 | import static org.thunlp.thulac.util.CodePointUtils.DIGITS;
15 | 
16 | /**
17 |  * A postprocess pass which filters forbidden tags from the the word list.
18 |  */
19 | public class FilterPass implements IPostprocessPass {
20 | 	/**
21 | 	 * Tags allowed to pass the filter. Words with tags out of this list will be
22 | 	 * discarded.
23 | 	 */
24 | 	private static final Set<String> ALLOWED_TAGS = new HashSet<>(Arrays.asList(
25 | 			"n", "np", "ns", "ni", "nz", "v", "a", "id", "t", "uw"));
26 | 
27 | 	private Dat xuDat;
28 | 	private Dat timeDat;
29 | 
30 | 	public FilterPass(String xuDatFile, String timeDatFile) throws IOException {
31 | 		this.xuDat = new Dat(xuDatFile);
32 | 		this.timeDat = new Dat(timeDatFile);
33 | 	}
34 | 
35 | 	/**
36 | 	 * Returns {@code true} is one of the following is true:<br>
37 | 	 * <ul>
38 | 	 * <li>Word contains one or more normal digits.</li>
39 | 	 * <li>Word contains two or more Chinese digits.</li>
40 | 	 * <li>Word is in dictionary specified by {@link #timeDat}.</li>
41 | 	 * </ul>
42 | 	 *
43 | 	 * @param word
44 | 	 * 		The word to check.
45 | 	 *
46 | 	 * @return Whether the word contains number digits.
47 | 	 */
48 | 	private boolean hasNumber(String word) {
49 | 		int count = 0;
50 | 		for (int c : StringUtils.toCodePoints(word))
51 | 			if (DIGITS.indexOf(c) != -1) return true;
52 | 			else if (CHINESE_DIGITS.indexOf(c) != -1 && count++ != 0) return true;
53 | 		return this.timeDat.contains(word);
54 | 	}
55 | 
56 | 	/**
57 | 	 * Remove words in segmented word list if one of the following is true:<br>
58 | 	 * <ul>
59 | 	 * <li>Tag of word not in {@link #ALLOWED_TAGS}.</li>
60 | 	 * <li>Word in dictionary specified by {@link #timeDat}.</li>
61 | 	 * <li>Word has tag "t" and {@linkplain #hasNumber(String) hasNumber(word)}
62 | 	 * returns {@code true}.</li>
63 | 	 * </ul>
64 | 	 *
65 | 	 * @param sentence
66 | 	 * 		The sentence to filter.
67 | 	 */
68 | 	@Override
69 | 	public void process(List<TaggedWord> sentence) {
70 | 		if (this.xuDat == null || this.timeDat == null || sentence.isEmpty()) return;
71 | 
72 | 		for (int i = sentence.size() - 1; i >= 0; --i) {
73 | 			String word = sentence.get(i).word;
74 | 			String tag = sentence.get(i).tag;
75 | 			if (!ALLOWED_TAGS.contains(tag) || this.xuDat.contains(word) ||
76 | 					("t".equals(tag) && this.hasNumber(word))) sentence.remove(i);
77 | 		}
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/IPostprocessPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.util.List;
 6 | 
 7 | /**
 8 |  * An interface which process the list of {@link TaggedWord} after segmentation.
 9 |  */
10 | public interface IPostprocessPass {
11 | 	/**
12 | 	 * Process the list of {@link TaggedWord}.
13 | 	 *
14 | 	 * @param sentence
15 | 	 * 		The list of {@link TaggedWord}.
16 | 	 */
17 | 	void process(List<TaggedWord> sentence);
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/NegWordPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.Dat;
 4 | import org.thunlp.thulac.data.TaggedWord;
 5 | import org.thunlp.thulac.util.StringUtils;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.List;
 9 | 
10 | /**
11 |  * A postprocess pass which recognises certain negative phrases (for example, "not good
12 |  * enough" in English) and separate the negative word from the rest parts in the phrase
13 |  * (in this example, "not good" is converted into "not" and "good enough") and give the
14 |  * separated parts their respective tags. A {@link Dat} file stores the list of negative
15 |  * phrases to be separated by {@link #process(List)}.
16 |  */
17 | public class NegWordPass implements IPostprocessPass {
18 | 	private Dat negPhrases;
19 | 
20 | 	public NegWordPass(String negDatFile) throws IOException {
21 | 		this.negPhrases = new Dat(negDatFile);
22 | 	}
23 | 
24 | 	@Override
25 | 	public void process(List<TaggedWord> sentence) {
26 | 		if (this.negPhrases == null || sentence.isEmpty()) return;
27 | 
28 | 		for (int i = sentence.size() - 1; i >= 0; --i) {
29 | 			TaggedWord tagged = sentence.get(i);
30 | 			if (this.negPhrases.contains(tagged.word)) {
31 | 				int[] codePoints = StringUtils.toCodePoints(tagged.word);
32 | 				String word = StringUtils.toString(codePoints, 1, codePoints.length - 1);
33 | 				sentence.add(i + 1, new TaggedWord(word, "v"));
34 | 				tagged.word = StringUtils.toString(codePoints[0]);
35 | 				tagged.tag = "d";
36 | 			}
37 | 		}
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/SpecialPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | 
 5 | import java.util.List;
 6 | 
 7 | /**
 8 |  * A postprocess path which deals with special cases.
 9 |  */
10 | public class SpecialPass implements IPostprocessPass {
11 | 	@Override
12 | 	public void process(List<TaggedWord> sentence) {
13 | 		this.filterHTTPURLs(sentence);
14 | 	}
15 | 
16 | 	/**
17 | 	 * Tag "x" for HTTP URLs.<br>
18 | 	 * HTTP URLs are identified as is, if the word is longer than 4 characters and
19 | 	 * starts with "http". (to conform with both {@code http} and {@code https} schemes)
20 | 	 *
21 | 	 * @param sentence
22 | 	 * 		The input sentence.
23 | 	 */
24 | 	private void filterHTTPURLs(List<TaggedWord> sentence) {
25 | 		for (TaggedWord tagged : sentence)
26 | 			if (tagged.word.length() >= 5 && tagged.word.startsWith("http"))
27 | 				tagged.tag = "x";
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/TimeWordPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | import org.thunlp.thulac.util.StringUtils;
 5 | 
 6 | import java.util.List;
 7 | 
 8 | import static org.thunlp.thulac.util.CodePointUtils.DIGITS;
 9 | import static org.thunlp.thulac.util.CodePointUtils.generate;
10 | 
11 | /**
12 |  * A postprocess pass which combine words which together represent a time period into
13 |  * one word.<br>
14 |  * For example, for input word list {@code "A", "B", "C1", "2", "34" "year"} ("year"
15 |  * here can by any Chinese time unit in {@link #TIME_UNITS}), the output should be:
16 |  * {@code "A", "B", "C1", "234year"}.<br>
17 |  * It can be seen that {@code "C1"} is not concatenated to {@code "234year"}, since it
18 |  * contains non-digit characters.<br>
19 |  * Please notice that this class is able to deal with full-width numbers like U+FF10
20 |  * (full-width digit 1) yet not Chinese digits like U+3007 (Chinese for "one").
21 |  */
22 | public class TimeWordPass implements IPostprocessPass {
23 | 	/**
24 | 	 * Chinese characters which represent time units: (description in English)<br>
25 | 	 * YEAR: U+5E74, MONTH: U+6708, DAY: U+65E5 & U+53F7, HOUR: U+65F6 & U+70B9,
26 | 	 * MINUTE: U+5206, SECOND: U+79D2.
27 | 	 */
28 | 	private static final String TIME_UNITS = generate('\u5E74', '\u6708', '\u65E5',
29 | 			'\u53F7', '\u65F6', '\u70B9', '\u5206', '\u79D2');
30 | 
31 | 	/**
32 | 	 * {@code word} is a number if all the code points in {@code word} is a
33 | 	 * {@linkplain org.thunlp.thulac.util.CodePointUtils#DIGITS digit}.
34 | 	 *
35 | 	 * @param word
36 | 	 * 		The word to check.
37 | 	 *
38 | 	 * @return Whether this {@code word} is a number.
39 | 	 */
40 | 	private boolean isNumber(String word) {
41 | 		for (int codePoint : StringUtils.toCodePoints(word))
42 | 			if (DIGITS.indexOf(codePoint) == -1) return false;
43 | 		return true;
44 | 	}
45 | 
46 | 	/**
47 | 	 * {@code word} is a time unit if and only if: {@code word} contains only ont code
48 | 	 * point and this code point is a {@linkplain #TIME_UNITS time unit}.
49 | 	 *
50 | 	 * @param word
51 | 	 * 		The word to check.
52 | 	 *
53 | 	 * @return Whether this {@code word} is a time unit.
54 | 	 */
55 | 	private boolean isTimeUnit(String word) {
56 | 		return StringUtils.codePointCount(word) == 1 &&
57 | 				TIME_UNITS.indexOf(word.codePointAt(0)) != -1;
58 | 	}
59 | 
60 | 	@Override
61 | 	public void process(List<TaggedWord> sentence) {
62 | 		boolean isTimeWord = false;
63 | 		for (int i = sentence.size() - 1; i >= 0; i--) {
64 | 			TaggedWord tagged = sentence.get(i);
65 | 			if (this.isTimeUnit(tagged.word)) isTimeWord = true;
66 | 			else if (isTimeWord && this.isNumber(tagged.word)) {
67 | 				tagged.word += sentence.remove(i + 1).word;
68 | 				tagged.tag = "t";
69 | 			} else isTimeWord = false;
70 | 		}
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/postprocess/VerbPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.postprocess;
 2 | 
 3 | import org.thunlp.thulac.data.Dat;
 4 | import org.thunlp.thulac.data.TaggedWord;
 5 | 
 6 | import java.io.IOException;
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * A postprocess pass which identifies <i>Model Verbs</i> and <i>Directional Verbs</i>.
11 |  *
12 |  * @see <a href="https://en.wikipedia.org/wiki/Modal_verb">Model Verb</a>
13 |  * @see <a href="https://zh.wikipedia.org/wiki/%E8%83%BD%E6%84%BF%E5%8A%A8%E8%AF%8D">
14 |  * Model Verb in Chinese</a>
15 |  * @see <a href="http://baike.baidu.com/item/%E8%B6%8B%E5%90%91%E5%8A%A8%E8%AF%8D">
16 |  * Directional Verb</a>
17 |  * @see <a href="https://zh.wikipedia.org/wiki/%E6%B1%89%E8%AF%AD%E8%AF%8D%E7%B1%BB#.E5.88.86.E7.B1.BB_2>
18 |  * Chinese Word Categories</a>
19 |  */
20 | public class VerbPass implements IPostprocessPass {
21 | 	/**
22 | 	 * {@link Dat} file containing word list of Model Verbs.
23 | 	 */
24 | 	private Dat vM;
25 | 	/**
26 | 	 * {@link Dat} file containing word list of Directional Verbs.
27 | 	 */
28 | 	private Dat vD;
29 | 	/**
30 | 	 * The tag to represent ordinary verbs.
31 | 	 */
32 | 	private String tag;
33 | 
34 | 	public VerbPass(String vMFile, String vDFile) throws IOException {
35 | 		this.vM = new Dat(vMFile);
36 | 		this.vD = new Dat(vDFile);
37 | 		this.tag = "v";
38 | 	}
39 | 
40 | 	/**
41 | 	 * Within two adjacent verbs, only the first one might be a Model Verb and only the
42 | 	 * second one might be a Directional Verb.
43 | 	 *
44 | 	 * @param sentence
45 | 	 * 		The input sentence.
46 | 	 */
47 | 	@Override
48 | 	public void process(List<TaggedWord> sentence) {
49 | 		if (this.vM == null || this.vD == null || sentence.isEmpty()) return;
50 | 
51 | 		TaggedWord last = sentence.get(0), tagged;
52 | 		for (int i = 1, size = sentence.size(); i < size; i++, last = tagged) {
53 | 			tagged = sentence.get(i + 1);
54 | 			if (this.tag.equals(last.tag) && this.tag.equals(tagged.tag))
55 | 				if (this.vM.contains(last.word)) tagged.tag = "vm";
56 | 				else if (this.vD.contains(tagged.word)) tagged.tag = "vd";
57 | 		}
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/preprocess/ConvertT2SPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.preprocess;
 2 | 
 3 | import org.thunlp.thulac.data.POCGraph;
 4 | import org.thunlp.thulac.util.StringUtils;
 5 | 
 6 | import java.io.DataInputStream;
 7 | import java.io.File;
 8 | import java.io.FileInputStream;
 9 | import java.io.IOException;
10 | import java.util.HashMap;
11 | 
12 | /**
13 |  * A preprocess pass which convert traditional Chinese characters to simplified ones,
14 |  * used when switch {@code -t2s} exists in the command line.
15 |  */
16 | public class ConvertT2SPass implements IPreprocessPass {
17 | 	private HashMap<Integer, Integer> t2sMap;
18 | 
19 | 	public ConvertT2SPass(String fileName) throws IOException {
20 | 		this.t2sMap = new HashMap<>();
21 | 		this.loadT2SMap(fileName);
22 | 	}
23 | 
24 | 	private void loadT2SMap(String filename) throws IOException {
25 | 		// TODO: adapt NIO
26 | 
27 | 		File mapFile = new File(filename);
28 | 		// t2s map format: recordCount * DWORD traditional +
29 | 		//                 recordCount * DWORD simplified
30 | 		// -> 8 * recordCount bytes in total
31 | 		int recordCount = (int) (mapFile.length() >> 3);
32 | 
33 | 		DataInputStream input = new DataInputStream(new FileInputStream(mapFile));
34 | 		int[] traditional = new int[recordCount]; // cache
35 | 		for (int i = 0; i < recordCount; ++i) traditional[i] = input.readInt();
36 | 		for (int i = 0; i < recordCount; ++i) {
37 | 			int simplified = input.readInt();
38 | 			this.t2sMap.put(traditional[i], simplified);
39 | 		}
40 | 		input.close();
41 | 	}
42 | 
43 | 	private int getSimplifiedCodePoint(int c) {
44 | 		if (this.t2sMap.containsKey(c)) return this.t2sMap.get(c);
45 | 		return c;
46 | 	}
47 | 
48 | 	private String convertT2S(String sentence) {
49 | 		int[] codePoints = StringUtils.toCodePoints(sentence);
50 | 		StringBuilder sb = new StringBuilder();
51 | 		for (int codePoint : codePoints)
52 | 			sb.appendCodePoint(this.getSimplifiedCodePoint(codePoint));
53 | 		return sb.toString();
54 | 	}
55 | 
56 | 	@Override
57 | 	public String process(String raw, POCGraph ignored) {
58 | 		return this.convertT2S(raw);
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/preprocess/IPreprocessPass.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.preprocess;
 2 | 
 3 | import org.thunlp.thulac.data.POCGraph;
 4 | 
 5 | /**
 6 |  * An interface which process the raw {@link String} before segmentation.
 7 |  */
 8 | public interface IPreprocessPass {
 9 | 	/**
10 | 	 * Process the raw {@link String}.
11 | 	 *
12 | 	 * @param raw
13 | 	 * 		The raw {@link String} to process.
14 | 	 * @param graph
15 | 	 * 		The {@link POCGraph} to write to.
16 | 	 *
17 | 	 * @return The processed {@link String}.
18 | 	 */
19 | 	String process(String raw, POCGraph graph);
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/preprocess/PreprocessPass.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.preprocess;
  2 | 
  3 | import org.thunlp.thulac.data.POCGraph;
  4 | import org.thunlp.thulac.util.StringUtils;
  5 | 
  6 | import static org.thunlp.thulac.util.CodePointUtils.SPECIAL_CHARS;
  7 | import static org.thunlp.thulac.util.CodePointUtils.WHITESPACE_CHARS;
  8 | 
  9 | /**
 10 |  * A preprocess pass which cleans raw input up.
 11 |  */
 12 | public class PreprocessPass implements IPreprocessPass {
 13 | 	// TODO: add more documentation
 14 | 
 15 | 	private static final String SINGLE_PUNCTUATION_CODE_POINTS = StringUtils.toString(
 16 | 			65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217, 8220, 8221, 1230, 12304,
 17 | 			12305, 12289, 12298, 12299, 64, 35, 65288, 65289, 34, 91, 93, 126, 47, 44, 58,
 18 | 			63, 9700, 9734, 9733, 8230, 39, 33, 42, 43, 62, 40, 41, 59, 61);
 19 | 
 20 | 	private boolean isSinglePunctuation(int c) {
 21 | 		return SINGLE_PUNCTUATION_CODE_POINTS.indexOf(c) != -1;
 22 | 	}
 23 | 
 24 | 	private String cleanup(String sentence, POCGraph graph) {
 25 | 		StringBuilder cleaned = new StringBuilder();
 26 | 		graph.clear();
 27 | 		boolean spaceFlag = false, otherFlag = false,
 28 | 				singlePunctuationFlag = false, titleFlag = false;
 29 | 
 30 | 		int titleStart = 0;
 31 | 		int[] codePoints = StringUtils.toCodePoints(sentence);
 32 | 		for (int c : codePoints) {
 33 | 			if (WHITESPACE_CHARS.indexOf(c) != -1) {
 34 | 				otherFlag = false;
 35 | 				if (spaceFlag) continue;
 36 | 				if (!graph.isEmpty())
 37 | 					graph.setElementAt(graph.lastElement() & 12, graph.size() - 1);
 38 | 				spaceFlag = true;
 39 | 				continue;
 40 | 			}
 41 | 
 42 | 			cleaned.appendCodePoint(c);
 43 | 			if (SPECIAL_CHARS.indexOf(c) != -1) {
 44 | 				if (spaceFlag) {
 45 | 					singlePunctuationFlag = this.isSinglePunctuation(c);
 46 | 					graph.add(singlePunctuationFlag ? 8 : 9);
 47 | 					spaceFlag = false;
 48 | 				} else {
 49 | 					if (otherFlag) {
 50 | 						if (this.isSinglePunctuation(c)) {
 51 | 							if (!graph.isEmpty()) graph.setElementAt(
 52 | 									graph.lastElement() & 12, graph.size() - 1);
 53 | 							graph.add(8);
 54 | 						} else if (singlePunctuationFlag) graph.add(9);
 55 | 						else {
 56 | 							if (!graph.isEmpty() && graph.lastElement() == 0)
 57 | 								graph.setElementAt(7, graph.size() - 1);
 58 | 							graph.add(2);
 59 | 						}
 60 | 					} else graph.add(9);
 61 | 					singlePunctuationFlag = this.isSinglePunctuation(c);
 62 | 				}
 63 | 				otherFlag = true;
 64 | 
 65 | 				if (c == 12298) titleStart = graph.size();
 66 | 				else if (c == 12299 && titleFlag) {
 67 | 					int titleEnd = graph.size() - 2;
 68 | 					if (titleEnd <= titleStart + 9)
 69 | 						if (titleStart == titleEnd) graph.setElementAt(9, titleStart);
 70 | 						else {
 71 | 							graph.setElementAt(1, titleStart);
 72 | 							for (int i = titleStart + 1; i < titleEnd; ++i)
 73 | 								graph.setElementAt(2, i);
 74 | 							graph.setElementAt(4, titleEnd);
 75 | 						}
 76 | 				}
 77 | 				titleFlag = c == 12298;
 78 | 			} else {
 79 | 				if (spaceFlag) graph.add(9);
 80 | 				else if (otherFlag) {
 81 | 					graph.setElementAt(graph.lastElement() & 12, graph.size() - 1);
 82 | 					graph.add(9);
 83 | 					singlePunctuationFlag = false;
 84 | 				} else graph.add(15);
 85 | 				spaceFlag = false;
 86 | 				otherFlag = false;
 87 | 			}
 88 | 		}
 89 | 
 90 | 		// deal with first & last character
 91 | 		if (!graph.isEmpty()) {
 92 | 			int first = graph.firstElement() & 9, last = graph.lastElement() & 12;
 93 | 			graph.setElementAt(first == 0 ? 9 : first, 0);
 94 | 			graph.setElementAt(last == 0 ? 12 : last, graph.size() - 1);
 95 | 		}
 96 | 
 97 | 		return cleaned.toString();
 98 | 	}
 99 | 
100 | 	@Override
101 | 	public String process(String raw, POCGraph graph) {
102 | 		return this.cleanup(raw, graph);
103 | 	}
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/util/BufferUtils.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.util;
 2 | 
 3 | import java.io.IOException;
 4 | import java.nio.ByteBuffer;
 5 | import java.nio.IntBuffer;
 6 | import java.nio.channels.FileChannel;
 7 | import java.nio.channels.SeekableByteChannel;
 8 | 
 9 | /**
10 |  * An utility class which deals with buffers.
11 |  *
12 |  * @see java.nio.Buffer
13 |  */
14 | public class BufferUtils {
15 | 	/**
16 | 	 * Read ints from {@code channel} using {@code buf} as buffer and putting them
17 | 	 * sequentially into the array of {@code int[]} represented by {@code arrays}.<br>
18 | 	 * {@code buf} is always in read mode after this method returns, that is, users
19 | 	 * have to call {@code buf.flip()} first if they wish to reuse it. {@code
20 | 	 * channel} is NOT closed after this method returns (since the EOF might not have been
21 | 	 * reached yet), therefore users should close it manually.<br>
22 | 	 *
23 | 	 * @param channel
24 | 	 * 		The {@link FileChannel} to read from.
25 | 	 * @param buf
26 | 	 * 		The {@link ByteBuffer} to use as buffer.
27 | 	 * @param arrays
28 | 	 * 		The array of {@code int[]} to store the read ints.
29 | 	 *
30 | 	 * @return A return value of {@code true} means that all the arrays are successfully
31 | 	 * filled with data read from {@code channel}, while {@code false} means that the
32 | 	 * EOF is reached before all the arrays are filled. In special case that all arrays
33 | 	 * are filled and EOF is reached, {@code true} is returned.
34 | 	 *
35 | 	 * @throws IOException
36 | 	 * 		If an exception is thrown while reading from {@code channel}.
37 | 	 * @throws NullPointerException
38 | 	 * 		If either channel is null, buf is null, or any element of {@code arrays} is
39 | 	 * 		null.
40 | 	 */
41 | 	public static boolean readInts(
42 | 			SeekableByteChannel channel, ByteBuffer buf, int[]... arrays)
43 | 			throws IOException {
44 | 		int position = 0, offset = 0;
45 | 		int[] current = arrays[position];
46 | 		int currentLeft = current.length, readBytes, readInts;
47 | 
48 | 		while (true) {
49 | 			// read buffer
50 | 			readBytes = channel.read(buf);
51 | 			// if EOF is reached and there are still arrays left not filled
52 | 			if (readBytes == -1) return false;
53 | 			buf.flip();
54 | 			IntBuffer intBuf = buf.asIntBuffer();
55 | 			readInts = readBytes >> 2;
56 | 
57 | 			// copy buffer content to arrays
58 | 			while (readInts > 0) {
59 | 				int getLen = Math.min(readInts, currentLeft);
60 | 				intBuf.get(current, offset, getLen);
61 | 				offset += getLen;
62 | 				readInts -= getLen;
63 | 				currentLeft -= getLen;
64 | 
65 | 				if (currentLeft == 0) { // if current array is filled
66 | 					++position;
67 | 					if (position == arrays.length) { // if all arrays have been filled
68 | 						buf.clear();
69 | 						return true;
70 | 					}
71 | 					current = arrays[position];
72 | 					offset = 0;
73 | 					currentLeft = current.length;
74 | 				}
75 | 			}
76 | 
77 | 			buf.clear();
78 | 		}
79 | 	}
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/util/CodePointUtils.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.util;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | /**
  7 |  * An utility class providing definitions for many sets of code points.
  8 |  */
  9 | public class CodePointUtils {
 10 | 	/**
 11 | 	 * ASCII and full-width digits.
 12 | 	 */
 13 | 	public static final String DIGITS =
 14 | 			generate(range('0', '9'), range('\uFF10', '\uFF19'));
 15 | 
 16 | 	/**
 17 | 	 * Chinese digits.
 18 | 	 */
 19 | 	public static final String CHINESE_DIGITS = generate('\u3007', '\u4E00', '\u4E8C',
 20 | 			'\u4E09', '\u56DB', '\u4E94', '\u516D', '\u4E03', '\u516B', '\u4E5D');
 21 | 
 22 | 	/**
 23 | 	 * Special characters, containing:<br>
 24 | 	 * <ul>
 25 | 	 * <li><b>Chinese full-width punctuations</b>:<br>
 26 | 	 * U+FF0C: Comma, U+3002: Full Stop, U+FF1F: Question Mark, U+FF01: Exclamation
 27 | 	 * Mark, U+FF1A: Colon, U+FF1B: Semicolon, U+3010 & U+3011: Brackets, U+3001:
 28 | 	 * Ideographic Comma, U+300A & U+300B: Guillemets, U+FF08 & U+FF09: Parentheses.
 29 | 	 * </li>
 30 | 	 * <li><b>Standard punctuations</b>:<br>
 31 | 	 * U+2018 & U+2019: Single Quotation Marks,U+201C & U+201D: Double Quotation
 32 | 	 * Marks, U+00B7: Middle Point, U+2026: Horizontal Ellipsis, U+2014: Em Dash.
 33 | 	 * </li>
 34 | 	 * <li><b>Special characters</b>:
 35 | 	 * U+FFE5: Full-width Yen Sign, U+25E4: Black Upper Left Triangle, U+2605: Black
 36 | 	 * Star, U+2606: White Star.
 37 | 	 * </li>
 38 | 	 * <li><b>ASCII characters</b>: All printable ASCII characters (from U+0021 to
 39 | 	 * U+007E) except for U+0060: Grave Accent.</li>
 40 | 	 * </ul>
 41 | 	 * (All of above character names are referred from the Unicode Consortium.)
 42 | 	 */
 43 | 	public static final String SPECIAL_CHARS = generate('\uFF0C', '\u3002', '\uFF1F',
 44 | 			'\uFF01', '\uFF1A', '\uFF1B', '\u3010', '\u3011', '\u3001', '\u300A',
 45 | 			'\u300B', '\uFF08', '\uFF09', '\u2018', '\u2019', '\u201C', '\u201D',
 46 | 			'\u00B7', '\u2026', '\u2014', '\uFFE5', '\u25E4', '\u2605', '\u2606',
 47 | 			range('\u0021', '\u005F'), range('\u0061', '\u007E'));
 48 | 
 49 | 	/**
 50 | 	 * Whitespaces: U+0020 & U+3000.
 51 | 	 */
 52 | 	public static final String WHITESPACE_CHARS = generate('\u0020', '\u3000');
 53 | 
 54 | 	/**
 55 | 	 * Generate a {@link String} containing a list of code points produced following
 56 | 	 * these steps:<br>
 57 | 	 * <ol>
 58 | 	 * <li>Let {@code list} be the empty list of integers.</li>
 59 | 	 * <li>For each {@link Object} {@code param} in {@code params}, sequentially from
 60 | 	 * {@code params[0]} to {@code params[params.length - 1]}, switch on {@code
 61 | 	 * param}'s class:<br>
 62 | 	 * <ul>
 63 | 	 * <li><b>{@link Integer}</b>: Append {@code param} to {@code list}.</li>
 64 | 	 * <li><b>{@code int[]}</b>: Append every integer in {@code param} to {@code
 65 | 	 * list}.</li>
 66 | 	 * <li><b>{@link Character}</b>: Append {@code param}, converted to {@code char}
 67 | 	 * and then to {@code int} and then to {@link Integer}, to {@code list}.</li>
 68 | 	 * <li><b>{@link String}</b>: Append every code point in the content of {@code
 69 | 	 * param} retrieved using {@link StringUtils#toCodePoints(String)} to {@code
 70 | 	 * list}.</li>
 71 | 	 * <li><b>Other</b>: Do nothing.</li>
 72 | 	 * </ul>
 73 | 	 * </li>
 74 | 	 * <li>Convert {@code list} to {@link String} using {@link StringUtils#toString(int...)}</li>
 75 | 	 * </ol>
 76 | 	 *
 77 | 	 * @param params
 78 | 	 * 		The input parameters.
 79 | 	 *
 80 | 	 * @return The generated {@link String}.
 81 | 	 */
 82 | 	public static String generate(Object... params) {
 83 | 		List<Integer> codePoints = new ArrayList<>();
 84 | 		for (Object param : params)
 85 | 			if (param instanceof Integer) codePoints.add((Integer) param);
 86 | 			else if (param instanceof int[]) for (int codePoint : (int[]) param)
 87 | 				codePoints.add(codePoint);
 88 | 			else if (param instanceof String)
 89 | 				for (int codePoint : StringUtils.toCodePoints((String) param))
 90 | 					codePoints.add(codePoint);
 91 | 			else if (param instanceof Character) codePoints.add((int) (Character) param);
 92 | 
 93 | 		int[] cps = new int[codePoints.size()];
 94 | 		for (int i = 0, size = codePoints.size(); i < size; ++i)
 95 | 			cps[i] = codePoints.get(i);
 96 | 
 97 | 		return StringUtils.toString(cps);
 98 | 	}
 99 | 
100 | 	/**
101 | 	 * Return an {@code int[]} containing code points ranging from start to end
102 | 	 * (inclusive);
103 | 	 */
104 | 	public static int[] range(int start, int end) {
105 | 		if (end < start) return null;
106 | 		int[] range = new int[end - start + 1];
107 | 		for (int i = start; i <= end; ++i) range[i - start] = i;
108 | 		return range;
109 | 	}
110 | }
111 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/util/IOUtils.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.util;
  2 | 
  3 | import org.thunlp.thulac.io.*;
  4 | 
  5 | import java.io.*;
  6 | import java.nio.charset.Charset;
  7 | import java.nio.charset.StandardCharsets;
  8 | import java.nio.charset.UnsupportedCharsetException;
  9 | import java.nio.file.Files;
 10 | import java.nio.file.Paths;
 11 | import java.util.ArrayList;
 12 | import java.util.List;
 13 | import java.util.regex.Matcher;
 14 | import java.util.regex.Pattern;
 15 | 
 16 | /**
 17 |  * A class which provides static utility methods used dealing with {@link IInputProvider}
 18 |  * and {@link IOutputHandler}. Some of them construct instances of {@link IInputProvider}
 19 |  * and {@link IOutputHandler}, hiding the implementation details from the user. Others
 20 |  * can be used within implementations of {@link IInputProvider} and
 21 |  * {@link IOutputHandler}, avoiding code duplicates.
 22 |  *
 23 |  * @see IInputProvider
 24 |  * @see IOutputHandler
 25 |  */
 26 | public class IOUtils {
 27 | 	/**
 28 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from
 29 | 	 * {@link System#in}, using the default charset as the input encoding.
 30 | 	 *
 31 | 	 * @return The {@link IInputProvider} created.
 32 | 	 */
 33 | 	public static IInputProvider inputFromConsole() {
 34 | 		return inputFromInputStream(System.in); // use default charset for System.in
 35 | 	}
 36 | 
 37 | 	/**
 38 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from a given
 39 | 	 * {@link InputStream} using UTF-8 as encoding.<br>
 40 | 	 * It is recommended to use {@link #inputFromFile(File, Charset)} when reading
 41 | 	 * input from files, since it takes better advantage of Java NIO and have better
 42 | 	 * performances.
 43 | 	 *
 44 | 	 * @param in
 45 | 	 * 		The {@link InputStream} to retrieve input from.
 46 | 	 *
 47 | 	 * @return The {@link IInputProvider} created.
 48 | 	 */
 49 | 	public static IInputProvider inputFromInputStream(InputStream in) {
 50 | 		return inputFromInputStream(in, (Charset) null);
 51 | 	}
 52 | 
 53 | 	/**
 54 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from a given
 55 | 	 * {@link InputStream} using a given charset as encoding.<br>
 56 | 	 * It is recommended to use {@link #inputFromFile(File, Charset)} when reading
 57 | 	 * input from files, since it takes better advantage of Java NIO and have better
 58 | 	 * performances.
 59 | 	 *
 60 | 	 * @param in
 61 | 	 * 		The {@link InputStream} to retrieve input from.
 62 | 	 * @param charsetName
 63 | 	 * 		The optional name of the charset to use, defaulted to "UTF-8".
 64 | 	 *
 65 | 	 * @return The {@link IInputProvider} created.
 66 | 	 */
 67 | 	public static IInputProvider inputFromInputStream(InputStream in, String charsetName)
 68 | 			throws UnsupportedCharsetException {
 69 | 		return inputFromInputStream(in, forName(charsetName));
 70 | 	}
 71 | 
 72 | 	/**
 73 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from a given
 74 | 	 * {@link InputStream} using a given charset as encoding.<br>
 75 | 	 * It is recommended to use {@link #inputFromFile(File, Charset)} when reading
 76 | 	 * input from files, since it takes better advantage of Java NIO and have better
 77 | 	 * performances.
 78 | 	 *
 79 | 	 * @param in
 80 | 	 * 		The {@link InputStream} to retrieve input from.
 81 | 	 * @param charset
 82 | 	 * 		The optional charset to use, defaulted to UTF-8.
 83 | 	 *
 84 | 	 * @return The {@link IInputProvider} created.
 85 | 	 */
 86 | 	public static IInputProvider inputFromInputStream(InputStream in, Charset charset) {
 87 | 		return new ReaderInputProvider(new BufferedReader(
 88 | 				new InputStreamReader(in, getOrDefault(charset))));
 89 | 	}
 90 | 
 91 | 	/**
 92 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from the
 93 | 	 * given file using UTF-8 as file encoding.
 94 | 	 *
 95 | 	 * @param filename
 96 | 	 * 		The name of the file to retrieve input from.
 97 | 	 *
 98 | 	 * @return The {@link IInputProvider} created.
 99 | 	 *
100 | 	 * @throws IOException
101 | 	 * 		If the file does not exist or is not readable.
102 | 	 */
103 | 	public static IInputProvider inputFromFile(String filename) throws IOException {
104 | 		return inputFromFile(filename, (Charset) null);
105 | 	}
106 | 
107 | 	/**
108 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from the
109 | 	 * given file using UTF-8 as file encoding.
110 | 	 *
111 | 	 * @param file
112 | 	 * 		The file to retrieve input from.
113 | 	 *
114 | 	 * @return The {@link IInputProvider} created.
115 | 	 *
116 | 	 * @throws IOException
117 | 	 * 		If the file does not exist or is not readable.
118 | 	 */
119 | 	public static IInputProvider inputFromFile(File file) throws IOException {
120 | 		return inputFromFile(file, (Charset) null);
121 | 	}
122 | 
123 | 	/**
124 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from the
125 | 	 * given file using a given charset as encoding.
126 | 	 *
127 | 	 * @param filename
128 | 	 * 		The name of the file to retrieve input from.
129 | 	 * @param charsetName
130 | 	 * 		The optional name of the charset to use, defaulted to "UTF-8".
131 | 	 *
132 | 	 * @return The {@link IInputProvider} created.
133 | 	 *
134 | 	 * @throws IOException
135 | 	 * 		If the file does not exist or is not readable.
136 | 	 * @throws UnsupportedCharsetException
137 | 	 * 		If the charset referred to by the given name is not supported.
138 | 	 */
139 | 	public static IInputProvider inputFromFile(String filename, String charsetName)
140 | 			throws IOException, UnsupportedCharsetException {
141 | 		return inputFromFile(filename, forName(charsetName));
142 | 	}
143 | 
144 | 	/**
145 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from the
146 | 	 * given file using a given charset as encoding.
147 | 	 *
148 | 	 * @param filename
149 | 	 * 		The file to retrieve input from.
150 | 	 * @param charset
151 | 	 * 		The optional file encoding to use, defaulted to UTF-8.
152 | 	 *
153 | 	 * @return The {@link IInputProvider} created.
154 | 	 *
155 | 	 * @throws IOException
156 | 	 * 		If the file does not exist or is not readable.
157 | 	 */
158 | 	public static IInputProvider inputFromFile(String filename, Charset charset)
159 | 			throws IOException {
160 | 		if (filename == null) return null; // new File(null) throws NPE
161 | 		return inputFromFile(new File(filename), charset);
162 | 	}
163 | 
164 | 	/**
165 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from the
166 | 	 * given file using a given charset as encoding.
167 | 	 *
168 | 	 * @param file
169 | 	 * 		The name of the file to retrieve input from.
170 | 	 * @param charsetName
171 | 	 * 		The optional name of the file encoding to use, defaulted to UTF-8.
172 | 	 *
173 | 	 * @return The {@link IInputProvider} created.
174 | 	 *
175 | 	 * @throws IOException
176 | 	 * 		If the file does not exist or is not readable.
177 | 	 * @throws UnsupportedCharsetException
178 | 	 * 		If the charset referred to by the given	name is not supported.
179 | 	 */
180 | 	public static IInputProvider inputFromFile(File file, String charsetName)
181 | 			throws IOException, UnsupportedCharsetException {
182 | 		return inputFromFile(file, forName(charsetName));
183 | 	}
184 | 
185 | 	/**
186 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from the
187 | 	 * given file using a given charset as encoding.
188 | 	 *
189 | 	 * @param file
190 | 	 * 		The name of the file to retrieve input from.
191 | 	 * @param charset
192 | 	 * 		The optional file encoding to use, defaulted to UTF-8.
193 | 	 *
194 | 	 * @return The {@link IInputProvider} created.
195 | 	 *
196 | 	 * @throws IOException
197 | 	 * 		If the file does not exist or is not readable.
198 | 	 */
199 | 	public static IInputProvider inputFromFile(File file, Charset charset)
200 | 			throws IOException {
201 | 		if (file == null) return null;
202 | 		return new ReaderInputProvider(
203 | 				Files.newBufferedReader(Paths.get(file.toURI()), getOrDefault(charset)));
204 | 	}
205 | 
206 | 	/**
207 | 	 * Creates an instance of {@link IInputProvider} which retrieves input from the
208 | 	 * given {@link String}.
209 | 	 *
210 | 	 * @param input
211 | 	 * 		The input string.
212 | 	 *
213 | 	 * @return The {@link IInputProvider} created.
214 | 	 */
215 | 	public static IInputProvider inputFromString(String input) {
216 | 		if (input == null) return null;
217 | 		return new StringInputProvider(input);
218 | 	}
219 | 
220 | 	/**
221 | 	 * Creates an instance of {@link IOutputHandler} which writes output to
222 | 	 * {@link System#out}, using the default charset as the output encoding.
223 | 	 *
224 | 	 * @return The {@link IOutputHandler} created.
225 | 	 */
226 | 	public static IOutputHandler outputToConsole() {
227 | 		return new WriterOutputHandler(new BufferedWriter(
228 | 				new OutputStreamWriter(System.out)));
229 | 	}
230 | 
231 | 	/**
232 | 	 * Creates an instance of {@link IOutputHandler} which writes output to a given
233 | 	 * {@link OutputStream} using UTF-8 as encoding.<br>
234 | 	 * It is recommended to use {@link #outputToFile(File, String)} when writing
235 | 	 * output to files, since it takes better advantage of Java NIO and have better
236 | 	 * performances.
237 | 	 *
238 | 	 * @param out
239 | 	 * 		The {@link OutputStream} to write output to.
240 | 	 *
241 | 	 * @return The {@link IOutputHandler} created.
242 | 	 */
243 | 	public static IOutputHandler outputToOutputStream(OutputStream out) {
244 | 		return outputToOutputStream(out, (Charset) null);
245 | 	}
246 | 
247 | 	/**
248 | 	 * Creates an instance of {@link IOutputHandler} which writes output to a given
249 | 	 * {@link OutputStream} using a given charset as encoding.<br>
250 | 	 * It is recommended to use {@link #outputToFile(File, String)} when writing
251 | 	 * output to files, since it takes better advantage of Java NIO and have better
252 | 	 * performances.
253 | 	 *
254 | 	 * @param out
255 | 	 * 		The {@link OutputStream} to write output to.
256 | 	 * @param charsetName
257 | 	 * 		The optional name of the charset to use, defaulted to UTF-8.
258 | 	 *
259 | 	 * @return The {@link IOutputHandler} created.
260 | 	 *
261 | 	 * @throws UnsupportedCharsetException
262 | 	 * 		If the charset referred to by the name is not supported.
263 | 	 */
264 | 	public static IOutputHandler outputToOutputStream(
265 | 			OutputStream out, String charsetName) throws UnsupportedCharsetException {
266 | 		return outputToOutputStream(out, forName(charsetName));
267 | 	}
268 | 
269 | 	/**
270 | 	 * Creates an instance of {@link IOutputHandler} which writes output to a given
271 | 	 * {@link OutputStream} using a given charset as encoding.<br>
272 | 	 * It is recommended to use {@link #outputToFile(File, String)} when writing
273 | 	 * output to files, since it takes better advantage of Java NIO and have better
274 | 	 * performances.
275 | 	 *
276 | 	 * @param out
277 | 	 * 		The {@link OutputStream} to write output to.
278 | 	 * @param charset
279 | 	 * 		The optional charset to use, defaulted to UTF-8.
280 | 	 *
281 | 	 * @return The {@link IOutputHandler} created.
282 | 	 */
283 | 	public static IOutputHandler outputToOutputStream(OutputStream out, Charset charset) {
284 | 		return new WriterOutputHandler(new BufferedWriter(
285 | 				new OutputStreamWriter(out, getOrDefault(charset))));
286 | 	}
287 | 
288 | 	/**
289 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
290 | 	 * given file using UTF-8 as file encoding.
291 | 	 *
292 | 	 * @param filename
293 | 	 * 		The name of the file to output to.
294 | 	 *
295 | 	 * @return The {@link IOutputHandler} created.
296 | 	 *
297 | 	 * @throws IOException
298 | 	 * 		If the file cannot be created or is not writable.
299 | 	 */
300 | 	public static IOutputHandler outputToFile(String filename) throws IOException {
301 | 		return outputToFile(filename, (Charset) null);
302 | 	}
303 | 
304 | 	/**
305 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
306 | 	 * given file using UTF-8 as file encoding.
307 | 	 *
308 | 	 * @param file
309 | 	 * 		The file to output to.
310 | 	 *
311 | 	 * @return The {@link IOutputHandler} created.
312 | 	 *
313 | 	 * @throws IOException
314 | 	 * 		If the file cannot be created or is not writable.
315 | 	 */
316 | 	public static IOutputHandler outputToFile(File file) throws IOException {
317 | 		return outputToFile(file, (Charset) null);
318 | 	}
319 | 
320 | 	/**
321 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
322 | 	 * given file using a given charset as encoding.
323 | 	 *
324 | 	 * @param filename
325 | 	 * 		The name of the file to output to.
326 | 	 * @param charsetName
327 | 	 * 		The optional name of the charset to use, defaulted to "UTF-8".
328 | 	 *
329 | 	 * @return The {@link IOutputHandler} created.
330 | 	 *
331 | 	 * @throws IOException
332 | 	 * 		If the file cannot be created or is not writable.
333 | 	 * @throws UnsupportedCharsetException
334 | 	 * 		If the charset referred to by the given name is not supported.
335 | 	 */
336 | 	public static IOutputHandler outputToFile(String filename, String charsetName)
337 | 			throws IOException, UnsupportedCharsetException {
338 | 		return outputToFile(filename, forName(charsetName));
339 | 	}
340 | 
341 | 	/**
342 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
343 | 	 * given file using a given charset as encoding.
344 | 	 *
345 | 	 * @param filename
346 | 	 * 		The name of the file to output to.
347 | 	 * @param charset
348 | 	 * 		The optional file encoding to use, defaulted to UTF-8.
349 | 	 *
350 | 	 * @return The {@link IOutputHandler} created.
351 | 	 *
352 | 	 * @throws IOException
353 | 	 * 		If the file cannot be created or is not writable.
354 | 	 */
355 | 	public static IOutputHandler outputToFile(String filename, Charset charset)
356 | 			throws IOException {
357 | 		if (filename == null) return null; // new File(null) throws NPE
358 | 		return outputToFile(new File(filename), charset);
359 | 	}
360 | 
361 | 	/**
362 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
363 | 	 * given file using a given charset as encoding.
364 | 	 *
365 | 	 * @param file
366 | 	 * 		The file to output to.
367 | 	 * @param charsetName
368 | 	 * 		The optional name of the file encoding to use, defaulted to "UTF-8".
369 | 	 *
370 | 	 * @return The {@link IOutputHandler} created.
371 | 	 *
372 | 	 * @throws IOException
373 | 	 * 		If the file cannot be created or is not writable.
374 | 	 * @throws UnsupportedCharsetException
375 | 	 * 		If the charset referred to by the given name is not supported.
376 | 	 */
377 | 	public static IOutputHandler outputToFile(File file, String charsetName)
378 | 			throws IOException, UnsupportedCharsetException {
379 | 		return outputToFile(file, forName(charsetName));
380 | 	}
381 | 
382 | 	/**
383 | 	 * Creates an instance of {@link IOutputHandler} which writes output to the
384 | 	 * given file using a given charset as encoding.
385 | 	 *
386 | 	 * @param file
387 | 	 * 		The file to output to.
388 | 	 * @param charset
389 | 	 * 		The optional file encoding to use, defaulted to UTF-8.
390 | 	 *
391 | 	 * @return The {@link IOutputHandler} created.
392 | 	 *
393 | 	 * @throws IOException
394 | 	 * 		If the file cannot be created or is not writable.
395 | 	 */
396 | 	public static IOutputHandler outputToFile(File file, Charset charset)
397 | 			throws IOException {
398 | 		if (file == null) return null;
399 | 		return new WriterOutputHandler(
400 | 				Files.newBufferedWriter(Paths.get(file.toURI()), getOrDefault(charset)));
401 | 	}
402 | 
403 | 	/**
404 | 	 * Creates an instance of {@link StringOutputHandler} which writes output to an
405 | 	 * {@link String} in memory.<br>
406 | 	 * It is typical to use this method like this:
407 | 	 * <pre><code>
408 | 	 * StringOutputHandler output = IOUtils.outputToString();
409 | 	 * Thulac.split(input, output, segOnly); // or anything else
410 | 	 * String outputStr = output.getString();
411 | 	 * </code></pre>
412 | 	 *
413 | 	 * @return The {@link StringOutputHandler} created.
414 | 	 */
415 | 	public static StringOutputHandler outputToString() {
416 | 		return new StringOutputHandler();
417 | 	}
418 | 
419 | 	private static final int MAX_LENGTH = 20000;
420 | 	private static final Pattern SPLIT_PATTERN =
421 | 			Pattern.compile(".*([\u3002\uff1f\uff01\uff1b;!?]|$)");
422 | 
423 | 	/**
424 | 	 * Split a given line into a list of line segments if the line is too long. It is
425 | 	 * promised that each line segment either is the last one or ends with an
426 | 	 * punctuation character.
427 | 	 *
428 | 	 * @param line
429 | 	 * 		The line to split into line segments.
430 | 	 *
431 | 	 * @return The list of line segments split.
432 | 	 */
433 | 	public static List<String> getLineSegments(String line) {
434 | 		List<String> lineSegments = new ArrayList<>();
435 | 		if (line.length() < MAX_LENGTH) lineSegments.add(line);
436 | 		else { // split the line into short line segments
437 | 			Matcher matcher = SPLIT_PATTERN.matcher(line);
438 | 			while (matcher.find()) lineSegments.add(matcher.group());
439 | 		}
440 | 		return lineSegments;
441 | 	}
442 | 
443 | 	/**
444 | 	 * Returns a {@link Charset} wich name {@code charset}. This methods differs from
445 | 	 * the {@link Charset#forName(String)} when {@code charset} is {@code null}, with
446 | 	 * this method returning {@code null} while {@link Charset#forName(String)} throws
447 | 	 * an NPE.
448 | 	 *
449 | 	 * @param charset
450 | 	 * 		The name of the {@link Charset}.
451 | 	 *
452 | 	 * @return The {@link Charset} with name {@code charset}.
453 | 	 *
454 | 	 * @throws UnsupportedCharsetException
455 | 	 * 		If the charset referred to by the given name is not supported.
456 | 	 */
457 | 	private static Charset forName(String charset) throws UnsupportedCharsetException {
458 | 		if (charset == null) return null;
459 | 		return Charset.forName(charset);
460 | 	}
461 | 
462 | 	/**
463 | 	 * Returns the given {@link Charset} when non-null, or
464 | 	 * {@link StandardCharsets#UTF_8} otherwise, since many applications using
465 | 	 * {@link Charset} throws NPE if charset is {@code null}.
466 | 	 *
467 | 	 * @param charset
468 | 	 * 		The given {@link Charset}.
469 | 	 *
470 | 	 * @return {@code charset} when non-null, {@link StandardCharsets#UTF_8} otherwise.
471 | 	 */
472 | 	private static Charset getOrDefault(Charset charset) {
473 | 		return charset == null ? StandardCharsets.UTF_8 : charset;
474 | 	}
475 | }
476 | 


--------------------------------------------------------------------------------
/src/main/java/org/thunlp/thulac/util/StringUtils.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.util;
  2 | 
  3 | /**
  4 |  * An utility class which deals with string, converting array of code points to and from
  5 |  * strings.
  6 |  */
  7 | public class StringUtils {
  8 | 	/**
  9 | 	 * Convert an array of code points to {@link String}.
 10 | 	 *
 11 | 	 * @param codePoints
 12 | 	 * 		The code points to convert.
 13 | 	 *
 14 | 	 * @return The converted {@link String}.
 15 | 	 */
 16 | 	public static String toString(int... codePoints) {
 17 | 		return toString(codePoints, 0, codePoints.length);
 18 | 	}
 19 | 
 20 | 	/**
 21 | 	 * Convert an array of code points to {@link String}.
 22 | 	 *
 23 | 	 * @param codePoints
 24 | 	 * 		The code points to convert.
 25 | 	 * @param offset
 26 | 	 * 		The starting offset of {@code codePoints}.
 27 | 	 * @param len
 28 | 	 * 		The number of code points to convert.
 29 | 	 *
 30 | 	 * @return The converted {@link String}, indices which exceeds {@code
 31 | 	 * codePoints.length} are discarded.
 32 | 	 */
 33 | 	public static String toString(int[] codePoints, int offset, int len) {
 34 | 		StringBuilder sb = new StringBuilder();
 35 | 		for (int i = offset, max = Math.min(codePoints.length, offset + len);
 36 | 			 i < max; ++i)
 37 | 			sb.appendCodePoint(codePoints[i]);
 38 | 		return sb.toString();
 39 | 	}
 40 | 
 41 | 	/**
 42 | 	 * Convert a {@link String} to an array of code points.<br>
 43 | 	 * Internally, data in {@link String} is stored in {@code char[]}, however for
 44 | 	 * Unicode code points greater than U+FFFF, one {@code char} (that is, two bytes)
 45 | 	 * is not enough. Therefore, Java uses <i>surrogates</i> to divide code points
 46 | 	 * that cannot be represented by one {@code} into two. The problem is,
 47 | 	 * {@link String#length()} return the length of its internal {@code char[]}, while
 48 | 	 * the return value of {@link String#length()} is not necessarily (though in most
 49 | 	 * cases) equal to the number of code points stored in the {@link String}.<br>
 50 | 	 * To solve this problem, the {@link String} class provides a set of methods to
 51 | 	 * retrieve the actual number of code points stored and to access a code points in
 52 | 	 * the {@link String} using the index by code points, as implemented in this method.
 53 | 	 * However, the iteration through a {@link String} by the actual code points is
 54 | 	 * fairly complicated, and it is much easier for applications to achieve this if
 55 | 	 * the string data is stored as {@code int[]}, each element representing a code point.
 56 | 	 * And this is exactly What this method does: take a {@link String} as input,
 57 | 	 * convert it into a {@code int[]} which contains exactly the same data as the
 58 | 	 * {@link String}.<br>
 59 | 	 * It is recommended that all applications which iterate through the characters
 60 | 	 * stored in a {@link String} use<br>
 61 | 	 * <pre><code>
 62 | 	 * int[] codePoints = StringUtils.toCodePoints(str);
 63 | 	 * for (int codePoint: codePoints) // do something ...
 64 | 	 * </code></pre>
 65 | 	 * instead of the traditional<br>
 66 | 	 * <pre><code>
 67 | 	 * for (int i = 0, length = str.length(); i < length; ++i) {
 68 | 	 *     char c = str.charAt(i);
 69 | 	 *     // do something ...
 70 | 	 * }
 71 | 	 * </code></pre>
 72 | 	 *
 73 | 	 * @param str
 74 | 	 * 		The {@link String} to convert.
 75 | 	 *
 76 | 	 * @return The converted array of code points.
 77 | 	 */
 78 | 	public static int[] toCodePoints(String str) {
 79 | 		if (str == null) return null;
 80 | 		int codePointCount = str.codePointCount(0, str.length());
 81 | 		int[] codePoints = new int[codePointCount];
 82 | 		for (int i = 0; i < codePointCount; ++i)
 83 | 			codePoints[i] = str.codePointAt(str.offsetByCodePoints(0, i));
 84 | 		return codePoints;
 85 | 	}
 86 | 
 87 | 	/**
 88 | 	 * Return the number of code points in the given {@link String}.
 89 | 	 *
 90 | 	 * @param str
 91 | 	 * 		The given {@link String}.
 92 | 	 *
 93 | 	 * @return The number of code points in {@code str}.
 94 | 	 */
 95 | 	public static int codePointCount(String str) {
 96 | 		return str.codePointCount(0, str.length());
 97 | 	}
 98 | 
 99 | 	/**
100 | 	 * Return code point {@code index}-ith code point in the given {@link String}.
101 | 	 *
102 | 	 * @param str
103 | 	 * 		The given {@link String}.
104 | 	 * @param index
105 | 	 * 		The index of the code point to return.
106 | 	 *
107 | 	 * @return The cde point at {@code index}.
108 | 	 *
109 | 	 * @throws IndexOutOfBoundsException
110 | 	 * 		If index is negative or greater than or equal to the number of code points
111 | 	 * 		of {@code str}.
112 | 	 */
113 | 	public static int codePointAt(String str, int index) {
114 | 		int codePointIndex = str.offsetByCodePoints(0, index);
115 | 		return str.codePointAt(codePointIndex);
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/IAccessible.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac;
  2 | 
  3 | import org.thunlp.thulac.io.IInputProvider;
  4 | import org.thunlp.thulac.io.IOutputHandler;
  5 | import org.thunlp.thulac.util.IOUtils;
  6 | 
  7 | import java.io.FileInputStream;
  8 | import java.io.IOException;
  9 | import java.io.InputStream;
 10 | import java.net.URI;
 11 | import java.net.URISyntaxException;
 12 | import java.net.URL;
 13 | import java.nio.file.Files;
 14 | import java.nio.file.Paths;
 15 | import java.util.List;
 16 | import java.util.stream.Collectors;
 17 | import java.util.stream.Stream;
 18 | 
 19 | /**
 20 |  * An interface which provides a set of common actions for resources and files used in
 21 |  * {@link TestHelper}. In practice, an {@code abstract class} is used instead of an
 22 |  * {@code interface} because interfaces does not allow private nested classes. Despite
 23 |  * of this, this class can be used just like an interface.
 24 |  */
 25 | public abstract class IAccessible {
 26 | 	/**
 27 | 	 * Create an instance of {@link IAccessible} with the given resource name.
 28 | 	 *
 29 | 	 * @param name
 30 | 	 * 		The resource name.
 31 | 	 *
 32 | 	 * @return The {@link IAccessible} created.
 33 | 	 *
 34 | 	 * @see <a href="http://docs.oracle.com/javase/6/docs/technotes/guides/lang/resources.html"
 35 | 	 * >Resources</a>
 36 | 	 */
 37 | 	public static IAccessible resourceAt(String name) {
 38 | 		return new AccessibleResource(name);
 39 | 	}
 40 | 
 41 | 	/**
 42 | 	 * Create an instance of {@link IAccessible} with the given file name.
 43 | 	 *
 44 | 	 * @param name
 45 | 	 * 		The file name.
 46 | 	 *
 47 | 	 * @return The {@link IAccessible} created.
 48 | 	 */
 49 | 	public static IAccessible fileAt(String name) {
 50 | 		return new AccessibleFiles(name);
 51 | 	}
 52 | 
 53 | 	/**
 54 | 	 * Trim lines and remove empty ones.
 55 | 	 *
 56 | 	 * @param lines
 57 | 	 * 		The raw lines as {@link Stream<String>}.
 58 | 	 *
 59 | 	 * @return The trimmed and non-empty lines as {@link List<String>}.
 60 | 	 */
 61 | 	private static List<String> getLines(Stream<String> lines) {
 62 | 		return lines.map(String::trim)
 63 | 				.filter(line -> !line.isEmpty())
 64 | 				.collect(Collectors.toList());
 65 | 	}
 66 | 
 67 | 	/**
 68 | 	 * Implementation of {@link IAccessible} reading from resources.
 69 | 	 */
 70 | 	private static class AccessibleResource extends IAccessible {
 71 | 		private URI uri;
 72 | 		private URL url;
 73 | 
 74 | 		public AccessibleResource(String resourceName) {
 75 | 			this.url = AccessibleResource.class.getResource(resourceName);
 76 | 			try {
 77 | 				this.uri = this.url.toURI();
 78 | 			} catch (URISyntaxException ignored) { // should not happen
 79 | 			}
 80 | 		}
 81 | 
 82 | 		@Override
 83 | 		public List<String> getLines() throws IOException {
 84 | 			return IAccessible.getLines(Files.lines(Paths.get(this.uri)));
 85 | 		}
 86 | 
 87 | 		@Override
 88 | 		public IOutputHandler toOutputHandler() throws IOException {
 89 | 			throw new UnsupportedOperationException("Output not supported on resources!");
 90 | 		}
 91 | 
 92 | 		@Override
 93 | 		public InputStream toInputStream() throws IOException {
 94 | 			return this.url.openStream();
 95 | 		}
 96 | 	}
 97 | 
 98 | 	/**
 99 | 	 * Implementation of {@link IAccessible} reading from and writing to files.
100 | 	 */
101 | 	private static class AccessibleFiles extends IAccessible {
102 | 		private String filename;
103 | 
104 | 		public AccessibleFiles(String filename) {
105 | 			this.filename = filename;
106 | 		}
107 | 
108 | 		@Override
109 | 		public List<String> getLines() throws IOException {
110 | 			return Files.readAllLines(Paths.get(this.filename));
111 | 		}
112 | 
113 | 		@Override
114 | 		public IInputProvider toInputProvider() throws IOException {
115 | 			return IOUtils.inputFromFile(this.filename);
116 | 		}
117 | 
118 | 		@Override
119 | 		public IOutputHandler toOutputHandler() throws IOException {
120 | 			return IOUtils.outputToFile(this.filename);
121 | 		}
122 | 
123 | 		@Override
124 | 		public InputStream toInputStream() throws IOException {
125 | 			return new FileInputStream(this.filename);
126 | 		}
127 | 	}
128 | 
129 | 	/**
130 | 	 * Return the content of this resource / file separated into individual lines.
131 | 	 *
132 | 	 * @return Content of this resource / file as a list of strings.
133 | 	 */
134 | 	public abstract List<String> getLines() throws IOException;
135 | 
136 | 	/**
137 | 	 * Create a {@link IInputProvider} with this resource / file.
138 | 	 *
139 | 	 * @return The {@link IInputProvider} created.
140 | 	 */
141 | 	public IInputProvider toInputProvider() throws IOException {
142 | 		return IOUtils.inputFromInputStream(this.toInputStream());
143 | 	}
144 | 
145 | 	/**
146 | 	 * Create a {@link IOutputHandler} with this resource / file.
147 | 	 *
148 | 	 * @return The {@link IOutputHandler} created.
149 | 	 */
150 | 	public abstract IOutputHandler toOutputHandler() throws IOException;
151 | 
152 | 	/**
153 | 	 * Create a {@link InputStream} with this resource / file.
154 | 	 *
155 | 	 * @return The {@link InputStream} created.
156 | 	 */
157 | 	public abstract InputStream toInputStream() throws IOException;
158 | }
159 | 


--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/MainAlt.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac;
 2 | 
 3 | import joptsimple.OptionException;
 4 | import joptsimple.OptionParser;
 5 | import joptsimple.OptionSet;
 6 | import joptsimple.OptionSpec;
 7 | import org.thunlp.thulac.io.IInputProvider;
 8 | import org.thunlp.thulac.io.IOutputHandler;
 9 | import org.thunlp.thulac.util.IOUtils;
10 | 
11 | import java.io.IOException;
12 | 
13 | import static java.util.Arrays.asList;
14 | 
15 | /**
16 |  * A test class of the CLI (Command Line Interface), using
17 |  * <a href="http://pholser.github.io/jopt-simple/">Jopt Simple</a> to parse command
18 |  * line input.
19 |  */
20 | public class MainAlt {
21 | 	private static final String SEG_ONLY_DESC = "Output segments only";
22 | 	private static final String T2S_DESC = "Convert traditional to simplified Chinese";
23 | 	private static final String FILTER_DESC = "Use filter for output";
24 | 	private static final String INPUT_DESC = "Path to the input file";
25 | 	private static final String OUTPUT_DESC = "Path to the output file";
26 | 	private static final String USER_DICT_DESC = "The user-specified dictionary";
27 | 	private static final String DELIMITER_DESC = "The separator between words and tags";
28 | 	private static final String MODEL_DIR_DESC = "Path for models directory";
29 | 	private static final String HELP_DESC = "Show help";
30 | 
31 | 	public static void main(String[] args) throws IOException {
32 | 		OptionParser parser = new OptionParser();
33 | 
34 | 		parser.accepts("seg_only", SEG_ONLY_DESC);
35 | 		parser.accepts("t2s", T2S_DESC);
36 | 		parser.accepts("filter", FILTER_DESC);
37 | 		OptionSpec<String> iOpt = parser.acceptsAll(
38 | 				asList("input", "i"), INPUT_DESC).withRequiredArg();
39 | 		OptionSpec<String> oOpt = parser.acceptsAll(
40 | 				asList("output", "o"), OUTPUT_DESC).withRequiredArg();
41 | 		OptionSpec<String> userDictOpt = parser.acceptsAll(
42 | 				asList("user_dict", "dict", "user"), USER_DICT_DESC).withRequiredArg();
43 | 		OptionSpec<String> dOpt = parser.acceptsAll(
44 | 				asList("delimiter", "delim", "deli"), DELIMITER_DESC).withRequiredArg();
45 | 		OptionSpec<String> modelDirOpt = parser.acceptsAll(
46 | 				asList("model_dir", "model"), MODEL_DIR_DESC).withRequiredArg();
47 | 		parser.acceptsAll(asList("help", "?", "h"), HELP_DESC).forHelp();
48 | 
49 | 		OptionSet opts = parser.parse(args);
50 | 
51 | 		if (opts.has("help")) parser.printHelpOn(System.out);
52 | 		else try {
53 | 			char separator = opts.valueOf(dOpt).charAt(0);
54 | 			boolean segOnly = opts.has("seg_only");
55 | 			boolean useT2S = opts.has("t2s");
56 | 			boolean useFilter = opts.has("filter");
57 | 
58 | 			IInputProvider input;
59 | 			if (opts.has(iOpt)) input = IOUtils.inputFromFile(opts.valueOf(iOpt));
60 | 			else input = IOUtils.inputFromConsole();
61 | 			IOutputHandler output;
62 | 			if (opts.has(oOpt)) output = IOUtils.outputToFile(opts.valueOf(oOpt));
63 | 			else output = IOUtils.outputToConsole();
64 | 
65 | 			String userDict = opts.valueOf(userDictOpt);
66 | 			String modelDir = opts.valueOf(modelDirOpt);
67 | 
68 | 			Thulac.split(modelDir, separator, userDict,
69 | 					useT2S, segOnly, useFilter, input, output);
70 | 		} catch (OptionException e) {
71 | 			parser.printHelpOn(System.out);
72 | 		}
73 | 	}
74 | }
75 | 


--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/ProfilerInputProvider.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac;
 2 | 
 3 | import org.thunlp.thulac.io.IInputProvider;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.ArrayList;
 7 | import java.util.Iterator;
 8 | import java.util.List;
 9 | 
10 | /**
11 |  * An implementation of {@link IInputProvider}, used in profiler to reduce time
12 |  * consumed by IO operations, wrapping outside another {@link IInputProvider}, reading
13 |  * the lines provided in advance and store them in memory. Note that they might lead to
14 |  * high memory usage for large files.
15 |  */
16 | public class ProfilerInputProvider implements IInputProvider {
17 | 	private Iterator<List<String>> linesIterator;
18 | 
19 | 	public ProfilerInputProvider(IInputProvider inputProvider) throws IOException {
20 | 		List<List<String>> lines = new ArrayList<>();
21 | 		for (List<String> lineSegments = inputProvider.provideInput();
22 | 			 lineSegments != null; lineSegments = inputProvider.provideInput())
23 | 			lines.add(lineSegments);
24 | 		this.linesIterator = lines.iterator();
25 | 	}
26 | 
27 | 	@Override
28 | 	public void onProgramStart() {
29 | 	}
30 | 
31 | 	@Override
32 | 	public void onProgramEnd() {
33 | 	}
34 | 
35 | 	@Override
36 | 	public List<String> provideInput() throws IOException {
37 | 		if (this.linesIterator.hasNext()) return this.linesIterator.next();
38 | 		else return null;
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/ProfilerOutputHandler.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac;
 2 | 
 3 | import org.thunlp.thulac.data.TaggedWord;
 4 | import org.thunlp.thulac.io.IOutputHandler;
 5 | 
 6 | import java.io.IOException;
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * An empty {@link IOutputHandler}, used in profiler to reduce time consumed by IO
11 |  * operations.
12 |  */
13 | public class ProfilerOutputHandler implements IOutputHandler {
14 | 	@Override
15 | 	public void onProgramStart() {
16 | 	}
17 | 
18 | 	@Override
19 | 	public void onProgramEnd() {
20 | 	}
21 | 
22 | 	@Override
23 | 	public void handleLineSegment(List<TaggedWord> words,
24 | 								  boolean segOnly, char separator) {
25 | 	}
26 | 
27 | 	@Override
28 | 	public void handleLineStart() throws IOException {
29 | 	}
30 | 
31 | 	@Override
32 | 	public void handleLineEnd() throws IOException {
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/TestHelper.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac;
  2 | 
  3 | import org.thunlp.thulac.io.IInputProvider;
  4 | import org.thunlp.thulac.io.IOutputHandler;
  5 | import org.thunlp.thulac.util.StringUtils;
  6 | 
  7 | import java.io.IOException;
  8 | import java.nio.file.Files;
  9 | import java.nio.file.Paths;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | 
 13 | import static org.junit.Assert.assertEquals;
 14 | import static org.junit.Assert.assertTrue;
 15 | 
 16 | /**
 17 |  * Helper class for THULAC tests.
 18 |  */
 19 | public class TestHelper {
 20 | 	/**
 21 | 	 * Run the segmentation program, write the output to the given position and
 22 | 	 * calculate the accuracy of the program.
 23 | 	 *
 24 | 	 * @param inputFile
 25 | 	 * 		The {@link IInputProvider} used as input.
 26 | 	 * @param compareFile
 27 | 	 * 		The {@link IInputProvider} used as answer.
 28 | 	 * @param outputFile
 29 | 	 * 		The {@link IOutputHandler} used as output.
 30 | 	 *
 31 | 	 * @throws IOException
 32 | 	 * 		If an error occurs while I/O.
 33 | 	 */
 34 | 	public static void testSuite(
 35 | 			IAccessible inputFile, IAccessible compareFile, IAccessible outputFile)
 36 | 			throws IOException {
 37 | 		run(inputFile, outputFile, true);
 38 | 		compare(inputFile, compareFile, outputFile);
 39 | 	}
 40 | 
 41 | 	/**
 42 | 	 * Runs the segmentation program with given input and output and the {@code
 43 | 	 * segOnly} flag and output execution time.
 44 | 	 *
 45 | 	 * @param input
 46 | 	 * 		The {@link IAccessible} used as input.
 47 | 	 * @param output
 48 | 	 * 		The {@link IAccessible} used as output.
 49 | 	 * @param segOnly
 50 | 	 * 		Whether to output segments only.
 51 | 	 *
 52 | 	 * @throws IOException
 53 | 	 * 		If one of the model files failed to load.
 54 | 	 */
 55 | 	public static void run(IAccessible input, IAccessible output, boolean segOnly)
 56 | 			throws IOException {
 57 | 		IInputProvider inputProvider = input.toInputProvider();
 58 | 		IOutputHandler outputHandler = output.toOutputHandler();
 59 | 		run(inputProvider, outputHandler, segOnly);
 60 | 	}
 61 | 
 62 | 	/**
 63 | 	 * Runs the segmentation program with given input and output and the {@code
 64 | 	 * segOnly} flag and output execution time.
 65 | 	 *
 66 | 	 * @param input
 67 | 	 * 		The {@link IInputProvider} used as input.
 68 | 	 * @param output
 69 | 	 * 		The {@link IOutputHandler} used as output.
 70 | 	 * @param segOnly
 71 | 	 * 		Whether to output segments only.
 72 | 	 *
 73 | 	 * @throws IOException
 74 | 	 * 		If one of the model files failed to load.
 75 | 	 */
 76 | 	public static void run(IInputProvider input, IOutputHandler output, boolean segOnly)
 77 | 			throws IOException {
 78 | 		long time = -System.currentTimeMillis();
 79 | 		Thulac.split(input, output, segOnly);
 80 | 		time += System.currentTimeMillis();
 81 | 		System.out.printf("Time elapsed: %dms\n", time);
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * Runs the segmentation program in profiler mode, that is, provide fastest input
 86 | 	 * and output to measure the actual time consumed by the program. Note that this
 87 | 	 * method does not output the result, use {@link #run(IInputProvider, IOutputHandler,
 88 | 	 * boolean)} or {@link #run(IAccessible, IAccessible, boolean)} if the result must be
 89 | 	 * used afterwards.
 90 | 	 *
 91 | 	 * @param input
 92 | 	 * 		The {@link IAccessible} used as input.
 93 | 	 * @param segOnly
 94 | 	 * 		Whether to output segments only.
 95 | 	 *
 96 | 	 * @throws IOException
 97 | 	 * 		If one of the model files failed to load.
 98 | 	 */
 99 | 	public static void runProfiler(IAccessible input, boolean segOnly)
100 | 			throws IOException {
101 | 		run(new ProfilerInputProvider(input.toInputProvider()),
102 | 				new ProfilerOutputHandler(), segOnly);
103 | 	}
104 | 
105 | 	/**
106 | 	 * Compare the output file and the answer file ({@code compareFile}) and calculate
107 | 	 * accuracy.<br>
108 | 	 * The comparison is done in such a way that, extracting split results from the
109 | 	 * files, the number of split positions in the output file which also exist in
110 | 	 * the compare file are counted.<br>
111 | 	 * This method requires outputFile to be generated with flag -seg_only
112 | 	 *
113 | 	 * @param inputFile
114 | 	 * 		The {@link IAccessible} used as input.
115 | 	 * @param compareFile
116 | 	 * 		The {@link IAccessible} used as answer.
117 | 	 * @param outputFile
118 | 	 * 		The {@link IAccessible} used as output.
119 | 	 *
120 | 	 * @throws IOException
121 | 	 * 		If an exception was thrown while reading the lines from {@code inputFile},
122 | 	 * 		{@code compareFile} or {@code outputFile}.
123 | 	 */
124 | 	public static void compare(
125 | 			IAccessible inputFile, IAccessible compareFile, IAccessible outputFile)
126 | 			throws IOException {
127 | 		// ADDITIONAL TO JAVADOC: ( *XXX* means XXX is a variable )
128 | 		// In other words, set *matches* to 0 initially. If THULAC splits input at
129 | 		// point A and so will a human, increase *matches* by one.
130 | 		// *total* is the number of total split segments in the answer, while
131 | 		// *segments* is that of the output from THULAC.
132 | 		// Accuracy is computed dividing *matches* by *total*, that is,
133 | 		//    accuracy = matches / total * 100%
134 | 		// *segments* is strictly greater than *matches*, therefore
135 | 		//    segments - matches
136 | 		// represent the number of wrongly split segments.
137 | 
138 | 		List<String> input = inputFile.getLines();
139 | 		List<String> output = outputFile.getLines();
140 | 		List<String> compare = compareFile.getLines();
141 | 
142 | 		int lines = input.size();
143 | 		List<List<Integer>> outputSeg = extractSegments(input, output);
144 | 		List<List<Integer>> compareSeg = extractSegments(input, compare);
145 | 		int matches = 0, segments = outputSeg.stream().mapToInt(List::size).sum(),
146 | 				total = compareSeg.stream().mapToInt(List::size).sum();
147 | 		for (int i = 0; i < lines; ++i) {
148 | 			List<Integer> outputLine = outputSeg.get(i);
149 | 			List<Integer> compareLine = compareSeg.get(i);
150 | 			matches += outputLine.stream().filter(compareLine::contains).count();
151 | 		}
152 | 
153 | 		System.out.printf("Result: %d total, %d segments, %d matches, %.2f%% accuracy\n",
154 | 				total, segments, matches, 100f * matches / total);
155 | 	}
156 | 
157 | 	private static List<List<Integer>> extractSegments(
158 | 			List<String> input, List<String> result) {
159 | 		List<List<Integer>> segments = new ArrayList<>();
160 | 		assertEquals("Line count of input and result doesn't match",
161 | 				input.size(), result.size());
162 | 		for (int i = 0, size = input.size(); i < size; ++i)
163 | 			segments.add(extractSegments(input.get(i), result.get(i)));
164 | 		return segments;
165 | 	}
166 | 
167 | 	private static List<Integer> extractSegments(
168 | 			String input, String result) {
169 | 		// It is required that the result contains all the characters (code points)
170 | 		// that exist in the input. This also means that the input should not contain
171 | 		// whitespaces (ASCII space U+0020 and Chinese fullwidth space U+3000),
172 | 		// otherwise the behavior of the program is undefined.
173 | 		// If a character in the input if not found in the output, than an
174 | 		// AssertionError is thrown with a message which provides more details.
175 | 
176 | 		// In addition, the result of splitting the input is represent by a list of
177 | 		// integers, each one, say N, means that the program finds it appropriate to
178 | 		// split the input after the Nth code Point.
179 | 		// To make it easier to understand, if N and M are two adjacent integers in the
180 | 		// returned list, then the Nth (inclusive) to the Mth (exclusive) code points
181 | 		// of the input together make a Chinese word.
182 | 
183 | 		List<Integer> segments = new ArrayList<>();
184 | 		int[] cp1 = StringUtils.toCodePoints(input),
185 | 				cp2 = StringUtils.toCodePoints(result);
186 | 		int pointer = 0, len1 = cp1.length, len2 = cp2.length;
187 | 		assertTrue("Result shorter than input!", len1 <= len2);
188 | 
189 | 		int i = 0;
190 | 		for (; i < len1 && pointer < len2; ++i, ++pointer) {
191 | 			int c = cp1[i];
192 | 			if (cp2[pointer] == c) continue;
193 | 			segments.add(i);
194 | 			for (; pointer < len2 && cp2[pointer] != c; ++pointer) ;
195 | 			if (pointer == len2) throw new AssertionError(
196 | 					new StringBuilder("Character '").appendCodePoint(c)
197 | 							.append("' not found in result string!\n")
198 | 							.append("Input: ").append(input)
199 | 							.append("Result: ").append(result).toString());
200 | 		}
201 | 		if (i != len1) throw new AssertionError(
202 | 				new StringBuilder("Character '").appendCodePoint(cp1[i])
203 | 						.append("' not found in result string!\n")
204 | 						.append("Input: ").append(input)
205 | 						.append("Result: ").append(result).toString());
206 | 
207 | 		return segments;
208 | 	}
209 | 
210 | 	private static final String RESOURCES_DIRECTORY = "/";
211 | 	// the temp directory used to store output files
212 | 	private static final String TEMP_DIRECTORY = "build/tmp/tests/";
213 | 
214 | 	static {
215 | 		try { // create tmp directory, otherwise IOException would be thrown
216 | 			Files.createDirectories(Paths.get(TEMP_DIRECTORY));
217 | 		} catch (IOException e) {
218 | 			throw new RuntimeException("Unable to create temp directory!", e);
219 | 		}
220 | 	}
221 | 
222 | 	public static IAccessible fileAt(String name) {
223 | 		return IAccessible.fileAt(name);
224 | 	}
225 | 
226 | 	public static IAccessible tempAt(String name) {
227 | 		return fileAt(TEMP_DIRECTORY + name);
228 | 	}
229 | 
230 | 	public static IAccessible resourceAt(String name) {
231 | 		return IAccessible.resourceAt(RESOURCES_DIRECTORY + name);
232 | 	}
233 | }


--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/Tests.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | /**
 8 |  *
 9 |  */
10 | public class Tests {
11 | 	// test files excluded from git for copyright reasons, users may download them here:
12 | 	// http://rsarxiv.github.io/2016/11/29/%E4%B8%AD%E6%96%87%E5%88%86%E8%AF%8D%E5%B7%A5%E5%85%B7%E6%B5%8B%E8%AF%84/
13 | 
14 | 	// @Test
15 | 	// public void test1() throws IOException {
16 | 	// 	TestHelper.run(TestHelper.resourceAt("input_1.txt"),
17 | 	// 			TestHelper.tempAt("output_1.txt"), false);
18 | 	// }
19 | 
20 | 	// @Test
21 | 	// public void test2() throws IOException {
22 | 	// 	TestHelper.testSuite(TestHelper.resourceAt("input_2.txt"),
23 | 	// 			TestHelper.resourceAt("compare_2.txt"),
24 | 	// 			TestHelper.tempAt("output_2.txt"));
25 | 	// }
26 | 
27 | 	// @Test
28 | 	// public void test3() throws IOException {
29 | 	// 	// non-Chinese users may see the following line rendered strangely,
30 | 	// 	// nevertheless it is only a simple Chinese sentence.
31 | 	// 	System.out.println(Thulac.split("今天，中国人民站起来了！", true));
32 | 	// }
33 | 
34 | 	// @Test
35 | 	// public void test4() throws IOException {
36 | 	// 	TestHelper.runProfiler(TestHelper.resourceAt("input_2.txt"), true);
37 | 	// }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/data/Dat2WordsConverter.java:
--------------------------------------------------------------------------------
  1 | package org.thunlp.thulac.data;
  2 | 
  3 | import org.junit.Test;
  4 | import org.thunlp.thulac.util.StringUtils;
  5 | 
  6 | import java.io.IOException;
  7 | import java.io.PrintWriter;
  8 | import java.nio.file.Files;
  9 | import java.nio.file.Path;
 10 | import java.nio.file.Paths;
 11 | import java.util.AbstractMap;
 12 | import java.util.Comparator;
 13 | import java.util.List;
 14 | import java.util.Stack;
 15 | import java.util.regex.Matcher;
 16 | import java.util.regex.Pattern;
 17 | import java.util.stream.Stream;
 18 | 
 19 | /**
 20 |  * A class which converts {@link Dat} files generated by {@link DatMaker} inversely to a
 21 |  * list
 22 |  * of words.
 23 |  */
 24 | public class Dat2WordsConverter {
 25 | 	/**
 26 | 	 * Converts the given {@link Dat} file generated by {@link DatMaker} to words plus
 27 | 	 * line numbers and output them through {@code writer}.
 28 | 	 *
 29 | 	 * @param dat
 30 | 	 * 		The {@link Dat} file to convert.
 31 | 	 * @param writer
 32 | 	 * 		The {@link PrintWriter} to output words plus line numbers to.
 33 | 	 * @param ln
 34 | 	 * 		Whether to output line numbers.
 35 | 	 */
 36 | 	private static void convert(Dat dat, PrintWriter writer, boolean ln) {
 37 | 		traverseTrieTree(dat, writer, 0, new Stack<>(), ln);
 38 | 	}
 39 | 
 40 | 	/**
 41 | 	 * Traverse within the Trie Tree specified by the {@link Dat} file. The file is
 42 | 	 * assumed to be generated correctly using {@link DatMaker}, otherwise the behavior
 43 | 	 * is undefined. Along the traversing, the words plus line numbers stored within this
 44 | 	 * Trie Tree are output using {@linkplain PrintWriter#println(String)
 45 | 	 * writer.println()}.<br>
 46 | 	 * This method calls itself recursively.
 47 | 	 *
 48 | 	 * @param dat
 49 | 	 * 		The {@link Dat} file.
 50 | 	 * @param writer
 51 | 	 * 		The {@link PrintWriter} to output words to.
 52 | 	 * @param index
 53 | 	 * 		The index of the node to traverse.
 54 | 	 * @param prefix
 55 | 	 * 		The current prefix of this node, as a list of code points.
 56 | 	 * @param ln
 57 | 	 * 		Whether to output line numbers
 58 | 	 */
 59 | 	private static void traverseTrieTree(
 60 | 			Dat dat, PrintWriter writer, int index, Stack<Integer> prefix, boolean ln) {
 61 | 		int[] d = dat.dat;
 62 | 		int base = d[index << 1], length = dat.datSize;
 63 | 		if (d[(base << 1) + 1] == index && !prefix.isEmpty()) {
 64 | 			writer.print(toString(prefix));
 65 | 			if (ln) {
 66 | 				writer.print(' ');
 67 | 				writer.println(d[base << 1]); // line number
 68 | 			} else writer.println();
 69 | 		}
 70 | 		for (int i = base + 1; i < length; ++i)
 71 | 			if (d[(i << 1) + 1] == index) {
 72 | 				prefix.push(i - base);
 73 | 				traverseTrieTree(dat, writer, i, prefix, ln);
 74 | 				prefix.pop();
 75 | 			}
 76 | 	}
 77 | 
 78 | 	/**
 79 | 	 * Converts an list of code points to a {@link String}.
 80 | 	 *
 81 | 	 * @param codePoints
 82 | 	 * 		The list of code pointe.
 83 | 	 *
 84 | 	 * @return The converted {@link String}.
 85 | 	 *
 86 | 	 * @see StringUtils#toString(int...)
 87 | 	 */
 88 | 	private static String toString(List<Integer> codePoints) {
 89 | 		StringBuilder sb = new StringBuilder();
 90 | 		for (int codePoint : codePoints) sb.appendCodePoint(codePoint);
 91 | 		return sb.toString();
 92 | 	}
 93 | 
 94 | 	/**
 95 | 	 * Convert dat file at models/&lt;name&gt;.dat to words and save converted result
 96 | 	 * to build/tmp/tests/&lt;name&gt;_text.txt.
 97 | 	 *
 98 | 	 * @param name
 99 | 	 * 		The name of the DAT file.
100 | 	 * @param ln
101 | 	 * 		Whether to output line numbers.
102 | 	 *
103 | 	 * @throws IOException
104 | 	 * 		If an I/O error occurs.
105 | 	 */
106 | 	private static void convertAndSave(String name, boolean ln) throws IOException {
107 | 		Dat dat = new Dat("models/" + name + ".dat");
108 | 		PrintWriter writer = new PrintWriter(Files.newBufferedWriter(
109 | 				Paths.get("build/tmp/tests/" + name + "_text.txt")));
110 | 		convert(dat, writer, ln);
111 | 		writer.close();
112 | 	}
113 | 
114 | 	private static Pattern LINE_PATTERN = Pattern.compile("^(.*)\\s(\\d+)$");
115 | 
116 | 	/**
117 | 	 * Read file generated by {@link #convertAndSave(String, boolean)} at
118 | 	 * build/tmp/tests/&lt;name&gt;_text.txt and sort the words comparing the
119 | 	 * corresponding line numbers. Every line of the input file should match {@link
120 | 	 * #LINE_PATTERN}, while the first group being the word and the second
121 | 	 * group being the line number. The sorted result is output to
122 | 	 * build/tmp/tests/&lt;name&gt;_sorted.txt with the line numbers removed,
123 | 	 * containing the words only.<br>
124 | 	 * Since the {@link Dat} file as input to {@link #convertAndSave(String, boolean)} is
125 | 	 * assumed to be generated using {@link DatMaker#readFromTxtFile(String)}, which
126 | 	 * reads from a text file containing a word on each line, the file generated by
127 | 	 * this method should be identical to the input file provided to {@link
128 | 	 * DatMaker#readFromTxtFile(String)}.
129 | 	 *
130 | 	 * @param name
131 | 	 * 		The name of the converted file.
132 | 	 *
133 | 	 * @throws IOException
134 | 	 * 		If an I/O error occurs.
135 | 	 */
136 | 	private static void sortAndSave(String name) throws IOException {
137 | 		// This method makes excessive use of the Java 8 Stream API, advanced knowledge
138 | 		// of streams is required to read the following code.
139 | 
140 | 		Files.write(Paths.get("build/tmp/tests/" + name + "_sorted.txt"),
141 | 				(Iterable<String>) Files.lines(
142 | 						Paths.get("build/tmp/tests/" + name + "_text.txt"))
143 | 						.map(line -> {
144 | 							Matcher matcher = LINE_PATTERN.matcher(line);
145 | 							if (!matcher.find()) return null;
146 | 							return new AbstractMap.SimpleEntry<>(
147 | 									Integer.parseInt(matcher.group(2)),
148 | 									matcher.group(1));
149 | 						})
150 | 						.sorted(Comparator.comparingInt(AbstractMap.SimpleEntry::getKey))
151 | 						.map(AbstractMap.SimpleEntry::getValue)::iterator);
152 | 	}
153 | 
154 | 	/**
155 | 	 * Convert a stream of {@link Dat} files specified by {@code datFiles} to words plus
156 | 	 * line numbers using {@link #convertAndSave(String, boolean)} and then sort the
157 | 	 * lines using {@link #sortAndSave(String)}. This method output messages to {@link
158 | 	 * System#out} while executing.
159 | 	 *
160 | 	 * @param datFiles
161 | 	 * 		The stream of {@link Dat} files, for each {@link String} in {@code datFiles},
162 | 	 * 		for example, {@code "example"}, the input {@link Dat} file is at {@code
163 | 	 * 		models/example.dat}, the converted file is at {@code
164 | 	 * 		build/tmp/tests/example_text.txt}, and the sorted file is at {@code
165 | 	 * 		build/tmp/tests/example_sorted.txt}.
166 | 	 */
167 | 	private void convertAndSort(Stream<String> datFiles) {
168 | 		datFiles.forEach(datFile -> {
169 | 			try {
170 | 				System.out.printf("Converting dat file %s.dat\n", datFile);
171 | 				convertAndSave(datFile, true);
172 | 				System.out.printf("Sorting dat file build/tmp/tests/%s_text.dat\n",
173 | 						datFile);
174 | 				sortAndSave(datFile);
175 | 			} catch (IOException e) {
176 | 				e.printStackTrace();
177 | 			}
178 | 		});
179 | 	}
180 | 
181 | 	// @Test
182 | 	// public void test() throws IOException {
183 | 	// 	convertAndSort(Files.list(Paths.get("models/"))
184 | 	// 			.parallel()
185 | 	// 			.map(Path::getFileName)
186 | 	// 			.map(Path::toString)
187 | 	// 			.map(String::toLowerCase)
188 | 	// 			.filter(filename -> filename.endsWith(".dat"))
189 | 	// 			.map(filename -> filename.substring(0, filename.length() - 4))
190 | 	// 			.filter(filename -> !"t2s".equals(filename)) // not Dat file
191 | 	// 			.filter(filename -> !"idiom".equals(filename))); // not DatMaker
192 | 	// 	// idiom.dat is correct Dat file however not generated by DatMaker
193 | 	// 	System.out.println("Converting dat file idiom.dat");
194 | 	// 	convertAndSave("idiom", false);
195 | 	// }
196 | }
197 | 


--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/data/DatMakerTest.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.data;
 2 | 
 3 | import org.junit.Test;
 4 | import org.thunlp.thulac.IAccessible;
 5 | import org.thunlp.thulac.TestHelper;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.List;
 9 | 
10 | import static org.junit.Assert.assertTrue;
11 | 
12 | /**
13 |  *
14 |  */
15 | public class DatMakerTest {
16 | 	@Test
17 | 	public void test() throws IOException {
18 | 		IAccessible file = TestHelper.resourceAt("dat_maker_test_1.txt");
19 | 		Dat dat = DatMaker.readFromInputStream(file.toInputStream());
20 | 		List<String> lines = file.getLines();
21 | 		for (String line : lines) assertTrue(line, dat.contains(line));
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/src/test/java/org/thunlp/thulac/util/CodePointUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package org.thunlp.thulac.util;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import java.util.Arrays;
 6 | 
 7 | import static org.junit.Assert.assertNotEquals;
 8 | 
 9 | /**
10 |  *
11 |  */
12 | public class CodePointUtilsTest {
13 | 	// the original one
14 | 	private static final String OTHER_CODE_POINTS =
15 | 			StringUtils.toString(65292, 12290, 65311, 65281, 65306, 65307, 8216, 8217,
16 | 					8220, 8221, 12304, 12305, 12289, 12298, 12299, 126, 183, 64, 124, 35,
17 | 					65509, 37, 8230, 38, 42, 65288, 65289, 8212, 45, 43, 61, 44, 46, 60,
18 | 					62, 63, 47, 33, 59, 58, 39, 34, 123, 125, 91, 93, 92, 124, 35, 36, 37,
19 | 					94, 38, 42, 40, 41, 95, 45, 43, 61, 9700, 9734, 9733, 65, 66, 67,
20 | 					68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
21 | 					85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105,
22 | 					106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
23 | 					120, 121, 122, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57);
24 | 
25 | 	@Test
26 | 	public void test() {
27 | 		// equality test
28 | 		Arrays.stream(StringUtils.toCodePoints(OTHER_CODE_POINTS))
29 | 				.forEach(ch -> assertNotEquals(String.valueOf(ch),
30 | 						-1, CodePointUtils.SPECIAL_CHARS.indexOf(ch)));
31 | 		Arrays.stream(StringUtils.toCodePoints(CodePointUtils.SPECIAL_CHARS))
32 | 				.forEach(ch -> assertNotEquals(String.valueOf(ch),
33 | 						-1, OTHER_CODE_POINTS.indexOf(ch)));
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/src/test/resources/dat_maker_test_1.txt:
--------------------------------------------------------------------------------
 1 | A
 2 | AB
 3 | ABC
 4 | AC
 5 | AD
 6 | AE
 7 | B
 8 | BC
 9 | BCDEFG
10 | BCDEGF
11 | BBCCDD
12 | BE
13 | BF
14 | BFF
15 | C
16 | D
17 | E
18 | F
19 | G


--------------------------------------------------------------------------------