├── HISTORY.md
├── README.md
├── REVISION.md
├── TODO.md
├── dict
    ├── sentiment
    │   ├── 主张词语（中文）.txt
    │   ├── 主张词语（英文）.txt
    │   ├── 正面情感词语（中文）.txt
    │   ├── 正面情感词语（英文）.txt
    │   ├── 正面评价词语（中文）.txt
    │   ├── 正面评价词语（英文）.txt
    │   ├── 程度级别词语（中文）.txt
    │   ├── 程度级别词语（英文）.txt
    │   ├── 统计结果.txt
    │   ├── 负面情感词语（中文）.txt
    │   ├── 负面情感词语（英文）.txt
    │   ├── 负面评价词语（中文）.txt
    │   └── 负面评价词语（英文）.txt
    ├── tendency
    │   └── tendency.xml
    └── user-concept.xml
├── docs
    ├── LCMC.zip
    └── 中文信息相似度计算理论与方法图书目录.pdf
├── pom.xml
├── src
    ├── main
    │   ├── java
    │   │   └── ruc
    │   │   │   └── irm
    │   │   │       ├── classification
    │   │   │           ├── Feature.java
    │   │   │           ├── Instance.java
    │   │   │           ├── NaiveBayesClassifier.java
    │   │   │           └── Variable.java
    │   │   │       ├── similarity
    │   │   │           ├── Similaritable.java
    │   │   │           ├── SimilarityFactory.java
    │   │   │           ├── phrase
    │   │   │           │   └── PhraseSimilarity.java
    │   │   │           ├── sentence
    │   │   │           │   ├── SegmentProxy.java
    │   │   │           │   ├── SentenceSimilarity.java
    │   │   │           │   ├── editdistance
    │   │   │           │   │   ├── Block.java
    │   │   │           │   │   ├── CharEditUnit.java
    │   │   │           │   │   ├── ChunkEditUnit.java
    │   │   │           │   │   ├── EditDistance.java
    │   │   │           │   │   ├── EditUnit.java
    │   │   │           │   │   ├── GregorEditDistance.java
    │   │   │           │   │   ├── Split.java
    │   │   │           │   │   ├── StandardEditDistance.java
    │   │   │           │   │   ├── SuperString.java
    │   │   │           │   │   ├── WordEditUnit.java
    │   │   │           │   │   ├── XiatianEditDistance.java
    │   │   │           │   │   └── XiatianEditDistance2.java
    │   │   │           │   └── morphology
    │   │   │           │   │   ├── MorphoSimilarity.java
    │   │   │           │   │   └── SemanticSimilarity.java
    │   │   │           ├── statistic
    │   │   │           │   ├── DictStatistic.java
    │   │   │           │   └── LCMC.java
    │   │   │           ├── text
    │   │   │           │   └── DiceSimilarity.java
    │   │   │           ├── util
    │   │   │           │   ├── About.java
    │   │   │           │   ├── BlankUtils.java
    │   │   │           │   ├── EditDistance.java
    │   │   │           │   ├── FileUtils.java
    │   │   │           │   ├── MathUtils.java
    │   │   │           │   ├── PinyinUtils.java
    │   │   │           │   ├── TraverseEvent.java
    │   │   │           │   ├── XmlException.java
    │   │   │           │   └── XmlUtils.java
    │   │   │           └── word
    │   │   │           │   ├── CharBasedSimilarity.java
    │   │   │           │   ├── WordSimilarity.java
    │   │   │           │   ├── cilin
    │   │   │           │       ├── Cilin.java
    │   │   │           │       ├── CilinCoding.java
    │   │   │           │       └── CilinDb.java
    │   │   │           │   ├── hownet
    │   │   │           │       ├── Hownet.java
    │   │   │           │       ├── HownetMeta.java
    │   │   │           │       ├── concept
    │   │   │           │       │   ├── Concept.java
    │   │   │           │       │   ├── ConceptDictTraverseEvent.java
    │   │   │           │       │   ├── ConceptLinkedList.java
    │   │   │           │       │   ├── ConceptParser.java
    │   │   │           │       │   ├── LiuConceptParser.java
    │   │   │           │       │   ├── MyConceptParser.java
    │   │   │           │       │   └── concept.dat
    │   │   │           │       └── sememe
    │   │   │           │       │   ├── FastSimpleMap.java
    │   │   │           │       │   ├── LiuqunSememeParser.java
    │   │   │           │       │   ├── MySememeParser.java
    │   │   │           │       │   ├── Sememe.java
    │   │   │           │       │   ├── SememeDictTraverseEvent.java
    │   │   │           │       │   ├── SememeParser.java
    │   │   │           │       │   ├── SememeType.java
    │   │   │           │       │   └── sememe.dat
    │   │   │           │   ├── hownet2
    │   │   │           │       ├── concept
    │   │   │           │       │   ├── BaseConceptParser.java
    │   │   │           │       │   ├── Concept.java
    │   │   │           │       │   ├── ConceptDictTraverseEvent.java
    │   │   │           │       │   ├── ConceptLinkedList.java
    │   │   │           │       │   ├── LiuConceptParser.java
    │   │   │           │       │   └── XiaConceptParser.java
    │   │   │           │       └── sememe
    │   │   │           │       │   ├── BaseSememeParser.java
    │   │   │           │       │   ├── LiuqunSememeParser.java
    │   │   │           │       │   ├── Sememe.java
    │   │   │           │       │   ├── SememeType.java
    │   │   │           │       │   └── XiaSememeParser.java
    │   │   │           │   └── pinyin
    │   │   │           │       └── PinyinSimilarity.java
    │   │   │       ├── tendency
    │   │   │           └── word
    │   │   │           │   ├── HownetWordTendency.java
    │   │   │           │   ├── Training.java
    │   │   │           │   └── WordTendency.java
    │   │   │       └── ui
    │   │   │           ├── PhraseSimilarityUI.java
    │   │   │           ├── SememeTreeUI.java
    │   │   │           ├── SentenceSimilarityUI.java
    │   │   │           ├── Start.java
    │   │   │           ├── TendencyUI.java
    │   │   │           └── WordSimlarityUI.java
    │   └── resources
    │   │   ├── about.html
    │   │   ├── data
    │   │       ├── F02-GB2312-to-PuTongHua-PinYin.txt
    │   │       ├── cilin.db.gz
    │   │       ├── concept.xml.gz
    │   │       └── sememe.xml.gz
    │   │   ├── log4j.dtd
    │   │   └── log4j.xml
    └── test
    │   └── java
    │       └── ruc
    │           └── irm
    │               └── similarity
    │                   ├── sentence
    │                       ├── MorphoSimilarityTest.java
    │                       └── SemanticSimilarityTest.java
    │                   ├── statistic
    │                       └── DictStatisticTest.java
    │                   └── word
    │                       ├── CharBasedSimilarityTest.java
    │                       ├── hownet
    │                           ├── ConceptTest.java
    │                           └── SememeTest.java
    │                       └── hownet2
    │                           └── HownetSimilarityTest.java
└── 中文信息相似度计算理论与方法图书目录.pdf


/HISTORY.md:
--------------------------------------------------------------------------------
1 | 变更历史
2 | ================
3 | 
4 | 2014-04： 把中文分词用ansj替换为原先的ictclas4j，在此对原作者表示感谢！把工程更改为maven工程，方便管理。
5 | 2014-08： 修正了SemanticSimilarity中的数组循环错误


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 说明
 3 | =====================
 4 | 汉语词语、组块、句子以及文本篇章等各个层面的相似度计算是中文信息处理领域的一项基础而又核心的工作，它直接决定着相关领域的研究发展状况，例如，在知识工程、基于实例的机器翻译、信息检索、自动问答以及拼写检查等方面，相似度计算都是一个非常关键的问题，长期以来一直是人们研究的一个热点和难点。相似度的研究涉及词语、组块、句子以及篇章等多个层面，目前的研究主要侧重于词语方面，提出了一些比较有代表性的理论与方法，如字面相似度算法、词素相似度算法，以及基于同义词词林、知网等语义词典的方法，国外的方法则主要包括基于构成字符的相似度计算方法、基于WORDNET的计算方法、基于词典注释的方法、基于大规模语料库统计的方法和基于搜索引擎的方法；有关组块、短语级别的相似度的研究现在还比较少，常用的方法是在词语相似度计算的基础上，借用句子相似度的计算方法计算组块之间的相似度。在句子层面的相似度计算方面，国外研究主要集中在字符串的相似度计算，国内则主要以词语为基本处理单元，通过计算相同词语所占的比重确定句子之间的相似度；文本层面的则集中于利用统计方法实现相似度计算。
 5 | 
 6 | xsimilarity项目为我们在相似度计算领域所取得的部分成果的Java代码实现，部分凌乱的代码已被去除，待重构之后再加入到工程之中。在相似度计算的研究过程中，许多研究学者的成果公布和无私帮助让我们受益匪浅，我们把代码开源出来，既是对前辈们表达我们的尊重之情，也希望能对大家共同的研究社区能有点滴贡献，能避免一些重复工作。
 7 | 
 8 | xsimilarity项目中所体现的思想或许还比较幼稚，希望高手们能用宽容的胸襟对待，并不吝赐教，我们也将根据研究进展情况和大家的实际需求，不断改进，同时也欢迎大家加入到这个项目的开发过程中来，共同推进相似度计算在中国的研究。
 9 | 
10 | xsimilarity项目中的理论知识大家可以参考doc目录下的文章，以及《中文信息相似度计算理论与方法》一书，重要的参考资料、程序资源在书中已经提到，如有需要，我们在今后将单独整理成列表，供大家参考。
11 | 
12 | 大家可以通过Eclipse导入项目，并运行ruc.irm.ui.Start进行快速测试。
13 | 
14 | 联系方式：xiat(at)ruc.edu.cn
15 | 
16 | 
17 | 编译运行
18 | =======================
19 | 首先确保系统中安装maven.
20 | 
21 | 如果要生成Intellij IDEA的工程文件，请进入命令行，在项目主目录下执行：
22 | 
23 | ```mvn idea:idea```
24 | 
25 | 如要生成eclipse的工程文件，则执行：
26 | 
27 | ```mvn eclipse:eclipse```
28 | 
29 | 要编译代码并在命令行运行测试：
30 | 
31 | ```mvn compile```
32 | 
33 | ```mvn dependency:copy-dependencies```
34 | 
35 | ```./run.py Start```
36 | 
37 | 即可打开主界面，进行测试
38 | 
39 | （注：开发测试所用的操作系统为Ubuntu，如为Windows，请自行修改run.py脚本）
40 | 
41 | 
42 | 
43 | 设想
44 | ========================
45 | 尝试把潜在和显性语义分析技术加入到xsimilarity中，并简化使用方式，方便初学者使用，但因个人精力受限，目前尚为开始集成处理。
46 | 
47 | 定个时间点：如果star数量超过500，再开始更新并把最近几年的相关研究成果集成进去。
48 | 
49 | 
50 | 欢迎有兴趣的人员与我联系，一起扩展xsimilarity的功能和实用性。
51 | 
52 | 
53 | 
54 | 致谢
55 | ========================
56 | ansj中文分词
57 | 
58 | 


--------------------------------------------------------------------------------
/REVISION.md:
--------------------------------------------------------------------------------
 1 | 错误修订
 2 | =====================
 3 | 
 4 | 1. 第三章概念词语的相似度计算部分的公式：
 5 | Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ β1 βi Sim i (C1, C2)
 6 | 应为： Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ Sim1(C1, C2) βi Sim i (C1, C2)
 7 | 可参考以下代码实现：           i
 8 |     @Override
 9 |     protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4) {
10 |         return beta1 * sim_v1 + beta2 * sim_v1 * sim_v2 + beta3 * sim_v1 * sim_v3 + beta4 * sim_v1 * sim_v4;
11 |     }
12 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | PLAN
2 | =============================
3 | 
4 | * 加入ESA和LSA处理，这两部分已经单独实现，但都比较复杂，如有精力和时间，考虑把ESA的某个快照结果打包，加入xsimilarity


--------------------------------------------------------------------------------
/dict/sentiment/主张词语（中文）.txt:
--------------------------------------------------------------------------------
 1 | 中文主张词语    38
 2 | 
 3 | 1. {perception|感知}  22  
 4 | 察觉
 5 | 触目
 6 | 耳闻
 7 | 发
 8 | 发觉
 9 | 发现
10 | 风闻
11 | 感
12 | 感觉
13 | 感觉到
14 | 感受到
15 | 见到
16 | 见得
17 | 觉
18 | 觉得
19 | 看得出来
20 | 窥见
21 | 领教
22 | 听说
23 | 痛感
24 | 预感
25 | 自觉
26 | 
27 | 2. {regard|认为}  16
28 | 抱定
29 | 当
30 | 道
31 | 感到
32 | 感觉
33 | 觉得
34 | 看
35 | 看待
36 | 论
37 | 认定
38 | 认为
39 | 认准
40 | 想
41 | 相信
42 | 以为
43 | 主张
44 | 


--------------------------------------------------------------------------------
/dict/sentiment/主张词语（英文）.txt:
--------------------------------------------------------------------------------
 1 | 英文主张词语    35
 2 | 
 3 | 1. {perception|感知}  21
 4 | be aware of
 5 | be conscious
 6 | be conscious of
 7 | be told
 8 | become aware of
 9 | detect
10 | discern
11 | discover
12 | feel
13 | find
14 | get a glimpse of
15 | get wind of
16 | have a premonition
17 | hear of
18 | keenly feel
19 | learn through hearsay
20 | meet the eye
21 | notice
22 | perceive
23 | see
24 | sense
25 | 
26 | {regard|认为} 14
27 | advocate
28 | believe
29 | consider
30 | feel
31 | firmly believe
32 | hold
33 | look upon
34 | maintain
35 | regard
36 | sense
37 | set one's mind on
38 | stand for
39 | suppose
40 | think


--------------------------------------------------------------------------------
/dict/sentiment/程度级别词语（中文）.txt:
--------------------------------------------------------------------------------
  1 | 中文程度级别词语    219
  2 | 
  3 | 1. “极其|extreme / 最|most”  69
  4 | 百分之百
  5 | 倍加
  6 | 备至
  7 | 不得了
  8 | 不堪
  9 | 不可开交
 10 | 不亦乐乎
 11 | 不折不扣
 12 | 彻头彻尾
 13 | 充分
 14 | 到头
 15 | 地地道道
 16 | 非常
 17 | 极
 18 | 极度
 19 | 极端
 20 | 极其
 21 | 极为
 22 | 截然
 23 | 尽
 24 | 惊人地
 25 | 绝
 26 | 绝顶
 27 | 绝对
 28 | 绝对化
 29 | 刻骨
 30 | 酷
 31 | 满
 32 | 满贯
 33 | 满心
 34 | 莫大
 35 | 奇
 36 | 入骨
 37 | 甚为
 38 | 十二分
 39 | 十分
 40 | 十足
 41 | 死
 42 | 滔天
 43 | 痛
 44 | 透
 45 | 完全
 46 | 完完全全
 47 | 万
 48 | 万般
 49 | 万分
 50 | 万万
 51 | 无比
 52 | 无度
 53 | 无可估量
 54 | 无以复加
 55 | 无以伦比
 56 | 要命
 57 | 要死
 58 | 已极
 59 | 已甚
 60 | 异常
 61 | 逾常
 62 | 贼
 63 | 之极
 64 | 之至
 65 | 至极
 66 | 卓绝
 67 | 最为
 68 | 佼佼
 69 | 郅
 70 | 綦
 71 | 齁
 72 | 最
 73 | 
 74 | 2. “很|very” 42
 75 | 不过
 76 | 不少
 77 | 不胜
 78 | 惨
 79 | 沉
 80 | 沉沉
 81 | 出奇
 82 | 大为
 83 | 多
 84 | 多多
 85 | 多加
 86 | 多么
 87 | 分外
 88 | 格外
 89 | 够瞧的
 90 | 够戗
 91 | 好
 92 | 好不
 93 | 何等
 94 | 很
 95 | 很是
 96 | 坏
 97 | 可
 98 | 老
 99 | 老大
100 | 良
101 | 颇
102 | 颇为
103 | 甚
104 | 实在
105 | 太
106 | 太甚
107 | 特
108 | 特别
109 | 尤
110 | 尤其
111 | 尤为
112 | 尤以
113 | 远
114 | 着实
115 | 曷
116 | 碜
117 | 
118 | 3. “较|more” 37
119 | 大不了
120 | 多
121 | 更
122 | 更加
123 | 更进一步
124 | 更为
125 | 还
126 | 还要
127 | 较
128 | 较比
129 | 较为
130 | 进一步
131 | 那般
132 | 那么
133 | 那样
134 | 强
135 | 如斯
136 | 益
137 | 益发
138 | 尤甚
139 | 逾
140 | 愈
141 | 愈 ... 愈
142 | 愈发
143 | 愈加
144 | 愈来愈
145 | 愈益
146 | 远远
147 | 越 ... 越
148 | 越发
149 | 越加
150 | 越来越
151 | 越是
152 | 这般
153 | 这样
154 | 足
155 | 足足
156 | 
157 | 4. “稍|-ish” 29
158 | 点点滴滴
159 | 多多少少
160 | 怪
161 | 好生
162 | 还
163 | 或多或少
164 | 略
165 | 略加
166 | 略略
167 | 略微
168 | 略为
169 | 蛮
170 | 稍
171 | 稍稍
172 | 稍微
173 | 稍为
174 | 稍许
175 | 挺
176 | 未免
177 | 相当
178 | 些
179 | 些微
180 | 些小
181 | 一点
182 | 一点儿
183 | 一些
184 | 有点
185 | 有点儿
186 | 有些
187 | 
188 | 5. “欠|insufficiently” 12
189 | 半点
190 | 不大
191 | 不丁点儿
192 | 不甚
193 | 不怎么
194 | 聊
195 | 没怎么
196 | 轻度
197 | 弱
198 | 丝毫
199 | 微
200 | 相对
201 | 
202 | 6. “超|over” 30
203 | 不为过
204 | 超
205 | 超额
206 | 超外差
207 | 超微结构
208 | 超物质
209 | 出头
210 | 多
211 | 浮
212 | 过
213 | 过度
214 | 过分
215 | 过火
216 | 过劲
217 | 过了头
218 | 过猛
219 | 过热
220 | 过甚
221 | 过头
222 | 过于
223 | 过逾
224 | 何止
225 | 何啻
226 | 开外
227 | 苦
228 | 老
229 | 偏
230 | 强
231 | 溢
232 | 忒
233 | 
234 | 
235 | 
236 | 


--------------------------------------------------------------------------------
/dict/sentiment/程度级别词语（英文）.txt:
--------------------------------------------------------------------------------
  1 | 英文程度级别词语    170
  2 | 
  3 | 1. “极其|extreme / 最|most”  64
  4 | 100 percent
  5 | absolute
  6 | absolutely
  7 | alarmingly
  8 | amazingly
  9 | as fully as possible
 10 | astonishingly
 11 | awfully
 12 | beyond challenge
 13 | beyond compare
 14 | beyond comparison
 15 | beyond measure
 16 | bitterly
 17 | by all means
 18 | completely
 19 | deep-rooted
 20 | deep-seated
 21 | deeply
 22 | definitely
 23 | disastrously
 24 | downright
 25 | entirely
 26 | exceedingly
 27 | excessively
 28 | extreme
 29 | extremely
 30 | fully
 31 | greatest
 32 | greatly
 33 | heinous
 34 | hundred-percent
 35 | immensely
 36 | immoderate
 37 | in a penetrating way
 38 | in every possible way
 39 | in the extreme
 40 | incomparably
 41 | ingrained
 42 | matchlessly
 43 | monstrous
 44 | most
 45 | of the highest degree
 46 | out-and-out
 47 | outstanding
 48 | outstandingly
 49 | reach the limit
 50 | right-down
 51 | sharply
 52 | sheer
 53 | superb
 54 | terribly
 55 | to death
 56 | to the full
 57 | to the letter
 58 | to the limit
 59 | to the marrow
 60 | to the utmost
 61 | totally
 62 | towering
 63 | unusually
 64 | utmost
 65 | utterly
 66 | very much
 67 | most
 68 | 
 69 | 2. “很|very” 25
 70 | a lot
 71 | awfully
 72 | badly
 73 | better
 74 | by far
 75 | considerably
 76 | deep
 77 | disastrously
 78 | especially
 79 | extraordinarily
 80 | extremely
 81 | greatly
 82 | how
 83 | however
 84 | indeed
 85 | much
 86 | particularly
 87 | really
 88 | terribly
 89 | to a serious degree
 90 | too far
 91 | too much
 92 | unusually
 93 | very
 94 | what a
 95 | 
 96 | 3. “较|more” 22
 97 | all the more
 98 | as much as
 99 | at the worst
100 | by far
101 | comparatively
102 | even more
103 | further
104 | further more
105 | in that way
106 | increasingly
107 | like that
108 | more
109 | more and more
110 | more so
111 | much more
112 | plus
113 | relatively
114 | slightly more
115 | so
116 | still more
117 | such
118 | the more ... the more
119 | 
120 | 4. “稍|-ish” 15
121 | a bit
122 | a bit too
123 | a little
124 | a little bit
125 | a little more
126 | fairly
127 | more or less
128 | passably
129 | pretty
130 | quite
131 | rather
132 | slightly
133 | some
134 | somewhat
135 | to some extent
136 | 
137 | 5. “欠|insufficiently” 11
138 | a little less
139 | just
140 | light
141 | merely
142 | not particularly
143 | not too
144 | not very
145 | relative
146 | slight
147 | slightest degree of
148 | slightly
149 | 
150 | 6. “超|over” 33
151 | a little over
152 | above
153 | above measure
154 | above quota
155 | and more
156 | excessive
157 | excessively
158 | exorbitance
159 | extra
160 | far more than
161 | hyperphysical
162 | inflated
163 | inordinate
164 | not too much
165 | odd
166 | outrageousness
167 | over
168 | over-
169 | overdone
170 | overheated
171 | plus
172 | slightly more
173 | super
174 | superheated
175 | superheterodyne
176 | surplus
177 | to a fault
178 | too
179 | too much
180 | ultra
181 | ultrastructural
182 | undue
183 | unduly
184 | 
185 | 
186 | 


--------------------------------------------------------------------------------
/dict/tendency/tendency.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | 
 3 | <!-- 
 4 | 情感规则描述文件
 5 | -->
 6 | <tendency>
 7 | 	<positive>
 8 | 		<!-- 逗号隔开的义原列表 -->
 9 | 	   <sememe-list>
10 | 	   	   <sememe name="良" weight="1"/>
11 | 		   <sememe name="喜悦" weight="1"/>
12 | 	   </sememe-list>
13 | 	   <words>
14 | 	   	<word name="NIUBI" type="拼音" sentiment="1.0"/>
15 | 		<word name="NB" sentiment="1.0"/>
16 | 	   </words>
17 | 	</positive>
18 |     
19 | </tendency>
20 | 


--------------------------------------------------------------------------------
/dict/user-concept.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <concepts>
 3 | 	<!--
 4 | 	example:
 5 | 	<c w="汉语词语" p="词性，取值为：V|N|ADJ|NUM|PREP等" d="对应的义原形式的定义"/>
 6 | 	-->
 7 |   	<c w="三聚氰胺" p="N" d="material|材料,#drinks|饮品"/>
 8 | 	<c w="山寨" p="V" d="produce|制造,means=imitate|模仿,pretend|假装,content=RegardAs|当作"/> 
 9 | </concepts>
10 | 


--------------------------------------------------------------------------------
/docs/LCMC.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/docs/LCMC.zip


--------------------------------------------------------------------------------
/docs/中文信息相似度计算理论与方法图书目录.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/docs/中文信息相似度计算理论与方法图书目录.pdf


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |     <modelVersion>4.0.0</modelVersion>
 4 |     <groupId>ruc.irm</groupId>
 5 |     <artifactId>xsimilarity</artifactId>
 6 |     <packaging>jar</packaging>
 7 |     <name>xsimilarity</name>
 8 |     <version>0.1</version>
 9 |     <description>xsimilarity</description>
10 |     <url>https://github.com/iamxiatian/xsimilarity</url>
11 |     <licenses>
12 |         <license>
13 |             <name>The Apache Software License, Version 2.0</name>
14 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
15 |             <distribution>repo</distribution>
16 |         </license>
17 |     </licenses>
18 | 
19 | 
20 |     <repositories>
21 |         <repository>
22 |             <id>cengtral</id>
23 |             <url>http://repo1.maven.org/maven2/</url>
24 |         </repository>
25 |     </repositories>
26 | 
27 |     <developers>
28 |         <developer>
29 |             <id>summer</id>
30 |             <name>summer</name>
31 |             <email>xiat(at)ruc.edu.cn</email>
32 |         </developer>
33 |     </developers>
34 | 
35 |     <properties>
36 |         <maven.compiler.source>1.8</maven.compiler.source>
37 |         <maven.compiler.target>1.8</maven.compiler.target>
38 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
39 |         <commons.lang3.version>3.3.1</commons.lang3.version>
40 |         <slf4j.version>1.7.1</slf4j.version>
41 |         <logback.version>1.2.3</logback.version>
42 |     </properties>
43 | 
44 |     <dependencies>
45 |         <dependency>
46 |             <groupId>org.slf4j</groupId>
47 |             <artifactId>slf4j-api</artifactId>
48 |             <version>${slf4j.version}</version>
49 |         </dependency>
50 | 
51 |         <dependency>
52 |             <groupId>ch.qos.logback</groupId>
53 |             <artifactId>logback-core</artifactId>
54 |             <version>${logback.version}</version>
55 |         </dependency>
56 | 
57 |         <dependency>
58 |             <groupId>ch.qos.logback</groupId>
59 |             <artifactId>logback-classic</artifactId>
60 |             <version>${logback.version}</version>
61 |         </dependency>
62 | 
63 |         <dependency>
64 |             <groupId>org.apache.commons</groupId>
65 |             <artifactId>commons-lang3</artifactId>
66 |             <version>${commons.lang3.version}</version>
67 |         </dependency>
68 | 
69 | 
70 |         <dependency>
71 |             <groupId>com.google.guava</groupId>
72 |             <artifactId>guava</artifactId>
73 |             <version>23.5-jre</version>
74 |         </dependency>
75 | 
76 |         <dependency>
77 |             <groupId>org.ansj</groupId>
78 |             <artifactId>ansj_seg</artifactId>
79 |             <version>5.1.1</version>
80 |         </dependency>
81 | 
82 |         <dependency>
83 |             <groupId>junit</groupId>
84 |             <artifactId>junit</artifactId>
85 |             <version>4.12</version>
86 |             <scope>test</scope>
87 |         </dependency>
88 |     </dependencies>
89 | 
90 | </project>
91 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/classification/Feature.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.classification;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | import java.util.HashMap;
 7 | import java.util.Map;
 8 | 
 9 | /**
10 |  * 文档的特征
11 |  * 
12 |  * @author xiatian
13 |  *
14 |  */
15 | public class Feature {
16 | 	/** 每个关键词在不同类别中出现的文档数量 */
17 | 	private Map<String, Integer> docCountMap = new HashMap<String, Integer>();
18 | 	/** 特征名称 */
19 | 	private String name;
20 | 	
21 | 	public String getName() {
22 | 		return name;
23 | 	}
24 | 	public void setName(String name) {
25 | 		this.name = name;
26 | 	}
27 | 	public void incDocCount(String category){
28 | 		if(docCountMap.containsKey(category)){
29 | 			docCountMap.put(category, docCountMap.get(category)+1);
30 | 		}else{
31 | 			docCountMap.put(category, 1);
32 | 		}
33 | 	}
34 | 	public int getDocCount(String category){
35 | 		if(docCountMap.containsKey(category)){
36 | 			return docCountMap.get(category);
37 | 		}else{
38 | 			return 0;
39 | 		}
40 | 	}
41 | 	
42 | 	public void write(DataOutput out) throws IOException{
43 | 		out.writeUTF(name==null?"":name);
44 | 		
45 | 		out.writeInt(docCountMap.size());
46 | 		for(String category:docCountMap.keySet()){
47 | 			out.writeUTF(category);
48 | 			out.writeInt(docCountMap.get(category));
49 | 		}
50 | 	}
51 | 	
52 | 	public void readFields(DataInput in) throws IOException {
53 | 		this.name = in.readUTF();
54 | 		
55 | 		docCountMap = new HashMap<String, Integer>();
56 | 		int size = in.readInt();
57 | 		for(int i=0; i<size; i++){
58 | 			String category = in.readUTF();
59 | 			int docCount = in.readInt();
60 | 			docCountMap.put(category, docCount);
61 | 		}
62 | 	}
63 | 	
64 | 	public static Feature read(DataInput in) throws IOException{
65 | 		Feature f = new Feature();
66 | 		f.readFields(in);
67 | 		return f;
68 | 	}
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/classification/Instance.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.classification;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.util.HashSet;
 9 | import java.util.List;
10 | import java.util.Set;
11 | 
12 | import ruc.irm.similarity.sentence.SegmentProxy;
13 | import ruc.irm.similarity.sentence.SegmentProxy.Word;
14 | 
15 | /**
16 |  * 代表一个文档实例
17 |  * 
18 |  * @author xiatian
19 |  * 
20 |  */
21 | public class Instance {
22 | 	/** 文档类别 */
23 | 	private String category;
24 | 	/** 文档内容 */
25 | 	private Set<String> bag = new HashSet<String>();
26 | 
27 | 	public Instance() {
28 | 	}
29 | 
30 | 	public Instance(String category, File f, String encoding) {
31 | 		this.category = category;
32 | 		String line = null;
33 | 		
34 | 		try {
35 | 			BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));
36 | 		
37 | 			while ((line = in.readLine()) != null) {
38 | 				System.out.println(line);
39 | 				List<Word> words = SegmentProxy.segment(line);
40 | 				for(Word w:words) {
41 |                     if (w.getPos().endsWith("adj")
42 |                             || w.getPos().startsWith("n")
43 |                             || w.getPos().startsWith("v")) {
44 |                         bag.add(w.getWord());
45 |                     }
46 |                 }
47 |             }
48 | 		} catch (IOException e) {
49 | 			System.out.println("current file:" + f.getAbsolutePath());
50 | 			System.out.println("current line:" + line);
51 | 			e.printStackTrace();
52 | 		}
53 | 	}
54 | 
55 | 	public String getCategory() {
56 | 		return category;
57 | 	}
58 | 
59 | 	public void setCategory(String category) {
60 | 		this.category = category;
61 | 	}
62 | 
63 | 	public Set<String> getWords() {
64 | 		return bag;
65 | 	}
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/classification/NaiveBayesClassifier.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.classification;
  2 | 
  3 | import java.io.DataInputStream;
  4 | import java.io.DataOutput;
  5 | import java.io.DataOutputStream;
  6 | import java.io.File;
  7 | import java.io.FileInputStream;
  8 | import java.io.FileOutputStream;
  9 | import java.io.IOException;
 10 | import java.util.Collection;
 11 | import java.util.HashMap;
 12 | import java.util.Map;
 13 | 
 14 | public class NaiveBayesClassifier {
 15 | 	/**
 16 | 	 * 记录每个类别下出现的文档数量, 用于计算P(C)使用
 17 | 	 */
 18 | 	Variable VARIABLE = new Variable();
 19 | 
 20 | 	/**
 21 | 	 * 词语在所有类别中的总数量
 22 | 	 */
 23 | 	Map<String, Integer> TERM_TOTAL_COUNT = new HashMap<String, Integer>();
 24 | 
 25 | 	/**
 26 | 	 * 训练一篇文档
 27 | 	 * @param doc
 28 | 	 */
 29 | 	public void training(Instance doc) {
 30 | 		VARIABLE.addInstance(doc);		
 31 | 	}
 32 | 	
 33 | 	/**
 34 | 	 * 保存训练结果
 35 | 	 * @throws IOException 
 36 | 	 */
 37 | 	void save(File file) throws IOException{		
 38 | 		DataOutput out = new DataOutputStream(new FileOutputStream(file));
 39 | 		VARIABLE.write(out);
 40 | 	}
 41 | 	
 42 | 	public void load(File file) throws IOException{
 43 | 		DataInputStream in = new DataInputStream(new FileInputStream(file));
 44 | 		VARIABLE = Variable.read(in);
 45 | 	}
 46 | 
 47 | 	/**
 48 | 	 * 计算P（C)
 49 | 	 * @param category
 50 | 	 * @return
 51 | 	 */
 52 | 	public double getCategoryProbability(String category){
 53 | 		return Math.log(VARIABLE.getDocCount(category)*1.0f/VARIABLE.getDocCount());
 54 | 	}
 55 | 	
 56 | 	/**
 57 | 	 * 计算P(feature|cateogry),返回的是取对数后的数值
 58 | 	 * @param feature
 59 | 	 * @param category
 60 | 	 * @return
 61 | 	 */
 62 | 	public double getFeatureProbability(String feature, String category){
 63 | 		int m = VARIABLE.getFeatureCount();
 64 | 		return Math.log((VARIABLE.getDocCount(feature, category)+1.0)/(VARIABLE.getDocCount(category)+m));
 65 | 	}
 66 | 	
 67 | 	/**
 68 | 	 * 计算给定实例文档属于指定类别的概率，返回的是取对数后的数值
 69 | 	 * @param category
 70 | 	 * @param doc
 71 | 	 * @return
 72 | 	 */
 73 | 	public double getProbability(String category, Instance doc) {
 74 | 		double result = getCategoryProbability(category);
 75 | 		for(String feature:doc.getWords()){
 76 | 			if(VARIABLE.containFeature(feature)){
 77 | 				result += getFeatureProbability(feature, category);
 78 | 			}			
 79 | 		}
 80 | 		return result;
 81 | 	}
 82 | 	
 83 | 	public String getCategory(Instance doc){
 84 | 		Collection<String> categories = VARIABLE.getCategories();
 85 | 		double best = Double.NEGATIVE_INFINITY;
 86 | 		String bestName = null;
 87 | 		for(String c:categories){
 88 | 			double current = getProbability(c, doc);
 89 | //			System.out.println(c + ":" + current);
 90 | 			if(best<current){
 91 | 				best = current;
 92 | 				bestName = c;
 93 | 			}
 94 | 		}
 95 | 		return bestName;
 96 | 	}
 97 | 	
 98 | 	public static void main(String[] args) throws IOException {
 99 | 		NaiveBayesClassifier classifier = new NaiveBayesClassifier();
100 | 		
101 | //		File samplePath = new File("./corpus/Sample");
102 | //		for(File categoryPath:samplePath.listFiles()){
103 | //			String category = categoryPath.getName();
104 | //			for(File f:categoryPath.listFiles()){
105 | //				classifier.training(new Instance(category, f, "GBK"));
106 | //			}
107 | //		}
108 | //		classifier.save(new File("result.dat"));
109 | //		System.out.println("Finished!");
110 | 		
111 | 		classifier.load(new File("result.dat"));
112 | 		
113 | 		Instance doc = new Instance(null, new File("/tmp/10.txt"), "GBK");
114 | 		System.out.println(classifier.getCategory(doc));
115 | 		
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/classification/Variable.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.classification;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.util.Collection;
  7 | import java.util.HashMap;
  8 | import java.util.Map;
  9 | 
 10 | /**
 11 |  * 分类的类别
 12 |  * 
 13 |  * @author xiatian
 14 |  *
 15 |  */
 16 | public class Variable {
 17 | 	/** 类别信息 */
 18 | 	Map<String, CategoryInfo> categoryMap = new HashMap<String, CategoryInfo>();
 19 | 	
 20 | 	Map<String, Feature> features = new HashMap<String, Feature>(); 
 21 | 	
 22 | 	/** 所有文档的数量 */
 23 | 	private int docCount = 0;
 24 | 	
 25 | 	public void write(DataOutput out) throws IOException{
 26 | 		//保存文档总数
 27 | 		out.writeInt(docCount);
 28 | 		
 29 | 		//写入类别总数
 30 | 		out.writeInt(categoryMap.size());
 31 | 		for(String category:categoryMap.keySet()){
 32 | 			out.writeUTF(category);
 33 | 			categoryMap.get(category).write(out);
 34 | 		}
 35 | 		
 36 | 		//写入Feature总数
 37 | 		out.writeInt(features.size());
 38 | 		for(String key:features.keySet()){
 39 | 			out.writeUTF(key);
 40 | 			features.get(key).write(out);
 41 | 		}
 42 | 	}
 43 | 	
 44 | 	public void readFields(DataInput in) throws IOException {
 45 | 		this.docCount = in.readInt();
 46 | 		
 47 | 		int size = in.readInt();
 48 | 		categoryMap = new HashMap<String, CategoryInfo>();
 49 | 		for(int i=0; i<size; i++){
 50 | 			String category = in.readUTF();
 51 | 			CategoryInfo info = CategoryInfo.read(in);
 52 | 			categoryMap.put(category, info);
 53 | 		}
 54 | 		
 55 | 		size = in.readInt();
 56 | 		features = new HashMap<String, Feature>();
 57 | 		for(int i=0; i<size; i++){
 58 | 			String word = in.readUTF();
 59 | 			Feature feature = Feature.read(in);
 60 | 			features.put(word, feature);
 61 | 		}
 62 | 	}
 63 | 	
 64 | 	public static Variable read(DataInput in) throws IOException{
 65 | 		Variable v = new Variable();
 66 | 		v.readFields(in);
 67 | 		return v;
 68 | 	}
 69 | 	
 70 | 	public Collection<String> getCategories(){
 71 | 		return categoryMap.keySet();
 72 | 	}
 73 | 	
 74 | 	public int getFeatureCount(){
 75 | 		return features.size();
 76 | 	}
 77 | 	
 78 | 	public boolean containFeature(String feature){
 79 | 		return features.containsKey(feature);
 80 | 	}
 81 | 	
 82 | 	public void incDocCount(){
 83 | 		this.docCount++;
 84 | 	}
 85 | 	
 86 | 	public int getDocCount(){
 87 | 		return this.docCount;
 88 | 	}
 89 | 	
 90 | 	/**
 91 | 	 * 获取置顶类别下的文档数量
 92 | 	 * @param category
 93 | 	 * @return
 94 | 	 */
 95 | 	public int getDocCount(String category){
 96 | 		return categoryMap.get(category).getDocCount();
 97 | 	}
 98 | 	
 99 | 	/**
100 | 	 * 获取feature在指定类别下的文档出现数量
101 | 	 * @param feature
102 | 	 * @param category
103 | 	 * @return
104 | 	 */
105 | 	public int getDocCount(String feature, String category){
106 | 		Feature f = features.get(feature);
107 | 		if(f!=null){
108 | 			return f.getDocCount(category);
109 | 		}
110 | 		return 0;
111 | 	}
112 | 	
113 | 	public void addInstance(Instance instance){
114 | 		incDocCount();
115 | 		CategoryInfo info = null;
116 | 		if(categoryMap.containsKey(instance.getCategory())){
117 | 			info = categoryMap.get(instance.getCategory());
118 | 		}else{
119 | 			info = new CategoryInfo();
120 | 		}
121 | 		info.incDocCount();
122 | 		categoryMap.put(instance.getCategory(), info);
123 | 		
124 | 		for(String word:instance.getWords()){
125 | 			Feature feature = features.get(word);
126 | 			
127 | 			if(feature==null) feature = new Feature();
128 | 			
129 | 			feature.setName(word);
130 | 			feature.incDocCount(instance.getCategory());
131 | 			
132 | 			features.put(word, feature);
133 | 		}
134 | 	}
135 | 	
136 | 	public static class CategoryInfo {
137 | 		private int docCount;
138 | 		
139 | 		public int getDocCount() {
140 | 			return docCount;
141 | 		}
142 | 		public void incDocCount(){
143 | 			this.docCount++;
144 | 		}
145 | 		public void setDocCount(int docCount) {
146 | 			this.docCount = docCount;
147 | 		}
148 | 			
149 | 		public void write(DataOutput out) throws IOException{
150 | 			out.writeInt(docCount);
151 | 		}
152 | 		
153 | 		public void readFields(DataInput in) throws IOException {
154 | 			this.docCount = in.readInt();
155 | 		}
156 | 		
157 | 		public static CategoryInfo read(DataInput in) throws IOException{
158 | 			CategoryInfo c = new CategoryInfo();
159 | 			c.readFields(in);
160 | 			return c;
161 | 		}
162 | 	}
163 | }
164 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/Similaritable.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity;
 2 | 
 3 | /**
 4 |  * 可以计算相似度的接口
 5 |  * 
 6 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 7 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 8 |  */
 9 | public interface Similaritable {
10 | 	/**
11 | 	 * 计算两个字符串的相似度，对于句子来说，计算的是句子相似度，对于词语则计算词语的相似度
12 | 	 * @param item1 参与相似度计算的第一个字符串
13 | 	 * @param item2 参与相似度计算的第二个字符串
14 | 	 * @return
15 | 	 */
16 | 	public double getSimilarity(String item1, String item2); 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/SimilarityFactory.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity;
 2 | 
 3 | import ruc.irm.similarity.sentence.SentenceSimilarity;
 4 | import ruc.irm.similarity.sentence.morphology.MorphoSimilarity;
 5 | import ruc.irm.similarity.word.WordSimilarity;
 6 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
 7 | 
 8 | public class SimilarityFactory {
 9 |     private static WordSimilarity wordSimilarity = XiaConceptParser.getInstance();
10 |     private static SentenceSimilarity sentenceSimilarity = MorphoSimilarity.getInstance();
11 |     
12 |     private SimilarityFactory(){}
13 |     
14 |     public static WordSimilarity getWordSimilarity(){
15 |         return wordSimilarity;
16 |     }
17 |     
18 |     public static SentenceSimilarity getSentenceSimilarity(){
19 |         return sentenceSimilarity;
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/phrase/PhraseSimilarity.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.phrase;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import ruc.irm.similarity.Similaritable;
 7 | 
 8 | /**
 9 |  * 一种简单的短语相似度计算方法，算法原理请参考《中文信息相似度计算理论与方法》一书P69.
10 |  * 
11 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
12 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
13 |  */
14 | public class PhraseSimilarity implements Similaritable {
15 | 
16 | 	@Override
17 | 	public double getSimilarity(String item1, String item2) {
18 | 		return (getSC(item1, item2) + getSC(item2, item1)) / 2.0;
19 | 	}
20 | 
21 | 	public List<Integer> getC(String first, String second, int pos) {
22 | 		List<Integer> results = new ArrayList<Integer>();
23 | 		char ch = first.charAt(pos);
24 | 		for (int i = 0; i < second.length(); i++) {
25 | 			if (ch == second.charAt(i)) {
26 | 				results.add(i);
27 | 			}
28 | 		}
29 | 		return results;
30 | 	}
31 | 
32 | 	public int getDistance(String first, String second, int pos) {
33 | 		int d = second.length();
34 | 		for (int k : getC(first, second, pos)) {
35 | 			int value = Math.abs(k - pos);
36 | 			if (d > value) {
37 | 				d = value;
38 | 			}
39 | 		}
40 | 
41 | 		return d;
42 | 	}
43 | 
44 | 	public double getCC(String first, String second, int pos) {
45 | 		return (second.length() - getDistance(first, second, pos)) * 1.0 / second.length();
46 | 	}
47 | 
48 | 	public double getSC(String first, String second) {
49 | 		double total = 0.0;
50 | 		for (int i = 0; i < first.length(); i++) {
51 | 			total = total + getCC(first, second, i);
52 | 		}
53 | 		return total / first.length();
54 | 	}
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/SegmentProxy.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.sentence;
  2 | 
  3 | import java.awt.BorderLayout;
  4 | import java.awt.GridLayout;
  5 | import java.awt.event.ActionEvent;
  6 | import java.awt.event.ActionListener;
  7 | import java.util.ArrayList;
  8 | import java.util.List;
  9 | 
 10 | import javax.swing.BorderFactory;
 11 | import javax.swing.JButton;
 12 | import javax.swing.JLabel;
 13 | import javax.swing.JPanel;
 14 | import javax.swing.JScrollPane;
 15 | import javax.swing.JTextArea;
 16 | import javax.swing.JTextField;
 17 | 
 18 | import org.ansj.domain.Result;
 19 | import org.ansj.domain.Term;
 20 | import org.ansj.splitWord.analysis.ToAnalysis;
 21 | 
 22 | /**
 23 |  * 对词法分析程序的封装代理，目前内部封装了对Ictclas4j（夏天改进版）的调用<br/>
 24 |  * 为方便演示程序快速启动，对Segment的调用采用了单例模式，实现需要时的延迟加载。
 25 |  *
 26 |  * @CHANGE 2014/04/04 采用Ansj词法分析器取代Ictclas4j-summer version
 27 |  *
 28 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 29 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 30 |  */
 31 | public class SegmentProxy {
 32 | 
 33 |     public static class Word {
 34 |         /**
 35 |          * 词语内容
 36 |          */
 37 |         private String word;
 38 |         /**
 39 |          * 词语词性代号
 40 |          */
 41 |         private String pos;
 42 | 
 43 |         public Word(String word, String pos) {
 44 |             this.word = word;
 45 |             this.pos = pos;
 46 |         }
 47 | 
 48 |         public String getWord() {
 49 |             return word;
 50 |         }
 51 | 
 52 |         public void setWord(String word) {
 53 |             this.word = word;
 54 |         }
 55 | 
 56 |         public String getPos() {
 57 |             return pos;
 58 |         }
 59 | 
 60 |         public void setPos(String pos) {
 61 |             this.pos = pos;
 62 |         }
 63 |     }
 64 | 
 65 |     public static List<Word> segment(String sentence) {
 66 |         List<Word> results = new ArrayList<Word>();
 67 |         Result terms = ToAnalysis.parse(sentence);
 68 | 
 69 |         for (Term term : terms) {
 70 |             results.add(new Word(term.getName(), term.natrue().natureStr));
 71 |         }
 72 | 
 73 |         return results;
 74 |     }
 75 | 
 76 |     public static String getSegmentedString(String sentence) {
 77 |         List<Word> words = segment(sentence);
 78 |         StringBuilder sb = new StringBuilder();
 79 |         for (Word word : words) {
 80 |             sb.append(word.getWord() + "/" + word.getPos()).append(" ");
 81 |         }
 82 |         return sb.toString();
 83 |     }
 84 | 
 85 |     public static JPanel createPanel() {
 86 |         //声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
 87 |         JPanel fullPanel = new JPanel();
 88 |         fullPanel.setLayout(new BorderLayout());
 89 | 
 90 |         JPanel northPanel = new JPanel();
 91 |         fullPanel.add(northPanel, "North");
 92 | 
 93 |         //centerPanel包括了一个文本框
 94 |         JPanel centerPanel = new JPanel();
 95 |         fullPanel.add(centerPanel, "Center");
 96 |         centerPanel.setLayout(new BorderLayout());
 97 |         final JTextArea result = new JTextArea();
 98 |         //result.setFont(new Font("宋体", Font.PLAIN, 16));
 99 |         result.setLineWrap(true);
100 |         JScrollPane centerScrollPane = new JScrollPane(result);
101 |         centerPanel.add(centerScrollPane, "Center");
102 | 
103 |         northPanel.setLayout(new GridLayout(1, 1));
104 | 
105 |         //以下加入northPanel中的第一个面板
106 |         final JTextField senField = new JTextField("什么是计算机病毒");
107 |         senField.setColumns(50);
108 | 
109 |         JPanel mainPanel = new JPanel();
110 |         mainPanel.setLayout(new GridLayout(2, 1));
111 | 
112 |         JPanel linePanel = new JPanel();
113 |         linePanel.add(new JLabel("句子:"));
114 |         linePanel.add(senField);
115 |         mainPanel.add(linePanel);
116 | 
117 |         linePanel = new JPanel();
118 |         JButton goButton = new JButton("词法分析");
119 |         linePanel.add(goButton);
120 |         mainPanel.add(linePanel);
121 |         goButton.addActionListener(new ActionListener() {
122 | 
123 |             @Override
124 |             public void actionPerformed(ActionEvent e) {
125 |                 String sentence = senField.getText();
126 |                 String text = "[" + sentence + "]的词法分析结果为:";
127 | 
128 |                 text = text + "\n" + getSegmentedString(sentence);
129 |                 text = text + "\n________________________________\n" + result.getText();
130 |                 result.setText(text);
131 |             }
132 | 
133 |         });
134 |         mainPanel.setBorder(BorderFactory.createEtchedBorder());
135 |         northPanel.add(mainPanel);
136 | 
137 |         return fullPanel;
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/SentenceSimilarity.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence;
2 | 
3 | import ruc.irm.similarity.Similaritable;
4 | 
5 | public interface SentenceSimilarity extends Similaritable {
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/Block.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.sentence.editdistance;
  2 | 
  3 | 
  4 | 
  5 | public class Block<T> {
  6 | 
  7 | 	private int globalPosition;
  8 | 	/** 块的内容 */
  9 | 	private SuperString<T> data;
 10 | 	/** 前后指针 */
 11 | 	private Block<T> prev, next;
 12 | 	/** 是否已经进行划分 */
 13 | 	private boolean divideFlag = false;
 14 | 
 15 | 	public Block(SuperString<T> string){
 16 | 		this.data = string;
 17 | 		this.globalPosition = 0;		
 18 | 	}
 19 | 	
 20 | 	public Block(SuperString<T> string, int globalBegin){
 21 | 		this.data = string;
 22 | 		this.globalPosition = globalBegin;
 23 | 	}	
 24 | 
 25 | 	public int getGlobalPosition() {
 26 | 		return globalPosition;
 27 | 	}
 28 | 
 29 | 	public void setGlobalPosition(int globalPosition) {
 30 | 		this.globalPosition = globalPosition;
 31 | 	}
 32 | 
 33 | 	public SuperString<T> getData() {
 34 | 		return data;
 35 | 	}
 36 | 
 37 | 	public void setData(SuperString<T> data) {
 38 | 		this.data = data;
 39 | 	}
 40 | 
 41 | 	public Block<T> getPrev() {
 42 | 		return prev;
 43 | 	}
 44 | 
 45 | 	public void setPrev(Block<T> prev) {
 46 | 		this.prev = prev;
 47 | 	}
 48 | 
 49 | 	public Block<T> getNext() {
 50 | 		return next;
 51 | 	}
 52 | 
 53 | 	public void setNext(Block<T> next) {
 54 | 		this.next = next;
 55 | 	}
 56 | 
 57 | 	public boolean isDivideFlag() {
 58 | 		return divideFlag;
 59 | 	}
 60 | 
 61 | 	public void setDivideFlag(boolean divideFlag) {
 62 | 		this.divideFlag = divideFlag;
 63 | 	}
 64 | 
 65 | 	public void divide(int start, int length){
 66 | 		if(start==0 && length==data.length()){
 67 | 			this.divideFlag = true;
 68 | 			return;
 69 | 		}else if(start==0){
 70 | 			//前面为已经分割的标记，后面应该为未分割的标记
 71 | 			Block<T> tail = new Block<T>(data.substring(length), globalPosition + start);
 72 | 			this.setDivideFlag(true);
 73 | 			this.setData(data.substring(0, length));
 74 | 			tail.next = this.next;
 75 | 			if(tail.next!=null)	tail.next.prev = tail;
 76 | 			this.next = tail;
 77 | 			tail.prev = this;
 78 | 		}else if(start+length == data.length()){
 79 | 			//后面为已经分割的标记，前面应该为未分割的标记
 80 | 			Block<T> head = new Block<T>(data.substring(0, start), globalPosition);
 81 | 			
 82 | 			this.setDivideFlag(true);
 83 | 			this.setData(data.substring(start));
 84 | 			
 85 | 			head.prev = this.prev;
 86 | 			if(head.prev!=null)	head.prev.next = head;
 87 | 			head.next = this;
 88 | 			this.prev = head;
 89 | 		}else{
 90 | 			//中间为已经分割的标记，前面和后面应该为未分割的标记
 91 | 			Block<T> head = new Block<T>(data.substring(0, start), globalPosition);
 92 | 			Block<T> tail = new Block<T>(data.substring(start+length), globalPosition + start+length);
 93 | 			
 94 | 			this.setDivideFlag(true);
 95 | 			this.setData(data.substring(start, start+length));
 96 | 			this.setGlobalPosition(globalPosition + start);
 97 | 			
 98 | 			head.prev = this.prev;
 99 | 			if(head.prev!=null)	head.prev.next = head;
100 | 			head.next = this;
101 | 			this.prev = head;
102 | 			
103 | 			tail.next = this.next;
104 | 			if(tail.next!=null)	tail.next.prev = tail;
105 | 			this.next = tail;
106 | 			tail.prev = this;
107 | 		}
108 | 
109 | 	}
110 | }
111 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/CharEditUnit.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.sentence.editdistance;
 2 | 
 3 | public class CharEditUnit extends EditUnit {
 4 | 	private String content = "";
 5 | 	
 6 | 	public CharEditUnit(Character ch){
 7 | 		content = ch.toString();
 8 | 	}
 9 | 	
10 | 	@Override
11 | 	public String getUnitString() {
12 | 		return content;
13 | 	}
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/ChunkEditUnit.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.sentence.editdistance;
 2 | 
 3 | 
 4 | public class ChunkEditUnit extends EditUnit {
 5 | 	private SuperString<? extends EditUnit> chunk = null;
 6 | 	
 7 | 	public ChunkEditUnit(SuperString<? extends EditUnit> chunk){
 8 | 		this.chunk = chunk;
 9 | 	}
10 | 		
11 | 	public String getUnitString() {
12 | 		return chunk.toString();
13 | 	}
14 | 	
15 | 	/**
16 | 	 * 根据此语的相似度获取替换代价
17 | 	 */
18 | 	@Override
19 | 	public double getSubstitutionCost(EditUnit otherUnit){
20 | 		if(!(otherUnit instanceof ChunkEditUnit)) return chunk.length();
21 | 		if(equals(otherUnit)) return 0.0;
22 | 		
23 | 		ChunkEditUnit other = (ChunkEditUnit)otherUnit;
24 | 		return new StandardEditDistance().getEditDistance(chunk, other.chunk);
25 | 	}
26 | 	
27 | 	/**
28 |      * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0
29 |      * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价
30 |      * @return 删除代价
31 |      */
32 |     public double getDeletionCost(){
33 |         return chunk.length();
34 |     }    
35 |     
36 |     /**
37 |      * 获取插入代价,标准算法的默认值为1.0.
38 |      * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价
39 |      */
40 |     public double getInsertionCost(){
41 |         return chunk.length();
42 |     }
43 | 	
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/EditDistance.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.sentence.editdistance;
 2 | 
 3 | import ruc.irm.similarity.Similaritable;
 4 | 
 5 | 
 6 | /**
 7 |  * 编辑距离的父类，定义了其中的主要行为
 8 |  * 
 9 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
10 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
11 |  */
12 | public abstract class EditDistance implements Similaritable {
13 |         
14 |     public abstract double getEditDistance(SuperString<? extends EditUnit> S, SuperString<? extends EditUnit> T);    
15 |  
16 |     public double getSimilarity(String s1, String s2){
17 |     	SuperString<WordEditUnit> S = SuperString.createWordSuperString(s1);
18 |     	SuperString<WordEditUnit> T = SuperString.createWordSuperString(s2);
19 |     	
20 |     	return 1-(getEditDistance(S, T))/(Math.max(S.length(), T.length()));
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/EditUnit.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.sentence.editdistance;
 2 | 
 3 | /**
 4 |  * 编辑单元
 5 |  * 
 6 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 7 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 8 |  */
 9 | public abstract class EditUnit {
10 | 	/**
11 | 	 * 获取编辑单元的内部字符串
12 | 	 * @return
13 | 	 */
14 | 	public abstract String getUnitString();
15 | 	
16 | 	/**
17 | 	 * 获取替换代价，默认替换代价当替换单元的内容相同时为0，
18 | 	 * 不同时为1
19 | 	 */
20 | 	public double getSubstitutionCost(EditUnit other){
21 | 		return this.equals(other)?0:1;
22 | 	}
23 | 	
24 | 	/**
25 |      * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0
26 |      * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价
27 |      * @return 删除代价
28 |      */
29 |     public double getDeletionCost(){
30 |         return 1.0;
31 |     }    
32 |     
33 |     /**
34 |      * 获取插入代价,标准算法的默认值为1.0.
35 |      * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价
36 |      */
37 |     public double getInsertionCost(){
38 |         return 1.0;
39 |     }
40 |     	
41 |     @Override
42 | 	public boolean equals(Object other){
43 |     	if(!(other instanceof EditUnit)) return false;
44 | 		return getUnitString().equals(((EditUnit)other).getUnitString());
45 | 	}
46 | 	
47 | 	@Override
48 | 	public String toString(){
49 | 		return getUnitString();
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/GregorEditDistance.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.sentence.editdistance;
  2 | 
  3 | /**
  4 |  * 由Gregor提出的考虑块交换(Block Transposition)的编辑距离改进算法
  5 |  * 时间复杂度为O(m3n3)
  6 |  * 具体实现请参考GregorLeusch，Nicola Ueffing的文章《A Novel String-to-String Distance Measure With
  7 |  * Application to Machine Translation Evaluation》
  8 |  * 问题：<br/>
  9 |  * 相似度计算的问题会影响句子相似度计算的直观结果，例如“什么是计算机病毒”，“电脑病毒是什么”
 10 |  * 直觉应该是2，即“什么是计算机病毒”首先变为“计算机病毒什么是”，再变为“计算机病毒是什么”，
 11 |  * 编辑代价为2，但实际上，当由“什么是计算机病毒”变为“计算机病毒什么是”后，由于"什么是"与“是什么”的替换代价只有0.2，
 12 |  * 因而不再进行交互，故总的编辑距离为1.2
 13 |  * 
 14 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 15 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 16 |  */
 17 | public class GregorEditDistance extends EditDistance {        
 18 |     /** 块交换代价 */
 19 |     public static double swapCost = 0.5;
 20 |     
 21 |     private SuperString<? extends EditUnit> S,T;
 22 |     /** 存放字符串从S(i0-i1)到T(j0-j1)的中间运算结果，避免多次运算，提高运算效率*/
 23 |     private double[][][][] QArray;
 24 |     
 25 |     public double getEditDistance(SuperString<? extends EditUnit> S,SuperString<? extends EditUnit> T){    
 26 |     	this.S = S;
 27 |     	this.T = T;
 28 |         QArray = new double[S.length()][S.length()][T.length()][T.length()];
 29 |         for(int i=0;i<S.length();i++){
 30 |             for(int i2=0;i2<S.length();i2++)
 31 |                 for(int j=0;j<T.length();j++)
 32 |                     for(int j2=0;j2<T.length();j2++){
 33 |                         QArray[i][i2][j][j2] = Double.MAX_VALUE;
 34 |                     }
 35 |         }
 36 |         
 37 |         return Q(0,S.length()-1,0,T.length()-1);
 38 |     }  
 39 |     
 40 |     private double Q(int i0,int i1,int j0,int j1){
 41 |         double cost = 0;
 42 |         
 43 |         if(i1<i0){
 44 |         	for(int j = j0; j<=j1; j++){
 45 |         		cost += T.elementAt(j).getInsertionCost();
 46 |         	}
 47 |         	return cost;
 48 |         }else if(j1<j0){
 49 |         	for(int i=i0; i<=i1; i++){
 50 |         		cost += S.elementAt(i).getDeletionCost();
 51 |         	}
 52 |         	return cost;
 53 |         }else if(i1==i0 && j1==j0){
 54 |         	cost = S.elementAt(i0).getSubstitutionCost(T.elementAt(j0));        	
 55 |         	QArray[i0][i1][j0][j1] = cost;
 56 |         	return cost;
 57 |         } else if(i1==i0){            
 58 |             double minSubstituteValue = 1.0;
 59 |             int minPosJ = j0;
 60 |             for(int j=j0;j<=j1;j++){
 61 |             	double subsitituteValue = S.elementAt(i0).getSubstitutionCost(T.elementAt(j));
 62 |             	if(minSubstituteValue > subsitituteValue){
 63 |             		minSubstituteValue = subsitituteValue;
 64 |             		minPosJ = j;
 65 |             	}                	
 66 |             }
 67 |             for(int j=j0;j<=j1;j++){
 68 |             	if(j == minPosJ){
 69 |             		cost += minSubstituteValue;             	
 70 |             	}else{
 71 |             		cost += T.elementAt(j).getInsertionCost();
 72 |             	}
 73 |             }                 
 74 |         }else if(j1==j0){            
 75 |         	double minSubstituteValue = 1.0;
 76 |             int minPosI = i0;
 77 |             for(int i=i0;i<=i1;i++){
 78 |             	double subsitituteValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j0));
 79 |             	if(minSubstituteValue > subsitituteValue){
 80 |             		minSubstituteValue = subsitituteValue;
 81 |             		minPosI = i;
 82 |             	}                	
 83 |             }
 84 |             for(int i=i0;i<=i1;i++){
 85 |             	if(i == minPosI){
 86 |             		cost += minSubstituteValue;             	
 87 |             	}else{
 88 |             		cost += S.elementAt(i).getDeletionCost();
 89 |             	}
 90 |             }            	             
 91 |         }else{
 92 |         	if(QArray[i0][i1][j0][j1]<Double.MAX_VALUE){
 93 |         		return QArray[i0][i1][j0][j1];
 94 |         	}
 95 |             for(int i=i0;i<i1;i++){
 96 |                 for(int j=j0;j<j1;j++){
 97 |                     double c = Math.min(Q(i0,i,j0,j)+Q(i+1,i1,j+1,j1),
 98 |                             Q(i0,i,j+1,j1)+Q(i+1,i1,j0,j)+swapCost);
 99 |                     if(c<QArray[i0][i1][j0][j1]){
100 |                     	QArray[i0][i1][j0][j1] = c;
101 |                     }
102 |                 }
103 |             }
104 |             return QArray[i0][i1][j0][j1];
105 |         }
106 |         QArray[i0][i1][j0][j1] = cost;        
107 |         return cost;
108 |     }
109 |     
110 |     public static void main(String[] argv) {
111 |         String s1 = "abcxdef";
112 |         String s2 = "defxabc";
113 |         //String s2 = "我的密码我忘记了,我该怎样做呢?";
114 |         GregorEditDistance ed = new GregorEditDistance();
115 |         System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1), SuperString.createCharSuperString(s2)));
116 |     }
117 | 
118 | 	
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/Split.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.sentence.editdistance;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | public class Split {
  7 | 	public static boolean MERGE_FLAG = true;
  8 | 	@SuppressWarnings("unchecked")
  9 | 	public static Object[] split(SuperString<? extends EditUnit> X, SuperString<? extends EditUnit> Y){
 10 | 		Block<? extends EditUnit> LX = new Block(X);
 11 | 		Block<? extends EditUnit> LY = new Block(Y);
 12 | 		split(LX,LY);
 13 | 		while(LY.getPrev()!=null){
 14 | 			LY = LY.getPrev();
 15 | 		}
 16 | 		while(LX.getPrev()!=null){
 17 | 			LX = LX.getPrev();
 18 | 		}
 19 | 		List<ChunkEditUnit> first = new ArrayList<ChunkEditUnit>();
 20 | 		List<ChunkEditUnit> second = new ArrayList<ChunkEditUnit>();
 21 | 		while(LX!=null){
 22 | 			first.add(new ChunkEditUnit(LX.getData()));
 23 | 			LX = LX.getNext();
 24 | 		}
 25 | 
 26 | 		while(LY!=null){
 27 | 			second.add(new ChunkEditUnit(LY.getData()));
 28 | 			LY = LY.getNext();
 29 | 		}
 30 | 		SuperString<ChunkEditUnit> s1 = new SuperString<ChunkEditUnit>(first);
 31 | 		SuperString<ChunkEditUnit> s2 = new SuperString<ChunkEditUnit>(second);
 32 | 		Object[] obj = new Object[]{s1, s2};
 33 | 		return obj;
 34 | 	}
 35 | 	
 36 | 	private static void split(Block<?> bx, Block<?> LY){
 37 | 		LCS maxLCS = null;
 38 | 		Block<?> by = LY;
 39 | 		while(by.getPrev()!=null){
 40 | 			by = by.getPrev();
 41 | 		}
 42 | 		Block<?> maxMatchedBy = by;
 43 | 		while(by!=null){
 44 | 			if(by.isDivideFlag()){
 45 | 				by = by.getNext();
 46 | 				continue;
 47 | 			}
 48 | 			
 49 | 			LCS lcs = LCS.parse(bx.getData(), by.getData());
 50 | 			if(maxLCS==null || maxLCS.length<lcs.length){
 51 | 				maxLCS = lcs;
 52 | 				maxMatchedBy = by;
 53 | 			}
 54 | 			
 55 | 			by = by.getNext();
 56 | 		}
 57 | 		
 58 | 		if(maxLCS!=null && maxLCS.length>0){
 59 | 			bx.divide(maxLCS.x_pos, maxLCS.length);
 60 | 			maxMatchedBy.divide(maxLCS.y_pos, maxLCS.length);
 61 | 		}		
 62 | 		
 63 | 		if(bx.getPrev()!=null && !bx.isDivideFlag()){
 64 | 			split(bx.getPrev(), LY);
 65 | 		}
 66 | 		
 67 | 		if(bx.getNext()!=null &&!bx.getNext().isDivideFlag()){
 68 | 			split(bx.getNext(), LY);
 69 | 		}
 70 | 		
 71 | 	}
 72 | 	
 73 | 	/** 
 74 | 	 * longest common string
 75 | 	 * @author Gavin
 76 | 	 *
 77 | 	 */
 78 | 	public static class LCS {
 79 | 		public int length = 0;  //LCS匹配的最长结果
 80 | 		public int x_pos = 0;	//LCS匹配的X的位置
 81 | 		public int y_pos = 0;	//LCS匹配的Y的位置
 82 | 		
 83 | 		public static LCS parse(SuperString<?> X, SuperString<?> Y){
 84 | 			LCS lcs = new LCS();
 85 | 			for(int start=0; start<X.length(); start++){
 86 | 				for(int end=start+1; end<=X.length(); end++){
 87 | 					SuperString<?> tempX = X.substring(start, end);
 88 | 					
 89 | 					int pos = Y.indexOf(tempX);					
 90 | 					if(pos>=0 && tempX.length()>lcs.length){
 91 | 						lcs.length = tempX.length();
 92 | 						lcs.x_pos = start;
 93 | 						lcs.y_pos = pos;
 94 | 					}					
 95 | 				}
 96 | 			}
 97 | 			return lcs;
 98 | 		}
 99 | 		
100 | 		public String toString(){
101 | 			return "length=" + length + ", x_pos=" + x_pos + ", y_pos=" + y_pos;
102 | 		}
103 | 	}
104 | 	
105 | 	public static void main(String[] args) {
106 | 		String s1 = "abcdefghijkabc";
107 | 		String s2 = "cdefghijklabccc";
108 | //		s2 = "fgabcdehijklkdslfkasdflak";
109 | //		s1 = "abcdefgxyzoxyjasdkfjjjaldsfa";
110 | //		s1 = "I like the book";
111 | //		s2 = "the book I like";
112 | 		s1 = "什么是计算机病毒";
113 | 		s2 = "电脑病毒是什么";
114 | 
115 | //		SuperString<CharEditUnit> ss1 = SuperString.createCharSuperString(s1);
116 | //		SuperString<CharEditUnit> ss2 = SuperString.createCharSuperString(s2);
117 | 		
118 | 		SuperString<WordEditUnit> ss1 = SuperString.createWordSuperString(s1);
119 | 		SuperString<WordEditUnit> ss2 = SuperString.createWordSuperString(s2);
120 | 		Split.split(ss1, ss2);
121 | //		LCS lcs = LCS.parse(ss1, ss2);
122 | //		System.out.println(lcs);
123 | 	}
124 | }
125 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/StandardEditDistance.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.sentence.editdistance;
 2 | 
 3 | 
 4 | /**
 5 |  * 基于编辑距离的汉语句子相似度计算
 6 |  * 
 7 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 8 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 9 |  */
10 | public class StandardEditDistance extends EditDistance {       
11 |     /**
12 |      * 获取两个串的编辑距离
13 |      * @param S 字符串1
14 |      * @param T 字符串2
15 |      * @return 两个串的编辑距离
16 |      */
17 |     public double getEditDistance(SuperString<? extends EditUnit> X, SuperString<? extends EditUnit> Y){
18 |     	double[][] D; //编辑矩阵
19 |         
20 |         int m = X.length(); //字符串X的长度
21 |         int n = Y.length(); //字符串Y的长度
22 |         //char ch_x_i;       //字符串X的第i个词
23 |         //char ch_y_j;       //字符串Y的第j个词
24 |         
25 |         if(m == 0){
26 |         	double distance = 0.0;
27 |         	for(int j=0; j<n; j++){
28 |         		distance += Y.elementAt(j).getInsertionCost();
29 |         	}
30 |             return distance;
31 |         }else if(n == 0){
32 |         	double distance = 0.0;
33 |         	for(int i=0; i<m; i++){
34 |         		distance += X.elementAt(i).getDeletionCost();
35 |         	}
36 |             return distance;
37 |         }
38 |                       
39 |         D = new double[n+1][m+1];
40 |         D[0][0] = 0.0; //第一个初始化为0
41 |         
42 |         /** 初始化D[0][j] */
43 |         for(int j = 1; j<=m; j++){
44 |             D[0][j] = D[0][j-1]+X.elementAt(j-1).getDeletionCost();
45 |         }
46 |         
47 |         /** 初始化D[i][0] */
48 |         for(int i = 1;i<=n; i++){
49 |             D[i][0] = D[i-1][0]+ Y.elementAt(i-1).getInsertionCost();
50 |         }        
51 |         
52 |         for(int i=1; i<=m; i++){
53 |         	EditUnit unit_x_i = X.elementAt(i-1);
54 |             for(int j=1; j<=n; j++){
55 |             	EditUnit unit_y_j = Y.elementAt(j-1);
56 |                 double cost = unit_x_i.getSubstitutionCost(unit_y_j);
57 |                 D[j][i] = Math.min(D[j-1][i]+Y.elementAt(j-1).getInsertionCost(),D[j][i-1]+X.elementAt(i-1).getDeletionCost());
58 |                 D[j][i] = Math.min(D[j][i], D[j-1][i-1]+cost);
59 |             }
60 |         }
61 |         
62 |         return D[n][m];
63 |     }
64 | 	
65 |     public static void main(String[] args) {
66 |         String s1 = "abcdefg";
67 |         String s2 = "gcdefab";
68 |         
69 |         StandardEditDistance ed = new StandardEditDistance();        
70 |         s1 = "什么是计算机病毒";
71 |         s2 = "什么是电脑病毒";
72 |         System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1), SuperString.createCharSuperString(s2)));        
73 |         System.out.println(ed.getEditDistance(SuperString.createWordSuperString(s1), SuperString.createWordSuperString(s2)));
74 |      }
75 | 
76 | 	
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/SuperString.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.sentence.editdistance;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import ruc.irm.similarity.sentence.SegmentProxy;
 7 | import ruc.irm.similarity.sentence.SegmentProxy.Word;
 8 | 
 9 | 
10 | /**
11 |  * 超级字符串，可以存放指定的数据类型
12 |  * 
13 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
14 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
15 |  *
16 |  * @param <T>
17 |  */
18 | public class SuperString<T> {
19 | 	private List<T> contents = new ArrayList<T>();
20 | 	
21 | 	public SuperString(List<T> contents){
22 | 		this.contents = contents;
23 | 	}
24 | 	
25 | 	public static SuperString<CharEditUnit> createCharSuperString(String str){
26 | 		List<CharEditUnit> list = new ArrayList<CharEditUnit>(str.length());
27 | 		for(int i=0; i<str.length(); i++){
28 | 			list.add(new CharEditUnit(str.charAt(i)));
29 | 		}
30 | 		SuperString<CharEditUnit> s = new SuperString<CharEditUnit>(list);
31 | 		return s;
32 | 	}
33 | 	
34 | 	public static SuperString<WordEditUnit> createWordSuperString(String sentence){
35 | 		List<Word> wordList = SegmentProxy.segment(sentence);
36 | 		List<WordEditUnit> unitList = new ArrayList<WordEditUnit>(wordList.size());
37 | 		for(int i=0; i<wordList.size(); i++){
38 | 			unitList.add(new WordEditUnit(wordList.get(i)));
39 | 		}
40 | 		SuperString<WordEditUnit> s = new SuperString<WordEditUnit>(unitList);
41 | 		return s;
42 | 	}
43 | 	
44 | 	
45 | 	public T elementAt(int pos){
46 | 		if(pos<0 || pos>=contents.size()){
47 | 			throw new ArrayIndexOutOfBoundsException("下标越界");
48 | 		}
49 | 		return contents.get(pos);
50 | 	}
51 | 	
52 | 	public int indexOf(SuperString<?> substring){
53 | 		int result = -1;
54 | 		for(int i=0; i<length(); i++){
55 | 			int j=0;
56 | 			if(i+substring.length()>length()) return -1;
57 | 			
58 | 			for(;j<substring.length();j++){				
59 | 				if(elementAt(i+j).equals(substring.elementAt(j))){
60 | 					continue;					
61 | 				}else{
62 | 					break;
63 | 				}
64 | 			}
65 | 			if(j==substring.length()){
66 | 				return i;
67 | 			}
68 | 		}
69 | 		return result;
70 | 	}
71 | 	
72 | 	public SuperString<T> substring(int fromIndex, int toIndex){
73 | 		return new SuperString<T>(contents.subList(fromIndex, toIndex));
74 | 	}
75 | 	
76 | 	public SuperString<T> substring(int fromIndex){
77 | 		return new SuperString<T>(contents.subList(fromIndex, contents.size()));
78 | 	}
79 | 	
80 | 	public int length(){
81 | 		return contents.size();
82 | 	}
83 | 	
84 | 	@Override
85 | 	public String toString(){
86 | 		StringBuilder sb = new StringBuilder();
87 | 		for(int i=0; i<length(); i++){
88 | 			sb.append(elementAt(i));
89 | 		}
90 | 		return sb.toString();
91 | 	}
92 | 	
93 | 	@Override
94 | 	public boolean equals(Object other){
95 |     	return toString().equals(other.toString());
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/WordEditUnit.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.sentence.editdistance;
 2 | 
 3 | import ruc.irm.similarity.sentence.SegmentProxy.Word;
 4 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
 5 | 
 6 | public class WordEditUnit extends EditUnit {
 7 | 	private Word word = null;
 8 | 	
 9 | 	public WordEditUnit(Word word){
10 | 		this.word = word;
11 | 	}
12 | 		
13 | 	public String getUnitString() {
14 | 		return word.getWord();
15 | 	}
16 | 	
17 | 	/**
18 | 	 * 根据此语的相似度获取替换代价
19 | 	 */
20 | 	@Override
21 | 	public double getSubstitutionCost(EditUnit otherUnit){
22 | 		if(!(otherUnit instanceof WordEditUnit)) return 1.0;
23 | 		if(equals(otherUnit)) return 0.0;
24 | 		
25 | 		WordEditUnit other = (WordEditUnit)otherUnit;
26 | 		//词性不同，直接返回1.0
27 | 		if(word.getPos()!=other.word.getPos()){
28 | 			return 1.0;
29 | 		}
30 | 		return 1 - XiaConceptParser.getInstance().getSimilarity(getUnitString(), other.getUnitString());
31 | 	}
32 | 	
33 | 	@Override
34 | 	public boolean equals(Object other){
35 |     	if(!(other instanceof WordEditUnit)) return false;
36 |     	WordEditUnit otherUnit = (WordEditUnit)other;
37 |     	Word otherWord = otherUnit.word;
38 |     	//词性不同，直接返回1.0
39 | 		if(word.getPos()!=otherWord.getPos()){
40 | 			return false;
41 | 		}
42 | 		double sim = XiaConceptParser.getInstance().getSimilarity(getUnitString(), otherUnit.getUnitString());
43 | 		return sim>0.85;
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/XiatianEditDistance.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.sentence.editdistance;
  2 | 
  3 | 
  4 | /**
  5 |  * 夏天提出的新的支持非相邻块交互的编辑距离算法
  6 |  * 
  7 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
  8 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
  9 |  */
 10 | public class XiatianEditDistance extends EditDistance {
 11 |     /** 块交换代价 */
 12 | 	public static double swapCost = 0.5;
 13 |     
 14 |     private SuperString<? extends EditUnit> S,T;
 15 |     private double[][][][] QArray;
 16 |     
 17 |     public double getEditDistance(SuperString<? extends EditUnit> S, SuperString<? extends EditUnit> T){
 18 |     	this.S = S;
 19 |     	this.T = T;
 20 |     	QArray = new double[S.length()+1][S.length()+1][T.length()+1][T.length()+1];
 21 |         for(int i=0;i<=S.length();i++){
 22 |             for(int i2=0;i2<=S.length();i2++)
 23 |                 for(int j=0;j<=T.length();j++)
 24 |                     for(int j2=0;j2<=T.length();j2++){
 25 |                         QArray[i][i2][j][j2]=Double.MAX_VALUE;
 26 |                     }
 27 |         }        
 28 |         return Q(0,S.length()-1,0,T.length()-1);
 29 |     }
 30 |     
 31 |     private double Q(int i1,int im,int j1,int jn){
 32 |     	if(QArray[i1][im][j1][jn]<Double.MAX_VALUE){
 33 |     		return QArray[i1][im][j1][jn];
 34 |     	}
 35 |         double cost = 0;                
 36 |         if(im<i1){
 37 |         	for(int j = j1; j<=jn; j++){
 38 |         		cost += T.elementAt(j).getInsertionCost();
 39 |         	}        	
 40 |         }else if(jn<j1){
 41 |         	for(int i=i1; i<=im; i++){
 42 |         		cost += S.elementAt(i).getDeletionCost();
 43 |         	}
 44 |         }else if(im==i1 && jn==j1){
 45 |         	cost = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));        	
 46 |         } else if(i1==im){            
 47 |             double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
 48 |             int minPosJ = j1;
 49 |             for(int j=j1+1;j<=jn;j++){
 50 |             	double subValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j));
 51 |             	if(minSubValue > subValue){
 52 |             		minSubValue = subValue;
 53 |             		minPosJ = j;
 54 |             	}                	
 55 |             }
 56 |             for(int j=j1;j<=jn;j++){
 57 |             	if(j == minPosJ){
 58 |             		cost += minSubValue;             	
 59 |             	}else{
 60 |             		cost += T.elementAt(j).getInsertionCost();
 61 |             	}
 62 |             }              
 63 |         }else if(j1==jn){    
 64 |         	int minPosI = i1;
 65 |         	double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));            
 66 |             for(int i=i1+1;i<=im;i++){
 67 |             	double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1));
 68 |             	if(minSubValue > subValue){
 69 |             		minSubValue = subValue;
 70 |             		minPosI = i;
 71 |             	}                	
 72 |             }
 73 |             for(int i=i1;i<=im;i++){
 74 |             	if(i == minPosI){
 75 |             		cost += minSubValue;             	
 76 |             	}else{
 77 |             		cost += S.elementAt(i).getDeletionCost();
 78 |             	}
 79 |             }            
 80 |         }else{        	
 81 |         	cost = QArray[i1][im][j1][jn];
 82 |             loop:for(int i=i1;i<im;i++){
 83 |                 //block X divide to 3 parts.
 84 |                 for(int LX=0;LX<=im-i;LX++){                    
 85 |                     //process Y sentence
 86 |                     for(int j=j1;j<jn;j++){
 87 |                     	//if(cost<=swapCost)break;
 88 |                         for(int LY=0;LY<=jn-j;LY++){                                 	
 89 |                         	//不交换的代价
 90 |                             double cost1 = Q(i1,i,j1,j)+Q(i+1,i+LX,j+1,j+LY)+Q(i+LX+1,im,j+LY+1,jn);
 91 |                             //交互代价
 92 |                             double cost2 = Q(i1,i,j+LY+1,jn)+Q(i+1,i+LX,j+1,j+LY)+Q(i+LX+1,im,j1,j)+swapCost;
 93 |                             cost = Math.min(Math.min(cost1, cost2),cost);
 94 |                             if(cost == 0) break loop;
 95 |                         }
 96 |                     }                    
 97 |                 }             
 98 |             }            
 99 |         }        
100 |         
101 |         QArray[i1][im][j1][jn] = cost;
102 |         return cost;
103 |     }
104 |     
105 |     public static void main(String[] argv) {    	
106 |         EditDistance ed = new XiatianEditDistance();
107 |         String s1 = "abcxdef";
108 |         String s2 = "def";
109 |         //String s2 = "我的密码我忘记了,我该怎样做呢?";
110 |         System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1), SuperString.createCharSuperString(s2)));
111 | 
112 |     }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/XiatianEditDistance2.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.sentence.editdistance;
  2 | 
  3 | 
  4 | /**
  5 |  * 夏天提出的新的支持非相邻块交互的编辑距离算法
  6 |  * 
  7 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
  8 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
  9 |  */
 10 | public class XiatianEditDistance2 extends EditDistance {
 11 |     /** 块交换代价 */
 12 |     private double swapCost = 1.0;
 13 |     
 14 |     private SuperString<? extends EditUnit> S,T;
 15 |     private double[][][][] QArray;
 16 |     
 17 |     @SuppressWarnings("unchecked")
 18 |     public double getEditDistance(SuperString<? extends EditUnit> S1, SuperString<? extends EditUnit> T1){
 19 |     	Object[] array = Split.split(S1, T1);
 20 |     	this.S = (SuperString<? extends EditUnit>)array[0];
 21 |     	this.T = (SuperString<? extends EditUnit>)array[1];
 22 |     	QArray = new double[S.length()+1][S.length()+1][T.length()+1][T.length()+1];
 23 |         for(int i=0;i<=S.length();i++){
 24 |             for(int i2=0;i2<=S.length();i2++)
 25 |                 for(int j=0;j<=T.length();j++)
 26 |                     for(int j2=0;j2<=T.length();j2++){
 27 |                         QArray[i][i2][j][j2]=Double.MAX_VALUE;
 28 |                     }
 29 |         }        
 30 |         return Q(0,S.length()-1,0,T.length()-1);
 31 |     }
 32 |     
 33 |     private double Q(int i1,int im,int j1,int jn){
 34 |     	if(QArray[i1][im][j1][jn]<Double.MAX_VALUE){
 35 |     		return QArray[i1][im][j1][jn];
 36 |     	}
 37 |         double cost = 0;                
 38 |         if(im<i1){
 39 |         	for(int j = j1; j<=jn; j++){
 40 |         		cost += T.elementAt(j).getInsertionCost();
 41 |         	}        	
 42 |         }else if(jn<j1){
 43 |         	for(int i=i1; i<=im; i++){
 44 |         		cost += S.elementAt(i).getDeletionCost();
 45 |         	}
 46 |         }else if(im==i1 && jn==j1){
 47 |         	cost = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));        	
 48 |         } else if(i1==im){            
 49 |             double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
 50 |             int minPosJ = j1;
 51 |             for(int j=j1+1;j<=jn;j++){
 52 |             	double subValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j));
 53 |             	if(minSubValue > subValue){
 54 |             		minSubValue = subValue;
 55 |             		minPosJ = j;
 56 |             	}                	
 57 |             }
 58 |             for(int j=j1;j<=jn;j++){
 59 |             	if(j == minPosJ){
 60 |             		cost += minSubValue;             	
 61 |             	}else{
 62 |             		cost += T.elementAt(j).getInsertionCost();
 63 |             	}
 64 |             }              
 65 |         }else if(j1==jn){    
 66 |         	int minPosI = i1;
 67 |         	double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));            
 68 |             for(int i=i1+1;i<=im;i++){
 69 |             	double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1));
 70 |             	if(minSubValue > subValue){
 71 |             		minSubValue = subValue;
 72 |             		minPosI = i;
 73 |             	}                	
 74 |             }
 75 |             for(int i=i1;i<=im;i++){
 76 |             	if(i == minPosI){
 77 |             		cost += minSubValue;             	
 78 |             	}else{
 79 |             		cost += S.elementAt(i).getDeletionCost();
 80 |             	}
 81 |             }            
 82 |         }else{        	
 83 |         	cost = QArray[i1][im][j1][jn];
 84 |             loop:for(int i=i1;i<im;i++){
 85 |                 //block X divide to 3 parts.
 86 |                 for(int LX=0;LX<=im-i;LX++){                    
 87 |                     //process Y sentence
 88 |                     for(int j=j1;j<jn;j++){
 89 |                     	//if(cost<=swapCost)break;
 90 |                         for(int LY=0;LY<=jn-j;LY++){                                 	
 91 |                         	//不交换的代价
 92 |                             double cost1 = Q(i1,i,j1,j)+Q(i+1,i+LX,j+1,j+LY)+Q(i+LX+1,im,j+LY+1,jn);
 93 |                             //交互代价
 94 |                             double cost2 = Q(i1,i,j+LY+1,jn)+Q(i+1,i+LX,j+1,j+LY)+Q(i+LX+1,im,j1,j)+swapCost;
 95 |                             cost = Math.min(Math.min(cost1, cost2),cost);
 96 |                             if(cost == 0) break loop;
 97 |                         }
 98 |                     }                    
 99 |                 }             
100 |             }            
101 |         }        
102 |         
103 |         QArray[i1][im][j1][jn] = cost;
104 |         return cost;
105 |     }
106 |     
107 |     public static void main(String[] argv) {    	
108 |         EditDistance ed = new XiatianEditDistance2();
109 |         String s1 = "abcxdef";
110 |         String s2 = "def";
111 |         //String s2 = "我的密码我忘记了,我该怎样做呢?";
112 |         System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1), SuperString.createCharSuperString(s2)));
113 | 
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/morphology/MorphoSimilarity.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.sentence.morphology;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | import org.slf4j.Logger;
  7 | import org.slf4j.LoggerFactory;
  8 | import ruc.irm.similarity.sentence.SegmentProxy;
  9 | import ruc.irm.similarity.sentence.SegmentProxy.Word;
 10 | import ruc.irm.similarity.sentence.SentenceSimilarity;
 11 | import ruc.irm.similarity.word.WordSimilarity;
 12 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
 13 | 
 14 | /**
 15 |  * 基于词形和词序的句子相似度计算算法，考虑了语义因素<br/>
 16 |  * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的方法，在考虑语义时，
 17 |  * 无法直接获取OnceWS(A, B)，因此，采用了两两匹配取最大值的方式。
 18 |  * 新的改进算法请参考{@code SemanticSimilarity}
 19 |  * 
 20 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 21 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 22 |  * 
 23 |  */
 24 | public class MorphoSimilarity implements SentenceSimilarity {
 25 |     private static Logger LOG = LoggerFactory.getLogger(MorphoSimilarity.class);
 26 |     
 27 |     /** 词形相似度占总相似度的比重 */
 28 |     private final double LAMBDA1 = 1.0;
 29 |     /** 词序相似度占总相似度的比重 */
 30 |     private final double LAMBDA2 = 0.0;   
 31 |     /** 词语相似度的计算 */
 32 |     private WordSimilarity wordSimilarity = null;
 33 |     
 34 |     private static String FILTER_CHARS = " 　，。；？《》()｜！,.;?<>|_^…!";
 35 |     
 36 |     private static MorphoSimilarity instance = null;
 37 |     
 38 |     public static MorphoSimilarity getInstance(){
 39 |     	if(instance == null){
 40 |     		instance = new MorphoSimilarity();
 41 |     	}
 42 |     	return instance;
 43 |     }
 44 |     
 45 |     private MorphoSimilarity(){
 46 |     	LOG.debug("used hownet wordsimilarity.");
 47 |     	this.wordSimilarity = XiaConceptParser.getInstance();
 48 |     	//this.segmenter = SegmentFactory.getInstance().getParser();
 49 |     }
 50 |     
 51 |     /**
 52 |      * 滤掉词串中的空格、标点符号
 53 |      * @param word_list
 54 |      * @return
 55 |      */
 56 |     private String[] filter(String[] word_list){
 57 |     	List<String> results = new ArrayList<String>();
 58 |     	for(String w:word_list){
 59 |     		if(!FILTER_CHARS.contains(w)){
 60 |     			results.add(w.toLowerCase());
 61 |     		}
 62 |     	}
 63 |     	
 64 |     	return results.toArray(new String[results.size()]);
 65 |     }
 66 |     
 67 |     /**
 68 |      * 计算两个句子的相似度
 69 |      * @see ruc.irm.similarity.Similaritable
 70 |      */
 71 |     public double getSimilarity(String firstSen,String secondSen){
 72 |     	//LOG.debug(segmenter.segmentToString(firstSen));
 73 |     	//LOG.debug(segmenter.segmentToString(secondSen));
 74 |         String[] firstList = filter(segment(firstSen));
 75 |         String[] secondList = filter(segment(secondSen));
 76 |         
 77 |         double wordSim = getOccurrenceSimilarity(firstList,secondList);
 78 |         //LOG.debug("词形相似度="+wordSim);
 79 |         
 80 |         double orderSim = getOrderSimilarity(firstList,secondList);
 81 |         //LOG.debug("词序相似度="+orderSim);
 82 |         
 83 |         return LAMBDA1*wordSim+LAMBDA2*orderSim;
 84 |     }
 85 |        
 86 |     /**
 87 |      * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序，第二个句子词语的顺序变化次数
 88 |      * @param firstList
 89 |      * @param secondList
 90 |      * @return
 91 |      */
 92 |     public double getOccurrenceSimilarity(String[] firstList, String[] secondList){    	
 93 |     	int max = firstList.length>secondList.length?firstList.length:secondList.length;
 94 |     	if(max==0){
 95 |     		return 0;
 96 |     	}
 97 |     	
 98 |     	//首先计算出所有可能的组合
 99 |     	double[][] scores = new double[max][max];
100 |     	for(int i=0; i<firstList.length; i++){
101 |     		for(int j=0; j<secondList.length; j++){
102 |     			scores[i][j] = wordSimilarity.getSimilarity(firstList[i], secondList[j]);
103 |     		}
104 |     	}
105 | 
106 |     	double total_score = 0;
107 |     	
108 |     	//从scores[][]中挑选出最大的一个相似度，然后减去该元素，进一步求剩余元素中的最大相似度    	    	
109 |     	while(scores.length > 0){
110 |     		double max_score = 0;
111 |     		int max_row = 0;
112 |     		int max_col = 0;
113 |     		
114 |     		//先挑出相似度最大的一对：<row, column, max_score> 
115 |     		for(int i=0; i<scores.length; i++){
116 |     			for(int j=0; j<scores.length; j++){
117 |     				if(max_score<scores[i][j]){
118 |     					max_row = i;
119 |     					max_col = j;
120 |     					max_score = scores[i][j];
121 |     				}
122 |     			}
123 |     		}
124 |     		
125 |     		//从数组中去除最大的相似度，继续挑选
126 |         	double[][] tmp_scores = new double[scores.length-1][scores.length-1];
127 |     		for(int i=0; i<scores.length; i++){
128 |     			if(i == max_row) continue;
129 |     			for(int j=0; j<scores.length; j++){
130 |     				if(j == max_col) continue;
131 |     				int tmp_i = max_row>i?i:i-1;
132 |     				int tmp_j = max_col>j?j:j-1;
133 |     				tmp_scores[tmp_i][tmp_j] = scores[i][j];
134 |     			}
135 |     		}
136 |     		total_score += max_score;
137 |     		scores = tmp_scores;    		
138 |     	}
139 |     	
140 |     	return (2*total_score) / (firstList.length + secondList.length);
141 |     }
142 |     
143 |     /**
144 |      * 获取两个集合的词序相似度
145 |      * @param firstList
146 |      * @param secondList
147 |      * @return
148 |      */
149 |     public double getOrderSimilarity(String[] firstList, String[] secondList){
150 |     	double similarity = 0.0;
151 |     	
152 |     	return similarity;
153 |     }    
154 |     
155 | //    @SuppressWarnings("unchecked")
156 | //	public String[] segment(String sentence){
157 | //    	MPWordSegment ws = new MPWordSegment();
158 | //    	ws.parseReader(new StringReader(sentence));    	
159 | //    	Vector tokens = ws.getTokens();
160 | //    	String[] results = new String[tokens.size()];
161 | //    	for(int i=0; i<tokens.size(); i++){
162 | //    		Token token = (Token)tokens.get(i);
163 | //    		results[i] = token.termText();    		
164 | //    	}
165 | //    	
166 | //    	return results;
167 | //    }
168 |     
169 |     public String[] segment(String sentence){
170 |     	List<Word> list = SegmentProxy.segment(sentence);
171 |     	String[] results = new String[list.size()];
172 |     	for(int i=0; i<list.size(); i++){
173 |     		results[i] = list.get(i).getWord();
174 |     	}
175 |     	return results;
176 |     }
177 |     
178 | }
179 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/morphology/SemanticSimilarity.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.sentence.morphology;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | import org.slf4j.Logger;
  7 | import org.slf4j.LoggerFactory;
  8 | import ruc.irm.similarity.sentence.SegmentProxy;
  9 | import ruc.irm.similarity.sentence.SegmentProxy.Word;
 10 | import ruc.irm.similarity.sentence.SentenceSimilarity;
 11 | import ruc.irm.similarity.word.WordSimilarity;
 12 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
 13 | 
 14 | /**
 15 |  * 基于语义的词形和词序句子相似度计算
 16 |  *
 17 |  * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的基于词形和词序的句子相似度计算算法
 18 |  * 在考虑语义时，无法直接获取OnceWS(A, B)，为此，通过记录两个句子的词语匹配对中相似度
 19 |  * 大于某一阈值的词语对最为相同词语，计算次序相似度。
 20 |  * 
 21 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 22 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 23 |  * 
 24 |  */
 25 | public class SemanticSimilarity implements SentenceSimilarity {
 26 |     private static Logger LOG = LoggerFactory.getLogger(SemanticSimilarity.class);
 27 |     
 28 |     /** 词形相似度占总相似度的比重 */
 29 |     private final double LAMBDA1 = 0.8;
 30 |     /** 词序相似度占总相似度的比重 */
 31 |     private final double LAMBDA2 = 0.2;   
 32 |     
 33 |     /** 如果两个词语的相似度大于了该阈值， 则作为相同词语，计算词序相似度 */
 34 |     private final double GAMMA = 0.6;
 35 |     
 36 |     /** 词语相似度的计算 */
 37 |     private WordSimilarity wordSimilarity = null;
 38 |     
 39 |     private static String FILTER_CHARS = " 　，。；？《》()｜！,.;?<>|_^…!";
 40 |     
 41 |     private static SemanticSimilarity instance = null;
 42 |     
 43 |     public static SemanticSimilarity getInstance(){
 44 |     	if(instance == null){
 45 |     		instance = new SemanticSimilarity();
 46 |     	}
 47 |     	return instance;
 48 |     }
 49 |     
 50 |     private SemanticSimilarity(){
 51 |     	LOG.debug("used hownet wordsimilarity.");
 52 |     	this.wordSimilarity = XiaConceptParser.getInstance();
 53 |     	//this.segmenter = SegmentFactory.getInstance().getParser();
 54 |     }
 55 |     
 56 |     /**
 57 |      * 滤掉词串中的空格、标点符号
 58 |      * @param word_list
 59 |      * @return
 60 |      */
 61 |     private String[] filter(String[] word_list){
 62 |     	List<String> results = new ArrayList<String>();
 63 |     	for(String w:word_list){
 64 |     		if(!FILTER_CHARS.contains(w)){
 65 |     			results.add(w.toLowerCase());
 66 |     		}
 67 |     	}
 68 |     	
 69 |     	return results.toArray(new String[results.size()]);
 70 |     }
 71 |     
 72 |     /**
 73 |      * 计算两个句子的相似度
 74 |      * @see ruc.irm.similarity.Similaritable
 75 |      */
 76 |     public double getSimilarity(String firstSen,String secondSen){
 77 |     	//LOG.debug(segmenter.segmentToString(firstSen));
 78 |     	//LOG.debug(segmenter.segmentToString(secondSen));
 79 |         String[] firstList = filter(segment(firstSen));
 80 |         String[] secondList = filter(segment(secondSen));
 81 |         
 82 |         return calculate(firstList,secondList);
 83 |     }
 84 |        
 85 |     /**
 86 |      * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序，第二个句子词语的顺序变化次数
 87 |      * @param firstList
 88 |      * @param secondList
 89 |      * @return
 90 |      */
 91 |     public double calculate(String[] firstList, String[] secondList){    	
 92 |     	if(firstList.length == 0 || secondList.length == 0){
 93 |     		return 0;
 94 |     	}
 95 |     	
 96 |     	//首先计算出所有可能的组合
 97 |     	double[][] scores = new double[firstList.length][secondList.length];
 98 |     	
 99 |     	//代表第1个句子对应位置是否已经被使用, 默认为未使用，即false
100 |     	boolean[] firstFlags = new boolean[firstList.length];
101 |     	
102 |     	//代表第2个句子对应位置是否已经被使用, 默认为未使用，即false
103 |         boolean[] secondFlags = new boolean[secondList.length];
104 |         
105 |         //PSecond的定义参见书中5.4.3节， 为避免无必要的初始化数组，
106 |         //数组中0值表示在第一个句子中没有对应的相似词语，大于0的值
107 |         //则表示在第一个句子中的位置（从1开始编号了）
108 |         int[] PSecond = new int[secondList.length];
109 |         
110 |     	for(int i=0; i<firstList.length; i++){
111 |     	    //firstFlags[i] = false;
112 |     		for(int j=0; j<secondList.length; j++){
113 |     			scores[i][j] = wordSimilarity.getSimilarity(firstList[i], secondList[j]);
114 |     		}
115 |     	}
116 | 
117 |     	double total_score = 0;
118 |     	
119 |     	//从scores[][]中挑选出最大的一个相似度，然后减去该元素(通过Flags数组表示)，进一步求剩余元素中的最大相似度    	    	
120 |     	while(true){
121 |     		double max_score = 0;
122 |     		int max_row = -1;
123 |     		int max_col = -1;
124 |     		
125 |     		//先挑出相似度最大的一对：<row, column, max_score> 
126 |     		for(int i=0; i<scores.length; i++){
127 |     		    if(firstFlags[i]) continue;
128 |     			for(int j=0; j<scores[i].length; j++){
129 |     			    if(secondFlags[j]) continue;
130 |     			    
131 |     				if(max_score<scores[i][j]){
132 |     					max_row = i;
133 |     					max_col = j;
134 |     					max_score = scores[i][j];
135 |     				}
136 |     			}
137 |     		}
138 |     		
139 |     		if(max_row>=0) {
140 |     		    total_score += max_score;
141 |     		    firstFlags[max_row] = true;
142 |     		    secondFlags[max_col] = true;
143 |     		    if(max_score>=GAMMA) {
144 |     		        PSecond[max_col] = max_row+1;
145 |     		    }
146 |     		} else {
147 |     		    break;
148 |     		}
149 |     	}
150 |     	
151 |     	double wordSim = (2*total_score) / (firstList.length + secondList.length);
152 |     	
153 |     	int previous = 0;
154 |     	int revOrdCount = 0;
155 |     	int onceWSSize = 0;
156 |     	for(int i=0; i<PSecond.length; i++) {
157 |     	    if(PSecond[i]>0) {
158 |     	        onceWSSize++;
159 |     	        if(previous>0 && (previous>PSecond[i])) {
160 |     	            revOrdCount++;
161 |     	        } 
162 |     	        previous = PSecond[i];
163 |     	    }
164 |     	}
165 |     	
166 |     	double ordSim = 0;
167 |     	if(onceWSSize==1) {
168 |     	    ordSim = 1;
169 |     	} else if(onceWSSize == 0) {
170 |     	    ordSim = 0;
171 |     	} else {
172 |     	    ordSim = 1.0 - revOrdCount*1.0/(onceWSSize-1);
173 |     	}
174 |     	
175 |     	System.out.println("wordSim ==> " + wordSim + ", ordSim ==> " + ordSim);
176 |     	
177 |     	return LAMBDA1*wordSim+LAMBDA2*ordSim;
178 |     }
179 |     
180 |     public String[] segment(String sentence){
181 |     	List<Word> list = SegmentProxy.segment(sentence);
182 |     	String[] results = new String[list.size()];
183 |     	for(int i=0; i<list.size(); i++){
184 |     		results[i] = list.get(i).getWord();
185 |     	}
186 |     	return results;
187 |     }
188 |     
189 | }
190 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/statistic/DictStatistic.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.statistic;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.InputStream;
 6 | import java.util.zip.GZIPInputStream;
 7 | 
 8 | import javax.xml.namespace.QName;
 9 | import javax.xml.stream.XMLEventReader;
10 | import javax.xml.stream.XMLInputFactory;
11 | import javax.xml.stream.events.StartElement;
12 | import javax.xml.stream.events.XMLEvent;
13 | 
14 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
15 | 
16 | /**
17 |  * 用于统计分词词典文件中的概念出现数量
18 |  * 
19 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
20 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
21 |  */
22 | public class DictStatistic {
23 | 	/**
24 | 	 * 从指定的xml文件加载词典文件
25 | 	 * @param xmlFile
26 | 	 * @param gzCompressed 是否再用gz格式对词典进行了压缩
27 | 	 * @return
28 | 	 */
29 | 	public void testFromXml(String xmlFile, boolean gzCompressed) {
30 | 		File file = new File(xmlFile);
31 | 		if (!file.canRead()){
32 | 			System.out.println("无法读取文件:" + xmlFile);
33 | 			return;// fail while opening the file
34 | 		}
35 | 		int count = 0, conceptCount=0;
36 | 		XMLInputFactory inputFactory = XMLInputFactory.newInstance();
37 | 		InputStream input = null;
38 | 		try {			
39 | 			if(gzCompressed){
40 | 				input = new GZIPInputStream(new FileInputStream(file));
41 | 			}else{
42 | 				input = new FileInputStream(file);
43 | 			}			
44 | 			XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
45 | 			while (xmlEventReader.hasNext()) {
46 | 				XMLEvent event = xmlEventReader.nextEvent();
47 | 				
48 | 				if (event.isStartElement()) {					
49 | 					StartElement startElement = event.asStartElement();					
50 | 					if(startElement.getName().toString().equals("table")){
51 | 						String head = startElement.getAttributeByName(QName.valueOf("head")).getValue();						
52 | 						while (xmlEventReader.hasNext()) {
53 | 							XMLEvent itemEvent = xmlEventReader.nextEvent();
54 | 							if(itemEvent.isStartElement()){
55 | 								StartElement itemStartElement = itemEvent.asStartElement();
56 | 								if(!itemStartElement.getName().toString().equals("item")) continue;
57 | 								String word = itemStartElement.getAttributeByName(QName.valueOf("word")).getValue();
58 | 								word = head + word;
59 | 								if(XiaConceptParser.getInstance().isConcept(word)){
60 | 									conceptCount++;
61 | 								}
62 | 								count++;
63 | 								if(count%1000==0){
64 | 									System.out.println("process words " + count + "...");
65 | 								}
66 | 							}
67 | 						}
68 | 					}					
69 | 				}
70 | 			}
71 | 			input.close();
72 | 			System.out.println(count + "\t" + conceptCount);
73 | 			return;
74 | 		} catch (Exception e) {
75 | 			e.printStackTrace();
76 | 		}
77 | 	}
78 | 	
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/statistic/LCMC.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.statistic;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.InputStream;
 6 | 
 7 | import javax.xml.stream.XMLEventReader;
 8 | import javax.xml.stream.XMLInputFactory;
 9 | import javax.xml.stream.events.StartElement;
10 | import javax.xml.stream.events.XMLEvent;
11 | 
12 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
13 | 
14 | 
15 | 
16 | public class LCMC {
17 | 	
18 | 	public void countUnConceptWords(File xmlFile) throws Exception{
19 | 		int totalCount = 0, conceptCount = 0;
20 | 		XMLInputFactory inputFactory = XMLInputFactory.newInstance();
21 | 		InputStream input = null;
22 | 		input = new FileInputStream(xmlFile);
23 | 		XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
24 | 		while (xmlEventReader.hasNext()) {
25 | 			XMLEvent event = xmlEventReader.nextEvent();
26 | 			
27 | 			if (event.isStartElement()) {					
28 | 				StartElement startElement = event.asStartElement();
29 | 				//如果是word开始
30 | 				if(startElement.getName().toString().equals("w")){
31 | 					String word = xmlEventReader.getElementText();
32 | 					totalCount++;
33 | 					if(XiaConceptParser.getInstance().isConcept(word)){
34 | 						conceptCount++;
35 | 					}
36 | 				}					
37 | 			}
38 | 		}//
39 | 		input.close();
40 | 		System.out.println(totalCount + "\t" + conceptCount);
41 | 	}
42 | 	
43 | 	public static void main(String[] args) throws Exception {
44 | 		LCMC lcmc = new LCMC();
45 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_A.XML"));
46 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_B.XML"));
47 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_C.XML"));
48 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_D.XML"));
49 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_E.XML"));
50 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_F.XML"));
51 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_G.XML"));
52 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_H.XML"));
53 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_J.XML"));
54 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_K.XML"));
55 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_L.XML"));
56 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_M.XML"));
57 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_N.XML"));
58 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_P.XML"));
59 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_R.XML"));
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/text/DiceSimilarity.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.text;
 2 | 
 3 | import ruc.irm.similarity.Similaritable;
 4 | 
 5 | public class DiceSimilarity implements Similaritable {
 6 | 
 7 | 	@Override
 8 | 	public double getSimilarity(String item1, String item2) {
 9 | 		// TODO Auto-generated method stub
10 | 		return 0;
11 | 	}
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/About.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.util;
 2 | 
 3 | import com.google.common.io.Resources;
 4 | 
 5 | import javax.swing.*;
 6 | import javax.swing.text.StyledEditorKit;
 7 | import java.awt.*;
 8 | import java.io.IOException;
 9 | import java.net.URL;
10 | import java.net.URLClassLoader;
11 | 
12 | /**
13 |  * 关于xsimilarity项目的说明信息
14 |  *
15 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
16 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
17 |  */
18 | public class About extends JFrame {
19 |     private static final long serialVersionUID = -2307582155443587993L;
20 | 
21 |     public static JPanel createPanel() {
22 |         JPanel mainPanel = new JPanel();
23 |         mainPanel.setLayout(new BorderLayout());
24 |         JTextPane editorPane = new JTextPane();
25 |         editorPane.setEditable(false);
26 |         //让长文本自动换行
27 |         editorPane.setEditorKit(new StyledEditorKit());
28 |         editorPane.setContentType("text/html");
29 |         try {
30 |             URL url = Resources.getResource("about.html");//可以用html格式文件做你的帮助系统了
31 |             editorPane.setPage(url);
32 |         } catch (IOException e1) {
33 |             editorPane.setText(e1.getMessage());
34 |         }
35 |         //editorPane.setText("<html><body>个人主页：<a href='xiatian.irm.cn'>http://xiatian.irm.cn/</a></body></html>");
36 | 
37 | 
38 |         mainPanel.add(new JScrollPane(editorPane), BorderLayout.CENTER);
39 |         return mainPanel;
40 |     }
41 | 
42 |     public About() {
43 |         this.setTitle("关于XSimilarity");
44 | 
45 |         this.setDefaultCloseOperation(EXIT_ON_CLOSE);
46 |         this.setPreferredSize(new Dimension(600, 400));
47 |         this.getContentPane().add(createPanel());
48 |         this.pack();
49 |     }
50 | 
51 |     public static void main(String[] args) {
52 |         new About().setVisible(true);
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/BlankUtils.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.util;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | /**
 6 |  * 判断是否为空的工具类
 7 |  * 
 8 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 9 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
10 |  */
11 | public class BlankUtils {
12 | 	/**
13 | 	 * 判断字符串s是否是空串
14 | 	 * @param s
15 | 	 * @return
16 | 	 */
17 | 	public static boolean isBlank(String string){
18 | 		return string==null || string.trim().equals("");
19 | 	}	
20 | 	
21 | 	/**
22 | 	 * 判断数组是否是空
23 | 	 * @param array
24 | 	 * @return
25 | 	 */
26 | 	public static boolean isBlank(Object[] array){
27 | 		return array==null || array.length==0;
28 | 	}
29 | 	
30 | 	/**
31 | 	 * 判断集合是否是空
32 | 	 * @param array
33 | 	 * @return
34 | 	 */
35 | 	public static boolean isBlank(Collection<? extends Object> array){
36 | 		return array==null || array.size()==0;
37 | 	}
38 | 	
39 | 	/**
40 | 	 * 判断所有的集合是否都为空
41 | 	 * @param collections 
42 | 	 * @return
43 | 	 */
44 | 	public static boolean isBlankAll(Collection<?>...collections){
45 | 		for(Collection<?> c:collections){
46 | 			if(!isBlank(c)){
47 | 				return false;
48 | 			}
49 | 		}
50 | 
51 | 		return true;	
52 | 	}
53 | 	
54 | 	/**
55 | 	 * 判断字符串strings中是否都是空串
56 | 	 * @param strings
57 | 	 * @return
58 | 	 */
59 | 	public static boolean isBlankAll(String... strings){
60 | 		for(String s:strings){
61 | 			if(!isBlank(s)){
62 | 				return false;
63 | 			}
64 | 		}
65 | 		
66 | 		return true;
67 | 	}
68 | 	
69 | 	/**
70 | 	 * 判断collections集合中是否至少有一个为空
71 | 	 * @param collections
72 | 	 * @return
73 | 	 */
74 | 	public static boolean isBlankAtLeastOne(Collection<?>...collections){
75 | 		for(Collection<?> c:collections){
76 | 			if(isBlank(c)){
77 | 				return true;
78 | 			}
79 | 		}
80 | 
81 | 		return false;	
82 | 	}
83 | 	
84 | 	/**
85 | 	 * 判断字符串strings中是否之首有一个为空
86 | 	 * @param strings
87 | 	 * @return
88 | 	 */
89 | 	public static boolean isBlankAtLeastOne(String... strings){
90 | 		for(String s:strings){
91 | 			if(isBlank(s)){
92 | 				return true;
93 | 			}
94 | 		}
95 | 		
96 | 		return false;
97 | 	}
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/EditDistance.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.util;
  2 | 
  3 | /**
  4 |  * 
  5 |  * This class computes the edit distance between two strings using dynamic
  6 |  * programming. The dynamic programming part is in the method
  7 |  * printEditDistance().
  8 |  * 
  9 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 10 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 11 |  */
 12 | public class EditDistance {
 13 | 	/**
 14 | 	 * 获取删除代价
 15 | 	 * 
 16 | 	 * @return
 17 | 	 */
 18 | 	public int getDeletionCost() {
 19 | 		return 1;
 20 | 	}
 21 | 
 22 | 	/**
 23 | 	 * 获取插入代价
 24 | 	 * 
 25 | 	 * @return
 26 | 	 */
 27 | 	public int getInsertionCost() {
 28 | 		return 1;
 29 | 	}
 30 | 
 31 | 	/**
 32 | 	 * 获取替换代价
 33 | 	 * 
 34 | 	 * @return
 35 | 	 */
 36 | 	public int getSubstitutionCost(char a, char b) {
 37 | 		return (a == b) ? 0 : 1;
 38 | 	}
 39 | 
 40 | 	public int getEditDistance(String S, String T) {
 41 | 		int[][] D = null;
 42 | 		if (S == null)
 43 | 			S = "";
 44 | 		if (T == null)
 45 | 			T = "";
 46 | 
 47 | 		char[] a = S.toCharArray();
 48 | 		char[] b = T.toCharArray();
 49 | 
 50 | 		int n = a.length; // 字符串S的长度
 51 | 		int m = b.length; // 字符串T的长度
 52 | 
 53 | 		if (a.length == 0) {
 54 | 			return b.length;
 55 | 		} else if (b.length == 0) {
 56 | 			return a.length;
 57 | 		}
 58 | 
 59 | 		D = new int[a.length + 1][b.length + 1];
 60 | 		
 61 | 		/** 初始化D[i][0] */
 62 | 		for (int i = 1; i <= n; i++) {
 63 | 			D[i][0] = D[i - 1][0] + getDeletionCost();
 64 | 		}
 65 | 
 66 | 		/** 初始化D[0][j] */
 67 | 		for (int j = 1; j <= m; j++) {
 68 | 			D[0][j] = D[0][j - 1] + getInsertionCost();
 69 | 		}
 70 | 
 71 | 		for (int i = 1; i <= n; i++) {
 72 | 			for (int j = 1; j <= m; j++) {
 73 | 				D[i][j] = MathUtils.min(D[i - 1][j] + getDeletionCost(),
 74 | 						D[i][j - 1] + getInsertionCost(), D[i - 1][j - 1]
 75 | 								+ getSubstitutionCost(a[i - 1], b[j - 1]));
 76 | 			}
 77 | 		}
 78 | 
 79 | 		return D[n][m];
 80 | 	}
 81 | 
 82 | 	/**
 83 | 	 * 应与getEditDistance(S, T)等同
 84 | 	 * @param s
 85 | 	 * @param t
 86 | 	 * @return
 87 | 	 */
 88 | 	public static int getLevenshteinDistance(String s, String t) {
 89 | 		if (s == null || t == null) {
 90 | 			throw new IllegalArgumentException("Strings must not be null");
 91 | 		}
 92 | 		int d[][]; // matrix
 93 | 		int n; // length of s
 94 | 		int m; // length of t
 95 | 		int i; // iterates through s
 96 | 		int j; // iterates through t
 97 | 		char s_i; // ith character of s
 98 | 		char t_j; // jth character of t
 99 | 		int cost; // cost
100 | 
101 | 		// Step 1
102 | 		n = s.length();
103 | 		m = t.length();
104 | 		if (n == 0) {
105 | 			return m;
106 | 		}
107 | 		if (m == 0) {
108 | 			return n;
109 | 		}
110 | 		d = new int[n + 1][m + 1];
111 | 
112 | 		// Step 2
113 | 		for (i = 0; i <= n; i++) {
114 | 			d[i][0] = i;
115 | 		}
116 | 		for (j = 0; j <= m; j++) {
117 | 			d[0][j] = j;
118 | 		}
119 | 
120 | 		// Step 3
121 | 		for (i = 1; i <= n; i++) {
122 | 			s_i = s.charAt(i - 1);
123 | 
124 | 			// Step 4
125 | 			for (j = 1; j <= m; j++) {
126 | 				t_j = t.charAt(j - 1);
127 | 
128 | 				// Step 5
129 | 				if (s_i == t_j) {
130 | 					cost = 0;
131 | 				} else {
132 | 					cost = 1;
133 | 				}
134 | 
135 | 				// Step 6
136 | 				d[i][j] = MathUtils.min(d[i - 1][j] + 1, d[i][j - 1] + 1,
137 | 						d[i - 1][j - 1] + cost);
138 | 			}
139 | 		}
140 | 
141 | 		// Step 7
142 | 		return d[n][m];
143 | 	}
144 | 	
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/FileUtils.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.util;
 2 | 
 3 | import java.io.BufferedOutputStream;
 4 | import java.io.BufferedReader;
 5 | import java.io.File;
 6 | import java.io.FileOutputStream;
 7 | import java.io.IOException;
 8 | import java.io.InputStream;
 9 | import java.io.InputStreamReader;
10 | 
11 | /**
12 |  * 与文件相关的工具类
13 |  * 
14 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
15 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
16 |  */
17 | public class FileUtils {
18 | 	/**
19 | 	 * 根据指定编码从输入流中依次遍历每一行文字
20 | 	 * 
21 | 	 * @param input
22 | 	 *            输入流
23 | 	 * @param encoding
24 | 	 *            输入流所用的文字编码
25 | 	 * @param event
26 | 	 *            遍历每一行时触发的事件处理
27 | 	 * @throws IOException
28 | 	 */
29 | 	public static void traverseLines(InputStream input, String encoding, TraverseEvent<String> event) throws IOException {
30 | 		BufferedReader in = new BufferedReader(new InputStreamReader(input,	encoding));
31 | 		String line = null;
32 | 
33 | 		while ((line = in.readLine()) != null) {
34 | 			event.visit(line);
35 | 		}
36 | 
37 | 		input.close();
38 | 		in.close();
39 | 	}
40 | 
41 | 	/**
42 | 	 * 保存字符串到文件中
43 | 	 * @param content
44 | 	 * @param fileName
45 | 	 * @return
46 | 	 */
47 | 	public static boolean saveStringToFile(String content, String fileName) {
48 | 		boolean rtn = false;
49 | 		BufferedOutputStream out = null;
50 | 		try {			
51 | 			File file = new File(fileName);
52 | 			file.getParentFile().mkdirs();
53 | 			
54 | 			out = new BufferedOutputStream(new FileOutputStream(file));
55 | 			out.write(content.getBytes("GBK"));
56 | 			out.close();
57 | 			rtn = true;
58 | 		} catch (Exception e) {
59 | 			System.out.println("saveStringToFile error:" + e.getMessage());
60 | 		} finally {
61 | 			try {
62 | 				out.close();
63 | 			} catch (Exception e) {
64 | 			}
65 | 		}
66 | 		return rtn;
67 | 	}
68 | 	
69 | 	public static void main(String[] args) {
70 | 		int count = 0;
71 | 		File dir = new File("G:/juanjuantx");
72 | 		for(File a:dir.listFiles()){
73 | 			if(a.isDirectory()){
74 | 				for(File zy: a.listFiles()){
75 | 					if(zy.listFiles()!=null)
76 | 					for(File rar:zy.listFiles()){
77 | 						if(rar.isFile() && rar.getName().endsWith(".rar")){
78 | 							count++;
79 | 						}
80 | 					}
81 | 				}
82 | 			}
83 | 		}
84 | 		System.out.println(count);
85 | 	}
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/MathUtils.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.util;
 2 | 
 3 | public class MathUtils {
 4 | 	public static int min(int... values){
 5 | 		int min = Integer.MAX_VALUE;
 6 | 		for(int v:values){
 7 | 			min = (v<min)?v:min;
 8 | 		}
 9 | 		return min;
10 | 	}
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/PinyinUtils.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.util;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.IOException;
  5 | import java.io.InputStream;
  6 | import java.io.InputStreamReader;
  7 | import java.util.HashMap;
  8 | import java.util.HashSet;
  9 | import java.util.Map;
 10 | import java.util.Set;
 11 | 
 12 | /**
 13 |  * 拼音处理的工具，负责从拼音词典加载内容，根据汉字词语或汉字查找拼音
 14 |  * 
 15 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 16 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 17 |  */
 18 | public class PinyinUtils {
 19 | 	/** 拼音的Map词典, 一个汉字可能对应多个拼音, 它所有的拼音放到一个集合中 */
 20 | 	private Map<Character, Set<String>> pinyinDict = null;
 21 | 	
 22 | 	/** 单例 */
 23 | 	private static PinyinUtils instance = null;
 24 | 	
 25 | 	private PinyinUtils() throws IOException{
 26 | 		//从classpath中加载拼音词典文件
 27 | 		InputStream input = this.getClass().getResourceAsStream("/data/F02-GB2312-to-PuTongHua-PinYin.txt");
 28 | 		
 29 | 		BufferedReader in = new BufferedReader(new InputStreamReader(input,	"UTF-8"));
 30 | 		String line = null;
 31 | 
 32 | 		MyTraverseEvent event = new MyTraverseEvent();
 33 | 		while ((line = in.readLine()) != null) {
 34 | 			event.visit(line);
 35 | 		}
 36 | 
 37 | 		input.close();
 38 | 		in.close();
 39 | 		
 40 | 		this.pinyinDict = event.getPinyins();
 41 | 	}
 42 | 	
 43 | 	public static PinyinUtils getInstance(){
 44 | 		if(instance == null){
 45 | 			try {
 46 | 				instance = new PinyinUtils();
 47 | 			} catch (IOException e) {				
 48 | 				e.printStackTrace();
 49 | 			}
 50 | 		}
 51 | 		
 52 | 		return instance;
 53 | 	}
 54 | 	
 55 | 	/**
 56 | 	 * 获取汉字的拼音, 由于汉字具有多音字，故返回一个集合
 57 | 	 * @param hanzi
 58 | 	 * @return
 59 | 	 */
 60 | 	public Set<String> getPinyin(Character hanzi){
 61 | 		Set<String> set = pinyinDict.get(hanzi);
 62 | 		if(set==null || set.size()==0){
 63 | 			set = new HashSet<String>();
 64 | 			set.add(hanzi.toString());
 65 | 		}
 66 | 		return set;
 67 | 	}
 68 | 	
 69 | 	/**
 70 | 	 * 获取词语的拼音, 一个词语可能对应多个拼音，把所有可能的组合放到集合中返回
 71 | 	 * @param word
 72 | 	 * @return
 73 | 	 */
 74 | 	public Set<String> getPinyin(String word){
 75 | 		Set<String> word_set = new HashSet<String>();
 76 | 		for(int i=0; i<word.length(); i++){
 77 | 			Set<String> hanzi_set = getPinyin(word.charAt(i));
 78 | 			if(word_set==null || word_set.size()==0){
 79 | 				word_set.addAll(hanzi_set);
 80 | 				continue;
 81 | 			}
 82 | 			
 83 | 			Set<String> tmp_set = new HashSet<String>();
 84 | 			for(String w:word_set){
 85 | 				for(String h:hanzi_set){
 86 | 					tmp_set.add(w + h);
 87 | 				}
 88 | 			}
 89 | 			
 90 | 			word_set = tmp_set;		
 91 | 		}
 92 | 
 93 | 		return word_set;
 94 | 	}
 95 | 	
 96 | 	/**
 97 | 	 * 获取拼音字符串，多音字只取一个
 98 | 	 * @param word
 99 | 	 * @return
100 | 	 */
101 | 	public String getPinyinSingle(String word){
102 | 		StringBuffer sb = new StringBuffer();
103 | 		for(int i=0; i<word.length(); i++){
104 | 			sb.append(getPinyin(word.charAt(i)).iterator().next());
105 | 		}
106 | 		return sb.toString();
107 | 	}
108 | 	
109 | 	/**
110 | 	 * 获取拼音串，对于多音字，给出所有拼音
111 | 	 * @param word
112 | 	 * @return
113 | 	 */
114 | 	public String getPinyinString(String word){
115 | 		StringBuffer sb = new StringBuffer();
116 | 		for(int i=0; i<word.length(); i++){
117 | 			Set<String> pinyin = getPinyin(word.charAt(i));
118 | 			sb.append(pinyin.toString());
119 | 		}
120 | 		return sb.toString();
121 | 	}
122 | 	
123 | 	/**
124 | 	 * 获取拼音首字母
125 | 	 * @param word
126 | 	 * @return
127 | 	 */
128 | 	public String getPinyinHead(String word){
129 | 		StringBuffer sb = new StringBuffer();
130 | 		for(int i=0; i<word.length(); i++){
131 | 			sb.append(getPinyin(word.charAt(i)).iterator().next().charAt(0));
132 | 		}
133 | 		return sb.toString();
134 | 	}
135 | 	
136 | 	private static class MyTraverseEvent {
137 | 		/** 一个汉字对应多个拼音, 多个拼音放到集合中 */
138 | 		private Map<Character, Set<String>> pinyins = null;
139 | 		
140 | 		public MyTraverseEvent(){
141 | 			this.pinyins = new HashMap<Character, Set<String>>();
142 | 		}
143 | 		
144 | 		public Map<Character, Set<String>> getPinyins(){
145 | 			return pinyins;
146 | 		}
147 | 		
148 | 		public boolean visit(String item) {
149 | 			if(item.startsWith("//")){
150 | 				return true;
151 | 			}
152 | 			
153 | 			char hanzi = item.charAt(0);
154 | 			//String pinyin = item.substring(2, item.length()-1);
155 | 			String pinyin = item.substring(2, item.length());
156 | 			Set<String> set = pinyins.get(hanzi);
157 | 			if(set==null){
158 | 				set = new HashSet<String>();
159 | 			}
160 | 			set.add(pinyin);
161 | 			
162 | 			pinyins.put(hanzi, set);
163 | 			return true;
164 | 		}		
165 | 	}
166 | 		
167 | }
168 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/TraverseEvent.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.util;
 2 | 
 3 | /**
 4 |  * 遍历接口, 对于需要遍历的东西，通过传入该接口，可以实现实际的访问处理
 5 |  * 
 6 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 7 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 8 |  * 
 9 |  * @param <T>
10 |  */
11 | public interface TraverseEvent<T> {
12 | 	
13 | 	/** 
14 | 	 * 遍历时访问其中的一个条目
15 | 	 * @param item
16 | 	 * @return
17 | 	 */
18 | 	public boolean visit(T item);
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/XmlException.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.util;
 2 | 
 3 | /**
 4 |  * Runtime exception for XML handling.
 5 |  * 
 6 |  * @author carver
 7 |  */
 8 | public class XmlException extends RuntimeException {
 9 | 
10 | 	private static final long serialVersionUID = 381260478228427716L;
11 | 
12 | 	public static final String XML_PAYLOAD_EMPTY = "xml.payload.empty";
13 | 	public static final String XML_ENCODE_ERROR = "xml.encoding.invalid";
14 | 	public static final String FILE_NOT_FOUND = "xml.file.not.found";
15 | 	public static final String XML_PARSE_ERROR = "xml.parse.error";
16 | 	public static final String XML_READ_ERROR = "xml.read.error";
17 | 	public static final String XML_VALIDATE_ERROR = "xml.validate.error";
18 | 	public static final String XML_TRANSFORM_ERROR = "xml.transform.error";
19 | 
20 | 	public XmlException() {
21 | 		super();
22 | 	}
23 | 
24 | 	public XmlException(String key, Throwable cause) {
25 | 		super(key, cause);
26 | 	}
27 | 
28 | 	public XmlException(String key) {
29 | 		super(key);
30 | 	}
31 | 
32 | 	public XmlException(Throwable cause) {
33 | 		super(cause);
34 | 	}
35 | 
36 | }


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/CharBasedSimilarity.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import ruc.irm.similarity.Similaritable;
 7 | 
 8 | 
 9 | /**
10 |  * 字面相似度计算方法
11 |  * 
12 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
13 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
14 |  */
15 | public class CharBasedSimilarity implements Similaritable {
16 | 
17 | 	private double alpha = 0.6;
18 | 	private double beta = 0.4;
19 | 	
20 | 	@Override
21 | 	public double getSimilarity(String word1, String word2) {
22 | 		if(isBlank(word1)&& isBlank(word2)){
23 | 			return 1.0;
24 | 		}
25 | 		if(isBlank(word1)|| isBlank(word2)){
26 | 			return 0.0;
27 | 		}
28 | 		
29 | 		List<Character> sameHZ = new ArrayList<Character>();
30 | 		
31 | 		String longString = word1.length()>=word2.length()?word1:word2;
32 | 		String shortString = word1.length()<word2.length()?word1:word2;
33 | 		for(int i=0; i<longString.length(); i++){
34 | 			Character ch = longString.charAt(i);
35 | 			if(shortString.contains(ch.toString())){
36 | 				sameHZ.add(ch);				
37 | 			}
38 | 		}
39 | 		
40 | 		double dp = Math.min(1.0*word1.length()/word2.length(), 1.0*word2.length()/word1.length());
41 | 		double part1 = alpha*(1.0*sameHZ.size()/word1.length() + 1.0*sameHZ.size()/word2.length())/2.0;				
42 | 		double part2 = beta*dp*(getWeightedResult(word1, sameHZ) + getWeightedResult(word2, sameHZ))/2.0;
43 | 
44 | 		return part1+part2;
45 | 	}
46 | 
47 | 	private double getWeightedResult(String word1, List<Character> sameHZ){
48 | 		double top = 0;
49 | 		double bottom = 0;
50 | 		for(int i=0; i<word1.length(); i++){
51 | 			if(sameHZ.contains(word1.charAt(i))){
52 | 				top+=(i+1);
53 | 			}
54 | 			bottom += (i+1);
55 | 		}
56 | 		return 1.0*top/bottom;
57 | 	}
58 | 	
59 | 	private boolean isBlank(String str){
60 | 		return str == null || str.trim().equals("");
61 | 	}
62 | 	
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/WordSimilarity.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word;
2 | 
3 | import ruc.irm.similarity.Similaritable;
4 | 
5 | public interface WordSimilarity extends Similaritable {
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/cilin/Cilin.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.cilin;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | import ruc.irm.similarity.Similaritable;
 6 | 
 7 | 
 8 | public class Cilin implements Similaritable {
 9 | 	private static Cilin instance = null;
10 | 	
11 | 	public static Cilin getInstance(){
12 | 		if(instance == null){
13 | 			instance = new Cilin();
14 | 		}
15 | 		return instance;
16 | 	}
17 | 	
18 | 	private Cilin(){
19 | 		
20 | 	}
21 | 	
22 | 	@Override
23 | 	public double getSimilarity(String item1, String item2) {
24 | 		double sim = 0.0;
25 | 		
26 | 		if(item1==null && item2==null){
27 | 			return 1.0;
28 | 		}else if(item1==null || item2==null){
29 | 			return 0.0;
30 | 		}else if(item1.equalsIgnoreCase(item2)){
31 | 			return 1.0;
32 | 		}
33 | 		
34 | 		Set<String> codeSet1 = CilinDb.getInstance().getCilinCoding(item1);
35 | 		Set<String> codeSet2 = CilinDb.getInstance().getCilinCoding(item2);
36 | 		if(codeSet1==null || codeSet2==null){
37 | 			return 0.0;
38 | 		}
39 | 		for(String code1:codeSet1){
40 | 			for(String code2:codeSet2){
41 | 				double s = getSimilarityByCode(code1, code2);
42 | 				System.out.println(code1 + "-" + code2 + "-" +CilinCoding.calculateCommonWeight(code1, code2));
43 | 				if(sim<s) sim = s;
44 | 			}
45 | 		}
46 | 		return sim;
47 | 	}
48 | 	
49 | 	public double getSimilarityByCode(String code1, String code2){
50 | 		return CilinCoding.calculateCommonWeight(code1, code2)/CilinCoding.TOTAL_WEIGHT;
51 | 	}
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/cilin/CilinCoding.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.cilin;
 2 | 
 3 | /**
 4 |  * 表2-3 哈工大词林扩展版规则编码表<br/>
 5 |  * <table border="1" style="color:red;">
 6 |  * <tr>
 7 |  * <td>编码位</td><td>1</td><td>	2</td><td>3</td><td>4</td><td>5</td><td>6</td><td>7</td><td>8</td>
 8 |  * </tr>
 9 |  * <tr>
10 |  * <td>编码示例</td><td>C</td><td>b</td><td>0</td><td>7</td><td>A</td><td>0</td><td>3</td><td>=</td>
11 |  * </tr><tr>
12 |  * <td>类别级别</td><td>第一级</td><td>第二级</td><td colspan="2">第三级</td><td>第四级</td><td colspan="2">第五级</td><td>标记位</td><td>
13 |  * </tr><tr>
14 |  * <td>类别含义</td><td>大类</td><td>中类</td><td colspan="2">小类</td><td>词群</td><td colspan="2">原子词群</td><td>词语关系</td>
15 |  * </tr>
16 |  * </table>
17 |  * <br/>
18 |  * 表中编码位从左到右顺序排列，其中，第8位对应的标记位为“=”、“#”和“@”三种符号之一。其中“=”代表常见的“同义”关系，“#”代表词语之间的相关关系，“@”则代表词语自我封闭的独立性质，它在词典中既没有同义词，也没有相关词。
19 |  * 
20 |  * 
21 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
22 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
23 |  */
24 | public class CilinCoding {
25 | 	public static double[] WEIGHT = new double[]{1.2, 1.2, 1.0, 1.0, 0.8, 0.4};
26 | 	public static double TOTAL_WEIGHT = 5.6;
27 | 	
28 | 	public static String getCodeLevel(String code,int level){
29 | 		switch(level){
30 | 		case 1:
31 | 			return code.substring(0, 1);
32 | 		case 2:
33 | 			return code.substring(1, 2);
34 | 		case 3:
35 | 			return code.substring(2, 4);
36 | 		case 4:
37 | 			return code.substring(4, 5);
38 | 		case 5:
39 | 			return code.substring(5, 7);
40 | 		case 6:
41 | 			return code.substring(7);
42 | 		}
43 | 
44 | 		return "";
45 | 	}
46 | 	
47 | 	/**
48 | 	 * 获取共同部分编码的权重
49 | 	 * @param code1
50 | 	 * @param code2
51 | 	 * @return
52 | 	 */
53 | 	public static double calculateCommonWeight(String code1, String code2){
54 | 		double weight = 0.0;
55 | 		for(int i=1; i<=6; i++){
56 | 			String c1 = getCodeLevel(code1,i);
57 | 			String c2 = getCodeLevel(code2,i);
58 | 			if(c1.equals(c2)){
59 | 				weight += WEIGHT[i-1];
60 | 			}else{
61 | 				break;
62 | 			}
63 | 		}
64 | 		return weight;
65 | 	}
66 | 	
67 | 	public static String printCoding(String code){
68 | 		StringBuilder sb = new StringBuilder();
69 | 		for(int i=1; i<=6; i++){
70 | 			if(i==1){
71 | 				sb.append("[LEVEL_" + i);
72 | 			}else{
73 | 				sb.append(", LEVEL_" + i);
74 | 			}
75 | 			sb.append(": ");
76 | 			sb.append(getCodeLevel(code, i));
77 | 		}
78 | 		sb.append("]");
79 | 		
80 | 		return sb.toString();
81 | 	}
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/cilin/CilinDb.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.cilin;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.util.HashMap;
 6 | import java.util.HashSet;
 7 | import java.util.Map;
 8 | import java.util.Set;
 9 | import java.util.zip.GZIPInputStream;
10 | 
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 | import ruc.irm.similarity.util.FileUtils;
14 | import ruc.irm.similarity.util.TraverseEvent;
15 | 
16 | /**
17 |  * 词林数据库
18 |  * 
19 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
20 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
21 |  */
22 | public class CilinDb {
23 | 	/** the logger */
24 | 	protected static Logger LOG = LoggerFactory.getLogger(CilinDb.class);
25 | 	/** 以词语为主键的索引表 */
26 | 	private Map<String, Set<String>> wordIndex = new HashMap<String, Set<String>>();
27 | 	/** 以编码为主键的索引表 */
28 | 	private Map<String, Set<String>> codeIndex = new HashMap<String, Set<String>>();
29 | 	
30 | 	private static CilinDb instance = null;
31 | 	
32 | 	public static CilinDb getInstance(){
33 | 		if(instance == null){
34 | 			try {
35 | 				instance = new CilinDb();
36 | 			} catch (IOException e) {
37 | 				LOG.error(e.toString());
38 | 			}
39 | 		}
40 | 		return instance;
41 | 	}
42 | 	
43 | 	private CilinDb() throws IOException{
44 | 		InputStream input = new GZIPInputStream(this.getClass().getResourceAsStream("/data/cilin.db.gz"));
45 | 		
46 | 		TraverseEvent<String> event = new TraverseEvent<String>(){
47 | 			@Override
48 | 			public boolean visit(String line) {
49 | 				String[] items = line.split(" ");
50 | 				Set<String> set = new HashSet<String>();
51 | 				for(int i=2; i<items.length; i++){
52 | 					String code = items[i].trim();
53 | 					if(!code.equals("")){
54 | 						set.add(code);
55 | 						
56 | 						//加入codeIndex编码
57 | 						Set<String> codeWords = codeIndex.get(code);
58 | 						if(codeWords==null){
59 | 							codeWords = new HashSet<String>();
60 | 						}
61 | 						codeWords.add(items[0]);
62 | 						codeIndex.put(code, codeWords);
63 | 					}
64 | 				}
65 | 				wordIndex.put(items[0], set);
66 | 				items = null;
67 | 				return false;
68 | 			}};
69 | 		LOG.info("loading cilin dictionary...");
70 | 		long time = System.currentTimeMillis();
71 | 		
72 | 		FileUtils.traverseLines(input, "UTF8", event);
73 | 		
74 | 		time = System.currentTimeMillis() - time;
75 | 		LOG.info("loading cilin dictionary completely. time elapsed: " + time);
76 | 		
77 | 	}
78 | 	
79 | 	/**
80 | 	 * 获取某个词语的词林编码，一个词语可以有多个编码，通过Set给出
81 | 	 * @param word
82 | 	 * @return
83 | 	 */
84 | 	public Set<String> getCilinCoding(String word){
85 | 		return wordIndex.get(word);
86 | 	}
87 | 	
88 | 	public Set<String> getCilinWords(String code){
89 | 		return codeIndex.get(code);
90 | 	}
91 | 	
92 | 	public static void main(String[] args) {
93 | 		CilinDb db = CilinDb.getInstance();
94 | 		String code = db.getCilinCoding("中国").iterator().next();
95 | 		System.out.println(CilinCoding.printCoding(code));
96 | 		System.out.println(db.getCilinWords(code));
97 | 	}
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/Hownet.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | import ruc.irm.similarity.Similaritable;
 8 | import ruc.irm.similarity.word.hownet2.concept.BaseConceptParser;
 9 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
10 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser;
11 | import ruc.irm.similarity.word.hownet2.sememe.BaseSememeParser;
12 | 
13 | /**
14 |  * Hownet的主控制类, 通过知网的概念和义原及其关系计算汉语词语之间的相似度. 
15 |  * 相似度的计算理论参考论文《汉语词语语义相似度计算研究》
16 |  * 
17 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
18 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
19 |  * 
20 |  * @see ruc.irm.similarity.Similaritable
21 |  */
22 | public class Hownet implements Similaritable{	
23 | 	/** the logger */
24 | 	private static final Logger LOG = LoggerFactory.getLogger(Hownet.class);
25 | 	/** 知网的单例 */
26 | 	private static Hownet instance = null;
27 | 	
28 | 	private BaseConceptParser conceptParser = null;
29 | 	
30 | 	private Hownet(){
31 | 		try {
32 | 			BaseSememeParser sememeParser = new XiaSememeParser();
33 | 			conceptParser = new XiaConceptParser(sememeParser);
34 | 		} catch (IOException e) {			
35 | 			e.printStackTrace();
36 | 			LOG.error(e.toString());
37 | 		}
38 | 	}
39 | 	
40 | 	/**
41 | 	 * 单例获取知网对象
42 | 	 * @return
43 | 	 */
44 | 	public static Hownet instance(){
45 | 		if(null == instance){
46 | 			instance = new Hownet();
47 | 		}
48 | 		
49 | 		return instance;
50 | 	}
51 | 	
52 | 	/**
53 | 	 * 获取概念解析器
54 | 	 * @return
55 | 	 */
56 | 	public BaseConceptParser getConceptParser(){
57 | 		return conceptParser;
58 | 	}
59 | 		
60 | 	public double getSimilarity(String item1, String item2) {		
61 | 		return conceptParser.getSimilarity(item1, item2);
62 | 	}
63 | 		
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/HownetMeta.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet;
 2 | 
 3 | /**
 4 |  * Metadata for Hownet
 5 |  * 
 6 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 7 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 8 |  */
 9 | public interface HownetMeta {	
10 | 	/** Algorithm of XIA Tian */
11 | 	public static final int ALGORITHM_XIA = 1;
12 | 	
13 | 	/** Algorithm of LIU Qun */
14 | 	public static final int ALGORITHM_LIU = 2;
15 | 	
16 | 	/**
17 | 	 * Hownet symbol descriptions
18 | 	 */
19 | 	public static final String Symbol_Descriptions[][] = {
20 | 	      {
21 | 	      "#", "表示与其相关"}
22 | 	      , {
23 | 	      "%", "是其部分"}
24 | 	      , {
25 | 	      "$", "可以被该V处置，或是该V的受事、对象、领有物，或内容"}
26 | 	      , {
27 | 	      "*", "施事或工具"}
28 | 	      , {
29 | 	      "+", "所标记的角色是隐性的，几乎在实际语言中不会出现"}
30 | 	      , {
31 | 	      "&", "指向"}
32 | 	      , {
33 | 	      "~", "多半是，多半有，很可能"}
34 | 	      , {
35 | 	      "@", "可以做V的空间或时间"}
36 | 	      , {
37 | 	      "?", "可以使N的材料"}
38 | 	      , {
39 | 	      "(", "至于其中的应该是一个词标记"}
40 | 	      , {
41 | 	      "^", "不存在，或没有，或不能"}
42 | 	      , {
43 | 	      "!", "表示某一属性为一敏感的属性，如味道之与食物"}
44 | 	      , {
45 | 	      "[", "标示概念的共性属性"}
46 | 	  };
47 | 	
48 | 	/** γ：具体词与义元的相似度一律为一个较小的常数 */
49 | 	public static final double gamma = 0.2; 
50 | 	
51 | 	/** δ:任一个非空值与空值的相似度为一个较小的常数，此处为0.2 */
52 | 	public static final double delta = 0.2;
53 | 
54 | 	/** β1实词概念第一基本义原描述式的权重 */
55 | 	public static final double beta1 = 0.5;
56 | 	/** β2实词概念其他基本义原描述式的权重 */
57 | 	public static final double beta2 = 0.2;
58 | 	/** β3实词概念关系义原描述式的权重 */
59 | 	public static final double beta3 = 0.17;
60 | 	/** β4实词概念符号义原描述式的权重 */
61 | 	public static final double beta4 = 0.13;
62 | 
63 | 	/** 
64 | 	 * Θ 计算后面概念的义原与参照概念所有义原的最大相似度, 并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用),
65 | 	 * 如果数值大于该值时才会起参照作用, 去掉冗余的不重要义原 
66 | 	 */
67 | 	public static final double PARAM_THETA = 0.5;
68 | 	/** 
69 | 	 * Ω 计算前面概念的义原与参照概念所有义原的最大相似度，并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用),
70 | 	 * 如果数值大于该值时才会调整前面概念的义原符号, 以起修正作用
71 | 	 */
72 | 	public static final double PARAM_OMEGA = 0.8;
73 | 	/** */
74 | 	public static final double PARAM_XI = 0.6;	
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/concept/Concept.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet.concept;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | import java.util.StringTokenizer;
  6 | 
  7 | import ruc.irm.similarity.word.hownet.HownetMeta;
  8 | 
  9 | 
 10 | /**
 11 |  * 知网的概念表示类 <br/>example和英文部分对于相似度的计算不起作用，考虑到内存开销， 在概念的表示中去掉了这部分数据的对应定义
 12 |  * 
 13 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 14 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 15 |  * @deprecated
 16 |  */
 17 | public class Concept implements HownetMeta, Comparable<Concept> {
 18 | 	/** 中文概念名称 */
 19 | 	protected String word;
 20 | 	/** 词性: Part of Speech */
 21 | 	protected String pos;
 22 | 	/** 定义 */
 23 | 	protected String define;
 24 | 
 25 | 	/** 是否是实词，false表示为虚词, 一般为实词 */
 26 | 	protected boolean bSubstantive;
 27 | 	/** 第一基本义原 */
 28 | 	protected String mainSememe;
 29 | 	/** 其他基本义原 */
 30 | 	protected String[] secondSememes;
 31 | 	/** 关系义元原 */
 32 | 	protected String[] relationSememes;
 33 | 	/** 关系符号描述 */
 34 | 	protected String[] symbolSememes;
 35 | 
 36 | 	static String[][] Concept_Type = { { "=", "事件" },
 37 | 			{ "aValue|属性值", "属性值" }, { "qValue|数量值", "数量值" },
 38 | 			{ "attribute|属性", "属性" }, { "quantity|数量", "数量" },
 39 | 			{ "unit|", "单位" }, { "%", "部件" } };	
 40 | 
 41 | 	public Concept(String word, String pos, String def) {		
 42 | 		this.word = word;
 43 | 		this.pos = pos;
 44 | 		this.define = (def == null) ? "" : def.trim();
 45 | 		
 46 | 		// 虚词用{***}表示
 47 | 		if (define.length() > 0 
 48 | 				&& define.charAt(0) == '{'
 49 | 				&& define.charAt(define.length() - 1) == '}'){
 50 | 			this.bSubstantive = false;
 51 | 		} else {
 52 | 			this.bSubstantive = true;
 53 | 		}
 54 | 
 55 | 		parseDefine();
 56 | 	}
 57 | 
 58 | 	/**
 59 | 	 * 处理定义，把定义分为第一基本义元、其他基本义元、关系义元和符号义元四类
 60 | 	 */
 61 | 	private void parseDefine() {
 62 | 		List<String> secondList = new ArrayList<String>(); 		//其他基本义原
 63 | 		List<String> relationList = new ArrayList<String>(); 	//关系义原
 64 | 		List<String> symbolList = new ArrayList<String>(); 		//符号义原
 65 | 		
 66 | 		String tokenString = this.define;
 67 | 
 68 | 		//如果不是实词，则处理“{}”中的内容
 69 | 		if (!this.bSubstantive) {			
 70 | 			tokenString = define.substring(1, define.length() - 1);
 71 | 		}
 72 | 		
 73 | 		StringTokenizer token = new StringTokenizer(tokenString, ",", false);
 74 | 
 75 | 		// 第一个为第一基本义元
 76 | 		if (token.hasMoreTokens()) {
 77 | 			this.mainSememe = token.nextToken();
 78 | 		}
 79 | 		
 80 | 		main_loop: while (token.hasMoreTokens()) {
 81 | 			String item = token.nextToken();
 82 | 			if (item.equals("")) continue;
 83 | 			
 84 | 			// 先判断是否为符号义元
 85 | 			String symbol = item.substring(0, 1);		
 86 | 			for(int i=0;i< Symbol_Descriptions.length;i++){
 87 | 		    	if(symbol.equals( Symbol_Descriptions[i][0])){
 88 | 		            symbolList.add(item);		            
 89 | 		            continue main_loop;
 90 | 		    	}
 91 | 			}
 92 | 			
 93 | 			//如果不是符号义元，则进一步判断是关系义元还是第二基本义元, 带有“=”表示关系义原
 94 | 			if (item.indexOf('=') > 0){
 95 | 				relationList.add(item);
 96 | 			} else {
 97 | 				secondList.add(item);
 98 | 			}			
 99 | 		}
100 | 		
101 | 		this.secondSememes = secondList.toArray(new String[secondList.size()]);
102 | 		this.relationSememes = relationList.toArray(new String[relationList.size()]);
103 | 		this.symbolSememes = symbolList.toArray(new String[symbolList.size()]);
104 | 	}
105 | 	
106 | 	/**
107 | 	 * 获取第一义元
108 | 	 * 
109 | 	 * @return
110 | 	 */
111 | 	public String getMainSememe() {
112 | 		return mainSememe;
113 | 	}
114 | 	
115 | 	/**
116 | 	 * 获取其他基本义元描述
117 | 	 * 
118 | 	 * @return
119 | 	 */
120 | 	public String[] getSecondSememes() {
121 | 		return secondSememes;
122 | 	}
123 | 
124 | 	/**
125 | 	 * 获取关系义元描述
126 | 	 * 
127 | 	 * @return
128 | 	 */
129 | 	public String[] getRelationSememes() {
130 | 		return relationSememes;
131 | 	}
132 | 
133 | 	/**
134 | 	 * 获取符号义元描述
135 | 	 * 
136 | 	 * @return
137 | 	 */
138 | 	public String[] getSymbolSememes() {
139 | 		return symbolSememes;
140 | 	}
141 | 
142 | 	@Override
143 | 	public String toString() {
144 | 		StringBuilder sb = new StringBuilder();
145 | 		sb.append("name=");
146 | 		sb.append(this.word);
147 | 		sb.append("; pos=");
148 | 		sb.append(this.pos);
149 | 		sb.append("; define=");
150 | 		sb.append(this.define);
151 | 		sb.append("; 第一基本义元:[" + mainSememe);
152 | 		
153 | 		sb.append("]; 其他基本义元描述:[");
154 | 		for(String sem: secondSememes){
155 | 			sb.append(sem);
156 | 			sb.append(";");
157 | 		}
158 | 
159 | 		sb.append("]; [关系义元描述:");
160 | 		for(String sem: relationSememes){
161 | 			sb.append(sem);
162 | 			sb.append(";");
163 | 		}
164 | 
165 | 		sb.append("]; [关系符号描述:");
166 | 		for(String sem: symbolSememes){
167 | 			sb.append(sem);
168 | 			sb.append(";");
169 | 		}
170 | 		sb.append("]");
171 | 		return sb.toString();
172 | 	}
173 | 
174 | 	/**
175 | 	 * 是实词还是虚词
176 | 	 * 
177 | 	 * @return true:实词；false:虚词
178 | 	 */
179 | 	public boolean isSubstantive() {
180 | 		return this.bSubstantive;
181 | 	}
182 | 
183 | 	public String getWord() {
184 | 		return word;
185 | 	}
186 | 
187 | 	public void setWord(String word) {
188 | 		this.word = word;
189 | 	}
190 | 
191 | 	public String getPos() {
192 | 		return pos;
193 | 	}
194 | 
195 | 	public void setPos(String pos) {
196 | 		this.pos = pos;
197 | 	}
198 | 
199 | 	public String getDefine() {
200 | 		return define;
201 | 	}
202 | 
203 | 	public void setDefine(String define) {
204 | 		this.define = define;
205 | 	}
206 | 
207 | 	/**
208 | 	 * 获取该概念的类型
209 | 	 * 
210 | 	 * @return
211 | 	 */
212 | 	public String getType() {
213 | 		for (int i = 0; i < Concept_Type.length; i++) {
214 | 			if (define.toUpperCase().indexOf(Concept_Type[i][0].toUpperCase()) >= 0) {
215 | 				return Concept_Type[i][1];
216 | 			}
217 | 		}
218 | 		return "普通概念";
219 | 	}	
220 | 	
221 | 	/**
222 | 	 * 按照概念的名称进行比较
223 | 	 */
224 | 	public int compareTo(Concept o) {
225 | 		return word.compareTo(o.word);
226 | 	}
227 | 
228 | 	//////////////////////////////////////////////
229 | 	/**
230 | 	 * 方便在parse中比较概念词语加入的方法
231 | 	 * @param another
232 | 	 * @return
233 | 	 */
234 | 	public int compareTo(String another){
235 | 		return word.compareTo(another);
236 | 	}
237 | 	
238 | 	public boolean equals(String another){
239 | 		return word.equals(another);
240 | 	}
241 | }


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/concept/ConceptDictTraverseEvent.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet.concept;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileOutputStream;
  6 | import java.io.InputStream;
  7 | import java.io.InputStreamReader;
  8 | import java.io.PrintWriter;
  9 | import java.util.ArrayList;
 10 | import java.util.Arrays;
 11 | import java.util.List;
 12 | 
 13 | import javax.xml.parsers.DocumentBuilder;
 14 | import javax.xml.parsers.DocumentBuilderFactory;
 15 | import javax.xml.transform.OutputKeys;
 16 | import javax.xml.transform.Transformer;
 17 | import javax.xml.transform.TransformerFactory;
 18 | import javax.xml.transform.dom.DOMSource;
 19 | import javax.xml.transform.stream.StreamResult;
 20 | 
 21 | import org.w3c.dom.Document;
 22 | import org.w3c.dom.Element;
 23 | 
 24 | import ruc.irm.similarity.util.TraverseEvent;
 25 | 
 26 | /**
 27 |  * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准，格式如下：<br/>
 28 |  * 阿斗                	N    	human|人,ProperName|专,past|昔<br/>
 29 |  * 阿爸                	N    	human|人,family|家,male|男<br/>
 30 |  * 即： &lt;概念&gt; &lt;空格或者跳格&gt; &lt;词性&gt; &lt;空格或者跳格&gt; &lt;定义&gt;"
 31 |  * <br/>
 32 |  * 概念保存到数组中，没有保存到Map中，可以降低对内存空间的使用
 33 |  * 
 34 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 35 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 36 |  * @deprecated
 37 |  */
 38 | public class ConceptDictTraverseEvent implements TraverseEvent<String> {
 39 | 	private List<Concept> conceptList = null;
 40 | 	
 41 | 	public ConceptDictTraverseEvent(){
 42 | 		conceptList = new ArrayList<Concept>();
 43 | 	}
 44 | 	
 45 | 	public Concept[] getConcepts(){
 46 | 		Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]);
 47 | 		Arrays.sort(concepts);
 48 | 		return concepts;
 49 | 	}
 50 | 	
 51 | 	/**
 52 | 	 * 读取概念词典中的一行，并进行解析处理
 53 | 	 */
 54 | 	public boolean visit(String line) {
 55 | 		String word = null;
 56 | 		String pos = null;
 57 | 		String define = "";
 58 | 		char ch;
 59 | 		
 60 | 		//以符号//开始的是注释行
 61 | 		if(line.startsWith("//")){
 62 | 			return true;
 63 | 		}
 64 | 		
 65 | 		int lastPosition = 0;	//最近一次处理内容的有意义的开始位置
 66 | 		int processFlag = 0;	//当前处理部分的标志 0：处理word； 1：词性；2：定义
 67 | 		//解析出一行中的概念各项数据		
 68 | 		loop: for (int position = 0; position < line.length(); position++) {
 69 | 			ch = line.charAt(position);
 70 | 			
 71 | 			if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
 72 | 				String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
 73 | 				switch(processFlag){				
 74 | 				case 0:
 75 | 					word = item;
 76 | 					processFlag++;
 77 | 					break;
 78 | 				case 1:
 79 | 					pos = item;
 80 | 					processFlag++;
 81 | 					break;
 82 | 				case 2:					
 83 | 					//define = item;
 84 | 					//processFlag++;
 85 | 					define = line.substring(lastPosition).trim();					
 86 | 					break loop;
 87 | 				case 3:
 88 | 					System.out.println(line);
 89 | 					break;
 90 | 				}				
 91 | 				
 92 | 				for( ;(position < line.length()); position++){
 93 | 					ch = line.charAt(position);
 94 | 					if ((ch != ' ') && (ch != '\t')) {
 95 | 						lastPosition = position;
 96 | 						break;
 97 | 					}
 98 | 				}
 99 | 					
100 | 			}
101 | 		}
102 | 		conceptList.add(new Concept(word, pos, define));
103 | 		return true;
104 | 	}
105 | 	
106 | 	public void saveToXML(File xmlFile) throws Exception{
107 | 		String conceptFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/concept.dat";
108 | 		InputStream input = this.getClass().getClassLoader().getResourceAsStream(conceptFile);
109 | 		BufferedReader in = new BufferedReader(new InputStreamReader(input,	"utf8"));
110 | 		
111 | 		DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance(); 
112 | 		DocumentBuilder builder=factory.newDocumentBuilder(); 
113 | 		Document document=builder.newDocument();
114 | 		Element root=document.createElement("concepts"); 
115 | 		document.appendChild(root); 
116 | 		
117 | 		String line = null;
118 | 
119 | 		while ((line = in.readLine()) != null) {
120 | 			saveLineToXML(document, root, line);
121 | 		}
122 | 
123 | 		input.close();
124 | 		in.close();
125 | 		
126 | 		TransformerFactory tf=TransformerFactory.newInstance(); 
127 | 		Transformer transformer=tf.newTransformer(); 
128 | 		DOMSource source=new DOMSource(document); 
129 | 		transformer.setOutputProperty(OutputKeys.ENCODING,"utf8"); 
130 | 		transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 
131 | 		PrintWriter pw=new PrintWriter(new FileOutputStream(xmlFile)); 
132 | 		StreamResult result=new StreamResult(pw); 
133 | 		transformer.transform(source,result); 
134 | 	}
135 | 	
136 | 	
137 | 	/**
138 | 	 * 读取概念词典中的一行，并进行解析处理
139 | 	 */
140 | 	private boolean saveLineToXML(Document document, Element root, String line) {
141 | 		String word = null;
142 | 		String pos = null;
143 | 		String define = "";
144 | 		char ch;
145 | 		
146 | 		//以符号//开始的是注释行
147 | 		if(line.startsWith("//")){
148 | 			return true;
149 | 		}
150 | 		
151 | 		int lastPosition = 0;	//最近一次处理内容的有意义的开始位置
152 | 		int processFlag = 0;	//当前处理部分的标志 0：处理word； 1：词性；2：定义
153 | 		//解析出一行中的概念各项数据		
154 | 		loop: for (int position = 0; position < line.length(); position++) {
155 | 			ch = line.charAt(position);
156 | 			
157 | 			if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
158 | 				String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
159 | 				switch(processFlag){				
160 | 				case 0:
161 | 					word = item;
162 | 					processFlag++;
163 | 					break;
164 | 				case 1:
165 | 					pos = item;
166 | 					processFlag++;
167 | 					break;
168 | 				case 2:					
169 | 					//define = item;
170 | 					//processFlag++;
171 | 					define = line.substring(lastPosition).trim();					
172 | 					break loop;
173 | 				case 3:
174 | 					System.out.println(line);
175 | 					break;
176 | 				}				
177 | 				
178 | 				for( ;(position < line.length()); position++){
179 | 					ch = line.charAt(position);
180 | 					if ((ch != ' ') && (ch != '\t')) {
181 | 						lastPosition = position;
182 | 						break;
183 | 					}
184 | 				}
185 | 					
186 | 			}
187 | 		}
188 | 		
189 | 		Element e = document.createElement("c");
190 | 		e.setAttribute("w", word);
191 | 		e.setAttribute("p", pos);
192 | 		e.setAttribute("d", define);
193 | 		root.appendChild(e);
194 | 		return true;
195 | 	}
196 | 	
197 | 	public static void main(String[] args) throws Exception {
198 | 	  new ConceptDictTraverseEvent().saveToXML(new File("/home/xiatian/Desktop/concept.xml"));
199 |   }
200 | 		
201 | }
202 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/concept/ConceptLinkedList.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet.concept;
 2 | 
 3 | import java.util.LinkedList;
 4 | 
 5 | /**
 6 |  * 用于概念处理的LinkedList
 7 |  * 
 8 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 9 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
10 |  *
11 |  * @param <T>
12 |  * @deprecated
13 |  */
14 | @SuppressWarnings("serial")
15 | public class ConceptLinkedList extends LinkedList<Concept> {
16 | 	
17 | 	/**
18 | 	 * 删除链表中最后面的size个元素
19 | 	 * @param size
20 | 	 */
21 | 	public void removeLast(int size){
22 | 		for(int i=0;i<size;i++){
23 | 			this.removeLast();
24 | 		}
25 | 	}
26 | 	
27 | 	/**
28 | 	 * 根据概念的定义判断是否已经加入到链表中
29 | 	 * @param concept
30 | 	 */
31 | 	public void addByDefine(Concept concept){
32 | 		for(Concept c:this){
33 | 			if(c.getDefine().equals(concept.getDefine())){
34 | 				return;
35 | 			}
36 | 		}
37 | 		
38 | 		this.add(concept);
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/concept/LiuConceptParser.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet.concept;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Collection;
 5 | 
 6 | import ruc.irm.similarity.util.BlankUtils;
 7 | import ruc.irm.similarity.word.hownet.sememe.LiuqunSememeParser;
 8 | import ruc.irm.similarity.word.hownet.sememe.SememeParser;
 9 | 
10 | 
11 | /**
12 |  * 刘群老师的相似度计算方式，对概念解析的处理方式
13 |  * 
14 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
15 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
16 |  * @deprecated
17 |  */
18 | public class LiuConceptParser extends ConceptParser{
19 | 	
20 | 	private static LiuConceptParser instance = null;
21 | 	
22 | 	public static LiuConceptParser getInstance(){
23 | 		if(instance == null){
24 | 			try {
25 | 				instance = new LiuConceptParser();
26 | 			} catch (IOException e) {
27 | 				e.printStackTrace();
28 | 			}
29 | 		}
30 | 		
31 | 		return instance;
32 | 	}
33 | 	
34 | 	private LiuConceptParser(SememeParser sememeParser) throws IOException {
35 | 		super(sememeParser);
36 | 	}
37 | 	
38 | 	private LiuConceptParser() throws IOException{
39 | 		super(new LiuqunSememeParser());
40 | 	}
41 | 
42 | 	@Override
43 | 	protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4){
44 | 		return beta1 * sim_v1 
45 |         + beta2 * sim_v1 * sim_v2
46 |         + beta3 * sim_v1 * sim_v2 * sim_v3 
47 |         + beta4 * sim_v1 * sim_v2 * sim_v3 * sim_v4;		
48 | 	}
49 | 
50 | 	@Override
51 | 	public double getSimilarity(String word1, String word2) {
52 | 		double similarity = 0.0;
53 | 
54 | 		// 如果两个句子相同,则直接返回1.0
55 | 		if (word1.equals(word2)) {
56 | 			return 1.0;
57 | 		}
58 | 
59 | 		Collection<Concept> concepts1 = getConcepts(word1);
60 | 		Collection<Concept> concepts2 = getConcepts(word2);
61 | 		
62 | 		//如果是blank，则说明是未登录词, 需要计算组合概念
63 | 		if(BlankUtils.isBlank(concepts1)  || BlankUtils.isBlank(concepts2)){
64 | 			return 0.0;
65 | 		}
66 | 		
67 | 		//两个for循环分别计算词语所有可能的概念的相似度
68 | 		for(Concept c1:concepts1){
69 | 			for(Concept c2:concepts2){				
70 | 				double v = getSimilarity(c1, c2);
71 | 
72 | 				if(v>similarity){
73 | 					similarity = v;
74 | 				}
75 | 				
76 | 				if(similarity == 1.0){
77 | 					break;
78 | 				}
79 | 			}
80 | 		}		
81 | 
82 | 		return similarity;
83 | 	}
84 | 	
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/FastSimpleMap.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.Collection;
  6 | 
  7 | /**
  8 |  * 一种新的Map，跟标准的Map不同，它的的Key可以有重复, 内部采用快速排序和二分查找,
  9 |  * 保持较少的变量，结构简单，可根据主键查找返回的结果是一个数组
 10 |  * 
 11 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 12 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 13 |  * 
 14 |  * @param <T>
 15 |  * @param <V>
 16 |  * @deprecated
 17 |  */
 18 | public class FastSimpleMap<K extends Comparable<K>, V> {
 19 | 	private K[] keys;
 20 | 	private V[] values;
 21 | 	
 22 | 	public FastSimpleMap(K[] keys, V[] values) throws IOException{
 23 | 		if(keys.length!=values.length){
 24 | 			throw new IOException("keys length must be equals values");
 25 | 		}
 26 | 		this.keys = keys;
 27 | 		this.values = values;
 28 | 		
 29 | 		// 根据keys进行排序
 30 | 		quicksort(0, keys.length-1);
 31 | 	}
 32 | 	
 33 | 	/**
 34 | 	 * 查找键对应的值集合
 35 | 	 * @param key
 36 | 	 * @return
 37 | 	 */
 38 | 	public Collection<V> get(K key) {
 39 | 		int low = 0;
 40 | 		int high = keys.length - 1;
 41 | 		
 42 | 		Collection<V> results = new ArrayList<V>();
 43 | 
 44 | 		while (low <= high) {
 45 | 			int mid = (low + high) >> 1;
 46 | 			K item = keys[mid];
 47 | 			int cmp = key.compareTo(item);
 48 | 			
 49 | 			if (cmp > 0) {
 50 | 				low = mid + 1;
 51 | 			} else if (cmp < 0) {
 52 | 				high = mid - 1;
 53 | 			} else {				
 54 | 				// 找到起始位置，该位置前后相同的都是该主键对应的值
 55 | 				for(int i=mid;i>=0 && keys[i].equals(key); i--){
 56 | 					results.add(values[i]);
 57 | 				}				
 58 | 				for(int i=mid+1; i<keys.length && keys[i].equals(key); i++){
 59 | 					results.add(values[i]);
 60 | 				}
 61 | 				
 62 | 				break; // break while
 63 | 			}
 64 | 		}
 65 | 		
 66 | 		return results;
 67 | 	}
 68 | 	
 69 | 	/**
 70 | 	 * 根据keys快速排序，排序的同时交换values
 71 | 	 * 
 72 | 	 * @param a
 73 | 	 * @param low
 74 | 	 * @param high
 75 | 	 */
 76 | 	private void quicksort (int low, int high)
 77 | 	{
 78 | 		//low is the lower index, high is the upper index
 79 | 		//of the region of array a that is to be sorted
 80 | 	    int i=low, j=high;
 81 | 	    K h;
 82 | 	    V v;
 83 | 	    K x=keys[(low+high)>>1];
 84 | 
 85 | 	    //partition
 86 | 	    do {    
 87 | 	        while (keys[i].compareTo(x)<0) i++; 
 88 | 	        while (keys[j].compareTo(x)>0) j--;
 89 | 	        
 90 | 	        if (i<=j)
 91 | 	        {
 92 | 	            h=keys[i]; keys[i]=keys[j]; keys[j]=h;
 93 | 	            v=values[i]; values[i]=values[j]; values[j]=v;
 94 | 	            i++; j--;
 95 | 	        }
 96 | 	    } while (i<=j);
 97 | 
 98 | 	    //  recursion
 99 | 	    if (low<j) quicksort(low, j);
100 | 	    if (i<high) quicksort(i, high);
101 | 	}
102 | }
103 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/LiuqunSememeParser.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | /**
  6 |  * 刘群老师计算义原相似度的方法, 实现了SememeParser中定义的抽象方法
  7 |  * 
  8 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
  9 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 10 |  * 
 11 |  * @author <a href="xiat@ruc.edu.cn">xiatian</a>
 12 |  * @version 1.0
 13 |  * @deprecated
 14 |  */
 15 | public class LiuqunSememeParser extends SememeParser {
 16 | 		
 17 | 	/** 计算义元相似度的可调节的参数，默认为1.6 */
 18 | 	private final float alpha = 1.6f;	
 19 | 	
 20 | 	public LiuqunSememeParser() throws IOException {
 21 | 		super();		
 22 | 	}
 23 | 
 24 | 	/**
 25 | 	 * 计算两个义元之间的相似度，由于义元可能相同，计算结果为其中相似度最大者 
 26 | 	 * <br/>similarity = alpha/(distance+alpha)
 27 | 	 * 
 28 | 	 * @param key1
 29 | 	 * @param key2
 30 | 	 * @return
 31 | 	 */
 32 | 	@Override
 33 | 	public double getSimilarity(String item1, String item2) {
 34 | 		int pos;
 35 | 
 36 | 		// 如果为空串，直接返回0
 37 | 		if (item1 == null || item2 == null || item1.equals("")
 38 | 				|| item2.equals(""))
 39 | 			return 0.0;
 40 | 
 41 | 		String key1 = item1.trim();
 42 | 		String key2 = item2.trim();
 43 | 
 44 | 		// 去掉()符号
 45 | 		if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
 46 | 			if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
 47 | 				key1 = key1.substring(1, key1.length() - 1);
 48 | 				key2 = key2.substring(1, key2.length() - 1);
 49 | 			} else {
 50 | 				return 0.0;
 51 | 			}
 52 | 		}
 53 | 
 54 | 		// 处理关系义元,即x=y的情况
 55 | 		if ((pos = key1.indexOf('=')) > 0) {
 56 | 			int pos2 = key2.indexOf('=');
 57 | 			// 如果是关系义元，则判断前面部分是否相同，如果相同，则转为计算后面部分的相似度，否则为0
 58 | 			if ((pos == pos2)
 59 | 					&& key1.substring(0, pos).equals(key2.substring(0, pos2))) {
 60 | 				key1 = key1.substring(pos + 1);
 61 | 				key2 = key2.substring(pos2 + 1);
 62 | 			} else {
 63 | 				return 0.0;
 64 | 			}
 65 | 		}
 66 | 
 67 | 		// 处理符号义元,即前面有特殊符号的义元
 68 | 		String symbol1 = key1.substring(0, 1);
 69 | 		String symbol2 = key2.substring(0, 1);
 70 | 
 71 | 		for (int i = 0; i < Symbol_Descriptions.length; i++) {
 72 | 			if (symbol1.equals(Symbol_Descriptions[i][0])) {
 73 | 				if (symbol1.equals(symbol2)) {
 74 | 					key1 = item1.substring(1);
 75 | 					key2 = item2.substring(1);
 76 | 					break;
 77 | 				} else {
 78 | 					return 0.0; // 如果不是同一关系符号，则相似度直接返回0
 79 | 				}
 80 | 			}
 81 | 		}
 82 | 
 83 | 		if ((pos = key1.indexOf("|")) >= 0) {
 84 | 			key1 = key1.substring(pos + 1);
 85 | 		}
 86 | 		if ((pos = key2.indexOf("|")) >= 0) {
 87 | 			key2 = key2.substring(pos + 1);
 88 | 		}
 89 | 
 90 | 		int distance = getDistance(key1, key2);
 91 | 		if (distance < 0)
 92 | 			return 0.0;
 93 | 		else
 94 | 			return alpha / (distance + alpha);
 95 | 	}
 96 | 
 97 | 	@Override
 98 | 	public double getSimilarity(Sememe sem1, Sememe sem2) {
 99 | 		int distance = getDistance(sem1, sem2);
100 | 		if (distance <= 0)
101 | 			return 0.0f;
102 | 		else
103 | 			return alpha / (distance + alpha);
104 | 	}
105 | }
106 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/MySememeParser.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import ruc.irm.similarity.util.BlankUtils;
  6 | 
  7 | 
  8 | /**
  9 |  * 义原相似度计算, 实现了SememeParser中定义的抽象方法
 10 |  * 
 11 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 12 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 13 |  * @deprecated 
 14 |  */
 15 | public class MySememeParser extends SememeParser {
 16 | 	
 17 | 	public MySememeParser() throws IOException{
 18 | 		super();
 19 | 	}
 20 | 	
 21 | 	/**
 22 | 	 * 计算两个义原的相似度	 
 23 | 	 */
 24 | 	@Override
 25 | 	public double getSimilarity(final Sememe sememe1, final Sememe sememe2) {		
 26 | 		Sememe sem1 = sememe1;
 27 | 		Sememe sem2 = sememe2;		
 28 | 
 29 | 		if (sememe1 == null || sememe2 == null){
 30 | 			return 0.0f;
 31 | 		}else if(sememe1.getId() == sememe2.getId()){
 32 | 			return 1.0f;
 33 | 		}
 34 | 		
 35 | 		//变为深度相同，然后一次上找共同的父节点
 36 | 		int level = sememe1.getDepth() - sememe2.getDepth();		
 37 | 		for (int i = 0; i < ((level < 0) ? level * -1 : level); i++) {
 38 | 			if (level > 0){
 39 | 				sem1 = SEMEMES[sem1.getParentId()];
 40 | 			}else{
 41 | 				sem2 = SEMEMES[sem2.getParentId()];
 42 | 			}
 43 | 		}
 44 | 		
 45 | 		while(sem1.getId() != sem2.getId()){
 46 | 			// 如果有一个已经到达根节点，仍然不同，则返回0
 47 | 			if (sem1.getId() == sem1.getParentId()
 48 | 					|| sem2.getId() == sem2.getParentId()) {
 49 | 				return 0.0f;
 50 | 			}
 51 | 			
 52 | 			sem1 = SEMEMES[sem1.getParentId()];
 53 | 			sem2 = SEMEMES[sem2.getParentId()];
 54 | 		}
 55 | 		
 56 | 		return sem1.getDepth()*2.0f/(sememe1.getDepth() + sememe2.getDepth());
 57 | 	}
 58 | 
 59 | 	/**
 60 | 	 * 计算两个义元之间的相似度，由于义元可能相同，计算结果为其中相似度最大者 similarity = alpha/(distance+alpha),
 61 | 	 * 如果两个字符串相同或都为空，直接返回1.0
 62 | 	 * 
 63 | 	 * @param key1 第一个义原字符串
 64 | 	 * @param key2 第二个义原字符串
 65 | 	 * @return
 66 | 	 */
 67 | 	@Override
 68 | 	public double getSimilarity(String item1, String item2) {	
 69 | 		if(BlankUtils.isBlankAll(item2, item2)){
 70 | 			return 1.0;
 71 | 		} else if(BlankUtils.isBlankAtLeastOne(item1, item2)){
 72 | 			return 0.0;
 73 | 		} else if(item1.equals(item2)){
 74 | 			return 1.0;
 75 | 		}		
 76 | 
 77 | 		String key1 = item1.trim();
 78 | 		String key2 = item2.trim();
 79 | 
 80 | 		// 去掉()符号
 81 | 		if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
 82 | 			
 83 | 			if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
 84 | 				key1 = key1.substring(1, key1.length() - 1);
 85 | 				key2 = key2.substring(1, key2.length() - 1);
 86 | 			} else {
 87 | 				return 0.0;
 88 | 			}
 89 | 			
 90 | 		}
 91 | 
 92 | 		// 处理关系义元,即x=y的情况
 93 | 		int pos = key1.indexOf('=');
 94 | 		if (pos > 0) {
 95 | 			int pos2 = key2.indexOf('=');
 96 | 			// 如果是关系义元，则判断前面部分是否相同，如果相同，则转为计算后面部分的相似度，否则为0
 97 | 			if ((pos == pos2)
 98 | 					&& key1.substring(0, pos).equals(key2.substring(0, pos2))) {
 99 | 				key1 = key1.substring(pos + 1);
100 | 				key2 = key2.substring(pos2 + 1);
101 | 			} else {
102 | 				return 0.0;
103 | 			}
104 | 		}
105 | 
106 | 		// 处理符号义元,即前面有特殊符号的义元
107 | 		String symbol1 = key1.substring(0, 1);
108 | 		String symbol2 = key2.substring(0, 1);
109 | 
110 | 		for (int i = 0; i < Symbol_Descriptions.length; i++) {
111 | 			if (symbol1.equals(Symbol_Descriptions[i][0])) {
112 | 				if (symbol1.equals(symbol2)) {
113 | 					key1 = item1.substring(1);
114 | 					key2 = item2.substring(1);
115 | 					break;
116 | 				} else {
117 | 					return 0.0; // 如果不是同一关系符号，则相似度直接返回0
118 | 				}
119 | 			}
120 | 		}
121 | 
122 | 		if ((pos = key1.indexOf("|")) >= 0) {
123 | 			key1 = key1.substring(pos + 1);
124 | 		}
125 | 		if ((pos = key2.indexOf("|")) >= 0) {
126 | 			key2 = key2.substring(pos + 1);
127 | 		}
128 | 
129 | 		// 如果两个字符串相等，直接返回距离为0
130 | 		if (key1.equals(key2)) {
131 | 			return 1.0;
132 | 		}
133 | 		
134 | 		Integer[] myset1 = getSememes(key1);
135 | 		Integer[] myset2 = getSememes(key2);
136 | 		
137 | 		double similarity = 0.0;
138 | 		for(int id1:myset1){
139 | 			for(int id2:myset2){
140 | 				double s = getSimilarity(SEMEMES[id1], SEMEMES[id2]);
141 | 				if(s>similarity){
142 | 					similarity = s;
143 | 				}
144 | 			}
145 | 		}
146 | 		
147 | 		return similarity;
148 | 	}
149 | 
150 | 	
151 | }


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/Sememe.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet.sememe;
  2 | 
  3 | /**
  4 |  * 描述知网义原的基本对象, 出于性能考虑，把未用到的英文名称、定义等在加载时忽略, 更准确的做法是以[英文定义|中文定义]
  5 |  * 作为一个整理进行处理，不过绝大多数只根据中文定义就可以标识出来，因此忽略不计。
  6 |  * 
  7 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
  8 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
  9 |  * @deprecated
 10 |  */
 11 | public class Sememe {
 12 | 	/** 义原编号 */
 13 | 	private int id;
 14 | 	/** 指向上位义元号 */
 15 | 	private int parentId;
 16 | 	/** 义原在义原树中的深度 */
 17 | 	private int depth;
 18 | 	/** 义原的中文名称*/
 19 | 	private String cnWord;
 20 | 	/** 义原的英文名称 */
 21 | 	private String enWord;
 22 | 	/** 义原的定义，如果没有(例如数量)，则为空串 */
 23 | 	private String define;
 24 | 	/** 义原的类型 */
 25 | 	private int type;	
 26 | 
 27 | 	/**
 28 | 	 * 每一行的形式为：be|是 {relevant,isa}/{relevant,descriptive} 
 29 | 	 * <br/>或者 official|官 [#organization|组织,#employee|员] 
 30 | 	 * <br/>或者 amount|多少 
 31 | 	 * <br/>把相应的部分赋予不同的属性
 32 | 	 * 出于性能考虑，把未用到的英文名称、定义等忽略
 33 | 	 * @param id
 34 | 	 * @param parentId
 35 | 	 * @param item 读取文件中的一行
 36 | 	 */
 37 | 	public Sememe(int id, int parentId, int depth, String item) {
 38 | 		this.id = id;
 39 | 		this.parentId = parentId;
 40 | 		this.depth = depth;
 41 | 		
 42 | 		int pos = item.indexOf('|');
 43 | 		if (pos < 0) {
 44 | 			this.cnWord = item;
 45 | 			this.enWord = item;
 46 | 		} else {
 47 | 			this.enWord = item.substring(0, pos);
 48 | 
 49 | 			// 去掉"|"符号
 50 | 			String nextPart = item.substring(pos + 1);
 51 | 			pos = nextPart.indexOf(' ');
 52 | 			if (pos <= 0) {
 53 | 				this.cnWord = nextPart;
 54 | 			} else {
 55 | 				this.cnWord = nextPart.substring(0, pos);
 56 | 				this.define = nextPart.substring(pos).trim();
 57 | 			}
 58 | 		}
 59 | 	}
 60 | 
 61 | 	public int getId() {
 62 | 		return id;
 63 | 	}
 64 | 
 65 | 	public void setId(int id) {
 66 | 		this.id = id;
 67 | 	}
 68 | 
 69 | 	public int getParentId() {
 70 | 		return parentId;
 71 | 	}
 72 | 
 73 | 	public void setParentId(int parentId) {
 74 | 		this.parentId = parentId;
 75 | 	}
 76 | 	
 77 | 	public int getDepth() {
 78 | 		return depth;
 79 | 	}
 80 | 
 81 | 	public void setDepth(int depth) {
 82 | 		this.depth = depth;
 83 | 	}
 84 | 
 85 | 	public String getCnWord() {
 86 | 		return cnWord;
 87 | 	}
 88 | 
 89 | 	public void setCnWord(String cnWord) {
 90 | 		this.cnWord = cnWord;
 91 | 	}
 92 | 
 93 | 	public String getEnWord() {
 94 | 		return enWord;
 95 | 	}
 96 | 
 97 | 	public void setEnWord(String enWord) {
 98 | 		this.enWord = enWord;
 99 | 	}
100 | 
101 | 	public String getDefine() {
102 | 		return define;
103 | 	}
104 | 
105 | 	public void setDefine(String define) {
106 | 		this.define = define;
107 | 	}
108 | 
109 | 	public int getType() {
110 | 		return type;
111 | 	}
112 | 
113 | 	public void setType(int type) {
114 | 		this.type = type;
115 | 	}
116 | 	
117 | 	@Override
118 | 	public String toString(){
119 | 		StringBuilder sb = new StringBuilder();
120 | 		sb.append("id=");
121 | 		sb.append(id);
122 | 		sb.append("; parentId=");
123 | 		sb.append(parentId);
124 | 		sb.append("; depth=");
125 | 		sb.append(depth);
126 | 		sb.append("; cnWord=");
127 | 		sb.append(cnWord);
128 | 		sb.append("; enWord=");
129 | 		sb.append(enWord);
130 | 		sb.append("; define=");
131 | 		sb.append(define);
132 | 		return sb.toString();
133 | 	}
134 | 
135 | }
136 | 
137 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/SememeParser.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.InputStream;
  5 | import java.util.Collection;
  6 | 
  7 | import org.slf4j.Logger;
  8 | import org.slf4j.LoggerFactory;
  9 | import ruc.irm.similarity.Similaritable;
 10 | import ruc.irm.similarity.util.BlankUtils;
 11 | import ruc.irm.similarity.util.FileUtils;
 12 | import ruc.irm.similarity.word.hownet.HownetMeta;
 13 | 
 14 | /**
 15 |  * 义原解析器, 包括义元数据的加载，义元的组织、索引、查询 以及义元的距离计算和相似度计算等.
 16 |  * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》
 17 |  * 
 18 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 19 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 20 |  * 
 21 |  * @see ruc.irm.similarity.Similaritable
 22 |  * @deprecated
 23 |  */
 24 | public abstract class SememeParser implements HownetMeta, Similaritable {
 25 | 	protected Logger LOG = LoggerFactory.getLogger(this.getClass());
 26 | 	
 27 | 	/** 所有的义原都存放到一个数组之中，并且义元的ID号与数组的下标相同 */
 28 | 	protected Sememe[] SEMEMES;
 29 | 
 30 | 	/** 通过对义原的汉语词义进行索引，根据该索引快速定位义原，找出义原的id，再到sememes中查找 */
 31 | 	private FastSimpleMap<String, Integer> sememeMap = null;		
 32 | 	
 33 | 	public SememeParser() throws IOException{
 34 | 		String sememeFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/sememe.dat";
 35 | 		
 36 | 		InputStream input = this.getClass().getClassLoader().getResourceAsStream(sememeFile);
 37 | 		load(input, "UTF-8");
 38 | 	}
 39 | 	
 40 | 	/**
 41 | 	 * 获取两个义原描述串的相似度
 42 | 	 * @param sememeName1
 43 | 	 * @param sememeName2
 44 | 	 * @see ke.commons.similarity.Similariable
 45 | 	 * @return
 46 | 	 */
 47 | 	public abstract double getSimilarity(String sememeName1, String sememeName2);
 48 | 	
 49 | 	/**
 50 | 	 * 获取两个确定义原的相似度
 51 | 	 * @param sememe1
 52 | 	 * @param sememe2
 53 | 	 * @return
 54 | 	 */
 55 | 	public abstract double getSimilarity(Sememe sememe1, Sememe sememe2);
 56 | 	
 57 | 	/**
 58 | 	 * 从文件中加载义元知识
 59 | 	 * 
 60 | 	 * @throws IOException
 61 | 	 */
 62 | 	public void load(InputStream input, String encoding) throws IOException {	
 63 | 		SememeDictTraverseEvent event = new SememeDictTraverseEvent();
 64 | 		LOG.info("loading sememe dictionary...");
 65 | 		long time = System.currentTimeMillis();
 66 | 		FileUtils.traverseLines(input, encoding, event);
 67 | 		this.SEMEMES = event.getSememes();	
 68 | 		
 69 | 		String[] keys = new String[SEMEMES.length];
 70 | 		Integer[] values = new Integer[SEMEMES.length];
 71 | 
 72 | 	    //设置索引
 73 | 	    for(int i=0; i<SEMEMES.length; i++){
 74 | 	    	keys[i] = SEMEMES[i].getCnWord();
 75 | 	    	values[i] = SEMEMES[i].getId();
 76 | 	    }
 77 | 	    sememeMap = new FastSimpleMap<String, Integer>(keys, values);
 78 | 	    
 79 | 	    time = System.currentTimeMillis() - time;
 80 | 	    LOG.info("sememe dictionary load completely. time elapsed: " + time);
 81 | 	}
 82 | 
 83 | 	/**
 84 | 	 * 根据汉语定义计算义元之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大， 
 85 | 	 * <br/>由于可能多个义元有相同的汉语词语，故计算结果为其中距离最小者
 86 | 	 * 
 87 | 	 * @param key1
 88 | 	 * @param key2
 89 | 	 * @return
 90 | 	 */
 91 | 	public int getDistance(String key1, String key2) {
 92 | 		int distance = Integer.MAX_VALUE;
 93 | 
 94 | 		// 如果两个字符串相等，直接返回距离为0
 95 | 		if (key1.equals(key2)) {
 96 | 			return 0;
 97 | 		}
 98 | 
 99 | 		Integer[] semArray1 = getSememes(key1);
100 | 		Integer[] semArray2 = getSememes(key2);
101 | 		
102 | 		// 如果key1或者key2不是义元，并且key1<>key2,则返回无穷大
103 | 		if (semArray1.length == 0 || semArray2.length == 0) {
104 | 			return Integer.MAX_VALUE;
105 | 		}
106 | 
107 | 		for(int i:semArray1){
108 | 			for(int j:semArray2){
109 | 				int d = getDistance(SEMEMES[i], SEMEMES[j]);
110 | 				if(d<distance){
111 | 					distance = d;
112 | 				}
113 | 			}
114 | 		}
115 | 		
116 | 		return distance;
117 | 	}
118 | 
119 | 	/**
120 | 	 * 获取两个义元在义原树中的距离
121 | 	 * 
122 | 	 * @param sem1
123 | 	 *            第一个义原
124 | 	 * @param sem2
125 | 	 *            第二个义原
126 | 	 * @return 两个义原的距离
127 | 	 */
128 | 	public int getDistance(Sememe sem1, Sememe sem2) {
129 | 		Sememe mysem1 = sem1;
130 | 		Sememe mysem2 = sem2;
131 | 		int distance = 0;
132 | 
133 | 		if (mysem1 == null || mysem2 == null)
134 | 			return Integer.MAX_VALUE;
135 | 		
136 | 		//变为深度相同，然后一次上找共同的父节点
137 | 		int level = mysem1.getDepth() - mysem2.getDepth();
138 | 		for (int i = 0; i < ((level < 0) ? level * -1 : level); i++) {
139 | 			if (level > 0)
140 | 				mysem1 = SEMEMES[mysem1.getParentId()];
141 | 			else
142 | 				mysem2 = SEMEMES[mysem2.getParentId()];
143 | 			distance++;
144 | 		}
145 | 
146 | 		//从不同的分支（深度相同）同时向上寻找共同的祖先节点
147 | 		while (mysem1.getId() != mysem2.getId()) {
148 | 			// 如果已经到达根节点，仍然不同，则返回无穷大(-1)
149 | 			if (mysem1.getId() == mysem1.getParentId()
150 | 					|| mysem2.getId() == mysem2.getParentId()) {
151 | 				distance = Integer.MAX_VALUE;
152 | 				break;
153 | 			}
154 | 
155 | 			mysem1 = SEMEMES[mysem1.getParentId()];
156 | 			mysem2 = SEMEMES[mysem2.getParentId()];
157 | 			distance += 2;
158 | 		}
159 | 
160 | 		return distance;
161 | 	}
162 | 
163 | 	/**
164 | 	 * 获取从该义元到根节点的路径表示字符串
165 | 	 * 
166 | 	 * @param key
167 | 	 * @return
168 | 	 */
169 | 	public String getPath(String key) {
170 | 		StringBuilder path = new StringBuilder();
171 | 		
172 | 		Sememe sem = getSememe(key);
173 | 		while (sem != null && sem.getId() != sem.getParentId()) {
174 | 			path.insert(0, "->" + sem.getCnWord());
175 | 			sem = SEMEMES[sem.getParentId()];
176 | 		}
177 | 		
178 | 		if (sem != null){
179 | 			path.insert(0, "->" + sem.getCnWord());
180 | 		}			
181 | 		path.insert(0, "START");
182 | 		return path.toString();
183 | 	}
184 | 
185 | 	/**
186 | 	 * 根据义原的名字，获取该义原的位置信息，义原体系中有时会有一个名字对应多个义原，一并返回到
187 | 	 * 义原数组中
188 | 	 * @param sememeName
189 | 	 * @return
190 | 	 */
191 | 	public Integer[] getSememes(String sememeName) {
192 | 		Collection<Integer> ids = sememeMap.get(sememeName);
193 | 
194 | 		return ids.toArray(new Integer[ids.size()]);
195 | 	}	
196 | 	
197 | 	/**
198 | 	 * 获取其中的一个义原，大部分义原就只有一个
199 | 	 * @param sememeName
200 | 	 * @return
201 | 	 */
202 | 	public Sememe getSememe(String sememeName){
203 | 		Integer[] ids = getSememes(sememeName);
204 | 		
205 | 		if(BlankUtils.isBlank(ids)){
206 | 			return null;
207 | 		}else{
208 | 			return SEMEMES[ids[0]];
209 | 		}
210 | 	}
211 | 	
212 | 	/**
213 | 	 * 过滤义原字符串，去掉其中的英文部分
214 | 	 * @param sememeString
215 | 	 * @return
216 | 	 */
217 | 	protected String filterSememeString(String sememeString){
218 | 		int pos = sememeString.indexOf("|");
219 | 		if (pos >= 0) {
220 | 			sememeString = sememeString.substring(pos + 1);
221 | 		}
222 | 		return sememeString;
223 | 	}
224 | 	
225 | }
226 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/SememeType.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet.sememe;
 2 | 
 3 | /**
 4 |  * 义原的类型定义<br/>
 5 |  * <ul>
 6 |  * <li>1：Event|事件</li>
 7 |  * <li>2：Entity|实体 </li>
 8 |  * <li>3:Attribute|属性 </li>
 9 |  * <li>4：Quantity|数量</li>
10 |  * <li>5：aValue|属性值</li>
11 |  * <li>6：qValue|数量值</li>
12 |  * <li>7: Secondary Feature|第二特征</li>
13 |  * <li>8: Syntax|语法</li>
14 |  * <li>9: EventRole|动态角色</li>
15 |  * <li>10:EventFeatures|动态属性</li>
16 |  * <li>0：未知</li>
17 |  * </ul>
18 |  * 
19 |  * 其中1~7为基本义元，8为语法义元，9、10为关系义元<br/>
20 |  * 
21 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
22 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
23 |  * @deprecated
24 |  */
25 | public interface SememeType {	 
26 | 	  /** Event|事件类型定义 */
27 | 	  public static final int Event = 1;
28 | 	  
29 | 	  /** Entity|实体类型定义*/
30 | 	  public static final int Entity = 2;
31 | 	  
32 | 	  /** Attribute|属性类型定义*/
33 | 	  public static final int Attribute = 3;
34 | 
35 | 	  /** Quantity|数量类型定义*/
36 | 	  public static final int Quantity = 4;
37 | 
38 | 	  /** aValue|属性值类型定义*/
39 | 	  public static final int AValue = 5;
40 | 
41 | 	  /** qValue|数量值类型定义*/
42 | 	  public static final int QValue = 6;
43 | 	  
44 | 	  /** Secondary Feature|第二特征类型定义*/
45 | 	  public static final int SecondaryFeature = 7;
46 | 	  
47 | 	  /** Syntax|语法类型定义*/
48 | 	  public static final int Syntax = 8;
49 | 	  
50 | 	  /** EventRole|动态角色类型定义*/
51 | 	  public static final int EventRoleAndFeature = 9;
52 | 	  
53 | 	  /** EventFeatures|动态属性类型定义*/
54 | 	  public static final int EventFeature = 10;
55 | 	  
56 | 	  /** 未知类型定义*/
57 | 	  public static final int Unknown = 0;
58 | 	  
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/concept/ConceptDictTraverseEvent.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet2.concept;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileOutputStream;
  6 | import java.io.InputStream;
  7 | import java.io.InputStreamReader;
  8 | import java.io.PrintWriter;
  9 | import java.util.ArrayList;
 10 | import java.util.Arrays;
 11 | import java.util.List;
 12 | 
 13 | import javax.xml.parsers.DocumentBuilder;
 14 | import javax.xml.parsers.DocumentBuilderFactory;
 15 | import javax.xml.transform.OutputKeys;
 16 | import javax.xml.transform.Transformer;
 17 | import javax.xml.transform.TransformerFactory;
 18 | import javax.xml.transform.dom.DOMSource;
 19 | import javax.xml.transform.stream.StreamResult;
 20 | 
 21 | import org.w3c.dom.Document;
 22 | import org.w3c.dom.Element;
 23 | 
 24 | import ruc.irm.similarity.util.TraverseEvent;
 25 | import ruc.irm.similarity.word.hownet2.concept.Concept;
 26 | 
 27 | 
 28 | /**
 29 |  * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准，格式如下：<br/>
 30 |  * 阿斗                	N    	human|人,ProperName|专,past|昔<br/>
 31 |  * 阿爸                	N    	human|人,family|家,male|男<br/>
 32 |  * 即： &lt;概念&gt; &lt;空格或者跳格&gt; &lt;词性&gt; &lt;空格或者跳格&gt; &lt;定义&gt;"
 33 |  * <br/>
 34 |  * 概念保存到数组中，没有保存到Map中，可以降低对内存空间的使用
 35 |  * 
 36 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 37 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 38 |  */
 39 | public class ConceptDictTraverseEvent implements TraverseEvent<String> {
 40 | 	private List<Concept> conceptList = null;
 41 | 	
 42 | 	public ConceptDictTraverseEvent(){
 43 | 		conceptList = new ArrayList<Concept>();
 44 | 	}
 45 | 	
 46 | 	public Concept[] getConcepts(){
 47 | 		Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]);
 48 | 		Arrays.sort(concepts);
 49 | 		return concepts;
 50 | 	}
 51 | 	
 52 | 	/**
 53 | 	 * 读取概念词典中的一行，并进行解析处理
 54 | 	 */
 55 | 	public boolean visit(String line) {
 56 | 		String word = null;
 57 | 		String pos = null;
 58 | 		String define = "";
 59 | 		char ch;
 60 | 		
 61 | 		//以符号//开始的是注释行
 62 | 		if(line.startsWith("//")){
 63 | 			return true;
 64 | 		}
 65 | 		
 66 | 		int lastPosition = 0;	//最近一次处理内容的有意义的开始位置
 67 | 		int processFlag = 0;	//当前处理部分的标志 0：处理word； 1：词性；2：定义
 68 | 		//解析出一行中的概念各项数据		
 69 | 		loop: for (int position = 0; position < line.length(); position++) {
 70 | 			ch = line.charAt(position);
 71 | 			
 72 | 			if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
 73 | 				String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
 74 | 				switch(processFlag){				
 75 | 				case 0:
 76 | 					word = item;
 77 | 					processFlag++;
 78 | 					break;
 79 | 				case 1:
 80 | 					pos = item;
 81 | 					processFlag++;
 82 | 					break;
 83 | 				case 2:					
 84 | 					//define = item;
 85 | 					//processFlag++;
 86 | 					define = line.substring(lastPosition).trim();					
 87 | 					break loop;
 88 | 				case 3:
 89 | 					System.out.println(line);
 90 | 					break;
 91 | 				}				
 92 | 				
 93 | 				for( ;(position < line.length()); position++){
 94 | 					ch = line.charAt(position);
 95 | 					if ((ch != ' ') && (ch != '\t')) {
 96 | 						lastPosition = position;
 97 | 						break;
 98 | 					}
 99 | 				}
100 | 					
101 | 			}
102 | 		}
103 | 		conceptList.add(new Concept(word, pos, define));
104 | 		return true;
105 | 	}
106 | 	
107 | 	public void saveToXML(File xmlFile) throws Exception{
108 | 		String conceptFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/concept.dat";
109 | 		InputStream input = this.getClass().getClassLoader().getResourceAsStream(conceptFile);
110 | 		BufferedReader in = new BufferedReader(new InputStreamReader(input,	"utf8"));
111 | 		
112 | 		DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance(); 
113 | 		DocumentBuilder builder=factory.newDocumentBuilder(); 
114 | 		Document document=builder.newDocument();
115 | 		Element root=document.createElement("concepts"); 
116 | 		document.appendChild(root); 
117 | 		
118 | 		String line = null;
119 | 
120 | 		while ((line = in.readLine()) != null) {
121 | 			saveLineToXML(document, root, line);
122 | 		}
123 | 
124 | 		input.close();
125 | 		in.close();
126 | 		
127 | 		TransformerFactory tf=TransformerFactory.newInstance(); 
128 | 		Transformer transformer=tf.newTransformer(); 
129 | 		DOMSource source=new DOMSource(document); 
130 | 		transformer.setOutputProperty(OutputKeys.ENCODING,"utf8"); 
131 | 		transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 
132 | 		PrintWriter pw=new PrintWriter(new FileOutputStream(xmlFile)); 
133 | 		StreamResult result=new StreamResult(pw); 
134 | 		transformer.transform(source,result); 
135 | 	}
136 | 	
137 | 	
138 | 	/**
139 | 	 * 读取概念词典中的一行，并进行解析处理
140 | 	 */
141 | 	private boolean saveLineToXML(Document document, Element root, String line) {
142 | 		String word = null;
143 | 		String pos = null;
144 | 		String define = "";
145 | 		char ch;
146 | 		
147 | 		//以符号//开始的是注释行
148 | 		if(line.startsWith("//")){
149 | 			return true;
150 | 		}
151 | 		
152 | 		int lastPosition = 0;	//最近一次处理内容的有意义的开始位置
153 | 		int processFlag = 0;	//当前处理部分的标志 0：处理word； 1：词性；2：定义
154 | 		//解析出一行中的概念各项数据		
155 | 		loop: for (int position = 0; position < line.length(); position++) {
156 | 			ch = line.charAt(position);
157 | 			
158 | 			if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
159 | 				String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
160 | 				switch(processFlag){				
161 | 				case 0:
162 | 					word = item;
163 | 					processFlag++;
164 | 					break;
165 | 				case 1:
166 | 					pos = item;
167 | 					processFlag++;
168 | 					break;
169 | 				case 2:					
170 | 					//define = item;
171 | 					//processFlag++;
172 | 					define = line.substring(lastPosition).trim();					
173 | 					break loop;
174 | 				case 3:
175 | 					System.out.println(line);
176 | 					break;
177 | 				}				
178 | 				
179 | 				for( ;(position < line.length()); position++){
180 | 					ch = line.charAt(position);
181 | 					if ((ch != ' ') && (ch != '\t')) {
182 | 						lastPosition = position;
183 | 						break;
184 | 					}
185 | 				}
186 | 					
187 | 			}
188 | 		}
189 | 		
190 | 		Element e = document.createElement("c");
191 | 		e.setAttribute("w", word);
192 | 		e.setAttribute("p", pos);
193 | 		e.setAttribute("d", define);
194 | 		root.appendChild(e);
195 | 		return true;
196 | 	}
197 | 	
198 | 	public static void main(String[] args) throws Exception {
199 | 	  new ConceptDictTraverseEvent().saveToXML(new File("/home/xiatian/Desktop/concept.xml"));
200 |   }
201 | 		
202 | }
203 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/concept/ConceptLinkedList.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet2.concept;
 2 | 
 3 | import java.util.LinkedList;
 4 | 
 5 | /**
 6 |  * 用于概念处理的LinkedList
 7 |  * 
 8 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 9 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
10 |  *
11 |  * @param <T>
12 |  */
13 | @SuppressWarnings("serial")
14 | public class ConceptLinkedList extends LinkedList<Concept> {
15 | 	
16 | 	/**
17 | 	 * 删除链表中最后面的size个元素
18 | 	 * @param size
19 | 	 */
20 | 	public void removeLast(int size){
21 | 		for(int i=0;i<size;i++){
22 | 			this.removeLast();
23 | 		}
24 | 	}
25 | 	
26 | 	/**
27 | 	 * 根据概念的定义判断是否已经加入到链表中
28 | 	 * @param concept
29 | 	 */
30 | 	public void addByDefine(Concept concept){
31 | 		for(Concept c:this){
32 | 			if(c.getDefine().equals(concept.getDefine())){
33 | 				return;
34 | 			}
35 | 		}
36 | 		
37 | 		this.add(concept);
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/concept/LiuConceptParser.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet2.concept;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Collection;
 5 | 
 6 | import ruc.irm.similarity.util.BlankUtils;
 7 | import ruc.irm.similarity.word.hownet2.sememe.LiuqunSememeParser;
 8 | import ruc.irm.similarity.word.hownet2.sememe.BaseSememeParser;
 9 | 
10 | 
11 | /**
12 |  * 刘群老师的相似度计算方式，对概念解析的处理方式
13 |  * 
14 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
15 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
16 |  */
17 | public class LiuConceptParser extends BaseConceptParser{
18 | 	
19 | 	private static LiuConceptParser instance = null;
20 | 	
21 | 	public static LiuConceptParser getInstance(){
22 | 		if(instance == null){
23 | 			try {
24 | 				instance = new LiuConceptParser();
25 | 			} catch (IOException e) {
26 | 				e.printStackTrace();
27 | 			}
28 | 		}
29 | 		
30 | 		return instance;
31 | 	}
32 | 	
33 | 	private LiuConceptParser(BaseSememeParser sememeParser) throws IOException {
34 | 		super(sememeParser);
35 | 	}
36 | 	
37 | 	private LiuConceptParser() throws IOException{
38 | 		super(new LiuqunSememeParser());
39 | 	}
40 | 
41 | 	@Override
42 | 	protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4){
43 | 		return beta1 * sim_v1 
44 |         + beta2 * sim_v1 * sim_v2
45 |         + beta3 * sim_v1 * sim_v2 * sim_v3 
46 |         + beta4 * sim_v1 * sim_v2 * sim_v3 * sim_v4;		
47 | 	}
48 | 
49 | 	@Override
50 | 	public double getSimilarity(String word1, String word2) {
51 | 		double similarity = 0.0;
52 | 
53 | 		// 如果两个句子相同,则直接返回1.0
54 | 		if (word1.equals(word2)) {
55 | 			return 1.0;
56 | 		}
57 | 
58 | 		Collection<Concept> concepts1 = getConcepts(word1);
59 | 		Collection<Concept> concepts2 = getConcepts(word2);
60 | 		
61 | 		//如果是blank，则说明是未登录词, 需要计算组合概念
62 | 		if(BlankUtils.isBlank(concepts1)  || BlankUtils.isBlank(concepts2)){
63 | 			return 0.0;
64 | 		}
65 | 		
66 | 		//两个for循环分别计算词语所有可能的概念的相似度
67 | 		for(Concept c1:concepts1){
68 | 			for(Concept c2:concepts2){				
69 | 				double v = getSimilarity(c1, c2);
70 | 
71 | 				if(v>similarity){
72 | 					similarity = v;
73 | 				}
74 | 				
75 | 				if(similarity == 1.0){
76 | 					break;
77 | 				}
78 | 			}
79 | 		}		
80 | 
81 | 		return similarity;
82 | 	}
83 | 	
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/sememe/BaseSememeParser.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet2.sememe;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.util.zip.GZIPInputStream;
 6 | 
 7 | import javax.xml.namespace.QName;
 8 | import javax.xml.stream.XMLEventReader;
 9 | import javax.xml.stream.XMLInputFactory;
10 | import javax.xml.stream.events.StartElement;
11 | import javax.xml.stream.events.XMLEvent;
12 | 
13 | import org.slf4j.Logger;
14 | import org.slf4j.LoggerFactory;
15 | import ruc.irm.similarity.Similaritable;
16 | import ruc.irm.similarity.word.hownet.HownetMeta;
17 | 
18 | import com.google.common.collect.HashMultimap;
19 | import com.google.common.collect.Multimap;
20 | 
21 | /**
22 |  * 义原解析器基类，所有义原存储在xml文件中（当前package中的sememe.xml.tar.gz文件）。<br/>
23 |  * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》或《中文信息相似度计算理论与方法》一书第三章<br/>
24 |  * 
25 |  * 为提高运算速度，义原的加载方式做了调整，只把义原的汉语定义和对应的Id加入到MultiMap对象中，并通过义原的层次化Id计算义原之间的相似度。<br/>
26 |  * 
27 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
28 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
29 |  * 
30 |  * @see {@link ruc.irm.similarity.Similaritable}
31 |  */
32 | public abstract class BaseSememeParser implements HownetMeta, Similaritable {
33 | 	protected Logger LOG = LoggerFactory.getLogger(this.getClass());
34 | 
35 | 	/** 所有的义原都存放到一个MultiMap, Key为Sememe的中文定义, Value为义原的Id */
36 | 	protected static Multimap<String, String> SEMEMES = null;
37 | 
38 | 	public BaseSememeParser() throws IOException {
39 | 		if (SEMEMES != null) {
40 | 			return;
41 | 		}
42 | 
43 | 		SEMEMES = HashMultimap.create();
44 | 
45 | 		InputStream input = this.getClass().getResourceAsStream("/data/sememe.xml.gz");
46 | 		input = new GZIPInputStream(input);
47 | 		load(input);
48 | 	}
49 | 
50 | 	/**
51 | 	 * 从文件中加载义元知识
52 | 	 * 
53 | 	 * @throws IOException
54 | 	 */
55 | 	public void load(InputStream input) throws IOException {
56 | 		System.out.print("loading sememes...");
57 | 		long time = System.currentTimeMillis();
58 | 		try {
59 | 			XMLInputFactory inputFactory = XMLInputFactory.newInstance();
60 | 			XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
61 | 
62 | 			int count = 0;
63 | 			while (xmlEventReader.hasNext()) {
64 | 				XMLEvent event = xmlEventReader.nextEvent();
65 | 
66 | 				if (event.isStartElement()) {
67 | 					StartElement startElement = event.asStartElement();
68 | 					if (startElement.getName().toString().equals("sememe")) {
69 | 						String cnWord = startElement.getAttributeByName(QName.valueOf("cn")).getValue();
70 | 						String id = startElement.getAttributeByName(QName.valueOf("id")).getValue();
71 | 						SEMEMES.put(cnWord, id);
72 | 						count++;
73 | 						if (count % 100 == 0) {
74 | 							System.out.print(".");
75 | 						}
76 | 					}
77 | 				}
78 | 			}
79 | 			input.close();
80 | 		} catch (Exception e) {
81 | 			throw new IOException(e);
82 | 		}
83 | 		time = System.currentTimeMillis() - time;
84 | 		System.out.println("\ncomplete!. time elapsed: " + (time / 1000) + "s");
85 | 	}
86 | 
87 | 	/**
88 | 	 * 计算两个义原之间的关联度
89 | 	 * 
90 | 	 * @param sememeName1
91 | 	 * @param sememeName2
92 | 	 * @return
93 | 	 */
94 | 	public double getAssociation(String sememeName1, String sememeName2) {
95 | 		return 0.0;
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/sememe/LiuqunSememeParser.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet2.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Collection;
  5 | 
  6 | /**
  7 |  * 刘群老师计算义原相似度的方法, 实现了SememeParser中定义的抽象方法
  8 |  * 
  9 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 10 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 11 |  * 
 12 |  * @author <a href="xiat@ruc.edu.cn">xiatian</a>
 13 |  * @version 1.0
 14 |  */
 15 | public class LiuqunSememeParser extends BaseSememeParser {
 16 | 		
 17 | 	/** 计算义元相似度的可调节的参数，默认为1.6 */
 18 | 	private final float alpha = 1.6f;	
 19 | 	
 20 | 	public LiuqunSememeParser() throws IOException {
 21 | 		super();		
 22 | 	}
 23 | 
 24 | 	/**
 25 | 	 * 计算两个义元之间的相似度，由于义元可能相同，计算结果为其中相似度最大者 
 26 | 	 * <br/>similarity = alpha/(distance+alpha)
 27 | 	 * 
 28 | 	 * @param key1
 29 | 	 * @param key2
 30 | 	 * @return
 31 | 	 */
 32 | 	@Override
 33 | 	public double getSimilarity(String item1, String item2) {
 34 | 		int pos;
 35 | 
 36 | 		// 如果为空串，直接返回0
 37 | 		if (item1 == null || item2 == null || item1.equals("")
 38 | 				|| item2.equals(""))
 39 | 			return 0.0;
 40 | 
 41 | 		String key1 = item1.trim();
 42 | 		String key2 = item2.trim();
 43 | 
 44 | 		// 去掉()符号
 45 | 		if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
 46 | 			if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
 47 | 				key1 = key1.substring(1, key1.length() - 1);
 48 | 				key2 = key2.substring(1, key2.length() - 1);
 49 | 			} else {
 50 | 				return 0.0;
 51 | 			}
 52 | 		}
 53 | 
 54 | 		// 处理关系义元,即x=y的情况
 55 | 		if ((pos = key1.indexOf('=')) > 0) {
 56 | 			int pos2 = key2.indexOf('=');
 57 | 			// 如果是关系义元，则判断前面部分是否相同，如果相同，则转为计算后面部分的相似度，否则为0
 58 | 			if ((pos == pos2)
 59 | 					&& key1.substring(0, pos).equals(key2.substring(0, pos2))) {
 60 | 				key1 = key1.substring(pos + 1);
 61 | 				key2 = key2.substring(pos2 + 1);
 62 | 			} else {
 63 | 				return 0.0;
 64 | 			}
 65 | 		}
 66 | 
 67 | 		// 处理符号义元,即前面有特殊符号的义元
 68 | 		String symbol1 = key1.substring(0, 1);
 69 | 		String symbol2 = key2.substring(0, 1);
 70 | 
 71 | 		for (int i = 0; i < Symbol_Descriptions.length; i++) {
 72 | 			if (symbol1.equals(Symbol_Descriptions[i][0])) {
 73 | 				if (symbol1.equals(symbol2)) {
 74 | 					key1 = item1.substring(1);
 75 | 					key2 = item2.substring(1);
 76 | 					break;
 77 | 				} else {
 78 | 					return 0.0; // 如果不是同一关系符号，则相似度直接返回0
 79 | 				}
 80 | 			}
 81 | 		}
 82 | 
 83 | 		if ((pos = key1.indexOf("|")) >= 0) {
 84 | 			key1 = key1.substring(pos + 1);
 85 | 		}
 86 | 		if ((pos = key2.indexOf("|")) >= 0) {
 87 | 			key2 = key2.substring(pos + 1);
 88 | 		}
 89 | 
 90 | 		int distance = getMinDistance(key1, key2);
 91 | 		return alpha / (distance + alpha);
 92 | 	}
 93 | 
 94 | 	/**
 95 | 	 * 根据汉语定义计算义原之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大，由于可能多个义元有相同的汉语词语，
 96 | 	 * 故计算结果为其中距离最小者
 97 | 	 * 
 98 | 	 * @param key1
 99 | 	 * @param key2
100 | 	 * @return
101 | 	 */
102 | 	public int getMinDistance(String sememe1, String sememe2) {
103 | 		int distance = Integer.MAX_VALUE;
104 | 
105 | 		// 如果两个字符串相等，直接返回距离为0
106 | 		if (sememe1.equals(sememe2)) {
107 | 			return 0;
108 | 		}
109 | 
110 | 		Collection<String> sememeIds1 = SEMEMES.get(sememe1);
111 | 		Collection<String> sememeIds2 = SEMEMES.get(sememe2);
112 | 		
113 | 		// 如果sememe1或者sememe2不是义元,则返回无穷大
114 | 		if (sememeIds1.size() == 0 || sememeIds1.size() == 0) {
115 | 			return Integer.MAX_VALUE;
116 | 		}
117 | 
118 | 		for(String id1:sememeIds1){
119 | 			for(String id2:sememeIds2){
120 | 				int d = getDistance(id1, id2);
121 | 				if(d<distance){
122 | 					distance = d;
123 | 				}
124 | 			}
125 | 		}
126 | 		
127 | 		return distance;
128 | 	}
129 | 
130 | 	/**
131 | 	 * 根据义原的具有层次的Id获取两个义原之间的语义距离
132 | 	 * @param id1
133 | 	 * @param id2
134 | 	 * @return
135 | 	 */
136 | 	int getDistance(String id1, String id2) {
137 | 		// 两个Id相同的位置终止地方
138 | 		int position = 0;
139 | 		String[] array1 = id1.split("-");
140 | 		String[] array2 = id2.split("-");
141 | 		for (position = 0; position < array1.length && position < array2.length; position++) {
142 | 			if (!array1[position].equals(array2[position])) {
143 | 				return array1.length + array2.length - position - position;
144 | 			}
145 | 		}
146 | 
147 | 		if (array1.length == array2.length) {
148 | 			return 0;
149 | 		} else if (array1.length == position) {
150 | 			return array2.length - position;
151 | 		} else {
152 | 			return array1.length - position;
153 | 		}
154 | 	}
155 | }
156 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/sememe/Sememe.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet2.sememe;
  2 | 
  3 | /**
  4 |  * 描述知网义原的基本对象, 出于性能考虑，把未用到的英文名称、定义等在加载时忽略, 更准确的做法是以[英文定义|中文定义]
  5 |  * 作为一个整理进行处理，不过绝大多数只根据中文定义就可以标识出来，因此忽略不计。<br/>
  6 |  * 义原编号采用父节点Id-子节点Id编码方式，如:
  7 |  * &lt;sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/>
  8 |  * 义原的id表明了义原之间的上下位关系和义原的深度。
  9 |  * 
 10 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 11 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 12 |  */
 13 | public class Sememe {
 14 | 	/** 
 15 | 	 * 义原编号,采用父节点Id-子节点Id编码方式，如&lt;sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/>
 16 | 	 * id表明了义原之间的上下位关系  
 17 | 	 */
 18 | 	private String id;
 19 | 	/** 义原的中文名称*/
 20 | 	private String cnWord;
 21 | 	/** 义原的英文名称 */
 22 | 	private String enWord;
 23 | 	/** 义原的定义，如果没有(例如数量)，则为空串 */
 24 | 	private String define;
 25 | 	
 26 | 	/**
 27 | 	 * 每一行的形式为：be|是 {relevant,isa}/{relevant,descriptive} 
 28 | 	 * <br/>或者 official|官 [#organization|组织,#employee|员] 
 29 | 	 * <br/>或者 amount|多少 
 30 | 	 * <br/>把相应的部分赋予不同的属性
 31 | 	 * 出于性能考虑，把未用到的英文名称、定义等忽略
 32 | 	 * @param id
 33 | 	 */
 34 | 	public Sememe(String id, String en, String cn, String define) {
 35 | 		this.id = id;
 36 | 		this.cnWord = cn;
 37 | 		//为提高效率，减少内存空间利用，可去掉以下两行
 38 | 		this.enWord = en;
 39 | 		this.define = define;
 40 | 	}
 41 | 
 42 | 	public String getId() {
 43 | 		return id;
 44 | 	}
 45 | 
 46 | 	public void setId(String id) {
 47 | 		this.id = id;
 48 | 	}
 49 | 
 50 | 	public String getCnWord() {
 51 | 		return cnWord;
 52 | 	}
 53 | 
 54 | 	public void setCnWord(String cnWord) {
 55 | 		this.cnWord = cnWord;
 56 | 	}
 57 | 
 58 | 	public String getEnWord() {
 59 | 		return enWord;
 60 | 	}
 61 | 
 62 | 	public void setEnWord(String enWord) {
 63 | 		this.enWord = enWord;
 64 | 	}
 65 | 
 66 | 	public String getDefine() {
 67 | 		return define;
 68 | 	}
 69 | 
 70 | 	public void setDefine(String define) {
 71 | 		this.define = define;
 72 | 	}
 73 | 
 74 | 	public int getType() {
 75 | 		char ch = id.charAt(0);
 76 | 		switch (ch) {
 77 | 		case '1':
 78 | 			return SememeType.Event;
 79 | 		case '2':
 80 | 			return SememeType.Entity;
 81 | 		case '3':
 82 | 			return SememeType.Attribute;
 83 | 		case '4':
 84 | 			return SememeType.Quantity;
 85 | 		case '5':
 86 | 			return SememeType.AValue;
 87 | 		case '6':
 88 | 			return SememeType.QValue;
 89 | 		case '7':
 90 | 			return SememeType.SecondaryFeature;
 91 | 		case '8':
 92 | 			return SememeType.Syntax;
 93 | 		case '9':
 94 | 			return SememeType.EventRoleAndFeature;
 95 | 		default:
 96 | 			return 0;
 97 | 		}
 98 | 	}
 99 | 		
100 | 	@Override
101 | 	public String toString(){
102 | 		StringBuilder sb = new StringBuilder();
103 | 		sb.append("id=");
104 | 		sb.append(id);
105 | 		sb.append("; cnWord=");
106 | 		sb.append(cnWord);
107 | 		sb.append("; enWord=");
108 | 		sb.append(enWord);
109 | 		sb.append("; define=");
110 | 		sb.append(define);
111 | 		return sb.toString();
112 | 	}
113 | 
114 | }
115 | 
116 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/sememe/SememeType.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet2.sememe;
 2 | 
 3 | /**
 4 |  * 义原的类型定义<br/>
 5 |  * <ul>
 6 |  * <li>1：Event|事件</li>
 7 |  * <li>2：Entity|实体 </li>
 8 |  * <li>3:Attribute|属性 </li>
 9 |  * <li>4：Quantity|数量</li>
10 |  * <li>5：aValue|属性值</li>
11 |  * <li>6：qValue|数量值</li>
12 |  * <li>7: Secondary Feature|第二特征</li>
13 |  * <li>8: Syntax|语法</li>
14 |  * <li>9: EventRole|动态角色</li>
15 |  * <li>10:EventFeatures|动态属性</li>
16 |  * <li>0：未知</li>
17 |  * </ul>
18 |  * 
19 |  * 其中1~7为基本义元，8为语法义元，9、10为关系义元<br/>
20 |  * 
21 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
22 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
23 |  */
24 | public interface SememeType {	 
25 | 	  /** Event|事件类型定义 */
26 | 	  public static final int Event = 1;
27 | 	  
28 | 	  /** Entity|实体类型定义*/
29 | 	  public static final int Entity = 2;
30 | 	  
31 | 	  /** Attribute|属性类型定义*/
32 | 	  public static final int Attribute = 3;
33 | 
34 | 	  /** Quantity|数量类型定义*/
35 | 	  public static final int Quantity = 4;
36 | 
37 | 	  /** aValue|属性值类型定义*/
38 | 	  public static final int AValue = 5;
39 | 
40 | 	  /** qValue|数量值类型定义*/
41 | 	  public static final int QValue = 6;
42 | 	  
43 | 	  /** Secondary Feature|第二特征类型定义*/
44 | 	  public static final int SecondaryFeature = 7;
45 | 	  
46 | 	  /** Syntax|语法类型定义*/
47 | 	  public static final int Syntax = 8;
48 | 	  
49 | 	  /** EventRole|动态角色类型定义*/
50 | 	  public static final int EventRoleAndFeature = 9;
51 | 	  
52 | 	  /** 未知类型定义*/
53 | 	  public static final int Unknown = 0;
54 | 	  
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/sememe/XiaSememeParser.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.similarity.word.hownet2.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Collection;
  5 | 
  6 | import ruc.irm.similarity.util.BlankUtils;
  7 | 
  8 | 
  9 | /**
 10 |  * 义原相似度计算, 实现了SememeParser中定义的抽象方法
 11 |  * 
 12 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 13 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室 
 14 |  */
 15 | public class XiaSememeParser extends BaseSememeParser {
 16 | 	
 17 | 	public XiaSememeParser() throws IOException{
 18 | 		super();
 19 | 	}
 20 | 	
 21 | 	/**
 22 | 	 * 计算两个义原的相似度	 
 23 | 	 */
 24 | 	double getSimilarityBySememeId(final String id1, final String id2) {		
 25 | 		
 26 | 		int position = 0;
 27 | 		String[] array1 = id1.split("-");
 28 | 		String[] array2 = id2.split("-");
 29 | 		for (position = 0; position < array1.length && position < array2.length; position++) {
 30 | 			if (!array1[position].equals(array2[position])) {
 31 | 				break;
 32 | 			}
 33 | 		}
 34 | 		
 35 | 		return 2.0*position/(array1.length + array2.length);
 36 | 	}
 37 | 	
 38 | 	/**
 39 | 	 * 根据汉语定义计算义原之间的相似度，由于可能多个义元有相同的汉语词语，故计算结果为其中相似度最大者
 40 | 	 * 
 41 | 	 * @param key1
 42 | 	 * @param key2
 43 | 	 * @return
 44 | 	 */
 45 | 	public double getMaxSimilarity(String sememeName1, String sememeName2) {
 46 | 		double maxValue = 0.0;
 47 | 
 48 | 		// 如果两个字符串相等，直接返回距离为0
 49 | 		if (sememeName1.equals(sememeName2)) {
 50 | 			return 1.0;
 51 | 		}
 52 | 
 53 | 		Collection<String> sememeIds1 = SEMEMES.get(sememeName1);
 54 | 		Collection<String> sememeIds2 = SEMEMES.get(sememeName2);
 55 | 		
 56 | 		// 如果sememe1或者sememe2不是义元,则返回0
 57 | 		if (sememeIds1.size() == 0 || sememeIds1.size() == 0) {
 58 | 			return 0.0;
 59 | 		}
 60 | 
 61 | 		for(String id1:sememeIds1){
 62 | 			for(String id2:sememeIds2){
 63 | 				double value = getSimilarityBySememeId(id1, id2);
 64 | 				if(value > maxValue){
 65 | 					maxValue = value;
 66 | 				}
 67 | 			}
 68 | 		}
 69 | 		
 70 | 		return maxValue;
 71 | 	}
 72 | 
 73 | 	/**
 74 | 	 * 计算两个义元之间的相似度，由于义元可能相同，计算结果为其中相似度最大者 similarity = alpha/(distance+alpha),
 75 | 	 * 如果两个字符串相同或都为空，直接返回1.0
 76 | 	 * 
 77 | 	 * @param key1 第一个义原字符串
 78 | 	 * @param key2 第二个义原字符串
 79 | 	 * @return
 80 | 	 */
 81 | 	@Override
 82 | 	public double getSimilarity(String item1, String item2) {	
 83 | 		if(BlankUtils.isBlankAll(item2, item2)){
 84 | 			return 1.0;
 85 | 		} else if(BlankUtils.isBlankAtLeastOne(item1, item2)){
 86 | 			return 0.0;
 87 | 		} else if(item1.equals(item2)){
 88 | 			return 1.0;
 89 | 		}		
 90 | 
 91 | 		String key1 = item1.trim();
 92 | 		String key2 = item2.trim();
 93 | 
 94 | 		// 去掉()符号
 95 | 		if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
 96 | 			
 97 | 			if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
 98 | 				key1 = key1.substring(1, key1.length() - 1);
 99 | 				key2 = key2.substring(1, key2.length() - 1);
100 | 			} else {
101 | 				return 0.0;
102 | 			}
103 | 			
104 | 		}
105 | 
106 | 		// 处理关系义元,即x=y的情况
107 | 		int pos = key1.indexOf('=');
108 | 		if (pos > 0) {
109 | 			int pos2 = key2.indexOf('=');
110 | 			// 如果是关系义元，则判断前面部分是否相同，如果相同，则转为计算后面部分的相似度，否则为0
111 | 			if ((pos == pos2)
112 | 					&& key1.substring(0, pos).equals(key2.substring(0, pos2))) {
113 | 				key1 = key1.substring(pos + 1);
114 | 				key2 = key2.substring(pos2 + 1);
115 | 			} else {
116 | 				return 0.0;
117 | 			}
118 | 		}
119 | 
120 | 		// 处理符号义元,即前面有特殊符号的义元
121 | 		String symbol1 = key1.substring(0, 1);
122 | 		String symbol2 = key2.substring(0, 1);
123 | 
124 | 		for (int i = 0; i < Symbol_Descriptions.length; i++) {
125 | 			if (symbol1.equals(Symbol_Descriptions[i][0])) {
126 | 				if (symbol1.equals(symbol2)) {
127 | 					key1 = item1.substring(1);
128 | 					key2 = item2.substring(1);
129 | 					break;
130 | 				} else {
131 | 					return 0.0; // 如果不是同一关系符号，则相似度直接返回0
132 | 				}
133 | 			}
134 | 		}
135 | 
136 | 		if ((pos = key1.indexOf("|")) >= 0) {
137 | 			key1 = key1.substring(pos + 1);
138 | 		}
139 | 		if ((pos = key2.indexOf("|")) >= 0) {
140 | 			key2 = key2.substring(pos + 1);
141 | 		}
142 | 
143 | 		// 如果两个字符串相等，直接返回距离为0
144 | 		if (key1.equals(key2)) {
145 | 			return 1.0;
146 | 		}
147 | 		
148 | 		return getMaxSimilarity(key1, key2);
149 | 	}
150 | 
151 | 	
152 | }


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/pinyin/PinyinSimilarity.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.pinyin;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | import ruc.irm.similarity.Similaritable;
 6 | import ruc.irm.similarity.util.EditDistance;
 7 | import ruc.irm.similarity.util.PinyinUtils;
 8 | 
 9 | 
10 | /**
11 |  * 通过拼音计算两个词语是否相似，拼音的相似程度采用编辑距离算法，并进行归一化衡量
12 |  * 
13 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
14 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
15 |  */
16 | public class PinyinSimilarity implements Similaritable {
17 | 	
18 | 	public double getSimilarity(String item1, String item2) {
19 | 		Set<String> pinyinSet1 = PinyinUtils.getInstance().getPinyin(item1);
20 | 		Set<String> pinyinSet2 = PinyinUtils.getInstance().getPinyin(item2);
21 | 
22 | 		double max = 0.0;
23 | 		for(String pinyin1:pinyinSet1){
24 | 			for(String pinyin2:pinyinSet2){
25 | 				double distance = new EditDistance().getEditDistance(pinyin1, pinyin2);		
26 | 				double similarity = 1 - distance/( (pinyin1.length()>pinyin2.length())?pinyin1.length():pinyin2.length());
27 | 				max = (max>similarity)?max:similarity;
28 | 				if(max==1.0){
29 | 					return max;
30 | 				}
31 | 			}
32 | 		}
33 | 		return max;
34 | 	}
35 | 	
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/tendency/word/HownetWordTendency.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.tendency.word;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Collection;
  5 | import java.util.HashSet;
  6 | import java.util.Set;
  7 | 
  8 | import ruc.irm.similarity.word.hownet2.concept.BaseConceptParser;
  9 | import ruc.irm.similarity.word.hownet2.concept.Concept;
 10 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
 11 | import ruc.irm.similarity.word.hownet2.sememe.BaseSememeParser;
 12 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser;
 13 | 
 14 | /**
 15 |  * 基于知网实现的词语倾向性判别
 16 |  * 
 17 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 18 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 19 |  */
 20 | public class HownetWordTendency implements WordTendency {
 21 |     public static String[] POSITIVE_SEMEMES = new String[]{
 22 |         "良",
 23 |         "喜悦",
 24 |         "夸奖",
 25 |         "满意",
 26 |         "期望",
 27 |         "注意",
 28 |         "致敬",
 29 |         "喜欢",
 30 |         "专",
 31 |         "敬佩",
 32 |         "同意",
 33 |         "爱惜",
 34 |         "愿意",
 35 |         "思念",
 36 |         "拥护",
 37 |         "祝贺",
 38 |         "福",
 39 |         "需求",
 40 |         "奖励",
 41 |         "致谢",
 42 |         "欢迎",
 43 |         "羡慕",
 44 |         "感激",
 45 |         "爱恋"
 46 |     };
 47 |     
 48 |     public static String[] NEGATIVE_SEMEMES = new String[]{
 49 |         "莠",
 50 |         "谴责",
 51 |         "害怕",
 52 |         "生气",
 53 |         "悲哀",
 54 |         "着急",
 55 |         "轻视",
 56 |         "羞愧",
 57 |         "烦恼",
 58 |         "灰心",
 59 |         "犹豫",
 60 |         "为难",
 61 |         "懊悔",
 62 |         "厌恶",
 63 |         "怀疑",
 64 |         "怜悯",
 65 |         "忧愁",
 66 |         "示怒",
 67 |         "不满",
 68 |         "仇恨",
 69 |         "埋怨",
 70 |         "失望",
 71 |         "坏"
 72 |     };
 73 |     private BaseConceptParser conceptParser = null;
 74 |     private BaseSememeParser sememeParser = null;
 75 |     
 76 |     public HownetWordTendency(){
 77 |         this.conceptParser =XiaConceptParser.getInstance();
 78 |         try {
 79 |             this.sememeParser = new XiaSememeParser();
 80 |         } catch (IOException e) {            
 81 |             e.printStackTrace();
 82 |         }
 83 |     }
 84 |     
 85 |     @Override
 86 |     public double getTendency(String word) {
 87 |         double positive = getSentiment(word, POSITIVE_SEMEMES);
 88 |         double negative = getSentiment(word, NEGATIVE_SEMEMES);;
 89 |         return positive - negative;
 90 |     }
 91 |     
 92 |     public double getSentiment(String word, String[] candidateSememes) {
 93 |         Collection<Concept> concepts = conceptParser.getConcepts(word);
 94 |         Set<String> sememes = new HashSet<String>();
 95 |         for (Concept c : concepts) {
 96 |             sememes.addAll(c.getAllSememeNames());
 97 |         }
 98 | 
 99 |         double max = 0.0;
100 |         for(String item:sememes){
101 |             double total = 0.0;
102 |             for(String positiveSememe:candidateSememes){
103 |                 //如果有特别接近的义原，直接返回该相似值，避免其他干扰
104 |                 double value = sememeParser.getSimilarity(item, positiveSememe);
105 |                 if(value>0.9){
106 |                     return value;
107 |                 }
108 |                 total += value;
109 |             }
110 |             double sim = total / candidateSememes.length;
111 |             if(sim>max){
112 |                 max = sim;
113 |             }
114 |         }
115 |         return max;
116 |     }    
117 |     
118 | }
119 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/tendency/word/Training.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.tendency.word;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.ArrayList;
  9 | import java.util.Collection;
 10 | import java.util.Collections;
 11 | import java.util.HashMap;
 12 | import java.util.List;
 13 | import java.util.Map;
 14 | 
 15 | import ruc.irm.similarity.util.BlankUtils;
 16 | import ruc.irm.similarity.word.hownet2.concept.Concept;
 17 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
 18 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser;
 19 | 
 20 | import com.google.common.collect.HashMultimap;
 21 | import com.google.common.collect.Multimap;
 22 | 
 23 | /**
 24 |  * 临时训练及测试类
 25 |  * 
 26 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 27 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 28 |  */
 29 | public class Training {
 30 |     
 31 |     void test(boolean testPositive) throws IOException{
 32 |         WordTendency tendency = new HownetWordTendency();
 33 |         File f = new File("./dict/sentiment/负面情感词语（中文）.txt");
 34 |         if(testPositive){
 35 |             //f = new File("./dict/sentiment/正面情感词语（中文）.txt");
 36 |             f = new File("./dict/sentiment/正面评价词语（中文）.txt");
 37 |         }
 38 |         String encoding = "utf-8";
 39 |         BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f),    encoding));
 40 |         String line;
 41 |         int wordCount = 0;
 42 |         int correctCount = 0;
 43 |         while ((line = in.readLine()) != null) {
 44 |             if(line.length()>5) continue;
 45 |             wordCount++;
 46 |             
 47 |             double value =tendency.getTendency(line.trim());
 48 |             if(value>0 && testPositive){
 49 |                  correctCount++;                
 50 |             }else if(value<0 && !testPositive){
 51 |                 correctCount++;                
 52 |             }else{
 53 |                 System.out.println("error:" + line + "\t value:" + value);
 54 |             }
 55 |         }
 56 |         System.out.println("correct:" + correctCount);
 57 |         System.out.println("total:" + wordCount);
 58 |         System.out.println("ratio:" + correctCount*1.0/wordCount);
 59 |     }
 60 | 	
 61 | 	/**
 62 | 	 * 该方法用于统计知网提供的情感词集合所涉及的义原以及出现频度
 63 | 	 * @throws IOException 
 64 | 	 */
 65 | 	/**
 66 | 	 * @throws IOException
 67 | 	 */
 68 | 	void countSentimentDistribution() throws IOException{	    
 69 | 		Map<String, Integer> sememeMap = new HashMap<String, Integer>();
 70 | 		File f = new File("./dict/sentiment/负面情感词语（中文）.txt");
 71 | 		String encoding = "utf-8";
 72 | 		boolean autoCombineConcept = false;
 73 | 		BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f),	encoding));
 74 | 		
 75 | 		XiaConceptParser parser = new XiaConceptParser(new XiaSememeParser());
 76 | 		
 77 | 		String line = null;
 78 | 
 79 | 		int conceptCount = 0;
 80 | 		int wordCount = 0;
 81 | 		while ((line = in.readLine()) != null) {
 82 | 			if(line.length()>5) continue;
 83 | 			wordCount++;
 84 | 			String word = line.trim();
 85 | 			Collection<Concept> concepts = parser.getInnerConcepts(word);
 86 | 			//由于目前的词典为知网2000版本，所以默认情况下仅对词典中出现的概念进行统计
 87 | 			if(BlankUtils.isBlank(concepts) && autoCombineConcept ){
 88 | 				concepts = parser.autoCombineConcepts(word, null);
 89 | 			}
 90 | 			for(Concept c: concepts){
 91 | 			    conceptCount++;
 92 | 				List<String> names = new ArrayList<String>();
 93 | 				
 94 | 				//加入主义原
 95 | 				names.add(c.getMainSememe());
 96 | 				
 97 | 				//加入关系义原
 98 | 				for(String item:c.getRelationSememes()){
 99 | 					names.add(item.substring(item.indexOf("=") + 1));
100 | 				}				
101 | 
102 | 				//加入符号义原
103 | 				for(String item:c.getSymbolSememes()){
104 | 					names.add(item.substring(1));
105 | 				}
106 | 				
107 |                 //加入其他义原集合
108 |                 for(String item:c.getSecondSememes()){
109 |                     names.add(item);
110 |                 }
111 |                 
112 | 				for(String item:names){
113 | 					Integer count = sememeMap.get(item);
114 | 					if(count==null){
115 | 						sememeMap.put(item, 1);
116 | 					}else{
117 | 						sememeMap.put(item, count+1);
118 | 					}
119 | 				}
120 | 			}			
121 | 		}
122 | 		in.close();
123 | 		
124 | 		//以下是为了按照义原出现的数量进行排序的代码
125 | 		Multimap<Integer, String> map2 = HashMultimap.create();
126 | 		for(String key:sememeMap.keySet()){
127 | 		    map2.put(sememeMap.get(key), key);
128 | 		}
129 | 		List<Integer> keys = new ArrayList<Integer>();
130 | 		for(Integer key: map2.keySet()){
131 | 		    keys.add(key);
132 | 		}
133 | 		Collections.sort(keys);
134 | 		
135 | 		int smallSememeCount = 0; //较少出现的不同义原数量
136 | 		int smallAppearTotal = 0;    //较少出现的义原在概念众出现的次数总和
137 | 		for(int index=(keys.size()-1); index>=0; index--){
138 | 		    Integer key = keys.get(index);
139 | 		    Collection<String> values = map2.get(key);
140 | 		    double ratio =  (key*100.0/conceptCount);
141 | 		    System.out.print(key + "(" + ratio + "%): ");
142 | 		    for(String v:values){
143 | 		        System.out.print(v+ "\t");
144 | 		    }
145 | 		    System.out.println();
146 | 		    if(ratio<0.7){
147 | 		        smallSememeCount += values.size();
148 | 		        smallAppearTotal += key*values.size();
149 | 		    }
150 | 		}		
151 | 		
152 | 		System.out.println("small info: ");
153 | 		System.out.println("\tdifferent sememes:" + smallSememeCount);
154 | 		System.out.println("\tappear count:" + smallAppearTotal);
155 |         System.out.println("\tratio:" + smallAppearTotal*100.0/conceptCount);
156 | 		System.out.println("wordCount:" + wordCount);
157 | 		System.out.println("conceptCount:" + conceptCount);
158 | 	}
159 | 
160 |     public static void main(String[] args) throws IOException {
161 |         Training training = new Training();
162 |         training.countSentimentDistribution();
163 | //        System.out.println("test positive:");
164 | //        training.test(true);
165 | //        
166 | //        System.out.println("test negative:");
167 |         //training.test(false);
168 |     }
169 | }
170 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/tendency/word/WordTendency.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.tendency.word;
 2 | 
 3 | /**
 4 |  * 计算词语的语义倾向性，词语的语义倾向性为一个介于[-1, 1]之间的实数，数值越大，褒义性越强，否则，贬义性越强
 5 |  * 
 6 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 7 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 8 |  */
 9 | public interface WordTendency {
10 | 	/**
11 | 	 * 获取词语的语义倾向性，词语的语义倾向性为一个介于[-1, 1]之间的实数，数值越大，褒义性越强，否则，贬义性越强
12 | 	 * @param word
13 | 	 * @return
14 | 	 */
15 | 	public double getTendency(String word);
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/ui/PhraseSimilarityUI.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.ui;
 2 | 
 3 | import java.awt.BorderLayout;
 4 | import java.awt.GridLayout;
 5 | import java.awt.event.ActionEvent;
 6 | import java.awt.event.ActionListener;
 7 | 
 8 | import javax.swing.BorderFactory;
 9 | import javax.swing.JButton;
10 | import javax.swing.JLabel;
11 | import javax.swing.JPanel;
12 | import javax.swing.JScrollPane;
13 | import javax.swing.JTextArea;
14 | import javax.swing.JTextField;
15 | 
16 | import ruc.irm.similarity.phrase.PhraseSimilarity;
17 | 
18 | /**
19 |  * 短语相似度的调用演示界面
20 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
21 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
22 |  */
23 | public class PhraseSimilarityUI {
24 | 
25 | 	/**
26 | 	 * 短语相似度的演示面板
27 | 	 * 
28 | 	 * @return
29 | 	 */
30 | 	public static JPanel createPanel() {
31 | 		// 声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
32 | 		JPanel fullPanel = new JPanel();
33 | 		fullPanel.setLayout(new BorderLayout());
34 | 
35 | 		JPanel northPanel = new JPanel();
36 | 		fullPanel.add(northPanel, "North");
37 | 
38 | 		// centerPanel包括了一个文本框
39 | 		JPanel centerPanel = new JPanel();
40 | 		fullPanel.add(centerPanel, "Center");
41 | 
42 | 		centerPanel.setLayout(new BorderLayout());
43 | 		final JTextArea result = new JTextArea();
44 | 		// result.setFont(new Font("宋体", Font.PLAIN, 16));
45 | 		result.setLineWrap(true);
46 | 		JScrollPane centerScrollPane = new JScrollPane(result);
47 | 		centerPanel.add(centerScrollPane, "Center");
48 | 
49 | 		northPanel.setLayout(new GridLayout(1, 1));
50 | 		// northPanel.add(createWordPanel());
51 | 		// northPanel.add(createCilinPanel());
52 | 
53 | 		// 以下加入northPanel中的第一个面板
54 | 		final JTextField field1 = new JTextField("");
55 | 		final JTextField field2 = new JTextField("");
56 | 		field1.setColumns(50);
57 | 		field2.setColumns(50);
58 | 
59 | 		JPanel mainPanel = new JPanel();
60 | 		mainPanel.setLayout(new GridLayout(3, 1));
61 | 
62 | 		JPanel linePanel = new JPanel();
63 | 		linePanel.add(new JLabel("短语1:"));
64 | 		linePanel.add(field1);
65 | 		mainPanel.add(linePanel);
66 | 
67 | 		linePanel = new JPanel();
68 | 		linePanel.add(new JLabel("短语2:"));
69 | 		linePanel.add(field2);
70 | 		mainPanel.add(linePanel);
71 | 
72 | 		linePanel = new JPanel();
73 | 		JButton goButton = new JButton("计算相似度");
74 | 		linePanel.add(goButton);
75 | 		mainPanel.add(linePanel);
76 | 		goButton.addActionListener(new ActionListener() {
77 | 
78 | 			@Override
79 | 			public void actionPerformed(ActionEvent e) {
80 | 				String phrase1 = field1.getText();
81 | 				String phrase2 = field2.getText();
82 | 				String text = "[" + phrase1 + "]与[" + phrase2 + "]的相似度为:";
83 | 				text = text + new PhraseSimilarity().getSimilarity(phrase1, phrase2);
84 | 				// text = text + "\n\n" + result.getText();
85 | 				result.setText(text);
86 | 			}
87 | 
88 | 		});
89 | 		mainPanel.setBorder(BorderFactory.createEtchedBorder());
90 | 		northPanel.add(mainPanel);
91 | 
92 | 		return fullPanel;
93 | 	}
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/ui/Start.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.ui;
 2 | 
 3 | import java.awt.Container;
 4 | import java.awt.Font;
 5 | import java.util.Enumeration;
 6 | 
 7 | import javax.swing.JFrame;
 8 | import javax.swing.JMenu;
 9 | import javax.swing.JMenuBar;
10 | import javax.swing.JMenuItem;
11 | import javax.swing.JScrollPane;
12 | import javax.swing.JTabbedPane;
13 | import javax.swing.SwingUtilities;
14 | import javax.swing.UIManager;
15 | import javax.swing.plaf.FontUIResource;
16 | 
17 | import ruc.irm.similarity.sentence.SegmentProxy;
18 | import ruc.irm.similarity.util.About;
19 | 
20 | /**
21 |  * 相似度计算软件包演示启动类
22 |  * 
23 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
24 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
25 |  */
26 | public class Start extends JFrame {
27 | 
28 | 	private static final long serialVersionUID = 85744461208L;
29 | 
30 | 	public Start() {
31 | 		this.setTitle("相似度计算演示程序");
32 | 		this.setSize(420, 700);
33 | 		this.setLocationRelativeTo(null);
34 | 		this.setDefaultCloseOperation(EXIT_ON_CLOSE);
35 | 
36 | 		// //////////////////////////////////
37 | 		// add menu
38 | 		JMenuBar menuBar = new JMenuBar();
39 | 		this.setJMenuBar(menuBar);
40 | 
41 | 		JMenu fileMenu = new JMenu("File");
42 | 		menuBar.add(fileMenu);
43 | 		fileMenu.add(new JMenuItem("Exit"));
44 | 
45 | 		JMenu helpMenu = new JMenu("Help");
46 | 		menuBar.add(helpMenu);
47 | 		helpMenu.add(new JMenuItem("Help"));
48 | 
49 | 		Container contentPane = this.getContentPane();
50 | 		JTabbedPane tabbedPane = new JTabbedPane();
51 | 		tabbedPane.add("词语", WordSimlarityUI.createPanel());
52 | 		tabbedPane.add("短语", PhraseSimilarityUI.createPanel());
53 | 		tabbedPane.add("句子", SentenceSimilarityUI.createPanel());
54 | 		// tabbedPane.add("文本", WordSimlarityUI.createPanel());
55 | 		tabbedPane.add("词法分析", SegmentProxy.createPanel());
56 | 		tabbedPane.add("义原树", SememeTreeUI.createPanel());
57 | 		tabbedPane.add("情感分析", TendencyUI.createPanel());
58 | 		tabbedPane.add("关于", About.createPanel());
59 | 		JScrollPane scrollPane = new JScrollPane(tabbedPane);
60 | 		contentPane.add(scrollPane);
61 | 		
62 | 		this.pack();
63 | 		setExtendedState(MAXIMIZED_BOTH);
64 | 	}
65 | 
66 | 	public static void InitGlobalFont(Font font) {
67 | 		FontUIResource fontRes = new FontUIResource(font);
68 | 		for (Enumeration<Object> keys = UIManager.getDefaults().keys(); keys.hasMoreElements();) {
69 | 			Object key = keys.nextElement();
70 | 			Object value = UIManager.get(key);
71 | 			if (value instanceof FontUIResource) {
72 | 				UIManager.put(key, fontRes);
73 | 			}
74 | 		}
75 | 	}
76 | 
77 | 	public static void main(String[] args) {
78 | 		//JFrame.setDefaultLookAndFeelDecorated(true);
79 | 		//解决字体在Ubuntu中显示有乱码的问题
80 | 		InitGlobalFont(new Font("Microsoft YaHei", Font.TRUETYPE_FONT, 12));
81 | 		SwingUtilities.invokeLater(new Runnable() {
82 | 
83 | 			public void run() {
84 | 				Start w = new Start();
85 | 				w.setVisible(true);
86 | 			}
87 | 		});
88 | 	}
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/java/ruc/irm/ui/TendencyUI.java:
--------------------------------------------------------------------------------
  1 | package ruc.irm.ui;
  2 | 
  3 | import java.awt.BorderLayout;
  4 | import java.awt.GridLayout;
  5 | import java.awt.event.ActionEvent;
  6 | import java.awt.event.ActionListener;
  7 | 
  8 | import javax.swing.BorderFactory;
  9 | import javax.swing.JButton;
 10 | import javax.swing.JFrame;
 11 | import javax.swing.JLabel;
 12 | import javax.swing.JPanel;
 13 | import javax.swing.JScrollPane;
 14 | import javax.swing.JTextArea;
 15 | import javax.swing.JTextField;
 16 | 
 17 | import ruc.irm.tendency.word.HownetWordTendency;
 18 | 
 19 | /**
 20 |  * 测试词语倾向性的用户调用演示界面
 21 |  * 
 22 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
 23 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室
 24 |  */
 25 | public class TendencyUI extends JFrame {
 26 |     private static final long serialVersionUID = -3976827963973640651L;
 27 | 
 28 |     public static JPanel createPanel(){
 29 |         //声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
 30 |         JPanel fullPanel = new JPanel();
 31 |         fullPanel.setLayout(new BorderLayout());
 32 | 
 33 |         JPanel northPanel = new JPanel();
 34 |         fullPanel.add(northPanel, "North");
 35 |         
 36 |         //centerPanel包括了一个文本框
 37 |         JPanel centerPanel = new JPanel();
 38 |         fullPanel.add(centerPanel, "Center");        
 39 |         centerPanel.setLayout(new BorderLayout());        
 40 |         final JTextArea result = new JTextArea();
 41 |         //result.setFont(new Font("宋体", Font.PLAIN, 16));
 42 |         result.setLineWrap(true);
 43 |         JScrollPane centerScrollPane = new JScrollPane(result);
 44 |         centerPanel.add(centerScrollPane, "Center");
 45 |         
 46 |         northPanel.setLayout(new GridLayout(1, 1));        
 47 |         
 48 |         //以下加入northPanel中的第一个面板
 49 |         final JTextField wordField = new JTextField("恶心");    
 50 |         wordField.setColumns(40);
 51 |         
 52 |         JPanel mainPanel = new JPanel();        
 53 |         mainPanel.setLayout(new GridLayout(2, 1));        
 54 |                 
 55 |         JPanel linePanel = new JPanel();
 56 |         linePanel.add(new JLabel("输入词语:"));        
 57 |         linePanel.add(wordField); 
 58 |         mainPanel.add(linePanel);
 59 |         
 60 |         linePanel = new JPanel();
 61 |         JButton goButton = new JButton("计算词语倾向");
 62 |         linePanel.add(goButton);
 63 |         mainPanel.add(linePanel);
 64 |         goButton.addActionListener(new ActionListener(){
 65 |             HownetWordTendency tendency = new HownetWordTendency();
 66 |             
 67 |             @Override
 68 |             public void actionPerformed(ActionEvent e) {
 69 |                 String word = wordField.getText();
 70 |                 double positive = tendency.getSentiment(word, HownetWordTendency.POSITIVE_SEMEMES);
 71 |                 double negative = tendency.getSentiment(word, HownetWordTendency.NEGATIVE_SEMEMES);
 72 |                 String text = "[" + word + "]的倾向分析结果为:" ;
 73 |                 
 74 |                 text = text + "\n正面接近程度=" + positive;
 75 |                 text = text + "\n负面接近程度=" + negative;
 76 |                 text = text + "\n倾向性=" + (positive - negative);                
 77 |                 text = text + "\n________________________________\n" + result.getText();
 78 |                 result.setText(text);
 79 |                 result.setCaretPosition(0);
 80 |             }
 81 |             
 82 |         });
 83 |         mainPanel.setBorder(BorderFactory.createEtchedBorder());
 84 |         northPanel.add(mainPanel);        
 85 |         
 86 |         return fullPanel;
 87 |     }
 88 | 
 89 |     public TendencyUI(){
 90 |         this.setTitle("词语倾向性演示");
 91 |         this.setSize(420, 700);
 92 |         this.setLocationRelativeTo(null);
 93 |         this.setDefaultCloseOperation(EXIT_ON_CLOSE);
 94 |         this.getContentPane().setLayout(new BorderLayout());
 95 |         this.getContentPane().add(createPanel());
 96 |     }
 97 |     
 98 |     public static void main(String[] args) {
 99 |         new TendencyUI().setVisible(true);
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/resources/about.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 	<head><title>xsimilarity</title></head>
 3 | 	<body>
 4 | 		<p>
 5 | 			<h2>XSimilarity</h2>
 6 | 		  项目地址：<a href="http://code.google.com/p/xsimilarity/">http://github.com/iamxiatian/xsimilarity/</a>
 7 | 		</p>
 8 | 		<p>
 9 | 			有任何问题或建议请与我们联系，您的反馈将有助于该项目的进一步完善。
10 | 		</p>
11 | 		<p>
12 |       <h2>致谢</h2>
13 |             本项目在研究过程中，得到了恩师樊孝忠教授的悉心指导，师恩如海，难以言谢！<br/>
14 | 			中国人民大学为本项目的持续研究提供了资金和计算机软硬件的支持，北京理工大学为本项目的早期研究提供了重要的基础设施，<br/>
15 | 			这些支持与国家的投入密不可分，
16 | 			本项目的开源和不断完善也算是对国家的点滴回报！<br/>
17 | 			代码中许多算法的核心思想来源于我们的研究同行和先辈们的已公开成果，另外，许多使用xsimilarity的人员对xsimilarity<br/>
18 | 			提出了宝贵的建议，在此一并表示深深的谢意！     <br/>
19 |         本工程使用了如下开源组件，对原作者致以谢意！
20 |         <ul>
21 |             <li>ANSJ： </li>
22 |         </ul>
23 |     </p>
24 | 		<p>
25 | 			<h2>联系方式</h2>
26 | 			夏天<br/>
27 | 			数据工程与知识工程教育部重点实验室（中国人民大学）<br/>
28 |       中国人民大学信息资源管理学院<br/>
29 |       电话: 86-10-82500675<br/>
30 |       Email: xiat(at)ruc.edu.cn<br/>
31 | 		</p>
32 | 		
33 | 	</body>
34 | </html>


--------------------------------------------------------------------------------
/src/main/resources/data/cilin.db.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/src/main/resources/data/cilin.db.gz


--------------------------------------------------------------------------------
/src/main/resources/data/concept.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/src/main/resources/data/concept.xml.gz


--------------------------------------------------------------------------------
/src/main/resources/data/sememe.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/src/main/resources/data/sememe.xml.gz


--------------------------------------------------------------------------------
/src/main/resources/log4j.dtd:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" ?>
  2 | 
  3 | <!-- Authors: Chris Taylor, Ceki Gulcu. -->
  4 | 
  5 | <!-- Version: 1.2 -->
  6 | 
  7 | <!-- A configuration element consists of optional renderer
  8 | elements,appender elements, categories and an optional root
  9 | element. -->
 10 | 
 11 | <!ELEMENT log4j:configuration (renderer*, appender*,(category|logger)*,root?,
 12 |                                categoryFactory?)>
 13 | 
 14 | <!-- The "threshold" attribute takes a level value such that all -->
 15 | <!-- logging statements with a level equal or below this value are -->
 16 | <!-- disabled. -->
 17 | 
 18 | <!-- Setting the "debug" enable the printing of internal log4j logging   -->
 19 | <!-- statements.                                                         -->
 20 | 
 21 | <!-- By default, debug attribute is "null", meaning that we not do touch -->
 22 | <!-- internal log4j logging settings. The "null" value for the threshold -->
 23 | <!-- attribute can be misleading. The threshold field of a repository	 -->
 24 | <!-- cannot be set to null. The "null" value for the threshold attribute -->
 25 | <!-- simply means don't touch the threshold field, the threshold field   --> 
 26 | <!-- keeps its old value.                                                -->
 27 |      
 28 | <!ATTLIST log4j:configuration
 29 |   xmlns:log4j              CDATA #FIXED "http://jakarta.apache.org/log4j/" 
 30 |   threshold                (all|debug|info|warn|error|fatal|off|null) "null"
 31 |   debug                    (true|false|null)  "null"
 32 | >
 33 | 
 34 | <!-- renderer elements allow the user to customize the conversion of  -->
 35 | <!-- message objects to String.                                       -->
 36 | 
 37 | <!ELEMENT renderer EMPTY>
 38 | <!ATTLIST renderer
 39 |   renderedClass  CDATA #REQUIRED
 40 |   renderingClass CDATA #REQUIRED
 41 | >
 42 | 
 43 | <!-- Appenders must have a name and a class. -->
 44 | <!-- Appenders may contain an error handler, a layout, optional parameters -->
 45 | <!-- and filters. They may also reference (or include) other appenders. -->
 46 | <!ELEMENT appender (errorHandler?, param*, layout?, filter*, appender-ref*)>
 47 | <!ATTLIST appender
 48 |   name 		ID 	#REQUIRED
 49 |   class 	CDATA	#REQUIRED
 50 | >
 51 | 
 52 | <!ELEMENT layout (param*)>
 53 | <!ATTLIST layout
 54 |   class		CDATA	#REQUIRED
 55 | >
 56 | 
 57 | <!ELEMENT filter (param*)>
 58 | <!ATTLIST filter
 59 |   class		CDATA	#REQUIRED
 60 | >
 61 | 
 62 | <!-- ErrorHandlers can be of any class. They can admit any number of -->
 63 | <!-- parameters. -->
 64 | 
 65 | <!ELEMENT errorHandler (param*, root-ref?, logger-ref*,  appender-ref?)> 
 66 | <!ATTLIST errorHandler
 67 |    class        CDATA   #REQUIRED 
 68 | >
 69 | 
 70 | <!ELEMENT root-ref EMPTY>
 71 | 
 72 | <!ELEMENT logger-ref EMPTY>
 73 | <!ATTLIST logger-ref
 74 |   ref IDREF #REQUIRED
 75 | >
 76 | 
 77 | <!ELEMENT param EMPTY>
 78 | <!ATTLIST param
 79 |   name		CDATA   #REQUIRED
 80 |   value		CDATA	#REQUIRED
 81 | >
 82 | 
 83 | 
 84 | <!-- The priority class is org.apache.log4j.Level by default -->
 85 | <!ELEMENT priority (param*)>
 86 | <!ATTLIST priority
 87 |   class   CDATA	#IMPLIED
 88 |   value	  CDATA #REQUIRED
 89 | >
 90 | 
 91 | <!-- The level class is org.apache.log4j.Level by default -->
 92 | <!ELEMENT level (param*)>
 93 | <!ATTLIST level
 94 |   class   CDATA	#IMPLIED
 95 |   value	  CDATA #REQUIRED
 96 | >
 97 | 
 98 | 
 99 | <!-- If no level element is specified, then the configurator MUST not -->
100 | <!-- touch the level of the named category. -->
101 | <!ELEMENT category (param*,(priority|level)?,appender-ref*)>
102 | <!ATTLIST category
103 |   class         CDATA   #IMPLIED
104 |   name		CDATA	#REQUIRED
105 |   additivity	(true|false) "true"  
106 | >
107 | 
108 | <!-- If no level element is specified, then the configurator MUST not -->
109 | <!-- touch the level of the named logger. -->
110 | <!ELEMENT logger (level?,appender-ref*)>
111 | <!ATTLIST logger
112 |   name		ID	#REQUIRED
113 |   additivity	(true|false) "true"  
114 | >
115 | 
116 | 
117 | <!ELEMENT categoryFactory (param*)>
118 | <!ATTLIST categoryFactory 
119 |    class        CDATA #REQUIRED>
120 | 
121 | <!ELEMENT appender-ref EMPTY>
122 | <!ATTLIST appender-ref
123 |   ref IDREF #REQUIRED
124 | >
125 | 
126 | <!-- If no priority element is specified, then the configurator MUST not -->
127 | <!-- touch the priority of root. -->
128 | <!-- The root category always exists and cannot be subclassed. -->
129 | <!ELEMENT root (param*, (priority|level)?, appender-ref*)>
130 | 
131 | 
132 | <!-- ==================================================================== -->
133 | <!--                       A logging event                                -->
134 | <!-- ==================================================================== -->
135 | <!ELEMENT log4j:eventSet (log4j:event*)>
136 | <!ATTLIST log4j:eventSet
137 |   xmlns:log4j             CDATA #FIXED "http://jakarta.apache.org/log4j/" 
138 |   version                (1.1|1.2) "1.2" 
139 |   includesLocationInfo   (true|false) "true"
140 | >
141 | 
142 | 
143 | 
144 | <!ELEMENT log4j:event (log4j:message, log4j:NDC?, log4j:throwable?, 
145 |                        log4j:locationInfo?) >
146 | 
147 | <!-- The timestamp format is application dependent. -->
148 | <!ATTLIST log4j:event
149 |     logger     CDATA #REQUIRED
150 |     level      CDATA #REQUIRED
151 |     thread     CDATA #REQUIRED
152 |     timestamp  CDATA #REQUIRED
153 | >
154 | 
155 | <!ELEMENT log4j:message (#PCDATA)>
156 | <!ELEMENT log4j:NDC (#PCDATA)>
157 | 
158 | <!ELEMENT log4j:throwable (#PCDATA)>
159 | 
160 | <!ELEMENT log4j:locationInfo EMPTY>
161 | <!ATTLIST log4j:locationInfo
162 |   class  CDATA	#REQUIRED
163 |   method CDATA	#REQUIRED
164 |   file   CDATA	#REQUIRED
165 |   line   CDATA	#REQUIRED
166 | >
167 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
 3 | <log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
 4 | 
 5 |   <appender name="console-appender" class="org.apache.log4j.ConsoleAppender">
 6 |     <layout class="org.apache.log4j.PatternLayout">
 7 |       <!--<param name="ConversionPattern" value="%d [%t] %c [%-5p] - %m%n"/>-->
 8 |       <param name="ConversionPattern" value="%d %-5p %c.%M(line %L) %x: %m%n"/>
 9 |     </layout>
10 |   </appender>
11 | 
12 |   <appender name="file-appender" class="org.apache.log4j.FileAppender">
13 |     <param name="File" value="xsimilarity.log"/>
14 |     <param name="Append" value="true"/>
15 |     <layout class="org.apache.log4j.PatternLayout">
16 |       <param name="ConversionPattern" value="%d [%t] %c [%-5p] - %m%n"/>
17 |     </layout>
18 |   </appender>
19 | 
20 |   <logger name="ke">
21 |     <level value="debug"/>
22 |   </logger>
23 |   
24 |   <logger name="org">
25 |     <level value="info"/>
26 |   </logger>
27 | 
28 |   <logger name="org.apache">
29 |     <level value="error"/>
30 |   </logger>
31 |   
32 |   <root>
33 |     <priority value="error"/>
34 |     <appender-ref ref="console-appender"/>
35 |     <appender-ref ref="file-appender"/>
36 |   </root>
37 | 
38 | </log4j:configuration>
39 | 


--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/sentence/MorphoSimilarityTest.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.sentence;
 2 | 
 3 | import org.junit.Test;
 4 | import ruc.irm.similarity.sentence.morphology.MorphoSimilarity;
 5 | import ruc.irm.similarity.sentence.morphology.SemanticSimilarity;
 6 | 
 7 | public class MorphoSimilarityTest {
 8 | 
 9 |     @Test
10 |     public void test() {
11 |         String s1 = "一个伟大的国家，中国";
12 |         String s2 = "中国是一个伟大的国家";
13 | 
14 |         s1="修改下密码";
15 |         s2="密码修改";
16 |         MorphoSimilarity similarity = MorphoSimilarity.getInstance();
17 |         double sim = similarity.getSimilarity(s1, s2);
18 |         System.out.println("sim ==> " + sim);
19 |     }
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/sentence/SemanticSimilarityTest.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.sentence;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import ruc.irm.similarity.sentence.morphology.SemanticSimilarity;
 6 | 
 7 | public class SemanticSimilarityTest {
 8 | 
 9 |     @Test
10 |     public void test() {
11 |         String s1 = "一个伟大的国家，中国";
12 |         String s2 = "中国是一个伟大的国家";
13 | 
14 | //        s1="修改下密码";
15 | //        s2="密码修改";
16 |         SemanticSimilarity similarity = SemanticSimilarity.getInstance();
17 |         double sim = similarity.getSimilarity(s1, s2);
18 |         System.out.println("sim ==> " + sim);
19 |         
20 |         
21 |     }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/statistic/DictStatisticTest.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.statistic;
 2 | 
 3 | import junit.framework.TestCase;
 4 | 
 5 | /**
 6 |  * ./db/coredict.xml.gz是利用的ictclas4j的词典文件，这个文件可以从lib/ictclas4j.jar文件中得到。
 7 |  * 即：把ictclas4j.jar文件解压开，里面的dictionary目录下有coredict.xml.gz文件。
 8 |  * 
 9 | * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
10 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
11 | */
12 | public class DictStatisticTest extends TestCase {
13 |     public void testCount(){
14 |         DictStatistic ds = new DictStatistic();
15 |         ds.testFromXml("./db/coredict.xml.gz", true);
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/word/CharBasedSimilarityTest.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word;
 2 | 
 3 | import junit.framework.TestCase;
 4 | 
 5 | public class CharBasedSimilarityTest extends TestCase {
 6 |     public void test() {
 7 |         CharBasedSimilarity sim = new CharBasedSimilarity();
 8 |         String s1 = "手机";
 9 |         String s2 = "飞机";
10 | 
11 |         assertTrue(sim.getSimilarity(s1, s2) > 0);
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/word/hownet/ConceptTest.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | import ruc.irm.similarity.word.hownet2.concept.Concept;
 6 | 
 7 | import com.google.common.collect.ArrayListMultimap;
 8 | import com.google.common.collect.HashMultimap;
 9 | import com.google.common.collect.Multimap;
10 | 
11 | public class ConceptTest {
12 | 	public static void main(String[] args) {
13 | 		Multimap<String, Concept> CONCEPTS = HashMultimap.create();
14 | //		CONCEPTS = ArrayListMultimap.create();
15 | 		
16 | 		CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
17 | 		CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
18 | 		CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
19 | 		CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
20 | 		
21 | 		Collection<Concept> collection = CONCEPTS.get("打");
22 | 		for(Concept c:collection){
23 | 			System.out.println(c);
24 | 		}
25 | 		
26 | 		Multimap<String, Integer> map = HashMultimap.create();
27 | //	map = ArrayListMultimap.create();
28 | 	
29 | 	map.put("打", 1);
30 | 	map.put("打", 1);
31 | 	map.put("打", 1);
32 | 	map.put("打", 2);
33 | 	
34 | 	Collection<Integer> cc = map.get("打");
35 | 	for(Integer i:cc){
36 | 		System.out.println(i);
37 | 	}
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/word/hownet/SememeTest.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet;
 2 | 
 3 | import java.io.InputStream;
 4 | 
 5 | import ruc.irm.similarity.util.FileUtils;
 6 | import ruc.irm.similarity.word.hownet.sememe.Sememe;
 7 | import ruc.irm.similarity.word.hownet.sememe.SememeDictTraverseEvent;
 8 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser;
 9 | 
10 | 
11 | /**
12 |  * 针对义原的测试
13 |  * 
14 |  * @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
15 |  * @organization 中国人民大学信息资源管理学院 知识工程实验室 
16 |  */
17 | public class SememeTest {
18 | 	public static void main(String[] args) throws Exception{
19 | 		String id1 = "2-1-3-4";
20 | //		String id2 = "2-1-2";
21 | //		System.out.println(getDistance(id1, id2));
22 | //		System.out.println(getSimilarityBySememeId(id1, id2));
23 | 		
24 | 		int pos = id1.lastIndexOf("-");
25 | 		String parentId = "root";
26 | 		if(pos>0){
27 | 			parentId = id1.substring(0, pos);
28 | 		}
29 | 		System.out.println(parentId);
30 | 		new XiaSememeParser().getSimilarity("test", "hello");
31 | 	}
32 | 	
33 | 	static void saveXML() throws Exception{
34 | 		String sememeFile = Sememe.class.getPackage().getName().replaceAll("\\.", "/") + "/sememe.dat";		
35 | 		InputStream input = Sememe.class.getClassLoader().getResourceAsStream(sememeFile);
36 | 		SememeDictTraverseEvent event = new SememeDictTraverseEvent();		
37 | 		
38 | 		FileUtils.traverseLines(input, "utf8", event);
39 | 		event.saveToXML("/home/xiatian/Desktop/sememe.xml");
40 | 	}
41 | 	
42 | 	static double getSimilarityBySememeId(final String id1, final String id2) {		
43 | 		
44 | 		int position = 0;
45 | 		String[] array1 = id1.split("-");
46 | 		String[] array2 = id2.split("-");
47 | 		for (position = 0; position < array1.length && position < array2.length; position++) {
48 | 			if (!array1[position].equals(array2[position])) {
49 | 				break;
50 | 			}
51 | 		}
52 | 		
53 | 		return 2.0*position/(array1.length + array2.length);
54 | 	}
55 | 
56 | 	static int getDistance(String id1, String id2) {
57 | 		// 两个Id相同的位置终止地方
58 | 		int position = 0;
59 | 		String[] array1 = id1.split("-");
60 | 		String[] array2 = id2.split("-");
61 | 		for (position = 0; position < array1.length && position < array2.length; position++) {
62 | 			if (!array1[position].equals(array2[position])) {
63 | 				return array1.length + array2.length - position - position;
64 | 			}
65 | 		}
66 | 
67 | 		if (array1.length == array2.length) {
68 | 			return 0;
69 | 		} else if (array1.length == position) {
70 | 			return array2.length - position;
71 | 		} else {
72 | 			return array1.length - position;
73 | 		}
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/word/hownet2/HownetSimilarityTest.java:
--------------------------------------------------------------------------------
 1 | package ruc.irm.similarity.word.hownet2;
 2 | 
 3 | import junit.framework.TestCase;
 4 | 
 5 | import org.junit.Before;
 6 | import org.junit.Test;
 7 | 
 8 | import ruc.irm.similarity.word.hownet2.concept.LiuConceptParser;
 9 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
10 | 
11 | public class HownetSimilarityTest  extends TestCase {
12 |     XiaConceptParser xParser = null;
13 |     LiuConceptParser lParser = null;
14 |     
15 |     @Before
16 |     public void setUp(){
17 |         xParser = XiaConceptParser.getInstance();
18 |         lParser = LiuConceptParser.getInstance();
19 |     }
20 |     
21 |     @Test
22 |     public void testWordSimiarltiy(){
23 |         String word1 = "电动车";
24 |         String word2 = "自行车";
25 |         double x_sim = xParser.getSimilarity(word1, word2);
26 |         double l_sim = lParser.getSimilarity(word1, word2);
27 |         assertTrue(x_sim>l_sim);
28 |         assertTrue(x_sim>0.2);
29 |     }
30 |     
31 |     /**
32 |      * 该词语计算相似度时出现死循环，bug由北京大学计算语言学研究所万富强提供，fqw0000@gmail.com
33 |      */
34 |     @Test
35 |     public void testWordSimiarltiy2(){
36 |         String word1 = "算法";
37 |         String word2 = "安提瓜和巴布达";
38 |         double x_sim = xParser.getSimilarity(word1, word2);
39 |         double l_sim = lParser.getSimilarity(word1, word2);
40 |         assertTrue(x_sim>=l_sim);
41 |         System.out.println("x_sim:" + x_sim);
42 |         System.out.println("l_sim:" + l_sim);
43 |         
44 |     }
45 | }
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/中文信息相似度计算理论与方法图书目录.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/中文信息相似度计算理论与方法图书目录.pdf


--------------------------------------------------------------------------------