├── HISTORY.md
├── README.md
├── REVISION.md
├── TODO.md
├── dict
├── sentiment
│ ├── 主张词语(中文).txt
│ ├── 主张词语(英文).txt
│ ├── 正面情感词语(中文).txt
│ ├── 正面情感词语(英文).txt
│ ├── 正面评价词语(中文).txt
│ ├── 正面评价词语(英文).txt
│ ├── 程度级别词语(中文).txt
│ ├── 程度级别词语(英文).txt
│ ├── 统计结果.txt
│ ├── 负面情感词语(中文).txt
│ ├── 负面情感词语(英文).txt
│ ├── 负面评价词语(中文).txt
│ └── 负面评价词语(英文).txt
├── tendency
│ └── tendency.xml
└── user-concept.xml
├── docs
├── LCMC.zip
└── 中文信息相似度计算理论与方法图书目录.pdf
├── pom.xml
├── src
├── main
│ ├── java
│ │ └── ruc
│ │ │ └── irm
│ │ │ ├── classification
│ │ │ ├── Feature.java
│ │ │ ├── Instance.java
│ │ │ ├── NaiveBayesClassifier.java
│ │ │ └── Variable.java
│ │ │ ├── similarity
│ │ │ ├── Similaritable.java
│ │ │ ├── SimilarityFactory.java
│ │ │ ├── phrase
│ │ │ │ └── PhraseSimilarity.java
│ │ │ ├── sentence
│ │ │ │ ├── SegmentProxy.java
│ │ │ │ ├── SentenceSimilarity.java
│ │ │ │ ├── editdistance
│ │ │ │ │ ├── Block.java
│ │ │ │ │ ├── CharEditUnit.java
│ │ │ │ │ ├── ChunkEditUnit.java
│ │ │ │ │ ├── EditDistance.java
│ │ │ │ │ ├── EditUnit.java
│ │ │ │ │ ├── GregorEditDistance.java
│ │ │ │ │ ├── Split.java
│ │ │ │ │ ├── StandardEditDistance.java
│ │ │ │ │ ├── SuperString.java
│ │ │ │ │ ├── WordEditUnit.java
│ │ │ │ │ ├── XiatianEditDistance.java
│ │ │ │ │ └── XiatianEditDistance2.java
│ │ │ │ └── morphology
│ │ │ │ │ ├── MorphoSimilarity.java
│ │ │ │ │ └── SemanticSimilarity.java
│ │ │ ├── statistic
│ │ │ │ ├── DictStatistic.java
│ │ │ │ └── LCMC.java
│ │ │ ├── text
│ │ │ │ └── DiceSimilarity.java
│ │ │ ├── util
│ │ │ │ ├── About.java
│ │ │ │ ├── BlankUtils.java
│ │ │ │ ├── EditDistance.java
│ │ │ │ ├── FileUtils.java
│ │ │ │ ├── MathUtils.java
│ │ │ │ ├── PinyinUtils.java
│ │ │ │ ├── TraverseEvent.java
│ │ │ │ ├── XmlException.java
│ │ │ │ └── XmlUtils.java
│ │ │ └── word
│ │ │ │ ├── CharBasedSimilarity.java
│ │ │ │ ├── WordSimilarity.java
│ │ │ │ ├── cilin
│ │ │ │ ├── Cilin.java
│ │ │ │ ├── CilinCoding.java
│ │ │ │ └── CilinDb.java
│ │ │ │ ├── hownet
│ │ │ │ ├── Hownet.java
│ │ │ │ ├── HownetMeta.java
│ │ │ │ ├── concept
│ │ │ │ │ ├── Concept.java
│ │ │ │ │ ├── ConceptDictTraverseEvent.java
│ │ │ │ │ ├── ConceptLinkedList.java
│ │ │ │ │ ├── ConceptParser.java
│ │ │ │ │ ├── LiuConceptParser.java
│ │ │ │ │ ├── MyConceptParser.java
│ │ │ │ │ └── concept.dat
│ │ │ │ └── sememe
│ │ │ │ │ ├── FastSimpleMap.java
│ │ │ │ │ ├── LiuqunSememeParser.java
│ │ │ │ │ ├── MySememeParser.java
│ │ │ │ │ ├── Sememe.java
│ │ │ │ │ ├── SememeDictTraverseEvent.java
│ │ │ │ │ ├── SememeParser.java
│ │ │ │ │ ├── SememeType.java
│ │ │ │ │ └── sememe.dat
│ │ │ │ ├── hownet2
│ │ │ │ ├── concept
│ │ │ │ │ ├── BaseConceptParser.java
│ │ │ │ │ ├── Concept.java
│ │ │ │ │ ├── ConceptDictTraverseEvent.java
│ │ │ │ │ ├── ConceptLinkedList.java
│ │ │ │ │ ├── LiuConceptParser.java
│ │ │ │ │ └── XiaConceptParser.java
│ │ │ │ └── sememe
│ │ │ │ │ ├── BaseSememeParser.java
│ │ │ │ │ ├── LiuqunSememeParser.java
│ │ │ │ │ ├── Sememe.java
│ │ │ │ │ ├── SememeType.java
│ │ │ │ │ └── XiaSememeParser.java
│ │ │ │ └── pinyin
│ │ │ │ └── PinyinSimilarity.java
│ │ │ ├── tendency
│ │ │ └── word
│ │ │ │ ├── HownetWordTendency.java
│ │ │ │ ├── Training.java
│ │ │ │ └── WordTendency.java
│ │ │ └── ui
│ │ │ ├── PhraseSimilarityUI.java
│ │ │ ├── SememeTreeUI.java
│ │ │ ├── SentenceSimilarityUI.java
│ │ │ ├── Start.java
│ │ │ ├── TendencyUI.java
│ │ │ └── WordSimlarityUI.java
│ └── resources
│ │ ├── about.html
│ │ ├── data
│ │ ├── F02-GB2312-to-PuTongHua-PinYin.txt
│ │ ├── cilin.db.gz
│ │ ├── concept.xml.gz
│ │ └── sememe.xml.gz
│ │ ├── log4j.dtd
│ │ └── log4j.xml
└── test
│ └── java
│ └── ruc
│ └── irm
│ └── similarity
│ ├── sentence
│ ├── MorphoSimilarityTest.java
│ └── SemanticSimilarityTest.java
│ ├── statistic
│ └── DictStatisticTest.java
│ └── word
│ ├── CharBasedSimilarityTest.java
│ ├── hownet
│ ├── ConceptTest.java
│ └── SememeTest.java
│ └── hownet2
│ └── HownetSimilarityTest.java
└── 中文信息相似度计算理论与方法图书目录.pdf
/HISTORY.md:
--------------------------------------------------------------------------------
1 | 变更历史
2 | ================
3 |
4 | 2014-04: 把中文分词用ansj替换为原先的ictclas4j,在此对原作者表示感谢!把工程更改为maven工程,方便管理。
5 | 2014-08: 修正了SemanticSimilarity中的数组循环错误
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | 说明
3 | =====================
4 | 汉语词语、组块、句子以及文本篇章等各个层面的相似度计算是中文信息处理领域的一项基础而又核心的工作,它直接决定着相关领域的研究发展状况,例如,在知识工程、基于实例的机器翻译、信息检索、自动问答以及拼写检查等方面,相似度计算都是一个非常关键的问题,长期以来一直是人们研究的一个热点和难点。相似度的研究涉及词语、组块、句子以及篇章等多个层面,目前的研究主要侧重于词语方面,提出了一些比较有代表性的理论与方法,如字面相似度算法、词素相似度算法,以及基于同义词词林、知网等语义词典的方法,国外的方法则主要包括基于构成字符的相似度计算方法、基于WORDNET的计算方法、基于词典注释的方法、基于大规模语料库统计的方法和基于搜索引擎的方法;有关组块、短语级别的相似度的研究现在还比较少,常用的方法是在词语相似度计算的基础上,借用句子相似度的计算方法计算组块之间的相似度。在句子层面的相似度计算方面,国外研究主要集中在字符串的相似度计算,国内则主要以词语为基本处理单元,通过计算相同词语所占的比重确定句子之间的相似度;文本层面的则集中于利用统计方法实现相似度计算。
5 |
6 | xsimilarity项目为我们在相似度计算领域所取得的部分成果的Java代码实现,部分凌乱的代码已被去除,待重构之后再加入到工程之中。在相似度计算的研究过程中,许多研究学者的成果公布和无私帮助让我们受益匪浅,我们把代码开源出来,既是对前辈们表达我们的尊重之情,也希望能对大家共同的研究社区能有点滴贡献,能避免一些重复工作。
7 |
8 | xsimilarity项目中所体现的思想或许还比较幼稚,希望高手们能用宽容的胸襟对待,并不吝赐教,我们也将根据研究进展情况和大家的实际需求,不断改进,同时也欢迎大家加入到这个项目的开发过程中来,共同推进相似度计算在中国的研究。
9 |
10 | xsimilarity项目中的理论知识大家可以参考doc目录下的文章,以及《中文信息相似度计算理论与方法》一书,重要的参考资料、程序资源在书中已经提到,如有需要,我们在今后将单独整理成列表,供大家参考。
11 |
12 | 大家可以通过Eclipse导入项目,并运行ruc.irm.ui.Start进行快速测试。
13 |
14 | 联系方式:xiat(at)ruc.edu.cn
15 |
16 |
17 | 编译运行
18 | =======================
19 | 首先确保系统中安装maven.
20 |
21 | 如果要生成Intellij IDEA的工程文件,请进入命令行,在项目主目录下执行:
22 |
23 | ```mvn idea:idea```
24 |
25 | 如要生成eclipse的工程文件,则执行:
26 |
27 | ```mvn eclipse:eclipse```
28 |
29 | 要编译代码并在命令行运行测试:
30 |
31 | ```mvn compile```
32 |
33 | ```mvn dependency:copy-dependencies```
34 |
35 | ```./run.py Start```
36 |
37 | 即可打开主界面,进行测试
38 |
39 | (注:开发测试所用的操作系统为Ubuntu,如为Windows,请自行修改run.py脚本)
40 |
41 |
42 |
43 | 设想
44 | ========================
45 | 尝试把潜在和显性语义分析技术加入到xsimilarity中,并简化使用方式,方便初学者使用,但因个人精力受限,目前尚为开始集成处理。
46 |
47 | 定个时间点:如果star数量超过500,再开始更新并把最近几年的相关研究成果集成进去。
48 |
49 |
50 | 欢迎有兴趣的人员与我联系,一起扩展xsimilarity的功能和实用性。
51 |
52 |
53 |
54 | 致谢
55 | ========================
56 | ansj中文分词
57 |
58 |
--------------------------------------------------------------------------------
/REVISION.md:
--------------------------------------------------------------------------------
1 | 错误修订
2 | =====================
3 |
4 | 1. 第三章概念词语的相似度计算部分的公式:
5 | Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ β1 βi Sim i (C1, C2)
6 | 应为: Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ Sim1(C1, C2) βi Sim i (C1, C2)
7 | 可参考以下代码实现: i
8 | @Override
9 | protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4) {
10 | return beta1 * sim_v1 + beta2 * sim_v1 * sim_v2 + beta3 * sim_v1 * sim_v3 + beta4 * sim_v1 * sim_v4;
11 | }
12 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | PLAN
2 | =============================
3 |
4 | * 加入ESA和LSA处理,这两部分已经单独实现,但都比较复杂,如有精力和时间,考虑把ESA的某个快照结果打包,加入xsimilarity
--------------------------------------------------------------------------------
/dict/sentiment/主张词语(中文).txt:
--------------------------------------------------------------------------------
1 | 中文主张词语 38
2 |
3 | 1. {perception|感知} 22
4 | 察觉
5 | 触目
6 | 耳闻
7 | 发
8 | 发觉
9 | 发现
10 | 风闻
11 | 感
12 | 感觉
13 | 感觉到
14 | 感受到
15 | 见到
16 | 见得
17 | 觉
18 | 觉得
19 | 看得出来
20 | 窥见
21 | 领教
22 | 听说
23 | 痛感
24 | 预感
25 | 自觉
26 |
27 | 2. {regard|认为} 16
28 | 抱定
29 | 当
30 | 道
31 | 感到
32 | 感觉
33 | 觉得
34 | 看
35 | 看待
36 | 论
37 | 认定
38 | 认为
39 | 认准
40 | 想
41 | 相信
42 | 以为
43 | 主张
44 |
--------------------------------------------------------------------------------
/dict/sentiment/主张词语(英文).txt:
--------------------------------------------------------------------------------
1 | 英文主张词语 35
2 |
3 | 1. {perception|感知} 21
4 | be aware of
5 | be conscious
6 | be conscious of
7 | be told
8 | become aware of
9 | detect
10 | discern
11 | discover
12 | feel
13 | find
14 | get a glimpse of
15 | get wind of
16 | have a premonition
17 | hear of
18 | keenly feel
19 | learn through hearsay
20 | meet the eye
21 | notice
22 | perceive
23 | see
24 | sense
25 |
26 | {regard|认为} 14
27 | advocate
28 | believe
29 | consider
30 | feel
31 | firmly believe
32 | hold
33 | look upon
34 | maintain
35 | regard
36 | sense
37 | set one's mind on
38 | stand for
39 | suppose
40 | think
--------------------------------------------------------------------------------
/dict/sentiment/程度级别词语(中文).txt:
--------------------------------------------------------------------------------
1 | 中文程度级别词语 219
2 |
3 | 1. “极其|extreme / 最|most” 69
4 | 百分之百
5 | 倍加
6 | 备至
7 | 不得了
8 | 不堪
9 | 不可开交
10 | 不亦乐乎
11 | 不折不扣
12 | 彻头彻尾
13 | 充分
14 | 到头
15 | 地地道道
16 | 非常
17 | 极
18 | 极度
19 | 极端
20 | 极其
21 | 极为
22 | 截然
23 | 尽
24 | 惊人地
25 | 绝
26 | 绝顶
27 | 绝对
28 | 绝对化
29 | 刻骨
30 | 酷
31 | 满
32 | 满贯
33 | 满心
34 | 莫大
35 | 奇
36 | 入骨
37 | 甚为
38 | 十二分
39 | 十分
40 | 十足
41 | 死
42 | 滔天
43 | 痛
44 | 透
45 | 完全
46 | 完完全全
47 | 万
48 | 万般
49 | 万分
50 | 万万
51 | 无比
52 | 无度
53 | 无可估量
54 | 无以复加
55 | 无以伦比
56 | 要命
57 | 要死
58 | 已极
59 | 已甚
60 | 异常
61 | 逾常
62 | 贼
63 | 之极
64 | 之至
65 | 至极
66 | 卓绝
67 | 最为
68 | 佼佼
69 | 郅
70 | 綦
71 | 齁
72 | 最
73 |
74 | 2. “很|very” 42
75 | 不过
76 | 不少
77 | 不胜
78 | 惨
79 | 沉
80 | 沉沉
81 | 出奇
82 | 大为
83 | 多
84 | 多多
85 | 多加
86 | 多么
87 | 分外
88 | 格外
89 | 够瞧的
90 | 够戗
91 | 好
92 | 好不
93 | 何等
94 | 很
95 | 很是
96 | 坏
97 | 可
98 | 老
99 | 老大
100 | 良
101 | 颇
102 | 颇为
103 | 甚
104 | 实在
105 | 太
106 | 太甚
107 | 特
108 | 特别
109 | 尤
110 | 尤其
111 | 尤为
112 | 尤以
113 | 远
114 | 着实
115 | 曷
116 | 碜
117 |
118 | 3. “较|more” 37
119 | 大不了
120 | 多
121 | 更
122 | 更加
123 | 更进一步
124 | 更为
125 | 还
126 | 还要
127 | 较
128 | 较比
129 | 较为
130 | 进一步
131 | 那般
132 | 那么
133 | 那样
134 | 强
135 | 如斯
136 | 益
137 | 益发
138 | 尤甚
139 | 逾
140 | 愈
141 | 愈 ... 愈
142 | 愈发
143 | 愈加
144 | 愈来愈
145 | 愈益
146 | 远远
147 | 越 ... 越
148 | 越发
149 | 越加
150 | 越来越
151 | 越是
152 | 这般
153 | 这样
154 | 足
155 | 足足
156 |
157 | 4. “稍|-ish” 29
158 | 点点滴滴
159 | 多多少少
160 | 怪
161 | 好生
162 | 还
163 | 或多或少
164 | 略
165 | 略加
166 | 略略
167 | 略微
168 | 略为
169 | 蛮
170 | 稍
171 | 稍稍
172 | 稍微
173 | 稍为
174 | 稍许
175 | 挺
176 | 未免
177 | 相当
178 | 些
179 | 些微
180 | 些小
181 | 一点
182 | 一点儿
183 | 一些
184 | 有点
185 | 有点儿
186 | 有些
187 |
188 | 5. “欠|insufficiently” 12
189 | 半点
190 | 不大
191 | 不丁点儿
192 | 不甚
193 | 不怎么
194 | 聊
195 | 没怎么
196 | 轻度
197 | 弱
198 | 丝毫
199 | 微
200 | 相对
201 |
202 | 6. “超|over” 30
203 | 不为过
204 | 超
205 | 超额
206 | 超外差
207 | 超微结构
208 | 超物质
209 | 出头
210 | 多
211 | 浮
212 | 过
213 | 过度
214 | 过分
215 | 过火
216 | 过劲
217 | 过了头
218 | 过猛
219 | 过热
220 | 过甚
221 | 过头
222 | 过于
223 | 过逾
224 | 何止
225 | 何啻
226 | 开外
227 | 苦
228 | 老
229 | 偏
230 | 强
231 | 溢
232 | 忒
233 |
234 |
235 |
236 |
--------------------------------------------------------------------------------
/dict/sentiment/程度级别词语(英文).txt:
--------------------------------------------------------------------------------
1 | 英文程度级别词语 170
2 |
3 | 1. “极其|extreme / 最|most” 64
4 | 100 percent
5 | absolute
6 | absolutely
7 | alarmingly
8 | amazingly
9 | as fully as possible
10 | astonishingly
11 | awfully
12 | beyond challenge
13 | beyond compare
14 | beyond comparison
15 | beyond measure
16 | bitterly
17 | by all means
18 | completely
19 | deep-rooted
20 | deep-seated
21 | deeply
22 | definitely
23 | disastrously
24 | downright
25 | entirely
26 | exceedingly
27 | excessively
28 | extreme
29 | extremely
30 | fully
31 | greatest
32 | greatly
33 | heinous
34 | hundred-percent
35 | immensely
36 | immoderate
37 | in a penetrating way
38 | in every possible way
39 | in the extreme
40 | incomparably
41 | ingrained
42 | matchlessly
43 | monstrous
44 | most
45 | of the highest degree
46 | out-and-out
47 | outstanding
48 | outstandingly
49 | reach the limit
50 | right-down
51 | sharply
52 | sheer
53 | superb
54 | terribly
55 | to death
56 | to the full
57 | to the letter
58 | to the limit
59 | to the marrow
60 | to the utmost
61 | totally
62 | towering
63 | unusually
64 | utmost
65 | utterly
66 | very much
67 | most
68 |
69 | 2. “很|very” 25
70 | a lot
71 | awfully
72 | badly
73 | better
74 | by far
75 | considerably
76 | deep
77 | disastrously
78 | especially
79 | extraordinarily
80 | extremely
81 | greatly
82 | how
83 | however
84 | indeed
85 | much
86 | particularly
87 | really
88 | terribly
89 | to a serious degree
90 | too far
91 | too much
92 | unusually
93 | very
94 | what a
95 |
96 | 3. “较|more” 22
97 | all the more
98 | as much as
99 | at the worst
100 | by far
101 | comparatively
102 | even more
103 | further
104 | further more
105 | in that way
106 | increasingly
107 | like that
108 | more
109 | more and more
110 | more so
111 | much more
112 | plus
113 | relatively
114 | slightly more
115 | so
116 | still more
117 | such
118 | the more ... the more
119 |
120 | 4. “稍|-ish” 15
121 | a bit
122 | a bit too
123 | a little
124 | a little bit
125 | a little more
126 | fairly
127 | more or less
128 | passably
129 | pretty
130 | quite
131 | rather
132 | slightly
133 | some
134 | somewhat
135 | to some extent
136 |
137 | 5. “欠|insufficiently” 11
138 | a little less
139 | just
140 | light
141 | merely
142 | not particularly
143 | not too
144 | not very
145 | relative
146 | slight
147 | slightest degree of
148 | slightly
149 |
150 | 6. “超|over” 33
151 | a little over
152 | above
153 | above measure
154 | above quota
155 | and more
156 | excessive
157 | excessively
158 | exorbitance
159 | extra
160 | far more than
161 | hyperphysical
162 | inflated
163 | inordinate
164 | not too much
165 | odd
166 | outrageousness
167 | over
168 | over-
169 | overdone
170 | overheated
171 | plus
172 | slightly more
173 | super
174 | superheated
175 | superheterodyne
176 | surplus
177 | to a fault
178 | too
179 | too much
180 | ultra
181 | ultrastructural
182 | undue
183 | unduly
184 |
185 |
186 |
--------------------------------------------------------------------------------
/dict/tendency/tendency.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/dict/user-concept.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/docs/LCMC.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/docs/LCMC.zip
--------------------------------------------------------------------------------
/docs/中文信息相似度计算理论与方法图书目录.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/docs/中文信息相似度计算理论与方法图书目录.pdf
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | ruc.irm
5 | xsimilarity
6 | jar
7 | xsimilarity
8 | 0.1
9 | xsimilarity
10 | https://github.com/iamxiatian/xsimilarity
11 |
12 |
13 | The Apache Software License, Version 2.0
14 | http://www.apache.org/licenses/LICENSE-2.0.txt
15 | repo
16 |
17 |
18 |
19 |
20 |
21 |
22 | cengtral
23 | http://repo1.maven.org/maven2/
24 |
25 |
26 |
27 |
28 |
29 | summer
30 | summer
31 | xiat(at)ruc.edu.cn
32 |
33 |
34 |
35 |
36 | 1.8
37 | 1.8
38 | UTF-8
39 | 3.3.1
40 | 1.7.1
41 | 1.2.3
42 |
43 |
44 |
45 |
46 | org.slf4j
47 | slf4j-api
48 | ${slf4j.version}
49 |
50 |
51 |
52 | ch.qos.logback
53 | logback-core
54 | ${logback.version}
55 |
56 |
57 |
58 | ch.qos.logback
59 | logback-classic
60 | ${logback.version}
61 |
62 |
63 |
64 | org.apache.commons
65 | commons-lang3
66 | ${commons.lang3.version}
67 |
68 |
69 |
70 |
71 | com.google.guava
72 | guava
73 | 23.5-jre
74 |
75 |
76 |
77 | org.ansj
78 | ansj_seg
79 | 5.1.1
80 |
81 |
82 |
83 | junit
84 | junit
85 | 4.12
86 | test
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/classification/Feature.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.classification;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 | import java.util.HashMap;
7 | import java.util.Map;
8 |
9 | /**
10 | * 文档的特征
11 | *
12 | * @author xiatian
13 | *
14 | */
15 | public class Feature {
16 | /** 每个关键词在不同类别中出现的文档数量 */
17 | private Map docCountMap = new HashMap();
18 | /** 特征名称 */
19 | private String name;
20 |
21 | public String getName() {
22 | return name;
23 | }
24 | public void setName(String name) {
25 | this.name = name;
26 | }
27 | public void incDocCount(String category){
28 | if(docCountMap.containsKey(category)){
29 | docCountMap.put(category, docCountMap.get(category)+1);
30 | }else{
31 | docCountMap.put(category, 1);
32 | }
33 | }
34 | public int getDocCount(String category){
35 | if(docCountMap.containsKey(category)){
36 | return docCountMap.get(category);
37 | }else{
38 | return 0;
39 | }
40 | }
41 |
42 | public void write(DataOutput out) throws IOException{
43 | out.writeUTF(name==null?"":name);
44 |
45 | out.writeInt(docCountMap.size());
46 | for(String category:docCountMap.keySet()){
47 | out.writeUTF(category);
48 | out.writeInt(docCountMap.get(category));
49 | }
50 | }
51 |
52 | public void readFields(DataInput in) throws IOException {
53 | this.name = in.readUTF();
54 |
55 | docCountMap = new HashMap();
56 | int size = in.readInt();
57 | for(int i=0; i bag = new HashSet();
26 |
27 | public Instance() {
28 | }
29 |
30 | public Instance(String category, File f, String encoding) {
31 | this.category = category;
32 | String line = null;
33 |
34 | try {
35 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));
36 |
37 | while ((line = in.readLine()) != null) {
38 | System.out.println(line);
39 | List words = SegmentProxy.segment(line);
40 | for(Word w:words) {
41 | if (w.getPos().endsWith("adj")
42 | || w.getPos().startsWith("n")
43 | || w.getPos().startsWith("v")) {
44 | bag.add(w.getWord());
45 | }
46 | }
47 | }
48 | } catch (IOException e) {
49 | System.out.println("current file:" + f.getAbsolutePath());
50 | System.out.println("current line:" + line);
51 | e.printStackTrace();
52 | }
53 | }
54 |
55 | public String getCategory() {
56 | return category;
57 | }
58 |
59 | public void setCategory(String category) {
60 | this.category = category;
61 | }
62 |
63 | public Set getWords() {
64 | return bag;
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/classification/NaiveBayesClassifier.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.classification;
2 |
3 | import java.io.DataInputStream;
4 | import java.io.DataOutput;
5 | import java.io.DataOutputStream;
6 | import java.io.File;
7 | import java.io.FileInputStream;
8 | import java.io.FileOutputStream;
9 | import java.io.IOException;
10 | import java.util.Collection;
11 | import java.util.HashMap;
12 | import java.util.Map;
13 |
14 | public class NaiveBayesClassifier {
15 | /**
16 | * 记录每个类别下出现的文档数量, 用于计算P(C)使用
17 | */
18 | Variable VARIABLE = new Variable();
19 |
20 | /**
21 | * 词语在所有类别中的总数量
22 | */
23 | Map TERM_TOTAL_COUNT = new HashMap();
24 |
25 | /**
26 | * 训练一篇文档
27 | * @param doc
28 | */
29 | public void training(Instance doc) {
30 | VARIABLE.addInstance(doc);
31 | }
32 |
33 | /**
34 | * 保存训练结果
35 | * @throws IOException
36 | */
37 | void save(File file) throws IOException{
38 | DataOutput out = new DataOutputStream(new FileOutputStream(file));
39 | VARIABLE.write(out);
40 | }
41 |
42 | public void load(File file) throws IOException{
43 | DataInputStream in = new DataInputStream(new FileInputStream(file));
44 | VARIABLE = Variable.read(in);
45 | }
46 |
47 | /**
48 | * 计算P(C)
49 | * @param category
50 | * @return
51 | */
52 | public double getCategoryProbability(String category){
53 | return Math.log(VARIABLE.getDocCount(category)*1.0f/VARIABLE.getDocCount());
54 | }
55 |
56 | /**
57 | * 计算P(feature|cateogry),返回的是取对数后的数值
58 | * @param feature
59 | * @param category
60 | * @return
61 | */
62 | public double getFeatureProbability(String feature, String category){
63 | int m = VARIABLE.getFeatureCount();
64 | return Math.log((VARIABLE.getDocCount(feature, category)+1.0)/(VARIABLE.getDocCount(category)+m));
65 | }
66 |
67 | /**
68 | * 计算给定实例文档属于指定类别的概率,返回的是取对数后的数值
69 | * @param category
70 | * @param doc
71 | * @return
72 | */
73 | public double getProbability(String category, Instance doc) {
74 | double result = getCategoryProbability(category);
75 | for(String feature:doc.getWords()){
76 | if(VARIABLE.containFeature(feature)){
77 | result += getFeatureProbability(feature, category);
78 | }
79 | }
80 | return result;
81 | }
82 |
83 | public String getCategory(Instance doc){
84 | Collection categories = VARIABLE.getCategories();
85 | double best = Double.NEGATIVE_INFINITY;
86 | String bestName = null;
87 | for(String c:categories){
88 | double current = getProbability(c, doc);
89 | // System.out.println(c + ":" + current);
90 | if(best categoryMap = new HashMap();
19 |
20 | Map features = new HashMap();
21 |
22 | /** 所有文档的数量 */
23 | private int docCount = 0;
24 |
25 | public void write(DataOutput out) throws IOException{
26 | //保存文档总数
27 | out.writeInt(docCount);
28 |
29 | //写入类别总数
30 | out.writeInt(categoryMap.size());
31 | for(String category:categoryMap.keySet()){
32 | out.writeUTF(category);
33 | categoryMap.get(category).write(out);
34 | }
35 |
36 | //写入Feature总数
37 | out.writeInt(features.size());
38 | for(String key:features.keySet()){
39 | out.writeUTF(key);
40 | features.get(key).write(out);
41 | }
42 | }
43 |
44 | public void readFields(DataInput in) throws IOException {
45 | this.docCount = in.readInt();
46 |
47 | int size = in.readInt();
48 | categoryMap = new HashMap();
49 | for(int i=0; i();
57 | for(int i=0; i getCategories(){
71 | return categoryMap.keySet();
72 | }
73 |
74 | public int getFeatureCount(){
75 | return features.size();
76 | }
77 |
78 | public boolean containFeature(String feature){
79 | return features.containsKey(feature);
80 | }
81 |
82 | public void incDocCount(){
83 | this.docCount++;
84 | }
85 |
86 | public int getDocCount(){
87 | return this.docCount;
88 | }
89 |
90 | /**
91 | * 获取置顶类别下的文档数量
92 | * @param category
93 | * @return
94 | */
95 | public int getDocCount(String category){
96 | return categoryMap.get(category).getDocCount();
97 | }
98 |
99 | /**
100 | * 获取feature在指定类别下的文档出现数量
101 | * @param feature
102 | * @param category
103 | * @return
104 | */
105 | public int getDocCount(String feature, String category){
106 | Feature f = features.get(feature);
107 | if(f!=null){
108 | return f.getDocCount(category);
109 | }
110 | return 0;
111 | }
112 |
113 | public void addInstance(Instance instance){
114 | incDocCount();
115 | CategoryInfo info = null;
116 | if(categoryMap.containsKey(instance.getCategory())){
117 | info = categoryMap.get(instance.getCategory());
118 | }else{
119 | info = new CategoryInfo();
120 | }
121 | info.incDocCount();
122 | categoryMap.put(instance.getCategory(), info);
123 |
124 | for(String word:instance.getWords()){
125 | Feature feature = features.get(word);
126 |
127 | if(feature==null) feature = new Feature();
128 |
129 | feature.setName(word);
130 | feature.incDocCount(instance.getCategory());
131 |
132 | features.put(word, feature);
133 | }
134 | }
135 |
136 | public static class CategoryInfo {
137 | private int docCount;
138 |
139 | public int getDocCount() {
140 | return docCount;
141 | }
142 | public void incDocCount(){
143 | this.docCount++;
144 | }
145 | public void setDocCount(int docCount) {
146 | this.docCount = docCount;
147 | }
148 |
149 | public void write(DataOutput out) throws IOException{
150 | out.writeInt(docCount);
151 | }
152 |
153 | public void readFields(DataInput in) throws IOException {
154 | this.docCount = in.readInt();
155 | }
156 |
157 | public static CategoryInfo read(DataInput in) throws IOException{
158 | CategoryInfo c = new CategoryInfo();
159 | c.readFields(in);
160 | return c;
161 | }
162 | }
163 | }
164 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/Similaritable.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity;
2 |
3 | /**
4 | * 可以计算相似度的接口
5 | *
6 | * @author 夏天
7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
8 | */
9 | public interface Similaritable {
10 | /**
11 | * 计算两个字符串的相似度,对于句子来说,计算的是句子相似度,对于词语则计算词语的相似度
12 | * @param item1 参与相似度计算的第一个字符串
13 | * @param item2 参与相似度计算的第二个字符串
14 | * @return
15 | */
16 | public double getSimilarity(String item1, String item2);
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/SimilarityFactory.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity;
2 |
3 | import ruc.irm.similarity.sentence.SentenceSimilarity;
4 | import ruc.irm.similarity.sentence.morphology.MorphoSimilarity;
5 | import ruc.irm.similarity.word.WordSimilarity;
6 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
7 |
8 | public class SimilarityFactory {
9 | private static WordSimilarity wordSimilarity = XiaConceptParser.getInstance();
10 | private static SentenceSimilarity sentenceSimilarity = MorphoSimilarity.getInstance();
11 |
12 | private SimilarityFactory(){}
13 |
14 | public static WordSimilarity getWordSimilarity(){
15 | return wordSimilarity;
16 | }
17 |
18 | public static SentenceSimilarity getSentenceSimilarity(){
19 | return sentenceSimilarity;
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/phrase/PhraseSimilarity.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.phrase;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import ruc.irm.similarity.Similaritable;
7 |
8 | /**
9 | * 一种简单的短语相似度计算方法,算法原理请参考《中文信息相似度计算理论与方法》一书P69.
10 | *
11 | * @author 夏天
12 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
13 | */
14 | public class PhraseSimilarity implements Similaritable {
15 |
16 | @Override
17 | public double getSimilarity(String item1, String item2) {
18 | return (getSC(item1, item2) + getSC(item2, item1)) / 2.0;
19 | }
20 |
21 | public List getC(String first, String second, int pos) {
22 | List results = new ArrayList();
23 | char ch = first.charAt(pos);
24 | for (int i = 0; i < second.length(); i++) {
25 | if (ch == second.charAt(i)) {
26 | results.add(i);
27 | }
28 | }
29 | return results;
30 | }
31 |
32 | public int getDistance(String first, String second, int pos) {
33 | int d = second.length();
34 | for (int k : getC(first, second, pos)) {
35 | int value = Math.abs(k - pos);
36 | if (d > value) {
37 | d = value;
38 | }
39 | }
40 |
41 | return d;
42 | }
43 |
44 | public double getCC(String first, String second, int pos) {
45 | return (second.length() - getDistance(first, second, pos)) * 1.0 / second.length();
46 | }
47 |
48 | public double getSC(String first, String second) {
49 | double total = 0.0;
50 | for (int i = 0; i < first.length(); i++) {
51 | total = total + getCC(first, second, i);
52 | }
53 | return total / first.length();
54 | }
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/SegmentProxy.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence;
2 |
3 | import java.awt.BorderLayout;
4 | import java.awt.GridLayout;
5 | import java.awt.event.ActionEvent;
6 | import java.awt.event.ActionListener;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | import javax.swing.BorderFactory;
11 | import javax.swing.JButton;
12 | import javax.swing.JLabel;
13 | import javax.swing.JPanel;
14 | import javax.swing.JScrollPane;
15 | import javax.swing.JTextArea;
16 | import javax.swing.JTextField;
17 |
18 | import org.ansj.domain.Result;
19 | import org.ansj.domain.Term;
20 | import org.ansj.splitWord.analysis.ToAnalysis;
21 |
22 | /**
23 | * 对词法分析程序的封装代理,目前内部封装了对Ictclas4j(夏天改进版)的调用
24 | * 为方便演示程序快速启动,对Segment的调用采用了单例模式,实现需要时的延迟加载。
25 | *
26 | * @CHANGE 2014/04/04 采用Ansj词法分析器取代Ictclas4j-summer version
27 | *
28 | * @author 夏天
29 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
30 | */
31 | public class SegmentProxy {
32 |
33 | public static class Word {
34 | /**
35 | * 词语内容
36 | */
37 | private String word;
38 | /**
39 | * 词语词性代号
40 | */
41 | private String pos;
42 |
43 | public Word(String word, String pos) {
44 | this.word = word;
45 | this.pos = pos;
46 | }
47 |
48 | public String getWord() {
49 | return word;
50 | }
51 |
52 | public void setWord(String word) {
53 | this.word = word;
54 | }
55 |
56 | public String getPos() {
57 | return pos;
58 | }
59 |
60 | public void setPos(String pos) {
61 | this.pos = pos;
62 | }
63 | }
64 |
65 | public static List segment(String sentence) {
66 | List results = new ArrayList();
67 | Result terms = ToAnalysis.parse(sentence);
68 |
69 | for (Term term : terms) {
70 | results.add(new Word(term.getName(), term.natrue().natureStr));
71 | }
72 |
73 | return results;
74 | }
75 |
76 | public static String getSegmentedString(String sentence) {
77 | List words = segment(sentence);
78 | StringBuilder sb = new StringBuilder();
79 | for (Word word : words) {
80 | sb.append(word.getWord() + "/" + word.getPos()).append(" ");
81 | }
82 | return sb.toString();
83 | }
84 |
85 | public static JPanel createPanel() {
86 | //声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
87 | JPanel fullPanel = new JPanel();
88 | fullPanel.setLayout(new BorderLayout());
89 |
90 | JPanel northPanel = new JPanel();
91 | fullPanel.add(northPanel, "North");
92 |
93 | //centerPanel包括了一个文本框
94 | JPanel centerPanel = new JPanel();
95 | fullPanel.add(centerPanel, "Center");
96 | centerPanel.setLayout(new BorderLayout());
97 | final JTextArea result = new JTextArea();
98 | //result.setFont(new Font("宋体", Font.PLAIN, 16));
99 | result.setLineWrap(true);
100 | JScrollPane centerScrollPane = new JScrollPane(result);
101 | centerPanel.add(centerScrollPane, "Center");
102 |
103 | northPanel.setLayout(new GridLayout(1, 1));
104 |
105 | //以下加入northPanel中的第一个面板
106 | final JTextField senField = new JTextField("什么是计算机病毒");
107 | senField.setColumns(50);
108 |
109 | JPanel mainPanel = new JPanel();
110 | mainPanel.setLayout(new GridLayout(2, 1));
111 |
112 | JPanel linePanel = new JPanel();
113 | linePanel.add(new JLabel("句子:"));
114 | linePanel.add(senField);
115 | mainPanel.add(linePanel);
116 |
117 | linePanel = new JPanel();
118 | JButton goButton = new JButton("词法分析");
119 | linePanel.add(goButton);
120 | mainPanel.add(linePanel);
121 | goButton.addActionListener(new ActionListener() {
122 |
123 | @Override
124 | public void actionPerformed(ActionEvent e) {
125 | String sentence = senField.getText();
126 | String text = "[" + sentence + "]的词法分析结果为:";
127 |
128 | text = text + "\n" + getSegmentedString(sentence);
129 | text = text + "\n________________________________\n" + result.getText();
130 | result.setText(text);
131 | }
132 |
133 | });
134 | mainPanel.setBorder(BorderFactory.createEtchedBorder());
135 | northPanel.add(mainPanel);
136 |
137 | return fullPanel;
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/SentenceSimilarity.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence;
2 |
3 | import ruc.irm.similarity.Similaritable;
4 |
5 | public interface SentenceSimilarity extends Similaritable {
6 |
7 | }
8 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/Block.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence.editdistance;
2 |
3 |
4 |
5 | public class Block {
6 |
7 | private int globalPosition;
8 | /** 块的内容 */
9 | private SuperString data;
10 | /** 前后指针 */
11 | private Block prev, next;
12 | /** 是否已经进行划分 */
13 | private boolean divideFlag = false;
14 |
15 | public Block(SuperString string){
16 | this.data = string;
17 | this.globalPosition = 0;
18 | }
19 |
20 | public Block(SuperString string, int globalBegin){
21 | this.data = string;
22 | this.globalPosition = globalBegin;
23 | }
24 |
25 | public int getGlobalPosition() {
26 | return globalPosition;
27 | }
28 |
29 | public void setGlobalPosition(int globalPosition) {
30 | this.globalPosition = globalPosition;
31 | }
32 |
33 | public SuperString getData() {
34 | return data;
35 | }
36 |
37 | public void setData(SuperString data) {
38 | this.data = data;
39 | }
40 |
41 | public Block getPrev() {
42 | return prev;
43 | }
44 |
45 | public void setPrev(Block prev) {
46 | this.prev = prev;
47 | }
48 |
49 | public Block getNext() {
50 | return next;
51 | }
52 |
53 | public void setNext(Block next) {
54 | this.next = next;
55 | }
56 |
57 | public boolean isDivideFlag() {
58 | return divideFlag;
59 | }
60 |
61 | public void setDivideFlag(boolean divideFlag) {
62 | this.divideFlag = divideFlag;
63 | }
64 |
65 | public void divide(int start, int length){
66 | if(start==0 && length==data.length()){
67 | this.divideFlag = true;
68 | return;
69 | }else if(start==0){
70 | //前面为已经分割的标记,后面应该为未分割的标记
71 | Block tail = new Block(data.substring(length), globalPosition + start);
72 | this.setDivideFlag(true);
73 | this.setData(data.substring(0, length));
74 | tail.next = this.next;
75 | if(tail.next!=null) tail.next.prev = tail;
76 | this.next = tail;
77 | tail.prev = this;
78 | }else if(start+length == data.length()){
79 | //后面为已经分割的标记,前面应该为未分割的标记
80 | Block head = new Block(data.substring(0, start), globalPosition);
81 |
82 | this.setDivideFlag(true);
83 | this.setData(data.substring(start));
84 |
85 | head.prev = this.prev;
86 | if(head.prev!=null) head.prev.next = head;
87 | head.next = this;
88 | this.prev = head;
89 | }else{
90 | //中间为已经分割的标记,前面和后面应该为未分割的标记
91 | Block head = new Block(data.substring(0, start), globalPosition);
92 | Block tail = new Block(data.substring(start+length), globalPosition + start+length);
93 |
94 | this.setDivideFlag(true);
95 | this.setData(data.substring(start, start+length));
96 | this.setGlobalPosition(globalPosition + start);
97 |
98 | head.prev = this.prev;
99 | if(head.prev!=null) head.prev.next = head;
100 | head.next = this;
101 | this.prev = head;
102 |
103 | tail.next = this.next;
104 | if(tail.next!=null) tail.next.prev = tail;
105 | this.next = tail;
106 | tail.prev = this;
107 | }
108 |
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/CharEditUnit.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence.editdistance;
2 |
3 | public class CharEditUnit extends EditUnit {
4 | private String content = "";
5 |
6 | public CharEditUnit(Character ch){
7 | content = ch.toString();
8 | }
9 |
10 | @Override
11 | public String getUnitString() {
12 | return content;
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/ChunkEditUnit.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence.editdistance;
2 |
3 |
4 | public class ChunkEditUnit extends EditUnit {
5 | private SuperString extends EditUnit> chunk = null;
6 |
7 | public ChunkEditUnit(SuperString extends EditUnit> chunk){
8 | this.chunk = chunk;
9 | }
10 |
11 | public String getUnitString() {
12 | return chunk.toString();
13 | }
14 |
15 | /**
16 | * 根据此语的相似度获取替换代价
17 | */
18 | @Override
19 | public double getSubstitutionCost(EditUnit otherUnit){
20 | if(!(otherUnit instanceof ChunkEditUnit)) return chunk.length();
21 | if(equals(otherUnit)) return 0.0;
22 |
23 | ChunkEditUnit other = (ChunkEditUnit)otherUnit;
24 | return new StandardEditDistance().getEditDistance(chunk, other.chunk);
25 | }
26 |
27 | /**
28 | * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0
29 | * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价
30 | * @return 删除代价
31 | */
32 | public double getDeletionCost(){
33 | return chunk.length();
34 | }
35 |
36 | /**
37 | * 获取插入代价,标准算法的默认值为1.0.
38 | * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价
39 | */
40 | public double getInsertionCost(){
41 | return chunk.length();
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/EditDistance.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence.editdistance;
2 |
3 | import ruc.irm.similarity.Similaritable;
4 |
5 |
6 | /**
7 | * 编辑距离的父类,定义了其中的主要行为
8 | *
9 | * @author 夏天
10 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
11 | */
12 | public abstract class EditDistance implements Similaritable {
13 |
14 | public abstract double getEditDistance(SuperString extends EditUnit> S, SuperString extends EditUnit> T);
15 |
16 | public double getSimilarity(String s1, String s2){
17 | SuperString S = SuperString.createWordSuperString(s1);
18 | SuperString T = SuperString.createWordSuperString(s2);
19 |
20 | return 1-(getEditDistance(S, T))/(Math.max(S.length(), T.length()));
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/EditUnit.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence.editdistance;
2 |
3 | /**
4 | * 编辑单元
5 | *
6 | * @author 夏天
7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
8 | */
9 | public abstract class EditUnit {
10 | /**
11 | * 获取编辑单元的内部字符串
12 | * @return
13 | */
14 | public abstract String getUnitString();
15 |
16 | /**
17 | * 获取替换代价,默认替换代价当替换单元的内容相同时为0,
18 | * 不同时为1
19 | */
20 | public double getSubstitutionCost(EditUnit other){
21 | return this.equals(other)?0:1;
22 | }
23 |
24 | /**
25 | * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0
26 | * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价
27 | * @return 删除代价
28 | */
29 | public double getDeletionCost(){
30 | return 1.0;
31 | }
32 |
33 | /**
34 | * 获取插入代价,标准算法的默认值为1.0.
35 | * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价
36 | */
37 | public double getInsertionCost(){
38 | return 1.0;
39 | }
40 |
41 | @Override
42 | public boolean equals(Object other){
43 | if(!(other instanceof EditUnit)) return false;
44 | return getUnitString().equals(((EditUnit)other).getUnitString());
45 | }
46 |
47 | @Override
48 | public String toString(){
49 | return getUnitString();
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/GregorEditDistance.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence.editdistance;
2 |
3 | /**
4 | * 由Gregor提出的考虑块交换(Block Transposition)的编辑距离改进算法
5 | * 时间复杂度为O(m3n3)
6 | * 具体实现请参考GregorLeusch,Nicola Ueffing的文章《A Novel String-to-String Distance Measure With
7 | * Application to Machine Translation Evaluation》
8 | * 问题:
9 | * 相似度计算的问题会影响句子相似度计算的直观结果,例如“什么是计算机病毒”,“电脑病毒是什么”
10 | * 直觉应该是2,即“什么是计算机病毒”首先变为“计算机病毒什么是”,再变为“计算机病毒是什么”,
11 | * 编辑代价为2,但实际上,当由“什么是计算机病毒”变为“计算机病毒什么是”后,由于"什么是"与“是什么”的替换代价只有0.2,
12 | * 因而不再进行交互,故总的编辑距离为1.2
13 | *
14 | * @author 夏天
15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
16 | */
17 | public class GregorEditDistance extends EditDistance {
18 | /** 块交换代价 */
19 | public static double swapCost = 0.5;
20 |
21 | private SuperString extends EditUnit> S,T;
22 | /** 存放字符串从S(i0-i1)到T(j0-j1)的中间运算结果,避免多次运算,提高运算效率*/
23 | private double[][][][] QArray;
24 |
25 | public double getEditDistance(SuperString extends EditUnit> S,SuperString extends EditUnit> T){
26 | this.S = S;
27 | this.T = T;
28 | QArray = new double[S.length()][S.length()][T.length()][T.length()];
29 | for(int i=0;i subsitituteValue){
63 | minSubstituteValue = subsitituteValue;
64 | minPosJ = j;
65 | }
66 | }
67 | for(int j=j0;j<=j1;j++){
68 | if(j == minPosJ){
69 | cost += minSubstituteValue;
70 | }else{
71 | cost += T.elementAt(j).getInsertionCost();
72 | }
73 | }
74 | }else if(j1==j0){
75 | double minSubstituteValue = 1.0;
76 | int minPosI = i0;
77 | for(int i=i0;i<=i1;i++){
78 | double subsitituteValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j0));
79 | if(minSubstituteValue > subsitituteValue){
80 | minSubstituteValue = subsitituteValue;
81 | minPosI = i;
82 | }
83 | }
84 | for(int i=i0;i<=i1;i++){
85 | if(i == minPosI){
86 | cost += minSubstituteValue;
87 | }else{
88 | cost += S.elementAt(i).getDeletionCost();
89 | }
90 | }
91 | }else{
92 | if(QArray[i0][i1][j0][j1] X, SuperString extends EditUnit> Y){
10 | Block extends EditUnit> LX = new Block(X);
11 | Block extends EditUnit> LY = new Block(Y);
12 | split(LX,LY);
13 | while(LY.getPrev()!=null){
14 | LY = LY.getPrev();
15 | }
16 | while(LX.getPrev()!=null){
17 | LX = LX.getPrev();
18 | }
19 | List first = new ArrayList();
20 | List second = new ArrayList();
21 | while(LX!=null){
22 | first.add(new ChunkEditUnit(LX.getData()));
23 | LX = LX.getNext();
24 | }
25 |
26 | while(LY!=null){
27 | second.add(new ChunkEditUnit(LY.getData()));
28 | LY = LY.getNext();
29 | }
30 | SuperString s1 = new SuperString(first);
31 | SuperString s2 = new SuperString(second);
32 | Object[] obj = new Object[]{s1, s2};
33 | return obj;
34 | }
35 |
36 | private static void split(Block> bx, Block> LY){
37 | LCS maxLCS = null;
38 | Block> by = LY;
39 | while(by.getPrev()!=null){
40 | by = by.getPrev();
41 | }
42 | Block> maxMatchedBy = by;
43 | while(by!=null){
44 | if(by.isDivideFlag()){
45 | by = by.getNext();
46 | continue;
47 | }
48 |
49 | LCS lcs = LCS.parse(bx.getData(), by.getData());
50 | if(maxLCS==null || maxLCS.length0){
59 | bx.divide(maxLCS.x_pos, maxLCS.length);
60 | maxMatchedBy.divide(maxLCS.y_pos, maxLCS.length);
61 | }
62 |
63 | if(bx.getPrev()!=null && !bx.isDivideFlag()){
64 | split(bx.getPrev(), LY);
65 | }
66 |
67 | if(bx.getNext()!=null &&!bx.getNext().isDivideFlag()){
68 | split(bx.getNext(), LY);
69 | }
70 |
71 | }
72 |
73 | /**
74 | * longest common string
75 | * @author Gavin
76 | *
77 | */
78 | public static class LCS {
79 | public int length = 0; //LCS匹配的最长结果
80 | public int x_pos = 0; //LCS匹配的X的位置
81 | public int y_pos = 0; //LCS匹配的Y的位置
82 |
83 | public static LCS parse(SuperString> X, SuperString> Y){
84 | LCS lcs = new LCS();
85 | for(int start=0; start tempX = X.substring(start, end);
88 |
89 | int pos = Y.indexOf(tempX);
90 | if(pos>=0 && tempX.length()>lcs.length){
91 | lcs.length = tempX.length();
92 | lcs.x_pos = start;
93 | lcs.y_pos = pos;
94 | }
95 | }
96 | }
97 | return lcs;
98 | }
99 |
100 | public String toString(){
101 | return "length=" + length + ", x_pos=" + x_pos + ", y_pos=" + y_pos;
102 | }
103 | }
104 |
105 | public static void main(String[] args) {
106 | String s1 = "abcdefghijkabc";
107 | String s2 = "cdefghijklabccc";
108 | // s2 = "fgabcdehijklkdslfkasdflak";
109 | // s1 = "abcdefgxyzoxyjasdkfjjjaldsfa";
110 | // s1 = "I like the book";
111 | // s2 = "the book I like";
112 | s1 = "什么是计算机病毒";
113 | s2 = "电脑病毒是什么";
114 |
115 | // SuperString ss1 = SuperString.createCharSuperString(s1);
116 | // SuperString ss2 = SuperString.createCharSuperString(s2);
117 |
118 | SuperString ss1 = SuperString.createWordSuperString(s1);
119 | SuperString ss2 = SuperString.createWordSuperString(s2);
120 | Split.split(ss1, ss2);
121 | // LCS lcs = LCS.parse(ss1, ss2);
122 | // System.out.println(lcs);
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/StandardEditDistance.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence.editdistance;
2 |
3 |
4 | /**
5 | * 基于编辑距离的汉语句子相似度计算
6 | *
7 | * @author 夏天
8 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
9 | */
10 | public class StandardEditDistance extends EditDistance {
11 | /**
12 | * 获取两个串的编辑距离
13 | * @param S 字符串1
14 | * @param T 字符串2
15 | * @return 两个串的编辑距离
16 | */
17 | public double getEditDistance(SuperString extends EditUnit> X, SuperString extends EditUnit> Y){
18 | double[][] D; //编辑矩阵
19 |
20 | int m = X.length(); //字符串X的长度
21 | int n = Y.length(); //字符串Y的长度
22 | //char ch_x_i; //字符串X的第i个词
23 | //char ch_y_j; //字符串Y的第j个词
24 |
25 | if(m == 0){
26 | double distance = 0.0;
27 | for(int j=0; j夏天
14 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
15 | *
16 | * @param
17 | */
18 | public class SuperString {
19 | private List contents = new ArrayList();
20 |
21 | public SuperString(List contents){
22 | this.contents = contents;
23 | }
24 |
25 | public static SuperString createCharSuperString(String str){
26 | List list = new ArrayList(str.length());
27 | for(int i=0; i s = new SuperString(list);
31 | return s;
32 | }
33 |
34 | public static SuperString createWordSuperString(String sentence){
35 | List wordList = SegmentProxy.segment(sentence);
36 | List unitList = new ArrayList(wordList.size());
37 | for(int i=0; i s = new SuperString(unitList);
41 | return s;
42 | }
43 |
44 |
45 | public T elementAt(int pos){
46 | if(pos<0 || pos>=contents.size()){
47 | throw new ArrayIndexOutOfBoundsException("下标越界");
48 | }
49 | return contents.get(pos);
50 | }
51 |
52 | public int indexOf(SuperString> substring){
53 | int result = -1;
54 | for(int i=0; ilength()) return -1;
57 |
58 | for(;j substring(int fromIndex, int toIndex){
73 | return new SuperString(contents.subList(fromIndex, toIndex));
74 | }
75 |
76 | public SuperString substring(int fromIndex){
77 | return new SuperString(contents.subList(fromIndex, contents.size()));
78 | }
79 |
80 | public int length(){
81 | return contents.size();
82 | }
83 |
84 | @Override
85 | public String toString(){
86 | StringBuilder sb = new StringBuilder();
87 | for(int i=0; i0.85;
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/sentence/editdistance/XiatianEditDistance.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence.editdistance;
2 |
3 |
4 | /**
5 | * 夏天提出的新的支持非相邻块交互的编辑距离算法
6 | *
7 | * @author 夏天
8 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
9 | */
10 | public class XiatianEditDistance extends EditDistance {
11 | /** 块交换代价 */
12 | public static double swapCost = 0.5;
13 |
14 | private SuperString extends EditUnit> S,T;
15 | private double[][][][] QArray;
16 |
17 | public double getEditDistance(SuperString extends EditUnit> S, SuperString extends EditUnit> T){
18 | this.S = S;
19 | this.T = T;
20 | QArray = new double[S.length()+1][S.length()+1][T.length()+1][T.length()+1];
21 | for(int i=0;i<=S.length();i++){
22 | for(int i2=0;i2<=S.length();i2++)
23 | for(int j=0;j<=T.length();j++)
24 | for(int j2=0;j2<=T.length();j2++){
25 | QArray[i][i2][j][j2]=Double.MAX_VALUE;
26 | }
27 | }
28 | return Q(0,S.length()-1,0,T.length()-1);
29 | }
30 |
31 | private double Q(int i1,int im,int j1,int jn){
32 | if(QArray[i1][im][j1][jn] subValue){
52 | minSubValue = subValue;
53 | minPosJ = j;
54 | }
55 | }
56 | for(int j=j1;j<=jn;j++){
57 | if(j == minPosJ){
58 | cost += minSubValue;
59 | }else{
60 | cost += T.elementAt(j).getInsertionCost();
61 | }
62 | }
63 | }else if(j1==jn){
64 | int minPosI = i1;
65 | double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
66 | for(int i=i1+1;i<=im;i++){
67 | double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1));
68 | if(minSubValue > subValue){
69 | minSubValue = subValue;
70 | minPosI = i;
71 | }
72 | }
73 | for(int i=i1;i<=im;i++){
74 | if(i == minPosI){
75 | cost += minSubValue;
76 | }else{
77 | cost += S.elementAt(i).getDeletionCost();
78 | }
79 | }
80 | }else{
81 | cost = QArray[i1][im][j1][jn];
82 | loop:for(int i=i1;i夏天
8 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
9 | */
10 | public class XiatianEditDistance2 extends EditDistance {
11 | /** 块交换代价 */
12 | private double swapCost = 1.0;
13 |
14 | private SuperString extends EditUnit> S,T;
15 | private double[][][][] QArray;
16 |
17 | @SuppressWarnings("unchecked")
18 | public double getEditDistance(SuperString extends EditUnit> S1, SuperString extends EditUnit> T1){
19 | Object[] array = Split.split(S1, T1);
20 | this.S = (SuperString extends EditUnit>)array[0];
21 | this.T = (SuperString extends EditUnit>)array[1];
22 | QArray = new double[S.length()+1][S.length()+1][T.length()+1][T.length()+1];
23 | for(int i=0;i<=S.length();i++){
24 | for(int i2=0;i2<=S.length();i2++)
25 | for(int j=0;j<=T.length();j++)
26 | for(int j2=0;j2<=T.length();j2++){
27 | QArray[i][i2][j][j2]=Double.MAX_VALUE;
28 | }
29 | }
30 | return Q(0,S.length()-1,0,T.length()-1);
31 | }
32 |
33 | private double Q(int i1,int im,int j1,int jn){
34 | if(QArray[i1][im][j1][jn] subValue){
54 | minSubValue = subValue;
55 | minPosJ = j;
56 | }
57 | }
58 | for(int j=j1;j<=jn;j++){
59 | if(j == minPosJ){
60 | cost += minSubValue;
61 | }else{
62 | cost += T.elementAt(j).getInsertionCost();
63 | }
64 | }
65 | }else if(j1==jn){
66 | int minPosI = i1;
67 | double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
68 | for(int i=i1+1;i<=im;i++){
69 | double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1));
70 | if(minSubValue > subValue){
71 | minSubValue = subValue;
72 | minPosI = i;
73 | }
74 | }
75 | for(int i=i1;i<=im;i++){
76 | if(i == minPosI){
77 | cost += minSubValue;
78 | }else{
79 | cost += S.elementAt(i).getDeletionCost();
80 | }
81 | }
82 | }else{
83 | cost = QArray[i1][im][j1][jn];
84 | loop:for(int i=i1;i
16 | * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的方法,在考虑语义时,
17 | * 无法直接获取OnceWS(A, B),因此,采用了两两匹配取最大值的方式。
18 | * 新的改进算法请参考{@code SemanticSimilarity}
19 | *
20 | * @author 夏天
21 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
22 | *
23 | */
24 | public class MorphoSimilarity implements SentenceSimilarity {
25 | private static Logger LOG = LoggerFactory.getLogger(MorphoSimilarity.class);
26 |
27 | /** 词形相似度占总相似度的比重 */
28 | private final double LAMBDA1 = 1.0;
29 | /** 词序相似度占总相似度的比重 */
30 | private final double LAMBDA2 = 0.0;
31 | /** 词语相似度的计算 */
32 | private WordSimilarity wordSimilarity = null;
33 |
34 | private static String FILTER_CHARS = " ,。;?《》()|!,.;?<>|_^…!";
35 |
36 | private static MorphoSimilarity instance = null;
37 |
38 | public static MorphoSimilarity getInstance(){
39 | if(instance == null){
40 | instance = new MorphoSimilarity();
41 | }
42 | return instance;
43 | }
44 |
45 | private MorphoSimilarity(){
46 | LOG.debug("used hownet wordsimilarity.");
47 | this.wordSimilarity = XiaConceptParser.getInstance();
48 | //this.segmenter = SegmentFactory.getInstance().getParser();
49 | }
50 |
51 | /**
52 | * 滤掉词串中的空格、标点符号
53 | * @param word_list
54 | * @return
55 | */
56 | private String[] filter(String[] word_list){
57 | List results = new ArrayList();
58 | for(String w:word_list){
59 | if(!FILTER_CHARS.contains(w)){
60 | results.add(w.toLowerCase());
61 | }
62 | }
63 |
64 | return results.toArray(new String[results.size()]);
65 | }
66 |
67 | /**
68 | * 计算两个句子的相似度
69 | * @see ruc.irm.similarity.Similaritable
70 | */
71 | public double getSimilarity(String firstSen,String secondSen){
72 | //LOG.debug(segmenter.segmentToString(firstSen));
73 | //LOG.debug(segmenter.segmentToString(secondSen));
74 | String[] firstList = filter(segment(firstSen));
75 | String[] secondList = filter(segment(secondSen));
76 |
77 | double wordSim = getOccurrenceSimilarity(firstList,secondList);
78 | //LOG.debug("词形相似度="+wordSim);
79 |
80 | double orderSim = getOrderSimilarity(firstList,secondList);
81 | //LOG.debug("词序相似度="+orderSim);
82 |
83 | return LAMBDA1*wordSim+LAMBDA2*orderSim;
84 | }
85 |
86 | /**
87 | * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序,第二个句子词语的顺序变化次数
88 | * @param firstList
89 | * @param secondList
90 | * @return
91 | */
92 | public double getOccurrenceSimilarity(String[] firstList, String[] secondList){
93 | int max = firstList.length>secondList.length?firstList.length:secondList.length;
94 | if(max==0){
95 | return 0;
96 | }
97 |
98 | //首先计算出所有可能的组合
99 | double[][] scores = new double[max][max];
100 | for(int i=0; i 0){
110 | double max_score = 0;
111 | int max_row = 0;
112 | int max_col = 0;
113 |
114 | //先挑出相似度最大的一对:
115 | for(int i=0; ii?i:i-1;
132 | int tmp_j = max_col>j?j:j-1;
133 | tmp_scores[tmp_i][tmp_j] = scores[i][j];
134 | }
135 | }
136 | total_score += max_score;
137 | scores = tmp_scores;
138 | }
139 |
140 | return (2*total_score) / (firstList.length + secondList.length);
141 | }
142 |
143 | /**
144 | * 获取两个集合的词序相似度
145 | * @param firstList
146 | * @param secondList
147 | * @return
148 | */
149 | public double getOrderSimilarity(String[] firstList, String[] secondList){
150 | double similarity = 0.0;
151 |
152 | return similarity;
153 | }
154 |
155 | // @SuppressWarnings("unchecked")
156 | // public String[] segment(String sentence){
157 | // MPWordSegment ws = new MPWordSegment();
158 | // ws.parseReader(new StringReader(sentence));
159 | // Vector tokens = ws.getTokens();
160 | // String[] results = new String[tokens.size()];
161 | // for(int i=0; i list = SegmentProxy.segment(sentence);
171 | String[] results = new String[list.size()];
172 | for(int i=0; i夏天
22 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
23 | *
24 | */
25 | public class SemanticSimilarity implements SentenceSimilarity {
26 | private static Logger LOG = LoggerFactory.getLogger(SemanticSimilarity.class);
27 |
28 | /** 词形相似度占总相似度的比重 */
29 | private final double LAMBDA1 = 0.8;
30 | /** 词序相似度占总相似度的比重 */
31 | private final double LAMBDA2 = 0.2;
32 |
33 | /** 如果两个词语的相似度大于了该阈值, 则作为相同词语,计算词序相似度 */
34 | private final double GAMMA = 0.6;
35 |
36 | /** 词语相似度的计算 */
37 | private WordSimilarity wordSimilarity = null;
38 |
39 | private static String FILTER_CHARS = " ,。;?《》()|!,.;?<>|_^…!";
40 |
41 | private static SemanticSimilarity instance = null;
42 |
43 | public static SemanticSimilarity getInstance(){
44 | if(instance == null){
45 | instance = new SemanticSimilarity();
46 | }
47 | return instance;
48 | }
49 |
50 | private SemanticSimilarity(){
51 | LOG.debug("used hownet wordsimilarity.");
52 | this.wordSimilarity = XiaConceptParser.getInstance();
53 | //this.segmenter = SegmentFactory.getInstance().getParser();
54 | }
55 |
56 | /**
57 | * 滤掉词串中的空格、标点符号
58 | * @param word_list
59 | * @return
60 | */
61 | private String[] filter(String[] word_list){
62 | List results = new ArrayList();
63 | for(String w:word_list){
64 | if(!FILTER_CHARS.contains(w)){
65 | results.add(w.toLowerCase());
66 | }
67 | }
68 |
69 | return results.toArray(new String[results.size()]);
70 | }
71 |
72 | /**
73 | * 计算两个句子的相似度
74 | * @see ruc.irm.similarity.Similaritable
75 | */
76 | public double getSimilarity(String firstSen,String secondSen){
77 | //LOG.debug(segmenter.segmentToString(firstSen));
78 | //LOG.debug(segmenter.segmentToString(secondSen));
79 | String[] firstList = filter(segment(firstSen));
80 | String[] secondList = filter(segment(secondSen));
81 |
82 | return calculate(firstList,secondList);
83 | }
84 |
85 | /**
86 | * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序,第二个句子词语的顺序变化次数
87 | * @param firstList
88 | * @param secondList
89 | * @return
90 | */
91 | public double calculate(String[] firstList, String[] secondList){
92 | if(firstList.length == 0 || secondList.length == 0){
93 | return 0;
94 | }
95 |
96 | //首先计算出所有可能的组合
97 | double[][] scores = new double[firstList.length][secondList.length];
98 |
99 | //代表第1个句子对应位置是否已经被使用, 默认为未使用,即false
100 | boolean[] firstFlags = new boolean[firstList.length];
101 |
102 | //代表第2个句子对应位置是否已经被使用, 默认为未使用,即false
103 | boolean[] secondFlags = new boolean[secondList.length];
104 |
105 | //PSecond的定义参见书中5.4.3节, 为避免无必要的初始化数组,
106 | //数组中0值表示在第一个句子中没有对应的相似词语,大于0的值
107 | //则表示在第一个句子中的位置(从1开始编号了)
108 | int[] PSecond = new int[secondList.length];
109 |
110 | for(int i=0; i
126 | for(int i=0; i=0) {
140 | total_score += max_score;
141 | firstFlags[max_row] = true;
142 | secondFlags[max_col] = true;
143 | if(max_score>=GAMMA) {
144 | PSecond[max_col] = max_row+1;
145 | }
146 | } else {
147 | break;
148 | }
149 | }
150 |
151 | double wordSim = (2*total_score) / (firstList.length + secondList.length);
152 |
153 | int previous = 0;
154 | int revOrdCount = 0;
155 | int onceWSSize = 0;
156 | for(int i=0; i0) {
158 | onceWSSize++;
159 | if(previous>0 && (previous>PSecond[i])) {
160 | revOrdCount++;
161 | }
162 | previous = PSecond[i];
163 | }
164 | }
165 |
166 | double ordSim = 0;
167 | if(onceWSSize==1) {
168 | ordSim = 1;
169 | } else if(onceWSSize == 0) {
170 | ordSim = 0;
171 | } else {
172 | ordSim = 1.0 - revOrdCount*1.0/(onceWSSize-1);
173 | }
174 |
175 | System.out.println("wordSim ==> " + wordSim + ", ordSim ==> " + ordSim);
176 |
177 | return LAMBDA1*wordSim+LAMBDA2*ordSim;
178 | }
179 |
180 | public String[] segment(String sentence){
181 | List list = SegmentProxy.segment(sentence);
182 | String[] results = new String[list.size()];
183 | for(int i=0; i夏天
20 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
21 | */
22 | public class DictStatistic {
23 | /**
24 | * 从指定的xml文件加载词典文件
25 | * @param xmlFile
26 | * @param gzCompressed 是否再用gz格式对词典进行了压缩
27 | * @return
28 | */
29 | public void testFromXml(String xmlFile, boolean gzCompressed) {
30 | File file = new File(xmlFile);
31 | if (!file.canRead()){
32 | System.out.println("无法读取文件:" + xmlFile);
33 | return;// fail while opening the file
34 | }
35 | int count = 0, conceptCount=0;
36 | XMLInputFactory inputFactory = XMLInputFactory.newInstance();
37 | InputStream input = null;
38 | try {
39 | if(gzCompressed){
40 | input = new GZIPInputStream(new FileInputStream(file));
41 | }else{
42 | input = new FileInputStream(file);
43 | }
44 | XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
45 | while (xmlEventReader.hasNext()) {
46 | XMLEvent event = xmlEventReader.nextEvent();
47 |
48 | if (event.isStartElement()) {
49 | StartElement startElement = event.asStartElement();
50 | if(startElement.getName().toString().equals("table")){
51 | String head = startElement.getAttributeByName(QName.valueOf("head")).getValue();
52 | while (xmlEventReader.hasNext()) {
53 | XMLEvent itemEvent = xmlEventReader.nextEvent();
54 | if(itemEvent.isStartElement()){
55 | StartElement itemStartElement = itemEvent.asStartElement();
56 | if(!itemStartElement.getName().toString().equals("item")) continue;
57 | String word = itemStartElement.getAttributeByName(QName.valueOf("word")).getValue();
58 | word = head + word;
59 | if(XiaConceptParser.getInstance().isConcept(word)){
60 | conceptCount++;
61 | }
62 | count++;
63 | if(count%1000==0){
64 | System.out.println("process words " + count + "...");
65 | }
66 | }
67 | }
68 | }
69 | }
70 | }
71 | input.close();
72 | System.out.println(count + "\t" + conceptCount);
73 | return;
74 | } catch (Exception e) {
75 | e.printStackTrace();
76 | }
77 | }
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/statistic/LCMC.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.statistic;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.InputStream;
6 |
7 | import javax.xml.stream.XMLEventReader;
8 | import javax.xml.stream.XMLInputFactory;
9 | import javax.xml.stream.events.StartElement;
10 | import javax.xml.stream.events.XMLEvent;
11 |
12 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
13 |
14 |
15 |
16 | public class LCMC {
17 |
18 | public void countUnConceptWords(File xmlFile) throws Exception{
19 | int totalCount = 0, conceptCount = 0;
20 | XMLInputFactory inputFactory = XMLInputFactory.newInstance();
21 | InputStream input = null;
22 | input = new FileInputStream(xmlFile);
23 | XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
24 | while (xmlEventReader.hasNext()) {
25 | XMLEvent event = xmlEventReader.nextEvent();
26 |
27 | if (event.isStartElement()) {
28 | StartElement startElement = event.asStartElement();
29 | //如果是word开始
30 | if(startElement.getName().toString().equals("w")){
31 | String word = xmlEventReader.getElementText();
32 | totalCount++;
33 | if(XiaConceptParser.getInstance().isConcept(word)){
34 | conceptCount++;
35 | }
36 | }
37 | }
38 | }//
39 | input.close();
40 | System.out.println(totalCount + "\t" + conceptCount);
41 | }
42 |
43 | public static void main(String[] args) throws Exception {
44 | LCMC lcmc = new LCMC();
45 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_A.XML"));
46 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_B.XML"));
47 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_C.XML"));
48 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_D.XML"));
49 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_E.XML"));
50 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_F.XML"));
51 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_G.XML"));
52 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_H.XML"));
53 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_J.XML"));
54 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_K.XML"));
55 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_L.XML"));
56 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_M.XML"));
57 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_N.XML"));
58 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_P.XML"));
59 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_R.XML"));
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/text/DiceSimilarity.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.text;
2 |
3 | import ruc.irm.similarity.Similaritable;
4 |
5 | public class DiceSimilarity implements Similaritable {
6 |
7 | @Override
8 | public double getSimilarity(String item1, String item2) {
9 | // TODO Auto-generated method stub
10 | return 0;
11 | }
12 |
13 | }
14 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/About.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.util;
2 |
3 | import com.google.common.io.Resources;
4 |
5 | import javax.swing.*;
6 | import javax.swing.text.StyledEditorKit;
7 | import java.awt.*;
8 | import java.io.IOException;
9 | import java.net.URL;
10 | import java.net.URLClassLoader;
11 |
12 | /**
13 | * 关于xsimilarity项目的说明信息
14 | *
15 | * @author 夏天
16 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
17 | */
18 | public class About extends JFrame {
19 | private static final long serialVersionUID = -2307582155443587993L;
20 |
21 | public static JPanel createPanel() {
22 | JPanel mainPanel = new JPanel();
23 | mainPanel.setLayout(new BorderLayout());
24 | JTextPane editorPane = new JTextPane();
25 | editorPane.setEditable(false);
26 | //让长文本自动换行
27 | editorPane.setEditorKit(new StyledEditorKit());
28 | editorPane.setContentType("text/html");
29 | try {
30 | URL url = Resources.getResource("about.html");//可以用html格式文件做你的帮助系统了
31 | editorPane.setPage(url);
32 | } catch (IOException e1) {
33 | editorPane.setText(e1.getMessage());
34 | }
35 | //editorPane.setText("个人主页:http://xiatian.irm.cn/");
36 |
37 |
38 | mainPanel.add(new JScrollPane(editorPane), BorderLayout.CENTER);
39 | return mainPanel;
40 | }
41 |
42 | public About() {
43 | this.setTitle("关于XSimilarity");
44 |
45 | this.setDefaultCloseOperation(EXIT_ON_CLOSE);
46 | this.setPreferredSize(new Dimension(600, 400));
47 | this.getContentPane().add(createPanel());
48 | this.pack();
49 | }
50 |
51 | public static void main(String[] args) {
52 | new About().setVisible(true);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/BlankUtils.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.util;
2 |
3 | import java.util.Collection;
4 |
5 | /**
6 | * 判断是否为空的工具类
7 | *
8 | * @author 夏天
9 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
10 | */
11 | public class BlankUtils {
12 | /**
13 | * 判断字符串s是否是空串
14 | * @param s
15 | * @return
16 | */
17 | public static boolean isBlank(String string){
18 | return string==null || string.trim().equals("");
19 | }
20 |
21 | /**
22 | * 判断数组是否是空
23 | * @param array
24 | * @return
25 | */
26 | public static boolean isBlank(Object[] array){
27 | return array==null || array.length==0;
28 | }
29 |
30 | /**
31 | * 判断集合是否是空
32 | * @param array
33 | * @return
34 | */
35 | public static boolean isBlank(Collection extends Object> array){
36 | return array==null || array.size()==0;
37 | }
38 |
39 | /**
40 | * 判断所有的集合是否都为空
41 | * @param collections
42 | * @return
43 | */
44 | public static boolean isBlankAll(Collection>...collections){
45 | for(Collection> c:collections){
46 | if(!isBlank(c)){
47 | return false;
48 | }
49 | }
50 |
51 | return true;
52 | }
53 |
54 | /**
55 | * 判断字符串strings中是否都是空串
56 | * @param strings
57 | * @return
58 | */
59 | public static boolean isBlankAll(String... strings){
60 | for(String s:strings){
61 | if(!isBlank(s)){
62 | return false;
63 | }
64 | }
65 |
66 | return true;
67 | }
68 |
69 | /**
70 | * 判断collections集合中是否至少有一个为空
71 | * @param collections
72 | * @return
73 | */
74 | public static boolean isBlankAtLeastOne(Collection>...collections){
75 | for(Collection> c:collections){
76 | if(isBlank(c)){
77 | return true;
78 | }
79 | }
80 |
81 | return false;
82 | }
83 |
84 | /**
85 | * 判断字符串strings中是否之首有一个为空
86 | * @param strings
87 | * @return
88 | */
89 | public static boolean isBlankAtLeastOne(String... strings){
90 | for(String s:strings){
91 | if(isBlank(s)){
92 | return true;
93 | }
94 | }
95 |
96 | return false;
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/EditDistance.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.util;
2 |
3 | /**
4 | *
5 | * This class computes the edit distance between two strings using dynamic
6 | * programming. The dynamic programming part is in the method
7 | * printEditDistance().
8 | *
9 | * @author 夏天
10 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
11 | */
12 | public class EditDistance {
13 | /**
14 | * 获取删除代价
15 | *
16 | * @return
17 | */
18 | public int getDeletionCost() {
19 | return 1;
20 | }
21 |
22 | /**
23 | * 获取插入代价
24 | *
25 | * @return
26 | */
27 | public int getInsertionCost() {
28 | return 1;
29 | }
30 |
31 | /**
32 | * 获取替换代价
33 | *
34 | * @return
35 | */
36 | public int getSubstitutionCost(char a, char b) {
37 | return (a == b) ? 0 : 1;
38 | }
39 |
40 | public int getEditDistance(String S, String T) {
41 | int[][] D = null;
42 | if (S == null)
43 | S = "";
44 | if (T == null)
45 | T = "";
46 |
47 | char[] a = S.toCharArray();
48 | char[] b = T.toCharArray();
49 |
50 | int n = a.length; // 字符串S的长度
51 | int m = b.length; // 字符串T的长度
52 |
53 | if (a.length == 0) {
54 | return b.length;
55 | } else if (b.length == 0) {
56 | return a.length;
57 | }
58 |
59 | D = new int[a.length + 1][b.length + 1];
60 |
61 | /** 初始化D[i][0] */
62 | for (int i = 1; i <= n; i++) {
63 | D[i][0] = D[i - 1][0] + getDeletionCost();
64 | }
65 |
66 | /** 初始化D[0][j] */
67 | for (int j = 1; j <= m; j++) {
68 | D[0][j] = D[0][j - 1] + getInsertionCost();
69 | }
70 |
71 | for (int i = 1; i <= n; i++) {
72 | for (int j = 1; j <= m; j++) {
73 | D[i][j] = MathUtils.min(D[i - 1][j] + getDeletionCost(),
74 | D[i][j - 1] + getInsertionCost(), D[i - 1][j - 1]
75 | + getSubstitutionCost(a[i - 1], b[j - 1]));
76 | }
77 | }
78 |
79 | return D[n][m];
80 | }
81 |
82 | /**
83 | * 应与getEditDistance(S, T)等同
84 | * @param s
85 | * @param t
86 | * @return
87 | */
88 | public static int getLevenshteinDistance(String s, String t) {
89 | if (s == null || t == null) {
90 | throw new IllegalArgumentException("Strings must not be null");
91 | }
92 | int d[][]; // matrix
93 | int n; // length of s
94 | int m; // length of t
95 | int i; // iterates through s
96 | int j; // iterates through t
97 | char s_i; // ith character of s
98 | char t_j; // jth character of t
99 | int cost; // cost
100 |
101 | // Step 1
102 | n = s.length();
103 | m = t.length();
104 | if (n == 0) {
105 | return m;
106 | }
107 | if (m == 0) {
108 | return n;
109 | }
110 | d = new int[n + 1][m + 1];
111 |
112 | // Step 2
113 | for (i = 0; i <= n; i++) {
114 | d[i][0] = i;
115 | }
116 | for (j = 0; j <= m; j++) {
117 | d[0][j] = j;
118 | }
119 |
120 | // Step 3
121 | for (i = 1; i <= n; i++) {
122 | s_i = s.charAt(i - 1);
123 |
124 | // Step 4
125 | for (j = 1; j <= m; j++) {
126 | t_j = t.charAt(j - 1);
127 |
128 | // Step 5
129 | if (s_i == t_j) {
130 | cost = 0;
131 | } else {
132 | cost = 1;
133 | }
134 |
135 | // Step 6
136 | d[i][j] = MathUtils.min(d[i - 1][j] + 1, d[i][j - 1] + 1,
137 | d[i - 1][j - 1] + cost);
138 | }
139 | }
140 |
141 | // Step 7
142 | return d[n][m];
143 | }
144 |
145 | }
146 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/FileUtils.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.util;
2 |
3 | import java.io.BufferedOutputStream;
4 | import java.io.BufferedReader;
5 | import java.io.File;
6 | import java.io.FileOutputStream;
7 | import java.io.IOException;
8 | import java.io.InputStream;
9 | import java.io.InputStreamReader;
10 |
11 | /**
12 | * 与文件相关的工具类
13 | *
14 | * @author 夏天
15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
16 | */
17 | public class FileUtils {
18 | /**
19 | * 根据指定编码从输入流中依次遍历每一行文字
20 | *
21 | * @param input
22 | * 输入流
23 | * @param encoding
24 | * 输入流所用的文字编码
25 | * @param event
26 | * 遍历每一行时触发的事件处理
27 | * @throws IOException
28 | */
29 | public static void traverseLines(InputStream input, String encoding, TraverseEvent event) throws IOException {
30 | BufferedReader in = new BufferedReader(new InputStreamReader(input, encoding));
31 | String line = null;
32 |
33 | while ((line = in.readLine()) != null) {
34 | event.visit(line);
35 | }
36 |
37 | input.close();
38 | in.close();
39 | }
40 |
41 | /**
42 | * 保存字符串到文件中
43 | * @param content
44 | * @param fileName
45 | * @return
46 | */
47 | public static boolean saveStringToFile(String content, String fileName) {
48 | boolean rtn = false;
49 | BufferedOutputStream out = null;
50 | try {
51 | File file = new File(fileName);
52 | file.getParentFile().mkdirs();
53 |
54 | out = new BufferedOutputStream(new FileOutputStream(file));
55 | out.write(content.getBytes("GBK"));
56 | out.close();
57 | rtn = true;
58 | } catch (Exception e) {
59 | System.out.println("saveStringToFile error:" + e.getMessage());
60 | } finally {
61 | try {
62 | out.close();
63 | } catch (Exception e) {
64 | }
65 | }
66 | return rtn;
67 | }
68 |
69 | public static void main(String[] args) {
70 | int count = 0;
71 | File dir = new File("G:/juanjuantx");
72 | for(File a:dir.listFiles()){
73 | if(a.isDirectory()){
74 | for(File zy: a.listFiles()){
75 | if(zy.listFiles()!=null)
76 | for(File rar:zy.listFiles()){
77 | if(rar.isFile() && rar.getName().endsWith(".rar")){
78 | count++;
79 | }
80 | }
81 | }
82 | }
83 | }
84 | System.out.println(count);
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/MathUtils.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.util;
2 |
3 | public class MathUtils {
4 | public static int min(int... values){
5 | int min = Integer.MAX_VALUE;
6 | for(int v:values){
7 | min = (v夏天
16 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
17 | */
18 | public class PinyinUtils {
19 | /** 拼音的Map词典, 一个汉字可能对应多个拼音, 它所有的拼音放到一个集合中 */
20 | private Map> pinyinDict = null;
21 |
22 | /** 单例 */
23 | private static PinyinUtils instance = null;
24 |
25 | private PinyinUtils() throws IOException{
26 | //从classpath中加载拼音词典文件
27 | InputStream input = this.getClass().getResourceAsStream("/data/F02-GB2312-to-PuTongHua-PinYin.txt");
28 |
29 | BufferedReader in = new BufferedReader(new InputStreamReader(input, "UTF-8"));
30 | String line = null;
31 |
32 | MyTraverseEvent event = new MyTraverseEvent();
33 | while ((line = in.readLine()) != null) {
34 | event.visit(line);
35 | }
36 |
37 | input.close();
38 | in.close();
39 |
40 | this.pinyinDict = event.getPinyins();
41 | }
42 |
43 | public static PinyinUtils getInstance(){
44 | if(instance == null){
45 | try {
46 | instance = new PinyinUtils();
47 | } catch (IOException e) {
48 | e.printStackTrace();
49 | }
50 | }
51 |
52 | return instance;
53 | }
54 |
55 | /**
56 | * 获取汉字的拼音, 由于汉字具有多音字,故返回一个集合
57 | * @param hanzi
58 | * @return
59 | */
60 | public Set getPinyin(Character hanzi){
61 | Set set = pinyinDict.get(hanzi);
62 | if(set==null || set.size()==0){
63 | set = new HashSet();
64 | set.add(hanzi.toString());
65 | }
66 | return set;
67 | }
68 |
69 | /**
70 | * 获取词语的拼音, 一个词语可能对应多个拼音,把所有可能的组合放到集合中返回
71 | * @param word
72 | * @return
73 | */
74 | public Set getPinyin(String word){
75 | Set word_set = new HashSet();
76 | for(int i=0; i hanzi_set = getPinyin(word.charAt(i));
78 | if(word_set==null || word_set.size()==0){
79 | word_set.addAll(hanzi_set);
80 | continue;
81 | }
82 |
83 | Set tmp_set = new HashSet();
84 | for(String w:word_set){
85 | for(String h:hanzi_set){
86 | tmp_set.add(w + h);
87 | }
88 | }
89 |
90 | word_set = tmp_set;
91 | }
92 |
93 | return word_set;
94 | }
95 |
96 | /**
97 | * 获取拼音字符串,多音字只取一个
98 | * @param word
99 | * @return
100 | */
101 | public String getPinyinSingle(String word){
102 | StringBuffer sb = new StringBuffer();
103 | for(int i=0; i pinyin = getPinyin(word.charAt(i));
118 | sb.append(pinyin.toString());
119 | }
120 | return sb.toString();
121 | }
122 |
123 | /**
124 | * 获取拼音首字母
125 | * @param word
126 | * @return
127 | */
128 | public String getPinyinHead(String word){
129 | StringBuffer sb = new StringBuffer();
130 | for(int i=0; i> pinyins = null;
139 |
140 | public MyTraverseEvent(){
141 | this.pinyins = new HashMap>();
142 | }
143 |
144 | public Map> getPinyins(){
145 | return pinyins;
146 | }
147 |
148 | public boolean visit(String item) {
149 | if(item.startsWith("//")){
150 | return true;
151 | }
152 |
153 | char hanzi = item.charAt(0);
154 | //String pinyin = item.substring(2, item.length()-1);
155 | String pinyin = item.substring(2, item.length());
156 | Set set = pinyins.get(hanzi);
157 | if(set==null){
158 | set = new HashSet();
159 | }
160 | set.add(pinyin);
161 |
162 | pinyins.put(hanzi, set);
163 | return true;
164 | }
165 | }
166 |
167 | }
168 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/TraverseEvent.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.util;
2 |
3 | /**
4 | * 遍历接口, 对于需要遍历的东西,通过传入该接口,可以实现实际的访问处理
5 | *
6 | * @author 夏天
7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
8 | *
9 | * @param
10 | */
11 | public interface TraverseEvent {
12 |
13 | /**
14 | * 遍历时访问其中的一个条目
15 | * @param item
16 | * @return
17 | */
18 | public boolean visit(T item);
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/util/XmlException.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.util;
2 |
3 | /**
4 | * Runtime exception for XML handling.
5 | *
6 | * @author carver
7 | */
8 | public class XmlException extends RuntimeException {
9 |
10 | private static final long serialVersionUID = 381260478228427716L;
11 |
12 | public static final String XML_PAYLOAD_EMPTY = "xml.payload.empty";
13 | public static final String XML_ENCODE_ERROR = "xml.encoding.invalid";
14 | public static final String FILE_NOT_FOUND = "xml.file.not.found";
15 | public static final String XML_PARSE_ERROR = "xml.parse.error";
16 | public static final String XML_READ_ERROR = "xml.read.error";
17 | public static final String XML_VALIDATE_ERROR = "xml.validate.error";
18 | public static final String XML_TRANSFORM_ERROR = "xml.transform.error";
19 |
20 | public XmlException() {
21 | super();
22 | }
23 |
24 | public XmlException(String key, Throwable cause) {
25 | super(key, cause);
26 | }
27 |
28 | public XmlException(String key) {
29 | super(key);
30 | }
31 |
32 | public XmlException(Throwable cause) {
33 | super(cause);
34 | }
35 |
36 | }
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/CharBasedSimilarity.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import ruc.irm.similarity.Similaritable;
7 |
8 |
9 | /**
10 | * 字面相似度计算方法
11 | *
12 | * @author 夏天
13 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
14 | */
15 | public class CharBasedSimilarity implements Similaritable {
16 |
17 | private double alpha = 0.6;
18 | private double beta = 0.4;
19 |
20 | @Override
21 | public double getSimilarity(String word1, String word2) {
22 | if(isBlank(word1)&& isBlank(word2)){
23 | return 1.0;
24 | }
25 | if(isBlank(word1)|| isBlank(word2)){
26 | return 0.0;
27 | }
28 |
29 | List sameHZ = new ArrayList();
30 |
31 | String longString = word1.length()>=word2.length()?word1:word2;
32 | String shortString = word1.length() sameHZ){
48 | double top = 0;
49 | double bottom = 0;
50 | for(int i=0; i codeSet1 = CilinDb.getInstance().getCilinCoding(item1);
35 | Set codeSet2 = CilinDb.getInstance().getCilinCoding(item2);
36 | if(codeSet1==null || codeSet2==null){
37 | return 0.0;
38 | }
39 | for(String code1:codeSet1){
40 | for(String code2:codeSet2){
41 | double s = getSimilarityByCode(code1, code2);
42 | System.out.println(code1 + "-" + code2 + "-" +CilinCoding.calculateCommonWeight(code1, code2));
43 | if(sim
5 | *
6 | *
7 | * 编码位 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
8 | *
9 | *
10 | * 编码示例 | C | b | 0 | 7 | A | 0 | 3 | = |
11 | *
12 | * 类别级别 | 第一级 | 第二级 | 第三级 | 第四级 | 第五级 | 标记位 |
13 | * |
14 | * 类别含义 | 大类 | 中类 | 小类 | 词群 | 原子词群 | 词语关系 |
15 | *
16 | *
17 | *
18 | * 表中编码位从左到右顺序排列,其中,第8位对应的标记位为“=”、“#”和“@”三种符号之一。其中“=”代表常见的“同义”关系,“#”代表词语之间的相关关系,“@”则代表词语自我封闭的独立性质,它在词典中既没有同义词,也没有相关词。
19 | *
20 | *
21 | * @author 夏天
22 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
23 | */
24 | public class CilinCoding {
25 | public static double[] WEIGHT = new double[]{1.2, 1.2, 1.0, 1.0, 0.8, 0.4};
26 | public static double TOTAL_WEIGHT = 5.6;
27 |
28 | public static String getCodeLevel(String code,int level){
29 | switch(level){
30 | case 1:
31 | return code.substring(0, 1);
32 | case 2:
33 | return code.substring(1, 2);
34 | case 3:
35 | return code.substring(2, 4);
36 | case 4:
37 | return code.substring(4, 5);
38 | case 5:
39 | return code.substring(5, 7);
40 | case 6:
41 | return code.substring(7);
42 | }
43 |
44 | return "";
45 | }
46 |
47 | /**
48 | * 获取共同部分编码的权重
49 | * @param code1
50 | * @param code2
51 | * @return
52 | */
53 | public static double calculateCommonWeight(String code1, String code2){
54 | double weight = 0.0;
55 | for(int i=1; i<=6; i++){
56 | String c1 = getCodeLevel(code1,i);
57 | String c2 = getCodeLevel(code2,i);
58 | if(c1.equals(c2)){
59 | weight += WEIGHT[i-1];
60 | }else{
61 | break;
62 | }
63 | }
64 | return weight;
65 | }
66 |
67 | public static String printCoding(String code){
68 | StringBuilder sb = new StringBuilder();
69 | for(int i=1; i<=6; i++){
70 | if(i==1){
71 | sb.append("[LEVEL_" + i);
72 | }else{
73 | sb.append(", LEVEL_" + i);
74 | }
75 | sb.append(": ");
76 | sb.append(getCodeLevel(code, i));
77 | }
78 | sb.append("]");
79 |
80 | return sb.toString();
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/cilin/CilinDb.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.cilin;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.util.HashMap;
6 | import java.util.HashSet;
7 | import java.util.Map;
8 | import java.util.Set;
9 | import java.util.zip.GZIPInputStream;
10 |
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 | import ruc.irm.similarity.util.FileUtils;
14 | import ruc.irm.similarity.util.TraverseEvent;
15 |
16 | /**
17 | * 词林数据库
18 | *
19 | * @author 夏天
20 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
21 | */
22 | public class CilinDb {
23 | /** the logger */
24 | protected static Logger LOG = LoggerFactory.getLogger(CilinDb.class);
25 | /** 以词语为主键的索引表 */
26 | private Map> wordIndex = new HashMap>();
27 | /** 以编码为主键的索引表 */
28 | private Map> codeIndex = new HashMap>();
29 |
30 | private static CilinDb instance = null;
31 |
32 | public static CilinDb getInstance(){
33 | if(instance == null){
34 | try {
35 | instance = new CilinDb();
36 | } catch (IOException e) {
37 | LOG.error(e.toString());
38 | }
39 | }
40 | return instance;
41 | }
42 |
43 | private CilinDb() throws IOException{
44 | InputStream input = new GZIPInputStream(this.getClass().getResourceAsStream("/data/cilin.db.gz"));
45 |
46 | TraverseEvent event = new TraverseEvent(){
47 | @Override
48 | public boolean visit(String line) {
49 | String[] items = line.split(" ");
50 | Set set = new HashSet();
51 | for(int i=2; i codeWords = codeIndex.get(code);
58 | if(codeWords==null){
59 | codeWords = new HashSet();
60 | }
61 | codeWords.add(items[0]);
62 | codeIndex.put(code, codeWords);
63 | }
64 | }
65 | wordIndex.put(items[0], set);
66 | items = null;
67 | return false;
68 | }};
69 | LOG.info("loading cilin dictionary...");
70 | long time = System.currentTimeMillis();
71 |
72 | FileUtils.traverseLines(input, "UTF8", event);
73 |
74 | time = System.currentTimeMillis() - time;
75 | LOG.info("loading cilin dictionary completely. time elapsed: " + time);
76 |
77 | }
78 |
79 | /**
80 | * 获取某个词语的词林编码,一个词语可以有多个编码,通过Set给出
81 | * @param word
82 | * @return
83 | */
84 | public Set getCilinCoding(String word){
85 | return wordIndex.get(word);
86 | }
87 |
88 | public Set getCilinWords(String code){
89 | return codeIndex.get(code);
90 | }
91 |
92 | public static void main(String[] args) {
93 | CilinDb db = CilinDb.getInstance();
94 | String code = db.getCilinCoding("中国").iterator().next();
95 | System.out.println(CilinCoding.printCoding(code));
96 | System.out.println(db.getCilinWords(code));
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/Hownet.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet;
2 |
3 | import java.io.IOException;
4 |
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 | import ruc.irm.similarity.Similaritable;
8 | import ruc.irm.similarity.word.hownet2.concept.BaseConceptParser;
9 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
10 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser;
11 | import ruc.irm.similarity.word.hownet2.sememe.BaseSememeParser;
12 |
13 | /**
14 | * Hownet的主控制类, 通过知网的概念和义原及其关系计算汉语词语之间的相似度.
15 | * 相似度的计算理论参考论文《汉语词语语义相似度计算研究》
16 | *
17 | * @author 夏天
18 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
19 | *
20 | * @see ruc.irm.similarity.Similaritable
21 | */
22 | public class Hownet implements Similaritable{
23 | /** the logger */
24 | private static final Logger LOG = LoggerFactory.getLogger(Hownet.class);
25 | /** 知网的单例 */
26 | private static Hownet instance = null;
27 |
28 | private BaseConceptParser conceptParser = null;
29 |
30 | private Hownet(){
31 | try {
32 | BaseSememeParser sememeParser = new XiaSememeParser();
33 | conceptParser = new XiaConceptParser(sememeParser);
34 | } catch (IOException e) {
35 | e.printStackTrace();
36 | LOG.error(e.toString());
37 | }
38 | }
39 |
40 | /**
41 | * 单例获取知网对象
42 | * @return
43 | */
44 | public static Hownet instance(){
45 | if(null == instance){
46 | instance = new Hownet();
47 | }
48 |
49 | return instance;
50 | }
51 |
52 | /**
53 | * 获取概念解析器
54 | * @return
55 | */
56 | public BaseConceptParser getConceptParser(){
57 | return conceptParser;
58 | }
59 |
60 | public double getSimilarity(String item1, String item2) {
61 | return conceptParser.getSimilarity(item1, item2);
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/HownetMeta.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet;
2 |
3 | /**
4 | * Metadata for Hownet
5 | *
6 | * @author 夏天
7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
8 | */
9 | public interface HownetMeta {
10 | /** Algorithm of XIA Tian */
11 | public static final int ALGORITHM_XIA = 1;
12 |
13 | /** Algorithm of LIU Qun */
14 | public static final int ALGORITHM_LIU = 2;
15 |
16 | /**
17 | * Hownet symbol descriptions
18 | */
19 | public static final String Symbol_Descriptions[][] = {
20 | {
21 | "#", "表示与其相关"}
22 | , {
23 | "%", "是其部分"}
24 | , {
25 | "$", "可以被该V处置,或是该V的受事、对象、领有物,或内容"}
26 | , {
27 | "*", "施事或工具"}
28 | , {
29 | "+", "所标记的角色是隐性的,几乎在实际语言中不会出现"}
30 | , {
31 | "&", "指向"}
32 | , {
33 | "~", "多半是,多半有,很可能"}
34 | , {
35 | "@", "可以做V的空间或时间"}
36 | , {
37 | "?", "可以使N的材料"}
38 | , {
39 | "(", "至于其中的应该是一个词标记"}
40 | , {
41 | "^", "不存在,或没有,或不能"}
42 | , {
43 | "!", "表示某一属性为一敏感的属性,如味道之与食物"}
44 | , {
45 | "[", "标示概念的共性属性"}
46 | };
47 |
48 | /** γ:具体词与义元的相似度一律为一个较小的常数 */
49 | public static final double gamma = 0.2;
50 |
51 | /** δ:任一个非空值与空值的相似度为一个较小的常数,此处为0.2 */
52 | public static final double delta = 0.2;
53 |
54 | /** β1实词概念第一基本义原描述式的权重 */
55 | public static final double beta1 = 0.5;
56 | /** β2实词概念其他基本义原描述式的权重 */
57 | public static final double beta2 = 0.2;
58 | /** β3实词概念关系义原描述式的权重 */
59 | public static final double beta3 = 0.17;
60 | /** β4实词概念符号义原描述式的权重 */
61 | public static final double beta4 = 0.13;
62 |
63 | /**
64 | * Θ 计算后面概念的义原与参照概念所有义原的最大相似度, 并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用),
65 | * 如果数值大于该值时才会起参照作用, 去掉冗余的不重要义原
66 | */
67 | public static final double PARAM_THETA = 0.5;
68 | /**
69 | * Ω 计算前面概念的义原与参照概念所有义原的最大相似度,并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用),
70 | * 如果数值大于该值时才会调整前面概念的义原符号, 以起修正作用
71 | */
72 | public static final double PARAM_OMEGA = 0.8;
73 | /** */
74 | public static final double PARAM_XI = 0.6;
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/concept/Concept.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet.concept;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 | import java.util.StringTokenizer;
6 |
7 | import ruc.irm.similarity.word.hownet.HownetMeta;
8 |
9 |
10 | /**
11 | * 知网的概念表示类
example和英文部分对于相似度的计算不起作用,考虑到内存开销, 在概念的表示中去掉了这部分数据的对应定义
12 | *
13 | * @author 夏天
14 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
15 | * @deprecated
16 | */
17 | public class Concept implements HownetMeta, Comparable {
18 | /** 中文概念名称 */
19 | protected String word;
20 | /** 词性: Part of Speech */
21 | protected String pos;
22 | /** 定义 */
23 | protected String define;
24 |
25 | /** 是否是实词,false表示为虚词, 一般为实词 */
26 | protected boolean bSubstantive;
27 | /** 第一基本义原 */
28 | protected String mainSememe;
29 | /** 其他基本义原 */
30 | protected String[] secondSememes;
31 | /** 关系义元原 */
32 | protected String[] relationSememes;
33 | /** 关系符号描述 */
34 | protected String[] symbolSememes;
35 |
36 | static String[][] Concept_Type = { { "=", "事件" },
37 | { "aValue|属性值", "属性值" }, { "qValue|数量值", "数量值" },
38 | { "attribute|属性", "属性" }, { "quantity|数量", "数量" },
39 | { "unit|", "单位" }, { "%", "部件" } };
40 |
41 | public Concept(String word, String pos, String def) {
42 | this.word = word;
43 | this.pos = pos;
44 | this.define = (def == null) ? "" : def.trim();
45 |
46 | // 虚词用{***}表示
47 | if (define.length() > 0
48 | && define.charAt(0) == '{'
49 | && define.charAt(define.length() - 1) == '}'){
50 | this.bSubstantive = false;
51 | } else {
52 | this.bSubstantive = true;
53 | }
54 |
55 | parseDefine();
56 | }
57 |
58 | /**
59 | * 处理定义,把定义分为第一基本义元、其他基本义元、关系义元和符号义元四类
60 | */
61 | private void parseDefine() {
62 | List secondList = new ArrayList(); //其他基本义原
63 | List relationList = new ArrayList(); //关系义原
64 | List symbolList = new ArrayList(); //符号义原
65 |
66 | String tokenString = this.define;
67 |
68 | //如果不是实词,则处理“{}”中的内容
69 | if (!this.bSubstantive) {
70 | tokenString = define.substring(1, define.length() - 1);
71 | }
72 |
73 | StringTokenizer token = new StringTokenizer(tokenString, ",", false);
74 |
75 | // 第一个为第一基本义元
76 | if (token.hasMoreTokens()) {
77 | this.mainSememe = token.nextToken();
78 | }
79 |
80 | main_loop: while (token.hasMoreTokens()) {
81 | String item = token.nextToken();
82 | if (item.equals("")) continue;
83 |
84 | // 先判断是否为符号义元
85 | String symbol = item.substring(0, 1);
86 | for(int i=0;i< Symbol_Descriptions.length;i++){
87 | if(symbol.equals( Symbol_Descriptions[i][0])){
88 | symbolList.add(item);
89 | continue main_loop;
90 | }
91 | }
92 |
93 | //如果不是符号义元,则进一步判断是关系义元还是第二基本义元, 带有“=”表示关系义原
94 | if (item.indexOf('=') > 0){
95 | relationList.add(item);
96 | } else {
97 | secondList.add(item);
98 | }
99 | }
100 |
101 | this.secondSememes = secondList.toArray(new String[secondList.size()]);
102 | this.relationSememes = relationList.toArray(new String[relationList.size()]);
103 | this.symbolSememes = symbolList.toArray(new String[symbolList.size()]);
104 | }
105 |
106 | /**
107 | * 获取第一义元
108 | *
109 | * @return
110 | */
111 | public String getMainSememe() {
112 | return mainSememe;
113 | }
114 |
115 | /**
116 | * 获取其他基本义元描述
117 | *
118 | * @return
119 | */
120 | public String[] getSecondSememes() {
121 | return secondSememes;
122 | }
123 |
124 | /**
125 | * 获取关系义元描述
126 | *
127 | * @return
128 | */
129 | public String[] getRelationSememes() {
130 | return relationSememes;
131 | }
132 |
133 | /**
134 | * 获取符号义元描述
135 | *
136 | * @return
137 | */
138 | public String[] getSymbolSememes() {
139 | return symbolSememes;
140 | }
141 |
142 | @Override
143 | public String toString() {
144 | StringBuilder sb = new StringBuilder();
145 | sb.append("name=");
146 | sb.append(this.word);
147 | sb.append("; pos=");
148 | sb.append(this.pos);
149 | sb.append("; define=");
150 | sb.append(this.define);
151 | sb.append("; 第一基本义元:[" + mainSememe);
152 |
153 | sb.append("]; 其他基本义元描述:[");
154 | for(String sem: secondSememes){
155 | sb.append(sem);
156 | sb.append(";");
157 | }
158 |
159 | sb.append("]; [关系义元描述:");
160 | for(String sem: relationSememes){
161 | sb.append(sem);
162 | sb.append(";");
163 | }
164 |
165 | sb.append("]; [关系符号描述:");
166 | for(String sem: symbolSememes){
167 | sb.append(sem);
168 | sb.append(";");
169 | }
170 | sb.append("]");
171 | return sb.toString();
172 | }
173 |
174 | /**
175 | * 是实词还是虚词
176 | *
177 | * @return true:实词;false:虚词
178 | */
179 | public boolean isSubstantive() {
180 | return this.bSubstantive;
181 | }
182 |
183 | public String getWord() {
184 | return word;
185 | }
186 |
187 | public void setWord(String word) {
188 | this.word = word;
189 | }
190 |
191 | public String getPos() {
192 | return pos;
193 | }
194 |
195 | public void setPos(String pos) {
196 | this.pos = pos;
197 | }
198 |
199 | public String getDefine() {
200 | return define;
201 | }
202 |
203 | public void setDefine(String define) {
204 | this.define = define;
205 | }
206 |
207 | /**
208 | * 获取该概念的类型
209 | *
210 | * @return
211 | */
212 | public String getType() {
213 | for (int i = 0; i < Concept_Type.length; i++) {
214 | if (define.toUpperCase().indexOf(Concept_Type[i][0].toUpperCase()) >= 0) {
215 | return Concept_Type[i][1];
216 | }
217 | }
218 | return "普通概念";
219 | }
220 |
221 | /**
222 | * 按照概念的名称进行比较
223 | */
224 | public int compareTo(Concept o) {
225 | return word.compareTo(o.word);
226 | }
227 |
228 | //////////////////////////////////////////////
229 | /**
230 | * 方便在parse中比较概念词语加入的方法
231 | * @param another
232 | * @return
233 | */
234 | public int compareTo(String another){
235 | return word.compareTo(another);
236 | }
237 |
238 | public boolean equals(String another){
239 | return word.equals(another);
240 | }
241 | }
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/concept/ConceptDictTraverseEvent.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet.concept;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileOutputStream;
6 | import java.io.InputStream;
7 | import java.io.InputStreamReader;
8 | import java.io.PrintWriter;
9 | import java.util.ArrayList;
10 | import java.util.Arrays;
11 | import java.util.List;
12 |
13 | import javax.xml.parsers.DocumentBuilder;
14 | import javax.xml.parsers.DocumentBuilderFactory;
15 | import javax.xml.transform.OutputKeys;
16 | import javax.xml.transform.Transformer;
17 | import javax.xml.transform.TransformerFactory;
18 | import javax.xml.transform.dom.DOMSource;
19 | import javax.xml.transform.stream.StreamResult;
20 |
21 | import org.w3c.dom.Document;
22 | import org.w3c.dom.Element;
23 |
24 | import ruc.irm.similarity.util.TraverseEvent;
25 |
26 | /**
27 | * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准,格式如下:
28 | * 阿斗 N human|人,ProperName|专,past|昔
29 | * 阿爸 N human|人,family|家,male|男
30 | * 即: <概念> <空格或者跳格> <词性> <空格或者跳格> <定义>"
31 | *
32 | * 概念保存到数组中,没有保存到Map中,可以降低对内存空间的使用
33 | *
34 | * @author 夏天
35 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
36 | * @deprecated
37 | */
38 | public class ConceptDictTraverseEvent implements TraverseEvent {
39 | private List conceptList = null;
40 |
41 | public ConceptDictTraverseEvent(){
42 | conceptList = new ArrayList();
43 | }
44 |
45 | public Concept[] getConcepts(){
46 | Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]);
47 | Arrays.sort(concepts);
48 | return concepts;
49 | }
50 |
51 | /**
52 | * 读取概念词典中的一行,并进行解析处理
53 | */
54 | public boolean visit(String line) {
55 | String word = null;
56 | String pos = null;
57 | String define = "";
58 | char ch;
59 |
60 | //以符号//开始的是注释行
61 | if(line.startsWith("//")){
62 | return true;
63 | }
64 |
65 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置
66 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义
67 | //解析出一行中的概念各项数据
68 | loop: for (int position = 0; position < line.length(); position++) {
69 | ch = line.charAt(position);
70 |
71 | if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
72 | String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
73 | switch(processFlag){
74 | case 0:
75 | word = item;
76 | processFlag++;
77 | break;
78 | case 1:
79 | pos = item;
80 | processFlag++;
81 | break;
82 | case 2:
83 | //define = item;
84 | //processFlag++;
85 | define = line.substring(lastPosition).trim();
86 | break loop;
87 | case 3:
88 | System.out.println(line);
89 | break;
90 | }
91 |
92 | for( ;(position < line.length()); position++){
93 | ch = line.charAt(position);
94 | if ((ch != ' ') && (ch != '\t')) {
95 | lastPosition = position;
96 | break;
97 | }
98 | }
99 |
100 | }
101 | }
102 | conceptList.add(new Concept(word, pos, define));
103 | return true;
104 | }
105 |
106 | public void saveToXML(File xmlFile) throws Exception{
107 | String conceptFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/concept.dat";
108 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(conceptFile);
109 | BufferedReader in = new BufferedReader(new InputStreamReader(input, "utf8"));
110 |
111 | DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance();
112 | DocumentBuilder builder=factory.newDocumentBuilder();
113 | Document document=builder.newDocument();
114 | Element root=document.createElement("concepts");
115 | document.appendChild(root);
116 |
117 | String line = null;
118 |
119 | while ((line = in.readLine()) != null) {
120 | saveLineToXML(document, root, line);
121 | }
122 |
123 | input.close();
124 | in.close();
125 |
126 | TransformerFactory tf=TransformerFactory.newInstance();
127 | Transformer transformer=tf.newTransformer();
128 | DOMSource source=new DOMSource(document);
129 | transformer.setOutputProperty(OutputKeys.ENCODING,"utf8");
130 | transformer.setOutputProperty(OutputKeys.INDENT,"yes");
131 | PrintWriter pw=new PrintWriter(new FileOutputStream(xmlFile));
132 | StreamResult result=new StreamResult(pw);
133 | transformer.transform(source,result);
134 | }
135 |
136 |
137 | /**
138 | * 读取概念词典中的一行,并进行解析处理
139 | */
140 | private boolean saveLineToXML(Document document, Element root, String line) {
141 | String word = null;
142 | String pos = null;
143 | String define = "";
144 | char ch;
145 |
146 | //以符号//开始的是注释行
147 | if(line.startsWith("//")){
148 | return true;
149 | }
150 |
151 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置
152 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义
153 | //解析出一行中的概念各项数据
154 | loop: for (int position = 0; position < line.length(); position++) {
155 | ch = line.charAt(position);
156 |
157 | if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
158 | String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
159 | switch(processFlag){
160 | case 0:
161 | word = item;
162 | processFlag++;
163 | break;
164 | case 1:
165 | pos = item;
166 | processFlag++;
167 | break;
168 | case 2:
169 | //define = item;
170 | //processFlag++;
171 | define = line.substring(lastPosition).trim();
172 | break loop;
173 | case 3:
174 | System.out.println(line);
175 | break;
176 | }
177 |
178 | for( ;(position < line.length()); position++){
179 | ch = line.charAt(position);
180 | if ((ch != ' ') && (ch != '\t')) {
181 | lastPosition = position;
182 | break;
183 | }
184 | }
185 |
186 | }
187 | }
188 |
189 | Element e = document.createElement("c");
190 | e.setAttribute("w", word);
191 | e.setAttribute("p", pos);
192 | e.setAttribute("d", define);
193 | root.appendChild(e);
194 | return true;
195 | }
196 |
197 | public static void main(String[] args) throws Exception {
198 | new ConceptDictTraverseEvent().saveToXML(new File("/home/xiatian/Desktop/concept.xml"));
199 | }
200 |
201 | }
202 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/concept/ConceptLinkedList.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet.concept;
2 |
3 | import java.util.LinkedList;
4 |
5 | /**
6 | * 用于概念处理的LinkedList
7 | *
8 | * @author 夏天
9 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
10 | *
11 | * @param
12 | * @deprecated
13 | */
14 | @SuppressWarnings("serial")
15 | public class ConceptLinkedList extends LinkedList {
16 |
17 | /**
18 | * 删除链表中最后面的size个元素
19 | * @param size
20 | */
21 | public void removeLast(int size){
22 | for(int i=0;i夏天
15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
16 | * @deprecated
17 | */
18 | public class LiuConceptParser extends ConceptParser{
19 |
20 | private static LiuConceptParser instance = null;
21 |
22 | public static LiuConceptParser getInstance(){
23 | if(instance == null){
24 | try {
25 | instance = new LiuConceptParser();
26 | } catch (IOException e) {
27 | e.printStackTrace();
28 | }
29 | }
30 |
31 | return instance;
32 | }
33 |
34 | private LiuConceptParser(SememeParser sememeParser) throws IOException {
35 | super(sememeParser);
36 | }
37 |
38 | private LiuConceptParser() throws IOException{
39 | super(new LiuqunSememeParser());
40 | }
41 |
42 | @Override
43 | protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4){
44 | return beta1 * sim_v1
45 | + beta2 * sim_v1 * sim_v2
46 | + beta3 * sim_v1 * sim_v2 * sim_v3
47 | + beta4 * sim_v1 * sim_v2 * sim_v3 * sim_v4;
48 | }
49 |
50 | @Override
51 | public double getSimilarity(String word1, String word2) {
52 | double similarity = 0.0;
53 |
54 | // 如果两个句子相同,则直接返回1.0
55 | if (word1.equals(word2)) {
56 | return 1.0;
57 | }
58 |
59 | Collection concepts1 = getConcepts(word1);
60 | Collection concepts2 = getConcepts(word2);
61 |
62 | //如果是blank,则说明是未登录词, 需要计算组合概念
63 | if(BlankUtils.isBlank(concepts1) || BlankUtils.isBlank(concepts2)){
64 | return 0.0;
65 | }
66 |
67 | //两个for循环分别计算词语所有可能的概念的相似度
68 | for(Concept c1:concepts1){
69 | for(Concept c2:concepts2){
70 | double v = getSimilarity(c1, c2);
71 |
72 | if(v>similarity){
73 | similarity = v;
74 | }
75 |
76 | if(similarity == 1.0){
77 | break;
78 | }
79 | }
80 | }
81 |
82 | return similarity;
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/FastSimpleMap.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet.sememe;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.Collection;
6 |
7 | /**
8 | * 一种新的Map,跟标准的Map不同,它的的Key可以有重复, 内部采用快速排序和二分查找,
9 | * 保持较少的变量,结构简单,可根据主键查找返回的结果是一个数组
10 | *
11 | * @author 夏天
12 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
13 | *
14 | * @param
15 | * @param
16 | * @deprecated
17 | */
18 | public class FastSimpleMap, V> {
19 | private K[] keys;
20 | private V[] values;
21 |
22 | public FastSimpleMap(K[] keys, V[] values) throws IOException{
23 | if(keys.length!=values.length){
24 | throw new IOException("keys length must be equals values");
25 | }
26 | this.keys = keys;
27 | this.values = values;
28 |
29 | // 根据keys进行排序
30 | quicksort(0, keys.length-1);
31 | }
32 |
33 | /**
34 | * 查找键对应的值集合
35 | * @param key
36 | * @return
37 | */
38 | public Collection get(K key) {
39 | int low = 0;
40 | int high = keys.length - 1;
41 |
42 | Collection results = new ArrayList();
43 |
44 | while (low <= high) {
45 | int mid = (low + high) >> 1;
46 | K item = keys[mid];
47 | int cmp = key.compareTo(item);
48 |
49 | if (cmp > 0) {
50 | low = mid + 1;
51 | } else if (cmp < 0) {
52 | high = mid - 1;
53 | } else {
54 | // 找到起始位置,该位置前后相同的都是该主键对应的值
55 | for(int i=mid;i>=0 && keys[i].equals(key); i--){
56 | results.add(values[i]);
57 | }
58 | for(int i=mid+1; i>1];
84 |
85 | //partition
86 | do {
87 | while (keys[i].compareTo(x)<0) i++;
88 | while (keys[j].compareTo(x)>0) j--;
89 |
90 | if (i<=j)
91 | {
92 | h=keys[i]; keys[i]=keys[j]; keys[j]=h;
93 | v=values[i]; values[i]=values[j]; values[j]=v;
94 | i++; j--;
95 | }
96 | } while (i<=j);
97 |
98 | // recursion
99 | if (low夏天
9 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
10 | *
11 | * @author xiatian
12 | * @version 1.0
13 | * @deprecated
14 | */
15 | public class LiuqunSememeParser extends SememeParser {
16 |
17 | /** 计算义元相似度的可调节的参数,默认为1.6 */
18 | private final float alpha = 1.6f;
19 |
20 | public LiuqunSememeParser() throws IOException {
21 | super();
22 | }
23 |
24 | /**
25 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者
26 | *
similarity = alpha/(distance+alpha)
27 | *
28 | * @param key1
29 | * @param key2
30 | * @return
31 | */
32 | @Override
33 | public double getSimilarity(String item1, String item2) {
34 | int pos;
35 |
36 | // 如果为空串,直接返回0
37 | if (item1 == null || item2 == null || item1.equals("")
38 | || item2.equals(""))
39 | return 0.0;
40 |
41 | String key1 = item1.trim();
42 | String key2 = item2.trim();
43 |
44 | // 去掉()符号
45 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
46 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
47 | key1 = key1.substring(1, key1.length() - 1);
48 | key2 = key2.substring(1, key2.length() - 1);
49 | } else {
50 | return 0.0;
51 | }
52 | }
53 |
54 | // 处理关系义元,即x=y的情况
55 | if ((pos = key1.indexOf('=')) > 0) {
56 | int pos2 = key2.indexOf('=');
57 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0
58 | if ((pos == pos2)
59 | && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
60 | key1 = key1.substring(pos + 1);
61 | key2 = key2.substring(pos2 + 1);
62 | } else {
63 | return 0.0;
64 | }
65 | }
66 |
67 | // 处理符号义元,即前面有特殊符号的义元
68 | String symbol1 = key1.substring(0, 1);
69 | String symbol2 = key2.substring(0, 1);
70 |
71 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
72 | if (symbol1.equals(Symbol_Descriptions[i][0])) {
73 | if (symbol1.equals(symbol2)) {
74 | key1 = item1.substring(1);
75 | key2 = item2.substring(1);
76 | break;
77 | } else {
78 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0
79 | }
80 | }
81 | }
82 |
83 | if ((pos = key1.indexOf("|")) >= 0) {
84 | key1 = key1.substring(pos + 1);
85 | }
86 | if ((pos = key2.indexOf("|")) >= 0) {
87 | key2 = key2.substring(pos + 1);
88 | }
89 |
90 | int distance = getDistance(key1, key2);
91 | if (distance < 0)
92 | return 0.0;
93 | else
94 | return alpha / (distance + alpha);
95 | }
96 |
97 | @Override
98 | public double getSimilarity(Sememe sem1, Sememe sem2) {
99 | int distance = getDistance(sem1, sem2);
100 | if (distance <= 0)
101 | return 0.0f;
102 | else
103 | return alpha / (distance + alpha);
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/MySememeParser.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet.sememe;
2 |
3 | import java.io.IOException;
4 |
5 | import ruc.irm.similarity.util.BlankUtils;
6 |
7 |
8 | /**
9 | * 义原相似度计算, 实现了SememeParser中定义的抽象方法
10 | *
11 | * @author 夏天
12 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
13 | * @deprecated
14 | */
15 | public class MySememeParser extends SememeParser {
16 |
17 | public MySememeParser() throws IOException{
18 | super();
19 | }
20 |
21 | /**
22 | * 计算两个义原的相似度
23 | */
24 | @Override
25 | public double getSimilarity(final Sememe sememe1, final Sememe sememe2) {
26 | Sememe sem1 = sememe1;
27 | Sememe sem2 = sememe2;
28 |
29 | if (sememe1 == null || sememe2 == null){
30 | return 0.0f;
31 | }else if(sememe1.getId() == sememe2.getId()){
32 | return 1.0f;
33 | }
34 |
35 | //变为深度相同,然后一次上找共同的父节点
36 | int level = sememe1.getDepth() - sememe2.getDepth();
37 | for (int i = 0; i < ((level < 0) ? level * -1 : level); i++) {
38 | if (level > 0){
39 | sem1 = SEMEMES[sem1.getParentId()];
40 | }else{
41 | sem2 = SEMEMES[sem2.getParentId()];
42 | }
43 | }
44 |
45 | while(sem1.getId() != sem2.getId()){
46 | // 如果有一个已经到达根节点,仍然不同,则返回0
47 | if (sem1.getId() == sem1.getParentId()
48 | || sem2.getId() == sem2.getParentId()) {
49 | return 0.0f;
50 | }
51 |
52 | sem1 = SEMEMES[sem1.getParentId()];
53 | sem2 = SEMEMES[sem2.getParentId()];
54 | }
55 |
56 | return sem1.getDepth()*2.0f/(sememe1.getDepth() + sememe2.getDepth());
57 | }
58 |
59 | /**
60 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者 similarity = alpha/(distance+alpha),
61 | * 如果两个字符串相同或都为空,直接返回1.0
62 | *
63 | * @param key1 第一个义原字符串
64 | * @param key2 第二个义原字符串
65 | * @return
66 | */
67 | @Override
68 | public double getSimilarity(String item1, String item2) {
69 | if(BlankUtils.isBlankAll(item2, item2)){
70 | return 1.0;
71 | } else if(BlankUtils.isBlankAtLeastOne(item1, item2)){
72 | return 0.0;
73 | } else if(item1.equals(item2)){
74 | return 1.0;
75 | }
76 |
77 | String key1 = item1.trim();
78 | String key2 = item2.trim();
79 |
80 | // 去掉()符号
81 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
82 |
83 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
84 | key1 = key1.substring(1, key1.length() - 1);
85 | key2 = key2.substring(1, key2.length() - 1);
86 | } else {
87 | return 0.0;
88 | }
89 |
90 | }
91 |
92 | // 处理关系义元,即x=y的情况
93 | int pos = key1.indexOf('=');
94 | if (pos > 0) {
95 | int pos2 = key2.indexOf('=');
96 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0
97 | if ((pos == pos2)
98 | && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
99 | key1 = key1.substring(pos + 1);
100 | key2 = key2.substring(pos2 + 1);
101 | } else {
102 | return 0.0;
103 | }
104 | }
105 |
106 | // 处理符号义元,即前面有特殊符号的义元
107 | String symbol1 = key1.substring(0, 1);
108 | String symbol2 = key2.substring(0, 1);
109 |
110 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
111 | if (symbol1.equals(Symbol_Descriptions[i][0])) {
112 | if (symbol1.equals(symbol2)) {
113 | key1 = item1.substring(1);
114 | key2 = item2.substring(1);
115 | break;
116 | } else {
117 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0
118 | }
119 | }
120 | }
121 |
122 | if ((pos = key1.indexOf("|")) >= 0) {
123 | key1 = key1.substring(pos + 1);
124 | }
125 | if ((pos = key2.indexOf("|")) >= 0) {
126 | key2 = key2.substring(pos + 1);
127 | }
128 |
129 | // 如果两个字符串相等,直接返回距离为0
130 | if (key1.equals(key2)) {
131 | return 1.0;
132 | }
133 |
134 | Integer[] myset1 = getSememes(key1);
135 | Integer[] myset2 = getSememes(key2);
136 |
137 | double similarity = 0.0;
138 | for(int id1:myset1){
139 | for(int id2:myset2){
140 | double s = getSimilarity(SEMEMES[id1], SEMEMES[id2]);
141 | if(s>similarity){
142 | similarity = s;
143 | }
144 | }
145 | }
146 |
147 | return similarity;
148 | }
149 |
150 |
151 | }
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/Sememe.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet.sememe;
2 |
3 | /**
4 | * 描述知网义原的基本对象, 出于性能考虑,把未用到的英文名称、定义等在加载时忽略, 更准确的做法是以[英文定义|中文定义]
5 | * 作为一个整理进行处理,不过绝大多数只根据中文定义就可以标识出来,因此忽略不计。
6 | *
7 | * @author 夏天
8 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
9 | * @deprecated
10 | */
11 | public class Sememe {
12 | /** 义原编号 */
13 | private int id;
14 | /** 指向上位义元号 */
15 | private int parentId;
16 | /** 义原在义原树中的深度 */
17 | private int depth;
18 | /** 义原的中文名称*/
19 | private String cnWord;
20 | /** 义原的英文名称 */
21 | private String enWord;
22 | /** 义原的定义,如果没有(例如数量),则为空串 */
23 | private String define;
24 | /** 义原的类型 */
25 | private int type;
26 |
27 | /**
28 | * 每一行的形式为:be|是 {relevant,isa}/{relevant,descriptive}
29 | *
或者 official|官 [#organization|组织,#employee|员]
30 | *
或者 amount|多少
31 | *
把相应的部分赋予不同的属性
32 | * 出于性能考虑,把未用到的英文名称、定义等忽略
33 | * @param id
34 | * @param parentId
35 | * @param item 读取文件中的一行
36 | */
37 | public Sememe(int id, int parentId, int depth, String item) {
38 | this.id = id;
39 | this.parentId = parentId;
40 | this.depth = depth;
41 |
42 | int pos = item.indexOf('|');
43 | if (pos < 0) {
44 | this.cnWord = item;
45 | this.enWord = item;
46 | } else {
47 | this.enWord = item.substring(0, pos);
48 |
49 | // 去掉"|"符号
50 | String nextPart = item.substring(pos + 1);
51 | pos = nextPart.indexOf(' ');
52 | if (pos <= 0) {
53 | this.cnWord = nextPart;
54 | } else {
55 | this.cnWord = nextPart.substring(0, pos);
56 | this.define = nextPart.substring(pos).trim();
57 | }
58 | }
59 | }
60 |
61 | public int getId() {
62 | return id;
63 | }
64 |
65 | public void setId(int id) {
66 | this.id = id;
67 | }
68 |
69 | public int getParentId() {
70 | return parentId;
71 | }
72 |
73 | public void setParentId(int parentId) {
74 | this.parentId = parentId;
75 | }
76 |
77 | public int getDepth() {
78 | return depth;
79 | }
80 |
81 | public void setDepth(int depth) {
82 | this.depth = depth;
83 | }
84 |
85 | public String getCnWord() {
86 | return cnWord;
87 | }
88 |
89 | public void setCnWord(String cnWord) {
90 | this.cnWord = cnWord;
91 | }
92 |
93 | public String getEnWord() {
94 | return enWord;
95 | }
96 |
97 | public void setEnWord(String enWord) {
98 | this.enWord = enWord;
99 | }
100 |
101 | public String getDefine() {
102 | return define;
103 | }
104 |
105 | public void setDefine(String define) {
106 | this.define = define;
107 | }
108 |
109 | public int getType() {
110 | return type;
111 | }
112 |
113 | public void setType(int type) {
114 | this.type = type;
115 | }
116 |
117 | @Override
118 | public String toString(){
119 | StringBuilder sb = new StringBuilder();
120 | sb.append("id=");
121 | sb.append(id);
122 | sb.append("; parentId=");
123 | sb.append(parentId);
124 | sb.append("; depth=");
125 | sb.append(depth);
126 | sb.append("; cnWord=");
127 | sb.append(cnWord);
128 | sb.append("; enWord=");
129 | sb.append(enWord);
130 | sb.append("; define=");
131 | sb.append(define);
132 | return sb.toString();
133 | }
134 |
135 | }
136 |
137 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/SememeParser.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet.sememe;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.util.Collection;
6 |
7 | import org.slf4j.Logger;
8 | import org.slf4j.LoggerFactory;
9 | import ruc.irm.similarity.Similaritable;
10 | import ruc.irm.similarity.util.BlankUtils;
11 | import ruc.irm.similarity.util.FileUtils;
12 | import ruc.irm.similarity.word.hownet.HownetMeta;
13 |
14 | /**
15 | * 义原解析器, 包括义元数据的加载,义元的组织、索引、查询 以及义元的距离计算和相似度计算等.
16 | * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》
17 | *
18 | * @author 夏天
19 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
20 | *
21 | * @see ruc.irm.similarity.Similaritable
22 | * @deprecated
23 | */
24 | public abstract class SememeParser implements HownetMeta, Similaritable {
25 | protected Logger LOG = LoggerFactory.getLogger(this.getClass());
26 |
27 | /** 所有的义原都存放到一个数组之中,并且义元的ID号与数组的下标相同 */
28 | protected Sememe[] SEMEMES;
29 |
30 | /** 通过对义原的汉语词义进行索引,根据该索引快速定位义原,找出义原的id,再到sememes中查找 */
31 | private FastSimpleMap sememeMap = null;
32 |
33 | public SememeParser() throws IOException{
34 | String sememeFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/sememe.dat";
35 |
36 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(sememeFile);
37 | load(input, "UTF-8");
38 | }
39 |
40 | /**
41 | * 获取两个义原描述串的相似度
42 | * @param sememeName1
43 | * @param sememeName2
44 | * @see ke.commons.similarity.Similariable
45 | * @return
46 | */
47 | public abstract double getSimilarity(String sememeName1, String sememeName2);
48 |
49 | /**
50 | * 获取两个确定义原的相似度
51 | * @param sememe1
52 | * @param sememe2
53 | * @return
54 | */
55 | public abstract double getSimilarity(Sememe sememe1, Sememe sememe2);
56 |
57 | /**
58 | * 从文件中加载义元知识
59 | *
60 | * @throws IOException
61 | */
62 | public void load(InputStream input, String encoding) throws IOException {
63 | SememeDictTraverseEvent event = new SememeDictTraverseEvent();
64 | LOG.info("loading sememe dictionary...");
65 | long time = System.currentTimeMillis();
66 | FileUtils.traverseLines(input, encoding, event);
67 | this.SEMEMES = event.getSememes();
68 |
69 | String[] keys = new String[SEMEMES.length];
70 | Integer[] values = new Integer[SEMEMES.length];
71 |
72 | //设置索引
73 | for(int i=0; i(keys, values);
78 |
79 | time = System.currentTimeMillis() - time;
80 | LOG.info("sememe dictionary load completely. time elapsed: " + time);
81 | }
82 |
83 | /**
84 | * 根据汉语定义计算义元之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大,
85 | *
由于可能多个义元有相同的汉语词语,故计算结果为其中距离最小者
86 | *
87 | * @param key1
88 | * @param key2
89 | * @return
90 | */
91 | public int getDistance(String key1, String key2) {
92 | int distance = Integer.MAX_VALUE;
93 |
94 | // 如果两个字符串相等,直接返回距离为0
95 | if (key1.equals(key2)) {
96 | return 0;
97 | }
98 |
99 | Integer[] semArray1 = getSememes(key1);
100 | Integer[] semArray2 = getSememes(key2);
101 |
102 | // 如果key1或者key2不是义元,并且key1<>key2,则返回无穷大
103 | if (semArray1.length == 0 || semArray2.length == 0) {
104 | return Integer.MAX_VALUE;
105 | }
106 |
107 | for(int i:semArray1){
108 | for(int j:semArray2){
109 | int d = getDistance(SEMEMES[i], SEMEMES[j]);
110 | if(d 0)
140 | mysem1 = SEMEMES[mysem1.getParentId()];
141 | else
142 | mysem2 = SEMEMES[mysem2.getParentId()];
143 | distance++;
144 | }
145 |
146 | //从不同的分支(深度相同)同时向上寻找共同的祖先节点
147 | while (mysem1.getId() != mysem2.getId()) {
148 | // 如果已经到达根节点,仍然不同,则返回无穷大(-1)
149 | if (mysem1.getId() == mysem1.getParentId()
150 | || mysem2.getId() == mysem2.getParentId()) {
151 | distance = Integer.MAX_VALUE;
152 | break;
153 | }
154 |
155 | mysem1 = SEMEMES[mysem1.getParentId()];
156 | mysem2 = SEMEMES[mysem2.getParentId()];
157 | distance += 2;
158 | }
159 |
160 | return distance;
161 | }
162 |
163 | /**
164 | * 获取从该义元到根节点的路径表示字符串
165 | *
166 | * @param key
167 | * @return
168 | */
169 | public String getPath(String key) {
170 | StringBuilder path = new StringBuilder();
171 |
172 | Sememe sem = getSememe(key);
173 | while (sem != null && sem.getId() != sem.getParentId()) {
174 | path.insert(0, "->" + sem.getCnWord());
175 | sem = SEMEMES[sem.getParentId()];
176 | }
177 |
178 | if (sem != null){
179 | path.insert(0, "->" + sem.getCnWord());
180 | }
181 | path.insert(0, "START");
182 | return path.toString();
183 | }
184 |
185 | /**
186 | * 根据义原的名字,获取该义原的位置信息,义原体系中有时会有一个名字对应多个义原,一并返回到
187 | * 义原数组中
188 | * @param sememeName
189 | * @return
190 | */
191 | public Integer[] getSememes(String sememeName) {
192 | Collection ids = sememeMap.get(sememeName);
193 |
194 | return ids.toArray(new Integer[ids.size()]);
195 | }
196 |
197 | /**
198 | * 获取其中的一个义原,大部分义原就只有一个
199 | * @param sememeName
200 | * @return
201 | */
202 | public Sememe getSememe(String sememeName){
203 | Integer[] ids = getSememes(sememeName);
204 |
205 | if(BlankUtils.isBlank(ids)){
206 | return null;
207 | }else{
208 | return SEMEMES[ids[0]];
209 | }
210 | }
211 |
212 | /**
213 | * 过滤义原字符串,去掉其中的英文部分
214 | * @param sememeString
215 | * @return
216 | */
217 | protected String filterSememeString(String sememeString){
218 | int pos = sememeString.indexOf("|");
219 | if (pos >= 0) {
220 | sememeString = sememeString.substring(pos + 1);
221 | }
222 | return sememeString;
223 | }
224 |
225 | }
226 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet/sememe/SememeType.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet.sememe;
2 |
3 | /**
4 | * 义原的类型定义
5 | *
6 | * - 1:Event|事件
7 | * - 2:Entity|实体
8 | * - 3:Attribute|属性
9 | * - 4:Quantity|数量
10 | * - 5:aValue|属性值
11 | * - 6:qValue|数量值
12 | * - 7: Secondary Feature|第二特征
13 | * - 8: Syntax|语法
14 | * - 9: EventRole|动态角色
15 | * - 10:EventFeatures|动态属性
16 | * - 0:未知
17 | *
18 | *
19 | * 其中1~7为基本义元,8为语法义元,9、10为关系义元
20 | *
21 | * @author 夏天
22 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
23 | * @deprecated
24 | */
25 | public interface SememeType {
26 | /** Event|事件类型定义 */
27 | public static final int Event = 1;
28 |
29 | /** Entity|实体类型定义*/
30 | public static final int Entity = 2;
31 |
32 | /** Attribute|属性类型定义*/
33 | public static final int Attribute = 3;
34 |
35 | /** Quantity|数量类型定义*/
36 | public static final int Quantity = 4;
37 |
38 | /** aValue|属性值类型定义*/
39 | public static final int AValue = 5;
40 |
41 | /** qValue|数量值类型定义*/
42 | public static final int QValue = 6;
43 |
44 | /** Secondary Feature|第二特征类型定义*/
45 | public static final int SecondaryFeature = 7;
46 |
47 | /** Syntax|语法类型定义*/
48 | public static final int Syntax = 8;
49 |
50 | /** EventRole|动态角色类型定义*/
51 | public static final int EventRoleAndFeature = 9;
52 |
53 | /** EventFeatures|动态属性类型定义*/
54 | public static final int EventFeature = 10;
55 |
56 | /** 未知类型定义*/
57 | public static final int Unknown = 0;
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/concept/ConceptDictTraverseEvent.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet2.concept;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileOutputStream;
6 | import java.io.InputStream;
7 | import java.io.InputStreamReader;
8 | import java.io.PrintWriter;
9 | import java.util.ArrayList;
10 | import java.util.Arrays;
11 | import java.util.List;
12 |
13 | import javax.xml.parsers.DocumentBuilder;
14 | import javax.xml.parsers.DocumentBuilderFactory;
15 | import javax.xml.transform.OutputKeys;
16 | import javax.xml.transform.Transformer;
17 | import javax.xml.transform.TransformerFactory;
18 | import javax.xml.transform.dom.DOMSource;
19 | import javax.xml.transform.stream.StreamResult;
20 |
21 | import org.w3c.dom.Document;
22 | import org.w3c.dom.Element;
23 |
24 | import ruc.irm.similarity.util.TraverseEvent;
25 | import ruc.irm.similarity.word.hownet2.concept.Concept;
26 |
27 |
28 | /**
29 | * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准,格式如下:
30 | * 阿斗 N human|人,ProperName|专,past|昔
31 | * 阿爸 N human|人,family|家,male|男
32 | * 即: <概念> <空格或者跳格> <词性> <空格或者跳格> <定义>"
33 | *
34 | * 概念保存到数组中,没有保存到Map中,可以降低对内存空间的使用
35 | *
36 | * @author 夏天
37 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
38 | */
39 | public class ConceptDictTraverseEvent implements TraverseEvent {
40 | private List conceptList = null;
41 |
42 | public ConceptDictTraverseEvent(){
43 | conceptList = new ArrayList();
44 | }
45 |
46 | public Concept[] getConcepts(){
47 | Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]);
48 | Arrays.sort(concepts);
49 | return concepts;
50 | }
51 |
52 | /**
53 | * 读取概念词典中的一行,并进行解析处理
54 | */
55 | public boolean visit(String line) {
56 | String word = null;
57 | String pos = null;
58 | String define = "";
59 | char ch;
60 |
61 | //以符号//开始的是注释行
62 | if(line.startsWith("//")){
63 | return true;
64 | }
65 |
66 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置
67 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义
68 | //解析出一行中的概念各项数据
69 | loop: for (int position = 0; position < line.length(); position++) {
70 | ch = line.charAt(position);
71 |
72 | if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
73 | String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
74 | switch(processFlag){
75 | case 0:
76 | word = item;
77 | processFlag++;
78 | break;
79 | case 1:
80 | pos = item;
81 | processFlag++;
82 | break;
83 | case 2:
84 | //define = item;
85 | //processFlag++;
86 | define = line.substring(lastPosition).trim();
87 | break loop;
88 | case 3:
89 | System.out.println(line);
90 | break;
91 | }
92 |
93 | for( ;(position < line.length()); position++){
94 | ch = line.charAt(position);
95 | if ((ch != ' ') && (ch != '\t')) {
96 | lastPosition = position;
97 | break;
98 | }
99 | }
100 |
101 | }
102 | }
103 | conceptList.add(new Concept(word, pos, define));
104 | return true;
105 | }
106 |
107 | public void saveToXML(File xmlFile) throws Exception{
108 | String conceptFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/concept.dat";
109 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(conceptFile);
110 | BufferedReader in = new BufferedReader(new InputStreamReader(input, "utf8"));
111 |
112 | DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance();
113 | DocumentBuilder builder=factory.newDocumentBuilder();
114 | Document document=builder.newDocument();
115 | Element root=document.createElement("concepts");
116 | document.appendChild(root);
117 |
118 | String line = null;
119 |
120 | while ((line = in.readLine()) != null) {
121 | saveLineToXML(document, root, line);
122 | }
123 |
124 | input.close();
125 | in.close();
126 |
127 | TransformerFactory tf=TransformerFactory.newInstance();
128 | Transformer transformer=tf.newTransformer();
129 | DOMSource source=new DOMSource(document);
130 | transformer.setOutputProperty(OutputKeys.ENCODING,"utf8");
131 | transformer.setOutputProperty(OutputKeys.INDENT,"yes");
132 | PrintWriter pw=new PrintWriter(new FileOutputStream(xmlFile));
133 | StreamResult result=new StreamResult(pw);
134 | transformer.transform(source,result);
135 | }
136 |
137 |
138 | /**
139 | * 读取概念词典中的一行,并进行解析处理
140 | */
141 | private boolean saveLineToXML(Document document, Element root, String line) {
142 | String word = null;
143 | String pos = null;
144 | String define = "";
145 | char ch;
146 |
147 | //以符号//开始的是注释行
148 | if(line.startsWith("//")){
149 | return true;
150 | }
151 |
152 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置
153 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义
154 | //解析出一行中的概念各项数据
155 | loop: for (int position = 0; position < line.length(); position++) {
156 | ch = line.charAt(position);
157 |
158 | if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
159 | String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
160 | switch(processFlag){
161 | case 0:
162 | word = item;
163 | processFlag++;
164 | break;
165 | case 1:
166 | pos = item;
167 | processFlag++;
168 | break;
169 | case 2:
170 | //define = item;
171 | //processFlag++;
172 | define = line.substring(lastPosition).trim();
173 | break loop;
174 | case 3:
175 | System.out.println(line);
176 | break;
177 | }
178 |
179 | for( ;(position < line.length()); position++){
180 | ch = line.charAt(position);
181 | if ((ch != ' ') && (ch != '\t')) {
182 | lastPosition = position;
183 | break;
184 | }
185 | }
186 |
187 | }
188 | }
189 |
190 | Element e = document.createElement("c");
191 | e.setAttribute("w", word);
192 | e.setAttribute("p", pos);
193 | e.setAttribute("d", define);
194 | root.appendChild(e);
195 | return true;
196 | }
197 |
198 | public static void main(String[] args) throws Exception {
199 | new ConceptDictTraverseEvent().saveToXML(new File("/home/xiatian/Desktop/concept.xml"));
200 | }
201 |
202 | }
203 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/concept/ConceptLinkedList.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet2.concept;
2 |
3 | import java.util.LinkedList;
4 |
5 | /**
6 | * 用于概念处理的LinkedList
7 | *
8 | * @author 夏天
9 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
10 | *
11 | * @param
12 | */
13 | @SuppressWarnings("serial")
14 | public class ConceptLinkedList extends LinkedList {
15 |
16 | /**
17 | * 删除链表中最后面的size个元素
18 | * @param size
19 | */
20 | public void removeLast(int size){
21 | for(int i=0;i夏天
15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
16 | */
17 | public class LiuConceptParser extends BaseConceptParser{
18 |
19 | private static LiuConceptParser instance = null;
20 |
21 | public static LiuConceptParser getInstance(){
22 | if(instance == null){
23 | try {
24 | instance = new LiuConceptParser();
25 | } catch (IOException e) {
26 | e.printStackTrace();
27 | }
28 | }
29 |
30 | return instance;
31 | }
32 |
33 | private LiuConceptParser(BaseSememeParser sememeParser) throws IOException {
34 | super(sememeParser);
35 | }
36 |
37 | private LiuConceptParser() throws IOException{
38 | super(new LiuqunSememeParser());
39 | }
40 |
41 | @Override
42 | protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4){
43 | return beta1 * sim_v1
44 | + beta2 * sim_v1 * sim_v2
45 | + beta3 * sim_v1 * sim_v2 * sim_v3
46 | + beta4 * sim_v1 * sim_v2 * sim_v3 * sim_v4;
47 | }
48 |
49 | @Override
50 | public double getSimilarity(String word1, String word2) {
51 | double similarity = 0.0;
52 |
53 | // 如果两个句子相同,则直接返回1.0
54 | if (word1.equals(word2)) {
55 | return 1.0;
56 | }
57 |
58 | Collection concepts1 = getConcepts(word1);
59 | Collection concepts2 = getConcepts(word2);
60 |
61 | //如果是blank,则说明是未登录词, 需要计算组合概念
62 | if(BlankUtils.isBlank(concepts1) || BlankUtils.isBlank(concepts2)){
63 | return 0.0;
64 | }
65 |
66 | //两个for循环分别计算词语所有可能的概念的相似度
67 | for(Concept c1:concepts1){
68 | for(Concept c2:concepts2){
69 | double v = getSimilarity(c1, c2);
70 |
71 | if(v>similarity){
72 | similarity = v;
73 | }
74 |
75 | if(similarity == 1.0){
76 | break;
77 | }
78 | }
79 | }
80 |
81 | return similarity;
82 | }
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/sememe/BaseSememeParser.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet2.sememe;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.util.zip.GZIPInputStream;
6 |
7 | import javax.xml.namespace.QName;
8 | import javax.xml.stream.XMLEventReader;
9 | import javax.xml.stream.XMLInputFactory;
10 | import javax.xml.stream.events.StartElement;
11 | import javax.xml.stream.events.XMLEvent;
12 |
13 | import org.slf4j.Logger;
14 | import org.slf4j.LoggerFactory;
15 | import ruc.irm.similarity.Similaritable;
16 | import ruc.irm.similarity.word.hownet.HownetMeta;
17 |
18 | import com.google.common.collect.HashMultimap;
19 | import com.google.common.collect.Multimap;
20 |
21 | /**
22 | * 义原解析器基类,所有义原存储在xml文件中(当前package中的sememe.xml.tar.gz文件)。
23 | * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》或《中文信息相似度计算理论与方法》一书第三章
24 | *
25 | * 为提高运算速度,义原的加载方式做了调整,只把义原的汉语定义和对应的Id加入到MultiMap对象中,并通过义原的层次化Id计算义原之间的相似度。
26 | *
27 | * @author 夏天
28 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
29 | *
30 | * @see {@link ruc.irm.similarity.Similaritable}
31 | */
32 | public abstract class BaseSememeParser implements HownetMeta, Similaritable {
33 | protected Logger LOG = LoggerFactory.getLogger(this.getClass());
34 |
35 | /** 所有的义原都存放到一个MultiMap, Key为Sememe的中文定义, Value为义原的Id */
36 | protected static Multimap SEMEMES = null;
37 |
38 | public BaseSememeParser() throws IOException {
39 | if (SEMEMES != null) {
40 | return;
41 | }
42 |
43 | SEMEMES = HashMultimap.create();
44 |
45 | InputStream input = this.getClass().getResourceAsStream("/data/sememe.xml.gz");
46 | input = new GZIPInputStream(input);
47 | load(input);
48 | }
49 |
50 | /**
51 | * 从文件中加载义元知识
52 | *
53 | * @throws IOException
54 | */
55 | public void load(InputStream input) throws IOException {
56 | System.out.print("loading sememes...");
57 | long time = System.currentTimeMillis();
58 | try {
59 | XMLInputFactory inputFactory = XMLInputFactory.newInstance();
60 | XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
61 |
62 | int count = 0;
63 | while (xmlEventReader.hasNext()) {
64 | XMLEvent event = xmlEventReader.nextEvent();
65 |
66 | if (event.isStartElement()) {
67 | StartElement startElement = event.asStartElement();
68 | if (startElement.getName().toString().equals("sememe")) {
69 | String cnWord = startElement.getAttributeByName(QName.valueOf("cn")).getValue();
70 | String id = startElement.getAttributeByName(QName.valueOf("id")).getValue();
71 | SEMEMES.put(cnWord, id);
72 | count++;
73 | if (count % 100 == 0) {
74 | System.out.print(".");
75 | }
76 | }
77 | }
78 | }
79 | input.close();
80 | } catch (Exception e) {
81 | throw new IOException(e);
82 | }
83 | time = System.currentTimeMillis() - time;
84 | System.out.println("\ncomplete!. time elapsed: " + (time / 1000) + "s");
85 | }
86 |
87 | /**
88 | * 计算两个义原之间的关联度
89 | *
90 | * @param sememeName1
91 | * @param sememeName2
92 | * @return
93 | */
94 | public double getAssociation(String sememeName1, String sememeName2) {
95 | return 0.0;
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/sememe/LiuqunSememeParser.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet2.sememe;
2 |
3 | import java.io.IOException;
4 | import java.util.Collection;
5 |
6 | /**
7 | * 刘群老师计算义原相似度的方法, 实现了SememeParser中定义的抽象方法
8 | *
9 | * @author 夏天
10 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
11 | *
12 | * @author xiatian
13 | * @version 1.0
14 | */
15 | public class LiuqunSememeParser extends BaseSememeParser {
16 |
17 | /** 计算义元相似度的可调节的参数,默认为1.6 */
18 | private final float alpha = 1.6f;
19 |
20 | public LiuqunSememeParser() throws IOException {
21 | super();
22 | }
23 |
24 | /**
25 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者
26 | *
similarity = alpha/(distance+alpha)
27 | *
28 | * @param key1
29 | * @param key2
30 | * @return
31 | */
32 | @Override
33 | public double getSimilarity(String item1, String item2) {
34 | int pos;
35 |
36 | // 如果为空串,直接返回0
37 | if (item1 == null || item2 == null || item1.equals("")
38 | || item2.equals(""))
39 | return 0.0;
40 |
41 | String key1 = item1.trim();
42 | String key2 = item2.trim();
43 |
44 | // 去掉()符号
45 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
46 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
47 | key1 = key1.substring(1, key1.length() - 1);
48 | key2 = key2.substring(1, key2.length() - 1);
49 | } else {
50 | return 0.0;
51 | }
52 | }
53 |
54 | // 处理关系义元,即x=y的情况
55 | if ((pos = key1.indexOf('=')) > 0) {
56 | int pos2 = key2.indexOf('=');
57 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0
58 | if ((pos == pos2)
59 | && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
60 | key1 = key1.substring(pos + 1);
61 | key2 = key2.substring(pos2 + 1);
62 | } else {
63 | return 0.0;
64 | }
65 | }
66 |
67 | // 处理符号义元,即前面有特殊符号的义元
68 | String symbol1 = key1.substring(0, 1);
69 | String symbol2 = key2.substring(0, 1);
70 |
71 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
72 | if (symbol1.equals(Symbol_Descriptions[i][0])) {
73 | if (symbol1.equals(symbol2)) {
74 | key1 = item1.substring(1);
75 | key2 = item2.substring(1);
76 | break;
77 | } else {
78 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0
79 | }
80 | }
81 | }
82 |
83 | if ((pos = key1.indexOf("|")) >= 0) {
84 | key1 = key1.substring(pos + 1);
85 | }
86 | if ((pos = key2.indexOf("|")) >= 0) {
87 | key2 = key2.substring(pos + 1);
88 | }
89 |
90 | int distance = getMinDistance(key1, key2);
91 | return alpha / (distance + alpha);
92 | }
93 |
94 | /**
95 | * 根据汉语定义计算义原之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大,由于可能多个义元有相同的汉语词语,
96 | * 故计算结果为其中距离最小者
97 | *
98 | * @param key1
99 | * @param key2
100 | * @return
101 | */
102 | public int getMinDistance(String sememe1, String sememe2) {
103 | int distance = Integer.MAX_VALUE;
104 |
105 | // 如果两个字符串相等,直接返回距离为0
106 | if (sememe1.equals(sememe2)) {
107 | return 0;
108 | }
109 |
110 | Collection sememeIds1 = SEMEMES.get(sememe1);
111 | Collection sememeIds2 = SEMEMES.get(sememe2);
112 |
113 | // 如果sememe1或者sememe2不是义元,则返回无穷大
114 | if (sememeIds1.size() == 0 || sememeIds1.size() == 0) {
115 | return Integer.MAX_VALUE;
116 | }
117 |
118 | for(String id1:sememeIds1){
119 | for(String id2:sememeIds2){
120 | int d = getDistance(id1, id2);
121 | if(d
6 | * 义原编号采用父节点Id-子节点Id编码方式,如:
7 | * <sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/>
8 | * 义原的id表明了义原之间的上下位关系和义原的深度。
9 | *
10 | * @author 夏天
11 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
12 | */
13 | public class Sememe {
14 | /**
15 | * 义原编号,采用父节点Id-子节点Id编码方式,如<sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/>
16 | * id表明了义原之间的上下位关系
17 | */
18 | private String id;
19 | /** 义原的中文名称*/
20 | private String cnWord;
21 | /** 义原的英文名称 */
22 | private String enWord;
23 | /** 义原的定义,如果没有(例如数量),则为空串 */
24 | private String define;
25 |
26 | /**
27 | * 每一行的形式为:be|是 {relevant,isa}/{relevant,descriptive}
28 | *
或者 official|官 [#organization|组织,#employee|员]
29 | *
或者 amount|多少
30 | *
把相应的部分赋予不同的属性
31 | * 出于性能考虑,把未用到的英文名称、定义等忽略
32 | * @param id
33 | */
34 | public Sememe(String id, String en, String cn, String define) {
35 | this.id = id;
36 | this.cnWord = cn;
37 | //为提高效率,减少内存空间利用,可去掉以下两行
38 | this.enWord = en;
39 | this.define = define;
40 | }
41 |
42 | public String getId() {
43 | return id;
44 | }
45 |
46 | public void setId(String id) {
47 | this.id = id;
48 | }
49 |
50 | public String getCnWord() {
51 | return cnWord;
52 | }
53 |
54 | public void setCnWord(String cnWord) {
55 | this.cnWord = cnWord;
56 | }
57 |
58 | public String getEnWord() {
59 | return enWord;
60 | }
61 |
62 | public void setEnWord(String enWord) {
63 | this.enWord = enWord;
64 | }
65 |
66 | public String getDefine() {
67 | return define;
68 | }
69 |
70 | public void setDefine(String define) {
71 | this.define = define;
72 | }
73 |
74 | public int getType() {
75 | char ch = id.charAt(0);
76 | switch (ch) {
77 | case '1':
78 | return SememeType.Event;
79 | case '2':
80 | return SememeType.Entity;
81 | case '3':
82 | return SememeType.Attribute;
83 | case '4':
84 | return SememeType.Quantity;
85 | case '5':
86 | return SememeType.AValue;
87 | case '6':
88 | return SememeType.QValue;
89 | case '7':
90 | return SememeType.SecondaryFeature;
91 | case '8':
92 | return SememeType.Syntax;
93 | case '9':
94 | return SememeType.EventRoleAndFeature;
95 | default:
96 | return 0;
97 | }
98 | }
99 |
100 | @Override
101 | public String toString(){
102 | StringBuilder sb = new StringBuilder();
103 | sb.append("id=");
104 | sb.append(id);
105 | sb.append("; cnWord=");
106 | sb.append(cnWord);
107 | sb.append("; enWord=");
108 | sb.append(enWord);
109 | sb.append("; define=");
110 | sb.append(define);
111 | return sb.toString();
112 | }
113 |
114 | }
115 |
116 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/sememe/SememeType.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet2.sememe;
2 |
3 | /**
4 | * 义原的类型定义
5 | *
6 | * - 1:Event|事件
7 | * - 2:Entity|实体
8 | * - 3:Attribute|属性
9 | * - 4:Quantity|数量
10 | * - 5:aValue|属性值
11 | * - 6:qValue|数量值
12 | * - 7: Secondary Feature|第二特征
13 | * - 8: Syntax|语法
14 | * - 9: EventRole|动态角色
15 | * - 10:EventFeatures|动态属性
16 | * - 0:未知
17 | *
18 | *
19 | * 其中1~7为基本义元,8为语法义元,9、10为关系义元
20 | *
21 | * @author 夏天
22 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
23 | */
24 | public interface SememeType {
25 | /** Event|事件类型定义 */
26 | public static final int Event = 1;
27 |
28 | /** Entity|实体类型定义*/
29 | public static final int Entity = 2;
30 |
31 | /** Attribute|属性类型定义*/
32 | public static final int Attribute = 3;
33 |
34 | /** Quantity|数量类型定义*/
35 | public static final int Quantity = 4;
36 |
37 | /** aValue|属性值类型定义*/
38 | public static final int AValue = 5;
39 |
40 | /** qValue|数量值类型定义*/
41 | public static final int QValue = 6;
42 |
43 | /** Secondary Feature|第二特征类型定义*/
44 | public static final int SecondaryFeature = 7;
45 |
46 | /** Syntax|语法类型定义*/
47 | public static final int Syntax = 8;
48 |
49 | /** EventRole|动态角色类型定义*/
50 | public static final int EventRoleAndFeature = 9;
51 |
52 | /** 未知类型定义*/
53 | public static final int Unknown = 0;
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/hownet2/sememe/XiaSememeParser.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet2.sememe;
2 |
3 | import java.io.IOException;
4 | import java.util.Collection;
5 |
6 | import ruc.irm.similarity.util.BlankUtils;
7 |
8 |
9 | /**
10 | * 义原相似度计算, 实现了SememeParser中定义的抽象方法
11 | *
12 | * @author 夏天
13 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
14 | */
15 | public class XiaSememeParser extends BaseSememeParser {
16 |
17 | public XiaSememeParser() throws IOException{
18 | super();
19 | }
20 |
21 | /**
22 | * 计算两个义原的相似度
23 | */
24 | double getSimilarityBySememeId(final String id1, final String id2) {
25 |
26 | int position = 0;
27 | String[] array1 = id1.split("-");
28 | String[] array2 = id2.split("-");
29 | for (position = 0; position < array1.length && position < array2.length; position++) {
30 | if (!array1[position].equals(array2[position])) {
31 | break;
32 | }
33 | }
34 |
35 | return 2.0*position/(array1.length + array2.length);
36 | }
37 |
38 | /**
39 | * 根据汉语定义计算义原之间的相似度,由于可能多个义元有相同的汉语词语,故计算结果为其中相似度最大者
40 | *
41 | * @param key1
42 | * @param key2
43 | * @return
44 | */
45 | public double getMaxSimilarity(String sememeName1, String sememeName2) {
46 | double maxValue = 0.0;
47 |
48 | // 如果两个字符串相等,直接返回距离为0
49 | if (sememeName1.equals(sememeName2)) {
50 | return 1.0;
51 | }
52 |
53 | Collection sememeIds1 = SEMEMES.get(sememeName1);
54 | Collection sememeIds2 = SEMEMES.get(sememeName2);
55 |
56 | // 如果sememe1或者sememe2不是义元,则返回0
57 | if (sememeIds1.size() == 0 || sememeIds1.size() == 0) {
58 | return 0.0;
59 | }
60 |
61 | for(String id1:sememeIds1){
62 | for(String id2:sememeIds2){
63 | double value = getSimilarityBySememeId(id1, id2);
64 | if(value > maxValue){
65 | maxValue = value;
66 | }
67 | }
68 | }
69 |
70 | return maxValue;
71 | }
72 |
73 | /**
74 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者 similarity = alpha/(distance+alpha),
75 | * 如果两个字符串相同或都为空,直接返回1.0
76 | *
77 | * @param key1 第一个义原字符串
78 | * @param key2 第二个义原字符串
79 | * @return
80 | */
81 | @Override
82 | public double getSimilarity(String item1, String item2) {
83 | if(BlankUtils.isBlankAll(item2, item2)){
84 | return 1.0;
85 | } else if(BlankUtils.isBlankAtLeastOne(item1, item2)){
86 | return 0.0;
87 | } else if(item1.equals(item2)){
88 | return 1.0;
89 | }
90 |
91 | String key1 = item1.trim();
92 | String key2 = item2.trim();
93 |
94 | // 去掉()符号
95 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
96 |
97 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
98 | key1 = key1.substring(1, key1.length() - 1);
99 | key2 = key2.substring(1, key2.length() - 1);
100 | } else {
101 | return 0.0;
102 | }
103 |
104 | }
105 |
106 | // 处理关系义元,即x=y的情况
107 | int pos = key1.indexOf('=');
108 | if (pos > 0) {
109 | int pos2 = key2.indexOf('=');
110 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0
111 | if ((pos == pos2)
112 | && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
113 | key1 = key1.substring(pos + 1);
114 | key2 = key2.substring(pos2 + 1);
115 | } else {
116 | return 0.0;
117 | }
118 | }
119 |
120 | // 处理符号义元,即前面有特殊符号的义元
121 | String symbol1 = key1.substring(0, 1);
122 | String symbol2 = key2.substring(0, 1);
123 |
124 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
125 | if (symbol1.equals(Symbol_Descriptions[i][0])) {
126 | if (symbol1.equals(symbol2)) {
127 | key1 = item1.substring(1);
128 | key2 = item2.substring(1);
129 | break;
130 | } else {
131 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0
132 | }
133 | }
134 | }
135 |
136 | if ((pos = key1.indexOf("|")) >= 0) {
137 | key1 = key1.substring(pos + 1);
138 | }
139 | if ((pos = key2.indexOf("|")) >= 0) {
140 | key2 = key2.substring(pos + 1);
141 | }
142 |
143 | // 如果两个字符串相等,直接返回距离为0
144 | if (key1.equals(key2)) {
145 | return 1.0;
146 | }
147 |
148 | return getMaxSimilarity(key1, key2);
149 | }
150 |
151 |
152 | }
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/similarity/word/pinyin/PinyinSimilarity.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.pinyin;
2 |
3 | import java.util.Set;
4 |
5 | import ruc.irm.similarity.Similaritable;
6 | import ruc.irm.similarity.util.EditDistance;
7 | import ruc.irm.similarity.util.PinyinUtils;
8 |
9 |
10 | /**
11 | * 通过拼音计算两个词语是否相似,拼音的相似程度采用编辑距离算法,并进行归一化衡量
12 | *
13 | * @author 夏天
14 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
15 | */
16 | public class PinyinSimilarity implements Similaritable {
17 |
18 | public double getSimilarity(String item1, String item2) {
19 | Set pinyinSet1 = PinyinUtils.getInstance().getPinyin(item1);
20 | Set pinyinSet2 = PinyinUtils.getInstance().getPinyin(item2);
21 |
22 | double max = 0.0;
23 | for(String pinyin1:pinyinSet1){
24 | for(String pinyin2:pinyinSet2){
25 | double distance = new EditDistance().getEditDistance(pinyin1, pinyin2);
26 | double similarity = 1 - distance/( (pinyin1.length()>pinyin2.length())?pinyin1.length():pinyin2.length());
27 | max = (max>similarity)?max:similarity;
28 | if(max==1.0){
29 | return max;
30 | }
31 | }
32 | }
33 | return max;
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/tendency/word/HownetWordTendency.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.tendency.word;
2 |
3 | import java.io.IOException;
4 | import java.util.Collection;
5 | import java.util.HashSet;
6 | import java.util.Set;
7 |
8 | import ruc.irm.similarity.word.hownet2.concept.BaseConceptParser;
9 | import ruc.irm.similarity.word.hownet2.concept.Concept;
10 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
11 | import ruc.irm.similarity.word.hownet2.sememe.BaseSememeParser;
12 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser;
13 |
14 | /**
15 | * 基于知网实现的词语倾向性判别
16 | *
17 | * @author 夏天
18 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
19 | */
20 | public class HownetWordTendency implements WordTendency {
21 | public static String[] POSITIVE_SEMEMES = new String[]{
22 | "良",
23 | "喜悦",
24 | "夸奖",
25 | "满意",
26 | "期望",
27 | "注意",
28 | "致敬",
29 | "喜欢",
30 | "专",
31 | "敬佩",
32 | "同意",
33 | "爱惜",
34 | "愿意",
35 | "思念",
36 | "拥护",
37 | "祝贺",
38 | "福",
39 | "需求",
40 | "奖励",
41 | "致谢",
42 | "欢迎",
43 | "羡慕",
44 | "感激",
45 | "爱恋"
46 | };
47 |
48 | public static String[] NEGATIVE_SEMEMES = new String[]{
49 | "莠",
50 | "谴责",
51 | "害怕",
52 | "生气",
53 | "悲哀",
54 | "着急",
55 | "轻视",
56 | "羞愧",
57 | "烦恼",
58 | "灰心",
59 | "犹豫",
60 | "为难",
61 | "懊悔",
62 | "厌恶",
63 | "怀疑",
64 | "怜悯",
65 | "忧愁",
66 | "示怒",
67 | "不满",
68 | "仇恨",
69 | "埋怨",
70 | "失望",
71 | "坏"
72 | };
73 | private BaseConceptParser conceptParser = null;
74 | private BaseSememeParser sememeParser = null;
75 |
76 | public HownetWordTendency(){
77 | this.conceptParser =XiaConceptParser.getInstance();
78 | try {
79 | this.sememeParser = new XiaSememeParser();
80 | } catch (IOException e) {
81 | e.printStackTrace();
82 | }
83 | }
84 |
85 | @Override
86 | public double getTendency(String word) {
87 | double positive = getSentiment(word, POSITIVE_SEMEMES);
88 | double negative = getSentiment(word, NEGATIVE_SEMEMES);;
89 | return positive - negative;
90 | }
91 |
92 | public double getSentiment(String word, String[] candidateSememes) {
93 | Collection concepts = conceptParser.getConcepts(word);
94 | Set sememes = new HashSet();
95 | for (Concept c : concepts) {
96 | sememes.addAll(c.getAllSememeNames());
97 | }
98 |
99 | double max = 0.0;
100 | for(String item:sememes){
101 | double total = 0.0;
102 | for(String positiveSememe:candidateSememes){
103 | //如果有特别接近的义原,直接返回该相似值,避免其他干扰
104 | double value = sememeParser.getSimilarity(item, positiveSememe);
105 | if(value>0.9){
106 | return value;
107 | }
108 | total += value;
109 | }
110 | double sim = total / candidateSememes.length;
111 | if(sim>max){
112 | max = sim;
113 | }
114 | }
115 | return max;
116 | }
117 |
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/tendency/word/Training.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.tendency.word;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.IOException;
7 | import java.io.InputStreamReader;
8 | import java.util.ArrayList;
9 | import java.util.Collection;
10 | import java.util.Collections;
11 | import java.util.HashMap;
12 | import java.util.List;
13 | import java.util.Map;
14 |
15 | import ruc.irm.similarity.util.BlankUtils;
16 | import ruc.irm.similarity.word.hownet2.concept.Concept;
17 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
18 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser;
19 |
20 | import com.google.common.collect.HashMultimap;
21 | import com.google.common.collect.Multimap;
22 |
23 | /**
24 | * 临时训练及测试类
25 | *
26 | * @author 夏天
27 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
28 | */
29 | public class Training {
30 |
31 | void test(boolean testPositive) throws IOException{
32 | WordTendency tendency = new HownetWordTendency();
33 | File f = new File("./dict/sentiment/负面情感词语(中文).txt");
34 | if(testPositive){
35 | //f = new File("./dict/sentiment/正面情感词语(中文).txt");
36 | f = new File("./dict/sentiment/正面评价词语(中文).txt");
37 | }
38 | String encoding = "utf-8";
39 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));
40 | String line;
41 | int wordCount = 0;
42 | int correctCount = 0;
43 | while ((line = in.readLine()) != null) {
44 | if(line.length()>5) continue;
45 | wordCount++;
46 |
47 | double value =tendency.getTendency(line.trim());
48 | if(value>0 && testPositive){
49 | correctCount++;
50 | }else if(value<0 && !testPositive){
51 | correctCount++;
52 | }else{
53 | System.out.println("error:" + line + "\t value:" + value);
54 | }
55 | }
56 | System.out.println("correct:" + correctCount);
57 | System.out.println("total:" + wordCount);
58 | System.out.println("ratio:" + correctCount*1.0/wordCount);
59 | }
60 |
61 | /**
62 | * 该方法用于统计知网提供的情感词集合所涉及的义原以及出现频度
63 | * @throws IOException
64 | */
65 | /**
66 | * @throws IOException
67 | */
68 | void countSentimentDistribution() throws IOException{
69 | Map sememeMap = new HashMap();
70 | File f = new File("./dict/sentiment/负面情感词语(中文).txt");
71 | String encoding = "utf-8";
72 | boolean autoCombineConcept = false;
73 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));
74 |
75 | XiaConceptParser parser = new XiaConceptParser(new XiaSememeParser());
76 |
77 | String line = null;
78 |
79 | int conceptCount = 0;
80 | int wordCount = 0;
81 | while ((line = in.readLine()) != null) {
82 | if(line.length()>5) continue;
83 | wordCount++;
84 | String word = line.trim();
85 | Collection concepts = parser.getInnerConcepts(word);
86 | //由于目前的词典为知网2000版本,所以默认情况下仅对词典中出现的概念进行统计
87 | if(BlankUtils.isBlank(concepts) && autoCombineConcept ){
88 | concepts = parser.autoCombineConcepts(word, null);
89 | }
90 | for(Concept c: concepts){
91 | conceptCount++;
92 | List names = new ArrayList();
93 |
94 | //加入主义原
95 | names.add(c.getMainSememe());
96 |
97 | //加入关系义原
98 | for(String item:c.getRelationSememes()){
99 | names.add(item.substring(item.indexOf("=") + 1));
100 | }
101 |
102 | //加入符号义原
103 | for(String item:c.getSymbolSememes()){
104 | names.add(item.substring(1));
105 | }
106 |
107 | //加入其他义原集合
108 | for(String item:c.getSecondSememes()){
109 | names.add(item);
110 | }
111 |
112 | for(String item:names){
113 | Integer count = sememeMap.get(item);
114 | if(count==null){
115 | sememeMap.put(item, 1);
116 | }else{
117 | sememeMap.put(item, count+1);
118 | }
119 | }
120 | }
121 | }
122 | in.close();
123 |
124 | //以下是为了按照义原出现的数量进行排序的代码
125 | Multimap map2 = HashMultimap.create();
126 | for(String key:sememeMap.keySet()){
127 | map2.put(sememeMap.get(key), key);
128 | }
129 | List keys = new ArrayList();
130 | for(Integer key: map2.keySet()){
131 | keys.add(key);
132 | }
133 | Collections.sort(keys);
134 |
135 | int smallSememeCount = 0; //较少出现的不同义原数量
136 | int smallAppearTotal = 0; //较少出现的义原在概念众出现的次数总和
137 | for(int index=(keys.size()-1); index>=0; index--){
138 | Integer key = keys.get(index);
139 | Collection values = map2.get(key);
140 | double ratio = (key*100.0/conceptCount);
141 | System.out.print(key + "(" + ratio + "%): ");
142 | for(String v:values){
143 | System.out.print(v+ "\t");
144 | }
145 | System.out.println();
146 | if(ratio<0.7){
147 | smallSememeCount += values.size();
148 | smallAppearTotal += key*values.size();
149 | }
150 | }
151 |
152 | System.out.println("small info: ");
153 | System.out.println("\tdifferent sememes:" + smallSememeCount);
154 | System.out.println("\tappear count:" + smallAppearTotal);
155 | System.out.println("\tratio:" + smallAppearTotal*100.0/conceptCount);
156 | System.out.println("wordCount:" + wordCount);
157 | System.out.println("conceptCount:" + conceptCount);
158 | }
159 |
160 | public static void main(String[] args) throws IOException {
161 | Training training = new Training();
162 | training.countSentimentDistribution();
163 | // System.out.println("test positive:");
164 | // training.test(true);
165 | //
166 | // System.out.println("test negative:");
167 | //training.test(false);
168 | }
169 | }
170 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/tendency/word/WordTendency.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.tendency.word;
2 |
3 | /**
4 | * 计算词语的语义倾向性,词语的语义倾向性为一个介于[-1, 1]之间的实数,数值越大,褒义性越强,否则,贬义性越强
5 | *
6 | * @author 夏天
7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
8 | */
9 | public interface WordTendency {
10 | /**
11 | * 获取词语的语义倾向性,词语的语义倾向性为一个介于[-1, 1]之间的实数,数值越大,褒义性越强,否则,贬义性越强
12 | * @param word
13 | * @return
14 | */
15 | public double getTendency(String word);
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/ui/PhraseSimilarityUI.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.ui;
2 |
3 | import java.awt.BorderLayout;
4 | import java.awt.GridLayout;
5 | import java.awt.event.ActionEvent;
6 | import java.awt.event.ActionListener;
7 |
8 | import javax.swing.BorderFactory;
9 | import javax.swing.JButton;
10 | import javax.swing.JLabel;
11 | import javax.swing.JPanel;
12 | import javax.swing.JScrollPane;
13 | import javax.swing.JTextArea;
14 | import javax.swing.JTextField;
15 |
16 | import ruc.irm.similarity.phrase.PhraseSimilarity;
17 |
18 | /**
19 | * 短语相似度的调用演示界面
20 | * @author 夏天
21 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
22 | */
23 | public class PhraseSimilarityUI {
24 |
25 | /**
26 | * 短语相似度的演示面板
27 | *
28 | * @return
29 | */
30 | public static JPanel createPanel() {
31 | // 声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
32 | JPanel fullPanel = new JPanel();
33 | fullPanel.setLayout(new BorderLayout());
34 |
35 | JPanel northPanel = new JPanel();
36 | fullPanel.add(northPanel, "North");
37 |
38 | // centerPanel包括了一个文本框
39 | JPanel centerPanel = new JPanel();
40 | fullPanel.add(centerPanel, "Center");
41 |
42 | centerPanel.setLayout(new BorderLayout());
43 | final JTextArea result = new JTextArea();
44 | // result.setFont(new Font("宋体", Font.PLAIN, 16));
45 | result.setLineWrap(true);
46 | JScrollPane centerScrollPane = new JScrollPane(result);
47 | centerPanel.add(centerScrollPane, "Center");
48 |
49 | northPanel.setLayout(new GridLayout(1, 1));
50 | // northPanel.add(createWordPanel());
51 | // northPanel.add(createCilinPanel());
52 |
53 | // 以下加入northPanel中的第一个面板
54 | final JTextField field1 = new JTextField("");
55 | final JTextField field2 = new JTextField("");
56 | field1.setColumns(50);
57 | field2.setColumns(50);
58 |
59 | JPanel mainPanel = new JPanel();
60 | mainPanel.setLayout(new GridLayout(3, 1));
61 |
62 | JPanel linePanel = new JPanel();
63 | linePanel.add(new JLabel("短语1:"));
64 | linePanel.add(field1);
65 | mainPanel.add(linePanel);
66 |
67 | linePanel = new JPanel();
68 | linePanel.add(new JLabel("短语2:"));
69 | linePanel.add(field2);
70 | mainPanel.add(linePanel);
71 |
72 | linePanel = new JPanel();
73 | JButton goButton = new JButton("计算相似度");
74 | linePanel.add(goButton);
75 | mainPanel.add(linePanel);
76 | goButton.addActionListener(new ActionListener() {
77 |
78 | @Override
79 | public void actionPerformed(ActionEvent e) {
80 | String phrase1 = field1.getText();
81 | String phrase2 = field2.getText();
82 | String text = "[" + phrase1 + "]与[" + phrase2 + "]的相似度为:";
83 | text = text + new PhraseSimilarity().getSimilarity(phrase1, phrase2);
84 | // text = text + "\n\n" + result.getText();
85 | result.setText(text);
86 | }
87 |
88 | });
89 | mainPanel.setBorder(BorderFactory.createEtchedBorder());
90 | northPanel.add(mainPanel);
91 |
92 | return fullPanel;
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/java/ruc/irm/ui/Start.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.ui;
2 |
3 | import java.awt.Container;
4 | import java.awt.Font;
5 | import java.util.Enumeration;
6 |
7 | import javax.swing.JFrame;
8 | import javax.swing.JMenu;
9 | import javax.swing.JMenuBar;
10 | import javax.swing.JMenuItem;
11 | import javax.swing.JScrollPane;
12 | import javax.swing.JTabbedPane;
13 | import javax.swing.SwingUtilities;
14 | import javax.swing.UIManager;
15 | import javax.swing.plaf.FontUIResource;
16 |
17 | import ruc.irm.similarity.sentence.SegmentProxy;
18 | import ruc.irm.similarity.util.About;
19 |
20 | /**
21 | * 相似度计算软件包演示启动类
22 | *
23 | * @author 夏天
24 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
25 | */
26 | public class Start extends JFrame {
27 |
28 | private static final long serialVersionUID = 85744461208L;
29 |
30 | public Start() {
31 | this.setTitle("相似度计算演示程序");
32 | this.setSize(420, 700);
33 | this.setLocationRelativeTo(null);
34 | this.setDefaultCloseOperation(EXIT_ON_CLOSE);
35 |
36 | // //////////////////////////////////
37 | // add menu
38 | JMenuBar menuBar = new JMenuBar();
39 | this.setJMenuBar(menuBar);
40 |
41 | JMenu fileMenu = new JMenu("File");
42 | menuBar.add(fileMenu);
43 | fileMenu.add(new JMenuItem("Exit"));
44 |
45 | JMenu helpMenu = new JMenu("Help");
46 | menuBar.add(helpMenu);
47 | helpMenu.add(new JMenuItem("Help"));
48 |
49 | Container contentPane = this.getContentPane();
50 | JTabbedPane tabbedPane = new JTabbedPane();
51 | tabbedPane.add("词语", WordSimlarityUI.createPanel());
52 | tabbedPane.add("短语", PhraseSimilarityUI.createPanel());
53 | tabbedPane.add("句子", SentenceSimilarityUI.createPanel());
54 | // tabbedPane.add("文本", WordSimlarityUI.createPanel());
55 | tabbedPane.add("词法分析", SegmentProxy.createPanel());
56 | tabbedPane.add("义原树", SememeTreeUI.createPanel());
57 | tabbedPane.add("情感分析", TendencyUI.createPanel());
58 | tabbedPane.add("关于", About.createPanel());
59 | JScrollPane scrollPane = new JScrollPane(tabbedPane);
60 | contentPane.add(scrollPane);
61 |
62 | this.pack();
63 | setExtendedState(MAXIMIZED_BOTH);
64 | }
65 |
66 | public static void InitGlobalFont(Font font) {
67 | FontUIResource fontRes = new FontUIResource(font);
68 | for (Enumeration
8 |
9 | 有任何问题或建议请与我们联系,您的反馈将有助于该项目的进一步完善。
10 |
11 |
12 |
致谢
13 | 本项目在研究过程中,得到了恩师樊孝忠教授的悉心指导,师恩如海,难以言谢!
14 | 中国人民大学为本项目的持续研究提供了资金和计算机软硬件的支持,北京理工大学为本项目的早期研究提供了重要的基础设施,
15 | 这些支持与国家的投入密不可分,
16 | 本项目的开源和不断完善也算是对国家的点滴回报!
17 | 代码中许多算法的核心思想来源于我们的研究同行和先辈们的已公开成果,另外,许多使用xsimilarity的人员对xsimilarity
18 | 提出了宝贵的建议,在此一并表示深深的谢意!
19 | 本工程使用了如下开源组件,对原作者致以谢意!
20 |
23 |
24 |
25 |
联系方式
26 | 夏天
27 | 数据工程与知识工程教育部重点实验室(中国人民大学)
28 | 中国人民大学信息资源管理学院
29 | 电话: 86-10-82500675
30 | Email: xiat(at)ruc.edu.cn
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/src/main/resources/data/cilin.db.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/src/main/resources/data/cilin.db.gz
--------------------------------------------------------------------------------
/src/main/resources/data/concept.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/src/main/resources/data/concept.xml.gz
--------------------------------------------------------------------------------
/src/main/resources/data/sememe.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/src/main/resources/data/sememe.xml.gz
--------------------------------------------------------------------------------
/src/main/resources/log4j.dtd:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
10 |
11 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
33 |
34 |
35 |
36 |
37 |
38 |
42 |
43 |
44 |
45 |
46 |
47 |
51 |
52 |
53 |
56 |
57 |
58 |
61 |
62 |
63 |
64 |
65 |
66 |
69 |
70 |
71 |
72 |
73 |
76 |
77 |
78 |
82 |
83 |
84 |
85 |
86 |
90 |
91 |
92 |
93 |
97 |
98 |
99 |
100 |
101 |
102 |
107 |
108 |
109 |
110 |
111 |
115 |
116 |
117 |
118 |
120 |
121 |
122 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
141 |
142 |
143 |
144 |
146 |
147 |
148 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
167 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/sentence/MorphoSimilarityTest.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence;
2 |
3 | import org.junit.Test;
4 | import ruc.irm.similarity.sentence.morphology.MorphoSimilarity;
5 | import ruc.irm.similarity.sentence.morphology.SemanticSimilarity;
6 |
7 | public class MorphoSimilarityTest {
8 |
9 | @Test
10 | public void test() {
11 | String s1 = "一个伟大的国家,中国";
12 | String s2 = "中国是一个伟大的国家";
13 |
14 | s1="修改下密码";
15 | s2="密码修改";
16 | MorphoSimilarity similarity = MorphoSimilarity.getInstance();
17 | double sim = similarity.getSimilarity(s1, s2);
18 | System.out.println("sim ==> " + sim);
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/sentence/SemanticSimilarityTest.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.sentence;
2 |
3 | import org.junit.Test;
4 |
5 | import ruc.irm.similarity.sentence.morphology.SemanticSimilarity;
6 |
7 | public class SemanticSimilarityTest {
8 |
9 | @Test
10 | public void test() {
11 | String s1 = "一个伟大的国家,中国";
12 | String s2 = "中国是一个伟大的国家";
13 |
14 | // s1="修改下密码";
15 | // s2="密码修改";
16 | SemanticSimilarity similarity = SemanticSimilarity.getInstance();
17 | double sim = similarity.getSimilarity(s1, s2);
18 | System.out.println("sim ==> " + sim);
19 |
20 |
21 | }
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/statistic/DictStatisticTest.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.statistic;
2 |
3 | import junit.framework.TestCase;
4 |
5 | /**
6 | * ./db/coredict.xml.gz是利用的ictclas4j的词典文件,这个文件可以从lib/ictclas4j.jar文件中得到。
7 | * 即:把ictclas4j.jar文件解压开,里面的dictionary目录下有coredict.xml.gz文件。
8 | *
9 | * @author 夏天
10 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
11 | */
12 | public class DictStatisticTest extends TestCase {
13 | public void testCount(){
14 | DictStatistic ds = new DictStatistic();
15 | ds.testFromXml("./db/coredict.xml.gz", true);
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/word/CharBasedSimilarityTest.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word;
2 |
3 | import junit.framework.TestCase;
4 |
5 | public class CharBasedSimilarityTest extends TestCase {
6 | public void test() {
7 | CharBasedSimilarity sim = new CharBasedSimilarity();
8 | String s1 = "手机";
9 | String s2 = "飞机";
10 |
11 | assertTrue(sim.getSimilarity(s1, s2) > 0);
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/word/hownet/ConceptTest.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet;
2 |
3 | import java.util.Collection;
4 |
5 | import ruc.irm.similarity.word.hownet2.concept.Concept;
6 |
7 | import com.google.common.collect.ArrayListMultimap;
8 | import com.google.common.collect.HashMultimap;
9 | import com.google.common.collect.Multimap;
10 |
11 | public class ConceptTest {
12 | public static void main(String[] args) {
13 | Multimap CONCEPTS = HashMultimap.create();
14 | // CONCEPTS = ArrayListMultimap.create();
15 |
16 | CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
17 | CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
18 | CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
19 | CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
20 |
21 | Collection collection = CONCEPTS.get("打");
22 | for(Concept c:collection){
23 | System.out.println(c);
24 | }
25 |
26 | Multimap map = HashMultimap.create();
27 | // map = ArrayListMultimap.create();
28 |
29 | map.put("打", 1);
30 | map.put("打", 1);
31 | map.put("打", 1);
32 | map.put("打", 2);
33 |
34 | Collection cc = map.get("打");
35 | for(Integer i:cc){
36 | System.out.println(i);
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/word/hownet/SememeTest.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet;
2 |
3 | import java.io.InputStream;
4 |
5 | import ruc.irm.similarity.util.FileUtils;
6 | import ruc.irm.similarity.word.hownet.sememe.Sememe;
7 | import ruc.irm.similarity.word.hownet.sememe.SememeDictTraverseEvent;
8 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser;
9 |
10 |
11 | /**
12 | * 针对义原的测试
13 | *
14 | * @author 夏天
15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室
16 | */
17 | public class SememeTest {
18 | public static void main(String[] args) throws Exception{
19 | String id1 = "2-1-3-4";
20 | // String id2 = "2-1-2";
21 | // System.out.println(getDistance(id1, id2));
22 | // System.out.println(getSimilarityBySememeId(id1, id2));
23 |
24 | int pos = id1.lastIndexOf("-");
25 | String parentId = "root";
26 | if(pos>0){
27 | parentId = id1.substring(0, pos);
28 | }
29 | System.out.println(parentId);
30 | new XiaSememeParser().getSimilarity("test", "hello");
31 | }
32 |
33 | static void saveXML() throws Exception{
34 | String sememeFile = Sememe.class.getPackage().getName().replaceAll("\\.", "/") + "/sememe.dat";
35 | InputStream input = Sememe.class.getClassLoader().getResourceAsStream(sememeFile);
36 | SememeDictTraverseEvent event = new SememeDictTraverseEvent();
37 |
38 | FileUtils.traverseLines(input, "utf8", event);
39 | event.saveToXML("/home/xiatian/Desktop/sememe.xml");
40 | }
41 |
42 | static double getSimilarityBySememeId(final String id1, final String id2) {
43 |
44 | int position = 0;
45 | String[] array1 = id1.split("-");
46 | String[] array2 = id2.split("-");
47 | for (position = 0; position < array1.length && position < array2.length; position++) {
48 | if (!array1[position].equals(array2[position])) {
49 | break;
50 | }
51 | }
52 |
53 | return 2.0*position/(array1.length + array2.length);
54 | }
55 |
56 | static int getDistance(String id1, String id2) {
57 | // 两个Id相同的位置终止地方
58 | int position = 0;
59 | String[] array1 = id1.split("-");
60 | String[] array2 = id2.split("-");
61 | for (position = 0; position < array1.length && position < array2.length; position++) {
62 | if (!array1[position].equals(array2[position])) {
63 | return array1.length + array2.length - position - position;
64 | }
65 | }
66 |
67 | if (array1.length == array2.length) {
68 | return 0;
69 | } else if (array1.length == position) {
70 | return array2.length - position;
71 | } else {
72 | return array1.length - position;
73 | }
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/java/ruc/irm/similarity/word/hownet2/HownetSimilarityTest.java:
--------------------------------------------------------------------------------
1 | package ruc.irm.similarity.word.hownet2;
2 |
3 | import junit.framework.TestCase;
4 |
5 | import org.junit.Before;
6 | import org.junit.Test;
7 |
8 | import ruc.irm.similarity.word.hownet2.concept.LiuConceptParser;
9 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser;
10 |
11 | public class HownetSimilarityTest extends TestCase {
12 | XiaConceptParser xParser = null;
13 | LiuConceptParser lParser = null;
14 |
15 | @Before
16 | public void setUp(){
17 | xParser = XiaConceptParser.getInstance();
18 | lParser = LiuConceptParser.getInstance();
19 | }
20 |
21 | @Test
22 | public void testWordSimiarltiy(){
23 | String word1 = "电动车";
24 | String word2 = "自行车";
25 | double x_sim = xParser.getSimilarity(word1, word2);
26 | double l_sim = lParser.getSimilarity(word1, word2);
27 | assertTrue(x_sim>l_sim);
28 | assertTrue(x_sim>0.2);
29 | }
30 |
31 | /**
32 | * 该词语计算相似度时出现死循环,bug由北京大学计算语言学研究所万富强提供,fqw0000@gmail.com
33 | */
34 | @Test
35 | public void testWordSimiarltiy2(){
36 | String word1 = "算法";
37 | String word2 = "安提瓜和巴布达";
38 | double x_sim = xParser.getSimilarity(word1, word2);
39 | double l_sim = lParser.getSimilarity(word1, word2);
40 | assertTrue(x_sim>=l_sim);
41 | System.out.println("x_sim:" + x_sim);
42 | System.out.println("l_sim:" + l_sim);
43 |
44 | }
45 | }
46 |
47 |
48 |
--------------------------------------------------------------------------------
/中文信息相似度计算理论与方法图书目录.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/中文信息相似度计算理论与方法图书目录.pdf
--------------------------------------------------------------------------------