├── .gitignore
├── README.md
├── bin
├── start
└── start.py
├── dict
├── sentiment
│ ├── 主张词语(中文).txt
│ ├── 主张词语(英文).txt
│ ├── 正面情感词语(中文).txt
│ ├── 正面情感词语(英文).txt
│ ├── 正面评价词语(中文).txt
│ ├── 正面评价词语(英文).txt
│ ├── 程度级别词语(中文).txt
│ ├── 程度级别词语(英文).txt
│ ├── 统计结果.txt
│ ├── 负面情感词语(中文).txt
│ ├── 负面情感词语(英文).txt
│ ├── 负面评价词语(中文).txt
│ └── 负面评价词语(英文).txt
├── tendency
│ └── tendency.xml
└── user-concept.xml
├── doc
├── HISTORY.md
├── LCMC.zip
├── REVISION.md
└── 中文信息相似度计算理论与方法图书目录.pdf
├── pom.xml
└── src
├── main
├── java
│ └── zx
│ │ └── soft
│ │ ├── classification
│ │ ├── Feature.java
│ │ ├── Instance.java
│ │ ├── NaiveBayesClassifier.java
│ │ └── Variable.java
│ │ ├── similarity
│ │ ├── Similaritable.java
│ │ ├── SimilarityFactory.java
│ │ ├── phrase
│ │ │ └── PhraseSimilarity.java
│ │ ├── sentence
│ │ │ ├── SegmentProxy.java
│ │ │ ├── SentenceSimilarity.java
│ │ │ ├── editdistance
│ │ │ │ ├── Block.java
│ │ │ │ ├── CharEditUnit.java
│ │ │ │ ├── ChunkEditUnit.java
│ │ │ │ ├── EditDistance.java
│ │ │ │ ├── EditUnit.java
│ │ │ │ ├── GregorEditDistance.java
│ │ │ │ ├── Split.java
│ │ │ │ ├── StandardEditDistance.java
│ │ │ │ ├── SuperString.java
│ │ │ │ ├── WordEditUnit.java
│ │ │ │ ├── XiatianEditDistance.java
│ │ │ │ └── XiatianEditDistance2.java
│ │ │ └── morphology
│ │ │ │ ├── MorphoSimilarity.java
│ │ │ │ └── SemanticSimilarity.java
│ │ ├── statistic
│ │ │ ├── DictStatistic.java
│ │ │ └── LCMC.java
│ │ ├── text
│ │ │ └── DiceSimilarity.java
│ │ ├── util
│ │ │ ├── About.java
│ │ │ ├── BlankUtils.java
│ │ │ ├── EditDistance.java
│ │ │ ├── F02-GB2312-to-PuTongHua-PinYin.txt
│ │ │ ├── FileUtils.java
│ │ │ ├── MathUtils.java
│ │ │ ├── PinyinUtils.java
│ │ │ ├── TraverseEvent.java
│ │ │ ├── XmlException.java
│ │ │ ├── XmlUtils.java
│ │ │ └── about.html
│ │ └── word
│ │ │ ├── CharBasedSimilarity.java
│ │ │ ├── WordSimilarity.java
│ │ │ ├── cilin
│ │ │ ├── Cilin.java
│ │ │ ├── CilinCoding.java
│ │ │ ├── CilinDb.java
│ │ │ └── cilin.db.gz
│ │ │ ├── hownet
│ │ │ ├── Hownet.java
│ │ │ ├── HownetMeta.java
│ │ │ ├── concept
│ │ │ │ ├── Concept.java
│ │ │ │ ├── ConceptDictTraverseEvent.java
│ │ │ │ ├── ConceptLinkedList.java
│ │ │ │ ├── ConceptParser.java
│ │ │ │ ├── LiuConceptParser.java
│ │ │ │ ├── MyConceptParser.java
│ │ │ │ └── concept.dat
│ │ │ └── sememe
│ │ │ │ ├── FastSimpleMap.java
│ │ │ │ ├── LiuqunSememeParser.java
│ │ │ │ ├── MySememeParser.java
│ │ │ │ ├── Sememe.java
│ │ │ │ ├── SememeDictTraverseEvent.java
│ │ │ │ ├── SememeParser.java
│ │ │ │ ├── SememeType.java
│ │ │ │ └── sememe.dat
│ │ │ ├── hownet2
│ │ │ ├── concept
│ │ │ │ ├── BaseConceptParser.java
│ │ │ │ ├── Concept.java
│ │ │ │ ├── ConceptDictTraverseEvent.java
│ │ │ │ ├── ConceptLinkedList.java
│ │ │ │ ├── LiuConceptParser.java
│ │ │ │ ├── XiaConceptParser.java
│ │ │ │ └── concept.xml.gz
│ │ │ └── sememe
│ │ │ │ ├── BaseSememeParser.java
│ │ │ │ ├── LiuqunSememeParser.java
│ │ │ │ ├── Sememe.java
│ │ │ │ ├── SememeType.java
│ │ │ │ ├── XiaSememeParser.java
│ │ │ │ └── sememe.xml.gz
│ │ │ └── pinyin
│ │ │ └── PinyinSimilarity.java
│ │ ├── tendency
│ │ └── word
│ │ │ ├── HownetWordTendency.java
│ │ │ ├── Training.java
│ │ │ └── WordTendency.java
│ │ └── ui
│ │ ├── PhraseSimilarityUI.java
│ │ ├── SememeTreeUI.java
│ │ ├── SentenceSimilarityUI.java
│ │ ├── Start.java
│ │ ├── TendencyUI.java
│ │ └── WordSimlarityUI.java
└── resources
│ ├── data
│ ├── F02-GB2312-to-PuTongHua-PinYin.txt
│ ├── about.html
│ ├── cilin.db.gz
│ ├── concept.dat
│ ├── concept.xml.gz
│ ├── sememe.dat
│ └── sememe.xml.gz
│ └── logback.xml
└── test
├── java
└── zx
│ └── soft
│ └── similarity
│ ├── sentence
│ └── SemanticSimilarityTest.java
│ ├── statistic
│ └── DictStatisticTest.java
│ └── word
│ ├── CharBasedSimilarityTest.java
│ ├── hownet
│ ├── ConceptTest.java
│ └── SememeTest.java
│ └── hownet2
│ └── HownetSimilarityTest.java
└── resources
├── data
├── F02-GB2312-to-PuTongHua-PinYin.txt
├── about.html
├── cilin.db.gz
├── concept.dat
├── concept.xml.gz
├── sememe.dat
└── sememe.xml.gz
└── logback-test.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | .settings/
4 | target/
5 | logs/
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 中文语义相似度计算框架
3 |
4 | > 汉语词语、组块、句子以及文本篇章等各个层面的相似度计算是中文信息处理领域的一项基础而又核心的工作,它直接决定着相关领域的研究发展状况,例如,在知识工程、基于实例的机器翻译、信息检索、自动问答以及拼写检查等方面,相似度计算都是一个非常关键的问题,长期以来一直是人们研究的一个热点和难点。相似度的研究涉及词语、组块、句子以及篇章等多个层面,目前的研究主要侧重于词语方面,提出了一些比较有代表性的理论与方法,如字面相似度算法、词素相似度算法,以及基于同义词词林、知网等语义词典的方法,国外的方法则主要包括基于构成字符的相似度计算方法、基于WORDNET的计算方法、基于词典注释的方法、基于大规模语料库统计的方法和基于搜索引擎的方法;有关组块、短语级别的相似度的研究现在还比较少,常用的方法是在词语相似度计算的基础上,借用句子相似度的计算方法计算组块之间的相似度。在句子层面的相似度计算方面,国外研究主要集中在字符串的相似度计算,国内则主要以词语为基本处理单元,通过计算相同词语所占的比重确定句子之间的相似度;文本层面的则集中于利用统计方法实现相似度计算。
5 |
6 | > 在相似度计算的研究过程中,许多研究学者的成果公布和无私帮助让我们受益匪浅,我们把代码开源出来,既是对前辈们表达我们的尊重之情,也希望能对大家共同的研究社区能有点滴贡献,能避免一些重复工作。
7 |
8 | 运行ruc.irm.ui.Start进行快速测试。
9 |
10 |
11 |
--------------------------------------------------------------------------------
/bin/start:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | X_HOME=/home/xiatian/workspace/study/xsimilarity
3 | java -Dswing.systemlaf=javax.swing.plaf.metal.MetalLookAndFeel -cp $X_HOME/lib/commons-logging-1.0.4.jar:$X_HOME/lib/log4j-1.2.8.jar:$X_HOME/lib/google-collect-1.0.jar:$X_HOME/lib/ictclas4j.jar:$X_HOME/bin ruc.irm.similarity.MainUI
4 |
--------------------------------------------------------------------------------
/bin/start.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import os, sys, re
3 |
4 | #home to store project
5 | HOME = './'
6 |
7 | #loop directory and get all libraries
8 | def getlibraries(HOME):
9 | jars = ''
10 | split_char = ':'
11 | if os.name=='nt':
12 | split_char = ';'
13 | jars = "."
14 |
15 | jars = jars + split_char + home + '/target/classes/main';
16 | jars = jars + split_char + home + '/target/classes/test';
17 | jars = jars + split_char + home + '/src/main/resources';
18 | jars = jars + split_char + home + '/src/main/java';
19 | libdir = home + "/lib";
20 |
21 | for jar in os.listdir(libdir):
22 | if(jar==".svn"):continue
23 | fullname = os.path.join(libdir,jar)
24 | if os.path.isdir(fullname):
25 | for subjar in os.listdir(fullname):
26 | if subjar.endswith('.jar'):
27 | jars = jars + split_char + os.path.join(fullname, subjar)
28 | else:
29 | jars = jars + split_char + fullname
30 |
31 | return jars;
32 |
33 | home = os.getcwd()
34 | if(os.path.basename(home)=='bin'):
35 | home = os.path.join(home,'..')
36 |
37 | libpath = getlibraries(home)
38 | command = 'java -Xmx256M -cp "' + libpath + '" '
39 |
40 | if(len(sys.argv)==1):
41 | print "useage:./start.py runclass"
42 | command = command + ' ruc.irm.similarity.MainUI'
43 | else:
44 | args = sys.argv
45 | for i in range(1,len(args)):
46 | command = command + ' ' + args[i]
47 |
48 | print "execute ", command
49 | print "xiatian@ruc."
50 | print os.getcwd()
51 | os.system(command)
52 |
--------------------------------------------------------------------------------
/dict/sentiment/主张词语(中文).txt:
--------------------------------------------------------------------------------
1 | 中文主张词语 38
2 |
3 | 1. {perception|感知} 22
4 | 察觉
5 | 触目
6 | 耳闻
7 | 发
8 | 发觉
9 | 发现
10 | 风闻
11 | 感
12 | 感觉
13 | 感觉到
14 | 感受到
15 | 见到
16 | 见得
17 | 觉
18 | 觉得
19 | 看得出来
20 | 窥见
21 | 领教
22 | 听说
23 | 痛感
24 | 预感
25 | 自觉
26 |
27 | 2. {regard|认为} 16
28 | 抱定
29 | 当
30 | 道
31 | 感到
32 | 感觉
33 | 觉得
34 | 看
35 | 看待
36 | 论
37 | 认定
38 | 认为
39 | 认准
40 | 想
41 | 相信
42 | 以为
43 | 主张
44 |
--------------------------------------------------------------------------------
/dict/sentiment/主张词语(英文).txt:
--------------------------------------------------------------------------------
1 | 英文主张词语 35
2 |
3 | 1. {perception|感知} 21
4 | be aware of
5 | be conscious
6 | be conscious of
7 | be told
8 | become aware of
9 | detect
10 | discern
11 | discover
12 | feel
13 | find
14 | get a glimpse of
15 | get wind of
16 | have a premonition
17 | hear of
18 | keenly feel
19 | learn through hearsay
20 | meet the eye
21 | notice
22 | perceive
23 | see
24 | sense
25 |
26 | {regard|认为} 14
27 | advocate
28 | believe
29 | consider
30 | feel
31 | firmly believe
32 | hold
33 | look upon
34 | maintain
35 | regard
36 | sense
37 | set one's mind on
38 | stand for
39 | suppose
40 | think
--------------------------------------------------------------------------------
/dict/sentiment/程度级别词语(中文).txt:
--------------------------------------------------------------------------------
1 | 中文程度级别词语 219
2 |
3 | 1. “极其|extreme / 最|most” 69
4 | 百分之百
5 | 倍加
6 | 备至
7 | 不得了
8 | 不堪
9 | 不可开交
10 | 不亦乐乎
11 | 不折不扣
12 | 彻头彻尾
13 | 充分
14 | 到头
15 | 地地道道
16 | 非常
17 | 极
18 | 极度
19 | 极端
20 | 极其
21 | 极为
22 | 截然
23 | 尽
24 | 惊人地
25 | 绝
26 | 绝顶
27 | 绝对
28 | 绝对化
29 | 刻骨
30 | 酷
31 | 满
32 | 满贯
33 | 满心
34 | 莫大
35 | 奇
36 | 入骨
37 | 甚为
38 | 十二分
39 | 十分
40 | 十足
41 | 死
42 | 滔天
43 | 痛
44 | 透
45 | 完全
46 | 完完全全
47 | 万
48 | 万般
49 | 万分
50 | 万万
51 | 无比
52 | 无度
53 | 无可估量
54 | 无以复加
55 | 无以伦比
56 | 要命
57 | 要死
58 | 已极
59 | 已甚
60 | 异常
61 | 逾常
62 | 贼
63 | 之极
64 | 之至
65 | 至极
66 | 卓绝
67 | 最为
68 | 佼佼
69 | 郅
70 | 綦
71 | 齁
72 | 最
73 |
74 | 2. “很|very” 42
75 | 不过
76 | 不少
77 | 不胜
78 | 惨
79 | 沉
80 | 沉沉
81 | 出奇
82 | 大为
83 | 多
84 | 多多
85 | 多加
86 | 多么
87 | 分外
88 | 格外
89 | 够瞧的
90 | 够戗
91 | 好
92 | 好不
93 | 何等
94 | 很
95 | 很是
96 | 坏
97 | 可
98 | 老
99 | 老大
100 | 良
101 | 颇
102 | 颇为
103 | 甚
104 | 实在
105 | 太
106 | 太甚
107 | 特
108 | 特别
109 | 尤
110 | 尤其
111 | 尤为
112 | 尤以
113 | 远
114 | 着实
115 | 曷
116 | 碜
117 |
118 | 3. “较|more” 37
119 | 大不了
120 | 多
121 | 更
122 | 更加
123 | 更进一步
124 | 更为
125 | 还
126 | 还要
127 | 较
128 | 较比
129 | 较为
130 | 进一步
131 | 那般
132 | 那么
133 | 那样
134 | 强
135 | 如斯
136 | 益
137 | 益发
138 | 尤甚
139 | 逾
140 | 愈
141 | 愈 ... 愈
142 | 愈发
143 | 愈加
144 | 愈来愈
145 | 愈益
146 | 远远
147 | 越 ... 越
148 | 越发
149 | 越加
150 | 越来越
151 | 越是
152 | 这般
153 | 这样
154 | 足
155 | 足足
156 |
157 | 4. “稍|-ish” 29
158 | 点点滴滴
159 | 多多少少
160 | 怪
161 | 好生
162 | 还
163 | 或多或少
164 | 略
165 | 略加
166 | 略略
167 | 略微
168 | 略为
169 | 蛮
170 | 稍
171 | 稍稍
172 | 稍微
173 | 稍为
174 | 稍许
175 | 挺
176 | 未免
177 | 相当
178 | 些
179 | 些微
180 | 些小
181 | 一点
182 | 一点儿
183 | 一些
184 | 有点
185 | 有点儿
186 | 有些
187 |
188 | 5. “欠|insufficiently” 12
189 | 半点
190 | 不大
191 | 不丁点儿
192 | 不甚
193 | 不怎么
194 | 聊
195 | 没怎么
196 | 轻度
197 | 弱
198 | 丝毫
199 | 微
200 | 相对
201 |
202 | 6. “超|over” 30
203 | 不为过
204 | 超
205 | 超额
206 | 超外差
207 | 超微结构
208 | 超物质
209 | 出头
210 | 多
211 | 浮
212 | 过
213 | 过度
214 | 过分
215 | 过火
216 | 过劲
217 | 过了头
218 | 过猛
219 | 过热
220 | 过甚
221 | 过头
222 | 过于
223 | 过逾
224 | 何止
225 | 何啻
226 | 开外
227 | 苦
228 | 老
229 | 偏
230 | 强
231 | 溢
232 | 忒
233 |
234 |
235 |
236 |
--------------------------------------------------------------------------------
/dict/sentiment/程度级别词语(英文).txt:
--------------------------------------------------------------------------------
1 | 英文程度级别词语 170
2 |
3 | 1. “极其|extreme / 最|most” 64
4 | 100 percent
5 | absolute
6 | absolutely
7 | alarmingly
8 | amazingly
9 | as fully as possible
10 | astonishingly
11 | awfully
12 | beyond challenge
13 | beyond compare
14 | beyond comparison
15 | beyond measure
16 | bitterly
17 | by all means
18 | completely
19 | deep-rooted
20 | deep-seated
21 | deeply
22 | definitely
23 | disastrously
24 | downright
25 | entirely
26 | exceedingly
27 | excessively
28 | extreme
29 | extremely
30 | fully
31 | greatest
32 | greatly
33 | heinous
34 | hundred-percent
35 | immensely
36 | immoderate
37 | in a penetrating way
38 | in every possible way
39 | in the extreme
40 | incomparably
41 | ingrained
42 | matchlessly
43 | monstrous
44 | most
45 | of the highest degree
46 | out-and-out
47 | outstanding
48 | outstandingly
49 | reach the limit
50 | right-down
51 | sharply
52 | sheer
53 | superb
54 | terribly
55 | to death
56 | to the full
57 | to the letter
58 | to the limit
59 | to the marrow
60 | to the utmost
61 | totally
62 | towering
63 | unusually
64 | utmost
65 | utterly
66 | very much
67 | most
68 |
69 | 2. “很|very” 25
70 | a lot
71 | awfully
72 | badly
73 | better
74 | by far
75 | considerably
76 | deep
77 | disastrously
78 | especially
79 | extraordinarily
80 | extremely
81 | greatly
82 | how
83 | however
84 | indeed
85 | much
86 | particularly
87 | really
88 | terribly
89 | to a serious degree
90 | too far
91 | too much
92 | unusually
93 | very
94 | what a
95 |
96 | 3. “较|more” 22
97 | all the more
98 | as much as
99 | at the worst
100 | by far
101 | comparatively
102 | even more
103 | further
104 | further more
105 | in that way
106 | increasingly
107 | like that
108 | more
109 | more and more
110 | more so
111 | much more
112 | plus
113 | relatively
114 | slightly more
115 | so
116 | still more
117 | such
118 | the more ... the more
119 |
120 | 4. “稍|-ish” 15
121 | a bit
122 | a bit too
123 | a little
124 | a little bit
125 | a little more
126 | fairly
127 | more or less
128 | passably
129 | pretty
130 | quite
131 | rather
132 | slightly
133 | some
134 | somewhat
135 | to some extent
136 |
137 | 5. “欠|insufficiently” 11
138 | a little less
139 | just
140 | light
141 | merely
142 | not particularly
143 | not too
144 | not very
145 | relative
146 | slight
147 | slightest degree of
148 | slightly
149 |
150 | 6. “超|over” 33
151 | a little over
152 | above
153 | above measure
154 | above quota
155 | and more
156 | excessive
157 | excessively
158 | exorbitance
159 | extra
160 | far more than
161 | hyperphysical
162 | inflated
163 | inordinate
164 | not too much
165 | odd
166 | outrageousness
167 | over
168 | over-
169 | overdone
170 | overheated
171 | plus
172 | slightly more
173 | super
174 | superheated
175 | superheterodyne
176 | surplus
177 | to a fault
178 | too
179 | too much
180 | ultra
181 | ultrastructural
182 | undue
183 | unduly
184 |
185 |
186 |
--------------------------------------------------------------------------------
/dict/tendency/tendency.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/dict/user-concept.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/doc/HISTORY.md:
--------------------------------------------------------------------------------
1 | 变更历史
2 | ================
3 |
4 | 2014-04: 把中文分词用ansj替换为原先的ictclas4j,在此对原作者表示感谢!把工程更改为maven工程,方便管理。
--------------------------------------------------------------------------------
/doc/LCMC.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/doc/LCMC.zip
--------------------------------------------------------------------------------
/doc/REVISION.md:
--------------------------------------------------------------------------------
1 | 错误修订
2 | =====================
3 |
4 | 1. 第三章概念词语的相似度计算部分的公式:
5 | Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ β1 βi Sim i (C1, C2)
6 | 应为: Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ Sim1(C1, C2) βi Sim i (C1, C2)
7 | 可参考以下代码实现: i
8 | @Override
9 | protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4) {
10 | return beta1 * sim_v1 + beta2 * sim_v1 * sim_v2 + beta3 * sim_v1 * sim_v3 + beta4 * sim_v1 * sim_v4;
11 | }
12 |
--------------------------------------------------------------------------------
/doc/中文信息相似度计算理论与方法图书目录.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/doc/中文信息相似度计算理论与方法图书目录.pdf
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | zx.soft
6 | semantic-similarity
7 | 1.0.0
8 | Semantic Similarity
9 |
10 |
11 | UTF-8
12 | 4.11
13 | 1.7.7
14 | 1.1.2
15 | 3.3.1
16 |
17 |
18 |
19 |
20 |
21 | ch.qos.logback
22 | logback-classic
23 | ${logback.version}
24 |
25 |
26 | ch.qos.logback
27 | logback-core
28 | ${logback.version}
29 |
30 |
31 | ch.qos.logback
32 | logback-access
33 | ${logback.version}
34 |
35 |
36 | org.slf4j
37 | slf4j-api
38 | ${slf4j.version}
39 |
40 |
41 |
42 | org.ansj
43 | tree_split
44 | 1.2
45 |
46 |
47 | org.ansj
48 | ansj_seg
49 | 1.3
50 |
51 |
52 | org.mockito
53 | mockito-all
54 | 1.9.5
55 |
56 |
57 | org.hamcrest
58 | hamcrest-all
59 | 1.3
60 |
61 |
62 | args4j
63 | args4j
64 | 2.0.16
65 |
66 |
67 | com.google.guava
68 | guava
69 | 13.0.1
70 |
71 |
72 | org.apache.commons
73 | commons-lang3
74 | ${commons.lang3.version}
75 |
76 |
77 | com.google.collections
78 | google-collections
79 | 1.0
80 |
81 |
82 |
83 | junit
84 | junit
85 | ${junit.version}
86 | test
87 |
88 |
89 |
90 |
91 |
92 | cengtral
93 | http://repo1.maven.org/maven2/
94 |
95 |
96 |
97 | Java.Net
98 | http://download.java.net/maven/2/
99 |
100 |
101 | ansj-repo
102 | http://maven.ansj.org/
103 |
104 |
105 | info-bliki-repository
106 | http://gwtwiki.googlecode.com/svn/maven-repository/
107 |
108 | true
109 |
110 |
111 | false
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 | org.apache.maven.plugins
120 | maven-compiler-plugin
121 | 3.1
122 |
123 | 1.7
124 | 1.7
125 | UTF-8
126 |
127 |
128 |
129 | org.apache.maven.plugins
130 | maven-source-plugin
131 | 2.2.1
132 |
133 |
134 | attach-sources
135 | verify
136 |
137 | jar-no-fork
138 |
139 |
140 |
141 |
142 |
143 | org.apache.maven.plugins
144 | maven-resources-plugin
145 | 2.6
146 |
147 | UTF-8
148 |
149 |
150 |
151 | org.apache.maven.plugins
152 | maven-surefire-plugin
153 | 2.16
154 |
155 |
156 | maven-assembly-plugin
157 | 2.4
158 |
159 |
160 |
161 | zx.soft.ui.Start
162 |
163 |
164 |
165 | jar-with-dependencies
166 |
167 |
168 |
169 |
170 | make-assembly
171 | package
172 |
173 | single
174 |
175 |
176 |
177 |
178 |
179 | org.codehaus.mojo
180 | exec-maven-plugin
181 |
182 |
183 |
184 | ${project.artifactId}-${project.version}
185 |
186 |
187 |
188 |
189 |
190 |
191 | zxsoft-snapshots
192 | Nexus Snapshot Repository
193 | http://192.168.3.23:18081/nexus/content/repositories/snapshots/
194 |
195 |
196 | sentiment
197 | http://192.168.3.23:18081/nexus/content/repositories/sentiment
198 |
199 |
200 |
201 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/classification/Feature.java:
--------------------------------------------------------------------------------
1 | package zx.soft.classification;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 | import java.util.HashMap;
7 | import java.util.Map;
8 |
9 | /**
10 | * 文档的特征
11 | *
12 | */
13 | public class Feature {
14 |
15 | /** 每个关键词在不同类别中出现的文档数量 */
16 | private Map docCountMap = new HashMap<>();
17 | /** 特征名称 */
18 | private String name;
19 |
20 | public String getName() {
21 | return name;
22 | }
23 |
24 | public void setName(String name) {
25 | this.name = name;
26 | }
27 |
28 | public void incDocCount(String category) {
29 | if (docCountMap.containsKey(category)) {
30 | docCountMap.put(category, docCountMap.get(category) + 1);
31 | } else {
32 | docCountMap.put(category, 1);
33 | }
34 | }
35 |
36 | public int getDocCount(String category) {
37 | if (docCountMap.containsKey(category)) {
38 | return docCountMap.get(category);
39 | } else {
40 | return 0;
41 | }
42 | }
43 |
44 | public void write(DataOutput out) throws IOException {
45 | out.writeUTF(name == null ? "" : name);
46 |
47 | out.writeInt(docCountMap.size());
48 | for (String category : docCountMap.keySet()) {
49 | out.writeUTF(category);
50 | out.writeInt(docCountMap.get(category));
51 | }
52 | }
53 |
54 | public void readFields(DataInput in) throws IOException {
55 | this.name = in.readUTF();
56 |
57 | docCountMap = new HashMap<>();
58 | int size = in.readInt();
59 | for (int i = 0; i < size; i++) {
60 | String category = in.readUTF();
61 | int docCount = in.readInt();
62 | docCountMap.put(category, docCount);
63 | }
64 | }
65 |
66 | public static Feature read(DataInput in) throws IOException {
67 | Feature f = new Feature();
68 | f.readFields(in);
69 | return f;
70 | }
71 |
72 | }
73 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/classification/Instance.java:
--------------------------------------------------------------------------------
1 | package zx.soft.classification;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.IOException;
7 | import java.io.InputStreamReader;
8 | import java.util.HashSet;
9 | import java.util.List;
10 | import java.util.Set;
11 |
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 |
15 | import zx.soft.similarity.sentence.SegmentProxy;
16 | import zx.soft.similarity.sentence.SegmentProxy.Word;
17 |
18 | /**
19 | * 代表一个文档实例
20 | *
21 | */
22 | public class Instance {
23 |
24 | private static Logger logger = LoggerFactory.getLogger(Instance.class);
25 |
26 | /** 文档类别 */
27 | private String category;
28 | /** 文档内容 */
29 | private final Set bag = new HashSet<>();
30 |
31 | public Instance() {
32 | //
33 | }
34 |
35 | public Instance(String category, File f, String encoding) {
36 | this.category = category;
37 | String line = null;
38 |
39 | try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));) {
40 | while ((line = in.readLine()) != null) {
41 | // System.out.println(line);
42 | List words = SegmentProxy.segment(line);
43 | for (Word w : words) {
44 | if (w.getPos().endsWith("adj") || w.getPos().startsWith("n") || w.getPos().startsWith("v")) {
45 | bag.add(w.getWord());
46 | }
47 | }
48 | }
49 | } catch (IOException e) {
50 | logger.error("current file:{},current line:{}", f.getAbsolutePath(), line);
51 | e.printStackTrace();
52 | }
53 | }
54 |
55 | public String getCategory() {
56 | return category;
57 | }
58 |
59 | public void setCategory(String category) {
60 | this.category = category;
61 | }
62 |
63 | public Set getWords() {
64 | return bag;
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/classification/NaiveBayesClassifier.java:
--------------------------------------------------------------------------------
1 | package zx.soft.classification;
2 |
3 | import java.io.DataInputStream;
4 | import java.io.DataOutput;
5 | import java.io.DataOutputStream;
6 | import java.io.File;
7 | import java.io.FileInputStream;
8 | import java.io.FileOutputStream;
9 | import java.io.IOException;
10 | import java.util.Collection;
11 | import java.util.HashMap;
12 | import java.util.Map;
13 |
14 | public class NaiveBayesClassifier {
15 |
16 | /**
17 | * 记录每个类别下出现的文档数量, 用于计算P(C)使用
18 | */
19 | Variable VARIABLE = new Variable();
20 |
21 | /**
22 | * 词语在所有类别中的总数量
23 | */
24 | Map TERM_TOTAL_COUNT = new HashMap<>();
25 |
26 | /**
27 | * 训练一篇文档
28 | * @param doc
29 | */
30 | public void training(Instance doc) {
31 | VARIABLE.addInstance(doc);
32 | }
33 |
34 | /**
35 | * 保存训练结果
36 | * @throws IOException
37 | */
38 | void save(File file) throws IOException {
39 | DataOutput out = new DataOutputStream(new FileOutputStream(file));
40 | VARIABLE.write(out);
41 | }
42 |
43 | public void load(File file) throws IOException {
44 | DataInputStream in = new DataInputStream(new FileInputStream(file));
45 | VARIABLE = Variable.read(in);
46 | }
47 |
48 | /**
49 | * 计算P(C)
50 | * @param category
51 | * @return
52 | */
53 | public double getCategoryProbability(String category) {
54 | return Math.log(VARIABLE.getDocCount(category) * 1.0f / VARIABLE.getDocCount());
55 | }
56 |
57 | /**
58 | * 计算P(feature|cateogry),返回的是取对数后的数值
59 | * @param feature
60 | * @param category
61 | * @return
62 | */
63 | public double getFeatureProbability(String feature, String category) {
64 | int m = VARIABLE.getFeatureCount();
65 | return Math.log((VARIABLE.getDocCount(feature, category) + 1.0) / (VARIABLE.getDocCount(category) + m));
66 | }
67 |
68 | /**
69 | * 计算给定实例文档属于指定类别的概率,返回的是取对数后的数值
70 | * @param category
71 | * @param doc
72 | * @return
73 | */
74 | public double getProbability(String category, Instance doc) {
75 | double result = getCategoryProbability(category);
76 | for (String feature : doc.getWords()) {
77 | if (VARIABLE.containFeature(feature)) {
78 | result += getFeatureProbability(feature, category);
79 | }
80 | }
81 | return result;
82 | }
83 |
84 | public String getCategory(Instance doc) {
85 | Collection categories = VARIABLE.getCategories();
86 | double best = Double.NEGATIVE_INFINITY;
87 | String bestName = null;
88 | for (String c : categories) {
89 | double current = getProbability(c, doc);
90 | // System.out.println(c + ":" + current);
91 | if (best < current) {
92 | best = current;
93 | bestName = c;
94 | }
95 | }
96 | return bestName;
97 | }
98 |
99 | public static void main(String[] args) throws IOException {
100 | NaiveBayesClassifier classifier = new NaiveBayesClassifier();
101 |
102 | // File samplePath = new File("./corpus/Sample");
103 | // for(File categoryPath:samplePath.listFiles()){
104 | // String category = categoryPath.getName();
105 | // for(File f:categoryPath.listFiles()){
106 | // classifier.training(new Instance(category, f, "GBK"));
107 | // }
108 | // }
109 | // classifier.save(new File("result.dat"));
110 | // System.out.println("Finished!");
111 |
112 | classifier.load(new File("result.dat"));
113 |
114 | Instance doc = new Instance(null, new File("/tmp/10.txt"), "GBK");
115 | System.out.println(classifier.getCategory(doc));
116 |
117 | }
118 |
119 | }
120 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/classification/Variable.java:
--------------------------------------------------------------------------------
1 | package zx.soft.classification;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 | import java.util.Collection;
7 | import java.util.HashMap;
8 | import java.util.Map;
9 |
10 | /**
11 | * 分类的类别
12 | *
13 | */
14 | public class Variable {
15 |
16 | /** 类别信息 */
17 | Map categoryMap = new HashMap<>();
18 |
19 | Map features = new HashMap<>();
20 |
21 | /** 所有文档的数量 */
22 | private int docCount = 0;
23 |
24 | public void write(DataOutput out) throws IOException {
25 | //保存文档总数
26 | out.writeInt(docCount);
27 |
28 | //写入类别总数
29 | out.writeInt(categoryMap.size());
30 | for (String category : categoryMap.keySet()) {
31 | out.writeUTF(category);
32 | categoryMap.get(category).write(out);
33 | }
34 |
35 | //写入Feature总数
36 | out.writeInt(features.size());
37 | for (String key : features.keySet()) {
38 | out.writeUTF(key);
39 | features.get(key).write(out);
40 | }
41 | }
42 |
43 | public void readFields(DataInput in) throws IOException {
44 | this.docCount = in.readInt();
45 |
46 | int size = in.readInt();
47 | categoryMap = new HashMap();
48 | for (int i = 0; i < size; i++) {
49 | String category = in.readUTF();
50 | CategoryInfo info = CategoryInfo.read(in);
51 | categoryMap.put(category, info);
52 | }
53 |
54 | size = in.readInt();
55 | features = new HashMap();
56 | for (int i = 0; i < size; i++) {
57 | String word = in.readUTF();
58 | Feature feature = Feature.read(in);
59 | features.put(word, feature);
60 | }
61 | }
62 |
63 | public static Variable read(DataInput in) throws IOException {
64 | Variable v = new Variable();
65 | v.readFields(in);
66 | return v;
67 | }
68 |
69 | public Collection getCategories() {
70 | return categoryMap.keySet();
71 | }
72 |
73 | public int getFeatureCount() {
74 | return features.size();
75 | }
76 |
77 | public boolean containFeature(String feature) {
78 | return features.containsKey(feature);
79 | }
80 |
81 | public void incDocCount() {
82 | this.docCount++;
83 | }
84 |
85 | public int getDocCount() {
86 | return this.docCount;
87 | }
88 |
89 | /**
90 | * 获取置顶类别下的文档数量
91 | * @param category
92 | * @return
93 | */
94 | public int getDocCount(String category) {
95 | return categoryMap.get(category).getDocCount();
96 | }
97 |
98 | /**
99 | * 获取feature在指定类别下的文档出现数量
100 | * @param feature
101 | * @param category
102 | * @return
103 | */
104 | public int getDocCount(String feature, String category) {
105 | Feature f = features.get(feature);
106 | if (f != null) {
107 | return f.getDocCount(category);
108 | }
109 | return 0;
110 | }
111 |
112 | public void addInstance(Instance instance) {
113 | incDocCount();
114 | CategoryInfo info = null;
115 | if (categoryMap.containsKey(instance.getCategory())) {
116 | info = categoryMap.get(instance.getCategory());
117 | } else {
118 | info = new CategoryInfo();
119 | }
120 | info.incDocCount();
121 | categoryMap.put(instance.getCategory(), info);
122 |
123 | for (String word : instance.getWords()) {
124 | Feature feature = features.get(word);
125 |
126 | if (feature == null)
127 | feature = new Feature();
128 |
129 | feature.setName(word);
130 | feature.incDocCount(instance.getCategory());
131 |
132 | features.put(word, feature);
133 | }
134 | }
135 |
136 | public static class CategoryInfo {
137 | private int docCount;
138 |
139 | public int getDocCount() {
140 | return docCount;
141 | }
142 |
143 | public void incDocCount() {
144 | this.docCount++;
145 | }
146 |
147 | public void setDocCount(int docCount) {
148 | this.docCount = docCount;
149 | }
150 |
151 | public void write(DataOutput out) throws IOException {
152 | out.writeInt(docCount);
153 | }
154 |
155 | public void readFields(DataInput in) throws IOException {
156 | this.docCount = in.readInt();
157 | }
158 |
159 | public static CategoryInfo read(DataInput in) throws IOException {
160 | CategoryInfo c = new CategoryInfo();
161 | c.readFields(in);
162 | return c;
163 | }
164 | }
165 |
166 | }
167 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/Similaritable.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity;
2 |
3 | /**
4 | * 可以计算相似度的接口
5 | *
6 | */
7 | public interface Similaritable {
8 |
9 | /**
10 | * 计算两个字符串的相似度,对于句子来说,计算的是句子相似度,对于词语则计算词语的相似度
11 | * @param item1 参与相似度计算的第一个字符串
12 | * @param item2 参与相似度计算的第二个字符串
13 | * @return
14 | */
15 | public double getSimilarity(String item1, String item2);
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/SimilarityFactory.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity;
2 |
3 | import zx.soft.similarity.sentence.SentenceSimilarity;
4 | import zx.soft.similarity.sentence.morphology.MorphoSimilarity;
5 | import zx.soft.similarity.word.WordSimilarity;
6 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
7 |
8 | public class SimilarityFactory {
9 |
10 | private static WordSimilarity wordSimilarity = XiaConceptParser.getInstance();
11 | private static SentenceSimilarity sentenceSimilarity = MorphoSimilarity.getInstance();
12 |
13 | private SimilarityFactory() {
14 | //
15 | }
16 |
17 | public static WordSimilarity getWordSimilarity() {
18 | return wordSimilarity;
19 | }
20 |
21 | public static SentenceSimilarity getSentenceSimilarity() {
22 | return sentenceSimilarity;
23 | }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/phrase/PhraseSimilarity.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.phrase;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import zx.soft.similarity.Similaritable;
7 |
8 | /**
9 | * 一种简单的短语相似度计算方法,算法原理请参考《中文信息相似度计算理论与方法》一书P69.
10 | *
11 | */
12 | public class PhraseSimilarity implements Similaritable {
13 |
14 | @Override
15 | public double getSimilarity(String item1, String item2) {
16 | return (getSC(item1, item2) + getSC(item2, item1)) / 2.0;
17 | }
18 |
19 | public List getC(String first, String second, int pos) {
20 | List results = new ArrayList();
21 | char ch = first.charAt(pos);
22 | for (int i = 0; i < second.length(); i++) {
23 | if (ch == second.charAt(i)) {
24 | results.add(i);
25 | }
26 | }
27 | return results;
28 | }
29 |
30 | public int getDistance(String first, String second, int pos) {
31 | int d = second.length();
32 | for (int k : getC(first, second, pos)) {
33 | int value = Math.abs(k - pos);
34 | if (d > value) {
35 | d = value;
36 | }
37 | }
38 |
39 | return d;
40 | }
41 |
42 | public double getCC(String first, String second, int pos) {
43 | return (second.length() - getDistance(first, second, pos)) * 1.0 / second.length();
44 | }
45 |
46 | public double getSC(String first, String second) {
47 | double total = 0.0;
48 | for (int i = 0; i < first.length(); i++) {
49 | total = total + getCC(first, second, i);
50 | }
51 | return total / first.length();
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/SegmentProxy.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence;
2 |
3 | import java.awt.BorderLayout;
4 | import java.awt.GridLayout;
5 | import java.awt.event.ActionEvent;
6 | import java.awt.event.ActionListener;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | import javax.swing.BorderFactory;
11 | import javax.swing.JButton;
12 | import javax.swing.JLabel;
13 | import javax.swing.JPanel;
14 | import javax.swing.JScrollPane;
15 | import javax.swing.JTextArea;
16 | import javax.swing.JTextField;
17 |
18 | import org.ansj.domain.Term;
19 | import org.ansj.recognition.NatureRecognition;
20 | import org.ansj.splitWord.analysis.ToAnalysis;
21 |
22 | /**
23 | * 对词法分析程序的封装代理,目前内部封装了对Ictclas4j(夏天改进版)的调用
24 | * 为方便演示程序快速启动,对Segment的调用采用了单例模式,实现需要时的延迟加载。
25 | *
26 | */
27 | public class SegmentProxy {
28 |
29 | public static class Word {
30 | /**
31 | * 词语内容
32 | */
33 | private String word;
34 | /**
35 | * 词语词性代号
36 | */
37 | private String pos;
38 |
39 | public Word(String word, String pos) {
40 | this.word = word;
41 | this.pos = pos;
42 | }
43 |
44 | public String getWord() {
45 | return word;
46 | }
47 |
48 | public void setWord(String word) {
49 | this.word = word;
50 | }
51 |
52 | public String getPos() {
53 | return pos;
54 | }
55 |
56 | public void setPos(String pos) {
57 | this.pos = pos;
58 | }
59 | }
60 |
61 | public static List segment(String sentence) {
62 | List results = new ArrayList();
63 | List terms = ToAnalysis.parse(sentence);
64 | new NatureRecognition(terms).recognition();
65 |
66 | for (Term term : terms) {
67 | results.add(new Word(term.getName(), term.getNatrue().natureStr));
68 | }
69 |
70 | return results;
71 | }
72 |
73 | public static String getSegmentedString(String sentence) {
74 | List words = segment(sentence);
75 | StringBuilder sb = new StringBuilder();
76 | for (Word word : words) {
77 | sb.append(word.getWord() + "/" + word.getPos()).append(" ");
78 | }
79 | return sb.toString();
80 | }
81 |
82 | public static JPanel createPanel() {
83 | //声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
84 | JPanel fullPanel = new JPanel();
85 | fullPanel.setLayout(new BorderLayout());
86 |
87 | JPanel northPanel = new JPanel();
88 | fullPanel.add(northPanel, "North");
89 |
90 | //centerPanel包括了一个文本框
91 | JPanel centerPanel = new JPanel();
92 | fullPanel.add(centerPanel, "Center");
93 | centerPanel.setLayout(new BorderLayout());
94 | final JTextArea result = new JTextArea();
95 | //result.setFont(new Font("宋体", Font.PLAIN, 16));
96 | result.setLineWrap(true);
97 | JScrollPane centerScrollPane = new JScrollPane(result);
98 | centerPanel.add(centerScrollPane, "Center");
99 |
100 | northPanel.setLayout(new GridLayout(1, 1));
101 |
102 | //以下加入northPanel中的第一个面板
103 | final JTextField senField = new JTextField("什么是计算机病毒");
104 | senField.setColumns(50);
105 |
106 | JPanel mainPanel = new JPanel();
107 | mainPanel.setLayout(new GridLayout(2, 1));
108 |
109 | JPanel linePanel = new JPanel();
110 | linePanel.add(new JLabel("句子:"));
111 | linePanel.add(senField);
112 | mainPanel.add(linePanel);
113 |
114 | linePanel = new JPanel();
115 | JButton goButton = new JButton("词法分析");
116 | linePanel.add(goButton);
117 | mainPanel.add(linePanel);
118 | goButton.addActionListener(new ActionListener() {
119 |
120 | @Override
121 | public void actionPerformed(ActionEvent e) {
122 | String sentence = senField.getText();
123 | String text = "[" + sentence + "]的词法分析结果为:";
124 |
125 | text = text + "\n" + getSegmentedString(sentence);
126 | text = text + "\n________________________________\n" + result.getText();
127 | result.setText(text);
128 | }
129 |
130 | });
131 | mainPanel.setBorder(BorderFactory.createEtchedBorder());
132 | northPanel.add(mainPanel);
133 |
134 | return fullPanel;
135 | }
136 |
137 | }
138 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/SentenceSimilarity.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence;
2 |
3 | import zx.soft.similarity.Similaritable;
4 |
5 | /**
6 | * 语句相似度接口
7 | *
8 | * @author wanggang
9 | *
10 | */
11 | public interface SentenceSimilarity extends Similaritable {
12 |
13 | }
14 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/Block.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | public class Block {
4 |
5 | private int globalPosition;
6 | /** 块的内容 */
7 | private SuperString data;
8 | /** 前后指针 */
9 | private Block prev, next;
10 | /** 是否已经进行划分 */
11 | private boolean divideFlag = false;
12 |
13 | public Block(SuperString string) {
14 | this.data = string;
15 | this.globalPosition = 0;
16 | }
17 |
18 | public Block(SuperString string, int globalBegin) {
19 | this.data = string;
20 | this.globalPosition = globalBegin;
21 | }
22 |
23 | public int getGlobalPosition() {
24 | return globalPosition;
25 | }
26 |
27 | public void setGlobalPosition(int globalPosition) {
28 | this.globalPosition = globalPosition;
29 | }
30 |
31 | public SuperString getData() {
32 | return data;
33 | }
34 |
35 | public void setData(SuperString data) {
36 | this.data = data;
37 | }
38 |
39 | public Block getPrev() {
40 | return prev;
41 | }
42 |
43 | public void setPrev(Block prev) {
44 | this.prev = prev;
45 | }
46 |
47 | public Block getNext() {
48 | return next;
49 | }
50 |
51 | public void setNext(Block next) {
52 | this.next = next;
53 | }
54 |
55 | public boolean isDivideFlag() {
56 | return divideFlag;
57 | }
58 |
59 | public void setDivideFlag(boolean divideFlag) {
60 | this.divideFlag = divideFlag;
61 | }
62 |
63 | public void divide(int start, int length) {
64 | if (start == 0 && length == data.length()) {
65 | this.divideFlag = true;
66 | return;
67 | } else if (start == 0) {
68 | //前面为已经分割的标记,后面应该为未分割的标记
69 | Block tail = new Block(data.substring(length), globalPosition + start);
70 | this.setDivideFlag(true);
71 | this.setData(data.substring(0, length));
72 | tail.next = this.next;
73 | if (tail.next != null)
74 | tail.next.prev = tail;
75 | this.next = tail;
76 | tail.prev = this;
77 | } else if (start + length == data.length()) {
78 | //后面为已经分割的标记,前面应该为未分割的标记
79 | Block head = new Block(data.substring(0, start), globalPosition);
80 |
81 | this.setDivideFlag(true);
82 | this.setData(data.substring(start));
83 |
84 | head.prev = this.prev;
85 | if (head.prev != null)
86 | head.prev.next = head;
87 | head.next = this;
88 | this.prev = head;
89 | } else {
90 | //中间为已经分割的标记,前面和后面应该为未分割的标记
91 | Block head = new Block(data.substring(0, start), globalPosition);
92 | Block tail = new Block(data.substring(start + length), globalPosition + start + length);
93 |
94 | this.setDivideFlag(true);
95 | this.setData(data.substring(start, start + length));
96 | this.setGlobalPosition(globalPosition + start);
97 |
98 | head.prev = this.prev;
99 | if (head.prev != null)
100 | head.prev.next = head;
101 | head.next = this;
102 | this.prev = head;
103 |
104 | tail.next = this.next;
105 | if (tail.next != null)
106 | tail.next.prev = tail;
107 | this.next = tail;
108 | tail.prev = this;
109 | }
110 | }
111 |
112 | }
113 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/CharEditUnit.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | public class CharEditUnit extends EditUnit {
4 |
5 | private String content = "";
6 |
7 | public CharEditUnit(Character ch) {
8 | content = ch.toString();
9 | }
10 |
11 | @Override
12 | public String getUnitString() {
13 | return content;
14 | }
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/ChunkEditUnit.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | public class ChunkEditUnit extends EditUnit {
4 |
5 | private SuperString extends EditUnit> chunk = null;
6 |
7 | public ChunkEditUnit(SuperString extends EditUnit> chunk) {
8 | this.chunk = chunk;
9 | }
10 |
11 | @Override
12 | public String getUnitString() {
13 | return chunk.toString();
14 | }
15 |
16 | /**
17 | * 根据此语的相似度获取替换代价
18 | */
19 | @Override
20 | public double getSubstitutionCost(EditUnit otherUnit) {
21 | if (!(otherUnit instanceof ChunkEditUnit))
22 | return chunk.length();
23 | if (equals(otherUnit))
24 | return 0.0;
25 |
26 | ChunkEditUnit other = (ChunkEditUnit) otherUnit;
27 | return new StandardEditDistance().getEditDistance(chunk, other.chunk);
28 | }
29 |
30 | /**
31 | * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0
32 | * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价
33 | * @return 删除代价
34 | */
35 | @Override
36 | public double getDeletionCost() {
37 | return chunk.length();
38 | }
39 |
40 | /**
41 | * 获取插入代价,标准算法的默认值为1.0.
42 | * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价
43 | */
44 | @Override
45 | public double getInsertionCost() {
46 | return chunk.length();
47 | }
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/EditDistance.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | import zx.soft.similarity.Similaritable;
4 |
5 | /**
6 | * 编辑距离的父类,定义了其中的主要行为
7 | *
8 | */
9 | public abstract class EditDistance implements Similaritable {
10 |
11 | public abstract double getEditDistance(SuperString extends EditUnit> S, SuperString extends EditUnit> T);
12 |
13 | @Override
14 | public double getSimilarity(String s1, String s2) {
15 | SuperString S = SuperString.createWordSuperString(s1);
16 | SuperString T = SuperString.createWordSuperString(s2);
17 |
18 | return 1 - (getEditDistance(S, T)) / (Math.max(S.length(), T.length()));
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/EditUnit.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | /**
4 | * 编辑单元
5 | *
6 | */
7 | public abstract class EditUnit {
8 |
9 | /**
10 | * 获取编辑单元的内部字符串
11 | * @return
12 | */
13 | public abstract String getUnitString();
14 |
15 | /**
16 | * 获取替换代价,默认替换代价当替换单元的内容相同时为0,
17 | * 不同时为1
18 | */
19 | public double getSubstitutionCost(EditUnit other) {
20 | return this.equals(other) ? 0 : 1;
21 | }
22 |
23 | /**
24 | * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0
25 | * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价
26 | * @return 删除代价
27 | */
28 | public double getDeletionCost() {
29 | return 1.0;
30 | }
31 |
32 | /**
33 | * 获取插入代价,标准算法的默认值为1.0.
34 | * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价
35 | */
36 | public double getInsertionCost() {
37 | return 1.0;
38 | }
39 |
40 | @Override
41 | public boolean equals(Object other) {
42 | if (!(other instanceof EditUnit))
43 | return false;
44 | return getUnitString().equals(((EditUnit) other).getUnitString());
45 | }
46 |
47 | @Override
48 | public String toString() {
49 | return getUnitString();
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/GregorEditDistance.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | /**
4 | * 由Gregor提出的考虑块交换(Block Transposition)的编辑距离改进算法
5 | * 时间复杂度为O(m3n3)
6 | * 具体实现请参考GregorLeusch,Nicola Ueffing的文章《A Novel String-to-String Distance Measure With
7 | * Application to Machine Translation Evaluation》
8 | * 问题:
9 | * 相似度计算的问题会影响句子相似度计算的直观结果,例如“什么是计算机病毒”,“电脑病毒是什么”
10 | * 直觉应该是2,即“什么是计算机病毒”首先变为“计算机病毒什么是”,再变为“计算机病毒是什么”,
11 | * 编辑代价为2,但实际上,当由“什么是计算机病毒”变为“计算机病毒什么是”后,由于"什么是"与“是什么”的替换代价只有0.2,
12 | * 因而不再进行交互,故总的编辑距离为1.2
13 | *
14 | */
15 | public class GregorEditDistance extends EditDistance {
16 |
17 | /** 块交换代价 */
18 | public static double swapCost = 0.5;
19 |
20 | private SuperString extends EditUnit> S, T;
21 |
22 | /** 存放字符串从S(i0-i1)到T(j0-j1)的中间运算结果,避免多次运算,提高运算效率*/
23 | private double[][][][] QArray;
24 |
25 | @Override
26 | public double getEditDistance(SuperString extends EditUnit> S, SuperString extends EditUnit> T) {
27 | this.S = S;
28 | this.T = T;
29 | QArray = new double[S.length()][S.length()][T.length()][T.length()];
30 | for (int i = 0; i < S.length(); i++) {
31 | for (int i2 = 0; i2 < S.length(); i2++)
32 | for (int j = 0; j < T.length(); j++)
33 | for (int j2 = 0; j2 < T.length(); j2++) {
34 | QArray[i][i2][j][j2] = Double.MAX_VALUE;
35 | }
36 | }
37 |
38 | return Q(0, S.length() - 1, 0, T.length() - 1);
39 | }
40 |
41 | private double Q(int i0, int i1, int j0, int j1) {
42 | double cost = 0;
43 |
44 | if (i1 < i0) {
45 | for (int j = j0; j <= j1; j++) {
46 | cost += T.elementAt(j).getInsertionCost();
47 | }
48 | return cost;
49 | } else if (j1 < j0) {
50 | for (int i = i0; i <= i1; i++) {
51 | cost += S.elementAt(i).getDeletionCost();
52 | }
53 | return cost;
54 | } else if (i1 == i0 && j1 == j0) {
55 | cost = S.elementAt(i0).getSubstitutionCost(T.elementAt(j0));
56 | QArray[i0][i1][j0][j1] = cost;
57 | return cost;
58 | } else if (i1 == i0) {
59 | double minSubstituteValue = 1.0;
60 | int minPosJ = j0;
61 | for (int j = j0; j <= j1; j++) {
62 | double subsitituteValue = S.elementAt(i0).getSubstitutionCost(T.elementAt(j));
63 | if (minSubstituteValue > subsitituteValue) {
64 | minSubstituteValue = subsitituteValue;
65 | minPosJ = j;
66 | }
67 | }
68 | for (int j = j0; j <= j1; j++) {
69 | if (j == minPosJ) {
70 | cost += minSubstituteValue;
71 | } else {
72 | cost += T.elementAt(j).getInsertionCost();
73 | }
74 | }
75 | } else if (j1 == j0) {
76 | double minSubstituteValue = 1.0;
77 | int minPosI = i0;
78 | for (int i = i0; i <= i1; i++) {
79 | double subsitituteValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j0));
80 | if (minSubstituteValue > subsitituteValue) {
81 | minSubstituteValue = subsitituteValue;
82 | minPosI = i;
83 | }
84 | }
85 | for (int i = i0; i <= i1; i++) {
86 | if (i == minPosI) {
87 | cost += minSubstituteValue;
88 | } else {
89 | cost += S.elementAt(i).getDeletionCost();
90 | }
91 | }
92 | } else {
93 | if (QArray[i0][i1][j0][j1] < Double.MAX_VALUE) {
94 | return QArray[i0][i1][j0][j1];
95 | }
96 | for (int i = i0; i < i1; i++) {
97 | for (int j = j0; j < j1; j++) {
98 | double c = Math.min(Q(i0, i, j0, j) + Q(i + 1, i1, j + 1, j1),
99 | Q(i0, i, j + 1, j1) + Q(i + 1, i1, j0, j) + swapCost);
100 | if (c < QArray[i0][i1][j0][j1]) {
101 | QArray[i0][i1][j0][j1] = c;
102 | }
103 | }
104 | }
105 | return QArray[i0][i1][j0][j1];
106 | }
107 | QArray[i0][i1][j0][j1] = cost;
108 | return cost;
109 | }
110 |
111 | public static void main(String[] argv) {
112 | String s1 = "abcxdef";
113 | String s2 = "defxabc";
114 | //String s2 = "我的密码我忘记了,我该怎样做呢?";
115 | GregorEditDistance ed = new GregorEditDistance();
116 | System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1),
117 | SuperString.createCharSuperString(s2)));
118 | }
119 |
120 | }
121 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/Split.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | public class Split {
7 |
8 | public static boolean MERGE_FLAG = true;
9 |
10 | @SuppressWarnings({ "unchecked", "rawtypes" })
11 | public static Object[] split(SuperString extends EditUnit> X, SuperString extends EditUnit> Y) {
12 | Block extends EditUnit> LX = new Block(X);
13 | Block extends EditUnit> LY = new Block(Y);
14 | split(LX, LY);
15 | while (LY.getPrev() != null) {
16 | LY = LY.getPrev();
17 | }
18 | while (LX.getPrev() != null) {
19 | LX = LX.getPrev();
20 | }
21 | List first = new ArrayList();
22 | List second = new ArrayList();
23 | while (LX != null) {
24 | first.add(new ChunkEditUnit(LX.getData()));
25 | LX = LX.getNext();
26 | }
27 |
28 | while (LY != null) {
29 | second.add(new ChunkEditUnit(LY.getData()));
30 | LY = LY.getNext();
31 | }
32 | SuperString s1 = new SuperString(first);
33 | SuperString s2 = new SuperString(second);
34 | Object[] obj = new Object[] { s1, s2 };
35 | return obj;
36 | }
37 |
38 | private static void split(Block> bx, Block> LY) {
39 | LCS maxLCS = null;
40 | Block> by = LY;
41 | while (by.getPrev() != null) {
42 | by = by.getPrev();
43 | }
44 | Block> maxMatchedBy = by;
45 | while (by != null) {
46 | if (by.isDivideFlag()) {
47 | by = by.getNext();
48 | continue;
49 | }
50 |
51 | LCS lcs = LCS.parse(bx.getData(), by.getData());
52 | if (maxLCS == null || maxLCS.length < lcs.length) {
53 | maxLCS = lcs;
54 | maxMatchedBy = by;
55 | }
56 |
57 | by = by.getNext();
58 | }
59 |
60 | if (maxLCS != null && maxLCS.length > 0) {
61 | bx.divide(maxLCS.x_pos, maxLCS.length);
62 | maxMatchedBy.divide(maxLCS.y_pos, maxLCS.length);
63 | }
64 |
65 | if (bx.getPrev() != null && !bx.isDivideFlag()) {
66 | split(bx.getPrev(), LY);
67 | }
68 |
69 | if (bx.getNext() != null && !bx.getNext().isDivideFlag()) {
70 | split(bx.getNext(), LY);
71 | }
72 | }
73 |
74 | /**
75 | * Longest Common String
76 | * @author Gavin
77 | *
78 | */
79 | public static class LCS {
80 | public int length = 0; //LCS匹配的最长结果
81 | public int x_pos = 0; //LCS匹配的X的位置
82 | public int y_pos = 0; //LCS匹配的Y的位置
83 |
84 | public static LCS parse(SuperString> X, SuperString> Y) {
85 | LCS lcs = new LCS();
86 | for (int start = 0; start < X.length(); start++) {
87 | for (int end = start + 1; end <= X.length(); end++) {
88 | SuperString> tempX = X.substring(start, end);
89 |
90 | int pos = Y.indexOf(tempX);
91 | if (pos >= 0 && tempX.length() > lcs.length) {
92 | lcs.length = tempX.length();
93 | lcs.x_pos = start;
94 | lcs.y_pos = pos;
95 | }
96 | }
97 | }
98 | return lcs;
99 | }
100 |
101 | @Override
102 | public String toString() {
103 | return "length=" + length + ", x_pos=" + x_pos + ", y_pos=" + y_pos;
104 | }
105 | }
106 |
107 | public static void main(String[] args) {
108 | String s1 = "abcdefghijkabc";
109 | String s2 = "cdefghijklabccc";
110 | // s2 = "fgabcdehijklkdslfkasdflak";
111 | // s1 = "abcdefgxyzoxyjasdkfjjjaldsfa";
112 | // s1 = "I like the book";
113 | // s2 = "the book I like";
114 | s1 = "什么是计算机病毒";
115 | s2 = "电脑病毒是什么";
116 |
117 | // SuperString ss1 = SuperString.createCharSuperString(s1);
118 | // SuperString ss2 = SuperString.createCharSuperString(s2);
119 |
120 | SuperString ss1 = SuperString.createWordSuperString(s1);
121 | SuperString ss2 = SuperString.createWordSuperString(s2);
122 | Split.split(ss1, ss2);
123 | // LCS lcs = LCS.parse(ss1, ss2);
124 | // System.out.println(lcs);
125 | }
126 |
127 | }
128 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/StandardEditDistance.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | /**
4 | * 基于编辑距离的汉语句子相似度计算
5 | *
6 | */
7 | public class StandardEditDistance extends EditDistance {
8 |
9 | /**
10 | * 获取两个串的编辑距离
11 | * @param S 字符串1
12 | * @param T 字符串2
13 | * @return 两个串的编辑距离
14 | */
15 | @Override
16 | public double getEditDistance(SuperString extends EditUnit> X, SuperString extends EditUnit> Y) {
17 | double[][] D; //编辑矩阵
18 |
19 | int m = X.length(); //字符串X的长度
20 | int n = Y.length(); //字符串Y的长度
21 | //char ch_x_i; //字符串X的第i个词
22 | //char ch_y_j; //字符串Y的第j个词
23 |
24 | if (m == 0) {
25 | double distance = 0.0;
26 | for (int j = 0; j < n; j++) {
27 | distance += Y.elementAt(j).getInsertionCost();
28 | }
29 | return distance;
30 | } else if (n == 0) {
31 | double distance = 0.0;
32 | for (int i = 0; i < m; i++) {
33 | distance += X.elementAt(i).getDeletionCost();
34 | }
35 | return distance;
36 | }
37 |
38 | D = new double[n + 1][m + 1];
39 | D[0][0] = 0.0; //第一个初始化为0
40 |
41 | /** 初始化D[0][j] */
42 | for (int j = 1; j <= m; j++) {
43 | D[0][j] = D[0][j - 1] + X.elementAt(j - 1).getDeletionCost();
44 | }
45 |
46 | /** 初始化D[i][0] */
47 | for (int i = 1; i <= n; i++) {
48 | D[i][0] = D[i - 1][0] + Y.elementAt(i - 1).getInsertionCost();
49 | }
50 |
51 | for (int i = 1; i <= m; i++) {
52 | EditUnit unit_x_i = X.elementAt(i - 1);
53 | for (int j = 1; j <= n; j++) {
54 | EditUnit unit_y_j = Y.elementAt(j - 1);
55 | double cost = unit_x_i.getSubstitutionCost(unit_y_j);
56 | D[j][i] = Math.min(D[j - 1][i] + Y.elementAt(j - 1).getInsertionCost(), D[j][i - 1]
57 | + X.elementAt(i - 1).getDeletionCost());
58 | D[j][i] = Math.min(D[j][i], D[j - 1][i - 1] + cost);
59 | }
60 | }
61 |
62 | return D[n][m];
63 | }
64 |
65 | public static void main(String[] args) {
66 | String s1 = "abcdefg";
67 | String s2 = "gcdefab";
68 |
69 | StandardEditDistance ed = new StandardEditDistance();
70 | s1 = "什么是计算机病毒";
71 | s2 = "什么是电脑病毒";
72 | System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1),
73 | SuperString.createCharSuperString(s2)));
74 | System.out.println(ed.getEditDistance(SuperString.createWordSuperString(s1),
75 | SuperString.createWordSuperString(s2)));
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/SuperString.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import zx.soft.similarity.sentence.SegmentProxy;
7 | import zx.soft.similarity.sentence.SegmentProxy.Word;
8 |
9 | /**
10 | * 超级字符串,可以存放指定的数据类型
11 | *
12 | */
13 | public class SuperString {
14 |
15 | private List contents = new ArrayList();
16 |
17 | public SuperString(List contents) {
18 | this.contents = contents;
19 | }
20 |
21 | public static SuperString createCharSuperString(String str) {
22 | List list = new ArrayList(str.length());
23 | for (int i = 0; i < str.length(); i++) {
24 | list.add(new CharEditUnit(str.charAt(i)));
25 | }
26 | SuperString s = new SuperString(list);
27 | return s;
28 | }
29 |
30 | public static SuperString createWordSuperString(String sentence) {
31 | List wordList = SegmentProxy.segment(sentence);
32 | List unitList = new ArrayList(wordList.size());
33 | for (int i = 0; i < wordList.size(); i++) {
34 | unitList.add(new WordEditUnit(wordList.get(i)));
35 | }
36 | SuperString s = new SuperString(unitList);
37 | return s;
38 | }
39 |
40 | public T elementAt(int pos) {
41 | if (pos < 0 || pos >= contents.size()) {
42 | throw new ArrayIndexOutOfBoundsException("下标越界");
43 | }
44 | return contents.get(pos);
45 | }
46 |
47 | public int indexOf(SuperString> substring) {
48 | int result = -1;
49 | for (int i = 0; i < length(); i++) {
50 | int j = 0;
51 | if (i + substring.length() > length())
52 | return -1;
53 |
54 | for (; j < substring.length(); j++) {
55 | if (elementAt(i + j).equals(substring.elementAt(j))) {
56 | continue;
57 | } else {
58 | break;
59 | }
60 | }
61 | if (j == substring.length()) {
62 | return i;
63 | }
64 | }
65 | return result;
66 | }
67 |
68 | public SuperString substring(int fromIndex, int toIndex) {
69 | return new SuperString(contents.subList(fromIndex, toIndex));
70 | }
71 |
72 | public SuperString substring(int fromIndex) {
73 | return new SuperString(contents.subList(fromIndex, contents.size()));
74 | }
75 |
76 | public int length() {
77 | return contents.size();
78 | }
79 |
80 | @Override
81 | public String toString() {
82 | StringBuilder sb = new StringBuilder();
83 | for (int i = 0; i < length(); i++) {
84 | sb.append(elementAt(i));
85 | }
86 | return sb.toString();
87 | }
88 |
89 | @Override
90 | public boolean equals(Object other) {
91 | return toString().equals(other.toString());
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/WordEditUnit.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | import zx.soft.similarity.sentence.SegmentProxy.Word;
4 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
5 |
6 | public class WordEditUnit extends EditUnit {
7 |
8 | private Word word = null;
9 |
10 | public WordEditUnit(Word word) {
11 | this.word = word;
12 | }
13 |
14 | @Override
15 | public String getUnitString() {
16 | return word.getWord();
17 | }
18 |
19 | /**
20 | * 根据此语的相似度获取替换代价
21 | */
22 | @Override
23 | public double getSubstitutionCost(EditUnit otherUnit) {
24 | if (!(otherUnit instanceof WordEditUnit))
25 | return 1.0;
26 | if (equals(otherUnit))
27 | return 0.0;
28 |
29 | WordEditUnit other = (WordEditUnit) otherUnit;
30 | //词性不同,直接返回1.0
31 | if (word.getPos() != other.word.getPos()) {
32 | return 1.0;
33 | }
34 | return 1 - XiaConceptParser.getInstance().getSimilarity(getUnitString(), other.getUnitString());
35 | }
36 |
37 | @Override
38 | public boolean equals(Object other) {
39 | if (!(other instanceof WordEditUnit))
40 | return false;
41 | WordEditUnit otherUnit = (WordEditUnit) other;
42 | Word otherWord = otherUnit.word;
43 | //词性不同,直接返回1.0
44 | if (word.getPos() != otherWord.getPos()) {
45 | return false;
46 | }
47 | double sim = XiaConceptParser.getInstance().getSimilarity(getUnitString(), otherUnit.getUnitString());
48 | return sim > 0.85;
49 | }
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/XiatianEditDistance.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | /**
4 | * 新的支持非相邻块交互的编辑距离算法
5 | *
6 | */
7 | public class XiatianEditDistance extends EditDistance {
8 |
9 | /** 块交换代价 */
10 | public static double swapCost = 0.5;
11 |
12 | private SuperString extends EditUnit> S, T;
13 | private double[][][][] QArray;
14 |
15 | @Override
16 | public double getEditDistance(SuperString extends EditUnit> S, SuperString extends EditUnit> T) {
17 | this.S = S;
18 | this.T = T;
19 | QArray = new double[S.length() + 1][S.length() + 1][T.length() + 1][T.length() + 1];
20 | for (int i = 0; i <= S.length(); i++) {
21 | for (int i2 = 0; i2 <= S.length(); i2++)
22 | for (int j = 0; j <= T.length(); j++)
23 | for (int j2 = 0; j2 <= T.length(); j2++) {
24 | QArray[i][i2][j][j2] = Double.MAX_VALUE;
25 | }
26 | }
27 | return Q(0, S.length() - 1, 0, T.length() - 1);
28 | }
29 |
30 | private double Q(int i1, int im, int j1, int jn) {
31 | if (QArray[i1][im][j1][jn] < Double.MAX_VALUE) {
32 | return QArray[i1][im][j1][jn];
33 | }
34 | double cost = 0;
35 | if (im < i1) {
36 | for (int j = j1; j <= jn; j++) {
37 | cost += T.elementAt(j).getInsertionCost();
38 | }
39 | } else if (jn < j1) {
40 | for (int i = i1; i <= im; i++) {
41 | cost += S.elementAt(i).getDeletionCost();
42 | }
43 | } else if (im == i1 && jn == j1) {
44 | cost = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
45 | } else if (i1 == im) {
46 | double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
47 | int minPosJ = j1;
48 | for (int j = j1 + 1; j <= jn; j++) {
49 | double subValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j));
50 | if (minSubValue > subValue) {
51 | minSubValue = subValue;
52 | minPosJ = j;
53 | }
54 | }
55 | for (int j = j1; j <= jn; j++) {
56 | if (j == minPosJ) {
57 | cost += minSubValue;
58 | } else {
59 | cost += T.elementAt(j).getInsertionCost();
60 | }
61 | }
62 | } else if (j1 == jn) {
63 | int minPosI = i1;
64 | double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
65 | for (int i = i1 + 1; i <= im; i++) {
66 | double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1));
67 | if (minSubValue > subValue) {
68 | minSubValue = subValue;
69 | minPosI = i;
70 | }
71 | }
72 | for (int i = i1; i <= im; i++) {
73 | if (i == minPosI) {
74 | cost += minSubValue;
75 | } else {
76 | cost += S.elementAt(i).getDeletionCost();
77 | }
78 | }
79 | } else {
80 | cost = QArray[i1][im][j1][jn];
81 | loop: for (int i = i1; i < im; i++) {
82 | //block X divide to 3 parts.
83 | for (int LX = 0; LX <= im - i; LX++) {
84 | //process Y sentence
85 | for (int j = j1; j < jn; j++) {
86 | //if(cost<=swapCost)break;
87 | for (int LY = 0; LY <= jn - j; LY++) {
88 | //不交换的代价
89 | double cost1 = Q(i1, i, j1, j) + Q(i + 1, i + LX, j + 1, j + LY)
90 | + Q(i + LX + 1, im, j + LY + 1, jn);
91 | //交互代价
92 | double cost2 = Q(i1, i, j + LY + 1, jn) + Q(i + 1, i + LX, j + 1, j + LY)
93 | + Q(i + LX + 1, im, j1, j) + swapCost;
94 | cost = Math.min(Math.min(cost1, cost2), cost);
95 | if (cost == 0)
96 | break loop;
97 | }
98 | }
99 | }
100 | }
101 | }
102 |
103 | QArray[i1][im][j1][jn] = cost;
104 | return cost;
105 | }
106 |
107 | public static void main(String[] argv) {
108 | EditDistance ed = new XiatianEditDistance();
109 | String s1 = "abcxdef";
110 | String s2 = "def";
111 | //String s2 = "我的密码我忘记了,我该怎样做呢?";
112 | System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1),
113 | SuperString.createCharSuperString(s2)));
114 | }
115 |
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/XiatianEditDistance2.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.editdistance;
2 |
3 | /**
4 | * 夏天提出的新的支持非相邻块交互的编辑距离算法
5 | *
6 | */
7 | public class XiatianEditDistance2 extends EditDistance {
8 |
9 | /** 块交换代价 */
10 | private final double swapCost = 1.0;
11 |
12 | private SuperString extends EditUnit> S, T;
13 | private double[][][][] QArray;
14 |
15 | @Override
16 | @SuppressWarnings("unchecked")
17 | public double getEditDistance(SuperString extends EditUnit> S1, SuperString extends EditUnit> T1) {
18 | Object[] array = Split.split(S1, T1);
19 | this.S = (SuperString extends EditUnit>) array[0];
20 | this.T = (SuperString extends EditUnit>) array[1];
21 | QArray = new double[S.length() + 1][S.length() + 1][T.length() + 1][T.length() + 1];
22 | for (int i = 0; i <= S.length(); i++) {
23 | for (int i2 = 0; i2 <= S.length(); i2++)
24 | for (int j = 0; j <= T.length(); j++)
25 | for (int j2 = 0; j2 <= T.length(); j2++) {
26 | QArray[i][i2][j][j2] = Double.MAX_VALUE;
27 | }
28 | }
29 | return Q(0, S.length() - 1, 0, T.length() - 1);
30 | }
31 |
32 | private double Q(int i1, int im, int j1, int jn) {
33 | if (QArray[i1][im][j1][jn] < Double.MAX_VALUE) {
34 | return QArray[i1][im][j1][jn];
35 | }
36 | double cost = 0;
37 | if (im < i1) {
38 | for (int j = j1; j <= jn; j++) {
39 | cost += T.elementAt(j).getInsertionCost();
40 | }
41 | } else if (jn < j1) {
42 | for (int i = i1; i <= im; i++) {
43 | cost += S.elementAt(i).getDeletionCost();
44 | }
45 | } else if (im == i1 && jn == j1) {
46 | cost = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
47 | } else if (i1 == im) {
48 | double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
49 | int minPosJ = j1;
50 | for (int j = j1 + 1; j <= jn; j++) {
51 | double subValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j));
52 | if (minSubValue > subValue) {
53 | minSubValue = subValue;
54 | minPosJ = j;
55 | }
56 | }
57 | for (int j = j1; j <= jn; j++) {
58 | if (j == minPosJ) {
59 | cost += minSubValue;
60 | } else {
61 | cost += T.elementAt(j).getInsertionCost();
62 | }
63 | }
64 | } else if (j1 == jn) {
65 | int minPosI = i1;
66 | double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
67 | for (int i = i1 + 1; i <= im; i++) {
68 | double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1));
69 | if (minSubValue > subValue) {
70 | minSubValue = subValue;
71 | minPosI = i;
72 | }
73 | }
74 | for (int i = i1; i <= im; i++) {
75 | if (i == minPosI) {
76 | cost += minSubValue;
77 | } else {
78 | cost += S.elementAt(i).getDeletionCost();
79 | }
80 | }
81 | } else {
82 | cost = QArray[i1][im][j1][jn];
83 | loop: for (int i = i1; i < im; i++) {
84 | //block X divide to 3 parts.
85 | for (int LX = 0; LX <= im - i; LX++) {
86 | //process Y sentence
87 | for (int j = j1; j < jn; j++) {
88 | //if(cost<=swapCost)break;
89 | for (int LY = 0; LY <= jn - j; LY++) {
90 | //不交换的代价
91 | double cost1 = Q(i1, i, j1, j) + Q(i + 1, i + LX, j + 1, j + LY)
92 | + Q(i + LX + 1, im, j + LY + 1, jn);
93 | //交互代价
94 | double cost2 = Q(i1, i, j + LY + 1, jn) + Q(i + 1, i + LX, j + 1, j + LY)
95 | + Q(i + LX + 1, im, j1, j) + swapCost;
96 | cost = Math.min(Math.min(cost1, cost2), cost);
97 | if (cost == 0)
98 | break loop;
99 | }
100 | }
101 | }
102 | }
103 | }
104 |
105 | QArray[i1][im][j1][jn] = cost;
106 | return cost;
107 | }
108 |
109 | public static void main(String[] argv) {
110 | EditDistance ed = new XiatianEditDistance2();
111 | String s1 = "abcxdef";
112 | String s2 = "def";
113 | //String s2 = "我的密码我忘记了,我该怎样做呢?";
114 | System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1),
115 | SuperString.createCharSuperString(s2)));
116 | }
117 |
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/morphology/MorphoSimilarity.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.morphology;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import org.slf4j.Logger;
7 | import org.slf4j.LoggerFactory;
8 |
9 | import zx.soft.similarity.sentence.SegmentProxy;
10 | import zx.soft.similarity.sentence.SegmentProxy.Word;
11 | import zx.soft.similarity.sentence.SentenceSimilarity;
12 | import zx.soft.similarity.word.WordSimilarity;
13 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
14 |
15 | /**
16 | * 基于词形和词序的句子相似度计算算法,考虑了语义因素
17 | * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的方法,在考虑语义时,
18 | * 无法直接获取OnceWS(A, B),因此,采用了两两匹配取最大值的方式。
19 | * 新的改进算法请参考{@code SemanticSimilarity}
20 | *
21 | */
22 | public class MorphoSimilarity implements SentenceSimilarity {
23 |
24 | private static Logger logger = LoggerFactory.getLogger(MorphoSimilarity.class);
25 |
26 | /** 词形相似度占总相似度的比重 */
27 | private final double LAMBDA1 = 1.0;
28 | /** 词序相似度占总相似度的比重 */
29 | private final double LAMBDA2 = 0.0;
30 | /** 词语相似度的计算 */
31 | private WordSimilarity wordSimilarity = null;
32 |
33 | private static String FILTER_CHARS = " ,。;?《》()|!,.;?<>|_^…!";
34 |
35 | private static MorphoSimilarity instance = null;
36 |
37 | public static MorphoSimilarity getInstance() {
38 | if (instance == null) {
39 | instance = new MorphoSimilarity();
40 | }
41 | return instance;
42 | }
43 |
44 | private MorphoSimilarity() {
45 | logger.debug("used hownet wordsimilarity.");
46 | this.wordSimilarity = XiaConceptParser.getInstance();
47 | //this.segmenter = SegmentFactory.getInstance().getParser();
48 | }
49 |
50 | /**
51 | * 滤掉词串中的空格、标点符号
52 | * @param word_list
53 | * @return
54 | */
55 | private String[] filter(String[] word_list) {
56 | List results = new ArrayList<>();
57 | for (String w : word_list) {
58 | if (!FILTER_CHARS.contains(w)) {
59 | results.add(w.toLowerCase());
60 | }
61 | }
62 |
63 | return results.toArray(new String[results.size()]);
64 | }
65 |
66 | /**
67 | * 计算两个句子的相似度
68 | * @see zx.soft.similarity.Similaritable
69 | */
70 | @Override
71 | public double getSimilarity(String firstSen, String secondSen) {
72 | //LOG.debug(segmenter.segmentToString(firstSen));
73 | //LOG.debug(segmenter.segmentToString(secondSen));
74 | String[] firstList = filter(segment(firstSen));
75 | String[] secondList = filter(segment(secondSen));
76 |
77 | double wordSim = getOccurrenceSimilarity(firstList, secondList);
78 | //LOG.debug("词形相似度="+wordSim);
79 |
80 | double orderSim = getOrderSimilarity(firstList, secondList);
81 | //LOG.debug("词序相似度="+orderSim);
82 |
83 | return LAMBDA1 * wordSim + LAMBDA2 * orderSim;
84 | }
85 |
86 | /**
87 | * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序,第二个句子词语的顺序变化次数
88 | * @param firstList
89 | * @param secondList
90 | * @return
91 | */
92 | public double getOccurrenceSimilarity(String[] firstList, String[] secondList) {
93 | int max = firstList.length > secondList.length ? firstList.length : secondList.length;
94 | if (max == 0) {
95 | return 0;
96 | }
97 |
98 | //首先计算出所有可能的组合
99 | double[][] scores = new double[max][max];
100 | for (int i = 0; i < firstList.length; i++) {
101 | for (int j = 0; j < secondList.length; j++) {
102 | scores[i][j] = wordSimilarity.getSimilarity(firstList[i], secondList[j]);
103 | }
104 | }
105 |
106 | double total_score = 0;
107 |
108 | //从scores[][]中挑选出最大的一个相似度,然后减去该元素,进一步求剩余元素中的最大相似度
109 | while (scores.length > 0) {
110 | double max_score = 0;
111 | int max_row = 0;
112 | int max_col = 0;
113 |
114 | //先挑出相似度最大的一对:
115 | for (int i = 0; i < scores.length; i++) {
116 | for (int j = 0; j < scores.length; j++) {
117 | if (max_score < scores[i][j]) {
118 | max_row = i;
119 | max_col = j;
120 | max_score = scores[i][j];
121 | }
122 | }
123 | }
124 |
125 | //从数组中去除最大的相似度,继续挑选
126 | double[][] tmp_scores = new double[scores.length - 1][scores.length - 1];
127 | for (int i = 0; i < scores.length; i++) {
128 | if (i == max_row)
129 | continue;
130 | for (int j = 0; j < scores.length; j++) {
131 | if (j == max_col)
132 | continue;
133 | int tmp_i = max_row > i ? i : i - 1;
134 | int tmp_j = max_col > j ? j : j - 1;
135 | tmp_scores[tmp_i][tmp_j] = scores[i][j];
136 | }
137 | }
138 | total_score += max_score;
139 | scores = tmp_scores;
140 | }
141 |
142 | return (2 * total_score) / (firstList.length + secondList.length);
143 | }
144 |
145 | /**
146 | * 获取两个集合的词序相似度
147 | * @param firstList
148 | * @param secondList
149 | * @return
150 | */
151 | public double getOrderSimilarity(String[] firstList, String[] secondList) {
152 | double similarity = 0.0;
153 |
154 | return similarity;
155 | }
156 |
157 | // @SuppressWarnings("unchecked")
158 | // public String[] segment(String sentence){
159 | // MPWordSegment ws = new MPWordSegment();
160 | // ws.parseReader(new StringReader(sentence));
161 | // Vector tokens = ws.getTokens();
162 | // String[] results = new String[tokens.size()];
163 | // for(int i=0; i list = SegmentProxy.segment(sentence);
173 | String[] results = new String[list.size()];
174 | for (int i = 0; i < list.size(); i++) {
175 | results[i] = list.get(i).getWord();
176 | }
177 | return results;
178 | }
179 |
180 | }
181 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/morphology/SemanticSimilarity.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.sentence.morphology;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import org.slf4j.Logger;
7 | import org.slf4j.LoggerFactory;
8 |
9 | import zx.soft.similarity.sentence.SegmentProxy;
10 | import zx.soft.similarity.sentence.SegmentProxy.Word;
11 | import zx.soft.similarity.sentence.SentenceSimilarity;
12 | import zx.soft.similarity.word.WordSimilarity;
13 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
14 |
15 | /**
16 | * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的基于词形和词序的句子相似度计算算法
17 | * 在考虑语义时,无法直接获取OnceWS(A, B),为此,通过记录两个句子的词语匹配对中相似度
18 | * 大于某一阈值的词语对最为相同词语,计算次序相似度。
19 | *
20 | *
21 | */
22 | public class SemanticSimilarity implements SentenceSimilarity {
23 |
24 | private static Logger logger = LoggerFactory.getLogger(SemanticSimilarity.class);
25 |
26 | /** 词形相似度占总相似度的比重 */
27 | private final double LAMBDA1 = 0.8;
28 | /** 词序相似度占总相似度的比重 */
29 | private final double LAMBDA2 = 0.2;
30 |
31 | /** 如果两个词语的相似度大于了该阈值, 则作为相同词语,计算词序相似度 */
32 | private final double GAMMA = 0.6;
33 |
34 | /** 词语相似度的计算 */
35 | private WordSimilarity wordSimilarity = null;
36 |
37 | private static String FILTER_CHARS = " ,。;?《》()|!,.;?<>|_^…!";
38 |
39 | private static SemanticSimilarity instance = null;
40 |
41 | public static SemanticSimilarity getInstance() {
42 | if (instance == null) {
43 | instance = new SemanticSimilarity();
44 | }
45 | return instance;
46 | }
47 |
48 | private SemanticSimilarity() {
49 | logger.debug("used hownet wordsimilarity.");
50 | this.wordSimilarity = XiaConceptParser.getInstance();
51 | //this.segmenter = SegmentFactory.getInstance().getParser();
52 | }
53 |
54 | /**
55 | * 滤掉词串中的空格、标点符号
56 | * @param word_list
57 | * @return
58 | */
59 | private String[] filter(String[] word_list) {
60 | List results = new ArrayList();
61 | for (String w : word_list) {
62 | if (!FILTER_CHARS.contains(w)) {
63 | results.add(w.toLowerCase());
64 | }
65 | }
66 |
67 | return results.toArray(new String[results.size()]);
68 | }
69 |
70 | /**
71 | * 计算两个句子的相似度
72 | * @see zx.soft.similarity.Similaritable
73 | */
74 | @Override
75 | public double getSimilarity(String firstSen, String secondSen) {
76 | //LOG.debug(segmenter.segmentToString(firstSen));
77 | //LOG.debug(segmenter.segmentToString(secondSen));
78 | String[] firstList = filter(segment(firstSen));
79 | String[] secondList = filter(segment(secondSen));
80 |
81 | return calculate(firstList, secondList);
82 | }
83 |
84 | /**
85 | * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序,第二个句子词语的顺序变化次数
86 | * @param firstList
87 | * @param secondList
88 | * @return
89 | */
90 | public double calculate(String[] firstList, String[] secondList) {
91 | if (firstList.length == 0 || secondList.length == 0) {
92 | return 0;
93 | }
94 |
95 | //首先计算出所有可能的组合
96 | double[][] scores = new double[firstList.length][secondList.length];
97 |
98 | //代表第1个句子对应位置是否已经被使用, 默认为未使用,即false
99 | boolean[] firstFlags = new boolean[firstList.length];
100 |
101 | //代表第2个句子对应位置是否已经被使用, 默认为未使用,即false
102 | boolean[] secondFlags = new boolean[secondList.length];
103 |
104 | //PSecond的定义参见书中5.4.3节, 为避免无必要的初始化数组,
105 | //数组中0值表示在第一个句子中没有对应的相似词语,大于0的值
106 | //则表示在第一个句子中的位置(从1开始编号了)
107 | int[] PSecond = new int[secondList.length];
108 |
109 | for (int i = 0; i < firstList.length; i++) {
110 | //firstFlags[i] = false;
111 | for (int j = 0; j < secondList.length; j++) {
112 | scores[i][j] = wordSimilarity.getSimilarity(firstList[i], secondList[j]);
113 | }
114 | }
115 |
116 | double total_score = 0;
117 |
118 | //从scores[][]中挑选出最大的一个相似度,然后减去该元素(通过Flags数组表示),进一步求剩余元素中的最大相似度
119 | while (true) {
120 | double max_score = 0;
121 | int max_row = -1;
122 | int max_col = -1;
123 |
124 | //先挑出相似度最大的一对:
125 | for (int i = 0; i < scores.length; i++) {
126 | if (firstFlags[i])
127 | continue;
128 | for (int j = 0; j < scores.length; j++) {
129 | if (secondFlags[j])
130 | continue;
131 |
132 | if (max_score < scores[i][j]) {
133 | max_row = i;
134 | max_col = j;
135 | max_score = scores[i][j];
136 | }
137 | }
138 | }
139 |
140 | if (max_row >= 0) {
141 | total_score += max_score;
142 | firstFlags[max_row] = true;
143 | secondFlags[max_col] = true;
144 | if (max_score >= GAMMA) {
145 | PSecond[max_col] = max_row + 1;
146 | }
147 | } else {
148 | break;
149 | }
150 | }
151 |
152 | double wordSim = (2 * total_score) / (firstList.length + secondList.length);
153 |
154 | int previous = 0;
155 | int revOrdCount = 0;
156 | int onceWSSize = 0;
157 | for (int i = 0; i < PSecond.length; i++) {
158 | if (PSecond[i] > 0) {
159 | onceWSSize++;
160 | if (previous > 0 && (previous > PSecond[i])) {
161 | revOrdCount++;
162 | }
163 | previous = PSecond[i];
164 | }
165 | }
166 |
167 | double ordSim = 0;
168 | if (onceWSSize == 1) {
169 | ordSim = 1;
170 | } else if (onceWSSize == 0) {
171 | ordSim = 0;
172 | } else {
173 | ordSim = 1.0 - revOrdCount * 1.0 / (onceWSSize - 1);
174 | }
175 |
176 | System.out.println("wordSim ==> " + wordSim + ", ordSim ==> " + ordSim);
177 |
178 | return LAMBDA1 * wordSim + LAMBDA2 * ordSim;
179 | }
180 |
181 | public String[] segment(String sentence) {
182 | List list = SegmentProxy.segment(sentence);
183 | String[] results = new String[list.size()];
184 | for (int i = 0; i < list.size(); i++) {
185 | results[i] = list.get(i).getWord();
186 | }
187 | return results;
188 | }
189 |
190 | }
191 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/statistic/DictStatistic.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.statistic;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.InputStream;
6 | import java.util.zip.GZIPInputStream;
7 |
8 | import javax.xml.namespace.QName;
9 | import javax.xml.stream.XMLEventReader;
10 | import javax.xml.stream.XMLInputFactory;
11 | import javax.xml.stream.events.StartElement;
12 | import javax.xml.stream.events.XMLEvent;
13 |
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 |
17 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
18 |
19 | /**
20 | * 用于统计分词词典文件中的概念出现数量
21 | *
22 | */
23 | public class DictStatistic {
24 |
25 | private static Logger logger = LoggerFactory.getLogger(DictStatistic.class);
26 |
27 | /**
28 | * 从指定的xml文件加载词典文件
29 | * @param xmlFile
30 | * @param gzCompressed 是否再用gz格式对词典进行了压缩
31 | * @return
32 | */
33 | public void testFromXml(String xmlFile, boolean gzCompressed) {
34 | File file = new File(xmlFile);
35 | if (!file.canRead()) {
36 | logger.error("无法读取文件:{}", xmlFile);
37 | return;// fail while opening the file
38 | }
39 | int count = 0, conceptCount = 0;
40 | XMLInputFactory inputFactory = XMLInputFactory.newInstance();
41 | InputStream input = null;
42 | try {
43 | if (gzCompressed) {
44 | input = new GZIPInputStream(new FileInputStream(file));
45 | } else {
46 | input = new FileInputStream(file);
47 | }
48 | XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
49 | while (xmlEventReader.hasNext()) {
50 | XMLEvent event = xmlEventReader.nextEvent();
51 |
52 | if (event.isStartElement()) {
53 | StartElement startElement = event.asStartElement();
54 | if (startElement.getName().toString().equals("table")) {
55 | String head = startElement.getAttributeByName(QName.valueOf("head")).getValue();
56 | while (xmlEventReader.hasNext()) {
57 | XMLEvent itemEvent = xmlEventReader.nextEvent();
58 | if (itemEvent.isStartElement()) {
59 | StartElement itemStartElement = itemEvent.asStartElement();
60 | if (!itemStartElement.getName().toString().equals("item"))
61 | continue;
62 | String word = itemStartElement.getAttributeByName(QName.valueOf("word")).getValue();
63 | word = head + word;
64 | if (XiaConceptParser.getInstance().isConcept(word)) {
65 | conceptCount++;
66 | }
67 | count++;
68 | if (count % 1000 == 0) {
69 | logger.info("process words {} ...", count);
70 | }
71 | }
72 | }
73 | }
74 | }
75 | }
76 | input.close();
77 | logger.info(count + "\t" + conceptCount);
78 | return;
79 | } catch (Exception e) {
80 | logger.error("Exception:{}", e.getMessage());
81 | e.printStackTrace();
82 | }
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/statistic/LCMC.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.statistic;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.InputStream;
6 |
7 | import javax.xml.stream.XMLEventReader;
8 | import javax.xml.stream.XMLInputFactory;
9 | import javax.xml.stream.events.StartElement;
10 | import javax.xml.stream.events.XMLEvent;
11 |
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 |
15 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
16 |
17 | public class LCMC {
18 |
19 | private static Logger logger = LoggerFactory.getLogger(LCMC.class);
20 |
21 | public void countUnConceptWords(File xmlFile) throws Exception {
22 | int totalCount = 0, conceptCount = 0;
23 | XMLInputFactory inputFactory = XMLInputFactory.newInstance();
24 | InputStream input = null;
25 | input = new FileInputStream(xmlFile);
26 | XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
27 | while (xmlEventReader.hasNext()) {
28 | XMLEvent event = xmlEventReader.nextEvent();
29 |
30 | if (event.isStartElement()) {
31 | StartElement startElement = event.asStartElement();
32 | //如果是word开始
33 | if (startElement.getName().toString().equals("w")) {
34 | String word = xmlEventReader.getElementText();
35 | totalCount++;
36 | if (XiaConceptParser.getInstance().isConcept(word)) {
37 | conceptCount++;
38 | }
39 | }
40 | }
41 | }//
42 | input.close();
43 | logger.info(totalCount + "\t" + conceptCount);
44 | }
45 |
46 | public static void main(String[] args) throws Exception {
47 | LCMC lcmc = new LCMC();
48 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_A.XML"));
49 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_B.XML"));
50 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_C.XML"));
51 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_D.XML"));
52 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_E.XML"));
53 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_F.XML"));
54 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_G.XML"));
55 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_H.XML"));
56 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_J.XML"));
57 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_K.XML"));
58 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_L.XML"));
59 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_M.XML"));
60 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_N.XML"));
61 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_P.XML"));
62 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_R.XML"));
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/text/DiceSimilarity.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.text;
2 |
3 | import zx.soft.similarity.Similaritable;
4 |
5 | public class DiceSimilarity implements Similaritable {
6 |
7 | @Override
8 | public double getSimilarity(String item1, String item2) {
9 | return 0;
10 | }
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/About.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.util;
2 |
3 | import java.awt.BorderLayout;
4 | import java.awt.Dimension;
5 | import java.io.IOException;
6 | import java.net.URL;
7 | import java.net.URLClassLoader;
8 |
9 | import javax.swing.JFrame;
10 | import javax.swing.JPanel;
11 | import javax.swing.JScrollPane;
12 | import javax.swing.JTextPane;
13 | import javax.swing.text.StyledEditorKit;
14 |
15 | /**
16 | * 关于xsimilarity项目的说明信息
17 | *
18 | */
19 | public class About extends JFrame {
20 |
21 | private static final long serialVersionUID = -2307582155443587993L;
22 |
23 | public static JPanel createPanel() {
24 | JPanel mainPanel = new JPanel();
25 | mainPanel.setLayout(new BorderLayout());
26 | JTextPane editorPane = new JTextPane();
27 | editorPane.setEditable(false);
28 | //让长文本自动换行
29 | editorPane.setEditorKit(new StyledEditorKit());
30 | editorPane.setContentType("text/html");
31 | try {
32 | URLClassLoader urlLoader = (URLClassLoader) About.class.getClassLoader();
33 | String html = "data/about.html";
34 | System.out.println(html);
35 | URL url = urlLoader.findResource(html); // 可以用html格式文件做你的帮助系统了
36 | editorPane.setPage(url);
37 | } catch (IOException e1) {
38 | editorPane.setText(e1.getMessage());
39 | }
40 | // editorPane.setText("个人主页:http://xiatian.irm.cn/");
41 |
42 | mainPanel.add(new JScrollPane(editorPane), BorderLayout.CENTER);
43 | return mainPanel;
44 | }
45 |
46 | public About() {
47 | this.setTitle("关于Semantic-Similarity");
48 |
49 | this.setDefaultCloseOperation(EXIT_ON_CLOSE);
50 | this.setPreferredSize(new Dimension(600, 400));
51 | this.getContentPane().add(createPanel());
52 | this.pack();
53 | }
54 |
55 | public static void main(String[] args) {
56 | new About().setVisible(true);
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/BlankUtils.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.util;
2 |
3 | import java.util.Collection;
4 |
5 | /**
6 | * 判断是否为空的工具类
7 | *
8 | */
9 | public class BlankUtils {
10 |
11 | /**
12 | * 判断字符串s是否是空串
13 | * @param s
14 | * @return
15 | */
16 | public static boolean isBlank(String string) {
17 | return string == null || string.trim().equals("");
18 | }
19 |
20 | /**
21 | * 判断数组是否是空
22 | * @param array
23 | * @return
24 | */
25 | public static boolean isBlank(Object[] array) {
26 | return array == null || array.length == 0;
27 | }
28 |
29 | /**
30 | * 判断集合是否是空
31 | * @param array
32 | * @return
33 | */
34 | public static boolean isBlank(Collection extends Object> array) {
35 | return array == null || array.size() == 0;
36 | }
37 |
38 | /**
39 | * 判断所有的集合是否都为空
40 | * @param collections
41 | * @return
42 | */
43 | public static boolean isBlankAll(Collection>... collections) {
44 | for (Collection> c : collections) {
45 | if (!isBlank(c)) {
46 | return false;
47 | }
48 | }
49 |
50 | return true;
51 | }
52 |
53 | /**
54 | * 判断字符串strings中是否都是空串
55 | * @param strings
56 | * @return
57 | */
58 | public static boolean isBlankAll(String... strings) {
59 | for (String s : strings) {
60 | if (!isBlank(s)) {
61 | return false;
62 | }
63 | }
64 |
65 | return true;
66 | }
67 |
68 | /**
69 | * 判断collections集合中是否至少有一个为空
70 | * @param collections
71 | * @return
72 | */
73 | public static boolean isBlankAtLeastOne(Collection>... collections) {
74 | for (Collection> c : collections) {
75 | if (isBlank(c)) {
76 | return true;
77 | }
78 | }
79 |
80 | return false;
81 | }
82 |
83 | /**
84 | * 判断字符串strings中是否之首有一个为空
85 | * @param strings
86 | * @return
87 | */
88 | public static boolean isBlankAtLeastOne(String... strings) {
89 | for (String s : strings) {
90 | if (isBlank(s)) {
91 | return true;
92 | }
93 | }
94 |
95 | return false;
96 | }
97 |
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/EditDistance.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.util;
2 |
3 | /**
4 | * This class computes the edit distance between two strings using dynamic
5 | * programming. The dynamic programming part is in the method
6 | * printEditDistance().
7 | *
8 | */
9 | public class EditDistance {
10 |
11 | /**
12 | * 获取删除代价
13 | *
14 | * @return
15 | */
16 | public int getDeletionCost() {
17 | return 1;
18 | }
19 |
20 | /**
21 | * 获取插入代价
22 | *
23 | * @return
24 | */
25 | public int getInsertionCost() {
26 | return 1;
27 | }
28 |
29 | /**
30 | * 获取替换代价
31 | *
32 | * @return
33 | */
34 | public int getSubstitutionCost(char a, char b) {
35 | return (a == b) ? 0 : 1;
36 | }
37 |
38 | public int getEditDistance(String S, String T) {
39 | int[][] D = null;
40 | if (S == null)
41 | S = "";
42 | if (T == null)
43 | T = "";
44 |
45 | char[] a = S.toCharArray();
46 | char[] b = T.toCharArray();
47 |
48 | int n = a.length; // 字符串S的长度
49 | int m = b.length; // 字符串T的长度
50 |
51 | if (a.length == 0) {
52 | return b.length;
53 | } else if (b.length == 0) {
54 | return a.length;
55 | }
56 |
57 | D = new int[a.length + 1][b.length + 1];
58 |
59 | /** 初始化D[i][0] */
60 | for (int i = 1; i <= n; i++) {
61 | D[i][0] = D[i - 1][0] + getDeletionCost();
62 | }
63 |
64 | /** 初始化D[0][j] */
65 | for (int j = 1; j <= m; j++) {
66 | D[0][j] = D[0][j - 1] + getInsertionCost();
67 | }
68 |
69 | for (int i = 1; i <= n; i++) {
70 | for (int j = 1; j <= m; j++) {
71 | D[i][j] = MathUtils.min(D[i - 1][j] + getDeletionCost(), D[i][j - 1] + getInsertionCost(),
72 | D[i - 1][j - 1] + getSubstitutionCost(a[i - 1], b[j - 1]));
73 | }
74 | }
75 |
76 | return D[n][m];
77 | }
78 |
79 | /**
80 | * 应与getEditDistance(S, T)等同
81 | * @param s
82 | * @param t
83 | * @return
84 | */
85 | public static int getLevenshteinDistance(String s, String t) {
86 | if (s == null || t == null) {
87 | throw new IllegalArgumentException("Strings must not be null");
88 | }
89 | int d[][]; // matrix
90 | int n; // length of s
91 | int m; // length of t
92 | int i; // iterates through s
93 | int j; // iterates through t
94 | char s_i; // ith character of s
95 | char t_j; // jth character of t
96 | int cost; // cost
97 |
98 | // Step 1
99 | n = s.length();
100 | m = t.length();
101 | if (n == 0) {
102 | return m;
103 | }
104 | if (m == 0) {
105 | return n;
106 | }
107 | d = new int[n + 1][m + 1];
108 |
109 | // Step 2
110 | for (i = 0; i <= n; i++) {
111 | d[i][0] = i;
112 | }
113 | for (j = 0; j <= m; j++) {
114 | d[0][j] = j;
115 | }
116 |
117 | // Step 3
118 | for (i = 1; i <= n; i++) {
119 | s_i = s.charAt(i - 1);
120 |
121 | // Step 4
122 | for (j = 1; j <= m; j++) {
123 | t_j = t.charAt(j - 1);
124 |
125 | // Step 5
126 | if (s_i == t_j) {
127 | cost = 0;
128 | } else {
129 | cost = 1;
130 | }
131 |
132 | // Step 6
133 | d[i][j] = MathUtils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
134 | }
135 | }
136 |
137 | // Step 7
138 | return d[n][m];
139 | }
140 |
141 | }
142 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/FileUtils.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.util;
2 |
3 | import java.io.BufferedOutputStream;
4 | import java.io.BufferedReader;
5 | import java.io.File;
6 | import java.io.FileOutputStream;
7 | import java.io.IOException;
8 | import java.io.InputStream;
9 | import java.io.InputStreamReader;
10 |
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 |
14 | /**
15 | * 与文件相关的工具类
16 | *
17 | */
18 | public class FileUtils {
19 |
20 | private static Logger logger = LoggerFactory.getLogger(FileUtils.class);
21 |
22 | /**
23 | * 根据指定编码从输入流中依次遍历每一行文字
24 | *
25 | * @param input
26 | * 输入流
27 | * @param encoding
28 | * 输入流所用的文字编码
29 | * @param event
30 | * 遍历每一行时触发的事件处理
31 | * @throws IOException
32 | */
33 | public static void traverseLines(InputStream input, String encoding, TraverseEvent event)
34 | throws IOException {
35 | BufferedReader in = new BufferedReader(new InputStreamReader(input, encoding));
36 | String line = null;
37 |
38 | while ((line = in.readLine()) != null) {
39 | event.visit(line);
40 | }
41 |
42 | input.close();
43 | in.close();
44 | }
45 |
46 | /**
47 | * 保存字符串到文件中
48 | * @param content
49 | * @param fileName
50 | * @return
51 | */
52 | public static boolean saveStringToFile(String content, String fileName) {
53 | boolean rtn = false;
54 | BufferedOutputStream out = null;
55 | try {
56 | File file = new File(fileName);
57 | file.getParentFile().mkdirs();
58 |
59 | out = new BufferedOutputStream(new FileOutputStream(file));
60 | out.write(content.getBytes("GBK"));
61 | out.close();
62 | rtn = true;
63 | } catch (Exception e) {
64 | logger.error("saveStringToFile error:{}", e.getMessage());
65 | } finally {
66 | try {
67 | out.close();
68 | } catch (Exception e) {
69 | logger.error("Exception:{}", e.getMessage());
70 | }
71 | }
72 | return rtn;
73 | }
74 |
75 | public static void main(String[] args) {
76 | int count = 0;
77 | File dir = new File("G:/juanjuantx");
78 | for (File a : dir.listFiles()) {
79 | if (a.isDirectory()) {
80 | for (File zy : a.listFiles()) {
81 | if (zy.listFiles() != null)
82 | for (File rar : zy.listFiles()) {
83 | if (rar.isFile() && rar.getName().endsWith(".rar")) {
84 | count++;
85 | }
86 | }
87 | }
88 | }
89 | }
90 | System.out.println(count);
91 | }
92 |
93 | }
94 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/MathUtils.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.util;
2 |
3 | public class MathUtils {
4 |
5 | public static int min(int... values) {
6 | int min = Integer.MAX_VALUE;
7 | for (int v : values) {
8 | min = (v < min) ? v : min;
9 | }
10 | return min;
11 | }
12 |
13 | }
14 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/PinyinUtils.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.util;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.IOException;
5 | import java.io.InputStream;
6 | import java.io.InputStreamReader;
7 | import java.util.HashMap;
8 | import java.util.HashSet;
9 | import java.util.Map;
10 | import java.util.Set;
11 |
12 | /**
13 | * 拼音处理的工具,负责从拼音词典加载内容,根据汉字词语或汉字查找拼音
14 | *
15 | */
16 | public class PinyinUtils {
17 |
18 | /** 拼音的Map词典, 一个汉字可能对应多个拼音, 它所有的拼音放到一个集合中 */
19 | private Map> pinyinDict = null;
20 |
21 | /** 单例 */
22 | private static PinyinUtils instance = null;
23 |
24 | private PinyinUtils() throws IOException {
25 | // 从classpath中加载拼音词典文件
26 | InputStream input = this.getClass().getClassLoader()
27 | .getResourceAsStream("data/F02-GB2312-to-PuTongHua-PinYin.txt");
28 |
29 | BufferedReader in = new BufferedReader(new InputStreamReader(input, "UTF-8"));
30 | String line = null;
31 |
32 | MyTraverseEvent event = new MyTraverseEvent();
33 | while ((line = in.readLine()) != null) {
34 | event.visit(line);
35 | }
36 |
37 | input.close();
38 | in.close();
39 |
40 | this.pinyinDict = event.getPinyins();
41 | }
42 |
43 | public static PinyinUtils getInstance() {
44 | if (instance == null) {
45 | try {
46 | instance = new PinyinUtils();
47 | } catch (IOException e) {
48 | e.printStackTrace();
49 | }
50 | }
51 |
52 | return instance;
53 | }
54 |
55 | /**
56 | * 获取汉字的拼音, 由于汉字具有多音字,故返回一个集合
57 | * @param hanzi
58 | * @return
59 | */
60 | public Set getPinyin(Character hanzi) {
61 | Set set = pinyinDict.get(hanzi);
62 | if (set == null || set.size() == 0) {
63 | set = new HashSet<>();
64 | set.add(hanzi.toString());
65 | }
66 | return set;
67 | }
68 |
69 | /**
70 | * 获取词语的拼音, 一个词语可能对应多个拼音,把所有可能的组合放到集合中返回
71 | * @param word
72 | * @return
73 | */
74 | public Set getPinyin(String word) {
75 | Set word_set = new HashSet<>();
76 | for (int i = 0; i < word.length(); i++) {
77 | Set hanzi_set = getPinyin(word.charAt(i));
78 | if (word_set == null || word_set.size() == 0) {
79 | word_set.addAll(hanzi_set);
80 | continue;
81 | }
82 |
83 | Set tmp_set = new HashSet<>();
84 | for (String w : word_set) {
85 | for (String h : hanzi_set) {
86 | tmp_set.add(w + h);
87 | }
88 | }
89 |
90 | word_set = tmp_set;
91 | }
92 |
93 | return word_set;
94 | }
95 |
96 | /**
97 | * 获取拼音字符串,多音字只取一个
98 | * @param word
99 | * @return
100 | */
101 | public String getPinyinSingle(String word) {
102 | StringBuffer sb = new StringBuffer();
103 | for (int i = 0; i < word.length(); i++) {
104 | sb.append(getPinyin(word.charAt(i)).iterator().next());
105 | }
106 | return sb.toString();
107 | }
108 |
109 | /**
110 | * 获取拼音串,对于多音字,给出所有拼音
111 | * @param word
112 | * @return
113 | */
114 | public String getPinyinString(String word) {
115 | StringBuffer sb = new StringBuffer();
116 | for (int i = 0; i < word.length(); i++) {
117 | Set pinyin = getPinyin(word.charAt(i));
118 | sb.append(pinyin.toString());
119 | }
120 | return sb.toString();
121 | }
122 |
123 | /**
124 | * 获取拼音首字母
125 | * @param word
126 | * @return
127 | */
128 | public String getPinyinHead(String word) {
129 | StringBuffer sb = new StringBuffer();
130 | for (int i = 0; i < word.length(); i++) {
131 | sb.append(getPinyin(word.charAt(i)).iterator().next().charAt(0));
132 | }
133 | return sb.toString();
134 | }
135 |
136 | private static class MyTraverseEvent {
137 | /** 一个汉字对应多个拼音, 多个拼音放到集合中 */
138 | private Map> pinyins = null;
139 |
140 | public MyTraverseEvent() {
141 | this.pinyins = new HashMap<>();
142 | }
143 |
144 | public Map> getPinyins() {
145 | return pinyins;
146 | }
147 |
148 | public boolean visit(String item) {
149 | if (item.startsWith("//")) {
150 | return true;
151 | }
152 |
153 | char hanzi = item.charAt(0);
154 | //String pinyin = item.substring(2, item.length()-1);
155 | String pinyin = item.substring(2, item.length());
156 | Set set = pinyins.get(hanzi);
157 | if (set == null) {
158 | set = new HashSet<>();
159 | }
160 | set.add(pinyin);
161 |
162 | pinyins.put(hanzi, set);
163 | return true;
164 | }
165 | }
166 |
167 | }
168 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/TraverseEvent.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.util;
2 |
3 | /**
4 | * 遍历接口, 对于需要遍历的东西,通过传入该接口,可以实现实际的访问处理
5 | *
6 | */
7 | public interface TraverseEvent {
8 |
9 | /**
10 | * 遍历时访问其中的一个条目
11 | * @param item
12 | * @return
13 | */
14 | public boolean visit(T item);
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/XmlException.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.util;
2 |
3 | /**
4 | * Runtime exception for XML handling.
5 | *
6 | */
7 | public class XmlException extends RuntimeException {
8 |
9 | private static final long serialVersionUID = 381260478228427716L;
10 |
11 | public static final String XML_PAYLOAD_EMPTY = "xml.payload.empty";
12 | public static final String XML_ENCODE_ERROR = "xml.encoding.invalid";
13 | public static final String FILE_NOT_FOUND = "xml.file.not.found";
14 | public static final String XML_PARSE_ERROR = "xml.parse.error";
15 | public static final String XML_READ_ERROR = "xml.read.error";
16 | public static final String XML_VALIDATE_ERROR = "xml.validate.error";
17 | public static final String XML_TRANSFORM_ERROR = "xml.transform.error";
18 |
19 | public XmlException() {
20 | super();
21 | }
22 |
23 | public XmlException(String key, Throwable cause) {
24 | super(key, cause);
25 | }
26 |
27 | public XmlException(String key) {
28 | super(key);
29 | }
30 |
31 | public XmlException(Throwable cause) {
32 | super(cause);
33 | }
34 |
35 | }
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/about.html:
--------------------------------------------------------------------------------
1 |
2 | xsimilarity
3 |
4 |
5 |
XSimilarity
6 | 项目地址:http://github.com/iamxiatian/xsimilarity/
7 |
8 |
9 | 有任何问题或建议请与我们联系,您的反馈将有助于该项目的进一步完善。
10 |
11 |
12 |
致谢
13 | 本项目在研究过程中,得到了恩师樊孝忠教授的悉心指导,师恩如海,难以言谢!
14 | 中国人民大学为本项目的持续研究提供了资金和计算机软硬件的支持,北京理工大学为本项目的早期研究提供了重要的基础设施,
15 | 这些支持与国家的投入密不可分,
16 | 本项目的开源和不断完善也算是对国家的点滴回报!
17 | 代码中许多算法的核心思想来源于我们的研究同行和先辈们的已公开成果,另外,许多使用xsimilarity的人员对xsimilarity
18 | 提出了宝贵的建议,在此一并表示深深的谢意!
19 | 本工程使用了如下开源组件,对原作者致以谢意!
20 |
23 |
24 |
25 |
联系方式
26 | 夏天
27 | 数据工程与知识工程教育部重点实验室(中国人民大学)
28 | 中国人民大学信息资源管理学院
29 | 电话: 86-10-82500675
30 | Email: xiat(at)ruc.edu.cn
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/CharBasedSimilarity.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import zx.soft.similarity.Similaritable;
7 |
8 | /**
9 | * 字面相似度计算方法
10 | *
11 | */
12 | public class CharBasedSimilarity implements Similaritable {
13 |
14 | private final double alpha = 0.6;
15 | private final double beta = 0.4;
16 |
17 | @Override
18 | public double getSimilarity(String word1, String word2) {
19 | if (isBlank(word1) && isBlank(word2)) {
20 | return 1.0;
21 | }
22 | if (isBlank(word1) || isBlank(word2)) {
23 | return 0.0;
24 | }
25 |
26 | List sameHZ = new ArrayList<>();
27 |
28 | String longString = word1.length() >= word2.length() ? word1 : word2;
29 | String shortString = word1.length() < word2.length() ? word1 : word2;
30 | for (int i = 0; i < longString.length(); i++) {
31 | Character ch = longString.charAt(i);
32 | if (shortString.contains(ch.toString())) {
33 | sameHZ.add(ch);
34 | }
35 | }
36 |
37 | double dp = Math.min(1.0 * word1.length() / word2.length(), 1.0 * word2.length() / word1.length());
38 | double part1 = alpha * (1.0 * sameHZ.size() / word1.length() + 1.0 * sameHZ.size() / word2.length()) / 2.0;
39 | double part2 = beta * dp * (getWeightedResult(word1, sameHZ) + getWeightedResult(word2, sameHZ)) / 2.0;
40 |
41 | return part1 + part2;
42 | }
43 |
44 | private double getWeightedResult(String word1, List sameHZ) {
45 | double top = 0;
46 | double bottom = 0;
47 | for (int i = 0; i < word1.length(); i++) {
48 | if (sameHZ.contains(word1.charAt(i))) {
49 | top += (i + 1);
50 | }
51 | bottom += (i + 1);
52 | }
53 | return 1.0 * top / bottom;
54 | }
55 |
56 | private boolean isBlank(String str) {
57 | return str == null || str.trim().equals("");
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/WordSimilarity.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word;
2 |
3 | import zx.soft.similarity.Similaritable;
4 |
5 | public interface WordSimilarity extends Similaritable {
6 |
7 | }
8 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/cilin/Cilin.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.cilin;
2 |
3 | import java.util.Set;
4 |
5 | import zx.soft.similarity.Similaritable;
6 |
7 | public class Cilin implements Similaritable {
8 |
9 | private static Cilin instance = null;
10 |
11 | public static Cilin getInstance() {
12 | if (instance == null) {
13 | instance = new Cilin();
14 | }
15 | return instance;
16 | }
17 |
18 | private Cilin() {
19 |
20 | }
21 |
22 | @Override
23 | public double getSimilarity(String item1, String item2) {
24 | double sim = 0.0;
25 |
26 | if (item1 == null && item2 == null) {
27 | return 1.0;
28 | } else if (item1 == null || item2 == null) {
29 | return 0.0;
30 | } else if (item1.equalsIgnoreCase(item2)) {
31 | return 1.0;
32 | }
33 |
34 | Set codeSet1 = CilinDb.getInstance().getCilinCoding(item1);
35 | Set codeSet2 = CilinDb.getInstance().getCilinCoding(item2);
36 | if (codeSet1 == null || codeSet2 == null) {
37 | return 0.0;
38 | }
39 | for (String code1 : codeSet1) {
40 | for (String code2 : codeSet2) {
41 | double s = getSimilarityByCode(code1, code2);
42 | System.out.println(code1 + "-" + code2 + "-" + CilinCoding.calculateCommonWeight(code1, code2));
43 | if (sim < s)
44 | sim = s;
45 | }
46 | }
47 | return sim;
48 | }
49 |
50 | public double getSimilarityByCode(String code1, String code2) {
51 | return CilinCoding.calculateCommonWeight(code1, code2) / CilinCoding.TOTAL_WEIGHT;
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/cilin/CilinCoding.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.cilin;
2 |
3 | /**
4 | * 表2-3 哈工大词林扩展版规则编码表
5 | *
6 | *
7 | * 编码位 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
8 | *
9 | *
10 | * 编码示例 | C | b | 0 | 7 | A | 0 | 3 | = |
11 | *
12 | * 类别级别 | 第一级 | 第二级 | 第三级 | 第四级 | 第五级 | 标记位 |
13 | * |
14 | * 类别含义 | 大类 | 中类 | 小类 | 词群 | 原子词群 | 词语关系 |
15 | *
16 | *
17 | *
18 | * 表中编码位从左到右顺序排列,其中,第8位对应的标记位为“=”、“#”和“@”三种符号之一。其中“=”代表常见的“同义”关系,“#”代表词语之间的相关关系,“@”则代表词语自我封闭的独立性质,它在词典中既没有同义词,也没有相关词。
19 | *
20 | *
21 | */
22 | public class CilinCoding {
23 |
24 | public static double[] WEIGHT = new double[] { 1.2, 1.2, 1.0, 1.0, 0.8, 0.4 };
25 | public static double TOTAL_WEIGHT = 5.6;
26 |
27 | public static String getCodeLevel(String code, int level) {
28 | switch (level) {
29 | case 1:
30 | return code.substring(0, 1);
31 | case 2:
32 | return code.substring(1, 2);
33 | case 3:
34 | return code.substring(2, 4);
35 | case 4:
36 | return code.substring(4, 5);
37 | case 5:
38 | return code.substring(5, 7);
39 | case 6:
40 | return code.substring(7);
41 | }
42 |
43 | return "";
44 | }
45 |
46 | /**
47 | * 获取共同部分编码的权重
48 | * @param code1
49 | * @param code2
50 | * @return
51 | */
52 | public static double calculateCommonWeight(String code1, String code2) {
53 | double weight = 0.0;
54 | for (int i = 1; i <= 6; i++) {
55 | String c1 = getCodeLevel(code1, i);
56 | String c2 = getCodeLevel(code2, i);
57 | if (c1.equals(c2)) {
58 | weight += WEIGHT[i - 1];
59 | } else {
60 | break;
61 | }
62 | }
63 | return weight;
64 | }
65 |
66 | public static String printCoding(String code) {
67 | StringBuilder sb = new StringBuilder();
68 | for (int i = 1; i <= 6; i++) {
69 | if (i == 1) {
70 | sb.append("[LEVEL_" + i);
71 | } else {
72 | sb.append(", LEVEL_" + i);
73 | }
74 | sb.append(": ");
75 | sb.append(getCodeLevel(code, i));
76 | }
77 | sb.append("]");
78 |
79 | return sb.toString();
80 | }
81 |
82 | }
83 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/cilin/CilinDb.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.cilin;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.util.HashMap;
6 | import java.util.HashSet;
7 | import java.util.Map;
8 | import java.util.Set;
9 | import java.util.zip.GZIPInputStream;
10 |
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 |
14 | import zx.soft.similarity.util.FileUtils;
15 | import zx.soft.similarity.util.TraverseEvent;
16 |
17 | /**
18 | * 词林数据库
19 | *
20 | */
21 | public class CilinDb {
22 |
23 | /** the logger */
24 | protected static Logger logger = LoggerFactory.getLogger(CilinDb.class);
25 | /** 以词语为主键的索引表 */
26 | private final Map> wordIndex = new HashMap<>();
27 | /** 以编码为主键的索引表 */
28 | private final Map> codeIndex = new HashMap<>();
29 |
30 | private static CilinDb instance = null;
31 |
32 | public static CilinDb getInstance() {
33 | if (instance == null) {
34 | try {
35 | instance = new CilinDb();
36 | } catch (IOException e) {
37 | logger.error("Exception:{}", e.getMessage());
38 | }
39 | }
40 | return instance;
41 | }
42 |
43 | private CilinDb() throws IOException {
44 | InputStream input = new GZIPInputStream(this.getClass().getClassLoader()
45 | .getResourceAsStream("data/cilin.db.gz"));
46 |
47 | TraverseEvent event = new TraverseEvent() {
48 | @Override
49 | public boolean visit(String line) {
50 | String[] items = line.split(" ");
51 | Set set = new HashSet<>();
52 | for (int i = 2; i < items.length; i++) {
53 | String code = items[i].trim();
54 | if (!code.equals("")) {
55 | set.add(code);
56 | //加入codeIndex编码
57 | Set codeWords = codeIndex.get(code);
58 | if (codeWords == null) {
59 | codeWords = new HashSet<>();
60 | }
61 | codeWords.add(items[0]);
62 | codeIndex.put(code, codeWords);
63 | }
64 | }
65 | wordIndex.put(items[0], set);
66 | items = null;
67 | return false;
68 | }
69 | };
70 | logger.info("loading cilin dictionary...");
71 | long time = System.currentTimeMillis();
72 |
73 | FileUtils.traverseLines(input, "UTF8", event);
74 |
75 | time = System.currentTimeMillis() - time;
76 | logger.info("loading cilin dictionary completely. time elapsed:{}", time);
77 | }
78 |
79 | /**
80 | * 获取某个词语的词林编码,一个词语可以有多个编码,通过Set给出
81 | * @param word
82 | * @return
83 | */
84 | public Set getCilinCoding(String word) {
85 | return wordIndex.get(word);
86 | }
87 |
88 | public Set getCilinWords(String code) {
89 | return codeIndex.get(code);
90 | }
91 |
92 | public static void main(String[] args) {
93 | CilinDb db = CilinDb.getInstance();
94 | String code = db.getCilinCoding("中国").iterator().next();
95 | System.out.println(CilinCoding.printCoding(code));
96 | System.out.println(db.getCilinWords(code));
97 | }
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/cilin/cilin.db.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/main/java/zx/soft/similarity/word/cilin/cilin.db.gz
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/Hownet.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet;
2 |
3 | import java.io.IOException;
4 |
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 |
8 | import zx.soft.similarity.Similaritable;
9 | import zx.soft.similarity.word.hownet2.concept.BaseConceptParser;
10 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
11 | import zx.soft.similarity.word.hownet2.sememe.BaseSememeParser;
12 | import zx.soft.similarity.word.hownet2.sememe.XiaSememeParser;
13 |
14 | /**
15 | * Hownet的主控制类, 通过知网的概念和义原及其关系计算汉语词语之间的相似度.
16 | * 相似度的计算理论参考论文《汉语词语语义相似度计算研究》
17 | *
18 | * @see zx.soft.similarity.Similaritable
19 | */
20 | public class Hownet implements Similaritable {
21 |
22 | /** the logger */
23 | private static final Logger logger = LoggerFactory.getLogger(Hownet.class);
24 | /** 知网的单例 */
25 | private static Hownet instance = null;
26 |
27 | private BaseConceptParser conceptParser = null;
28 |
29 | private Hownet() {
30 | try {
31 | BaseSememeParser sememeParser = new XiaSememeParser();
32 | conceptParser = new XiaConceptParser(sememeParser);
33 | } catch (IOException e) {
34 | logger.error("Exception:{}", e.getMessage());
35 | e.printStackTrace();
36 | }
37 | }
38 |
39 | /**
40 | * 单例获取知网对象
41 | * @return
42 | */
43 | public static Hownet instance() {
44 | if (null == instance) {
45 | instance = new Hownet();
46 | }
47 | return instance;
48 | }
49 |
50 | /**
51 | * 获取概念解析器
52 | * @return
53 | */
54 | public BaseConceptParser getConceptParser() {
55 | return conceptParser;
56 | }
57 |
58 | @Override
59 | public double getSimilarity(String item1, String item2) {
60 | return conceptParser.getSimilarity(item1, item2);
61 | }
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/HownetMeta.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet;
2 |
3 | /**
4 | * Metadata for Hownet
5 | *
6 | */
7 | public interface HownetMeta {
8 |
9 | /** Algorithm of XIA Tian */
10 | public static final int ALGORITHM_XIA = 1;
11 |
12 | /** Algorithm of LIU Qun */
13 | public static final int ALGORITHM_LIU = 2;
14 |
15 | /**
16 | * Hownet symbol descriptions
17 | */
18 | public static final String Symbol_Descriptions[][] = { { "#", "表示与其相关" }, { "%", "是其部分" },
19 | { "$", "可以被该V处置,或是该V的受事、对象、领有物,或内容" }, { "*", "施事或工具" }, { "+", "所标记的角色是隐性的,几乎在实际语言中不会出现" }, { "&", "指向" },
20 | { "~", "多半是,多半有,很可能" }, { "@", "可以做V的空间或时间" }, { "?", "可以使N的材料" }, { "(", "至于其中的应该是一个词标记" },
21 | { "^", "不存在,或没有,或不能" }, { "!", "表示某一属性为一敏感的属性,如味道之与食物" }, { "[", "标示概念的共性属性" } };
22 |
23 | /** γ:具体词与义元的相似度一律为一个较小的常数 */
24 | public static final double gamma = 0.2;
25 |
26 | /** δ:任一个非空值与空值的相似度为一个较小的常数,此处为0.2 */
27 | public static final double delta = 0.2;
28 |
29 | /** β1实词概念第一基本义原描述式的权重 */
30 | public static final double beta1 = 0.5;
31 | /** β2实词概念其他基本义原描述式的权重 */
32 | public static final double beta2 = 0.2;
33 | /** β3实词概念关系义原描述式的权重 */
34 | public static final double beta3 = 0.17;
35 | /** β4实词概念符号义原描述式的权重 */
36 | public static final double beta4 = 0.13;
37 |
38 | /**
39 | * Θ 计算后面概念的义原与参照概念所有义原的最大相似度, 并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用),
40 | * 如果数值大于该值时才会起参照作用, 去掉冗余的不重要义原
41 | */
42 | public static final double PARAM_THETA = 0.5;
43 | /**
44 | * Ω 计算前面概念的义原与参照概念所有义原的最大相似度,并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用),
45 | * 如果数值大于该值时才会调整前面概念的义原符号, 以起修正作用
46 | */
47 | public static final double PARAM_OMEGA = 0.8;
48 | /** */
49 | public static final double PARAM_XI = 0.6;
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/concept/Concept.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.concept;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 | import java.util.StringTokenizer;
6 |
7 | import zx.soft.similarity.word.hownet.HownetMeta;
8 |
9 | /**
10 | * 知网的概念表示类
example和英文部分对于相似度的计算不起作用,考虑到内存开销, 在概念的表示中去掉了这部分数据的对应定义
11 | *
12 | * @deprecated
13 | */
14 | @Deprecated
15 | public class Concept implements HownetMeta, Comparable {
16 |
17 | /** 中文概念名称 */
18 | protected String word;
19 | /** 词性: Part of Speech */
20 | protected String pos;
21 | /** 定义 */
22 | protected String define;
23 |
24 | /** 是否是实词,false表示为虚词, 一般为实词 */
25 | protected boolean bSubstantive;
26 | /** 第一基本义原 */
27 | protected String mainSememe;
28 | /** 其他基本义原 */
29 | protected String[] secondSememes;
30 | /** 关系义元原 */
31 | protected String[] relationSememes;
32 | /** 关系符号描述 */
33 | protected String[] symbolSememes;
34 |
35 | static String[][] Concept_Type = { { "=", "事件" }, { "aValue|属性值", "属性值" }, { "qValue|数量值", "数量值" },
36 | { "attribute|属性", "属性" }, { "quantity|数量", "数量" }, { "unit|", "单位" }, { "%", "部件" } };
37 |
38 | public Concept(String word, String pos, String def) {
39 | this.word = word;
40 | this.pos = pos;
41 | this.define = (def == null) ? "" : def.trim();
42 |
43 | // 虚词用{***}表示
44 | if (define.length() > 0 && define.charAt(0) == '{' && define.charAt(define.length() - 1) == '}') {
45 | this.bSubstantive = false;
46 | } else {
47 | this.bSubstantive = true;
48 | }
49 |
50 | parseDefine();
51 | }
52 |
53 | /**
54 | * 处理定义,把定义分为第一基本义元、其他基本义元、关系义元和符号义元四类
55 | */
56 | private void parseDefine() {
57 | List secondList = new ArrayList<>(); //其他基本义原
58 | List relationList = new ArrayList<>(); //关系义原
59 | List symbolList = new ArrayList<>(); //符号义原
60 |
61 | String tokenString = this.define;
62 |
63 | //如果不是实词,则处理“{}”中的内容
64 | if (!this.bSubstantive) {
65 | tokenString = define.substring(1, define.length() - 1);
66 | }
67 |
68 | StringTokenizer token = new StringTokenizer(tokenString, ",", false);
69 |
70 | // 第一个为第一基本义元
71 | if (token.hasMoreTokens()) {
72 | this.mainSememe = token.nextToken();
73 | }
74 |
75 | main_loop: while (token.hasMoreTokens()) {
76 | String item = token.nextToken();
77 | if (item.equals(""))
78 | continue;
79 |
80 | // 先判断是否为符号义元
81 | String symbol = item.substring(0, 1);
82 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
83 | if (symbol.equals(Symbol_Descriptions[i][0])) {
84 | symbolList.add(item);
85 | continue main_loop;
86 | }
87 | }
88 |
89 | //如果不是符号义元,则进一步判断是关系义元还是第二基本义元, 带有“=”表示关系义原
90 | if (item.indexOf('=') > 0) {
91 | relationList.add(item);
92 | } else {
93 | secondList.add(item);
94 | }
95 | }
96 |
97 | this.secondSememes = secondList.toArray(new String[secondList.size()]);
98 | this.relationSememes = relationList.toArray(new String[relationList.size()]);
99 | this.symbolSememes = symbolList.toArray(new String[symbolList.size()]);
100 | }
101 |
102 | /**
103 | * 获取第一义元
104 | *
105 | * @return
106 | */
107 | public String getMainSememe() {
108 | return mainSememe;
109 | }
110 |
111 | /**
112 | * 获取其他基本义元描述
113 | *
114 | * @return
115 | */
116 | public String[] getSecondSememes() {
117 | return secondSememes;
118 | }
119 |
120 | /**
121 | * 获取关系义元描述
122 | *
123 | * @return
124 | */
125 | public String[] getRelationSememes() {
126 | return relationSememes;
127 | }
128 |
129 | /**
130 | * 获取符号义元描述
131 | *
132 | * @return
133 | */
134 | public String[] getSymbolSememes() {
135 | return symbolSememes;
136 | }
137 |
138 | @Override
139 | public String toString() {
140 | StringBuilder sb = new StringBuilder();
141 | sb.append("name=");
142 | sb.append(this.word);
143 | sb.append("; pos=");
144 | sb.append(this.pos);
145 | sb.append("; define=");
146 | sb.append(this.define);
147 | sb.append("; 第一基本义元:[" + mainSememe);
148 |
149 | sb.append("]; 其他基本义元描述:[");
150 | for (String sem : secondSememes) {
151 | sb.append(sem);
152 | sb.append(";");
153 | }
154 |
155 | sb.append("]; [关系义元描述:");
156 | for (String sem : relationSememes) {
157 | sb.append(sem);
158 | sb.append(";");
159 | }
160 |
161 | sb.append("]; [关系符号描述:");
162 | for (String sem : symbolSememes) {
163 | sb.append(sem);
164 | sb.append(";");
165 | }
166 | sb.append("]");
167 | return sb.toString();
168 | }
169 |
170 | /**
171 | * 是实词还是虚词
172 | *
173 | * @return true:实词;false:虚词
174 | */
175 | public boolean isSubstantive() {
176 | return this.bSubstantive;
177 | }
178 |
179 | public String getWord() {
180 | return word;
181 | }
182 |
183 | public void setWord(String word) {
184 | this.word = word;
185 | }
186 |
187 | public String getPos() {
188 | return pos;
189 | }
190 |
191 | public void setPos(String pos) {
192 | this.pos = pos;
193 | }
194 |
195 | public String getDefine() {
196 | return define;
197 | }
198 |
199 | public void setDefine(String define) {
200 | this.define = define;
201 | }
202 |
203 | /**
204 | * 获取该概念的类型
205 | *
206 | * @return
207 | */
208 | public String getType() {
209 | for (int i = 0; i < Concept_Type.length; i++) {
210 | if (define.toUpperCase().indexOf(Concept_Type[i][0].toUpperCase()) >= 0) {
211 | return Concept_Type[i][1];
212 | }
213 | }
214 | return "普通概念";
215 | }
216 |
217 | /**
218 | * 按照概念的名称进行比较
219 | */
220 | @Override
221 | public int compareTo(Concept o) {
222 | return word.compareTo(o.word);
223 | }
224 |
225 | //////////////////////////////////////////////
226 | /**
227 | * 方便在parse中比较概念词语加入的方法
228 | * @param another
229 | * @return
230 | */
231 | public int compareTo(String another) {
232 | return word.compareTo(another);
233 | }
234 |
235 | public boolean equals(String another) {
236 | return word.equals(another);
237 | }
238 |
239 | }
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/concept/ConceptDictTraverseEvent.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.concept;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileOutputStream;
6 | import java.io.InputStream;
7 | import java.io.InputStreamReader;
8 | import java.io.PrintWriter;
9 | import java.util.ArrayList;
10 | import java.util.Arrays;
11 | import java.util.List;
12 |
13 | import javax.xml.parsers.DocumentBuilder;
14 | import javax.xml.parsers.DocumentBuilderFactory;
15 | import javax.xml.transform.OutputKeys;
16 | import javax.xml.transform.Transformer;
17 | import javax.xml.transform.TransformerFactory;
18 | import javax.xml.transform.dom.DOMSource;
19 | import javax.xml.transform.stream.StreamResult;
20 |
21 | import org.w3c.dom.Document;
22 | import org.w3c.dom.Element;
23 |
24 | import zx.soft.similarity.util.TraverseEvent;
25 |
26 | /**
27 | * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准,格式如下:
28 | * 阿斗 N human|人,ProperName|专,past|昔
29 | * 阿爸 N human|人,family|家,male|男
30 | * 即: <概念> <空格或者跳格> <词性> <空格或者跳格> <定义>"
31 | *
32 | * 概念保存到数组中,没有保存到Map中,可以降低对内存空间的使用
33 | *
34 | * @deprecated
35 | */
36 | @Deprecated
37 | public class ConceptDictTraverseEvent implements TraverseEvent {
38 |
39 | private List conceptList = null;
40 |
41 | public ConceptDictTraverseEvent() {
42 | conceptList = new ArrayList<>();
43 | }
44 |
45 | public Concept[] getConcepts() {
46 | Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]);
47 | Arrays.sort(concepts);
48 | return concepts;
49 | }
50 |
51 | /**
52 | * 读取概念词典中的一行,并进行解析处理
53 | */
54 | @Override
55 | public boolean visit(String line) {
56 | String word = null;
57 | String pos = null;
58 | String define = "";
59 | char ch;
60 |
61 | // 以符号//开始的是注释行
62 | if (line.startsWith("//")) {
63 | return true;
64 | }
65 |
66 | int lastPosition = 0; // 最近一次处理内容的有意义的开始位置
67 | int processFlag = 0; // 当前处理部分的标志 0:处理word; 1:词性;2:定义
68 | // 解析出一行中的概念各项数据
69 | loop: for (int position = 0; position < line.length(); position++) {
70 | ch = line.charAt(position);
71 |
72 | if ((ch == ' ') || (ch == '\t') || (position == (line.length() - 1))) {
73 | String item = line.substring(lastPosition, (position == (line.length() - 1)) ? (position + 1)
74 | : position);
75 | switch (processFlag) {
76 | case 0:
77 | word = item;
78 | processFlag++;
79 | break;
80 | case 1:
81 | pos = item;
82 | processFlag++;
83 | break;
84 | case 2:
85 | //define = item;
86 | //processFlag++;
87 | define = line.substring(lastPosition).trim();
88 | break loop;
89 | case 3:
90 | System.out.println(line);
91 | break;
92 | }
93 |
94 | for (; (position < line.length()); position++) {
95 | ch = line.charAt(position);
96 | if ((ch != ' ') && (ch != '\t')) {
97 | lastPosition = position;
98 | break;
99 | }
100 | }
101 |
102 | }
103 | }
104 | conceptList.add(new Concept(word, pos, define));
105 | return true;
106 | }
107 |
108 | public void saveToXML(File xmlFile) throws Exception {
109 | InputStream input = this.getClass().getClassLoader().getResourceAsStream("data/concept.dat");
110 | BufferedReader in = new BufferedReader(new InputStreamReader(input, "utf8"));
111 |
112 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
113 | DocumentBuilder builder = factory.newDocumentBuilder();
114 | Document document = builder.newDocument();
115 | Element root = document.createElement("concepts");
116 | document.appendChild(root);
117 |
118 | String line = null;
119 |
120 | while ((line = in.readLine()) != null) {
121 | saveLineToXML(document, root, line);
122 | }
123 |
124 | input.close();
125 | in.close();
126 |
127 | TransformerFactory tf = TransformerFactory.newInstance();
128 | Transformer transformer = tf.newTransformer();
129 | DOMSource source = new DOMSource(document);
130 | transformer.setOutputProperty(OutputKeys.ENCODING, "utf8");
131 | transformer.setOutputProperty(OutputKeys.INDENT, "yes");
132 | PrintWriter pw = new PrintWriter(new FileOutputStream(xmlFile));
133 | StreamResult result = new StreamResult(pw);
134 | transformer.transform(source, result);
135 | }
136 |
137 | /**
138 | * 读取概念词典中的一行,并进行解析处理
139 | */
140 | private boolean saveLineToXML(Document document, Element root, String line) {
141 | String word = null;
142 | String pos = null;
143 | String define = "";
144 | char ch;
145 |
146 | //以符号//开始的是注释行
147 | if (line.startsWith("//")) {
148 | return true;
149 | }
150 |
151 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置
152 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义
153 | //解析出一行中的概念各项数据
154 | loop: for (int position = 0; position < line.length(); position++) {
155 | ch = line.charAt(position);
156 |
157 | if ((ch == ' ') || (ch == '\t') || (position == (line.length() - 1))) {
158 | String item = line.substring(lastPosition, (position == (line.length() - 1)) ? (position + 1)
159 | : position);
160 | switch (processFlag) {
161 | case 0:
162 | word = item;
163 | processFlag++;
164 | break;
165 | case 1:
166 | pos = item;
167 | processFlag++;
168 | break;
169 | case 2:
170 | //define = item;
171 | //processFlag++;
172 | define = line.substring(lastPosition).trim();
173 | break loop;
174 | case 3:
175 | System.out.println(line);
176 | break;
177 | }
178 |
179 | for (; (position < line.length()); position++) {
180 | ch = line.charAt(position);
181 | if ((ch != ' ') && (ch != '\t')) {
182 | lastPosition = position;
183 | break;
184 | }
185 | }
186 |
187 | }
188 | }
189 |
190 | Element e = document.createElement("c");
191 | e.setAttribute("w", word);
192 | e.setAttribute("p", pos);
193 | e.setAttribute("d", define);
194 | root.appendChild(e);
195 | return true;
196 | }
197 |
198 | public static void main(String[] args) throws Exception {
199 | new ConceptDictTraverseEvent().saveToXML(new File("/XXX/concept.xml"));
200 | }
201 |
202 | }
203 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/concept/ConceptLinkedList.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.concept;
2 |
3 | import java.util.LinkedList;
4 |
5 | /**
6 | * 用于概念处理的LinkedList
7 | *
8 | * @param
9 | * @deprecated
10 | */
11 | @Deprecated
12 | @SuppressWarnings("serial")
13 | public class ConceptLinkedList extends LinkedList {
14 |
15 | /**
16 | * 删除链表中最后面的size个元素
17 | * @param size
18 | */
19 | public void removeLast(int size) {
20 | for (int i = 0; i < size; i++) {
21 | this.removeLast();
22 | }
23 | }
24 |
25 | /**
26 | * 根据概念的定义判断是否已经加入到链表中
27 | * @param concept
28 | */
29 | public void addByDefine(Concept concept) {
30 | for (Concept c : this) {
31 | if (c.getDefine().equals(concept.getDefine())) {
32 | return;
33 | }
34 | }
35 |
36 | this.add(concept);
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/concept/LiuConceptParser.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.concept;
2 |
3 | import java.io.IOException;
4 | import java.util.Collection;
5 |
6 | import zx.soft.similarity.util.BlankUtils;
7 | import zx.soft.similarity.word.hownet.sememe.LiuqunSememeParser;
8 | import zx.soft.similarity.word.hownet.sememe.SememeParser;
9 |
10 | /**
11 | * 刘群老师的相似度计算方式,对概念解析的处理方式
12 | *
13 | * @deprecated
14 | */
15 | @Deprecated
16 | public class LiuConceptParser extends ConceptParser {
17 |
18 | private static LiuConceptParser instance = null;
19 |
20 | public static LiuConceptParser getInstance() {
21 | if (instance == null) {
22 | try {
23 | instance = new LiuConceptParser();
24 | } catch (IOException e) {
25 | e.printStackTrace();
26 | }
27 | }
28 |
29 | return instance;
30 | }
31 |
32 | private LiuConceptParser(SememeParser sememeParser) throws IOException {
33 | super(sememeParser);
34 | }
35 |
36 | private LiuConceptParser() throws IOException {
37 | super(new LiuqunSememeParser());
38 | }
39 |
40 | @Override
41 | protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4) {
42 | return beta1 * sim_v1 + beta2 * sim_v1 * sim_v2 + beta3 * sim_v1 * sim_v2 * sim_v3 + beta4 * sim_v1 * sim_v2
43 | * sim_v3 * sim_v4;
44 | }
45 |
46 | @Override
47 | public double getSimilarity(String word1, String word2) {
48 | double similarity = 0.0;
49 |
50 | // 如果两个句子相同,则直接返回1.0
51 | if (word1.equals(word2)) {
52 | return 1.0;
53 | }
54 |
55 | Collection concepts1 = getConcepts(word1);
56 | Collection concepts2 = getConcepts(word2);
57 |
58 | // 如果是blank,则说明是未登录词, 需要计算组合概念
59 | if (BlankUtils.isBlank(concepts1) || BlankUtils.isBlank(concepts2)) {
60 | return 0.0;
61 | }
62 |
63 | // 两个for循环分别计算词语所有可能的概念的相似度
64 | for (Concept c1 : concepts1) {
65 | for (Concept c2 : concepts2) {
66 | double v = getSimilarity(c1, c2);
67 |
68 | if (v > similarity) {
69 | similarity = v;
70 | }
71 |
72 | if (similarity == 1.0) {
73 | break;
74 | }
75 | }
76 | }
77 |
78 | return similarity;
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/FastSimpleMap.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.sememe;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.Collection;
6 |
7 | /**
8 | * 一种新的Map,跟标准的Map不同,它的的Key可以有重复, 内部采用快速排序和二分查找,
9 | * 保持较少的变量,结构简单,可根据主键查找返回的结果是一个数组
10 | *
11 | * @param
12 | * @param
13 | * @deprecated
14 | */
15 | @Deprecated
16 | public class FastSimpleMap, V> {
17 |
18 | private final K[] keys;
19 | private final V[] values;
20 |
21 | public FastSimpleMap(K[] keys, V[] values) throws IOException {
22 | if (keys.length != values.length) {
23 | throw new IOException("keys length must be equals values");
24 | }
25 | this.keys = keys;
26 | this.values = values;
27 |
28 | // 根据keys进行排序
29 | quicksort(0, keys.length - 1);
30 | }
31 |
32 | /**
33 | * 查找键对应的值集合
34 | * @param key
35 | * @return
36 | */
37 | public Collection get(K key) {
38 | int low = 0;
39 | int high = keys.length - 1;
40 |
41 | Collection results = new ArrayList();
42 |
43 | while (low <= high) {
44 | int mid = (low + high) >> 1;
45 | K item = keys[mid];
46 | int cmp = key.compareTo(item);
47 |
48 | if (cmp > 0) {
49 | low = mid + 1;
50 | } else if (cmp < 0) {
51 | high = mid - 1;
52 | } else {
53 | // 找到起始位置,该位置前后相同的都是该主键对应的值
54 | for (int i = mid; i >= 0 && keys[i].equals(key); i--) {
55 | results.add(values[i]);
56 | }
57 | for (int i = mid + 1; i < keys.length && keys[i].equals(key); i++) {
58 | results.add(values[i]);
59 | }
60 |
61 | break;
62 | }
63 | }
64 |
65 | return results;
66 | }
67 |
68 | /**
69 | * 根据keys快速排序,排序的同时交换values
70 | *
71 | * @param a
72 | * @param low
73 | * @param high
74 | */
75 | private void quicksort(int low, int high) {
76 | // low is the lower index, high is the upper index
77 | // of the region of array a that is to be sorted
78 | int i = low, j = high;
79 | K h;
80 | V v;
81 | K x = keys[(low + high) >> 1];
82 |
83 | // partition
84 | do {
85 | while (keys[i].compareTo(x) < 0)
86 | i++;
87 | while (keys[j].compareTo(x) > 0)
88 | j--;
89 |
90 | if (i <= j) {
91 | h = keys[i];
92 | keys[i] = keys[j];
93 | keys[j] = h;
94 | v = values[i];
95 | values[i] = values[j];
96 | values[j] = v;
97 | i++;
98 | j--;
99 | }
100 | } while (i <= j);
101 |
102 | // recursion
103 | if (low < j)
104 | quicksort(low, j);
105 | if (i < high)
106 | quicksort(i, high);
107 | }
108 |
109 | }
110 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/LiuqunSememeParser.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.sememe;
2 |
3 | import java.io.IOException;
4 |
5 | /**
6 | * 刘群老师计算义原相似度的方法, 实现了SememeParser中定义的抽象方法
7 | *
8 | * @version 1.0
9 | * @deprecated
10 | */
11 | @Deprecated
12 | public class LiuqunSememeParser extends SememeParser {
13 |
14 | /** 计算义元相似度的可调节的参数,默认为1.6 */
15 | private final float alpha = 1.6f;
16 |
17 | public LiuqunSememeParser() throws IOException {
18 | super();
19 | }
20 |
21 | /**
22 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者
23 | *
similarity = alpha/(distance+alpha)
24 | *
25 | * @param key1
26 | * @param key2
27 | * @return
28 | */
29 | @Override
30 | public double getSimilarity(String item1, String item2) {
31 | int pos;
32 |
33 | // 如果为空串,直接返回0
34 | if (item1 == null || item2 == null || item1.equals("") || item2.equals(""))
35 | return 0.0;
36 |
37 | String key1 = item1.trim();
38 | String key2 = item2.trim();
39 |
40 | // 去掉()符号
41 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
42 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
43 | key1 = key1.substring(1, key1.length() - 1);
44 | key2 = key2.substring(1, key2.length() - 1);
45 | } else {
46 | return 0.0;
47 | }
48 | }
49 |
50 | // 处理关系义元,即x=y的情况
51 | if ((pos = key1.indexOf('=')) > 0) {
52 | int pos2 = key2.indexOf('=');
53 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0
54 | if ((pos == pos2) && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
55 | key1 = key1.substring(pos + 1);
56 | key2 = key2.substring(pos2 + 1);
57 | } else {
58 | return 0.0;
59 | }
60 | }
61 |
62 | // 处理符号义元,即前面有特殊符号的义元
63 | String symbol1 = key1.substring(0, 1);
64 | String symbol2 = key2.substring(0, 1);
65 |
66 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
67 | if (symbol1.equals(Symbol_Descriptions[i][0])) {
68 | if (symbol1.equals(symbol2)) {
69 | key1 = item1.substring(1);
70 | key2 = item2.substring(1);
71 | break;
72 | } else {
73 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0
74 | }
75 | }
76 | }
77 |
78 | if ((pos = key1.indexOf("|")) >= 0) {
79 | key1 = key1.substring(pos + 1);
80 | }
81 | if ((pos = key2.indexOf("|")) >= 0) {
82 | key2 = key2.substring(pos + 1);
83 | }
84 |
85 | int distance = getDistance(key1, key2);
86 | if (distance < 0)
87 | return 0.0;
88 | else
89 | return alpha / (distance + alpha);
90 | }
91 |
92 | @Override
93 | public double getSimilarity(Sememe sem1, Sememe sem2) {
94 | int distance = getDistance(sem1, sem2);
95 | if (distance <= 0)
96 | return 0.0f;
97 | else
98 | return alpha / (distance + alpha);
99 | }
100 |
101 | }
102 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/MySememeParser.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.sememe;
2 |
3 | import java.io.IOException;
4 |
5 | import zx.soft.similarity.util.BlankUtils;
6 |
7 | /**
8 | * 义原相似度计算, 实现了SememeParser中定义的抽象方法
9 | *
10 | * @deprecated
11 | */
12 | @Deprecated
13 | public class MySememeParser extends SememeParser {
14 |
15 | public MySememeParser() throws IOException {
16 | super();
17 | }
18 |
19 | /**
20 | * 计算两个义原的相似度
21 | */
22 | @Override
23 | public double getSimilarity(final Sememe sememe1, final Sememe sememe2) {
24 | Sememe sem1 = sememe1;
25 | Sememe sem2 = sememe2;
26 |
27 | if (sememe1 == null || sememe2 == null) {
28 | return 0.0f;
29 | } else if (sememe1.getId() == sememe2.getId()) {
30 | return 1.0f;
31 | }
32 |
33 | //变为深度相同,然后一次上找共同的父节点
34 | int level = sememe1.getDepth() - sememe2.getDepth();
35 | for (int i = 0; i < ((level < 0) ? level * -1 : level); i++) {
36 | if (level > 0) {
37 | sem1 = SEMEMES[sem1.getParentId()];
38 | } else {
39 | sem2 = SEMEMES[sem2.getParentId()];
40 | }
41 | }
42 |
43 | while (sem1.getId() != sem2.getId()) {
44 | // 如果有一个已经到达根节点,仍然不同,则返回0
45 | if (sem1.getId() == sem1.getParentId() || sem2.getId() == sem2.getParentId()) {
46 | return 0.0f;
47 | }
48 |
49 | sem1 = SEMEMES[sem1.getParentId()];
50 | sem2 = SEMEMES[sem2.getParentId()];
51 | }
52 |
53 | return sem1.getDepth() * 2.0f / (sememe1.getDepth() + sememe2.getDepth());
54 | }
55 |
56 | /**
57 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者 similarity = alpha/(distance+alpha),
58 | * 如果两个字符串相同或都为空,直接返回1.0
59 | *
60 | * @param key1 第一个义原字符串
61 | * @param key2 第二个义原字符串
62 | * @return
63 | */
64 | @Override
65 | public double getSimilarity(String item1, String item2) {
66 | if (BlankUtils.isBlankAll(item2, item2)) {
67 | return 1.0;
68 | } else if (BlankUtils.isBlankAtLeastOne(item1, item2)) {
69 | return 0.0;
70 | } else if (item1.equals(item2)) {
71 | return 1.0;
72 | }
73 |
74 | String key1 = item1.trim();
75 | String key2 = item2.trim();
76 |
77 | // 去掉()符号
78 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
79 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
80 | key1 = key1.substring(1, key1.length() - 1);
81 | key2 = key2.substring(1, key2.length() - 1);
82 | } else {
83 | return 0.0;
84 | }
85 | }
86 |
87 | // 处理关系义元,即x=y的情况
88 | int pos = key1.indexOf('=');
89 | if (pos > 0) {
90 | int pos2 = key2.indexOf('=');
91 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0
92 | if ((pos == pos2) && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
93 | key1 = key1.substring(pos + 1);
94 | key2 = key2.substring(pos2 + 1);
95 | } else {
96 | return 0.0;
97 | }
98 | }
99 |
100 | // 处理符号义元,即前面有特殊符号的义元
101 | String symbol1 = key1.substring(0, 1);
102 | String symbol2 = key2.substring(0, 1);
103 |
104 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
105 | if (symbol1.equals(Symbol_Descriptions[i][0])) {
106 | if (symbol1.equals(symbol2)) {
107 | key1 = item1.substring(1);
108 | key2 = item2.substring(1);
109 | break;
110 | } else {
111 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0
112 | }
113 | }
114 | }
115 |
116 | if ((pos = key1.indexOf("|")) >= 0) {
117 | key1 = key1.substring(pos + 1);
118 | }
119 | if ((pos = key2.indexOf("|")) >= 0) {
120 | key2 = key2.substring(pos + 1);
121 | }
122 |
123 | // 如果两个字符串相等,直接返回距离为0
124 | if (key1.equals(key2)) {
125 | return 1.0;
126 | }
127 |
128 | Integer[] myset1 = getSememes(key1);
129 | Integer[] myset2 = getSememes(key2);
130 |
131 | double similarity = 0.0;
132 | for (int id1 : myset1) {
133 | for (int id2 : myset2) {
134 | double s = getSimilarity(SEMEMES[id1], SEMEMES[id2]);
135 | if (s > similarity) {
136 | similarity = s;
137 | }
138 | }
139 | }
140 |
141 | return similarity;
142 | }
143 |
144 | }
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/Sememe.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.sememe;
2 |
3 | /**
4 | * 描述知网义原的基本对象, 出于性能考虑,把未用到的英文名称、定义等在加载时忽略, 更准确的做法是以[英文定义|中文定义]
5 | * 作为一个整理进行处理,不过绝大多数只根据中文定义就可以标识出来,因此忽略不计。
6 | *
7 | * @deprecated
8 | */
9 | @Deprecated
10 | public class Sememe {
11 |
12 | /** 义原编号 */
13 | private int id;
14 | /** 指向上位义元号 */
15 | private int parentId;
16 | /** 义原在义原树中的深度 */
17 | private int depth;
18 | /** 义原的中文名称*/
19 | private String cnWord;
20 | /** 义原的英文名称 */
21 | private String enWord;
22 | /** 义原的定义,如果没有(例如数量),则为空串 */
23 | private String define;
24 | /** 义原的类型 */
25 | private int type;
26 |
27 | /**
28 | * 每一行的形式为:be|是 {relevant,isa}/{relevant,descriptive}
29 | *
或者 official|官 [#organization|组织,#employee|员]
30 | *
或者 amount|多少
31 | *
把相应的部分赋予不同的属性
32 | * 出于性能考虑,把未用到的英文名称、定义等忽略
33 | * @param id
34 | * @param parentId
35 | * @param item 读取文件中的一行
36 | */
37 | public Sememe(int id, int parentId, int depth, String item) {
38 | this.id = id;
39 | this.parentId = parentId;
40 | this.depth = depth;
41 |
42 | int pos = item.indexOf('|');
43 | if (pos < 0) {
44 | this.cnWord = item;
45 | this.enWord = item;
46 | } else {
47 | this.enWord = item.substring(0, pos);
48 |
49 | // 去掉"|"符号
50 | String nextPart = item.substring(pos + 1);
51 | pos = nextPart.indexOf(' ');
52 | if (pos <= 0) {
53 | this.cnWord = nextPart;
54 | } else {
55 | this.cnWord = nextPart.substring(0, pos);
56 | this.define = nextPart.substring(pos).trim();
57 | }
58 | }
59 | }
60 |
61 | public int getId() {
62 | return id;
63 | }
64 |
65 | public void setId(int id) {
66 | this.id = id;
67 | }
68 |
69 | public int getParentId() {
70 | return parentId;
71 | }
72 |
73 | public void setParentId(int parentId) {
74 | this.parentId = parentId;
75 | }
76 |
77 | public int getDepth() {
78 | return depth;
79 | }
80 |
81 | public void setDepth(int depth) {
82 | this.depth = depth;
83 | }
84 |
85 | public String getCnWord() {
86 | return cnWord;
87 | }
88 |
89 | public void setCnWord(String cnWord) {
90 | this.cnWord = cnWord;
91 | }
92 |
93 | public String getEnWord() {
94 | return enWord;
95 | }
96 |
97 | public void setEnWord(String enWord) {
98 | this.enWord = enWord;
99 | }
100 |
101 | public String getDefine() {
102 | return define;
103 | }
104 |
105 | public void setDefine(String define) {
106 | this.define = define;
107 | }
108 |
109 | public int getType() {
110 | return type;
111 | }
112 |
113 | public void setType(int type) {
114 | this.type = type;
115 | }
116 |
117 | @Override
118 | public String toString() {
119 | StringBuilder sb = new StringBuilder();
120 | sb.append("id=");
121 | sb.append(id);
122 | sb.append("; parentId=");
123 | sb.append(parentId);
124 | sb.append("; depth=");
125 | sb.append(depth);
126 | sb.append("; cnWord=");
127 | sb.append(cnWord);
128 | sb.append("; enWord=");
129 | sb.append(enWord);
130 | sb.append("; define=");
131 | sb.append(define);
132 | return sb.toString();
133 | }
134 |
135 | }
136 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/SememeDictTraverseEvent.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.sememe;
2 |
3 | import java.io.FileOutputStream;
4 | import java.io.PrintWriter;
5 | import java.util.ArrayList;
6 | import java.util.List;
7 |
8 | import javax.xml.parsers.DocumentBuilder;
9 | import javax.xml.parsers.DocumentBuilderFactory;
10 | import javax.xml.transform.OutputKeys;
11 | import javax.xml.transform.Transformer;
12 | import javax.xml.transform.TransformerFactory;
13 | import javax.xml.transform.dom.DOMSource;
14 | import javax.xml.transform.stream.StreamResult;
15 |
16 | import org.w3c.dom.Document;
17 | import org.w3c.dom.Element;
18 |
19 | import zx.soft.similarity.util.TraverseEvent;
20 |
21 | /**
22 | * 实现遍历加载义原信息到义原表中, 义原词典的组织以知网导出的格式为标准,如:
23 | * - entity|实体
24 | * ├ thing|万物 [#time|时间,#space|空间]
25 | * │ ├ physical|物质 [!appearance|外观]
26 | * │ │ ├ animate|生物 [*alive|活着,!age|年龄,*die|死,*metabolize|代谢]
27 | * │ │ │ ├ AnimalHuman|动物 [!sex|性别,*AlterLocation|变空间位置,*StateMental|精神状态]
28 | * │ │ │ │
29 | * 等等
30 | *
31 | * @deprecated
32 | */
33 | @Deprecated
34 | public class SememeDictTraverseEvent implements TraverseEvent {
35 |
36 | /** 义原存放的列表, 按照顺序设置ID,存放到线性表中 */
37 | private List sememeList = null;
38 |
39 | public SememeDictTraverseEvent() {
40 | this.sememeList = new ArrayList<>();
41 | }
42 |
43 | /**
44 | * 获取加载后的义原信息,按照下标顺序存放,树的层次关系通过数组下标表示
45 | * @return
46 | */
47 | public Sememe[] getSememes() {
48 | return sememeList.toArray(new Sememe[sememeList.size()]);
49 | }
50 |
51 | private void processXML(Document document, Element root, int parentId, String fullParentId) {
52 | int position = 1;
53 | for (int i = 0; i < sememeList.size(); i++) {
54 | Sememe sememe = sememeList.get(i);
55 | if (sememe.getParentId() == parentId && sememe.getId() != parentId) {
56 | Element sememeNode = document.createElement("sememe");
57 | String fullId = fullParentId + "-" + (position++);
58 | sememeNode.setAttribute("id", fullId);
59 | sememeNode.setAttribute("cn", sememe.getCnWord());
60 | sememeNode.setAttribute("en", sememe.getEnWord());
61 | if (sememe.getDefine() != null && !sememe.getDefine().equals("")) {
62 | sememeNode.setAttribute("define", sememe.getDefine());
63 | }
64 | root.appendChild(sememeNode);
65 | processXML(document, root, sememe.getId(), fullId);
66 | }
67 | }
68 | }
69 |
70 | /**
71 | * 保存到XML文件中, 新版本的xsimilarity采用xml格式存储义原,其格式为
72 | * <sememes>
73 | * <sememe cn="事件" en="event" id="1"/>
74 | * <sememe cn="静态" en="static" id="1-1"/>
75 | * ...
76 | * </sememes>
77 | * @param xmlFile
78 | * @throws Exception
79 | */
80 | public void saveToXML(String xmlFile) throws Exception {
81 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
82 | DocumentBuilder builder = factory.newDocumentBuilder();
83 | Document document = builder.newDocument();
84 | Element root = document.createElement("sememes");
85 | document.appendChild(root);
86 | int position = 1;
87 | for (Sememe sememe : sememeList) {
88 | if (sememe.getId() != sememe.getParentId()) {
89 | continue;
90 | }
91 |
92 | Element sememeNode = document.createElement("sememe");
93 | String fullId = Integer.toString(position++);
94 |
95 | sememeNode.setAttribute("id", fullId);
96 | sememeNode.setAttribute("cn", sememe.getCnWord());
97 | sememeNode.setAttribute("en", sememe.getEnWord());
98 | if (sememe.getDefine() != null && !sememe.getDefine().equals("")) {
99 | sememeNode.setAttribute("define", sememe.getDefine());
100 | }
101 | root.appendChild(sememeNode);
102 | processXML(document, root, sememe.getId(), fullId);
103 | }
104 |
105 | TransformerFactory tf = TransformerFactory.newInstance();
106 | Transformer transformer = tf.newTransformer();
107 | DOMSource source = new DOMSource(document);
108 | transformer.setOutputProperty(OutputKeys.ENCODING, "utf8");
109 | transformer.setOutputProperty(OutputKeys.INDENT, "yes");
110 | PrintWriter pw = new PrintWriter(new FileOutputStream(xmlFile));
111 | StreamResult result = new StreamResult(pw);
112 | transformer.transform(source, result);
113 | }
114 |
115 | /**
116 | * 解析当前义原信息文本行
117 | * 判断读入的一行文本是义元树中的第几层,读入的格式形如:
118 | * - entity|实体
119 | * ├ thing|万物 [#time|时间,#space|空间]
120 | * │ ├ physical|物质 [!appearance|外观]
121 | * │ │ ├ animate|生物 [*alive|活着,!age|年龄,*die|死,*metabolize|代谢]
122 | *
123 | * @param item
124 | * @return 如果是义原,则info[0]返回层次深度(info[0]>=0); info[1]返回具体的义元内容起始位置;否则info[0]返回-1
125 | */
126 | private int[] parseSememeLine(String item) {
127 | int[] info = new int[2];
128 | info[0] = -1;
129 |
130 | int prefixLen = 0; // 前缀的数目,包括空格和"-,│,├"等符号,其中空格和"-"符号算一个长度,其他算2个
131 | for (int i = 0; i < item.length(); i++) {
132 | char ch = item.charAt(i);
133 | if ((ch == ' ') || (ch == '-')) {
134 | prefixLen++;
135 | } else if ((ch == '├') || (ch == '│') || (ch == '└')) {
136 | prefixLen += 2;
137 | } else {
138 | // 遇到非前缀字符,求解,根据前缀深度,如果为2,返回0,即第一级,否则,每增加3,深度加1
139 | if (prefixLen >= 2) {
140 | info[0] = (prefixLen - 2) / 3;
141 | info[1] = i;
142 | }
143 | break;
144 | }
145 | }
146 | return info;
147 | }
148 |
149 | /**
150 | * 根据字符串判断义元的类型
151 | *
152 | * @param item
153 | * @return
154 | */
155 | private int parseSememeType(String item) {
156 | String myItem = item.toLowerCase().trim();
157 | if (myItem.indexOf("event|") == 0)
158 | return SememeType.Event;
159 | else if (myItem.indexOf("entity|") == 0)
160 | return SememeType.Entity;
161 | else if (myItem.indexOf("attribute|") == 0)
162 | return SememeType.Attribute;
163 | else if (myItem.indexOf("quantity|") == 0)
164 | return SememeType.Quantity;
165 | else if (myItem.indexOf("avalue|") == 0)
166 | return SememeType.AValue;
167 | else if (myItem.indexOf("qvalue|") == 0)
168 | return SememeType.QValue;
169 | else if (myItem.indexOf("secondary feature") == 0)
170 | return SememeType.SecondaryFeature;
171 | else if (myItem.indexOf("syntax") == 0)
172 | return SememeType.Syntax;
173 | else if (myItem.indexOf("eventrole and features") == 0)
174 | return SememeType.EventRoleAndFeature;
175 | else
176 | return 0;
177 | }
178 |
179 | /**
180 | * 实现TraverseEvent的实际访问接口, 返回值没有使用
181 | * @see ke.commons.util.TraverseEvent
182 | */
183 | @Override
184 | public boolean visit(String line) {
185 | //判断是否为注释行
186 | if (line.trim().equals("") || line.trim().charAt(0) == '#')
187 | return true;
188 |
189 | //当前义原在整个义原列表中的位置
190 | int position = sememeList.size();
191 |
192 | //解析当前义原信息文本行, info[0]表示当前义原的层次, info[1]表示当前义原的实际信息在文本行中的开始位置
193 | int[] info = parseSememeLine(line);
194 | int curDepth = info[0];
195 |
196 | //如果深度<0,继续
197 | if (info[0] < 0)
198 | return false;
199 |
200 | //取出真正的义原字符串
201 | String sememeString = line.substring(info[1]);
202 |
203 | //深度为0,表示为根节点
204 | if (info[0] == 0) {
205 | Sememe sememe = new Sememe(position, position, 0, sememeString);
206 | int sememeType = parseSememeType(sememeString);
207 | sememe.setType(sememeType);
208 | sememeList.add(sememe);
209 | } else {
210 | Sememe parentSememe = sememeList.get(position - 1);
211 | //最近一个深度比当前深度大1的义原即为该义原的父节点
212 |
213 | while ((parentSememe.getDepth() - curDepth) != -1) {
214 | parentSememe = sememeList.get(parentSememe.getParentId());
215 | }
216 | Sememe sememe = new Sememe(position, parentSememe.getId(), curDepth, sememeString);
217 | sememe.setType(parentSememe.getType());
218 | sememeList.add(sememe);
219 | }
220 |
221 | return true;
222 | }
223 |
224 | }
225 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/SememeParser.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.sememe;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.util.Collection;
6 |
7 | import org.slf4j.Logger;
8 | import org.slf4j.LoggerFactory;
9 |
10 | import zx.soft.similarity.Similaritable;
11 | import zx.soft.similarity.util.BlankUtils;
12 | import zx.soft.similarity.util.FileUtils;
13 | import zx.soft.similarity.word.hownet.HownetMeta;
14 |
15 | /**
16 | * 义原解析器, 包括义元数据的加载,义元的组织、索引、查询 以及义元的距离计算和相似度计算等.
17 | * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》
18 | *
19 | * @see zx.soft.similarity.Similaritable
20 | * @deprecated
21 | */
22 | @Deprecated
23 | public abstract class SememeParser implements HownetMeta, Similaritable {
24 |
25 | protected Logger logger = LoggerFactory.getLogger(this.getClass());
26 |
27 | /** 所有的义原都存放到一个数组之中,并且义元的ID号与数组的下标相同 */
28 | protected Sememe[] SEMEMES;
29 |
30 | /** 通过对义原的汉语词义进行索引,根据该索引快速定位义原,找出义原的id,再到sememes中查找 */
31 | private FastSimpleMap sememeMap = null;
32 |
33 | public SememeParser() throws IOException {
34 | InputStream input = this.getClass().getClassLoader().getResourceAsStream("data/sememe.dat");
35 | load(input, "UTF-8");
36 | }
37 |
38 | /**
39 | * 获取两个义原描述串的相似度
40 | * @param sememeName1
41 | * @param sememeName2
42 | * @see ke.commons.similarity.Similariable
43 | * @return
44 | */
45 | @Override
46 | public abstract double getSimilarity(String sememeName1, String sememeName2);
47 |
48 | /**
49 | * 获取两个确定义原的相似度
50 | * @param sememe1
51 | * @param sememe2
52 | * @return
53 | */
54 | public abstract double getSimilarity(Sememe sememe1, Sememe sememe2);
55 |
56 | /**
57 | * 从文件中加载义元知识
58 | *
59 | * @throws IOException
60 | */
61 | public void load(InputStream input, String encoding) throws IOException {
62 | SememeDictTraverseEvent event = new SememeDictTraverseEvent();
63 | logger.info("loading sememe dictionary...");
64 | long time = System.currentTimeMillis();
65 | FileUtils.traverseLines(input, encoding, event);
66 | this.SEMEMES = event.getSememes();
67 |
68 | String[] keys = new String[SEMEMES.length];
69 | Integer[] values = new Integer[SEMEMES.length];
70 |
71 | //设置索引
72 | for (int i = 0; i < SEMEMES.length; i++) {
73 | keys[i] = SEMEMES[i].getCnWord();
74 | values[i] = SEMEMES[i].getId();
75 | }
76 | sememeMap = new FastSimpleMap(keys, values);
77 |
78 | time = System.currentTimeMillis() - time;
79 | logger.info("sememe dictionary load completely. time elapsed:{}", time);
80 | }
81 |
82 | /**
83 | * 根据汉语定义计算义元之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大,
84 | *
由于可能多个义元有相同的汉语词语,故计算结果为其中距离最小者
85 | *
86 | * @param key1
87 | * @param key2
88 | * @return
89 | */
90 | public int getDistance(String key1, String key2) {
91 | int distance = Integer.MAX_VALUE;
92 |
93 | // 如果两个字符串相等,直接返回距离为0
94 | if (key1.equals(key2)) {
95 | return 0;
96 | }
97 |
98 | Integer[] semArray1 = getSememes(key1);
99 | Integer[] semArray2 = getSememes(key2);
100 |
101 | // 如果key1或者key2不是义元,并且key1<>key2,则返回无穷大
102 | if (semArray1.length == 0 || semArray2.length == 0) {
103 | return Integer.MAX_VALUE;
104 | }
105 |
106 | for (int i : semArray1) {
107 | for (int j : semArray2) {
108 | int d = getDistance(SEMEMES[i], SEMEMES[j]);
109 | if (d < distance) {
110 | distance = d;
111 | }
112 | }
113 | }
114 |
115 | return distance;
116 | }
117 |
118 | /**
119 | * 获取两个义元在义原树中的距离
120 | *
121 | * @param sem1
122 | * 第一个义原
123 | * @param sem2
124 | * 第二个义原
125 | * @return 两个义原的距离
126 | */
127 | public int getDistance(Sememe sem1, Sememe sem2) {
128 | Sememe mysem1 = sem1;
129 | Sememe mysem2 = sem2;
130 | int distance = 0;
131 |
132 | if (mysem1 == null || mysem2 == null)
133 | return Integer.MAX_VALUE;
134 |
135 | //变为深度相同,然后一次上找共同的父节点
136 | int level = mysem1.getDepth() - mysem2.getDepth();
137 | for (int i = 0; i < ((level < 0) ? level * -1 : level); i++) {
138 | if (level > 0)
139 | mysem1 = SEMEMES[mysem1.getParentId()];
140 | else
141 | mysem2 = SEMEMES[mysem2.getParentId()];
142 | distance++;
143 | }
144 |
145 | //从不同的分支(深度相同)同时向上寻找共同的祖先节点
146 | while (mysem1.getId() != mysem2.getId()) {
147 | // 如果已经到达根节点,仍然不同,则返回无穷大(-1)
148 | if (mysem1.getId() == mysem1.getParentId() || mysem2.getId() == mysem2.getParentId()) {
149 | distance = Integer.MAX_VALUE;
150 | break;
151 | }
152 |
153 | mysem1 = SEMEMES[mysem1.getParentId()];
154 | mysem2 = SEMEMES[mysem2.getParentId()];
155 | distance += 2;
156 | }
157 |
158 | return distance;
159 | }
160 |
161 | /**
162 | * 获取从该义元到根节点的路径表示字符串
163 | *
164 | * @param key
165 | * @return
166 | */
167 | public String getPath(String key) {
168 | StringBuilder path = new StringBuilder();
169 |
170 | Sememe sem = getSememe(key);
171 | while (sem != null && sem.getId() != sem.getParentId()) {
172 | path.insert(0, "->" + sem.getCnWord());
173 | sem = SEMEMES[sem.getParentId()];
174 | }
175 |
176 | if (sem != null) {
177 | path.insert(0, "->" + sem.getCnWord());
178 | }
179 | path.insert(0, "START");
180 | return path.toString();
181 | }
182 |
183 | /**
184 | * 根据义原的名字,获取该义原的位置信息,义原体系中有时会有一个名字对应多个义原,一并返回到
185 | * 义原数组中
186 | * @param sememeName
187 | * @return
188 | */
189 | public Integer[] getSememes(String sememeName) {
190 | Collection ids = sememeMap.get(sememeName);
191 |
192 | return ids.toArray(new Integer[ids.size()]);
193 | }
194 |
195 | /**
196 | * 获取其中的一个义原,大部分义原就只有一个
197 | * @param sememeName
198 | * @return
199 | */
200 | public Sememe getSememe(String sememeName) {
201 | Integer[] ids = getSememes(sememeName);
202 |
203 | if (BlankUtils.isBlank(ids)) {
204 | return null;
205 | } else {
206 | return SEMEMES[ids[0]];
207 | }
208 | }
209 |
210 | /**
211 | * 过滤义原字符串,去掉其中的英文部分
212 | * @param sememeString
213 | * @return
214 | */
215 | protected String filterSememeString(String sememeString) {
216 | int pos = sememeString.indexOf("|");
217 | if (pos >= 0) {
218 | sememeString = sememeString.substring(pos + 1);
219 | }
220 | return sememeString;
221 | }
222 |
223 | }
224 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/SememeType.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet.sememe;
2 |
3 | /**
4 | * 义原的类型定义
5 | *
6 | * - 1:Event|事件
7 | * - 2:Entity|实体
8 | * - 3:Attribute|属性
9 | * - 4:Quantity|数量
10 | * - 5:aValue|属性值
11 | * - 6:qValue|数量值
12 | * - 7: Secondary Feature|第二特征
13 | * - 8: Syntax|语法
14 | * - 9: EventRole|动态角色
15 | * - 10:EventFeatures|动态属性
16 | * - 0:未知
17 | *
18 | *
19 | * 其中1~7为基本义元,8为语法义元,9、10为关系义元
20 | *
21 | * @deprecated
22 | */
23 | @Deprecated
24 | public interface SememeType {
25 |
26 | /** Event|事件类型定义 */
27 | public static final int Event = 1;
28 |
29 | /** Entity|实体类型定义*/
30 | public static final int Entity = 2;
31 |
32 | /** Attribute|属性类型定义*/
33 | public static final int Attribute = 3;
34 |
35 | /** Quantity|数量类型定义*/
36 | public static final int Quantity = 4;
37 |
38 | /** aValue|属性值类型定义*/
39 | public static final int AValue = 5;
40 |
41 | /** qValue|数量值类型定义*/
42 | public static final int QValue = 6;
43 |
44 | /** Secondary Feature|第二特征类型定义*/
45 | public static final int SecondaryFeature = 7;
46 |
47 | /** Syntax|语法类型定义*/
48 | public static final int Syntax = 8;
49 |
50 | /** EventRole|动态角色类型定义*/
51 | public static final int EventRoleAndFeature = 9;
52 |
53 | /** EventFeatures|动态属性类型定义*/
54 | public static final int EventFeature = 10;
55 |
56 | /** 未知类型定义*/
57 | public static final int Unknown = 0;
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/Concept.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet2.concept;
2 |
3 | import java.util.ArrayList;
4 | import java.util.HashSet;
5 | import java.util.List;
6 | import java.util.Set;
7 | import java.util.StringTokenizer;
8 |
9 | import zx.soft.similarity.word.hownet.HownetMeta;
10 |
11 | /**
12 | * 知网的概念表示类
example和英文部分对于相似度的计算不起作用,考虑到内存开销, 在概念的表示中去掉了这部分数据的对应定义
13 | *
14 | */
15 | public class Concept implements HownetMeta {
16 |
17 | /** 中文概念名称 */
18 | protected String word;
19 | /** 词性: Part of Speech */
20 | protected String pos;
21 | /** 定义 */
22 | protected String define;
23 |
24 | /** 是否是实词,false表示为虚词, 一般为实词 */
25 | protected boolean bSubstantive;
26 | /** 第一基本义原 */
27 | protected String mainSememe;
28 | /** 其他基本义原 */
29 | protected String[] secondSememes;
30 | /** 关系义元原 */
31 | protected String[] relationSememes;
32 | /** 关系符号描述 */
33 | protected String[] symbolSememes;
34 |
35 | static String[][] Concept_Type = { { "=", "事件" }, { "aValue|属性值", "属性值" }, { "qValue|数量值", "数量值" },
36 | { "attribute|属性", "属性" }, { "quantity|数量", "数量" }, { "unit|", "单位" }, { "%", "部件" } };
37 |
38 | public Concept(String word, String pos, String def) {
39 | this.word = word;
40 | this.pos = pos;
41 | this.define = (def == null) ? "" : def.trim();
42 |
43 | // 虚词用{***}表示
44 | if (define.length() > 0 && define.charAt(0) == '{' && define.charAt(define.length() - 1) == '}') {
45 | this.bSubstantive = false;
46 | } else {
47 | this.bSubstantive = true;
48 | }
49 |
50 | parseDefine();
51 | }
52 |
53 | /**
54 | * 处理定义,把定义分为第一基本义元、其他基本义元、关系义元和符号义元四类
55 | */
56 | private void parseDefine() {
57 | List secondList = new ArrayList<>(); //其他基本义原
58 | List relationList = new ArrayList<>(); //关系义原
59 | List symbolList = new ArrayList<>(); //符号义原
60 |
61 | String tokenString = this.define;
62 |
63 | //如果不是实词,则处理“{}”中的内容
64 | if (!this.bSubstantive) {
65 | tokenString = define.substring(1, define.length() - 1);
66 | }
67 |
68 | StringTokenizer token = new StringTokenizer(tokenString, ",", false);
69 |
70 | // 第一个为第一基本义元
71 | if (token.hasMoreTokens()) {
72 | this.mainSememe = token.nextToken();
73 | }
74 |
75 | main_loop: while (token.hasMoreTokens()) {
76 | String item = token.nextToken();
77 | if (item.equals(""))
78 | continue;
79 |
80 | // 先判断是否为符号义元
81 | String symbol = item.substring(0, 1);
82 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
83 | if (symbol.equals(Symbol_Descriptions[i][0])) {
84 | symbolList.add(item);
85 | continue main_loop;
86 | }
87 | }
88 |
89 | //如果不是符号义元,则进一步判断是关系义元还是第二基本义元, 带有“=”表示关系义原
90 | if (item.indexOf('=') > 0) {
91 | relationList.add(item);
92 | } else {
93 | secondList.add(item);
94 | }
95 | }
96 |
97 | this.secondSememes = secondList.toArray(new String[secondList.size()]);
98 | this.relationSememes = relationList.toArray(new String[relationList.size()]);
99 | this.symbolSememes = symbolList.toArray(new String[symbolList.size()]);
100 |
101 | }
102 |
103 | /**
104 | * 获取第一义元
105 | *
106 | * @return
107 | */
108 | public String getMainSememe() {
109 | return mainSememe;
110 | }
111 |
112 | /**
113 | * 获取其他基本义元描述
114 | *
115 | * @return
116 | */
117 | public String[] getSecondSememes() {
118 | return secondSememes;
119 | }
120 |
121 | /**
122 | * 获取关系义元描述
123 | *
124 | * @return
125 | */
126 | public String[] getRelationSememes() {
127 | return relationSememes;
128 | }
129 |
130 | /**
131 | * 获取符号义元描述
132 | *
133 | * @return
134 | */
135 | public String[] getSymbolSememes() {
136 | return symbolSememes;
137 | }
138 |
139 | public Set getAllSememeNames() {
140 | Set names = new HashSet<>();
141 |
142 | //加入主义原
143 | names.add(getMainSememe());
144 |
145 | //加入关系义原
146 | for (String item : getRelationSememes()) {
147 | names.add(item.substring(item.indexOf("=") + 1));
148 | }
149 |
150 | //加入符号义原
151 | for (String item : getSymbolSememes()) {
152 | names.add(item.substring(1));
153 | }
154 |
155 | //加入其他义原集合
156 | for (String item : getSecondSememes()) {
157 | names.add(item);
158 | }
159 | return names;
160 | }
161 |
162 | @Override
163 | public String toString() {
164 | StringBuilder sb = new StringBuilder();
165 | sb.append("name=");
166 | sb.append(this.word);
167 | sb.append("; pos=");
168 | sb.append(this.pos);
169 | sb.append("; define=");
170 | sb.append(this.define);
171 | sb.append("; 第一基本义元:[" + mainSememe);
172 |
173 | sb.append("]; 其他基本义元描述:[");
174 | for (String sem : secondSememes) {
175 | sb.append(sem);
176 | sb.append(";");
177 | }
178 |
179 | sb.append("]; [关系义元描述:");
180 | for (String sem : relationSememes) {
181 | sb.append(sem);
182 | sb.append(";");
183 | }
184 |
185 | sb.append("]; [关系符号描述:");
186 | for (String sem : symbolSememes) {
187 | sb.append(sem);
188 | sb.append(";");
189 | }
190 | sb.append("]");
191 | return sb.toString();
192 | }
193 |
194 | /**
195 | * 是实词还是虚词
196 | *
197 | * @return true:实词;false:虚词
198 | */
199 | public boolean isSubstantive() {
200 | return this.bSubstantive;
201 | }
202 |
203 | public String getWord() {
204 | return word;
205 | }
206 |
207 | public void setWord(String word) {
208 | this.word = word;
209 | }
210 |
211 | public String getPos() {
212 | return pos;
213 | }
214 |
215 | public void setPos(String pos) {
216 | this.pos = pos;
217 | }
218 |
219 | public String getDefine() {
220 | return define;
221 | }
222 |
223 | public void setDefine(String define) {
224 | this.define = define;
225 | }
226 |
227 | /**
228 | * 获取该概念的类型
229 | *
230 | * @return
231 | */
232 | public String getType() {
233 | for (int i = 0; i < Concept_Type.length; i++) {
234 | if (define.toUpperCase().indexOf(Concept_Type[i][0].toUpperCase()) >= 0) {
235 | return Concept_Type[i][1];
236 | }
237 | }
238 | return "普通概念";
239 | }
240 |
241 | @Override
242 | public int hashCode() {
243 | return define == null ? word.hashCode() : define.hashCode();
244 | }
245 |
246 | @Override
247 | public boolean equals(Object anObject) {
248 | if (anObject instanceof Concept) {
249 | Concept c = (Concept) anObject;
250 | return word.equals(c.word) && define.equals(c.define);
251 | } else {
252 | return false;
253 | }
254 | }
255 |
256 | }
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/ConceptDictTraverseEvent.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet2.concept;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileOutputStream;
6 | import java.io.InputStream;
7 | import java.io.InputStreamReader;
8 | import java.io.PrintWriter;
9 | import java.util.ArrayList;
10 | import java.util.Arrays;
11 | import java.util.List;
12 |
13 | import javax.xml.parsers.DocumentBuilder;
14 | import javax.xml.parsers.DocumentBuilderFactory;
15 | import javax.xml.transform.OutputKeys;
16 | import javax.xml.transform.Transformer;
17 | import javax.xml.transform.TransformerFactory;
18 | import javax.xml.transform.dom.DOMSource;
19 | import javax.xml.transform.stream.StreamResult;
20 |
21 | import org.w3c.dom.Document;
22 | import org.w3c.dom.Element;
23 |
24 | import zx.soft.similarity.util.TraverseEvent;
25 |
26 | /**
27 | * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准,格式如下:
28 | * 阿斗 N human|人,ProperName|专,past|昔
29 | * 阿爸 N human|人,family|家,male|男
30 | * 即: <概念> <空格或者跳格> <词性> <空格或者跳格> <定义>"
31 | *
32 | * 概念保存到数组中,没有保存到Map中,可以降低对内存空间的使用
33 | *
34 | */
35 | public class ConceptDictTraverseEvent implements TraverseEvent {
36 |
37 | private List conceptList = null;
38 |
39 | public ConceptDictTraverseEvent() {
40 | conceptList = new ArrayList<>();
41 | }
42 |
43 | public Concept[] getConcepts() {
44 | Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]);
45 | Arrays.sort(concepts);
46 | return concepts;
47 | }
48 |
49 | /**
50 | * 读取概念词典中的一行,并进行解析处理
51 | */
52 | @Override
53 | public boolean visit(String line) {
54 | String word = null;
55 | String pos = null;
56 | String define = "";
57 | char ch;
58 |
59 | //以符号//开始的是注释行
60 | if (line.startsWith("//")) {
61 | return true;
62 | }
63 |
64 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置
65 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义
66 | //解析出一行中的概念各项数据
67 | loop: for (int position = 0; position < line.length(); position++) {
68 | ch = line.charAt(position);
69 |
70 | if ((ch == ' ') || (ch == '\t') || (position == (line.length() - 1))) {
71 | String item = line.substring(lastPosition, (position == (line.length() - 1)) ? (position + 1)
72 | : position);
73 | switch (processFlag) {
74 | case 0:
75 | word = item;
76 | processFlag++;
77 | break;
78 | case 1:
79 | pos = item;
80 | processFlag++;
81 | break;
82 | case 2:
83 | //define = item;
84 | //processFlag++;
85 | define = line.substring(lastPosition).trim();
86 | break loop;
87 | case 3:
88 | System.out.println(line);
89 | break;
90 | }
91 |
92 | for (; (position < line.length()); position++) {
93 | ch = line.charAt(position);
94 | if ((ch != ' ') && (ch != '\t')) {
95 | lastPosition = position;
96 | break;
97 | }
98 | }
99 |
100 | }
101 | }
102 | conceptList.add(new Concept(word, pos, define));
103 | return true;
104 | }
105 |
106 | public void saveToXML(File xmlFile) throws Exception {
107 | InputStream input = this.getClass().getClassLoader().getResourceAsStream("data/concept.dat");
108 | BufferedReader in = new BufferedReader(new InputStreamReader(input, "utf8"));
109 |
110 | DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
111 | DocumentBuilder builder = factory.newDocumentBuilder();
112 | Document document = builder.newDocument();
113 | Element root = document.createElement("concepts");
114 | document.appendChild(root);
115 |
116 | String line = null;
117 |
118 | while ((line = in.readLine()) != null) {
119 | saveLineToXML(document, root, line);
120 | }
121 |
122 | input.close();
123 | in.close();
124 |
125 | TransformerFactory tf = TransformerFactory.newInstance();
126 | Transformer transformer = tf.newTransformer();
127 | DOMSource source = new DOMSource(document);
128 | transformer.setOutputProperty(OutputKeys.ENCODING, "utf8");
129 | transformer.setOutputProperty(OutputKeys.INDENT, "yes");
130 | PrintWriter pw = new PrintWriter(new FileOutputStream(xmlFile));
131 | StreamResult result = new StreamResult(pw);
132 | transformer.transform(source, result);
133 | }
134 |
135 | /**
136 | * 读取概念词典中的一行,并进行解析处理
137 | */
138 | private boolean saveLineToXML(Document document, Element root, String line) {
139 | String word = null;
140 | String pos = null;
141 | String define = "";
142 | char ch;
143 |
144 | //以符号//开始的是注释行
145 | if (line.startsWith("//")) {
146 | return true;
147 | }
148 |
149 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置
150 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义
151 | //解析出一行中的概念各项数据
152 | loop: for (int position = 0; position < line.length(); position++) {
153 | ch = line.charAt(position);
154 |
155 | if ((ch == ' ') || (ch == '\t') || (position == (line.length() - 1))) {
156 | String item = line.substring(lastPosition, (position == (line.length() - 1)) ? (position + 1)
157 | : position);
158 | switch (processFlag) {
159 | case 0:
160 | word = item;
161 | processFlag++;
162 | break;
163 | case 1:
164 | pos = item;
165 | processFlag++;
166 | break;
167 | case 2:
168 | //define = item;
169 | //processFlag++;
170 | define = line.substring(lastPosition).trim();
171 | break loop;
172 | case 3:
173 | System.out.println(line);
174 | break;
175 | }
176 |
177 | for (; (position < line.length()); position++) {
178 | ch = line.charAt(position);
179 | if ((ch != ' ') && (ch != '\t')) {
180 | lastPosition = position;
181 | break;
182 | }
183 | }
184 |
185 | }
186 | }
187 |
188 | Element e = document.createElement("c");
189 | e.setAttribute("w", word);
190 | e.setAttribute("p", pos);
191 | e.setAttribute("d", define);
192 | root.appendChild(e);
193 | return true;
194 | }
195 |
196 | public static void main(String[] args) throws Exception {
197 | new ConceptDictTraverseEvent().saveToXML(new File("/home/xiatian/Desktop/concept.xml"));
198 | }
199 |
200 | }
201 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/ConceptLinkedList.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet2.concept;
2 |
3 | import java.util.LinkedList;
4 |
5 | /**
6 | * 用于概念处理的LinkedList
7 | *
8 | * @param
9 | */
10 | public class ConceptLinkedList extends LinkedList {
11 |
12 | private static final long serialVersionUID = -1889819083192992375L;
13 |
14 | /**
15 | * 删除链表中最后面的size个元素
16 | * @param size
17 | */
18 | public void removeLast(int size) {
19 | for (int i = 0; i < size; i++) {
20 | this.removeLast();
21 | }
22 | }
23 |
24 | /**
25 | * 根据概念的定义判断是否已经加入到链表中
26 | * @param concept
27 | */
28 | public void addByDefine(Concept concept) {
29 | for (Concept c : this) {
30 | if (c.getDefine().equals(concept.getDefine())) {
31 | return;
32 | }
33 | }
34 |
35 | this.add(concept);
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/LiuConceptParser.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet2.concept;
2 |
3 | import java.io.IOException;
4 | import java.util.Collection;
5 |
6 | import zx.soft.similarity.util.BlankUtils;
7 | import zx.soft.similarity.word.hownet2.sememe.BaseSememeParser;
8 | import zx.soft.similarity.word.hownet2.sememe.LiuqunSememeParser;
9 |
10 | /**
11 | * 相似度计算方式,对概念解析的处理方式
12 | *
13 | */
14 | public class LiuConceptParser extends BaseConceptParser {
15 |
16 | private static LiuConceptParser instance = null;
17 |
18 | public static LiuConceptParser getInstance() {
19 | if (instance == null) {
20 | try {
21 | instance = new LiuConceptParser();
22 | } catch (IOException e) {
23 | e.printStackTrace();
24 | }
25 | }
26 |
27 | return instance;
28 | }
29 |
30 | private LiuConceptParser(BaseSememeParser sememeParser) throws IOException {
31 | super(sememeParser);
32 | }
33 |
34 | private LiuConceptParser() throws IOException {
35 | super(new LiuqunSememeParser());
36 | }
37 |
38 | @Override
39 | protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4) {
40 | return beta1 * sim_v1 + beta2 * sim_v1 * sim_v2 + beta3 * sim_v1 * sim_v2 * sim_v3 + beta4 * sim_v1 * sim_v2
41 | * sim_v3 * sim_v4;
42 | }
43 |
44 | @Override
45 | public double getSimilarity(String word1, String word2) {
46 | double similarity = 0.0;
47 |
48 | // 如果两个句子相同,则直接返回1.0
49 | if (word1.equals(word2)) {
50 | return 1.0;
51 | }
52 |
53 | Collection concepts1 = getConcepts(word1);
54 | Collection concepts2 = getConcepts(word2);
55 |
56 | //如果是blank,则说明是未登录词, 需要计算组合概念
57 | if (BlankUtils.isBlank(concepts1) || BlankUtils.isBlank(concepts2)) {
58 | return 0.0;
59 | }
60 |
61 | //两个for循环分别计算词语所有可能的概念的相似度
62 | for (Concept c1 : concepts1) {
63 | for (Concept c2 : concepts2) {
64 | double v = getSimilarity(c1, c2);
65 |
66 | if (v > similarity) {
67 | similarity = v;
68 | }
69 |
70 | if (similarity == 1.0) {
71 | break;
72 | }
73 | }
74 | }
75 |
76 | return similarity;
77 | }
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/concept.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/main/java/zx/soft/similarity/word/hownet2/concept/concept.xml.gz
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/BaseSememeParser.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet2.sememe;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.util.zip.GZIPInputStream;
6 |
7 | import javax.xml.namespace.QName;
8 | import javax.xml.stream.XMLEventReader;
9 | import javax.xml.stream.XMLInputFactory;
10 | import javax.xml.stream.events.StartElement;
11 | import javax.xml.stream.events.XMLEvent;
12 |
13 | import org.slf4j.Logger;
14 | import org.slf4j.LoggerFactory;
15 |
16 | import zx.soft.similarity.Similaritable;
17 | import zx.soft.similarity.word.hownet.HownetMeta;
18 |
19 | import com.google.common.collect.HashMultimap;
20 | import com.google.common.collect.Multimap;
21 |
22 | /**
23 | * 义原解析器基类,所有义原存储在xml文件中(当前package中的sememe.xml.tar.gz文件)。
24 | * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》或《中文信息相似度计算理论与方法》一书第三章
25 | *
26 | * 为提高运算速度,义原的加载方式做了调整,只把义原的汉语定义和对应的Id加入到MultiMap对象中,并通过义原的层次化Id计算义原之间的相似度。
27 | *
28 | * @see {@link zx.soft.similarity.Similaritable}
29 | */
30 | public abstract class BaseSememeParser implements HownetMeta, Similaritable {
31 |
32 | protected Logger logger = LoggerFactory.getLogger(this.getClass());
33 |
34 | /** 所有的义原都存放到一个MultiMap, Key为Sememe的中文定义, Value为义原的Id */
35 | protected static Multimap SEMEMES = null;
36 |
37 | public BaseSememeParser() throws IOException {
38 | if (SEMEMES != null) {
39 | return;
40 | }
41 |
42 | SEMEMES = HashMultimap.create();
43 |
44 | InputStream input = this.getClass().getClassLoader().getResourceAsStream("data/sememe.xml.gz");
45 | input = new GZIPInputStream(input);
46 | load(input);
47 | }
48 |
49 | /**
50 | * 从文件中加载义元知识
51 | *
52 | * @throws IOException
53 | */
54 | public void load(InputStream input) throws IOException {
55 | System.out.print("loading sememes...");
56 | long time = System.currentTimeMillis();
57 | try {
58 | XMLInputFactory inputFactory = XMLInputFactory.newInstance();
59 | XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
60 |
61 | int count = 0;
62 | while (xmlEventReader.hasNext()) {
63 | XMLEvent event = xmlEventReader.nextEvent();
64 |
65 | if (event.isStartElement()) {
66 | StartElement startElement = event.asStartElement();
67 | if (startElement.getName().toString().equals("sememe")) {
68 | String cnWord = startElement.getAttributeByName(QName.valueOf("cn")).getValue();
69 | String id = startElement.getAttributeByName(QName.valueOf("id")).getValue();
70 | SEMEMES.put(cnWord, id);
71 | count++;
72 | if (count % 100 == 0) {
73 | System.out.print(".");
74 | }
75 | }
76 | }
77 | }
78 | input.close();
79 | } catch (Exception e) {
80 | throw new IOException(e);
81 | }
82 | time = System.currentTimeMillis() - time;
83 | System.out.println("\ncomplete!. time elapsed: " + (time / 1000) + "s");
84 | }
85 |
86 | /**
87 | * 计算两个义原之间的关联度
88 | *
89 | * @param sememeName1
90 | * @param sememeName2
91 | * @return
92 | */
93 | public double getAssociation(String sememeName1, String sememeName2) {
94 | return 0.0;
95 | }
96 |
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/LiuqunSememeParser.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet2.sememe;
2 |
3 | import java.io.IOException;
4 | import java.util.Collection;
5 |
6 | /**
7 | * 计算义原相似度的方法, 实现了SememeParser中定义的抽象方法
8 | *
9 | */
10 | public class LiuqunSememeParser extends BaseSememeParser {
11 |
12 | /** 计算义元相似度的可调节的参数,默认为1.6 */
13 | private final float alpha = 1.6f;
14 |
15 | public LiuqunSememeParser() throws IOException {
16 | super();
17 | }
18 |
19 | /**
20 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者
21 | *
similarity = alpha/(distance+alpha)
22 | *
23 | * @param key1
24 | * @param key2
25 | * @return
26 | */
27 | @Override
28 | public double getSimilarity(String item1, String item2) {
29 | int pos;
30 |
31 | // 如果为空串,直接返回0
32 | if (item1 == null || item2 == null || item1.equals("") || item2.equals(""))
33 | return 0.0;
34 |
35 | String key1 = item1.trim();
36 | String key2 = item2.trim();
37 |
38 | // 去掉()符号
39 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
40 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
41 | key1 = key1.substring(1, key1.length() - 1);
42 | key2 = key2.substring(1, key2.length() - 1);
43 | } else {
44 | return 0.0;
45 | }
46 | }
47 |
48 | // 处理关系义元,即x=y的情况
49 | if ((pos = key1.indexOf('=')) > 0) {
50 | int pos2 = key2.indexOf('=');
51 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0
52 | if ((pos == pos2) && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
53 | key1 = key1.substring(pos + 1);
54 | key2 = key2.substring(pos2 + 1);
55 | } else {
56 | return 0.0;
57 | }
58 | }
59 |
60 | // 处理符号义元,即前面有特殊符号的义元
61 | String symbol1 = key1.substring(0, 1);
62 | String symbol2 = key2.substring(0, 1);
63 |
64 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
65 | if (symbol1.equals(Symbol_Descriptions[i][0])) {
66 | if (symbol1.equals(symbol2)) {
67 | key1 = item1.substring(1);
68 | key2 = item2.substring(1);
69 | break;
70 | } else {
71 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0
72 | }
73 | }
74 | }
75 |
76 | if ((pos = key1.indexOf("|")) >= 0) {
77 | key1 = key1.substring(pos + 1);
78 | }
79 | if ((pos = key2.indexOf("|")) >= 0) {
80 | key2 = key2.substring(pos + 1);
81 | }
82 |
83 | int distance = getMinDistance(key1, key2);
84 | return alpha / (distance + alpha);
85 | }
86 |
87 | /**
88 | * 根据汉语定义计算义原之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大,由于可能多个义元有相同的汉语词语,
89 | * 故计算结果为其中距离最小者
90 | *
91 | * @param key1
92 | * @param key2
93 | * @return
94 | */
95 | public int getMinDistance(String sememe1, String sememe2) {
96 | int distance = Integer.MAX_VALUE;
97 |
98 | // 如果两个字符串相等,直接返回距离为0
99 | if (sememe1.equals(sememe2)) {
100 | return 0;
101 | }
102 |
103 | Collection sememeIds1 = SEMEMES.get(sememe1);
104 | Collection sememeIds2 = SEMEMES.get(sememe2);
105 |
106 | // 如果sememe1或者sememe2不是义元,则返回无穷大
107 | if (sememeIds1.size() == 0 || sememeIds1.size() == 0) {
108 | return Integer.MAX_VALUE;
109 | }
110 |
111 | for (String id1 : sememeIds1) {
112 | for (String id2 : sememeIds2) {
113 | int d = getDistance(id1, id2);
114 | if (d < distance) {
115 | distance = d;
116 | }
117 | }
118 | }
119 |
120 | return distance;
121 | }
122 |
123 | /**
124 | * 根据义原的具有层次的Id获取两个义原之间的语义距离
125 | * @param id1
126 | * @param id2
127 | * @return
128 | */
129 | int getDistance(String id1, String id2) {
130 | // 两个Id相同的位置终止地方
131 | int position = 0;
132 | String[] array1 = id1.split("-");
133 | String[] array2 = id2.split("-");
134 | for (position = 0; position < array1.length && position < array2.length; position++) {
135 | if (!array1[position].equals(array2[position])) {
136 | return array1.length + array2.length - position - position;
137 | }
138 | }
139 |
140 | if (array1.length == array2.length) {
141 | return 0;
142 | } else if (array1.length == position) {
143 | return array2.length - position;
144 | } else {
145 | return array1.length - position;
146 | }
147 | }
148 |
149 | }
150 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/Sememe.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet2.sememe;
2 |
3 | /**
4 | * 描述知网义原的基本对象, 出于性能考虑,把未用到的英文名称、定义等在加载时忽略, 更准确的做法是以[英文定义|中文定义]
5 | * 作为一个整理进行处理,不过绝大多数只根据中文定义就可以标识出来,因此忽略不计。
6 | * 义原编号采用父节点Id-子节点Id编码方式,如:
7 | * <sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/>
8 | * 义原的id表明了义原之间的上下位关系和义原的深度。
9 | *
10 | */
11 | public class Sememe {
12 |
13 | /**
14 | * 义原编号,采用父节点Id-子节点Id编码方式,如<sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/>
15 | * id表明了义原之间的上下位关系
16 | */
17 | private String id;
18 | /** 义原的中文名称*/
19 | private String cnWord;
20 | /** 义原的英文名称 */
21 | private String enWord;
22 | /** 义原的定义,如果没有(例如数量),则为空串 */
23 | private String define;
24 |
25 | /**
26 | * 每一行的形式为:be|是 {relevant,isa}/{relevant,descriptive}
27 | *
或者 official|官 [#organization|组织,#employee|员]
28 | *
或者 amount|多少
29 | *
把相应的部分赋予不同的属性
30 | * 出于性能考虑,把未用到的英文名称、定义等忽略
31 | * @param id
32 | * @param parentId
33 | * @param item 读取文件中的一行
34 | */
35 | public Sememe(String id, String en, String cn, String define) {
36 | this.id = id;
37 | this.cnWord = cn;
38 | //为提高效率,减少内存空间利用,可去掉以下两行
39 | this.enWord = en;
40 | this.define = define;
41 | }
42 |
43 | public String getId() {
44 | return id;
45 | }
46 |
47 | public void setId(String id) {
48 | this.id = id;
49 | }
50 |
51 | public String getCnWord() {
52 | return cnWord;
53 | }
54 |
55 | public void setCnWord(String cnWord) {
56 | this.cnWord = cnWord;
57 | }
58 |
59 | public String getEnWord() {
60 | return enWord;
61 | }
62 |
63 | public void setEnWord(String enWord) {
64 | this.enWord = enWord;
65 | }
66 |
67 | public String getDefine() {
68 | return define;
69 | }
70 |
71 | public void setDefine(String define) {
72 | this.define = define;
73 | }
74 |
75 | public int getType() {
76 | char ch = id.charAt(0);
77 | switch (ch) {
78 | case '1':
79 | return SememeType.Event;
80 | case '2':
81 | return SememeType.Entity;
82 | case '3':
83 | return SememeType.Attribute;
84 | case '4':
85 | return SememeType.Quantity;
86 | case '5':
87 | return SememeType.AValue;
88 | case '6':
89 | return SememeType.QValue;
90 | case '7':
91 | return SememeType.SecondaryFeature;
92 | case '8':
93 | return SememeType.Syntax;
94 | case '9':
95 | return SememeType.EventRoleAndFeature;
96 | default:
97 | return 0;
98 | }
99 | }
100 |
101 | @Override
102 | public String toString() {
103 | StringBuilder sb = new StringBuilder();
104 | sb.append("id=");
105 | sb.append(id);
106 | sb.append("; cnWord=");
107 | sb.append(cnWord);
108 | sb.append("; enWord=");
109 | sb.append(enWord);
110 | sb.append("; define=");
111 | sb.append(define);
112 | return sb.toString();
113 | }
114 |
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/SememeType.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet2.sememe;
2 |
3 | /**
4 | * 义原的类型定义
5 | *
6 | * - 1:Event|事件
7 | * - 2:Entity|实体
8 | * - 3:Attribute|属性
9 | * - 4:Quantity|数量
10 | * - 5:aValue|属性值
11 | * - 6:qValue|数量值
12 | * - 7: Secondary Feature|第二特征
13 | * - 8: Syntax|语法
14 | * - 9: EventRole|动态角色
15 | * - 10:EventFeatures|动态属性
16 | * - 0:未知
17 | *
18 | *
19 | * 其中1~7为基本义元,8为语法义元,9、10为关系义元
20 | *
21 | */
22 | public interface SememeType {
23 |
24 | /** Event|事件类型定义 */
25 | public static final int Event = 1;
26 |
27 | /** Entity|实体类型定义*/
28 | public static final int Entity = 2;
29 |
30 | /** Attribute|属性类型定义*/
31 | public static final int Attribute = 3;
32 |
33 | /** Quantity|数量类型定义*/
34 | public static final int Quantity = 4;
35 |
36 | /** aValue|属性值类型定义*/
37 | public static final int AValue = 5;
38 |
39 | /** qValue|数量值类型定义*/
40 | public static final int QValue = 6;
41 |
42 | /** Secondary Feature|第二特征类型定义*/
43 | public static final int SecondaryFeature = 7;
44 |
45 | /** Syntax|语法类型定义*/
46 | public static final int Syntax = 8;
47 |
48 | /** EventRole|动态角色类型定义*/
49 | public static final int EventRoleAndFeature = 9;
50 |
51 | /** 未知类型定义*/
52 | public static final int Unknown = 0;
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/XiaSememeParser.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.hownet2.sememe;
2 |
3 | import java.io.IOException;
4 | import java.util.Collection;
5 |
6 | import zx.soft.similarity.util.BlankUtils;
7 |
8 | /**
9 | * 义原相似度计算, 实现了SememeParser中定义的抽象方法
10 | *
11 | */
12 | public class XiaSememeParser extends BaseSememeParser {
13 |
14 | public XiaSememeParser() throws IOException {
15 | super();
16 | }
17 |
18 | /**
19 | * 计算两个义原的相似度
20 | */
21 | double getSimilarityBySememeId(final String id1, final String id2) {
22 |
23 | int position = 0;
24 | String[] array1 = id1.split("-");
25 | String[] array2 = id2.split("-");
26 | for (position = 0; position < array1.length && position < array2.length; position++) {
27 | if (!array1[position].equals(array2[position])) {
28 | break;
29 | }
30 | }
31 |
32 | return 2.0 * position / (array1.length + array2.length);
33 | }
34 |
35 | /**
36 | * 根据汉语定义计算义原之间的相似度,由于可能多个义元有相同的汉语词语,故计算结果为其中相似度最大者
37 | *
38 | * @param key1
39 | * @param key2
40 | * @return
41 | */
42 | public double getMaxSimilarity(String sememeName1, String sememeName2) {
43 | double maxValue = 0.0;
44 |
45 | // 如果两个字符串相等,直接返回距离为0
46 | if (sememeName1.equals(sememeName2)) {
47 | return 1.0;
48 | }
49 |
50 | Collection sememeIds1 = SEMEMES.get(sememeName1);
51 | Collection sememeIds2 = SEMEMES.get(sememeName2);
52 |
53 | // 如果sememe1或者sememe2不是义元,则返回0
54 | if (sememeIds1.size() == 0 || sememeIds1.size() == 0) {
55 | return 0.0;
56 | }
57 |
58 | for (String id1 : sememeIds1) {
59 | for (String id2 : sememeIds2) {
60 | double value = getSimilarityBySememeId(id1, id2);
61 | if (value > maxValue) {
62 | maxValue = value;
63 | }
64 | }
65 | }
66 |
67 | return maxValue;
68 | }
69 |
70 | /**
71 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者 similarity = alpha/(distance+alpha),
72 | * 如果两个字符串相同或都为空,直接返回1.0
73 | *
74 | * @param key1 第一个义原字符串
75 | * @param key2 第二个义原字符串
76 | * @return
77 | */
78 | @Override
79 | public double getSimilarity(String item1, String item2) {
80 | if (BlankUtils.isBlankAll(item2, item2)) {
81 | return 1.0;
82 | } else if (BlankUtils.isBlankAtLeastOne(item1, item2)) {
83 | return 0.0;
84 | } else if (item1.equals(item2)) {
85 | return 1.0;
86 | }
87 |
88 | String key1 = item1.trim();
89 | String key2 = item2.trim();
90 |
91 | // 去掉()符号
92 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
93 |
94 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
95 | key1 = key1.substring(1, key1.length() - 1);
96 | key2 = key2.substring(1, key2.length() - 1);
97 | } else {
98 | return 0.0;
99 | }
100 |
101 | }
102 |
103 | // 处理关系义元,即x=y的情况
104 | int pos = key1.indexOf('=');
105 | if (pos > 0) {
106 | int pos2 = key2.indexOf('=');
107 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0
108 | if ((pos == pos2) && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
109 | key1 = key1.substring(pos + 1);
110 | key2 = key2.substring(pos2 + 1);
111 | } else {
112 | return 0.0;
113 | }
114 | }
115 |
116 | // 处理符号义元,即前面有特殊符号的义元
117 | String symbol1 = key1.substring(0, 1);
118 | String symbol2 = key2.substring(0, 1);
119 |
120 | for (int i = 0; i < Symbol_Descriptions.length; i++) {
121 | if (symbol1.equals(Symbol_Descriptions[i][0])) {
122 | if (symbol1.equals(symbol2)) {
123 | key1 = item1.substring(1);
124 | key2 = item2.substring(1);
125 | break;
126 | } else {
127 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0
128 | }
129 | }
130 | }
131 |
132 | if ((pos = key1.indexOf("|")) >= 0) {
133 | key1 = key1.substring(pos + 1);
134 | }
135 | if ((pos = key2.indexOf("|")) >= 0) {
136 | key2 = key2.substring(pos + 1);
137 | }
138 |
139 | // 如果两个字符串相等,直接返回距离为0
140 | if (key1.equals(key2)) {
141 | return 1.0;
142 | }
143 |
144 | return getMaxSimilarity(key1, key2);
145 | }
146 |
147 | }
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/sememe.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/main/java/zx/soft/similarity/word/hownet2/sememe/sememe.xml.gz
--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/pinyin/PinyinSimilarity.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word.pinyin;
2 |
3 | import java.util.Set;
4 |
5 | import zx.soft.similarity.Similaritable;
6 | import zx.soft.similarity.util.EditDistance;
7 | import zx.soft.similarity.util.PinyinUtils;
8 |
9 | /**
10 | * 通过拼音计算两个词语是否相似,拼音的相似程度采用编辑距离算法,并进行归一化衡量
11 | *
12 | */
13 | public class PinyinSimilarity implements Similaritable {
14 |
15 | @Override
16 | public double getSimilarity(String item1, String item2) {
17 | Set pinyinSet1 = PinyinUtils.getInstance().getPinyin(item1);
18 | Set pinyinSet2 = PinyinUtils.getInstance().getPinyin(item2);
19 |
20 | double max = 0.0;
21 | for (String pinyin1 : pinyinSet1) {
22 | for (String pinyin2 : pinyinSet2) {
23 | double distance = new EditDistance().getEditDistance(pinyin1, pinyin2);
24 | double similarity = 1 - distance
25 | / ((pinyin1.length() > pinyin2.length()) ? pinyin1.length() : pinyin2.length());
26 | max = (max > similarity) ? max : similarity;
27 | if (max == 1.0) {
28 | return max;
29 | }
30 | }
31 | }
32 | return max;
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/tendency/word/HownetWordTendency.java:
--------------------------------------------------------------------------------
1 | package zx.soft.tendency.word;
2 |
3 | import java.io.IOException;
4 | import java.util.Collection;
5 | import java.util.HashSet;
6 | import java.util.Set;
7 |
8 | import org.slf4j.Logger;
9 | import org.slf4j.LoggerFactory;
10 |
11 | import zx.soft.similarity.word.hownet2.concept.BaseConceptParser;
12 | import zx.soft.similarity.word.hownet2.concept.Concept;
13 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
14 | import zx.soft.similarity.word.hownet2.sememe.BaseSememeParser;
15 | import zx.soft.similarity.word.hownet2.sememe.XiaSememeParser;
16 |
17 | /**
18 | * 基于知网实现的词语倾向性判别
19 | *
20 | */
21 | public class HownetWordTendency implements WordTendency {
22 |
23 | private static Logger logger = LoggerFactory.getLogger(HownetWordTendency.class);
24 |
25 | public static String[] POSITIVE_SEMEMES = new String[] { "良", "喜悦", "夸奖", "满意", "期望", "注意", "致敬", "喜欢", "专", "敬佩",
26 | "同意", "爱惜", "愿意", "思念", "拥护", "祝贺", "福", "需求", "奖励", "致谢", "欢迎", "羡慕", "感激", "爱恋" };
27 |
28 | public static String[] NEGATIVE_SEMEMES = new String[] { "莠", "谴责", "害怕", "生气", "悲哀", "着急", "轻视", "羞愧", "烦恼", "灰心",
29 | "犹豫", "为难", "懊悔", "厌恶", "怀疑", "怜悯", "忧愁", "示怒", "不满", "仇恨", "埋怨", "失望", "坏" };
30 | private BaseConceptParser conceptParser = null;
31 | private BaseSememeParser sememeParser = null;
32 |
33 | public HownetWordTendency() {
34 | this.conceptParser = XiaConceptParser.getInstance();
35 | try {
36 | this.sememeParser = new XiaSememeParser();
37 | } catch (IOException e) {
38 | logger.error("Exception:{}", e.getMessage());
39 | }
40 | }
41 |
42 | @Override
43 | public double getTendency(String word) {
44 | double positive = getSentiment(word, POSITIVE_SEMEMES);
45 | double negative = getSentiment(word, NEGATIVE_SEMEMES);
46 | return positive - negative;
47 | }
48 |
49 | public double getSentiment(String word, String[] candidateSememes) {
50 | Collection concepts = conceptParser.getConcepts(word);
51 | Set sememes = new HashSet<>();
52 | for (Concept c : concepts) {
53 | sememes.addAll(c.getAllSememeNames());
54 | }
55 |
56 | double max = 0.0;
57 | for (String item : sememes) {
58 | double total = 0.0;
59 | for (String positiveSememe : candidateSememes) {
60 | //如果有特别接近的义原,直接返回该相似值,避免其他干扰
61 | double value = sememeParser.getSimilarity(item, positiveSememe);
62 | if (value > 0.9) {
63 | return value;
64 | }
65 | total += value;
66 | }
67 | double sim = total / candidateSememes.length;
68 | if (sim > max) {
69 | max = sim;
70 | }
71 | }
72 | return max;
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/tendency/word/Training.java:
--------------------------------------------------------------------------------
1 | package zx.soft.tendency.word;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.IOException;
7 | import java.io.InputStreamReader;
8 | import java.util.ArrayList;
9 | import java.util.Collection;
10 | import java.util.Collections;
11 | import java.util.HashMap;
12 | import java.util.List;
13 | import java.util.Map;
14 |
15 | import zx.soft.similarity.util.BlankUtils;
16 | import zx.soft.similarity.word.hownet2.concept.Concept;
17 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
18 | import zx.soft.similarity.word.hownet2.sememe.XiaSememeParser;
19 |
20 | import com.google.common.collect.HashMultimap;
21 | import com.google.common.collect.Multimap;
22 |
23 | /**
24 | * 临时训练及测试类
25 | *
26 | */
27 | public class Training {
28 |
29 | void test(boolean testPositive) throws IOException {
30 | WordTendency tendency = new HownetWordTendency();
31 | File f = new File("./dict/sentiment/负面情感词语(中文).txt");
32 | if (testPositive) {
33 | //f = new File("./dict/sentiment/正面情感词语(中文).txt");
34 | f = new File("./dict/sentiment/正面评价词语(中文).txt");
35 | }
36 | String encoding = "utf-8";
37 | String line;
38 | int wordCount = 0;
39 | int correctCount = 0;
40 |
41 | try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));) {
42 | while ((line = in.readLine()) != null) {
43 | if (line.length() > 5)
44 | continue;
45 | wordCount++;
46 |
47 | double value = tendency.getTendency(line.trim());
48 | if (value > 0 && testPositive) {
49 | correctCount++;
50 | } else if (value < 0 && !testPositive) {
51 | correctCount++;
52 | } else {
53 | System.out.println("error:" + line + "\t value:" + value);
54 | }
55 | }
56 | }
57 | System.out.println("correct:" + correctCount);
58 | System.out.println("total:" + wordCount);
59 | System.out.println("ratio:" + correctCount * 1.0 / wordCount);
60 | }
61 |
62 | /**
63 | * 该方法用于统计知网提供的情感词集合所涉及的义原以及出现频度
64 | * @throws IOException
65 | */
66 | void countSentimentDistribution() throws IOException {
67 | Map sememeMap = new HashMap<>();
68 | File f = new File("./dict/sentiment/负面情感词语(中文).txt");
69 | String encoding = "utf-8";
70 | boolean autoCombineConcept = false;
71 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));
72 |
73 | XiaConceptParser parser = new XiaConceptParser(new XiaSememeParser());
74 |
75 | String line = null;
76 |
77 | int conceptCount = 0;
78 | int wordCount = 0;
79 | while ((line = in.readLine()) != null) {
80 | if (line.length() > 5)
81 | continue;
82 | wordCount++;
83 | String word = line.trim();
84 | Collection concepts = parser.getInnerConcepts(word);
85 | //由于目前的词典为知网2000版本,所以默认情况下仅对词典中出现的概念进行统计
86 | if (BlankUtils.isBlank(concepts) && autoCombineConcept) {
87 | concepts = parser.autoCombineConcepts(word, null);
88 | }
89 | for (Concept c : concepts) {
90 | conceptCount++;
91 | List names = new ArrayList();
92 |
93 | //加入主义原
94 | names.add(c.getMainSememe());
95 |
96 | //加入关系义原
97 | for (String item : c.getRelationSememes()) {
98 | names.add(item.substring(item.indexOf("=") + 1));
99 | }
100 |
101 | //加入符号义原
102 | for (String item : c.getSymbolSememes()) {
103 | names.add(item.substring(1));
104 | }
105 |
106 | //加入其他义原集合
107 | for (String item : c.getSecondSememes()) {
108 | names.add(item);
109 | }
110 |
111 | for (String item : names) {
112 | Integer count = sememeMap.get(item);
113 | if (count == null) {
114 | sememeMap.put(item, 1);
115 | } else {
116 | sememeMap.put(item, count + 1);
117 | }
118 | }
119 | }
120 | }
121 | in.close();
122 |
123 | //以下是为了按照义原出现的数量进行排序的代码
124 | Multimap map2 = HashMultimap.create();
125 | for (String key : sememeMap.keySet()) {
126 | map2.put(sememeMap.get(key), key);
127 | }
128 | List keys = new ArrayList<>();
129 | for (Integer key : map2.keySet()) {
130 | keys.add(key);
131 | }
132 | Collections.sort(keys);
133 |
134 | int smallSememeCount = 0; //较少出现的不同义原数量
135 | int smallAppearTotal = 0; //较少出现的义原在概念众出现的次数总和
136 | for (int index = (keys.size() - 1); index >= 0; index--) {
137 | Integer key = keys.get(index);
138 | Collection values = map2.get(key);
139 | double ratio = (key * 100.0 / conceptCount);
140 | System.out.print(key + "(" + ratio + "%): ");
141 | for (String v : values) {
142 | System.out.print(v + "\t");
143 | }
144 | System.out.println();
145 | if (ratio < 0.7) {
146 | smallSememeCount += values.size();
147 | smallAppearTotal += key * values.size();
148 | }
149 | }
150 |
151 | System.out.println("small info: ");
152 | System.out.println("\tdifferent sememes:" + smallSememeCount);
153 | System.out.println("\tappear count:" + smallAppearTotal);
154 | System.out.println("\tratio:" + smallAppearTotal * 100.0 / conceptCount);
155 | System.out.println("wordCount:" + wordCount);
156 | System.out.println("conceptCount:" + conceptCount);
157 | }
158 |
159 | public static void main(String[] args) throws IOException {
160 | Training training = new Training();
161 | training.countSentimentDistribution();
162 | // System.out.println("test positive:");
163 | // training.test(true);
164 | //
165 | // System.out.println("test negative:");
166 | //training.test(false);
167 | }
168 |
169 | }
170 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/tendency/word/WordTendency.java:
--------------------------------------------------------------------------------
1 | package zx.soft.tendency.word;
2 |
3 | /**
4 | * 计算词语的语义倾向性,词语的语义倾向性为一个介于[-1, 1]之间的实数,数值越大,褒义性越强,否则,贬义性越强
5 | *
6 | */
7 | public interface WordTendency {
8 |
9 | /**
10 | * 获取词语的语义倾向性,词语的语义倾向性为一个介于[-1, 1]之间的实数,数值越大,褒义性越强,否则,贬义性越强
11 | * @param word
12 | * @return
13 | */
14 | public double getTendency(String word);
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/ui/PhraseSimilarityUI.java:
--------------------------------------------------------------------------------
1 | package zx.soft.ui;
2 |
3 | import java.awt.BorderLayout;
4 | import java.awt.GridLayout;
5 | import java.awt.event.ActionEvent;
6 | import java.awt.event.ActionListener;
7 |
8 | import javax.swing.BorderFactory;
9 | import javax.swing.JButton;
10 | import javax.swing.JLabel;
11 | import javax.swing.JPanel;
12 | import javax.swing.JScrollPane;
13 | import javax.swing.JTextArea;
14 | import javax.swing.JTextField;
15 |
16 | import zx.soft.similarity.phrase.PhraseSimilarity;
17 |
18 | /**
19 | * 短语相似度的调用演示界面
20 | */
21 | public class PhraseSimilarityUI {
22 |
23 | /**
24 | * 短语相似度的演示面板
25 | *
26 | * @return
27 | */
28 | public static JPanel createPanel() {
29 | // 声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
30 | JPanel fullPanel = new JPanel();
31 | fullPanel.setLayout(new BorderLayout());
32 |
33 | JPanel northPanel = new JPanel();
34 | fullPanel.add(northPanel, "North");
35 |
36 | // centerPanel包括了一个文本框
37 | JPanel centerPanel = new JPanel();
38 | fullPanel.add(centerPanel, "Center");
39 |
40 | centerPanel.setLayout(new BorderLayout());
41 | final JTextArea result = new JTextArea();
42 | // result.setFont(new Font("宋体", Font.PLAIN, 16));
43 | result.setLineWrap(true);
44 | JScrollPane centerScrollPane = new JScrollPane(result);
45 | centerPanel.add(centerScrollPane, "Center");
46 |
47 | northPanel.setLayout(new GridLayout(1, 1));
48 | // northPanel.add(createWordPanel());
49 | // northPanel.add(createCilinPanel());
50 |
51 | // 以下加入northPanel中的第一个面板
52 | final JTextField field1 = new JTextField("");
53 | final JTextField field2 = new JTextField("");
54 | field1.setColumns(50);
55 | field2.setColumns(50);
56 |
57 | JPanel mainPanel = new JPanel();
58 | mainPanel.setLayout(new GridLayout(3, 1));
59 |
60 | JPanel linePanel = new JPanel();
61 | linePanel.add(new JLabel("短语1:"));
62 | linePanel.add(field1);
63 | mainPanel.add(linePanel);
64 |
65 | linePanel = new JPanel();
66 | linePanel.add(new JLabel("短语2:"));
67 | linePanel.add(field2);
68 | mainPanel.add(linePanel);
69 |
70 | linePanel = new JPanel();
71 | JButton goButton = new JButton("计算相似度");
72 | linePanel.add(goButton);
73 | mainPanel.add(linePanel);
74 | goButton.addActionListener(new ActionListener() {
75 |
76 | @Override
77 | public void actionPerformed(ActionEvent e) {
78 | String phrase1 = field1.getText();
79 | String phrase2 = field2.getText();
80 | String text = "[" + phrase1 + "]与[" + phrase2 + "]的相似度为:";
81 | text = text + new PhraseSimilarity().getSimilarity(phrase1, phrase2);
82 | // text = text + "\n\n" + result.getText();
83 | result.setText(text);
84 | }
85 |
86 | });
87 | mainPanel.setBorder(BorderFactory.createEtchedBorder());
88 | northPanel.add(mainPanel);
89 |
90 | return fullPanel;
91 | }
92 |
93 | }
94 |
--------------------------------------------------------------------------------
/src/main/java/zx/soft/ui/Start.java:
--------------------------------------------------------------------------------
1 | package zx.soft.ui;
2 |
3 | import java.awt.Container;
4 | import java.awt.Font;
5 | import java.util.Enumeration;
6 |
7 | import javax.swing.JFrame;
8 | import javax.swing.JMenu;
9 | import javax.swing.JMenuBar;
10 | import javax.swing.JMenuItem;
11 | import javax.swing.JScrollPane;
12 | import javax.swing.JTabbedPane;
13 | import javax.swing.SwingUtilities;
14 | import javax.swing.UIManager;
15 | import javax.swing.plaf.FontUIResource;
16 |
17 | import zx.soft.similarity.sentence.SegmentProxy;
18 | import zx.soft.similarity.util.About;
19 |
20 | /**
21 | * 相似度计算软件包演示启动类
22 | *
23 | */
24 | public class Start extends JFrame {
25 |
26 | private static final long serialVersionUID = 85744461208L;
27 |
28 | public Start() {
29 | this.setTitle("相似度计算演示程序");
30 | this.setSize(420, 700);
31 | this.setLocationRelativeTo(null);
32 | this.setDefaultCloseOperation(EXIT_ON_CLOSE);
33 |
34 | // //////////////////////////////////
35 | // add menu
36 | JMenuBar menuBar = new JMenuBar();
37 | this.setJMenuBar(menuBar);
38 |
39 | JMenu fileMenu = new JMenu("File");
40 | menuBar.add(fileMenu);
41 | fileMenu.add(new JMenuItem("Exit"));
42 |
43 | JMenu helpMenu = new JMenu("Help");
44 | menuBar.add(helpMenu);
45 | helpMenu.add(new JMenuItem("Help"));
46 |
47 | Container contentPane = this.getContentPane();
48 | JTabbedPane tabbedPane = new JTabbedPane();
49 | tabbedPane.add("词语", WordSimlarityUI.createPanel());
50 | tabbedPane.add("短语", PhraseSimilarityUI.createPanel());
51 | tabbedPane.add("句子", SentenceSimilarityUI.createPanel());
52 | // tabbedPane.add("文本", WordSimlarityUI.createPanel());
53 | tabbedPane.add("词法分析", SegmentProxy.createPanel());
54 | tabbedPane.add("义原树", SememeTreeUI.createPanel());
55 | tabbedPane.add("情感分析", TendencyUI.createPanel());
56 | tabbedPane.add("关于", About.createPanel());
57 | JScrollPane scrollPane = new JScrollPane(tabbedPane);
58 | contentPane.add(scrollPane);
59 |
60 | this.pack();
61 | setExtendedState(MAXIMIZED_BOTH);
62 | }
63 |
64 | public static void InitGlobalFont(Font font) {
65 | FontUIResource fontRes = new FontUIResource(font);
66 | for (Enumeration