├── .gitignore
├── README.md
├── bin
    ├── start
    └── start.py
├── dict
    ├── sentiment
    │   ├── 主张词语（中文）.txt
    │   ├── 主张词语（英文）.txt
    │   ├── 正面情感词语（中文）.txt
    │   ├── 正面情感词语（英文）.txt
    │   ├── 正面评价词语（中文）.txt
    │   ├── 正面评价词语（英文）.txt
    │   ├── 程度级别词语（中文）.txt
    │   ├── 程度级别词语（英文）.txt
    │   ├── 统计结果.txt
    │   ├── 负面情感词语（中文）.txt
    │   ├── 负面情感词语（英文）.txt
    │   ├── 负面评价词语（中文）.txt
    │   └── 负面评价词语（英文）.txt
    ├── tendency
    │   └── tendency.xml
    └── user-concept.xml
├── doc
    ├── HISTORY.md
    ├── LCMC.zip
    ├── REVISION.md
    └── 中文信息相似度计算理论与方法图书目录.pdf
├── pom.xml
└── src
    ├── main
        ├── java
        │   └── zx
        │   │   └── soft
        │   │       ├── classification
        │   │           ├── Feature.java
        │   │           ├── Instance.java
        │   │           ├── NaiveBayesClassifier.java
        │   │           └── Variable.java
        │   │       ├── similarity
        │   │           ├── Similaritable.java
        │   │           ├── SimilarityFactory.java
        │   │           ├── phrase
        │   │           │   └── PhraseSimilarity.java
        │   │           ├── sentence
        │   │           │   ├── SegmentProxy.java
        │   │           │   ├── SentenceSimilarity.java
        │   │           │   ├── editdistance
        │   │           │   │   ├── Block.java
        │   │           │   │   ├── CharEditUnit.java
        │   │           │   │   ├── ChunkEditUnit.java
        │   │           │   │   ├── EditDistance.java
        │   │           │   │   ├── EditUnit.java
        │   │           │   │   ├── GregorEditDistance.java
        │   │           │   │   ├── Split.java
        │   │           │   │   ├── StandardEditDistance.java
        │   │           │   │   ├── SuperString.java
        │   │           │   │   ├── WordEditUnit.java
        │   │           │   │   ├── XiatianEditDistance.java
        │   │           │   │   └── XiatianEditDistance2.java
        │   │           │   └── morphology
        │   │           │   │   ├── MorphoSimilarity.java
        │   │           │   │   └── SemanticSimilarity.java
        │   │           ├── statistic
        │   │           │   ├── DictStatistic.java
        │   │           │   └── LCMC.java
        │   │           ├── text
        │   │           │   └── DiceSimilarity.java
        │   │           ├── util
        │   │           │   ├── About.java
        │   │           │   ├── BlankUtils.java
        │   │           │   ├── EditDistance.java
        │   │           │   ├── F02-GB2312-to-PuTongHua-PinYin.txt
        │   │           │   ├── FileUtils.java
        │   │           │   ├── MathUtils.java
        │   │           │   ├── PinyinUtils.java
        │   │           │   ├── TraverseEvent.java
        │   │           │   ├── XmlException.java
        │   │           │   ├── XmlUtils.java
        │   │           │   └── about.html
        │   │           └── word
        │   │           │   ├── CharBasedSimilarity.java
        │   │           │   ├── WordSimilarity.java
        │   │           │   ├── cilin
        │   │           │       ├── Cilin.java
        │   │           │       ├── CilinCoding.java
        │   │           │       ├── CilinDb.java
        │   │           │       └── cilin.db.gz
        │   │           │   ├── hownet
        │   │           │       ├── Hownet.java
        │   │           │       ├── HownetMeta.java
        │   │           │       ├── concept
        │   │           │       │   ├── Concept.java
        │   │           │       │   ├── ConceptDictTraverseEvent.java
        │   │           │       │   ├── ConceptLinkedList.java
        │   │           │       │   ├── ConceptParser.java
        │   │           │       │   ├── LiuConceptParser.java
        │   │           │       │   ├── MyConceptParser.java
        │   │           │       │   └── concept.dat
        │   │           │       └── sememe
        │   │           │       │   ├── FastSimpleMap.java
        │   │           │       │   ├── LiuqunSememeParser.java
        │   │           │       │   ├── MySememeParser.java
        │   │           │       │   ├── Sememe.java
        │   │           │       │   ├── SememeDictTraverseEvent.java
        │   │           │       │   ├── SememeParser.java
        │   │           │       │   ├── SememeType.java
        │   │           │       │   └── sememe.dat
        │   │           │   ├── hownet2
        │   │           │       ├── concept
        │   │           │       │   ├── BaseConceptParser.java
        │   │           │       │   ├── Concept.java
        │   │           │       │   ├── ConceptDictTraverseEvent.java
        │   │           │       │   ├── ConceptLinkedList.java
        │   │           │       │   ├── LiuConceptParser.java
        │   │           │       │   ├── XiaConceptParser.java
        │   │           │       │   └── concept.xml.gz
        │   │           │       └── sememe
        │   │           │       │   ├── BaseSememeParser.java
        │   │           │       │   ├── LiuqunSememeParser.java
        │   │           │       │   ├── Sememe.java
        │   │           │       │   ├── SememeType.java
        │   │           │       │   ├── XiaSememeParser.java
        │   │           │       │   └── sememe.xml.gz
        │   │           │   └── pinyin
        │   │           │       └── PinyinSimilarity.java
        │   │       ├── tendency
        │   │           └── word
        │   │           │   ├── HownetWordTendency.java
        │   │           │   ├── Training.java
        │   │           │   └── WordTendency.java
        │   │       └── ui
        │   │           ├── PhraseSimilarityUI.java
        │   │           ├── SememeTreeUI.java
        │   │           ├── SentenceSimilarityUI.java
        │   │           ├── Start.java
        │   │           ├── TendencyUI.java
        │   │           └── WordSimlarityUI.java
        └── resources
        │   ├── data
        │       ├── F02-GB2312-to-PuTongHua-PinYin.txt
        │       ├── about.html
        │       ├── cilin.db.gz
        │       ├── concept.dat
        │       ├── concept.xml.gz
        │       ├── sememe.dat
        │       └── sememe.xml.gz
        │   └── logback.xml
    └── test
        ├── java
            └── zx
            │   └── soft
            │       └── similarity
            │           ├── sentence
            │               └── SemanticSimilarityTest.java
            │           ├── statistic
            │               └── DictStatisticTest.java
            │           └── word
            │               ├── CharBasedSimilarityTest.java
            │               ├── hownet
            │                   ├── ConceptTest.java
            │                   └── SememeTest.java
            │               └── hownet2
            │                   └── HownetSimilarityTest.java
        └── resources
            ├── data
                ├── F02-GB2312-to-PuTongHua-PinYin.txt
                ├── about.html
                ├── cilin.db.gz
                ├── concept.dat
                ├── concept.xml.gz
                ├── sememe.dat
                └── sememe.xml.gz
            └── logback-test.xml


/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | .settings/
4 | target/
5 | logs/
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 中文语义相似度计算框架
 3 | 
 4 | > 汉语词语、组块、句子以及文本篇章等各个层面的相似度计算是中文信息处理领域的一项基础而又核心的工作，它直接决定着相关领域的研究发展状况，例如，在知识工程、基于实例的机器翻译、信息检索、自动问答以及拼写检查等方面，相似度计算都是一个非常关键的问题，长期以来一直是人们研究的一个热点和难点。相似度的研究涉及词语、组块、句子以及篇章等多个层面，目前的研究主要侧重于词语方面，提出了一些比较有代表性的理论与方法，如字面相似度算法、词素相似度算法，以及基于同义词词林、知网等语义词典的方法，国外的方法则主要包括基于构成字符的相似度计算方法、基于WORDNET的计算方法、基于词典注释的方法、基于大规模语料库统计的方法和基于搜索引擎的方法；有关组块、短语级别的相似度的研究现在还比较少，常用的方法是在词语相似度计算的基础上，借用句子相似度的计算方法计算组块之间的相似度。在句子层面的相似度计算方面，国外研究主要集中在字符串的相似度计算，国内则主要以词语为基本处理单元，通过计算相同词语所占的比重确定句子之间的相似度；文本层面的则集中于利用统计方法实现相似度计算。
 5 | 
 6 | > 在相似度计算的研究过程中，许多研究学者的成果公布和无私帮助让我们受益匪浅，我们把代码开源出来，既是对前辈们表达我们的尊重之情，也希望能对大家共同的研究社区能有点滴贡献，能避免一些重复工作。
 7 | 
 8 | 运行ruc.irm.ui.Start进行快速测试。
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/bin/start:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | X_HOME=/home/xiatian/workspace/study/xsimilarity
3 | java -Dswing.systemlaf=javax.swing.plaf.metal.MetalLookAndFeel -cp $X_HOME/lib/commons-logging-1.0.4.jar:$X_HOME/lib/log4j-1.2.8.jar:$X_HOME/lib/google-collect-1.0.jar:$X_HOME/lib/ictclas4j.jar:$X_HOME/bin ruc.irm.similarity.MainUI
4 | 


--------------------------------------------------------------------------------
/bin/start.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os, sys, re
 3 | 
 4 | #home to store project
 5 | HOME = './'
 6 | 
 7 | #loop directory and get all libraries
 8 | def getlibraries(HOME):
 9 | 	jars = ''
10 |         split_char = ':'
11 |         if os.name=='nt':
12 | 	        split_char = ';'
13 | 		jars = "."
14 | 	
15 |         jars = jars + split_char + home +  '/target/classes/main';
16 |         jars = jars + split_char + home + '/target/classes/test';
17 |         jars = jars + split_char + home + '/src/main/resources';
18 |         jars = jars + split_char + home + '/src/main/java';
19 |         libdir = home + "/lib";
20 | 
21 |         for jar in os.listdir(libdir):
22 |             if(jar==".svn"):continue
23 |             fullname = os.path.join(libdir,jar)
24 |             if os.path.isdir(fullname):
25 |                 for subjar in os.listdir(fullname):
26 |                     if subjar.endswith('.jar'):
27 |                         jars = jars + split_char + os.path.join(fullname, subjar)
28 |             else:
29 |                 jars = jars + split_char + fullname    
30 | 
31 |         return jars;
32 | 
33 | home = os.getcwd()
34 | if(os.path.basename(home)=='bin'):
35 |         home = os.path.join(home,'..')
36 |         
37 | libpath = getlibraries(home)
38 | command = 'java -Xmx256M -cp "' + libpath + '" '
39 | 
40 | if(len(sys.argv)==1):
41 | 	print "useage:./start.py runclass"
42 | 	command = command + ' ruc.irm.similarity.MainUI'
43 | else:
44 |     args = sys.argv
45 |     for i in range(1,len(args)):
46 |         command = command + ' ' + args[i]
47 | 
48 | print "execute ", command
49 | print "xiatian@ruc."
50 | print os.getcwd()
51 | os.system(command)
52 | 


--------------------------------------------------------------------------------
/dict/sentiment/主张词语（中文）.txt:
--------------------------------------------------------------------------------
 1 | 中文主张词语    38
 2 | 
 3 | 1. {perception|感知}  22  
 4 | 察觉
 5 | 触目
 6 | 耳闻
 7 | 发
 8 | 发觉
 9 | 发现
10 | 风闻
11 | 感
12 | 感觉
13 | 感觉到
14 | 感受到
15 | 见到
16 | 见得
17 | 觉
18 | 觉得
19 | 看得出来
20 | 窥见
21 | 领教
22 | 听说
23 | 痛感
24 | 预感
25 | 自觉
26 | 
27 | 2. {regard|认为}  16
28 | 抱定
29 | 当
30 | 道
31 | 感到
32 | 感觉
33 | 觉得
34 | 看
35 | 看待
36 | 论
37 | 认定
38 | 认为
39 | 认准
40 | 想
41 | 相信
42 | 以为
43 | 主张
44 | 


--------------------------------------------------------------------------------
/dict/sentiment/主张词语（英文）.txt:
--------------------------------------------------------------------------------
 1 | 英文主张词语    35
 2 | 
 3 | 1. {perception|感知}  21
 4 | be aware of
 5 | be conscious
 6 | be conscious of
 7 | be told
 8 | become aware of
 9 | detect
10 | discern
11 | discover
12 | feel
13 | find
14 | get a glimpse of
15 | get wind of
16 | have a premonition
17 | hear of
18 | keenly feel
19 | learn through hearsay
20 | meet the eye
21 | notice
22 | perceive
23 | see
24 | sense
25 | 
26 | {regard|认为} 14
27 | advocate
28 | believe
29 | consider
30 | feel
31 | firmly believe
32 | hold
33 | look upon
34 | maintain
35 | regard
36 | sense
37 | set one's mind on
38 | stand for
39 | suppose
40 | think


--------------------------------------------------------------------------------
/dict/sentiment/程度级别词语（中文）.txt:
--------------------------------------------------------------------------------
  1 | 中文程度级别词语    219
  2 | 
  3 | 1. “极其|extreme / 最|most”  69
  4 | 百分之百
  5 | 倍加
  6 | 备至
  7 | 不得了
  8 | 不堪
  9 | 不可开交
 10 | 不亦乐乎
 11 | 不折不扣
 12 | 彻头彻尾
 13 | 充分
 14 | 到头
 15 | 地地道道
 16 | 非常
 17 | 极
 18 | 极度
 19 | 极端
 20 | 极其
 21 | 极为
 22 | 截然
 23 | 尽
 24 | 惊人地
 25 | 绝
 26 | 绝顶
 27 | 绝对
 28 | 绝对化
 29 | 刻骨
 30 | 酷
 31 | 满
 32 | 满贯
 33 | 满心
 34 | 莫大
 35 | 奇
 36 | 入骨
 37 | 甚为
 38 | 十二分
 39 | 十分
 40 | 十足
 41 | 死
 42 | 滔天
 43 | 痛
 44 | 透
 45 | 完全
 46 | 完完全全
 47 | 万
 48 | 万般
 49 | 万分
 50 | 万万
 51 | 无比
 52 | 无度
 53 | 无可估量
 54 | 无以复加
 55 | 无以伦比
 56 | 要命
 57 | 要死
 58 | 已极
 59 | 已甚
 60 | 异常
 61 | 逾常
 62 | 贼
 63 | 之极
 64 | 之至
 65 | 至极
 66 | 卓绝
 67 | 最为
 68 | 佼佼
 69 | 郅
 70 | 綦
 71 | 齁
 72 | 最
 73 | 
 74 | 2. “很|very” 42
 75 | 不过
 76 | 不少
 77 | 不胜
 78 | 惨
 79 | 沉
 80 | 沉沉
 81 | 出奇
 82 | 大为
 83 | 多
 84 | 多多
 85 | 多加
 86 | 多么
 87 | 分外
 88 | 格外
 89 | 够瞧的
 90 | 够戗
 91 | 好
 92 | 好不
 93 | 何等
 94 | 很
 95 | 很是
 96 | 坏
 97 | 可
 98 | 老
 99 | 老大
100 | 良
101 | 颇
102 | 颇为
103 | 甚
104 | 实在
105 | 太
106 | 太甚
107 | 特
108 | 特别
109 | 尤
110 | 尤其
111 | 尤为
112 | 尤以
113 | 远
114 | 着实
115 | 曷
116 | 碜
117 | 
118 | 3. “较|more” 37
119 | 大不了
120 | 多
121 | 更
122 | 更加
123 | 更进一步
124 | 更为
125 | 还
126 | 还要
127 | 较
128 | 较比
129 | 较为
130 | 进一步
131 | 那般
132 | 那么
133 | 那样
134 | 强
135 | 如斯
136 | 益
137 | 益发
138 | 尤甚
139 | 逾
140 | 愈
141 | 愈 ... 愈
142 | 愈发
143 | 愈加
144 | 愈来愈
145 | 愈益
146 | 远远
147 | 越 ... 越
148 | 越发
149 | 越加
150 | 越来越
151 | 越是
152 | 这般
153 | 这样
154 | 足
155 | 足足
156 | 
157 | 4. “稍|-ish” 29
158 | 点点滴滴
159 | 多多少少
160 | 怪
161 | 好生
162 | 还
163 | 或多或少
164 | 略
165 | 略加
166 | 略略
167 | 略微
168 | 略为
169 | 蛮
170 | 稍
171 | 稍稍
172 | 稍微
173 | 稍为
174 | 稍许
175 | 挺
176 | 未免
177 | 相当
178 | 些
179 | 些微
180 | 些小
181 | 一点
182 | 一点儿
183 | 一些
184 | 有点
185 | 有点儿
186 | 有些
187 | 
188 | 5. “欠|insufficiently” 12
189 | 半点
190 | 不大
191 | 不丁点儿
192 | 不甚
193 | 不怎么
194 | 聊
195 | 没怎么
196 | 轻度
197 | 弱
198 | 丝毫
199 | 微
200 | 相对
201 | 
202 | 6. “超|over” 30
203 | 不为过
204 | 超
205 | 超额
206 | 超外差
207 | 超微结构
208 | 超物质
209 | 出头
210 | 多
211 | 浮
212 | 过
213 | 过度
214 | 过分
215 | 过火
216 | 过劲
217 | 过了头
218 | 过猛
219 | 过热
220 | 过甚
221 | 过头
222 | 过于
223 | 过逾
224 | 何止
225 | 何啻
226 | 开外
227 | 苦
228 | 老
229 | 偏
230 | 强
231 | 溢
232 | 忒
233 | 
234 | 
235 | 
236 | 


--------------------------------------------------------------------------------
/dict/sentiment/程度级别词语（英文）.txt:
--------------------------------------------------------------------------------
  1 | 英文程度级别词语    170
  2 | 
  3 | 1. “极其|extreme / 最|most”  64
  4 | 100 percent
  5 | absolute
  6 | absolutely
  7 | alarmingly
  8 | amazingly
  9 | as fully as possible
 10 | astonishingly
 11 | awfully
 12 | beyond challenge
 13 | beyond compare
 14 | beyond comparison
 15 | beyond measure
 16 | bitterly
 17 | by all means
 18 | completely
 19 | deep-rooted
 20 | deep-seated
 21 | deeply
 22 | definitely
 23 | disastrously
 24 | downright
 25 | entirely
 26 | exceedingly
 27 | excessively
 28 | extreme
 29 | extremely
 30 | fully
 31 | greatest
 32 | greatly
 33 | heinous
 34 | hundred-percent
 35 | immensely
 36 | immoderate
 37 | in a penetrating way
 38 | in every possible way
 39 | in the extreme
 40 | incomparably
 41 | ingrained
 42 | matchlessly
 43 | monstrous
 44 | most
 45 | of the highest degree
 46 | out-and-out
 47 | outstanding
 48 | outstandingly
 49 | reach the limit
 50 | right-down
 51 | sharply
 52 | sheer
 53 | superb
 54 | terribly
 55 | to death
 56 | to the full
 57 | to the letter
 58 | to the limit
 59 | to the marrow
 60 | to the utmost
 61 | totally
 62 | towering
 63 | unusually
 64 | utmost
 65 | utterly
 66 | very much
 67 | most
 68 | 
 69 | 2. “很|very” 25
 70 | a lot
 71 | awfully
 72 | badly
 73 | better
 74 | by far
 75 | considerably
 76 | deep
 77 | disastrously
 78 | especially
 79 | extraordinarily
 80 | extremely
 81 | greatly
 82 | how
 83 | however
 84 | indeed
 85 | much
 86 | particularly
 87 | really
 88 | terribly
 89 | to a serious degree
 90 | too far
 91 | too much
 92 | unusually
 93 | very
 94 | what a
 95 | 
 96 | 3. “较|more” 22
 97 | all the more
 98 | as much as
 99 | at the worst
100 | by far
101 | comparatively
102 | even more
103 | further
104 | further more
105 | in that way
106 | increasingly
107 | like that
108 | more
109 | more and more
110 | more so
111 | much more
112 | plus
113 | relatively
114 | slightly more
115 | so
116 | still more
117 | such
118 | the more ... the more
119 | 
120 | 4. “稍|-ish” 15
121 | a bit
122 | a bit too
123 | a little
124 | a little bit
125 | a little more
126 | fairly
127 | more or less
128 | passably
129 | pretty
130 | quite
131 | rather
132 | slightly
133 | some
134 | somewhat
135 | to some extent
136 | 
137 | 5. “欠|insufficiently” 11
138 | a little less
139 | just
140 | light
141 | merely
142 | not particularly
143 | not too
144 | not very
145 | relative
146 | slight
147 | slightest degree of
148 | slightly
149 | 
150 | 6. “超|over” 33
151 | a little over
152 | above
153 | above measure
154 | above quota
155 | and more
156 | excessive
157 | excessively
158 | exorbitance
159 | extra
160 | far more than
161 | hyperphysical
162 | inflated
163 | inordinate
164 | not too much
165 | odd
166 | outrageousness
167 | over
168 | over-
169 | overdone
170 | overheated
171 | plus
172 | slightly more
173 | super
174 | superheated
175 | superheterodyne
176 | surplus
177 | to a fault
178 | too
179 | too much
180 | ultra
181 | ultrastructural
182 | undue
183 | unduly
184 | 
185 | 
186 | 


--------------------------------------------------------------------------------
/dict/tendency/tendency.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | 
 3 | <!-- 
 4 | 情感规则描述文件
 5 | -->
 6 | <tendency>
 7 | 	<positive>
 8 | 		<!-- 逗号隔开的义原列表 -->
 9 | 	   <sememe-list>
10 | 	   	   <sememe name="良" weight="1"/>
11 | 		   <sememe name="喜悦" weight="1"/>
12 | 	   </sememe-list>
13 | 	   <words>
14 | 	   	<word name="NIUBI" type="拼音" sentiment="1.0"/>
15 | 		<word name="NB" sentiment="1.0"/>
16 | 	   </words>
17 | 	</positive>
18 |     
19 | </tendency>
20 | 


--------------------------------------------------------------------------------
/dict/user-concept.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <concepts>
 3 | 	<!--
 4 | 	example:
 5 | 	<c w="汉语词语" p="词性，取值为：V|N|ADJ|NUM|PREP等" d="对应的义原形式的定义"/>
 6 | 	-->
 7 |   	<c w="三聚氰胺" p="N" d="material|材料,#drinks|饮品"/>
 8 | 	<c w="山寨" p="V" d="produce|制造,means=imitate|模仿,pretend|假装,content=RegardAs|当作"/> 
 9 | </concepts>
10 | 


--------------------------------------------------------------------------------
/doc/HISTORY.md:
--------------------------------------------------------------------------------
1 | 变更历史
2 | ================
3 | 
4 | 2014-04： 把中文分词用ansj替换为原先的ictclas4j，在此对原作者表示感谢！把工程更改为maven工程，方便管理。


--------------------------------------------------------------------------------
/doc/LCMC.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/doc/LCMC.zip


--------------------------------------------------------------------------------
/doc/REVISION.md:
--------------------------------------------------------------------------------
 1 | 错误修订
 2 | =====================
 3 | 
 4 | 1. 第三章概念词语的相似度计算部分的公式：
 5 | Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ β1 βi Sim i (C1, C2)
 6 | 应为： Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ Sim1(C1, C2) βi Sim i (C1, C2)
 7 | 可参考以下代码实现：           i
 8 |     @Override
 9 |     protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4) {
10 |         return beta1 * sim_v1 + beta2 * sim_v1 * sim_v2 + beta3 * sim_v1 * sim_v3 + beta4 * sim_v1 * sim_v4;
11 |     }
12 | 


--------------------------------------------------------------------------------
/doc/中文信息相似度计算理论与方法图书目录.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/doc/中文信息相似度计算理论与方法图书目录.pdf


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 
  5 | 	<groupId>zx.soft</groupId>
  6 | 	<artifactId>semantic-similarity</artifactId>
  7 | 	<version>1.0.0</version>
  8 | 	<name>Semantic Similarity</name>
  9 | 
 10 | 	<properties>
 11 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 12 | 		<junit.version>4.11</junit.version>
 13 | 		<slf4j.version>1.7.7</slf4j.version>
 14 | 		<logback.version>1.1.2</logback.version>
 15 | 		<commons.lang3.version>3.3.1</commons.lang3.version>
 16 | 	</properties>
 17 | 
 18 | 	<dependencies>
 19 | 		<!-- 日志包 -->
 20 | 		<dependency>
 21 | 			<groupId>ch.qos.logback</groupId>
 22 | 			<artifactId>logback-classic</artifactId>
 23 | 			<version>${logback.version}</version>
 24 | 		</dependency>
 25 | 		<dependency>
 26 | 			<groupId>ch.qos.logback</groupId>
 27 | 			<artifactId>logback-core</artifactId>
 28 | 			<version>${logback.version}</version>
 29 | 		</dependency>
 30 | 		<dependency>
 31 | 			<groupId>ch.qos.logback</groupId>
 32 | 			<artifactId>logback-access</artifactId>
 33 | 			<version>${logback.version}</version>
 34 | 		</dependency>
 35 | 		<dependency>
 36 | 			<groupId>org.slf4j</groupId>
 37 | 			<artifactId>slf4j-api</artifactId>
 38 | 			<version>${slf4j.version}</version>
 39 | 		</dependency>
 40 | 		<!-- 分词器 -->
 41 | 		<dependency>
 42 | 			<groupId>org.ansj</groupId>
 43 | 			<artifactId>tree_split</artifactId>
 44 | 			<version>1.2</version>
 45 | 		</dependency>
 46 | 		<dependency>
 47 | 			<groupId>org.ansj</groupId>
 48 | 			<artifactId>ansj_seg</artifactId>
 49 | 			<version>1.3</version>
 50 | 		</dependency>
 51 | 		<dependency>
 52 | 			<groupId>org.mockito</groupId>
 53 | 			<artifactId>mockito-all</artifactId>
 54 | 			<version>1.9.5</version>
 55 | 		</dependency>
 56 | 		<dependency>
 57 | 			<groupId>org.hamcrest</groupId>
 58 | 			<artifactId>hamcrest-all</artifactId>
 59 | 			<version>1.3</version>
 60 | 		</dependency>
 61 | 		<dependency>
 62 | 			<groupId>args4j</groupId>
 63 | 			<artifactId>args4j</artifactId>
 64 | 			<version>2.0.16</version>
 65 | 		</dependency>
 66 | 		<dependency>
 67 | 			<groupId>com.google.guava</groupId>
 68 | 			<artifactId>guava</artifactId>
 69 | 			<version>13.0.1</version>
 70 | 		</dependency>
 71 | 		<dependency>
 72 | 			<groupId>org.apache.commons</groupId>
 73 | 			<artifactId>commons-lang3</artifactId>
 74 | 			<version>${commons.lang3.version}</version>
 75 | 		</dependency>
 76 | 		<dependency>
 77 | 			<groupId>com.google.collections</groupId>
 78 | 			<artifactId>google-collections</artifactId>
 79 | 			<version>1.0</version>
 80 | 		</dependency>
 81 | 		<!-- 测试包 -->
 82 | 		<dependency>
 83 | 			<groupId>junit</groupId>
 84 | 			<artifactId>junit</artifactId>
 85 | 			<version>${junit.version}</version>
 86 | 			<scope>test</scope>
 87 | 		</dependency>
 88 | 	</dependencies>
 89 | 
 90 | 	<repositories>
 91 | 		<repository>
 92 | 			<id>cengtral</id>
 93 | 			<url>http://repo1.maven.org/maven2/</url>
 94 | 		</repository>
 95 | 		<!-- java.net maven repository, for example java mail -->
 96 | 		<repository>
 97 | 			<id>Java.Net</id>
 98 | 			<url>http://download.java.net/maven/2/</url>
 99 | 		</repository>
100 | 		<repository>
101 | 			<id>ansj-repo</id>
102 | 			<url>http://maven.ansj.org/</url>
103 | 		</repository>
104 | 		<repository>
105 | 			<id>info-bliki-repository</id>
106 | 			<url>http://gwtwiki.googlecode.com/svn/maven-repository/</url>
107 | 			<releases>
108 | 				<enabled>true</enabled>
109 | 			</releases>
110 | 			<snapshots>
111 | 				<enabled>false</enabled>
112 | 			</snapshots>
113 | 		</repository>
114 | 	</repositories>
115 | 
116 | 	<build>
117 | 		<plugins>
118 | 			<plugin>
119 | 				<groupId>org.apache.maven.plugins</groupId>
120 | 				<artifactId>maven-compiler-plugin</artifactId>
121 | 				<version>3.1</version>
122 | 				<configuration>
123 | 					<source>1.7</source>
124 | 					<target>1.7</target>
125 | 					<encoding>UTF-8</encoding>
126 | 				</configuration>
127 | 			</plugin>
128 | 			<plugin>
129 | 				<groupId>org.apache.maven.plugins</groupId>
130 | 				<artifactId>maven-source-plugin</artifactId>
131 | 				<version>2.2.1</version>
132 | 				<executions>
133 | 					<execution>
134 | 						<id>attach-sources</id>
135 | 						<phase>verify</phase>
136 | 						<goals>
137 | 							<goal>jar-no-fork</goal>
138 | 						</goals>
139 | 					</execution>
140 | 				</executions>
141 | 			</plugin>
142 | 			<plugin>
143 | 				<groupId>org.apache.maven.plugins</groupId>
144 | 				<artifactId>maven-resources-plugin</artifactId>
145 | 				<version>2.6</version>
146 | 				<configuration>
147 | 					<encoding>UTF-8</encoding>
148 | 				</configuration>
149 | 			</plugin>
150 | 			<plugin>
151 | 				<groupId>org.apache.maven.plugins</groupId>
152 | 				<artifactId>maven-surefire-plugin</artifactId>
153 | 				<version>2.16</version>
154 | 			</plugin>
155 | 			<plugin>
156 | 				<artifactId>maven-assembly-plugin</artifactId>
157 | 				<version>2.4</version>
158 | 				<configuration>
159 | 					<archive>
160 | 						<manifest>
161 | 							<mainClass>zx.soft.ui.Start</mainClass>
162 | 						</manifest>
163 | 					</archive>
164 | 					<descriptorRefs>
165 | 						<descriptorRef>jar-with-dependencies</descriptorRef>
166 | 					</descriptorRefs>
167 | 				</configuration>
168 | 				<executions>
169 | 					<execution>
170 | 						<id>make-assembly</id>
171 | 						<phase>package</phase>
172 | 						<goals>
173 | 							<goal>single</goal>
174 | 						</goals>
175 | 					</execution>
176 | 				</executions>
177 | 			</plugin>
178 | 			<plugin>
179 | 				<groupId>org.codehaus.mojo</groupId>
180 | 				<artifactId>exec-maven-plugin</artifactId>
181 | 			</plugin>
182 | 		</plugins>
183 | 
184 | 		<finalName>${project.artifactId}-${project.version}</finalName>
185 | 
186 | 	</build>
187 | 
188 | 	<!-- 部署仓库 -->
189 | 	<distributionManagement>
190 | 		<snapshotRepository>
191 | 			<id>zxsoft-snapshots</id>
192 | 			<name>Nexus Snapshot Repository</name>
193 | 			<url>http://192.168.3.23:18081/nexus/content/repositories/snapshots/</url>
194 | 		</snapshotRepository>
195 | 		<repository>
196 | 			<id>sentiment</id>
197 | 			<url>http://192.168.3.23:18081/nexus/content/repositories/sentiment</url>
198 | 		</repository>
199 | 	</distributionManagement>
200 | 
201 | </project>


--------------------------------------------------------------------------------
/src/main/java/zx/soft/classification/Feature.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.classification;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | import java.util.HashMap;
 7 | import java.util.Map;
 8 | 
 9 | /**
10 |  * 文档的特征
11 |  *
12 |  */
13 | public class Feature {
14 | 
15 | 	/** 每个关键词在不同类别中出现的文档数量 */
16 | 	private Map<String, Integer> docCountMap = new HashMap<>();
17 | 	/** 特征名称 */
18 | 	private String name;
19 | 
20 | 	public String getName() {
21 | 		return name;
22 | 	}
23 | 
24 | 	public void setName(String name) {
25 | 		this.name = name;
26 | 	}
27 | 
28 | 	public void incDocCount(String category) {
29 | 		if (docCountMap.containsKey(category)) {
30 | 			docCountMap.put(category, docCountMap.get(category) + 1);
31 | 		} else {
32 | 			docCountMap.put(category, 1);
33 | 		}
34 | 	}
35 | 
36 | 	public int getDocCount(String category) {
37 | 		if (docCountMap.containsKey(category)) {
38 | 			return docCountMap.get(category);
39 | 		} else {
40 | 			return 0;
41 | 		}
42 | 	}
43 | 
44 | 	public void write(DataOutput out) throws IOException {
45 | 		out.writeUTF(name == null ? "" : name);
46 | 
47 | 		out.writeInt(docCountMap.size());
48 | 		for (String category : docCountMap.keySet()) {
49 | 			out.writeUTF(category);
50 | 			out.writeInt(docCountMap.get(category));
51 | 		}
52 | 	}
53 | 
54 | 	public void readFields(DataInput in) throws IOException {
55 | 		this.name = in.readUTF();
56 | 
57 | 		docCountMap = new HashMap<>();
58 | 		int size = in.readInt();
59 | 		for (int i = 0; i < size; i++) {
60 | 			String category = in.readUTF();
61 | 			int docCount = in.readInt();
62 | 			docCountMap.put(category, docCount);
63 | 		}
64 | 	}
65 | 
66 | 	public static Feature read(DataInput in) throws IOException {
67 | 		Feature f = new Feature();
68 | 		f.readFields(in);
69 | 		return f;
70 | 	}
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/classification/Instance.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.classification;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.util.HashSet;
 9 | import java.util.List;
10 | import java.util.Set;
11 | 
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 | 
15 | import zx.soft.similarity.sentence.SegmentProxy;
16 | import zx.soft.similarity.sentence.SegmentProxy.Word;
17 | 
18 | /**
19 |  * 代表一个文档实例
20 |  *
21 |  */
22 | public class Instance {
23 | 
24 | 	private static Logger logger = LoggerFactory.getLogger(Instance.class);
25 | 
26 | 	/** 文档类别 */
27 | 	private String category;
28 | 	/** 文档内容 */
29 | 	private final Set<String> bag = new HashSet<>();
30 | 
31 | 	public Instance() {
32 | 		//
33 | 	}
34 | 
35 | 	public Instance(String category, File f, String encoding) {
36 | 		this.category = category;
37 | 		String line = null;
38 | 
39 | 		try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));) {
40 | 			while ((line = in.readLine()) != null) {
41 | 				//				System.out.println(line);
42 | 				List<Word> words = SegmentProxy.segment(line);
43 | 				for (Word w : words) {
44 | 					if (w.getPos().endsWith("adj") || w.getPos().startsWith("n") || w.getPos().startsWith("v")) {
45 | 						bag.add(w.getWord());
46 | 					}
47 | 				}
48 | 			}
49 | 		} catch (IOException e) {
50 | 			logger.error("current file:{},current line:{}", f.getAbsolutePath(), line);
51 | 			e.printStackTrace();
52 | 		}
53 | 	}
54 | 
55 | 	public String getCategory() {
56 | 		return category;
57 | 	}
58 | 
59 | 	public void setCategory(String category) {
60 | 		this.category = category;
61 | 	}
62 | 
63 | 	public Set<String> getWords() {
64 | 		return bag;
65 | 	}
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/classification/NaiveBayesClassifier.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.classification;
  2 | 
  3 | import java.io.DataInputStream;
  4 | import java.io.DataOutput;
  5 | import java.io.DataOutputStream;
  6 | import java.io.File;
  7 | import java.io.FileInputStream;
  8 | import java.io.FileOutputStream;
  9 | import java.io.IOException;
 10 | import java.util.Collection;
 11 | import java.util.HashMap;
 12 | import java.util.Map;
 13 | 
 14 | public class NaiveBayesClassifier {
 15 | 
 16 | 	/**
 17 | 	 * 记录每个类别下出现的文档数量, 用于计算P(C)使用
 18 | 	 */
 19 | 	Variable VARIABLE = new Variable();
 20 | 
 21 | 	/**
 22 | 	 * 词语在所有类别中的总数量
 23 | 	 */
 24 | 	Map<String, Integer> TERM_TOTAL_COUNT = new HashMap<>();
 25 | 
 26 | 	/**
 27 | 	 * 训练一篇文档
 28 | 	 * @param doc
 29 | 	 */
 30 | 	public void training(Instance doc) {
 31 | 		VARIABLE.addInstance(doc);
 32 | 	}
 33 | 
 34 | 	/**
 35 | 	 * 保存训练结果
 36 | 	 * @throws IOException
 37 | 	 */
 38 | 	void save(File file) throws IOException {
 39 | 		DataOutput out = new DataOutputStream(new FileOutputStream(file));
 40 | 		VARIABLE.write(out);
 41 | 	}
 42 | 
 43 | 	public void load(File file) throws IOException {
 44 | 		DataInputStream in = new DataInputStream(new FileInputStream(file));
 45 | 		VARIABLE = Variable.read(in);
 46 | 	}
 47 | 
 48 | 	/**
 49 | 	 * 计算P（C)
 50 | 	 * @param category
 51 | 	 * @return
 52 | 	 */
 53 | 	public double getCategoryProbability(String category) {
 54 | 		return Math.log(VARIABLE.getDocCount(category) * 1.0f / VARIABLE.getDocCount());
 55 | 	}
 56 | 
 57 | 	/**
 58 | 	 * 计算P(feature|cateogry),返回的是取对数后的数值
 59 | 	 * @param feature
 60 | 	 * @param category
 61 | 	 * @return
 62 | 	 */
 63 | 	public double getFeatureProbability(String feature, String category) {
 64 | 		int m = VARIABLE.getFeatureCount();
 65 | 		return Math.log((VARIABLE.getDocCount(feature, category) + 1.0) / (VARIABLE.getDocCount(category) + m));
 66 | 	}
 67 | 
 68 | 	/**
 69 | 	 * 计算给定实例文档属于指定类别的概率，返回的是取对数后的数值
 70 | 	 * @param category
 71 | 	 * @param doc
 72 | 	 * @return
 73 | 	 */
 74 | 	public double getProbability(String category, Instance doc) {
 75 | 		double result = getCategoryProbability(category);
 76 | 		for (String feature : doc.getWords()) {
 77 | 			if (VARIABLE.containFeature(feature)) {
 78 | 				result += getFeatureProbability(feature, category);
 79 | 			}
 80 | 		}
 81 | 		return result;
 82 | 	}
 83 | 
 84 | 	public String getCategory(Instance doc) {
 85 | 		Collection<String> categories = VARIABLE.getCategories();
 86 | 		double best = Double.NEGATIVE_INFINITY;
 87 | 		String bestName = null;
 88 | 		for (String c : categories) {
 89 | 			double current = getProbability(c, doc);
 90 | 			//			System.out.println(c + ":" + current);
 91 | 			if (best < current) {
 92 | 				best = current;
 93 | 				bestName = c;
 94 | 			}
 95 | 		}
 96 | 		return bestName;
 97 | 	}
 98 | 
 99 | 	public static void main(String[] args) throws IOException {
100 | 		NaiveBayesClassifier classifier = new NaiveBayesClassifier();
101 | 
102 | 		//		File samplePath = new File("./corpus/Sample");
103 | 		//		for(File categoryPath:samplePath.listFiles()){
104 | 		//			String category = categoryPath.getName();
105 | 		//			for(File f:categoryPath.listFiles()){
106 | 		//				classifier.training(new Instance(category, f, "GBK"));
107 | 		//			}
108 | 		//		}
109 | 		//		classifier.save(new File("result.dat"));
110 | 		//		System.out.println("Finished!");
111 | 
112 | 		classifier.load(new File("result.dat"));
113 | 
114 | 		Instance doc = new Instance(null, new File("/tmp/10.txt"), "GBK");
115 | 		System.out.println(classifier.getCategory(doc));
116 | 
117 | 	}
118 | 
119 | }
120 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/classification/Variable.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.classification;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.util.Collection;
  7 | import java.util.HashMap;
  8 | import java.util.Map;
  9 | 
 10 | /**
 11 |  * 分类的类别
 12 |  *
 13 |  */
 14 | public class Variable {
 15 | 
 16 | 	/** 类别信息 */
 17 | 	Map<String, CategoryInfo> categoryMap = new HashMap<>();
 18 | 
 19 | 	Map<String, Feature> features = new HashMap<>();
 20 | 
 21 | 	/** 所有文档的数量 */
 22 | 	private int docCount = 0;
 23 | 
 24 | 	public void write(DataOutput out) throws IOException {
 25 | 		//保存文档总数
 26 | 		out.writeInt(docCount);
 27 | 
 28 | 		//写入类别总数
 29 | 		out.writeInt(categoryMap.size());
 30 | 		for (String category : categoryMap.keySet()) {
 31 | 			out.writeUTF(category);
 32 | 			categoryMap.get(category).write(out);
 33 | 		}
 34 | 
 35 | 		//写入Feature总数
 36 | 		out.writeInt(features.size());
 37 | 		for (String key : features.keySet()) {
 38 | 			out.writeUTF(key);
 39 | 			features.get(key).write(out);
 40 | 		}
 41 | 	}
 42 | 
 43 | 	public void readFields(DataInput in) throws IOException {
 44 | 		this.docCount = in.readInt();
 45 | 
 46 | 		int size = in.readInt();
 47 | 		categoryMap = new HashMap<String, CategoryInfo>();
 48 | 		for (int i = 0; i < size; i++) {
 49 | 			String category = in.readUTF();
 50 | 			CategoryInfo info = CategoryInfo.read(in);
 51 | 			categoryMap.put(category, info);
 52 | 		}
 53 | 
 54 | 		size = in.readInt();
 55 | 		features = new HashMap<String, Feature>();
 56 | 		for (int i = 0; i < size; i++) {
 57 | 			String word = in.readUTF();
 58 | 			Feature feature = Feature.read(in);
 59 | 			features.put(word, feature);
 60 | 		}
 61 | 	}
 62 | 
 63 | 	public static Variable read(DataInput in) throws IOException {
 64 | 		Variable v = new Variable();
 65 | 		v.readFields(in);
 66 | 		return v;
 67 | 	}
 68 | 
 69 | 	public Collection<String> getCategories() {
 70 | 		return categoryMap.keySet();
 71 | 	}
 72 | 
 73 | 	public int getFeatureCount() {
 74 | 		return features.size();
 75 | 	}
 76 | 
 77 | 	public boolean containFeature(String feature) {
 78 | 		return features.containsKey(feature);
 79 | 	}
 80 | 
 81 | 	public void incDocCount() {
 82 | 		this.docCount++;
 83 | 	}
 84 | 
 85 | 	public int getDocCount() {
 86 | 		return this.docCount;
 87 | 	}
 88 | 
 89 | 	/**
 90 | 	 * 获取置顶类别下的文档数量
 91 | 	 * @param category
 92 | 	 * @return
 93 | 	 */
 94 | 	public int getDocCount(String category) {
 95 | 		return categoryMap.get(category).getDocCount();
 96 | 	}
 97 | 
 98 | 	/**
 99 | 	 * 获取feature在指定类别下的文档出现数量
100 | 	 * @param feature
101 | 	 * @param category
102 | 	 * @return
103 | 	 */
104 | 	public int getDocCount(String feature, String category) {
105 | 		Feature f = features.get(feature);
106 | 		if (f != null) {
107 | 			return f.getDocCount(category);
108 | 		}
109 | 		return 0;
110 | 	}
111 | 
112 | 	public void addInstance(Instance instance) {
113 | 		incDocCount();
114 | 		CategoryInfo info = null;
115 | 		if (categoryMap.containsKey(instance.getCategory())) {
116 | 			info = categoryMap.get(instance.getCategory());
117 | 		} else {
118 | 			info = new CategoryInfo();
119 | 		}
120 | 		info.incDocCount();
121 | 		categoryMap.put(instance.getCategory(), info);
122 | 
123 | 		for (String word : instance.getWords()) {
124 | 			Feature feature = features.get(word);
125 | 
126 | 			if (feature == null)
127 | 				feature = new Feature();
128 | 
129 | 			feature.setName(word);
130 | 			feature.incDocCount(instance.getCategory());
131 | 
132 | 			features.put(word, feature);
133 | 		}
134 | 	}
135 | 
136 | 	public static class CategoryInfo {
137 | 		private int docCount;
138 | 
139 | 		public int getDocCount() {
140 | 			return docCount;
141 | 		}
142 | 
143 | 		public void incDocCount() {
144 | 			this.docCount++;
145 | 		}
146 | 
147 | 		public void setDocCount(int docCount) {
148 | 			this.docCount = docCount;
149 | 		}
150 | 
151 | 		public void write(DataOutput out) throws IOException {
152 | 			out.writeInt(docCount);
153 | 		}
154 | 
155 | 		public void readFields(DataInput in) throws IOException {
156 | 			this.docCount = in.readInt();
157 | 		}
158 | 
159 | 		public static CategoryInfo read(DataInput in) throws IOException {
160 | 			CategoryInfo c = new CategoryInfo();
161 | 			c.readFields(in);
162 | 			return c;
163 | 		}
164 | 	}
165 | 
166 | }
167 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/Similaritable.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity;
 2 | 
 3 | /**
 4 |  * 可以计算相似度的接口
 5 |  * 
 6 |  */
 7 | public interface Similaritable {
 8 | 
 9 | 	/**
10 | 	 * 计算两个字符串的相似度，对于句子来说，计算的是句子相似度，对于词语则计算词语的相似度
11 | 	 * @param item1 参与相似度计算的第一个字符串
12 | 	 * @param item2 参与相似度计算的第二个字符串
13 | 	 * @return
14 | 	 */
15 | 	public double getSimilarity(String item1, String item2);
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/SimilarityFactory.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity;
 2 | 
 3 | import zx.soft.similarity.sentence.SentenceSimilarity;
 4 | import zx.soft.similarity.sentence.morphology.MorphoSimilarity;
 5 | import zx.soft.similarity.word.WordSimilarity;
 6 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
 7 | 
 8 | public class SimilarityFactory {
 9 | 
10 | 	private static WordSimilarity wordSimilarity = XiaConceptParser.getInstance();
11 | 	private static SentenceSimilarity sentenceSimilarity = MorphoSimilarity.getInstance();
12 | 
13 | 	private SimilarityFactory() {
14 | 		//
15 | 	}
16 | 
17 | 	public static WordSimilarity getWordSimilarity() {
18 | 		return wordSimilarity;
19 | 	}
20 | 
21 | 	public static SentenceSimilarity getSentenceSimilarity() {
22 | 		return sentenceSimilarity;
23 | 	}
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/phrase/PhraseSimilarity.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.phrase;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import zx.soft.similarity.Similaritable;
 7 | 
 8 | /**
 9 |  * 一种简单的短语相似度计算方法，算法原理请参考《中文信息相似度计算理论与方法》一书P69.
10 |  * 
11 |  */
12 | public class PhraseSimilarity implements Similaritable {
13 | 
14 | 	@Override
15 | 	public double getSimilarity(String item1, String item2) {
16 | 		return (getSC(item1, item2) + getSC(item2, item1)) / 2.0;
17 | 	}
18 | 
19 | 	public List<Integer> getC(String first, String second, int pos) {
20 | 		List<Integer> results = new ArrayList<Integer>();
21 | 		char ch = first.charAt(pos);
22 | 		for (int i = 0; i < second.length(); i++) {
23 | 			if (ch == second.charAt(i)) {
24 | 				results.add(i);
25 | 			}
26 | 		}
27 | 		return results;
28 | 	}
29 | 
30 | 	public int getDistance(String first, String second, int pos) {
31 | 		int d = second.length();
32 | 		for (int k : getC(first, second, pos)) {
33 | 			int value = Math.abs(k - pos);
34 | 			if (d > value) {
35 | 				d = value;
36 | 			}
37 | 		}
38 | 
39 | 		return d;
40 | 	}
41 | 
42 | 	public double getCC(String first, String second, int pos) {
43 | 		return (second.length() - getDistance(first, second, pos)) * 1.0 / second.length();
44 | 	}
45 | 
46 | 	public double getSC(String first, String second) {
47 | 		double total = 0.0;
48 | 		for (int i = 0; i < first.length(); i++) {
49 | 			total = total + getCC(first, second, i);
50 | 		}
51 | 		return total / first.length();
52 | 	}
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/SegmentProxy.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.sentence;
  2 | 
  3 | import java.awt.BorderLayout;
  4 | import java.awt.GridLayout;
  5 | import java.awt.event.ActionEvent;
  6 | import java.awt.event.ActionListener;
  7 | import java.util.ArrayList;
  8 | import java.util.List;
  9 | 
 10 | import javax.swing.BorderFactory;
 11 | import javax.swing.JButton;
 12 | import javax.swing.JLabel;
 13 | import javax.swing.JPanel;
 14 | import javax.swing.JScrollPane;
 15 | import javax.swing.JTextArea;
 16 | import javax.swing.JTextField;
 17 | 
 18 | import org.ansj.domain.Term;
 19 | import org.ansj.recognition.NatureRecognition;
 20 | import org.ansj.splitWord.analysis.ToAnalysis;
 21 | 
 22 | /**
 23 |  * 对词法分析程序的封装代理，目前内部封装了对Ictclas4j（夏天改进版）的调用<br/>
 24 |  * 为方便演示程序快速启动，对Segment的调用采用了单例模式，实现需要时的延迟加载。
 25 |  *
 26 |  */
 27 | public class SegmentProxy {
 28 | 
 29 | 	public static class Word {
 30 | 		/**
 31 | 		 * 词语内容
 32 | 		 */
 33 | 		private String word;
 34 | 		/**
 35 | 		 * 词语词性代号
 36 | 		 */
 37 | 		private String pos;
 38 | 
 39 | 		public Word(String word, String pos) {
 40 | 			this.word = word;
 41 | 			this.pos = pos;
 42 | 		}
 43 | 
 44 | 		public String getWord() {
 45 | 			return word;
 46 | 		}
 47 | 
 48 | 		public void setWord(String word) {
 49 | 			this.word = word;
 50 | 		}
 51 | 
 52 | 		public String getPos() {
 53 | 			return pos;
 54 | 		}
 55 | 
 56 | 		public void setPos(String pos) {
 57 | 			this.pos = pos;
 58 | 		}
 59 | 	}
 60 | 
 61 | 	public static List<Word> segment(String sentence) {
 62 | 		List<Word> results = new ArrayList<Word>();
 63 | 		List<Term> terms = ToAnalysis.parse(sentence);
 64 | 		new NatureRecognition(terms).recognition();
 65 | 
 66 | 		for (Term term : terms) {
 67 | 			results.add(new Word(term.getName(), term.getNatrue().natureStr));
 68 | 		}
 69 | 
 70 | 		return results;
 71 | 	}
 72 | 
 73 | 	public static String getSegmentedString(String sentence) {
 74 | 		List<Word> words = segment(sentence);
 75 | 		StringBuilder sb = new StringBuilder();
 76 | 		for (Word word : words) {
 77 | 			sb.append(word.getWord() + "/" + word.getPos()).append(" ");
 78 | 		}
 79 | 		return sb.toString();
 80 | 	}
 81 | 
 82 | 	public static JPanel createPanel() {
 83 | 		//声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
 84 | 		JPanel fullPanel = new JPanel();
 85 | 		fullPanel.setLayout(new BorderLayout());
 86 | 
 87 | 		JPanel northPanel = new JPanel();
 88 | 		fullPanel.add(northPanel, "North");
 89 | 
 90 | 		//centerPanel包括了一个文本框
 91 | 		JPanel centerPanel = new JPanel();
 92 | 		fullPanel.add(centerPanel, "Center");
 93 | 		centerPanel.setLayout(new BorderLayout());
 94 | 		final JTextArea result = new JTextArea();
 95 | 		//result.setFont(new Font("宋体", Font.PLAIN, 16));
 96 | 		result.setLineWrap(true);
 97 | 		JScrollPane centerScrollPane = new JScrollPane(result);
 98 | 		centerPanel.add(centerScrollPane, "Center");
 99 | 
100 | 		northPanel.setLayout(new GridLayout(1, 1));
101 | 
102 | 		//以下加入northPanel中的第一个面板
103 | 		final JTextField senField = new JTextField("什么是计算机病毒");
104 | 		senField.setColumns(50);
105 | 
106 | 		JPanel mainPanel = new JPanel();
107 | 		mainPanel.setLayout(new GridLayout(2, 1));
108 | 
109 | 		JPanel linePanel = new JPanel();
110 | 		linePanel.add(new JLabel("句子:"));
111 | 		linePanel.add(senField);
112 | 		mainPanel.add(linePanel);
113 | 
114 | 		linePanel = new JPanel();
115 | 		JButton goButton = new JButton("词法分析");
116 | 		linePanel.add(goButton);
117 | 		mainPanel.add(linePanel);
118 | 		goButton.addActionListener(new ActionListener() {
119 | 
120 | 			@Override
121 | 			public void actionPerformed(ActionEvent e) {
122 | 				String sentence = senField.getText();
123 | 				String text = "[" + sentence + "]的词法分析结果为:";
124 | 
125 | 				text = text + "\n" + getSegmentedString(sentence);
126 | 				text = text + "\n________________________________\n" + result.getText();
127 | 				result.setText(text);
128 | 			}
129 | 
130 | 		});
131 | 		mainPanel.setBorder(BorderFactory.createEtchedBorder());
132 | 		northPanel.add(mainPanel);
133 | 
134 | 		return fullPanel;
135 | 	}
136 | 
137 | }
138 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/SentenceSimilarity.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.sentence;
 2 | 
 3 | import zx.soft.similarity.Similaritable;
 4 | 
 5 | /**
 6 |  * 语句相似度接口
 7 |  *
 8 |  * @author wanggang
 9 |  *
10 |  */
11 | public interface SentenceSimilarity extends Similaritable {
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/Block.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.sentence.editdistance;
  2 | 
  3 | public class Block<T> {
  4 | 
  5 | 	private int globalPosition;
  6 | 	/** 块的内容 */
  7 | 	private SuperString<T> data;
  8 | 	/** 前后指针 */
  9 | 	private Block<T> prev, next;
 10 | 	/** 是否已经进行划分 */
 11 | 	private boolean divideFlag = false;
 12 | 
 13 | 	public Block(SuperString<T> string) {
 14 | 		this.data = string;
 15 | 		this.globalPosition = 0;
 16 | 	}
 17 | 
 18 | 	public Block(SuperString<T> string, int globalBegin) {
 19 | 		this.data = string;
 20 | 		this.globalPosition = globalBegin;
 21 | 	}
 22 | 
 23 | 	public int getGlobalPosition() {
 24 | 		return globalPosition;
 25 | 	}
 26 | 
 27 | 	public void setGlobalPosition(int globalPosition) {
 28 | 		this.globalPosition = globalPosition;
 29 | 	}
 30 | 
 31 | 	public SuperString<T> getData() {
 32 | 		return data;
 33 | 	}
 34 | 
 35 | 	public void setData(SuperString<T> data) {
 36 | 		this.data = data;
 37 | 	}
 38 | 
 39 | 	public Block<T> getPrev() {
 40 | 		return prev;
 41 | 	}
 42 | 
 43 | 	public void setPrev(Block<T> prev) {
 44 | 		this.prev = prev;
 45 | 	}
 46 | 
 47 | 	public Block<T> getNext() {
 48 | 		return next;
 49 | 	}
 50 | 
 51 | 	public void setNext(Block<T> next) {
 52 | 		this.next = next;
 53 | 	}
 54 | 
 55 | 	public boolean isDivideFlag() {
 56 | 		return divideFlag;
 57 | 	}
 58 | 
 59 | 	public void setDivideFlag(boolean divideFlag) {
 60 | 		this.divideFlag = divideFlag;
 61 | 	}
 62 | 
 63 | 	public void divide(int start, int length) {
 64 | 		if (start == 0 && length == data.length()) {
 65 | 			this.divideFlag = true;
 66 | 			return;
 67 | 		} else if (start == 0) {
 68 | 			//前面为已经分割的标记，后面应该为未分割的标记
 69 | 			Block<T> tail = new Block<T>(data.substring(length), globalPosition + start);
 70 | 			this.setDivideFlag(true);
 71 | 			this.setData(data.substring(0, length));
 72 | 			tail.next = this.next;
 73 | 			if (tail.next != null)
 74 | 				tail.next.prev = tail;
 75 | 			this.next = tail;
 76 | 			tail.prev = this;
 77 | 		} else if (start + length == data.length()) {
 78 | 			//后面为已经分割的标记，前面应该为未分割的标记
 79 | 			Block<T> head = new Block<T>(data.substring(0, start), globalPosition);
 80 | 
 81 | 			this.setDivideFlag(true);
 82 | 			this.setData(data.substring(start));
 83 | 
 84 | 			head.prev = this.prev;
 85 | 			if (head.prev != null)
 86 | 				head.prev.next = head;
 87 | 			head.next = this;
 88 | 			this.prev = head;
 89 | 		} else {
 90 | 			//中间为已经分割的标记，前面和后面应该为未分割的标记
 91 | 			Block<T> head = new Block<T>(data.substring(0, start), globalPosition);
 92 | 			Block<T> tail = new Block<T>(data.substring(start + length), globalPosition + start + length);
 93 | 
 94 | 			this.setDivideFlag(true);
 95 | 			this.setData(data.substring(start, start + length));
 96 | 			this.setGlobalPosition(globalPosition + start);
 97 | 
 98 | 			head.prev = this.prev;
 99 | 			if (head.prev != null)
100 | 				head.prev.next = head;
101 | 			head.next = this;
102 | 			this.prev = head;
103 | 
104 | 			tail.next = this.next;
105 | 			if (tail.next != null)
106 | 				tail.next.prev = tail;
107 | 			this.next = tail;
108 | 			tail.prev = this;
109 | 		}
110 | 	}
111 | 
112 | }
113 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/CharEditUnit.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.sentence.editdistance;
 2 | 
 3 | public class CharEditUnit extends EditUnit {
 4 | 
 5 | 	private String content = "";
 6 | 
 7 | 	public CharEditUnit(Character ch) {
 8 | 		content = ch.toString();
 9 | 	}
10 | 
11 | 	@Override
12 | 	public String getUnitString() {
13 | 		return content;
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/ChunkEditUnit.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.sentence.editdistance;
 2 | 
 3 | public class ChunkEditUnit extends EditUnit {
 4 | 
 5 | 	private SuperString<? extends EditUnit> chunk = null;
 6 | 
 7 | 	public ChunkEditUnit(SuperString<? extends EditUnit> chunk) {
 8 | 		this.chunk = chunk;
 9 | 	}
10 | 
11 | 	@Override
12 | 	public String getUnitString() {
13 | 		return chunk.toString();
14 | 	}
15 | 
16 | 	/**
17 | 	 * 根据此语的相似度获取替换代价
18 | 	 */
19 | 	@Override
20 | 	public double getSubstitutionCost(EditUnit otherUnit) {
21 | 		if (!(otherUnit instanceof ChunkEditUnit))
22 | 			return chunk.length();
23 | 		if (equals(otherUnit))
24 | 			return 0.0;
25 | 
26 | 		ChunkEditUnit other = (ChunkEditUnit) otherUnit;
27 | 		return new StandardEditDistance().getEditDistance(chunk, other.chunk);
28 | 	}
29 | 
30 | 	/**
31 | 	 * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0
32 | 	 * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价
33 | 	 * @return 删除代价
34 | 	 */
35 | 	@Override
36 | 	public double getDeletionCost() {
37 | 		return chunk.length();
38 | 	}
39 | 
40 | 	/**
41 | 	 * 获取插入代价,标准算法的默认值为1.0.
42 | 	 * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价
43 | 	 */
44 | 	@Override
45 | 	public double getInsertionCost() {
46 | 		return chunk.length();
47 | 	}
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/EditDistance.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.sentence.editdistance;
 2 | 
 3 | import zx.soft.similarity.Similaritable;
 4 | 
 5 | /**
 6 |  * 编辑距离的父类，定义了其中的主要行为
 7 |  * 
 8 |  */
 9 | public abstract class EditDistance implements Similaritable {
10 | 
11 | 	public abstract double getEditDistance(SuperString<? extends EditUnit> S, SuperString<? extends EditUnit> T);
12 | 
13 | 	@Override
14 | 	public double getSimilarity(String s1, String s2) {
15 | 		SuperString<WordEditUnit> S = SuperString.createWordSuperString(s1);
16 | 		SuperString<WordEditUnit> T = SuperString.createWordSuperString(s2);
17 | 
18 | 		return 1 - (getEditDistance(S, T)) / (Math.max(S.length(), T.length()));
19 | 	}
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/EditUnit.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.sentence.editdistance;
 2 | 
 3 | /**
 4 |  * 编辑单元
 5 |  * 
 6 |  */
 7 | public abstract class EditUnit {
 8 | 
 9 | 	/**
10 | 	 * 获取编辑单元的内部字符串
11 | 	 * @return
12 | 	 */
13 | 	public abstract String getUnitString();
14 | 
15 | 	/**
16 | 	 * 获取替换代价，默认替换代价当替换单元的内容相同时为0，
17 | 	 * 不同时为1
18 | 	 */
19 | 	public double getSubstitutionCost(EditUnit other) {
20 | 		return this.equals(other) ? 0 : 1;
21 | 	}
22 | 
23 | 	/**
24 | 	 * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0
25 | 	 * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价
26 | 	 * @return 删除代价
27 | 	 */
28 | 	public double getDeletionCost() {
29 | 		return 1.0;
30 | 	}
31 | 
32 | 	/**
33 | 	 * 获取插入代价,标准算法的默认值为1.0.
34 | 	 * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价
35 | 	 */
36 | 	public double getInsertionCost() {
37 | 		return 1.0;
38 | 	}
39 | 
40 | 	@Override
41 | 	public boolean equals(Object other) {
42 | 		if (!(other instanceof EditUnit))
43 | 			return false;
44 | 		return getUnitString().equals(((EditUnit) other).getUnitString());
45 | 	}
46 | 
47 | 	@Override
48 | 	public String toString() {
49 | 		return getUnitString();
50 | 	}
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/GregorEditDistance.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.sentence.editdistance;
  2 | 
  3 | /**
  4 |  * 由Gregor提出的考虑块交换(Block Transposition)的编辑距离改进算法
  5 |  * 时间复杂度为O(m3n3)
  6 |  * 具体实现请参考GregorLeusch，Nicola Ueffing的文章《A Novel String-to-String Distance Measure With
  7 |  * Application to Machine Translation Evaluation》
  8 |  * 问题：<br/>
  9 |  * 相似度计算的问题会影响句子相似度计算的直观结果，例如“什么是计算机病毒”，“电脑病毒是什么”
 10 |  * 直觉应该是2，即“什么是计算机病毒”首先变为“计算机病毒什么是”，再变为“计算机病毒是什么”，
 11 |  * 编辑代价为2，但实际上，当由“什么是计算机病毒”变为“计算机病毒什么是”后，由于"什么是"与“是什么”的替换代价只有0.2，
 12 |  * 因而不再进行交互，故总的编辑距离为1.2
 13 |  * 
 14 |  */
 15 | public class GregorEditDistance extends EditDistance {
 16 | 
 17 | 	/** 块交换代价 */
 18 | 	public static double swapCost = 0.5;
 19 | 
 20 | 	private SuperString<? extends EditUnit> S, T;
 21 | 
 22 | 	/** 存放字符串从S(i0-i1)到T(j0-j1)的中间运算结果，避免多次运算，提高运算效率*/
 23 | 	private double[][][][] QArray;
 24 | 
 25 | 	@Override
 26 | 	public double getEditDistance(SuperString<? extends EditUnit> S, SuperString<? extends EditUnit> T) {
 27 | 		this.S = S;
 28 | 		this.T = T;
 29 | 		QArray = new double[S.length()][S.length()][T.length()][T.length()];
 30 | 		for (int i = 0; i < S.length(); i++) {
 31 | 			for (int i2 = 0; i2 < S.length(); i2++)
 32 | 				for (int j = 0; j < T.length(); j++)
 33 | 					for (int j2 = 0; j2 < T.length(); j2++) {
 34 | 						QArray[i][i2][j][j2] = Double.MAX_VALUE;
 35 | 					}
 36 | 		}
 37 | 
 38 | 		return Q(0, S.length() - 1, 0, T.length() - 1);
 39 | 	}
 40 | 
 41 | 	private double Q(int i0, int i1, int j0, int j1) {
 42 | 		double cost = 0;
 43 | 
 44 | 		if (i1 < i0) {
 45 | 			for (int j = j0; j <= j1; j++) {
 46 | 				cost += T.elementAt(j).getInsertionCost();
 47 | 			}
 48 | 			return cost;
 49 | 		} else if (j1 < j0) {
 50 | 			for (int i = i0; i <= i1; i++) {
 51 | 				cost += S.elementAt(i).getDeletionCost();
 52 | 			}
 53 | 			return cost;
 54 | 		} else if (i1 == i0 && j1 == j0) {
 55 | 			cost = S.elementAt(i0).getSubstitutionCost(T.elementAt(j0));
 56 | 			QArray[i0][i1][j0][j1] = cost;
 57 | 			return cost;
 58 | 		} else if (i1 == i0) {
 59 | 			double minSubstituteValue = 1.0;
 60 | 			int minPosJ = j0;
 61 | 			for (int j = j0; j <= j1; j++) {
 62 | 				double subsitituteValue = S.elementAt(i0).getSubstitutionCost(T.elementAt(j));
 63 | 				if (minSubstituteValue > subsitituteValue) {
 64 | 					minSubstituteValue = subsitituteValue;
 65 | 					minPosJ = j;
 66 | 				}
 67 | 			}
 68 | 			for (int j = j0; j <= j1; j++) {
 69 | 				if (j == minPosJ) {
 70 | 					cost += minSubstituteValue;
 71 | 				} else {
 72 | 					cost += T.elementAt(j).getInsertionCost();
 73 | 				}
 74 | 			}
 75 | 		} else if (j1 == j0) {
 76 | 			double minSubstituteValue = 1.0;
 77 | 			int minPosI = i0;
 78 | 			for (int i = i0; i <= i1; i++) {
 79 | 				double subsitituteValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j0));
 80 | 				if (minSubstituteValue > subsitituteValue) {
 81 | 					minSubstituteValue = subsitituteValue;
 82 | 					minPosI = i;
 83 | 				}
 84 | 			}
 85 | 			for (int i = i0; i <= i1; i++) {
 86 | 				if (i == minPosI) {
 87 | 					cost += minSubstituteValue;
 88 | 				} else {
 89 | 					cost += S.elementAt(i).getDeletionCost();
 90 | 				}
 91 | 			}
 92 | 		} else {
 93 | 			if (QArray[i0][i1][j0][j1] < Double.MAX_VALUE) {
 94 | 				return QArray[i0][i1][j0][j1];
 95 | 			}
 96 | 			for (int i = i0; i < i1; i++) {
 97 | 				for (int j = j0; j < j1; j++) {
 98 | 					double c = Math.min(Q(i0, i, j0, j) + Q(i + 1, i1, j + 1, j1),
 99 | 							Q(i0, i, j + 1, j1) + Q(i + 1, i1, j0, j) + swapCost);
100 | 					if (c < QArray[i0][i1][j0][j1]) {
101 | 						QArray[i0][i1][j0][j1] = c;
102 | 					}
103 | 				}
104 | 			}
105 | 			return QArray[i0][i1][j0][j1];
106 | 		}
107 | 		QArray[i0][i1][j0][j1] = cost;
108 | 		return cost;
109 | 	}
110 | 
111 | 	public static void main(String[] argv) {
112 | 		String s1 = "abcxdef";
113 | 		String s2 = "defxabc";
114 | 		//String s2 = "我的密码我忘记了,我该怎样做呢?";
115 | 		GregorEditDistance ed = new GregorEditDistance();
116 | 		System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1),
117 | 				SuperString.createCharSuperString(s2)));
118 | 	}
119 | 
120 | }
121 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/Split.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.sentence.editdistance;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | public class Split {
  7 | 
  8 | 	public static boolean MERGE_FLAG = true;
  9 | 
 10 | 	@SuppressWarnings({ "unchecked", "rawtypes" })
 11 | 	public static Object[] split(SuperString<? extends EditUnit> X, SuperString<? extends EditUnit> Y) {
 12 | 		Block<? extends EditUnit> LX = new Block(X);
 13 | 		Block<? extends EditUnit> LY = new Block(Y);
 14 | 		split(LX, LY);
 15 | 		while (LY.getPrev() != null) {
 16 | 			LY = LY.getPrev();
 17 | 		}
 18 | 		while (LX.getPrev() != null) {
 19 | 			LX = LX.getPrev();
 20 | 		}
 21 | 		List<ChunkEditUnit> first = new ArrayList<ChunkEditUnit>();
 22 | 		List<ChunkEditUnit> second = new ArrayList<ChunkEditUnit>();
 23 | 		while (LX != null) {
 24 | 			first.add(new ChunkEditUnit(LX.getData()));
 25 | 			LX = LX.getNext();
 26 | 		}
 27 | 
 28 | 		while (LY != null) {
 29 | 			second.add(new ChunkEditUnit(LY.getData()));
 30 | 			LY = LY.getNext();
 31 | 		}
 32 | 		SuperString<ChunkEditUnit> s1 = new SuperString<ChunkEditUnit>(first);
 33 | 		SuperString<ChunkEditUnit> s2 = new SuperString<ChunkEditUnit>(second);
 34 | 		Object[] obj = new Object[] { s1, s2 };
 35 | 		return obj;
 36 | 	}
 37 | 
 38 | 	private static void split(Block<?> bx, Block<?> LY) {
 39 | 		LCS maxLCS = null;
 40 | 		Block<?> by = LY;
 41 | 		while (by.getPrev() != null) {
 42 | 			by = by.getPrev();
 43 | 		}
 44 | 		Block<?> maxMatchedBy = by;
 45 | 		while (by != null) {
 46 | 			if (by.isDivideFlag()) {
 47 | 				by = by.getNext();
 48 | 				continue;
 49 | 			}
 50 | 
 51 | 			LCS lcs = LCS.parse(bx.getData(), by.getData());
 52 | 			if (maxLCS == null || maxLCS.length < lcs.length) {
 53 | 				maxLCS = lcs;
 54 | 				maxMatchedBy = by;
 55 | 			}
 56 | 
 57 | 			by = by.getNext();
 58 | 		}
 59 | 
 60 | 		if (maxLCS != null && maxLCS.length > 0) {
 61 | 			bx.divide(maxLCS.x_pos, maxLCS.length);
 62 | 			maxMatchedBy.divide(maxLCS.y_pos, maxLCS.length);
 63 | 		}
 64 | 
 65 | 		if (bx.getPrev() != null && !bx.isDivideFlag()) {
 66 | 			split(bx.getPrev(), LY);
 67 | 		}
 68 | 
 69 | 		if (bx.getNext() != null && !bx.getNext().isDivideFlag()) {
 70 | 			split(bx.getNext(), LY);
 71 | 		}
 72 | 	}
 73 | 
 74 | 	/**
 75 | 	 * Longest Common String
 76 | 	 * @author Gavin
 77 | 	 *
 78 | 	 */
 79 | 	public static class LCS {
 80 | 		public int length = 0; //LCS匹配的最长结果
 81 | 		public int x_pos = 0; //LCS匹配的X的位置
 82 | 		public int y_pos = 0; //LCS匹配的Y的位置
 83 | 
 84 | 		public static LCS parse(SuperString<?> X, SuperString<?> Y) {
 85 | 			LCS lcs = new LCS();
 86 | 			for (int start = 0; start < X.length(); start++) {
 87 | 				for (int end = start + 1; end <= X.length(); end++) {
 88 | 					SuperString<?> tempX = X.substring(start, end);
 89 | 
 90 | 					int pos = Y.indexOf(tempX);
 91 | 					if (pos >= 0 && tempX.length() > lcs.length) {
 92 | 						lcs.length = tempX.length();
 93 | 						lcs.x_pos = start;
 94 | 						lcs.y_pos = pos;
 95 | 					}
 96 | 				}
 97 | 			}
 98 | 			return lcs;
 99 | 		}
100 | 
101 | 		@Override
102 | 		public String toString() {
103 | 			return "length=" + length + ", x_pos=" + x_pos + ", y_pos=" + y_pos;
104 | 		}
105 | 	}
106 | 
107 | 	public static void main(String[] args) {
108 | 		String s1 = "abcdefghijkabc";
109 | 		String s2 = "cdefghijklabccc";
110 | 		//		s2 = "fgabcdehijklkdslfkasdflak";
111 | 		//		s1 = "abcdefgxyzoxyjasdkfjjjaldsfa";
112 | 		//		s1 = "I like the book";
113 | 		//		s2 = "the book I like";
114 | 		s1 = "什么是计算机病毒";
115 | 		s2 = "电脑病毒是什么";
116 | 
117 | 		//		SuperString<CharEditUnit> ss1 = SuperString.createCharSuperString(s1);
118 | 		//		SuperString<CharEditUnit> ss2 = SuperString.createCharSuperString(s2);
119 | 
120 | 		SuperString<WordEditUnit> ss1 = SuperString.createWordSuperString(s1);
121 | 		SuperString<WordEditUnit> ss2 = SuperString.createWordSuperString(s2);
122 | 		Split.split(ss1, ss2);
123 | 		//		LCS lcs = LCS.parse(ss1, ss2);
124 | 		//		System.out.println(lcs);
125 | 	}
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/StandardEditDistance.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.sentence.editdistance;
 2 | 
 3 | /**
 4 |  * 基于编辑距离的汉语句子相似度计算
 5 |  * 
 6 |  */
 7 | public class StandardEditDistance extends EditDistance {
 8 | 
 9 | 	/**
10 | 	 * 获取两个串的编辑距离
11 | 	 * @param S 字符串1
12 | 	 * @param T 字符串2
13 | 	 * @return 两个串的编辑距离
14 | 	 */
15 | 	@Override
16 | 	public double getEditDistance(SuperString<? extends EditUnit> X, SuperString<? extends EditUnit> Y) {
17 | 		double[][] D; //编辑矩阵
18 | 
19 | 		int m = X.length(); //字符串X的长度
20 | 		int n = Y.length(); //字符串Y的长度
21 | 		//char ch_x_i;       //字符串X的第i个词
22 | 		//char ch_y_j;       //字符串Y的第j个词
23 | 
24 | 		if (m == 0) {
25 | 			double distance = 0.0;
26 | 			for (int j = 0; j < n; j++) {
27 | 				distance += Y.elementAt(j).getInsertionCost();
28 | 			}
29 | 			return distance;
30 | 		} else if (n == 0) {
31 | 			double distance = 0.0;
32 | 			for (int i = 0; i < m; i++) {
33 | 				distance += X.elementAt(i).getDeletionCost();
34 | 			}
35 | 			return distance;
36 | 		}
37 | 
38 | 		D = new double[n + 1][m + 1];
39 | 		D[0][0] = 0.0; //第一个初始化为0
40 | 
41 | 		/** 初始化D[0][j] */
42 | 		for (int j = 1; j <= m; j++) {
43 | 			D[0][j] = D[0][j - 1] + X.elementAt(j - 1).getDeletionCost();
44 | 		}
45 | 
46 | 		/** 初始化D[i][0] */
47 | 		for (int i = 1; i <= n; i++) {
48 | 			D[i][0] = D[i - 1][0] + Y.elementAt(i - 1).getInsertionCost();
49 | 		}
50 | 
51 | 		for (int i = 1; i <= m; i++) {
52 | 			EditUnit unit_x_i = X.elementAt(i - 1);
53 | 			for (int j = 1; j <= n; j++) {
54 | 				EditUnit unit_y_j = Y.elementAt(j - 1);
55 | 				double cost = unit_x_i.getSubstitutionCost(unit_y_j);
56 | 				D[j][i] = Math.min(D[j - 1][i] + Y.elementAt(j - 1).getInsertionCost(), D[j][i - 1]
57 | 						+ X.elementAt(i - 1).getDeletionCost());
58 | 				D[j][i] = Math.min(D[j][i], D[j - 1][i - 1] + cost);
59 | 			}
60 | 		}
61 | 
62 | 		return D[n][m];
63 | 	}
64 | 
65 | 	public static void main(String[] args) {
66 | 		String s1 = "abcdefg";
67 | 		String s2 = "gcdefab";
68 | 
69 | 		StandardEditDistance ed = new StandardEditDistance();
70 | 		s1 = "什么是计算机病毒";
71 | 		s2 = "什么是电脑病毒";
72 | 		System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1),
73 | 				SuperString.createCharSuperString(s2)));
74 | 		System.out.println(ed.getEditDistance(SuperString.createWordSuperString(s1),
75 | 				SuperString.createWordSuperString(s2)));
76 | 	}
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/SuperString.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.sentence.editdistance;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import zx.soft.similarity.sentence.SegmentProxy;
 7 | import zx.soft.similarity.sentence.SegmentProxy.Word;
 8 | 
 9 | /**
10 |  * 超级字符串，可以存放指定的数据类型
11 |  * 
12 |  */
13 | public class SuperString<T> {
14 | 
15 | 	private List<T> contents = new ArrayList<T>();
16 | 
17 | 	public SuperString(List<T> contents) {
18 | 		this.contents = contents;
19 | 	}
20 | 
21 | 	public static SuperString<CharEditUnit> createCharSuperString(String str) {
22 | 		List<CharEditUnit> list = new ArrayList<CharEditUnit>(str.length());
23 | 		for (int i = 0; i < str.length(); i++) {
24 | 			list.add(new CharEditUnit(str.charAt(i)));
25 | 		}
26 | 		SuperString<CharEditUnit> s = new SuperString<CharEditUnit>(list);
27 | 		return s;
28 | 	}
29 | 
30 | 	public static SuperString<WordEditUnit> createWordSuperString(String sentence) {
31 | 		List<Word> wordList = SegmentProxy.segment(sentence);
32 | 		List<WordEditUnit> unitList = new ArrayList<WordEditUnit>(wordList.size());
33 | 		for (int i = 0; i < wordList.size(); i++) {
34 | 			unitList.add(new WordEditUnit(wordList.get(i)));
35 | 		}
36 | 		SuperString<WordEditUnit> s = new SuperString<WordEditUnit>(unitList);
37 | 		return s;
38 | 	}
39 | 
40 | 	public T elementAt(int pos) {
41 | 		if (pos < 0 || pos >= contents.size()) {
42 | 			throw new ArrayIndexOutOfBoundsException("下标越界");
43 | 		}
44 | 		return contents.get(pos);
45 | 	}
46 | 
47 | 	public int indexOf(SuperString<?> substring) {
48 | 		int result = -1;
49 | 		for (int i = 0; i < length(); i++) {
50 | 			int j = 0;
51 | 			if (i + substring.length() > length())
52 | 				return -1;
53 | 
54 | 			for (; j < substring.length(); j++) {
55 | 				if (elementAt(i + j).equals(substring.elementAt(j))) {
56 | 					continue;
57 | 				} else {
58 | 					break;
59 | 				}
60 | 			}
61 | 			if (j == substring.length()) {
62 | 				return i;
63 | 			}
64 | 		}
65 | 		return result;
66 | 	}
67 | 
68 | 	public SuperString<T> substring(int fromIndex, int toIndex) {
69 | 		return new SuperString<T>(contents.subList(fromIndex, toIndex));
70 | 	}
71 | 
72 | 	public SuperString<T> substring(int fromIndex) {
73 | 		return new SuperString<T>(contents.subList(fromIndex, contents.size()));
74 | 	}
75 | 
76 | 	public int length() {
77 | 		return contents.size();
78 | 	}
79 | 
80 | 	@Override
81 | 	public String toString() {
82 | 		StringBuilder sb = new StringBuilder();
83 | 		for (int i = 0; i < length(); i++) {
84 | 			sb.append(elementAt(i));
85 | 		}
86 | 		return sb.toString();
87 | 	}
88 | 
89 | 	@Override
90 | 	public boolean equals(Object other) {
91 | 		return toString().equals(other.toString());
92 | 	}
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/WordEditUnit.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.sentence.editdistance;
 2 | 
 3 | import zx.soft.similarity.sentence.SegmentProxy.Word;
 4 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
 5 | 
 6 | public class WordEditUnit extends EditUnit {
 7 | 
 8 | 	private Word word = null;
 9 | 
10 | 	public WordEditUnit(Word word) {
11 | 		this.word = word;
12 | 	}
13 | 
14 | 	@Override
15 | 	public String getUnitString() {
16 | 		return word.getWord();
17 | 	}
18 | 
19 | 	/**
20 | 	 * 根据此语的相似度获取替换代价
21 | 	 */
22 | 	@Override
23 | 	public double getSubstitutionCost(EditUnit otherUnit) {
24 | 		if (!(otherUnit instanceof WordEditUnit))
25 | 			return 1.0;
26 | 		if (equals(otherUnit))
27 | 			return 0.0;
28 | 
29 | 		WordEditUnit other = (WordEditUnit) otherUnit;
30 | 		//词性不同，直接返回1.0
31 | 		if (word.getPos() != other.word.getPos()) {
32 | 			return 1.0;
33 | 		}
34 | 		return 1 - XiaConceptParser.getInstance().getSimilarity(getUnitString(), other.getUnitString());
35 | 	}
36 | 
37 | 	@Override
38 | 	public boolean equals(Object other) {
39 | 		if (!(other instanceof WordEditUnit))
40 | 			return false;
41 | 		WordEditUnit otherUnit = (WordEditUnit) other;
42 | 		Word otherWord = otherUnit.word;
43 | 		//词性不同，直接返回1.0
44 | 		if (word.getPos() != otherWord.getPos()) {
45 | 			return false;
46 | 		}
47 | 		double sim = XiaConceptParser.getInstance().getSimilarity(getUnitString(), otherUnit.getUnitString());
48 | 		return sim > 0.85;
49 | 	}
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/XiatianEditDistance.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.sentence.editdistance;
  2 | 
  3 | /**
  4 |  * 新的支持非相邻块交互的编辑距离算法
  5 |  * 
  6 |  */
  7 | public class XiatianEditDistance extends EditDistance {
  8 | 
  9 | 	/** 块交换代价 */
 10 | 	public static double swapCost = 0.5;
 11 | 
 12 | 	private SuperString<? extends EditUnit> S, T;
 13 | 	private double[][][][] QArray;
 14 | 
 15 | 	@Override
 16 | 	public double getEditDistance(SuperString<? extends EditUnit> S, SuperString<? extends EditUnit> T) {
 17 | 		this.S = S;
 18 | 		this.T = T;
 19 | 		QArray = new double[S.length() + 1][S.length() + 1][T.length() + 1][T.length() + 1];
 20 | 		for (int i = 0; i <= S.length(); i++) {
 21 | 			for (int i2 = 0; i2 <= S.length(); i2++)
 22 | 				for (int j = 0; j <= T.length(); j++)
 23 | 					for (int j2 = 0; j2 <= T.length(); j2++) {
 24 | 						QArray[i][i2][j][j2] = Double.MAX_VALUE;
 25 | 					}
 26 | 		}
 27 | 		return Q(0, S.length() - 1, 0, T.length() - 1);
 28 | 	}
 29 | 
 30 | 	private double Q(int i1, int im, int j1, int jn) {
 31 | 		if (QArray[i1][im][j1][jn] < Double.MAX_VALUE) {
 32 | 			return QArray[i1][im][j1][jn];
 33 | 		}
 34 | 		double cost = 0;
 35 | 		if (im < i1) {
 36 | 			for (int j = j1; j <= jn; j++) {
 37 | 				cost += T.elementAt(j).getInsertionCost();
 38 | 			}
 39 | 		} else if (jn < j1) {
 40 | 			for (int i = i1; i <= im; i++) {
 41 | 				cost += S.elementAt(i).getDeletionCost();
 42 | 			}
 43 | 		} else if (im == i1 && jn == j1) {
 44 | 			cost = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
 45 | 		} else if (i1 == im) {
 46 | 			double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
 47 | 			int minPosJ = j1;
 48 | 			for (int j = j1 + 1; j <= jn; j++) {
 49 | 				double subValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j));
 50 | 				if (minSubValue > subValue) {
 51 | 					minSubValue = subValue;
 52 | 					minPosJ = j;
 53 | 				}
 54 | 			}
 55 | 			for (int j = j1; j <= jn; j++) {
 56 | 				if (j == minPosJ) {
 57 | 					cost += minSubValue;
 58 | 				} else {
 59 | 					cost += T.elementAt(j).getInsertionCost();
 60 | 				}
 61 | 			}
 62 | 		} else if (j1 == jn) {
 63 | 			int minPosI = i1;
 64 | 			double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
 65 | 			for (int i = i1 + 1; i <= im; i++) {
 66 | 				double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1));
 67 | 				if (minSubValue > subValue) {
 68 | 					minSubValue = subValue;
 69 | 					minPosI = i;
 70 | 				}
 71 | 			}
 72 | 			for (int i = i1; i <= im; i++) {
 73 | 				if (i == minPosI) {
 74 | 					cost += minSubValue;
 75 | 				} else {
 76 | 					cost += S.elementAt(i).getDeletionCost();
 77 | 				}
 78 | 			}
 79 | 		} else {
 80 | 			cost = QArray[i1][im][j1][jn];
 81 | 			loop: for (int i = i1; i < im; i++) {
 82 | 				//block X divide to 3 parts.
 83 | 				for (int LX = 0; LX <= im - i; LX++) {
 84 | 					//process Y sentence
 85 | 					for (int j = j1; j < jn; j++) {
 86 | 						//if(cost<=swapCost)break;
 87 | 						for (int LY = 0; LY <= jn - j; LY++) {
 88 | 							//不交换的代价
 89 | 							double cost1 = Q(i1, i, j1, j) + Q(i + 1, i + LX, j + 1, j + LY)
 90 | 									+ Q(i + LX + 1, im, j + LY + 1, jn);
 91 | 							//交互代价
 92 | 							double cost2 = Q(i1, i, j + LY + 1, jn) + Q(i + 1, i + LX, j + 1, j + LY)
 93 | 									+ Q(i + LX + 1, im, j1, j) + swapCost;
 94 | 							cost = Math.min(Math.min(cost1, cost2), cost);
 95 | 							if (cost == 0)
 96 | 								break loop;
 97 | 						}
 98 | 					}
 99 | 				}
100 | 			}
101 | 		}
102 | 
103 | 		QArray[i1][im][j1][jn] = cost;
104 | 		return cost;
105 | 	}
106 | 
107 | 	public static void main(String[] argv) {
108 | 		EditDistance ed = new XiatianEditDistance();
109 | 		String s1 = "abcxdef";
110 | 		String s2 = "def";
111 | 		//String s2 = "我的密码我忘记了,我该怎样做呢?";
112 | 		System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1),
113 | 				SuperString.createCharSuperString(s2)));
114 | 	}
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/editdistance/XiatianEditDistance2.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.sentence.editdistance;
  2 | 
  3 | /**
  4 |  * 夏天提出的新的支持非相邻块交互的编辑距离算法
  5 |  * 
  6 |  */
  7 | public class XiatianEditDistance2 extends EditDistance {
  8 | 
  9 | 	/** 块交换代价 */
 10 | 	private final double swapCost = 1.0;
 11 | 
 12 | 	private SuperString<? extends EditUnit> S, T;
 13 | 	private double[][][][] QArray;
 14 | 
 15 | 	@Override
 16 | 	@SuppressWarnings("unchecked")
 17 | 	public double getEditDistance(SuperString<? extends EditUnit> S1, SuperString<? extends EditUnit> T1) {
 18 | 		Object[] array = Split.split(S1, T1);
 19 | 		this.S = (SuperString<? extends EditUnit>) array[0];
 20 | 		this.T = (SuperString<? extends EditUnit>) array[1];
 21 | 		QArray = new double[S.length() + 1][S.length() + 1][T.length() + 1][T.length() + 1];
 22 | 		for (int i = 0; i <= S.length(); i++) {
 23 | 			for (int i2 = 0; i2 <= S.length(); i2++)
 24 | 				for (int j = 0; j <= T.length(); j++)
 25 | 					for (int j2 = 0; j2 <= T.length(); j2++) {
 26 | 						QArray[i][i2][j][j2] = Double.MAX_VALUE;
 27 | 					}
 28 | 		}
 29 | 		return Q(0, S.length() - 1, 0, T.length() - 1);
 30 | 	}
 31 | 
 32 | 	private double Q(int i1, int im, int j1, int jn) {
 33 | 		if (QArray[i1][im][j1][jn] < Double.MAX_VALUE) {
 34 | 			return QArray[i1][im][j1][jn];
 35 | 		}
 36 | 		double cost = 0;
 37 | 		if (im < i1) {
 38 | 			for (int j = j1; j <= jn; j++) {
 39 | 				cost += T.elementAt(j).getInsertionCost();
 40 | 			}
 41 | 		} else if (jn < j1) {
 42 | 			for (int i = i1; i <= im; i++) {
 43 | 				cost += S.elementAt(i).getDeletionCost();
 44 | 			}
 45 | 		} else if (im == i1 && jn == j1) {
 46 | 			cost = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
 47 | 		} else if (i1 == im) {
 48 | 			double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
 49 | 			int minPosJ = j1;
 50 | 			for (int j = j1 + 1; j <= jn; j++) {
 51 | 				double subValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j));
 52 | 				if (minSubValue > subValue) {
 53 | 					minSubValue = subValue;
 54 | 					minPosJ = j;
 55 | 				}
 56 | 			}
 57 | 			for (int j = j1; j <= jn; j++) {
 58 | 				if (j == minPosJ) {
 59 | 					cost += minSubValue;
 60 | 				} else {
 61 | 					cost += T.elementAt(j).getInsertionCost();
 62 | 				}
 63 | 			}
 64 | 		} else if (j1 == jn) {
 65 | 			int minPosI = i1;
 66 | 			double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1));
 67 | 			for (int i = i1 + 1; i <= im; i++) {
 68 | 				double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1));
 69 | 				if (minSubValue > subValue) {
 70 | 					minSubValue = subValue;
 71 | 					minPosI = i;
 72 | 				}
 73 | 			}
 74 | 			for (int i = i1; i <= im; i++) {
 75 | 				if (i == minPosI) {
 76 | 					cost += minSubValue;
 77 | 				} else {
 78 | 					cost += S.elementAt(i).getDeletionCost();
 79 | 				}
 80 | 			}
 81 | 		} else {
 82 | 			cost = QArray[i1][im][j1][jn];
 83 | 			loop: for (int i = i1; i < im; i++) {
 84 | 				//block X divide to 3 parts.
 85 | 				for (int LX = 0; LX <= im - i; LX++) {
 86 | 					//process Y sentence
 87 | 					for (int j = j1; j < jn; j++) {
 88 | 						//if(cost<=swapCost)break;
 89 | 						for (int LY = 0; LY <= jn - j; LY++) {
 90 | 							//不交换的代价
 91 | 							double cost1 = Q(i1, i, j1, j) + Q(i + 1, i + LX, j + 1, j + LY)
 92 | 									+ Q(i + LX + 1, im, j + LY + 1, jn);
 93 | 							//交互代价
 94 | 							double cost2 = Q(i1, i, j + LY + 1, jn) + Q(i + 1, i + LX, j + 1, j + LY)
 95 | 									+ Q(i + LX + 1, im, j1, j) + swapCost;
 96 | 							cost = Math.min(Math.min(cost1, cost2), cost);
 97 | 							if (cost == 0)
 98 | 								break loop;
 99 | 						}
100 | 					}
101 | 				}
102 | 			}
103 | 		}
104 | 
105 | 		QArray[i1][im][j1][jn] = cost;
106 | 		return cost;
107 | 	}
108 | 
109 | 	public static void main(String[] argv) {
110 | 		EditDistance ed = new XiatianEditDistance2();
111 | 		String s1 = "abcxdef";
112 | 		String s2 = "def";
113 | 		//String s2 = "我的密码我忘记了,我该怎样做呢?";
114 | 		System.out.println(ed.getEditDistance(SuperString.createCharSuperString(s1),
115 | 				SuperString.createCharSuperString(s2)));
116 | 	}
117 | 
118 | }
119 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/morphology/MorphoSimilarity.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.sentence.morphology;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | import org.slf4j.Logger;
  7 | import org.slf4j.LoggerFactory;
  8 | 
  9 | import zx.soft.similarity.sentence.SegmentProxy;
 10 | import zx.soft.similarity.sentence.SegmentProxy.Word;
 11 | import zx.soft.similarity.sentence.SentenceSimilarity;
 12 | import zx.soft.similarity.word.WordSimilarity;
 13 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
 14 | 
 15 | /**
 16 |  * 基于词形和词序的句子相似度计算算法，考虑了语义因素<br/>
 17 |  * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的方法，在考虑语义时，
 18 |  * 无法直接获取OnceWS(A, B)，因此，采用了两两匹配取最大值的方式。
 19 |  * 新的改进算法请参考{@code SemanticSimilarity}
 20 |  *
 21 |  */
 22 | public class MorphoSimilarity implements SentenceSimilarity {
 23 | 
 24 | 	private static Logger logger = LoggerFactory.getLogger(MorphoSimilarity.class);
 25 | 
 26 | 	/** 词形相似度占总相似度的比重 */
 27 | 	private final double LAMBDA1 = 1.0;
 28 | 	/** 词序相似度占总相似度的比重 */
 29 | 	private final double LAMBDA2 = 0.0;
 30 | 	/** 词语相似度的计算 */
 31 | 	private WordSimilarity wordSimilarity = null;
 32 | 
 33 | 	private static String FILTER_CHARS = " 　，。；？《》()｜！,.;?<>|_^…!";
 34 | 
 35 | 	private static MorphoSimilarity instance = null;
 36 | 
 37 | 	public static MorphoSimilarity getInstance() {
 38 | 		if (instance == null) {
 39 | 			instance = new MorphoSimilarity();
 40 | 		}
 41 | 		return instance;
 42 | 	}
 43 | 
 44 | 	private MorphoSimilarity() {
 45 | 		logger.debug("used hownet wordsimilarity.");
 46 | 		this.wordSimilarity = XiaConceptParser.getInstance();
 47 | 		//this.segmenter = SegmentFactory.getInstance().getParser();
 48 | 	}
 49 | 
 50 | 	/**
 51 | 	 * 滤掉词串中的空格、标点符号
 52 | 	 * @param word_list
 53 | 	 * @return
 54 | 	 */
 55 | 	private String[] filter(String[] word_list) {
 56 | 		List<String> results = new ArrayList<>();
 57 | 		for (String w : word_list) {
 58 | 			if (!FILTER_CHARS.contains(w)) {
 59 | 				results.add(w.toLowerCase());
 60 | 			}
 61 | 		}
 62 | 
 63 | 		return results.toArray(new String[results.size()]);
 64 | 	}
 65 | 
 66 | 	/**
 67 | 	 * 计算两个句子的相似度
 68 | 	 * @see zx.soft.similarity.Similaritable
 69 | 	 */
 70 | 	@Override
 71 | 	public double getSimilarity(String firstSen, String secondSen) {
 72 | 		//LOG.debug(segmenter.segmentToString(firstSen));
 73 | 		//LOG.debug(segmenter.segmentToString(secondSen));
 74 | 		String[] firstList = filter(segment(firstSen));
 75 | 		String[] secondList = filter(segment(secondSen));
 76 | 
 77 | 		double wordSim = getOccurrenceSimilarity(firstList, secondList);
 78 | 		//LOG.debug("词形相似度="+wordSim);
 79 | 
 80 | 		double orderSim = getOrderSimilarity(firstList, secondList);
 81 | 		//LOG.debug("词序相似度="+orderSim);
 82 | 
 83 | 		return LAMBDA1 * wordSim + LAMBDA2 * orderSim;
 84 | 	}
 85 | 
 86 | 	/**
 87 | 	 * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序，第二个句子词语的顺序变化次数
 88 | 	 * @param firstList
 89 | 	 * @param secondList
 90 | 	 * @return
 91 | 	 */
 92 | 	public double getOccurrenceSimilarity(String[] firstList, String[] secondList) {
 93 | 		int max = firstList.length > secondList.length ? firstList.length : secondList.length;
 94 | 		if (max == 0) {
 95 | 			return 0;
 96 | 		}
 97 | 
 98 | 		//首先计算出所有可能的组合
 99 | 		double[][] scores = new double[max][max];
100 | 		for (int i = 0; i < firstList.length; i++) {
101 | 			for (int j = 0; j < secondList.length; j++) {
102 | 				scores[i][j] = wordSimilarity.getSimilarity(firstList[i], secondList[j]);
103 | 			}
104 | 		}
105 | 
106 | 		double total_score = 0;
107 | 
108 | 		//从scores[][]中挑选出最大的一个相似度，然后减去该元素，进一步求剩余元素中的最大相似度
109 | 		while (scores.length > 0) {
110 | 			double max_score = 0;
111 | 			int max_row = 0;
112 | 			int max_col = 0;
113 | 
114 | 			//先挑出相似度最大的一对：<row, column, max_score>
115 | 			for (int i = 0; i < scores.length; i++) {
116 | 				for (int j = 0; j < scores.length; j++) {
117 | 					if (max_score < scores[i][j]) {
118 | 						max_row = i;
119 | 						max_col = j;
120 | 						max_score = scores[i][j];
121 | 					}
122 | 				}
123 | 			}
124 | 
125 | 			//从数组中去除最大的相似度，继续挑选
126 | 			double[][] tmp_scores = new double[scores.length - 1][scores.length - 1];
127 | 			for (int i = 0; i < scores.length; i++) {
128 | 				if (i == max_row)
129 | 					continue;
130 | 				for (int j = 0; j < scores.length; j++) {
131 | 					if (j == max_col)
132 | 						continue;
133 | 					int tmp_i = max_row > i ? i : i - 1;
134 | 					int tmp_j = max_col > j ? j : j - 1;
135 | 					tmp_scores[tmp_i][tmp_j] = scores[i][j];
136 | 				}
137 | 			}
138 | 			total_score += max_score;
139 | 			scores = tmp_scores;
140 | 		}
141 | 
142 | 		return (2 * total_score) / (firstList.length + secondList.length);
143 | 	}
144 | 
145 | 	/**
146 | 	 * 获取两个集合的词序相似度
147 | 	 * @param firstList
148 | 	 * @param secondList
149 | 	 * @return
150 | 	 */
151 | 	public double getOrderSimilarity(String[] firstList, String[] secondList) {
152 | 		double similarity = 0.0;
153 | 
154 | 		return similarity;
155 | 	}
156 | 
157 | 	//    @SuppressWarnings("unchecked")
158 | 	//	public String[] segment(String sentence){
159 | 	//    	MPWordSegment ws = new MPWordSegment();
160 | 	//    	ws.parseReader(new StringReader(sentence));
161 | 	//    	Vector tokens = ws.getTokens();
162 | 	//    	String[] results = new String[tokens.size()];
163 | 	//    	for(int i=0; i<tokens.size(); i++){
164 | 	//    		Token token = (Token)tokens.get(i);
165 | 	//    		results[i] = token.termText();
166 | 	//    	}
167 | 	//
168 | 	//    	return results;
169 | 	//    }
170 | 
171 | 	public String[] segment(String sentence) {
172 | 		List<Word> list = SegmentProxy.segment(sentence);
173 | 		String[] results = new String[list.size()];
174 | 		for (int i = 0; i < list.size(); i++) {
175 | 			results[i] = list.get(i).getWord();
176 | 		}
177 | 		return results;
178 | 	}
179 | 
180 | }
181 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/sentence/morphology/SemanticSimilarity.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.sentence.morphology;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | import org.slf4j.Logger;
  7 | import org.slf4j.LoggerFactory;
  8 | 
  9 | import zx.soft.similarity.sentence.SegmentProxy;
 10 | import zx.soft.similarity.sentence.SegmentProxy.Word;
 11 | import zx.soft.similarity.sentence.SentenceSimilarity;
 12 | import zx.soft.similarity.word.WordSimilarity;
 13 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
 14 | 
 15 | /**
 16 |  * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的基于词形和词序的句子相似度计算算法
 17 |  * 在考虑语义时，无法直接获取OnceWS(A, B)，为此，通过记录两个句子的词语匹配对中相似度
 18 |  * 大于某一阈值的词语对最为相同词语，计算次序相似度。
 19 |  *
 20 |  *
 21 |  */
 22 | public class SemanticSimilarity implements SentenceSimilarity {
 23 | 
 24 | 	private static Logger logger = LoggerFactory.getLogger(SemanticSimilarity.class);
 25 | 
 26 | 	/** 词形相似度占总相似度的比重 */
 27 | 	private final double LAMBDA1 = 0.8;
 28 | 	/** 词序相似度占总相似度的比重 */
 29 | 	private final double LAMBDA2 = 0.2;
 30 | 
 31 | 	/** 如果两个词语的相似度大于了该阈值， 则作为相同词语，计算词序相似度 */
 32 | 	private final double GAMMA = 0.6;
 33 | 
 34 | 	/** 词语相似度的计算 */
 35 | 	private WordSimilarity wordSimilarity = null;
 36 | 
 37 | 	private static String FILTER_CHARS = " 　，。；？《》()｜！,.;?<>|_^…!";
 38 | 
 39 | 	private static SemanticSimilarity instance = null;
 40 | 
 41 | 	public static SemanticSimilarity getInstance() {
 42 | 		if (instance == null) {
 43 | 			instance = new SemanticSimilarity();
 44 | 		}
 45 | 		return instance;
 46 | 	}
 47 | 
 48 | 	private SemanticSimilarity() {
 49 | 		logger.debug("used hownet wordsimilarity.");
 50 | 		this.wordSimilarity = XiaConceptParser.getInstance();
 51 | 		//this.segmenter = SegmentFactory.getInstance().getParser();
 52 | 	}
 53 | 
 54 | 	/**
 55 | 	 * 滤掉词串中的空格、标点符号
 56 | 	 * @param word_list
 57 | 	 * @return
 58 | 	 */
 59 | 	private String[] filter(String[] word_list) {
 60 | 		List<String> results = new ArrayList<String>();
 61 | 		for (String w : word_list) {
 62 | 			if (!FILTER_CHARS.contains(w)) {
 63 | 				results.add(w.toLowerCase());
 64 | 			}
 65 | 		}
 66 | 
 67 | 		return results.toArray(new String[results.size()]);
 68 | 	}
 69 | 
 70 | 	/**
 71 | 	 * 计算两个句子的相似度
 72 | 	 * @see zx.soft.similarity.Similaritable
 73 | 	 */
 74 | 	@Override
 75 | 	public double getSimilarity(String firstSen, String secondSen) {
 76 | 		//LOG.debug(segmenter.segmentToString(firstSen));
 77 | 		//LOG.debug(segmenter.segmentToString(secondSen));
 78 | 		String[] firstList = filter(segment(firstSen));
 79 | 		String[] secondList = filter(segment(secondSen));
 80 | 
 81 | 		return calculate(firstList, secondList);
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序，第二个句子词语的顺序变化次数
 86 | 	 * @param firstList
 87 | 	 * @param secondList
 88 | 	 * @return
 89 | 	 */
 90 | 	public double calculate(String[] firstList, String[] secondList) {
 91 | 		if (firstList.length == 0 || secondList.length == 0) {
 92 | 			return 0;
 93 | 		}
 94 | 
 95 | 		//首先计算出所有可能的组合
 96 | 		double[][] scores = new double[firstList.length][secondList.length];
 97 | 
 98 | 		//代表第1个句子对应位置是否已经被使用, 默认为未使用，即false
 99 | 		boolean[] firstFlags = new boolean[firstList.length];
100 | 
101 | 		//代表第2个句子对应位置是否已经被使用, 默认为未使用，即false
102 | 		boolean[] secondFlags = new boolean[secondList.length];
103 | 
104 | 		//PSecond的定义参见书中5.4.3节， 为避免无必要的初始化数组，
105 | 		//数组中0值表示在第一个句子中没有对应的相似词语，大于0的值
106 | 		//则表示在第一个句子中的位置（从1开始编号了）
107 | 		int[] PSecond = new int[secondList.length];
108 | 
109 | 		for (int i = 0; i < firstList.length; i++) {
110 | 			//firstFlags[i] = false;
111 | 			for (int j = 0; j < secondList.length; j++) {
112 | 				scores[i][j] = wordSimilarity.getSimilarity(firstList[i], secondList[j]);
113 | 			}
114 | 		}
115 | 
116 | 		double total_score = 0;
117 | 
118 | 		//从scores[][]中挑选出最大的一个相似度，然后减去该元素(通过Flags数组表示)，进一步求剩余元素中的最大相似度
119 | 		while (true) {
120 | 			double max_score = 0;
121 | 			int max_row = -1;
122 | 			int max_col = -1;
123 | 
124 | 			//先挑出相似度最大的一对：<row, column, max_score>
125 | 			for (int i = 0; i < scores.length; i++) {
126 | 				if (firstFlags[i])
127 | 					continue;
128 | 				for (int j = 0; j < scores.length; j++) {
129 | 					if (secondFlags[j])
130 | 						continue;
131 | 
132 | 					if (max_score < scores[i][j]) {
133 | 						max_row = i;
134 | 						max_col = j;
135 | 						max_score = scores[i][j];
136 | 					}
137 | 				}
138 | 			}
139 | 
140 | 			if (max_row >= 0) {
141 | 				total_score += max_score;
142 | 				firstFlags[max_row] = true;
143 | 				secondFlags[max_col] = true;
144 | 				if (max_score >= GAMMA) {
145 | 					PSecond[max_col] = max_row + 1;
146 | 				}
147 | 			} else {
148 | 				break;
149 | 			}
150 | 		}
151 | 
152 | 		double wordSim = (2 * total_score) / (firstList.length + secondList.length);
153 | 
154 | 		int previous = 0;
155 | 		int revOrdCount = 0;
156 | 		int onceWSSize = 0;
157 | 		for (int i = 0; i < PSecond.length; i++) {
158 | 			if (PSecond[i] > 0) {
159 | 				onceWSSize++;
160 | 				if (previous > 0 && (previous > PSecond[i])) {
161 | 					revOrdCount++;
162 | 				}
163 | 				previous = PSecond[i];
164 | 			}
165 | 		}
166 | 
167 | 		double ordSim = 0;
168 | 		if (onceWSSize == 1) {
169 | 			ordSim = 1;
170 | 		} else if (onceWSSize == 0) {
171 | 			ordSim = 0;
172 | 		} else {
173 | 			ordSim = 1.0 - revOrdCount * 1.0 / (onceWSSize - 1);
174 | 		}
175 | 
176 | 		System.out.println("wordSim ==> " + wordSim + ", ordSim ==> " + ordSim);
177 | 
178 | 		return LAMBDA1 * wordSim + LAMBDA2 * ordSim;
179 | 	}
180 | 
181 | 	public String[] segment(String sentence) {
182 | 		List<Word> list = SegmentProxy.segment(sentence);
183 | 		String[] results = new String[list.size()];
184 | 		for (int i = 0; i < list.size(); i++) {
185 | 			results[i] = list.get(i).getWord();
186 | 		}
187 | 		return results;
188 | 	}
189 | 
190 | }
191 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/statistic/DictStatistic.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.statistic;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.InputStream;
 6 | import java.util.zip.GZIPInputStream;
 7 | 
 8 | import javax.xml.namespace.QName;
 9 | import javax.xml.stream.XMLEventReader;
10 | import javax.xml.stream.XMLInputFactory;
11 | import javax.xml.stream.events.StartElement;
12 | import javax.xml.stream.events.XMLEvent;
13 | 
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
18 | 
19 | /**
20 |  * 用于统计分词词典文件中的概念出现数量
21 |  *
22 |  */
23 | public class DictStatistic {
24 | 
25 | 	private static Logger logger = LoggerFactory.getLogger(DictStatistic.class);
26 | 
27 | 	/**
28 | 	 * 从指定的xml文件加载词典文件
29 | 	 * @param xmlFile
30 | 	 * @param gzCompressed 是否再用gz格式对词典进行了压缩
31 | 	 * @return
32 | 	 */
33 | 	public void testFromXml(String xmlFile, boolean gzCompressed) {
34 | 		File file = new File(xmlFile);
35 | 		if (!file.canRead()) {
36 | 			logger.error("无法读取文件:{}", xmlFile);
37 | 			return;// fail while opening the file
38 | 		}
39 | 		int count = 0, conceptCount = 0;
40 | 		XMLInputFactory inputFactory = XMLInputFactory.newInstance();
41 | 		InputStream input = null;
42 | 		try {
43 | 			if (gzCompressed) {
44 | 				input = new GZIPInputStream(new FileInputStream(file));
45 | 			} else {
46 | 				input = new FileInputStream(file);
47 | 			}
48 | 			XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
49 | 			while (xmlEventReader.hasNext()) {
50 | 				XMLEvent event = xmlEventReader.nextEvent();
51 | 
52 | 				if (event.isStartElement()) {
53 | 					StartElement startElement = event.asStartElement();
54 | 					if (startElement.getName().toString().equals("table")) {
55 | 						String head = startElement.getAttributeByName(QName.valueOf("head")).getValue();
56 | 						while (xmlEventReader.hasNext()) {
57 | 							XMLEvent itemEvent = xmlEventReader.nextEvent();
58 | 							if (itemEvent.isStartElement()) {
59 | 								StartElement itemStartElement = itemEvent.asStartElement();
60 | 								if (!itemStartElement.getName().toString().equals("item"))
61 | 									continue;
62 | 								String word = itemStartElement.getAttributeByName(QName.valueOf("word")).getValue();
63 | 								word = head + word;
64 | 								if (XiaConceptParser.getInstance().isConcept(word)) {
65 | 									conceptCount++;
66 | 								}
67 | 								count++;
68 | 								if (count % 1000 == 0) {
69 | 									logger.info("process words {} ...", count);
70 | 								}
71 | 							}
72 | 						}
73 | 					}
74 | 				}
75 | 			}
76 | 			input.close();
77 | 			logger.info(count + "\t" + conceptCount);
78 | 			return;
79 | 		} catch (Exception e) {
80 | 			logger.error("Exception:{}", e.getMessage());
81 | 			e.printStackTrace();
82 | 		}
83 | 	}
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/statistic/LCMC.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.statistic;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.InputStream;
 6 | 
 7 | import javax.xml.stream.XMLEventReader;
 8 | import javax.xml.stream.XMLInputFactory;
 9 | import javax.xml.stream.events.StartElement;
10 | import javax.xml.stream.events.XMLEvent;
11 | 
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 | 
15 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
16 | 
17 | public class LCMC {
18 | 
19 | 	private static Logger logger = LoggerFactory.getLogger(LCMC.class);
20 | 
21 | 	public void countUnConceptWords(File xmlFile) throws Exception {
22 | 		int totalCount = 0, conceptCount = 0;
23 | 		XMLInputFactory inputFactory = XMLInputFactory.newInstance();
24 | 		InputStream input = null;
25 | 		input = new FileInputStream(xmlFile);
26 | 		XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
27 | 		while (xmlEventReader.hasNext()) {
28 | 			XMLEvent event = xmlEventReader.nextEvent();
29 | 
30 | 			if (event.isStartElement()) {
31 | 				StartElement startElement = event.asStartElement();
32 | 				//如果是word开始
33 | 				if (startElement.getName().toString().equals("w")) {
34 | 					String word = xmlEventReader.getElementText();
35 | 					totalCount++;
36 | 					if (XiaConceptParser.getInstance().isConcept(word)) {
37 | 						conceptCount++;
38 | 					}
39 | 				}
40 | 			}
41 | 		}//
42 | 		input.close();
43 | 		logger.info(totalCount + "\t" + conceptCount);
44 | 	}
45 | 
46 | 	public static void main(String[] args) throws Exception {
47 | 		LCMC lcmc = new LCMC();
48 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_A.XML"));
49 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_B.XML"));
50 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_C.XML"));
51 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_D.XML"));
52 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_E.XML"));
53 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_F.XML"));
54 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_G.XML"));
55 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_H.XML"));
56 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_J.XML"));
57 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_K.XML"));
58 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_L.XML"));
59 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_M.XML"));
60 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_N.XML"));
61 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_P.XML"));
62 | 		lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_R.XML"));
63 | 	}
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/text/DiceSimilarity.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.text;
 2 | 
 3 | import zx.soft.similarity.Similaritable;
 4 | 
 5 | public class DiceSimilarity implements Similaritable {
 6 | 
 7 | 	@Override
 8 | 	public double getSimilarity(String item1, String item2) {
 9 | 		return 0;
10 | 	}
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/About.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.util;
 2 | 
 3 | import java.awt.BorderLayout;
 4 | import java.awt.Dimension;
 5 | import java.io.IOException;
 6 | import java.net.URL;
 7 | import java.net.URLClassLoader;
 8 | 
 9 | import javax.swing.JFrame;
10 | import javax.swing.JPanel;
11 | import javax.swing.JScrollPane;
12 | import javax.swing.JTextPane;
13 | import javax.swing.text.StyledEditorKit;
14 | 
15 | /**
16 |  * 关于xsimilarity项目的说明信息
17 |  *
18 |  */
19 | public class About extends JFrame {
20 | 
21 | 	private static final long serialVersionUID = -2307582155443587993L;
22 | 
23 | 	public static JPanel createPanel() {
24 | 		JPanel mainPanel = new JPanel();
25 | 		mainPanel.setLayout(new BorderLayout());
26 | 		JTextPane editorPane = new JTextPane();
27 | 		editorPane.setEditable(false);
28 | 		//让长文本自动换行
29 | 		editorPane.setEditorKit(new StyledEditorKit());
30 | 		editorPane.setContentType("text/html");
31 | 		try {
32 | 			URLClassLoader urlLoader = (URLClassLoader) About.class.getClassLoader();
33 | 			String html = "data/about.html";
34 | 			System.out.println(html);
35 | 			URL url = urlLoader.findResource(html); // 可以用html格式文件做你的帮助系统了
36 | 			editorPane.setPage(url);
37 | 		} catch (IOException e1) {
38 | 			editorPane.setText(e1.getMessage());
39 | 		}
40 | 		// editorPane.setText("<html><body>个人主页：<a href='xiatian.irm.cn'>http://xiatian.irm.cn/</a></body></html>");
41 | 
42 | 		mainPanel.add(new JScrollPane(editorPane), BorderLayout.CENTER);
43 | 		return mainPanel;
44 | 	}
45 | 
46 | 	public About() {
47 | 		this.setTitle("关于Semantic-Similarity");
48 | 
49 | 		this.setDefaultCloseOperation(EXIT_ON_CLOSE);
50 | 		this.setPreferredSize(new Dimension(600, 400));
51 | 		this.getContentPane().add(createPanel());
52 | 		this.pack();
53 | 	}
54 | 
55 | 	public static void main(String[] args) {
56 | 		new About().setVisible(true);
57 | 	}
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/BlankUtils.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.util;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | /**
 6 |  * 判断是否为空的工具类
 7 |  * 
 8 |  */
 9 | public class BlankUtils {
10 | 
11 | 	/**
12 | 	 * 判断字符串s是否是空串
13 | 	 * @param s
14 | 	 * @return
15 | 	 */
16 | 	public static boolean isBlank(String string) {
17 | 		return string == null || string.trim().equals("");
18 | 	}
19 | 
20 | 	/**
21 | 	 * 判断数组是否是空
22 | 	 * @param array
23 | 	 * @return
24 | 	 */
25 | 	public static boolean isBlank(Object[] array) {
26 | 		return array == null || array.length == 0;
27 | 	}
28 | 
29 | 	/**
30 | 	 * 判断集合是否是空
31 | 	 * @param array
32 | 	 * @return
33 | 	 */
34 | 	public static boolean isBlank(Collection<? extends Object> array) {
35 | 		return array == null || array.size() == 0;
36 | 	}
37 | 
38 | 	/**
39 | 	 * 判断所有的集合是否都为空
40 | 	 * @param collections 
41 | 	 * @return
42 | 	 */
43 | 	public static boolean isBlankAll(Collection<?>... collections) {
44 | 		for (Collection<?> c : collections) {
45 | 			if (!isBlank(c)) {
46 | 				return false;
47 | 			}
48 | 		}
49 | 
50 | 		return true;
51 | 	}
52 | 
53 | 	/**
54 | 	 * 判断字符串strings中是否都是空串
55 | 	 * @param strings
56 | 	 * @return
57 | 	 */
58 | 	public static boolean isBlankAll(String... strings) {
59 | 		for (String s : strings) {
60 | 			if (!isBlank(s)) {
61 | 				return false;
62 | 			}
63 | 		}
64 | 
65 | 		return true;
66 | 	}
67 | 
68 | 	/**
69 | 	 * 判断collections集合中是否至少有一个为空
70 | 	 * @param collections
71 | 	 * @return
72 | 	 */
73 | 	public static boolean isBlankAtLeastOne(Collection<?>... collections) {
74 | 		for (Collection<?> c : collections) {
75 | 			if (isBlank(c)) {
76 | 				return true;
77 | 			}
78 | 		}
79 | 
80 | 		return false;
81 | 	}
82 | 
83 | 	/**
84 | 	 * 判断字符串strings中是否之首有一个为空
85 | 	 * @param strings
86 | 	 * @return
87 | 	 */
88 | 	public static boolean isBlankAtLeastOne(String... strings) {
89 | 		for (String s : strings) {
90 | 			if (isBlank(s)) {
91 | 				return true;
92 | 			}
93 | 		}
94 | 
95 | 		return false;
96 | 	}
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/EditDistance.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.util;
  2 | 
  3 | /**
  4 |  * This class computes the edit distance between two strings using dynamic
  5 |  * programming. The dynamic programming part is in the method
  6 |  * printEditDistance().
  7 |  * 
  8 |  */
  9 | public class EditDistance {
 10 | 
 11 | 	/**
 12 | 	 * 获取删除代价
 13 | 	 * 
 14 | 	 * @return
 15 | 	 */
 16 | 	public int getDeletionCost() {
 17 | 		return 1;
 18 | 	}
 19 | 
 20 | 	/**
 21 | 	 * 获取插入代价
 22 | 	 * 
 23 | 	 * @return
 24 | 	 */
 25 | 	public int getInsertionCost() {
 26 | 		return 1;
 27 | 	}
 28 | 
 29 | 	/**
 30 | 	 * 获取替换代价
 31 | 	 * 
 32 | 	 * @return
 33 | 	 */
 34 | 	public int getSubstitutionCost(char a, char b) {
 35 | 		return (a == b) ? 0 : 1;
 36 | 	}
 37 | 
 38 | 	public int getEditDistance(String S, String T) {
 39 | 		int[][] D = null;
 40 | 		if (S == null)
 41 | 			S = "";
 42 | 		if (T == null)
 43 | 			T = "";
 44 | 
 45 | 		char[] a = S.toCharArray();
 46 | 		char[] b = T.toCharArray();
 47 | 
 48 | 		int n = a.length; // 字符串S的长度
 49 | 		int m = b.length; // 字符串T的长度
 50 | 
 51 | 		if (a.length == 0) {
 52 | 			return b.length;
 53 | 		} else if (b.length == 0) {
 54 | 			return a.length;
 55 | 		}
 56 | 
 57 | 		D = new int[a.length + 1][b.length + 1];
 58 | 
 59 | 		/** 初始化D[i][0] */
 60 | 		for (int i = 1; i <= n; i++) {
 61 | 			D[i][0] = D[i - 1][0] + getDeletionCost();
 62 | 		}
 63 | 
 64 | 		/** 初始化D[0][j] */
 65 | 		for (int j = 1; j <= m; j++) {
 66 | 			D[0][j] = D[0][j - 1] + getInsertionCost();
 67 | 		}
 68 | 
 69 | 		for (int i = 1; i <= n; i++) {
 70 | 			for (int j = 1; j <= m; j++) {
 71 | 				D[i][j] = MathUtils.min(D[i - 1][j] + getDeletionCost(), D[i][j - 1] + getInsertionCost(),
 72 | 						D[i - 1][j - 1] + getSubstitutionCost(a[i - 1], b[j - 1]));
 73 | 			}
 74 | 		}
 75 | 
 76 | 		return D[n][m];
 77 | 	}
 78 | 
 79 | 	/**
 80 | 	 * 应与getEditDistance(S, T)等同
 81 | 	 * @param s
 82 | 	 * @param t
 83 | 	 * @return
 84 | 	 */
 85 | 	public static int getLevenshteinDistance(String s, String t) {
 86 | 		if (s == null || t == null) {
 87 | 			throw new IllegalArgumentException("Strings must not be null");
 88 | 		}
 89 | 		int d[][]; // matrix
 90 | 		int n; // length of s
 91 | 		int m; // length of t
 92 | 		int i; // iterates through s
 93 | 		int j; // iterates through t
 94 | 		char s_i; // ith character of s
 95 | 		char t_j; // jth character of t
 96 | 		int cost; // cost
 97 | 
 98 | 		// Step 1
 99 | 		n = s.length();
100 | 		m = t.length();
101 | 		if (n == 0) {
102 | 			return m;
103 | 		}
104 | 		if (m == 0) {
105 | 			return n;
106 | 		}
107 | 		d = new int[n + 1][m + 1];
108 | 
109 | 		// Step 2
110 | 		for (i = 0; i <= n; i++) {
111 | 			d[i][0] = i;
112 | 		}
113 | 		for (j = 0; j <= m; j++) {
114 | 			d[0][j] = j;
115 | 		}
116 | 
117 | 		// Step 3
118 | 		for (i = 1; i <= n; i++) {
119 | 			s_i = s.charAt(i - 1);
120 | 
121 | 			// Step 4
122 | 			for (j = 1; j <= m; j++) {
123 | 				t_j = t.charAt(j - 1);
124 | 
125 | 				// Step 5
126 | 				if (s_i == t_j) {
127 | 					cost = 0;
128 | 				} else {
129 | 					cost = 1;
130 | 				}
131 | 
132 | 				// Step 6
133 | 				d[i][j] = MathUtils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
134 | 			}
135 | 		}
136 | 
137 | 		// Step 7
138 | 		return d[n][m];
139 | 	}
140 | 
141 | }
142 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/FileUtils.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.util;
 2 | 
 3 | import java.io.BufferedOutputStream;
 4 | import java.io.BufferedReader;
 5 | import java.io.File;
 6 | import java.io.FileOutputStream;
 7 | import java.io.IOException;
 8 | import java.io.InputStream;
 9 | import java.io.InputStreamReader;
10 | 
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 | 
14 | /**
15 |  * 与文件相关的工具类
16 |  *
17 |  */
18 | public class FileUtils {
19 | 
20 | 	private static Logger logger = LoggerFactory.getLogger(FileUtils.class);
21 | 
22 | 	/**
23 | 	 * 根据指定编码从输入流中依次遍历每一行文字
24 | 	 *
25 | 	 * @param input
26 | 	 *            输入流
27 | 	 * @param encoding
28 | 	 *            输入流所用的文字编码
29 | 	 * @param event
30 | 	 *            遍历每一行时触发的事件处理
31 | 	 * @throws IOException
32 | 	 */
33 | 	public static void traverseLines(InputStream input, String encoding, TraverseEvent<String> event)
34 | 			throws IOException {
35 | 		BufferedReader in = new BufferedReader(new InputStreamReader(input, encoding));
36 | 		String line = null;
37 | 
38 | 		while ((line = in.readLine()) != null) {
39 | 			event.visit(line);
40 | 		}
41 | 
42 | 		input.close();
43 | 		in.close();
44 | 	}
45 | 
46 | 	/**
47 | 	 * 保存字符串到文件中
48 | 	 * @param content
49 | 	 * @param fileName
50 | 	 * @return
51 | 	 */
52 | 	public static boolean saveStringToFile(String content, String fileName) {
53 | 		boolean rtn = false;
54 | 		BufferedOutputStream out = null;
55 | 		try {
56 | 			File file = new File(fileName);
57 | 			file.getParentFile().mkdirs();
58 | 
59 | 			out = new BufferedOutputStream(new FileOutputStream(file));
60 | 			out.write(content.getBytes("GBK"));
61 | 			out.close();
62 | 			rtn = true;
63 | 		} catch (Exception e) {
64 | 			logger.error("saveStringToFile error:{}", e.getMessage());
65 | 		} finally {
66 | 			try {
67 | 				out.close();
68 | 			} catch (Exception e) {
69 | 				logger.error("Exception:{}", e.getMessage());
70 | 			}
71 | 		}
72 | 		return rtn;
73 | 	}
74 | 
75 | 	public static void main(String[] args) {
76 | 		int count = 0;
77 | 		File dir = new File("G:/juanjuantx");
78 | 		for (File a : dir.listFiles()) {
79 | 			if (a.isDirectory()) {
80 | 				for (File zy : a.listFiles()) {
81 | 					if (zy.listFiles() != null)
82 | 						for (File rar : zy.listFiles()) {
83 | 							if (rar.isFile() && rar.getName().endsWith(".rar")) {
84 | 								count++;
85 | 							}
86 | 						}
87 | 				}
88 | 			}
89 | 		}
90 | 		System.out.println(count);
91 | 	}
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/MathUtils.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.util;
 2 | 
 3 | public class MathUtils {
 4 | 
 5 | 	public static int min(int... values) {
 6 | 		int min = Integer.MAX_VALUE;
 7 | 		for (int v : values) {
 8 | 			min = (v < min) ? v : min;
 9 | 		}
10 | 		return min;
11 | 	}
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/PinyinUtils.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.util;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.IOException;
  5 | import java.io.InputStream;
  6 | import java.io.InputStreamReader;
  7 | import java.util.HashMap;
  8 | import java.util.HashSet;
  9 | import java.util.Map;
 10 | import java.util.Set;
 11 | 
 12 | /**
 13 |  * 拼音处理的工具，负责从拼音词典加载内容，根据汉字词语或汉字查找拼音
 14 |  *
 15 |  */
 16 | public class PinyinUtils {
 17 | 
 18 | 	/** 拼音的Map词典, 一个汉字可能对应多个拼音, 它所有的拼音放到一个集合中 */
 19 | 	private Map<Character, Set<String>> pinyinDict = null;
 20 | 
 21 | 	/** 单例 */
 22 | 	private static PinyinUtils instance = null;
 23 | 
 24 | 	private PinyinUtils() throws IOException {
 25 | 		// 从classpath中加载拼音词典文件
 26 | 		InputStream input = this.getClass().getClassLoader()
 27 | 				.getResourceAsStream("data/F02-GB2312-to-PuTongHua-PinYin.txt");
 28 | 
 29 | 		BufferedReader in = new BufferedReader(new InputStreamReader(input, "UTF-8"));
 30 | 		String line = null;
 31 | 
 32 | 		MyTraverseEvent event = new MyTraverseEvent();
 33 | 		while ((line = in.readLine()) != null) {
 34 | 			event.visit(line);
 35 | 		}
 36 | 
 37 | 		input.close();
 38 | 		in.close();
 39 | 
 40 | 		this.pinyinDict = event.getPinyins();
 41 | 	}
 42 | 
 43 | 	public static PinyinUtils getInstance() {
 44 | 		if (instance == null) {
 45 | 			try {
 46 | 				instance = new PinyinUtils();
 47 | 			} catch (IOException e) {
 48 | 				e.printStackTrace();
 49 | 			}
 50 | 		}
 51 | 
 52 | 		return instance;
 53 | 	}
 54 | 
 55 | 	/**
 56 | 	 * 获取汉字的拼音, 由于汉字具有多音字，故返回一个集合
 57 | 	 * @param hanzi
 58 | 	 * @return
 59 | 	 */
 60 | 	public Set<String> getPinyin(Character hanzi) {
 61 | 		Set<String> set = pinyinDict.get(hanzi);
 62 | 		if (set == null || set.size() == 0) {
 63 | 			set = new HashSet<>();
 64 | 			set.add(hanzi.toString());
 65 | 		}
 66 | 		return set;
 67 | 	}
 68 | 
 69 | 	/**
 70 | 	 * 获取词语的拼音, 一个词语可能对应多个拼音，把所有可能的组合放到集合中返回
 71 | 	 * @param word
 72 | 	 * @return
 73 | 	 */
 74 | 	public Set<String> getPinyin(String word) {
 75 | 		Set<String> word_set = new HashSet<>();
 76 | 		for (int i = 0; i < word.length(); i++) {
 77 | 			Set<String> hanzi_set = getPinyin(word.charAt(i));
 78 | 			if (word_set == null || word_set.size() == 0) {
 79 | 				word_set.addAll(hanzi_set);
 80 | 				continue;
 81 | 			}
 82 | 
 83 | 			Set<String> tmp_set = new HashSet<>();
 84 | 			for (String w : word_set) {
 85 | 				for (String h : hanzi_set) {
 86 | 					tmp_set.add(w + h);
 87 | 				}
 88 | 			}
 89 | 
 90 | 			word_set = tmp_set;
 91 | 		}
 92 | 
 93 | 		return word_set;
 94 | 	}
 95 | 
 96 | 	/**
 97 | 	 * 获取拼音字符串，多音字只取一个
 98 | 	 * @param word
 99 | 	 * @return
100 | 	 */
101 | 	public String getPinyinSingle(String word) {
102 | 		StringBuffer sb = new StringBuffer();
103 | 		for (int i = 0; i < word.length(); i++) {
104 | 			sb.append(getPinyin(word.charAt(i)).iterator().next());
105 | 		}
106 | 		return sb.toString();
107 | 	}
108 | 
109 | 	/**
110 | 	 * 获取拼音串，对于多音字，给出所有拼音
111 | 	 * @param word
112 | 	 * @return
113 | 	 */
114 | 	public String getPinyinString(String word) {
115 | 		StringBuffer sb = new StringBuffer();
116 | 		for (int i = 0; i < word.length(); i++) {
117 | 			Set<String> pinyin = getPinyin(word.charAt(i));
118 | 			sb.append(pinyin.toString());
119 | 		}
120 | 		return sb.toString();
121 | 	}
122 | 
123 | 	/**
124 | 	 * 获取拼音首字母
125 | 	 * @param word
126 | 	 * @return
127 | 	 */
128 | 	public String getPinyinHead(String word) {
129 | 		StringBuffer sb = new StringBuffer();
130 | 		for (int i = 0; i < word.length(); i++) {
131 | 			sb.append(getPinyin(word.charAt(i)).iterator().next().charAt(0));
132 | 		}
133 | 		return sb.toString();
134 | 	}
135 | 
136 | 	private static class MyTraverseEvent {
137 | 		/** 一个汉字对应多个拼音, 多个拼音放到集合中 */
138 | 		private Map<Character, Set<String>> pinyins = null;
139 | 
140 | 		public MyTraverseEvent() {
141 | 			this.pinyins = new HashMap<>();
142 | 		}
143 | 
144 | 		public Map<Character, Set<String>> getPinyins() {
145 | 			return pinyins;
146 | 		}
147 | 
148 | 		public boolean visit(String item) {
149 | 			if (item.startsWith("//")) {
150 | 				return true;
151 | 			}
152 | 
153 | 			char hanzi = item.charAt(0);
154 | 			//String pinyin = item.substring(2, item.length()-1);
155 | 			String pinyin = item.substring(2, item.length());
156 | 			Set<String> set = pinyins.get(hanzi);
157 | 			if (set == null) {
158 | 				set = new HashSet<>();
159 | 			}
160 | 			set.add(pinyin);
161 | 
162 | 			pinyins.put(hanzi, set);
163 | 			return true;
164 | 		}
165 | 	}
166 | 
167 | }
168 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/TraverseEvent.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.util;
 2 | 
 3 | /**
 4 |  * 遍历接口, 对于需要遍历的东西，通过传入该接口，可以实现实际的访问处理
 5 |  * 
 6 |  */
 7 | public interface TraverseEvent<T> {
 8 | 
 9 | 	/** 
10 | 	 * 遍历时访问其中的一个条目
11 | 	 * @param item
12 | 	 * @return
13 | 	 */
14 | 	public boolean visit(T item);
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/XmlException.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.util;
 2 | 
 3 | /**
 4 |  * Runtime exception for XML handling.
 5 |  * 
 6 |  */
 7 | public class XmlException extends RuntimeException {
 8 | 
 9 | 	private static final long serialVersionUID = 381260478228427716L;
10 | 
11 | 	public static final String XML_PAYLOAD_EMPTY = "xml.payload.empty";
12 | 	public static final String XML_ENCODE_ERROR = "xml.encoding.invalid";
13 | 	public static final String FILE_NOT_FOUND = "xml.file.not.found";
14 | 	public static final String XML_PARSE_ERROR = "xml.parse.error";
15 | 	public static final String XML_READ_ERROR = "xml.read.error";
16 | 	public static final String XML_VALIDATE_ERROR = "xml.validate.error";
17 | 	public static final String XML_TRANSFORM_ERROR = "xml.transform.error";
18 | 
19 | 	public XmlException() {
20 | 		super();
21 | 	}
22 | 
23 | 	public XmlException(String key, Throwable cause) {
24 | 		super(key, cause);
25 | 	}
26 | 
27 | 	public XmlException(String key) {
28 | 		super(key);
29 | 	}
30 | 
31 | 	public XmlException(Throwable cause) {
32 | 		super(cause);
33 | 	}
34 | 
35 | }


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/util/about.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 	<head><title>xsimilarity</title></head>
 3 | 	<body>
 4 | 		<p>
 5 | 			<h2>XSimilarity</h2>
 6 | 		  项目地址：<a href="http://code.google.com/p/xsimilarity/">http://github.com/iamxiatian/xsimilarity/</a>
 7 | 		</p>
 8 | 		<p>
 9 | 			有任何问题或建议请与我们联系，您的反馈将有助于该项目的进一步完善。
10 | 		</p>
11 | 		<p>
12 |       <h2>致谢</h2>
13 |             本项目在研究过程中，得到了恩师樊孝忠教授的悉心指导，师恩如海，难以言谢！<br/>
14 | 			中国人民大学为本项目的持续研究提供了资金和计算机软硬件的支持，北京理工大学为本项目的早期研究提供了重要的基础设施，<br/>
15 | 			这些支持与国家的投入密不可分，
16 | 			本项目的开源和不断完善也算是对国家的点滴回报！<br/>
17 | 			代码中许多算法的核心思想来源于我们的研究同行和先辈们的已公开成果，另外，许多使用xsimilarity的人员对xsimilarity<br/>
18 | 			提出了宝贵的建议，在此一并表示深深的谢意！     <br/>
19 |         本工程使用了如下开源组件，对原作者致以谢意！
20 |         <ul>
21 |             <li>ANSJ： </li>
22 |         </ul>
23 |     </p>
24 | 		<p>
25 | 			<h2>联系方式</h2>
26 | 			夏天<br/>
27 | 			数据工程与知识工程教育部重点实验室（中国人民大学）<br/>
28 |       中国人民大学信息资源管理学院<br/>
29 |       电话: 86-10-82500675<br/>
30 |       Email: xiat(at)ruc.edu.cn<br/>
31 | 		</p>
32 | 		
33 | 	</body>
34 | </html>


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/CharBasedSimilarity.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import zx.soft.similarity.Similaritable;
 7 | 
 8 | /**
 9 |  * 字面相似度计算方法
10 |  *
11 |  */
12 | public class CharBasedSimilarity implements Similaritable {
13 | 
14 | 	private final double alpha = 0.6;
15 | 	private final double beta = 0.4;
16 | 
17 | 	@Override
18 | 	public double getSimilarity(String word1, String word2) {
19 | 		if (isBlank(word1) && isBlank(word2)) {
20 | 			return 1.0;
21 | 		}
22 | 		if (isBlank(word1) || isBlank(word2)) {
23 | 			return 0.0;
24 | 		}
25 | 
26 | 		List<Character> sameHZ = new ArrayList<>();
27 | 
28 | 		String longString = word1.length() >= word2.length() ? word1 : word2;
29 | 		String shortString = word1.length() < word2.length() ? word1 : word2;
30 | 		for (int i = 0; i < longString.length(); i++) {
31 | 			Character ch = longString.charAt(i);
32 | 			if (shortString.contains(ch.toString())) {
33 | 				sameHZ.add(ch);
34 | 			}
35 | 		}
36 | 
37 | 		double dp = Math.min(1.0 * word1.length() / word2.length(), 1.0 * word2.length() / word1.length());
38 | 		double part1 = alpha * (1.0 * sameHZ.size() / word1.length() + 1.0 * sameHZ.size() / word2.length()) / 2.0;
39 | 		double part2 = beta * dp * (getWeightedResult(word1, sameHZ) + getWeightedResult(word2, sameHZ)) / 2.0;
40 | 
41 | 		return part1 + part2;
42 | 	}
43 | 
44 | 	private double getWeightedResult(String word1, List<Character> sameHZ) {
45 | 		double top = 0;
46 | 		double bottom = 0;
47 | 		for (int i = 0; i < word1.length(); i++) {
48 | 			if (sameHZ.contains(word1.charAt(i))) {
49 | 				top += (i + 1);
50 | 			}
51 | 			bottom += (i + 1);
52 | 		}
53 | 		return 1.0 * top / bottom;
54 | 	}
55 | 
56 | 	private boolean isBlank(String str) {
57 | 		return str == null || str.trim().equals("");
58 | 	}
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/WordSimilarity.java:
--------------------------------------------------------------------------------
1 | package zx.soft.similarity.word;
2 | 
3 | import zx.soft.similarity.Similaritable;
4 | 
5 | public interface WordSimilarity extends Similaritable {
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/cilin/Cilin.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.cilin;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | import zx.soft.similarity.Similaritable;
 6 | 
 7 | public class Cilin implements Similaritable {
 8 | 
 9 | 	private static Cilin instance = null;
10 | 
11 | 	public static Cilin getInstance() {
12 | 		if (instance == null) {
13 | 			instance = new Cilin();
14 | 		}
15 | 		return instance;
16 | 	}
17 | 
18 | 	private Cilin() {
19 | 
20 | 	}
21 | 
22 | 	@Override
23 | 	public double getSimilarity(String item1, String item2) {
24 | 		double sim = 0.0;
25 | 
26 | 		if (item1 == null && item2 == null) {
27 | 			return 1.0;
28 | 		} else if (item1 == null || item2 == null) {
29 | 			return 0.0;
30 | 		} else if (item1.equalsIgnoreCase(item2)) {
31 | 			return 1.0;
32 | 		}
33 | 
34 | 		Set<String> codeSet1 = CilinDb.getInstance().getCilinCoding(item1);
35 | 		Set<String> codeSet2 = CilinDb.getInstance().getCilinCoding(item2);
36 | 		if (codeSet1 == null || codeSet2 == null) {
37 | 			return 0.0;
38 | 		}
39 | 		for (String code1 : codeSet1) {
40 | 			for (String code2 : codeSet2) {
41 | 				double s = getSimilarityByCode(code1, code2);
42 | 				System.out.println(code1 + "-" + code2 + "-" + CilinCoding.calculateCommonWeight(code1, code2));
43 | 				if (sim < s)
44 | 					sim = s;
45 | 			}
46 | 		}
47 | 		return sim;
48 | 	}
49 | 
50 | 	public double getSimilarityByCode(String code1, String code2) {
51 | 		return CilinCoding.calculateCommonWeight(code1, code2) / CilinCoding.TOTAL_WEIGHT;
52 | 	}
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/cilin/CilinCoding.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.cilin;
 2 | 
 3 | /**
 4 |  * 表2-3 哈工大词林扩展版规则编码表<br/>
 5 |  * <table border="1" style="color:red;">
 6 |  * <tr>
 7 |  * <td>编码位</td><td>1</td><td>	2</td><td>3</td><td>4</td><td>5</td><td>6</td><td>7</td><td>8</td>
 8 |  * </tr>
 9 |  * <tr>
10 |  * <td>编码示例</td><td>C</td><td>b</td><td>0</td><td>7</td><td>A</td><td>0</td><td>3</td><td>=</td>
11 |  * </tr><tr>
12 |  * <td>类别级别</td><td>第一级</td><td>第二级</td><td colspan="2">第三级</td><td>第四级</td><td colspan="2">第五级</td><td>标记位</td><td>
13 |  * </tr><tr>
14 |  * <td>类别含义</td><td>大类</td><td>中类</td><td colspan="2">小类</td><td>词群</td><td colspan="2">原子词群</td><td>词语关系</td>
15 |  * </tr>
16 |  * </table>
17 |  * <br/>
18 |  * 表中编码位从左到右顺序排列，其中，第8位对应的标记位为“=”、“#”和“@”三种符号之一。其中“=”代表常见的“同义”关系，“#”代表词语之间的相关关系，“@”则代表词语自我封闭的独立性质，它在词典中既没有同义词，也没有相关词。
19 |  * 
20 |  * 
21 |  */
22 | public class CilinCoding {
23 | 
24 | 	public static double[] WEIGHT = new double[] { 1.2, 1.2, 1.0, 1.0, 0.8, 0.4 };
25 | 	public static double TOTAL_WEIGHT = 5.6;
26 | 
27 | 	public static String getCodeLevel(String code, int level) {
28 | 		switch (level) {
29 | 		case 1:
30 | 			return code.substring(0, 1);
31 | 		case 2:
32 | 			return code.substring(1, 2);
33 | 		case 3:
34 | 			return code.substring(2, 4);
35 | 		case 4:
36 | 			return code.substring(4, 5);
37 | 		case 5:
38 | 			return code.substring(5, 7);
39 | 		case 6:
40 | 			return code.substring(7);
41 | 		}
42 | 
43 | 		return "";
44 | 	}
45 | 
46 | 	/**
47 | 	 * 获取共同部分编码的权重
48 | 	 * @param code1
49 | 	 * @param code2
50 | 	 * @return
51 | 	 */
52 | 	public static double calculateCommonWeight(String code1, String code2) {
53 | 		double weight = 0.0;
54 | 		for (int i = 1; i <= 6; i++) {
55 | 			String c1 = getCodeLevel(code1, i);
56 | 			String c2 = getCodeLevel(code2, i);
57 | 			if (c1.equals(c2)) {
58 | 				weight += WEIGHT[i - 1];
59 | 			} else {
60 | 				break;
61 | 			}
62 | 		}
63 | 		return weight;
64 | 	}
65 | 
66 | 	public static String printCoding(String code) {
67 | 		StringBuilder sb = new StringBuilder();
68 | 		for (int i = 1; i <= 6; i++) {
69 | 			if (i == 1) {
70 | 				sb.append("[LEVEL_" + i);
71 | 			} else {
72 | 				sb.append(", LEVEL_" + i);
73 | 			}
74 | 			sb.append(": ");
75 | 			sb.append(getCodeLevel(code, i));
76 | 		}
77 | 		sb.append("]");
78 | 
79 | 		return sb.toString();
80 | 	}
81 | 
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/cilin/CilinDb.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.cilin;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.InputStream;
  5 | import java.util.HashMap;
  6 | import java.util.HashSet;
  7 | import java.util.Map;
  8 | import java.util.Set;
  9 | import java.util.zip.GZIPInputStream;
 10 | 
 11 | import org.slf4j.Logger;
 12 | import org.slf4j.LoggerFactory;
 13 | 
 14 | import zx.soft.similarity.util.FileUtils;
 15 | import zx.soft.similarity.util.TraverseEvent;
 16 | 
 17 | /**
 18 |  * 词林数据库
 19 |  *
 20 |  */
 21 | public class CilinDb {
 22 | 
 23 | 	/** the logger */
 24 | 	protected static Logger logger = LoggerFactory.getLogger(CilinDb.class);
 25 | 	/** 以词语为主键的索引表 */
 26 | 	private final Map<String, Set<String>> wordIndex = new HashMap<>();
 27 | 	/** 以编码为主键的索引表 */
 28 | 	private final Map<String, Set<String>> codeIndex = new HashMap<>();
 29 | 
 30 | 	private static CilinDb instance = null;
 31 | 
 32 | 	public static CilinDb getInstance() {
 33 | 		if (instance == null) {
 34 | 			try {
 35 | 				instance = new CilinDb();
 36 | 			} catch (IOException e) {
 37 | 				logger.error("Exception:{}", e.getMessage());
 38 | 			}
 39 | 		}
 40 | 		return instance;
 41 | 	}
 42 | 
 43 | 	private CilinDb() throws IOException {
 44 | 		InputStream input = new GZIPInputStream(this.getClass().getClassLoader()
 45 | 				.getResourceAsStream("data/cilin.db.gz"));
 46 | 
 47 | 		TraverseEvent<String> event = new TraverseEvent<String>() {
 48 | 			@Override
 49 | 			public boolean visit(String line) {
 50 | 				String[] items = line.split(" ");
 51 | 				Set<String> set = new HashSet<>();
 52 | 				for (int i = 2; i < items.length; i++) {
 53 | 					String code = items[i].trim();
 54 | 					if (!code.equals("")) {
 55 | 						set.add(code);
 56 | 						//加入codeIndex编码
 57 | 						Set<String> codeWords = codeIndex.get(code);
 58 | 						if (codeWords == null) {
 59 | 							codeWords = new HashSet<>();
 60 | 						}
 61 | 						codeWords.add(items[0]);
 62 | 						codeIndex.put(code, codeWords);
 63 | 					}
 64 | 				}
 65 | 				wordIndex.put(items[0], set);
 66 | 				items = null;
 67 | 				return false;
 68 | 			}
 69 | 		};
 70 | 		logger.info("loading cilin dictionary...");
 71 | 		long time = System.currentTimeMillis();
 72 | 
 73 | 		FileUtils.traverseLines(input, "UTF8", event);
 74 | 
 75 | 		time = System.currentTimeMillis() - time;
 76 | 		logger.info("loading cilin dictionary completely. time elapsed:{}", time);
 77 | 	}
 78 | 
 79 | 	/**
 80 | 	 * 获取某个词语的词林编码，一个词语可以有多个编码，通过Set给出
 81 | 	 * @param word
 82 | 	 * @return
 83 | 	 */
 84 | 	public Set<String> getCilinCoding(String word) {
 85 | 		return wordIndex.get(word);
 86 | 	}
 87 | 
 88 | 	public Set<String> getCilinWords(String code) {
 89 | 		return codeIndex.get(code);
 90 | 	}
 91 | 
 92 | 	public static void main(String[] args) {
 93 | 		CilinDb db = CilinDb.getInstance();
 94 | 		String code = db.getCilinCoding("中国").iterator().next();
 95 | 		System.out.println(CilinCoding.printCoding(code));
 96 | 		System.out.println(db.getCilinWords(code));
 97 | 	}
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/cilin/cilin.db.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/main/java/zx/soft/similarity/word/cilin/cilin.db.gz


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/Hownet.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | 
 8 | import zx.soft.similarity.Similaritable;
 9 | import zx.soft.similarity.word.hownet2.concept.BaseConceptParser;
10 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
11 | import zx.soft.similarity.word.hownet2.sememe.BaseSememeParser;
12 | import zx.soft.similarity.word.hownet2.sememe.XiaSememeParser;
13 | 
14 | /**
15 |  * Hownet的主控制类, 通过知网的概念和义原及其关系计算汉语词语之间的相似度.
16 |  * 相似度的计算理论参考论文《汉语词语语义相似度计算研究》
17 |  *
18 |  * @see zx.soft.similarity.Similaritable
19 |  */
20 | public class Hownet implements Similaritable {
21 | 
22 | 	/** the logger */
23 | 	private static final Logger logger = LoggerFactory.getLogger(Hownet.class);
24 | 	/** 知网的单例 */
25 | 	private static Hownet instance = null;
26 | 
27 | 	private BaseConceptParser conceptParser = null;
28 | 
29 | 	private Hownet() {
30 | 		try {
31 | 			BaseSememeParser sememeParser = new XiaSememeParser();
32 | 			conceptParser = new XiaConceptParser(sememeParser);
33 | 		} catch (IOException e) {
34 | 			logger.error("Exception:{}", e.getMessage());
35 | 			e.printStackTrace();
36 | 		}
37 | 	}
38 | 
39 | 	/**
40 | 	 * 单例获取知网对象
41 | 	 * @return
42 | 	 */
43 | 	public static Hownet instance() {
44 | 		if (null == instance) {
45 | 			instance = new Hownet();
46 | 		}
47 | 		return instance;
48 | 	}
49 | 
50 | 	/**
51 | 	 * 获取概念解析器
52 | 	 * @return
53 | 	 */
54 | 	public BaseConceptParser getConceptParser() {
55 | 		return conceptParser;
56 | 	}
57 | 
58 | 	@Override
59 | 	public double getSimilarity(String item1, String item2) {
60 | 		return conceptParser.getSimilarity(item1, item2);
61 | 	}
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/HownetMeta.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet;
 2 | 
 3 | /**
 4 |  * Metadata for Hownet
 5 |  * 
 6 |  */
 7 | public interface HownetMeta {
 8 | 
 9 | 	/** Algorithm of XIA Tian */
10 | 	public static final int ALGORITHM_XIA = 1;
11 | 
12 | 	/** Algorithm of LIU Qun */
13 | 	public static final int ALGORITHM_LIU = 2;
14 | 
15 | 	/**
16 | 	 * Hownet symbol descriptions
17 | 	 */
18 | 	public static final String Symbol_Descriptions[][] = { { "#", "表示与其相关" }, { "%", "是其部分" },
19 | 			{ "$", "可以被该V处置，或是该V的受事、对象、领有物，或内容" }, { "*", "施事或工具" }, { "+", "所标记的角色是隐性的，几乎在实际语言中不会出现" }, { "&", "指向" },
20 | 			{ "~", "多半是，多半有，很可能" }, { "@", "可以做V的空间或时间" }, { "?", "可以使N的材料" }, { "(", "至于其中的应该是一个词标记" },
21 | 			{ "^", "不存在，或没有，或不能" }, { "!", "表示某一属性为一敏感的属性，如味道之与食物" }, { "[", "标示概念的共性属性" } };
22 | 
23 | 	/** γ：具体词与义元的相似度一律为一个较小的常数 */
24 | 	public static final double gamma = 0.2;
25 | 
26 | 	/** δ:任一个非空值与空值的相似度为一个较小的常数，此处为0.2 */
27 | 	public static final double delta = 0.2;
28 | 
29 | 	/** β1实词概念第一基本义原描述式的权重 */
30 | 	public static final double beta1 = 0.5;
31 | 	/** β2实词概念其他基本义原描述式的权重 */
32 | 	public static final double beta2 = 0.2;
33 | 	/** β3实词概念关系义原描述式的权重 */
34 | 	public static final double beta3 = 0.17;
35 | 	/** β4实词概念符号义原描述式的权重 */
36 | 	public static final double beta4 = 0.13;
37 | 
38 | 	/** 
39 | 	 * Θ 计算后面概念的义原与参照概念所有义原的最大相似度, 并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用),
40 | 	 * 如果数值大于该值时才会起参照作用, 去掉冗余的不重要义原 
41 | 	 */
42 | 	public static final double PARAM_THETA = 0.5;
43 | 	/** 
44 | 	 * Ω 计算前面概念的义原与参照概念所有义原的最大相似度，并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用),
45 | 	 * 如果数值大于该值时才会调整前面概念的义原符号, 以起修正作用
46 | 	 */
47 | 	public static final double PARAM_OMEGA = 0.8;
48 | 	/** */
49 | 	public static final double PARAM_XI = 0.6;
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/concept/Concept.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet.concept;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | import java.util.StringTokenizer;
  6 | 
  7 | import zx.soft.similarity.word.hownet.HownetMeta;
  8 | 
  9 | /**
 10 |  * 知网的概念表示类 <br/>example和英文部分对于相似度的计算不起作用，考虑到内存开销， 在概念的表示中去掉了这部分数据的对应定义
 11 |  *
 12 |  * @deprecated
 13 |  */
 14 | @Deprecated
 15 | public class Concept implements HownetMeta, Comparable<Concept> {
 16 | 
 17 | 	/** 中文概念名称 */
 18 | 	protected String word;
 19 | 	/** 词性: Part of Speech */
 20 | 	protected String pos;
 21 | 	/** 定义 */
 22 | 	protected String define;
 23 | 
 24 | 	/** 是否是实词，false表示为虚词, 一般为实词 */
 25 | 	protected boolean bSubstantive;
 26 | 	/** 第一基本义原 */
 27 | 	protected String mainSememe;
 28 | 	/** 其他基本义原 */
 29 | 	protected String[] secondSememes;
 30 | 	/** 关系义元原 */
 31 | 	protected String[] relationSememes;
 32 | 	/** 关系符号描述 */
 33 | 	protected String[] symbolSememes;
 34 | 
 35 | 	static String[][] Concept_Type = { { "=", "事件" }, { "aValue|属性值", "属性值" }, { "qValue|数量值", "数量值" },
 36 | 			{ "attribute|属性", "属性" }, { "quantity|数量", "数量" }, { "unit|", "单位" }, { "%", "部件" } };
 37 | 
 38 | 	public Concept(String word, String pos, String def) {
 39 | 		this.word = word;
 40 | 		this.pos = pos;
 41 | 		this.define = (def == null) ? "" : def.trim();
 42 | 
 43 | 		// 虚词用{***}表示
 44 | 		if (define.length() > 0 && define.charAt(0) == '{' && define.charAt(define.length() - 1) == '}') {
 45 | 			this.bSubstantive = false;
 46 | 		} else {
 47 | 			this.bSubstantive = true;
 48 | 		}
 49 | 
 50 | 		parseDefine();
 51 | 	}
 52 | 
 53 | 	/**
 54 | 	 * 处理定义，把定义分为第一基本义元、其他基本义元、关系义元和符号义元四类
 55 | 	 */
 56 | 	private void parseDefine() {
 57 | 		List<String> secondList = new ArrayList<>(); //其他基本义原
 58 | 		List<String> relationList = new ArrayList<>(); //关系义原
 59 | 		List<String> symbolList = new ArrayList<>(); //符号义原
 60 | 
 61 | 		String tokenString = this.define;
 62 | 
 63 | 		//如果不是实词，则处理“{}”中的内容
 64 | 		if (!this.bSubstantive) {
 65 | 			tokenString = define.substring(1, define.length() - 1);
 66 | 		}
 67 | 
 68 | 		StringTokenizer token = new StringTokenizer(tokenString, ",", false);
 69 | 
 70 | 		// 第一个为第一基本义元
 71 | 		if (token.hasMoreTokens()) {
 72 | 			this.mainSememe = token.nextToken();
 73 | 		}
 74 | 
 75 | 		main_loop: while (token.hasMoreTokens()) {
 76 | 			String item = token.nextToken();
 77 | 			if (item.equals(""))
 78 | 				continue;
 79 | 
 80 | 			// 先判断是否为符号义元
 81 | 			String symbol = item.substring(0, 1);
 82 | 			for (int i = 0; i < Symbol_Descriptions.length; i++) {
 83 | 				if (symbol.equals(Symbol_Descriptions[i][0])) {
 84 | 					symbolList.add(item);
 85 | 					continue main_loop;
 86 | 				}
 87 | 			}
 88 | 
 89 | 			//如果不是符号义元，则进一步判断是关系义元还是第二基本义元, 带有“=”表示关系义原
 90 | 			if (item.indexOf('=') > 0) {
 91 | 				relationList.add(item);
 92 | 			} else {
 93 | 				secondList.add(item);
 94 | 			}
 95 | 		}
 96 | 
 97 | 		this.secondSememes = secondList.toArray(new String[secondList.size()]);
 98 | 		this.relationSememes = relationList.toArray(new String[relationList.size()]);
 99 | 		this.symbolSememes = symbolList.toArray(new String[symbolList.size()]);
100 | 	}
101 | 
102 | 	/**
103 | 	 * 获取第一义元
104 | 	 *
105 | 	 * @return
106 | 	 */
107 | 	public String getMainSememe() {
108 | 		return mainSememe;
109 | 	}
110 | 
111 | 	/**
112 | 	 * 获取其他基本义元描述
113 | 	 *
114 | 	 * @return
115 | 	 */
116 | 	public String[] getSecondSememes() {
117 | 		return secondSememes;
118 | 	}
119 | 
120 | 	/**
121 | 	 * 获取关系义元描述
122 | 	 *
123 | 	 * @return
124 | 	 */
125 | 	public String[] getRelationSememes() {
126 | 		return relationSememes;
127 | 	}
128 | 
129 | 	/**
130 | 	 * 获取符号义元描述
131 | 	 *
132 | 	 * @return
133 | 	 */
134 | 	public String[] getSymbolSememes() {
135 | 		return symbolSememes;
136 | 	}
137 | 
138 | 	@Override
139 | 	public String toString() {
140 | 		StringBuilder sb = new StringBuilder();
141 | 		sb.append("name=");
142 | 		sb.append(this.word);
143 | 		sb.append("; pos=");
144 | 		sb.append(this.pos);
145 | 		sb.append("; define=");
146 | 		sb.append(this.define);
147 | 		sb.append("; 第一基本义元:[" + mainSememe);
148 | 
149 | 		sb.append("]; 其他基本义元描述:[");
150 | 		for (String sem : secondSememes) {
151 | 			sb.append(sem);
152 | 			sb.append(";");
153 | 		}
154 | 
155 | 		sb.append("]; [关系义元描述:");
156 | 		for (String sem : relationSememes) {
157 | 			sb.append(sem);
158 | 			sb.append(";");
159 | 		}
160 | 
161 | 		sb.append("]; [关系符号描述:");
162 | 		for (String sem : symbolSememes) {
163 | 			sb.append(sem);
164 | 			sb.append(";");
165 | 		}
166 | 		sb.append("]");
167 | 		return sb.toString();
168 | 	}
169 | 
170 | 	/**
171 | 	 * 是实词还是虚词
172 | 	 *
173 | 	 * @return true:实词；false:虚词
174 | 	 */
175 | 	public boolean isSubstantive() {
176 | 		return this.bSubstantive;
177 | 	}
178 | 
179 | 	public String getWord() {
180 | 		return word;
181 | 	}
182 | 
183 | 	public void setWord(String word) {
184 | 		this.word = word;
185 | 	}
186 | 
187 | 	public String getPos() {
188 | 		return pos;
189 | 	}
190 | 
191 | 	public void setPos(String pos) {
192 | 		this.pos = pos;
193 | 	}
194 | 
195 | 	public String getDefine() {
196 | 		return define;
197 | 	}
198 | 
199 | 	public void setDefine(String define) {
200 | 		this.define = define;
201 | 	}
202 | 
203 | 	/**
204 | 	 * 获取该概念的类型
205 | 	 *
206 | 	 * @return
207 | 	 */
208 | 	public String getType() {
209 | 		for (int i = 0; i < Concept_Type.length; i++) {
210 | 			if (define.toUpperCase().indexOf(Concept_Type[i][0].toUpperCase()) >= 0) {
211 | 				return Concept_Type[i][1];
212 | 			}
213 | 		}
214 | 		return "普通概念";
215 | 	}
216 | 
217 | 	/**
218 | 	 * 按照概念的名称进行比较
219 | 	 */
220 | 	@Override
221 | 	public int compareTo(Concept o) {
222 | 		return word.compareTo(o.word);
223 | 	}
224 | 
225 | 	//////////////////////////////////////////////
226 | 	/**
227 | 	 * 方便在parse中比较概念词语加入的方法
228 | 	 * @param another
229 | 	 * @return
230 | 	 */
231 | 	public int compareTo(String another) {
232 | 		return word.compareTo(another);
233 | 	}
234 | 
235 | 	public boolean equals(String another) {
236 | 		return word.equals(another);
237 | 	}
238 | 
239 | }


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/concept/ConceptDictTraverseEvent.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet.concept;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileOutputStream;
  6 | import java.io.InputStream;
  7 | import java.io.InputStreamReader;
  8 | import java.io.PrintWriter;
  9 | import java.util.ArrayList;
 10 | import java.util.Arrays;
 11 | import java.util.List;
 12 | 
 13 | import javax.xml.parsers.DocumentBuilder;
 14 | import javax.xml.parsers.DocumentBuilderFactory;
 15 | import javax.xml.transform.OutputKeys;
 16 | import javax.xml.transform.Transformer;
 17 | import javax.xml.transform.TransformerFactory;
 18 | import javax.xml.transform.dom.DOMSource;
 19 | import javax.xml.transform.stream.StreamResult;
 20 | 
 21 | import org.w3c.dom.Document;
 22 | import org.w3c.dom.Element;
 23 | 
 24 | import zx.soft.similarity.util.TraverseEvent;
 25 | 
 26 | /**
 27 |  * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准，格式如下：<br/>
 28 |  * 阿斗                	N    	human|人,ProperName|专,past|昔<br/>
 29 |  * 阿爸                	N    	human|人,family|家,male|男<br/>
 30 |  * 即： &lt;概念&gt; &lt;空格或者跳格&gt; &lt;词性&gt; &lt;空格或者跳格&gt; &lt;定义&gt;"
 31 |  * <br/>
 32 |  * 概念保存到数组中，没有保存到Map中，可以降低对内存空间的使用
 33 |  *
 34 |  * @deprecated
 35 |  */
 36 | @Deprecated
 37 | public class ConceptDictTraverseEvent implements TraverseEvent<String> {
 38 | 
 39 | 	private List<Concept> conceptList = null;
 40 | 
 41 | 	public ConceptDictTraverseEvent() {
 42 | 		conceptList = new ArrayList<>();
 43 | 	}
 44 | 
 45 | 	public Concept[] getConcepts() {
 46 | 		Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]);
 47 | 		Arrays.sort(concepts);
 48 | 		return concepts;
 49 | 	}
 50 | 
 51 | 	/**
 52 | 	 * 读取概念词典中的一行，并进行解析处理
 53 | 	 */
 54 | 	@Override
 55 | 	public boolean visit(String line) {
 56 | 		String word = null;
 57 | 		String pos = null;
 58 | 		String define = "";
 59 | 		char ch;
 60 | 
 61 | 		// 以符号//开始的是注释行
 62 | 		if (line.startsWith("//")) {
 63 | 			return true;
 64 | 		}
 65 | 
 66 | 		int lastPosition = 0; // 最近一次处理内容的有意义的开始位置
 67 | 		int processFlag = 0; // 当前处理部分的标志 0：处理word； 1：词性；2：定义
 68 | 		// 解析出一行中的概念各项数据
 69 | 		loop: for (int position = 0; position < line.length(); position++) {
 70 | 			ch = line.charAt(position);
 71 | 
 72 | 			if ((ch == ' ') || (ch == '\t') || (position == (line.length() - 1))) {
 73 | 				String item = line.substring(lastPosition, (position == (line.length() - 1)) ? (position + 1)
 74 | 						: position);
 75 | 				switch (processFlag) {
 76 | 				case 0:
 77 | 					word = item;
 78 | 					processFlag++;
 79 | 					break;
 80 | 				case 1:
 81 | 					pos = item;
 82 | 					processFlag++;
 83 | 					break;
 84 | 				case 2:
 85 | 					//define = item;
 86 | 					//processFlag++;
 87 | 					define = line.substring(lastPosition).trim();
 88 | 					break loop;
 89 | 				case 3:
 90 | 					System.out.println(line);
 91 | 					break;
 92 | 				}
 93 | 
 94 | 				for (; (position < line.length()); position++) {
 95 | 					ch = line.charAt(position);
 96 | 					if ((ch != ' ') && (ch != '\t')) {
 97 | 						lastPosition = position;
 98 | 						break;
 99 | 					}
100 | 				}
101 | 
102 | 			}
103 | 		}
104 | 		conceptList.add(new Concept(word, pos, define));
105 | 		return true;
106 | 	}
107 | 
108 | 	public void saveToXML(File xmlFile) throws Exception {
109 | 		InputStream input = this.getClass().getClassLoader().getResourceAsStream("data/concept.dat");
110 | 		BufferedReader in = new BufferedReader(new InputStreamReader(input, "utf8"));
111 | 
112 | 		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
113 | 		DocumentBuilder builder = factory.newDocumentBuilder();
114 | 		Document document = builder.newDocument();
115 | 		Element root = document.createElement("concepts");
116 | 		document.appendChild(root);
117 | 
118 | 		String line = null;
119 | 
120 | 		while ((line = in.readLine()) != null) {
121 | 			saveLineToXML(document, root, line);
122 | 		}
123 | 
124 | 		input.close();
125 | 		in.close();
126 | 
127 | 		TransformerFactory tf = TransformerFactory.newInstance();
128 | 		Transformer transformer = tf.newTransformer();
129 | 		DOMSource source = new DOMSource(document);
130 | 		transformer.setOutputProperty(OutputKeys.ENCODING, "utf8");
131 | 		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
132 | 		PrintWriter pw = new PrintWriter(new FileOutputStream(xmlFile));
133 | 		StreamResult result = new StreamResult(pw);
134 | 		transformer.transform(source, result);
135 | 	}
136 | 
137 | 	/**
138 | 	 * 读取概念词典中的一行，并进行解析处理
139 | 	 */
140 | 	private boolean saveLineToXML(Document document, Element root, String line) {
141 | 		String word = null;
142 | 		String pos = null;
143 | 		String define = "";
144 | 		char ch;
145 | 
146 | 		//以符号//开始的是注释行
147 | 		if (line.startsWith("//")) {
148 | 			return true;
149 | 		}
150 | 
151 | 		int lastPosition = 0; //最近一次处理内容的有意义的开始位置
152 | 		int processFlag = 0; //当前处理部分的标志 0：处理word； 1：词性；2：定义
153 | 		//解析出一行中的概念各项数据
154 | 		loop: for (int position = 0; position < line.length(); position++) {
155 | 			ch = line.charAt(position);
156 | 
157 | 			if ((ch == ' ') || (ch == '\t') || (position == (line.length() - 1))) {
158 | 				String item = line.substring(lastPosition, (position == (line.length() - 1)) ? (position + 1)
159 | 						: position);
160 | 				switch (processFlag) {
161 | 				case 0:
162 | 					word = item;
163 | 					processFlag++;
164 | 					break;
165 | 				case 1:
166 | 					pos = item;
167 | 					processFlag++;
168 | 					break;
169 | 				case 2:
170 | 					//define = item;
171 | 					//processFlag++;
172 | 					define = line.substring(lastPosition).trim();
173 | 					break loop;
174 | 				case 3:
175 | 					System.out.println(line);
176 | 					break;
177 | 				}
178 | 
179 | 				for (; (position < line.length()); position++) {
180 | 					ch = line.charAt(position);
181 | 					if ((ch != ' ') && (ch != '\t')) {
182 | 						lastPosition = position;
183 | 						break;
184 | 					}
185 | 				}
186 | 
187 | 			}
188 | 		}
189 | 
190 | 		Element e = document.createElement("c");
191 | 		e.setAttribute("w", word);
192 | 		e.setAttribute("p", pos);
193 | 		e.setAttribute("d", define);
194 | 		root.appendChild(e);
195 | 		return true;
196 | 	}
197 | 
198 | 	public static void main(String[] args) throws Exception {
199 | 		new ConceptDictTraverseEvent().saveToXML(new File("/XXX/concept.xml"));
200 | 	}
201 | 
202 | }
203 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/concept/ConceptLinkedList.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet.concept;
 2 | 
 3 | import java.util.LinkedList;
 4 | 
 5 | /**
 6 |  * 用于概念处理的LinkedList
 7 |  * 
 8 |  * @param <T>
 9 |  * @deprecated
10 |  */
11 | @Deprecated
12 | @SuppressWarnings("serial")
13 | public class ConceptLinkedList extends LinkedList<Concept> {
14 | 
15 | 	/**
16 | 	 * 删除链表中最后面的size个元素
17 | 	 * @param size
18 | 	 */
19 | 	public void removeLast(int size) {
20 | 		for (int i = 0; i < size; i++) {
21 | 			this.removeLast();
22 | 		}
23 | 	}
24 | 
25 | 	/**
26 | 	 * 根据概念的定义判断是否已经加入到链表中
27 | 	 * @param concept
28 | 	 */
29 | 	public void addByDefine(Concept concept) {
30 | 		for (Concept c : this) {
31 | 			if (c.getDefine().equals(concept.getDefine())) {
32 | 				return;
33 | 			}
34 | 		}
35 | 
36 | 		this.add(concept);
37 | 	}
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/concept/LiuConceptParser.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet.concept;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Collection;
 5 | 
 6 | import zx.soft.similarity.util.BlankUtils;
 7 | import zx.soft.similarity.word.hownet.sememe.LiuqunSememeParser;
 8 | import zx.soft.similarity.word.hownet.sememe.SememeParser;
 9 | 
10 | /**
11 |  * 刘群老师的相似度计算方式，对概念解析的处理方式
12 |  *
13 |  * @deprecated
14 |  */
15 | @Deprecated
16 | public class LiuConceptParser extends ConceptParser {
17 | 
18 | 	private static LiuConceptParser instance = null;
19 | 
20 | 	public static LiuConceptParser getInstance() {
21 | 		if (instance == null) {
22 | 			try {
23 | 				instance = new LiuConceptParser();
24 | 			} catch (IOException e) {
25 | 				e.printStackTrace();
26 | 			}
27 | 		}
28 | 
29 | 		return instance;
30 | 	}
31 | 
32 | 	private LiuConceptParser(SememeParser sememeParser) throws IOException {
33 | 		super(sememeParser);
34 | 	}
35 | 
36 | 	private LiuConceptParser() throws IOException {
37 | 		super(new LiuqunSememeParser());
38 | 	}
39 | 
40 | 	@Override
41 | 	protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4) {
42 | 		return beta1 * sim_v1 + beta2 * sim_v1 * sim_v2 + beta3 * sim_v1 * sim_v2 * sim_v3 + beta4 * sim_v1 * sim_v2
43 | 				* sim_v3 * sim_v4;
44 | 	}
45 | 
46 | 	@Override
47 | 	public double getSimilarity(String word1, String word2) {
48 | 		double similarity = 0.0;
49 | 
50 | 		// 如果两个句子相同,则直接返回1.0
51 | 		if (word1.equals(word2)) {
52 | 			return 1.0;
53 | 		}
54 | 
55 | 		Collection<Concept> concepts1 = getConcepts(word1);
56 | 		Collection<Concept> concepts2 = getConcepts(word2);
57 | 
58 | 		// 如果是blank，则说明是未登录词, 需要计算组合概念
59 | 		if (BlankUtils.isBlank(concepts1) || BlankUtils.isBlank(concepts2)) {
60 | 			return 0.0;
61 | 		}
62 | 
63 | 		// 两个for循环分别计算词语所有可能的概念的相似度
64 | 		for (Concept c1 : concepts1) {
65 | 			for (Concept c2 : concepts2) {
66 | 				double v = getSimilarity(c1, c2);
67 | 
68 | 				if (v > similarity) {
69 | 					similarity = v;
70 | 				}
71 | 
72 | 				if (similarity == 1.0) {
73 | 					break;
74 | 				}
75 | 			}
76 | 		}
77 | 
78 | 		return similarity;
79 | 	}
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/FastSimpleMap.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.Collection;
  6 | 
  7 | /**
  8 |  * 一种新的Map，跟标准的Map不同，它的的Key可以有重复, 内部采用快速排序和二分查找,
  9 |  * 保持较少的变量，结构简单，可根据主键查找返回的结果是一个数组
 10 |  *
 11 |  * @param <T>
 12 |  * @param <V>
 13 |  * @deprecated
 14 |  */
 15 | @Deprecated
 16 | public class FastSimpleMap<K extends Comparable<K>, V> {
 17 | 
 18 | 	private final K[] keys;
 19 | 	private final V[] values;
 20 | 
 21 | 	public FastSimpleMap(K[] keys, V[] values) throws IOException {
 22 | 		if (keys.length != values.length) {
 23 | 			throw new IOException("keys length must be equals values");
 24 | 		}
 25 | 		this.keys = keys;
 26 | 		this.values = values;
 27 | 
 28 | 		// 根据keys进行排序
 29 | 		quicksort(0, keys.length - 1);
 30 | 	}
 31 | 
 32 | 	/**
 33 | 	 * 查找键对应的值集合
 34 | 	 * @param key
 35 | 	 * @return
 36 | 	 */
 37 | 	public Collection<V> get(K key) {
 38 | 		int low = 0;
 39 | 		int high = keys.length - 1;
 40 | 
 41 | 		Collection<V> results = new ArrayList<V>();
 42 | 
 43 | 		while (low <= high) {
 44 | 			int mid = (low + high) >> 1;
 45 | 			K item = keys[mid];
 46 | 			int cmp = key.compareTo(item);
 47 | 
 48 | 			if (cmp > 0) {
 49 | 				low = mid + 1;
 50 | 			} else if (cmp < 0) {
 51 | 				high = mid - 1;
 52 | 			} else {
 53 | 				// 找到起始位置，该位置前后相同的都是该主键对应的值
 54 | 				for (int i = mid; i >= 0 && keys[i].equals(key); i--) {
 55 | 					results.add(values[i]);
 56 | 				}
 57 | 				for (int i = mid + 1; i < keys.length && keys[i].equals(key); i++) {
 58 | 					results.add(values[i]);
 59 | 				}
 60 | 
 61 | 				break;
 62 | 			}
 63 | 		}
 64 | 
 65 | 		return results;
 66 | 	}
 67 | 
 68 | 	/**
 69 | 	 * 根据keys快速排序，排序的同时交换values
 70 | 	 *
 71 | 	 * @param a
 72 | 	 * @param low
 73 | 	 * @param high
 74 | 	 */
 75 | 	private void quicksort(int low, int high) {
 76 | 		// low is the lower index, high is the upper index
 77 | 		// of the region of array a that is to be sorted
 78 | 		int i = low, j = high;
 79 | 		K h;
 80 | 		V v;
 81 | 		K x = keys[(low + high) >> 1];
 82 | 
 83 | 		// partition
 84 | 		do {
 85 | 			while (keys[i].compareTo(x) < 0)
 86 | 				i++;
 87 | 			while (keys[j].compareTo(x) > 0)
 88 | 				j--;
 89 | 
 90 | 			if (i <= j) {
 91 | 				h = keys[i];
 92 | 				keys[i] = keys[j];
 93 | 				keys[j] = h;
 94 | 				v = values[i];
 95 | 				values[i] = values[j];
 96 | 				values[j] = v;
 97 | 				i++;
 98 | 				j--;
 99 | 			}
100 | 		} while (i <= j);
101 | 
102 | 		// recursion
103 | 		if (low < j)
104 | 			quicksort(low, j);
105 | 		if (i < high)
106 | 			quicksort(i, high);
107 | 	}
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/LiuqunSememeParser.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | /**
  6 |  * 刘群老师计算义原相似度的方法, 实现了SememeParser中定义的抽象方法
  7 |  * 
  8 |  * @version 1.0
  9 |  * @deprecated
 10 |  */
 11 | @Deprecated
 12 | public class LiuqunSememeParser extends SememeParser {
 13 | 
 14 | 	/** 计算义元相似度的可调节的参数，默认为1.6 */
 15 | 	private final float alpha = 1.6f;
 16 | 
 17 | 	public LiuqunSememeParser() throws IOException {
 18 | 		super();
 19 | 	}
 20 | 
 21 | 	/**
 22 | 	 * 计算两个义元之间的相似度，由于义元可能相同，计算结果为其中相似度最大者 
 23 | 	 * <br/>similarity = alpha/(distance+alpha)
 24 | 	 * 
 25 | 	 * @param key1
 26 | 	 * @param key2
 27 | 	 * @return
 28 | 	 */
 29 | 	@Override
 30 | 	public double getSimilarity(String item1, String item2) {
 31 | 		int pos;
 32 | 
 33 | 		// 如果为空串，直接返回0
 34 | 		if (item1 == null || item2 == null || item1.equals("") || item2.equals(""))
 35 | 			return 0.0;
 36 | 
 37 | 		String key1 = item1.trim();
 38 | 		String key2 = item2.trim();
 39 | 
 40 | 		// 去掉()符号
 41 | 		if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
 42 | 			if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
 43 | 				key1 = key1.substring(1, key1.length() - 1);
 44 | 				key2 = key2.substring(1, key2.length() - 1);
 45 | 			} else {
 46 | 				return 0.0;
 47 | 			}
 48 | 		}
 49 | 
 50 | 		// 处理关系义元,即x=y的情况
 51 | 		if ((pos = key1.indexOf('=')) > 0) {
 52 | 			int pos2 = key2.indexOf('=');
 53 | 			// 如果是关系义元，则判断前面部分是否相同，如果相同，则转为计算后面部分的相似度，否则为0
 54 | 			if ((pos == pos2) && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
 55 | 				key1 = key1.substring(pos + 1);
 56 | 				key2 = key2.substring(pos2 + 1);
 57 | 			} else {
 58 | 				return 0.0;
 59 | 			}
 60 | 		}
 61 | 
 62 | 		// 处理符号义元,即前面有特殊符号的义元
 63 | 		String symbol1 = key1.substring(0, 1);
 64 | 		String symbol2 = key2.substring(0, 1);
 65 | 
 66 | 		for (int i = 0; i < Symbol_Descriptions.length; i++) {
 67 | 			if (symbol1.equals(Symbol_Descriptions[i][0])) {
 68 | 				if (symbol1.equals(symbol2)) {
 69 | 					key1 = item1.substring(1);
 70 | 					key2 = item2.substring(1);
 71 | 					break;
 72 | 				} else {
 73 | 					return 0.0; // 如果不是同一关系符号，则相似度直接返回0
 74 | 				}
 75 | 			}
 76 | 		}
 77 | 
 78 | 		if ((pos = key1.indexOf("|")) >= 0) {
 79 | 			key1 = key1.substring(pos + 1);
 80 | 		}
 81 | 		if ((pos = key2.indexOf("|")) >= 0) {
 82 | 			key2 = key2.substring(pos + 1);
 83 | 		}
 84 | 
 85 | 		int distance = getDistance(key1, key2);
 86 | 		if (distance < 0)
 87 | 			return 0.0;
 88 | 		else
 89 | 			return alpha / (distance + alpha);
 90 | 	}
 91 | 
 92 | 	@Override
 93 | 	public double getSimilarity(Sememe sem1, Sememe sem2) {
 94 | 		int distance = getDistance(sem1, sem2);
 95 | 		if (distance <= 0)
 96 | 			return 0.0f;
 97 | 		else
 98 | 			return alpha / (distance + alpha);
 99 | 	}
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/MySememeParser.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import zx.soft.similarity.util.BlankUtils;
  6 | 
  7 | /**
  8 |  * 义原相似度计算, 实现了SememeParser中定义的抽象方法
  9 |  *
 10 |  * @deprecated
 11 |  */
 12 | @Deprecated
 13 | public class MySememeParser extends SememeParser {
 14 | 
 15 | 	public MySememeParser() throws IOException {
 16 | 		super();
 17 | 	}
 18 | 
 19 | 	/**
 20 | 	 * 计算两个义原的相似度
 21 | 	 */
 22 | 	@Override
 23 | 	public double getSimilarity(final Sememe sememe1, final Sememe sememe2) {
 24 | 		Sememe sem1 = sememe1;
 25 | 		Sememe sem2 = sememe2;
 26 | 
 27 | 		if (sememe1 == null || sememe2 == null) {
 28 | 			return 0.0f;
 29 | 		} else if (sememe1.getId() == sememe2.getId()) {
 30 | 			return 1.0f;
 31 | 		}
 32 | 
 33 | 		//变为深度相同，然后一次上找共同的父节点
 34 | 		int level = sememe1.getDepth() - sememe2.getDepth();
 35 | 		for (int i = 0; i < ((level < 0) ? level * -1 : level); i++) {
 36 | 			if (level > 0) {
 37 | 				sem1 = SEMEMES[sem1.getParentId()];
 38 | 			} else {
 39 | 				sem2 = SEMEMES[sem2.getParentId()];
 40 | 			}
 41 | 		}
 42 | 
 43 | 		while (sem1.getId() != sem2.getId()) {
 44 | 			// 如果有一个已经到达根节点，仍然不同，则返回0
 45 | 			if (sem1.getId() == sem1.getParentId() || sem2.getId() == sem2.getParentId()) {
 46 | 				return 0.0f;
 47 | 			}
 48 | 
 49 | 			sem1 = SEMEMES[sem1.getParentId()];
 50 | 			sem2 = SEMEMES[sem2.getParentId()];
 51 | 		}
 52 | 
 53 | 		return sem1.getDepth() * 2.0f / (sememe1.getDepth() + sememe2.getDepth());
 54 | 	}
 55 | 
 56 | 	/**
 57 | 	 * 计算两个义元之间的相似度，由于义元可能相同，计算结果为其中相似度最大者 similarity = alpha/(distance+alpha),
 58 | 	 * 如果两个字符串相同或都为空，直接返回1.0
 59 | 	 *
 60 | 	 * @param key1 第一个义原字符串
 61 | 	 * @param key2 第二个义原字符串
 62 | 	 * @return
 63 | 	 */
 64 | 	@Override
 65 | 	public double getSimilarity(String item1, String item2) {
 66 | 		if (BlankUtils.isBlankAll(item2, item2)) {
 67 | 			return 1.0;
 68 | 		} else if (BlankUtils.isBlankAtLeastOne(item1, item2)) {
 69 | 			return 0.0;
 70 | 		} else if (item1.equals(item2)) {
 71 | 			return 1.0;
 72 | 		}
 73 | 
 74 | 		String key1 = item1.trim();
 75 | 		String key2 = item2.trim();
 76 | 
 77 | 		// 去掉()符号
 78 | 		if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
 79 | 			if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
 80 | 				key1 = key1.substring(1, key1.length() - 1);
 81 | 				key2 = key2.substring(1, key2.length() - 1);
 82 | 			} else {
 83 | 				return 0.0;
 84 | 			}
 85 | 		}
 86 | 
 87 | 		// 处理关系义元,即x=y的情况
 88 | 		int pos = key1.indexOf('=');
 89 | 		if (pos > 0) {
 90 | 			int pos2 = key2.indexOf('=');
 91 | 			// 如果是关系义元，则判断前面部分是否相同，如果相同，则转为计算后面部分的相似度，否则为0
 92 | 			if ((pos == pos2) && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
 93 | 				key1 = key1.substring(pos + 1);
 94 | 				key2 = key2.substring(pos2 + 1);
 95 | 			} else {
 96 | 				return 0.0;
 97 | 			}
 98 | 		}
 99 | 
100 | 		// 处理符号义元,即前面有特殊符号的义元
101 | 		String symbol1 = key1.substring(0, 1);
102 | 		String symbol2 = key2.substring(0, 1);
103 | 
104 | 		for (int i = 0; i < Symbol_Descriptions.length; i++) {
105 | 			if (symbol1.equals(Symbol_Descriptions[i][0])) {
106 | 				if (symbol1.equals(symbol2)) {
107 | 					key1 = item1.substring(1);
108 | 					key2 = item2.substring(1);
109 | 					break;
110 | 				} else {
111 | 					return 0.0; // 如果不是同一关系符号，则相似度直接返回0
112 | 				}
113 | 			}
114 | 		}
115 | 
116 | 		if ((pos = key1.indexOf("|")) >= 0) {
117 | 			key1 = key1.substring(pos + 1);
118 | 		}
119 | 		if ((pos = key2.indexOf("|")) >= 0) {
120 | 			key2 = key2.substring(pos + 1);
121 | 		}
122 | 
123 | 		// 如果两个字符串相等，直接返回距离为0
124 | 		if (key1.equals(key2)) {
125 | 			return 1.0;
126 | 		}
127 | 
128 | 		Integer[] myset1 = getSememes(key1);
129 | 		Integer[] myset2 = getSememes(key2);
130 | 
131 | 		double similarity = 0.0;
132 | 		for (int id1 : myset1) {
133 | 			for (int id2 : myset2) {
134 | 				double s = getSimilarity(SEMEMES[id1], SEMEMES[id2]);
135 | 				if (s > similarity) {
136 | 					similarity = s;
137 | 				}
138 | 			}
139 | 		}
140 | 
141 | 		return similarity;
142 | 	}
143 | 
144 | }


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/Sememe.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet.sememe;
  2 | 
  3 | /**
  4 |  * 描述知网义原的基本对象, 出于性能考虑，把未用到的英文名称、定义等在加载时忽略, 更准确的做法是以[英文定义|中文定义]
  5 |  * 作为一个整理进行处理，不过绝大多数只根据中文定义就可以标识出来，因此忽略不计。
  6 |  * 
  7 |  * @deprecated
  8 |  */
  9 | @Deprecated
 10 | public class Sememe {
 11 | 
 12 | 	/** 义原编号 */
 13 | 	private int id;
 14 | 	/** 指向上位义元号 */
 15 | 	private int parentId;
 16 | 	/** 义原在义原树中的深度 */
 17 | 	private int depth;
 18 | 	/** 义原的中文名称*/
 19 | 	private String cnWord;
 20 | 	/** 义原的英文名称 */
 21 | 	private String enWord;
 22 | 	/** 义原的定义，如果没有(例如数量)，则为空串 */
 23 | 	private String define;
 24 | 	/** 义原的类型 */
 25 | 	private int type;
 26 | 
 27 | 	/**
 28 | 	 * 每一行的形式为：be|是 {relevant,isa}/{relevant,descriptive} 
 29 | 	 * <br/>或者 official|官 [#organization|组织,#employee|员] 
 30 | 	 * <br/>或者 amount|多少 
 31 | 	 * <br/>把相应的部分赋予不同的属性
 32 | 	 * 出于性能考虑，把未用到的英文名称、定义等忽略
 33 | 	 * @param id
 34 | 	 * @param parentId
 35 | 	 * @param item 读取文件中的一行
 36 | 	 */
 37 | 	public Sememe(int id, int parentId, int depth, String item) {
 38 | 		this.id = id;
 39 | 		this.parentId = parentId;
 40 | 		this.depth = depth;
 41 | 
 42 | 		int pos = item.indexOf('|');
 43 | 		if (pos < 0) {
 44 | 			this.cnWord = item;
 45 | 			this.enWord = item;
 46 | 		} else {
 47 | 			this.enWord = item.substring(0, pos);
 48 | 
 49 | 			// 去掉"|"符号
 50 | 			String nextPart = item.substring(pos + 1);
 51 | 			pos = nextPart.indexOf(' ');
 52 | 			if (pos <= 0) {
 53 | 				this.cnWord = nextPart;
 54 | 			} else {
 55 | 				this.cnWord = nextPart.substring(0, pos);
 56 | 				this.define = nextPart.substring(pos).trim();
 57 | 			}
 58 | 		}
 59 | 	}
 60 | 
 61 | 	public int getId() {
 62 | 		return id;
 63 | 	}
 64 | 
 65 | 	public void setId(int id) {
 66 | 		this.id = id;
 67 | 	}
 68 | 
 69 | 	public int getParentId() {
 70 | 		return parentId;
 71 | 	}
 72 | 
 73 | 	public void setParentId(int parentId) {
 74 | 		this.parentId = parentId;
 75 | 	}
 76 | 
 77 | 	public int getDepth() {
 78 | 		return depth;
 79 | 	}
 80 | 
 81 | 	public void setDepth(int depth) {
 82 | 		this.depth = depth;
 83 | 	}
 84 | 
 85 | 	public String getCnWord() {
 86 | 		return cnWord;
 87 | 	}
 88 | 
 89 | 	public void setCnWord(String cnWord) {
 90 | 		this.cnWord = cnWord;
 91 | 	}
 92 | 
 93 | 	public String getEnWord() {
 94 | 		return enWord;
 95 | 	}
 96 | 
 97 | 	public void setEnWord(String enWord) {
 98 | 		this.enWord = enWord;
 99 | 	}
100 | 
101 | 	public String getDefine() {
102 | 		return define;
103 | 	}
104 | 
105 | 	public void setDefine(String define) {
106 | 		this.define = define;
107 | 	}
108 | 
109 | 	public int getType() {
110 | 		return type;
111 | 	}
112 | 
113 | 	public void setType(int type) {
114 | 		this.type = type;
115 | 	}
116 | 
117 | 	@Override
118 | 	public String toString() {
119 | 		StringBuilder sb = new StringBuilder();
120 | 		sb.append("id=");
121 | 		sb.append(id);
122 | 		sb.append("; parentId=");
123 | 		sb.append(parentId);
124 | 		sb.append("; depth=");
125 | 		sb.append(depth);
126 | 		sb.append("; cnWord=");
127 | 		sb.append(cnWord);
128 | 		sb.append("; enWord=");
129 | 		sb.append(enWord);
130 | 		sb.append("; define=");
131 | 		sb.append(define);
132 | 		return sb.toString();
133 | 	}
134 | 
135 | }
136 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/SememeDictTraverseEvent.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet.sememe;
  2 | 
  3 | import java.io.FileOutputStream;
  4 | import java.io.PrintWriter;
  5 | import java.util.ArrayList;
  6 | import java.util.List;
  7 | 
  8 | import javax.xml.parsers.DocumentBuilder;
  9 | import javax.xml.parsers.DocumentBuilderFactory;
 10 | import javax.xml.transform.OutputKeys;
 11 | import javax.xml.transform.Transformer;
 12 | import javax.xml.transform.TransformerFactory;
 13 | import javax.xml.transform.dom.DOMSource;
 14 | import javax.xml.transform.stream.StreamResult;
 15 | 
 16 | import org.w3c.dom.Document;
 17 | import org.w3c.dom.Element;
 18 | 
 19 | import zx.soft.similarity.util.TraverseEvent;
 20 | 
 21 | /**
 22 |  * 实现遍历加载义原信息到义原表中, 义原词典的组织以知网导出的格式为标准，如：<br/>
 23 |  * - entity|实体 <br/>
 24 |  * ├ thing|万物 [#time|时间,#space|空间] <br/>
 25 |  * │ ├ physical|物质 [!appearance|外观] <br/>
 26 |  * │ │ ├ animate|生物 [*alive|活着,!age|年龄,*die|死,*metabolize|代谢] <br/>
 27 |  * │ │ │ ├ AnimalHuman|动物 [!sex|性别,*AlterLocation|变空间位置,*StateMental|精神状态] <br/>
 28 |  * │ │ │ │<br/>
 29 |  * 等等 <br>
 30 |  *
 31 |  * @deprecated
 32 |  */
 33 | @Deprecated
 34 | public class SememeDictTraverseEvent implements TraverseEvent<String> {
 35 | 
 36 | 	/** 义原存放的列表, 按照顺序设置ID，存放到线性表中 */
 37 | 	private List<Sememe> sememeList = null;
 38 | 
 39 | 	public SememeDictTraverseEvent() {
 40 | 		this.sememeList = new ArrayList<>();
 41 | 	}
 42 | 
 43 | 	/**
 44 | 	 * 获取加载后的义原信息，按照下标顺序存放，树的层次关系通过数组下标表示
 45 | 	 * @return
 46 | 	 */
 47 | 	public Sememe[] getSememes() {
 48 | 		return sememeList.toArray(new Sememe[sememeList.size()]);
 49 | 	}
 50 | 
 51 | 	private void processXML(Document document, Element root, int parentId, String fullParentId) {
 52 | 		int position = 1;
 53 | 		for (int i = 0; i < sememeList.size(); i++) {
 54 | 			Sememe sememe = sememeList.get(i);
 55 | 			if (sememe.getParentId() == parentId && sememe.getId() != parentId) {
 56 | 				Element sememeNode = document.createElement("sememe");
 57 | 				String fullId = fullParentId + "-" + (position++);
 58 | 				sememeNode.setAttribute("id", fullId);
 59 | 				sememeNode.setAttribute("cn", sememe.getCnWord());
 60 | 				sememeNode.setAttribute("en", sememe.getEnWord());
 61 | 				if (sememe.getDefine() != null && !sememe.getDefine().equals("")) {
 62 | 					sememeNode.setAttribute("define", sememe.getDefine());
 63 | 				}
 64 | 				root.appendChild(sememeNode);
 65 | 				processXML(document, root, sememe.getId(), fullId);
 66 | 			}
 67 | 		}
 68 | 	}
 69 | 
 70 | 	/**
 71 | 	 * 保存到XML文件中, 新版本的xsimilarity采用xml格式存储义原，其格式为
 72 | 	 * &lt;sememes>
 73 | 	 *   &lt;sememe cn="事件" en="event" id="1"/>
 74 | 	 *   &lt;sememe cn="静态" en="static" id="1-1"/>
 75 | 	 * ...
 76 | 	 * &lt;/sememes>
 77 | 	 * @param xmlFile
 78 | 	 * @throws Exception
 79 | 	 */
 80 | 	public void saveToXML(String xmlFile) throws Exception {
 81 | 		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
 82 | 		DocumentBuilder builder = factory.newDocumentBuilder();
 83 | 		Document document = builder.newDocument();
 84 | 		Element root = document.createElement("sememes");
 85 | 		document.appendChild(root);
 86 | 		int position = 1;
 87 | 		for (Sememe sememe : sememeList) {
 88 | 			if (sememe.getId() != sememe.getParentId()) {
 89 | 				continue;
 90 | 			}
 91 | 
 92 | 			Element sememeNode = document.createElement("sememe");
 93 | 			String fullId = Integer.toString(position++);
 94 | 
 95 | 			sememeNode.setAttribute("id", fullId);
 96 | 			sememeNode.setAttribute("cn", sememe.getCnWord());
 97 | 			sememeNode.setAttribute("en", sememe.getEnWord());
 98 | 			if (sememe.getDefine() != null && !sememe.getDefine().equals("")) {
 99 | 				sememeNode.setAttribute("define", sememe.getDefine());
100 | 			}
101 | 			root.appendChild(sememeNode);
102 | 			processXML(document, root, sememe.getId(), fullId);
103 | 		}
104 | 
105 | 		TransformerFactory tf = TransformerFactory.newInstance();
106 | 		Transformer transformer = tf.newTransformer();
107 | 		DOMSource source = new DOMSource(document);
108 | 		transformer.setOutputProperty(OutputKeys.ENCODING, "utf8");
109 | 		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
110 | 		PrintWriter pw = new PrintWriter(new FileOutputStream(xmlFile));
111 | 		StreamResult result = new StreamResult(pw);
112 | 		transformer.transform(source, result);
113 | 	}
114 | 
115 | 	/**
116 | 	 * 解析当前义原信息文本行<br/>
117 | 	 * 判断读入的一行文本是义元树中的第几层，读入的格式形如：<br>
118 | 	 *  - entity|实体 <br>
119 | 	 *   ├ thing|万物 [#time|时间,#space|空间] <br>
120 | 	 *   │ ├ physical|物质 [!appearance|外观] <br>
121 | 	 *   │ │ ├ animate|生物 [*alive|活着,!age|年龄,*die|死,*metabolize|代谢] <br>
122 | 	 *
123 | 	 * @param item
124 | 	 * @return 如果是义原，则info[0]返回层次深度(info[0]>=0); info[1]返回具体的义元内容起始位置；否则info[0]返回-1
125 | 	 */
126 | 	private int[] parseSememeLine(String item) {
127 | 		int[] info = new int[2];
128 | 		info[0] = -1;
129 | 
130 | 		int prefixLen = 0; // 前缀的数目，包括空格和"-,│,├"等符号，其中空格和"-"符号算一个长度，其他算2个
131 | 		for (int i = 0; i < item.length(); i++) {
132 | 			char ch = item.charAt(i);
133 | 			if ((ch == ' ') || (ch == '-')) {
134 | 				prefixLen++;
135 | 			} else if ((ch == '├') || (ch == '│') || (ch == '└')) {
136 | 				prefixLen += 2;
137 | 			} else {
138 | 				// 遇到非前缀字符，求解，根据前缀深度，如果为2，返回0，即第一级，否则，每增加3，深度加1
139 | 				if (prefixLen >= 2) {
140 | 					info[0] = (prefixLen - 2) / 3;
141 | 					info[1] = i;
142 | 				}
143 | 				break;
144 | 			}
145 | 		}
146 | 		return info;
147 | 	}
148 | 
149 | 	/**
150 | 	 * 根据字符串判断义元的类型
151 | 	 *
152 | 	 * @param item
153 | 	 * @return
154 | 	 */
155 | 	private int parseSememeType(String item) {
156 | 		String myItem = item.toLowerCase().trim();
157 | 		if (myItem.indexOf("event|") == 0)
158 | 			return SememeType.Event;
159 | 		else if (myItem.indexOf("entity|") == 0)
160 | 			return SememeType.Entity;
161 | 		else if (myItem.indexOf("attribute|") == 0)
162 | 			return SememeType.Attribute;
163 | 		else if (myItem.indexOf("quantity|") == 0)
164 | 			return SememeType.Quantity;
165 | 		else if (myItem.indexOf("avalue|") == 0)
166 | 			return SememeType.AValue;
167 | 		else if (myItem.indexOf("qvalue|") == 0)
168 | 			return SememeType.QValue;
169 | 		else if (myItem.indexOf("secondary feature") == 0)
170 | 			return SememeType.SecondaryFeature;
171 | 		else if (myItem.indexOf("syntax") == 0)
172 | 			return SememeType.Syntax;
173 | 		else if (myItem.indexOf("eventrole and features") == 0)
174 | 			return SememeType.EventRoleAndFeature;
175 | 		else
176 | 			return 0;
177 | 	}
178 | 
179 | 	/**
180 | 	 * 实现TraverseEvent<String>的实际访问接口, 返回值没有使用
181 | 	 * @see ke.commons.util.TraverseEvent
182 | 	 */
183 | 	@Override
184 | 	public boolean visit(String line) {
185 | 		//判断是否为注释行
186 | 		if (line.trim().equals("") || line.trim().charAt(0) == '#')
187 | 			return true;
188 | 
189 | 		//当前义原在整个义原列表中的位置
190 | 		int position = sememeList.size();
191 | 
192 | 		//解析当前义原信息文本行, info[0]表示当前义原的层次, info[1]表示当前义原的实际信息在文本行中的开始位置
193 | 		int[] info = parseSememeLine(line);
194 | 		int curDepth = info[0];
195 | 
196 | 		//如果深度<0，继续
197 | 		if (info[0] < 0)
198 | 			return false;
199 | 
200 | 		//取出真正的义原字符串
201 | 		String sememeString = line.substring(info[1]);
202 | 
203 | 		//深度为0，表示为根节点
204 | 		if (info[0] == 0) {
205 | 			Sememe sememe = new Sememe(position, position, 0, sememeString);
206 | 			int sememeType = parseSememeType(sememeString);
207 | 			sememe.setType(sememeType);
208 | 			sememeList.add(sememe);
209 | 		} else {
210 | 			Sememe parentSememe = sememeList.get(position - 1);
211 | 			//最近一个深度比当前深度大1的义原即为该义原的父节点
212 | 
213 | 			while ((parentSememe.getDepth() - curDepth) != -1) {
214 | 				parentSememe = sememeList.get(parentSememe.getParentId());
215 | 			}
216 | 			Sememe sememe = new Sememe(position, parentSememe.getId(), curDepth, sememeString);
217 | 			sememe.setType(parentSememe.getType());
218 | 			sememeList.add(sememe);
219 | 		}
220 | 
221 | 		return true;
222 | 	}
223 | 
224 | }
225 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/SememeParser.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.InputStream;
  5 | import java.util.Collection;
  6 | 
  7 | import org.slf4j.Logger;
  8 | import org.slf4j.LoggerFactory;
  9 | 
 10 | import zx.soft.similarity.Similaritable;
 11 | import zx.soft.similarity.util.BlankUtils;
 12 | import zx.soft.similarity.util.FileUtils;
 13 | import zx.soft.similarity.word.hownet.HownetMeta;
 14 | 
 15 | /**
 16 |  * 义原解析器, 包括义元数据的加载，义元的组织、索引、查询 以及义元的距离计算和相似度计算等.
 17 |  * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》
 18 |  *
 19 |  * @see zx.soft.similarity.Similaritable
 20 |  * @deprecated
 21 |  */
 22 | @Deprecated
 23 | public abstract class SememeParser implements HownetMeta, Similaritable {
 24 | 
 25 | 	protected Logger logger = LoggerFactory.getLogger(this.getClass());
 26 | 
 27 | 	/** 所有的义原都存放到一个数组之中，并且义元的ID号与数组的下标相同 */
 28 | 	protected Sememe[] SEMEMES;
 29 | 
 30 | 	/** 通过对义原的汉语词义进行索引，根据该索引快速定位义原，找出义原的id，再到sememes中查找 */
 31 | 	private FastSimpleMap<String, Integer> sememeMap = null;
 32 | 
 33 | 	public SememeParser() throws IOException {
 34 | 		InputStream input = this.getClass().getClassLoader().getResourceAsStream("data/sememe.dat");
 35 | 		load(input, "UTF-8");
 36 | 	}
 37 | 
 38 | 	/**
 39 | 	 * 获取两个义原描述串的相似度
 40 | 	 * @param sememeName1
 41 | 	 * @param sememeName2
 42 | 	 * @see ke.commons.similarity.Similariable
 43 | 	 * @return
 44 | 	 */
 45 | 	@Override
 46 | 	public abstract double getSimilarity(String sememeName1, String sememeName2);
 47 | 
 48 | 	/**
 49 | 	 * 获取两个确定义原的相似度
 50 | 	 * @param sememe1
 51 | 	 * @param sememe2
 52 | 	 * @return
 53 | 	 */
 54 | 	public abstract double getSimilarity(Sememe sememe1, Sememe sememe2);
 55 | 
 56 | 	/**
 57 | 	 * 从文件中加载义元知识
 58 | 	 *
 59 | 	 * @throws IOException
 60 | 	 */
 61 | 	public void load(InputStream input, String encoding) throws IOException {
 62 | 		SememeDictTraverseEvent event = new SememeDictTraverseEvent();
 63 | 		logger.info("loading sememe dictionary...");
 64 | 		long time = System.currentTimeMillis();
 65 | 		FileUtils.traverseLines(input, encoding, event);
 66 | 		this.SEMEMES = event.getSememes();
 67 | 
 68 | 		String[] keys = new String[SEMEMES.length];
 69 | 		Integer[] values = new Integer[SEMEMES.length];
 70 | 
 71 | 		//设置索引
 72 | 		for (int i = 0; i < SEMEMES.length; i++) {
 73 | 			keys[i] = SEMEMES[i].getCnWord();
 74 | 			values[i] = SEMEMES[i].getId();
 75 | 		}
 76 | 		sememeMap = new FastSimpleMap<String, Integer>(keys, values);
 77 | 
 78 | 		time = System.currentTimeMillis() - time;
 79 | 		logger.info("sememe dictionary load completely. time elapsed:{}", time);
 80 | 	}
 81 | 
 82 | 	/**
 83 | 	 * 根据汉语定义计算义元之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大，
 84 | 	 * <br/>由于可能多个义元有相同的汉语词语，故计算结果为其中距离最小者
 85 | 	 *
 86 | 	 * @param key1
 87 | 	 * @param key2
 88 | 	 * @return
 89 | 	 */
 90 | 	public int getDistance(String key1, String key2) {
 91 | 		int distance = Integer.MAX_VALUE;
 92 | 
 93 | 		// 如果两个字符串相等，直接返回距离为0
 94 | 		if (key1.equals(key2)) {
 95 | 			return 0;
 96 | 		}
 97 | 
 98 | 		Integer[] semArray1 = getSememes(key1);
 99 | 		Integer[] semArray2 = getSememes(key2);
100 | 
101 | 		// 如果key1或者key2不是义元，并且key1<>key2,则返回无穷大
102 | 		if (semArray1.length == 0 || semArray2.length == 0) {
103 | 			return Integer.MAX_VALUE;
104 | 		}
105 | 
106 | 		for (int i : semArray1) {
107 | 			for (int j : semArray2) {
108 | 				int d = getDistance(SEMEMES[i], SEMEMES[j]);
109 | 				if (d < distance) {
110 | 					distance = d;
111 | 				}
112 | 			}
113 | 		}
114 | 
115 | 		return distance;
116 | 	}
117 | 
118 | 	/**
119 | 	 * 获取两个义元在义原树中的距离
120 | 	 *
121 | 	 * @param sem1
122 | 	 *            第一个义原
123 | 	 * @param sem2
124 | 	 *            第二个义原
125 | 	 * @return 两个义原的距离
126 | 	 */
127 | 	public int getDistance(Sememe sem1, Sememe sem2) {
128 | 		Sememe mysem1 = sem1;
129 | 		Sememe mysem2 = sem2;
130 | 		int distance = 0;
131 | 
132 | 		if (mysem1 == null || mysem2 == null)
133 | 			return Integer.MAX_VALUE;
134 | 
135 | 		//变为深度相同，然后一次上找共同的父节点
136 | 		int level = mysem1.getDepth() - mysem2.getDepth();
137 | 		for (int i = 0; i < ((level < 0) ? level * -1 : level); i++) {
138 | 			if (level > 0)
139 | 				mysem1 = SEMEMES[mysem1.getParentId()];
140 | 			else
141 | 				mysem2 = SEMEMES[mysem2.getParentId()];
142 | 			distance++;
143 | 		}
144 | 
145 | 		//从不同的分支（深度相同）同时向上寻找共同的祖先节点
146 | 		while (mysem1.getId() != mysem2.getId()) {
147 | 			// 如果已经到达根节点，仍然不同，则返回无穷大(-1)
148 | 			if (mysem1.getId() == mysem1.getParentId() || mysem2.getId() == mysem2.getParentId()) {
149 | 				distance = Integer.MAX_VALUE;
150 | 				break;
151 | 			}
152 | 
153 | 			mysem1 = SEMEMES[mysem1.getParentId()];
154 | 			mysem2 = SEMEMES[mysem2.getParentId()];
155 | 			distance += 2;
156 | 		}
157 | 
158 | 		return distance;
159 | 	}
160 | 
161 | 	/**
162 | 	 * 获取从该义元到根节点的路径表示字符串
163 | 	 *
164 | 	 * @param key
165 | 	 * @return
166 | 	 */
167 | 	public String getPath(String key) {
168 | 		StringBuilder path = new StringBuilder();
169 | 
170 | 		Sememe sem = getSememe(key);
171 | 		while (sem != null && sem.getId() != sem.getParentId()) {
172 | 			path.insert(0, "->" + sem.getCnWord());
173 | 			sem = SEMEMES[sem.getParentId()];
174 | 		}
175 | 
176 | 		if (sem != null) {
177 | 			path.insert(0, "->" + sem.getCnWord());
178 | 		}
179 | 		path.insert(0, "START");
180 | 		return path.toString();
181 | 	}
182 | 
183 | 	/**
184 | 	 * 根据义原的名字，获取该义原的位置信息，义原体系中有时会有一个名字对应多个义原，一并返回到
185 | 	 * 义原数组中
186 | 	 * @param sememeName
187 | 	 * @return
188 | 	 */
189 | 	public Integer[] getSememes(String sememeName) {
190 | 		Collection<Integer> ids = sememeMap.get(sememeName);
191 | 
192 | 		return ids.toArray(new Integer[ids.size()]);
193 | 	}
194 | 
195 | 	/**
196 | 	 * 获取其中的一个义原，大部分义原就只有一个
197 | 	 * @param sememeName
198 | 	 * @return
199 | 	 */
200 | 	public Sememe getSememe(String sememeName) {
201 | 		Integer[] ids = getSememes(sememeName);
202 | 
203 | 		if (BlankUtils.isBlank(ids)) {
204 | 			return null;
205 | 		} else {
206 | 			return SEMEMES[ids[0]];
207 | 		}
208 | 	}
209 | 
210 | 	/**
211 | 	 * 过滤义原字符串，去掉其中的英文部分
212 | 	 * @param sememeString
213 | 	 * @return
214 | 	 */
215 | 	protected String filterSememeString(String sememeString) {
216 | 		int pos = sememeString.indexOf("|");
217 | 		if (pos >= 0) {
218 | 			sememeString = sememeString.substring(pos + 1);
219 | 		}
220 | 		return sememeString;
221 | 	}
222 | 
223 | }
224 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet/sememe/SememeType.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet.sememe;
 2 | 
 3 | /**
 4 |  * 义原的类型定义<br/>
 5 |  * <ul>
 6 |  * <li>1：Event|事件</li>
 7 |  * <li>2：Entity|实体 </li>
 8 |  * <li>3:Attribute|属性 </li>
 9 |  * <li>4：Quantity|数量</li>
10 |  * <li>5：aValue|属性值</li>
11 |  * <li>6：qValue|数量值</li>
12 |  * <li>7: Secondary Feature|第二特征</li>
13 |  * <li>8: Syntax|语法</li>
14 |  * <li>9: EventRole|动态角色</li>
15 |  * <li>10:EventFeatures|动态属性</li>
16 |  * <li>0：未知</li>
17 |  * </ul>
18 |  * 
19 |  * 其中1~7为基本义元，8为语法义元，9、10为关系义元<br/>
20 |  * 
21 |  * @deprecated
22 |  */
23 | @Deprecated
24 | public interface SememeType {
25 | 
26 | 	/** Event|事件类型定义 */
27 | 	public static final int Event = 1;
28 | 
29 | 	/** Entity|实体类型定义*/
30 | 	public static final int Entity = 2;
31 | 
32 | 	/** Attribute|属性类型定义*/
33 | 	public static final int Attribute = 3;
34 | 
35 | 	/** Quantity|数量类型定义*/
36 | 	public static final int Quantity = 4;
37 | 
38 | 	/** aValue|属性值类型定义*/
39 | 	public static final int AValue = 5;
40 | 
41 | 	/** qValue|数量值类型定义*/
42 | 	public static final int QValue = 6;
43 | 
44 | 	/** Secondary Feature|第二特征类型定义*/
45 | 	public static final int SecondaryFeature = 7;
46 | 
47 | 	/** Syntax|语法类型定义*/
48 | 	public static final int Syntax = 8;
49 | 
50 | 	/** EventRole|动态角色类型定义*/
51 | 	public static final int EventRoleAndFeature = 9;
52 | 
53 | 	/** EventFeatures|动态属性类型定义*/
54 | 	public static final int EventFeature = 10;
55 | 
56 | 	/** 未知类型定义*/
57 | 	public static final int Unknown = 0;
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/Concept.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet2.concept;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.HashSet;
  5 | import java.util.List;
  6 | import java.util.Set;
  7 | import java.util.StringTokenizer;
  8 | 
  9 | import zx.soft.similarity.word.hownet.HownetMeta;
 10 | 
 11 | /**
 12 |  * 知网的概念表示类 <br/>example和英文部分对于相似度的计算不起作用，考虑到内存开销， 在概念的表示中去掉了这部分数据的对应定义
 13 |  *
 14 |  */
 15 | public class Concept implements HownetMeta {
 16 | 
 17 | 	/** 中文概念名称 */
 18 | 	protected String word;
 19 | 	/** 词性: Part of Speech */
 20 | 	protected String pos;
 21 | 	/** 定义 */
 22 | 	protected String define;
 23 | 
 24 | 	/** 是否是实词，false表示为虚词, 一般为实词 */
 25 | 	protected boolean bSubstantive;
 26 | 	/** 第一基本义原 */
 27 | 	protected String mainSememe;
 28 | 	/** 其他基本义原 */
 29 | 	protected String[] secondSememes;
 30 | 	/** 关系义元原 */
 31 | 	protected String[] relationSememes;
 32 | 	/** 关系符号描述 */
 33 | 	protected String[] symbolSememes;
 34 | 
 35 | 	static String[][] Concept_Type = { { "=", "事件" }, { "aValue|属性值", "属性值" }, { "qValue|数量值", "数量值" },
 36 | 			{ "attribute|属性", "属性" }, { "quantity|数量", "数量" }, { "unit|", "单位" }, { "%", "部件" } };
 37 | 
 38 | 	public Concept(String word, String pos, String def) {
 39 | 		this.word = word;
 40 | 		this.pos = pos;
 41 | 		this.define = (def == null) ? "" : def.trim();
 42 | 
 43 | 		// 虚词用{***}表示
 44 | 		if (define.length() > 0 && define.charAt(0) == '{' && define.charAt(define.length() - 1) == '}') {
 45 | 			this.bSubstantive = false;
 46 | 		} else {
 47 | 			this.bSubstantive = true;
 48 | 		}
 49 | 
 50 | 		parseDefine();
 51 | 	}
 52 | 
 53 | 	/**
 54 | 	 * 处理定义，把定义分为第一基本义元、其他基本义元、关系义元和符号义元四类
 55 | 	 */
 56 | 	private void parseDefine() {
 57 | 		List<String> secondList = new ArrayList<>(); //其他基本义原
 58 | 		List<String> relationList = new ArrayList<>(); //关系义原
 59 | 		List<String> symbolList = new ArrayList<>(); //符号义原
 60 | 
 61 | 		String tokenString = this.define;
 62 | 
 63 | 		//如果不是实词，则处理“{}”中的内容
 64 | 		if (!this.bSubstantive) {
 65 | 			tokenString = define.substring(1, define.length() - 1);
 66 | 		}
 67 | 
 68 | 		StringTokenizer token = new StringTokenizer(tokenString, ",", false);
 69 | 
 70 | 		// 第一个为第一基本义元
 71 | 		if (token.hasMoreTokens()) {
 72 | 			this.mainSememe = token.nextToken();
 73 | 		}
 74 | 
 75 | 		main_loop: while (token.hasMoreTokens()) {
 76 | 			String item = token.nextToken();
 77 | 			if (item.equals(""))
 78 | 				continue;
 79 | 
 80 | 			// 先判断是否为符号义元
 81 | 			String symbol = item.substring(0, 1);
 82 | 			for (int i = 0; i < Symbol_Descriptions.length; i++) {
 83 | 				if (symbol.equals(Symbol_Descriptions[i][0])) {
 84 | 					symbolList.add(item);
 85 | 					continue main_loop;
 86 | 				}
 87 | 			}
 88 | 
 89 | 			//如果不是符号义元，则进一步判断是关系义元还是第二基本义元, 带有“=”表示关系义原
 90 | 			if (item.indexOf('=') > 0) {
 91 | 				relationList.add(item);
 92 | 			} else {
 93 | 				secondList.add(item);
 94 | 			}
 95 | 		}
 96 | 
 97 | 		this.secondSememes = secondList.toArray(new String[secondList.size()]);
 98 | 		this.relationSememes = relationList.toArray(new String[relationList.size()]);
 99 | 		this.symbolSememes = symbolList.toArray(new String[symbolList.size()]);
100 | 
101 | 	}
102 | 
103 | 	/**
104 | 	 * 获取第一义元
105 | 	 *
106 | 	 * @return
107 | 	 */
108 | 	public String getMainSememe() {
109 | 		return mainSememe;
110 | 	}
111 | 
112 | 	/**
113 | 	 * 获取其他基本义元描述
114 | 	 *
115 | 	 * @return
116 | 	 */
117 | 	public String[] getSecondSememes() {
118 | 		return secondSememes;
119 | 	}
120 | 
121 | 	/**
122 | 	 * 获取关系义元描述
123 | 	 *
124 | 	 * @return
125 | 	 */
126 | 	public String[] getRelationSememes() {
127 | 		return relationSememes;
128 | 	}
129 | 
130 | 	/**
131 | 	 * 获取符号义元描述
132 | 	 *
133 | 	 * @return
134 | 	 */
135 | 	public String[] getSymbolSememes() {
136 | 		return symbolSememes;
137 | 	}
138 | 
139 | 	public Set<String> getAllSememeNames() {
140 | 		Set<String> names = new HashSet<>();
141 | 
142 | 		//加入主义原
143 | 		names.add(getMainSememe());
144 | 
145 | 		//加入关系义原
146 | 		for (String item : getRelationSememes()) {
147 | 			names.add(item.substring(item.indexOf("=") + 1));
148 | 		}
149 | 
150 | 		//加入符号义原
151 | 		for (String item : getSymbolSememes()) {
152 | 			names.add(item.substring(1));
153 | 		}
154 | 
155 | 		//加入其他义原集合
156 | 		for (String item : getSecondSememes()) {
157 | 			names.add(item);
158 | 		}
159 | 		return names;
160 | 	}
161 | 
162 | 	@Override
163 | 	public String toString() {
164 | 		StringBuilder sb = new StringBuilder();
165 | 		sb.append("name=");
166 | 		sb.append(this.word);
167 | 		sb.append("; pos=");
168 | 		sb.append(this.pos);
169 | 		sb.append("; define=");
170 | 		sb.append(this.define);
171 | 		sb.append("; 第一基本义元:[" + mainSememe);
172 | 
173 | 		sb.append("]; 其他基本义元描述:[");
174 | 		for (String sem : secondSememes) {
175 | 			sb.append(sem);
176 | 			sb.append(";");
177 | 		}
178 | 
179 | 		sb.append("]; [关系义元描述:");
180 | 		for (String sem : relationSememes) {
181 | 			sb.append(sem);
182 | 			sb.append(";");
183 | 		}
184 | 
185 | 		sb.append("]; [关系符号描述:");
186 | 		for (String sem : symbolSememes) {
187 | 			sb.append(sem);
188 | 			sb.append(";");
189 | 		}
190 | 		sb.append("]");
191 | 		return sb.toString();
192 | 	}
193 | 
194 | 	/**
195 | 	 * 是实词还是虚词
196 | 	 *
197 | 	 * @return true:实词；false:虚词
198 | 	 */
199 | 	public boolean isSubstantive() {
200 | 		return this.bSubstantive;
201 | 	}
202 | 
203 | 	public String getWord() {
204 | 		return word;
205 | 	}
206 | 
207 | 	public void setWord(String word) {
208 | 		this.word = word;
209 | 	}
210 | 
211 | 	public String getPos() {
212 | 		return pos;
213 | 	}
214 | 
215 | 	public void setPos(String pos) {
216 | 		this.pos = pos;
217 | 	}
218 | 
219 | 	public String getDefine() {
220 | 		return define;
221 | 	}
222 | 
223 | 	public void setDefine(String define) {
224 | 		this.define = define;
225 | 	}
226 | 
227 | 	/**
228 | 	 * 获取该概念的类型
229 | 	 *
230 | 	 * @return
231 | 	 */
232 | 	public String getType() {
233 | 		for (int i = 0; i < Concept_Type.length; i++) {
234 | 			if (define.toUpperCase().indexOf(Concept_Type[i][0].toUpperCase()) >= 0) {
235 | 				return Concept_Type[i][1];
236 | 			}
237 | 		}
238 | 		return "普通概念";
239 | 	}
240 | 
241 | 	@Override
242 | 	public int hashCode() {
243 | 		return define == null ? word.hashCode() : define.hashCode();
244 | 	}
245 | 
246 | 	@Override
247 | 	public boolean equals(Object anObject) {
248 | 		if (anObject instanceof Concept) {
249 | 			Concept c = (Concept) anObject;
250 | 			return word.equals(c.word) && define.equals(c.define);
251 | 		} else {
252 | 			return false;
253 | 		}
254 | 	}
255 | 
256 | }


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/ConceptDictTraverseEvent.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet2.concept;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileOutputStream;
  6 | import java.io.InputStream;
  7 | import java.io.InputStreamReader;
  8 | import java.io.PrintWriter;
  9 | import java.util.ArrayList;
 10 | import java.util.Arrays;
 11 | import java.util.List;
 12 | 
 13 | import javax.xml.parsers.DocumentBuilder;
 14 | import javax.xml.parsers.DocumentBuilderFactory;
 15 | import javax.xml.transform.OutputKeys;
 16 | import javax.xml.transform.Transformer;
 17 | import javax.xml.transform.TransformerFactory;
 18 | import javax.xml.transform.dom.DOMSource;
 19 | import javax.xml.transform.stream.StreamResult;
 20 | 
 21 | import org.w3c.dom.Document;
 22 | import org.w3c.dom.Element;
 23 | 
 24 | import zx.soft.similarity.util.TraverseEvent;
 25 | 
 26 | /**
 27 |  * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准，格式如下：<br/>
 28 |  * 阿斗                	N    	human|人,ProperName|专,past|昔<br/>
 29 |  * 阿爸                	N    	human|人,family|家,male|男<br/>
 30 |  * 即： &lt;概念&gt; &lt;空格或者跳格&gt; &lt;词性&gt; &lt;空格或者跳格&gt; &lt;定义&gt;"
 31 |  * <br/>
 32 |  * 概念保存到数组中，没有保存到Map中，可以降低对内存空间的使用
 33 |  *
 34 |  */
 35 | public class ConceptDictTraverseEvent implements TraverseEvent<String> {
 36 | 
 37 | 	private List<Concept> conceptList = null;
 38 | 
 39 | 	public ConceptDictTraverseEvent() {
 40 | 		conceptList = new ArrayList<>();
 41 | 	}
 42 | 
 43 | 	public Concept[] getConcepts() {
 44 | 		Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]);
 45 | 		Arrays.sort(concepts);
 46 | 		return concepts;
 47 | 	}
 48 | 
 49 | 	/**
 50 | 	 * 读取概念词典中的一行，并进行解析处理
 51 | 	 */
 52 | 	@Override
 53 | 	public boolean visit(String line) {
 54 | 		String word = null;
 55 | 		String pos = null;
 56 | 		String define = "";
 57 | 		char ch;
 58 | 
 59 | 		//以符号//开始的是注释行
 60 | 		if (line.startsWith("//")) {
 61 | 			return true;
 62 | 		}
 63 | 
 64 | 		int lastPosition = 0; //最近一次处理内容的有意义的开始位置
 65 | 		int processFlag = 0; //当前处理部分的标志 0：处理word； 1：词性；2：定义
 66 | 		//解析出一行中的概念各项数据
 67 | 		loop: for (int position = 0; position < line.length(); position++) {
 68 | 			ch = line.charAt(position);
 69 | 
 70 | 			if ((ch == ' ') || (ch == '\t') || (position == (line.length() - 1))) {
 71 | 				String item = line.substring(lastPosition, (position == (line.length() - 1)) ? (position + 1)
 72 | 						: position);
 73 | 				switch (processFlag) {
 74 | 				case 0:
 75 | 					word = item;
 76 | 					processFlag++;
 77 | 					break;
 78 | 				case 1:
 79 | 					pos = item;
 80 | 					processFlag++;
 81 | 					break;
 82 | 				case 2:
 83 | 					//define = item;
 84 | 					//processFlag++;
 85 | 					define = line.substring(lastPosition).trim();
 86 | 					break loop;
 87 | 				case 3:
 88 | 					System.out.println(line);
 89 | 					break;
 90 | 				}
 91 | 
 92 | 				for (; (position < line.length()); position++) {
 93 | 					ch = line.charAt(position);
 94 | 					if ((ch != ' ') && (ch != '\t')) {
 95 | 						lastPosition = position;
 96 | 						break;
 97 | 					}
 98 | 				}
 99 | 
100 | 			}
101 | 		}
102 | 		conceptList.add(new Concept(word, pos, define));
103 | 		return true;
104 | 	}
105 | 
106 | 	public void saveToXML(File xmlFile) throws Exception {
107 | 		InputStream input = this.getClass().getClassLoader().getResourceAsStream("data/concept.dat");
108 | 		BufferedReader in = new BufferedReader(new InputStreamReader(input, "utf8"));
109 | 
110 | 		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
111 | 		DocumentBuilder builder = factory.newDocumentBuilder();
112 | 		Document document = builder.newDocument();
113 | 		Element root = document.createElement("concepts");
114 | 		document.appendChild(root);
115 | 
116 | 		String line = null;
117 | 
118 | 		while ((line = in.readLine()) != null) {
119 | 			saveLineToXML(document, root, line);
120 | 		}
121 | 
122 | 		input.close();
123 | 		in.close();
124 | 
125 | 		TransformerFactory tf = TransformerFactory.newInstance();
126 | 		Transformer transformer = tf.newTransformer();
127 | 		DOMSource source = new DOMSource(document);
128 | 		transformer.setOutputProperty(OutputKeys.ENCODING, "utf8");
129 | 		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
130 | 		PrintWriter pw = new PrintWriter(new FileOutputStream(xmlFile));
131 | 		StreamResult result = new StreamResult(pw);
132 | 		transformer.transform(source, result);
133 | 	}
134 | 
135 | 	/**
136 | 	 * 读取概念词典中的一行，并进行解析处理
137 | 	 */
138 | 	private boolean saveLineToXML(Document document, Element root, String line) {
139 | 		String word = null;
140 | 		String pos = null;
141 | 		String define = "";
142 | 		char ch;
143 | 
144 | 		//以符号//开始的是注释行
145 | 		if (line.startsWith("//")) {
146 | 			return true;
147 | 		}
148 | 
149 | 		int lastPosition = 0; //最近一次处理内容的有意义的开始位置
150 | 		int processFlag = 0; //当前处理部分的标志 0：处理word； 1：词性；2：定义
151 | 		//解析出一行中的概念各项数据
152 | 		loop: for (int position = 0; position < line.length(); position++) {
153 | 			ch = line.charAt(position);
154 | 
155 | 			if ((ch == ' ') || (ch == '\t') || (position == (line.length() - 1))) {
156 | 				String item = line.substring(lastPosition, (position == (line.length() - 1)) ? (position + 1)
157 | 						: position);
158 | 				switch (processFlag) {
159 | 				case 0:
160 | 					word = item;
161 | 					processFlag++;
162 | 					break;
163 | 				case 1:
164 | 					pos = item;
165 | 					processFlag++;
166 | 					break;
167 | 				case 2:
168 | 					//define = item;
169 | 					//processFlag++;
170 | 					define = line.substring(lastPosition).trim();
171 | 					break loop;
172 | 				case 3:
173 | 					System.out.println(line);
174 | 					break;
175 | 				}
176 | 
177 | 				for (; (position < line.length()); position++) {
178 | 					ch = line.charAt(position);
179 | 					if ((ch != ' ') && (ch != '\t')) {
180 | 						lastPosition = position;
181 | 						break;
182 | 					}
183 | 				}
184 | 
185 | 			}
186 | 		}
187 | 
188 | 		Element e = document.createElement("c");
189 | 		e.setAttribute("w", word);
190 | 		e.setAttribute("p", pos);
191 | 		e.setAttribute("d", define);
192 | 		root.appendChild(e);
193 | 		return true;
194 | 	}
195 | 
196 | 	public static void main(String[] args) throws Exception {
197 | 		new ConceptDictTraverseEvent().saveToXML(new File("/home/xiatian/Desktop/concept.xml"));
198 | 	}
199 | 
200 | }
201 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/ConceptLinkedList.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet2.concept;
 2 | 
 3 | import java.util.LinkedList;
 4 | 
 5 | /**
 6 |  * 用于概念处理的LinkedList
 7 |  * 
 8 |  * @param <T>
 9 |  */
10 | public class ConceptLinkedList extends LinkedList<Concept> {
11 | 
12 | 	private static final long serialVersionUID = -1889819083192992375L;
13 | 
14 | 	/**
15 | 	 * 删除链表中最后面的size个元素
16 | 	 * @param size
17 | 	 */
18 | 	public void removeLast(int size) {
19 | 		for (int i = 0; i < size; i++) {
20 | 			this.removeLast();
21 | 		}
22 | 	}
23 | 
24 | 	/**
25 | 	 * 根据概念的定义判断是否已经加入到链表中
26 | 	 * @param concept
27 | 	 */
28 | 	public void addByDefine(Concept concept) {
29 | 		for (Concept c : this) {
30 | 			if (c.getDefine().equals(concept.getDefine())) {
31 | 				return;
32 | 			}
33 | 		}
34 | 
35 | 		this.add(concept);
36 | 	}
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/LiuConceptParser.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet2.concept;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Collection;
 5 | 
 6 | import zx.soft.similarity.util.BlankUtils;
 7 | import zx.soft.similarity.word.hownet2.sememe.BaseSememeParser;
 8 | import zx.soft.similarity.word.hownet2.sememe.LiuqunSememeParser;
 9 | 
10 | /**
11 |  * 相似度计算方式，对概念解析的处理方式
12 |  * 
13 |  */
14 | public class LiuConceptParser extends BaseConceptParser {
15 | 
16 | 	private static LiuConceptParser instance = null;
17 | 
18 | 	public static LiuConceptParser getInstance() {
19 | 		if (instance == null) {
20 | 			try {
21 | 				instance = new LiuConceptParser();
22 | 			} catch (IOException e) {
23 | 				e.printStackTrace();
24 | 			}
25 | 		}
26 | 
27 | 		return instance;
28 | 	}
29 | 
30 | 	private LiuConceptParser(BaseSememeParser sememeParser) throws IOException {
31 | 		super(sememeParser);
32 | 	}
33 | 
34 | 	private LiuConceptParser() throws IOException {
35 | 		super(new LiuqunSememeParser());
36 | 	}
37 | 
38 | 	@Override
39 | 	protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4) {
40 | 		return beta1 * sim_v1 + beta2 * sim_v1 * sim_v2 + beta3 * sim_v1 * sim_v2 * sim_v3 + beta4 * sim_v1 * sim_v2
41 | 				* sim_v3 * sim_v4;
42 | 	}
43 | 
44 | 	@Override
45 | 	public double getSimilarity(String word1, String word2) {
46 | 		double similarity = 0.0;
47 | 
48 | 		// 如果两个句子相同,则直接返回1.0
49 | 		if (word1.equals(word2)) {
50 | 			return 1.0;
51 | 		}
52 | 
53 | 		Collection<Concept> concepts1 = getConcepts(word1);
54 | 		Collection<Concept> concepts2 = getConcepts(word2);
55 | 
56 | 		//如果是blank，则说明是未登录词, 需要计算组合概念
57 | 		if (BlankUtils.isBlank(concepts1) || BlankUtils.isBlank(concepts2)) {
58 | 			return 0.0;
59 | 		}
60 | 
61 | 		//两个for循环分别计算词语所有可能的概念的相似度
62 | 		for (Concept c1 : concepts1) {
63 | 			for (Concept c2 : concepts2) {
64 | 				double v = getSimilarity(c1, c2);
65 | 
66 | 				if (v > similarity) {
67 | 					similarity = v;
68 | 				}
69 | 
70 | 				if (similarity == 1.0) {
71 | 					break;
72 | 				}
73 | 			}
74 | 		}
75 | 
76 | 		return similarity;
77 | 	}
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/concept/concept.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/main/java/zx/soft/similarity/word/hownet2/concept/concept.xml.gz


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/BaseSememeParser.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet2.sememe;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.util.zip.GZIPInputStream;
 6 | 
 7 | import javax.xml.namespace.QName;
 8 | import javax.xml.stream.XMLEventReader;
 9 | import javax.xml.stream.XMLInputFactory;
10 | import javax.xml.stream.events.StartElement;
11 | import javax.xml.stream.events.XMLEvent;
12 | 
13 | import org.slf4j.Logger;
14 | import org.slf4j.LoggerFactory;
15 | 
16 | import zx.soft.similarity.Similaritable;
17 | import zx.soft.similarity.word.hownet.HownetMeta;
18 | 
19 | import com.google.common.collect.HashMultimap;
20 | import com.google.common.collect.Multimap;
21 | 
22 | /**
23 |  * 义原解析器基类，所有义原存储在xml文件中（当前package中的sememe.xml.tar.gz文件）。<br/>
24 |  * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》或《中文信息相似度计算理论与方法》一书第三章<br/>
25 |  *
26 |  * 为提高运算速度，义原的加载方式做了调整，只把义原的汉语定义和对应的Id加入到MultiMap对象中，并通过义原的层次化Id计算义原之间的相似度。<br/>
27 |  *
28 |  * @see {@link zx.soft.similarity.Similaritable}
29 |  */
30 | public abstract class BaseSememeParser implements HownetMeta, Similaritable {
31 | 
32 | 	protected Logger logger = LoggerFactory.getLogger(this.getClass());
33 | 
34 | 	/** 所有的义原都存放到一个MultiMap, Key为Sememe的中文定义, Value为义原的Id */
35 | 	protected static Multimap<String, String> SEMEMES = null;
36 | 
37 | 	public BaseSememeParser() throws IOException {
38 | 		if (SEMEMES != null) {
39 | 			return;
40 | 		}
41 | 
42 | 		SEMEMES = HashMultimap.create();
43 | 
44 | 		InputStream input = this.getClass().getClassLoader().getResourceAsStream("data/sememe.xml.gz");
45 | 		input = new GZIPInputStream(input);
46 | 		load(input);
47 | 	}
48 | 
49 | 	/**
50 | 	 * 从文件中加载义元知识
51 | 	 *
52 | 	 * @throws IOException
53 | 	 */
54 | 	public void load(InputStream input) throws IOException {
55 | 		System.out.print("loading sememes...");
56 | 		long time = System.currentTimeMillis();
57 | 		try {
58 | 			XMLInputFactory inputFactory = XMLInputFactory.newInstance();
59 | 			XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input);
60 | 
61 | 			int count = 0;
62 | 			while (xmlEventReader.hasNext()) {
63 | 				XMLEvent event = xmlEventReader.nextEvent();
64 | 
65 | 				if (event.isStartElement()) {
66 | 					StartElement startElement = event.asStartElement();
67 | 					if (startElement.getName().toString().equals("sememe")) {
68 | 						String cnWord = startElement.getAttributeByName(QName.valueOf("cn")).getValue();
69 | 						String id = startElement.getAttributeByName(QName.valueOf("id")).getValue();
70 | 						SEMEMES.put(cnWord, id);
71 | 						count++;
72 | 						if (count % 100 == 0) {
73 | 							System.out.print(".");
74 | 						}
75 | 					}
76 | 				}
77 | 			}
78 | 			input.close();
79 | 		} catch (Exception e) {
80 | 			throw new IOException(e);
81 | 		}
82 | 		time = System.currentTimeMillis() - time;
83 | 		System.out.println("\ncomplete!. time elapsed: " + (time / 1000) + "s");
84 | 	}
85 | 
86 | 	/**
87 | 	 * 计算两个义原之间的关联度
88 | 	 *
89 | 	 * @param sememeName1
90 | 	 * @param sememeName2
91 | 	 * @return
92 | 	 */
93 | 	public double getAssociation(String sememeName1, String sememeName2) {
94 | 		return 0.0;
95 | 	}
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/LiuqunSememeParser.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet2.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Collection;
  5 | 
  6 | /**
  7 |  * 计算义原相似度的方法, 实现了SememeParser中定义的抽象方法
  8 |  * 
  9 |  */
 10 | public class LiuqunSememeParser extends BaseSememeParser {
 11 | 
 12 | 	/** 计算义元相似度的可调节的参数，默认为1.6 */
 13 | 	private final float alpha = 1.6f;
 14 | 
 15 | 	public LiuqunSememeParser() throws IOException {
 16 | 		super();
 17 | 	}
 18 | 
 19 | 	/**
 20 | 	 * 计算两个义元之间的相似度，由于义元可能相同，计算结果为其中相似度最大者 
 21 | 	 * <br/>similarity = alpha/(distance+alpha)
 22 | 	 * 
 23 | 	 * @param key1
 24 | 	 * @param key2
 25 | 	 * @return
 26 | 	 */
 27 | 	@Override
 28 | 	public double getSimilarity(String item1, String item2) {
 29 | 		int pos;
 30 | 
 31 | 		// 如果为空串，直接返回0
 32 | 		if (item1 == null || item2 == null || item1.equals("") || item2.equals(""))
 33 | 			return 0.0;
 34 | 
 35 | 		String key1 = item1.trim();
 36 | 		String key2 = item2.trim();
 37 | 
 38 | 		// 去掉()符号
 39 | 		if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
 40 | 			if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
 41 | 				key1 = key1.substring(1, key1.length() - 1);
 42 | 				key2 = key2.substring(1, key2.length() - 1);
 43 | 			} else {
 44 | 				return 0.0;
 45 | 			}
 46 | 		}
 47 | 
 48 | 		// 处理关系义元,即x=y的情况
 49 | 		if ((pos = key1.indexOf('=')) > 0) {
 50 | 			int pos2 = key2.indexOf('=');
 51 | 			// 如果是关系义元，则判断前面部分是否相同，如果相同，则转为计算后面部分的相似度，否则为0
 52 | 			if ((pos == pos2) && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
 53 | 				key1 = key1.substring(pos + 1);
 54 | 				key2 = key2.substring(pos2 + 1);
 55 | 			} else {
 56 | 				return 0.0;
 57 | 			}
 58 | 		}
 59 | 
 60 | 		// 处理符号义元,即前面有特殊符号的义元
 61 | 		String symbol1 = key1.substring(0, 1);
 62 | 		String symbol2 = key2.substring(0, 1);
 63 | 
 64 | 		for (int i = 0; i < Symbol_Descriptions.length; i++) {
 65 | 			if (symbol1.equals(Symbol_Descriptions[i][0])) {
 66 | 				if (symbol1.equals(symbol2)) {
 67 | 					key1 = item1.substring(1);
 68 | 					key2 = item2.substring(1);
 69 | 					break;
 70 | 				} else {
 71 | 					return 0.0; // 如果不是同一关系符号，则相似度直接返回0
 72 | 				}
 73 | 			}
 74 | 		}
 75 | 
 76 | 		if ((pos = key1.indexOf("|")) >= 0) {
 77 | 			key1 = key1.substring(pos + 1);
 78 | 		}
 79 | 		if ((pos = key2.indexOf("|")) >= 0) {
 80 | 			key2 = key2.substring(pos + 1);
 81 | 		}
 82 | 
 83 | 		int distance = getMinDistance(key1, key2);
 84 | 		return alpha / (distance + alpha);
 85 | 	}
 86 | 
 87 | 	/**
 88 | 	 * 根据汉语定义计算义原之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大，由于可能多个义元有相同的汉语词语，
 89 | 	 * 故计算结果为其中距离最小者
 90 | 	 * 
 91 | 	 * @param key1
 92 | 	 * @param key2
 93 | 	 * @return
 94 | 	 */
 95 | 	public int getMinDistance(String sememe1, String sememe2) {
 96 | 		int distance = Integer.MAX_VALUE;
 97 | 
 98 | 		// 如果两个字符串相等，直接返回距离为0
 99 | 		if (sememe1.equals(sememe2)) {
100 | 			return 0;
101 | 		}
102 | 
103 | 		Collection<String> sememeIds1 = SEMEMES.get(sememe1);
104 | 		Collection<String> sememeIds2 = SEMEMES.get(sememe2);
105 | 
106 | 		// 如果sememe1或者sememe2不是义元,则返回无穷大
107 | 		if (sememeIds1.size() == 0 || sememeIds1.size() == 0) {
108 | 			return Integer.MAX_VALUE;
109 | 		}
110 | 
111 | 		for (String id1 : sememeIds1) {
112 | 			for (String id2 : sememeIds2) {
113 | 				int d = getDistance(id1, id2);
114 | 				if (d < distance) {
115 | 					distance = d;
116 | 				}
117 | 			}
118 | 		}
119 | 
120 | 		return distance;
121 | 	}
122 | 
123 | 	/**
124 | 	 * 根据义原的具有层次的Id获取两个义原之间的语义距离
125 | 	 * @param id1
126 | 	 * @param id2
127 | 	 * @return
128 | 	 */
129 | 	int getDistance(String id1, String id2) {
130 | 		// 两个Id相同的位置终止地方
131 | 		int position = 0;
132 | 		String[] array1 = id1.split("-");
133 | 		String[] array2 = id2.split("-");
134 | 		for (position = 0; position < array1.length && position < array2.length; position++) {
135 | 			if (!array1[position].equals(array2[position])) {
136 | 				return array1.length + array2.length - position - position;
137 | 			}
138 | 		}
139 | 
140 | 		if (array1.length == array2.length) {
141 | 			return 0;
142 | 		} else if (array1.length == position) {
143 | 			return array2.length - position;
144 | 		} else {
145 | 			return array1.length - position;
146 | 		}
147 | 	}
148 | 
149 | }
150 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/Sememe.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet2.sememe;
  2 | 
  3 | /**
  4 |  * 描述知网义原的基本对象, 出于性能考虑，把未用到的英文名称、定义等在加载时忽略, 更准确的做法是以[英文定义|中文定义]
  5 |  * 作为一个整理进行处理，不过绝大多数只根据中文定义就可以标识出来，因此忽略不计。<br/>
  6 |  * 义原编号采用父节点Id-子节点Id编码方式，如:
  7 |  * &lt;sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/>
  8 |  * 义原的id表明了义原之间的上下位关系和义原的深度。
  9 |  * 
 10 |  */
 11 | public class Sememe {
 12 | 
 13 | 	/** 
 14 | 	 * 义原编号,采用父节点Id-子节点Id编码方式，如&lt;sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/>
 15 | 	 * id表明了义原之间的上下位关系  
 16 | 	 */
 17 | 	private String id;
 18 | 	/** 义原的中文名称*/
 19 | 	private String cnWord;
 20 | 	/** 义原的英文名称 */
 21 | 	private String enWord;
 22 | 	/** 义原的定义，如果没有(例如数量)，则为空串 */
 23 | 	private String define;
 24 | 
 25 | 	/**
 26 | 	 * 每一行的形式为：be|是 {relevant,isa}/{relevant,descriptive} 
 27 | 	 * <br/>或者 official|官 [#organization|组织,#employee|员] 
 28 | 	 * <br/>或者 amount|多少 
 29 | 	 * <br/>把相应的部分赋予不同的属性
 30 | 	 * 出于性能考虑，把未用到的英文名称、定义等忽略
 31 | 	 * @param id
 32 | 	 * @param parentId
 33 | 	 * @param item 读取文件中的一行
 34 | 	 */
 35 | 	public Sememe(String id, String en, String cn, String define) {
 36 | 		this.id = id;
 37 | 		this.cnWord = cn;
 38 | 		//为提高效率，减少内存空间利用，可去掉以下两行
 39 | 		this.enWord = en;
 40 | 		this.define = define;
 41 | 	}
 42 | 
 43 | 	public String getId() {
 44 | 		return id;
 45 | 	}
 46 | 
 47 | 	public void setId(String id) {
 48 | 		this.id = id;
 49 | 	}
 50 | 
 51 | 	public String getCnWord() {
 52 | 		return cnWord;
 53 | 	}
 54 | 
 55 | 	public void setCnWord(String cnWord) {
 56 | 		this.cnWord = cnWord;
 57 | 	}
 58 | 
 59 | 	public String getEnWord() {
 60 | 		return enWord;
 61 | 	}
 62 | 
 63 | 	public void setEnWord(String enWord) {
 64 | 		this.enWord = enWord;
 65 | 	}
 66 | 
 67 | 	public String getDefine() {
 68 | 		return define;
 69 | 	}
 70 | 
 71 | 	public void setDefine(String define) {
 72 | 		this.define = define;
 73 | 	}
 74 | 
 75 | 	public int getType() {
 76 | 		char ch = id.charAt(0);
 77 | 		switch (ch) {
 78 | 		case '1':
 79 | 			return SememeType.Event;
 80 | 		case '2':
 81 | 			return SememeType.Entity;
 82 | 		case '3':
 83 | 			return SememeType.Attribute;
 84 | 		case '4':
 85 | 			return SememeType.Quantity;
 86 | 		case '5':
 87 | 			return SememeType.AValue;
 88 | 		case '6':
 89 | 			return SememeType.QValue;
 90 | 		case '7':
 91 | 			return SememeType.SecondaryFeature;
 92 | 		case '8':
 93 | 			return SememeType.Syntax;
 94 | 		case '9':
 95 | 			return SememeType.EventRoleAndFeature;
 96 | 		default:
 97 | 			return 0;
 98 | 		}
 99 | 	}
100 | 
101 | 	@Override
102 | 	public String toString() {
103 | 		StringBuilder sb = new StringBuilder();
104 | 		sb.append("id=");
105 | 		sb.append(id);
106 | 		sb.append("; cnWord=");
107 | 		sb.append(cnWord);
108 | 		sb.append("; enWord=");
109 | 		sb.append(enWord);
110 | 		sb.append("; define=");
111 | 		sb.append(define);
112 | 		return sb.toString();
113 | 	}
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/SememeType.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet2.sememe;
 2 | 
 3 | /**
 4 |  * 义原的类型定义<br/>
 5 |  * <ul>
 6 |  * <li>1：Event|事件</li>
 7 |  * <li>2：Entity|实体 </li>
 8 |  * <li>3:Attribute|属性 </li>
 9 |  * <li>4：Quantity|数量</li>
10 |  * <li>5：aValue|属性值</li>
11 |  * <li>6：qValue|数量值</li>
12 |  * <li>7: Secondary Feature|第二特征</li>
13 |  * <li>8: Syntax|语法</li>
14 |  * <li>9: EventRole|动态角色</li>
15 |  * <li>10:EventFeatures|动态属性</li>
16 |  * <li>0：未知</li>
17 |  * </ul>
18 |  * 
19 |  * 其中1~7为基本义元，8为语法义元，9、10为关系义元<br/>
20 |  * 
21 |  */
22 | public interface SememeType {
23 | 
24 | 	/** Event|事件类型定义 */
25 | 	public static final int Event = 1;
26 | 
27 | 	/** Entity|实体类型定义*/
28 | 	public static final int Entity = 2;
29 | 
30 | 	/** Attribute|属性类型定义*/
31 | 	public static final int Attribute = 3;
32 | 
33 | 	/** Quantity|数量类型定义*/
34 | 	public static final int Quantity = 4;
35 | 
36 | 	/** aValue|属性值类型定义*/
37 | 	public static final int AValue = 5;
38 | 
39 | 	/** qValue|数量值类型定义*/
40 | 	public static final int QValue = 6;
41 | 
42 | 	/** Secondary Feature|第二特征类型定义*/
43 | 	public static final int SecondaryFeature = 7;
44 | 
45 | 	/** Syntax|语法类型定义*/
46 | 	public static final int Syntax = 8;
47 | 
48 | 	/** EventRole|动态角色类型定义*/
49 | 	public static final int EventRoleAndFeature = 9;
50 | 
51 | 	/** 未知类型定义*/
52 | 	public static final int Unknown = 0;
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/XiaSememeParser.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.similarity.word.hownet2.sememe;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Collection;
  5 | 
  6 | import zx.soft.similarity.util.BlankUtils;
  7 | 
  8 | /**
  9 |  * 义原相似度计算, 实现了SememeParser中定义的抽象方法
 10 |  * 
 11 |  */
 12 | public class XiaSememeParser extends BaseSememeParser {
 13 | 
 14 | 	public XiaSememeParser() throws IOException {
 15 | 		super();
 16 | 	}
 17 | 
 18 | 	/**
 19 | 	 * 计算两个义原的相似度	 
 20 | 	 */
 21 | 	double getSimilarityBySememeId(final String id1, final String id2) {
 22 | 
 23 | 		int position = 0;
 24 | 		String[] array1 = id1.split("-");
 25 | 		String[] array2 = id2.split("-");
 26 | 		for (position = 0; position < array1.length && position < array2.length; position++) {
 27 | 			if (!array1[position].equals(array2[position])) {
 28 | 				break;
 29 | 			}
 30 | 		}
 31 | 
 32 | 		return 2.0 * position / (array1.length + array2.length);
 33 | 	}
 34 | 
 35 | 	/**
 36 | 	 * 根据汉语定义计算义原之间的相似度，由于可能多个义元有相同的汉语词语，故计算结果为其中相似度最大者
 37 | 	 * 
 38 | 	 * @param key1
 39 | 	 * @param key2
 40 | 	 * @return
 41 | 	 */
 42 | 	public double getMaxSimilarity(String sememeName1, String sememeName2) {
 43 | 		double maxValue = 0.0;
 44 | 
 45 | 		// 如果两个字符串相等，直接返回距离为0
 46 | 		if (sememeName1.equals(sememeName2)) {
 47 | 			return 1.0;
 48 | 		}
 49 | 
 50 | 		Collection<String> sememeIds1 = SEMEMES.get(sememeName1);
 51 | 		Collection<String> sememeIds2 = SEMEMES.get(sememeName2);
 52 | 
 53 | 		// 如果sememe1或者sememe2不是义元,则返回0
 54 | 		if (sememeIds1.size() == 0 || sememeIds1.size() == 0) {
 55 | 			return 0.0;
 56 | 		}
 57 | 
 58 | 		for (String id1 : sememeIds1) {
 59 | 			for (String id2 : sememeIds2) {
 60 | 				double value = getSimilarityBySememeId(id1, id2);
 61 | 				if (value > maxValue) {
 62 | 					maxValue = value;
 63 | 				}
 64 | 			}
 65 | 		}
 66 | 
 67 | 		return maxValue;
 68 | 	}
 69 | 
 70 | 	/**
 71 | 	 * 计算两个义元之间的相似度，由于义元可能相同，计算结果为其中相似度最大者 similarity = alpha/(distance+alpha),
 72 | 	 * 如果两个字符串相同或都为空，直接返回1.0
 73 | 	 * 
 74 | 	 * @param key1 第一个义原字符串
 75 | 	 * @param key2 第二个义原字符串
 76 | 	 * @return
 77 | 	 */
 78 | 	@Override
 79 | 	public double getSimilarity(String item1, String item2) {
 80 | 		if (BlankUtils.isBlankAll(item2, item2)) {
 81 | 			return 1.0;
 82 | 		} else if (BlankUtils.isBlankAtLeastOne(item1, item2)) {
 83 | 			return 0.0;
 84 | 		} else if (item1.equals(item2)) {
 85 | 			return 1.0;
 86 | 		}
 87 | 
 88 | 		String key1 = item1.trim();
 89 | 		String key2 = item2.trim();
 90 | 
 91 | 		// 去掉()符号
 92 | 		if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) {
 93 | 
 94 | 			if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') {
 95 | 				key1 = key1.substring(1, key1.length() - 1);
 96 | 				key2 = key2.substring(1, key2.length() - 1);
 97 | 			} else {
 98 | 				return 0.0;
 99 | 			}
100 | 
101 | 		}
102 | 
103 | 		// 处理关系义元,即x=y的情况
104 | 		int pos = key1.indexOf('=');
105 | 		if (pos > 0) {
106 | 			int pos2 = key2.indexOf('=');
107 | 			// 如果是关系义元，则判断前面部分是否相同，如果相同，则转为计算后面部分的相似度，否则为0
108 | 			if ((pos == pos2) && key1.substring(0, pos).equals(key2.substring(0, pos2))) {
109 | 				key1 = key1.substring(pos + 1);
110 | 				key2 = key2.substring(pos2 + 1);
111 | 			} else {
112 | 				return 0.0;
113 | 			}
114 | 		}
115 | 
116 | 		// 处理符号义元,即前面有特殊符号的义元
117 | 		String symbol1 = key1.substring(0, 1);
118 | 		String symbol2 = key2.substring(0, 1);
119 | 
120 | 		for (int i = 0; i < Symbol_Descriptions.length; i++) {
121 | 			if (symbol1.equals(Symbol_Descriptions[i][0])) {
122 | 				if (symbol1.equals(symbol2)) {
123 | 					key1 = item1.substring(1);
124 | 					key2 = item2.substring(1);
125 | 					break;
126 | 				} else {
127 | 					return 0.0; // 如果不是同一关系符号，则相似度直接返回0
128 | 				}
129 | 			}
130 | 		}
131 | 
132 | 		if ((pos = key1.indexOf("|")) >= 0) {
133 | 			key1 = key1.substring(pos + 1);
134 | 		}
135 | 		if ((pos = key2.indexOf("|")) >= 0) {
136 | 			key2 = key2.substring(pos + 1);
137 | 		}
138 | 
139 | 		// 如果两个字符串相等，直接返回距离为0
140 | 		if (key1.equals(key2)) {
141 | 			return 1.0;
142 | 		}
143 | 
144 | 		return getMaxSimilarity(key1, key2);
145 | 	}
146 | 
147 | }


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/hownet2/sememe/sememe.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/main/java/zx/soft/similarity/word/hownet2/sememe/sememe.xml.gz


--------------------------------------------------------------------------------
/src/main/java/zx/soft/similarity/word/pinyin/PinyinSimilarity.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.pinyin;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | import zx.soft.similarity.Similaritable;
 6 | import zx.soft.similarity.util.EditDistance;
 7 | import zx.soft.similarity.util.PinyinUtils;
 8 | 
 9 | /**
10 |  * 通过拼音计算两个词语是否相似，拼音的相似程度采用编辑距离算法，并进行归一化衡量
11 |  * 
12 |  */
13 | public class PinyinSimilarity implements Similaritable {
14 | 
15 | 	@Override
16 | 	public double getSimilarity(String item1, String item2) {
17 | 		Set<String> pinyinSet1 = PinyinUtils.getInstance().getPinyin(item1);
18 | 		Set<String> pinyinSet2 = PinyinUtils.getInstance().getPinyin(item2);
19 | 
20 | 		double max = 0.0;
21 | 		for (String pinyin1 : pinyinSet1) {
22 | 			for (String pinyin2 : pinyinSet2) {
23 | 				double distance = new EditDistance().getEditDistance(pinyin1, pinyin2);
24 | 				double similarity = 1 - distance
25 | 						/ ((pinyin1.length() > pinyin2.length()) ? pinyin1.length() : pinyin2.length());
26 | 				max = (max > similarity) ? max : similarity;
27 | 				if (max == 1.0) {
28 | 					return max;
29 | 				}
30 | 			}
31 | 		}
32 | 		return max;
33 | 	}
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/tendency/word/HownetWordTendency.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.tendency.word;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Collection;
 5 | import java.util.HashSet;
 6 | import java.util.Set;
 7 | 
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | import zx.soft.similarity.word.hownet2.concept.BaseConceptParser;
12 | import zx.soft.similarity.word.hownet2.concept.Concept;
13 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
14 | import zx.soft.similarity.word.hownet2.sememe.BaseSememeParser;
15 | import zx.soft.similarity.word.hownet2.sememe.XiaSememeParser;
16 | 
17 | /**
18 |  * 基于知网实现的词语倾向性判别
19 |  *
20 |  */
21 | public class HownetWordTendency implements WordTendency {
22 | 
23 | 	private static Logger logger = LoggerFactory.getLogger(HownetWordTendency.class);
24 | 
25 | 	public static String[] POSITIVE_SEMEMES = new String[] { "良", "喜悦", "夸奖", "满意", "期望", "注意", "致敬", "喜欢", "专", "敬佩",
26 | 			"同意", "爱惜", "愿意", "思念", "拥护", "祝贺", "福", "需求", "奖励", "致谢", "欢迎", "羡慕", "感激", "爱恋" };
27 | 
28 | 	public static String[] NEGATIVE_SEMEMES = new String[] { "莠", "谴责", "害怕", "生气", "悲哀", "着急", "轻视", "羞愧", "烦恼", "灰心",
29 | 			"犹豫", "为难", "懊悔", "厌恶", "怀疑", "怜悯", "忧愁", "示怒", "不满", "仇恨", "埋怨", "失望", "坏" };
30 | 	private BaseConceptParser conceptParser = null;
31 | 	private BaseSememeParser sememeParser = null;
32 | 
33 | 	public HownetWordTendency() {
34 | 		this.conceptParser = XiaConceptParser.getInstance();
35 | 		try {
36 | 			this.sememeParser = new XiaSememeParser();
37 | 		} catch (IOException e) {
38 | 			logger.error("Exception:{}", e.getMessage());
39 | 		}
40 | 	}
41 | 
42 | 	@Override
43 | 	public double getTendency(String word) {
44 | 		double positive = getSentiment(word, POSITIVE_SEMEMES);
45 | 		double negative = getSentiment(word, NEGATIVE_SEMEMES);
46 | 		return positive - negative;
47 | 	}
48 | 
49 | 	public double getSentiment(String word, String[] candidateSememes) {
50 | 		Collection<Concept> concepts = conceptParser.getConcepts(word);
51 | 		Set<String> sememes = new HashSet<>();
52 | 		for (Concept c : concepts) {
53 | 			sememes.addAll(c.getAllSememeNames());
54 | 		}
55 | 
56 | 		double max = 0.0;
57 | 		for (String item : sememes) {
58 | 			double total = 0.0;
59 | 			for (String positiveSememe : candidateSememes) {
60 | 				//如果有特别接近的义原，直接返回该相似值，避免其他干扰
61 | 				double value = sememeParser.getSimilarity(item, positiveSememe);
62 | 				if (value > 0.9) {
63 | 					return value;
64 | 				}
65 | 				total += value;
66 | 			}
67 | 			double sim = total / candidateSememes.length;
68 | 			if (sim > max) {
69 | 				max = sim;
70 | 			}
71 | 		}
72 | 		return max;
73 | 	}
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/tendency/word/Training.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.tendency.word;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.ArrayList;
  9 | import java.util.Collection;
 10 | import java.util.Collections;
 11 | import java.util.HashMap;
 12 | import java.util.List;
 13 | import java.util.Map;
 14 | 
 15 | import zx.soft.similarity.util.BlankUtils;
 16 | import zx.soft.similarity.word.hownet2.concept.Concept;
 17 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
 18 | import zx.soft.similarity.word.hownet2.sememe.XiaSememeParser;
 19 | 
 20 | import com.google.common.collect.HashMultimap;
 21 | import com.google.common.collect.Multimap;
 22 | 
 23 | /**
 24 |  * 临时训练及测试类
 25 |  *
 26 |  */
 27 | public class Training {
 28 | 
 29 | 	void test(boolean testPositive) throws IOException {
 30 | 		WordTendency tendency = new HownetWordTendency();
 31 | 		File f = new File("./dict/sentiment/负面情感词语（中文）.txt");
 32 | 		if (testPositive) {
 33 | 			//f = new File("./dict/sentiment/正面情感词语（中文）.txt");
 34 | 			f = new File("./dict/sentiment/正面评价词语（中文）.txt");
 35 | 		}
 36 | 		String encoding = "utf-8";
 37 | 		String line;
 38 | 		int wordCount = 0;
 39 | 		int correctCount = 0;
 40 | 
 41 | 		try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));) {
 42 | 			while ((line = in.readLine()) != null) {
 43 | 				if (line.length() > 5)
 44 | 					continue;
 45 | 				wordCount++;
 46 | 
 47 | 				double value = tendency.getTendency(line.trim());
 48 | 				if (value > 0 && testPositive) {
 49 | 					correctCount++;
 50 | 				} else if (value < 0 && !testPositive) {
 51 | 					correctCount++;
 52 | 				} else {
 53 | 					System.out.println("error:" + line + "\t value:" + value);
 54 | 				}
 55 | 			}
 56 | 		}
 57 | 		System.out.println("correct:" + correctCount);
 58 | 		System.out.println("total:" + wordCount);
 59 | 		System.out.println("ratio:" + correctCount * 1.0 / wordCount);
 60 | 	}
 61 | 
 62 | 	/**
 63 | 	 * 该方法用于统计知网提供的情感词集合所涉及的义原以及出现频度
 64 | 	 * @throws IOException
 65 | 	 */
 66 | 	void countSentimentDistribution() throws IOException {
 67 | 		Map<String, Integer> sememeMap = new HashMap<>();
 68 | 		File f = new File("./dict/sentiment/负面情感词语（中文）.txt");
 69 | 		String encoding = "utf-8";
 70 | 		boolean autoCombineConcept = false;
 71 | 		BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));
 72 | 
 73 | 		XiaConceptParser parser = new XiaConceptParser(new XiaSememeParser());
 74 | 
 75 | 		String line = null;
 76 | 
 77 | 		int conceptCount = 0;
 78 | 		int wordCount = 0;
 79 | 		while ((line = in.readLine()) != null) {
 80 | 			if (line.length() > 5)
 81 | 				continue;
 82 | 			wordCount++;
 83 | 			String word = line.trim();
 84 | 			Collection<Concept> concepts = parser.getInnerConcepts(word);
 85 | 			//由于目前的词典为知网2000版本，所以默认情况下仅对词典中出现的概念进行统计
 86 | 			if (BlankUtils.isBlank(concepts) && autoCombineConcept) {
 87 | 				concepts = parser.autoCombineConcepts(word, null);
 88 | 			}
 89 | 			for (Concept c : concepts) {
 90 | 				conceptCount++;
 91 | 				List<String> names = new ArrayList<String>();
 92 | 
 93 | 				//加入主义原
 94 | 				names.add(c.getMainSememe());
 95 | 
 96 | 				//加入关系义原
 97 | 				for (String item : c.getRelationSememes()) {
 98 | 					names.add(item.substring(item.indexOf("=") + 1));
 99 | 				}
100 | 
101 | 				//加入符号义原
102 | 				for (String item : c.getSymbolSememes()) {
103 | 					names.add(item.substring(1));
104 | 				}
105 | 
106 | 				//加入其他义原集合
107 | 				for (String item : c.getSecondSememes()) {
108 | 					names.add(item);
109 | 				}
110 | 
111 | 				for (String item : names) {
112 | 					Integer count = sememeMap.get(item);
113 | 					if (count == null) {
114 | 						sememeMap.put(item, 1);
115 | 					} else {
116 | 						sememeMap.put(item, count + 1);
117 | 					}
118 | 				}
119 | 			}
120 | 		}
121 | 		in.close();
122 | 
123 | 		//以下是为了按照义原出现的数量进行排序的代码
124 | 		Multimap<Integer, String> map2 = HashMultimap.create();
125 | 		for (String key : sememeMap.keySet()) {
126 | 			map2.put(sememeMap.get(key), key);
127 | 		}
128 | 		List<Integer> keys = new ArrayList<>();
129 | 		for (Integer key : map2.keySet()) {
130 | 			keys.add(key);
131 | 		}
132 | 		Collections.sort(keys);
133 | 
134 | 		int smallSememeCount = 0; //较少出现的不同义原数量
135 | 		int smallAppearTotal = 0; //较少出现的义原在概念众出现的次数总和
136 | 		for (int index = (keys.size() - 1); index >= 0; index--) {
137 | 			Integer key = keys.get(index);
138 | 			Collection<String> values = map2.get(key);
139 | 			double ratio = (key * 100.0 / conceptCount);
140 | 			System.out.print(key + "(" + ratio + "%): ");
141 | 			for (String v : values) {
142 | 				System.out.print(v + "\t");
143 | 			}
144 | 			System.out.println();
145 | 			if (ratio < 0.7) {
146 | 				smallSememeCount += values.size();
147 | 				smallAppearTotal += key * values.size();
148 | 			}
149 | 		}
150 | 
151 | 		System.out.println("small info: ");
152 | 		System.out.println("\tdifferent sememes:" + smallSememeCount);
153 | 		System.out.println("\tappear count:" + smallAppearTotal);
154 | 		System.out.println("\tratio:" + smallAppearTotal * 100.0 / conceptCount);
155 | 		System.out.println("wordCount:" + wordCount);
156 | 		System.out.println("conceptCount:" + conceptCount);
157 | 	}
158 | 
159 | 	public static void main(String[] args) throws IOException {
160 | 		Training training = new Training();
161 | 		training.countSentimentDistribution();
162 | 		//        System.out.println("test positive:");
163 | 		//        training.test(true);
164 | 		//
165 | 		//        System.out.println("test negative:");
166 | 		//training.test(false);
167 | 	}
168 | 
169 | }
170 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/tendency/word/WordTendency.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.tendency.word;
 2 | 
 3 | /**
 4 |  * 计算词语的语义倾向性，词语的语义倾向性为一个介于[-1, 1]之间的实数，数值越大，褒义性越强，否则，贬义性越强
 5 |  * 
 6 |  */
 7 | public interface WordTendency {
 8 | 
 9 | 	/**
10 | 	 * 获取词语的语义倾向性，词语的语义倾向性为一个介于[-1, 1]之间的实数，数值越大，褒义性越强，否则，贬义性越强
11 | 	 * @param word
12 | 	 * @return
13 | 	 */
14 | 	public double getTendency(String word);
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/ui/PhraseSimilarityUI.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.ui;
 2 | 
 3 | import java.awt.BorderLayout;
 4 | import java.awt.GridLayout;
 5 | import java.awt.event.ActionEvent;
 6 | import java.awt.event.ActionListener;
 7 | 
 8 | import javax.swing.BorderFactory;
 9 | import javax.swing.JButton;
10 | import javax.swing.JLabel;
11 | import javax.swing.JPanel;
12 | import javax.swing.JScrollPane;
13 | import javax.swing.JTextArea;
14 | import javax.swing.JTextField;
15 | 
16 | import zx.soft.similarity.phrase.PhraseSimilarity;
17 | 
18 | /**
19 |  * 短语相似度的调用演示界面
20 |  */
21 | public class PhraseSimilarityUI {
22 | 
23 | 	/**
24 | 	 * 短语相似度的演示面板
25 | 	 * 
26 | 	 * @return
27 | 	 */
28 | 	public static JPanel createPanel() {
29 | 		// 声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
30 | 		JPanel fullPanel = new JPanel();
31 | 		fullPanel.setLayout(new BorderLayout());
32 | 
33 | 		JPanel northPanel = new JPanel();
34 | 		fullPanel.add(northPanel, "North");
35 | 
36 | 		// centerPanel包括了一个文本框
37 | 		JPanel centerPanel = new JPanel();
38 | 		fullPanel.add(centerPanel, "Center");
39 | 
40 | 		centerPanel.setLayout(new BorderLayout());
41 | 		final JTextArea result = new JTextArea();
42 | 		// result.setFont(new Font("宋体", Font.PLAIN, 16));
43 | 		result.setLineWrap(true);
44 | 		JScrollPane centerScrollPane = new JScrollPane(result);
45 | 		centerPanel.add(centerScrollPane, "Center");
46 | 
47 | 		northPanel.setLayout(new GridLayout(1, 1));
48 | 		// northPanel.add(createWordPanel());
49 | 		// northPanel.add(createCilinPanel());
50 | 
51 | 		// 以下加入northPanel中的第一个面板
52 | 		final JTextField field1 = new JTextField("");
53 | 		final JTextField field2 = new JTextField("");
54 | 		field1.setColumns(50);
55 | 		field2.setColumns(50);
56 | 
57 | 		JPanel mainPanel = new JPanel();
58 | 		mainPanel.setLayout(new GridLayout(3, 1));
59 | 
60 | 		JPanel linePanel = new JPanel();
61 | 		linePanel.add(new JLabel("短语1:"));
62 | 		linePanel.add(field1);
63 | 		mainPanel.add(linePanel);
64 | 
65 | 		linePanel = new JPanel();
66 | 		linePanel.add(new JLabel("短语2:"));
67 | 		linePanel.add(field2);
68 | 		mainPanel.add(linePanel);
69 | 
70 | 		linePanel = new JPanel();
71 | 		JButton goButton = new JButton("计算相似度");
72 | 		linePanel.add(goButton);
73 | 		mainPanel.add(linePanel);
74 | 		goButton.addActionListener(new ActionListener() {
75 | 
76 | 			@Override
77 | 			public void actionPerformed(ActionEvent e) {
78 | 				String phrase1 = field1.getText();
79 | 				String phrase2 = field2.getText();
80 | 				String text = "[" + phrase1 + "]与[" + phrase2 + "]的相似度为:";
81 | 				text = text + new PhraseSimilarity().getSimilarity(phrase1, phrase2);
82 | 				// text = text + "\n\n" + result.getText();
83 | 				result.setText(text);
84 | 			}
85 | 
86 | 		});
87 | 		mainPanel.setBorder(BorderFactory.createEtchedBorder());
88 | 		northPanel.add(mainPanel);
89 | 
90 | 		return fullPanel;
91 | 	}
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/ui/Start.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.ui;
 2 | 
 3 | import java.awt.Container;
 4 | import java.awt.Font;
 5 | import java.util.Enumeration;
 6 | 
 7 | import javax.swing.JFrame;
 8 | import javax.swing.JMenu;
 9 | import javax.swing.JMenuBar;
10 | import javax.swing.JMenuItem;
11 | import javax.swing.JScrollPane;
12 | import javax.swing.JTabbedPane;
13 | import javax.swing.SwingUtilities;
14 | import javax.swing.UIManager;
15 | import javax.swing.plaf.FontUIResource;
16 | 
17 | import zx.soft.similarity.sentence.SegmentProxy;
18 | import zx.soft.similarity.util.About;
19 | 
20 | /**
21 |  * 相似度计算软件包演示启动类
22 |  * 
23 |  */
24 | public class Start extends JFrame {
25 | 
26 | 	private static final long serialVersionUID = 85744461208L;
27 | 
28 | 	public Start() {
29 | 		this.setTitle("相似度计算演示程序");
30 | 		this.setSize(420, 700);
31 | 		this.setLocationRelativeTo(null);
32 | 		this.setDefaultCloseOperation(EXIT_ON_CLOSE);
33 | 
34 | 		// //////////////////////////////////
35 | 		// add menu
36 | 		JMenuBar menuBar = new JMenuBar();
37 | 		this.setJMenuBar(menuBar);
38 | 
39 | 		JMenu fileMenu = new JMenu("File");
40 | 		menuBar.add(fileMenu);
41 | 		fileMenu.add(new JMenuItem("Exit"));
42 | 
43 | 		JMenu helpMenu = new JMenu("Help");
44 | 		menuBar.add(helpMenu);
45 | 		helpMenu.add(new JMenuItem("Help"));
46 | 
47 | 		Container contentPane = this.getContentPane();
48 | 		JTabbedPane tabbedPane = new JTabbedPane();
49 | 		tabbedPane.add("词语", WordSimlarityUI.createPanel());
50 | 		tabbedPane.add("短语", PhraseSimilarityUI.createPanel());
51 | 		tabbedPane.add("句子", SentenceSimilarityUI.createPanel());
52 | 		// tabbedPane.add("文本", WordSimlarityUI.createPanel());
53 | 		tabbedPane.add("词法分析", SegmentProxy.createPanel());
54 | 		tabbedPane.add("义原树", SememeTreeUI.createPanel());
55 | 		tabbedPane.add("情感分析", TendencyUI.createPanel());
56 | 		tabbedPane.add("关于", About.createPanel());
57 | 		JScrollPane scrollPane = new JScrollPane(tabbedPane);
58 | 		contentPane.add(scrollPane);
59 | 
60 | 		this.pack();
61 | 		setExtendedState(MAXIMIZED_BOTH);
62 | 	}
63 | 
64 | 	public static void InitGlobalFont(Font font) {
65 | 		FontUIResource fontRes = new FontUIResource(font);
66 | 		for (Enumeration<Object> keys = UIManager.getDefaults().keys(); keys.hasMoreElements();) {
67 | 			Object key = keys.nextElement();
68 | 			Object value = UIManager.get(key);
69 | 			if (value instanceof FontUIResource) {
70 | 				UIManager.put(key, fontRes);
71 | 			}
72 | 		}
73 | 	}
74 | 
75 | 	public static void main(String[] args) {
76 | 		//JFrame.setDefaultLookAndFeelDecorated(true);
77 | 		//解决字体在Ubuntu中显示有乱码的问题
78 | 		InitGlobalFont(new Font("Microsoft YaHei", Font.TRUETYPE_FONT, 12));
79 | 		SwingUtilities.invokeLater(new Runnable() {
80 | 
81 | 			@Override
82 | 			public void run() {
83 | 				Start w = new Start();
84 | 				w.setVisible(true);
85 | 			}
86 | 		});
87 | 	}
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/java/zx/soft/ui/TendencyUI.java:
--------------------------------------------------------------------------------
  1 | package zx.soft.ui;
  2 | 
  3 | import java.awt.BorderLayout;
  4 | import java.awt.GridLayout;
  5 | import java.awt.event.ActionEvent;
  6 | import java.awt.event.ActionListener;
  7 | 
  8 | import javax.swing.BorderFactory;
  9 | import javax.swing.JButton;
 10 | import javax.swing.JFrame;
 11 | import javax.swing.JLabel;
 12 | import javax.swing.JPanel;
 13 | import javax.swing.JScrollPane;
 14 | import javax.swing.JTextArea;
 15 | import javax.swing.JTextField;
 16 | 
 17 | import zx.soft.tendency.word.HownetWordTendency;
 18 | 
 19 | /**
 20 |  * 测试词语倾向性的用户调用演示界面
 21 |  * 
 22 |  */
 23 | public class TendencyUI extends JFrame {
 24 | 
 25 | 	private static final long serialVersionUID = -3976827963973640651L;
 26 | 
 27 | 	public static JPanel createPanel() {
 28 | 		//声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel
 29 | 		JPanel fullPanel = new JPanel();
 30 | 		fullPanel.setLayout(new BorderLayout());
 31 | 
 32 | 		JPanel northPanel = new JPanel();
 33 | 		fullPanel.add(northPanel, "North");
 34 | 
 35 | 		//centerPanel包括了一个文本框
 36 | 		JPanel centerPanel = new JPanel();
 37 | 		fullPanel.add(centerPanel, "Center");
 38 | 		centerPanel.setLayout(new BorderLayout());
 39 | 		final JTextArea result = new JTextArea();
 40 | 		//result.setFont(new Font("宋体", Font.PLAIN, 16));
 41 | 		result.setLineWrap(true);
 42 | 		JScrollPane centerScrollPane = new JScrollPane(result);
 43 | 		centerPanel.add(centerScrollPane, "Center");
 44 | 
 45 | 		northPanel.setLayout(new GridLayout(1, 1));
 46 | 
 47 | 		//以下加入northPanel中的第一个面板
 48 | 		final JTextField wordField = new JTextField("恶心");
 49 | 		wordField.setColumns(40);
 50 | 
 51 | 		JPanel mainPanel = new JPanel();
 52 | 		mainPanel.setLayout(new GridLayout(2, 1));
 53 | 
 54 | 		JPanel linePanel = new JPanel();
 55 | 		linePanel.add(new JLabel("输入词语:"));
 56 | 		linePanel.add(wordField);
 57 | 		mainPanel.add(linePanel);
 58 | 
 59 | 		linePanel = new JPanel();
 60 | 		JButton goButton = new JButton("计算词语倾向");
 61 | 		linePanel.add(goButton);
 62 | 		mainPanel.add(linePanel);
 63 | 		goButton.addActionListener(new ActionListener() {
 64 | 			HownetWordTendency tendency = new HownetWordTendency();
 65 | 
 66 | 			@Override
 67 | 			public void actionPerformed(ActionEvent e) {
 68 | 				String word = wordField.getText();
 69 | 				double positive = tendency.getSentiment(word, HownetWordTendency.POSITIVE_SEMEMES);
 70 | 				double negative = tendency.getSentiment(word, HownetWordTendency.NEGATIVE_SEMEMES);
 71 | 				String text = "[" + word + "]的倾向分析结果为:";
 72 | 
 73 | 				text = text + "\n正面接近程度=" + positive;
 74 | 				text = text + "\n负面接近程度=" + negative;
 75 | 				text = text + "\n倾向性=" + (positive - negative);
 76 | 				text = text + "\n________________________________\n" + result.getText();
 77 | 				result.setText(text);
 78 | 				result.setCaretPosition(0);
 79 | 			}
 80 | 
 81 | 		});
 82 | 		mainPanel.setBorder(BorderFactory.createEtchedBorder());
 83 | 		northPanel.add(mainPanel);
 84 | 
 85 | 		return fullPanel;
 86 | 	}
 87 | 
 88 | 	public TendencyUI() {
 89 | 		this.setTitle("词语倾向性演示");
 90 | 		this.setSize(420, 700);
 91 | 		this.setLocationRelativeTo(null);
 92 | 		this.setDefaultCloseOperation(EXIT_ON_CLOSE);
 93 | 		this.getContentPane().setLayout(new BorderLayout());
 94 | 		this.getContentPane().add(createPanel());
 95 | 	}
 96 | 
 97 | 	public static void main(String[] args) {
 98 | 		new TendencyUI().setVisible(true);
 99 | 	}
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/resources/data/about.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 	<head><title>xsimilarity</title></head>
 3 | 	<body>
 4 | 		<p>
 5 | 			<h2>XSimilarity</h2>
 6 | 		  项目地址：<a href="http://code.google.com/p/xsimilarity/">http://github.com/iamxiatian/xsimilarity/</a>
 7 | 		</p>
 8 | 		<p>
 9 | 			有任何问题或建议请与我们联系，您的反馈将有助于该项目的进一步完善。
10 | 		</p>
11 | 		<p>
12 |       <h2>致谢</h2>
13 |             本项目在研究过程中，得到了恩师樊孝忠教授的悉心指导，师恩如海，难以言谢！<br/>
14 | 			中国人民大学为本项目的持续研究提供了资金和计算机软硬件的支持，北京理工大学为本项目的早期研究提供了重要的基础设施，<br/>
15 | 			这些支持与国家的投入密不可分，
16 | 			本项目的开源和不断完善也算是对国家的点滴回报！<br/>
17 | 			代码中许多算法的核心思想来源于我们的研究同行和先辈们的已公开成果，另外，许多使用xsimilarity的人员对xsimilarity<br/>
18 | 			提出了宝贵的建议，在此一并表示深深的谢意！     <br/>
19 |         本工程使用了如下开源组件，对原作者致以谢意！
20 |         <ul>
21 |             <li>ANSJ： </li>
22 |         </ul>
23 |     </p>
24 | 		<p>
25 | 			<h2>联系方式</h2>
26 | 			夏天<br/>
27 | 			数据工程与知识工程教育部重点实验室（中国人民大学）<br/>
28 |       中国人民大学信息资源管理学院<br/>
29 |       电话: 86-10-82500675<br/>
30 |       Email: xiat(at)ruc.edu.cn<br/>
31 | 		</p>
32 | 		
33 | 	</body>
34 | </html>


--------------------------------------------------------------------------------
/src/main/resources/data/cilin.db.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/main/resources/data/cilin.db.gz


--------------------------------------------------------------------------------
/src/main/resources/data/concept.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/main/resources/data/concept.xml.gz


--------------------------------------------------------------------------------
/src/main/resources/data/sememe.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/main/resources/data/sememe.xml.gz


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <configuration>
 4 | 
 5 | 	<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 6 | 		<encoder>
 7 | 			<pattern>%d{ISO8601} [%thread] %-5level %logger{36} [Line:%-3L] - %msg%n</pattern>
 8 | 		</encoder>
 9 | 		<!-- 只输出level级别的日志 -->
10 | 		<filter class="ch.qos.logback.classic.filter.LevelFilter">
11 | 			<level>INFO</level>
12 | 			<onMatch>ACCEPT</onMatch>
13 | 			<onMismatch>DENY</onMismatch>
14 | 		</filter>
15 | 	</appender>
16 | 
17 | 	<appender name="FILE"
18 | 		class="ch.qos.logback.core.rolling.RollingFileAppender">
19 | 		<File>logs/semantic-similarity.log</File>
20 | 		<encoder>
21 | 			<pattern>%d{ISO8601} [%thread] %-5level %logger{36} [Line:%-3L] - %msg%n</pattern>
22 | 		</encoder>
23 | 		<!-- 只输出level级别以上的日志 -->
24 | 		<filter class="ch.qos.logback.classic.filter.ThresholdFilter">
25 | 			<level>INFO</level>
26 | 		</filter>
27 | 		<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
28 | 			<fileNamePattern>logs/semantic-similarity.log.%d{yyyy-MM-dd}.gz</fileNamePattern>
29 | 			<maxHistory>30</maxHistory>
30 | 		</rollingPolicy>
31 | 	</appender>
32 | 
33 | 	<!-- org.apache.commons.httpclient包中的java类 -->
34 | 	<logger name="org.apache.commons.httpclient" level="ERROR"
35 | 		addtivity="false">
36 | 		<appender-ref ref="FILE" />
37 | 	</logger>
38 | 
39 | 	<!-- org.restlet包中的java类 -->
40 | 	<logger name="org.restlet" level="WARN" addtivity="false">
41 | 		<appender-ref ref="FILE" />
42 | 	</logger>
43 | 
44 | 	<root level="DEBUGE">
45 | 		<appender-ref ref="FILE" />
46 | 		<appender-ref ref="STDOUT" />
47 | 	</root>
48 | 
49 | </configuration>


--------------------------------------------------------------------------------
/src/test/java/zx/soft/similarity/sentence/SemanticSimilarityTest.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.sentence;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import zx.soft.similarity.sentence.morphology.SemanticSimilarity;
 6 | 
 7 | public class SemanticSimilarityTest {
 8 | 
 9 | 	@Test
10 | 	public void test() {
11 | 		String s1 = "一个伟大的国家，中国";
12 | 		String s2 = "中国是一个伟大的国家";
13 | 
14 | 		SemanticSimilarity similarity = SemanticSimilarity.getInstance();
15 | 		double sim = similarity.getSimilarity(s1, s2);
16 | 		System.out.println("sim ==> " + sim);
17 | 	}
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/test/java/zx/soft/similarity/statistic/DictStatisticTest.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.statistic;
 2 | 
 3 | import zx.soft.similarity.statistic.DictStatistic;
 4 | import junit.framework.TestCase;
 5 | 
 6 | /**
 7 |  * ./db/coredict.xml.gz是利用的ictclas4j的词典文件，这个文件可以从lib/ictclas4j.jar文件中得到。
 8 |  * 即：把ictclas4j.jar文件解压开，里面的dictionary目录下有coredict.xml.gz文件。
 9 |  * 
10 | */
11 | public class DictStatisticTest extends TestCase {
12 | 
13 | 	public void testCount() {
14 | 		DictStatistic ds = new DictStatistic();
15 | 		ds.testFromXml("./db/coredict.xml.gz", true);
16 | 	}
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/src/test/java/zx/soft/similarity/word/CharBasedSimilarityTest.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word;
 2 | 
 3 | import zx.soft.similarity.word.CharBasedSimilarity;
 4 | import junit.framework.TestCase;
 5 | 
 6 | public class CharBasedSimilarityTest extends TestCase {
 7 | 
 8 | 	public void test() {
 9 | 		CharBasedSimilarity sim = new CharBasedSimilarity();
10 | 		String s1 = "手机";
11 | 		String s2 = "飞机";
12 | 
13 | 		assertTrue(sim.getSimilarity(s1, s2) > 0);
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/test/java/zx/soft/similarity/word/hownet/ConceptTest.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | import zx.soft.similarity.word.hownet2.concept.Concept;
 6 | 
 7 | import com.google.common.collect.HashMultimap;
 8 | import com.google.common.collect.Multimap;
 9 | 
10 | public class ConceptTest {
11 | 
12 | 	public static void main(String[] args) {
13 | 		Multimap<String, Concept> CONCEPTS = HashMultimap.create();
14 | 		//		CONCEPTS = ArrayListMultimap.create();
15 | 
16 | 		CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
17 | 		CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
18 | 		CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
19 | 		CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起"));
20 | 
21 | 		Collection<Concept> collection = CONCEPTS.get("打");
22 | 		for (Concept c : collection) {
23 | 			System.out.println(c);
24 | 		}
25 | 
26 | 		Multimap<String, Integer> map = HashMultimap.create();
27 | 		//	map = ArrayListMultimap.create();
28 | 
29 | 		map.put("打", 1);
30 | 		map.put("打", 1);
31 | 		map.put("打", 1);
32 | 		map.put("打", 2);
33 | 
34 | 		Collection<Integer> cc = map.get("打");
35 | 		for (Integer i : cc) {
36 | 			System.out.println(i);
37 | 		}
38 | 	}
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/java/zx/soft/similarity/word/hownet/SememeTest.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet;
 2 | 
 3 | import java.io.InputStream;
 4 | 
 5 | import zx.soft.similarity.util.FileUtils;
 6 | import zx.soft.similarity.word.hownet.sememe.Sememe;
 7 | import zx.soft.similarity.word.hownet.sememe.SememeDictTraverseEvent;
 8 | import zx.soft.similarity.word.hownet2.sememe.XiaSememeParser;
 9 | 
10 | /**
11 |  * 针对义原的测试
12 |  * 
13 |  * @author wanggang
14 |  *
15 |  */
16 | @SuppressWarnings("deprecation")
17 | public class SememeTest {
18 | 
19 | 	public static void main(String[] args) throws Exception {
20 | 		String id1 = "2-1-3-4";
21 | 		//		String id2 = "2-1-2";
22 | 		//		System.out.println(getDistance(id1, id2));
23 | 		//		System.out.println(getSimilarityBySememeId(id1, id2));
24 | 
25 | 		int pos = id1.lastIndexOf("-");
26 | 		String parentId = "root";
27 | 		if (pos > 0) {
28 | 			parentId = id1.substring(0, pos);
29 | 		}
30 | 		System.out.println(parentId);
31 | 		new XiaSememeParser().getSimilarity("test", "hello");
32 | 	}
33 | 
34 | 	static void saveXML() throws Exception {
35 | 		String sememeFile = Sememe.class.getPackage().getName().replaceAll("\\.", "/") + "/sememe.dat";
36 | 		InputStream input = Sememe.class.getClassLoader().getResourceAsStream(sememeFile);
37 | 		SememeDictTraverseEvent event = new SememeDictTraverseEvent();
38 | 
39 | 		FileUtils.traverseLines(input, "utf8", event);
40 | 		event.saveToXML("/home/xiatian/Desktop/sememe.xml");
41 | 	}
42 | 
43 | 	static double getSimilarityBySememeId(final String id1, final String id2) {
44 | 
45 | 		int position = 0;
46 | 		String[] array1 = id1.split("-");
47 | 		String[] array2 = id2.split("-");
48 | 		for (position = 0; position < array1.length && position < array2.length; position++) {
49 | 			if (!array1[position].equals(array2[position])) {
50 | 				break;
51 | 			}
52 | 		}
53 | 
54 | 		return 2.0 * position / (array1.length + array2.length);
55 | 	}
56 | 
57 | 	static int getDistance(String id1, String id2) {
58 | 		// 两个Id相同的位置终止地方
59 | 		int position = 0;
60 | 		String[] array1 = id1.split("-");
61 | 		String[] array2 = id2.split("-");
62 | 		for (position = 0; position < array1.length && position < array2.length; position++) {
63 | 			if (!array1[position].equals(array2[position])) {
64 | 				return array1.length + array2.length - position - position;
65 | 			}
66 | 		}
67 | 
68 | 		if (array1.length == array2.length) {
69 | 			return 0;
70 | 		} else if (array1.length == position) {
71 | 			return array2.length - position;
72 | 		} else {
73 | 			return array1.length - position;
74 | 		}
75 | 	}
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/src/test/java/zx/soft/similarity/word/hownet2/HownetSimilarityTest.java:
--------------------------------------------------------------------------------
 1 | package zx.soft.similarity.word.hownet2;
 2 | 
 3 | import junit.framework.TestCase;
 4 | 
 5 | import org.junit.Before;
 6 | import org.junit.Test;
 7 | 
 8 | import zx.soft.similarity.word.hownet2.concept.LiuConceptParser;
 9 | import zx.soft.similarity.word.hownet2.concept.XiaConceptParser;
10 | 
11 | public class HownetSimilarityTest extends TestCase {
12 | 
13 | 	XiaConceptParser xParser = null;
14 | 	LiuConceptParser lParser = null;
15 | 
16 | 	@Override
17 | 	@Before
18 | 	public void setUp() {
19 | 		xParser = XiaConceptParser.getInstance();
20 | 		lParser = LiuConceptParser.getInstance();
21 | 	}
22 | 
23 | 	@Test
24 | 	public void testWordSimiarltiy() {
25 | 		String word1 = "电动车";
26 | 		String word2 = "自行车";
27 | 		double x_sim = xParser.getSimilarity(word1, word2);
28 | 		double l_sim = lParser.getSimilarity(word1, word2);
29 | 		assertTrue(x_sim > l_sim);
30 | 		assertTrue(x_sim > 0.2);
31 | 	}
32 | 
33 | 	/**
34 | 	 * 该词语计算相似度时出现死循环，bug由北京大学计算语言学研究所万富强提供，fqw0000@gmail.com
35 | 	 */
36 | 	@Test
37 | 	public void testWordSimiarltiy2() {
38 | 		String word1 = "算法";
39 | 		String word2 = "安提瓜和巴布达";
40 | 		double x_sim = xParser.getSimilarity(word1, word2);
41 | 		double l_sim = lParser.getSimilarity(word1, word2);
42 | 		assertTrue(x_sim >= l_sim);
43 | 		System.out.println("x_sim:" + x_sim);
44 | 		System.out.println("l_sim:" + l_sim);
45 | 	}
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/test/resources/data/about.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 	<head><title>xsimilarity</title></head>
 3 | 	<body>
 4 | 		<p>
 5 | 			<h2>XSimilarity</h2>
 6 | 		  项目地址：<a href="http://code.google.com/p/xsimilarity/">http://github.com/iamxiatian/xsimilarity/</a>
 7 | 		</p>
 8 | 		<p>
 9 | 			有任何问题或建议请与我们联系，您的反馈将有助于该项目的进一步完善。
10 | 		</p>
11 | 		<p>
12 |       <h2>致谢</h2>
13 |             本项目在研究过程中，得到了恩师樊孝忠教授的悉心指导，师恩如海，难以言谢！<br/>
14 | 			中国人民大学为本项目的持续研究提供了资金和计算机软硬件的支持，北京理工大学为本项目的早期研究提供了重要的基础设施，<br/>
15 | 			这些支持与国家的投入密不可分，
16 | 			本项目的开源和不断完善也算是对国家的点滴回报！<br/>
17 | 			代码中许多算法的核心思想来源于我们的研究同行和先辈们的已公开成果，另外，许多使用xsimilarity的人员对xsimilarity<br/>
18 | 			提出了宝贵的建议，在此一并表示深深的谢意！     <br/>
19 |         本工程使用了如下开源组件，对原作者致以谢意！
20 |         <ul>
21 |             <li>ANSJ： </li>
22 |         </ul>
23 |     </p>
24 | 		<p>
25 | 			<h2>联系方式</h2>
26 | 			夏天<br/>
27 | 			数据工程与知识工程教育部重点实验室（中国人民大学）<br/>
28 |       中国人民大学信息资源管理学院<br/>
29 |       电话: 86-10-82500675<br/>
30 |       Email: xiat(at)ruc.edu.cn<br/>
31 | 		</p>
32 | 		
33 | 	</body>
34 | </html>


--------------------------------------------------------------------------------
/src/test/resources/data/cilin.db.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/test/resources/data/cilin.db.gz


--------------------------------------------------------------------------------
/src/test/resources/data/concept.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/test/resources/data/concept.xml.gz


--------------------------------------------------------------------------------
/src/test/resources/data/sememe.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-distribution/semantic-similarity/8deb4382a41ad94a82db15ea945c6bfa8ac103f0/src/test/resources/data/sememe.xml.gz


--------------------------------------------------------------------------------
/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <configuration>
 4 | 
 5 | 	<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 6 | 		<encoder>
 7 | 			<pattern>%d{MMdd.HHmmss.SSS} [%-20t] [%-5p] [%-20c] [L:%-3L] - %m%n</pattern>
 8 | 		</encoder>
 9 | 	</appender>
10 | 
11 | 	<logger name="zx.soft" level="DEBUG">
12 | 		<appender-ref ref="STDOUT" />
13 | 	</logger>
14 | 
15 | 	<root level="DEBUG">
16 | 		<appender-ref ref="STDOUT" />
17 | 	</root>
18 | 
19 | </configuration>


--------------------------------------------------------------------------------