├── HISTORY.md ├── README.md ├── REVISION.md ├── TODO.md ├── dict ├── sentiment │ ├── 主张词语(中文).txt │ ├── 主张词语(英文).txt │ ├── 正面情感词语(中文).txt │ ├── 正面情感词语(英文).txt │ ├── 正面评价词语(中文).txt │ ├── 正面评价词语(英文).txt │ ├── 程度级别词语(中文).txt │ ├── 程度级别词语(英文).txt │ ├── 统计结果.txt │ ├── 负面情感词语(中文).txt │ ├── 负面情感词语(英文).txt │ ├── 负面评价词语(中文).txt │ └── 负面评价词语(英文).txt ├── tendency │ └── tendency.xml └── user-concept.xml ├── docs ├── LCMC.zip └── 中文信息相似度计算理论与方法图书目录.pdf ├── pom.xml ├── src ├── main │ ├── java │ │ └── ruc │ │ │ └── irm │ │ │ ├── classification │ │ │ ├── Feature.java │ │ │ ├── Instance.java │ │ │ ├── NaiveBayesClassifier.java │ │ │ └── Variable.java │ │ │ ├── similarity │ │ │ ├── Similaritable.java │ │ │ ├── SimilarityFactory.java │ │ │ ├── phrase │ │ │ │ └── PhraseSimilarity.java │ │ │ ├── sentence │ │ │ │ ├── SegmentProxy.java │ │ │ │ ├── SentenceSimilarity.java │ │ │ │ ├── editdistance │ │ │ │ │ ├── Block.java │ │ │ │ │ ├── CharEditUnit.java │ │ │ │ │ ├── ChunkEditUnit.java │ │ │ │ │ ├── EditDistance.java │ │ │ │ │ ├── EditUnit.java │ │ │ │ │ ├── GregorEditDistance.java │ │ │ │ │ ├── Split.java │ │ │ │ │ ├── StandardEditDistance.java │ │ │ │ │ ├── SuperString.java │ │ │ │ │ ├── WordEditUnit.java │ │ │ │ │ ├── XiatianEditDistance.java │ │ │ │ │ └── XiatianEditDistance2.java │ │ │ │ └── morphology │ │ │ │ │ ├── MorphoSimilarity.java │ │ │ │ │ └── SemanticSimilarity.java │ │ │ ├── statistic │ │ │ │ ├── DictStatistic.java │ │ │ │ └── LCMC.java │ │ │ ├── text │ │ │ │ └── DiceSimilarity.java │ │ │ ├── util │ │ │ │ ├── About.java │ │ │ │ ├── BlankUtils.java │ │ │ │ ├── EditDistance.java │ │ │ │ ├── FileUtils.java │ │ │ │ ├── MathUtils.java │ │ │ │ ├── PinyinUtils.java │ │ │ │ ├── TraverseEvent.java │ │ │ │ ├── XmlException.java │ │ │ │ └── XmlUtils.java │ │ │ └── word │ │ │ │ ├── CharBasedSimilarity.java │ │ │ │ ├── WordSimilarity.java │ │ │ │ ├── cilin │ │ │ │ ├── Cilin.java │ │ │ │ ├── CilinCoding.java │ │ │ │ └── CilinDb.java │ │ │ │ ├── hownet │ │ │ │ ├── Hownet.java │ │ │ │ ├── HownetMeta.java │ │ │ │ ├── concept │ │ │ │ │ ├── Concept.java │ │ │ │ │ ├── ConceptDictTraverseEvent.java │ │ │ │ │ ├── ConceptLinkedList.java │ │ │ │ │ ├── ConceptParser.java │ │ │ │ │ ├── LiuConceptParser.java │ │ │ │ │ ├── MyConceptParser.java │ │ │ │ │ └── concept.dat │ │ │ │ └── sememe │ │ │ │ │ ├── FastSimpleMap.java │ │ │ │ │ ├── LiuqunSememeParser.java │ │ │ │ │ ├── MySememeParser.java │ │ │ │ │ ├── Sememe.java │ │ │ │ │ ├── SememeDictTraverseEvent.java │ │ │ │ │ ├── SememeParser.java │ │ │ │ │ ├── SememeType.java │ │ │ │ │ └── sememe.dat │ │ │ │ ├── hownet2 │ │ │ │ ├── concept │ │ │ │ │ ├── BaseConceptParser.java │ │ │ │ │ ├── Concept.java │ │ │ │ │ ├── ConceptDictTraverseEvent.java │ │ │ │ │ ├── ConceptLinkedList.java │ │ │ │ │ ├── LiuConceptParser.java │ │ │ │ │ └── XiaConceptParser.java │ │ │ │ └── sememe │ │ │ │ │ ├── BaseSememeParser.java │ │ │ │ │ ├── LiuqunSememeParser.java │ │ │ │ │ ├── Sememe.java │ │ │ │ │ ├── SememeType.java │ │ │ │ │ └── XiaSememeParser.java │ │ │ │ └── pinyin │ │ │ │ └── PinyinSimilarity.java │ │ │ ├── tendency │ │ │ └── word │ │ │ │ ├── HownetWordTendency.java │ │ │ │ ├── Training.java │ │ │ │ └── WordTendency.java │ │ │ └── ui │ │ │ ├── PhraseSimilarityUI.java │ │ │ ├── SememeTreeUI.java │ │ │ ├── SentenceSimilarityUI.java │ │ │ ├── Start.java │ │ │ ├── TendencyUI.java │ │ │ └── WordSimlarityUI.java │ └── resources │ │ ├── about.html │ │ ├── data │ │ ├── F02-GB2312-to-PuTongHua-PinYin.txt │ │ ├── cilin.db.gz │ │ ├── concept.xml.gz │ │ └── sememe.xml.gz │ │ ├── log4j.dtd │ │ └── log4j.xml └── test │ └── java │ └── ruc │ └── irm │ └── similarity │ ├── sentence │ ├── MorphoSimilarityTest.java │ └── SemanticSimilarityTest.java │ ├── statistic │ └── DictStatisticTest.java │ └── word │ ├── CharBasedSimilarityTest.java │ ├── hownet │ ├── ConceptTest.java │ └── SememeTest.java │ └── hownet2 │ └── HownetSimilarityTest.java └── 中文信息相似度计算理论与方法图书目录.pdf /HISTORY.md: -------------------------------------------------------------------------------- 1 | 变更历史 2 | ================ 3 | 4 | 2014-04: 把中文分词用ansj替换为原先的ictclas4j,在此对原作者表示感谢!把工程更改为maven工程,方便管理。 5 | 2014-08: 修正了SemanticSimilarity中的数组循环错误 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 说明 3 | ===================== 4 | 汉语词语、组块、句子以及文本篇章等各个层面的相似度计算是中文信息处理领域的一项基础而又核心的工作,它直接决定着相关领域的研究发展状况,例如,在知识工程、基于实例的机器翻译、信息检索、自动问答以及拼写检查等方面,相似度计算都是一个非常关键的问题,长期以来一直是人们研究的一个热点和难点。相似度的研究涉及词语、组块、句子以及篇章等多个层面,目前的研究主要侧重于词语方面,提出了一些比较有代表性的理论与方法,如字面相似度算法、词素相似度算法,以及基于同义词词林、知网等语义词典的方法,国外的方法则主要包括基于构成字符的相似度计算方法、基于WORDNET的计算方法、基于词典注释的方法、基于大规模语料库统计的方法和基于搜索引擎的方法;有关组块、短语级别的相似度的研究现在还比较少,常用的方法是在词语相似度计算的基础上,借用句子相似度的计算方法计算组块之间的相似度。在句子层面的相似度计算方面,国外研究主要集中在字符串的相似度计算,国内则主要以词语为基本处理单元,通过计算相同词语所占的比重确定句子之间的相似度;文本层面的则集中于利用统计方法实现相似度计算。 5 | 6 | xsimilarity项目为我们在相似度计算领域所取得的部分成果的Java代码实现,部分凌乱的代码已被去除,待重构之后再加入到工程之中。在相似度计算的研究过程中,许多研究学者的成果公布和无私帮助让我们受益匪浅,我们把代码开源出来,既是对前辈们表达我们的尊重之情,也希望能对大家共同的研究社区能有点滴贡献,能避免一些重复工作。 7 | 8 | xsimilarity项目中所体现的思想或许还比较幼稚,希望高手们能用宽容的胸襟对待,并不吝赐教,我们也将根据研究进展情况和大家的实际需求,不断改进,同时也欢迎大家加入到这个项目的开发过程中来,共同推进相似度计算在中国的研究。 9 | 10 | xsimilarity项目中的理论知识大家可以参考doc目录下的文章,以及《中文信息相似度计算理论与方法》一书,重要的参考资料、程序资源在书中已经提到,如有需要,我们在今后将单独整理成列表,供大家参考。 11 | 12 | 大家可以通过Eclipse导入项目,并运行ruc.irm.ui.Start进行快速测试。 13 | 14 | 联系方式:xiat(at)ruc.edu.cn 15 | 16 | 17 | 编译运行 18 | ======================= 19 | 首先确保系统中安装maven. 20 | 21 | 如果要生成Intellij IDEA的工程文件,请进入命令行,在项目主目录下执行: 22 | 23 | ```mvn idea:idea``` 24 | 25 | 如要生成eclipse的工程文件,则执行: 26 | 27 | ```mvn eclipse:eclipse``` 28 | 29 | 要编译代码并在命令行运行测试: 30 | 31 | ```mvn compile``` 32 | 33 | ```mvn dependency:copy-dependencies``` 34 | 35 | ```./run.py Start``` 36 | 37 | 即可打开主界面,进行测试 38 | 39 | (注:开发测试所用的操作系统为Ubuntu,如为Windows,请自行修改run.py脚本) 40 | 41 | 42 | 43 | 设想 44 | ======================== 45 | 尝试把潜在和显性语义分析技术加入到xsimilarity中,并简化使用方式,方便初学者使用,但因个人精力受限,目前尚为开始集成处理。 46 | 47 | 定个时间点:如果star数量超过500,再开始更新并把最近几年的相关研究成果集成进去。 48 | 49 | 50 | 欢迎有兴趣的人员与我联系,一起扩展xsimilarity的功能和实用性。 51 | 52 | 53 | 54 | 致谢 55 | ======================== 56 | ansj中文分词 57 | 58 | -------------------------------------------------------------------------------- /REVISION.md: -------------------------------------------------------------------------------- 1 | 错误修订 2 | ===================== 3 | 4 | 1. 第三章概念词语的相似度计算部分的公式: 5 | Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ β1 βi Sim i (C1, C2) 6 | 应为: Sim(C1, C2) = β1 Sim1 (C1, C2) + ∑ Sim1(C1, C2) βi Sim i (C1, C2) 7 | 可参考以下代码实现: i 8 | @Override 9 | protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4) { 10 | return beta1 * sim_v1 + beta2 * sim_v1 * sim_v2 + beta3 * sim_v1 * sim_v3 + beta4 * sim_v1 * sim_v4; 11 | } 12 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | PLAN 2 | ============================= 3 | 4 | * 加入ESA和LSA处理,这两部分已经单独实现,但都比较复杂,如有精力和时间,考虑把ESA的某个快照结果打包,加入xsimilarity -------------------------------------------------------------------------------- /dict/sentiment/主张词语(中文).txt: -------------------------------------------------------------------------------- 1 | 中文主张词语 38 2 | 3 | 1. {perception|感知} 22 4 | 察觉 5 | 触目 6 | 耳闻 7 | 发 8 | 发觉 9 | 发现 10 | 风闻 11 | 感 12 | 感觉 13 | 感觉到 14 | 感受到 15 | 见到 16 | 见得 17 | 觉 18 | 觉得 19 | 看得出来 20 | 窥见 21 | 领教 22 | 听说 23 | 痛感 24 | 预感 25 | 自觉 26 | 27 | 2. {regard|认为} 16 28 | 抱定 29 | 当 30 | 道 31 | 感到 32 | 感觉 33 | 觉得 34 | 看 35 | 看待 36 | 论 37 | 认定 38 | 认为 39 | 认准 40 | 想 41 | 相信 42 | 以为 43 | 主张 44 | -------------------------------------------------------------------------------- /dict/sentiment/主张词语(英文).txt: -------------------------------------------------------------------------------- 1 | 英文主张词语 35 2 | 3 | 1. {perception|感知} 21 4 | be aware of 5 | be conscious 6 | be conscious of 7 | be told 8 | become aware of 9 | detect 10 | discern 11 | discover 12 | feel 13 | find 14 | get a glimpse of 15 | get wind of 16 | have a premonition 17 | hear of 18 | keenly feel 19 | learn through hearsay 20 | meet the eye 21 | notice 22 | perceive 23 | see 24 | sense 25 | 26 | {regard|认为} 14 27 | advocate 28 | believe 29 | consider 30 | feel 31 | firmly believe 32 | hold 33 | look upon 34 | maintain 35 | regard 36 | sense 37 | set one's mind on 38 | stand for 39 | suppose 40 | think -------------------------------------------------------------------------------- /dict/sentiment/程度级别词语(中文).txt: -------------------------------------------------------------------------------- 1 | 中文程度级别词语 219 2 | 3 | 1. “极其|extreme / 最|most” 69 4 | 百分之百 5 | 倍加 6 | 备至 7 | 不得了 8 | 不堪 9 | 不可开交 10 | 不亦乐乎 11 | 不折不扣 12 | 彻头彻尾 13 | 充分 14 | 到头 15 | 地地道道 16 | 非常 17 | 极 18 | 极度 19 | 极端 20 | 极其 21 | 极为 22 | 截然 23 | 尽 24 | 惊人地 25 | 绝 26 | 绝顶 27 | 绝对 28 | 绝对化 29 | 刻骨 30 | 酷 31 | 满 32 | 满贯 33 | 满心 34 | 莫大 35 | 奇 36 | 入骨 37 | 甚为 38 | 十二分 39 | 十分 40 | 十足 41 | 死 42 | 滔天 43 | 痛 44 | 透 45 | 完全 46 | 完完全全 47 | 万 48 | 万般 49 | 万分 50 | 万万 51 | 无比 52 | 无度 53 | 无可估量 54 | 无以复加 55 | 无以伦比 56 | 要命 57 | 要死 58 | 已极 59 | 已甚 60 | 异常 61 | 逾常 62 | 贼 63 | 之极 64 | 之至 65 | 至极 66 | 卓绝 67 | 最为 68 | 佼佼 69 | 郅 70 | 綦 71 | 齁 72 | 最 73 | 74 | 2. “很|very” 42 75 | 不过 76 | 不少 77 | 不胜 78 | 惨 79 | 沉 80 | 沉沉 81 | 出奇 82 | 大为 83 | 多 84 | 多多 85 | 多加 86 | 多么 87 | 分外 88 | 格外 89 | 够瞧的 90 | 够戗 91 | 好 92 | 好不 93 | 何等 94 | 很 95 | 很是 96 | 坏 97 | 可 98 | 老 99 | 老大 100 | 良 101 | 颇 102 | 颇为 103 | 甚 104 | 实在 105 | 太 106 | 太甚 107 | 特 108 | 特别 109 | 尤 110 | 尤其 111 | 尤为 112 | 尤以 113 | 远 114 | 着实 115 | 曷 116 | 碜 117 | 118 | 3. “较|more” 37 119 | 大不了 120 | 多 121 | 更 122 | 更加 123 | 更进一步 124 | 更为 125 | 还 126 | 还要 127 | 较 128 | 较比 129 | 较为 130 | 进一步 131 | 那般 132 | 那么 133 | 那样 134 | 强 135 | 如斯 136 | 益 137 | 益发 138 | 尤甚 139 | 逾 140 | 愈 141 | 愈 ... 愈 142 | 愈发 143 | 愈加 144 | 愈来愈 145 | 愈益 146 | 远远 147 | 越 ... 越 148 | 越发 149 | 越加 150 | 越来越 151 | 越是 152 | 这般 153 | 这样 154 | 足 155 | 足足 156 | 157 | 4. “稍|-ish” 29 158 | 点点滴滴 159 | 多多少少 160 | 怪 161 | 好生 162 | 还 163 | 或多或少 164 | 略 165 | 略加 166 | 略略 167 | 略微 168 | 略为 169 | 蛮 170 | 稍 171 | 稍稍 172 | 稍微 173 | 稍为 174 | 稍许 175 | 挺 176 | 未免 177 | 相当 178 | 些 179 | 些微 180 | 些小 181 | 一点 182 | 一点儿 183 | 一些 184 | 有点 185 | 有点儿 186 | 有些 187 | 188 | 5. “欠|insufficiently” 12 189 | 半点 190 | 不大 191 | 不丁点儿 192 | 不甚 193 | 不怎么 194 | 聊 195 | 没怎么 196 | 轻度 197 | 弱 198 | 丝毫 199 | 微 200 | 相对 201 | 202 | 6. “超|over” 30 203 | 不为过 204 | 超 205 | 超额 206 | 超外差 207 | 超微结构 208 | 超物质 209 | 出头 210 | 多 211 | 浮 212 | 过 213 | 过度 214 | 过分 215 | 过火 216 | 过劲 217 | 过了头 218 | 过猛 219 | 过热 220 | 过甚 221 | 过头 222 | 过于 223 | 过逾 224 | 何止 225 | 何啻 226 | 开外 227 | 苦 228 | 老 229 | 偏 230 | 强 231 | 溢 232 | 忒 233 | 234 | 235 | 236 | -------------------------------------------------------------------------------- /dict/sentiment/程度级别词语(英文).txt: -------------------------------------------------------------------------------- 1 | 英文程度级别词语 170 2 | 3 | 1. “极其|extreme / 最|most” 64 4 | 100 percent 5 | absolute 6 | absolutely 7 | alarmingly 8 | amazingly 9 | as fully as possible 10 | astonishingly 11 | awfully 12 | beyond challenge 13 | beyond compare 14 | beyond comparison 15 | beyond measure 16 | bitterly 17 | by all means 18 | completely 19 | deep-rooted 20 | deep-seated 21 | deeply 22 | definitely 23 | disastrously 24 | downright 25 | entirely 26 | exceedingly 27 | excessively 28 | extreme 29 | extremely 30 | fully 31 | greatest 32 | greatly 33 | heinous 34 | hundred-percent 35 | immensely 36 | immoderate 37 | in a penetrating way 38 | in every possible way 39 | in the extreme 40 | incomparably 41 | ingrained 42 | matchlessly 43 | monstrous 44 | most 45 | of the highest degree 46 | out-and-out 47 | outstanding 48 | outstandingly 49 | reach the limit 50 | right-down 51 | sharply 52 | sheer 53 | superb 54 | terribly 55 | to death 56 | to the full 57 | to the letter 58 | to the limit 59 | to the marrow 60 | to the utmost 61 | totally 62 | towering 63 | unusually 64 | utmost 65 | utterly 66 | very much 67 | most 68 | 69 | 2. “很|very” 25 70 | a lot 71 | awfully 72 | badly 73 | better 74 | by far 75 | considerably 76 | deep 77 | disastrously 78 | especially 79 | extraordinarily 80 | extremely 81 | greatly 82 | how 83 | however 84 | indeed 85 | much 86 | particularly 87 | really 88 | terribly 89 | to a serious degree 90 | too far 91 | too much 92 | unusually 93 | very 94 | what a 95 | 96 | 3. “较|more” 22 97 | all the more 98 | as much as 99 | at the worst 100 | by far 101 | comparatively 102 | even more 103 | further 104 | further more 105 | in that way 106 | increasingly 107 | like that 108 | more 109 | more and more 110 | more so 111 | much more 112 | plus 113 | relatively 114 | slightly more 115 | so 116 | still more 117 | such 118 | the more ... the more 119 | 120 | 4. “稍|-ish” 15 121 | a bit 122 | a bit too 123 | a little 124 | a little bit 125 | a little more 126 | fairly 127 | more or less 128 | passably 129 | pretty 130 | quite 131 | rather 132 | slightly 133 | some 134 | somewhat 135 | to some extent 136 | 137 | 5. “欠|insufficiently” 11 138 | a little less 139 | just 140 | light 141 | merely 142 | not particularly 143 | not too 144 | not very 145 | relative 146 | slight 147 | slightest degree of 148 | slightly 149 | 150 | 6. “超|over” 33 151 | a little over 152 | above 153 | above measure 154 | above quota 155 | and more 156 | excessive 157 | excessively 158 | exorbitance 159 | extra 160 | far more than 161 | hyperphysical 162 | inflated 163 | inordinate 164 | not too much 165 | odd 166 | outrageousness 167 | over 168 | over- 169 | overdone 170 | overheated 171 | plus 172 | slightly more 173 | super 174 | superheated 175 | superheterodyne 176 | surplus 177 | to a fault 178 | too 179 | too much 180 | ultra 181 | ultrastructural 182 | undue 183 | unduly 184 | 185 | 186 | -------------------------------------------------------------------------------- /dict/tendency/tendency.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /dict/user-concept.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/LCMC.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/docs/LCMC.zip -------------------------------------------------------------------------------- /docs/中文信息相似度计算理论与方法图书目录.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/docs/中文信息相似度计算理论与方法图书目录.pdf -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | ruc.irm 5 | xsimilarity 6 | jar 7 | xsimilarity 8 | 0.1 9 | xsimilarity 10 | https://github.com/iamxiatian/xsimilarity 11 | 12 | 13 | The Apache Software License, Version 2.0 14 | http://www.apache.org/licenses/LICENSE-2.0.txt 15 | repo 16 | 17 | 18 | 19 | 20 | 21 | 22 | cengtral 23 | http://repo1.maven.org/maven2/ 24 | 25 | 26 | 27 | 28 | 29 | summer 30 | summer 31 | xiat(at)ruc.edu.cn 32 | 33 | 34 | 35 | 36 | 1.8 37 | 1.8 38 | UTF-8 39 | 3.3.1 40 | 1.7.1 41 | 1.2.3 42 | 43 | 44 | 45 | 46 | org.slf4j 47 | slf4j-api 48 | ${slf4j.version} 49 | 50 | 51 | 52 | ch.qos.logback 53 | logback-core 54 | ${logback.version} 55 | 56 | 57 | 58 | ch.qos.logback 59 | logback-classic 60 | ${logback.version} 61 | 62 | 63 | 64 | org.apache.commons 65 | commons-lang3 66 | ${commons.lang3.version} 67 | 68 | 69 | 70 | 71 | com.google.guava 72 | guava 73 | 23.5-jre 74 | 75 | 76 | 77 | org.ansj 78 | ansj_seg 79 | 5.1.1 80 | 81 | 82 | 83 | junit 84 | junit 85 | 4.12 86 | test 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/classification/Feature.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.classification; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | /** 10 | * 文档的特征 11 | * 12 | * @author xiatian 13 | * 14 | */ 15 | public class Feature { 16 | /** 每个关键词在不同类别中出现的文档数量 */ 17 | private Map docCountMap = new HashMap(); 18 | /** 特征名称 */ 19 | private String name; 20 | 21 | public String getName() { 22 | return name; 23 | } 24 | public void setName(String name) { 25 | this.name = name; 26 | } 27 | public void incDocCount(String category){ 28 | if(docCountMap.containsKey(category)){ 29 | docCountMap.put(category, docCountMap.get(category)+1); 30 | }else{ 31 | docCountMap.put(category, 1); 32 | } 33 | } 34 | public int getDocCount(String category){ 35 | if(docCountMap.containsKey(category)){ 36 | return docCountMap.get(category); 37 | }else{ 38 | return 0; 39 | } 40 | } 41 | 42 | public void write(DataOutput out) throws IOException{ 43 | out.writeUTF(name==null?"":name); 44 | 45 | out.writeInt(docCountMap.size()); 46 | for(String category:docCountMap.keySet()){ 47 | out.writeUTF(category); 48 | out.writeInt(docCountMap.get(category)); 49 | } 50 | } 51 | 52 | public void readFields(DataInput in) throws IOException { 53 | this.name = in.readUTF(); 54 | 55 | docCountMap = new HashMap(); 56 | int size = in.readInt(); 57 | for(int i=0; i bag = new HashSet(); 26 | 27 | public Instance() { 28 | } 29 | 30 | public Instance(String category, File f, String encoding) { 31 | this.category = category; 32 | String line = null; 33 | 34 | try { 35 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding)); 36 | 37 | while ((line = in.readLine()) != null) { 38 | System.out.println(line); 39 | List words = SegmentProxy.segment(line); 40 | for(Word w:words) { 41 | if (w.getPos().endsWith("adj") 42 | || w.getPos().startsWith("n") 43 | || w.getPos().startsWith("v")) { 44 | bag.add(w.getWord()); 45 | } 46 | } 47 | } 48 | } catch (IOException e) { 49 | System.out.println("current file:" + f.getAbsolutePath()); 50 | System.out.println("current line:" + line); 51 | e.printStackTrace(); 52 | } 53 | } 54 | 55 | public String getCategory() { 56 | return category; 57 | } 58 | 59 | public void setCategory(String category) { 60 | this.category = category; 61 | } 62 | 63 | public Set getWords() { 64 | return bag; 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/classification/NaiveBayesClassifier.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.classification; 2 | 3 | import java.io.DataInputStream; 4 | import java.io.DataOutput; 5 | import java.io.DataOutputStream; 6 | import java.io.File; 7 | import java.io.FileInputStream; 8 | import java.io.FileOutputStream; 9 | import java.io.IOException; 10 | import java.util.Collection; 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | public class NaiveBayesClassifier { 15 | /** 16 | * 记录每个类别下出现的文档数量, 用于计算P(C)使用 17 | */ 18 | Variable VARIABLE = new Variable(); 19 | 20 | /** 21 | * 词语在所有类别中的总数量 22 | */ 23 | Map TERM_TOTAL_COUNT = new HashMap(); 24 | 25 | /** 26 | * 训练一篇文档 27 | * @param doc 28 | */ 29 | public void training(Instance doc) { 30 | VARIABLE.addInstance(doc); 31 | } 32 | 33 | /** 34 | * 保存训练结果 35 | * @throws IOException 36 | */ 37 | void save(File file) throws IOException{ 38 | DataOutput out = new DataOutputStream(new FileOutputStream(file)); 39 | VARIABLE.write(out); 40 | } 41 | 42 | public void load(File file) throws IOException{ 43 | DataInputStream in = new DataInputStream(new FileInputStream(file)); 44 | VARIABLE = Variable.read(in); 45 | } 46 | 47 | /** 48 | * 计算P(C) 49 | * @param category 50 | * @return 51 | */ 52 | public double getCategoryProbability(String category){ 53 | return Math.log(VARIABLE.getDocCount(category)*1.0f/VARIABLE.getDocCount()); 54 | } 55 | 56 | /** 57 | * 计算P(feature|cateogry),返回的是取对数后的数值 58 | * @param feature 59 | * @param category 60 | * @return 61 | */ 62 | public double getFeatureProbability(String feature, String category){ 63 | int m = VARIABLE.getFeatureCount(); 64 | return Math.log((VARIABLE.getDocCount(feature, category)+1.0)/(VARIABLE.getDocCount(category)+m)); 65 | } 66 | 67 | /** 68 | * 计算给定实例文档属于指定类别的概率,返回的是取对数后的数值 69 | * @param category 70 | * @param doc 71 | * @return 72 | */ 73 | public double getProbability(String category, Instance doc) { 74 | double result = getCategoryProbability(category); 75 | for(String feature:doc.getWords()){ 76 | if(VARIABLE.containFeature(feature)){ 77 | result += getFeatureProbability(feature, category); 78 | } 79 | } 80 | return result; 81 | } 82 | 83 | public String getCategory(Instance doc){ 84 | Collection categories = VARIABLE.getCategories(); 85 | double best = Double.NEGATIVE_INFINITY; 86 | String bestName = null; 87 | for(String c:categories){ 88 | double current = getProbability(c, doc); 89 | // System.out.println(c + ":" + current); 90 | if(best categoryMap = new HashMap(); 19 | 20 | Map features = new HashMap(); 21 | 22 | /** 所有文档的数量 */ 23 | private int docCount = 0; 24 | 25 | public void write(DataOutput out) throws IOException{ 26 | //保存文档总数 27 | out.writeInt(docCount); 28 | 29 | //写入类别总数 30 | out.writeInt(categoryMap.size()); 31 | for(String category:categoryMap.keySet()){ 32 | out.writeUTF(category); 33 | categoryMap.get(category).write(out); 34 | } 35 | 36 | //写入Feature总数 37 | out.writeInt(features.size()); 38 | for(String key:features.keySet()){ 39 | out.writeUTF(key); 40 | features.get(key).write(out); 41 | } 42 | } 43 | 44 | public void readFields(DataInput in) throws IOException { 45 | this.docCount = in.readInt(); 46 | 47 | int size = in.readInt(); 48 | categoryMap = new HashMap(); 49 | for(int i=0; i(); 57 | for(int i=0; i getCategories(){ 71 | return categoryMap.keySet(); 72 | } 73 | 74 | public int getFeatureCount(){ 75 | return features.size(); 76 | } 77 | 78 | public boolean containFeature(String feature){ 79 | return features.containsKey(feature); 80 | } 81 | 82 | public void incDocCount(){ 83 | this.docCount++; 84 | } 85 | 86 | public int getDocCount(){ 87 | return this.docCount; 88 | } 89 | 90 | /** 91 | * 获取置顶类别下的文档数量 92 | * @param category 93 | * @return 94 | */ 95 | public int getDocCount(String category){ 96 | return categoryMap.get(category).getDocCount(); 97 | } 98 | 99 | /** 100 | * 获取feature在指定类别下的文档出现数量 101 | * @param feature 102 | * @param category 103 | * @return 104 | */ 105 | public int getDocCount(String feature, String category){ 106 | Feature f = features.get(feature); 107 | if(f!=null){ 108 | return f.getDocCount(category); 109 | } 110 | return 0; 111 | } 112 | 113 | public void addInstance(Instance instance){ 114 | incDocCount(); 115 | CategoryInfo info = null; 116 | if(categoryMap.containsKey(instance.getCategory())){ 117 | info = categoryMap.get(instance.getCategory()); 118 | }else{ 119 | info = new CategoryInfo(); 120 | } 121 | info.incDocCount(); 122 | categoryMap.put(instance.getCategory(), info); 123 | 124 | for(String word:instance.getWords()){ 125 | Feature feature = features.get(word); 126 | 127 | if(feature==null) feature = new Feature(); 128 | 129 | feature.setName(word); 130 | feature.incDocCount(instance.getCategory()); 131 | 132 | features.put(word, feature); 133 | } 134 | } 135 | 136 | public static class CategoryInfo { 137 | private int docCount; 138 | 139 | public int getDocCount() { 140 | return docCount; 141 | } 142 | public void incDocCount(){ 143 | this.docCount++; 144 | } 145 | public void setDocCount(int docCount) { 146 | this.docCount = docCount; 147 | } 148 | 149 | public void write(DataOutput out) throws IOException{ 150 | out.writeInt(docCount); 151 | } 152 | 153 | public void readFields(DataInput in) throws IOException { 154 | this.docCount = in.readInt(); 155 | } 156 | 157 | public static CategoryInfo read(DataInput in) throws IOException{ 158 | CategoryInfo c = new CategoryInfo(); 159 | c.readFields(in); 160 | return c; 161 | } 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/Similaritable.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity; 2 | 3 | /** 4 | * 可以计算相似度的接口 5 | * 6 | * @author 夏天 7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 8 | */ 9 | public interface Similaritable { 10 | /** 11 | * 计算两个字符串的相似度,对于句子来说,计算的是句子相似度,对于词语则计算词语的相似度 12 | * @param item1 参与相似度计算的第一个字符串 13 | * @param item2 参与相似度计算的第二个字符串 14 | * @return 15 | */ 16 | public double getSimilarity(String item1, String item2); 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/SimilarityFactory.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity; 2 | 3 | import ruc.irm.similarity.sentence.SentenceSimilarity; 4 | import ruc.irm.similarity.sentence.morphology.MorphoSimilarity; 5 | import ruc.irm.similarity.word.WordSimilarity; 6 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser; 7 | 8 | public class SimilarityFactory { 9 | private static WordSimilarity wordSimilarity = XiaConceptParser.getInstance(); 10 | private static SentenceSimilarity sentenceSimilarity = MorphoSimilarity.getInstance(); 11 | 12 | private SimilarityFactory(){} 13 | 14 | public static WordSimilarity getWordSimilarity(){ 15 | return wordSimilarity; 16 | } 17 | 18 | public static SentenceSimilarity getSentenceSimilarity(){ 19 | return sentenceSimilarity; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/phrase/PhraseSimilarity.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.phrase; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import ruc.irm.similarity.Similaritable; 7 | 8 | /** 9 | * 一种简单的短语相似度计算方法,算法原理请参考《中文信息相似度计算理论与方法》一书P69. 10 | * 11 | * @author 夏天 12 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 13 | */ 14 | public class PhraseSimilarity implements Similaritable { 15 | 16 | @Override 17 | public double getSimilarity(String item1, String item2) { 18 | return (getSC(item1, item2) + getSC(item2, item1)) / 2.0; 19 | } 20 | 21 | public List getC(String first, String second, int pos) { 22 | List results = new ArrayList(); 23 | char ch = first.charAt(pos); 24 | for (int i = 0; i < second.length(); i++) { 25 | if (ch == second.charAt(i)) { 26 | results.add(i); 27 | } 28 | } 29 | return results; 30 | } 31 | 32 | public int getDistance(String first, String second, int pos) { 33 | int d = second.length(); 34 | for (int k : getC(first, second, pos)) { 35 | int value = Math.abs(k - pos); 36 | if (d > value) { 37 | d = value; 38 | } 39 | } 40 | 41 | return d; 42 | } 43 | 44 | public double getCC(String first, String second, int pos) { 45 | return (second.length() - getDistance(first, second, pos)) * 1.0 / second.length(); 46 | } 47 | 48 | public double getSC(String first, String second) { 49 | double total = 0.0; 50 | for (int i = 0; i < first.length(); i++) { 51 | total = total + getCC(first, second, i); 52 | } 53 | return total / first.length(); 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/SegmentProxy.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence; 2 | 3 | import java.awt.BorderLayout; 4 | import java.awt.GridLayout; 5 | import java.awt.event.ActionEvent; 6 | import java.awt.event.ActionListener; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | import javax.swing.BorderFactory; 11 | import javax.swing.JButton; 12 | import javax.swing.JLabel; 13 | import javax.swing.JPanel; 14 | import javax.swing.JScrollPane; 15 | import javax.swing.JTextArea; 16 | import javax.swing.JTextField; 17 | 18 | import org.ansj.domain.Result; 19 | import org.ansj.domain.Term; 20 | import org.ansj.splitWord.analysis.ToAnalysis; 21 | 22 | /** 23 | * 对词法分析程序的封装代理,目前内部封装了对Ictclas4j(夏天改进版)的调用
24 | * 为方便演示程序快速启动,对Segment的调用采用了单例模式,实现需要时的延迟加载。 25 | * 26 | * @CHANGE 2014/04/04 采用Ansj词法分析器取代Ictclas4j-summer version 27 | * 28 | * @author 夏天 29 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 30 | */ 31 | public class SegmentProxy { 32 | 33 | public static class Word { 34 | /** 35 | * 词语内容 36 | */ 37 | private String word; 38 | /** 39 | * 词语词性代号 40 | */ 41 | private String pos; 42 | 43 | public Word(String word, String pos) { 44 | this.word = word; 45 | this.pos = pos; 46 | } 47 | 48 | public String getWord() { 49 | return word; 50 | } 51 | 52 | public void setWord(String word) { 53 | this.word = word; 54 | } 55 | 56 | public String getPos() { 57 | return pos; 58 | } 59 | 60 | public void setPos(String pos) { 61 | this.pos = pos; 62 | } 63 | } 64 | 65 | public static List segment(String sentence) { 66 | List results = new ArrayList(); 67 | Result terms = ToAnalysis.parse(sentence); 68 | 69 | for (Term term : terms) { 70 | results.add(new Word(term.getName(), term.natrue().natureStr)); 71 | } 72 | 73 | return results; 74 | } 75 | 76 | public static String getSegmentedString(String sentence) { 77 | List words = segment(sentence); 78 | StringBuilder sb = new StringBuilder(); 79 | for (Word word : words) { 80 | sb.append(word.getWord() + "/" + word.getPos()).append(" "); 81 | } 82 | return sb.toString(); 83 | } 84 | 85 | public static JPanel createPanel() { 86 | //声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel 87 | JPanel fullPanel = new JPanel(); 88 | fullPanel.setLayout(new BorderLayout()); 89 | 90 | JPanel northPanel = new JPanel(); 91 | fullPanel.add(northPanel, "North"); 92 | 93 | //centerPanel包括了一个文本框 94 | JPanel centerPanel = new JPanel(); 95 | fullPanel.add(centerPanel, "Center"); 96 | centerPanel.setLayout(new BorderLayout()); 97 | final JTextArea result = new JTextArea(); 98 | //result.setFont(new Font("宋体", Font.PLAIN, 16)); 99 | result.setLineWrap(true); 100 | JScrollPane centerScrollPane = new JScrollPane(result); 101 | centerPanel.add(centerScrollPane, "Center"); 102 | 103 | northPanel.setLayout(new GridLayout(1, 1)); 104 | 105 | //以下加入northPanel中的第一个面板 106 | final JTextField senField = new JTextField("什么是计算机病毒"); 107 | senField.setColumns(50); 108 | 109 | JPanel mainPanel = new JPanel(); 110 | mainPanel.setLayout(new GridLayout(2, 1)); 111 | 112 | JPanel linePanel = new JPanel(); 113 | linePanel.add(new JLabel("句子:")); 114 | linePanel.add(senField); 115 | mainPanel.add(linePanel); 116 | 117 | linePanel = new JPanel(); 118 | JButton goButton = new JButton("词法分析"); 119 | linePanel.add(goButton); 120 | mainPanel.add(linePanel); 121 | goButton.addActionListener(new ActionListener() { 122 | 123 | @Override 124 | public void actionPerformed(ActionEvent e) { 125 | String sentence = senField.getText(); 126 | String text = "[" + sentence + "]的词法分析结果为:"; 127 | 128 | text = text + "\n" + getSegmentedString(sentence); 129 | text = text + "\n________________________________\n" + result.getText(); 130 | result.setText(text); 131 | } 132 | 133 | }); 134 | mainPanel.setBorder(BorderFactory.createEtchedBorder()); 135 | northPanel.add(mainPanel); 136 | 137 | return fullPanel; 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/SentenceSimilarity.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence; 2 | 3 | import ruc.irm.similarity.Similaritable; 4 | 5 | public interface SentenceSimilarity extends Similaritable { 6 | 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/editdistance/Block.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence.editdistance; 2 | 3 | 4 | 5 | public class Block { 6 | 7 | private int globalPosition; 8 | /** 块的内容 */ 9 | private SuperString data; 10 | /** 前后指针 */ 11 | private Block prev, next; 12 | /** 是否已经进行划分 */ 13 | private boolean divideFlag = false; 14 | 15 | public Block(SuperString string){ 16 | this.data = string; 17 | this.globalPosition = 0; 18 | } 19 | 20 | public Block(SuperString string, int globalBegin){ 21 | this.data = string; 22 | this.globalPosition = globalBegin; 23 | } 24 | 25 | public int getGlobalPosition() { 26 | return globalPosition; 27 | } 28 | 29 | public void setGlobalPosition(int globalPosition) { 30 | this.globalPosition = globalPosition; 31 | } 32 | 33 | public SuperString getData() { 34 | return data; 35 | } 36 | 37 | public void setData(SuperString data) { 38 | this.data = data; 39 | } 40 | 41 | public Block getPrev() { 42 | return prev; 43 | } 44 | 45 | public void setPrev(Block prev) { 46 | this.prev = prev; 47 | } 48 | 49 | public Block getNext() { 50 | return next; 51 | } 52 | 53 | public void setNext(Block next) { 54 | this.next = next; 55 | } 56 | 57 | public boolean isDivideFlag() { 58 | return divideFlag; 59 | } 60 | 61 | public void setDivideFlag(boolean divideFlag) { 62 | this.divideFlag = divideFlag; 63 | } 64 | 65 | public void divide(int start, int length){ 66 | if(start==0 && length==data.length()){ 67 | this.divideFlag = true; 68 | return; 69 | }else if(start==0){ 70 | //前面为已经分割的标记,后面应该为未分割的标记 71 | Block tail = new Block(data.substring(length), globalPosition + start); 72 | this.setDivideFlag(true); 73 | this.setData(data.substring(0, length)); 74 | tail.next = this.next; 75 | if(tail.next!=null) tail.next.prev = tail; 76 | this.next = tail; 77 | tail.prev = this; 78 | }else if(start+length == data.length()){ 79 | //后面为已经分割的标记,前面应该为未分割的标记 80 | Block head = new Block(data.substring(0, start), globalPosition); 81 | 82 | this.setDivideFlag(true); 83 | this.setData(data.substring(start)); 84 | 85 | head.prev = this.prev; 86 | if(head.prev!=null) head.prev.next = head; 87 | head.next = this; 88 | this.prev = head; 89 | }else{ 90 | //中间为已经分割的标记,前面和后面应该为未分割的标记 91 | Block head = new Block(data.substring(0, start), globalPosition); 92 | Block tail = new Block(data.substring(start+length), globalPosition + start+length); 93 | 94 | this.setDivideFlag(true); 95 | this.setData(data.substring(start, start+length)); 96 | this.setGlobalPosition(globalPosition + start); 97 | 98 | head.prev = this.prev; 99 | if(head.prev!=null) head.prev.next = head; 100 | head.next = this; 101 | this.prev = head; 102 | 103 | tail.next = this.next; 104 | if(tail.next!=null) tail.next.prev = tail; 105 | this.next = tail; 106 | tail.prev = this; 107 | } 108 | 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/editdistance/CharEditUnit.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence.editdistance; 2 | 3 | public class CharEditUnit extends EditUnit { 4 | private String content = ""; 5 | 6 | public CharEditUnit(Character ch){ 7 | content = ch.toString(); 8 | } 9 | 10 | @Override 11 | public String getUnitString() { 12 | return content; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/editdistance/ChunkEditUnit.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence.editdistance; 2 | 3 | 4 | public class ChunkEditUnit extends EditUnit { 5 | private SuperString chunk = null; 6 | 7 | public ChunkEditUnit(SuperString chunk){ 8 | this.chunk = chunk; 9 | } 10 | 11 | public String getUnitString() { 12 | return chunk.toString(); 13 | } 14 | 15 | /** 16 | * 根据此语的相似度获取替换代价 17 | */ 18 | @Override 19 | public double getSubstitutionCost(EditUnit otherUnit){ 20 | if(!(otherUnit instanceof ChunkEditUnit)) return chunk.length(); 21 | if(equals(otherUnit)) return 0.0; 22 | 23 | ChunkEditUnit other = (ChunkEditUnit)otherUnit; 24 | return new StandardEditDistance().getEditDistance(chunk, other.chunk); 25 | } 26 | 27 | /** 28 | * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0 29 | * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价 30 | * @return 删除代价 31 | */ 32 | public double getDeletionCost(){ 33 | return chunk.length(); 34 | } 35 | 36 | /** 37 | * 获取插入代价,标准算法的默认值为1.0. 38 | * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价 39 | */ 40 | public double getInsertionCost(){ 41 | return chunk.length(); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/editdistance/EditDistance.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence.editdistance; 2 | 3 | import ruc.irm.similarity.Similaritable; 4 | 5 | 6 | /** 7 | * 编辑距离的父类,定义了其中的主要行为 8 | * 9 | * @author 夏天 10 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 11 | */ 12 | public abstract class EditDistance implements Similaritable { 13 | 14 | public abstract double getEditDistance(SuperString S, SuperString T); 15 | 16 | public double getSimilarity(String s1, String s2){ 17 | SuperString S = SuperString.createWordSuperString(s1); 18 | SuperString T = SuperString.createWordSuperString(s2); 19 | 20 | return 1-(getEditDistance(S, T))/(Math.max(S.length(), T.length())); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/editdistance/EditUnit.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence.editdistance; 2 | 3 | /** 4 | * 编辑单元 5 | * 6 | * @author 夏天 7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 8 | */ 9 | public abstract class EditUnit { 10 | /** 11 | * 获取编辑单元的内部字符串 12 | * @return 13 | */ 14 | public abstract String getUnitString(); 15 | 16 | /** 17 | * 获取替换代价,默认替换代价当替换单元的内容相同时为0, 18 | * 不同时为1 19 | */ 20 | public double getSubstitutionCost(EditUnit other){ 21 | return this.equals(other)?0:1; 22 | } 23 | 24 | /** 25 | * 获取删除代价,标准算法的默认值为1.0, 此处也设为1.0 26 | * 具体的编辑单元可以通过覆盖该方法设置不同的删除代价 27 | * @return 删除代价 28 | */ 29 | public double getDeletionCost(){ 30 | return 1.0; 31 | } 32 | 33 | /** 34 | * 获取插入代价,标准算法的默认值为1.0. 35 | * 具体的编辑单元可以通过覆盖该方法设置不同的插入代价 36 | */ 37 | public double getInsertionCost(){ 38 | return 1.0; 39 | } 40 | 41 | @Override 42 | public boolean equals(Object other){ 43 | if(!(other instanceof EditUnit)) return false; 44 | return getUnitString().equals(((EditUnit)other).getUnitString()); 45 | } 46 | 47 | @Override 48 | public String toString(){ 49 | return getUnitString(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/editdistance/GregorEditDistance.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence.editdistance; 2 | 3 | /** 4 | * 由Gregor提出的考虑块交换(Block Transposition)的编辑距离改进算法 5 | * 时间复杂度为O(m3n3) 6 | * 具体实现请参考GregorLeusch,Nicola Ueffing的文章《A Novel String-to-String Distance Measure With 7 | * Application to Machine Translation Evaluation》 8 | * 问题:
9 | * 相似度计算的问题会影响句子相似度计算的直观结果,例如“什么是计算机病毒”,“电脑病毒是什么” 10 | * 直觉应该是2,即“什么是计算机病毒”首先变为“计算机病毒什么是”,再变为“计算机病毒是什么”, 11 | * 编辑代价为2,但实际上,当由“什么是计算机病毒”变为“计算机病毒什么是”后,由于"什么是"与“是什么”的替换代价只有0.2, 12 | * 因而不再进行交互,故总的编辑距离为1.2 13 | * 14 | * @author 夏天 15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 16 | */ 17 | public class GregorEditDistance extends EditDistance { 18 | /** 块交换代价 */ 19 | public static double swapCost = 0.5; 20 | 21 | private SuperString S,T; 22 | /** 存放字符串从S(i0-i1)到T(j0-j1)的中间运算结果,避免多次运算,提高运算效率*/ 23 | private double[][][][] QArray; 24 | 25 | public double getEditDistance(SuperString S,SuperString T){ 26 | this.S = S; 27 | this.T = T; 28 | QArray = new double[S.length()][S.length()][T.length()][T.length()]; 29 | for(int i=0;i subsitituteValue){ 63 | minSubstituteValue = subsitituteValue; 64 | minPosJ = j; 65 | } 66 | } 67 | for(int j=j0;j<=j1;j++){ 68 | if(j == minPosJ){ 69 | cost += minSubstituteValue; 70 | }else{ 71 | cost += T.elementAt(j).getInsertionCost(); 72 | } 73 | } 74 | }else if(j1==j0){ 75 | double minSubstituteValue = 1.0; 76 | int minPosI = i0; 77 | for(int i=i0;i<=i1;i++){ 78 | double subsitituteValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j0)); 79 | if(minSubstituteValue > subsitituteValue){ 80 | minSubstituteValue = subsitituteValue; 81 | minPosI = i; 82 | } 83 | } 84 | for(int i=i0;i<=i1;i++){ 85 | if(i == minPosI){ 86 | cost += minSubstituteValue; 87 | }else{ 88 | cost += S.elementAt(i).getDeletionCost(); 89 | } 90 | } 91 | }else{ 92 | if(QArray[i0][i1][j0][j1] X, SuperString Y){ 10 | Block LX = new Block(X); 11 | Block LY = new Block(Y); 12 | split(LX,LY); 13 | while(LY.getPrev()!=null){ 14 | LY = LY.getPrev(); 15 | } 16 | while(LX.getPrev()!=null){ 17 | LX = LX.getPrev(); 18 | } 19 | List first = new ArrayList(); 20 | List second = new ArrayList(); 21 | while(LX!=null){ 22 | first.add(new ChunkEditUnit(LX.getData())); 23 | LX = LX.getNext(); 24 | } 25 | 26 | while(LY!=null){ 27 | second.add(new ChunkEditUnit(LY.getData())); 28 | LY = LY.getNext(); 29 | } 30 | SuperString s1 = new SuperString(first); 31 | SuperString s2 = new SuperString(second); 32 | Object[] obj = new Object[]{s1, s2}; 33 | return obj; 34 | } 35 | 36 | private static void split(Block bx, Block LY){ 37 | LCS maxLCS = null; 38 | Block by = LY; 39 | while(by.getPrev()!=null){ 40 | by = by.getPrev(); 41 | } 42 | Block maxMatchedBy = by; 43 | while(by!=null){ 44 | if(by.isDivideFlag()){ 45 | by = by.getNext(); 46 | continue; 47 | } 48 | 49 | LCS lcs = LCS.parse(bx.getData(), by.getData()); 50 | if(maxLCS==null || maxLCS.length0){ 59 | bx.divide(maxLCS.x_pos, maxLCS.length); 60 | maxMatchedBy.divide(maxLCS.y_pos, maxLCS.length); 61 | } 62 | 63 | if(bx.getPrev()!=null && !bx.isDivideFlag()){ 64 | split(bx.getPrev(), LY); 65 | } 66 | 67 | if(bx.getNext()!=null &&!bx.getNext().isDivideFlag()){ 68 | split(bx.getNext(), LY); 69 | } 70 | 71 | } 72 | 73 | /** 74 | * longest common string 75 | * @author Gavin 76 | * 77 | */ 78 | public static class LCS { 79 | public int length = 0; //LCS匹配的最长结果 80 | public int x_pos = 0; //LCS匹配的X的位置 81 | public int y_pos = 0; //LCS匹配的Y的位置 82 | 83 | public static LCS parse(SuperString X, SuperString Y){ 84 | LCS lcs = new LCS(); 85 | for(int start=0; start tempX = X.substring(start, end); 88 | 89 | int pos = Y.indexOf(tempX); 90 | if(pos>=0 && tempX.length()>lcs.length){ 91 | lcs.length = tempX.length(); 92 | lcs.x_pos = start; 93 | lcs.y_pos = pos; 94 | } 95 | } 96 | } 97 | return lcs; 98 | } 99 | 100 | public String toString(){ 101 | return "length=" + length + ", x_pos=" + x_pos + ", y_pos=" + y_pos; 102 | } 103 | } 104 | 105 | public static void main(String[] args) { 106 | String s1 = "abcdefghijkabc"; 107 | String s2 = "cdefghijklabccc"; 108 | // s2 = "fgabcdehijklkdslfkasdflak"; 109 | // s1 = "abcdefgxyzoxyjasdkfjjjaldsfa"; 110 | // s1 = "I like the book"; 111 | // s2 = "the book I like"; 112 | s1 = "什么是计算机病毒"; 113 | s2 = "电脑病毒是什么"; 114 | 115 | // SuperString ss1 = SuperString.createCharSuperString(s1); 116 | // SuperString ss2 = SuperString.createCharSuperString(s2); 117 | 118 | SuperString ss1 = SuperString.createWordSuperString(s1); 119 | SuperString ss2 = SuperString.createWordSuperString(s2); 120 | Split.split(ss1, ss2); 121 | // LCS lcs = LCS.parse(ss1, ss2); 122 | // System.out.println(lcs); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/editdistance/StandardEditDistance.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence.editdistance; 2 | 3 | 4 | /** 5 | * 基于编辑距离的汉语句子相似度计算 6 | * 7 | * @author 夏天 8 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 9 | */ 10 | public class StandardEditDistance extends EditDistance { 11 | /** 12 | * 获取两个串的编辑距离 13 | * @param S 字符串1 14 | * @param T 字符串2 15 | * @return 两个串的编辑距离 16 | */ 17 | public double getEditDistance(SuperString X, SuperString Y){ 18 | double[][] D; //编辑矩阵 19 | 20 | int m = X.length(); //字符串X的长度 21 | int n = Y.length(); //字符串Y的长度 22 | //char ch_x_i; //字符串X的第i个词 23 | //char ch_y_j; //字符串Y的第j个词 24 | 25 | if(m == 0){ 26 | double distance = 0.0; 27 | for(int j=0; j夏天 14 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 15 | * 16 | * @param 17 | */ 18 | public class SuperString { 19 | private List contents = new ArrayList(); 20 | 21 | public SuperString(List contents){ 22 | this.contents = contents; 23 | } 24 | 25 | public static SuperString createCharSuperString(String str){ 26 | List list = new ArrayList(str.length()); 27 | for(int i=0; i s = new SuperString(list); 31 | return s; 32 | } 33 | 34 | public static SuperString createWordSuperString(String sentence){ 35 | List wordList = SegmentProxy.segment(sentence); 36 | List unitList = new ArrayList(wordList.size()); 37 | for(int i=0; i s = new SuperString(unitList); 41 | return s; 42 | } 43 | 44 | 45 | public T elementAt(int pos){ 46 | if(pos<0 || pos>=contents.size()){ 47 | throw new ArrayIndexOutOfBoundsException("下标越界"); 48 | } 49 | return contents.get(pos); 50 | } 51 | 52 | public int indexOf(SuperString substring){ 53 | int result = -1; 54 | for(int i=0; ilength()) return -1; 57 | 58 | for(;j substring(int fromIndex, int toIndex){ 73 | return new SuperString(contents.subList(fromIndex, toIndex)); 74 | } 75 | 76 | public SuperString substring(int fromIndex){ 77 | return new SuperString(contents.subList(fromIndex, contents.size())); 78 | } 79 | 80 | public int length(){ 81 | return contents.size(); 82 | } 83 | 84 | @Override 85 | public String toString(){ 86 | StringBuilder sb = new StringBuilder(); 87 | for(int i=0; i0.85; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/sentence/editdistance/XiatianEditDistance.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence.editdistance; 2 | 3 | 4 | /** 5 | * 夏天提出的新的支持非相邻块交互的编辑距离算法 6 | * 7 | * @author 夏天 8 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 9 | */ 10 | public class XiatianEditDistance extends EditDistance { 11 | /** 块交换代价 */ 12 | public static double swapCost = 0.5; 13 | 14 | private SuperString S,T; 15 | private double[][][][] QArray; 16 | 17 | public double getEditDistance(SuperString S, SuperString T){ 18 | this.S = S; 19 | this.T = T; 20 | QArray = new double[S.length()+1][S.length()+1][T.length()+1][T.length()+1]; 21 | for(int i=0;i<=S.length();i++){ 22 | for(int i2=0;i2<=S.length();i2++) 23 | for(int j=0;j<=T.length();j++) 24 | for(int j2=0;j2<=T.length();j2++){ 25 | QArray[i][i2][j][j2]=Double.MAX_VALUE; 26 | } 27 | } 28 | return Q(0,S.length()-1,0,T.length()-1); 29 | } 30 | 31 | private double Q(int i1,int im,int j1,int jn){ 32 | if(QArray[i1][im][j1][jn] subValue){ 52 | minSubValue = subValue; 53 | minPosJ = j; 54 | } 55 | } 56 | for(int j=j1;j<=jn;j++){ 57 | if(j == minPosJ){ 58 | cost += minSubValue; 59 | }else{ 60 | cost += T.elementAt(j).getInsertionCost(); 61 | } 62 | } 63 | }else if(j1==jn){ 64 | int minPosI = i1; 65 | double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1)); 66 | for(int i=i1+1;i<=im;i++){ 67 | double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1)); 68 | if(minSubValue > subValue){ 69 | minSubValue = subValue; 70 | minPosI = i; 71 | } 72 | } 73 | for(int i=i1;i<=im;i++){ 74 | if(i == minPosI){ 75 | cost += minSubValue; 76 | }else{ 77 | cost += S.elementAt(i).getDeletionCost(); 78 | } 79 | } 80 | }else{ 81 | cost = QArray[i1][im][j1][jn]; 82 | loop:for(int i=i1;i夏天 8 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 9 | */ 10 | public class XiatianEditDistance2 extends EditDistance { 11 | /** 块交换代价 */ 12 | private double swapCost = 1.0; 13 | 14 | private SuperString S,T; 15 | private double[][][][] QArray; 16 | 17 | @SuppressWarnings("unchecked") 18 | public double getEditDistance(SuperString S1, SuperString T1){ 19 | Object[] array = Split.split(S1, T1); 20 | this.S = (SuperString)array[0]; 21 | this.T = (SuperString)array[1]; 22 | QArray = new double[S.length()+1][S.length()+1][T.length()+1][T.length()+1]; 23 | for(int i=0;i<=S.length();i++){ 24 | for(int i2=0;i2<=S.length();i2++) 25 | for(int j=0;j<=T.length();j++) 26 | for(int j2=0;j2<=T.length();j2++){ 27 | QArray[i][i2][j][j2]=Double.MAX_VALUE; 28 | } 29 | } 30 | return Q(0,S.length()-1,0,T.length()-1); 31 | } 32 | 33 | private double Q(int i1,int im,int j1,int jn){ 34 | if(QArray[i1][im][j1][jn] subValue){ 54 | minSubValue = subValue; 55 | minPosJ = j; 56 | } 57 | } 58 | for(int j=j1;j<=jn;j++){ 59 | if(j == minPosJ){ 60 | cost += minSubValue; 61 | }else{ 62 | cost += T.elementAt(j).getInsertionCost(); 63 | } 64 | } 65 | }else if(j1==jn){ 66 | int minPosI = i1; 67 | double minSubValue = S.elementAt(i1).getSubstitutionCost(T.elementAt(j1)); 68 | for(int i=i1+1;i<=im;i++){ 69 | double subValue = S.elementAt(i).getSubstitutionCost(T.elementAt(j1)); 70 | if(minSubValue > subValue){ 71 | minSubValue = subValue; 72 | minPosI = i; 73 | } 74 | } 75 | for(int i=i1;i<=im;i++){ 76 | if(i == minPosI){ 77 | cost += minSubValue; 78 | }else{ 79 | cost += S.elementAt(i).getDeletionCost(); 80 | } 81 | } 82 | }else{ 83 | cost = QArray[i1][im][j1][jn]; 84 | loop:for(int i=i1;i 16 | * 《中文信息相似度计算理论与方法》5.4.3小节所介绍的方法,在考虑语义时, 17 | * 无法直接获取OnceWS(A, B),因此,采用了两两匹配取最大值的方式。 18 | * 新的改进算法请参考{@code SemanticSimilarity} 19 | * 20 | * @author 夏天 21 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 22 | * 23 | */ 24 | public class MorphoSimilarity implements SentenceSimilarity { 25 | private static Logger LOG = LoggerFactory.getLogger(MorphoSimilarity.class); 26 | 27 | /** 词形相似度占总相似度的比重 */ 28 | private final double LAMBDA1 = 1.0; 29 | /** 词序相似度占总相似度的比重 */ 30 | private final double LAMBDA2 = 0.0; 31 | /** 词语相似度的计算 */ 32 | private WordSimilarity wordSimilarity = null; 33 | 34 | private static String FILTER_CHARS = "  ,。;?《》()|!,.;?<>|_^…!"; 35 | 36 | private static MorphoSimilarity instance = null; 37 | 38 | public static MorphoSimilarity getInstance(){ 39 | if(instance == null){ 40 | instance = new MorphoSimilarity(); 41 | } 42 | return instance; 43 | } 44 | 45 | private MorphoSimilarity(){ 46 | LOG.debug("used hownet wordsimilarity."); 47 | this.wordSimilarity = XiaConceptParser.getInstance(); 48 | //this.segmenter = SegmentFactory.getInstance().getParser(); 49 | } 50 | 51 | /** 52 | * 滤掉词串中的空格、标点符号 53 | * @param word_list 54 | * @return 55 | */ 56 | private String[] filter(String[] word_list){ 57 | List results = new ArrayList(); 58 | for(String w:word_list){ 59 | if(!FILTER_CHARS.contains(w)){ 60 | results.add(w.toLowerCase()); 61 | } 62 | } 63 | 64 | return results.toArray(new String[results.size()]); 65 | } 66 | 67 | /** 68 | * 计算两个句子的相似度 69 | * @see ruc.irm.similarity.Similaritable 70 | */ 71 | public double getSimilarity(String firstSen,String secondSen){ 72 | //LOG.debug(segmenter.segmentToString(firstSen)); 73 | //LOG.debug(segmenter.segmentToString(secondSen)); 74 | String[] firstList = filter(segment(firstSen)); 75 | String[] secondList = filter(segment(secondSen)); 76 | 77 | double wordSim = getOccurrenceSimilarity(firstList,secondList); 78 | //LOG.debug("词形相似度="+wordSim); 79 | 80 | double orderSim = getOrderSimilarity(firstList,secondList); 81 | //LOG.debug("词序相似度="+orderSim); 82 | 83 | return LAMBDA1*wordSim+LAMBDA2*orderSim; 84 | } 85 | 86 | /** 87 | * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序,第二个句子词语的顺序变化次数 88 | * @param firstList 89 | * @param secondList 90 | * @return 91 | */ 92 | public double getOccurrenceSimilarity(String[] firstList, String[] secondList){ 93 | int max = firstList.length>secondList.length?firstList.length:secondList.length; 94 | if(max==0){ 95 | return 0; 96 | } 97 | 98 | //首先计算出所有可能的组合 99 | double[][] scores = new double[max][max]; 100 | for(int i=0; i 0){ 110 | double max_score = 0; 111 | int max_row = 0; 112 | int max_col = 0; 113 | 114 | //先挑出相似度最大的一对: 115 | for(int i=0; ii?i:i-1; 132 | int tmp_j = max_col>j?j:j-1; 133 | tmp_scores[tmp_i][tmp_j] = scores[i][j]; 134 | } 135 | } 136 | total_score += max_score; 137 | scores = tmp_scores; 138 | } 139 | 140 | return (2*total_score) / (firstList.length + secondList.length); 141 | } 142 | 143 | /** 144 | * 获取两个集合的词序相似度 145 | * @param firstList 146 | * @param secondList 147 | * @return 148 | */ 149 | public double getOrderSimilarity(String[] firstList, String[] secondList){ 150 | double similarity = 0.0; 151 | 152 | return similarity; 153 | } 154 | 155 | // @SuppressWarnings("unchecked") 156 | // public String[] segment(String sentence){ 157 | // MPWordSegment ws = new MPWordSegment(); 158 | // ws.parseReader(new StringReader(sentence)); 159 | // Vector tokens = ws.getTokens(); 160 | // String[] results = new String[tokens.size()]; 161 | // for(int i=0; i list = SegmentProxy.segment(sentence); 171 | String[] results = new String[list.size()]; 172 | for(int i=0; i夏天 22 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 23 | * 24 | */ 25 | public class SemanticSimilarity implements SentenceSimilarity { 26 | private static Logger LOG = LoggerFactory.getLogger(SemanticSimilarity.class); 27 | 28 | /** 词形相似度占总相似度的比重 */ 29 | private final double LAMBDA1 = 0.8; 30 | /** 词序相似度占总相似度的比重 */ 31 | private final double LAMBDA2 = 0.2; 32 | 33 | /** 如果两个词语的相似度大于了该阈值, 则作为相同词语,计算词序相似度 */ 34 | private final double GAMMA = 0.6; 35 | 36 | /** 词语相似度的计算 */ 37 | private WordSimilarity wordSimilarity = null; 38 | 39 | private static String FILTER_CHARS = "  ,。;?《》()|!,.;?<>|_^…!"; 40 | 41 | private static SemanticSimilarity instance = null; 42 | 43 | public static SemanticSimilarity getInstance(){ 44 | if(instance == null){ 45 | instance = new SemanticSimilarity(); 46 | } 47 | return instance; 48 | } 49 | 50 | private SemanticSimilarity(){ 51 | LOG.debug("used hownet wordsimilarity."); 52 | this.wordSimilarity = XiaConceptParser.getInstance(); 53 | //this.segmenter = SegmentFactory.getInstance().getParser(); 54 | } 55 | 56 | /** 57 | * 滤掉词串中的空格、标点符号 58 | * @param word_list 59 | * @return 60 | */ 61 | private String[] filter(String[] word_list){ 62 | List results = new ArrayList(); 63 | for(String w:word_list){ 64 | if(!FILTER_CHARS.contains(w)){ 65 | results.add(w.toLowerCase()); 66 | } 67 | } 68 | 69 | return results.toArray(new String[results.size()]); 70 | } 71 | 72 | /** 73 | * 计算两个句子的相似度 74 | * @see ruc.irm.similarity.Similaritable 75 | */ 76 | public double getSimilarity(String firstSen,String secondSen){ 77 | //LOG.debug(segmenter.segmentToString(firstSen)); 78 | //LOG.debug(segmenter.segmentToString(secondSen)); 79 | String[] firstList = filter(segment(firstSen)); 80 | String[] secondList = filter(segment(secondSen)); 81 | 82 | return calculate(firstList,secondList); 83 | } 84 | 85 | /** 86 | * 获取两个集合的词形相似度, 同时获取相对于第一个句子中的词语顺序,第二个句子词语的顺序变化次数 87 | * @param firstList 88 | * @param secondList 89 | * @return 90 | */ 91 | public double calculate(String[] firstList, String[] secondList){ 92 | if(firstList.length == 0 || secondList.length == 0){ 93 | return 0; 94 | } 95 | 96 | //首先计算出所有可能的组合 97 | double[][] scores = new double[firstList.length][secondList.length]; 98 | 99 | //代表第1个句子对应位置是否已经被使用, 默认为未使用,即false 100 | boolean[] firstFlags = new boolean[firstList.length]; 101 | 102 | //代表第2个句子对应位置是否已经被使用, 默认为未使用,即false 103 | boolean[] secondFlags = new boolean[secondList.length]; 104 | 105 | //PSecond的定义参见书中5.4.3节, 为避免无必要的初始化数组, 106 | //数组中0值表示在第一个句子中没有对应的相似词语,大于0的值 107 | //则表示在第一个句子中的位置(从1开始编号了) 108 | int[] PSecond = new int[secondList.length]; 109 | 110 | for(int i=0; i 126 | for(int i=0; i=0) { 140 | total_score += max_score; 141 | firstFlags[max_row] = true; 142 | secondFlags[max_col] = true; 143 | if(max_score>=GAMMA) { 144 | PSecond[max_col] = max_row+1; 145 | } 146 | } else { 147 | break; 148 | } 149 | } 150 | 151 | double wordSim = (2*total_score) / (firstList.length + secondList.length); 152 | 153 | int previous = 0; 154 | int revOrdCount = 0; 155 | int onceWSSize = 0; 156 | for(int i=0; i0) { 158 | onceWSSize++; 159 | if(previous>0 && (previous>PSecond[i])) { 160 | revOrdCount++; 161 | } 162 | previous = PSecond[i]; 163 | } 164 | } 165 | 166 | double ordSim = 0; 167 | if(onceWSSize==1) { 168 | ordSim = 1; 169 | } else if(onceWSSize == 0) { 170 | ordSim = 0; 171 | } else { 172 | ordSim = 1.0 - revOrdCount*1.0/(onceWSSize-1); 173 | } 174 | 175 | System.out.println("wordSim ==> " + wordSim + ", ordSim ==> " + ordSim); 176 | 177 | return LAMBDA1*wordSim+LAMBDA2*ordSim; 178 | } 179 | 180 | public String[] segment(String sentence){ 181 | List list = SegmentProxy.segment(sentence); 182 | String[] results = new String[list.size()]; 183 | for(int i=0; i夏天 20 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 21 | */ 22 | public class DictStatistic { 23 | /** 24 | * 从指定的xml文件加载词典文件 25 | * @param xmlFile 26 | * @param gzCompressed 是否再用gz格式对词典进行了压缩 27 | * @return 28 | */ 29 | public void testFromXml(String xmlFile, boolean gzCompressed) { 30 | File file = new File(xmlFile); 31 | if (!file.canRead()){ 32 | System.out.println("无法读取文件:" + xmlFile); 33 | return;// fail while opening the file 34 | } 35 | int count = 0, conceptCount=0; 36 | XMLInputFactory inputFactory = XMLInputFactory.newInstance(); 37 | InputStream input = null; 38 | try { 39 | if(gzCompressed){ 40 | input = new GZIPInputStream(new FileInputStream(file)); 41 | }else{ 42 | input = new FileInputStream(file); 43 | } 44 | XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input); 45 | while (xmlEventReader.hasNext()) { 46 | XMLEvent event = xmlEventReader.nextEvent(); 47 | 48 | if (event.isStartElement()) { 49 | StartElement startElement = event.asStartElement(); 50 | if(startElement.getName().toString().equals("table")){ 51 | String head = startElement.getAttributeByName(QName.valueOf("head")).getValue(); 52 | while (xmlEventReader.hasNext()) { 53 | XMLEvent itemEvent = xmlEventReader.nextEvent(); 54 | if(itemEvent.isStartElement()){ 55 | StartElement itemStartElement = itemEvent.asStartElement(); 56 | if(!itemStartElement.getName().toString().equals("item")) continue; 57 | String word = itemStartElement.getAttributeByName(QName.valueOf("word")).getValue(); 58 | word = head + word; 59 | if(XiaConceptParser.getInstance().isConcept(word)){ 60 | conceptCount++; 61 | } 62 | count++; 63 | if(count%1000==0){ 64 | System.out.println("process words " + count + "..."); 65 | } 66 | } 67 | } 68 | } 69 | } 70 | } 71 | input.close(); 72 | System.out.println(count + "\t" + conceptCount); 73 | return; 74 | } catch (Exception e) { 75 | e.printStackTrace(); 76 | } 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/statistic/LCMC.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.statistic; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.InputStream; 6 | 7 | import javax.xml.stream.XMLEventReader; 8 | import javax.xml.stream.XMLInputFactory; 9 | import javax.xml.stream.events.StartElement; 10 | import javax.xml.stream.events.XMLEvent; 11 | 12 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser; 13 | 14 | 15 | 16 | public class LCMC { 17 | 18 | public void countUnConceptWords(File xmlFile) throws Exception{ 19 | int totalCount = 0, conceptCount = 0; 20 | XMLInputFactory inputFactory = XMLInputFactory.newInstance(); 21 | InputStream input = null; 22 | input = new FileInputStream(xmlFile); 23 | XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input); 24 | while (xmlEventReader.hasNext()) { 25 | XMLEvent event = xmlEventReader.nextEvent(); 26 | 27 | if (event.isStartElement()) { 28 | StartElement startElement = event.asStartElement(); 29 | //如果是word开始 30 | if(startElement.getName().toString().equals("w")){ 31 | String word = xmlEventReader.getElementText(); 32 | totalCount++; 33 | if(XiaConceptParser.getInstance().isConcept(word)){ 34 | conceptCount++; 35 | } 36 | } 37 | } 38 | }// 39 | input.close(); 40 | System.out.println(totalCount + "\t" + conceptCount); 41 | } 42 | 43 | public static void main(String[] args) throws Exception { 44 | LCMC lcmc = new LCMC(); 45 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_A.XML")); 46 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_B.XML")); 47 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_C.XML")); 48 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_D.XML")); 49 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_E.XML")); 50 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_F.XML")); 51 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_G.XML")); 52 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_H.XML")); 53 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_J.XML")); 54 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_K.XML")); 55 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_L.XML")); 56 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_M.XML")); 57 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_N.XML")); 58 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_P.XML")); 59 | lcmc.countUnConceptWords(new File("./db/lcmc/LCMC_R.XML")); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/text/DiceSimilarity.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.text; 2 | 3 | import ruc.irm.similarity.Similaritable; 4 | 5 | public class DiceSimilarity implements Similaritable { 6 | 7 | @Override 8 | public double getSimilarity(String item1, String item2) { 9 | // TODO Auto-generated method stub 10 | return 0; 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/util/About.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.util; 2 | 3 | import com.google.common.io.Resources; 4 | 5 | import javax.swing.*; 6 | import javax.swing.text.StyledEditorKit; 7 | import java.awt.*; 8 | import java.io.IOException; 9 | import java.net.URL; 10 | import java.net.URLClassLoader; 11 | 12 | /** 13 | * 关于xsimilarity项目的说明信息 14 | * 15 | * @author 夏天 16 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 17 | */ 18 | public class About extends JFrame { 19 | private static final long serialVersionUID = -2307582155443587993L; 20 | 21 | public static JPanel createPanel() { 22 | JPanel mainPanel = new JPanel(); 23 | mainPanel.setLayout(new BorderLayout()); 24 | JTextPane editorPane = new JTextPane(); 25 | editorPane.setEditable(false); 26 | //让长文本自动换行 27 | editorPane.setEditorKit(new StyledEditorKit()); 28 | editorPane.setContentType("text/html"); 29 | try { 30 | URL url = Resources.getResource("about.html");//可以用html格式文件做你的帮助系统了 31 | editorPane.setPage(url); 32 | } catch (IOException e1) { 33 | editorPane.setText(e1.getMessage()); 34 | } 35 | //editorPane.setText("个人主页:http://xiatian.irm.cn/"); 36 | 37 | 38 | mainPanel.add(new JScrollPane(editorPane), BorderLayout.CENTER); 39 | return mainPanel; 40 | } 41 | 42 | public About() { 43 | this.setTitle("关于XSimilarity"); 44 | 45 | this.setDefaultCloseOperation(EXIT_ON_CLOSE); 46 | this.setPreferredSize(new Dimension(600, 400)); 47 | this.getContentPane().add(createPanel()); 48 | this.pack(); 49 | } 50 | 51 | public static void main(String[] args) { 52 | new About().setVisible(true); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/util/BlankUtils.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.util; 2 | 3 | import java.util.Collection; 4 | 5 | /** 6 | * 判断是否为空的工具类 7 | * 8 | * @author 夏天 9 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 10 | */ 11 | public class BlankUtils { 12 | /** 13 | * 判断字符串s是否是空串 14 | * @param s 15 | * @return 16 | */ 17 | public static boolean isBlank(String string){ 18 | return string==null || string.trim().equals(""); 19 | } 20 | 21 | /** 22 | * 判断数组是否是空 23 | * @param array 24 | * @return 25 | */ 26 | public static boolean isBlank(Object[] array){ 27 | return array==null || array.length==0; 28 | } 29 | 30 | /** 31 | * 判断集合是否是空 32 | * @param array 33 | * @return 34 | */ 35 | public static boolean isBlank(Collection array){ 36 | return array==null || array.size()==0; 37 | } 38 | 39 | /** 40 | * 判断所有的集合是否都为空 41 | * @param collections 42 | * @return 43 | */ 44 | public static boolean isBlankAll(Collection...collections){ 45 | for(Collection c:collections){ 46 | if(!isBlank(c)){ 47 | return false; 48 | } 49 | } 50 | 51 | return true; 52 | } 53 | 54 | /** 55 | * 判断字符串strings中是否都是空串 56 | * @param strings 57 | * @return 58 | */ 59 | public static boolean isBlankAll(String... strings){ 60 | for(String s:strings){ 61 | if(!isBlank(s)){ 62 | return false; 63 | } 64 | } 65 | 66 | return true; 67 | } 68 | 69 | /** 70 | * 判断collections集合中是否至少有一个为空 71 | * @param collections 72 | * @return 73 | */ 74 | public static boolean isBlankAtLeastOne(Collection...collections){ 75 | for(Collection c:collections){ 76 | if(isBlank(c)){ 77 | return true; 78 | } 79 | } 80 | 81 | return false; 82 | } 83 | 84 | /** 85 | * 判断字符串strings中是否之首有一个为空 86 | * @param strings 87 | * @return 88 | */ 89 | public static boolean isBlankAtLeastOne(String... strings){ 90 | for(String s:strings){ 91 | if(isBlank(s)){ 92 | return true; 93 | } 94 | } 95 | 96 | return false; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/util/EditDistance.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.util; 2 | 3 | /** 4 | * 5 | * This class computes the edit distance between two strings using dynamic 6 | * programming. The dynamic programming part is in the method 7 | * printEditDistance(). 8 | * 9 | * @author 夏天 10 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 11 | */ 12 | public class EditDistance { 13 | /** 14 | * 获取删除代价 15 | * 16 | * @return 17 | */ 18 | public int getDeletionCost() { 19 | return 1; 20 | } 21 | 22 | /** 23 | * 获取插入代价 24 | * 25 | * @return 26 | */ 27 | public int getInsertionCost() { 28 | return 1; 29 | } 30 | 31 | /** 32 | * 获取替换代价 33 | * 34 | * @return 35 | */ 36 | public int getSubstitutionCost(char a, char b) { 37 | return (a == b) ? 0 : 1; 38 | } 39 | 40 | public int getEditDistance(String S, String T) { 41 | int[][] D = null; 42 | if (S == null) 43 | S = ""; 44 | if (T == null) 45 | T = ""; 46 | 47 | char[] a = S.toCharArray(); 48 | char[] b = T.toCharArray(); 49 | 50 | int n = a.length; // 字符串S的长度 51 | int m = b.length; // 字符串T的长度 52 | 53 | if (a.length == 0) { 54 | return b.length; 55 | } else if (b.length == 0) { 56 | return a.length; 57 | } 58 | 59 | D = new int[a.length + 1][b.length + 1]; 60 | 61 | /** 初始化D[i][0] */ 62 | for (int i = 1; i <= n; i++) { 63 | D[i][0] = D[i - 1][0] + getDeletionCost(); 64 | } 65 | 66 | /** 初始化D[0][j] */ 67 | for (int j = 1; j <= m; j++) { 68 | D[0][j] = D[0][j - 1] + getInsertionCost(); 69 | } 70 | 71 | for (int i = 1; i <= n; i++) { 72 | for (int j = 1; j <= m; j++) { 73 | D[i][j] = MathUtils.min(D[i - 1][j] + getDeletionCost(), 74 | D[i][j - 1] + getInsertionCost(), D[i - 1][j - 1] 75 | + getSubstitutionCost(a[i - 1], b[j - 1])); 76 | } 77 | } 78 | 79 | return D[n][m]; 80 | } 81 | 82 | /** 83 | * 应与getEditDistance(S, T)等同 84 | * @param s 85 | * @param t 86 | * @return 87 | */ 88 | public static int getLevenshteinDistance(String s, String t) { 89 | if (s == null || t == null) { 90 | throw new IllegalArgumentException("Strings must not be null"); 91 | } 92 | int d[][]; // matrix 93 | int n; // length of s 94 | int m; // length of t 95 | int i; // iterates through s 96 | int j; // iterates through t 97 | char s_i; // ith character of s 98 | char t_j; // jth character of t 99 | int cost; // cost 100 | 101 | // Step 1 102 | n = s.length(); 103 | m = t.length(); 104 | if (n == 0) { 105 | return m; 106 | } 107 | if (m == 0) { 108 | return n; 109 | } 110 | d = new int[n + 1][m + 1]; 111 | 112 | // Step 2 113 | for (i = 0; i <= n; i++) { 114 | d[i][0] = i; 115 | } 116 | for (j = 0; j <= m; j++) { 117 | d[0][j] = j; 118 | } 119 | 120 | // Step 3 121 | for (i = 1; i <= n; i++) { 122 | s_i = s.charAt(i - 1); 123 | 124 | // Step 4 125 | for (j = 1; j <= m; j++) { 126 | t_j = t.charAt(j - 1); 127 | 128 | // Step 5 129 | if (s_i == t_j) { 130 | cost = 0; 131 | } else { 132 | cost = 1; 133 | } 134 | 135 | // Step 6 136 | d[i][j] = MathUtils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, 137 | d[i - 1][j - 1] + cost); 138 | } 139 | } 140 | 141 | // Step 7 142 | return d[n][m]; 143 | } 144 | 145 | } 146 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/util/FileUtils.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.util; 2 | 3 | import java.io.BufferedOutputStream; 4 | import java.io.BufferedReader; 5 | import java.io.File; 6 | import java.io.FileOutputStream; 7 | import java.io.IOException; 8 | import java.io.InputStream; 9 | import java.io.InputStreamReader; 10 | 11 | /** 12 | * 与文件相关的工具类 13 | * 14 | * @author 夏天 15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 16 | */ 17 | public class FileUtils { 18 | /** 19 | * 根据指定编码从输入流中依次遍历每一行文字 20 | * 21 | * @param input 22 | * 输入流 23 | * @param encoding 24 | * 输入流所用的文字编码 25 | * @param event 26 | * 遍历每一行时触发的事件处理 27 | * @throws IOException 28 | */ 29 | public static void traverseLines(InputStream input, String encoding, TraverseEvent event) throws IOException { 30 | BufferedReader in = new BufferedReader(new InputStreamReader(input, encoding)); 31 | String line = null; 32 | 33 | while ((line = in.readLine()) != null) { 34 | event.visit(line); 35 | } 36 | 37 | input.close(); 38 | in.close(); 39 | } 40 | 41 | /** 42 | * 保存字符串到文件中 43 | * @param content 44 | * @param fileName 45 | * @return 46 | */ 47 | public static boolean saveStringToFile(String content, String fileName) { 48 | boolean rtn = false; 49 | BufferedOutputStream out = null; 50 | try { 51 | File file = new File(fileName); 52 | file.getParentFile().mkdirs(); 53 | 54 | out = new BufferedOutputStream(new FileOutputStream(file)); 55 | out.write(content.getBytes("GBK")); 56 | out.close(); 57 | rtn = true; 58 | } catch (Exception e) { 59 | System.out.println("saveStringToFile error:" + e.getMessage()); 60 | } finally { 61 | try { 62 | out.close(); 63 | } catch (Exception e) { 64 | } 65 | } 66 | return rtn; 67 | } 68 | 69 | public static void main(String[] args) { 70 | int count = 0; 71 | File dir = new File("G:/juanjuantx"); 72 | for(File a:dir.listFiles()){ 73 | if(a.isDirectory()){ 74 | for(File zy: a.listFiles()){ 75 | if(zy.listFiles()!=null) 76 | for(File rar:zy.listFiles()){ 77 | if(rar.isFile() && rar.getName().endsWith(".rar")){ 78 | count++; 79 | } 80 | } 81 | } 82 | } 83 | } 84 | System.out.println(count); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/util/MathUtils.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.util; 2 | 3 | public class MathUtils { 4 | public static int min(int... values){ 5 | int min = Integer.MAX_VALUE; 6 | for(int v:values){ 7 | min = (v夏天 16 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 17 | */ 18 | public class PinyinUtils { 19 | /** 拼音的Map词典, 一个汉字可能对应多个拼音, 它所有的拼音放到一个集合中 */ 20 | private Map> pinyinDict = null; 21 | 22 | /** 单例 */ 23 | private static PinyinUtils instance = null; 24 | 25 | private PinyinUtils() throws IOException{ 26 | //从classpath中加载拼音词典文件 27 | InputStream input = this.getClass().getResourceAsStream("/data/F02-GB2312-to-PuTongHua-PinYin.txt"); 28 | 29 | BufferedReader in = new BufferedReader(new InputStreamReader(input, "UTF-8")); 30 | String line = null; 31 | 32 | MyTraverseEvent event = new MyTraverseEvent(); 33 | while ((line = in.readLine()) != null) { 34 | event.visit(line); 35 | } 36 | 37 | input.close(); 38 | in.close(); 39 | 40 | this.pinyinDict = event.getPinyins(); 41 | } 42 | 43 | public static PinyinUtils getInstance(){ 44 | if(instance == null){ 45 | try { 46 | instance = new PinyinUtils(); 47 | } catch (IOException e) { 48 | e.printStackTrace(); 49 | } 50 | } 51 | 52 | return instance; 53 | } 54 | 55 | /** 56 | * 获取汉字的拼音, 由于汉字具有多音字,故返回一个集合 57 | * @param hanzi 58 | * @return 59 | */ 60 | public Set getPinyin(Character hanzi){ 61 | Set set = pinyinDict.get(hanzi); 62 | if(set==null || set.size()==0){ 63 | set = new HashSet(); 64 | set.add(hanzi.toString()); 65 | } 66 | return set; 67 | } 68 | 69 | /** 70 | * 获取词语的拼音, 一个词语可能对应多个拼音,把所有可能的组合放到集合中返回 71 | * @param word 72 | * @return 73 | */ 74 | public Set getPinyin(String word){ 75 | Set word_set = new HashSet(); 76 | for(int i=0; i hanzi_set = getPinyin(word.charAt(i)); 78 | if(word_set==null || word_set.size()==0){ 79 | word_set.addAll(hanzi_set); 80 | continue; 81 | } 82 | 83 | Set tmp_set = new HashSet(); 84 | for(String w:word_set){ 85 | for(String h:hanzi_set){ 86 | tmp_set.add(w + h); 87 | } 88 | } 89 | 90 | word_set = tmp_set; 91 | } 92 | 93 | return word_set; 94 | } 95 | 96 | /** 97 | * 获取拼音字符串,多音字只取一个 98 | * @param word 99 | * @return 100 | */ 101 | public String getPinyinSingle(String word){ 102 | StringBuffer sb = new StringBuffer(); 103 | for(int i=0; i pinyin = getPinyin(word.charAt(i)); 118 | sb.append(pinyin.toString()); 119 | } 120 | return sb.toString(); 121 | } 122 | 123 | /** 124 | * 获取拼音首字母 125 | * @param word 126 | * @return 127 | */ 128 | public String getPinyinHead(String word){ 129 | StringBuffer sb = new StringBuffer(); 130 | for(int i=0; i> pinyins = null; 139 | 140 | public MyTraverseEvent(){ 141 | this.pinyins = new HashMap>(); 142 | } 143 | 144 | public Map> getPinyins(){ 145 | return pinyins; 146 | } 147 | 148 | public boolean visit(String item) { 149 | if(item.startsWith("//")){ 150 | return true; 151 | } 152 | 153 | char hanzi = item.charAt(0); 154 | //String pinyin = item.substring(2, item.length()-1); 155 | String pinyin = item.substring(2, item.length()); 156 | Set set = pinyins.get(hanzi); 157 | if(set==null){ 158 | set = new HashSet(); 159 | } 160 | set.add(pinyin); 161 | 162 | pinyins.put(hanzi, set); 163 | return true; 164 | } 165 | } 166 | 167 | } 168 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/util/TraverseEvent.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.util; 2 | 3 | /** 4 | * 遍历接口, 对于需要遍历的东西,通过传入该接口,可以实现实际的访问处理 5 | * 6 | * @author 夏天 7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 8 | * 9 | * @param 10 | */ 11 | public interface TraverseEvent { 12 | 13 | /** 14 | * 遍历时访问其中的一个条目 15 | * @param item 16 | * @return 17 | */ 18 | public boolean visit(T item); 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/util/XmlException.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.util; 2 | 3 | /** 4 | * Runtime exception for XML handling. 5 | * 6 | * @author carver 7 | */ 8 | public class XmlException extends RuntimeException { 9 | 10 | private static final long serialVersionUID = 381260478228427716L; 11 | 12 | public static final String XML_PAYLOAD_EMPTY = "xml.payload.empty"; 13 | public static final String XML_ENCODE_ERROR = "xml.encoding.invalid"; 14 | public static final String FILE_NOT_FOUND = "xml.file.not.found"; 15 | public static final String XML_PARSE_ERROR = "xml.parse.error"; 16 | public static final String XML_READ_ERROR = "xml.read.error"; 17 | public static final String XML_VALIDATE_ERROR = "xml.validate.error"; 18 | public static final String XML_TRANSFORM_ERROR = "xml.transform.error"; 19 | 20 | public XmlException() { 21 | super(); 22 | } 23 | 24 | public XmlException(String key, Throwable cause) { 25 | super(key, cause); 26 | } 27 | 28 | public XmlException(String key) { 29 | super(key); 30 | } 31 | 32 | public XmlException(Throwable cause) { 33 | super(cause); 34 | } 35 | 36 | } -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/CharBasedSimilarity.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import ruc.irm.similarity.Similaritable; 7 | 8 | 9 | /** 10 | * 字面相似度计算方法 11 | * 12 | * @author 夏天 13 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 14 | */ 15 | public class CharBasedSimilarity implements Similaritable { 16 | 17 | private double alpha = 0.6; 18 | private double beta = 0.4; 19 | 20 | @Override 21 | public double getSimilarity(String word1, String word2) { 22 | if(isBlank(word1)&& isBlank(word2)){ 23 | return 1.0; 24 | } 25 | if(isBlank(word1)|| isBlank(word2)){ 26 | return 0.0; 27 | } 28 | 29 | List sameHZ = new ArrayList(); 30 | 31 | String longString = word1.length()>=word2.length()?word1:word2; 32 | String shortString = word1.length() sameHZ){ 48 | double top = 0; 49 | double bottom = 0; 50 | for(int i=0; i codeSet1 = CilinDb.getInstance().getCilinCoding(item1); 35 | Set codeSet2 = CilinDb.getInstance().getCilinCoding(item2); 36 | if(codeSet1==null || codeSet2==null){ 37 | return 0.0; 38 | } 39 | for(String code1:codeSet1){ 40 | for(String code2:codeSet2){ 41 | double s = getSimilarityByCode(code1, code2); 42 | System.out.println(code1 + "-" + code2 + "-" +CilinCoding.calculateCommonWeight(code1, code2)); 43 | if(sim 5 | * 6 | * 7 | * 8 | * 9 | * 10 | * 11 | * 12 | * 14 | * 15 | * 16 | *
编码位1 2345678
编码示例Cb07A03=
类别级别第一级第二级第三级第四级第五级标记位 13 | *
类别含义大类中类小类词群原子词群词语关系
17 | *
18 | * 表中编码位从左到右顺序排列,其中,第8位对应的标记位为“=”、“#”和“@”三种符号之一。其中“=”代表常见的“同义”关系,“#”代表词语之间的相关关系,“@”则代表词语自我封闭的独立性质,它在词典中既没有同义词,也没有相关词。 19 | * 20 | * 21 | * @author 夏天 22 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 23 | */ 24 | public class CilinCoding { 25 | public static double[] WEIGHT = new double[]{1.2, 1.2, 1.0, 1.0, 0.8, 0.4}; 26 | public static double TOTAL_WEIGHT = 5.6; 27 | 28 | public static String getCodeLevel(String code,int level){ 29 | switch(level){ 30 | case 1: 31 | return code.substring(0, 1); 32 | case 2: 33 | return code.substring(1, 2); 34 | case 3: 35 | return code.substring(2, 4); 36 | case 4: 37 | return code.substring(4, 5); 38 | case 5: 39 | return code.substring(5, 7); 40 | case 6: 41 | return code.substring(7); 42 | } 43 | 44 | return ""; 45 | } 46 | 47 | /** 48 | * 获取共同部分编码的权重 49 | * @param code1 50 | * @param code2 51 | * @return 52 | */ 53 | public static double calculateCommonWeight(String code1, String code2){ 54 | double weight = 0.0; 55 | for(int i=1; i<=6; i++){ 56 | String c1 = getCodeLevel(code1,i); 57 | String c2 = getCodeLevel(code2,i); 58 | if(c1.equals(c2)){ 59 | weight += WEIGHT[i-1]; 60 | }else{ 61 | break; 62 | } 63 | } 64 | return weight; 65 | } 66 | 67 | public static String printCoding(String code){ 68 | StringBuilder sb = new StringBuilder(); 69 | for(int i=1; i<=6; i++){ 70 | if(i==1){ 71 | sb.append("[LEVEL_" + i); 72 | }else{ 73 | sb.append(", LEVEL_" + i); 74 | } 75 | sb.append(": "); 76 | sb.append(getCodeLevel(code, i)); 77 | } 78 | sb.append("]"); 79 | 80 | return sb.toString(); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/cilin/CilinDb.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.cilin; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.HashMap; 6 | import java.util.HashSet; 7 | import java.util.Map; 8 | import java.util.Set; 9 | import java.util.zip.GZIPInputStream; 10 | 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | import ruc.irm.similarity.util.FileUtils; 14 | import ruc.irm.similarity.util.TraverseEvent; 15 | 16 | /** 17 | * 词林数据库 18 | * 19 | * @author 夏天 20 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 21 | */ 22 | public class CilinDb { 23 | /** the logger */ 24 | protected static Logger LOG = LoggerFactory.getLogger(CilinDb.class); 25 | /** 以词语为主键的索引表 */ 26 | private Map> wordIndex = new HashMap>(); 27 | /** 以编码为主键的索引表 */ 28 | private Map> codeIndex = new HashMap>(); 29 | 30 | private static CilinDb instance = null; 31 | 32 | public static CilinDb getInstance(){ 33 | if(instance == null){ 34 | try { 35 | instance = new CilinDb(); 36 | } catch (IOException e) { 37 | LOG.error(e.toString()); 38 | } 39 | } 40 | return instance; 41 | } 42 | 43 | private CilinDb() throws IOException{ 44 | InputStream input = new GZIPInputStream(this.getClass().getResourceAsStream("/data/cilin.db.gz")); 45 | 46 | TraverseEvent event = new TraverseEvent(){ 47 | @Override 48 | public boolean visit(String line) { 49 | String[] items = line.split(" "); 50 | Set set = new HashSet(); 51 | for(int i=2; i codeWords = codeIndex.get(code); 58 | if(codeWords==null){ 59 | codeWords = new HashSet(); 60 | } 61 | codeWords.add(items[0]); 62 | codeIndex.put(code, codeWords); 63 | } 64 | } 65 | wordIndex.put(items[0], set); 66 | items = null; 67 | return false; 68 | }}; 69 | LOG.info("loading cilin dictionary..."); 70 | long time = System.currentTimeMillis(); 71 | 72 | FileUtils.traverseLines(input, "UTF8", event); 73 | 74 | time = System.currentTimeMillis() - time; 75 | LOG.info("loading cilin dictionary completely. time elapsed: " + time); 76 | 77 | } 78 | 79 | /** 80 | * 获取某个词语的词林编码,一个词语可以有多个编码,通过Set给出 81 | * @param word 82 | * @return 83 | */ 84 | public Set getCilinCoding(String word){ 85 | return wordIndex.get(word); 86 | } 87 | 88 | public Set getCilinWords(String code){ 89 | return codeIndex.get(code); 90 | } 91 | 92 | public static void main(String[] args) { 93 | CilinDb db = CilinDb.getInstance(); 94 | String code = db.getCilinCoding("中国").iterator().next(); 95 | System.out.println(CilinCoding.printCoding(code)); 96 | System.out.println(db.getCilinWords(code)); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/Hownet.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet; 2 | 3 | import java.io.IOException; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import ruc.irm.similarity.Similaritable; 8 | import ruc.irm.similarity.word.hownet2.concept.BaseConceptParser; 9 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser; 10 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser; 11 | import ruc.irm.similarity.word.hownet2.sememe.BaseSememeParser; 12 | 13 | /** 14 | * Hownet的主控制类, 通过知网的概念和义原及其关系计算汉语词语之间的相似度. 15 | * 相似度的计算理论参考论文《汉语词语语义相似度计算研究》 16 | * 17 | * @author 夏天 18 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 19 | * 20 | * @see ruc.irm.similarity.Similaritable 21 | */ 22 | public class Hownet implements Similaritable{ 23 | /** the logger */ 24 | private static final Logger LOG = LoggerFactory.getLogger(Hownet.class); 25 | /** 知网的单例 */ 26 | private static Hownet instance = null; 27 | 28 | private BaseConceptParser conceptParser = null; 29 | 30 | private Hownet(){ 31 | try { 32 | BaseSememeParser sememeParser = new XiaSememeParser(); 33 | conceptParser = new XiaConceptParser(sememeParser); 34 | } catch (IOException e) { 35 | e.printStackTrace(); 36 | LOG.error(e.toString()); 37 | } 38 | } 39 | 40 | /** 41 | * 单例获取知网对象 42 | * @return 43 | */ 44 | public static Hownet instance(){ 45 | if(null == instance){ 46 | instance = new Hownet(); 47 | } 48 | 49 | return instance; 50 | } 51 | 52 | /** 53 | * 获取概念解析器 54 | * @return 55 | */ 56 | public BaseConceptParser getConceptParser(){ 57 | return conceptParser; 58 | } 59 | 60 | public double getSimilarity(String item1, String item2) { 61 | return conceptParser.getSimilarity(item1, item2); 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/HownetMeta.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet; 2 | 3 | /** 4 | * Metadata for Hownet 5 | * 6 | * @author 夏天 7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 8 | */ 9 | public interface HownetMeta { 10 | /** Algorithm of XIA Tian */ 11 | public static final int ALGORITHM_XIA = 1; 12 | 13 | /** Algorithm of LIU Qun */ 14 | public static final int ALGORITHM_LIU = 2; 15 | 16 | /** 17 | * Hownet symbol descriptions 18 | */ 19 | public static final String Symbol_Descriptions[][] = { 20 | { 21 | "#", "表示与其相关"} 22 | , { 23 | "%", "是其部分"} 24 | , { 25 | "$", "可以被该V处置,或是该V的受事、对象、领有物,或内容"} 26 | , { 27 | "*", "施事或工具"} 28 | , { 29 | "+", "所标记的角色是隐性的,几乎在实际语言中不会出现"} 30 | , { 31 | "&", "指向"} 32 | , { 33 | "~", "多半是,多半有,很可能"} 34 | , { 35 | "@", "可以做V的空间或时间"} 36 | , { 37 | "?", "可以使N的材料"} 38 | , { 39 | "(", "至于其中的应该是一个词标记"} 40 | , { 41 | "^", "不存在,或没有,或不能"} 42 | , { 43 | "!", "表示某一属性为一敏感的属性,如味道之与食物"} 44 | , { 45 | "[", "标示概念的共性属性"} 46 | }; 47 | 48 | /** γ:具体词与义元的相似度一律为一个较小的常数 */ 49 | public static final double gamma = 0.2; 50 | 51 | /** δ:任一个非空值与空值的相似度为一个较小的常数,此处为0.2 */ 52 | public static final double delta = 0.2; 53 | 54 | /** β1实词概念第一基本义原描述式的权重 */ 55 | public static final double beta1 = 0.5; 56 | /** β2实词概念其他基本义原描述式的权重 */ 57 | public static final double beta2 = 0.2; 58 | /** β3实词概念关系义原描述式的权重 */ 59 | public static final double beta3 = 0.17; 60 | /** β4实词概念符号义原描述式的权重 */ 61 | public static final double beta4 = 0.13; 62 | 63 | /** 64 | * Θ 计算后面概念的义原与参照概念所有义原的最大相似度, 并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用), 65 | * 如果数值大于该值时才会起参照作用, 去掉冗余的不重要义原 66 | */ 67 | public static final double PARAM_THETA = 0.5; 68 | /** 69 | * Ω 计算前面概念的义原与参照概念所有义原的最大相似度,并乘以两个概念主义原相似度的积(主义原通过该方式起约束作用), 70 | * 如果数值大于该值时才会调整前面概念的义原符号, 以起修正作用 71 | */ 72 | public static final double PARAM_OMEGA = 0.8; 73 | /** */ 74 | public static final double PARAM_XI = 0.6; 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/concept/Concept.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet.concept; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.StringTokenizer; 6 | 7 | import ruc.irm.similarity.word.hownet.HownetMeta; 8 | 9 | 10 | /** 11 | * 知网的概念表示类
example和英文部分对于相似度的计算不起作用,考虑到内存开销, 在概念的表示中去掉了这部分数据的对应定义 12 | * 13 | * @author 夏天 14 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 15 | * @deprecated 16 | */ 17 | public class Concept implements HownetMeta, Comparable { 18 | /** 中文概念名称 */ 19 | protected String word; 20 | /** 词性: Part of Speech */ 21 | protected String pos; 22 | /** 定义 */ 23 | protected String define; 24 | 25 | /** 是否是实词,false表示为虚词, 一般为实词 */ 26 | protected boolean bSubstantive; 27 | /** 第一基本义原 */ 28 | protected String mainSememe; 29 | /** 其他基本义原 */ 30 | protected String[] secondSememes; 31 | /** 关系义元原 */ 32 | protected String[] relationSememes; 33 | /** 关系符号描述 */ 34 | protected String[] symbolSememes; 35 | 36 | static String[][] Concept_Type = { { "=", "事件" }, 37 | { "aValue|属性值", "属性值" }, { "qValue|数量值", "数量值" }, 38 | { "attribute|属性", "属性" }, { "quantity|数量", "数量" }, 39 | { "unit|", "单位" }, { "%", "部件" } }; 40 | 41 | public Concept(String word, String pos, String def) { 42 | this.word = word; 43 | this.pos = pos; 44 | this.define = (def == null) ? "" : def.trim(); 45 | 46 | // 虚词用{***}表示 47 | if (define.length() > 0 48 | && define.charAt(0) == '{' 49 | && define.charAt(define.length() - 1) == '}'){ 50 | this.bSubstantive = false; 51 | } else { 52 | this.bSubstantive = true; 53 | } 54 | 55 | parseDefine(); 56 | } 57 | 58 | /** 59 | * 处理定义,把定义分为第一基本义元、其他基本义元、关系义元和符号义元四类 60 | */ 61 | private void parseDefine() { 62 | List secondList = new ArrayList(); //其他基本义原 63 | List relationList = new ArrayList(); //关系义原 64 | List symbolList = new ArrayList(); //符号义原 65 | 66 | String tokenString = this.define; 67 | 68 | //如果不是实词,则处理“{}”中的内容 69 | if (!this.bSubstantive) { 70 | tokenString = define.substring(1, define.length() - 1); 71 | } 72 | 73 | StringTokenizer token = new StringTokenizer(tokenString, ",", false); 74 | 75 | // 第一个为第一基本义元 76 | if (token.hasMoreTokens()) { 77 | this.mainSememe = token.nextToken(); 78 | } 79 | 80 | main_loop: while (token.hasMoreTokens()) { 81 | String item = token.nextToken(); 82 | if (item.equals("")) continue; 83 | 84 | // 先判断是否为符号义元 85 | String symbol = item.substring(0, 1); 86 | for(int i=0;i< Symbol_Descriptions.length;i++){ 87 | if(symbol.equals( Symbol_Descriptions[i][0])){ 88 | symbolList.add(item); 89 | continue main_loop; 90 | } 91 | } 92 | 93 | //如果不是符号义元,则进一步判断是关系义元还是第二基本义元, 带有“=”表示关系义原 94 | if (item.indexOf('=') > 0){ 95 | relationList.add(item); 96 | } else { 97 | secondList.add(item); 98 | } 99 | } 100 | 101 | this.secondSememes = secondList.toArray(new String[secondList.size()]); 102 | this.relationSememes = relationList.toArray(new String[relationList.size()]); 103 | this.symbolSememes = symbolList.toArray(new String[symbolList.size()]); 104 | } 105 | 106 | /** 107 | * 获取第一义元 108 | * 109 | * @return 110 | */ 111 | public String getMainSememe() { 112 | return mainSememe; 113 | } 114 | 115 | /** 116 | * 获取其他基本义元描述 117 | * 118 | * @return 119 | */ 120 | public String[] getSecondSememes() { 121 | return secondSememes; 122 | } 123 | 124 | /** 125 | * 获取关系义元描述 126 | * 127 | * @return 128 | */ 129 | public String[] getRelationSememes() { 130 | return relationSememes; 131 | } 132 | 133 | /** 134 | * 获取符号义元描述 135 | * 136 | * @return 137 | */ 138 | public String[] getSymbolSememes() { 139 | return symbolSememes; 140 | } 141 | 142 | @Override 143 | public String toString() { 144 | StringBuilder sb = new StringBuilder(); 145 | sb.append("name="); 146 | sb.append(this.word); 147 | sb.append("; pos="); 148 | sb.append(this.pos); 149 | sb.append("; define="); 150 | sb.append(this.define); 151 | sb.append("; 第一基本义元:[" + mainSememe); 152 | 153 | sb.append("]; 其他基本义元描述:["); 154 | for(String sem: secondSememes){ 155 | sb.append(sem); 156 | sb.append(";"); 157 | } 158 | 159 | sb.append("]; [关系义元描述:"); 160 | for(String sem: relationSememes){ 161 | sb.append(sem); 162 | sb.append(";"); 163 | } 164 | 165 | sb.append("]; [关系符号描述:"); 166 | for(String sem: symbolSememes){ 167 | sb.append(sem); 168 | sb.append(";"); 169 | } 170 | sb.append("]"); 171 | return sb.toString(); 172 | } 173 | 174 | /** 175 | * 是实词还是虚词 176 | * 177 | * @return true:实词;false:虚词 178 | */ 179 | public boolean isSubstantive() { 180 | return this.bSubstantive; 181 | } 182 | 183 | public String getWord() { 184 | return word; 185 | } 186 | 187 | public void setWord(String word) { 188 | this.word = word; 189 | } 190 | 191 | public String getPos() { 192 | return pos; 193 | } 194 | 195 | public void setPos(String pos) { 196 | this.pos = pos; 197 | } 198 | 199 | public String getDefine() { 200 | return define; 201 | } 202 | 203 | public void setDefine(String define) { 204 | this.define = define; 205 | } 206 | 207 | /** 208 | * 获取该概念的类型 209 | * 210 | * @return 211 | */ 212 | public String getType() { 213 | for (int i = 0; i < Concept_Type.length; i++) { 214 | if (define.toUpperCase().indexOf(Concept_Type[i][0].toUpperCase()) >= 0) { 215 | return Concept_Type[i][1]; 216 | } 217 | } 218 | return "普通概念"; 219 | } 220 | 221 | /** 222 | * 按照概念的名称进行比较 223 | */ 224 | public int compareTo(Concept o) { 225 | return word.compareTo(o.word); 226 | } 227 | 228 | ////////////////////////////////////////////// 229 | /** 230 | * 方便在parse中比较概念词语加入的方法 231 | * @param another 232 | * @return 233 | */ 234 | public int compareTo(String another){ 235 | return word.compareTo(another); 236 | } 237 | 238 | public boolean equals(String another){ 239 | return word.equals(another); 240 | } 241 | } -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/concept/ConceptDictTraverseEvent.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet.concept; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileOutputStream; 6 | import java.io.InputStream; 7 | import java.io.InputStreamReader; 8 | import java.io.PrintWriter; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.List; 12 | 13 | import javax.xml.parsers.DocumentBuilder; 14 | import javax.xml.parsers.DocumentBuilderFactory; 15 | import javax.xml.transform.OutputKeys; 16 | import javax.xml.transform.Transformer; 17 | import javax.xml.transform.TransformerFactory; 18 | import javax.xml.transform.dom.DOMSource; 19 | import javax.xml.transform.stream.StreamResult; 20 | 21 | import org.w3c.dom.Document; 22 | import org.w3c.dom.Element; 23 | 24 | import ruc.irm.similarity.util.TraverseEvent; 25 | 26 | /** 27 | * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准,格式如下:
28 | * 阿斗 N human|人,ProperName|专,past|昔
29 | * 阿爸 N human|人,family|家,male|男
30 | * 即: <概念> <空格或者跳格> <词性> <空格或者跳格> <定义>" 31 | *
32 | * 概念保存到数组中,没有保存到Map中,可以降低对内存空间的使用 33 | * 34 | * @author 夏天 35 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 36 | * @deprecated 37 | */ 38 | public class ConceptDictTraverseEvent implements TraverseEvent { 39 | private List conceptList = null; 40 | 41 | public ConceptDictTraverseEvent(){ 42 | conceptList = new ArrayList(); 43 | } 44 | 45 | public Concept[] getConcepts(){ 46 | Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]); 47 | Arrays.sort(concepts); 48 | return concepts; 49 | } 50 | 51 | /** 52 | * 读取概念词典中的一行,并进行解析处理 53 | */ 54 | public boolean visit(String line) { 55 | String word = null; 56 | String pos = null; 57 | String define = ""; 58 | char ch; 59 | 60 | //以符号//开始的是注释行 61 | if(line.startsWith("//")){ 62 | return true; 63 | } 64 | 65 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置 66 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义 67 | //解析出一行中的概念各项数据 68 | loop: for (int position = 0; position < line.length(); position++) { 69 | ch = line.charAt(position); 70 | 71 | if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) { 72 | String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position); 73 | switch(processFlag){ 74 | case 0: 75 | word = item; 76 | processFlag++; 77 | break; 78 | case 1: 79 | pos = item; 80 | processFlag++; 81 | break; 82 | case 2: 83 | //define = item; 84 | //processFlag++; 85 | define = line.substring(lastPosition).trim(); 86 | break loop; 87 | case 3: 88 | System.out.println(line); 89 | break; 90 | } 91 | 92 | for( ;(position < line.length()); position++){ 93 | ch = line.charAt(position); 94 | if ((ch != ' ') && (ch != '\t')) { 95 | lastPosition = position; 96 | break; 97 | } 98 | } 99 | 100 | } 101 | } 102 | conceptList.add(new Concept(word, pos, define)); 103 | return true; 104 | } 105 | 106 | public void saveToXML(File xmlFile) throws Exception{ 107 | String conceptFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/concept.dat"; 108 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(conceptFile); 109 | BufferedReader in = new BufferedReader(new InputStreamReader(input, "utf8")); 110 | 111 | DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance(); 112 | DocumentBuilder builder=factory.newDocumentBuilder(); 113 | Document document=builder.newDocument(); 114 | Element root=document.createElement("concepts"); 115 | document.appendChild(root); 116 | 117 | String line = null; 118 | 119 | while ((line = in.readLine()) != null) { 120 | saveLineToXML(document, root, line); 121 | } 122 | 123 | input.close(); 124 | in.close(); 125 | 126 | TransformerFactory tf=TransformerFactory.newInstance(); 127 | Transformer transformer=tf.newTransformer(); 128 | DOMSource source=new DOMSource(document); 129 | transformer.setOutputProperty(OutputKeys.ENCODING,"utf8"); 130 | transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 131 | PrintWriter pw=new PrintWriter(new FileOutputStream(xmlFile)); 132 | StreamResult result=new StreamResult(pw); 133 | transformer.transform(source,result); 134 | } 135 | 136 | 137 | /** 138 | * 读取概念词典中的一行,并进行解析处理 139 | */ 140 | private boolean saveLineToXML(Document document, Element root, String line) { 141 | String word = null; 142 | String pos = null; 143 | String define = ""; 144 | char ch; 145 | 146 | //以符号//开始的是注释行 147 | if(line.startsWith("//")){ 148 | return true; 149 | } 150 | 151 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置 152 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义 153 | //解析出一行中的概念各项数据 154 | loop: for (int position = 0; position < line.length(); position++) { 155 | ch = line.charAt(position); 156 | 157 | if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) { 158 | String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position); 159 | switch(processFlag){ 160 | case 0: 161 | word = item; 162 | processFlag++; 163 | break; 164 | case 1: 165 | pos = item; 166 | processFlag++; 167 | break; 168 | case 2: 169 | //define = item; 170 | //processFlag++; 171 | define = line.substring(lastPosition).trim(); 172 | break loop; 173 | case 3: 174 | System.out.println(line); 175 | break; 176 | } 177 | 178 | for( ;(position < line.length()); position++){ 179 | ch = line.charAt(position); 180 | if ((ch != ' ') && (ch != '\t')) { 181 | lastPosition = position; 182 | break; 183 | } 184 | } 185 | 186 | } 187 | } 188 | 189 | Element e = document.createElement("c"); 190 | e.setAttribute("w", word); 191 | e.setAttribute("p", pos); 192 | e.setAttribute("d", define); 193 | root.appendChild(e); 194 | return true; 195 | } 196 | 197 | public static void main(String[] args) throws Exception { 198 | new ConceptDictTraverseEvent().saveToXML(new File("/home/xiatian/Desktop/concept.xml")); 199 | } 200 | 201 | } 202 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/concept/ConceptLinkedList.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet.concept; 2 | 3 | import java.util.LinkedList; 4 | 5 | /** 6 | * 用于概念处理的LinkedList 7 | * 8 | * @author 夏天 9 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 10 | * 11 | * @param 12 | * @deprecated 13 | */ 14 | @SuppressWarnings("serial") 15 | public class ConceptLinkedList extends LinkedList { 16 | 17 | /** 18 | * 删除链表中最后面的size个元素 19 | * @param size 20 | */ 21 | public void removeLast(int size){ 22 | for(int i=0;i夏天 15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 16 | * @deprecated 17 | */ 18 | public class LiuConceptParser extends ConceptParser{ 19 | 20 | private static LiuConceptParser instance = null; 21 | 22 | public static LiuConceptParser getInstance(){ 23 | if(instance == null){ 24 | try { 25 | instance = new LiuConceptParser(); 26 | } catch (IOException e) { 27 | e.printStackTrace(); 28 | } 29 | } 30 | 31 | return instance; 32 | } 33 | 34 | private LiuConceptParser(SememeParser sememeParser) throws IOException { 35 | super(sememeParser); 36 | } 37 | 38 | private LiuConceptParser() throws IOException{ 39 | super(new LiuqunSememeParser()); 40 | } 41 | 42 | @Override 43 | protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4){ 44 | return beta1 * sim_v1 45 | + beta2 * sim_v1 * sim_v2 46 | + beta3 * sim_v1 * sim_v2 * sim_v3 47 | + beta4 * sim_v1 * sim_v2 * sim_v3 * sim_v4; 48 | } 49 | 50 | @Override 51 | public double getSimilarity(String word1, String word2) { 52 | double similarity = 0.0; 53 | 54 | // 如果两个句子相同,则直接返回1.0 55 | if (word1.equals(word2)) { 56 | return 1.0; 57 | } 58 | 59 | Collection concepts1 = getConcepts(word1); 60 | Collection concepts2 = getConcepts(word2); 61 | 62 | //如果是blank,则说明是未登录词, 需要计算组合概念 63 | if(BlankUtils.isBlank(concepts1) || BlankUtils.isBlank(concepts2)){ 64 | return 0.0; 65 | } 66 | 67 | //两个for循环分别计算词语所有可能的概念的相似度 68 | for(Concept c1:concepts1){ 69 | for(Concept c2:concepts2){ 70 | double v = getSimilarity(c1, c2); 71 | 72 | if(v>similarity){ 73 | similarity = v; 74 | } 75 | 76 | if(similarity == 1.0){ 77 | break; 78 | } 79 | } 80 | } 81 | 82 | return similarity; 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/sememe/FastSimpleMap.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet.sememe; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Collection; 6 | 7 | /** 8 | * 一种新的Map,跟标准的Map不同,它的的Key可以有重复, 内部采用快速排序和二分查找, 9 | * 保持较少的变量,结构简单,可根据主键查找返回的结果是一个数组 10 | * 11 | * @author 夏天 12 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 13 | * 14 | * @param 15 | * @param 16 | * @deprecated 17 | */ 18 | public class FastSimpleMap, V> { 19 | private K[] keys; 20 | private V[] values; 21 | 22 | public FastSimpleMap(K[] keys, V[] values) throws IOException{ 23 | if(keys.length!=values.length){ 24 | throw new IOException("keys length must be equals values"); 25 | } 26 | this.keys = keys; 27 | this.values = values; 28 | 29 | // 根据keys进行排序 30 | quicksort(0, keys.length-1); 31 | } 32 | 33 | /** 34 | * 查找键对应的值集合 35 | * @param key 36 | * @return 37 | */ 38 | public Collection get(K key) { 39 | int low = 0; 40 | int high = keys.length - 1; 41 | 42 | Collection results = new ArrayList(); 43 | 44 | while (low <= high) { 45 | int mid = (low + high) >> 1; 46 | K item = keys[mid]; 47 | int cmp = key.compareTo(item); 48 | 49 | if (cmp > 0) { 50 | low = mid + 1; 51 | } else if (cmp < 0) { 52 | high = mid - 1; 53 | } else { 54 | // 找到起始位置,该位置前后相同的都是该主键对应的值 55 | for(int i=mid;i>=0 && keys[i].equals(key); i--){ 56 | results.add(values[i]); 57 | } 58 | for(int i=mid+1; i>1]; 84 | 85 | //partition 86 | do { 87 | while (keys[i].compareTo(x)<0) i++; 88 | while (keys[j].compareTo(x)>0) j--; 89 | 90 | if (i<=j) 91 | { 92 | h=keys[i]; keys[i]=keys[j]; keys[j]=h; 93 | v=values[i]; values[i]=values[j]; values[j]=v; 94 | i++; j--; 95 | } 96 | } while (i<=j); 97 | 98 | // recursion 99 | if (low夏天 9 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 10 | * 11 | * @author xiatian 12 | * @version 1.0 13 | * @deprecated 14 | */ 15 | public class LiuqunSememeParser extends SememeParser { 16 | 17 | /** 计算义元相似度的可调节的参数,默认为1.6 */ 18 | private final float alpha = 1.6f; 19 | 20 | public LiuqunSememeParser() throws IOException { 21 | super(); 22 | } 23 | 24 | /** 25 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者 26 | *
similarity = alpha/(distance+alpha) 27 | * 28 | * @param key1 29 | * @param key2 30 | * @return 31 | */ 32 | @Override 33 | public double getSimilarity(String item1, String item2) { 34 | int pos; 35 | 36 | // 如果为空串,直接返回0 37 | if (item1 == null || item2 == null || item1.equals("") 38 | || item2.equals("")) 39 | return 0.0; 40 | 41 | String key1 = item1.trim(); 42 | String key2 = item2.trim(); 43 | 44 | // 去掉()符号 45 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) { 46 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') { 47 | key1 = key1.substring(1, key1.length() - 1); 48 | key2 = key2.substring(1, key2.length() - 1); 49 | } else { 50 | return 0.0; 51 | } 52 | } 53 | 54 | // 处理关系义元,即x=y的情况 55 | if ((pos = key1.indexOf('=')) > 0) { 56 | int pos2 = key2.indexOf('='); 57 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0 58 | if ((pos == pos2) 59 | && key1.substring(0, pos).equals(key2.substring(0, pos2))) { 60 | key1 = key1.substring(pos + 1); 61 | key2 = key2.substring(pos2 + 1); 62 | } else { 63 | return 0.0; 64 | } 65 | } 66 | 67 | // 处理符号义元,即前面有特殊符号的义元 68 | String symbol1 = key1.substring(0, 1); 69 | String symbol2 = key2.substring(0, 1); 70 | 71 | for (int i = 0; i < Symbol_Descriptions.length; i++) { 72 | if (symbol1.equals(Symbol_Descriptions[i][0])) { 73 | if (symbol1.equals(symbol2)) { 74 | key1 = item1.substring(1); 75 | key2 = item2.substring(1); 76 | break; 77 | } else { 78 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0 79 | } 80 | } 81 | } 82 | 83 | if ((pos = key1.indexOf("|")) >= 0) { 84 | key1 = key1.substring(pos + 1); 85 | } 86 | if ((pos = key2.indexOf("|")) >= 0) { 87 | key2 = key2.substring(pos + 1); 88 | } 89 | 90 | int distance = getDistance(key1, key2); 91 | if (distance < 0) 92 | return 0.0; 93 | else 94 | return alpha / (distance + alpha); 95 | } 96 | 97 | @Override 98 | public double getSimilarity(Sememe sem1, Sememe sem2) { 99 | int distance = getDistance(sem1, sem2); 100 | if (distance <= 0) 101 | return 0.0f; 102 | else 103 | return alpha / (distance + alpha); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/sememe/MySememeParser.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet.sememe; 2 | 3 | import java.io.IOException; 4 | 5 | import ruc.irm.similarity.util.BlankUtils; 6 | 7 | 8 | /** 9 | * 义原相似度计算, 实现了SememeParser中定义的抽象方法 10 | * 11 | * @author 夏天 12 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 13 | * @deprecated 14 | */ 15 | public class MySememeParser extends SememeParser { 16 | 17 | public MySememeParser() throws IOException{ 18 | super(); 19 | } 20 | 21 | /** 22 | * 计算两个义原的相似度 23 | */ 24 | @Override 25 | public double getSimilarity(final Sememe sememe1, final Sememe sememe2) { 26 | Sememe sem1 = sememe1; 27 | Sememe sem2 = sememe2; 28 | 29 | if (sememe1 == null || sememe2 == null){ 30 | return 0.0f; 31 | }else if(sememe1.getId() == sememe2.getId()){ 32 | return 1.0f; 33 | } 34 | 35 | //变为深度相同,然后一次上找共同的父节点 36 | int level = sememe1.getDepth() - sememe2.getDepth(); 37 | for (int i = 0; i < ((level < 0) ? level * -1 : level); i++) { 38 | if (level > 0){ 39 | sem1 = SEMEMES[sem1.getParentId()]; 40 | }else{ 41 | sem2 = SEMEMES[sem2.getParentId()]; 42 | } 43 | } 44 | 45 | while(sem1.getId() != sem2.getId()){ 46 | // 如果有一个已经到达根节点,仍然不同,则返回0 47 | if (sem1.getId() == sem1.getParentId() 48 | || sem2.getId() == sem2.getParentId()) { 49 | return 0.0f; 50 | } 51 | 52 | sem1 = SEMEMES[sem1.getParentId()]; 53 | sem2 = SEMEMES[sem2.getParentId()]; 54 | } 55 | 56 | return sem1.getDepth()*2.0f/(sememe1.getDepth() + sememe2.getDepth()); 57 | } 58 | 59 | /** 60 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者 similarity = alpha/(distance+alpha), 61 | * 如果两个字符串相同或都为空,直接返回1.0 62 | * 63 | * @param key1 第一个义原字符串 64 | * @param key2 第二个义原字符串 65 | * @return 66 | */ 67 | @Override 68 | public double getSimilarity(String item1, String item2) { 69 | if(BlankUtils.isBlankAll(item2, item2)){ 70 | return 1.0; 71 | } else if(BlankUtils.isBlankAtLeastOne(item1, item2)){ 72 | return 0.0; 73 | } else if(item1.equals(item2)){ 74 | return 1.0; 75 | } 76 | 77 | String key1 = item1.trim(); 78 | String key2 = item2.trim(); 79 | 80 | // 去掉()符号 81 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) { 82 | 83 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') { 84 | key1 = key1.substring(1, key1.length() - 1); 85 | key2 = key2.substring(1, key2.length() - 1); 86 | } else { 87 | return 0.0; 88 | } 89 | 90 | } 91 | 92 | // 处理关系义元,即x=y的情况 93 | int pos = key1.indexOf('='); 94 | if (pos > 0) { 95 | int pos2 = key2.indexOf('='); 96 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0 97 | if ((pos == pos2) 98 | && key1.substring(0, pos).equals(key2.substring(0, pos2))) { 99 | key1 = key1.substring(pos + 1); 100 | key2 = key2.substring(pos2 + 1); 101 | } else { 102 | return 0.0; 103 | } 104 | } 105 | 106 | // 处理符号义元,即前面有特殊符号的义元 107 | String symbol1 = key1.substring(0, 1); 108 | String symbol2 = key2.substring(0, 1); 109 | 110 | for (int i = 0; i < Symbol_Descriptions.length; i++) { 111 | if (symbol1.equals(Symbol_Descriptions[i][0])) { 112 | if (symbol1.equals(symbol2)) { 113 | key1 = item1.substring(1); 114 | key2 = item2.substring(1); 115 | break; 116 | } else { 117 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0 118 | } 119 | } 120 | } 121 | 122 | if ((pos = key1.indexOf("|")) >= 0) { 123 | key1 = key1.substring(pos + 1); 124 | } 125 | if ((pos = key2.indexOf("|")) >= 0) { 126 | key2 = key2.substring(pos + 1); 127 | } 128 | 129 | // 如果两个字符串相等,直接返回距离为0 130 | if (key1.equals(key2)) { 131 | return 1.0; 132 | } 133 | 134 | Integer[] myset1 = getSememes(key1); 135 | Integer[] myset2 = getSememes(key2); 136 | 137 | double similarity = 0.0; 138 | for(int id1:myset1){ 139 | for(int id2:myset2){ 140 | double s = getSimilarity(SEMEMES[id1], SEMEMES[id2]); 141 | if(s>similarity){ 142 | similarity = s; 143 | } 144 | } 145 | } 146 | 147 | return similarity; 148 | } 149 | 150 | 151 | } -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/sememe/Sememe.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet.sememe; 2 | 3 | /** 4 | * 描述知网义原的基本对象, 出于性能考虑,把未用到的英文名称、定义等在加载时忽略, 更准确的做法是以[英文定义|中文定义] 5 | * 作为一个整理进行处理,不过绝大多数只根据中文定义就可以标识出来,因此忽略不计。 6 | * 7 | * @author 夏天 8 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 9 | * @deprecated 10 | */ 11 | public class Sememe { 12 | /** 义原编号 */ 13 | private int id; 14 | /** 指向上位义元号 */ 15 | private int parentId; 16 | /** 义原在义原树中的深度 */ 17 | private int depth; 18 | /** 义原的中文名称*/ 19 | private String cnWord; 20 | /** 义原的英文名称 */ 21 | private String enWord; 22 | /** 义原的定义,如果没有(例如数量),则为空串 */ 23 | private String define; 24 | /** 义原的类型 */ 25 | private int type; 26 | 27 | /** 28 | * 每一行的形式为:be|是 {relevant,isa}/{relevant,descriptive} 29 | *
或者 official|官 [#organization|组织,#employee|员] 30 | *
或者 amount|多少 31 | *
把相应的部分赋予不同的属性 32 | * 出于性能考虑,把未用到的英文名称、定义等忽略 33 | * @param id 34 | * @param parentId 35 | * @param item 读取文件中的一行 36 | */ 37 | public Sememe(int id, int parentId, int depth, String item) { 38 | this.id = id; 39 | this.parentId = parentId; 40 | this.depth = depth; 41 | 42 | int pos = item.indexOf('|'); 43 | if (pos < 0) { 44 | this.cnWord = item; 45 | this.enWord = item; 46 | } else { 47 | this.enWord = item.substring(0, pos); 48 | 49 | // 去掉"|"符号 50 | String nextPart = item.substring(pos + 1); 51 | pos = nextPart.indexOf(' '); 52 | if (pos <= 0) { 53 | this.cnWord = nextPart; 54 | } else { 55 | this.cnWord = nextPart.substring(0, pos); 56 | this.define = nextPart.substring(pos).trim(); 57 | } 58 | } 59 | } 60 | 61 | public int getId() { 62 | return id; 63 | } 64 | 65 | public void setId(int id) { 66 | this.id = id; 67 | } 68 | 69 | public int getParentId() { 70 | return parentId; 71 | } 72 | 73 | public void setParentId(int parentId) { 74 | this.parentId = parentId; 75 | } 76 | 77 | public int getDepth() { 78 | return depth; 79 | } 80 | 81 | public void setDepth(int depth) { 82 | this.depth = depth; 83 | } 84 | 85 | public String getCnWord() { 86 | return cnWord; 87 | } 88 | 89 | public void setCnWord(String cnWord) { 90 | this.cnWord = cnWord; 91 | } 92 | 93 | public String getEnWord() { 94 | return enWord; 95 | } 96 | 97 | public void setEnWord(String enWord) { 98 | this.enWord = enWord; 99 | } 100 | 101 | public String getDefine() { 102 | return define; 103 | } 104 | 105 | public void setDefine(String define) { 106 | this.define = define; 107 | } 108 | 109 | public int getType() { 110 | return type; 111 | } 112 | 113 | public void setType(int type) { 114 | this.type = type; 115 | } 116 | 117 | @Override 118 | public String toString(){ 119 | StringBuilder sb = new StringBuilder(); 120 | sb.append("id="); 121 | sb.append(id); 122 | sb.append("; parentId="); 123 | sb.append(parentId); 124 | sb.append("; depth="); 125 | sb.append(depth); 126 | sb.append("; cnWord="); 127 | sb.append(cnWord); 128 | sb.append("; enWord="); 129 | sb.append(enWord); 130 | sb.append("; define="); 131 | sb.append(define); 132 | return sb.toString(); 133 | } 134 | 135 | } 136 | 137 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/sememe/SememeParser.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet.sememe; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.Collection; 6 | 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | import ruc.irm.similarity.Similaritable; 10 | import ruc.irm.similarity.util.BlankUtils; 11 | import ruc.irm.similarity.util.FileUtils; 12 | import ruc.irm.similarity.word.hownet.HownetMeta; 13 | 14 | /** 15 | * 义原解析器, 包括义元数据的加载,义元的组织、索引、查询 以及义元的距离计算和相似度计算等. 16 | * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》 17 | * 18 | * @author 夏天 19 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 20 | * 21 | * @see ruc.irm.similarity.Similaritable 22 | * @deprecated 23 | */ 24 | public abstract class SememeParser implements HownetMeta, Similaritable { 25 | protected Logger LOG = LoggerFactory.getLogger(this.getClass()); 26 | 27 | /** 所有的义原都存放到一个数组之中,并且义元的ID号与数组的下标相同 */ 28 | protected Sememe[] SEMEMES; 29 | 30 | /** 通过对义原的汉语词义进行索引,根据该索引快速定位义原,找出义原的id,再到sememes中查找 */ 31 | private FastSimpleMap sememeMap = null; 32 | 33 | public SememeParser() throws IOException{ 34 | String sememeFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/sememe.dat"; 35 | 36 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(sememeFile); 37 | load(input, "UTF-8"); 38 | } 39 | 40 | /** 41 | * 获取两个义原描述串的相似度 42 | * @param sememeName1 43 | * @param sememeName2 44 | * @see ke.commons.similarity.Similariable 45 | * @return 46 | */ 47 | public abstract double getSimilarity(String sememeName1, String sememeName2); 48 | 49 | /** 50 | * 获取两个确定义原的相似度 51 | * @param sememe1 52 | * @param sememe2 53 | * @return 54 | */ 55 | public abstract double getSimilarity(Sememe sememe1, Sememe sememe2); 56 | 57 | /** 58 | * 从文件中加载义元知识 59 | * 60 | * @throws IOException 61 | */ 62 | public void load(InputStream input, String encoding) throws IOException { 63 | SememeDictTraverseEvent event = new SememeDictTraverseEvent(); 64 | LOG.info("loading sememe dictionary..."); 65 | long time = System.currentTimeMillis(); 66 | FileUtils.traverseLines(input, encoding, event); 67 | this.SEMEMES = event.getSememes(); 68 | 69 | String[] keys = new String[SEMEMES.length]; 70 | Integer[] values = new Integer[SEMEMES.length]; 71 | 72 | //设置索引 73 | for(int i=0; i(keys, values); 78 | 79 | time = System.currentTimeMillis() - time; 80 | LOG.info("sememe dictionary load completely. time elapsed: " + time); 81 | } 82 | 83 | /** 84 | * 根据汉语定义计算义元之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大, 85 | *
由于可能多个义元有相同的汉语词语,故计算结果为其中距离最小者 86 | * 87 | * @param key1 88 | * @param key2 89 | * @return 90 | */ 91 | public int getDistance(String key1, String key2) { 92 | int distance = Integer.MAX_VALUE; 93 | 94 | // 如果两个字符串相等,直接返回距离为0 95 | if (key1.equals(key2)) { 96 | return 0; 97 | } 98 | 99 | Integer[] semArray1 = getSememes(key1); 100 | Integer[] semArray2 = getSememes(key2); 101 | 102 | // 如果key1或者key2不是义元,并且key1<>key2,则返回无穷大 103 | if (semArray1.length == 0 || semArray2.length == 0) { 104 | return Integer.MAX_VALUE; 105 | } 106 | 107 | for(int i:semArray1){ 108 | for(int j:semArray2){ 109 | int d = getDistance(SEMEMES[i], SEMEMES[j]); 110 | if(d 0) 140 | mysem1 = SEMEMES[mysem1.getParentId()]; 141 | else 142 | mysem2 = SEMEMES[mysem2.getParentId()]; 143 | distance++; 144 | } 145 | 146 | //从不同的分支(深度相同)同时向上寻找共同的祖先节点 147 | while (mysem1.getId() != mysem2.getId()) { 148 | // 如果已经到达根节点,仍然不同,则返回无穷大(-1) 149 | if (mysem1.getId() == mysem1.getParentId() 150 | || mysem2.getId() == mysem2.getParentId()) { 151 | distance = Integer.MAX_VALUE; 152 | break; 153 | } 154 | 155 | mysem1 = SEMEMES[mysem1.getParentId()]; 156 | mysem2 = SEMEMES[mysem2.getParentId()]; 157 | distance += 2; 158 | } 159 | 160 | return distance; 161 | } 162 | 163 | /** 164 | * 获取从该义元到根节点的路径表示字符串 165 | * 166 | * @param key 167 | * @return 168 | */ 169 | public String getPath(String key) { 170 | StringBuilder path = new StringBuilder(); 171 | 172 | Sememe sem = getSememe(key); 173 | while (sem != null && sem.getId() != sem.getParentId()) { 174 | path.insert(0, "->" + sem.getCnWord()); 175 | sem = SEMEMES[sem.getParentId()]; 176 | } 177 | 178 | if (sem != null){ 179 | path.insert(0, "->" + sem.getCnWord()); 180 | } 181 | path.insert(0, "START"); 182 | return path.toString(); 183 | } 184 | 185 | /** 186 | * 根据义原的名字,获取该义原的位置信息,义原体系中有时会有一个名字对应多个义原,一并返回到 187 | * 义原数组中 188 | * @param sememeName 189 | * @return 190 | */ 191 | public Integer[] getSememes(String sememeName) { 192 | Collection ids = sememeMap.get(sememeName); 193 | 194 | return ids.toArray(new Integer[ids.size()]); 195 | } 196 | 197 | /** 198 | * 获取其中的一个义原,大部分义原就只有一个 199 | * @param sememeName 200 | * @return 201 | */ 202 | public Sememe getSememe(String sememeName){ 203 | Integer[] ids = getSememes(sememeName); 204 | 205 | if(BlankUtils.isBlank(ids)){ 206 | return null; 207 | }else{ 208 | return SEMEMES[ids[0]]; 209 | } 210 | } 211 | 212 | /** 213 | * 过滤义原字符串,去掉其中的英文部分 214 | * @param sememeString 215 | * @return 216 | */ 217 | protected String filterSememeString(String sememeString){ 218 | int pos = sememeString.indexOf("|"); 219 | if (pos >= 0) { 220 | sememeString = sememeString.substring(pos + 1); 221 | } 222 | return sememeString; 223 | } 224 | 225 | } 226 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet/sememe/SememeType.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet.sememe; 2 | 3 | /** 4 | * 义原的类型定义
5 | *
    6 | *
  • 1:Event|事件
  • 7 | *
  • 2:Entity|实体
  • 8 | *
  • 3:Attribute|属性
  • 9 | *
  • 4:Quantity|数量
  • 10 | *
  • 5:aValue|属性值
  • 11 | *
  • 6:qValue|数量值
  • 12 | *
  • 7: Secondary Feature|第二特征
  • 13 | *
  • 8: Syntax|语法
  • 14 | *
  • 9: EventRole|动态角色
  • 15 | *
  • 10:EventFeatures|动态属性
  • 16 | *
  • 0:未知
  • 17 | *
18 | * 19 | * 其中1~7为基本义元,8为语法义元,9、10为关系义元
20 | * 21 | * @author 夏天 22 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 23 | * @deprecated 24 | */ 25 | public interface SememeType { 26 | /** Event|事件类型定义 */ 27 | public static final int Event = 1; 28 | 29 | /** Entity|实体类型定义*/ 30 | public static final int Entity = 2; 31 | 32 | /** Attribute|属性类型定义*/ 33 | public static final int Attribute = 3; 34 | 35 | /** Quantity|数量类型定义*/ 36 | public static final int Quantity = 4; 37 | 38 | /** aValue|属性值类型定义*/ 39 | public static final int AValue = 5; 40 | 41 | /** qValue|数量值类型定义*/ 42 | public static final int QValue = 6; 43 | 44 | /** Secondary Feature|第二特征类型定义*/ 45 | public static final int SecondaryFeature = 7; 46 | 47 | /** Syntax|语法类型定义*/ 48 | public static final int Syntax = 8; 49 | 50 | /** EventRole|动态角色类型定义*/ 51 | public static final int EventRoleAndFeature = 9; 52 | 53 | /** EventFeatures|动态属性类型定义*/ 54 | public static final int EventFeature = 10; 55 | 56 | /** 未知类型定义*/ 57 | public static final int Unknown = 0; 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet2/concept/ConceptDictTraverseEvent.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet2.concept; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileOutputStream; 6 | import java.io.InputStream; 7 | import java.io.InputStreamReader; 8 | import java.io.PrintWriter; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.List; 12 | 13 | import javax.xml.parsers.DocumentBuilder; 14 | import javax.xml.parsers.DocumentBuilderFactory; 15 | import javax.xml.transform.OutputKeys; 16 | import javax.xml.transform.Transformer; 17 | import javax.xml.transform.TransformerFactory; 18 | import javax.xml.transform.dom.DOMSource; 19 | import javax.xml.transform.stream.StreamResult; 20 | 21 | import org.w3c.dom.Document; 22 | import org.w3c.dom.Element; 23 | 24 | import ruc.irm.similarity.util.TraverseEvent; 25 | import ruc.irm.similarity.word.hownet2.concept.Concept; 26 | 27 | 28 | /** 29 | * 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准,格式如下:
30 | * 阿斗 N human|人,ProperName|专,past|昔
31 | * 阿爸 N human|人,family|家,male|男
32 | * 即: <概念> <空格或者跳格> <词性> <空格或者跳格> <定义>" 33 | *
34 | * 概念保存到数组中,没有保存到Map中,可以降低对内存空间的使用 35 | * 36 | * @author 夏天 37 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 38 | */ 39 | public class ConceptDictTraverseEvent implements TraverseEvent { 40 | private List conceptList = null; 41 | 42 | public ConceptDictTraverseEvent(){ 43 | conceptList = new ArrayList(); 44 | } 45 | 46 | public Concept[] getConcepts(){ 47 | Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]); 48 | Arrays.sort(concepts); 49 | return concepts; 50 | } 51 | 52 | /** 53 | * 读取概念词典中的一行,并进行解析处理 54 | */ 55 | public boolean visit(String line) { 56 | String word = null; 57 | String pos = null; 58 | String define = ""; 59 | char ch; 60 | 61 | //以符号//开始的是注释行 62 | if(line.startsWith("//")){ 63 | return true; 64 | } 65 | 66 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置 67 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义 68 | //解析出一行中的概念各项数据 69 | loop: for (int position = 0; position < line.length(); position++) { 70 | ch = line.charAt(position); 71 | 72 | if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) { 73 | String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position); 74 | switch(processFlag){ 75 | case 0: 76 | word = item; 77 | processFlag++; 78 | break; 79 | case 1: 80 | pos = item; 81 | processFlag++; 82 | break; 83 | case 2: 84 | //define = item; 85 | //processFlag++; 86 | define = line.substring(lastPosition).trim(); 87 | break loop; 88 | case 3: 89 | System.out.println(line); 90 | break; 91 | } 92 | 93 | for( ;(position < line.length()); position++){ 94 | ch = line.charAt(position); 95 | if ((ch != ' ') && (ch != '\t')) { 96 | lastPosition = position; 97 | break; 98 | } 99 | } 100 | 101 | } 102 | } 103 | conceptList.add(new Concept(word, pos, define)); 104 | return true; 105 | } 106 | 107 | public void saveToXML(File xmlFile) throws Exception{ 108 | String conceptFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/concept.dat"; 109 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(conceptFile); 110 | BufferedReader in = new BufferedReader(new InputStreamReader(input, "utf8")); 111 | 112 | DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance(); 113 | DocumentBuilder builder=factory.newDocumentBuilder(); 114 | Document document=builder.newDocument(); 115 | Element root=document.createElement("concepts"); 116 | document.appendChild(root); 117 | 118 | String line = null; 119 | 120 | while ((line = in.readLine()) != null) { 121 | saveLineToXML(document, root, line); 122 | } 123 | 124 | input.close(); 125 | in.close(); 126 | 127 | TransformerFactory tf=TransformerFactory.newInstance(); 128 | Transformer transformer=tf.newTransformer(); 129 | DOMSource source=new DOMSource(document); 130 | transformer.setOutputProperty(OutputKeys.ENCODING,"utf8"); 131 | transformer.setOutputProperty(OutputKeys.INDENT,"yes"); 132 | PrintWriter pw=new PrintWriter(new FileOutputStream(xmlFile)); 133 | StreamResult result=new StreamResult(pw); 134 | transformer.transform(source,result); 135 | } 136 | 137 | 138 | /** 139 | * 读取概念词典中的一行,并进行解析处理 140 | */ 141 | private boolean saveLineToXML(Document document, Element root, String line) { 142 | String word = null; 143 | String pos = null; 144 | String define = ""; 145 | char ch; 146 | 147 | //以符号//开始的是注释行 148 | if(line.startsWith("//")){ 149 | return true; 150 | } 151 | 152 | int lastPosition = 0; //最近一次处理内容的有意义的开始位置 153 | int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义 154 | //解析出一行中的概念各项数据 155 | loop: for (int position = 0; position < line.length(); position++) { 156 | ch = line.charAt(position); 157 | 158 | if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) { 159 | String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position); 160 | switch(processFlag){ 161 | case 0: 162 | word = item; 163 | processFlag++; 164 | break; 165 | case 1: 166 | pos = item; 167 | processFlag++; 168 | break; 169 | case 2: 170 | //define = item; 171 | //processFlag++; 172 | define = line.substring(lastPosition).trim(); 173 | break loop; 174 | case 3: 175 | System.out.println(line); 176 | break; 177 | } 178 | 179 | for( ;(position < line.length()); position++){ 180 | ch = line.charAt(position); 181 | if ((ch != ' ') && (ch != '\t')) { 182 | lastPosition = position; 183 | break; 184 | } 185 | } 186 | 187 | } 188 | } 189 | 190 | Element e = document.createElement("c"); 191 | e.setAttribute("w", word); 192 | e.setAttribute("p", pos); 193 | e.setAttribute("d", define); 194 | root.appendChild(e); 195 | return true; 196 | } 197 | 198 | public static void main(String[] args) throws Exception { 199 | new ConceptDictTraverseEvent().saveToXML(new File("/home/xiatian/Desktop/concept.xml")); 200 | } 201 | 202 | } 203 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet2/concept/ConceptLinkedList.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet2.concept; 2 | 3 | import java.util.LinkedList; 4 | 5 | /** 6 | * 用于概念处理的LinkedList 7 | * 8 | * @author 夏天 9 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 10 | * 11 | * @param 12 | */ 13 | @SuppressWarnings("serial") 14 | public class ConceptLinkedList extends LinkedList { 15 | 16 | /** 17 | * 删除链表中最后面的size个元素 18 | * @param size 19 | */ 20 | public void removeLast(int size){ 21 | for(int i=0;i夏天 15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 16 | */ 17 | public class LiuConceptParser extends BaseConceptParser{ 18 | 19 | private static LiuConceptParser instance = null; 20 | 21 | public static LiuConceptParser getInstance(){ 22 | if(instance == null){ 23 | try { 24 | instance = new LiuConceptParser(); 25 | } catch (IOException e) { 26 | e.printStackTrace(); 27 | } 28 | } 29 | 30 | return instance; 31 | } 32 | 33 | private LiuConceptParser(BaseSememeParser sememeParser) throws IOException { 34 | super(sememeParser); 35 | } 36 | 37 | private LiuConceptParser() throws IOException{ 38 | super(new LiuqunSememeParser()); 39 | } 40 | 41 | @Override 42 | protected double calculate(double sim_v1, double sim_v2, double sim_v3, double sim_v4){ 43 | return beta1 * sim_v1 44 | + beta2 * sim_v1 * sim_v2 45 | + beta3 * sim_v1 * sim_v2 * sim_v3 46 | + beta4 * sim_v1 * sim_v2 * sim_v3 * sim_v4; 47 | } 48 | 49 | @Override 50 | public double getSimilarity(String word1, String word2) { 51 | double similarity = 0.0; 52 | 53 | // 如果两个句子相同,则直接返回1.0 54 | if (word1.equals(word2)) { 55 | return 1.0; 56 | } 57 | 58 | Collection concepts1 = getConcepts(word1); 59 | Collection concepts2 = getConcepts(word2); 60 | 61 | //如果是blank,则说明是未登录词, 需要计算组合概念 62 | if(BlankUtils.isBlank(concepts1) || BlankUtils.isBlank(concepts2)){ 63 | return 0.0; 64 | } 65 | 66 | //两个for循环分别计算词语所有可能的概念的相似度 67 | for(Concept c1:concepts1){ 68 | for(Concept c2:concepts2){ 69 | double v = getSimilarity(c1, c2); 70 | 71 | if(v>similarity){ 72 | similarity = v; 73 | } 74 | 75 | if(similarity == 1.0){ 76 | break; 77 | } 78 | } 79 | } 80 | 81 | return similarity; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet2/sememe/BaseSememeParser.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet2.sememe; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.zip.GZIPInputStream; 6 | 7 | import javax.xml.namespace.QName; 8 | import javax.xml.stream.XMLEventReader; 9 | import javax.xml.stream.XMLInputFactory; 10 | import javax.xml.stream.events.StartElement; 11 | import javax.xml.stream.events.XMLEvent; 12 | 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | import ruc.irm.similarity.Similaritable; 16 | import ruc.irm.similarity.word.hownet.HownetMeta; 17 | 18 | import com.google.common.collect.HashMultimap; 19 | import com.google.common.collect.Multimap; 20 | 21 | /** 22 | * 义原解析器基类,所有义原存储在xml文件中(当前package中的sememe.xml.tar.gz文件)。
23 | * 算法的核心思想请参看论文《汉语词语语义相似度计算研究》或《中文信息相似度计算理论与方法》一书第三章
24 | * 25 | * 为提高运算速度,义原的加载方式做了调整,只把义原的汉语定义和对应的Id加入到MultiMap对象中,并通过义原的层次化Id计算义原之间的相似度。
26 | * 27 | * @author 夏天 28 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 29 | * 30 | * @see {@link ruc.irm.similarity.Similaritable} 31 | */ 32 | public abstract class BaseSememeParser implements HownetMeta, Similaritable { 33 | protected Logger LOG = LoggerFactory.getLogger(this.getClass()); 34 | 35 | /** 所有的义原都存放到一个MultiMap, Key为Sememe的中文定义, Value为义原的Id */ 36 | protected static Multimap SEMEMES = null; 37 | 38 | public BaseSememeParser() throws IOException { 39 | if (SEMEMES != null) { 40 | return; 41 | } 42 | 43 | SEMEMES = HashMultimap.create(); 44 | 45 | InputStream input = this.getClass().getResourceAsStream("/data/sememe.xml.gz"); 46 | input = new GZIPInputStream(input); 47 | load(input); 48 | } 49 | 50 | /** 51 | * 从文件中加载义元知识 52 | * 53 | * @throws IOException 54 | */ 55 | public void load(InputStream input) throws IOException { 56 | System.out.print("loading sememes..."); 57 | long time = System.currentTimeMillis(); 58 | try { 59 | XMLInputFactory inputFactory = XMLInputFactory.newInstance(); 60 | XMLEventReader xmlEventReader = inputFactory.createXMLEventReader(input); 61 | 62 | int count = 0; 63 | while (xmlEventReader.hasNext()) { 64 | XMLEvent event = xmlEventReader.nextEvent(); 65 | 66 | if (event.isStartElement()) { 67 | StartElement startElement = event.asStartElement(); 68 | if (startElement.getName().toString().equals("sememe")) { 69 | String cnWord = startElement.getAttributeByName(QName.valueOf("cn")).getValue(); 70 | String id = startElement.getAttributeByName(QName.valueOf("id")).getValue(); 71 | SEMEMES.put(cnWord, id); 72 | count++; 73 | if (count % 100 == 0) { 74 | System.out.print("."); 75 | } 76 | } 77 | } 78 | } 79 | input.close(); 80 | } catch (Exception e) { 81 | throw new IOException(e); 82 | } 83 | time = System.currentTimeMillis() - time; 84 | System.out.println("\ncomplete!. time elapsed: " + (time / 1000) + "s"); 85 | } 86 | 87 | /** 88 | * 计算两个义原之间的关联度 89 | * 90 | * @param sememeName1 91 | * @param sememeName2 92 | * @return 93 | */ 94 | public double getAssociation(String sememeName1, String sememeName2) { 95 | return 0.0; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet2/sememe/LiuqunSememeParser.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet2.sememe; 2 | 3 | import java.io.IOException; 4 | import java.util.Collection; 5 | 6 | /** 7 | * 刘群老师计算义原相似度的方法, 实现了SememeParser中定义的抽象方法 8 | * 9 | * @author 夏天 10 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 11 | * 12 | * @author xiatian 13 | * @version 1.0 14 | */ 15 | public class LiuqunSememeParser extends BaseSememeParser { 16 | 17 | /** 计算义元相似度的可调节的参数,默认为1.6 */ 18 | private final float alpha = 1.6f; 19 | 20 | public LiuqunSememeParser() throws IOException { 21 | super(); 22 | } 23 | 24 | /** 25 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者 26 | *
similarity = alpha/(distance+alpha) 27 | * 28 | * @param key1 29 | * @param key2 30 | * @return 31 | */ 32 | @Override 33 | public double getSimilarity(String item1, String item2) { 34 | int pos; 35 | 36 | // 如果为空串,直接返回0 37 | if (item1 == null || item2 == null || item1.equals("") 38 | || item2.equals("")) 39 | return 0.0; 40 | 41 | String key1 = item1.trim(); 42 | String key2 = item2.trim(); 43 | 44 | // 去掉()符号 45 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) { 46 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') { 47 | key1 = key1.substring(1, key1.length() - 1); 48 | key2 = key2.substring(1, key2.length() - 1); 49 | } else { 50 | return 0.0; 51 | } 52 | } 53 | 54 | // 处理关系义元,即x=y的情况 55 | if ((pos = key1.indexOf('=')) > 0) { 56 | int pos2 = key2.indexOf('='); 57 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0 58 | if ((pos == pos2) 59 | && key1.substring(0, pos).equals(key2.substring(0, pos2))) { 60 | key1 = key1.substring(pos + 1); 61 | key2 = key2.substring(pos2 + 1); 62 | } else { 63 | return 0.0; 64 | } 65 | } 66 | 67 | // 处理符号义元,即前面有特殊符号的义元 68 | String symbol1 = key1.substring(0, 1); 69 | String symbol2 = key2.substring(0, 1); 70 | 71 | for (int i = 0; i < Symbol_Descriptions.length; i++) { 72 | if (symbol1.equals(Symbol_Descriptions[i][0])) { 73 | if (symbol1.equals(symbol2)) { 74 | key1 = item1.substring(1); 75 | key2 = item2.substring(1); 76 | break; 77 | } else { 78 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0 79 | } 80 | } 81 | } 82 | 83 | if ((pos = key1.indexOf("|")) >= 0) { 84 | key1 = key1.substring(pos + 1); 85 | } 86 | if ((pos = key2.indexOf("|")) >= 0) { 87 | key2 = key2.substring(pos + 1); 88 | } 89 | 90 | int distance = getMinDistance(key1, key2); 91 | return alpha / (distance + alpha); 92 | } 93 | 94 | /** 95 | * 根据汉语定义计算义原之间的距离,Integer.MAX_VALUE代表两个义元之间的距离为无穷大,由于可能多个义元有相同的汉语词语, 96 | * 故计算结果为其中距离最小者 97 | * 98 | * @param key1 99 | * @param key2 100 | * @return 101 | */ 102 | public int getMinDistance(String sememe1, String sememe2) { 103 | int distance = Integer.MAX_VALUE; 104 | 105 | // 如果两个字符串相等,直接返回距离为0 106 | if (sememe1.equals(sememe2)) { 107 | return 0; 108 | } 109 | 110 | Collection sememeIds1 = SEMEMES.get(sememe1); 111 | Collection sememeIds2 = SEMEMES.get(sememe2); 112 | 113 | // 如果sememe1或者sememe2不是义元,则返回无穷大 114 | if (sememeIds1.size() == 0 || sememeIds1.size() == 0) { 115 | return Integer.MAX_VALUE; 116 | } 117 | 118 | for(String id1:sememeIds1){ 119 | for(String id2:sememeIds2){ 120 | int d = getDistance(id1, id2); 121 | if(d 6 | * 义原编号采用父节点Id-子节点Id编码方式,如: 7 | * <sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/> 8 | * 义原的id表明了义原之间的上下位关系和义原的深度。 9 | * 10 | * @author 夏天 11 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 12 | */ 13 | public class Sememe { 14 | /** 15 | * 义原编号,采用父节点Id-子节点Id编码方式,如<sememe cn="成功" define="{experiencer,scope}" en="succeed" id="1-1-2-1-4-5"/> 16 | * id表明了义原之间的上下位关系 17 | */ 18 | private String id; 19 | /** 义原的中文名称*/ 20 | private String cnWord; 21 | /** 义原的英文名称 */ 22 | private String enWord; 23 | /** 义原的定义,如果没有(例如数量),则为空串 */ 24 | private String define; 25 | 26 | /** 27 | * 每一行的形式为:be|是 {relevant,isa}/{relevant,descriptive} 28 | *
或者 official|官 [#organization|组织,#employee|员] 29 | *
或者 amount|多少 30 | *
把相应的部分赋予不同的属性 31 | * 出于性能考虑,把未用到的英文名称、定义等忽略 32 | * @param id 33 | */ 34 | public Sememe(String id, String en, String cn, String define) { 35 | this.id = id; 36 | this.cnWord = cn; 37 | //为提高效率,减少内存空间利用,可去掉以下两行 38 | this.enWord = en; 39 | this.define = define; 40 | } 41 | 42 | public String getId() { 43 | return id; 44 | } 45 | 46 | public void setId(String id) { 47 | this.id = id; 48 | } 49 | 50 | public String getCnWord() { 51 | return cnWord; 52 | } 53 | 54 | public void setCnWord(String cnWord) { 55 | this.cnWord = cnWord; 56 | } 57 | 58 | public String getEnWord() { 59 | return enWord; 60 | } 61 | 62 | public void setEnWord(String enWord) { 63 | this.enWord = enWord; 64 | } 65 | 66 | public String getDefine() { 67 | return define; 68 | } 69 | 70 | public void setDefine(String define) { 71 | this.define = define; 72 | } 73 | 74 | public int getType() { 75 | char ch = id.charAt(0); 76 | switch (ch) { 77 | case '1': 78 | return SememeType.Event; 79 | case '2': 80 | return SememeType.Entity; 81 | case '3': 82 | return SememeType.Attribute; 83 | case '4': 84 | return SememeType.Quantity; 85 | case '5': 86 | return SememeType.AValue; 87 | case '6': 88 | return SememeType.QValue; 89 | case '7': 90 | return SememeType.SecondaryFeature; 91 | case '8': 92 | return SememeType.Syntax; 93 | case '9': 94 | return SememeType.EventRoleAndFeature; 95 | default: 96 | return 0; 97 | } 98 | } 99 | 100 | @Override 101 | public String toString(){ 102 | StringBuilder sb = new StringBuilder(); 103 | sb.append("id="); 104 | sb.append(id); 105 | sb.append("; cnWord="); 106 | sb.append(cnWord); 107 | sb.append("; enWord="); 108 | sb.append(enWord); 109 | sb.append("; define="); 110 | sb.append(define); 111 | return sb.toString(); 112 | } 113 | 114 | } 115 | 116 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet2/sememe/SememeType.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet2.sememe; 2 | 3 | /** 4 | * 义原的类型定义
5 | *
    6 | *
  • 1:Event|事件
  • 7 | *
  • 2:Entity|实体
  • 8 | *
  • 3:Attribute|属性
  • 9 | *
  • 4:Quantity|数量
  • 10 | *
  • 5:aValue|属性值
  • 11 | *
  • 6:qValue|数量值
  • 12 | *
  • 7: Secondary Feature|第二特征
  • 13 | *
  • 8: Syntax|语法
  • 14 | *
  • 9: EventRole|动态角色
  • 15 | *
  • 10:EventFeatures|动态属性
  • 16 | *
  • 0:未知
  • 17 | *
18 | * 19 | * 其中1~7为基本义元,8为语法义元,9、10为关系义元
20 | * 21 | * @author 夏天 22 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 23 | */ 24 | public interface SememeType { 25 | /** Event|事件类型定义 */ 26 | public static final int Event = 1; 27 | 28 | /** Entity|实体类型定义*/ 29 | public static final int Entity = 2; 30 | 31 | /** Attribute|属性类型定义*/ 32 | public static final int Attribute = 3; 33 | 34 | /** Quantity|数量类型定义*/ 35 | public static final int Quantity = 4; 36 | 37 | /** aValue|属性值类型定义*/ 38 | public static final int AValue = 5; 39 | 40 | /** qValue|数量值类型定义*/ 41 | public static final int QValue = 6; 42 | 43 | /** Secondary Feature|第二特征类型定义*/ 44 | public static final int SecondaryFeature = 7; 45 | 46 | /** Syntax|语法类型定义*/ 47 | public static final int Syntax = 8; 48 | 49 | /** EventRole|动态角色类型定义*/ 50 | public static final int EventRoleAndFeature = 9; 51 | 52 | /** 未知类型定义*/ 53 | public static final int Unknown = 0; 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/hownet2/sememe/XiaSememeParser.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet2.sememe; 2 | 3 | import java.io.IOException; 4 | import java.util.Collection; 5 | 6 | import ruc.irm.similarity.util.BlankUtils; 7 | 8 | 9 | /** 10 | * 义原相似度计算, 实现了SememeParser中定义的抽象方法 11 | * 12 | * @author 夏天 13 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 14 | */ 15 | public class XiaSememeParser extends BaseSememeParser { 16 | 17 | public XiaSememeParser() throws IOException{ 18 | super(); 19 | } 20 | 21 | /** 22 | * 计算两个义原的相似度 23 | */ 24 | double getSimilarityBySememeId(final String id1, final String id2) { 25 | 26 | int position = 0; 27 | String[] array1 = id1.split("-"); 28 | String[] array2 = id2.split("-"); 29 | for (position = 0; position < array1.length && position < array2.length; position++) { 30 | if (!array1[position].equals(array2[position])) { 31 | break; 32 | } 33 | } 34 | 35 | return 2.0*position/(array1.length + array2.length); 36 | } 37 | 38 | /** 39 | * 根据汉语定义计算义原之间的相似度,由于可能多个义元有相同的汉语词语,故计算结果为其中相似度最大者 40 | * 41 | * @param key1 42 | * @param key2 43 | * @return 44 | */ 45 | public double getMaxSimilarity(String sememeName1, String sememeName2) { 46 | double maxValue = 0.0; 47 | 48 | // 如果两个字符串相等,直接返回距离为0 49 | if (sememeName1.equals(sememeName2)) { 50 | return 1.0; 51 | } 52 | 53 | Collection sememeIds1 = SEMEMES.get(sememeName1); 54 | Collection sememeIds2 = SEMEMES.get(sememeName2); 55 | 56 | // 如果sememe1或者sememe2不是义元,则返回0 57 | if (sememeIds1.size() == 0 || sememeIds1.size() == 0) { 58 | return 0.0; 59 | } 60 | 61 | for(String id1:sememeIds1){ 62 | for(String id2:sememeIds2){ 63 | double value = getSimilarityBySememeId(id1, id2); 64 | if(value > maxValue){ 65 | maxValue = value; 66 | } 67 | } 68 | } 69 | 70 | return maxValue; 71 | } 72 | 73 | /** 74 | * 计算两个义元之间的相似度,由于义元可能相同,计算结果为其中相似度最大者 similarity = alpha/(distance+alpha), 75 | * 如果两个字符串相同或都为空,直接返回1.0 76 | * 77 | * @param key1 第一个义原字符串 78 | * @param key2 第二个义原字符串 79 | * @return 80 | */ 81 | @Override 82 | public double getSimilarity(String item1, String item2) { 83 | if(BlankUtils.isBlankAll(item2, item2)){ 84 | return 1.0; 85 | } else if(BlankUtils.isBlankAtLeastOne(item1, item2)){ 86 | return 0.0; 87 | } else if(item1.equals(item2)){ 88 | return 1.0; 89 | } 90 | 91 | String key1 = item1.trim(); 92 | String key2 = item2.trim(); 93 | 94 | // 去掉()符号 95 | if ((key1.charAt(0) == '(') && (key1.charAt(key1.length() - 1) == ')')) { 96 | 97 | if (key2.charAt(0) == '(' && key2.charAt(key2.length() - 1) == ')') { 98 | key1 = key1.substring(1, key1.length() - 1); 99 | key2 = key2.substring(1, key2.length() - 1); 100 | } else { 101 | return 0.0; 102 | } 103 | 104 | } 105 | 106 | // 处理关系义元,即x=y的情况 107 | int pos = key1.indexOf('='); 108 | if (pos > 0) { 109 | int pos2 = key2.indexOf('='); 110 | // 如果是关系义元,则判断前面部分是否相同,如果相同,则转为计算后面部分的相似度,否则为0 111 | if ((pos == pos2) 112 | && key1.substring(0, pos).equals(key2.substring(0, pos2))) { 113 | key1 = key1.substring(pos + 1); 114 | key2 = key2.substring(pos2 + 1); 115 | } else { 116 | return 0.0; 117 | } 118 | } 119 | 120 | // 处理符号义元,即前面有特殊符号的义元 121 | String symbol1 = key1.substring(0, 1); 122 | String symbol2 = key2.substring(0, 1); 123 | 124 | for (int i = 0; i < Symbol_Descriptions.length; i++) { 125 | if (symbol1.equals(Symbol_Descriptions[i][0])) { 126 | if (symbol1.equals(symbol2)) { 127 | key1 = item1.substring(1); 128 | key2 = item2.substring(1); 129 | break; 130 | } else { 131 | return 0.0; // 如果不是同一关系符号,则相似度直接返回0 132 | } 133 | } 134 | } 135 | 136 | if ((pos = key1.indexOf("|")) >= 0) { 137 | key1 = key1.substring(pos + 1); 138 | } 139 | if ((pos = key2.indexOf("|")) >= 0) { 140 | key2 = key2.substring(pos + 1); 141 | } 142 | 143 | // 如果两个字符串相等,直接返回距离为0 144 | if (key1.equals(key2)) { 145 | return 1.0; 146 | } 147 | 148 | return getMaxSimilarity(key1, key2); 149 | } 150 | 151 | 152 | } -------------------------------------------------------------------------------- /src/main/java/ruc/irm/similarity/word/pinyin/PinyinSimilarity.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.pinyin; 2 | 3 | import java.util.Set; 4 | 5 | import ruc.irm.similarity.Similaritable; 6 | import ruc.irm.similarity.util.EditDistance; 7 | import ruc.irm.similarity.util.PinyinUtils; 8 | 9 | 10 | /** 11 | * 通过拼音计算两个词语是否相似,拼音的相似程度采用编辑距离算法,并进行归一化衡量 12 | * 13 | * @author 夏天 14 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 15 | */ 16 | public class PinyinSimilarity implements Similaritable { 17 | 18 | public double getSimilarity(String item1, String item2) { 19 | Set pinyinSet1 = PinyinUtils.getInstance().getPinyin(item1); 20 | Set pinyinSet2 = PinyinUtils.getInstance().getPinyin(item2); 21 | 22 | double max = 0.0; 23 | for(String pinyin1:pinyinSet1){ 24 | for(String pinyin2:pinyinSet2){ 25 | double distance = new EditDistance().getEditDistance(pinyin1, pinyin2); 26 | double similarity = 1 - distance/( (pinyin1.length()>pinyin2.length())?pinyin1.length():pinyin2.length()); 27 | max = (max>similarity)?max:similarity; 28 | if(max==1.0){ 29 | return max; 30 | } 31 | } 32 | } 33 | return max; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/tendency/word/HownetWordTendency.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.tendency.word; 2 | 3 | import java.io.IOException; 4 | import java.util.Collection; 5 | import java.util.HashSet; 6 | import java.util.Set; 7 | 8 | import ruc.irm.similarity.word.hownet2.concept.BaseConceptParser; 9 | import ruc.irm.similarity.word.hownet2.concept.Concept; 10 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser; 11 | import ruc.irm.similarity.word.hownet2.sememe.BaseSememeParser; 12 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser; 13 | 14 | /** 15 | * 基于知网实现的词语倾向性判别 16 | * 17 | * @author 夏天 18 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 19 | */ 20 | public class HownetWordTendency implements WordTendency { 21 | public static String[] POSITIVE_SEMEMES = new String[]{ 22 | "良", 23 | "喜悦", 24 | "夸奖", 25 | "满意", 26 | "期望", 27 | "注意", 28 | "致敬", 29 | "喜欢", 30 | "专", 31 | "敬佩", 32 | "同意", 33 | "爱惜", 34 | "愿意", 35 | "思念", 36 | "拥护", 37 | "祝贺", 38 | "福", 39 | "需求", 40 | "奖励", 41 | "致谢", 42 | "欢迎", 43 | "羡慕", 44 | "感激", 45 | "爱恋" 46 | }; 47 | 48 | public static String[] NEGATIVE_SEMEMES = new String[]{ 49 | "莠", 50 | "谴责", 51 | "害怕", 52 | "生气", 53 | "悲哀", 54 | "着急", 55 | "轻视", 56 | "羞愧", 57 | "烦恼", 58 | "灰心", 59 | "犹豫", 60 | "为难", 61 | "懊悔", 62 | "厌恶", 63 | "怀疑", 64 | "怜悯", 65 | "忧愁", 66 | "示怒", 67 | "不满", 68 | "仇恨", 69 | "埋怨", 70 | "失望", 71 | "坏" 72 | }; 73 | private BaseConceptParser conceptParser = null; 74 | private BaseSememeParser sememeParser = null; 75 | 76 | public HownetWordTendency(){ 77 | this.conceptParser =XiaConceptParser.getInstance(); 78 | try { 79 | this.sememeParser = new XiaSememeParser(); 80 | } catch (IOException e) { 81 | e.printStackTrace(); 82 | } 83 | } 84 | 85 | @Override 86 | public double getTendency(String word) { 87 | double positive = getSentiment(word, POSITIVE_SEMEMES); 88 | double negative = getSentiment(word, NEGATIVE_SEMEMES);; 89 | return positive - negative; 90 | } 91 | 92 | public double getSentiment(String word, String[] candidateSememes) { 93 | Collection concepts = conceptParser.getConcepts(word); 94 | Set sememes = new HashSet(); 95 | for (Concept c : concepts) { 96 | sememes.addAll(c.getAllSememeNames()); 97 | } 98 | 99 | double max = 0.0; 100 | for(String item:sememes){ 101 | double total = 0.0; 102 | for(String positiveSememe:candidateSememes){ 103 | //如果有特别接近的义原,直接返回该相似值,避免其他干扰 104 | double value = sememeParser.getSimilarity(item, positiveSememe); 105 | if(value>0.9){ 106 | return value; 107 | } 108 | total += value; 109 | } 110 | double sim = total / candidateSememes.length; 111 | if(sim>max){ 112 | max = sim; 113 | } 114 | } 115 | return max; 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/tendency/word/Training.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.tendency.word; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.ArrayList; 9 | import java.util.Collection; 10 | import java.util.Collections; 11 | import java.util.HashMap; 12 | import java.util.List; 13 | import java.util.Map; 14 | 15 | import ruc.irm.similarity.util.BlankUtils; 16 | import ruc.irm.similarity.word.hownet2.concept.Concept; 17 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser; 18 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser; 19 | 20 | import com.google.common.collect.HashMultimap; 21 | import com.google.common.collect.Multimap; 22 | 23 | /** 24 | * 临时训练及测试类 25 | * 26 | * @author 夏天 27 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 28 | */ 29 | public class Training { 30 | 31 | void test(boolean testPositive) throws IOException{ 32 | WordTendency tendency = new HownetWordTendency(); 33 | File f = new File("./dict/sentiment/负面情感词语(中文).txt"); 34 | if(testPositive){ 35 | //f = new File("./dict/sentiment/正面情感词语(中文).txt"); 36 | f = new File("./dict/sentiment/正面评价词语(中文).txt"); 37 | } 38 | String encoding = "utf-8"; 39 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding)); 40 | String line; 41 | int wordCount = 0; 42 | int correctCount = 0; 43 | while ((line = in.readLine()) != null) { 44 | if(line.length()>5) continue; 45 | wordCount++; 46 | 47 | double value =tendency.getTendency(line.trim()); 48 | if(value>0 && testPositive){ 49 | correctCount++; 50 | }else if(value<0 && !testPositive){ 51 | correctCount++; 52 | }else{ 53 | System.out.println("error:" + line + "\t value:" + value); 54 | } 55 | } 56 | System.out.println("correct:" + correctCount); 57 | System.out.println("total:" + wordCount); 58 | System.out.println("ratio:" + correctCount*1.0/wordCount); 59 | } 60 | 61 | /** 62 | * 该方法用于统计知网提供的情感词集合所涉及的义原以及出现频度 63 | * @throws IOException 64 | */ 65 | /** 66 | * @throws IOException 67 | */ 68 | void countSentimentDistribution() throws IOException{ 69 | Map sememeMap = new HashMap(); 70 | File f = new File("./dict/sentiment/负面情感词语(中文).txt"); 71 | String encoding = "utf-8"; 72 | boolean autoCombineConcept = false; 73 | BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding)); 74 | 75 | XiaConceptParser parser = new XiaConceptParser(new XiaSememeParser()); 76 | 77 | String line = null; 78 | 79 | int conceptCount = 0; 80 | int wordCount = 0; 81 | while ((line = in.readLine()) != null) { 82 | if(line.length()>5) continue; 83 | wordCount++; 84 | String word = line.trim(); 85 | Collection concepts = parser.getInnerConcepts(word); 86 | //由于目前的词典为知网2000版本,所以默认情况下仅对词典中出现的概念进行统计 87 | if(BlankUtils.isBlank(concepts) && autoCombineConcept ){ 88 | concepts = parser.autoCombineConcepts(word, null); 89 | } 90 | for(Concept c: concepts){ 91 | conceptCount++; 92 | List names = new ArrayList(); 93 | 94 | //加入主义原 95 | names.add(c.getMainSememe()); 96 | 97 | //加入关系义原 98 | for(String item:c.getRelationSememes()){ 99 | names.add(item.substring(item.indexOf("=") + 1)); 100 | } 101 | 102 | //加入符号义原 103 | for(String item:c.getSymbolSememes()){ 104 | names.add(item.substring(1)); 105 | } 106 | 107 | //加入其他义原集合 108 | for(String item:c.getSecondSememes()){ 109 | names.add(item); 110 | } 111 | 112 | for(String item:names){ 113 | Integer count = sememeMap.get(item); 114 | if(count==null){ 115 | sememeMap.put(item, 1); 116 | }else{ 117 | sememeMap.put(item, count+1); 118 | } 119 | } 120 | } 121 | } 122 | in.close(); 123 | 124 | //以下是为了按照义原出现的数量进行排序的代码 125 | Multimap map2 = HashMultimap.create(); 126 | for(String key:sememeMap.keySet()){ 127 | map2.put(sememeMap.get(key), key); 128 | } 129 | List keys = new ArrayList(); 130 | for(Integer key: map2.keySet()){ 131 | keys.add(key); 132 | } 133 | Collections.sort(keys); 134 | 135 | int smallSememeCount = 0; //较少出现的不同义原数量 136 | int smallAppearTotal = 0; //较少出现的义原在概念众出现的次数总和 137 | for(int index=(keys.size()-1); index>=0; index--){ 138 | Integer key = keys.get(index); 139 | Collection values = map2.get(key); 140 | double ratio = (key*100.0/conceptCount); 141 | System.out.print(key + "(" + ratio + "%): "); 142 | for(String v:values){ 143 | System.out.print(v+ "\t"); 144 | } 145 | System.out.println(); 146 | if(ratio<0.7){ 147 | smallSememeCount += values.size(); 148 | smallAppearTotal += key*values.size(); 149 | } 150 | } 151 | 152 | System.out.println("small info: "); 153 | System.out.println("\tdifferent sememes:" + smallSememeCount); 154 | System.out.println("\tappear count:" + smallAppearTotal); 155 | System.out.println("\tratio:" + smallAppearTotal*100.0/conceptCount); 156 | System.out.println("wordCount:" + wordCount); 157 | System.out.println("conceptCount:" + conceptCount); 158 | } 159 | 160 | public static void main(String[] args) throws IOException { 161 | Training training = new Training(); 162 | training.countSentimentDistribution(); 163 | // System.out.println("test positive:"); 164 | // training.test(true); 165 | // 166 | // System.out.println("test negative:"); 167 | //training.test(false); 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/tendency/word/WordTendency.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.tendency.word; 2 | 3 | /** 4 | * 计算词语的语义倾向性,词语的语义倾向性为一个介于[-1, 1]之间的实数,数值越大,褒义性越强,否则,贬义性越强 5 | * 6 | * @author 夏天 7 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 8 | */ 9 | public interface WordTendency { 10 | /** 11 | * 获取词语的语义倾向性,词语的语义倾向性为一个介于[-1, 1]之间的实数,数值越大,褒义性越强,否则,贬义性越强 12 | * @param word 13 | * @return 14 | */ 15 | public double getTendency(String word); 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/ui/PhraseSimilarityUI.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.ui; 2 | 3 | import java.awt.BorderLayout; 4 | import java.awt.GridLayout; 5 | import java.awt.event.ActionEvent; 6 | import java.awt.event.ActionListener; 7 | 8 | import javax.swing.BorderFactory; 9 | import javax.swing.JButton; 10 | import javax.swing.JLabel; 11 | import javax.swing.JPanel; 12 | import javax.swing.JScrollPane; 13 | import javax.swing.JTextArea; 14 | import javax.swing.JTextField; 15 | 16 | import ruc.irm.similarity.phrase.PhraseSimilarity; 17 | 18 | /** 19 | * 短语相似度的调用演示界面 20 | * @author 夏天 21 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 22 | */ 23 | public class PhraseSimilarityUI { 24 | 25 | /** 26 | * 短语相似度的演示面板 27 | * 28 | * @return 29 | */ 30 | public static JPanel createPanel() { 31 | // 声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel 32 | JPanel fullPanel = new JPanel(); 33 | fullPanel.setLayout(new BorderLayout()); 34 | 35 | JPanel northPanel = new JPanel(); 36 | fullPanel.add(northPanel, "North"); 37 | 38 | // centerPanel包括了一个文本框 39 | JPanel centerPanel = new JPanel(); 40 | fullPanel.add(centerPanel, "Center"); 41 | 42 | centerPanel.setLayout(new BorderLayout()); 43 | final JTextArea result = new JTextArea(); 44 | // result.setFont(new Font("宋体", Font.PLAIN, 16)); 45 | result.setLineWrap(true); 46 | JScrollPane centerScrollPane = new JScrollPane(result); 47 | centerPanel.add(centerScrollPane, "Center"); 48 | 49 | northPanel.setLayout(new GridLayout(1, 1)); 50 | // northPanel.add(createWordPanel()); 51 | // northPanel.add(createCilinPanel()); 52 | 53 | // 以下加入northPanel中的第一个面板 54 | final JTextField field1 = new JTextField(""); 55 | final JTextField field2 = new JTextField(""); 56 | field1.setColumns(50); 57 | field2.setColumns(50); 58 | 59 | JPanel mainPanel = new JPanel(); 60 | mainPanel.setLayout(new GridLayout(3, 1)); 61 | 62 | JPanel linePanel = new JPanel(); 63 | linePanel.add(new JLabel("短语1:")); 64 | linePanel.add(field1); 65 | mainPanel.add(linePanel); 66 | 67 | linePanel = new JPanel(); 68 | linePanel.add(new JLabel("短语2:")); 69 | linePanel.add(field2); 70 | mainPanel.add(linePanel); 71 | 72 | linePanel = new JPanel(); 73 | JButton goButton = new JButton("计算相似度"); 74 | linePanel.add(goButton); 75 | mainPanel.add(linePanel); 76 | goButton.addActionListener(new ActionListener() { 77 | 78 | @Override 79 | public void actionPerformed(ActionEvent e) { 80 | String phrase1 = field1.getText(); 81 | String phrase2 = field2.getText(); 82 | String text = "[" + phrase1 + "]与[" + phrase2 + "]的相似度为:"; 83 | text = text + new PhraseSimilarity().getSimilarity(phrase1, phrase2); 84 | // text = text + "\n\n" + result.getText(); 85 | result.setText(text); 86 | } 87 | 88 | }); 89 | mainPanel.setBorder(BorderFactory.createEtchedBorder()); 90 | northPanel.add(mainPanel); 91 | 92 | return fullPanel; 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/ui/Start.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.ui; 2 | 3 | import java.awt.Container; 4 | import java.awt.Font; 5 | import java.util.Enumeration; 6 | 7 | import javax.swing.JFrame; 8 | import javax.swing.JMenu; 9 | import javax.swing.JMenuBar; 10 | import javax.swing.JMenuItem; 11 | import javax.swing.JScrollPane; 12 | import javax.swing.JTabbedPane; 13 | import javax.swing.SwingUtilities; 14 | import javax.swing.UIManager; 15 | import javax.swing.plaf.FontUIResource; 16 | 17 | import ruc.irm.similarity.sentence.SegmentProxy; 18 | import ruc.irm.similarity.util.About; 19 | 20 | /** 21 | * 相似度计算软件包演示启动类 22 | * 23 | * @author 夏天 24 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 25 | */ 26 | public class Start extends JFrame { 27 | 28 | private static final long serialVersionUID = 85744461208L; 29 | 30 | public Start() { 31 | this.setTitle("相似度计算演示程序"); 32 | this.setSize(420, 700); 33 | this.setLocationRelativeTo(null); 34 | this.setDefaultCloseOperation(EXIT_ON_CLOSE); 35 | 36 | // ////////////////////////////////// 37 | // add menu 38 | JMenuBar menuBar = new JMenuBar(); 39 | this.setJMenuBar(menuBar); 40 | 41 | JMenu fileMenu = new JMenu("File"); 42 | menuBar.add(fileMenu); 43 | fileMenu.add(new JMenuItem("Exit")); 44 | 45 | JMenu helpMenu = new JMenu("Help"); 46 | menuBar.add(helpMenu); 47 | helpMenu.add(new JMenuItem("Help")); 48 | 49 | Container contentPane = this.getContentPane(); 50 | JTabbedPane tabbedPane = new JTabbedPane(); 51 | tabbedPane.add("词语", WordSimlarityUI.createPanel()); 52 | tabbedPane.add("短语", PhraseSimilarityUI.createPanel()); 53 | tabbedPane.add("句子", SentenceSimilarityUI.createPanel()); 54 | // tabbedPane.add("文本", WordSimlarityUI.createPanel()); 55 | tabbedPane.add("词法分析", SegmentProxy.createPanel()); 56 | tabbedPane.add("义原树", SememeTreeUI.createPanel()); 57 | tabbedPane.add("情感分析", TendencyUI.createPanel()); 58 | tabbedPane.add("关于", About.createPanel()); 59 | JScrollPane scrollPane = new JScrollPane(tabbedPane); 60 | contentPane.add(scrollPane); 61 | 62 | this.pack(); 63 | setExtendedState(MAXIMIZED_BOTH); 64 | } 65 | 66 | public static void InitGlobalFont(Font font) { 67 | FontUIResource fontRes = new FontUIResource(font); 68 | for (Enumeration keys = UIManager.getDefaults().keys(); keys.hasMoreElements();) { 69 | Object key = keys.nextElement(); 70 | Object value = UIManager.get(key); 71 | if (value instanceof FontUIResource) { 72 | UIManager.put(key, fontRes); 73 | } 74 | } 75 | } 76 | 77 | public static void main(String[] args) { 78 | //JFrame.setDefaultLookAndFeelDecorated(true); 79 | //解决字体在Ubuntu中显示有乱码的问题 80 | InitGlobalFont(new Font("Microsoft YaHei", Font.TRUETYPE_FONT, 12)); 81 | SwingUtilities.invokeLater(new Runnable() { 82 | 83 | public void run() { 84 | Start w = new Start(); 85 | w.setVisible(true); 86 | } 87 | }); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/ruc/irm/ui/TendencyUI.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.ui; 2 | 3 | import java.awt.BorderLayout; 4 | import java.awt.GridLayout; 5 | import java.awt.event.ActionEvent; 6 | import java.awt.event.ActionListener; 7 | 8 | import javax.swing.BorderFactory; 9 | import javax.swing.JButton; 10 | import javax.swing.JFrame; 11 | import javax.swing.JLabel; 12 | import javax.swing.JPanel; 13 | import javax.swing.JScrollPane; 14 | import javax.swing.JTextArea; 15 | import javax.swing.JTextField; 16 | 17 | import ruc.irm.tendency.word.HownetWordTendency; 18 | 19 | /** 20 | * 测试词语倾向性的用户调用演示界面 21 | * 22 | * @author 夏天 23 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 24 | */ 25 | public class TendencyUI extends JFrame { 26 | private static final long serialVersionUID = -3976827963973640651L; 27 | 28 | public static JPanel createPanel(){ 29 | //声明总的大面板, fullPanel包括一个NorthPanel和一个centerPanel 30 | JPanel fullPanel = new JPanel(); 31 | fullPanel.setLayout(new BorderLayout()); 32 | 33 | JPanel northPanel = new JPanel(); 34 | fullPanel.add(northPanel, "North"); 35 | 36 | //centerPanel包括了一个文本框 37 | JPanel centerPanel = new JPanel(); 38 | fullPanel.add(centerPanel, "Center"); 39 | centerPanel.setLayout(new BorderLayout()); 40 | final JTextArea result = new JTextArea(); 41 | //result.setFont(new Font("宋体", Font.PLAIN, 16)); 42 | result.setLineWrap(true); 43 | JScrollPane centerScrollPane = new JScrollPane(result); 44 | centerPanel.add(centerScrollPane, "Center"); 45 | 46 | northPanel.setLayout(new GridLayout(1, 1)); 47 | 48 | //以下加入northPanel中的第一个面板 49 | final JTextField wordField = new JTextField("恶心"); 50 | wordField.setColumns(40); 51 | 52 | JPanel mainPanel = new JPanel(); 53 | mainPanel.setLayout(new GridLayout(2, 1)); 54 | 55 | JPanel linePanel = new JPanel(); 56 | linePanel.add(new JLabel("输入词语:")); 57 | linePanel.add(wordField); 58 | mainPanel.add(linePanel); 59 | 60 | linePanel = new JPanel(); 61 | JButton goButton = new JButton("计算词语倾向"); 62 | linePanel.add(goButton); 63 | mainPanel.add(linePanel); 64 | goButton.addActionListener(new ActionListener(){ 65 | HownetWordTendency tendency = new HownetWordTendency(); 66 | 67 | @Override 68 | public void actionPerformed(ActionEvent e) { 69 | String word = wordField.getText(); 70 | double positive = tendency.getSentiment(word, HownetWordTendency.POSITIVE_SEMEMES); 71 | double negative = tendency.getSentiment(word, HownetWordTendency.NEGATIVE_SEMEMES); 72 | String text = "[" + word + "]的倾向分析结果为:" ; 73 | 74 | text = text + "\n正面接近程度=" + positive; 75 | text = text + "\n负面接近程度=" + negative; 76 | text = text + "\n倾向性=" + (positive - negative); 77 | text = text + "\n________________________________\n" + result.getText(); 78 | result.setText(text); 79 | result.setCaretPosition(0); 80 | } 81 | 82 | }); 83 | mainPanel.setBorder(BorderFactory.createEtchedBorder()); 84 | northPanel.add(mainPanel); 85 | 86 | return fullPanel; 87 | } 88 | 89 | public TendencyUI(){ 90 | this.setTitle("词语倾向性演示"); 91 | this.setSize(420, 700); 92 | this.setLocationRelativeTo(null); 93 | this.setDefaultCloseOperation(EXIT_ON_CLOSE); 94 | this.getContentPane().setLayout(new BorderLayout()); 95 | this.getContentPane().add(createPanel()); 96 | } 97 | 98 | public static void main(String[] args) { 99 | new TendencyUI().setVisible(true); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/resources/about.html: -------------------------------------------------------------------------------- 1 | 2 | xsimilarity 3 | 4 |

5 |

XSimilarity

6 | 项目地址:http://github.com/iamxiatian/xsimilarity/ 7 |

8 |

9 | 有任何问题或建议请与我们联系,您的反馈将有助于该项目的进一步完善。 10 |

11 |

12 |

致谢

13 | 本项目在研究过程中,得到了恩师樊孝忠教授的悉心指导,师恩如海,难以言谢!
14 | 中国人民大学为本项目的持续研究提供了资金和计算机软硬件的支持,北京理工大学为本项目的早期研究提供了重要的基础设施,
15 | 这些支持与国家的投入密不可分, 16 | 本项目的开源和不断完善也算是对国家的点滴回报!
17 | 代码中许多算法的核心思想来源于我们的研究同行和先辈们的已公开成果,另外,许多使用xsimilarity的人员对xsimilarity
18 | 提出了宝贵的建议,在此一并表示深深的谢意!
19 | 本工程使用了如下开源组件,对原作者致以谢意! 20 |
    21 |
  • ANSJ:
  • 22 |
23 |

24 |

25 |

联系方式

26 | 夏天
27 | 数据工程与知识工程教育部重点实验室(中国人民大学)
28 | 中国人民大学信息资源管理学院
29 | 电话: 86-10-82500675
30 | Email: xiat(at)ruc.edu.cn
31 |

32 | 33 | 34 | -------------------------------------------------------------------------------- /src/main/resources/data/cilin.db.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/src/main/resources/data/cilin.db.gz -------------------------------------------------------------------------------- /src/main/resources/data/concept.xml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/src/main/resources/data/concept.xml.gz -------------------------------------------------------------------------------- /src/main/resources/data/sememe.xml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/src/main/resources/data/sememe.xml.gz -------------------------------------------------------------------------------- /src/main/resources/log4j.dtd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 10 | 11 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 33 | 34 | 35 | 36 | 37 | 38 | 42 | 43 | 44 | 45 | 46 | 47 | 51 | 52 | 53 | 56 | 57 | 58 | 61 | 62 | 63 | 64 | 65 | 66 | 69 | 70 | 71 | 72 | 73 | 76 | 77 | 78 | 82 | 83 | 84 | 85 | 86 | 90 | 91 | 92 | 93 | 97 | 98 | 99 | 100 | 101 | 102 | 107 | 108 | 109 | 110 | 111 | 115 | 116 | 117 | 118 | 120 | 121 | 122 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 141 | 142 | 143 | 144 | 146 | 147 | 148 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 167 | -------------------------------------------------------------------------------- /src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /src/test/java/ruc/irm/similarity/sentence/MorphoSimilarityTest.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence; 2 | 3 | import org.junit.Test; 4 | import ruc.irm.similarity.sentence.morphology.MorphoSimilarity; 5 | import ruc.irm.similarity.sentence.morphology.SemanticSimilarity; 6 | 7 | public class MorphoSimilarityTest { 8 | 9 | @Test 10 | public void test() { 11 | String s1 = "一个伟大的国家,中国"; 12 | String s2 = "中国是一个伟大的国家"; 13 | 14 | s1="修改下密码"; 15 | s2="密码修改"; 16 | MorphoSimilarity similarity = MorphoSimilarity.getInstance(); 17 | double sim = similarity.getSimilarity(s1, s2); 18 | System.out.println("sim ==> " + sim); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/test/java/ruc/irm/similarity/sentence/SemanticSimilarityTest.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.sentence; 2 | 3 | import org.junit.Test; 4 | 5 | import ruc.irm.similarity.sentence.morphology.SemanticSimilarity; 6 | 7 | public class SemanticSimilarityTest { 8 | 9 | @Test 10 | public void test() { 11 | String s1 = "一个伟大的国家,中国"; 12 | String s2 = "中国是一个伟大的国家"; 13 | 14 | // s1="修改下密码"; 15 | // s2="密码修改"; 16 | SemanticSimilarity similarity = SemanticSimilarity.getInstance(); 17 | double sim = similarity.getSimilarity(s1, s2); 18 | System.out.println("sim ==> " + sim); 19 | 20 | 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/test/java/ruc/irm/similarity/statistic/DictStatisticTest.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.statistic; 2 | 3 | import junit.framework.TestCase; 4 | 5 | /** 6 | * ./db/coredict.xml.gz是利用的ictclas4j的词典文件,这个文件可以从lib/ictclas4j.jar文件中得到。 7 | * 即:把ictclas4j.jar文件解压开,里面的dictionary目录下有coredict.xml.gz文件。 8 | * 9 | * @author 夏天 10 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 11 | */ 12 | public class DictStatisticTest extends TestCase { 13 | public void testCount(){ 14 | DictStatistic ds = new DictStatistic(); 15 | ds.testFromXml("./db/coredict.xml.gz", true); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/test/java/ruc/irm/similarity/word/CharBasedSimilarityTest.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word; 2 | 3 | import junit.framework.TestCase; 4 | 5 | public class CharBasedSimilarityTest extends TestCase { 6 | public void test() { 7 | CharBasedSimilarity sim = new CharBasedSimilarity(); 8 | String s1 = "手机"; 9 | String s2 = "飞机"; 10 | 11 | assertTrue(sim.getSimilarity(s1, s2) > 0); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/test/java/ruc/irm/similarity/word/hownet/ConceptTest.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet; 2 | 3 | import java.util.Collection; 4 | 5 | import ruc.irm.similarity.word.hownet2.concept.Concept; 6 | 7 | import com.google.common.collect.ArrayListMultimap; 8 | import com.google.common.collect.HashMultimap; 9 | import com.google.common.collect.Multimap; 10 | 11 | public class ConceptTest { 12 | public static void main(String[] args) { 13 | Multimap CONCEPTS = HashMultimap.create(); 14 | // CONCEPTS = ArrayListMultimap.create(); 15 | 16 | CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起")); 17 | CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起")); 18 | CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起")); 19 | CONCEPTS.put("打", new Concept("打", "V", "TakeOutOfWater|捞起")); 20 | 21 | Collection collection = CONCEPTS.get("打"); 22 | for(Concept c:collection){ 23 | System.out.println(c); 24 | } 25 | 26 | Multimap map = HashMultimap.create(); 27 | // map = ArrayListMultimap.create(); 28 | 29 | map.put("打", 1); 30 | map.put("打", 1); 31 | map.put("打", 1); 32 | map.put("打", 2); 33 | 34 | Collection cc = map.get("打"); 35 | for(Integer i:cc){ 36 | System.out.println(i); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/test/java/ruc/irm/similarity/word/hownet/SememeTest.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet; 2 | 3 | import java.io.InputStream; 4 | 5 | import ruc.irm.similarity.util.FileUtils; 6 | import ruc.irm.similarity.word.hownet.sememe.Sememe; 7 | import ruc.irm.similarity.word.hownet.sememe.SememeDictTraverseEvent; 8 | import ruc.irm.similarity.word.hownet2.sememe.XiaSememeParser; 9 | 10 | 11 | /** 12 | * 针对义原的测试 13 | * 14 | * @author 夏天 15 | * @organization 中国人民大学信息资源管理学院 知识工程实验室 16 | */ 17 | public class SememeTest { 18 | public static void main(String[] args) throws Exception{ 19 | String id1 = "2-1-3-4"; 20 | // String id2 = "2-1-2"; 21 | // System.out.println(getDistance(id1, id2)); 22 | // System.out.println(getSimilarityBySememeId(id1, id2)); 23 | 24 | int pos = id1.lastIndexOf("-"); 25 | String parentId = "root"; 26 | if(pos>0){ 27 | parentId = id1.substring(0, pos); 28 | } 29 | System.out.println(parentId); 30 | new XiaSememeParser().getSimilarity("test", "hello"); 31 | } 32 | 33 | static void saveXML() throws Exception{ 34 | String sememeFile = Sememe.class.getPackage().getName().replaceAll("\\.", "/") + "/sememe.dat"; 35 | InputStream input = Sememe.class.getClassLoader().getResourceAsStream(sememeFile); 36 | SememeDictTraverseEvent event = new SememeDictTraverseEvent(); 37 | 38 | FileUtils.traverseLines(input, "utf8", event); 39 | event.saveToXML("/home/xiatian/Desktop/sememe.xml"); 40 | } 41 | 42 | static double getSimilarityBySememeId(final String id1, final String id2) { 43 | 44 | int position = 0; 45 | String[] array1 = id1.split("-"); 46 | String[] array2 = id2.split("-"); 47 | for (position = 0; position < array1.length && position < array2.length; position++) { 48 | if (!array1[position].equals(array2[position])) { 49 | break; 50 | } 51 | } 52 | 53 | return 2.0*position/(array1.length + array2.length); 54 | } 55 | 56 | static int getDistance(String id1, String id2) { 57 | // 两个Id相同的位置终止地方 58 | int position = 0; 59 | String[] array1 = id1.split("-"); 60 | String[] array2 = id2.split("-"); 61 | for (position = 0; position < array1.length && position < array2.length; position++) { 62 | if (!array1[position].equals(array2[position])) { 63 | return array1.length + array2.length - position - position; 64 | } 65 | } 66 | 67 | if (array1.length == array2.length) { 68 | return 0; 69 | } else if (array1.length == position) { 70 | return array2.length - position; 71 | } else { 72 | return array1.length - position; 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/java/ruc/irm/similarity/word/hownet2/HownetSimilarityTest.java: -------------------------------------------------------------------------------- 1 | package ruc.irm.similarity.word.hownet2; 2 | 3 | import junit.framework.TestCase; 4 | 5 | import org.junit.Before; 6 | import org.junit.Test; 7 | 8 | import ruc.irm.similarity.word.hownet2.concept.LiuConceptParser; 9 | import ruc.irm.similarity.word.hownet2.concept.XiaConceptParser; 10 | 11 | public class HownetSimilarityTest extends TestCase { 12 | XiaConceptParser xParser = null; 13 | LiuConceptParser lParser = null; 14 | 15 | @Before 16 | public void setUp(){ 17 | xParser = XiaConceptParser.getInstance(); 18 | lParser = LiuConceptParser.getInstance(); 19 | } 20 | 21 | @Test 22 | public void testWordSimiarltiy(){ 23 | String word1 = "电动车"; 24 | String word2 = "自行车"; 25 | double x_sim = xParser.getSimilarity(word1, word2); 26 | double l_sim = lParser.getSimilarity(word1, word2); 27 | assertTrue(x_sim>l_sim); 28 | assertTrue(x_sim>0.2); 29 | } 30 | 31 | /** 32 | * 该词语计算相似度时出现死循环,bug由北京大学计算语言学研究所万富强提供,fqw0000@gmail.com 33 | */ 34 | @Test 35 | public void testWordSimiarltiy2(){ 36 | String word1 = "算法"; 37 | String word2 = "安提瓜和巴布达"; 38 | double x_sim = xParser.getSimilarity(word1, word2); 39 | double l_sim = lParser.getSimilarity(word1, word2); 40 | assertTrue(x_sim>=l_sim); 41 | System.out.println("x_sim:" + x_sim); 42 | System.out.println("l_sim:" + l_sim); 43 | 44 | } 45 | } 46 | 47 | 48 | -------------------------------------------------------------------------------- /中文信息相似度计算理论与方法图书目录.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamxiatian/xsimilarity/812ec85d3341731e615c3cd91af17c1df4840853/中文信息相似度计算理论与方法图书目录.pdf --------------------------------------------------------------------------------