├── .gitignore ├── README.md ├── build.xml ├── dic ├── administrative.dic ├── appellation.dic ├── company.dic ├── comupter-science.dic ├── contemporary-words.dic ├── division │ ├── africa.dic │ ├── america.dic │ ├── china.dic │ ├── europe.dic │ ├── japan.dic │ ├── korea.dic │ ├── oceania.dic │ ├── readme.txt │ └── taiwan.dic ├── festival.dic ├── language.dic ├── locale │ ├── beijing.dic │ ├── fuzhou.dic │ ├── quanzhou.dic │ ├── readme.txt │ └── xiamen.dic ├── name-foreign.dic ├── nation.dic ├── org-domestic.dic ├── org-foreign.dic ├── paoding-dic-names.properties ├── star-domestic.dic ├── star-foreign.dic ├── t-base.dic ├── x-confucian-family-name.dic ├── x-for-combinatorics.dic ├── x-noise-charactor.dic ├── x-noise-word.dic └── x-unit.dic ├── dist └── paoding-analysis-2.0.4.jar ├── pom.xml └── src ├── main ├── java │ └── net │ │ └── paoding │ │ └── analysis │ │ ├── Constants.java │ │ ├── analyzer │ │ ├── PaodingAnalyzer.java │ │ ├── PaodingAnalyzerBean.java │ │ ├── PaodingTokenizer.java │ │ ├── TokenCollector.java │ │ ├── estimate │ │ │ ├── Estimate.java │ │ │ └── TryPaodingAnalyzer.java │ │ └── impl │ │ │ ├── CompiledFileDictionaries.java │ │ │ ├── MaxWordLengthTokenCollector.java │ │ │ ├── MostWordsModeDictionariesCompiler.java │ │ │ ├── MostWordsTokenCollector.java │ │ │ └── SortingDictionariesCompiler.java │ │ ├── dictionary │ │ ├── BinaryDictionary.java │ │ ├── Dictionary.java │ │ ├── DictionaryDelegate.java │ │ ├── HashBinaryDictionary.java │ │ ├── Hit.java │ │ ├── Word.java │ │ └── support │ │ │ ├── detection │ │ │ ├── Detector.java │ │ │ ├── Difference.java │ │ │ ├── DifferenceListener.java │ │ │ ├── ExtensionFileFilter.java │ │ │ ├── Node.java │ │ │ └── Snapshot.java │ │ │ └── filewords │ │ │ ├── FileWordsReader.java │ │ │ ├── ReadListener.java │ │ │ ├── SimpleReadListener.java │ │ │ └── SimpleReadListener2.java │ │ ├── exception │ │ └── PaodingAnalysisException.java │ │ ├── ext │ │ └── PaodingAnalyzerListener.java │ │ └── knife │ │ ├── Beef.java │ │ ├── CJKKnife.java │ │ ├── CharSet.java │ │ ├── Collector.java │ │ ├── CollectorStdoutImpl.java │ │ ├── CombinatoricsKnife.java │ │ ├── Dictionaries.java │ │ ├── DictionariesCompiler.java │ │ ├── DictionariesWare.java │ │ ├── FakeKnife.java │ │ ├── FileDictionaries.java │ │ ├── FileDictionariesDifferenceListener.java │ │ ├── Knife.java │ │ ├── KnifeBox.java │ │ ├── LetterKnife.java │ │ ├── NumberKnife.java │ │ ├── Paoding.java │ │ ├── PaodingMaker.java │ │ └── SmartKnifeBox.java └── resources │ ├── paoding-analysis-default.properties │ ├── paoding-analysis.properties │ ├── paoding-analyzer.properties │ ├── paoding-dic-home.properties │ ├── paoding-knives-user.properties │ └── paoding-knives.properties └── test └── java └── net └── paoding └── analysis └── t ├── AnalysisCompare.java ├── InMemoryShortExample.java └── SplitTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | .project 3 | .classpath 4 | *.iml 5 | *.war 6 | *.ear 7 | 8 | 9 | .idea 10 | .settings 11 | target 12 | dic -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Paoding分詞器,基於Lucene4.x 2 | 3 | 修正了Lucene 4.9版本中已經移除的方法(見http://goo.gl/qg4aKJ) 4 | 並重新編譯使其可以在4.9版正確載入執行。 5 | 6 | 注意!由於我編譯過的Paoding是以Java 7來編譯的,而Lucene 4.9只支援 Java 7u55以上版本,包含Java 8,若要使用於4.7以下版本則需要重新使用Java 6編譯Paoding成jar後方可使用。否則將會出現NoClassDefFoundError: org/apache/lucene/analysis/Token的錯誤。 7 | 8 | 原作者: http://git.oschina.net/zhzhenqin/paoding-analysis 9 | 10 | #Paoding分词器基于Lucene4.x 11 | 12 | 原项目见 https://code.google.com/p/paoding/ 13 | 14 | #Paoding Analysis摘要 15 | 16 | Paoding's Knives 中文分词具有极 高效率 和 高扩展性 。引入隐喻,采用完全的面向对象设计,构思先进。 17 | 18 | 高效率:在PIII 1G内存个人机器上,1秒 可准确分词 100万 汉字。 19 | 20 | 采用基于 不限制个数 的词典文件对文章进行有效切分,使能够将对词汇分类定义。 21 | 22 | 能够对未知的词汇进行合理解析 23 | 24 | 用心的贡献,极其能鼓励人 25 | 26 | ----------------------! 27 | 28 | 分词示例如下: 29 | 30 | 31 | TokenStream ts = analyzer.tokenStream("text", new StringReader(text)); 32 | //添加工具类 注意:以下这些与之前lucene2.x版本不同的地方 33 | CharTermAttribute offAtt = (CharTermAttribute) ts.addAttribute(CharTermAttribute.class); 34 | // 循环打印出分词的结果,及分词出现的位置 35 | while (ts.incrementToken()) { 36 | System.out.print(offAtt.toString() + "\t"); 37 | } 38 | 39 | 40 | #编译说明 41 | 42 | 项目默认可以使用Maven直接编译. 43 | 44 | 如果使用Ant,可把依赖的lib放入 {pro_workspace}/target/dependency/ 下. 然后使用ant可以直接编译. 45 | 编译的结果存放在 {pro_workspace}/target/dist/{version}/ 下 46 | 47 | 48 | 可使用Maven的 copy-dependencies 命令直接copy依赖到{pro_workspace}/target/dependency/,然后使用ant编译 49 | 50 | 51 | mvn dependency:copy-dependencies 52 | 53 | 54 | #Solr4.x使用说明 55 | 56 | Solr 4.x以上可以直接配置Lucene的Analyzer. 57 | 配置如: 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /build.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | Builds, tests, and runs the project ndetl. 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /dic/administrative.dic: -------------------------------------------------------------------------------- 1 | 人事部 2 | 信息产业部 3 | 农业部 4 | 医管局 5 | 发改委 6 | 国土资源部 7 | 国防部 8 | 外交部 9 | 教育部 10 | 文化部 11 | 民政部 12 | 能源部 13 | 能源部 14 | 财政部 15 | 铁道部 16 | 防卫厅 17 | 防卫省 18 | 革命委员会 -------------------------------------------------------------------------------- /dic/appellation.dic: -------------------------------------------------------------------------------- 1 | 中队长 2 | 主任 3 | 主席 4 | 军长 5 | 医生 6 | 博士 7 | 厂长 8 | 司令 9 | 大队长 10 | 夫人 11 | 小队长 12 | 局长 13 | 师傅 14 | 师长 15 | 总统 16 | 指导 17 | 排长 18 | 教授 19 | 教练 20 | 旅长 21 | 校长 22 | 班长 23 | 秘书 24 | 组长 25 | 经理 26 | 老师 27 | 营长 28 | 董事 29 | 董事长 30 | 连长 31 | 队长 32 | -------------------------------------------------------------------------------- /dic/company.dic: -------------------------------------------------------------------------------- 1 | 中国中央电视台 2 | 中国电信有限公司 3 | 中国移动通讯有限公司 4 | 中国网通有限公司 5 | 中国联合通讯有限公司 6 | 中国联通 7 | 中央电视台 8 | 北京百度科技发展有限公司 9 | 央视 10 | 电信 11 | 百度 12 | 移动 13 | 网通 14 | 联通 15 | -------------------------------------------------------------------------------- /dic/comupter-science.dic: -------------------------------------------------------------------------------- 1 | 主板 2 | 内存 3 | 键盘 4 | -------------------------------------------------------------------------------- /dic/contemporary-words.dic: -------------------------------------------------------------------------------- 1 | 支付宝 2 | 斑竹 3 | 站长 4 | 贝宝 5 | 陶宝 -------------------------------------------------------------------------------- /dic/division/africa.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruyaoyao/paoding-analysis/72ce2a4e542dbb36436093c969a12aa4f5c3f685/dic/division/africa.dic -------------------------------------------------------------------------------- /dic/division/america.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruyaoyao/paoding-analysis/72ce2a4e542dbb36436093c969a12aa4f5c3f685/dic/division/america.dic -------------------------------------------------------------------------------- /dic/division/europe.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruyaoyao/paoding-analysis/72ce2a4e542dbb36436093c969a12aa4f5c3f685/dic/division/europe.dic -------------------------------------------------------------------------------- /dic/division/japan.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruyaoyao/paoding-analysis/72ce2a4e542dbb36436093c969a12aa4f5c3f685/dic/division/japan.dic -------------------------------------------------------------------------------- /dic/division/korea.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruyaoyao/paoding-analysis/72ce2a4e542dbb36436093c969a12aa4f5c3f685/dic/division/korea.dic -------------------------------------------------------------------------------- /dic/division/oceania.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruyaoyao/paoding-analysis/72ce2a4e542dbb36436093c969a12aa4f5c3f685/dic/division/oceania.dic -------------------------------------------------------------------------------- /dic/division/readme.txt: -------------------------------------------------------------------------------- 1 | 地区划分在此记录 2 | 比如中国的省市县,国外的洲、河流等 -------------------------------------------------------------------------------- /dic/division/taiwan.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruyaoyao/paoding-analysis/72ce2a4e542dbb36436093c969a12aa4f5c3f685/dic/division/taiwan.dic -------------------------------------------------------------------------------- /dic/festival.dic: -------------------------------------------------------------------------------- 1 | 七七纪念日 2 | 七夕 3 | 七夕情人节 4 | 七夕节 5 | 万圣节 6 | 世界人权日 7 | 世界儿歌节 8 | 世界儿童节 9 | 世界动物日 10 | 世界卫生日 11 | 世界地球日 12 | 世界教师日 13 | 世界无烟日 14 | 世界无童工日 15 | 世界林业节 16 | 世界森林日 17 | 世界水日 18 | 世界海洋日 19 | 世界湿地日 20 | 世界献血日 21 | 世界环境日 22 | 世界电视日 23 | 世界睡眠日 24 | 世界粮食日 25 | 世界精神卫生日 26 | 世界红十字日 27 | 世界问候日 28 | 中国人民抗日战争纪念日 29 | 中国国耻日 30 | 中国学生营养日 31 | 中国爱牙日 32 | 中国爱耳日 33 | 中国青年志愿者服务日 34 | 中国青年节 35 | 中秋 36 | 中秋节 37 | 人口日 38 | 人权日 39 | 儿歌节 40 | 儿童节 41 | 元宵 42 | 元宵节 43 | 元旦 44 | 党生日 45 | 全国中小学生安全教育日 46 | 全国助残日 47 | 全国爱眼日 48 | 全国爱耳日 49 | 六十亿人口日 50 | 六四纪念日 51 | 冬至 52 | 减轻自然灾害日 53 | 动物日 54 | 助残日 55 | 劳动妇女节 56 | 劳动节 57 | 博物馆日 58 | 卫生日 59 | 和平日 60 | 国庆 61 | 国庆节 62 | 国耻日 63 | 国际儿童节 64 | 国际减轻自然灾害日 65 | 国际劳动妇女节 66 | 国际劳动节 67 | 国际博物馆日 68 | 国际和平日 69 | 国际奥林匹克日 70 | 国际妇女节 71 | 国际容忍日 72 | 国际左撇子日 73 | 国际志愿者日 74 | 国际护士节 75 | 国际无车日 76 | 国际残疾人日 77 | 国际母语日 78 | 国际气象节 79 | 国际消费者权益日 80 | 国际牛奶日 81 | 国际盲人节 82 | 国际禁毒日 83 | 国际老人日 84 | 国际臭氧层保护日 85 | 国际非洲儿童日 86 | 国际音乐日 87 | 国际麻风日 88 | 圣诞节 89 | 地球日 90 | 处暑 91 | 复活节 92 | 夏至 93 | 大寒 94 | 大暑 95 | 大雪 96 | 奥林匹克日 97 | 妇女节 98 | 学生营养日 99 | 安全教育日 100 | 安全日 101 | 容忍日 102 | 寒露 103 | 小寒 104 | 小年 105 | 小暑 106 | 小满 107 | 小雪 108 | 左撇子日 109 | 平安夜 110 | 建党日 111 | 建军节 112 | 志愿人员日 113 | 志愿者日 114 | 情人节 115 | 惊蛰 116 | 愚人节 117 | 感恩节 118 | 扫房日 119 | 抗日战争纪念日 120 | 抗日纪念日 121 | 护士节 122 | 教师日 123 | 教师节 124 | 文化遗产日 125 | 无烟日 126 | 无童工日 127 | 无车日 128 | 春分 129 | 春节 130 | 植树节 131 | 残疾人日 132 | 母亲节 133 | 母语日 134 | 气象节 135 | 水日 136 | 海洋日 137 | 消费者权益日 138 | 清明 139 | 清明节 140 | 湿地日 141 | 爱牙日 142 | 爱眼日 143 | 爱耳日 144 | 父亲节 145 | 牛奶日 146 | 独立日 147 | 献血日 148 | 环境日 149 | 电视日 150 | 白露 151 | 盲人节 152 | 睡眠日 153 | 秋分 154 | 立冬 155 | 立夏 156 | 立春 157 | 立秋 158 | 端午节 159 | 粮食日 160 | 精神卫生日 161 | 红十字日 162 | 老人日 163 | 联合国日 164 | 腊八节 165 | 腊日 166 | 臭氧保护日 167 | 臭氧层保护日 168 | 芒种 169 | 营养日 170 | 谷雨 171 | 重阳 172 | 重阳节 173 | 问候日 174 | 除夕 175 | 雨水 176 | 霜降 177 | 青年志愿者服务日 178 | 青年节 179 | 非洲儿童日 180 | 音乐日 181 | 麻风日 182 | 龙头节 183 | -182 184 | -------------------------------------------------------------------------------- /dic/language.dic: -------------------------------------------------------------------------------- 1 | 中文 2 | 台湾话 3 | 台语 4 | 客家话 5 | 汉字 6 | 汉语 7 | 法文 8 | 法语 9 | 福建话 10 | 粤语 11 | 美语 12 | 英文 13 | 英语 14 | 西班牙语 15 | 闽南语 16 | -15 -------------------------------------------------------------------------------- /dic/locale/beijing.dic: -------------------------------------------------------------------------------- 1 | 健翔桥 2 | 北医大 3 | 四惠东 4 | 复兴门 5 | 天安门 6 | 德胜门 7 | 德胜门西 8 | 新街口 9 | 朝阳门 10 | 正阳门 11 | 水立方 12 | 积水潭 13 | 积水潭桥 14 | 苹果园 15 | 西直门 16 | 长安街 17 | -15 18 | -------------------------------------------------------------------------------- /dic/locale/quanzhou.dic: -------------------------------------------------------------------------------- 1 | 东西塔 2 | 崇武 3 | 惠安 4 | 洛阳桥 5 | -------------------------------------------------------------------------------- /dic/locale/readme.txt: -------------------------------------------------------------------------------- 1 | 各地方街道等在此录入 -------------------------------------------------------------------------------- /dic/locale/xiamen.dic: -------------------------------------------------------------------------------- 1 | 思明区 -------------------------------------------------------------------------------- /dic/name-foreign.dic: -------------------------------------------------------------------------------- 1 | 亚历山大 2 | 克林顿 3 | 克里斯汀 4 | 布什 5 | 布莱尔 6 | 科特勒 7 | 约翰 8 | 约翰逊 9 | 蒂娜 10 | -11 11 | -------------------------------------------------------------------------------- /dic/nation.dic: -------------------------------------------------------------------------------- 1 | 东非 2 | 中华 3 | 中华 4 | 中华人民共和国 5 | 中华民国 6 | 中国 7 | 中国 8 | 中非 9 | 乌克兰 10 | 也门 11 | 以色列 12 | 伊拉克 13 | 伊朗 14 | 俄罗斯 15 | 分类 16 | 加拿大 17 | 南非 18 | 古巴 19 | 台湾 20 | 埃及 21 | 塞尔维亚 22 | 墨西哥 23 | 威尔士 24 | 尼日利亚 25 | 巴比伦 26 | 希腊 27 | 德国 28 | 德意志 29 | 意大利 30 | 捷克 31 | 日本 32 | 朝鲜 33 | 比利时 34 | 法兰西 35 | 法国 36 | 波兰 37 | 波黑 38 | 瑞典 39 | 瑞士 40 | 白俄罗斯 41 | 缅甸 42 | 美利坚 43 | 美利坚合众国 44 | 美国 45 | 老挝 46 | 苏格兰 47 | 苏联 48 | 英国 49 | 英格兰 50 | 葡萄牙 51 | 蒙古 52 | 西班牙 53 | 越南 54 | 韩国 55 | -------------------------------------------------------------------------------- /dic/org-domestic.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruyaoyao/paoding-analysis/72ce2a4e542dbb36436093c969a12aa4f5c3f685/dic/org-domestic.dic -------------------------------------------------------------------------------- /dic/org-foreign.dic: -------------------------------------------------------------------------------- 1 | 上海合作组织 2 | 世卫 3 | 世界卫生组织 4 | 世界银行 5 | 东盟 6 | 亚太经合组织 7 | 人权理事会 8 | 六方会谈 9 | 北约 10 | 哈马斯 11 | 安全理事会 12 | 安理会 13 | 欧佩克 14 | 红十字会 15 | 联合国 16 | -------------------------------------------------------------------------------- /dic/paoding-dic-names.properties: -------------------------------------------------------------------------------- 1 | #dictionary character encoding 2 | #paoding.dic.charset=UTF-8 3 | 4 | ### Set maximum word length (Chinese character) that analyzer can support. Longer words will be ignored. 5 | ### By default, it is set to "0", which means all words will be analyzed. 6 | #paoding.dic.maxWordLen=0 7 | 8 | #dictionaries which are skip 9 | #paoding.dic.skip.prefix=x- 10 | 11 | #chinese/cjk charactors that will not token 12 | #paoding.dic.noise-charactor=x-noise-charactor 13 | 14 | #chinese/cjk words that will not token 15 | #paoding.dic.noise-word=x-noise-word 16 | 17 | #unit words, like "ge", "zhi", ... 18 | #paoding.dic.unit=x-unit 19 | 20 | #like "Wang", "Zhang", ... 21 | #paoding.dic.confucian-family-name=x-confucian-family-name 22 | 23 | #linke "uPAN", "cdHE" 24 | #paoding.dic.for-combinatorics=x-for-combinatorics 25 | 26 | 27 | -------------------------------------------------------------------------------- /dic/star-domestic.dic: -------------------------------------------------------------------------------- 1 | 丁俊辉 2 | 乾隆 3 | 刘德华 4 | 刘翔 5 | 华仔 6 | 周杰伦 7 | 姚明 8 | 小丁 9 | 小辉 10 | 庖丁 11 | 康熙 12 | 张学友 13 | 朱军 14 | 朱德 15 | 朱德茂 16 | 朱镕基 17 | 李世民 18 | 李瑞环 19 | 武则天 20 | 毛主席 21 | 毛泽东 22 | 江泽民 23 | 老许 24 | 胡志明 25 | 胡锦涛 26 | 许静蕾 27 | 诸葛亮 28 | 赵本山 29 | 陈佩斯 30 | 马云 31 | 马加爵 32 | -30 33 | #历史、政治、学术、企业、娱乐、体育、社会现象 34 | -------------------------------------------------------------------------------- /dic/star-foreign.dic: -------------------------------------------------------------------------------- 1 | 比尔 2 | 盖茨 3 | -2 4 | -------------------------------------------------------------------------------- /dic/x-confucian-family-name.dic: -------------------------------------------------------------------------------- 1 | 丁 2 | 万 3 | 上官 4 | 丘 5 | 东郭 6 | 严 7 | 丰 8 | 乃 9 | 乌 10 | 乐 11 | 乔 12 | 习 13 | 云 14 | 亚 15 | 什 16 | 仇 17 | 仝 18 | 任 19 | 伊 20 | 伍 21 | 伏 22 | 休 23 | 伦 24 | 伯 25 | 何 26 | 佘 27 | 余 28 | 佟 29 | 佩 30 | 侯 31 | 俄 32 | 保 33 | 俞 34 | 倪 35 | 傅 36 | 储 37 | 元 38 | 克 39 | 公孙 40 | 兰 41 | 关 42 | 兹 43 | 内 44 | 冈 45 | 冉 46 | 农 47 | 冯 48 | 况 49 | 冷 50 | 冼 51 | 凌 52 | 凤 53 | 凯 54 | 刀 55 | 刁 56 | 切 57 | 刘 58 | 利 59 | 加 60 | 努 61 | 励 62 | 劳 63 | 勒 64 | 勾 65 | 包 66 | 匡 67 | 匹 68 | 华 69 | 卓 70 | 单 71 | 单于 72 | 博 73 | 卜 74 | 卞 75 | 卡 76 | 卢 77 | 卫 78 | 印 79 | 危 80 | 厄 81 | 厉 82 | 及 83 | 古 84 | 可 85 | 史 86 | 叶 87 | 司徒 88 | 司空 89 | 司马 90 | 合 91 | 吉 92 | 吕 93 | 吴 94 | 周 95 | 哈 96 | 哥 97 | 唐 98 | 商 99 | 喀 100 | 喻 101 | 图 102 | 土 103 | 坦 104 | 埃 105 | 基 106 | 塔 107 | 塞 108 | 墨 109 | 夏 110 | 多 111 | 大 112 | 夫 113 | 奇 114 | 奚 115 | 奥 116 | 姆 117 | 姚 118 | 姜 119 | 姬 120 | 娄 121 | 孔 122 | 孙 123 | 孟 124 | 季 125 | 安 126 | 宋 127 | 宗 128 | 官 129 | 宝 130 | 宣 131 | 宫 132 | 容 133 | 宾 134 | 寿 135 | 封 136 | 尉 137 | 小泉 138 | 尔 139 | 尤 140 | 尹 141 | 尼 142 | 居 143 | 屈 144 | 屠 145 | 岑 146 | 岳 147 | 崔 148 | 左 149 | 巫 150 | 巴 151 | 布 152 | 希 153 | 帕 154 | 常 155 | 平 156 | 幸 157 | 庄 158 | 库 159 | 应 160 | 庞 161 | 康 162 | 廉 163 | 廖 164 | 延 165 | 弗 166 | 张 167 | 强 168 | 彦 169 | 彭 170 | 徐 171 | 德 172 | 慕容 173 | 戈 174 | 成 175 | 戚 176 | 戴 177 | 房 178 | 托 179 | 拉 180 | 招 181 | 摩 182 | 敖 183 | 斐 184 | 斯 185 | 方 186 | 於 187 | 昌 188 | 明 189 | 易 190 | 晋 191 | 晏 192 | 普 193 | 曹 194 | 曼 195 | 曾 196 | 朗 197 | 朱 198 | 朴 199 | 权 200 | 李 201 | 杜 202 | 来 203 | 杨 204 | 杭 205 | 杰 206 | 林 207 | 柏 208 | 查 209 | 柯 210 | 柳 211 | 柴 212 | 根 213 | 格 214 | 桂 215 | 桑 216 | 梁 217 | 梅 218 | 森 219 | 楚 220 | 楼 221 | 樊 222 | 欧阳 223 | 武 224 | 段 225 | 殷 226 | 比 227 | 毕 228 | 毛 229 | 江 230 | 池 231 | 汤 232 | 汪 233 | 沃 234 | 沈 235 | 沙 236 | 法 237 | 波 238 | 泰 239 | 泽 240 | 洛 241 | 洪 242 | 浦 243 | 涂 244 | 淳 245 | 温 246 | 游 247 | 湛 248 | 溥 249 | 滕 250 | 满 251 | 潘 252 | 澳 253 | 澹台 254 | 烈 255 | 焦 256 | 熊 257 | 燕 258 | 爱 259 | 爱新觉罗 260 | 牛 261 | 牟 262 | 特 263 | 狄 264 | 王 265 | 班 266 | 理 267 | 瑞 268 | 瑶 269 | 瓦 270 | 甄 271 | 甘 272 | 田 273 | 申 274 | 登 275 | 白 276 | 皇甫 277 | 皮 278 | 盖 279 | 盛 280 | 瞿 281 | 石 282 | 祁 283 | 祖 284 | 祝 285 | 福 286 | 禹 287 | 禾 288 | 科 289 | 秦 290 | 程 291 | 稽 292 | 穆 293 | 空 294 | 窦 295 | 章 296 | 端 297 | 竺 298 | 简 299 | 管 300 | 米 301 | 索 302 | 累 303 | 纪 304 | 纳 305 | 练 306 | 维 307 | 缪 308 | 罗 309 | 翁 310 | 翟 311 | 翦 312 | 耶 313 | 耿 314 | 聂 315 | 胡 316 | 胥 317 | 腓 318 | 腾 319 | 臧 320 | 舍 321 | 舒 322 | 良 323 | 艾 324 | 芬 325 | 芮 326 | 花 327 | 苏 328 | 苗 329 | 苟 330 | 英 331 | 范 332 | 茅 333 | 茨 334 | 荀 335 | 荆 336 | 荣 337 | 莫 338 | 莱 339 | 萧 340 | 萨 341 | 董 342 | 蒂 343 | 蒋 344 | 蒙 345 | 蒲 346 | 蓝 347 | 蓬 348 | 蔚 349 | 蔡 350 | 薛 351 | 虞 352 | 蚁 353 | 衡 354 | 袁 355 | 裘 356 | 裴 357 | 褚 358 | 西 359 | 解 360 | 言 361 | 詹 362 | 许 363 | 诸 364 | 诸葛 365 | 诺 366 | 谈 367 | 谢 368 | 谭 369 | 谷 370 | 贝 371 | 费 372 | 贺 373 | 贾 374 | 赖 375 | 赛 376 | 赫 377 | 赵 378 | 路 379 | 辛 380 | 辜 381 | 边 382 | 达 383 | 迈 384 | 连 385 | 迟 386 | 迪 387 | 逊 388 | 邓 389 | 邝 390 | 邢 391 | 那 392 | 邬 393 | 邰 394 | 邱 395 | 邵 396 | 邹 397 | 郁 398 | 郎 399 | 郑 400 | 郝 401 | 郭 402 | 都 403 | 里 404 | 金 405 | 钟 406 | 钮 407 | 钱 408 | 银 409 | 闵 410 | 闻 411 | 阎 412 | 阮 413 | 阳 414 | 阿 415 | 陆 416 | 陈 417 | 陶 418 | 隆 419 | 雅 420 | 雷 421 | 霍 422 | 靳 423 | 韦 424 | 韩 425 | 项 426 | 顾 427 | 颜 428 | 饶 429 | 马 430 | 骆 431 | 高 432 | 魏 433 | 鱼 434 | 鲁 435 | 鲍 436 | 鲜 437 | 麦 438 | 麻 439 | 黄 440 | 黎 441 | 黛 442 | 齐 443 | 龙 444 | 龚 445 | -444 446 | -------------------------------------------------------------------------------- /dic/x-for-combinatorics.dic: -------------------------------------------------------------------------------- 1 | U盘 2 | CD盒 3 | CD机 4 | C盘 5 | D盘 6 | E盘 7 | F盘 8 | G盘 9 | H盘 10 | I盘 11 | J盘 12 | K盘 13 | Z盘 14 | K歌之王 15 | A座 16 | B座 17 | C座 18 | D座 19 | E座 20 | F座 21 | A计划 22 | B计划 23 | B超 24 | Q版 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /dic/x-noise-charactor.dic: -------------------------------------------------------------------------------- 1 | 的 2 | 一 3 | 不 4 | 在 5 | 人 6 | 有 7 | 是 8 | 为 9 | 以 10 | 于 11 | 上 12 | 他 13 | 而 14 | 后 15 | 之 16 | 来 17 | 及 18 | 了 19 | 因 20 | 下 21 | 可 22 | 到 23 | 由 24 | 这 25 | 与 26 | 也 27 | 此 28 | 但 29 | 并 30 | 个 31 | 其 32 | 已 33 | 无 34 | 小 35 | 我 36 | 们 37 | 起 38 | 最 39 | 再 40 | 今 41 | 去 42 | 好 43 | 只 44 | 又 45 | 或 46 | 很 47 | 亦 48 | 某 49 | 把 50 | 那 51 | 你 52 | 乃 53 | 它 54 | 吧 55 | 被 56 | 比 57 | 别 58 | 趁 59 | 当 60 | 从 61 | 到 62 | 得 63 | 打 64 | 凡 65 | 儿 66 | 尔 67 | 该 68 | 各 69 | 给 70 | 跟 71 | 和 72 | 何 73 | 还 74 | 即 75 | 几 76 | 既 77 | 看 78 | 据 79 | 距 80 | 靠 81 | 啦 82 | 了 83 | 另 84 | 么 85 | 每 86 | 们 87 | 嘛 88 | 拿 89 | 哪 90 | 那 91 | 您 92 | 凭 93 | 且 94 | 却 95 | 让 96 | 仍 97 | 啥 98 | 如 99 | 若 100 | 使 101 | 谁 102 | 虽 103 | 随 104 | 同 105 | 所 106 | 她 107 | 哇 108 | 嗡 109 | 往 110 | 哪 111 | 些 112 | 向 113 | 沿 114 | 哟 115 | 用 116 | 于 117 | 咱 118 | 则 119 | 怎 120 | 曾 121 | 至 122 | 致 123 | 着 124 | 诸 125 | 自 -------------------------------------------------------------------------------- /dic/x-noise-word.dic: -------------------------------------------------------------------------------- 1 | 你们 2 | 那样 3 | 所以 4 | 得了 5 | 当地 6 | 有关 7 | 所有 8 | 因之 9 | 用来 10 | 所在 11 | 对待 12 | 而外 13 | 分别 14 | 某些 15 | 对方 16 | 不只 17 | 虽然 18 | 无论 19 | 不论 20 | 无论如何 21 | 但是 22 | 全部 23 | 尽管 24 | 大家 25 | 以便 26 | 自己 27 | 可是 28 | 反之 29 | 这些 30 | 什么 31 | 由此 32 | 万一 33 | 而已 34 | 何以 35 | 咱们 36 | 值此 37 | 向着 38 | 哪怕 39 | 倘若 40 | 出于 41 | 如上 42 | 如若 43 | 替代 44 | 什么样 45 | 如是 46 | 照着 47 | 此处 48 | 这样 49 | 每当 50 | 此次 51 | 至于 52 | 此地 53 | 要不然 54 | 逐步 55 | 格里斯 56 | 本地 57 | 要不 58 | 其次 59 | 尽管如此 60 | 遵循 61 | 乃至 62 | 若是 63 | 并且 64 | 如下 65 | 可以 66 | 才能 67 | 以及 68 | 彼此 69 | 根据 70 | 随后 71 | 有时 72 | -------------------------------------------------------------------------------- /dic/x-unit.dic: -------------------------------------------------------------------------------- 1 | 万 2 | 世 3 | 世纪 4 | 两 5 | 个 6 | 中 7 | 乘 8 | 井 9 | 亩 10 | 人 11 | 人工作日 12 | 人日 13 | 人月日 14 | 亿 15 | 仙 16 | 代 17 | 件 18 | 份 19 | 伏 20 | 伏特 21 | 位 22 | 例 23 | 倍 24 | 元 25 | 兆 26 | 光年 27 | 克 28 | 党 29 | 公顷 30 | 册 31 | 出 32 | 分 33 | 分钟 34 | 划 35 | 列 36 | 刻 37 | 剧 38 | 包 39 | 匹 40 | 区 41 | 千 42 | 升 43 | 单 44 | 卫 45 | 卷 46 | 厂 47 | 厅 48 | 厨 49 | 口 50 | 句 51 | 只 52 | 台 53 | 号 54 | 吨 55 | 员 56 | 周 57 | 周岁 58 | 周年 59 | 品 60 | 回 61 | 团 62 | 国 63 | 圆 64 | 圈 65 | 场 66 | 坪 67 | 堆 68 | 堵 69 | 声 70 | 壶 71 | 处 72 | 夜 73 | 大 74 | 天 75 | 头 76 | 女 77 | 孔 78 | 季 79 | 安 80 | 安培 81 | 宗 82 | 室 83 | 家 84 | 寸 85 | 尺 86 | 尾 87 | 局 88 | 层 89 | 届 90 | 岁 91 | 市 92 | 带 93 | 幅 94 | 幕 95 | 平方米 96 | 年 97 | 年级 98 | 床 99 | 店 100 | 度 101 | 座 102 | 弄 103 | 式 104 | 张 105 | 微克 106 | 微秒 107 | 微米 108 | 快 109 | 成 110 | 房 111 | 批 112 | 把 113 | 折 114 | 抽 115 | 捧 116 | 撮 117 | 支 118 | 斤 119 | 族 120 | 日 121 | 时 122 | 晚 123 | 曲 124 | 月 125 | 期 126 | 本 127 | 朵 128 | 束 129 | 条 130 | 杯 131 | 柜 132 | 栋 133 | 样 134 | 根 135 | 桌 136 | 桶 137 | 楼 138 | 次 139 | 步 140 | 段 141 | 毫 142 | 毫克 143 | 毫分 144 | 毫升 145 | 毫秒 146 | 毫米 147 | 洞 148 | 派 149 | 滴 150 | 点 151 | 片 152 | 牛 153 | 环 154 | 班 155 | 瓶 156 | 男 157 | 盏 158 | 盒 159 | 盘 160 | 种 161 | 科 162 | 秒 163 | 秒钟 164 | 立方米 165 | 站 166 | 章 167 | 笔 168 | 等 169 | 箱 170 | 米 171 | 粒 172 | 级 173 | 线 174 | 维 175 | 缸 176 | 群 177 | 翻 178 | 艘 179 | 节 180 | 英寸 181 | 行 182 | 袋 183 | 角 184 | 课 185 | 路 186 | 车 187 | 轮 188 | 辆 189 | 辈 190 | 辑 191 | 道 192 | 部 193 | 里 194 | 重 195 | 针 196 | 钱 197 | 门 198 | 间 199 | 阶 200 | 隔 201 | 集 202 | 面 203 | 页 204 | 颗 205 | 首 206 | -205 207 | -------------------------------------------------------------------------------- /dist/paoding-analysis-2.0.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ruyaoyao/paoding-analysis/72ce2a4e542dbb36436093c969a12aa4f5c3f685/dist/paoding-analysis-2.0.4.jar -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | net.paoding 8 | paoding-analysis 9 | 2.0.4 10 | 11 | jar 12 | 13 | paoding-analysis, 基于Lucene4.x的分词器 14 | https://code.google.com/p/paoding/ 15 | 16 | 17 | 18 | ZhenQin 19 | zhzhenqin@gmail.com 20 | http://my.oschina.net/zhzhenqin/home 21 | 22 | 23 | 24 | 25 | 4.6.0 26 | 27 | UTF-8 28 | GBK 29 | 30 | ZhenQin 31 | http://my.oschina.net/zhzhenqin/home 32 | 33 | /home/zhenqin/software/apache-tomcat-6.0.35 34 | net.paoding.analysis.analyzer.PaodingAnalyzer 35 | 36 | 37 | 38 | 39 | commons-logging 40 | commons-logging 41 | 1.1.1 42 | 43 | 44 | 45 | 46 | org.apache.lucene 47 | lucene-core 48 | ${lucene.version} 49 | 50 | 51 | 52 | org.apache.lucene 53 | lucene-analyzers-common 54 | ${lucene.version} 55 | 56 | 57 | 58 | org.apache.lucene 59 | lucene-queries 60 | ${lucene.version} 61 | 62 | 63 | jakarta-regexp 64 | jakarta-regexp 65 | 66 | 67 | 68 | 69 | 70 | 71 | org.apache.lucene 72 | lucene-highlighter 73 | ${lucene.version} 74 | 75 | 76 | 77 | org.apache.lucene 78 | lucene-grouping 79 | ${lucene.version} 80 | 81 | 82 | 83 | org.apache.lucene 84 | lucene-queryparser 85 | ${lucene.version} 86 | 87 | 88 | 89 | junit 90 | junit 91 | 4.8.2 92 | test 93 | 94 | 95 | 96 | 97 | ${project.artifactId} 98 | 99 | 100 | org.apache.maven.plugins 101 | maven-compiler-plugin 102 | 2.5.1 103 | 104 | 1.6 105 | 1.6 106 | UTF-8 107 | 108 | 109 | 110 | 111 | org.codehaus.mojo 112 | exec-maven-plugin 113 | 1.2.1 114 | 115 | ${java.main.class} 116 | 125 | 126 | 127 | 128 | 129 | java 130 | 131 | 132 | 133 | 134 | 135 | 136 | org.mortbay.jetty 137 | maven-jetty-plugin 138 | 6.1.26 139 | 140 | 10 141 | 142 | ${project.artifactId} 143 | 144 | 145 | 146 | 147 | 148 | org.apache.maven.plugins 149 | maven-war-plugin 150 | 151 | ${project.artifactId} 152 | 153 | 154 | true 155 | true 156 | false 157 | 158 | true 159 | true 160 | true 161 | 162 | 163 | ${project.url} 164 | ${vendor.name} 165 | ${verdor.domain} 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | org.apache.maven.plugins 174 | maven-javadoc-plugin 175 | 2.5 176 | 177 | true 178 | zh_CN 179 | ${project.artifactId} Javadoc 180 | 181 | 182 | 183 | 184 | org.codehaus.mojo 185 | tomcat-maven-plugin 186 | 1.1 187 | 188 | 189 | ${tomcat.home} 190 | 8080 191 | ISO-8859-1 192 | /${project.artifactId} 193 | ${project.build.outputDirectory} 194 | ${project.basedir}/src/main/webapp 195 | ${project.basedir}/src/main/webapp/WEB-INF/web.xml 196 | 197 | 198 | 199 | 200 | org.apache.maven.plugins 201 | maven-eclipse-plugin 202 | 2.9 203 | 204 | 205 | org.eclipse.jdt.core.javanature 206 | org.eclipse.m2e.core.maven2Nature 207 | org.springframework.ide.eclipse.core.springnature 208 | 209 | 210 | org.eclipse.jdt.core.javabuilder 211 | org.eclipse.m2e.core.maven2Builder 212 | org.springframework.ide.eclipse.core.springbuilder 213 | 214 | 215 | 216 | 217 | 218 | org.apache.maven.plugins 219 | maven-idea-plugin 220 | 2.2 221 | 222 | 223 | 224 | 225 | 226 | src/main/java 227 | 228 | **/*.xml 229 | 230 | 231 | 232 | 233 | src/main/resources 234 | 235 | 236 | 237 | 238 | 239 | src/test/java 240 | 241 | 242 | src/test/resources 243 | 244 | 245 | 246 | 247 | 248 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/Constants.java: -------------------------------------------------------------------------------- 1 | package net.paoding.analysis; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.Properties; 6 | 7 | /** 8 | * 9 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 10 | * 11 | * @since 2.0.0 12 | */ 13 | public class Constants { 14 | 15 | /** 16 | * "词典目录安装目录"配置的优先级别 17 | *

18 | * "system-env"以及其他非"this"的配置,表示优先从环境变量PAODING_DIC_HOME的值找词典目录安装环境 19 | * "this"表示优先从本配置文件的paoding.dic.home配置项找
20 | * 只有在高优先级没有配置,才会找低优先级的配置。 默认环境变量的优先级别高于paoding-analysis.properties属性文件配置。 21 | */ 22 | public static final String DIC_HOME_CONFIG_FIRST = "paoding.dic.home.config-first"; 23 | public static final String DIC_HOME_CONFIG_FIRST_DEFAULT = "system-env"; 24 | 25 | /** 26 | * 词典安装目录环境变量名 27 | */ 28 | public static final String ENV_PAODING_DIC_HOME = "PAODING_DIC_HOME"; 29 | 30 | // ------------------------------------------------------------- 31 | /** 32 | * 词典安装目录 33 | *

34 | * 默认值为null,以在环境变量和配置文件都没有配置paoding.dic.home的情况下,让PaodingMaker尝试从当前工作目录下、类路径下探索是否存在dic目录 35 | */ 36 | public static final String DIC_HOME = "paoding.dic.home"; 37 | public static final String DIC_HOME_DEFAULT = null; 38 | 39 | // ------------------------------------------------------------- 40 | // 41 | public static final String DIC_CHARSET = "paoding.dic.charset"; 42 | public static final String DIC_CHARSET_DEFAULT = "UTF-8"; 43 | 44 | // dictionary word length limit 45 | public static final String DIC_MAXWORDLEN = "paoding.dic.maxWordLen"; 46 | public static final String DIC_MAXWORDLEN_DEFAULT = "0"; 47 | 48 | // ------------------------------------------------------------- 49 | // dictionaries which are skip 50 | public static final String DIC_SKIP_PREFIX = "paoding.dic.skip.prefix"; 51 | public static final String DIC_SKIP_PREFIX_DEFAULT = "x-"; 52 | 53 | // ------------------------------------------------------------- 54 | // chinese/cjk charactors that will not token 55 | public static final String DIC_NOISE_CHARACTOR = "paoding.dic.noise-charactor"; 56 | public static final String DIC_NOISE_CHARACTOR_DEFAULT = "x-noise-charactor"; 57 | 58 | // ------------------------------------------------------------- 59 | // chinese/cjk words that will not token 60 | public static final String DIC_NOISE_WORD = "paoding.dic.noise-word"; 61 | public static final String DIC_NOISE_WORD_DEFAULT = "x-noise-word"; 62 | 63 | // ------------------------------------------------------------- 64 | // unit words, like "ge", "zhi", ... 65 | public static final String DIC_UNIT = "paoding.dic.unit"; 66 | public static final String DIC_UNIT_DEFAULT = "x-unit"; 67 | 68 | // ------------------------------------------------------------- 69 | // like "Wang", "Zhang", ... 70 | public static final String DIC_CONFUCIAN_FAMILY_NAME = "paoding.dic.confucian-family-name"; 71 | public static final String DIC_CONFUCIAN_FAMILY_NAME_DEFAULT = "x-confucian-family-name"; 72 | 73 | // ------------------------------------------------------------- 74 | // like 75 | public static final String DIC_FOR_COMBINATORICS = "paoding.dic.for-combinatorics"; 76 | public static final String DIC_FOR_COMBINATORICS_DEFAULT = "x-for-combinatorics"; 77 | 78 | // ------------------------------------------------------------- 79 | // like 80 | public static final String DIC_DETECTOR_INTERVAL = "paoding.dic.detector.interval"; 81 | public static final String DIC_DETECTOR_INTERVAL_DEFAULT = "60"; 82 | 83 | // ------------------------------------------------------------- 84 | // like "default", "max", ... 85 | public static final String ANALYZER_MODE = "paoding.analyzer.mode"; 86 | public static final String ANALYZER_MOE_DEFAULT = "most-words"; 87 | 88 | // ------------------------------------------------------------- 89 | // 90 | public static final String ANALYZER_DICTIONARIES_COMPILER = "paoding.analyzer.dictionaries.compiler"; 91 | public static final String ANALYZER_DICTIONARIES_COMPILER_DEFAULT = null; 92 | 93 | // ------------------------------------------------------------- 94 | private static final Map map = new HashMap(); 95 | 96 | static { 97 | map.put(DIC_HOME_CONFIG_FIRST, DIC_HOME_CONFIG_FIRST_DEFAULT); 98 | map.put(DIC_HOME, DIC_HOME_DEFAULT); 99 | map.put(DIC_CHARSET, DIC_CHARSET_DEFAULT); 100 | map.put(DIC_MAXWORDLEN, DIC_MAXWORDLEN_DEFAULT); 101 | map.put(DIC_SKIP_PREFIX, DIC_SKIP_PREFIX_DEFAULT); 102 | map.put(DIC_NOISE_CHARACTOR, DIC_NOISE_CHARACTOR_DEFAULT); 103 | map.put(DIC_NOISE_WORD, DIC_NOISE_WORD_DEFAULT); 104 | map.put(DIC_UNIT, DIC_UNIT_DEFAULT); 105 | map.put(DIC_CONFUCIAN_FAMILY_NAME, DIC_CONFUCIAN_FAMILY_NAME_DEFAULT); 106 | map.put(DIC_FOR_COMBINATORICS, DIC_FOR_COMBINATORICS_DEFAULT); 107 | map.put(DIC_DETECTOR_INTERVAL, DIC_DETECTOR_INTERVAL_DEFAULT); 108 | map.put(ANALYZER_MODE, ANALYZER_MOE_DEFAULT); 109 | map.put(ANALYZER_DICTIONARIES_COMPILER, ANALYZER_DICTIONARIES_COMPILER_DEFAULT); 110 | } 111 | 112 | // 113 | public static final String KNIFE_CLASS = "paoding.knife.class."; 114 | 115 | public static String getProperty(Properties p, String name) { 116 | return p.getProperty(name, (String) map.get(name)); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/PaodingAnalyzer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.analyzer; 17 | 18 | import java.util.Properties; 19 | 20 | import net.paoding.analysis.Constants; 21 | import net.paoding.analysis.analyzer.estimate.TryPaodingAnalyzer; 22 | import net.paoding.analysis.knife.Knife; 23 | import net.paoding.analysis.knife.Paoding; 24 | import net.paoding.analysis.knife.PaodingMaker; 25 | 26 | /** 27 | * PaodingAnalyzer是基于“庖丁解牛”框架的Lucene词语分析器,是“庖丁解牛”框架对Lucene的适配器。 28 | *

29 | * 30 | * PaodingAnalyzer是线程安全的:并发情况下使用同一个PaodingAnalyzer实例是可行的。
31 | * PaodingAnalyzer是可复用的:推荐多次同一个PaodingAnalyzer实例。 32 | *

33 | * 34 | * PaodingAnalyzer自动读取类路径下的paoding-analysis.properties属性文件,装配PaodingAnalyzer 35 | *

36 | * 37 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 38 | * 39 | * @see PaodingAnalyzerBean 40 | * 41 | * @since 1.0 42 | * 43 | */ 44 | public class PaodingAnalyzer extends PaodingAnalyzerBean { 45 | 46 | /** 47 | * 根据类路径下的paoding-analysis.properties构建一个PaodingAnalyzer对象 48 | *

49 | * 在一个JVM中,可多次创建,而并不会多次读取属性文件,不会重复读取字典。 50 | */ 51 | public PaodingAnalyzer() { 52 | this(PaodingMaker.DEFAULT_PROPERTIES_PATH); 53 | } 54 | 55 | /** 56 | * @param propertiesPath null表示使用类路径下的paoding-analysis.properties 57 | */ 58 | public PaodingAnalyzer(String propertiesPath) { 59 | init(propertiesPath); 60 | } 61 | 62 | protected void init(String propertiesPath) { 63 | // 根据PaodingMaker说明, 64 | // 1、多次调用getProperties(),返回的都是同一个properties实例(只要属性文件没发生过修改) 65 | // 2、相同的properties实例,PaodingMaker也将返回同一个Paoding实例 66 | // 根据以上1、2点说明,在此能够保证多次创建PaodingAnalyzer并不会多次装载属性文件和词典 67 | if (propertiesPath == null) { 68 | propertiesPath = PaodingMaker.DEFAULT_PROPERTIES_PATH; 69 | } 70 | Properties properties = PaodingMaker.getProperties(propertiesPath); 71 | String mode = Constants 72 | .getProperty(properties, Constants.ANALYZER_MODE); 73 | Paoding paoding = PaodingMaker.make(properties); 74 | setKnife(paoding); 75 | setMode(mode); 76 | } 77 | 78 | /** 79 | * 本方法为PaodingAnalyzer附带的测试评估方法。
80 | * 执行之可以查看分词效果。以下任选一种方式进行: 81 | *

82 | * 83 | * java net...PaodingAnalyzer
84 | * java net...PaodingAnalyzer --help
85 | * java net...PaodingAnalyzer 中华人民共和国
86 | * java net...PaodingAnalyzer -m max 中华人民共和国
87 | * java net...PaodingAnalyzer -f c:/text.txt
88 | * java net...PaodingAnalyzer -f c:/text.txt -c utf-8
89 | * 90 | * @param args 91 | */ 92 | public static void main(String[] args) { 93 | if (System.getProperty("paoding.try.app") == null) { 94 | System.setProperty("paoding.try.app", "PaodingAnalyzer"); 95 | System.setProperty("paoding.try.cmd", "java PaodingAnalyzer"); 96 | } 97 | TryPaodingAnalyzer.main(args); 98 | } 99 | 100 | // -------------------------------------------------- 101 | 102 | /** 103 | * @param knife 104 | * @param mode default_mode 105 | * @deprecated 106 | */ 107 | public PaodingAnalyzer(Knife knife, int mode) { 108 | super(knife, mode); 109 | } 110 | 111 | /** 112 | * 等价于maxMode() 113 | * 114 | * @param knife 115 | * @return 116 | * @deprecated 117 | */ 118 | public static PaodingAnalyzer queryMode(Knife knife) { 119 | return maxMode(knife); 120 | } 121 | 122 | /** 123 | * 124 | * @param knife 125 | * @return 126 | * @deprecated 127 | */ 128 | public static PaodingAnalyzer defaultMode(Knife knife) { 129 | return new PaodingAnalyzer(knife, MOST_WORDS_MODE); 130 | } 131 | 132 | /** 133 | * 134 | * @param knife 135 | * @return 136 | * @deprecated 137 | */ 138 | public static PaodingAnalyzer maxMode(Knife knife) { 139 | return new PaodingAnalyzer(knife, MAX_WORD_LENGTH_MODE); 140 | } 141 | 142 | /** 143 | * 等价于defaultMode() 144 | * 145 | * @param knife 146 | * @return 147 | * @deprecated 148 | * 149 | */ 150 | public static PaodingAnalyzer writerMode(Knife knife) { 151 | return defaultMode(knife); 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/PaodingAnalyzerBean.java: -------------------------------------------------------------------------------- 1 | package net.paoding.analysis.analyzer; 2 | 3 | import java.io.Reader; 4 | 5 | import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector; 6 | import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector; 7 | import net.paoding.analysis.knife.Knife; 8 | 9 | import org.apache.lucene.analysis.Analyzer; 10 | 11 | 12 | /** 13 | * 14 | * 15 | * @author ZhenQin, linliangyi 16 | */ 17 | public class PaodingAnalyzerBean extends Analyzer { 18 | 19 | // ------------------------------------------------- 20 | 21 | /** 22 | * 最多切分 23 | */ 24 | public static final int MOST_WORDS_MODE = 1; 25 | 26 | /** 27 | * 按最大切分 28 | */ 29 | public static final int MAX_WORD_LENGTH_MODE = 2; 30 | 31 | // ------------------------------------------------- 32 | /** 33 | * 用于向PaodingTokenizer提供,分解文本字符 34 | * 35 | * @see net.paoding.analysis.analyzer.PaodingTokenizer#incrementToken() 36 | * 37 | */ 38 | private Knife knife; 39 | 40 | /** 41 | * @see #MOST_WORDS_MODE 42 | * @see #MAX_WORD_LENGTH_MODE 43 | */ 44 | private int mode = MOST_WORDS_MODE; 45 | 46 | /** 47 | * 反射执行默认构造方法 48 | */ 49 | private Class modeClass; 50 | 51 | // ------------------------------------------------- 52 | 53 | public PaodingAnalyzerBean() { 54 | } 55 | 56 | /** 57 | * @see #setKnife(Knife) 58 | * @param knife 59 | */ 60 | public PaodingAnalyzerBean(Knife knife) { 61 | this.knife = knife; 62 | } 63 | 64 | /** 65 | * @see #setKnife(Knife) 66 | * @see #setMode(int) 67 | * @param knife 68 | * @param mode 69 | */ 70 | public PaodingAnalyzerBean(Knife knife, int mode) { 71 | this.knife = knife; 72 | this.mode = mode; 73 | } 74 | 75 | /** 76 | * @see #setKnife(Knife) 77 | * @see #setMode(int) 78 | * @param knife 79 | * @param mode 80 | */ 81 | public PaodingAnalyzerBean(Knife knife, String mode) { 82 | this.knife = knife; 83 | this.setMode(mode); 84 | } 85 | 86 | 87 | @Override 88 | protected TokenStreamComponents createComponents(String fieldName, Reader reader) { 89 | if (knife == null) { 90 | throw new NullPointerException("knife should be set before token"); 91 | } 92 | // PaodingTokenizer是TokenStream实现,使用knife解析reader流入的文本 93 | return new TokenStreamComponents(new PaodingTokenizer(reader, 94 | knife, createTokenCollector())); 95 | } 96 | 97 | 98 | 99 | protected TokenCollector createTokenCollector() { 100 | if (modeClass != null) { 101 | try { 102 | return (TokenCollector) modeClass.newInstance(); 103 | } catch (InstantiationException e) { 104 | throw new IllegalArgumentException("wrong mode class:" + e.getMessage()); 105 | } catch (IllegalAccessException e) { 106 | throw new IllegalArgumentException("wrong mode class:" + e.getMessage()); 107 | } 108 | } 109 | switch (mode) { 110 | case MOST_WORDS_MODE: 111 | return new MostWordsTokenCollector(); 112 | case MAX_WORD_LENGTH_MODE: 113 | return new MaxWordLengthTokenCollector(); 114 | default: 115 | throw new Error("never happened"); 116 | } 117 | } 118 | 119 | 120 | // ------------------------------------------------- 121 | 122 | public Knife getKnife() { 123 | return knife; 124 | } 125 | 126 | public void setKnife(Knife knife) { 127 | this.knife = knife; 128 | } 129 | 130 | public int getMode() { 131 | return mode; 132 | } 133 | 134 | /** 135 | * 设置分析器模式. 136 | *

137 | * 138 | * @param mode 139 | */ 140 | public void setMode(int mode) { 141 | if (mode != MOST_WORDS_MODE && mode != MAX_WORD_LENGTH_MODE) { 142 | throw new IllegalArgumentException("wrong mode:" + mode); 143 | } 144 | this.mode = mode; 145 | this.modeClass = null; 146 | } 147 | 148 | /** 149 | * 设置分析器模式类。 150 | * 151 | * @param modeClass 152 | * TokenCollector的实现类。 153 | */ 154 | public void setModeClass(Class modeClass) { 155 | this.modeClass = modeClass; 156 | } 157 | 158 | public void setModeClass(String modeClass) { 159 | try { 160 | this.modeClass = Class.forName(modeClass); 161 | } catch (ClassNotFoundException e) { 162 | throw new IllegalArgumentException("not found mode class:" + e.getMessage()); 163 | } 164 | } 165 | 166 | 167 | public void setMode(String mode) { 168 | if (mode.startsWith("class:")) { 169 | setModeClass(mode.substring("class:".length())); 170 | } else { 171 | if ("most-words".equalsIgnoreCase(mode) 172 | || "default".equalsIgnoreCase(mode) 173 | || ("" + MOST_WORDS_MODE).equals(mode)) { 174 | setMode(MOST_WORDS_MODE); 175 | } else if ("max-word-length".equalsIgnoreCase(mode) 176 | || ("" + MAX_WORD_LENGTH_MODE).equals(mode)) { 177 | setMode(MAX_WORD_LENGTH_MODE); 178 | } else { 179 | throw new IllegalArgumentException("不合法的分析器Mode参数设置:" + mode); 180 | } 181 | } 182 | } 183 | 184 | // ------------------------------------------------- 185 | } 186 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/PaodingTokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.analyzer; 17 | 18 | import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector; 19 | import net.paoding.analysis.knife.Beef; 20 | import net.paoding.analysis.knife.Collector; 21 | import net.paoding.analysis.knife.Knife; 22 | import net.paoding.analysis.knife.Paoding; 23 | import org.apache.lucene.analysis.Token; 24 | import org.apache.lucene.analysis.Tokenizer; 25 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 26 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 27 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 28 | import org.apache.lucene.util.BytesRef; 29 | 30 | import java.io.IOException; 31 | import java.io.Reader; 32 | import java.util.Iterator; 33 | 34 | /** 35 | * PaodingTokenizer是基于“庖丁解牛”框架的TokenStream实现,为PaodingAnalyzer使用。 36 | *

37 | * 38 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 39 | * 40 | * @see Beef 41 | * @see Knife 42 | * @see Paoding 43 | * @see Tokenizer 44 | * @see PaodingAnalyzer 45 | * 46 | * @see Collector 47 | * @see TokenCollector 48 | * @see net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector 49 | * @see MostWordsTokenCollector 50 | * 51 | * @since 1.0 52 | */ 53 | public final class PaodingTokenizer extends Tokenizer implements Collector { 54 | 55 | // ------------------------------------------------- 56 | 57 | /** 58 | * 从input读入的总字符数 59 | */ 60 | private int inputLength; 61 | 62 | /** 63 | * 64 | */ 65 | private static final int bufferLength = 128; 66 | 67 | /** 68 | * 接收来自{@link #input}的文本字符 69 | * 70 | * @see #incrementToken() 71 | */ 72 | protected final char[] buffer = new char[bufferLength]; 73 | 74 | /** 75 | * {@link #buffer}[0]在{@link #input}中的偏移 76 | * 77 | * @see #collect(String, int, int) 78 | * @see #incrementToken() 79 | */ 80 | private int offset; 81 | 82 | /** 83 | * 84 | */ 85 | private final Beef beef = new Beef(buffer, 0, 0); 86 | 87 | /** 88 | * 89 | */ 90 | private int dissected; 91 | 92 | /** 93 | * 用于分解beef中的文本字符,由PaodingAnalyzer提供 94 | * 95 | * @see #incrementToken() 96 | */ 97 | private Knife knife; 98 | 99 | /** 100 | * 切分句子后在这里保存所有的词 101 | */ 102 | private TokenCollector tokenCollector; 103 | 104 | /** 105 | * tokens迭代器,用于next()方法顺序读取tokens中的Token对象 106 | * 107 | * @see #tokenCollector 108 | * @see #incrementToken() 109 | */ 110 | private Iterator tokenIteractor; 111 | 112 | 113 | 114 | private CharTermAttribute termAtt; 115 | private OffsetAttribute offsetAtt; 116 | private PositionIncrementAttribute positionIncrementAttribute; 117 | 118 | // ------------------------------------------------- 119 | 120 | /** 121 | * 122 | * @param input 123 | * @param knife 124 | * @param tokenCollector 125 | */ 126 | public PaodingTokenizer(Reader input, Knife knife, 127 | TokenCollector tokenCollector) { 128 | super(input); 129 | this.input = input; 130 | this.knife = knife; 131 | this.tokenCollector = tokenCollector; 132 | init(); 133 | } 134 | 135 | 136 | private void init() { 137 | termAtt = addAttribute(CharTermAttribute.class); 138 | offsetAtt = addAttribute(OffsetAttribute.class); 139 | positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); 140 | //typeAtt = addAttribute(TypeAttribute.class); 141 | } 142 | 143 | // ------------------------------------------------- 144 | 145 | public TokenCollector getTokenCollector() { 146 | return tokenCollector; 147 | } 148 | 149 | public void setTokenCollector(TokenCollector tokenCollector) { 150 | this.tokenCollector = tokenCollector; 151 | } 152 | 153 | // ------------------------------------------------- 154 | 155 | public void collect(String word, int offset, int end) { 156 | tokenCollector.collect(word, this.offset + offset, this.offset + end); 157 | } 158 | 159 | // ------------------------------------------------- 160 | public int getInputLength() { 161 | return inputLength; 162 | } 163 | 164 | @Override 165 | public boolean incrementToken() throws IOException { 166 | // 已经穷尽tokensIteractor的Token对象,则继续请求reader流入数据 167 | while (tokenIteractor == null || !tokenIteractor.hasNext()) { 168 | // System.out.println(dissected); 169 | int read = 0; 170 | int remainning = -1;// 重新从reader读入字符前,buffer中还剩下的字符数,负数表示当前暂不需要从reader中读入字符 171 | if (dissected >= beef.length()) { 172 | remainning = 0; 173 | } else if (dissected < 0) { 174 | remainning = bufferLength + dissected; 175 | } 176 | if (remainning >= 0) { 177 | if (remainning > 0) { 178 | System.arraycopy(buffer, -dissected, buffer, 0, remainning); 179 | } 180 | read = input 181 | .read(buffer, remainning, bufferLength - remainning); 182 | inputLength += read; 183 | int charCount = remainning + read; 184 | if (charCount < 0) { 185 | // reader已尽,按接口next()要求返回null. 186 | return false; 187 | } 188 | if (charCount < bufferLength) { 189 | buffer[charCount++] = 0; 190 | } 191 | // 构造“牛”,并使用knife“解”之 192 | beef.set(0, charCount); 193 | offset += Math.abs(dissected); 194 | // offset -= remainning; 195 | dissected = 0; 196 | } 197 | dissected = knife.dissect(this, beef, dissected); 198 | // offset += read;// !!! 199 | tokenIteractor = tokenCollector.iterator(); 200 | } 201 | 202 | if(tokenIteractor.hasNext()) { 203 | // 返回tokensIteractor下一个Token对象 204 | Token token = tokenIteractor.next(); 205 | termAtt.setEmpty(); 206 | termAtt.append(token); 207 | offsetAtt.setOffset(correctOffset(token.startOffset()), 208 | correctOffset(token.endOffset())); 209 | positionIncrementAttribute.setPositionIncrement(token.endOffset()); 210 | return true; 211 | } 212 | return tokenIteractor.hasNext(); 213 | } 214 | 215 | @Override 216 | public void reset() throws IOException { 217 | super.reset(); 218 | offset = 0; 219 | inputLength = 0; 220 | tokenIteractor = null; 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/TokenCollector.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.analyzer; 17 | 18 | import java.util.Iterator; 19 | 20 | import org.apache.lucene.analysis.Token; 21 | 22 | import net.paoding.analysis.knife.Collector; 23 | 24 | /** 25 | * 26 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 27 | * 28 | * @since 1.1 29 | */ 30 | public interface TokenCollector extends Collector { 31 | 32 | /** 33 | * 34 | * @return 35 | */ 36 | public Iterator iterator(); 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/estimate/Estimate.java: -------------------------------------------------------------------------------- 1 | package net.paoding.analysis.analyzer.estimate; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintStream; 5 | import java.io.Reader; 6 | import java.io.StringReader; 7 | import java.util.LinkedList; 8 | 9 | import net.paoding.analysis.analyzer.PaodingTokenizer; 10 | 11 | import org.apache.lucene.analysis.Analyzer; 12 | import org.apache.lucene.analysis.TokenStream; 13 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 14 | 15 | public class Estimate { 16 | private Analyzer analyzer; 17 | private String print; 18 | private PrintGate printGate; 19 | 20 | public Estimate() { 21 | this.setPrint("50");// 默认只打印前50行分词效果 22 | } 23 | 24 | public Estimate(Analyzer analyzer) { 25 | setAnalyzer(analyzer); 26 | this.setPrint("50");// 默认只打印前50行分词效果 27 | } 28 | 29 | public void setAnalyzer(Analyzer analyzer) { 30 | this.analyzer = analyzer; 31 | } 32 | 33 | public Analyzer getAnalyzer() { 34 | return analyzer; 35 | } 36 | 37 | public void setPrint(String print) { 38 | if (print == null || print.length() == 0 39 | || print.equalsIgnoreCase("null") 40 | || print.equalsIgnoreCase("no")) { 41 | printGate = null; 42 | this.print = null; 43 | } else { 44 | printGate = new LinePrintGate(); 45 | printGate.setPrint(print, 10); 46 | this.print = print; 47 | } 48 | } 49 | 50 | public String getPrint() { 51 | return print; 52 | } 53 | 54 | public void test(String input) { 55 | this.test(System.out, input); 56 | } 57 | 58 | public void test(PrintStream out, String input) { 59 | Reader reader = new StringReaderEx(input); 60 | this.test(out, reader); 61 | } 62 | 63 | public void test(PrintStream out, Reader reader) { 64 | try { 65 | long begin = System.currentTimeMillis(); 66 | 67 | LinkedList list = new LinkedList(); 68 | int wordsCount = 0; 69 | 70 | //collect token 71 | TokenStream ts = analyzer.tokenStream("", reader); 72 | ts.reset(); 73 | CharTermAttribute termAtt = (CharTermAttribute) ts 74 | .addAttribute(CharTermAttribute.class); 75 | while (ts.incrementToken()) { 76 | if (printGate != null && printGate.filter(wordsCount)) { 77 | list.add(new CToken(termAtt.toString(), wordsCount)); 78 | } 79 | wordsCount++; 80 | } 81 | 82 | long end = System.currentTimeMillis(); 83 | int c = 0; 84 | if (list.size() > 0) { 85 | for (CToken ctoken : list) { 86 | c = ctoken.i; 87 | if (c % 10 == 0) { 88 | if (c != 0) { 89 | out.println(); 90 | } 91 | out.print((c / 10 + 1) + ":\t"); 92 | } 93 | out.print(ctoken.t + "/"); 94 | } 95 | } 96 | if (wordsCount == 0) { 97 | System.out.println("\tAll are noise characters or words"); 98 | } else { 99 | if (c % 10 != 1) { 100 | System.out.println(); 101 | } 102 | String inputLength = "<未知>"; 103 | if (reader instanceof StringReaderEx) { 104 | inputLength = "" + ((StringReaderEx) reader).inputLength; 105 | } else if (ts instanceof PaodingTokenizer) { 106 | inputLength = "" + ((PaodingTokenizer) ts).getInputLength(); 107 | } 108 | System.out.println(); 109 | System.out.println("\t分词器" + analyzer.getClass().getName()); 110 | System.out.println("\t内容长度 " + inputLength + "字符, 分 " 111 | + wordsCount + "个词"); 112 | System.out.println("\t分词耗时 " + (end - begin) + "ms "); 113 | } 114 | } catch (IOException e) { 115 | e.printStackTrace(); 116 | } finally { 117 | try { 118 | reader.close(); 119 | } catch (IOException e) { 120 | } 121 | } 122 | } 123 | 124 | // ------------------------------------------- 125 | 126 | static class CToken { 127 | String t; 128 | int i; 129 | 130 | CToken(String t, int i) { 131 | this.t = t; 132 | this.i = i; 133 | } 134 | } 135 | 136 | static interface PrintGate { 137 | public void setPrint(String print, int unitSize); 138 | 139 | boolean filter(int count); 140 | } 141 | 142 | static class PrintGateToken implements PrintGate { 143 | private int begin; 144 | private int end; 145 | 146 | public void setBegin(int begin) { 147 | this.begin = begin; 148 | } 149 | 150 | public void setEnd(int end) { 151 | this.end = end; 152 | } 153 | 154 | public void setPrint(String print, int unitSize) { 155 | int i = print.indexOf('-'); 156 | if (i > 0) { 157 | int bv = Integer.parseInt(print.substring(0, i)); 158 | int ev = Integer.parseInt(print.substring(i + 1)); 159 | setBegin(unitSize * (Math.abs(bv) - 1));// 第5行,是从第40开始的 160 | setEnd(unitSize * Math.abs(ev));// 到第10行,是截止于100(不包含该边界) 161 | } else { 162 | setBegin(0); 163 | int v = Integer.parseInt(print); 164 | setEnd(unitSize * (Math.abs(v))); 165 | } 166 | } 167 | 168 | public boolean filter(int count) { 169 | return count >= begin && count < end; 170 | } 171 | } 172 | 173 | static class LinePrintGate implements PrintGate { 174 | 175 | private PrintGate[] list; 176 | 177 | public void setPrint(String print, int unitSize) { 178 | String[] prints = print.split(","); 179 | list = new PrintGate[prints.length]; 180 | for (int i = 0; i < prints.length; i++) { 181 | PrintGateToken pg = new PrintGateToken(); 182 | pg.setPrint(prints[i], unitSize); 183 | list[i] = pg; 184 | } 185 | } 186 | 187 | public boolean filter(int count) { 188 | for (int i = 0; i < list.length; i++) { 189 | if (list[i].filter(count)) { 190 | return true; 191 | } 192 | } 193 | return false; 194 | } 195 | 196 | } 197 | 198 | static class StringReaderEx extends StringReader { 199 | private int inputLength; 200 | 201 | public StringReaderEx(String s) { 202 | super(s); 203 | inputLength = s.length(); 204 | } 205 | } 206 | 207 | } 208 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/estimate/TryPaodingAnalyzer.java: -------------------------------------------------------------------------------- 1 | package net.paoding.analysis.analyzer.estimate; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.io.Reader; 10 | import java.net.URL; 11 | 12 | import net.paoding.analysis.analyzer.PaodingAnalyzer; 13 | import net.paoding.analysis.knife.PaodingMaker; 14 | 15 | import org.apache.lucene.analysis.Analyzer; 16 | 17 | public class TryPaodingAnalyzer { 18 | private static final String ARGS_TIP = ":"; 19 | static String input = null; 20 | static String file = null; 21 | static Reader reader = null; 22 | static String charset = null; 23 | static String mode = null; 24 | static String analyzerName = null; 25 | static String print = null; 26 | static String properties = PaodingMaker.DEFAULT_PROPERTIES_PATH; 27 | 28 | public static void main(String[] args) { 29 | try { 30 | resetArgs(); 31 | 32 | int inInput = 0; 33 | for (int i = 0; i < args.length; i++) { 34 | if (args[i] == null || (args[i] = args[i].trim()).length() == 0) { 35 | continue; 36 | } 37 | if (args[i].equals("--file") || args[i].equals("-f")) { 38 | file = args[++i]; 39 | } else if (args[i].equals("--charset") || args[i].equals("-c")) { 40 | charset = args[++i]; 41 | } else if (args[i].equals("--mode") || args[i].equals("-m")) { 42 | mode = args[++i]; 43 | } else if (args[i].equals("--properties") || args[i].equals("-p")) { 44 | properties = args[++i]; 45 | } else if (args[i].equals("--analyzer") || args[i].equals("-a")) { 46 | analyzerName = args[++i]; 47 | } else if (args[i].equals("--print") || args[i].equals("-P")) { 48 | print = args[++i]; 49 | } else if (args[i].equals("--input") || args[i].equals("-i")) { 50 | inInput++; 51 | } else if (args[i].equals("--help") || args[i].equals("-h") 52 | || args[i].equals("?")) { 53 | printHelp(); 54 | return; 55 | } else { 56 | // 非选项的参数数组视为input 57 | if (!args[i].startsWith("-") 58 | && (i == 0 || args[i - 1].equals("-i") || args[i - 1].equals("--input") || !args[i - 1].startsWith("-"))) { 59 | if (input == null) { 60 | input = args[i];// !!没有++i 61 | } else { 62 | input = input + ' ' + args[i];// !!没有++i 63 | } 64 | inInput++; 65 | } 66 | } 67 | } 68 | if (file != null) { 69 | input = null; 70 | reader = getReader(file, charset); 71 | } 72 | // 73 | analysing(); 74 | } catch (Exception e1) { 75 | resetArgs(); 76 | e1.printStackTrace(); 77 | } 78 | } 79 | 80 | 81 | 82 | private static void resetArgs() { 83 | input = null; 84 | file = null; 85 | reader = null; 86 | charset = null; 87 | mode = null; 88 | print = null; 89 | analyzerName = null; 90 | properties = PaodingMaker.DEFAULT_PROPERTIES_PATH; 91 | } 92 | 93 | 94 | 95 | private static void analysing() throws Exception { 96 | Analyzer analyzer; 97 | if (analyzerName == null || analyzerName.length() == 0 || analyzerName.equalsIgnoreCase("paoding")) { 98 | //properties==null等同于new new PaodingAnalyzer(); 99 | analyzer = new PaodingAnalyzer(properties); 100 | if (mode != null) { 101 | ((PaodingAnalyzer) analyzer).setMode(mode); 102 | } 103 | } 104 | else { 105 | Class clz; 106 | if (analyzerName.equalsIgnoreCase("standard")) { 107 | analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer"; 108 | } 109 | else if (analyzerName.equalsIgnoreCase("cjk")) { 110 | analyzerName = "org.apache.lucene.analysis.cjk.CJKAnalyzer"; 111 | } 112 | else if (analyzerName.equalsIgnoreCase("cn") || analyzerName.equalsIgnoreCase("chinese")) { 113 | analyzerName = "org.apache.lucene.analysis.cn.ChineseAnalyzer"; 114 | } 115 | else if (analyzerName.equalsIgnoreCase("st") || analyzerName.equalsIgnoreCase("standard")) { 116 | analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer"; 117 | } 118 | clz = Class.forName(analyzerName); 119 | analyzer = (Analyzer) clz.newInstance(); 120 | } 121 | boolean readInputFromConsle = false; 122 | Estimate estimate = new Estimate(analyzer); 123 | if (print != null) { 124 | estimate.setPrint(print); 125 | } 126 | while (true) { 127 | if (reader == null) { 128 | if (input == null || input.length() == 0 || readInputFromConsle) { 129 | input = getInputFromConsole(); 130 | readInputFromConsle = true; 131 | } 132 | if (input == null || input.length() == 0) { 133 | System.out.println("Warn: none charactors you input!!"); 134 | continue; 135 | } 136 | else if (input.startsWith(ARGS_TIP)) { 137 | String argsStr = input.substring(ARGS_TIP.length()); 138 | main(argsStr.split(" ")); 139 | continue; 140 | } 141 | } 142 | if (reader != null) { 143 | estimate.test(System.out, reader); 144 | reader = null; 145 | } 146 | else { 147 | estimate.test(System.out, input); 148 | input = null; 149 | } 150 | System.out.println("--------------------------------------------------"); 151 | if (false == readInputFromConsle) { 152 | return; 153 | } 154 | } 155 | } 156 | 157 | private static String getInputFromConsole() throws IOException { 158 | printTitleIfNotPrinted(""); 159 | String input = null; 160 | BufferedReader reader = new BufferedReader(new InputStreamReader( 161 | System.in)); 162 | String line; 163 | do { 164 | System.out.print("paoding> "); 165 | line = reader.readLine(); 166 | if (line == null || line.length() == 0) { 167 | continue; 168 | } 169 | if (line.equals(ARGS_TIP + "clear") || line.equals(ARGS_TIP + "c")) { 170 | input = null; 171 | System.out.println("paoding> Cleared"); 172 | return getInputFromConsole(); 173 | } 174 | else if (line.equals(ARGS_TIP + "exit") || line.equals(ARGS_TIP + "quit") || line.equals(ARGS_TIP + "e") || line.equals(ARGS_TIP + "q") ) { 175 | System.out.println("Bye!"); 176 | System.exit(0); 177 | } 178 | else if (input == null && line.startsWith(ARGS_TIP)) { 179 | input = line; 180 | break; 181 | } 182 | else { 183 | if (line.endsWith(";")) { 184 | if (line.length() > ";".length()) { 185 | input = line.substring(0, line.length() - ";".length()); 186 | } 187 | break; 188 | } 189 | else { 190 | if (input == null) { 191 | input = line; 192 | } else { 193 | input = input + "\n" + line; 194 | } 195 | } 196 | } 197 | } while (true); 198 | return input == null ? null : input.trim(); 199 | } 200 | 201 | private static void printHelp() { 202 | String app = System.getProperty("paoding.try.app", 203 | "TryPaodingAnalyzer"); 204 | String cmd = System.getProperty("paoding.try.cmd", "java " 205 | + TryPaodingAnalyzer.class.getName()); 206 | System.out.println(app + "的用法:"); 207 | System.out.println("\t" + cmd + " [OPTIONS] [text_content]"); 208 | System.out.println("\nOPTIONS:"); 209 | System.out.println("\t--file, -f:\n\t\t文章以文件的形式输入,在前缀加上\"classpath:\"表示从类路径中寻找该文件。"); 210 | System.out.println("\t--charset, -c:\n\t\t文章的字符集编码,比如gbk,utf-8等。如果没有设置该选项,则使用Java环境默认的字符集编码。"); 211 | System.out.println("\t--properties, -p:\n\t\t不读取默认的类路径下的庖丁分词属性文件,而使用指定的文件,在前缀加上\"classpath:\"表示从类路径中寻找该文件。"); 212 | System.out.println("\t--mode, -m:\n\t\t强制使用给定的mode的分词器;可以设定为default,most-words,max-word-length或指定类名的其他mode(指定类名的,需要加前缀\"class:\")。"); 213 | System.out.println("\t--input, -i:\n\t\t要被分词的文章内容;当没有通过-f或--file指定文章输入文件时可选择这个选项指定要被分词的内容。"); 214 | System.out.println("\t--analyzer, -a:\n\t\t测试其他分词器,通过--analyzer或-a指定其完整类名。特别地,paoding、cjk、chinese、st分别对应PaodingAnalyzer、CJKAnalyzer、ChineseAnalyzer、StandardAnalyzer"); 215 | System.out.println("\t--print, -P:\n\t\t 是否打印分词结果。默认打印前50行。规则:no表示不打印;50等价于1-50行;1-50表示打印1至50行;可以以逗号组合使用,如20,40-50表示打印1-20以及40-50行"); 216 | System.out.println("\n示例:"); 217 | System.out.println("\t" + cmd); 218 | System.out.println("\t" + cmd + " ?"); 219 | System.out.println("\t" + cmd + " 中华人民共和国"); 220 | System.out.println("\t" + cmd + " -m max 中华人民共和国"); 221 | System.out.println("\t" + cmd + " -f e:/content.txt -c utf8"); 222 | System.out.println("\t" + cmd + " -f e:/content.txt -c utf8 -m max-word-length"); 223 | System.out.println("\t" + cmd + " -f e:/content.txt -c utf8 -a cjk"); 224 | System.out.println("\n若是控制台进入\"paoding>\"后:"); 225 | titlePrinted = false; 226 | printTitleIfNotPrinted("\t"); 227 | } 228 | 229 | 230 | private static boolean titlePrinted = false; 231 | private static boolean welcomePrinted = false; 232 | private static void printTitleIfNotPrinted(String prefix) { 233 | if (!titlePrinted) { 234 | System.out.println(); 235 | if (!welcomePrinted) { 236 | System.out.println("Welcome to Paoding Analyser(2.0.4-alpha2)"); 237 | System.out.println(); 238 | welcomePrinted = true; 239 | } 240 | System.out.println(prefix + "直接输入或粘贴要被分词的内容,以分号;结束,回车后开始分词。"); 241 | System.out.println(prefix + "另起一行输入:clear或:c,使此次输入无效,用以重新输入。"); 242 | System.out.println(prefix + "要使用命令行参数读入文件内容或其他参数请以冒号:开始,然后输入参数选项。"); 243 | System.out.println(prefix + "退出,请输入:quit或:q、:exit、:e"); 244 | System.out.println(prefix + "需要帮助,请输入:?"); 245 | System.out.println(prefix + "注意:指定对文件分词之前要了解该文件的编码,如果系统编码和文件编码不一致,要通过-c指定文件的编码。"); 246 | System.out.println(); 247 | titlePrinted = true; 248 | } 249 | } 250 | 251 | 252 | static String getContent(String path, String encoding) throws IOException { 253 | return (String) read(path, encoding, true); 254 | } 255 | 256 | static Reader getReader(String path, String encoding) throws IOException { 257 | return (Reader) read(path, encoding, false); 258 | } 259 | 260 | static Object read(String path, String encoding, boolean return_string) throws IOException { 261 | InputStream in; 262 | if (path.startsWith("classpath:")) { 263 | path = path.substring("classpath:".length()); 264 | URL url = Estimate.class.getClassLoader().getResource(path); 265 | if (url == null) { 266 | throw new IllegalArgumentException("Not found " + path 267 | + " in classpath."); 268 | } 269 | System.out.println("read content from:" + url.getFile()); 270 | in = url.openStream(); 271 | } else { 272 | File f = new File(path); 273 | if (!f.exists()) { 274 | throw new IllegalArgumentException("Not found " + path 275 | + " in system."); 276 | } 277 | System.out.println("read content from:" + f.getAbsolutePath()); 278 | in = new FileInputStream(f); 279 | } 280 | Reader re; 281 | if (encoding != null) { 282 | re = new InputStreamReader(in, encoding); 283 | } else { 284 | re = new InputStreamReader(in); 285 | } 286 | if (!return_string) { 287 | return re; 288 | } 289 | char[] chs = new char[1024]; 290 | int count; 291 | // 为兼容低版本的JDK,使用StringBuffer而不是StringBuilder 292 | StringBuffer content = new StringBuffer(); 293 | while ((count = re.read(chs)) != -1) { 294 | content.append(chs, 0, count); 295 | } 296 | re.close(); 297 | return content.toString(); 298 | } 299 | } 300 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/impl/CompiledFileDictionaries.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.analyzer.impl; 17 | 18 | import java.io.File; 19 | import java.io.FileFilter; 20 | import java.io.IOException; 21 | import java.util.Collection; 22 | import java.util.LinkedList; 23 | import java.util.Map; 24 | 25 | import net.paoding.analysis.dictionary.BinaryDictionary; 26 | import net.paoding.analysis.dictionary.Dictionary; 27 | import net.paoding.analysis.dictionary.HashBinaryDictionary; 28 | import net.paoding.analysis.dictionary.Word; 29 | import net.paoding.analysis.dictionary.support.detection.Detector; 30 | import net.paoding.analysis.dictionary.support.detection.DifferenceListener; 31 | import net.paoding.analysis.dictionary.support.filewords.FileWordsReader; 32 | import net.paoding.analysis.exception.PaodingAnalysisException; 33 | import net.paoding.analysis.ext.PaodingAnalyzerListener; 34 | import net.paoding.analysis.knife.CJKKnife; 35 | import net.paoding.analysis.knife.Dictionaries; 36 | 37 | import org.apache.commons.logging.Log; 38 | import org.apache.commons.logging.LogFactory; 39 | 40 | /** 41 | * 中文字典缓存根据地,为{@link CJKKnife}所用。
42 | * 从本对象可以获取中文需要的相关字典。包括词汇表、姓氏表、计量单位表、忽略的词或单字等。 43 | *

44 | * 45 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 46 | * 47 | * @see CJKKnife 48 | * 49 | * @since 1.0 50 | */ 51 | public class CompiledFileDictionaries implements Dictionaries { 52 | 53 | // ------------------------------------------------- 54 | 55 | protected Log log = LogFactory.getLog(this.getClass()); 56 | 57 | // ------------------------------------------------- 58 | 59 | /** 60 | * 词汇表字典 61 | */ 62 | protected Dictionary vocabularyDictionary; 63 | 64 | /** 65 | * lantin+cjk的词典 66 | */ 67 | protected Dictionary combinatoricsDictionary; 68 | 69 | /** 70 | * 姓氏字典 71 | * 72 | */ 73 | protected Dictionary confucianFamilyNamesDictionary; 74 | 75 | /** 76 | * 忽略的单字 77 | */ 78 | protected Dictionary noiseCharactorsDictionary; 79 | 80 | /** 81 | * 忽略的词语 82 | * 83 | */ 84 | protected Dictionary noiseWordsDictionary; 85 | 86 | /** 87 | * 计量单位 88 | */ 89 | protected Dictionary unitsDictionary; 90 | 91 | // ------------------------------------------------- 92 | 93 | protected String dicHome; 94 | protected String noiseCharactor; 95 | protected String noiseWord; 96 | protected String unit; 97 | protected String confucianFamilyName; 98 | protected String combinatorics; 99 | protected String charsetName; 100 | protected int maxWordLen; 101 | 102 | private PaodingAnalyzerListener listener = null; 103 | // ---------------------- 104 | 105 | public CompiledFileDictionaries() { 106 | } 107 | 108 | public CompiledFileDictionaries(String dicHome, String noiseCharactor, 109 | String noiseWord, String unit, String confucianFamilyName, 110 | String combinatorics, String charsetName, int maxWordLen) { 111 | this.dicHome = dicHome; 112 | this.noiseCharactor = noiseCharactor; 113 | this.noiseWord = noiseWord; 114 | this.unit = unit; 115 | this.confucianFamilyName = confucianFamilyName; 116 | this.combinatorics = combinatorics; 117 | this.charsetName = charsetName; 118 | this.maxWordLen = maxWordLen; 119 | } 120 | 121 | public String getDicHome() { 122 | return dicHome; 123 | } 124 | 125 | public void setDicHome(String dicHome) { 126 | this.dicHome = dicHome; 127 | } 128 | 129 | public String getNoiseCharactor() { 130 | return noiseCharactor; 131 | } 132 | 133 | public void setNoiseCharactor(String noiseCharactor) { 134 | this.noiseCharactor = noiseCharactor; 135 | } 136 | 137 | public String getNoiseWord() { 138 | return noiseWord; 139 | } 140 | 141 | public void setNoiseWord(String noiseWord) { 142 | this.noiseWord = noiseWord; 143 | } 144 | 145 | public String getUnit() { 146 | return unit; 147 | } 148 | 149 | public void setUnit(String unit) { 150 | this.unit = unit; 151 | } 152 | 153 | public String getConfucianFamilyName() { 154 | return confucianFamilyName; 155 | } 156 | 157 | public void setConfucianFamilyName(String confucianFamilyName) { 158 | this.confucianFamilyName = confucianFamilyName; 159 | } 160 | 161 | public String getCharsetName() { 162 | return charsetName; 163 | } 164 | 165 | public void setCharsetName(String charsetName) { 166 | this.charsetName = charsetName; 167 | } 168 | 169 | public int getMaxWordLen() { 170 | return maxWordLen; 171 | } 172 | 173 | public void setMaxWordLen(int maxWordLen) { 174 | this.maxWordLen = maxWordLen; 175 | } 176 | 177 | public void setLantinFllowedByCjk(String lantinFllowedByCjk) { 178 | this.combinatorics = lantinFllowedByCjk; 179 | } 180 | 181 | public String getLantinFllowedByCjk() { 182 | return combinatorics; 183 | } 184 | 185 | // ------------------------------------------------- 186 | 187 | /** 188 | * 词汇表字典 189 | * 190 | * @return 191 | */ 192 | public synchronized Dictionary getVocabularyDictionary() { 193 | if (vocabularyDictionary == null) { 194 | // 大概有5639个字有词语,故取0x2fff=x^13>8000>8000*0.75=6000>5639 195 | vocabularyDictionary = new HashBinaryDictionary( 196 | getVocabularyWords(), 0x2fff, 0.75f); 197 | } 198 | return vocabularyDictionary; 199 | } 200 | 201 | /** 202 | * 姓氏字典 203 | * 204 | * @return 205 | */ 206 | public synchronized Dictionary getConfucianFamilyNamesDictionary() { 207 | if (confucianFamilyNamesDictionary == null) { 208 | confucianFamilyNamesDictionary = new BinaryDictionary( 209 | getConfucianFamilyNames()); 210 | } 211 | return confucianFamilyNamesDictionary; 212 | } 213 | 214 | /** 215 | * 忽略的词语 216 | * 217 | * @return 218 | */ 219 | public synchronized Dictionary getNoiseCharactorsDictionary() { 220 | if (noiseCharactorsDictionary == null) { 221 | noiseCharactorsDictionary = new HashBinaryDictionary( 222 | getNoiseCharactors(), 256, 0.75f); 223 | } 224 | return noiseCharactorsDictionary; 225 | } 226 | 227 | /** 228 | * 忽略的单字 229 | * 230 | * @return 231 | */ 232 | public synchronized Dictionary getNoiseWordsDictionary() { 233 | if (noiseWordsDictionary == null) { 234 | noiseWordsDictionary = new BinaryDictionary(getNoiseWords()); 235 | } 236 | return noiseWordsDictionary; 237 | } 238 | 239 | /** 240 | * 计量单位 241 | * 242 | * @return 243 | */ 244 | public synchronized Dictionary getUnitsDictionary() { 245 | if (unitsDictionary == null) { 246 | unitsDictionary = new HashBinaryDictionary(getUnits(), 1024, 0.75f); 247 | } 248 | return unitsDictionary; 249 | } 250 | 251 | public synchronized Dictionary getCombinatoricsDictionary() { 252 | if (combinatoricsDictionary == null) { 253 | combinatoricsDictionary = new BinaryDictionary( 254 | getCombinatoricsWords()); 255 | } 256 | return combinatoricsDictionary; 257 | } 258 | 259 | private Detector detector; 260 | 261 | public synchronized void startDetecting(int interval, DifferenceListener l) { 262 | if (detector != null || interval < 0) { 263 | return; 264 | } 265 | Detector detector = new Detector(); 266 | detector.setHome(dicHome); 267 | detector.setFilter(null); 268 | detector.setFilter(new FileFilter() { 269 | public boolean accept(File pathname) { 270 | return pathname.getPath().endsWith(".dic.compiled") 271 | || pathname.getPath().endsWith(".metadata"); 272 | } 273 | }); 274 | detector.setLastSnapshot(detector.flash()); 275 | detector.setListener(l); 276 | detector.setInterval(interval); 277 | detector.start(true); 278 | this.detector = detector; 279 | } 280 | 281 | public synchronized void stopDetecting() { 282 | if (detector == null) { 283 | return; 284 | } 285 | detector.setStop(); 286 | detector = null; 287 | } 288 | 289 | // --------------------------------------------------------------- 290 | // 以下为辅助性的方式-类私有或package私有 291 | 292 | protected Word[] getDictionaryWords(String dicNameRelativeDicHome) { 293 | File f = new File(this.dicHome, "/" + dicNameRelativeDicHome 294 | + ".dic.compiled"); 295 | if (!f.exists()) { 296 | return new Word[0]; 297 | } 298 | try { 299 | if(this.listener != null){ 300 | this.listener.readCompileDic(f.getAbsolutePath()); 301 | } 302 | Map> map = FileWordsReader.readWords(f.getAbsolutePath(), 303 | charsetName, maxWordLen, LinkedList.class, ".dic.compiled"); 304 | Collection wordsList = map.values().iterator().next(); 305 | if(this.listener != null){ 306 | this.listener.readCompileDicFinished(f.getAbsolutePath(), wordsList); 307 | } 308 | return (Word[]) wordsList.toArray(new Word[wordsList.size()]); 309 | } catch (IOException e) { 310 | throw toRuntimeException(e); 311 | } 312 | } 313 | 314 | 315 | protected Word[] getVocabularyWords() { 316 | return getDictionaryWords("vocabulary"); 317 | } 318 | 319 | protected Word[] getConfucianFamilyNames() { 320 | return getDictionaryWords(confucianFamilyName); 321 | } 322 | 323 | protected Word[] getNoiseWords() { 324 | return getDictionaryWords(noiseWord); 325 | } 326 | 327 | protected Word[] getNoiseCharactors() { 328 | return getDictionaryWords(noiseCharactor); 329 | } 330 | 331 | protected Word[] getUnits() { 332 | return getDictionaryWords(unit); 333 | } 334 | 335 | protected Word[] getCombinatoricsWords() { 336 | return getDictionaryWords(combinatorics); 337 | } 338 | 339 | // -------------------------------------- 340 | 341 | protected RuntimeException toRuntimeException(IOException e) { 342 | return new PaodingAnalysisException(e); 343 | } 344 | 345 | public void setAnalyzerListener(PaodingAnalyzerListener listener) { 346 | this.listener = listener; 347 | 348 | } 349 | } 350 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/impl/MaxWordLengthTokenCollector.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.analyzer.impl; 17 | 18 | import java.util.Iterator; 19 | import java.util.LinkedList; 20 | 21 | 22 | import net.paoding.analysis.analyzer.TokenCollector; 23 | 24 | import org.apache.lucene.analysis.Token; 25 | 26 | 27 | /** 28 | * 29 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 30 | * 31 | * @since 1.1 32 | */ 33 | public class MaxWordLengthTokenCollector implements TokenCollector { 34 | 35 | /** 36 | * 存储当前被knife分解而成的Token对象 37 | * 38 | */ 39 | private LinkedList tokens = new LinkedList(); 40 | 41 | private Token candidate; 42 | 43 | private Token last; 44 | 45 | 46 | public MaxWordLengthTokenCollector() { 47 | } 48 | 49 | public Iterator iterator() { 50 | if (candidate != null) { 51 | this.tokens.add(candidate); 52 | candidate = null; 53 | } 54 | Iterator iter = this.tokens.iterator(); 55 | this.tokens = new LinkedList(); 56 | return iter; 57 | } 58 | 59 | public void collect(String word, int offset, int end) { 60 | Token c = candidate != null ? candidate : last; 61 | if (c == null) { 62 | candidate = new Token(word, offset, end); 63 | } else if (offset == c.startOffset()) { 64 | if (end > c.endOffset()) { 65 | candidate = new Token(word, offset, end); 66 | } 67 | } else if (offset > c.startOffset()) { 68 | if (candidate != null) { 69 | select(candidate); 70 | } 71 | if (end > c.endOffset()) { 72 | candidate = new Token(word, offset, end); 73 | } else { 74 | candidate = null; 75 | } 76 | } else if (end >= c.endOffset()) { 77 | if (last != null && last.startOffset() >= offset 78 | && last.endOffset() <= end) { 79 | for (Iterator iter = tokens.iterator(); iter.hasNext();) { 80 | last = (Token) iter.next(); 81 | if (last.startOffset() >= offset && last.endOffset() <= end) { 82 | iter.remove(); 83 | } 84 | } 85 | } 86 | last = null; 87 | candidate = new Token(word, offset, end); 88 | } 89 | } 90 | 91 | protected void select(Token t) { 92 | this.tokens.add(t); 93 | this.last = t; 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/impl/MostWordsModeDictionariesCompiler.java: -------------------------------------------------------------------------------- 1 | package net.paoding.analysis.analyzer.impl; 2 | 3 | import java.io.BufferedOutputStream; 4 | import java.io.File; 5 | import java.io.FileFilter; 6 | import java.io.FileInputStream; 7 | import java.io.FileNotFoundException; 8 | import java.io.FileOutputStream; 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | import java.io.OutputStream; 12 | import java.io.UnsupportedEncodingException; 13 | import java.util.BitSet; 14 | import java.util.Properties; 15 | 16 | import net.paoding.analysis.Constants; 17 | import net.paoding.analysis.dictionary.Dictionary; 18 | import net.paoding.analysis.dictionary.Hit; 19 | import net.paoding.analysis.dictionary.Word; 20 | import net.paoding.analysis.dictionary.support.detection.Snapshot; 21 | import net.paoding.analysis.knife.Beef; 22 | import net.paoding.analysis.knife.Collector; 23 | import net.paoding.analysis.knife.Dictionaries; 24 | import net.paoding.analysis.knife.DictionariesCompiler; 25 | import net.paoding.analysis.knife.Knife; 26 | 27 | public class MostWordsModeDictionariesCompiler implements DictionariesCompiler { 28 | public static final String VERSION = "2"; 29 | 30 | public boolean shouldCompile(Properties p) throws Exception { 31 | String dicHome = p.getProperty("paoding.dic.home.absolute.path"); 32 | File dicHomeFile = new File(dicHome); 33 | File compliedMetadataFile = new File(dicHomeFile, ".compiled/most-words-mode/.metadata"); 34 | if (compliedMetadataFile.exists() && compliedMetadataFile.isFile()) { 35 | // get checksum for all compiled dictionaries 36 | String checksum = Snapshot.flash( 37 | new File(dicHomeFile, ".compiled/most-words-mode"), 38 | new FileFilter() { 39 | public boolean accept(File pathname) { 40 | return pathname.getPath().endsWith(".dic.compiled"); 41 | } 42 | }).getCheckSum(); 43 | 44 | Properties compiledProperties = new Properties(); 45 | InputStream compiledPropertiesInput = new FileInputStream(compliedMetadataFile); 46 | compiledProperties.load(compiledPropertiesInput); 47 | compiledPropertiesInput.close(); 48 | String compiledCheckSum = compiledProperties.getProperty("paoding.analysis.compiler.checksum"); 49 | String clazz = compiledProperties.getProperty("paoding.analysis.compiler.class"); 50 | String version = compiledProperties.getProperty("paoding.analysis.compiler.version"); 51 | if (checksum.equals(compiledCheckSum) && this.getClass().getName().equalsIgnoreCase(clazz) 52 | && VERSION.equalsIgnoreCase(version)) { 53 | return false; 54 | } 55 | } 56 | return true; 57 | } 58 | 59 | public void compile(Dictionaries dictionaries, Knife knife, Properties p) throws Exception { 60 | String dicHome = p.getProperty("paoding.dic.home.absolute.path"); 61 | String noiseCharactor = getProperty(p, Constants.DIC_NOISE_CHARACTOR); 62 | String noiseWord = getProperty(p, Constants.DIC_NOISE_WORD); 63 | String unit = getProperty(p, Constants.DIC_UNIT); 64 | String confucianFamilyName = getProperty(p, Constants.DIC_CONFUCIAN_FAMILY_NAME); 65 | String combinatorics = getProperty(p, Constants.DIC_FOR_COMBINATORICS); 66 | String charsetName = getProperty(p, Constants.DIC_CHARSET); 67 | 68 | File dicHomeFile = new File(dicHome); 69 | File compiledDicHomeFile = new File(dicHomeFile, ".compiled/most-words-mode"); 70 | compiledDicHomeFile.mkdirs(); 71 | // 72 | Dictionary vocabularyDictionary = dictionaries.getVocabularyDictionary(); 73 | File vocabularyFile = new File(compiledDicHomeFile, "vocabulary.dic.compiled"); 74 | compileVocabulary(vocabularyDictionary, knife, vocabularyFile, charsetName); 75 | 76 | // 77 | Dictionary noiseCharactorsDictionary = dictionaries.getNoiseCharactorsDictionary(); 78 | File noiseCharactorsDictionaryFile = new File(compiledDicHomeFile, noiseCharactor + ".dic.compiled"); 79 | sortCompile(noiseCharactorsDictionary, noiseCharactorsDictionaryFile, charsetName); 80 | // 81 | Dictionary noiseWordsDictionary = dictionaries.getNoiseWordsDictionary(); 82 | File noiseWordsDictionaryFile = new File(compiledDicHomeFile, noiseWord + ".dic.compiled"); 83 | sortCompile(noiseWordsDictionary, noiseWordsDictionaryFile, charsetName); 84 | // 85 | Dictionary unitsDictionary = dictionaries.getUnitsDictionary(); 86 | File unitsDictionaryFile = new File(compiledDicHomeFile, unit + ".dic.compiled"); 87 | sortCompile(unitsDictionary, unitsDictionaryFile, charsetName); 88 | // 89 | Dictionary confucianFamilyDictionary = dictionaries.getConfucianFamilyNamesDictionary(); 90 | File confucianFamilyDictionaryFile = new File(compiledDicHomeFile, confucianFamilyName + ".dic.compiled"); 91 | sortCompile(confucianFamilyDictionary, confucianFamilyDictionaryFile, charsetName); 92 | // 93 | Dictionary combinatoricsDictionary = dictionaries.getCombinatoricsDictionary(); 94 | File combinatoricsDictionaryFile = new File(compiledDicHomeFile, combinatorics + ".dic.compiled"); 95 | sortCompile(combinatoricsDictionary, combinatoricsDictionaryFile, charsetName); 96 | 97 | // 98 | File compliedMetadataFile = new File(dicHomeFile, ".compiled/most-words-mode/.metadata"); 99 | if (compliedMetadataFile.exists()) { 100 | //compliedMetadataFile.setWritable(true); 101 | compliedMetadataFile.delete(); 102 | } 103 | else { 104 | compliedMetadataFile.getParentFile().mkdirs(); 105 | } 106 | OutputStream compiledPropertiesOutput = new FileOutputStream(compliedMetadataFile); 107 | Properties compiledProperties = new Properties(); 108 | String lastModifiedsKey = "paoding.analysis.properties.lastModifieds"; 109 | String filesKey = "paoding.analysis.properties.files"; 110 | compiledProperties.setProperty(lastModifiedsKey, p.getProperty(lastModifiedsKey)); 111 | compiledProperties.setProperty(filesKey, p.getProperty(filesKey)); 112 | compiledProperties.setProperty("paoding.analysis.compiler.checksum", 113 | Snapshot.flash( 114 | new File(dicHomeFile, ".compiled/most-words-mode"), 115 | new FileFilter() { 116 | public boolean accept(File pathname) { 117 | return pathname.getPath().endsWith( 118 | ".dic.compiled"); 119 | } 120 | }).getCheckSum()); 121 | compiledProperties.setProperty("paoding.analysis.compiler.class", this.getClass().getName()); 122 | compiledProperties.setProperty("paoding.analysis.compiler.version", VERSION); 123 | compiledProperties.store(compiledPropertiesOutput, "dont edit it! this file was auto generated by paoding."); 124 | compiledPropertiesOutput.close(); 125 | compliedMetadataFile.setReadOnly(); 126 | } 127 | 128 | 129 | public Dictionaries readCompliedDictionaries(Properties p) { 130 | String dicHomeAbsolutePath = p.getProperty("paoding.dic.home.absolute.path"); 131 | String noiseCharactor = getProperty(p, Constants.DIC_NOISE_CHARACTOR); 132 | String noiseWord = getProperty(p, Constants.DIC_NOISE_WORD); 133 | String unit = getProperty(p, Constants.DIC_UNIT); 134 | String confucianFamilyName = getProperty(p, Constants.DIC_CONFUCIAN_FAMILY_NAME); 135 | String combinatorics = getProperty(p, Constants.DIC_FOR_COMBINATORICS); 136 | String charsetName = getProperty(p, Constants.DIC_CHARSET); 137 | int maxWordLen = Integer.valueOf(getProperty(p, Constants.DIC_MAXWORDLEN)); 138 | return new CompiledFileDictionaries( 139 | dicHomeAbsolutePath + "/.compiled/most-words-mode", 140 | noiseCharactor, noiseWord, unit, 141 | confucianFamilyName, combinatorics, charsetName,maxWordLen); 142 | } 143 | 144 | private static String getProperty(Properties p, String name) { 145 | return Constants.getProperty(p, name); 146 | } 147 | 148 | 149 | private void sortCompile(final Dictionary dictionary, 150 | File dicFile, String charsetName) throws FileNotFoundException, 151 | IOException, UnsupportedEncodingException { 152 | int wordsSize = dictionary.size(); 153 | if (dicFile.exists()) { 154 | //dicFile.setWritable(true); 155 | dicFile.delete(); 156 | } 157 | BufferedOutputStream out = new BufferedOutputStream( 158 | new FileOutputStream(dicFile), 1024 * 16); 159 | 160 | for (int i = 0; i < wordsSize; i++) { 161 | Word word = dictionary.get(i); 162 | out.write(word.getText().getBytes(charsetName)); 163 | if (word.getModifiers() != Word.DEFAUL) { 164 | out.write("[m=".getBytes()); 165 | out.write(String.valueOf(word.getModifiers()).getBytes()); 166 | out.write(']'); 167 | } 168 | out.write('\r'); 169 | out.write('\n'); 170 | } 171 | out.flush(); 172 | out.close(); 173 | dicFile.setReadOnly(); 174 | } 175 | 176 | private void compileVocabulary(final Dictionary vocabularyDictionary, Knife knife, 177 | File vocabularyFile, String charsetName) throws FileNotFoundException, 178 | IOException, UnsupportedEncodingException { 179 | int vocabularySize = vocabularyDictionary.size(); 180 | Word[] vocabularyWords = new Word[vocabularySize]; 181 | char[] chs = new char[128]; 182 | for (int i = 0; i < vocabularySize; i ++) { 183 | final Word curWord = vocabularyDictionary.get(i); 184 | curWord.getText().getChars(0, curWord.length(), chs, 0); 185 | chs[curWord.length()] = (char) -1; 186 | Beef beef = new Beef(chs, 0, curWord.length() + 1); 187 | final BitSet bs = new BitSet(curWord.length()); 188 | knife.dissect(new Collector(){ 189 | public void collect(String word, int offset, int end) { 190 | Hit hit = vocabularyDictionary.search(word, 0, word.length()); 191 | if (hit.isHit() && hit.getWord().length() != curWord.length()) { 192 | for (int j = offset; j < end; j++) { 193 | bs.set(j, true); 194 | } 195 | } 196 | } 197 | 198 | }, beef, 0); 199 | 200 | for (int j = 0; j < curWord.length();j++) { 201 | if (!bs.get(j)) { 202 | vocabularyWords[i] = curWord; 203 | break; 204 | } 205 | } 206 | } 207 | if (vocabularyFile.exists()) { 208 | //vocabularyFile.setWritable(true); 209 | vocabularyFile.delete(); 210 | } 211 | BufferedOutputStream out = new BufferedOutputStream( 212 | new FileOutputStream(vocabularyFile), 1024 * 16); 213 | 214 | for (int i = 0; i < vocabularySize; i++) { 215 | if (vocabularyWords[i] != null) { 216 | out.write(vocabularyWords[i].getText().getBytes(charsetName)); 217 | if (vocabularyWords[i].getModifiers() != Word.DEFAUL) { 218 | out.write("[m=".getBytes()); 219 | out.write(String.valueOf(vocabularyWords[i].getModifiers()).getBytes()); 220 | out.write(']'); 221 | } 222 | out.write('\r'); 223 | out.write('\n'); 224 | } 225 | } 226 | out.flush(); 227 | out.close(); 228 | vocabularyFile.setReadOnly(); 229 | } 230 | 231 | } 232 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/impl/MostWordsTokenCollector.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.analyzer.impl; 17 | 18 | import java.util.Iterator; 19 | 20 | import net.paoding.analysis.analyzer.TokenCollector; 21 | 22 | import org.apache.lucene.analysis.Token; 23 | 24 | /** 25 | * 26 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 27 | * 28 | * @since 1.1 29 | */ 30 | public class MostWordsTokenCollector implements TokenCollector, Iterator { 31 | 32 | private LinkedToken firstToken; 33 | private LinkedToken lastToken; 34 | 35 | /** 36 | * Collector接口实现。
37 | * 构造词语Token对象,并放置在tokens中 38 | * 39 | */ 40 | public void collect(String word, int begin, int end) { 41 | LinkedToken tokenToAdd = new LinkedToken(word, begin, end); 42 | if (firstToken == null) { 43 | firstToken = tokenToAdd; 44 | lastToken = tokenToAdd; 45 | return; 46 | } 47 | if (tokenToAdd.compareTo(lastToken) > 0) { 48 | tokenToAdd.pre = lastToken; 49 | lastToken.next = tokenToAdd; 50 | lastToken = tokenToAdd; 51 | // 52 | } else { 53 | LinkedToken curTokenToTry = lastToken.pre; 54 | while (curTokenToTry != null 55 | && tokenToAdd.compareTo(curTokenToTry) < 0) { 56 | curTokenToTry = curTokenToTry.pre; 57 | } 58 | if (curTokenToTry == null) { 59 | firstToken.pre = tokenToAdd; 60 | tokenToAdd.next = firstToken; 61 | firstToken = tokenToAdd; 62 | } else { 63 | tokenToAdd.next = curTokenToTry.next; 64 | curTokenToTry.next.pre = tokenToAdd; 65 | tokenToAdd.pre = curTokenToTry; 66 | curTokenToTry.next = tokenToAdd; 67 | 68 | } 69 | } 70 | } 71 | 72 | private LinkedToken nextLinkedToken; 73 | 74 | public Iterator iterator() { 75 | nextLinkedToken = firstToken; 76 | firstToken = null; 77 | return this; 78 | } 79 | 80 | public boolean hasNext() { 81 | return nextLinkedToken != null; 82 | } 83 | 84 | public Token next() { 85 | LinkedToken ret = nextLinkedToken; 86 | nextLinkedToken = nextLinkedToken.next; 87 | return ret; 88 | } 89 | 90 | public void remove() { 91 | 92 | } 93 | 94 | private static class LinkedToken extends Token implements Comparable { 95 | /** 96 | * 97 | */ 98 | private static final long serialVersionUID = 118708L; 99 | 100 | public LinkedToken pre; 101 | public LinkedToken next; 102 | 103 | public LinkedToken(String word, int begin, int end) { 104 | super(word, begin, end); 105 | } 106 | 107 | public int compareTo(LinkedToken obj) { 108 | // 简单/单单/简简单单/ 109 | if (this.endOffset() > obj.endOffset()) 110 | return 1; 111 | if (this.endOffset() == obj.endOffset()) { 112 | return obj.startOffset() - this.startOffset(); 113 | } 114 | return -1; 115 | } 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/analyzer/impl/SortingDictionariesCompiler.java: -------------------------------------------------------------------------------- 1 | package net.paoding.analysis.analyzer.impl; 2 | 3 | import java.io.BufferedOutputStream; 4 | import java.io.File; 5 | import java.io.FileFilter; 6 | import java.io.FileInputStream; 7 | import java.io.FileNotFoundException; 8 | import java.io.FileOutputStream; 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | import java.io.OutputStream; 12 | import java.io.UnsupportedEncodingException; 13 | import java.util.Properties; 14 | 15 | import net.paoding.analysis.Constants; 16 | import net.paoding.analysis.dictionary.Dictionary; 17 | import net.paoding.analysis.dictionary.Word; 18 | import net.paoding.analysis.dictionary.support.detection.Snapshot; 19 | import net.paoding.analysis.knife.Dictionaries; 20 | import net.paoding.analysis.knife.DictionariesCompiler; 21 | import net.paoding.analysis.knife.Knife; 22 | 23 | public class SortingDictionariesCompiler implements DictionariesCompiler { 24 | public static final String VERSION = "2"; 25 | 26 | 27 | public boolean shouldCompile(Properties p) throws Exception { 28 | String dicHome = p.getProperty("paoding.dic.home.absolute.path"); 29 | File dicHomeFile = new File(dicHome); 30 | File compliedMetadataFile = new File(dicHomeFile, ".compiled/sorting/.metadata"); 31 | if (compliedMetadataFile.exists() && compliedMetadataFile.isFile()) { 32 | // get checksum for all compiled dictionaries 33 | String checksum = Snapshot.flash( 34 | new File(dicHomeFile, ".compiled/sorting"), 35 | new FileFilter() { 36 | public boolean accept(File pathname) { 37 | return pathname.getPath().endsWith(".dic.compiled"); 38 | } 39 | }).getCheckSum(); 40 | 41 | Properties compiledProperties = new Properties(); 42 | InputStream compiledPropertiesInput = new FileInputStream(compliedMetadataFile); 43 | compiledProperties.load(compiledPropertiesInput); 44 | compiledPropertiesInput.close(); 45 | String compiledCheckSum = compiledProperties.getProperty("paoding.analysis.compiler.checksum"); 46 | String clazz = compiledProperties.getProperty("paoding.analysis.compiler.class"); 47 | String version = compiledProperties.getProperty("paoding.analysis.compiler.version"); 48 | if (checksum.equals(compiledCheckSum) && this.getClass().getName().equalsIgnoreCase(clazz) 49 | && VERSION.equalsIgnoreCase(version)) { 50 | return false; 51 | } 52 | } 53 | return true; 54 | } 55 | 56 | 57 | public void compile(Dictionaries dictionaries, Knife knife, Properties p) throws Exception { 58 | 59 | String dicHome = p.getProperty("paoding.dic.home.absolute.path"); 60 | String noiseCharactor = getProperty(p, Constants.DIC_NOISE_CHARACTOR); 61 | String noiseWord = getProperty(p, Constants.DIC_NOISE_WORD); 62 | String unit = getProperty(p, Constants.DIC_UNIT); 63 | String confucianFamilyName = getProperty(p, Constants.DIC_CONFUCIAN_FAMILY_NAME); 64 | String combinatorics = getProperty(p, Constants.DIC_FOR_COMBINATORICS); 65 | String charsetName = getProperty(p, Constants.DIC_CHARSET); 66 | 67 | File dicHomeFile = new File(dicHome); 68 | File compiledDicHomeFile = new File(dicHomeFile, ".compiled/sorting"); 69 | compiledDicHomeFile.mkdirs(); 70 | 71 | // 72 | Dictionary vocabularyDictionary = dictionaries.getVocabularyDictionary(); 73 | File vocabularyFile = new File(compiledDicHomeFile, "vocabulary.dic.compiled"); 74 | sortCompile(vocabularyDictionary, vocabularyFile, charsetName); 75 | 76 | // 77 | Dictionary noiseCharactorsDictionary = dictionaries.getNoiseCharactorsDictionary(); 78 | File noiseCharactorsDictionaryFile = new File(compiledDicHomeFile, noiseCharactor + ".dic.compiled"); 79 | sortCompile(noiseCharactorsDictionary, noiseCharactorsDictionaryFile, charsetName); 80 | // 81 | Dictionary noiseWordsDictionary = dictionaries.getNoiseWordsDictionary(); 82 | File noiseWordsDictionaryFile = new File(compiledDicHomeFile, noiseWord + ".dic.compiled"); 83 | sortCompile(noiseWordsDictionary, noiseWordsDictionaryFile, charsetName); 84 | // 85 | Dictionary unitsDictionary = dictionaries.getUnitsDictionary(); 86 | File unitsDictionaryFile = new File(compiledDicHomeFile, unit + ".dic.compiled"); 87 | sortCompile(unitsDictionary, unitsDictionaryFile, charsetName); 88 | // 89 | Dictionary confucianFamilyDictionary = dictionaries.getConfucianFamilyNamesDictionary(); 90 | File confucianFamilyDictionaryFile = new File(compiledDicHomeFile, confucianFamilyName + ".dic.compiled"); 91 | sortCompile(confucianFamilyDictionary, confucianFamilyDictionaryFile, charsetName); 92 | // 93 | Dictionary combinatoricsDictionary = dictionaries.getCombinatoricsDictionary(); 94 | File combinatoricsDictionaryFile = new File(compiledDicHomeFile, combinatorics + ".dic.compiled"); 95 | sortCompile(combinatoricsDictionary, combinatoricsDictionaryFile, charsetName); 96 | 97 | // 98 | File compliedMetadataFile = new File(dicHomeFile, ".compiled/sorting/.metadata"); 99 | if (compliedMetadataFile.exists()) { 100 | //compliedMetadataFile.setWritable(true); 101 | compliedMetadataFile.delete(); 102 | } 103 | else { 104 | compliedMetadataFile.getParentFile().mkdirs(); 105 | } 106 | OutputStream compiledPropertiesOutput = new FileOutputStream(compliedMetadataFile); 107 | Properties compiledProperties = new Properties(); 108 | String lastModifiedsKey = "paoding.analysis.properties.lastModifieds"; 109 | String filesKey = "paoding.analysis.properties.files"; 110 | compiledProperties.setProperty(lastModifiedsKey, p.getProperty(lastModifiedsKey)); 111 | compiledProperties.setProperty(filesKey, p.getProperty(filesKey)); 112 | compiledProperties.setProperty("paoding.analysis.compiler.checksum", 113 | Snapshot.flash( 114 | new File(dicHomeFile, ".compiled/sorting"), 115 | new FileFilter() { 116 | public boolean accept(File pathname) { 117 | return pathname.getPath().endsWith( 118 | ".dic.compiled"); 119 | } 120 | }).getCheckSum()); 121 | compiledProperties.setProperty("paoding.analysis.compiler.class", this.getClass().getName()); 122 | compiledProperties.setProperty("paoding.analysis.compiler.version", VERSION); 123 | compiledProperties.store(compiledPropertiesOutput, "dont edit it! this file was auto generated by paoding."); 124 | compiledPropertiesOutput.close(); 125 | compliedMetadataFile.setReadOnly(); 126 | } 127 | 128 | 129 | 130 | private void sortCompile(final Dictionary dictionary, 131 | File dicFile, String charsetName) throws FileNotFoundException, 132 | IOException, UnsupportedEncodingException { 133 | int wordsSize = dictionary.size(); 134 | if (dicFile.exists()) { 135 | //dicFile.setWritable(true); 136 | dicFile.delete(); 137 | } 138 | BufferedOutputStream out = new BufferedOutputStream( 139 | new FileOutputStream(dicFile), 1024 * 16); 140 | 141 | for (int i = 0; i < wordsSize; i++) { 142 | Word word = dictionary.get(i); 143 | out.write(word.getText().getBytes(charsetName)); 144 | if (word.getModifiers() != Word.DEFAUL) { 145 | out.write("[m=".getBytes()); 146 | out.write(String.valueOf(word.getModifiers()).getBytes()); 147 | out.write(']'); 148 | } 149 | out.write('\r'); 150 | out.write('\n'); 151 | } 152 | out.flush(); 153 | out.close(); 154 | dicFile.setReadOnly(); 155 | } 156 | 157 | public Dictionaries readCompliedDictionaries(Properties p) { 158 | String dicHomeAbsolutePath = p.getProperty("paoding.dic.home.absolute.path"); 159 | String noiseCharactor = getProperty(p, Constants.DIC_NOISE_CHARACTOR); 160 | String noiseWord = getProperty(p, Constants.DIC_NOISE_WORD); 161 | String unit = getProperty(p, Constants.DIC_UNIT); 162 | String confucianFamilyName = getProperty(p, Constants.DIC_CONFUCIAN_FAMILY_NAME); 163 | String combinatorics = getProperty(p, Constants.DIC_FOR_COMBINATORICS); 164 | String charsetName = getProperty(p, Constants.DIC_CHARSET); 165 | int maxWordLen = Integer.valueOf(getProperty(p, Constants.DIC_MAXWORDLEN)); 166 | return new CompiledFileDictionaries( 167 | dicHomeAbsolutePath + "/.compiled/sorting", 168 | noiseCharactor, noiseWord, unit, 169 | confucianFamilyName, combinatorics, charsetName, maxWordLen); 170 | } 171 | 172 | private static String getProperty(Properties p, String name) { 173 | return Constants.getProperty(p, name); 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/BinaryDictionary.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary; 17 | 18 | /** 19 | * Dictionary的二叉查找实现。 20 | *

21 | * 22 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 23 | * 24 | * @since 1.0 25 | * 26 | */ 27 | public class BinaryDictionary implements Dictionary { 28 | 29 | // ------------------------------------------------- 30 | 31 | private Word[] ascWords; 32 | 33 | private final int start; 34 | private final int end; 35 | private final int count; 36 | 37 | // ------------------------------------------------- 38 | 39 | /** 40 | * 以一组升序排列的词语构造二叉查找字典 41 | *

42 | * 43 | * @param ascWords 44 | * 升序排列词语 45 | */ 46 | public BinaryDictionary(Word[] ascWords) { 47 | this(ascWords, 0, ascWords.length); 48 | } 49 | 50 | public BinaryDictionary(Word[] ascWords, int start, int end) { 51 | this.ascWords = ascWords; 52 | this.start = start; 53 | this.end = end; 54 | this.count = end - start; 55 | } 56 | 57 | // ------------------------------------------------- 58 | 59 | public Word get(int index) { 60 | return ascWords[start + index]; 61 | } 62 | 63 | public int size() { 64 | return count; 65 | } 66 | 67 | public Hit search(CharSequence input, int begin, int count) { 68 | int left = this.start; 69 | int right = this.end - 1; 70 | int pointer = 0; 71 | Word word = null; 72 | int relation; 73 | // 74 | while (left <= right) { 75 | pointer = (left + right) >> 1; 76 | word = ascWords[pointer]; 77 | relation = compare(input, begin, count, word); 78 | if (relation == 0) { 79 | // System.out.println(new String(input,begin, count)+"***" + 80 | // word); 81 | int nextWordIndex = pointer + 1; 82 | if (nextWordIndex >= ascWords.length) { 83 | return new Hit(pointer, word, null); 84 | } else { 85 | return new Hit(pointer, word, ascWords[nextWordIndex]); 86 | } 87 | } 88 | if (relation < 0) 89 | right = pointer - 1; 90 | else 91 | left = pointer + 1; 92 | } 93 | // 94 | if (left >= ascWords.length) { 95 | return Hit.UNDEFINED; 96 | } 97 | // 98 | boolean asPrex = true; 99 | Word nextWord = ascWords[left]; 100 | if (nextWord.length() < count) { 101 | asPrex = false; 102 | } 103 | for (int i = begin, j = 0; asPrex && j < count; i++, j++) { 104 | if (input.charAt(i) != nextWord.charAt(j)) { 105 | asPrex = false; 106 | } 107 | } 108 | return asPrex ? new Hit(Hit.UNCLOSED_INDEX, null, nextWord) 109 | : Hit.UNDEFINED; 110 | } 111 | 112 | public static int compare(CharSequence one, int begin, int count, 113 | CharSequence theOther) { 114 | for (int i = begin, j = 0; i < one.length() 115 | && j < Math.min(theOther.length(), count); i++, j++) { 116 | if (one.charAt(i) > theOther.charAt(j)) { 117 | return 1; 118 | } else if (one.charAt(i) < theOther.charAt(j)) { 119 | return -1; 120 | } 121 | } 122 | return count - theOther.length(); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/Dictionary.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary; 17 | 18 | /** 19 | * Dictionary是一个只读字典,用于查找是否包含某个词语,以及相关信息。 20 | *

21 | * 22 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 23 | * 24 | * @see BinaryDictionary 25 | * @see HashBinaryDictionary 26 | * 27 | * @since 1.0 28 | * 29 | */ 30 | public interface Dictionary { 31 | 32 | /** 33 | * 返回字典中词语数>=0 34 | * 35 | * @return 36 | */ 37 | public int size(); 38 | 39 | /** 40 | * 返回给定位置的词语 41 | * 42 | * @param index 43 | * 0,1,2,...,size-1 44 | * @return 45 | */ 46 | public Word get(int index); 47 | 48 | /** 49 | * 搜索词典是否收集input[offset]到input[offset+count-1]之间字符串(包含边界)的词。
50 | * 搜索结果以非空Hit对象给出。 51 | *

52 | * @param input 要搜索的字符串是其中连续的一部分 53 | * @param offset 要搜索的字符串开始位置相对input的偏移 54 | * @param count 要搜索的字符串字符个数 55 | * @return 返回的Hit对象非空,程序通过hit对象提供的方法判断搜索结果 56 | * 57 | * @see Hit 58 | */ 59 | public Hit search(CharSequence input, int offset, int count); 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/DictionaryDelegate.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary; 17 | /** 18 | * 19 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 20 | * 21 | * @since 1.1 22 | */ 23 | public class DictionaryDelegate implements Dictionary { 24 | private Dictionary target; 25 | 26 | public DictionaryDelegate() { 27 | } 28 | 29 | public DictionaryDelegate(Dictionary target) { 30 | this.target = target; 31 | } 32 | 33 | public Dictionary getTarget() { 34 | return target; 35 | } 36 | 37 | public void setTarget(Dictionary target) { 38 | this.target = target; 39 | } 40 | 41 | public Word get(int index) { 42 | return target.get(index); 43 | } 44 | 45 | public Hit search(CharSequence input, int offset, int count) { 46 | return target.search(input, offset, count); 47 | } 48 | 49 | public int size() { 50 | return target.size(); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/HashBinaryDictionary.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary; 17 | 18 | import java.util.HashMap; 19 | import java.util.Map; 20 | 21 | /** 22 | * Dictionary的散列+二叉查找实现。 23 | *

24 | * 用于对大数量的,且头字符相同的字符串较多的情况,e.g汉字词语字典。在这种情况下,检索速度将比二叉字典更快。 25 | *

26 | * 27 | * HashBinaryDictionary以一组已经排序的词语为输入,所有头字符相同的词语划为一个集合作为分字典(使用BinaryDictionary实现)。 28 | * 查找词语时,先根据第一个字符找得分词典(BinaryDictionary实现),再从该分词典中定位该词语。 29 | *

30 | * 31 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 32 | * 33 | * @see BinaryDictionary 34 | * 35 | * @since 1.0 36 | * 37 | */ 38 | public class HashBinaryDictionary implements Dictionary { 39 | 40 | // ------------------------------------------------- 41 | 42 | /** 43 | * 字典中所有词语,用于方便{@link #get(int)}方法 44 | */ 45 | private Word[] ascWords; 46 | 47 | /** 48 | * 首字符到分词典的映射 49 | */ 50 | private Map subs; 51 | 52 | /** 53 | * 54 | */ 55 | private final int hashIndex; 56 | 57 | private final int start; 58 | private final int end; 59 | private final int count; 60 | 61 | // ------------------------------------------------- 62 | 63 | /** 64 | * 65 | * @param ascWords 66 | * 升序排列词语 67 | * @param initialCapacity 68 | * @param loadFactor 69 | */ 70 | public HashBinaryDictionary(Word[] ascWords, int initialCapacity, 71 | float loadFactor) { 72 | this(ascWords, 0, 0, ascWords.length, initialCapacity, loadFactor); 73 | } 74 | 75 | public HashBinaryDictionary(Word[] ascWords, int hashIndex, int start, 76 | int end, int initialCapacity, float loadFactor) { 77 | this.ascWords = ascWords; 78 | this.start = start; 79 | this.end = end; 80 | this.count = end - start; 81 | this.hashIndex = hashIndex; 82 | subs = new HashMap(initialCapacity, 83 | loadFactor); 84 | createSubDictionaries(); 85 | } 86 | 87 | // ------------------------------------------------- 88 | 89 | /** 90 | * 创建分词典映射,为构造函数调用 91 | */ 92 | protected void createSubDictionaries() { 93 | if (this.start >= ascWords.length) { 94 | return; 95 | } 96 | 97 | // 定位相同头字符词语的开头和结束位置以确认分字典 98 | int beginIndex = this.start; 99 | int endIndex = this.start + 1; 100 | 101 | char beginHashChar = getChar(ascWords[start], hashIndex); 102 | char endHashChar; 103 | for (; endIndex < this.end; endIndex++) { 104 | endHashChar = getChar(ascWords[endIndex], hashIndex); 105 | if (endHashChar != beginHashChar) { 106 | addSubDictionary(beginHashChar, beginIndex, endIndex); 107 | beginIndex = endIndex; 108 | beginHashChar = endHashChar; 109 | } 110 | } 111 | addSubDictionary(beginHashChar, beginIndex, this.end); 112 | } 113 | 114 | 115 | protected char getChar(CharSequence s, int index) { 116 | if (index >= s.length()) { 117 | return (char) 0; 118 | } 119 | return s.charAt(index); 120 | } 121 | 122 | /** 123 | * 将位置在beginIndex和endIndex之间(不包括endIndex)的词语作为一个分词典 124 | * 125 | * @param hashChar 126 | * @param beginIndex 127 | * @param endIndex 128 | */ 129 | protected void addSubDictionary(char hashChar, int beginIndex, int endIndex) { 130 | Dictionary subDic = createSubDictionary(ascWords, beginIndex, endIndex); 131 | SubDictionaryWrap subDicWrap = new SubDictionaryWrap(hashChar, 132 | subDic, beginIndex); 133 | subs.put(keyOf(hashChar), subDicWrap); 134 | } 135 | 136 | protected Dictionary createSubDictionary(Word[] ascWords, int beginIndex, 137 | int endIndex) { 138 | int count = endIndex - beginIndex; 139 | if (count < 16) { 140 | return new BinaryDictionary(ascWords, beginIndex, endIndex); 141 | } else { 142 | return new HashBinaryDictionary(ascWords, hashIndex + 1, 143 | beginIndex, endIndex, getCapacity(count), 0.75f); 144 | } 145 | } 146 | 147 | protected static final int[] capacityCandiate = { 16, 32, 64, 128, 256, 148 | 512, 1024, 2048, 4096, 10192 }; 149 | 150 | protected int getCapacity(int count) { 151 | int capacity = -1; 152 | count <<= 2; 153 | count /= 3; 154 | count += 1; 155 | for (int i = 0; i < capacityCandiate.length; i++) { 156 | if (count <= capacityCandiate[i]) { 157 | capacity = capacityCandiate[i]; 158 | break; 159 | } 160 | } 161 | if (capacity < 0) { 162 | capacity = capacityCandiate[capacityCandiate.length - 1]; 163 | } 164 | return capacity; 165 | } 166 | 167 | // ------------------------------------------------- 168 | 169 | public Word get(int index) { 170 | return ascWords[start + index]; 171 | } 172 | 173 | public Hit search(CharSequence input, int begin, int count) { 174 | SubDictionaryWrap subDic = (SubDictionaryWrap) subs.get(keyOf(input 175 | .charAt(hashIndex + begin))); 176 | if (subDic == null) { 177 | return Hit.UNDEFINED; 178 | } 179 | Dictionary dic = subDic.dic; 180 | // 对count==hashIndex + 1的处理 181 | if (count == hashIndex + 1) { 182 | Word header = dic.get(0); 183 | if (header.length() == hashIndex + 1) { 184 | if (subDic.wordIndexOffset + 1 < this.ascWords.length) { 185 | return new Hit(subDic.wordIndexOffset, header, 186 | this.ascWords[subDic.wordIndexOffset + 1]); 187 | } else { 188 | return new Hit(subDic.wordIndexOffset, header, null); 189 | } 190 | } else { 191 | return new Hit(Hit.UNCLOSED_INDEX, null, header); 192 | } 193 | } 194 | // count > hashIndex + 1 195 | Hit word = dic.search(input, begin, count); 196 | if (word.isHit()) { 197 | int index = subDic.wordIndexOffset + word.getIndex(); 198 | word.setIndex(index); 199 | if (word.getNext() == null && index < size()) { 200 | word.setNext(get(index + 1)); 201 | } 202 | } 203 | return word; 204 | } 205 | 206 | public int size() { 207 | return count; 208 | } 209 | 210 | // ------------------------------------------------- 211 | 212 | /** 213 | * 字符的在{@link #subs}的key值。 214 | * 215 | * @param theChar 216 | * @return 217 | * 218 | * @see #subs 219 | */ 220 | protected Object keyOf(char theChar) { 221 | // return theChar - 0x4E00;// '一'==0x4E00 222 | return new Integer(theChar); 223 | } 224 | 225 | /** 226 | * 分词典封箱 227 | */ 228 | static class SubDictionaryWrap { 229 | /** 230 | * 分词典词组的头字符 231 | */ 232 | char hashChar; 233 | 234 | /** 235 | * 分词典 236 | */ 237 | Dictionary dic; 238 | 239 | /** 240 | * 分词典第一个词语在所有词语中的偏移位置 241 | */ 242 | int wordIndexOffset; 243 | 244 | public SubDictionaryWrap(char hashChar, Dictionary dic, 245 | int wordIndexOffset) { 246 | this.hashChar = hashChar; 247 | this.dic = dic; 248 | this.wordIndexOffset = wordIndexOffset; 249 | } 250 | } 251 | 252 | } 253 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/Hit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary; 17 | 18 | /** 19 | * Hit是检索字典时返回的结果。检索字典时,总是返回一个非空的Hit对象表示可能的各种情况。 20 | *

21 | * 22 | * Hit对象包含2类判断信息: 23 | *

  • 要检索的词语是否存在于词典中: {@link #isHit()}
  • 24 | *
  • 词典是否含有以给定字符串开头的其他词语: {@link #isUnclosed()}
  • 25 | *
    26 | * 如果上面2个信息都是否定的,则 {@link #isUndefined()}返回true,否则返回false.
    27 | *
    28 | * 29 | * 如果{@link #isHit()}返回true,则{@link #getWord()}返回查找结果,{@link #getNext()}返回下一个词语。
    30 | * 如果{@link #isHit()}返回false,但{@link #isUnclosed()}返回true,{@link #getNext()}返回以所查询词语开头的位置最靠前的词语。 31 | *

    32 | * 33 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 34 | * 35 | * @see Dictionary 36 | * @see BinaryDictionary 37 | * @see HashBinaryDictionary 38 | * 39 | * @since 1.0 40 | * 41 | */ 42 | public class Hit { 43 | 44 | // ------------------------------------------------- 45 | 46 | public final static int UNCLOSED_INDEX = -1; 47 | 48 | public final static int UNDEFINED_INDEX = -2; 49 | 50 | public final static Hit UNDEFINED = new Hit(UNDEFINED_INDEX, null, null); 51 | 52 | // ------------------------------------------------- 53 | 54 | /** 55 | * 目标词语在词典中的位置,或者在字典没有该词语是表示其他意思(参见以上静态变量定义的情况) 56 | */ 57 | private int index; 58 | 59 | /** 60 | * 查找命中时,词典中相应的词 61 | */ 62 | private Word word; 63 | 64 | /** 65 | * 词典中命中词的下一个单词,或{@link #isUnclosed()}为true时最接近的下一个词(参见本类的注释) 66 | */ 67 | private Word next; 68 | 69 | // ------------------------------------------------- 70 | 71 | /** 72 | * 73 | * @param index 74 | * 目标词语在词典中的位置,或者在字典没有该词语是表示其他意思(参见以上静态变量定义的情况) 75 | * @param word 76 | * 查找命中时,词典中相应的词 77 | * @param next 78 | * 词典中命中词的下一个单词,或{@link #isUnclosed()}为true时最接近的下一个词(参见本类的注释) 79 | */ 80 | public Hit(int index, Word word, Word next) { 81 | this.index = index; 82 | this.word = word; 83 | this.next = next; 84 | } 85 | 86 | // ------------------------------------------------- 87 | 88 | /** 89 | * 查找命中时,词典中相应的词 90 | */ 91 | public Word getWord() { 92 | return word; 93 | } 94 | 95 | /** 96 | * 目标词语在词典中的位置,或者在字典没有该词语是表示其他意思(参见以上静态变量定义的情况) 97 | * @return 98 | */ 99 | public int getIndex() { 100 | return index; 101 | } 102 | 103 | /** 104 | * 词典中命中词的下一个单词,或{@link #isUnclosed()}为true时最接近的下一个词(参见本类的注释) 105 | * @return 106 | */ 107 | public Word getNext() { 108 | return next; 109 | } 110 | 111 | /** 112 | * 是否在字典中检索到要检索的词语 113 | * @return 114 | */ 115 | public boolean isHit() { 116 | return this.index >= 0; 117 | } 118 | 119 | /** 120 | * 是否有以当前检索词语开头的词语 121 | * @return 122 | */ 123 | public boolean isUnclosed() { 124 | return UNCLOSED_INDEX == this.index 125 | || (this.next != null 126 | && this.next.length() >= this.word.length() && this.next 127 | .startsWith(word)); 128 | } 129 | 130 | /** 131 | * 字典中没有当前检索的词语,或以其开头的词语 132 | * @return 133 | */ 134 | public boolean isUndefined() { 135 | return UNDEFINED.index == this.index; 136 | } 137 | 138 | // ------------------------------------------------- 139 | 140 | void setIndex(int index) { 141 | this.index = index; 142 | } 143 | 144 | void setWord(Word key) { 145 | this.word = key; 146 | } 147 | 148 | void setNext(Word next) { 149 | this.next = next; 150 | } 151 | 152 | // ------------------------------------------------- 153 | 154 | public int hashCode() { 155 | final int PRIME = 31; 156 | int result = 1; 157 | result = PRIME * result + ((word == null) ? 0 : word.hashCode()); 158 | result = PRIME * result + index; 159 | return result; 160 | } 161 | 162 | public boolean equals(Object obj) { 163 | if (this == obj) 164 | return true; 165 | if (obj == null) 166 | return false; 167 | if (getClass() != obj.getClass()) 168 | return false; 169 | final Hit other = (Hit) obj; 170 | if (word == null) { 171 | if (other.word != null) 172 | return false; 173 | } else if (!word.equals(other.word)) 174 | return false; 175 | if (index != other.index) 176 | return false; 177 | return true; 178 | } 179 | 180 | public String toString() { 181 | if (isUnclosed()) { 182 | return "[UNCLOSED]"; 183 | } else if (isUndefined()) { 184 | return "[UNDEFINED]"; 185 | } 186 | return "[" + index + ']' + word; 187 | } 188 | 189 | } 190 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/Word.java: -------------------------------------------------------------------------------- 1 | package net.paoding.analysis.dictionary; 2 | 3 | public class Word implements Comparable, CharSequence { 4 | 5 | public static final int DEFAUL = 0; 6 | private String text; 7 | private int modifiers = DEFAUL; 8 | 9 | public Word() { 10 | } 11 | 12 | public Word(String text) { 13 | this.text = text; 14 | } 15 | 16 | public Word(String text, int modifiers) { 17 | this.text = text; 18 | this.modifiers = modifiers; 19 | } 20 | 21 | public String getText() { 22 | return text; 23 | } 24 | 25 | public void setText(String text) { 26 | this.text = text; 27 | } 28 | 29 | public int getModifiers() { 30 | return modifiers; 31 | } 32 | 33 | public void setModifiers(int modifiers) { 34 | this.modifiers = modifiers; 35 | } 36 | 37 | public int compareTo(Word obj) { 38 | return this.text.compareTo(obj.text); 39 | } 40 | 41 | public String toString() { 42 | return text; 43 | } 44 | 45 | public int length() { 46 | return text.length(); 47 | } 48 | 49 | public boolean startsWith(Word word) { 50 | return text.startsWith(word.text); 51 | } 52 | 53 | public char charAt(int j) { 54 | return text.charAt(j); 55 | } 56 | 57 | public CharSequence subSequence(int start, int end) { 58 | throw new UnsupportedOperationException(); 59 | } 60 | 61 | public int hashCode() { 62 | return text.hashCode(); 63 | } 64 | 65 | public boolean equals(Object obj) { 66 | return text.equals(((Word) obj).text); 67 | } 68 | 69 | public void setNoiseCharactor() { 70 | modifiers |= 1; 71 | } 72 | 73 | public void setNoiseWord() { 74 | modifiers |= (1 << 1); 75 | } 76 | 77 | public boolean isNoiseCharactor() { 78 | return (modifiers & 1) == 1; 79 | } 80 | 81 | public boolean isNoise() { 82 | return isNoiseCharactor() || isNoiseWord(); 83 | } 84 | 85 | public boolean isNoiseWord() { 86 | return (modifiers >> 1 & 1) == 1; 87 | } 88 | 89 | public static void main(String[] args) { 90 | Word w = new Word(""); 91 | System.out.println(w.isNoiseCharactor()); 92 | w.setNoiseCharactor(); 93 | System.out.println(w.isNoiseCharactor()); 94 | System.out.println(w.isNoiseWord()); 95 | w.setNoiseWord(); 96 | System.out.println(w.isNoiseWord()); 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/detection/Detector.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.detection; 17 | 18 | import java.io.File; 19 | import java.io.FileFilter; 20 | 21 | import org.apache.commons.logging.Log; 22 | import org.apache.commons.logging.LogFactory; 23 | 24 | /** 25 | * 26 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 27 | * 28 | * @since 2.0.2 29 | * 30 | */ 31 | public class Detector implements Runnable { 32 | 33 | private Log log = LogFactory.getLog(this.getClass()); 34 | 35 | private DifferenceListener listener; 36 | 37 | private File home; 38 | 39 | private FileFilter filter; 40 | 41 | private long interval; 42 | 43 | private Snapshot lastSnapshot; 44 | 45 | private Thread thread; 46 | 47 | private boolean alive = true; 48 | 49 | public void setListener(DifferenceListener listener) { 50 | this.listener = listener; 51 | } 52 | 53 | public Detector() { 54 | } 55 | 56 | /** 57 | * 检查间隔 58 | * 59 | * @param interval 60 | */ 61 | public void setInterval(int interval) { 62 | this.interval = interval * 1000; 63 | } 64 | 65 | public void setHome(File home) { 66 | this.home = home; 67 | } 68 | 69 | public void setHome(String home) { 70 | this.home = new File(home); 71 | } 72 | 73 | public void setFilter(FileFilter filter) { 74 | this.filter = filter; 75 | } 76 | 77 | public Snapshot flash(){ 78 | return Snapshot.flash(home, filter); 79 | } 80 | 81 | public void start(boolean daemon) { 82 | if (lastSnapshot == null) { 83 | lastSnapshot = flash(); 84 | } 85 | thread = new Thread(this); 86 | thread.setDaemon(daemon); 87 | thread.start(); 88 | } 89 | 90 | 91 | public Snapshot getLastSnapshot() { 92 | return lastSnapshot; 93 | } 94 | 95 | public void setLastSnapshot(Snapshot last) { 96 | this.lastSnapshot = last; 97 | } 98 | 99 | public void run() { 100 | if (interval <= 0) 101 | throw new IllegalArgumentException( 102 | "should set a interval(>0) for the detection."); 103 | while (alive) { 104 | sleep(); 105 | forceDetecting(); 106 | } 107 | } 108 | 109 | public void forceDetecting() { 110 | Snapshot current = flash(); 111 | Difference diff = current.diff(lastSnapshot); 112 | if (!diff.isEmpty()) { 113 | try { 114 | listener.on(diff); 115 | log.info("found differen for " + home); 116 | log.info(diff); 117 | lastSnapshot = current; 118 | } catch (Exception e) { 119 | log.error("", e); 120 | } 121 | } 122 | } 123 | 124 | public void setStop() { 125 | alive = false; 126 | thread = null; 127 | } 128 | 129 | private void sleep() { 130 | try { 131 | Thread.sleep(interval); 132 | } catch (InterruptedException e) { 133 | e.printStackTrace(); 134 | } 135 | } 136 | 137 | public static void main(String[] args) { 138 | Detector d = new Detector(); 139 | d.setInterval(1); 140 | d.setHome(new File("dic")); 141 | d.setFilter(new ExtensionFileFilter(".dic")); 142 | d.setListener(new DifferenceListener() { 143 | public void on(Difference diff) { 144 | System.out.println(diff); 145 | } 146 | 147 | }); 148 | d.start(false); 149 | } 150 | 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/detection/Difference.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.detection; 17 | 18 | import java.util.LinkedList; 19 | import java.util.List; 20 | 21 | /** 22 | * 23 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 24 | * 25 | * @since 2.0.2 26 | * 27 | */ 28 | public class Difference { 29 | 30 | /** 31 | * 变更了的 32 | * 33 | * @return 34 | */ 35 | private List modified = new LinkedList(); 36 | 37 | /** 38 | * 删除了的 39 | * 40 | * @return 41 | */ 42 | private List deleted = new LinkedList(); 43 | 44 | /** 45 | * 新加的 46 | * 47 | * @return 48 | */ 49 | private List newcome = new LinkedList(); 50 | 51 | private Snapshot older; 52 | private Snapshot younger; 53 | 54 | public List getModified() { 55 | return modified; 56 | } 57 | 58 | public void setModified(List modified) { 59 | this.modified = modified; 60 | } 61 | 62 | public List getDeleted() { 63 | return deleted; 64 | } 65 | 66 | public void setDeleted(List deleted) { 67 | this.deleted = deleted; 68 | } 69 | 70 | public List getNewcome() { 71 | return newcome; 72 | } 73 | 74 | public void setNewcome(List newcome) { 75 | this.newcome = newcome; 76 | } 77 | 78 | public Snapshot getOlder() { 79 | return older; 80 | } 81 | 82 | public void setOlder(Snapshot older) { 83 | this.older = older; 84 | } 85 | 86 | public Snapshot getYounger() { 87 | return younger; 88 | } 89 | 90 | public void setYounger(Snapshot younger) { 91 | this.younger = younger; 92 | } 93 | 94 | public boolean isEmpty() { 95 | return deleted.isEmpty() && modified.isEmpty() && newcome.isEmpty(); 96 | } 97 | 98 | public String toString() { 99 | String smodified = ArraysToString(modified.toArray(new Node[] {})); 100 | String snewcome = ArraysToString(newcome.toArray(new Node[] {})); 101 | String sdeleted = ArraysToString(deleted.toArray(new Node[] {})); 102 | return "modified=" + smodified + ";newcome=" + snewcome + ";deleted=" 103 | + sdeleted; 104 | } 105 | 106 | // 低于JDK1.5无Arrays.toString()方法,故有以下方法 107 | private static String ArraysToString(Object[] a) { 108 | if (a == null) 109 | return "null"; 110 | int iMax = a.length - 1; 111 | if (iMax == -1) 112 | return "[]"; 113 | 114 | StringBuffer b = new StringBuffer(); 115 | b.append('['); 116 | for (int i = 0;; i++) { 117 | b.append(String.valueOf(a[i])); 118 | if (i == iMax) 119 | return b.append(']').toString(); 120 | b.append(", "); 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/detection/DifferenceListener.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.detection; 17 | /** 18 | * 19 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 20 | * 21 | * @since 2.0.2 22 | * 23 | */ 24 | public interface DifferenceListener { 25 | 26 | public void on(Difference diff) throws Exception ; 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/detection/ExtensionFileFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.detection; 17 | 18 | import java.io.File; 19 | import java.io.FileFilter; 20 | /** 21 | * 22 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 23 | * 24 | * @since 2.0.2 25 | * 26 | */ 27 | public class ExtensionFileFilter implements FileFilter { 28 | private String end; 29 | 30 | public ExtensionFileFilter() { 31 | } 32 | 33 | public ExtensionFileFilter(String end) { 34 | this.end = end; 35 | } 36 | 37 | public void setEnd(String end) { 38 | this.end = end; 39 | } 40 | 41 | public String getEnd() { 42 | return end; 43 | } 44 | 45 | public boolean accept(File pathname) { 46 | return pathname.isDirectory() || pathname.getName().endsWith(end); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/detection/Node.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.detection; 17 | 18 | /** 19 | * 20 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 21 | * 22 | * @since 2.0.2 23 | * 24 | */ 25 | public class Node implements Comparable{ 26 | 27 | String path; 28 | 29 | boolean isFile; 30 | 31 | public Node() { 32 | } 33 | 34 | public Node(String path, boolean isFile) { 35 | this.path = path; 36 | this.isFile = isFile; 37 | } 38 | 39 | /** 40 | * 返回结点路径 41 | *

    42 | * 如果该结点为根,则返回根的绝对路径
    43 | * 如果该结点为根下的目录或文件,则返回其相对与根的路径
    44 | * 45 | * @return 46 | */ 47 | public String getPath() { 48 | return path; 49 | } 50 | 51 | /** 52 | * 该结点当时的属性:是否为文件 53 | * 54 | * @return 55 | */ 56 | public boolean isFile() { 57 | return isFile; 58 | } 59 | 60 | public String toString() { 61 | return path; 62 | } 63 | 64 | public int hashCode() { 65 | final int prime = 31; 66 | int result = 1; 67 | result = prime * result + ((path == null) ? 0 : path.hashCode()); 68 | return result; 69 | } 70 | 71 | public boolean equals(Object obj) { 72 | if (this == obj) 73 | return true; 74 | if (obj == null) 75 | return false; 76 | if (getClass() != obj.getClass()) 77 | return false; 78 | final Node other = (Node) obj; 79 | if (path == null) { 80 | if (other.path != null) 81 | return false; 82 | } else if (!path.equals(other.path)) 83 | return false; 84 | return true; 85 | } 86 | 87 | public int compareTo(Node o) { 88 | //path 89 | if (this.path != null && o.path != null){ 90 | int cmp = this.path.compareTo(o.path); 91 | if (cmp != 0) return cmp; 92 | } else { 93 | if (this.path != null && o.path == null) return 1; 94 | if (this.path == null && o.path != null) return -1; 95 | } 96 | 97 | //isfile 98 | if (this.isFile && !o.isFile) return 1; 99 | if (!this.isFile && o.isFile) return -1; 100 | return 0; 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/detection/Snapshot.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.detection; 17 | 18 | import java.io.File; 19 | import java.io.FileFilter; 20 | import java.util.ArrayList; 21 | import java.util.Arrays; 22 | import java.util.HashMap; 23 | import java.util.Iterator; 24 | import java.util.LinkedList; 25 | import java.util.Map; 26 | 27 | /** 28 | * 29 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 30 | * 31 | * @since 2.0.2 32 | * 33 | */ 34 | public class Snapshot { 35 | 36 | // 此次快照版本,使用时间表示 37 | private long version; 38 | 39 | // 根地址,绝对地址,使用/作为目录分隔符 40 | private String root; 41 | 42 | // String为相对根的地址,使用/作为目录分隔符 43 | private Map nodesMap = new HashMap(); 44 | 45 | // 46 | private InnerNode[] nodes; 47 | 48 | //checksum of this snapshot 49 | private String checksum; 50 | 51 | private Snapshot() { 52 | } 53 | 54 | public static Snapshot flash(String root, FileFilter filter) { 55 | return flash(new File(root), filter); 56 | } 57 | 58 | public static Snapshot flash(File rootFile, FileFilter filter) { 59 | Snapshot snapshot = new Snapshot(); 60 | snapshot.implFlash(rootFile, filter); 61 | return snapshot; 62 | } 63 | 64 | private void implFlash(File rootFile, FileFilter filter) { 65 | version = System.currentTimeMillis(); 66 | root = rootFile.getAbsolutePath().replace('\\', '/'); 67 | if (!rootFile.exists()) { 68 | // do nothing, maybe the file has been deleted 69 | nodes = new InnerNode[0]; 70 | } else { 71 | InnerNode rootNode = new InnerNode(); 72 | rootNode.path = root; 73 | rootNode.isFile = rootFile.isFile(); 74 | rootNode.lastModified = rootFile.lastModified(); 75 | nodesMap.put(root, rootNode); 76 | if (rootFile.isDirectory()) { 77 | LinkedList files = getPosterity(rootFile, filter); 78 | nodes = new InnerNode[files.size()]; 79 | Iterator iter = files.iterator(); 80 | for (int i = 0; i < nodes.length; i++) { 81 | File f = (File) iter.next(); 82 | String path = f.getAbsolutePath().substring( 83 | this.root.length() + 1); 84 | path = path.replace('\\', '/'); 85 | InnerNode node = new InnerNode(); 86 | node.path = path; 87 | node.isFile = f.isFile(); 88 | node.lastModified = f.lastModified(); 89 | int index = path.lastIndexOf('/'); 90 | node.parent = index == -1 ? root : path.substring(0, index); 91 | nodes[i] = node; 92 | nodesMap.put(path, node); 93 | } 94 | } 95 | } 96 | 97 | //sort node for checksum 98 | Arrays.sort(nodes); 99 | checksum = null; 100 | } 101 | 102 | /** 103 | * build checksum of snapshot 104 | * 105 | * @return checksum of current snapshot 106 | */ 107 | private void buildCheckSum() { 108 | short checksum = -631; 109 | short multiplier = 1; 110 | String ENCODING = "UTF-8"; 111 | 112 | StringBuilder value = new StringBuilder(); 113 | for(int i = 0; i < nodes.length; i++){ 114 | value.append(nodes[i].path); 115 | value.append(nodes[i].isFile); 116 | value.append(nodes[i].parent); 117 | value.append(nodes[i].lastModified); 118 | } 119 | 120 | try { 121 | byte[] data = value.toString().getBytes(ENCODING); 122 | for (int b = 0; b < data.length; ++b) 123 | checksum += data[b] * multiplier++; 124 | } catch (java.io.UnsupportedEncodingException ex) { 125 | 126 | } 127 | 128 | this.checksum = String.valueOf(checksum); 129 | } 130 | 131 | public long getVersion() { 132 | return version; 133 | } 134 | 135 | public void setVersion(long version) { 136 | this.version = version; 137 | } 138 | 139 | public String getRoot() { 140 | return root; 141 | } 142 | 143 | public void setRoot(String root) { 144 | this.root = root; 145 | } 146 | 147 | //get checksum in lazy mode 148 | public String getCheckSum() { 149 | if (checksum == null) buildCheckSum(); 150 | return checksum; 151 | } 152 | 153 | public Difference diff(Snapshot that) { 154 | Snapshot older = that; 155 | Snapshot younger = this; 156 | if (that.version > this.version) { 157 | older = this; 158 | younger = that; 159 | } 160 | Difference diff = new Difference(); 161 | if (!younger.root.equals(older.root)) { 162 | throw new IllegalArgumentException("the snaps should be same root"); 163 | } 164 | for (int i = 0; i < older.nodes.length; i ++) { 165 | InnerNode olderNode = older.nodes[i]; 166 | InnerNode yongerNode = (InnerNode) younger.nodesMap.get((String) olderNode.path); 167 | if (yongerNode == null) { 168 | diff.getDeleted().add(olderNode); 169 | } else if (yongerNode.lastModified != olderNode.lastModified) { 170 | diff.getModified().add(olderNode); 171 | } 172 | } 173 | 174 | for (int i = 0; i < younger.nodes.length; i ++) { 175 | InnerNode yongerNode = younger.nodes[i]; 176 | InnerNode olderNode = (InnerNode) older.nodesMap.get((String) yongerNode.path); 177 | if (olderNode == null) { 178 | diff.getNewcome().add(yongerNode); 179 | } 180 | } 181 | diff.setOlder(older); 182 | diff.setYounger(younger); 183 | return diff; 184 | } 185 | 186 | public static void main(String[] args) throws InterruptedException { 187 | File f = new File("dic"); 188 | Snapshot snapshot1 = Snapshot.flash(f, null); 189 | System.out.println("----"); 190 | Thread.sleep(3000); 191 | System.out.println("----"); 192 | Thread.sleep(3000); 193 | System.out.println("----"); 194 | Snapshot snapshot2 = Snapshot.flash(f, null); 195 | Difference diff = snapshot2.diff(snapshot1); 196 | String deleted = ArraysToString(diff.getDeleted().toArray( 197 | new Node[] {})); 198 | System.out.println("deleted: " + deleted); 199 | String modified = ArraysToString(diff.getModified().toArray( 200 | new Node[] {})); 201 | System.out.println("modified: " + modified); 202 | String newcome = ArraysToString(diff.getNewcome().toArray( 203 | new Node[] {})); 204 | System.out.println("newcome: " + newcome); 205 | } 206 | 207 | 208 | // 低于JDK1.5无Arrays.toString()方法,故有以下方法 209 | private static String ArraysToString(Object[] a) { 210 | if (a == null) 211 | return "null"; 212 | int iMax = a.length - 1; 213 | if (iMax == -1) 214 | return "[]"; 215 | 216 | StringBuffer b = new StringBuffer(); 217 | b.append('['); 218 | for (int i = 0;; i++) { 219 | b.append(String.valueOf(a[i])); 220 | if (i == iMax) 221 | return b.append(']').toString(); 222 | b.append(", "); 223 | } 224 | } 225 | 226 | // -------------------------------------------- 227 | 228 | private LinkedList getPosterity(File root, FileFilter filter) { 229 | ArrayList dirs = new ArrayList(); 230 | LinkedList files = new LinkedList(); 231 | dirs.add(root); 232 | int index = 0; 233 | while (index < dirs.size()) { 234 | File cur = (File) dirs.get(index++); 235 | File[] children = cur.listFiles(); 236 | for (int i = 0; i < children.length; i ++) { 237 | File f = children[i]; 238 | if (filter == null || filter.accept(f)) { 239 | if (f.isDirectory()) { 240 | dirs.add(f); 241 | } else { 242 | files.add(f); 243 | } 244 | } 245 | } 246 | } 247 | return files; 248 | } 249 | 250 | class InnerNode extends Node { 251 | String parent; 252 | long lastModified; 253 | 254 | @Override 255 | public int compareTo(Node o) { 256 | // super compare 257 | int result = super.compareTo(o); 258 | if (result != 0) 259 | return result; 260 | 261 | if (o instanceof InnerNode) { 262 | InnerNode node = (InnerNode) o; 263 | 264 | // parent 265 | if (this.parent != null && node.parent != null) { 266 | int cmp = this.parent.compareTo(node.parent); 267 | if (cmp != 0) 268 | return cmp; 269 | } else { 270 | if (this.parent != null && node.parent == null) 271 | return 1; 272 | if (this.parent == null && node.parent != null) 273 | return -1; 274 | } 275 | 276 | // lastModified 277 | if (this.lastModified > node.lastModified) 278 | return 1; 279 | if (this.lastModified < node.lastModified) 280 | return -1; 281 | } 282 | return 0; 283 | } 284 | } 285 | } 286 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/filewords/FileWordsReader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.filewords; 17 | 18 | import java.io.BufferedReader; 19 | import java.io.File; 20 | import java.io.FileInputStream; 21 | import java.io.FileNotFoundException; 22 | import java.io.IOException; 23 | import java.io.InputStreamReader; 24 | import java.net.URISyntaxException; 25 | import java.net.URL; 26 | import java.util.ArrayList; 27 | import java.util.Collection; 28 | import java.util.Iterator; 29 | import java.util.LinkedList; 30 | import java.util.Map; 31 | import java.util.Set; 32 | 33 | import net.paoding.analysis.dictionary.Word; 34 | import net.paoding.analysis.knife.CharSet; 35 | 36 | /** 37 | * 38 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 39 | * 40 | * @since 1.0 41 | * 42 | */ 43 | public class FileWordsReader { 44 | 45 | public static Map> readWords( 46 | String fileOrDirectory, String charsetName, int maxWordLen) throws IOException { 47 | SimpleReadListener l = new SimpleReadListener(); 48 | readWords(fileOrDirectory, l, charsetName, maxWordLen); 49 | return l.getResult(); 50 | } 51 | 52 | public static Map> readWords( 53 | String fileOrDirectory, String charsetName, int maxWordLen, Class collectionClass, String ext) throws IOException { 54 | SimpleReadListener2 l = new SimpleReadListener2(collectionClass, ext); 55 | readWords(fileOrDirectory, l, charsetName, maxWordLen); 56 | return l.getResult(); 57 | } 58 | 59 | public static void readWords(String fileOrDirectory, ReadListener l, String charsetName, int maxWordLen) 60 | throws IOException { 61 | File file; 62 | if (fileOrDirectory.startsWith("classpath:")) { 63 | String name = fileOrDirectory.substring("classpath:".length()); 64 | URL url = FileWordsReader.class.getClassLoader().getResource(name); 65 | if (url == null) { 66 | throw new FileNotFoundException("file \"" + name + "\" not found in classpath!"); 67 | } 68 | file = new File(getUrlPath(url)); 69 | } 70 | else { 71 | file = new File(fileOrDirectory); 72 | if (!file.exists()) { 73 | throw new FileNotFoundException("file \"" + fileOrDirectory + "\" not found!"); 74 | } 75 | } 76 | ArrayList dirs = new ArrayList(); 77 | LinkedList dics = new LinkedList(); 78 | String dir; 79 | if (file.isDirectory()) { 80 | dirs.add(file); 81 | dir = file.getAbsolutePath(); 82 | } else { 83 | dics.add(file); 84 | dir = file.getParentFile().getAbsolutePath(); 85 | } 86 | int index = 0; 87 | while (index < dirs.size()) { 88 | File cur = (File) dirs.get(index++); 89 | File[] files = cur.listFiles(); 90 | for (int i = 0; i < files.length; i ++) { 91 | File f = files[i]; 92 | if (f.isDirectory()) { 93 | dirs.add(f); 94 | } else { 95 | dics.add(f); 96 | } 97 | } 98 | } 99 | for (Iterator iter = dics.iterator(); iter.hasNext();) { 100 | File f = (File) iter.next(); 101 | String name = f.getAbsolutePath().substring( 102 | dir.length() + 1); 103 | name = name.replace('\\', '/'); 104 | if (!l.onFileBegin(name)) { 105 | continue; 106 | } 107 | BufferedReader in = new BufferedReader(new InputStreamReader( 108 | new FileInputStream(f), charsetName)); 109 | String word; 110 | boolean firstInDic = true; 111 | while ((word = in.readLine()) != null) { 112 | if (firstInDic) { 113 | firstInDic = false; 114 | // ref:http://www.w3.org/International/questions/qa-utf8-bom 115 | // ZERO WIDTH NO-BREAK SPACE 116 | // notepad将文件保存为unitcode或utf-8时会在文件开头保存bom字符串 117 | // notepad根据是否有bom来识别该文件是否是utf-8编码存储的。 118 | // 庖丁字典需要将这个字符从词典中去掉 119 | if (word.length() > 0 && CharSet.isBom(word.charAt(0))) { 120 | word = word.substring(1); 121 | } 122 | } 123 | 124 | // maximum word length limitation 125 | if (maxWordLen <= 0 || word.length() <= maxWordLen){ 126 | l.onWord(word); 127 | } 128 | } 129 | l.onFileEnd(name); 130 | in.close(); 131 | } 132 | } 133 | 134 | private static String getUrlPath(URL url){ 135 | if (url == null) return null; 136 | String urlPath = null; 137 | try { 138 | urlPath = url.toURI().getPath(); 139 | } catch (URISyntaxException e) { 140 | } 141 | if (urlPath == null){ 142 | urlPath = url.getFile(); 143 | } 144 | return urlPath; 145 | } 146 | 147 | } 148 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/filewords/ReadListener.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.filewords; 17 | 18 | 19 | /** 20 | * 21 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 22 | * 23 | * @since 1.0 24 | * 25 | */ 26 | public interface ReadListener { 27 | public boolean onFileBegin(String file); 28 | public void onFileEnd(String file); 29 | public void onWord(String word); 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/filewords/SimpleReadListener.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.filewords; 17 | 18 | import java.util.HashSet; 19 | import java.util.Hashtable; 20 | import java.util.Map; 21 | import java.util.Set; 22 | 23 | import net.paoding.analysis.dictionary.Word; 24 | import net.paoding.analysis.knife.CharSet; 25 | 26 | /** 27 | * 28 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 29 | * 30 | * @since 1.0 31 | * 32 | */ 33 | public class SimpleReadListener implements ReadListener { 34 | private Map> dics = new Hashtable >(); 35 | private HashSet words = new HashSet(); 36 | private String ext = ".dic"; 37 | 38 | public SimpleReadListener(String ext) { 39 | this.ext = ext; 40 | } 41 | 42 | public SimpleReadListener() { 43 | } 44 | 45 | public boolean onFileBegin(String file) { 46 | if (!file.endsWith(ext)) { 47 | return false; 48 | } 49 | words = new HashSet(); 50 | return true; 51 | } 52 | 53 | public void onFileEnd(String file) { 54 | String name = file.substring(0, file.length() - 4); 55 | dics.put(name, words); 56 | words = null; 57 | } 58 | 59 | public void onWord(String wordText) { 60 | wordText = wordText.trim().toLowerCase(); 61 | if (wordText.length() == 0 || wordText.charAt(0) == '#' 62 | || wordText.charAt(0) == '-') { 63 | return; 64 | } 65 | // 去除汉字数字词 66 | for (int i = 0; i < wordText.length(); i++) { 67 | char ch = wordText.charAt(i); 68 | int num = CharSet.toNumber(ch); 69 | if (num >= 0) { 70 | if (i == 0) { 71 | if (num > 10) {// "十二" vs "千万" 72 | break; 73 | } 74 | } 75 | if (num == 2) { 76 | if (wordText.equals("两") || wordText.equals("两两")) { 77 | break; 78 | } 79 | } 80 | if (i + 1 == wordText.length()) { 81 | return; 82 | } 83 | } else { 84 | break; 85 | } 86 | } 87 | int index = wordText.indexOf('['); 88 | if (index == -1) { 89 | words.add(new Word(wordText)); 90 | } 91 | else { 92 | Word w = new Word(wordText.substring(0, index)); 93 | int mindex = wordText.indexOf("m=", index); 94 | if (mindex != -1) { 95 | int mEndIndex = wordText.indexOf("]", mindex); 96 | String m = wordText.substring(mindex + "m=".length(), mEndIndex); 97 | w.setModifiers(Integer.parseInt(m)); 98 | words.add(w); 99 | } 100 | } 101 | } 102 | 103 | public Map>getResult() { 104 | return dics; 105 | } 106 | 107 | } -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/dictionary/support/filewords/SimpleReadListener2.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.dictionary.support.filewords; 17 | 18 | import java.util.Collection; 19 | import java.util.HashSet; 20 | import java.util.Hashtable; 21 | import java.util.Map; 22 | 23 | import net.paoding.analysis.dictionary.Word; 24 | 25 | /** 26 | * 本类用于读取编译后的词典 27 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 28 | * 29 | * @since 1.0 30 | * 31 | */ 32 | public class SimpleReadListener2 implements ReadListener { 33 | private Map> dics = new Hashtable>(); 34 | private Class collectionClass = HashSet.class; 35 | private Collection words; 36 | private String ext = ".dic"; 37 | 38 | public SimpleReadListener2(Class collectionClass, String ext) { 39 | this.ext = ext; 40 | this.collectionClass = collectionClass; 41 | } 42 | 43 | public SimpleReadListener2() { 44 | } 45 | 46 | public boolean onFileBegin(String file) { 47 | if (!file.endsWith(ext)) { 48 | return false; 49 | } 50 | try { 51 | words = (Collection) collectionClass.newInstance(); 52 | } catch (InstantiationException e) { 53 | e.printStackTrace(); 54 | } catch (IllegalAccessException e) { 55 | e.printStackTrace(); 56 | } 57 | return true; 58 | } 59 | 60 | public void onFileEnd(String file) { 61 | String name = file.substring(0, file.length() - 4); 62 | dics.put(name, words); 63 | words = null; 64 | } 65 | 66 | public void onWord(String wordText) { 67 | wordText = wordText.trim().toLowerCase(); 68 | if (wordText.length() == 0 || wordText.charAt(0) == '#' 69 | || wordText.charAt(0) == '-') { 70 | return; 71 | } 72 | 73 | if (!wordText.endsWith("]")) { 74 | words.add(new Word(wordText)); 75 | } 76 | else { 77 | int index = wordText.indexOf('['); 78 | Word w = new Word(wordText.substring(0, index)); 79 | int mindex = wordText.indexOf("m=", index); 80 | int mEndIndex = wordText.indexOf("]", mindex); 81 | String m = wordText.substring(mindex + "m=".length(), mEndIndex); 82 | w.setModifiers(Integer.parseInt(m)); 83 | words.add(w); 84 | } 85 | } 86 | 87 | public Map> getResult() { 88 | return dics; 89 | } 90 | 91 | } -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/exception/PaodingAnalysisException.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.exception; 17 | 18 | /** 19 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 20 | * 21 | */ 22 | public class PaodingAnalysisException extends RuntimeException { 23 | 24 | private static final long serialVersionUID = 5319477662251490296L; 25 | 26 | public PaodingAnalysisException() { 27 | super(); 28 | } 29 | 30 | public PaodingAnalysisException(String message, Throwable cause) { 31 | super(message, cause); 32 | } 33 | 34 | public PaodingAnalysisException(String message) { 35 | super(message); 36 | } 37 | 38 | public PaodingAnalysisException(Throwable cause) { 39 | super(cause); 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/ext/PaodingAnalyzerListener.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package net.paoding.analysis.ext; 5 | 6 | import java.util.Collection; 7 | 8 | import net.paoding.analysis.dictionary.Word; 9 | 10 | /** 11 | * @author ZhenQin 12 | * 13 | */ 14 | public abstract interface PaodingAnalyzerListener { 15 | 16 | public abstract void readDic(String dicPath); 17 | 18 | 19 | public abstract void readDicFinished(String dicPath, Collection conllec); 20 | 21 | 22 | public abstract void refreshDic(String dicPath, Collection conllec); 23 | 24 | public abstract void readCompileDic(String dicPath); 25 | 26 | public abstract void readCompileDicFinished(String dicPath, Collection conllec); 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/Beef.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | /** 19 | * {@link Beef}是要被庖丁“解”的“牛骨肉”,是对文本字符流的高效封装,可以从中读取指定位置的字符。 20 | *

    21 | * {@link Beef}和{@link String}对象的不同之处在于:
    22 | * {@link Beef}共享输入的char数组,{@link String}的策略是对共享数组进行克隆,克隆损耗了性能。
    23 | * 同时,{@link Beef}在 {@link #charAt(int)}方法还进行对字符的预处理,使返回时符合规则:1)toLowerCase 24 | * 2)全角转半角等 25 | *

    26 | * 27 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 28 | * 29 | * @since 1.0 30 | * 31 | */ 32 | public class Beef implements CharSequence { 33 | 34 | // ------------------------------------------------- 35 | 36 | /** 37 | * 文本字符数组 38 | */ 39 | private final char[] value; 40 | 41 | /** 42 | * 字符开始位置,即charAt(i)返回value[offset+i]字符 43 | */ 44 | private int offset; 45 | 46 | /** 47 | * 从offset位置开始的字符数 48 | */ 49 | private int count; 50 | 51 | /** Cache the hash code for the beef */ 52 | private int hash; // Default to 0 53 | 54 | // ------------------------------------------------- 55 | 56 | /** 57 | * 构造函数 58 | * 59 | * @param body 60 | * 被本对象中直接拥有的文本字符数组 61 | * @param offset 62 | * 字符开始位置,即get(i)返回body[offset+i]字符 63 | * @param count 64 | * 从offset位置开始的字符数 65 | */ 66 | public Beef(char[] value, int offset, int count) { 67 | this.value = value; 68 | set(offset, count); 69 | } 70 | 71 | // ------------------------------------------------- 72 | 73 | public void set(int offset, int count) { 74 | if (offset < 0) { 75 | throw new StringIndexOutOfBoundsException(offset); 76 | } 77 | if (count < 0) { 78 | throw new StringIndexOutOfBoundsException(count); 79 | } 80 | if (offset > value.length - count) { 81 | throw new StringIndexOutOfBoundsException(offset + count); 82 | } 83 | this.offset = offset; 84 | this.count = count; 85 | } 86 | 87 | public char[] getValue() { 88 | return value; 89 | } 90 | 91 | 92 | public int getCount() { 93 | return count; 94 | } 95 | 96 | public int getOffset() { 97 | return offset; 98 | } 99 | 100 | // ------------------------------------------------- 101 | 102 | /** 103 | * 获取指定位置的字符。返回之前将被预处理:1)toLowerCase,2)全角转半角等 104 | */ 105 | public char charAt(int index) { 106 | if (index >= 0 && index < count) { 107 | char src = value[offset + index]; 108 | if (src > 65280 && src < 65375) { 109 | src = (char) (src - 65248); 110 | value[offset + index] = src; 111 | } 112 | if (src >= 'A' && src <= 'Z') { 113 | src += 32; 114 | value[offset + index] = src; 115 | } else if (src == 12288) { 116 | src = 32; 117 | value[offset + index] = 32; 118 | } 119 | return src; 120 | } 121 | return (char) -1; 122 | } 123 | 124 | public int length() { 125 | return count; 126 | } 127 | 128 | public CharSequence subSequence(int start, int end) { 129 | return new String(value, offset + start, end - start); 130 | } 131 | 132 | // ------------------------------------------------- 133 | 134 | public String toString() { 135 | return new String(value, offset, count); 136 | } 137 | 138 | public int hashCode() { 139 | int h = hash; 140 | if (h == 0) { 141 | int off = offset; 142 | char val[] = value; 143 | int len = count; 144 | 145 | for (int i = 0; i < len; i++) { 146 | h = 31 * h + val[off++]; 147 | } 148 | hash = h; 149 | } 150 | return h; 151 | } 152 | 153 | } 154 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/CharSet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | /** 19 | * 20 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 21 | * 22 | * @since 1.0 23 | * 24 | */ 25 | public class CharSet { 26 | 27 | public static boolean isArabianNumber(char ch) { 28 | return ch >= '0' && ch <= '9'; 29 | } 30 | 31 | public static boolean isLantingLetter(char ch) { 32 | return ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z'; 33 | } 34 | 35 | public static boolean isCjkUnifiedIdeographs(char ch) { 36 | return ch >= 0x4E00 && ch < 0xA000; 37 | } 38 | 39 | public static boolean isBom(char ch) { 40 | // ref:http://www.w3.org/International/questions/qa-utf8-bom 41 | return ch == 0xFEFF || ch == 0xFFFE; 42 | } 43 | 44 | public static int toNumber(char ch) { 45 | switch (ch) { 46 | case '0': 47 | case '零': 48 | case '〇': 49 | return 0; 50 | case '1': 51 | case '一': 52 | case '壹': 53 | return 1; 54 | case '2': 55 | case '二': 56 | case '两': 57 | case '俩': 58 | case '貳': 59 | return 2; 60 | case '3': 61 | case '三': 62 | case '叁': 63 | return 3; 64 | case '4': 65 | case '四': 66 | case '肆': 67 | return 4; 68 | case '5': 69 | case '五': 70 | case '伍': 71 | return 5; 72 | case '6': 73 | case '六': 74 | case '陆': 75 | return 6; 76 | case '7': 77 | case '柒': 78 | case '七': 79 | return 7; 80 | case '8': 81 | case '捌': 82 | case '八': 83 | return 8; 84 | case '9': 85 | case '九': 86 | case '玖': 87 | return 9; 88 | case '十': 89 | case '什': 90 | return 10; 91 | case '百': 92 | case '佰': 93 | return 100; 94 | case '千': 95 | case '仟': 96 | return 1000; 97 | /* 98 | * Fix issue 12: 溢出bug 99 | */ 100 | /* 101 | case '万': 102 | case '萬': 103 | return 10000; 104 | case '亿': 105 | case '億': 106 | return 100000000; 107 | */ 108 | default: 109 | return -1; 110 | } 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/Collector.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | /** 19 | * Collector接收Knife切割文本得到的词语。 20 | *

    21 | * 22 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 23 | * 24 | * @see Knife 25 | * 26 | * @since 1.0 27 | * 28 | */ 29 | public interface Collector { 30 | 31 | /** 32 | * 当Knife从文本流中获取一个词语时,本方法被调用。
    33 | * 调用的顺序与词语在文本流中的顺序是否一致视不同实现可能有不同的策略。 34 | *

    35 | * 36 | * 如当Knife收到“中国当代社会现象”文本流中的“社会”时,传入的参数分别将是:(“社会”, 4, 6) 37 | * 38 | * @param word 39 | * 接收到的词语 40 | * @param offset 41 | * 该词语在文本流中的偏移位置 42 | * @param end 43 | * 该词语在文本流中的结束位置(词语不包括文本流end位置的字符),end-offset是为word的长度 44 | * 45 | * 46 | */ 47 | public void collect(String word, int offset, int end); 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/CollectorStdoutImpl.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | /** 19 | * 20 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 21 | * 22 | */ 23 | public class CollectorStdoutImpl implements Collector { 24 | 25 | private static ThreadLocal tl = new ThreadLocal() { 26 | protected Integer initialValue() { 27 | return new Integer(0); 28 | } 29 | }; 30 | 31 | public void collect(String word, int begin, int end) { 32 | int last = ((Integer) tl.get()).intValue(); 33 | Integer c = new Integer(last + 1); 34 | tl.set(c); 35 | System.out.println(c + ":\t[" + begin + ", " + end + ")=" + word); 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/CombinatoricsKnife.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | import java.util.HashSet; 19 | 20 | import net.paoding.analysis.dictionary.Dictionary; 21 | import net.paoding.analysis.dictionary.Hit; 22 | 23 | /** 24 | * 排列组合Knife。 25 | *

    26 | * 27 | * 该Knife把遇到的非LIMIT字符视为一个单词分出。
    28 | * 同时如果有以该词语开头的字符串在x-for-combinatorics.dic出现也会切出 29 | * 30 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 31 | * 32 | * @since 1.0 33 | * 34 | */ 35 | public abstract class CombinatoricsKnife implements Knife, DictionariesWare { 36 | 37 | protected Dictionary combinatoricsDictionary; 38 | 39 | protected HashSet noiseTable; 40 | 41 | public CombinatoricsKnife() { 42 | } 43 | 44 | public CombinatoricsKnife(String[] noiseWords) { 45 | setNoiseWords(noiseWords); 46 | } 47 | 48 | public void setNoiseWords(String[] noiseWords) { 49 | noiseTable = new HashSet((int) (noiseWords.length * 1.5)); 50 | for (int i = 0; i < noiseWords.length; i++) { 51 | noiseTable.add(noiseWords[i]); 52 | } 53 | } 54 | 55 | public void setDictionaries(Dictionaries dictionaries) { 56 | combinatoricsDictionary = dictionaries.getCombinatoricsDictionary(); 57 | } 58 | 59 | public int dissect(Collector collector, Beef beef, int offset) { 60 | // 当point == -1时表示本次分解没有遇到POINT性质的字符; 61 | // 如果point != -1,该值表示POINT性质字符的开始位置, 62 | // 这个位置将被返回,下一个Knife将从point位置开始分词 63 | int point = -1; 64 | 65 | // 记录同质字符分词结束极限位置(不包括limit位置的字符)-也就是assignable方法遇到LIMIT性质的字符的位置 66 | // 如果point==-1,limit将被返回,下一个Knife将从limit位置开始尝试分词 67 | int limit = offset + 1; 68 | 69 | // 构建point和limit变量的值: 70 | // 往前直到遇到LIMIT字符; 71 | // 其中如果遇到第一次POINT字符,则会将它记录为point 72 | GO_UNTIL_LIMIT: while (true) { 73 | switch (assignable(beef, offset, limit)) { 74 | case LIMIT: 75 | break GO_UNTIL_LIMIT; 76 | case POINT: 77 | if (point == -1) { 78 | point = limit; 79 | } 80 | } 81 | limit++; 82 | } 83 | // 如果最后一个字符也是ASSIGNED以及POINT, 84 | // 且beef之前已经被分解了一部分(从而能够腾出空间以读入新的字符),则需要重新读入字符后再分词 85 | if (limit == beef.length() && offset > 0) { 86 | return -offset; 87 | } 88 | 89 | // 检索是否有以该词语位前缀的词典词语 90 | // 若有,则将它解出 91 | int dicWordVote = -1; 92 | if (combinatoricsDictionary != null && beef.charAt(limit) > 0xFF) { 93 | dicWordVote = tryDicWord(collector, beef, offset, limit); 94 | } 95 | 96 | // 收集从offset分别到point以及limit的词 97 | // 注意这里不收集从point到limit的词 98 | // ->当然可能从point到limit的字符也可能是一个词,不过这不是本次分解的责任 99 | // ->如果认为它应该是个词,那么只要配置对应的其它Knife实例,该Knife会有机会把它切出来的 100 | // ->因为我们会返回point作为下一个Knife分词的开始。 101 | 102 | int pointVote = collectPoint(collector, beef, offset, point, limit, 103 | dicWordVote); 104 | int limitVote = collectLimit(collector, beef, offset, point, limit, 105 | dicWordVote); 106 | 107 | return nextOffset(beef, offset, point, limit, pointVote, limitVote, 108 | dicWordVote); 109 | } 110 | 111 | /** 112 | * 通知收集从offset到第一个LIMIT字符的词,并投票下一个Knife开始的分词位置。如果不存在POINT字符,则Point的值为-1。 113 | *

    114 | * 115 | * 默认方法实现:如果不存在POINT性质的字符,则直接返回不做任何切词处理。 116 | * 117 | * @param collector 118 | * @param beef 119 | * @param offset 120 | * 本次分解的内容在beef中的开始位置 121 | * @param point 122 | * 本次分解的内容的第一个POINT性质字符的位置,-1表示不存在该性质的字符 123 | * @param limit 124 | * 本次分解的内容的LIMIT性质字符 125 | * @return 投票下一个Knife开始分词的位置;-1表示弃权。默认方法实现:弃权。 126 | */ 127 | protected int collectPoint(Collector collector, Beef beef, int offset, 128 | int point, int limit, int dicWordVote) { 129 | if (point != -1 && dicWordVote == -1) { 130 | collectIfNotNoise(collector, beef, offset, point); 131 | } 132 | return -1; 133 | } 134 | 135 | /** 136 | * 通知收集从offset到第一个LIMIT字符的词,并投票下一个Knife开始的分词位置。 137 | *

    138 | * 139 | * 默认方法实现:把从offset位置到limit位置止(不包含边界)的字符串视为一个词切出。 140 | * 141 | * @param collector 142 | * @param beef 143 | * @param offset 144 | * 本次分解的内容在beef中的开始位置 145 | * @param point 146 | * 本次分解的内容的第一个POINT性质字符的位置,-1表示不存在该性质的字符 147 | * @param limit 148 | * 本次分解的内容的LIMIT性质字符 149 | * 150 | * @param dicWordVote 151 | * 152 | * @return 投票下一个Knife开始分词的位置;-1表示弃权。默认方法实现:弃权。 153 | */ 154 | protected int collectLimit(Collector collector, Beef beef, int offset, 155 | int point, int limit, int dicWordVote) { 156 | if (dicWordVote == -1) { 157 | collectIfNotNoise(collector, beef, offset, limit); 158 | } 159 | return -1; 160 | } 161 | 162 | /** 163 | * 尝试从combinatorics字典中检索,如果存在以offset到limit位置止(不包含limit边界)字符串开始的词语,则切出该词语。 164 | *

    165 | * 如没有检索到这样的词语,则本方法返回-1弃权投票下一个Knife的开始分解位置。
    166 | * 如果检索到这样的词语,在切出在词语的同时,投票返回这个词语的结束位置(词语本身不包含该结束位置的字符) 167 | *

    168 | * 169 | * (for version 2.0.4+):
    170 | * 本方法目前存在的局限:
    171 | * 如果字典中的某个词语刚好分隔在两次beef之中,比如"U"刚好是此次beef的最后字符,而"盘"是下一次beef的第一个字符,
    172 | * 这种情况现在 {@link CombinatoricsKnife}还没机制办法识别将之处理为一个词语 173 | * 174 | * @param collector 175 | * @param beef 176 | * @param offset 177 | * @param limit 178 | * @return 179 | */ 180 | protected int tryDicWord(Collector collector, Beef beef, int offset, 181 | int limit) { 182 | int ret = limit; 183 | for (int end = limit + 1, count = limit - offset + 1; end <= beef 184 | .length(); end++, count++) { 185 | Hit hit = combinatoricsDictionary.search(beef, offset, count); 186 | if (hit.isUndefined()) { 187 | break; 188 | } else if (hit.isHit()) { 189 | collectIfNotNoise(collector, beef, offset, end); 190 | // 收到词语,将ret设置为该词语的end 191 | ret = end; 192 | } 193 | // gotoNextChar为true表示在词典中存在以当前词为开头的词, 194 | boolean gotoNextChar = hit.isUnclosed() && end < beef.length() 195 | && beef.charAt(end) >= hit.getNext().charAt(count); 196 | if (!gotoNextChar) { 197 | break; 198 | } 199 | } 200 | return ret <= limit ? -1 : ret; 201 | // TODO: 202 | // 存在的局限: 203 | // 刚好词语分隔在两次beef之中,比如"U"刚好是此次beef的最后字符,而"盘"是下一次beef的第一个字符 204 | // 这种情况现在CombinatoricsKnife还没机制办法识别将之处理为一个词语 205 | } 206 | 207 | /** 208 | * 当Knife决定切出从offset始到end位置止(不包含结束位置的字符)的词语时,本方法能够过滤掉可能是noise的词,使最终不切出。 209 | * 210 | * @param collector 211 | * @param beef 212 | * @param offset 213 | * @param end 214 | */ 215 | protected void collectIfNotNoise(Collector collector, Beef beef, 216 | int offset, int end) { 217 | // 将offset和end之间的词(不包含end位置)创建出来给word 218 | // 如果该词语为噪音词,则重新丢弃之(设置为null), 219 | String word = beef.subSequence(offset, end).toString(); 220 | if (noiseTable != null && noiseTable.contains(word)) { 221 | word = null; 222 | } 223 | 224 | // 否则发送消息给collect方法,表示Knife新鲜出炉了一个内容为word的候选词语 225 | // 即:最终决定是否要把这个词语通知给collector的是collect方法 226 | if (word != null) { 227 | doCollect(collector, word, beef, offset, end); 228 | } 229 | } 230 | 231 | /** 232 | * 233 | * 当Knife决定切出从offset始到end位置止(不包含结束位置的字符)的词语时,本方法直接调用{@link #doCollect(Collector, String, Beef, int, int)}切出词语(而不过滤noise词汇) 234 | * 235 | * @param collector 236 | * @param beef 237 | * @param offset 238 | * @param end 239 | */ 240 | protected void collect(Collector collector, Beef beef, int offset, int end) { 241 | String word = beef.subSequence(offset, end).toString(); 242 | doCollect(collector, word, beef, offset, end); 243 | } 244 | 245 | /** 246 | * 收集分解出的候选词语。 默认实现是将该候选词语通知给收集器collector。
    247 | * 子类覆盖本方法可以更灵活地控制词语的收录,例如控制仅当word满足一些额外条件再决定是否收集,
    248 | * 或依上下文环境收集更多的相关词语 249 | * 250 | * @param collector 251 | * @param word 252 | * @param beef 253 | * @param offset 254 | * @param end 255 | */ 256 | protected void doCollect(Collector collector, String word, Beef beef, 257 | int offset, int end) { 258 | collector.collect(word, offset, end); 259 | } 260 | 261 | /** 262 | * 根据字符串性质位置,以及分词结果投票,决出下一个Knife应该从哪一个位置开始探测切词 263 | * 264 | * @param beef 265 | * @param offset 266 | * 本次分词的开始位置 267 | * @param point 268 | * 本次分词的第一个POINT性质的字符位置,-1表示没有该性质的字符 269 | * @param limit 270 | * 本次分词的第一个LIMIT性质的字符位置 271 | * @param pointVote 272 | * 收集从offset到第一个POINT性质字符词汇时的投票,-1表示弃权 273 | * @param limitVote 274 | * 收集从offset到第一个LIMIT性质字符词汇时的投票,-1表示弃权 275 | * @param dicWordVote 276 | * 收集combinatorics词典词语时的投票,-1表示弃权 277 | * @return 278 | */ 279 | protected int nextOffset(Beef beef, int offset, int point, int limit, 280 | int pointVote, int limitVote, int dicWordVote) { 281 | int max = pointVote > limitVote ? pointVote : limitVote; 282 | max = max > dicWordVote ? max : dicWordVote; 283 | if (max == -1) { 284 | return point != -1 ? point : limit; 285 | } else if (max > limit) { 286 | return max; 287 | } else { 288 | return limit; 289 | } 290 | } 291 | } 292 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/Dictionaries.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | import net.paoding.analysis.dictionary.Dictionary; 19 | import net.paoding.analysis.dictionary.support.detection.DifferenceListener; 20 | import net.paoding.analysis.ext.PaodingAnalyzerListener; 21 | 22 | /** 23 | * 中文字典缓存根据地,为{@link CJKKnife}所用。
    24 | * 从本对象可以获取中文需要的相关字典。包括词汇表、姓氏表、计量单位表、忽略的词或单字等。 25 | *

    26 | * 27 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 28 | * 29 | * @see CJKKnife 30 | * 31 | * @since 1.0 32 | */ 33 | public interface Dictionaries { 34 | /** 35 | * 词汇表字典 36 | * 37 | * @return 38 | */ 39 | public Dictionary getVocabularyDictionary(); 40 | 41 | /** 42 | * 姓氏字典 43 | * 44 | * @return 45 | */ 46 | public Dictionary getConfucianFamilyNamesDictionary(); 47 | 48 | /** 49 | * 忽略的词语 50 | * 51 | * @return 52 | */ 53 | public Dictionary getNoiseCharactorsDictionary(); 54 | 55 | /** 56 | * 忽略的单字 57 | * 58 | * @return 59 | */ 60 | public Dictionary getNoiseWordsDictionary(); 61 | 62 | /** 63 | * 计量单位 64 | * 65 | * @return 66 | */ 67 | public Dictionary getUnitsDictionary(); 68 | 69 | /** 70 | * lantin+cjk, num+cjk 71 | * @return 72 | */ 73 | public Dictionary getCombinatoricsDictionary(); 74 | 75 | /** 76 | * 77 | * @param l 78 | */ 79 | public void startDetecting(int interval, DifferenceListener l); 80 | 81 | 82 | public void stopDetecting(); 83 | 84 | 85 | public abstract void setAnalyzerListener(PaodingAnalyzerListener listener); 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/DictionariesCompiler.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | 19 | import java.util.Properties; 20 | 21 | import net.paoding.analysis.ext.PaodingAnalyzerListener; 22 | /** 23 | * 24 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 25 | * 26 | * @since 2.0.4 27 | */ 28 | public interface DictionariesCompiler { 29 | 30 | /** 31 | * 32 | * @param p 33 | * @return 34 | * @throws Exception 35 | */ 36 | public boolean shouldCompile(Properties p) throws Exception; 37 | 38 | /** 39 | * 40 | * @param dictionaries 41 | * @param knife 42 | * @param p 43 | * @throws Exception 44 | */ 45 | public void compile(Dictionaries dictionaries, Knife knife, Properties p) throws Exception; 46 | 47 | /** 48 | * 49 | * @param p 50 | * @return 51 | * @throws Exception 52 | */ 53 | public Dictionaries readCompliedDictionaries(Properties p) throws Exception; 54 | 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/DictionariesWare.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | /** 19 | * 20 | * @author zhiliang.wang [qieqie.wang@gmail.com] 21 | * @since 2.0.2 22 | */ 23 | public interface DictionariesWare { 24 | 25 | public void setDictionaries(Dictionaries dictionaries); 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/FakeKnife.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | import org.apache.commons.logging.Log; 19 | import org.apache.commons.logging.LogFactory; 20 | 21 | /** 22 | * 无实际用处的Knife。用于示例装载Knife并进行属性设置。 23 | *

    24 | * 25 | * @see paoding-knives-user.properties 26 | * @author zhiliang.wang [qieqie.wang@gmail.com] 27 | * @since 2.0.2 28 | */ 29 | 30 | public class FakeKnife implements Knife, DictionariesWare { 31 | 32 | private Log log = LogFactory.getLog(this.getClass()); 33 | 34 | private String name; 35 | 36 | private int intParam; 37 | 38 | private Inner inner = new Inner(); 39 | 40 | public void setName(String name) { 41 | this.name = name; 42 | log.info("set property: name=" + name); 43 | } 44 | 45 | public String getName() { 46 | return name; 47 | } 48 | 49 | public int getIntParam() { 50 | return intParam; 51 | } 52 | 53 | public void setIntParam(int intParam) { 54 | this.intParam = intParam; 55 | log.info("set property: intParam=" + intParam); 56 | } 57 | 58 | public void setInner(Inner inner) { 59 | this.inner = inner; 60 | } 61 | 62 | public Inner getInner() { 63 | return inner; 64 | } 65 | 66 | public int assignable(Beef beef, int offset, int index) { 67 | return LIMIT; 68 | } 69 | 70 | public int dissect(Collector collector, Beef beef, int offset) { 71 | throw new Error("this knife doesn't accept any beef"); 72 | } 73 | 74 | public void setDictionaries(Dictionaries dictionaries) { 75 | } 76 | 77 | class Inner { 78 | private boolean bool; 79 | 80 | public void setBool(boolean bool) { 81 | this.bool = bool; 82 | log.info("set property: bool=" + bool); 83 | } 84 | 85 | public boolean isBool() { 86 | return bool; 87 | } 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/FileDictionariesDifferenceListener.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | import java.util.Iterator; 19 | import java.util.LinkedList; 20 | import java.util.List; 21 | 22 | import net.paoding.analysis.dictionary.support.detection.Difference; 23 | import net.paoding.analysis.dictionary.support.detection.DifferenceListener; 24 | import net.paoding.analysis.dictionary.support.detection.Node; 25 | 26 | /** 27 | * 28 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 29 | * 30 | * @since 2.0.2 31 | * 32 | */ 33 | public class FileDictionariesDifferenceListener implements DifferenceListener { 34 | 35 | private FileDictionaries dictionaries; 36 | 37 | private KnifeBox knifeBox; 38 | 39 | public FileDictionariesDifferenceListener() { 40 | } 41 | 42 | public FileDictionariesDifferenceListener(Dictionaries dictionaries, 43 | KnifeBox knifeBox) { 44 | this.dictionaries = (FileDictionaries) dictionaries; 45 | this.knifeBox = knifeBox; 46 | } 47 | 48 | public Dictionaries getDictionaries() { 49 | return dictionaries; 50 | } 51 | 52 | public void setDictionaries(Dictionaries dictionaries) { 53 | this.dictionaries = (FileDictionaries) dictionaries; 54 | } 55 | 56 | public KnifeBox getKnifeBox() { 57 | return knifeBox; 58 | } 59 | 60 | public void setKnifeBox(KnifeBox knifeBox) { 61 | this.knifeBox = knifeBox; 62 | } 63 | 64 | public synchronized void on(Difference diff) { 65 | List all = new LinkedList(); 66 | all.addAll(diff.getDeleted()); 67 | all.addAll(diff.getModified()); 68 | all.addAll(diff.getNewcome()); 69 | for (Iterator iter = all.iterator(); iter.hasNext();) { 70 | Node node = iter.next(); 71 | if (node.isFile()) { 72 | dictionaries.refreshDicWords(node.getPath()); 73 | } 74 | } 75 | Knife[] knives = knifeBox.getKnives(); 76 | for (int i = 0; i < knives.length; i ++) { 77 | Knife knife = knives[i]; 78 | if (knife instanceof DictionariesWare) { 79 | ((DictionariesWare) knife).setDictionaries(dictionaries); 80 | } 81 | } 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/Knife.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | /** 19 | * Knife规定如何分解字符串成词语,并将分解成的词语告知{@link Collector}接口。 20 | *

    21 | * 22 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 23 | * 24 | * @see Collector 25 | * @see Paoding 26 | * @see CJKKnife 27 | * @see CombinatoricsKnife 28 | * @see NumberKnife 29 | * @see LetterKnife 30 | * 31 | * @since 1.0 32 | * 33 | */ 34 | public interface Knife { 35 | 36 | /** 37 | * 表征 {@link #assignable(Beef beef, int offset, int indec)}对index位置字符的性质规定。 38 | * ASSIGNED性质的字符表示该字符可以被Knife接受进行分词。 39 | *

    40 | * {@link KnifeBox}据此将一段由这种性质字符开始的内容(由beef和offset封装)交给Knife分解。 41 | *

    42 | * 同样的一段内容的一个位置的字符,对不同的Knife来说,往往具有不同的性质结果。 43 | *

    44 | * 45 | * @see KnifeBox#dissect(Collector, Beef, int) 46 | */ 47 | int ASSIGNED = 1; 48 | 49 | /** 50 | * 表征 {@link #assignable(Beef beef, int offset, int indec)}对index位置字符的性质规定。 51 | * POINT性质的字符表示如果给定分解的内容之前存在ASSIGNED性质的字符时,该字符可以被Knife接受进行分词。 52 | *

    53 | * {@link KnifeBox}不关心给定的文本内容是否包含POINT性质的字符。
    54 | * 这种性质的字符的最大关心者是{@link Knife}本身。 55 | * 一般情况下,如果存在POINT性质的字符,下一个合适的Knife将从第一个这样性质的字符开始分解内容
    56 | * (这仅是一般的情况,具体是由{@link #dissect(Collector, Beef, int)}来确定)。 57 | *

    58 | * 同样的一段内容的一个位置的字符,对不同的Knife来说,往往具有不同的性质结果。 59 | *

    60 | */ 61 | int POINT = 0; 62 | 63 | /** 64 | * 表征 {@link #assignable(Beef beef, int offset, int indec)}对index位置字符的性质规定。 65 | * LIMIT性质的字符表示给定的字符不属于此Knife的分解范畴。本Knife分解应该到此为止。
    66 | * 一般情况下,如果不存在POINT性质的字符,下一个合适的Knife将从这样性质的字符开始分解内容
    67 | * (这仅是一般的情况,具体是由{@link #dissect(Collector, Beef, int)}来确定)。 68 | *

    69 | * 同样的一段内容的一个位置的字符,对不同的Knife来说,往往具有不同的性质结果。 70 | *

    71 | */ 72 | int LIMIT = -1; 73 | 74 | /** 75 | * 返回beef的index位置字符的性质,{@link KnifeBox}据此决定将一段文本内容“交给”一个合适的Knife切词 76 | * 77 | * @param beef 78 | * 要被分词的字符串 79 | * @param offset 80 | * Knife开始或有可能开始切词的始发位置。 81 | * @param index 82 | * 被判断的字符的位置,本方法返回的即时该位置字符的性质。index>=offset。
    83 | * 当{@link KnifeBox}根据字符的性质(是否为{@link #ASSIGNED})选择Knife分解时,index=offset。 84 | * @return index位置的字符在本Knife中的性质规定
    85 | * 当offset==index时,仅当返回ASSIGNED时,该Knife才有机会被{@link KnifeBox}分配接收文本内容进行分词
    86 | * (即才有机会调用dissect方法) 87 | * @see #LIMIT 88 | * @see #ASSIGNED 89 | * @see #POINT 90 | */ 91 | public int assignable(Beef beef, int offset, int index); 92 | 93 | /** 94 | * 分解词语,并将分解成的词语相关信息告知{@link Collector}接口。 95 | *

    96 | * 分解从beef的offset位置开始,直至可能的结束的位置,结束时返回具有特定意义的一个非0数字。
    97 | * 98 | * @param collector 99 | * 当分解到词语时,collector将被通知接收该词语 100 | * @param beef 101 | * 待分解的字符串内容,这个字符串可能是所要分解的全部字符串的一部分(比如文章中的某一部分),当beef的最后一个字符为'\0'时,表示此次分解是文章最后一段。 102 | * @param offset 103 | * 此次分解从beef的offset位置开始,即本此分解只需从beef.charAt(offset)开始 104 | * @return 非0的整数,即正整数或负整数。
    105 | * 正数时:表示此次分解到该结束位置(不包括该边界),即此次成功分解了从offset到该位置的文本流。
    106 | * 特别地,当其>=beef.lenght()表示已经把beef所有的词语分解完毕
    107 | * 如果,当其==offset时,表示{@link KnifeBox}应该继续遍历还未遍历的Knife,确定是否有其他Knife接收分解offset位置开始的文本内容
    108 | *

    109 | * 负数时:该负数的绝对值必须>=offset。这个绝对值表示此次成功分解了从offset到该绝对值的文本流,剩下的字符,该knife已经不能正确解析。(一般此时应该重新传入新的beef对象解析) 110 | *

    111 | * 比如,有内容为"hello yang!"的文章,先读入8个字符"hello ya",
    112 | * 此时分解后应该返回-5,表示正确解析到5这个位置,即"hello",但必须读入新的字符然后再继续解析。 113 | * 此时beef构造者就读入剩下的字符"ng!"并与前次剩下的" ya"
    114 | * 构成串" yang!",这样才能继续解析,从而解析出"yang"! 115 | * 116 | * 117 | */ 118 | public int dissect(Collector collector, Beef beef, int offset); 119 | } 120 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/KnifeBox.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | import java.util.ArrayList; 19 | import java.util.Iterator; 20 | import java.util.List; 21 | 22 | /** 23 | * KnifeBox负责决策当遇到字符串指定位置时应使用的Knife对象. 24 | *

    25 | * 26 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 27 | * 28 | * @see Paoding 29 | * 30 | * @since 1.0 31 | * 32 | */ 33 | public class KnifeBox implements Knife { 34 | 35 | private Knife[] knives; 36 | 37 | private int size; 38 | 39 | public KnifeBox() { 40 | } 41 | 42 | public KnifeBox(List knives) { 43 | this.setKnives(knives); 44 | } 45 | 46 | public KnifeBox(Knife[] knives) { 47 | this.setKnives(knives); 48 | } 49 | 50 | /** 51 | * 返回配置的所有Knife
    52 | * !!!不要去变更返回数组中的元素 53 | * 54 | * @return 55 | */ 56 | public Knife[] getKnives() { 57 | return knives; 58 | } 59 | 60 | public void setKnives(List knifeList) { 61 | if (knifeList == null) { 62 | knifeList = new ArrayList(0); 63 | } 64 | size = knifeList.size(); 65 | this.knives = new Knife[size]; 66 | Iterator iter = knifeList.iterator(); 67 | for (int i = 0; i < size; i++) { 68 | this.knives[i] = iter.next(); 69 | } 70 | } 71 | 72 | public void setKnives(Knife[] knives) { 73 | if (knives == null) { 74 | knives = new Knife[0]; 75 | } 76 | size = knives.length; 77 | this.knives = new Knife[size]; 78 | System.arraycopy(knives, 0, this.knives, 0, size); 79 | } 80 | 81 | public int assignable(Beef beef, int offset, int index) { 82 | return ASSIGNED; 83 | } 84 | 85 | public int dissect(Collector collector, Beef beef, int offset) { 86 | Knife knife; 87 | for (int i = 0; i < size; i++) { 88 | knife = knives[i]; 89 | if (ASSIGNED == knife.assignable(beef, offset, offset)) { 90 | int lastLimit = knife.dissect(collector, beef, offset); 91 | // 如果返回的下一个分词点发生了变化(可进可退),则直接返回之, 92 | // 否则继续让下一个Knife有机会分词 93 | if (lastLimit != offset) { 94 | return lastLimit; 95 | } 96 | } 97 | } 98 | return ++offset; 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/LetterKnife.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | 19 | /** 20 | * 21 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 22 | * 23 | */ 24 | public class LetterKnife extends CombinatoricsKnife { 25 | 26 | public static final String[] DEFAULT_NOISE = { "a", "an", "and", "are", "as", "at", 27 | "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", 28 | "not", "of", "on", "or", "such", "that", "the", "their", "then", 29 | "there", "these", "they", "this", "to", "was", "will", "with", 30 | "www" }; 31 | 32 | 33 | public LetterKnife() { 34 | super(DEFAULT_NOISE); 35 | } 36 | 37 | public LetterKnife(String[] noiseWords) { 38 | super(noiseWords); 39 | } 40 | 41 | public int assignable(Beef beef, int offset, int index) { 42 | char ch = beef.charAt(index); 43 | if (CharSet.isLantingLetter(ch)) { 44 | return ASSIGNED; 45 | } 46 | if (index > offset) { 47 | if ((ch >= '0' && ch <= '9') || ch == '-' || ch == '_') { 48 | return POINT; 49 | } 50 | } 51 | return LIMIT; 52 | } 53 | 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/NumberKnife.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | import java.math.BigInteger; 19 | 20 | import net.paoding.analysis.dictionary.Dictionary; 21 | import net.paoding.analysis.dictionary.Hit; 22 | 23 | /** 24 | * 25 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 26 | * 27 | */ 28 | public class NumberKnife extends CombinatoricsKnife implements DictionariesWare { 29 | 30 | private Dictionary units; 31 | 32 | public NumberKnife() { 33 | } 34 | 35 | public NumberKnife(Dictionaries dictionaries) { 36 | setDictionaries(dictionaries); 37 | } 38 | 39 | public void setDictionaries(Dictionaries dictionaries) { 40 | super.setDictionaries(dictionaries); 41 | units = dictionaries.getUnitsDictionary(); 42 | } 43 | 44 | 45 | public int assignable(Beef beef, int offset, int index) { 46 | char ch = beef.charAt(index); 47 | if (CharSet.isArabianNumber(ch)) 48 | return ASSIGNED; 49 | if (index > offset) { 50 | if (CharSet.isLantingLetter(ch) || ch == '.' || ch == '-' || ch == '_') { 51 | if (ch == '-' || ch == '_' || CharSet.isLantingLetter(ch) 52 | || !CharSet.isArabianNumber(beef.charAt(index + 1))) { 53 | //分词效果 54 | //123.456 ->123.456/ 55 | //123.abc.34 ->123/123.abc.34/abc/34/ ["abc"、"abc/34"系由LetterKnife分出,非NumberKnife] 56 | //没有或判断!CharSet.isArabianNumber(beef.charAt(index + 1)),则分出"123.",而非"123" 57 | //123.abc.34 ->123./123.abc.34/abc/34/ 58 | return POINT; 59 | } 60 | return ASSIGNED; 61 | } 62 | } 63 | return LIMIT; 64 | } 65 | 66 | protected int collectLimit(Collector collector, Beef beef, 67 | int offset, int point, int limit, int dicWordVote) { 68 | // "123abc"的直接调用super的 69 | if (point != -1) { 70 | return super.collectLimit(collector, beef, offset, point, limit, dicWordVote); 71 | } 72 | // 73 | // 2.2两 74 | // ^=_point 75 | // 76 | final int _point = limit; 77 | // 当前尝试判断的字符的位置 78 | int curTail = offset; 79 | /* 80 | * Fix issue 56: 中文数字解析问题后续 81 | */ 82 | BigInteger number1 = BigInteger.valueOf(-1); 83 | BigInteger number2 = BigInteger.valueOf(-1); 84 | int bitValue = 0; 85 | int maxUnit = 0; 86 | //TODO:这里又重复从curTail(其值为offset)判断,重新遍历判断是否为数字,算是一个重复计算 87 | //但考虑这个计算对中文分词性能影响微乎其微暂时先不优化 88 | for (; (bitValue = CharSet.toNumber(beef.charAt(curTail))) >= 0; curTail++) { 89 | // 90 | if (bitValue == 2 91 | && (beef.charAt(curTail) == '两' || beef.charAt(curTail) == '俩' || beef 92 | .charAt(curTail) == '倆')) { 93 | if (curTail != offset) { 94 | break; 95 | } 96 | } 97 | // 处理连续汉字个位值的数字:"三四五六" ->"3456" 98 | if (bitValue >= 0 && bitValue < 10) { 99 | if (number2.compareTo(BigInteger.ZERO) < 0) 100 | number2 = BigInteger.valueOf(bitValue); 101 | else { 102 | number2 = number2.multiply(BigInteger.valueOf(10)); 103 | number2 = number2.add(BigInteger.valueOf(bitValue)); 104 | } 105 | } else { 106 | if (number2.compareTo(BigInteger.ZERO) < 0) { 107 | if (number1.compareTo(BigInteger.ZERO) < 0) { 108 | number1 = BigInteger.ONE; 109 | } 110 | number1 = number1.multiply(BigInteger.valueOf(bitValue)); 111 | } else { 112 | if (number1.compareTo(BigInteger.ZERO) < 0) { 113 | number1 = BigInteger.ZERO; 114 | } 115 | if (bitValue >= maxUnit) { 116 | number1 = number1.add(number2); 117 | number1 = number1.multiply(BigInteger.valueOf(bitValue)); 118 | maxUnit = bitValue; 119 | } else { 120 | number1 = number1.add(number2.multiply(BigInteger.valueOf(bitValue))); 121 | } 122 | } 123 | number2 = BigInteger.valueOf(-1); 124 | } 125 | } 126 | if (number2.compareTo(BigInteger.ZERO) > 0) { 127 | if (number1.compareTo(BigInteger.ZERO) < 0) { 128 | number1 = number2; 129 | } else { 130 | number1 = number1.add(number2); 131 | } 132 | } 133 | if (number1.compareTo(BigInteger.ZERO) >= 0 && curTail > _point) { 134 | doCollect(collector, String.valueOf(number1), beef, offset, curTail); 135 | } 136 | else { 137 | super.collectLimit(collector, beef, offset, point, limit, dicWordVote); 138 | } 139 | 140 | curTail = curTail > limit ? curTail : limit; 141 | 142 | // 143 | // 后面可能跟了计量单位 144 | if (units != null && CharSet.isCjkUnifiedIdeographs(beef.charAt(curTail))) { 145 | Hit wd = null; 146 | Hit wd2 = null; 147 | int i = curTail + 1; 148 | 149 | /* 150 | * Fix issue 48: 查找计量单位引起的高亮越界错误 151 | */ 152 | while (i <= limit && (wd = units.search(beef, curTail, i - curTail)).isHit()) { 153 | wd2 = wd; 154 | i++; 155 | if (!wd.isUnclosed()) { 156 | break; 157 | } 158 | } 159 | i --; 160 | if (wd2 != null) { 161 | collector.collect(wd2.getWord().getText(), curTail, i); 162 | return i; 163 | } 164 | } 165 | // 166 | 167 | return curTail > limit ? curTail : -1; 168 | } 169 | 170 | 171 | } 172 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/Paoding.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | 19 | /** 20 | * Paoding是一个背着“刀箱”(内藏各种“刀”)毕生精力“解牛”的人,即“庖丁”。 21 | *

    22 | * 正因为他拥有各种不同的“刀”,而且能够识别什么“肉(字符)”应该用什么“刀”分割,所以他能游刃有余地把整头牛切割,成为合适的“肉片(词语)”。
    23 | * 这里的“刀”由Knife扮演,各种“刀”由“刀箱”KnifeBox管理(Paoding对象本身就是一个KnifeBox),并由KnifeBox决策什么时候出什么“刀”。 24 | * 25 | * @author Zhiliang Wang [qieqie.wang@gmail.com] 26 | * 27 | * @see Knife 28 | * @see KnifeBox 29 | * 30 | * @since 1.0 31 | */ 32 | public class Paoding extends SmartKnifeBox implements Knife { 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/net/paoding/analysis/knife/SmartKnifeBox.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2007 The Apache Software Foundation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package net.paoding.analysis.knife; 17 | 18 | public class SmartKnifeBox extends KnifeBox implements Knife { 19 | 20 | public int dissect(Collector collector, Beef beef, int offset) { 21 | final int beefLength = beef.length(); 22 | while (offset >= 0 && offset < beefLength) { 23 | offset = super.dissect(collector, beef, offset); 24 | } 25 | return offset; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/resources/paoding-analysis-default.properties: -------------------------------------------------------------------------------- 1 | 2 | paoding.imports=\ 3 | ifexists:classpath:paoding-analyzer.properties;\ 4 | ifexists:classpath:paoding-dic-home.properties;\ 5 | ifexists:dic-home:paoding-dic-names.properties;\ 6 | ifexists:classpath:paoding-knives.properties; 7 | 8 | -------------------------------------------------------------------------------- /src/main/resources/paoding-analysis.properties: -------------------------------------------------------------------------------- 1 | 2 | paoding.imports=\ 3 | ifexists:classpath:paoding-analysis-default.properties;\ 4 | ifexists:classpath:paoding-analysis-user.properties;\ 5 | ifexists:classpath:paoding-knives-user.properties 6 | 7 | -------------------------------------------------------------------------------- /src/main/resources/paoding-analyzer.properties: -------------------------------------------------------------------------------- 1 | 2 | 3 | #PaodingAnlyzer Mode, "most-words", "max-word-length", "class:com.xxx.MyTokenCollectorImpl"... 4 | #paoding.analyzer.mode=most-words 5 | #paoding.analyzer.dictionaries.compiler=net.paoding.analysis.analyzer.impl.MostWordsModeDictionariesCompiler 6 | #paoding.analyzer.mode=max-word-length 7 | #paoding.analyzer.dictionaries.compiler=net.paoding.analysis.analyzer.impl.SortingDictionariesCompiler 8 | -------------------------------------------------------------------------------- /src/main/resources/paoding-dic-home.properties: -------------------------------------------------------------------------------- 1 | 2 | #values are "system-env" or "this"; 3 | #if value is "this" , using the paoding.dic.home as dicHome if configed! 4 | #paoding.dic.home.config-first=system-env 5 | 6 | #dictionary home (directory) 7 | #"classpath:xxx" means dictionary home is in classpath. 8 | #e.g "classpath:dic" means dictionaries are in "classes/dic" directory or any other classpath directory 9 | paoding.dic.home=dic 10 | 11 | #seconds for dic modification detection 12 | #paoding.dic.detector.interval=60 13 | -------------------------------------------------------------------------------- /src/main/resources/paoding-knives-user.properties: -------------------------------------------------------------------------------- 1 | 2 | # an example of knife with properties setting 3 | #paoding.knife.class.fakeKnife=net.paoding.analysis.knife.FakeKnife 4 | #paoding.knife.class.fakeKnife.name=Fake Knife 5 | #paoding.knife.class.fakeKnife.intParam=34 6 | #paoding.knife.class.fakeKnife.inner.bool=true 7 | 8 | -------------------------------------------------------------------------------- /src/main/resources/paoding-knives.properties: -------------------------------------------------------------------------------- 1 | 2 | paoding.knife.class.letterKnife=net.paoding.analysis.knife.LetterKnife 3 | paoding.knife.class.numberKnife=net.paoding.analysis.knife.NumberKnife 4 | paoding.knife.class.cjkKnife=net.paoding.analysis.knife.CJKKnife 5 | -------------------------------------------------------------------------------- /src/test/java/net/paoding/analysis/t/AnalysisCompare.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package net.paoding.analysis.t; 5 | 6 | import java.io.IOException; 7 | import java.io.StringReader; 8 | 9 | import net.paoding.analysis.analyzer.PaodingAnalyzer; 10 | import org.apache.lucene.analysis.Analyzer; 11 | import org.apache.lucene.analysis.TokenStream; 12 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 13 | 14 | /** 15 | * 16 | * 17 | * @author ZhenQin 18 | * 19 | */ 20 | public class AnalysisCompare { 21 | 22 | /** 23 | * 24 | */ 25 | public AnalysisCompare() { 26 | 27 | } 28 | 29 | 30 | public static void parse(Analyzer analyzer, String text) throws IOException { 31 | TokenStream ts = analyzer.tokenStream("text", new StringReader(text)); 32 | ts.reset(); 33 | // 添加工具类 注意:以下这些与之前lucene2.x版本不同的地方 34 | CharTermAttribute offAtt = ts.addAttribute(CharTermAttribute.class); 35 | // 循环打印出分词的结果,及分词出现的位置 36 | while (ts.incrementToken()) { 37 | System.out.print(offAtt.toString() + "\t"); 38 | } 39 | System.out.println(); 40 | ts.close(); 41 | } 42 | 43 | 44 | public static void main(String[] args) throws IOException { 45 | Analyzer paodingAnalyzer = new PaodingAnalyzer(); 46 | 47 | String text = "你吃饭了吗"; 48 | parse(paodingAnalyzer, text); 49 | parse(paodingAnalyzer, text); 50 | parse(paodingAnalyzer, text); 51 | parse(paodingAnalyzer, text); 52 | parse(paodingAnalyzer, text); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/test/java/net/paoding/analysis/t/InMemoryShortExample.java: -------------------------------------------------------------------------------- 1 | package net.paoding.analysis.t; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | 6 | import net.paoding.analysis.analyzer.PaodingAnalyzer; 7 | 8 | import org.apache.lucene.analysis.Analyzer; 9 | import org.apache.lucene.document.Document; 10 | import org.apache.lucene.document.Field.Store; 11 | import org.apache.lucene.document.TextField; 12 | import org.apache.lucene.index.DirectoryReader; 13 | import org.apache.lucene.index.IndexWriter; 14 | import org.apache.lucene.index.IndexWriterConfig; 15 | import org.apache.lucene.queryparser.classic.QueryParser; 16 | import org.apache.lucene.search.IndexSearcher; 17 | import org.apache.lucene.search.MatchAllDocsQuery; 18 | import org.apache.lucene.store.Directory; 19 | import org.apache.lucene.store.FSDirectory; 20 | import org.apache.lucene.store.RAMDirectory; 21 | import org.apache.lucene.util.Version; 22 | import org.junit.Assert; 23 | 24 | public class InMemoryShortExample { 25 | 26 | private static final Analyzer ANALYZER = new PaodingAnalyzer(); 27 | 28 | public static void main(String[] args) { 29 | // Construct a RAMDirectory to hold the in-memory representation 30 | // of the index. 31 | 32 | try { 33 | // Directory idx = FSDirectory.open(new File("F:/data/lucene/fix")); 34 | Directory idx = new RAMDirectory(); 35 | // Make an writer to create the index 36 | IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_46, 37 | ANALYZER); 38 | 39 | IndexWriter writer = new IndexWriter(idx, iwc); 40 | 41 | // Add some Document objects containing quotes 42 | writer.addDocument(createDocument("维基百科:关于中文维基百科", "维基百科:关于中文维基百科")); 43 | 44 | writer.commit(); 45 | 46 | IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(idx)); 47 | Assert.assertTrue(searcher.search(new QueryParser(Version.LUCENE_46, 48 | "title", ANALYZER).parse("title:'维基'"), 10).totalHits > 0); 49 | } catch (Exception ioe) { 50 | // In this example we aren't really doing an I/O, so this 51 | // exception should never actually be thrown. 52 | ioe.printStackTrace(); 53 | } 54 | } 55 | 56 | /** 57 | * Make a Document object with an un-indexed title field and an indexed 58 | * content field. 59 | */ 60 | private static Document createDocument(String title, String content) { 61 | Document doc = new Document(); 62 | 63 | // Add the title as an unindexed field... 64 | doc.add(new TextField("title", title, Store.YES)); 65 | 66 | // ...and the content as an indexed field. Note that indexed 67 | // Text fields are constructed using a Reader. Lucene can read 68 | // and index very large chunks of text, without storing the 69 | // entire content verbatim in the index. In this example we 70 | // can just wrap the content string in a StringReader. 71 | doc.add(new TextField("content", content, Store.YES)); 72 | 73 | return doc; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/java/net/paoding/analysis/t/SplitTest.java: -------------------------------------------------------------------------------- 1 | package net.paoding.analysis.t; 2 | 3 | import net.paoding.analysis.analyzer.PaodingAnalyzer; 4 | import net.paoding.analysis.analyzer.PaodingTokenizer; 5 | import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector; 6 | import net.paoding.analysis.knife.Paoding; 7 | import org.apache.lucene.analysis.Analyzer; 8 | import org.apache.lucene.queryparser.classic.QueryParser; 9 | import org.apache.lucene.search.Query; 10 | import org.apache.lucene.util.Version; 11 | import org.junit.Test; 12 | 13 | import java.io.StringReader; 14 | 15 | /** 16 | *

    17 |  *
    18 |  * Created by IntelliJ IDEA.
    19 |  * User: ZhenQin
    20 |  * Date: 14-1-7
    21 |  * Time: 下午3:42
    22 |  * To change this template use File | Settings | File Templates.
    23 |  *
    24 |  * 
    25 | * 26 | * @author ZhenQin 27 | */ 28 | public class SplitTest { 29 | 30 | Analyzer ANALYZER = new PaodingAnalyzer(); 31 | 32 | public SplitTest() { 33 | } 34 | 35 | 36 | @Test 37 | public void testSplitChinese() throws Exception { 38 | String txt = "汉文化和服装 汉文化"; 39 | PaodingTokenizer tokenizer = new PaodingTokenizer( 40 | new StringReader(txt), 41 | new Paoding(), 42 | new MaxWordLengthTokenCollector()); 43 | 44 | System.out.println(tokenizer); 45 | } 46 | 47 | 48 | @Test 49 | public void testParse() throws Exception { 50 | Query query = new QueryParser(Version.LUCENE_46, 51 | "title", ANALYZER).parse("title:你吃饭被撑死了吗"); 52 | 53 | System.out.println(query); 54 | } 55 | } 56 | --------------------------------------------------------------------------------